#!/bin/bash
# SkillDarbar API - Production Server Deployment Script
# Optimized for AI streaming, concurrent requests, and multi-process stability

set -e  # Exit on error

# ============================================================================
# CONFIGURATION
# ============================================================================
HOST="0.0.0.0"
PORT=30016
WORKERS=4                    # Increased for better concurrency
WORKER_CONNECTIONS=1000      # Max simultaneous connections per worker
TIMEOUT=300                  # 5 minutes for long AI responses
GRACEFUL_TIMEOUT=30          # Time to finish existing requests on reload
KEEP_ALIVE=5
MAX_REQUESTS=500             # Restart worker after N requests (prevent memory leaks)
MAX_REQUESTS_JITTER=50       # Random jitter to prevent all workers restarting at once

# ============================================================================
# RESOURCE LIMITS - Prevent runaway processes
# ============================================================================
export OPENBLAS_NUM_THREADS=1
export MKL_NUM_THREADS=1
export OMP_NUM_THREADS=1
export NUMEXPR_NUM_THREADS=1
export TOKENIZERS_PARALLELISM=false

# Python garbage collection tuning
export PYTHONMALLOC=malloc
export MALLOC_TRIM_THRESHOLD_=100000

# ============================================================================
# PATHS
# ============================================================================
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
cd "$SCRIPT_DIR"

LOG_DIR="$SCRIPT_DIR/logs"
ACCESS_LOG="$LOG_DIR/access.log"
ERROR_LOG="$LOG_DIR/error.log"
PID_FILE="$LOG_DIR/gunicorn.pid"
LOCK_FILE="$LOG_DIR/.start.lock"

# ============================================================================
# FUNCTIONS
# ============================================================================
log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}

cleanup_lock() {
    rm -f "$LOCK_FILE"
}
trap cleanup_lock EXIT

check_port() {
    if lsof -Pi :$PORT -sTCP:LISTEN -t >/dev/null 2>&1; then
        return 0  # Port in use
    fi
    return 1  # Port free
}

check_venv() {
    # Check if venv exists and is valid
    if [ -d "venv" ]; then
        # Test if pip works in the venv
        if venv/bin/python --version >/dev/null 2>&1; then
            return 0  # venv is valid
        else
            log "⚠️  Virtual environment is broken (Python interpreter missing)"
            log "   The venv was created with a Python version that no longer exists."
            log ""
            log "To fix, run these commands on your server:"
            log "   rm -rf venv"
            log "   python3 -m venv venv"
            log "   source venv/bin/activate"
            log "   python -m pip install -r requirements.txt"
            log ""
            return 1  # venv is broken
        fi
    fi
    return 2  # no venv exists
}

activate_python() {
    # Try to activate virtual environment
    if [ -d "venv" ]; then
        if check_venv; then
            source venv/bin/activate
            log "Virtual environment activated"
            return 0
        else
            log "Attempting to use system Python instead..."
        fi
    fi
    
    # Fallback: use system Python
    if command -v python3 &>/dev/null; then
        log "Using system Python: $(python3 --version)"
        return 0
    elif command -v python &>/dev/null; then
        log "Using system Python: $(python --version)"
        alias python3=python
        return 0
    else
        log "ERROR: No Python interpreter found!"
        return 1
    fi
}

kill_existing() {
    log "Checking for existing processes..."
    
    # Try graceful shutdown first via PID file
    if [ -f "$PID_FILE" ]; then
        local old_pid=$(cat "$PID_FILE")
        if ps -p "$old_pid" > /dev/null 2>&1; then
            log "Sending SIGTERM to existing process (PID: $old_pid)..."
            kill -TERM "$old_pid" 2>/dev/null || true
            
            # Wait for graceful shutdown
            local wait_count=0
            while ps -p "$old_pid" > /dev/null 2>&1 && [ $wait_count -lt 10 ]; do
                sleep 1
                ((wait_count++))
            done
            
            # Force kill if still running
            if ps -p "$old_pid" > /dev/null 2>&1; then
                log "Force killing process..."
                kill -9 "$old_pid" 2>/dev/null || true
                sleep 1
            fi
        fi
        rm -f "$PID_FILE"
    fi
    
    # Also kill any orphaned gunicorn processes on our port
    pkill -f "gunicorn.*:$PORT" 2>/dev/null || true
    sleep 1
    
    # Final check
    if check_port; then
        log "ERROR: Port $PORT still in use after cleanup!"
        lsof -Pi :$PORT -sTCP:LISTEN
        exit 1
    fi
}

rotate_logs() {
    local max_size=$((50 * 1024 * 1024))  # 50MB
    
    for logfile in "$ACCESS_LOG" "$ERROR_LOG"; do
        if [ -f "$logfile" ]; then
            local size=$(stat -f%z "$logfile" 2>/dev/null || stat -c%s "$logfile" 2>/dev/null || echo 0)
            if [ "$size" -gt "$max_size" ]; then
                log "Rotating $logfile (size: $size bytes)"
                mv "$logfile" "${logfile}.$(date '+%Y%m%d_%H%M%S').bak"
                # Keep only last 5 backups
                ls -t "${logfile}".*.bak 2>/dev/null | tail -n +6 | xargs rm -f 2>/dev/null || true
            fi
        fi
    done
}

install_dependencies() {
    local missing=0
    
    if ! python3 -c "import gunicorn" 2>/dev/null; then
        log "Installing gunicorn..."
        python3 -m pip install --quiet gunicorn
        missing=1
    fi
    
    if ! python3 -c "import gevent" 2>/dev/null; then
        log "Installing gevent (async worker)..."
        python3 -m pip install --quiet gevent
        missing=1
    fi
    
    return $missing
}

# ============================================================================
# MAIN
# ============================================================================

# Prevent multiple simultaneous starts
exec 200>"$LOCK_FILE"
if ! flock -n 200; then
    log "ERROR: Another start script is already running!"
    exit 1
fi

log "=========================================="
log "SkillDarbar API - Starting Server"
log "=========================================="

# Create directories
mkdir -p "$LOG_DIR"

# Activate Python environment (venv or system)
if ! activate_python; then
    log "ERROR: Cannot proceed without Python!"
    exit 1
fi

# Install dependencies if needed
install_dependencies

# Rotate large log files
rotate_logs

# Kill existing processes
kill_existing

log "Configuration:"
log "  Host: $HOST:$PORT"
log "  Workers: $WORKERS (gevent async)"
log "  Worker Connections: $WORKER_CONNECTIONS"
log "  Timeout: ${TIMEOUT}s"
log "  Max Requests: $MAX_REQUESTS (+/- $MAX_REQUESTS_JITTER jitter)"
log "  Logs: $LOG_DIR/"

# Start Gunicorn with gevent async workers for streaming
gunicorn \
    --bind "$HOST:$PORT" \
    --workers "$WORKERS" \
    --worker-class gevent \
    --worker-connections "$WORKER_CONNECTIONS" \
    --timeout "$TIMEOUT" \
    --graceful-timeout "$GRACEFUL_TIMEOUT" \
    --keep-alive "$KEEP_ALIVE" \
    --max-requests "$MAX_REQUESTS" \
    --max-requests-jitter "$MAX_REQUESTS_JITTER" \
    --access-logfile "$ACCESS_LOG" \
    --error-logfile "$ERROR_LOG" \
    --access-logformat '%(h)s %(l)s %(u)s %(t)s "%(r)s" %(s)s %(b)s "%(f)s" "%(a)s" %(D)s' \
    --capture-output \
    --enable-stdio-inheritance \
    --pid "$PID_FILE" \
    --daemon \
    --preload \
    app:app

# Verify startup
sleep 3

if [ -f "$PID_FILE" ]; then
    PID=$(cat "$PID_FILE")
    if ps -p "$PID" > /dev/null 2>&1; then
        # Quick health check
        if curl -s -o /dev/null -w "%{http_code}" "http://localhost:$PORT/api/health" 2>/dev/null | grep -q "200"; then
            log "✅ API started successfully and responding!"
        else
            log "⚠️  API started (PID: $PID) but health check pending..."
        fi
        log "   PID: $PID"
        log "   Access logs: $ACCESS_LOG"
        log "   Error logs: $ERROR_LOG"
        log ""
        log "Management commands:"
        log "   Stop:    $SCRIPT_DIR/stop_webuzo.sh"
        log "   Reload:  kill -HUP $PID"
        log "   Status:  ps aux | grep gunicorn"
        exit 0
    fi
fi

log "❌ API failed to start! Last 30 lines of error log:"
tail -30 "$ERROR_LOG" 2>/dev/null || log "No error log available"
exit 1