feat: deploy model api server to chicago-web01

2025-11-27 11:01:54 -08:00 · 2025-11-27 11:01:54 -08:00 · 0cb89ddc80
commit 0cb89ddc80
parent 515a0e6d81
6 changed files with 394 additions and 18 deletions
--- a/api/.gitignore
+++ b/api/.gitignore
@ -9,6 +9,6 @@ venv/
 ENV/
 # NLTK Data (uncomment if you want to download on each deployment)
-nltk_data/
+cache-nltk/
-models_cache/
+cache-huggingface/
 genfiles
--- a/api/deploy.sh
+++ b/api/deploy.sh
@ -0,0 +1,118 @@
 #!/bin/sh
 set -eu
 set -o pipefail
 # Deployment topology with Nomad
 #
 # Directory structure on remote:
 #   $base/$project/
 #     releases/{stamp}_{hash}/
 #       salience/           - python package
 #       transcript.txt      - default text file
 #       pyproject.toml      - dependencies
 #       .venv/              - virtual environment (created by uv)
 #       job.nomad.hcl       - nomad job definition for this release
 #
 # Zero-downtime deployment with Nomad:
 #   1. rsync new release (salience/ + pyproject.toml + transcript.txt)
 #   2. uv sync dependencies
 #   3. generate job file with release path
 #   4. nomad job run (triggers blue-green deployment)
 #   5. nomad waits for health checks to pass
 #   6. nomad auto-promotes new allocation
 #   7. old allocation enters graceful shutdown (30s kill_timeout)
 #   8. consul-template updates nginx config (via service tags)
 #   9. cleanup old releases (keep 5 most recent)
 ssh=deploy-peoplesgrocers-website
 base=/home/peoplesgrocers
 project=salience-editor-api
 #git diff-index --quiet HEAD || { echo 'git repo dirty'; exit 1; }
 hash=$(git rev-parse --short=8 HEAD)
 stamp=$(date +%Y-%b-%d-%a-%I_%M%p | tr 'APM' 'apm')
 release="${stamp}-${hash}"
 echo "deploying: $project @ $release"
 printf "continue? [y/n] "
 read ans
 test "$ans" = "y" || exit 1
 # prepare remote directories
 ssh $ssh "mkdir -p $base/$project/releases/$release"
 # sync all files using rclone (handles poor network connections better)
 echo "syncing release files..."
 temp_dir=$(mktemp -d)
 trap "rm -rf $temp_dir" EXIT INT TERM
 # Copy files to temp directory for single rclone transfer
 rsync -a \
    --exclude '__pycache__' \
    --exclude '*.swp' \
    salience pyproject.toml transcript.txt README.md \
    "$temp_dir/"
 test -f uv.lock && cp uv.lock "$temp_dir/"
 rclone copy "$temp_dir/" "${ssh}:$base/$project/releases/$release/" \
  --progress --retries 10 --checksum
 rm -rf "$temp_dir"
 echo "installing dependencies with uv..."
 ssh $ssh "cd $base/$project/releases/$release && ~/.local/bin/uv sync --link-mode symlink"
 # generate nomad job file with release path
 echo "generating nomad job file..."
 release_path="$base/$project/releases/$release"
 job_file="$base/$project/releases/$release/job.nomad.hcl"
 # Use envsubst with whitelist to only replace our variables, not Nomad runtime variables
 export RELEASE_PLACEHOLDER="$release"
 export RELEASE_PATH="$release_path"
 envsubst '$RELEASE_PLACEHOLDER $RELEASE_PATH' < salience-editor-api.nomad.hcl | ssh $ssh "cat > $job_file"
 echo ""
 echo "nomad job file created at: $job_file"
 echo ""
 # submit job to nomad
 echo "submitting job to nomad..."
 deployment_id=$(ssh $ssh "source ~/.local/bin/env && nomad job run $job_file | grep -oE 'Deployment ID = [a-f0-9-]+' | awk '{print \$4}'"  )
 if [ -n "$deployment_id" ]; then
  echo "deployment started: $deployment_id"
  echo ""
  echo "monitoring deployment..."
  # Monitor deployment status
  ssh $ssh "source ~/.local/bin/env && nomad deployment status $deployment_id"
  echo ""
  printf "watch deployment progress? [y/n] "
  read ans
  if [ "$ans" = "y" ]; then
    ssh $ssh "source ~/.local/bin/env && watch -n 2 'nomad deployment status $deployment_id'"
  fi
 else
  echo "warning: could not extract deployment ID"
  echo "check deployment status manually with: nomad job status $project"
 fi
 echo ""
 echo "done: $release"
 echo ""
 echo "Next steps:"
 echo "- Nomad will automatically promote the deployment after health checks pass"
 echo "- Consul-template will update nginx config based on healthy service instances"
 echo "- Old allocation will gracefully shutdown (30s timeout for in-flight requests)"
 echo "- Run ./cleanup-old-releases.sh to remove old releases (keeps 5 most recent)"
 echo ""
 if [ -n "$deployment_id" ]; then
  echo "Monitor deployment:"
  echo "  nomad deployment status $deployment_id"
  echo "  watch -n 2 'nomad deployment status $deployment_id'"
  echo "  nomad job allocs $project"
  echo ""
 fi
 echo "Check service health:"
 echo "  curl http://localhost:15500/v1/health/service/$project | jq"
--- a/api/salience-editor-api.nomad.hcl
+++ b/api/salience-editor-api.nomad.hcl
@ -0,0 +1,127 @@
 job "salience-editor-api" {
  datacenters = ["ord10"]
  type        = "service"
  constraint {
    attribute = "${node.unique.name}"
    value     = "chicago-web01"
  }
  group "app" {
    count = 1
    network {
      mode = "host"
      port "http" {
        # Nomad will assign an available port
      }
    }
    update {
      max_parallel      = 1
      health_check      = "checks"
      min_healthy_time  = "10s"
      healthy_deadline  = "5m"
      auto_promote      = true
      auto_revert       = true
      canary            = 1
    }
    task "gunicorn-server" {
      driver = "raw_exec"
      user = "peoplesgrocers"
      config {
        work_dir = "$RELEASE_PATH"
        command  = "/home/peoplesgrocers/.local/bin/uv"
        # You can add --log-level debug to gunicorn
        args     = ["run", "gunicorn", "--preload", "--workers", "3", "--bind", "127.0.0.1:${NOMAD_PORT_http}", "--timeout", "300", "salience:app"]
      }
      env {
        PORT   = "${NOMAD_PORT_http}"
        ORIGIN = "https://peoplesgrocers.com"
        #PATH   = "/home/peoplesgrocers/.local/bin:/usr/local/bin:/usr/bin:/bin"
        HOME   = "/home/peoplesgrocers"
        UV_CACHE_DIR = "/home/peoplesgrocers/.cache/uv"
        HF_HOME   = "/home/peoplesgrocers/cache-huggingface"
        NLTK_DATA = "/home/peoplesgrocers/cache-nltk"
      }
      # Release path set during deployment via envsubst
      template {
        data = <<EOH
 RELEASE_PATH="$RELEASE_PATH"
 EOH
        destination = "local/env"
        env         = true
      }
      service {
        name = "salience-editor-api"
        port = "http"
        tags = [
          "flask",
          "gunicorn",
          "api",
          "ml"
        ]
        # Health check on stats endpoint (lightweight)
        check {
          type     = "http"
          path     = "/stats"
          interval = "10s"
          timeout  = "5s"
          check_restart {
            limit = 3
            grace = "180s"  # 3 minutes for model loading
          }
        }
        meta {
          version = "$RELEASE_PLACEHOLDER"
        }
      }
      template {
        data = <<EOH
 #!/bin/sh
 host=http://127.0.0.1:{{ env "NOMAD_PORT_http" }}
 echo "=== /models ==="
 curl -s "$host/models"
 echo
 echo "=== /salience ==="
 curl -s -X POST -d "The cat sat on the mat. The dog chased the cat." "$host/salience?model=all-mpnet-base-v2"
 echo
 EOH
        destination = "local/smoke-test.sh"
        perms       = "0755"
        change_mode = "script"
        change_script {
          command = "/bin/sh"
          args    = ["-c", "cp ${NOMAD_TASK_DIR}/smoke-test.sh $RELEASE_PATH/smoke-test.sh"]
        }
      }
      resources {
        cpu    = 2000
        # If the task keeps dieing with Error code 137, check
        # sudo dmesg -T | grep -i "killed process
        # What I saw when the memory limit was too low was
        # [Thu Nov 27 18:19:09 2025] Memory cgroup out of memory: Killed process 2750984 (gunicorn) total-vm:4556920kB, anon-rss:295900kB, file-rss:244188kB, shmem-rss:0kB, UID:1010 pgtables:1920kB oom_score_adj:0
        memory = 8000
      }
      # I manually timed it once. Took a good 18 seconds to shutdown
      kill_timeout = "30s"
      kill_signal = "SIGTERM"
    }
  }
 }
--- a/api/salience/init.py
+++ b/api/salience/init.py
@ -1,12 +1,74 @@
 # Memory Sharing for ML Models
 # ============================
 # This app is designed to run with Gunicorn's --preload flag, which loads the
 # SentenceTransformer models once in the master process before forking workers.
 # On Linux, fork uses copy-on-write (COW) semantics, so workers share the
 # read-only model weights in memory rather than each loading their own copy.
 # This is critical for keeping memory usage reasonable with large transformer models.
 #
 # ResourceTracker errors on shutdown (Python 3.14):
 # When you Ctrl+C the Gunicorn process, you may see
 # "ChildProcessError: [Errno 10] No child processes"
 # from multiprocessing.resource_tracker.
 #
 # I think this is harmless. I think what happens is each forked worker gets a
 # copy of the ResourceTracker object, then each copy tries to deallocate the
 # same resources. The process still shuts down reasonbly quickly, so I'm not
 # concerned.
 print("Starting salience __init__.py...")
 from flask import Flask, request
 from flask_cors import CORS
 import numpy as np
 from .salience import extract, AVAILABLE_MODELS
 import json
 import time
 from collections import deque
 import threading
 app = Flask(__name__)
 CORS(app, origins=["http://localhost:5173"])
 # Thread-safe stats tracker for this worker process
 class StatsTracker:
    def __init__(self):
        # Store (start_time, end_time, duration) for successful requests
        self.processing_spans = deque(maxlen=1000)
        # Store arrival timestamps for overflow requests
        self.overflow_arrivals = deque(maxlen=1000)
        self.lock = threading.Lock()
    def add_processing_span(self, start_time, end_time):
        duration = end_time - start_time
        with self.lock:
            self.processing_spans.append((start_time, end_time, duration))
            # Clean old entries (>5 min)
            cutoff = time.time() - 300
            while self.processing_spans and self.processing_spans[0][0] < cutoff:
                self.processing_spans.popleft()
    def add_overflow_arrival(self, arrival_time):
        with self.lock:
            self.overflow_arrivals.append(arrival_time)
            # Clean old entries (>5 min)
            cutoff = time.time() - 300
            while self.overflow_arrivals and self.overflow_arrivals[0] < cutoff:
                self.overflow_arrivals.popleft()
    def get_stats(self):
        with self.lock:
            return {
                'processing_spans': [
                    {'start': start, 'end': end, 'duration': duration}
                    for start, end, duration in self.processing_spans
                ],
                'overflow_arrivals': list(self.overflow_arrivals),
                'window_seconds': 300  # 5 minutes
            }
 stats_tracker = StatsTracker()
 # Load default text from transcript.txt for GET requests
 with open('./transcript.txt', 'r') as file:
    default_source_text = file.read().strip()
@ -15,9 +77,40 @@ with open('./transcript.txt', 'r') as file:
 def models_view():
    return json.dumps(list(AVAILABLE_MODELS.keys()))
@app.route("/overflow", methods=['GET', 'POST'])
 def overflow_view():
    """
    Endpoint hit when HAProxy queue is full.
    Returns 429 with statistics about processing and overflow.
    """
    arrival_time = time.time()
    stats_tracker.add_overflow_arrival(arrival_time)
    stats = stats_tracker.get_stats()
    response = {
        'error': 'Queue full',
        'status': 429,
        'stats': stats,
        'message': 'Service is at capacity. Try again or check queue statistics.'
    }
    return json.dumps(response), 429
@app.route("/stats")
 def stats_view():
    """
    Endpoint for frontend to poll current queue statistics.
    Returns processing spans and overflow arrivals from last 5 minutes.
    """
    stats = stats_tracker.get_stats()
    return json.dumps(stats)
@app.route("/salience", methods=['GET'])
 def salience_view_default():
    """GET endpoint - processes default text from transcript.txt"""
    start_time = time.time()
    model_name = request.args.get('model', 'all-mpnet-base-v2')
    # Validate model name
@ -26,6 +119,9 @@ def salience_view_default():
    sentence_ranges, adjacency = extract(default_source_text, model_name)
    end_time = time.time()
    stats_tracker.add_processing_span(start_time, end_time)
    return json.dumps({
        'source': default_source_text,
        'intervals': sentence_ranges,
@ -36,6 +132,8 @@ def salience_view_default():
@app.route("/salience", methods=['POST'])
 def salience_view_custom():
    """POST endpoint - processes text from request body"""
    start_time = time.time()
    model_name = request.args.get('model', 'all-mpnet-base-v2')
    # Validate model name
@ -50,6 +148,9 @@ def salience_view_custom():
    sentence_ranges, adjacency = extract(source_text, model_name)
    end_time = time.time()
    stats_tracker.add_processing_span(start_time, end_time)
    return json.dumps({
        'source': source_text,
        'intervals': sentence_ranges,
--- a/api/salience/salience.py
+++ b/api/salience/salience.py
@ -1,24 +1,34 @@
 import numpy as np
 import torch
 from sentence_transformers import SentenceTransformer
 import nltk.data
 import nltk
 import os
-# Set NLTK data path to project directory
+# Set default cache locations BEFORE importing libraries that use them
 PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 NLTK_DATA_DIR = os.path.join(PROJECT_DIR, 'nltk_data')
 TRANSFORMERS_CACHE_DIR = os.path.join(PROJECT_DIR, 'models_cache')
-# Add to NLTK's search path
+if 'NLTK_DATA' not in os.environ:
-nltk.data.path.insert(0, NLTK_DATA_DIR)
+    nltk_data_path = os.path.join(PROJECT_DIR, 'cache-nltk')
    os.makedirs(nltk_data_path, exist_ok=True)
    os.environ['NLTK_DATA'] = nltk_data_path
-# Download to the custom location
+if 'HF_HOME' not in os.environ:
    os.environ['HF_HOME'] = os.path.join(PROJECT_DIR, 'cache-huggingface')
 from salience.timed_import import timed_import
 with timed_import("import numpy as np"):
    import numpy as np
 with timed_import("import torch"):
    import torch
 with timed_import("from sentence_transformers import SentenceTransformer"):
    from sentence_transformers import SentenceTransformer
 with timed_import("import nltk"):
    import nltk.data
    import nltk
 # Download punkt_tab to the configured location
 # Using punkt_tab (the modern tab-separated format introduced in NLTK 3.8+)
 # instead of the older punkt pickle format
 # The punkt_tab model version depends on the NLTK Python package version
 # Check your NLTK version with: uv pip show nltk
-nltk.download('punkt_tab', download_dir=NLTK_DATA_DIR)
+nltk.download('punkt_tab')
 # Available models for the demo
 AVAILABLE_MODELS = {
@ -46,13 +56,13 @@ AVAILABLE_MODELS = {
 print("Loading sentence transformer models...")
 models = {}
-models['all-mpnet-base-v2'] = SentenceTransformer('all-mpnet-base-v2', cache_folder=TRANSFORMERS_CACHE_DIR)
+models['all-mpnet-base-v2'] = SentenceTransformer('all-mpnet-base-v2')
 print("Loading Alibaba-NLP/gte-large-en-v1.5")
-models['gte-large-en-v1.5'] = SentenceTransformer('Alibaba-NLP/gte-large-en-v1.5', trust_remote_code=True, cache_folder=TRANSFORMERS_CACHE_DIR)
+models['gte-large-en-v1.5'] = SentenceTransformer('Alibaba-NLP/gte-large-en-v1.5', trust_remote_code=True)
 #print("Loading Qwen/Qwen3-Embedding-4B")
-#models['qwen3-embedding-4b'] = SentenceTransformer('Qwen/Qwen3-Embedding-4B', trust_remote_code=True, cache_folder=TRANSFORMERS_CACHE_DIR)
+#models['qwen3-embedding-4b'] = SentenceTransformer('Qwen/Qwen3-Embedding-4B', trust_remote_code=True)
 print("Loading mixedbread-ai/mxbai-embed-large-v1")
-models["mxbai-embed-large-v1"] = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1', cache_folder=TRANSFORMERS_CACHE_DIR)
+models["mxbai-embed-large-v1"] = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1')
 print("All models loaded!")
 sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
--- a/api/salience/timed_import.py
+++ b/api/salience/timed_import.py
@ -0,0 +1,20 @@
 import sys
 import time
 class timed_import:
    """Context manager for timing imports."""
    def __init__(self, name):
        self.name = name
        self.start = None
    def __enter__(self):
        sys.stdout.write(f"{self.name} ")
        sys.stdout.flush()
        self.start = time.time()
        return self
    def __exit__(self, *args):
        elapsed = time.time() - self.start
        print(f"in {elapsed:.1f}s")