From 0cb89ddc8074c2f70278c3cbe3f3191664234dd6 Mon Sep 17 00:00:00 2001 From: nobody Date: Thu, 27 Nov 2025 11:01:54 -0800 Subject: [PATCH] feat: deploy model api server to chicago-web01 --- api/.gitignore | 4 +- api/deploy.sh | 118 +++++++++++++++++++++++++++ api/salience-editor-api.nomad.hcl | 127 ++++++++++++++++++++++++++++++ api/salience/__init__.py | 101 ++++++++++++++++++++++++ api/salience/salience.py | 42 ++++++---- api/salience/timed_import.py | 20 +++++ 6 files changed, 394 insertions(+), 18 deletions(-) create mode 100755 api/deploy.sh create mode 100644 api/salience-editor-api.nomad.hcl create mode 100644 api/salience/timed_import.py diff --git a/api/.gitignore b/api/.gitignore index 898f4a6..f8bf22a 100644 --- a/api/.gitignore +++ b/api/.gitignore @@ -9,6 +9,6 @@ venv/ ENV/ # NLTK Data (uncomment if you want to download on each deployment) -nltk_data/ -models_cache/ +cache-nltk/ +cache-huggingface/ genfiles diff --git a/api/deploy.sh b/api/deploy.sh new file mode 100755 index 0000000..eaabd8f --- /dev/null +++ b/api/deploy.sh @@ -0,0 +1,118 @@ +#!/bin/sh +set -eu +set -o pipefail + +# Deployment topology with Nomad +# +# Directory structure on remote: +# $base/$project/ +# releases/{stamp}_{hash}/ +# salience/ - python package +# transcript.txt - default text file +# pyproject.toml - dependencies +# .venv/ - virtual environment (created by uv) +# job.nomad.hcl - nomad job definition for this release +# +# Zero-downtime deployment with Nomad: +# 1. rsync new release (salience/ + pyproject.toml + transcript.txt) +# 2. uv sync dependencies +# 3. generate job file with release path +# 4. nomad job run (triggers blue-green deployment) +# 5. nomad waits for health checks to pass +# 6. nomad auto-promotes new allocation +# 7. old allocation enters graceful shutdown (30s kill_timeout) +# 8. consul-template updates nginx config (via service tags) +# 9. cleanup old releases (keep 5 most recent) + +ssh=deploy-peoplesgrocers-website +base=/home/peoplesgrocers +project=salience-editor-api + +#git diff-index --quiet HEAD || { echo 'git repo dirty'; exit 1; } + +hash=$(git rev-parse --short=8 HEAD) +stamp=$(date +%Y-%b-%d-%a-%I_%M%p | tr 'APM' 'apm') +release="${stamp}-${hash}" + +echo "deploying: $project @ $release" +printf "continue? [y/n] " +read ans +test "$ans" = "y" || exit 1 + +# prepare remote directories +ssh $ssh "mkdir -p $base/$project/releases/$release" + +# sync all files using rclone (handles poor network connections better) +echo "syncing release files..." +temp_dir=$(mktemp -d) +trap "rm -rf $temp_dir" EXIT INT TERM + +# Copy files to temp directory for single rclone transfer +rsync -a \ + --exclude '__pycache__' \ + --exclude '*.swp' \ + salience pyproject.toml transcript.txt README.md \ + "$temp_dir/" +test -f uv.lock && cp uv.lock "$temp_dir/" +rclone copy "$temp_dir/" "${ssh}:$base/$project/releases/$release/" \ + --progress --retries 10 --checksum + +rm -rf "$temp_dir" +echo "installing dependencies with uv..." +ssh $ssh "cd $base/$project/releases/$release && ~/.local/bin/uv sync --link-mode symlink" + +# generate nomad job file with release path +echo "generating nomad job file..." +release_path="$base/$project/releases/$release" +job_file="$base/$project/releases/$release/job.nomad.hcl" + +# Use envsubst with whitelist to only replace our variables, not Nomad runtime variables +export RELEASE_PLACEHOLDER="$release" +export RELEASE_PATH="$release_path" +envsubst '$RELEASE_PLACEHOLDER $RELEASE_PATH' < salience-editor-api.nomad.hcl | ssh $ssh "cat > $job_file" + +echo "" +echo "nomad job file created at: $job_file" +echo "" + +# submit job to nomad +echo "submitting job to nomad..." +deployment_id=$(ssh $ssh "source ~/.local/bin/env && nomad job run $job_file | grep -oE 'Deployment ID = [a-f0-9-]+' | awk '{print \$4}'" ) + +if [ -n "$deployment_id" ]; then + echo "deployment started: $deployment_id" + echo "" + echo "monitoring deployment..." + + # Monitor deployment status + ssh $ssh "source ~/.local/bin/env && nomad deployment status $deployment_id" + + echo "" + printf "watch deployment progress? [y/n] " + read ans + if [ "$ans" = "y" ]; then + ssh $ssh "source ~/.local/bin/env && watch -n 2 'nomad deployment status $deployment_id'" + fi +else + echo "warning: could not extract deployment ID" + echo "check deployment status manually with: nomad job status $project" +fi + +echo "" +echo "done: $release" +echo "" +echo "Next steps:" +echo "- Nomad will automatically promote the deployment after health checks pass" +echo "- Consul-template will update nginx config based on healthy service instances" +echo "- Old allocation will gracefully shutdown (30s timeout for in-flight requests)" +echo "- Run ./cleanup-old-releases.sh to remove old releases (keeps 5 most recent)" +echo "" +if [ -n "$deployment_id" ]; then + echo "Monitor deployment:" + echo " nomad deployment status $deployment_id" + echo " watch -n 2 'nomad deployment status $deployment_id'" + echo " nomad job allocs $project" + echo "" +fi +echo "Check service health:" +echo " curl http://localhost:15500/v1/health/service/$project | jq" diff --git a/api/salience-editor-api.nomad.hcl b/api/salience-editor-api.nomad.hcl new file mode 100644 index 0000000..b4de70e --- /dev/null +++ b/api/salience-editor-api.nomad.hcl @@ -0,0 +1,127 @@ +job "salience-editor-api" { + datacenters = ["ord10"] + type = "service" + + constraint { + attribute = "${node.unique.name}" + value = "chicago-web01" + } + + group "app" { + count = 1 + + network { + mode = "host" + + port "http" { + # Nomad will assign an available port + } + } + + update { + max_parallel = 1 + health_check = "checks" + min_healthy_time = "10s" + healthy_deadline = "5m" + auto_promote = true + auto_revert = true + canary = 1 + } + + task "gunicorn-server" { + driver = "raw_exec" + + user = "peoplesgrocers" + + config { + work_dir = "$RELEASE_PATH" + command = "/home/peoplesgrocers/.local/bin/uv" + # You can add --log-level debug to gunicorn + args = ["run", "gunicorn", "--preload", "--workers", "3", "--bind", "127.0.0.1:${NOMAD_PORT_http}", "--timeout", "300", "salience:app"] + } + + env { + PORT = "${NOMAD_PORT_http}" + ORIGIN = "https://peoplesgrocers.com" + #PATH = "/home/peoplesgrocers/.local/bin:/usr/local/bin:/usr/bin:/bin" + HOME = "/home/peoplesgrocers" + UV_CACHE_DIR = "/home/peoplesgrocers/.cache/uv" + HF_HOME = "/home/peoplesgrocers/cache-huggingface" + NLTK_DATA = "/home/peoplesgrocers/cache-nltk" + } + + # Release path set during deployment via envsubst + template { + data = <5 min) + cutoff = time.time() - 300 + while self.processing_spans and self.processing_spans[0][0] < cutoff: + self.processing_spans.popleft() + + def add_overflow_arrival(self, arrival_time): + with self.lock: + self.overflow_arrivals.append(arrival_time) + # Clean old entries (>5 min) + cutoff = time.time() - 300 + while self.overflow_arrivals and self.overflow_arrivals[0] < cutoff: + self.overflow_arrivals.popleft() + + def get_stats(self): + with self.lock: + return { + 'processing_spans': [ + {'start': start, 'end': end, 'duration': duration} + for start, end, duration in self.processing_spans + ], + 'overflow_arrivals': list(self.overflow_arrivals), + 'window_seconds': 300 # 5 minutes + } + +stats_tracker = StatsTracker() + # Load default text from transcript.txt for GET requests with open('./transcript.txt', 'r') as file: default_source_text = file.read().strip() @@ -15,9 +77,40 @@ with open('./transcript.txt', 'r') as file: def models_view(): return json.dumps(list(AVAILABLE_MODELS.keys())) +@app.route("/overflow", methods=['GET', 'POST']) +def overflow_view(): + """ + Endpoint hit when HAProxy queue is full. + Returns 429 with statistics about processing and overflow. + """ + arrival_time = time.time() + stats_tracker.add_overflow_arrival(arrival_time) + + stats = stats_tracker.get_stats() + + response = { + 'error': 'Queue full', + 'status': 429, + 'stats': stats, + 'message': 'Service is at capacity. Try again or check queue statistics.' + } + + return json.dumps(response), 429 + +@app.route("/stats") +def stats_view(): + """ + Endpoint for frontend to poll current queue statistics. + Returns processing spans and overflow arrivals from last 5 minutes. + """ + stats = stats_tracker.get_stats() + return json.dumps(stats) + @app.route("/salience", methods=['GET']) def salience_view_default(): """GET endpoint - processes default text from transcript.txt""" + start_time = time.time() + model_name = request.args.get('model', 'all-mpnet-base-v2') # Validate model name @@ -26,6 +119,9 @@ def salience_view_default(): sentence_ranges, adjacency = extract(default_source_text, model_name) + end_time = time.time() + stats_tracker.add_processing_span(start_time, end_time) + return json.dumps({ 'source': default_source_text, 'intervals': sentence_ranges, @@ -36,6 +132,8 @@ def salience_view_default(): @app.route("/salience", methods=['POST']) def salience_view_custom(): """POST endpoint - processes text from request body""" + start_time = time.time() + model_name = request.args.get('model', 'all-mpnet-base-v2') # Validate model name @@ -50,6 +148,9 @@ def salience_view_custom(): sentence_ranges, adjacency = extract(source_text, model_name) + end_time = time.time() + stats_tracker.add_processing_span(start_time, end_time) + return json.dumps({ 'source': source_text, 'intervals': sentence_ranges, diff --git a/api/salience/salience.py b/api/salience/salience.py index 48359f4..93246fc 100644 --- a/api/salience/salience.py +++ b/api/salience/salience.py @@ -1,24 +1,34 @@ -import numpy as np -import torch -from sentence_transformers import SentenceTransformer -import nltk.data -import nltk import os -# Set NLTK data path to project directory +# Set default cache locations BEFORE importing libraries that use them PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -NLTK_DATA_DIR = os.path.join(PROJECT_DIR, 'nltk_data') -TRANSFORMERS_CACHE_DIR = os.path.join(PROJECT_DIR, 'models_cache') -# Add to NLTK's search path -nltk.data.path.insert(0, NLTK_DATA_DIR) +if 'NLTK_DATA' not in os.environ: + nltk_data_path = os.path.join(PROJECT_DIR, 'cache-nltk') + os.makedirs(nltk_data_path, exist_ok=True) + os.environ['NLTK_DATA'] = nltk_data_path -# Download to the custom location +if 'HF_HOME' not in os.environ: + os.environ['HF_HOME'] = os.path.join(PROJECT_DIR, 'cache-huggingface') + +from salience.timed_import import timed_import + +with timed_import("import numpy as np"): + import numpy as np +with timed_import("import torch"): + import torch +with timed_import("from sentence_transformers import SentenceTransformer"): + from sentence_transformers import SentenceTransformer +with timed_import("import nltk"): + import nltk.data + import nltk + +# Download punkt_tab to the configured location # Using punkt_tab (the modern tab-separated format introduced in NLTK 3.8+) # instead of the older punkt pickle format # The punkt_tab model version depends on the NLTK Python package version # Check your NLTK version with: uv pip show nltk -nltk.download('punkt_tab', download_dir=NLTK_DATA_DIR) +nltk.download('punkt_tab') # Available models for the demo AVAILABLE_MODELS = { @@ -46,13 +56,13 @@ AVAILABLE_MODELS = { print("Loading sentence transformer models...") models = {} -models['all-mpnet-base-v2'] = SentenceTransformer('all-mpnet-base-v2', cache_folder=TRANSFORMERS_CACHE_DIR) +models['all-mpnet-base-v2'] = SentenceTransformer('all-mpnet-base-v2') print("Loading Alibaba-NLP/gte-large-en-v1.5") -models['gte-large-en-v1.5'] = SentenceTransformer('Alibaba-NLP/gte-large-en-v1.5', trust_remote_code=True, cache_folder=TRANSFORMERS_CACHE_DIR) +models['gte-large-en-v1.5'] = SentenceTransformer('Alibaba-NLP/gte-large-en-v1.5', trust_remote_code=True) #print("Loading Qwen/Qwen3-Embedding-4B") -#models['qwen3-embedding-4b'] = SentenceTransformer('Qwen/Qwen3-Embedding-4B', trust_remote_code=True, cache_folder=TRANSFORMERS_CACHE_DIR) +#models['qwen3-embedding-4b'] = SentenceTransformer('Qwen/Qwen3-Embedding-4B', trust_remote_code=True) print("Loading mixedbread-ai/mxbai-embed-large-v1") -models["mxbai-embed-large-v1"] = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1', cache_folder=TRANSFORMERS_CACHE_DIR) +models["mxbai-embed-large-v1"] = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1') print("All models loaded!") sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') diff --git a/api/salience/timed_import.py b/api/salience/timed_import.py new file mode 100644 index 0000000..dc6ad48 --- /dev/null +++ b/api/salience/timed_import.py @@ -0,0 +1,20 @@ +import sys +import time + + +class timed_import: + """Context manager for timing imports.""" + + def __init__(self, name): + self.name = name + self.start = None + + def __enter__(self): + sys.stdout.write(f"{self.name} ") + sys.stdout.flush() + self.start = time.time() + return self + + def __exit__(self, *args): + elapsed = time.time() - self.start + print(f"in {elapsed:.1f}s")