feat: make version deployable
This commit is contained in:
parent
4aa8759514
commit
49bd94cda2
22 changed files with 7785 additions and 10962 deletions
|
|
@ -1,10 +1,79 @@
|
|||
# Text Salience API
|
||||
|
||||
A Flask API for computing text salience using sentence transformers, with HAProxy-based queue management to handle resource contention.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
nginx (SSL termination, :443)
|
||||
↓
|
||||
HAProxy (queue manager, 127.0.0.2:5000)
|
||||
├─► [2 slots available] → Gunicorn workers (127.0.89.34:5000)
|
||||
│ Process request normally
|
||||
│ Track processing span
|
||||
│
|
||||
└─► [Queue full, 120+] → /overflow endpoint (127.0.89.34:5000)
|
||||
Return 429 with stats
|
||||
Track overflow arrival
|
||||
```
|
||||
|
||||
## Queue Management
|
||||
|
||||
- **Processing slots**: 2 concurrent requests
|
||||
- **Queue depth**: 120 requests
|
||||
- **Queue timeout**: 10 minutes
|
||||
- **Processing time**: ~5 seconds per request
|
||||
|
||||
When the queue is full, requests are routed to `/overflow` which returns a 429 status with statistics about:
|
||||
- Recent processing spans (last 5 minutes)
|
||||
- Overflow arrival times (last 5 minutes)
|
||||
|
||||
The frontend can use these statistics to:
|
||||
- Calculate queue probability using Poisson distribution
|
||||
- Display estimated wait times
|
||||
- Show arrival rate trends
|
||||
|
||||
## Run API
|
||||
|
||||
### Development (without queue)
|
||||
```bash
|
||||
uv run flask --app salience run
|
||||
```
|
||||
|
||||
### Production (with HAProxy queue)
|
||||
|
||||
1. **Start Gunicorn** with preloaded models (loads models once, forks 3 workers):
|
||||
```bash
|
||||
uv run gunicorn \
|
||||
--preload \
|
||||
--workers 3 \
|
||||
--bind 127.0.89.34:5000 \
|
||||
--timeout 300 \
|
||||
--access-logfile - \
|
||||
salience:app
|
||||
```
|
||||
(3 workers: 2 for model processing + 1 for overflow/stats responses)
|
||||
|
||||
2. **Start HAProxy** (assumes you're including `haproxy.cfg` in your main HAProxy config):
|
||||
```bash
|
||||
# If running standalone HAProxy for this service:
|
||||
# Uncomment the global/defaults sections in haproxy.cfg first
|
||||
haproxy -f haproxy.cfg
|
||||
|
||||
# If using a global HAProxy instance:
|
||||
# Include the frontend/backend sections from haproxy.cfg in your main config
|
||||
```
|
||||
|
||||
3. **Configure nginx** to proxy to HAProxy:
|
||||
```nginx
|
||||
location /api/salience {
|
||||
proxy_pass http://127.0.0.2:5000;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Host $host;
|
||||
proxy_read_timeout 900s;
|
||||
}
|
||||
```
|
||||
|
||||
## Benchmarks
|
||||
```bash
|
||||
# Generate embeddings
|
||||
|
|
|
|||
|
|
@ -24,8 +24,8 @@ set -o pipefail
|
|||
# 8. consul-template updates nginx config (via service tags)
|
||||
# 9. cleanup old releases (keep 5 most recent)
|
||||
|
||||
ssh=deploy-peoplesgrocers-website
|
||||
base=/home/peoplesgrocers
|
||||
ssh=pond-nomad
|
||||
base=/Users/nomad
|
||||
project=salience-editor-api
|
||||
|
||||
#git diff-index --quiet HEAD || { echo 'git repo dirty'; exit 1; }
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ requires-python = ">=3.11"
|
|||
dependencies = [
|
||||
"flask>=2.3.2,<3.0.0",
|
||||
"flask-cors>=4.0.0,<5.0.0",
|
||||
"gunicorn>=22.0.0,<23.0.0",
|
||||
"waitress>=3.0.0",
|
||||
"transformers>=4.30.2,<5.0.0",
|
||||
"nltk>=3.8.1,<4.0.0",
|
||||
"sentence-transformers>=2.2.2,<3.0.0",
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ job "salience-editor-api" {
|
|||
|
||||
constraint {
|
||||
attribute = "${node.unique.name}"
|
||||
value = "chicago-web01"
|
||||
value = "mac-mini"
|
||||
}
|
||||
|
||||
group "app" {
|
||||
|
|
@ -28,26 +28,27 @@ job "salience-editor-api" {
|
|||
canary = 1
|
||||
}
|
||||
|
||||
task "gunicorn-server" {
|
||||
task "waitress-server" {
|
||||
driver = "raw_exec"
|
||||
|
||||
user = "peoplesgrocers"
|
||||
#user = "nomad"
|
||||
|
||||
config {
|
||||
work_dir = "$RELEASE_PATH"
|
||||
command = "/home/peoplesgrocers/.local/bin/uv"
|
||||
# You can add --log-level debug to gunicorn
|
||||
args = ["run", "gunicorn", "--preload", "--workers", "3", "--bind", "127.0.0.1:${NOMAD_PORT_http}", "--timeout", "300", "salience:app"]
|
||||
command = "/Users/nomad/.local/bin/uv"
|
||||
# Waitress is single-process (no fork), avoiding Metal/MPS issues on macOS
|
||||
args = ["run", "waitress-serve", "--listen=10.77.0.2:${NOMAD_PORT_http}", "--listen=127.0.0.1:${NOMAD_PORT_http}", "salience:app"]
|
||||
}
|
||||
|
||||
env {
|
||||
PORT = "${NOMAD_PORT_http}"
|
||||
ORIGIN = "https://peoplesgrocers.com"
|
||||
#PATH = "/home/peoplesgrocers/.local/bin:/usr/local/bin:/usr/bin:/bin"
|
||||
HOME = "/home/peoplesgrocers"
|
||||
UV_CACHE_DIR = "/home/peoplesgrocers/.cache/uv"
|
||||
HF_HOME = "/home/peoplesgrocers/cache-huggingface"
|
||||
NLTK_DATA = "/home/peoplesgrocers/cache-nltk"
|
||||
#PATH = "/Users/nomad/.local/bin:/usr/local/bin:/usr/bin:/bin"
|
||||
HOME = "/Users/nomad"
|
||||
UV_CACHE_DIR = "/Users/nomad/.cache/uv"
|
||||
HF_HOME = "/Users/nomad/cache-huggingface"
|
||||
NLTK_DATA = "/Users/nomad/cache-nltk"
|
||||
#TORCH_DEVICE = "cpu" # Force CPU since chicago-web01 has no GPU
|
||||
}
|
||||
|
||||
# Release path set during deployment via envsubst
|
||||
|
|
@ -65,7 +66,7 @@ EOH
|
|||
|
||||
tags = [
|
||||
"flask",
|
||||
"gunicorn",
|
||||
"waitress",
|
||||
"api",
|
||||
"ml"
|
||||
]
|
||||
|
|
@ -91,7 +92,7 @@ EOH
|
|||
template {
|
||||
data = <<EOH
|
||||
#!/bin/sh
|
||||
host=http://127.0.0.1:{{ env "NOMAD_PORT_http" }}
|
||||
host=http://10.77.0.2:{{ env "NOMAD_PORT_http" }}
|
||||
|
||||
echo "=== /models ==="
|
||||
curl -s "$host/models"
|
||||
|
|
|
|||
|
|
@ -1,27 +1,14 @@
|
|||
# Memory Sharing for ML Models
|
||||
# ============================
|
||||
# This app is designed to run with Gunicorn's --preload flag, which loads the
|
||||
# SentenceTransformer models once in the master process before forking workers.
|
||||
# On Linux, fork uses copy-on-write (COW) semantics, so workers share the
|
||||
# read-only model weights in memory rather than each loading their own copy.
|
||||
# This is critical for keeping memory usage reasonable with large transformer models.
|
||||
#
|
||||
# ResourceTracker errors on shutdown (Python 3.14):
|
||||
# When you Ctrl+C the Gunicorn process, you may see
|
||||
# "ChildProcessError: [Errno 10] No child processes"
|
||||
# from multiprocessing.resource_tracker.
|
||||
#
|
||||
# I think this is harmless. I think what happens is each forked worker gets a
|
||||
# copy of the ResourceTracker object, then each copy tries to deallocate the
|
||||
# same resources. The process still shuts down reasonbly quickly, so I'm not
|
||||
# concerned.
|
||||
# Salience API
|
||||
# ============
|
||||
# Uses a worker thread for model inference to avoid fork() issues with Metal/MPS.
|
||||
# The worker thread owns all model instances; HTTP handlers submit work via queue.
|
||||
|
||||
print("Starting salience __init__.py...")
|
||||
|
||||
from flask import Flask, request
|
||||
from flask_cors import CORS
|
||||
import numpy as np
|
||||
from .salience import extract, AVAILABLE_MODELS
|
||||
from .salience import submit_work, AVAILABLE_MODELS
|
||||
import json
|
||||
import time
|
||||
from collections import deque
|
||||
|
|
@ -117,7 +104,7 @@ def salience_view_default():
|
|||
if model_name not in AVAILABLE_MODELS:
|
||||
return json.dumps({'error': f'Invalid model: {model_name}'}), 400
|
||||
|
||||
sentence_ranges, adjacency = extract(default_source_text, model_name)
|
||||
sentence_ranges, adjacency = submit_work(default_source_text, model_name)
|
||||
|
||||
end_time = time.time()
|
||||
stats_tracker.add_processing_span(start_time, end_time)
|
||||
|
|
@ -146,7 +133,7 @@ def salience_view_custom():
|
|||
if not source_text:
|
||||
return json.dumps({'error': 'No text provided'}), 400
|
||||
|
||||
sentence_ranges, adjacency = extract(source_text, model_name)
|
||||
sentence_ranges, adjacency = submit_work(source_text, model_name)
|
||||
|
||||
end_time = time.time()
|
||||
stats_tracker.add_processing_span(start_time, end_time)
|
||||
|
|
|
|||
|
|
@ -1,4 +1,8 @@
|
|||
import os
|
||||
import threading
|
||||
import queue
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
# Set default cache locations BEFORE importing libraries that use them
|
||||
PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
|
@ -11,6 +15,9 @@ if 'NLTK_DATA' not in os.environ:
|
|||
if 'HF_HOME' not in os.environ:
|
||||
os.environ['HF_HOME'] = os.path.join(PROJECT_DIR, 'cache-huggingface')
|
||||
|
||||
# Device configuration: set TORCH_DEVICE=cpu to force CPU, otherwise auto-detect
|
||||
DEVICE = os.environ.get('TORCH_DEVICE', None) # None = auto-detect (cuda/mps/cpu)
|
||||
|
||||
from salience.timed_import import timed_import
|
||||
|
||||
with timed_import("import numpy as np"):
|
||||
|
|
@ -31,8 +38,9 @@ with timed_import("import nltk"):
|
|||
nltk.download('punkt_tab')
|
||||
|
||||
# Available models for the demo
|
||||
# Keys are short names for the API, values are full HuggingFace repo IDs
|
||||
AVAILABLE_MODELS = {
|
||||
'all-mpnet-base-v2': 'all-mpnet-base-v2', # Dec 2020
|
||||
'all-mpnet-base-v2': 'sentence-transformers/all-mpnet-base-v2', # Dec 2020
|
||||
'gte-large-en-v1.5': 'Alibaba-NLP/gte-large-en-v1.5', # Jan 2024
|
||||
# 'qwen3-embedding-4b': 'Qwen/Qwen3-Embedding-4B', # April 2025
|
||||
'mxbai-embed-large-v1': 'mixedbread-ai/mxbai-embed-large-v1',
|
||||
|
|
@ -52,18 +60,17 @@ AVAILABLE_MODELS = {
|
|||
# Qwen/Qwen3-Embedding-4B: 80.86
|
||||
# mixedbread-ai/mxbai-embed-large-v1: 85.00
|
||||
|
||||
# Load all models into memory
|
||||
print("Loading sentence transformer models...")
|
||||
models = {}
|
||||
# Models loaded on first use in worker thread
|
||||
_loaded_models = {}
|
||||
|
||||
models['all-mpnet-base-v2'] = SentenceTransformer('all-mpnet-base-v2')
|
||||
print("Loading Alibaba-NLP/gte-large-en-v1.5")
|
||||
models['gte-large-en-v1.5'] = SentenceTransformer('Alibaba-NLP/gte-large-en-v1.5', trust_remote_code=True)
|
||||
#print("Loading Qwen/Qwen3-Embedding-4B")
|
||||
#models['qwen3-embedding-4b'] = SentenceTransformer('Qwen/Qwen3-Embedding-4B', trust_remote_code=True)
|
||||
print("Loading mixedbread-ai/mxbai-embed-large-v1")
|
||||
models["mxbai-embed-large-v1"] = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1')
|
||||
print("All models loaded!")
|
||||
def _get_model(model_name):
|
||||
"""Load and cache a model. Called only from worker thread."""
|
||||
if model_name not in _loaded_models:
|
||||
repo_id = AVAILABLE_MODELS[model_name]
|
||||
print(f"Loading model {repo_id} into memory...")
|
||||
trust_remote = model_name in ('gte-large-en-v1.5', 'qwen3-embedding-4b')
|
||||
_loaded_models[model_name] = SentenceTransformer(repo_id, trust_remote_code=trust_remote, device=DEVICE)
|
||||
return _loaded_models[model_name]
|
||||
|
||||
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
|
||||
|
||||
|
|
@ -89,7 +96,7 @@ def get_sentences(source_text):
|
|||
return sentences, sentence_ranges
|
||||
|
||||
def text_rank(sentences, model_name='all-mpnet-base-v2'):
|
||||
model = models[model_name]
|
||||
model = _get_model(model_name)
|
||||
vectors = model.encode(sentences)
|
||||
adjacency = torch.tensor(cos_sim(vectors)).fill_diagonal_(0.)
|
||||
adjacency[adjacency < 0] = 0
|
||||
|
|
@ -111,6 +118,65 @@ def extract(source_text, model_name='all-mpnet-base-v2'):
|
|||
return sentence_ranges, adjacency
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Worker Thread for Model Inference
|
||||
# =============================================================================
|
||||
# All model inference runs in a dedicated worker thread. This:
|
||||
# 1. Avoids fork() issues with Metal/MPS (no forking server needed)
|
||||
# 2. Serializes inference requests (one at a time)
|
||||
# 3. Keeps /stats and other endpoints responsive
|
||||
|
||||
@dataclass
|
||||
class WorkItem:
|
||||
source_text: str
|
||||
model_name: str
|
||||
event: threading.Event = field(default_factory=threading.Event)
|
||||
result: Optional[tuple] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
_work_queue: queue.Queue[WorkItem] = queue.Queue()
|
||||
|
||||
def _model_worker():
|
||||
"""Worker thread loop - processes inference requests from queue."""
|
||||
while True:
|
||||
item = _work_queue.get()
|
||||
try:
|
||||
item.result = extract(item.source_text, item.model_name)
|
||||
except Exception as e:
|
||||
item.error = str(e)
|
||||
finally:
|
||||
item.event.set()
|
||||
|
||||
# Start worker thread
|
||||
threading.Thread(target=_model_worker, daemon=True, name="model-worker").start()
|
||||
|
||||
def submit_work(source_text: str, model_name: str, timeout: float = 60.0) -> tuple:
|
||||
"""Submit text for salience extraction and wait for result.
|
||||
|
||||
Args:
|
||||
source_text: Text to analyze
|
||||
model_name: Name of the model to use (must be in AVAILABLE_MODELS)
|
||||
timeout: Max seconds to wait for result
|
||||
|
||||
Returns:
|
||||
(sentence_ranges, adjacency) tuple
|
||||
|
||||
Raises:
|
||||
TimeoutError: If inference takes longer than timeout
|
||||
RuntimeError: If inference fails
|
||||
"""
|
||||
item = WorkItem(source_text=source_text, model_name=model_name)
|
||||
_work_queue.put(item)
|
||||
|
||||
if not item.event.wait(timeout=timeout):
|
||||
raise TimeoutError("Model inference timed out")
|
||||
|
||||
if item.error:
|
||||
raise RuntimeError(item.error)
|
||||
|
||||
return item.result
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Unused/Debugging Code
|
||||
# =============================================================================
|
||||
|
|
|
|||
11
api/smoke-test.sh
Executable file
11
api/smoke-test.sh
Executable file
|
|
@ -0,0 +1,11 @@
|
|||
#!/bin/sh
|
||||
host=http://127.0.0.1:5000
|
||||
|
||||
echo "=== /models ==="
|
||||
curl -s "$host/models"
|
||||
echo
|
||||
|
||||
echo "=== /salience ==="
|
||||
curl -s -H "Content-Type: text/plain" --data-binary "The cat sat on the mat. The dog chased the cat." "$host/salience?model=all-mpnet-base-v2"
|
||||
echo
|
||||
|
||||
25
api/uv.lock
generated
25
api/uv.lock
generated
|
|
@ -303,18 +303,6 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/eb/02/a6b21098b1d5d6249b7c5ab69dde30108a71e4e819d4a9778f1de1d5b70d/fsspec-2025.10.0-py3-none-any.whl", hash = "sha256:7c7712353ae7d875407f97715f0e1ffcc21e33d5b24556cb1e090ae9409ec61d", size = 200966 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "gunicorn"
|
||||
version = "22.0.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "packaging" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/1e/88/e2f93c5738a4c1f56a458fc7a5b1676fc31dcdbb182bef6b40a141c17d66/gunicorn-22.0.0.tar.gz", hash = "sha256:4a0b436239ff76fb33f11c07a16482c521a7e09c1ce3cc293c2330afe01bec63", size = 3639760 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/29/97/6d610ae77b5633d24b69c2ff1ac3044e0e565ecbd1ec188f02c45073054c/gunicorn-22.0.0-py3-none-any.whl", hash = "sha256:350679f91b24062c86e386e198a15438d53a7a8207235a78ba1b53df4c4378d9", size = 84443 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hf-xet"
|
||||
version = "1.2.0"
|
||||
|
|
@ -1257,7 +1245,6 @@ source = { editable = "." }
|
|||
dependencies = [
|
||||
{ name = "flask" },
|
||||
{ name = "flask-cors" },
|
||||
{ name = "gunicorn" },
|
||||
{ name = "matplotlib" },
|
||||
{ name = "nltk" },
|
||||
{ name = "numpy" },
|
||||
|
|
@ -1267,13 +1254,13 @@ dependencies = [
|
|||
{ name = "seaborn" },
|
||||
{ name = "sentence-transformers" },
|
||||
{ name = "transformers" },
|
||||
{ name = "waitress" },
|
||||
]
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "flask", specifier = ">=2.3.2,<3.0.0" },
|
||||
{ name = "flask-cors", specifier = ">=4.0.0,<5.0.0" },
|
||||
{ name = "gunicorn", specifier = ">=22.0.0,<23.0.0" },
|
||||
{ name = "matplotlib", specifier = ">=3.10.7" },
|
||||
{ name = "nltk", specifier = ">=3.8.1,<4.0.0" },
|
||||
{ name = "numpy", specifier = ">=1.25.0,<2.0.0" },
|
||||
|
|
@ -1283,6 +1270,7 @@ requires-dist = [
|
|||
{ name = "seaborn", specifier = ">=0.13.2" },
|
||||
{ name = "sentence-transformers", specifier = ">=2.2.2,<3.0.0" },
|
||||
{ name = "transformers", specifier = ">=4.30.2,<5.0.0" },
|
||||
{ name = "waitress", specifier = ">=3.0.0" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -1621,6 +1609,15 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "waitress"
|
||||
version = "3.0.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/bf/cb/04ddb054f45faa306a230769e868c28b8065ea196891f09004ebace5b184/waitress-3.0.2.tar.gz", hash = "sha256:682aaaf2af0c44ada4abfb70ded36393f0e307f4ab9456a215ce0020baefc31f", size = 179901 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/8d/57/a27182528c90ef38d82b636a11f606b0cbb0e17588ed205435f8affe3368/waitress-3.0.2-py3-none-any.whl", hash = "sha256:c56d67fd6e87c2ee598b76abdd4e96cfad1f24cacdea5078d382b1f9d7b5ed2e", size = 56232 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "werkzeug"
|
||||
version = "3.1.3"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue