feat: make version deployable
This commit is contained in:
parent
4aa8759514
commit
49bd94cda2
22 changed files with 7785 additions and 10962 deletions
|
|
@ -1,10 +1,79 @@
|
|||
# Text Salience API
|
||||
|
||||
A Flask API for computing text salience using sentence transformers, with HAProxy-based queue management to handle resource contention.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
nginx (SSL termination, :443)
|
||||
↓
|
||||
HAProxy (queue manager, 127.0.0.2:5000)
|
||||
├─► [2 slots available] → Gunicorn workers (127.0.89.34:5000)
|
||||
│ Process request normally
|
||||
│ Track processing span
|
||||
│
|
||||
└─► [Queue full, 120+] → /overflow endpoint (127.0.89.34:5000)
|
||||
Return 429 with stats
|
||||
Track overflow arrival
|
||||
```
|
||||
|
||||
## Queue Management
|
||||
|
||||
- **Processing slots**: 2 concurrent requests
|
||||
- **Queue depth**: 120 requests
|
||||
- **Queue timeout**: 10 minutes
|
||||
- **Processing time**: ~5 seconds per request
|
||||
|
||||
When the queue is full, requests are routed to `/overflow` which returns a 429 status with statistics about:
|
||||
- Recent processing spans (last 5 minutes)
|
||||
- Overflow arrival times (last 5 minutes)
|
||||
|
||||
The frontend can use these statistics to:
|
||||
- Calculate queue probability using Poisson distribution
|
||||
- Display estimated wait times
|
||||
- Show arrival rate trends
|
||||
|
||||
## Run API
|
||||
|
||||
### Development (without queue)
|
||||
```bash
|
||||
uv run flask --app salience run
|
||||
```
|
||||
|
||||
### Production (with HAProxy queue)
|
||||
|
||||
1. **Start Gunicorn** with preloaded models (loads models once, forks 3 workers):
|
||||
```bash
|
||||
uv run gunicorn \
|
||||
--preload \
|
||||
--workers 3 \
|
||||
--bind 127.0.89.34:5000 \
|
||||
--timeout 300 \
|
||||
--access-logfile - \
|
||||
salience:app
|
||||
```
|
||||
(3 workers: 2 for model processing + 1 for overflow/stats responses)
|
||||
|
||||
2. **Start HAProxy** (assumes you're including `haproxy.cfg` in your main HAProxy config):
|
||||
```bash
|
||||
# If running standalone HAProxy for this service:
|
||||
# Uncomment the global/defaults sections in haproxy.cfg first
|
||||
haproxy -f haproxy.cfg
|
||||
|
||||
# If using a global HAProxy instance:
|
||||
# Include the frontend/backend sections from haproxy.cfg in your main config
|
||||
```
|
||||
|
||||
3. **Configure nginx** to proxy to HAProxy:
|
||||
```nginx
|
||||
location /api/salience {
|
||||
proxy_pass http://127.0.0.2:5000;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Host $host;
|
||||
proxy_read_timeout 900s;
|
||||
}
|
||||
```
|
||||
|
||||
## Benchmarks
|
||||
```bash
|
||||
# Generate embeddings
|
||||
|
|
|
|||
|
|
@ -24,8 +24,8 @@ set -o pipefail
|
|||
# 8. consul-template updates nginx config (via service tags)
|
||||
# 9. cleanup old releases (keep 5 most recent)
|
||||
|
||||
ssh=deploy-peoplesgrocers-website
|
||||
base=/home/peoplesgrocers
|
||||
ssh=pond-nomad
|
||||
base=/Users/nomad
|
||||
project=salience-editor-api
|
||||
|
||||
#git diff-index --quiet HEAD || { echo 'git repo dirty'; exit 1; }
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ requires-python = ">=3.11"
|
|||
dependencies = [
|
||||
"flask>=2.3.2,<3.0.0",
|
||||
"flask-cors>=4.0.0,<5.0.0",
|
||||
"gunicorn>=22.0.0,<23.0.0",
|
||||
"waitress>=3.0.0",
|
||||
"transformers>=4.30.2,<5.0.0",
|
||||
"nltk>=3.8.1,<4.0.0",
|
||||
"sentence-transformers>=2.2.2,<3.0.0",
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ job "salience-editor-api" {
|
|||
|
||||
constraint {
|
||||
attribute = "${node.unique.name}"
|
||||
value = "chicago-web01"
|
||||
value = "mac-mini"
|
||||
}
|
||||
|
||||
group "app" {
|
||||
|
|
@ -28,26 +28,27 @@ job "salience-editor-api" {
|
|||
canary = 1
|
||||
}
|
||||
|
||||
task "gunicorn-server" {
|
||||
task "waitress-server" {
|
||||
driver = "raw_exec"
|
||||
|
||||
user = "peoplesgrocers"
|
||||
#user = "nomad"
|
||||
|
||||
config {
|
||||
work_dir = "$RELEASE_PATH"
|
||||
command = "/home/peoplesgrocers/.local/bin/uv"
|
||||
# You can add --log-level debug to gunicorn
|
||||
args = ["run", "gunicorn", "--preload", "--workers", "3", "--bind", "127.0.0.1:${NOMAD_PORT_http}", "--timeout", "300", "salience:app"]
|
||||
command = "/Users/nomad/.local/bin/uv"
|
||||
# Waitress is single-process (no fork), avoiding Metal/MPS issues on macOS
|
||||
args = ["run", "waitress-serve", "--listen=10.77.0.2:${NOMAD_PORT_http}", "--listen=127.0.0.1:${NOMAD_PORT_http}", "salience:app"]
|
||||
}
|
||||
|
||||
env {
|
||||
PORT = "${NOMAD_PORT_http}"
|
||||
ORIGIN = "https://peoplesgrocers.com"
|
||||
#PATH = "/home/peoplesgrocers/.local/bin:/usr/local/bin:/usr/bin:/bin"
|
||||
HOME = "/home/peoplesgrocers"
|
||||
UV_CACHE_DIR = "/home/peoplesgrocers/.cache/uv"
|
||||
HF_HOME = "/home/peoplesgrocers/cache-huggingface"
|
||||
NLTK_DATA = "/home/peoplesgrocers/cache-nltk"
|
||||
#PATH = "/Users/nomad/.local/bin:/usr/local/bin:/usr/bin:/bin"
|
||||
HOME = "/Users/nomad"
|
||||
UV_CACHE_DIR = "/Users/nomad/.cache/uv"
|
||||
HF_HOME = "/Users/nomad/cache-huggingface"
|
||||
NLTK_DATA = "/Users/nomad/cache-nltk"
|
||||
#TORCH_DEVICE = "cpu" # Force CPU since chicago-web01 has no GPU
|
||||
}
|
||||
|
||||
# Release path set during deployment via envsubst
|
||||
|
|
@ -65,7 +66,7 @@ EOH
|
|||
|
||||
tags = [
|
||||
"flask",
|
||||
"gunicorn",
|
||||
"waitress",
|
||||
"api",
|
||||
"ml"
|
||||
]
|
||||
|
|
@ -91,7 +92,7 @@ EOH
|
|||
template {
|
||||
data = <<EOH
|
||||
#!/bin/sh
|
||||
host=http://127.0.0.1:{{ env "NOMAD_PORT_http" }}
|
||||
host=http://10.77.0.2:{{ env "NOMAD_PORT_http" }}
|
||||
|
||||
echo "=== /models ==="
|
||||
curl -s "$host/models"
|
||||
|
|
|
|||
|
|
@ -1,27 +1,14 @@
|
|||
# Memory Sharing for ML Models
|
||||
# ============================
|
||||
# This app is designed to run with Gunicorn's --preload flag, which loads the
|
||||
# SentenceTransformer models once in the master process before forking workers.
|
||||
# On Linux, fork uses copy-on-write (COW) semantics, so workers share the
|
||||
# read-only model weights in memory rather than each loading their own copy.
|
||||
# This is critical for keeping memory usage reasonable with large transformer models.
|
||||
#
|
||||
# ResourceTracker errors on shutdown (Python 3.14):
|
||||
# When you Ctrl+C the Gunicorn process, you may see
|
||||
# "ChildProcessError: [Errno 10] No child processes"
|
||||
# from multiprocessing.resource_tracker.
|
||||
#
|
||||
# I think this is harmless. I think what happens is each forked worker gets a
|
||||
# copy of the ResourceTracker object, then each copy tries to deallocate the
|
||||
# same resources. The process still shuts down reasonbly quickly, so I'm not
|
||||
# concerned.
|
||||
# Salience API
|
||||
# ============
|
||||
# Uses a worker thread for model inference to avoid fork() issues with Metal/MPS.
|
||||
# The worker thread owns all model instances; HTTP handlers submit work via queue.
|
||||
|
||||
print("Starting salience __init__.py...")
|
||||
|
||||
from flask import Flask, request
|
||||
from flask_cors import CORS
|
||||
import numpy as np
|
||||
from .salience import extract, AVAILABLE_MODELS
|
||||
from .salience import submit_work, AVAILABLE_MODELS
|
||||
import json
|
||||
import time
|
||||
from collections import deque
|
||||
|
|
@ -117,7 +104,7 @@ def salience_view_default():
|
|||
if model_name not in AVAILABLE_MODELS:
|
||||
return json.dumps({'error': f'Invalid model: {model_name}'}), 400
|
||||
|
||||
sentence_ranges, adjacency = extract(default_source_text, model_name)
|
||||
sentence_ranges, adjacency = submit_work(default_source_text, model_name)
|
||||
|
||||
end_time = time.time()
|
||||
stats_tracker.add_processing_span(start_time, end_time)
|
||||
|
|
@ -146,7 +133,7 @@ def salience_view_custom():
|
|||
if not source_text:
|
||||
return json.dumps({'error': 'No text provided'}), 400
|
||||
|
||||
sentence_ranges, adjacency = extract(source_text, model_name)
|
||||
sentence_ranges, adjacency = submit_work(source_text, model_name)
|
||||
|
||||
end_time = time.time()
|
||||
stats_tracker.add_processing_span(start_time, end_time)
|
||||
|
|
|
|||
|
|
@ -1,4 +1,8 @@
|
|||
import os
|
||||
import threading
|
||||
import queue
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
# Set default cache locations BEFORE importing libraries that use them
|
||||
PROJECT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
|
@ -11,6 +15,9 @@ if 'NLTK_DATA' not in os.environ:
|
|||
if 'HF_HOME' not in os.environ:
|
||||
os.environ['HF_HOME'] = os.path.join(PROJECT_DIR, 'cache-huggingface')
|
||||
|
||||
# Device configuration: set TORCH_DEVICE=cpu to force CPU, otherwise auto-detect
|
||||
DEVICE = os.environ.get('TORCH_DEVICE', None) # None = auto-detect (cuda/mps/cpu)
|
||||
|
||||
from salience.timed_import import timed_import
|
||||
|
||||
with timed_import("import numpy as np"):
|
||||
|
|
@ -31,8 +38,9 @@ with timed_import("import nltk"):
|
|||
nltk.download('punkt_tab')
|
||||
|
||||
# Available models for the demo
|
||||
# Keys are short names for the API, values are full HuggingFace repo IDs
|
||||
AVAILABLE_MODELS = {
|
||||
'all-mpnet-base-v2': 'all-mpnet-base-v2', # Dec 2020
|
||||
'all-mpnet-base-v2': 'sentence-transformers/all-mpnet-base-v2', # Dec 2020
|
||||
'gte-large-en-v1.5': 'Alibaba-NLP/gte-large-en-v1.5', # Jan 2024
|
||||
# 'qwen3-embedding-4b': 'Qwen/Qwen3-Embedding-4B', # April 2025
|
||||
'mxbai-embed-large-v1': 'mixedbread-ai/mxbai-embed-large-v1',
|
||||
|
|
@ -52,18 +60,17 @@ AVAILABLE_MODELS = {
|
|||
# Qwen/Qwen3-Embedding-4B: 80.86
|
||||
# mixedbread-ai/mxbai-embed-large-v1: 85.00
|
||||
|
||||
# Load all models into memory
|
||||
print("Loading sentence transformer models...")
|
||||
models = {}
|
||||
# Models loaded on first use in worker thread
|
||||
_loaded_models = {}
|
||||
|
||||
models['all-mpnet-base-v2'] = SentenceTransformer('all-mpnet-base-v2')
|
||||
print("Loading Alibaba-NLP/gte-large-en-v1.5")
|
||||
models['gte-large-en-v1.5'] = SentenceTransformer('Alibaba-NLP/gte-large-en-v1.5', trust_remote_code=True)
|
||||
#print("Loading Qwen/Qwen3-Embedding-4B")
|
||||
#models['qwen3-embedding-4b'] = SentenceTransformer('Qwen/Qwen3-Embedding-4B', trust_remote_code=True)
|
||||
print("Loading mixedbread-ai/mxbai-embed-large-v1")
|
||||
models["mxbai-embed-large-v1"] = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1')
|
||||
print("All models loaded!")
|
||||
def _get_model(model_name):
|
||||
"""Load and cache a model. Called only from worker thread."""
|
||||
if model_name not in _loaded_models:
|
||||
repo_id = AVAILABLE_MODELS[model_name]
|
||||
print(f"Loading model {repo_id} into memory...")
|
||||
trust_remote = model_name in ('gte-large-en-v1.5', 'qwen3-embedding-4b')
|
||||
_loaded_models[model_name] = SentenceTransformer(repo_id, trust_remote_code=trust_remote, device=DEVICE)
|
||||
return _loaded_models[model_name]
|
||||
|
||||
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
|
||||
|
||||
|
|
@ -89,7 +96,7 @@ def get_sentences(source_text):
|
|||
return sentences, sentence_ranges
|
||||
|
||||
def text_rank(sentences, model_name='all-mpnet-base-v2'):
|
||||
model = models[model_name]
|
||||
model = _get_model(model_name)
|
||||
vectors = model.encode(sentences)
|
||||
adjacency = torch.tensor(cos_sim(vectors)).fill_diagonal_(0.)
|
||||
adjacency[adjacency < 0] = 0
|
||||
|
|
@ -111,6 +118,65 @@ def extract(source_text, model_name='all-mpnet-base-v2'):
|
|||
return sentence_ranges, adjacency
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Worker Thread for Model Inference
|
||||
# =============================================================================
|
||||
# All model inference runs in a dedicated worker thread. This:
|
||||
# 1. Avoids fork() issues with Metal/MPS (no forking server needed)
|
||||
# 2. Serializes inference requests (one at a time)
|
||||
# 3. Keeps /stats and other endpoints responsive
|
||||
|
||||
@dataclass
|
||||
class WorkItem:
|
||||
source_text: str
|
||||
model_name: str
|
||||
event: threading.Event = field(default_factory=threading.Event)
|
||||
result: Optional[tuple] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
_work_queue: queue.Queue[WorkItem] = queue.Queue()
|
||||
|
||||
def _model_worker():
|
||||
"""Worker thread loop - processes inference requests from queue."""
|
||||
while True:
|
||||
item = _work_queue.get()
|
||||
try:
|
||||
item.result = extract(item.source_text, item.model_name)
|
||||
except Exception as e:
|
||||
item.error = str(e)
|
||||
finally:
|
||||
item.event.set()
|
||||
|
||||
# Start worker thread
|
||||
threading.Thread(target=_model_worker, daemon=True, name="model-worker").start()
|
||||
|
||||
def submit_work(source_text: str, model_name: str, timeout: float = 60.0) -> tuple:
|
||||
"""Submit text for salience extraction and wait for result.
|
||||
|
||||
Args:
|
||||
source_text: Text to analyze
|
||||
model_name: Name of the model to use (must be in AVAILABLE_MODELS)
|
||||
timeout: Max seconds to wait for result
|
||||
|
||||
Returns:
|
||||
(sentence_ranges, adjacency) tuple
|
||||
|
||||
Raises:
|
||||
TimeoutError: If inference takes longer than timeout
|
||||
RuntimeError: If inference fails
|
||||
"""
|
||||
item = WorkItem(source_text=source_text, model_name=model_name)
|
||||
_work_queue.put(item)
|
||||
|
||||
if not item.event.wait(timeout=timeout):
|
||||
raise TimeoutError("Model inference timed out")
|
||||
|
||||
if item.error:
|
||||
raise RuntimeError(item.error)
|
||||
|
||||
return item.result
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Unused/Debugging Code
|
||||
# =============================================================================
|
||||
|
|
|
|||
11
api/smoke-test.sh
Executable file
11
api/smoke-test.sh
Executable file
|
|
@ -0,0 +1,11 @@
|
|||
#!/bin/sh
|
||||
host=http://127.0.0.1:5000
|
||||
|
||||
echo "=== /models ==="
|
||||
curl -s "$host/models"
|
||||
echo
|
||||
|
||||
echo "=== /salience ==="
|
||||
curl -s -H "Content-Type: text/plain" --data-binary "The cat sat on the mat. The dog chased the cat." "$host/salience?model=all-mpnet-base-v2"
|
||||
echo
|
||||
|
||||
25
api/uv.lock
generated
25
api/uv.lock
generated
|
|
@ -303,18 +303,6 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/eb/02/a6b21098b1d5d6249b7c5ab69dde30108a71e4e819d4a9778f1de1d5b70d/fsspec-2025.10.0-py3-none-any.whl", hash = "sha256:7c7712353ae7d875407f97715f0e1ffcc21e33d5b24556cb1e090ae9409ec61d", size = 200966 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "gunicorn"
|
||||
version = "22.0.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "packaging" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/1e/88/e2f93c5738a4c1f56a458fc7a5b1676fc31dcdbb182bef6b40a141c17d66/gunicorn-22.0.0.tar.gz", hash = "sha256:4a0b436239ff76fb33f11c07a16482c521a7e09c1ce3cc293c2330afe01bec63", size = 3639760 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/29/97/6d610ae77b5633d24b69c2ff1ac3044e0e565ecbd1ec188f02c45073054c/gunicorn-22.0.0-py3-none-any.whl", hash = "sha256:350679f91b24062c86e386e198a15438d53a7a8207235a78ba1b53df4c4378d9", size = 84443 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hf-xet"
|
||||
version = "1.2.0"
|
||||
|
|
@ -1257,7 +1245,6 @@ source = { editable = "." }
|
|||
dependencies = [
|
||||
{ name = "flask" },
|
||||
{ name = "flask-cors" },
|
||||
{ name = "gunicorn" },
|
||||
{ name = "matplotlib" },
|
||||
{ name = "nltk" },
|
||||
{ name = "numpy" },
|
||||
|
|
@ -1267,13 +1254,13 @@ dependencies = [
|
|||
{ name = "seaborn" },
|
||||
{ name = "sentence-transformers" },
|
||||
{ name = "transformers" },
|
||||
{ name = "waitress" },
|
||||
]
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "flask", specifier = ">=2.3.2,<3.0.0" },
|
||||
{ name = "flask-cors", specifier = ">=4.0.0,<5.0.0" },
|
||||
{ name = "gunicorn", specifier = ">=22.0.0,<23.0.0" },
|
||||
{ name = "matplotlib", specifier = ">=3.10.7" },
|
||||
{ name = "nltk", specifier = ">=3.8.1,<4.0.0" },
|
||||
{ name = "numpy", specifier = ">=1.25.0,<2.0.0" },
|
||||
|
|
@ -1283,6 +1270,7 @@ requires-dist = [
|
|||
{ name = "seaborn", specifier = ">=0.13.2" },
|
||||
{ name = "sentence-transformers", specifier = ">=2.2.2,<3.0.0" },
|
||||
{ name = "transformers", specifier = ">=4.30.2,<5.0.0" },
|
||||
{ name = "waitress", specifier = ">=3.0.0" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -1621,6 +1609,15 @@ wheels = [
|
|||
{ url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "waitress"
|
||||
version = "3.0.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/bf/cb/04ddb054f45faa306a230769e868c28b8065ea196891f09004ebace5b184/waitress-3.0.2.tar.gz", hash = "sha256:682aaaf2af0c44ada4abfb70ded36393f0e307f4ab9456a215ce0020baefc31f", size = 179901 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/8d/57/a27182528c90ef38d82b636a11f606b0cbb0e17588ed205435f8affe3368/waitress-3.0.2-py3-none-any.whl", hash = "sha256:c56d67fd6e87c2ee598b76abdd4e96cfad1f24cacdea5078d382b1f9d7b5ed2e", size = 56232 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "werkzeug"
|
||||
version = "3.1.3"
|
||||
|
|
|
|||
|
|
@ -1,33 +0,0 @@
|
|||
#!/bin/sh
|
||||
set -eu
|
||||
|
||||
# rsync local models_cache to server shared cache
|
||||
# reports server models not in local (manual cleanup required)
|
||||
|
||||
ssh=deploy-peoplesgrocers-website
|
||||
base=/home/peoplesgrocers
|
||||
project=salience
|
||||
|
||||
ssh $ssh "mkdir -p $base/$project/shared/models_cache"
|
||||
|
||||
test -d api/models_cache || { echo 'no local api/models_cache'; exit 1; }
|
||||
|
||||
echo "local models_cache size:"
|
||||
du -sh api/models_cache
|
||||
|
||||
echo "syncing to $ssh:$base/$project/shared/models_cache/"
|
||||
# do not use compression because these model files are basically random data
|
||||
# the SSH connection is pretty spotting when transferring large files
|
||||
rsync -va --info=progress2 --partial --append-verify api/models_cache/ $ssh:$base/$project/shared/models_cache/
|
||||
|
||||
# local models
|
||||
local=$(cd api/models_cache && find . -maxdepth 2 -name 'models--*' -type d | sed 's|./||' | sort)
|
||||
|
||||
# server models
|
||||
remote=$(ssh $ssh "cd $base/$project/shared/models_cache && find . -maxdepth 2 -name 'models--*' -type d | sed 's|./||'" | sort)
|
||||
|
||||
# report server models not in local
|
||||
echo "checking for unused models on server..."
|
||||
for r in $remote; do
|
||||
echo "$local" | grep -q "^${r}$" || echo "unused: $r"
|
||||
done
|
||||
301
deploy.sh
301
deploy.sh
|
|
@ -1,301 +0,0 @@
|
|||
#!/bin/sh
|
||||
set -eu
|
||||
|
||||
# rsync wrapper for unreliable network connections
|
||||
# my home network is spotty and rsync often dies mid-transfer
|
||||
# this lets me manually retry the failed rsync and continue the deploy
|
||||
rsync_retry() {
|
||||
if ! rsync "$@"; then
|
||||
echo ""
|
||||
echo "rsync failed (probably network). run this manually:"
|
||||
echo "rsync $*"
|
||||
echo ""
|
||||
printf "press enter after manual rsync completes to continue..."
|
||||
read _
|
||||
fi
|
||||
}
|
||||
|
||||
# Deployment topology and zero-downtime process
|
||||
#
|
||||
# Directory structure on remote:
|
||||
# $base/$project/
|
||||
# base_port - starting port number (default 3100)
|
||||
# releases/{stamp}_{hash}/
|
||||
# dist/ - static assets served by nginx
|
||||
# server/ - node server from frontend/server/entrypoint.express.js
|
||||
# api/ - python flask API (salience/, nltk_data/, etc.)
|
||||
# assigned_port - port allocated to frontend for this release
|
||||
# assigned_api_port - port allocated to API for this release
|
||||
# current -> releases/{latest}
|
||||
# systemd/ - unit files per release (frontend + API)
|
||||
#
|
||||
# Zero-downtime deployment:
|
||||
# 1. rsync new release (frontend/dist/ + frontend/server/ + api/)
|
||||
# 2. install dependencies (npm for frontend, uv for API)
|
||||
# 3. allocate two ports (frontend + API via get-next-port.sh)
|
||||
# 4. generate systemd units for new release with unique ports
|
||||
# - frontend: node server
|
||||
# - API: gunicorn with 4 workers running Flask app
|
||||
# 5. start new services, wait for health
|
||||
# 6. update nginx upstream to point to new ports
|
||||
# 7. reload nginx (graceful, no dropped connections)
|
||||
# 8. stop old services
|
||||
# 9. cleanup old releases (keep 3 most recent)
|
||||
#
|
||||
# Port allocation: get-next-port.sh reads base_port and existing
|
||||
# assigned_port files to find first available port.
|
||||
# Each release runs independently until cutover.
|
||||
|
||||
ssh=deploy-peoplesgrocers-website
|
||||
base=/home/peoplesgrocers
|
||||
project=salience
|
||||
nginx_conf=/etc/nginx/sites-available/$project
|
||||
service_listen_address=127.221.91.58
|
||||
local_nginx_snippet="$HOME/src/work/infra/servers/chicago-web01/nginx/snippets/qwik-city-apps/salience.conf"
|
||||
|
||||
test -d frontend/dist || { echo 'no frontend/dist/'; exit 1; }
|
||||
test -d frontend/server || { echo 'no frontend/server/'; exit 1; }
|
||||
test -d .git || { echo 'not a git repo'; exit 1; }
|
||||
git diff-index --quiet HEAD || { echo 'git repo dirty'; exit 1; }
|
||||
|
||||
hash=$(git rev-parse --short=8 HEAD)
|
||||
stamp=$(date +%Y-%b-%d-%a-%I-%M%p | tr 'APM' 'apm')
|
||||
release="${stamp}_${hash}"
|
||||
service_name="${project}-${release}"
|
||||
|
||||
echo "deploying: $project @ $release"
|
||||
printf "continue? [y/n] "
|
||||
read ans
|
||||
test "$ans" = "y" || exit 1
|
||||
|
||||
# prepare remote directories
|
||||
ssh $ssh "mkdir -p $base/$project/{releases,systemd} $base/$project/releases/$release"
|
||||
|
||||
# sync both dist and server
|
||||
echo "syncing dist..."
|
||||
rsync_retry -tvaz frontend/dist/ $ssh:$base/$project/releases/$release/dist/
|
||||
echo "syncing server..."
|
||||
rsync_retry -tvaz frontend/server/ $ssh:$base/$project/releases/$release/server/
|
||||
|
||||
# copy server package.json and install dependencies
|
||||
echo "copying server package.json..."
|
||||
scp frontend/package.json $ssh:$base/$project/releases/$release/package.json
|
||||
echo "installing server dependencies..."
|
||||
ssh $ssh "source ~/.nvm/nvm.sh && cd $base/$project/releases/$release && npm install"
|
||||
|
||||
# sync api directory (exclude benchmarks, include specific files/dirs)
|
||||
echo "syncing api..."
|
||||
rsync_retry -tvaz \
|
||||
--include='salience/' --include='salience/**' \
|
||||
--include='nltk_data/' --include='nltk_data/**' \
|
||||
--include='pyproject.toml' \
|
||||
--include='uv.lock' \
|
||||
--include='transcript.txt' \
|
||||
--include='README.md' \
|
||||
--exclude='*' \
|
||||
api/ $ssh:$base/$project/releases/$release/api/
|
||||
|
||||
# link to shared models cache
|
||||
echo "linking to shared models_cache..."
|
||||
ssh $ssh "mkdir -p $base/$project/shared/models_cache"
|
||||
ssh $ssh "ln -sfn ../../../shared/models_cache $base/$project/releases/$release/api/models_cache"
|
||||
|
||||
echo "installing api dependencies..."
|
||||
ssh $ssh "cd $base/$project/releases/$release/api && ~/.local/bin/uv sync"
|
||||
|
||||
set -x
|
||||
# determine ports for this release (frontend and api)
|
||||
port=$(sh get-next-port.sh)
|
||||
echo "frontend port for this release: $port"
|
||||
api_port=$(sh get-next-port.sh)
|
||||
echo "api port for this release: $api_port"
|
||||
|
||||
# record port assignments
|
||||
ssh $ssh "echo $port > $base/$project/releases/$release/assigned_port"
|
||||
ssh $ssh "echo $api_port > $base/$project/releases/$release/assigned_api_port"
|
||||
|
||||
set +x
|
||||
|
||||
# generate systemd unit file
|
||||
unit_content="[Unit]
|
||||
Description=${project} release ${release}
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=peoplesgrocers
|
||||
WorkingDirectory=$base/$project/releases/$release
|
||||
Environment=\"PORT=$port\"
|
||||
Environment=\"ORIGIN=https://peoplesgrocers.com\"
|
||||
ExecStart=/home/peoplesgrocers/.nvm/versions/node/v24.10.0/bin/node server/entry.express.js
|
||||
Restart=on-failure
|
||||
RestartSec=10
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target"
|
||||
|
||||
echo "$unit_content" | ssh $ssh "cat > $base/$project/systemd/${service_name}.service"
|
||||
|
||||
echo ""
|
||||
echo "systemd unit created at: $base/$project/systemd/${service_name}.service"
|
||||
echo ""
|
||||
|
||||
# generate systemd unit file for API
|
||||
api_service_name="${project}-api-${release}"
|
||||
api_unit_content="[Unit]
|
||||
Description=${project} API release ${release}
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=peoplesgrocers
|
||||
WorkingDirectory=$base/$project/releases/$release/api
|
||||
Environment=\"PORT=$api_port\"
|
||||
ExecStart=$base/$project/releases/$release/api/.venv/bin/gunicorn --bind ${service_listen_address}:$api_port --workers 4 salience:app
|
||||
Restart=on-failure
|
||||
RestartSec=10
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target"
|
||||
|
||||
echo "$api_unit_content" | ssh $ssh "cat > $base/$project/systemd/${api_service_name}.service"
|
||||
|
||||
echo ""
|
||||
echo "API systemd unit created at: $base/$project/systemd/${api_service_name}.service"
|
||||
echo ""
|
||||
|
||||
# find old services
|
||||
old_service=$(ssh $ssh "systemctl list-units --type=service --state=running | grep '^${project}-' | grep -v 'api' | awk '{print \$1}' | head -1" || true)
|
||||
if [ -n "$old_service" ]; then
|
||||
old_port=$(ssh $ssh "systemctl show $old_service --property=Environment" | sed -n 's/.*PORT=\([0-9]*\).*/\1/p')
|
||||
echo "old frontend service: $old_service (port $old_port)"
|
||||
else
|
||||
old_port=""
|
||||
echo "no old frontend service running"
|
||||
fi
|
||||
|
||||
old_api_service=$(ssh $ssh "systemctl list-units --type=service --state=running | grep '^${project}-api-' | awk '{print \$1}' | head -1" || true)
|
||||
if [ -n "$old_api_service" ]; then
|
||||
old_api_port=$(ssh $ssh "systemctl show $old_api_service --property=Environment" | sed -n 's/.*PORT=\([0-9]*\).*/\1/p')
|
||||
echo "old API service: $old_api_service (port $old_api_port)"
|
||||
else
|
||||
old_api_port=""
|
||||
echo "no old API service running"
|
||||
fi
|
||||
|
||||
# Update local nginx snippet with new port
|
||||
if [ -n "$local_nginx_snippet" ] && [ -f "$local_nginx_snippet" ]; then
|
||||
echo "updating local nginx snippet: $local_nginx_snippet"
|
||||
if [ -n "$old_port" ]; then
|
||||
echo " changing port $old_port -> $port"
|
||||
sed -i.bak "s/${service_listen_address}:${old_port}/${service_listen_address}:${port}/g" "$local_nginx_snippet"
|
||||
else
|
||||
echo " setting port to $port"
|
||||
sed -i.bak "s/${service_listen_address}:[0-9]\{4,5\}/${service_listen_address}:${port}/g" "$local_nginx_snippet"
|
||||
fi
|
||||
rm -f "${local_nginx_snippet}.bak"
|
||||
echo "nginx snippet updated locally"
|
||||
else
|
||||
echo "warning: local_nginx_snippet not set or file not found"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "--- run these commands on $ssh ---"
|
||||
echo ""
|
||||
echo "# install and start new services"
|
||||
echo "sudo ln -sf $base/$project/systemd/${service_name}.service /etc/systemd/system/"
|
||||
echo "sudo ln -sf $base/$project/systemd/${api_service_name}.service /etc/systemd/system/"
|
||||
echo "sudo systemctl daemon-reload"
|
||||
echo "sudo systemctl start ${service_name}"
|
||||
echo "sudo systemctl start ${api_service_name}"
|
||||
echo ""
|
||||
echo "# verify services are healthy"
|
||||
echo "sudo systemctl status ${service_name}"
|
||||
echo "sudo systemctl status ${api_service_name}"
|
||||
echo "curl http://${service_listen_address}:$port/"
|
||||
echo "curl http://${service_listen_address}:$api_port/models"
|
||||
echo ""
|
||||
echo "# then deploy your nginx configuration and reload nginx"
|
||||
echo ""
|
||||
|
||||
if [ -n "$old_service" ]; then
|
||||
echo "# stop old frontend service"
|
||||
echo "sudo systemctl stop $old_service"
|
||||
echo ""
|
||||
fi
|
||||
|
||||
if [ -n "$old_api_service" ]; then
|
||||
echo "# stop old API service"
|
||||
echo "sudo systemctl stop $old_api_service"
|
||||
echo ""
|
||||
fi
|
||||
|
||||
echo "# update current symlink"
|
||||
echo "ln -sfn releases/$release $base/$project/current"
|
||||
echo ""
|
||||
echo "--- end commands ---"
|
||||
echo ""
|
||||
|
||||
printf "test health checks? [y/n] "
|
||||
read ans
|
||||
if [ "$ans" = "y" ]; then
|
||||
echo "testing frontend..."
|
||||
ssh $ssh "curl -v http://${service_listen_address}:$port/" || echo "frontend health check failed"
|
||||
echo "testing API..."
|
||||
ssh $ssh "curl -v http://${service_listen_address}:$api_port/models" || echo "API health check failed"
|
||||
fi
|
||||
|
||||
if [ -n "$old_service" ]; then
|
||||
echo ""
|
||||
printf "stop old frontend service ($old_service)? [y/n] "
|
||||
read ans
|
||||
if [ "$ans" = "y" ]; then
|
||||
ssh $ssh "sudo systemctl stop $old_service"
|
||||
echo "old frontend service stopped"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -n "$old_api_service" ]; then
|
||||
echo ""
|
||||
printf "stop old API service ($old_api_service)? [y/n] "
|
||||
read ans
|
||||
if [ "$ans" = "y" ]; then
|
||||
ssh $ssh "sudo systemctl stop $old_api_service"
|
||||
echo "old API service stopped"
|
||||
fi
|
||||
fi
|
||||
|
||||
echo ""
|
||||
printf "update current symlink? [y/n] "
|
||||
read ans
|
||||
if [ "$ans" = "y" ]; then
|
||||
ssh $ssh "ln -sfn releases/$release $base/$project/current"
|
||||
echo "current -> $release"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "cleanup old releases (keep 3):"
|
||||
old_releases=$(ssh $ssh "cd $base/$project/releases && ls -t | sed -n '4,\$p'" || true)
|
||||
if [ -n "$old_releases" ]; then
|
||||
echo "$old_releases"
|
||||
printf "remove these? [y/n] "
|
||||
read ans
|
||||
if [ "$ans" = "y" ]; then
|
||||
ssh $ssh "cd $base/$project/releases && ls -t | sed -n '4,\$p' | while read r; do
|
||||
rm -rf \"\$r\"
|
||||
sudo systemctl stop ${project}-\${r} 2>/dev/null || true
|
||||
sudo systemctl stop ${project}-api-\${r} 2>/dev/null || true
|
||||
sudo rm -f /etc/systemd/system/${project}-\${r}.service
|
||||
sudo rm -f /etc/systemd/system/${project}-api-\${r}.service
|
||||
rm -f $base/$project/systemd/${project}-\${r}.service
|
||||
rm -f $base/$project/systemd/${project}-api-\${r}.service
|
||||
done"
|
||||
ssh $ssh "sudo systemctl daemon-reload"
|
||||
echo "cleanup done"
|
||||
fi
|
||||
else
|
||||
echo "no old releases to clean"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "done: $release (frontend port $port, API port $api_port)"
|
||||
4
frontend/.gitignore
vendored
4
frontend/.gitignore
vendored
|
|
@ -37,6 +37,6 @@ lerna-debug.log*
|
|||
*.sln
|
||||
*.sw?
|
||||
|
||||
# Yarn
|
||||
# Yarn PnP
|
||||
.yarn/*
|
||||
!.yarn/releases
|
||||
.pnp.*
|
||||
|
|
|
|||
7
frontend/.yarnrc.yml
Normal file
7
frontend/.yarnrc.yml
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
nodeLinker: pnp
|
||||
|
||||
packageExtensions:
|
||||
"@builder.io/qwik@*":
|
||||
dependencies:
|
||||
ignore: "*"
|
||||
semver: "*"
|
||||
124
frontend/deploy.sh
Executable file
124
frontend/deploy.sh
Executable file
|
|
@ -0,0 +1,124 @@
|
|||
#!/bin/sh
|
||||
set -eu
|
||||
|
||||
# Deployment topology with Nomad
|
||||
#
|
||||
# Directory structure on remote:
|
||||
# $base/$project/
|
||||
# releases/{stamp}_{hash}/
|
||||
# dist/ - static assets served by nginx
|
||||
# server/ - node server (entry.express.js)
|
||||
# package.json - dependencies
|
||||
# node_modules/ - installed dependencies
|
||||
# job.nomad.hcl - nomad job definition for this release
|
||||
# current -> releases/{latest}
|
||||
#
|
||||
# Zero-downtime deployment with Nomad:
|
||||
# 1. rsync new release (dist/ + server/)
|
||||
# 2. yarn install dependencies
|
||||
# 3. generate job file with release path
|
||||
# 4. nomad job run (triggers blue-green deployment)
|
||||
# 5. nomad waits for health checks to pass
|
||||
# 6. nomad auto-promotes new allocation
|
||||
# 7. old allocation enters graceful shutdown (30s kill_timeout)
|
||||
# 8. consul-template updates nginx config (via service tags)
|
||||
# 9. cleanup old releases (keep 5 most recent)
|
||||
|
||||
ssh=deploy-peoplesgrocers-website
|
||||
base=/home/peoplesgrocers
|
||||
project=salience-editor-qwik-city
|
||||
|
||||
test -d dist || { echo 'no dist/'; exit 1; }
|
||||
test -d server || { echo 'no server/'; exit 1; }
|
||||
# git diff-index --quiet HEAD || { echo 'git repo dirty'; exit 1; }
|
||||
|
||||
hash=$(git rev-parse --short=8 HEAD)
|
||||
stamp=$(date +%Y-%b-%d-%a-%I_%M%p | tr 'APM' 'apm')
|
||||
release="${stamp}-${hash}"
|
||||
|
||||
echo "deploying: $project @ $release"
|
||||
printf "continue? [y/n] "
|
||||
read ans
|
||||
test "$ans" = "y" || exit 1
|
||||
|
||||
# prepare remote directories
|
||||
ssh $ssh "mkdir -p $base/$project/releases/$release"
|
||||
|
||||
# sync all files using rclone (handles poor network connections better)
|
||||
echo "syncing release files (dist/, server/, package.json)..."
|
||||
temp_dir=$(mktemp -d)
|
||||
trap "rm -rf $temp_dir" EXIT INT TERM
|
||||
|
||||
# Copy files to temp directory for single rclone transfer
|
||||
cp -r dist server package.json .yarnrc.yml "$temp_dir/"
|
||||
rclone copy "$temp_dir/" "${ssh}:$base/$project/releases/$release/" \
|
||||
--progress --retries 10 --checksum
|
||||
|
||||
rm -rf "$temp_dir"
|
||||
echo "installing server dependencies..."
|
||||
ssh $ssh "source ~/.nvm/nvm.sh && cd $base/$project/releases/$release && yarn install"
|
||||
|
||||
# generate nomad job file with release path
|
||||
echo "generating nomad job file..."
|
||||
release_path="$base/$project/releases/$release"
|
||||
job_file="$base/$project/releases/$release/job.nomad.hcl"
|
||||
|
||||
# Use envsubst with whitelist to only replace our variables, not Nomad runtime variables
|
||||
export RELEASE_PLACEHOLDER="$release"
|
||||
export RELEASE_PATH="$release_path"
|
||||
envsubst '$RELEASE_PLACEHOLDER $RELEASE_PATH' < salience-editor-qwik-city.nomad.hcl | ssh $ssh "cat > $job_file"
|
||||
|
||||
echo ""
|
||||
echo "nomad job file created at: $job_file"
|
||||
echo ""
|
||||
|
||||
# submit job to nomad
|
||||
echo "submitting job to nomad..."
|
||||
deployment_id=$(ssh $ssh "source ~/.local/bin/env && nomad job run $job_file | grep -oE 'Deployment ID = [a-f0-9-]+' | awk '{print \$4}'" )
|
||||
|
||||
if [ -n "$deployment_id" ]; then
|
||||
echo "deployment started: $deployment_id"
|
||||
echo ""
|
||||
echo "monitoring deployment..."
|
||||
|
||||
# Monitor deployment status
|
||||
ssh $ssh "source ~/.local/bin/env && nomad deployment status $deployment_id"
|
||||
|
||||
echo ""
|
||||
printf "watch deployment progress? [y/n] "
|
||||
read ans
|
||||
if [ "$ans" = "y" ]; then
|
||||
ssh $ssh "source ~/.local/bin/env && watch -n 2 'nomad deployment status $deployment_id'"
|
||||
fi
|
||||
else
|
||||
echo "warning: could not extract deployment ID"
|
||||
echo "check deployment status manually with: nomad job status $project"
|
||||
fi
|
||||
|
||||
# update current symlink
|
||||
echo ""
|
||||
printf "update current symlink? [y/n] "
|
||||
read ans
|
||||
if [ "$ans" = "y" ]; then
|
||||
ssh $ssh "ln -sfn releases/$release $base/$project/current"
|
||||
echo "current -> $release"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "done: $release"
|
||||
echo ""
|
||||
echo "Next steps:"
|
||||
echo "- Nomad will automatically promote the deployment after health checks pass"
|
||||
echo "- Consul-template will update nginx config based on healthy service instances"
|
||||
echo "- Old allocation will gracefully shutdown (30s timeout for in-flight requests)"
|
||||
echo "- Run ./cleanup-old-releases.sh to remove old releases (keeps 5 most recent)"
|
||||
echo ""
|
||||
if [ -n "$deployment_id" ]; then
|
||||
echo "Monitor deployment:"
|
||||
echo " nomad deployment status $deployment_id"
|
||||
echo " watch -n 2 'nomad deployment status $deployment_id'"
|
||||
echo " nomad job allocs $project"
|
||||
echo ""
|
||||
fi
|
||||
echo "Check service health:"
|
||||
echo " curl http://localhost:15500/v1/health/service/$project | jq"
|
||||
10472
frontend/package-lock.json
generated
10472
frontend/package-lock.json
generated
File diff suppressed because it is too large
Load diff
|
|
@ -59,5 +59,6 @@
|
|||
"prosemirror-transform": "^1.10.4",
|
||||
"prosemirror-view": "^1.41.3",
|
||||
"temml": "^0.11.11"
|
||||
}
|
||||
},
|
||||
"packageManager": "yarn@4.11.0"
|
||||
}
|
||||
|
|
|
|||
105
frontend/salience-editor-qwik-city.nomad.hcl
Normal file
105
frontend/salience-editor-qwik-city.nomad.hcl
Normal file
|
|
@ -0,0 +1,105 @@
|
|||
job "salience-editor-qwik-city" {
|
||||
datacenters = ["ord10"]
|
||||
type = "service"
|
||||
|
||||
constraint {
|
||||
attribute = "${node.unique.name}"
|
||||
value = "chicago-web01"
|
||||
}
|
||||
|
||||
group "app" {
|
||||
count = 1
|
||||
|
||||
network {
|
||||
mode = "host"
|
||||
|
||||
port "http" {
|
||||
# Nomad will assign an available port
|
||||
}
|
||||
}
|
||||
|
||||
# Blue-green deployment strategy using canary
|
||||
update {
|
||||
max_parallel = 1
|
||||
health_check = "checks"
|
||||
min_healthy_time = "10s"
|
||||
healthy_deadline = "5m"
|
||||
auto_promote = true
|
||||
auto_revert = true
|
||||
canary = 1
|
||||
}
|
||||
|
||||
task "node-server" {
|
||||
driver = "raw_exec"
|
||||
|
||||
user = "peoplesgrocers"
|
||||
|
||||
config {
|
||||
# Set working directory to release path
|
||||
# RELEASE_PATH will be interpolated during deployment
|
||||
work_dir = "$RELEASE_PATH"
|
||||
command = "/home/peoplesgrocers/.nvm/versions/node/v24.10.0/bin/yarn"
|
||||
args = ["node", "server/entry.express.js"]
|
||||
}
|
||||
|
||||
# Template to set working directory path dynamically
|
||||
# This will be replaced during deployment
|
||||
env {
|
||||
PORT = "${NOMAD_PORT_http}"
|
||||
ORIGIN = "https://peoplesgrocers.com"
|
||||
PATH = "/home/peoplesgrocers/.nvm/versions/node/v24.10.0/bin:/usr/local/bin:/usr/bin:/bin"
|
||||
HOME = "/home/peoplesgrocers"
|
||||
}
|
||||
|
||||
# Release path set during deployment via envsubst
|
||||
template {
|
||||
data = <<EOH
|
||||
RELEASE_PATH="$RELEASE_PATH"
|
||||
EOH
|
||||
destination = "local/env"
|
||||
env = true
|
||||
}
|
||||
|
||||
resources {
|
||||
cpu = 500 # MHz
|
||||
memory = 256 # MB
|
||||
}
|
||||
|
||||
# Consul service registration with health check
|
||||
service {
|
||||
name = "salience-editor-qwik-city"
|
||||
port = "http"
|
||||
|
||||
tags = [
|
||||
"qwik-city",
|
||||
"ssr",
|
||||
"app"
|
||||
]
|
||||
|
||||
# Health check on dedicated health endpoint
|
||||
check {
|
||||
type = "http"
|
||||
path = "/p/salience-editor/health"
|
||||
interval = "10s"
|
||||
timeout = "2s"
|
||||
|
||||
check_restart {
|
||||
limit = 3
|
||||
grace = "10s"
|
||||
}
|
||||
}
|
||||
|
||||
# Service meta for nginx template filtering
|
||||
meta {
|
||||
version = "$RELEASE_PLACEHOLDER"
|
||||
}
|
||||
}
|
||||
|
||||
# Allow 30 seconds for graceful shutdown of in-flight requests
|
||||
kill_timeout = "30s"
|
||||
|
||||
# Ensure node is available
|
||||
kill_signal = "SIGTERM"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -66,8 +66,8 @@ app.use(router);
|
|||
// Use Qwik City's 404 handler
|
||||
app.use(notFound);
|
||||
|
||||
// Start the express server
|
||||
app.listen(PORT, '127.221.91.58', () => {
|
||||
// If I want to use this with nomad health checks, then I have to listen on 127.0.0.1
|
||||
app.listen(PORT, '127.0.0.1', () => {
|
||||
/* eslint-disable */
|
||||
console.log(`Server started: http://127.221.91.58:${PORT}/`);
|
||||
console.log(`Server started: http://127.0.0.1:${PORT}/`);
|
||||
});
|
||||
|
|
|
|||
|
|
@ -8,15 +8,16 @@ import { Math } from "~/components/math/math"
|
|||
|
||||
A couple of days ago I came across
|
||||
[github.com/mattneary/salience](https://github.com/mattneary/salience) by Matt Neary. I thought it
|
||||
was quite neat how armed with a good understanding of math he was able to take sentence embeddings and in fewer lines of code
|
||||
than this introduction determine the significance of all sentences in a document.
|
||||
was quite neat how someone well armed with math can take sentence embeddings
|
||||
and determine the significance of all sentences in a document in fewer lines of
|
||||
code than my introduction paragraph here.
|
||||
|
||||
This is not a description of [all the changes I made and extra book-keeping involved to turn Matt's script into a proper web app demo](/grunt-work).
|
||||
|
||||
This post is an outsider's view of how Matt's salience code works. If you're
|
||||
already working with ML models in Python, this will feel torturously detailed.
|
||||
|
||||
I'm going to be explaing everything 3 times, the equations a ML engineer would doodle out, the element by element matrix operations to give you feel for the dataflow, and the numpy code that implements it.
|
||||
My interest in this overly detailed style is, the equations a ML engineer would doodle out, the element by element matrix operations to give you feel for the dataflow, and the numpy code that implements it.
|
||||
|
||||
When you see `sims /= norms.T` in numpy, I want to explain the matrix dimensions
|
||||
|
||||
|
|
@ -66,21 +67,19 @@ Where:
|
|||
- <Math tex="D" /> = embedding dimension (768 for all-mpnet-base-v2, 1024 for gte-large-en-v1.5)
|
||||
- Each row represents one sentence in semantic space
|
||||
|
||||
**Step 3a: Compute all dot products**
|
||||
First, compute all the dot products at once:
|
||||
|
||||
<Math display tex="\mathbf{S} = \mathbf{E} \mathbf{E}^T" />
|
||||
|
||||
Since <Math tex="\mathbf{E}" /> is <Math tex="N \times D" /> and <Math tex="\mathbf{E}^T" /> is <Math tex="D \times N" />, their product gives us an <Math tex="N \times N" /> matrix where entry <Math tex="S_{ij} = \mathbf{e}_i \cdot \mathbf{e}_j" />.
|
||||
|
||||
**Step 3b: Compute the norms and normalize**
|
||||
Now we complete the cosine similarity formula by dividing each element by the product of the corresponding embedding norms:
|
||||
|
||||
First, compute a vector of norms:
|
||||
<Math display tex="A_{ij} = \frac{S_{ij}}{\|\mathbf{e}_i\| \cdot \|\mathbf{e}_j\|}" />
|
||||
|
||||
<Math display tex="\mathbf{n} = \begin{bmatrix} \|\mathbf{e}_1\| \\ \|\mathbf{e}_2\| \\ \|\mathbf{e}_3\| \\ \vdots \\ \|\mathbf{e}_N\| \end{bmatrix}" />
|
||||
This gives us the full adjacency matrix:
|
||||
|
||||
This is an <Math tex="(N, 1)" /> vector where each element is the magnitude of one sentence's embedding. Now we need to visit every single element of <Math tex="\mathbf{S}" /> to make the adjacency matrix <Math tex="A_{ij} = \frac{S_{ij}}{n_i \cdot n_j}" />:
|
||||
|
||||
<Math display tex="\mathbf{A} = \begin{bmatrix} \frac{S_{11}}{n_1 \cdot n_1} & \frac{S_{12}}{n_1 \cdot n_2} & \cdots & \frac{S_{1N}}{n_1 \cdot n_N} \\ \frac{S_{21}}{n_2 \cdot n_1} & \frac{S_{22}}{n_2 \cdot n_2} & \cdots & \frac{S_{2N}}{n_2 \cdot n_N} \\ \vdots & \vdots & \ddots & \vdots \\ \frac{S_{N1}}{n_N \cdot n_1} & \frac{S_{N2}}{n_N \cdot n_2} & \cdots & \frac{S_{NN}}{n_N \cdot n_N} \end{bmatrix}" />
|
||||
<Math display tex="\mathbf{A} = \begin{bmatrix} \frac{S_{11}}{\|\mathbf{e}_1\| \cdot \|\mathbf{e}_1\|} & \frac{S_{12}}{\|\mathbf{e}_1\| \cdot \|\mathbf{e}_2\|} & \cdots & \frac{S_{1N}}{\|\mathbf{e}_1\| \cdot \|\mathbf{e}_N\|} \\ \frac{S_{21}}{\|\mathbf{e}_2\| \cdot \|\mathbf{e}_1\|} & \frac{S_{22}}{\|\mathbf{e}_2\| \cdot \|\mathbf{e}_2\|} & \cdots & \frac{S_{2N}}{\|\mathbf{e}_2\| \cdot \|\mathbf{e}_N\|} \\ \vdots & \vdots & \ddots & \vdots \\ \frac{S_{N1}}{\|\mathbf{e}_N\| \cdot \|\mathbf{e}_1\|} & \frac{S_{N2}}{\|\mathbf{e}_N\| \cdot \|\mathbf{e}_2\|} & \cdots & \frac{S_{NN}}{\|\mathbf{e}_N\| \cdot \|\mathbf{e}_N\|} \end{bmatrix}" />
|
||||
|
||||
**Quick benchmark:** For a <Math tex="194 \times 768" /> embeddings matrix (194 sentences):
|
||||
|
||||
|
|
@ -99,7 +98,12 @@ def cos_sim(a):
|
|||
return sims
|
||||
```
|
||||
|
||||
The `keepdims=True` makes `norms` shape <Math tex="(N, 1)" /> instead of <Math tex="(N,)" />, which is crucial—when transposed, <Math tex="(N, 1)" /> becomes <Math tex="(1, N)" />, allowing the broadcasting to work for column-wise division.
|
||||
The `keepdims=True` makes `norms` shape <Math tex="(N, 1)" /> instead of <Math
|
||||
tex="(N,)" />, which is crucial. When transposed, <Math tex="(N, 1)" /> becomes
|
||||
<Math tex="(1, N)" />, allowing the broadcasting to work for column-wise
|
||||
division. Transpose does not do anything to the shape <Math tex="(N,)" />. I
|
||||
don't know why transpose works this way, but this seems like gotcha to look out
|
||||
for.
|
||||
|
||||
|
||||
## Step 4: Clean Up the Graph
|
||||
|
|
@ -109,9 +113,15 @@ We make two adjustments to the adjacency matrix to make our TextRank work:
|
|||
1. **Remove self-loops:** Set diagonal to zero (<Math tex="A_{ii} = 0" />)
|
||||
2. **Remove negative edges:** Set <Math tex="A_{ij} = \max(0, A_{ij})" />
|
||||
|
||||
A sentence shouldn't vote for its own importance. And sentences with opposite meanings get disconnected.
|
||||
|
||||
**Important assumption:** This assumes your document has a coherent main idea and that sentences are generally on-topic. We're betting that the topic with the most "semantic mass" is the *correct* topic. This is obviously not true for many documents:
|
||||
A sentence shouldn't vote for its own importance. And sentences with opposite
|
||||
meanings get disconnected. I'll grant you 2) seems like a bit of a leap. I'll
|
||||
grant you that, as my understanding is the real reason we zero out negative
|
||||
entries is the normalization algorithm we want to use does not work with
|
||||
negative edges. Thus we worked backwards from the available normlization
|
||||
algorithms to handwave an assumption your document has a coherent main idea and
|
||||
that sentences are generally on-topic. We're betting that the topic with the
|
||||
most "semantic mass" is the *correct* topic. This is obviously not true for
|
||||
many documents:
|
||||
|
||||
- Dialectical essays that deliberately contrast opposing viewpoints
|
||||
- Documents heavy with quotes that argue against something
|
||||
|
|
@ -120,9 +130,20 @@ A sentence shouldn't vote for its own importance. And sentences with opposite me
|
|||
|
||||
For example: "Nuclear power is dangerous. Critics say it causes meltdowns [...]. However, modern reactors are actually very safe."
|
||||
|
||||
The algorithm might highlight the criticism because multiple sentences cluster around "danger", even though the document's actual position is pro-nuclear. There's nothing inherent in the math that identifies authorial intent vs. quoted opposition.
|
||||
The algorithm might highlight the criticism because multiple sentences cluster
|
||||
around "danger", even though the document's actual position is pro-nuclear.
|
||||
There's nothing inherent in the math that identifies authorial intent vs.
|
||||
quoted opposition.
|
||||
|
||||
**Bottom line:** This technique works well for coherent, single-perspective documents. It can fail when multiple competing viewpoints have similar semantic weight.
|
||||
Reflecting on my personal use cases, basically all documents I would run
|
||||
through such a tool to edit for compactness will be single topic persuasive
|
||||
essays. We should X. Its very unlikely I'll be able to indulge my penchant for
|
||||
dialetical essays at work.
|
||||
|
||||
Basically just keep in mind that we've made a pretty big foundational
|
||||
assumption that can fail when multiple competing viewpoints have similar
|
||||
semantic weight and the demo gives you no visual indication or warning this has
|
||||
happened.
|
||||
|
||||
## Step 5: Normalize the Adjacency Matrix
|
||||
|
||||
|
|
@ -145,34 +166,39 @@ The result is a diagonal matrix that looks like:
|
|||
|
||||
Now we use <Math tex="\mathbf{D}" /> to normalize <Math tex="\mathbf{A}" />. There are two approaches:
|
||||
|
||||
Traditional normalization <Math tex="\mathbf{D}^{-1} \mathbf{A}" />:
|
||||
Traditional normalization <Math tex="\tilde{\mathbf{A}} = \mathbf{D}^{-1} \mathbf{A}" />:
|
||||
- This creates a row-stochastic matrix (rows sum to 1)
|
||||
- Interpretation: "If I'm at sentence <Math tex="i" />, what's the probability of jumping to sentence <Math tex="j" />?"
|
||||
- This is like a proper Markov chain transition matrix
|
||||
- Used in standard PageRank and TextRank
|
||||
- Supports **directed** graphs, a property useful for modeling web page
|
||||
navigation (page A links to B but B does not link back to A), but we don't
|
||||
actually need for sentence similarity where similarity of A to B is exactly
|
||||
the same value as B to A.
|
||||
|
||||
Spectral normalization <Math tex="\mathbf{D}^{-1/2} \mathbf{A} \mathbf{D}^{-1/2}" />:
|
||||
- Used in spectral clustering and graph analysis
|
||||
Spectral normalization <Math tex="\tilde{\mathbf{A}} = \mathbf{D}^{-1/2} \mathbf{A} \mathbf{D}^{-1/2}" />:
|
||||
- Treats the graph as **unidirected** (hey! that's us)
|
||||
- Symmetry preservation: if A is symmetric (which cosine similarity matrix is), then the normalized version
|
||||
stays symmetric
|
||||
- The eigenvalues are bounded in [-1, 1]
|
||||
- More uniform influence from all neighbors
|
||||
- Better numerical properties for exponentiation
|
||||
|
||||
With traditional normalization, sentences with many connections get their
|
||||
influence diluted. A sentence connected to 10 others splits its "voting power"
|
||||
into 10 pieces. A sentence connected to 2 others splits its power into just 2
|
||||
pieces. This creates a bias against well-connected sentences.
|
||||
|
||||
The traditional <Math tex="\mathbf{D}^{-1} \mathbf{A}" /> approach introduces potential node bias and lacks symmetry. Spectral normalization
|
||||
provides a more balanced representation by symmetrizing the adjacency matrix and ensuring more uniform
|
||||
neighbor influence. This method prevents high-degree nodes from dominating the graph's structure, creating a
|
||||
more equitable information propagation mechanism.
|
||||
Spectral normalization solves this problem. Well-connected sentences keep their
|
||||
influence proportional to connectivity.
|
||||
|
||||
With traditional normalization, sentences with many connections get their influence diluted. A sentence connected to 10 others splits its "voting power" into 10 pieces. A sentence connected to 2 others splits its power into just 2 pieces. This creates a bias against well-connected sentences.
|
||||
|
||||
Spectral normalization treats the graph as **undirected**, which matches how
|
||||
semantic similarity works. Well-connected sentences keep their influence
|
||||
proportional to connectivity. Two sentences that are similar to each other
|
||||
should have equal influence on each other, not asymmetric transition
|
||||
probabilities.
|
||||
I asked a ML engineer to explain the same idea to give you a
|
||||
Rosetta Stone to understand their jaron.
|
||||
|
||||
> The traditional <Math tex="\mathbf{D}^{-1} \mathbf{A}" /> approach introduces potential node bias and lacks symmetry. Spectral normalization
|
||||
> provides a more balanced representation by symmetrizing the adjacency matrix and ensuring more uniform
|
||||
> neighbor influence. This method prevents high-degree nodes from dominating the graph's structure, creating a
|
||||
> more equitable information propagation mechanism.
|
||||
|
||||
## Step 6: Random Walk Simulation
|
||||
|
||||
|
|
@ -187,15 +213,35 @@ Where:
|
|||
|
||||
**Intuition:** After <Math tex="k" /> steps of random walking through the similarity graph, which sentences have we visited most? Those are the central, important sentences.
|
||||
|
||||
You might think we'd need to exponentiate the matrix—compute <Math tex="\tilde{\mathbf{A}}^k" /> first, then multiply by <Math tex="\mathbf{1}^T" />. But there's a trick here. Since <Math tex="\mathbf{1}^T" /> is just a row vector of all ones (shape <Math tex="1 \times N" />), we can evaluate the expression left-to-right:
|
||||
|
||||
<Math display tex="\mathbf{s} = ((\mathbf{1}^T \tilde{\mathbf{A}}) \tilde{\mathbf{A}}) \tilde{\mathbf{A}} \cdots" />
|
||||
|
||||
Each step is vector times matrix, which produces another vector. So we're doing <Math tex="k" /> iterations of vector-matrix multiplication, where each one is <Math tex="N^2" /> operations. Total cost: <Math tex="kN^2" />.
|
||||
|
||||
If we were exponentiating the matrix instead, we'd be doing matrix-matrix multiplication (<Math tex="N^3" /> operations per step). Since <Math tex="k" /> is small (say only 5 or 10) it's way more efficient to just evaluate left-to-right and keep passing a vector through. For a document with 200 sentences and <Math tex="k=5" />, that's roughly 200,000 operations instead of 8,000,000. A 40× speedup!
|
||||
|
||||
The choice of <Math tex="k" /> is important. A small <Math tex="k" /> (5-10 steps) means the random walk doesn't go very far. A sentence's importance is determined by its immediate neighborhood in the similarity graph. A large <Math tex="k" /> (letting it converge, like PageRank does) means influence propagates across the entire document, and you end up with only the sentences most central to the document's single main theme ranking highly.
|
||||
|
||||
For editing, we want the local structure. Documents aren't monolithic. Different paragraphs discuss different aspects of the topic. We want to see which sentences matter within their local context, not just identify the 3-5 globally most central sentences. So we use a small <Math tex="k" /> and deliberately stop before convergence.
|
||||
|
||||
As a bonus, this not-fully-converged state also happens to be computationally cheaper.
|
||||
|
||||
## Step 7: Map Scores to Highlight Colors
|
||||
|
||||
Now we have a vector of raw salience scores from the random walk. Problem: these scores have no physical meaning. Different embedding models produce wildly different ranges:
|
||||
- Model A on Doc 1: `[0.461, 1.231]`
|
||||
- Model B on Doc 2: `[0.892, 1.059]`
|
||||
- Model A on Doc 1: scores range from 0.461 to 1.231
|
||||
- Model B on Doc 1: scores range from 0.892 to 1.059
|
||||
|
||||
We need to turn this vector of arbitrary numbers into CSS highlight opacities in `[0, 1]`. Here's the reasoning behind creating the remapping function:
|
||||
|
||||
I could do trivial linear scaling - multiply by a constant to get scores into some range like <Math tex="X" /> to <Math tex="X + 2" />. But let's try to make the top sentences stand out more. One trick: exponentiation. Since human perception of brightness is not linear, exponentiation will preserve order but push the top values apart more. It makes the top few sentences really pop out.
|
||||
Since I'm using this for editing documents, it seems reasonable I would only want to see highlights on roughly half the sentences—throw half away. (Of course, the threshold is configurable in the settings panel.)
|
||||
|
||||
Here's the idea: if we map scores into a range of size 2 (say, <Math tex="X" /> to <Math tex="X + 2" />), then we can threshold at the midpoint (<Math tex="X + 1" />). Sentences scoring <Math tex="X + 1" /> to <Math tex="X + 2" /> get highlighted.
|
||||
|
||||
For a typical document, this gives you roughly 50% highlighted. But it's better than just hard-thresholding at exactly the top 50%: if 70% of sentences score above <Math tex="X + 1" />, maybe your document is just really on-topic and you don't need to cut as much. If only 30% score above <Math tex="X + 1" />, the document is scattered and only the truly central sentences get highlighted.
|
||||
|
||||
I could do trivial linear scaling to get scores into the range <Math tex="X" /> to <Math tex="X + 2" />. But let's try to make the top sentences stand out more. One trick: exponentiation. Since human perception of brightness is not linear, exponentiation will preserve order but push the top values apart more. It makes the top few sentences really pop out.
|
||||
|
||||
**Building the remapping function**
|
||||
|
||||
|
|
@ -205,9 +251,9 @@ Given a salience vector <Math tex="\mathbf{s}" /> with values ranging from <Math
|
|||
|
||||
Sure, it takes more work to find the right exponent for our target spread of 2, but that's still easy with a simple solver.
|
||||
|
||||
2. **Find a threshold** <Math tex="\tau" /> such that 50% of the sentences get clamped to zero.
|
||||
2. **Set the threshold** <Math tex="\tau" /> at the midpoint of the remapped range.
|
||||
|
||||
Since I'm using this for editing documents, I only want to see highlights on roughly half the sentences—the important half.
|
||||
This is where we draw the line between highlighted and non-highlighted sentences.
|
||||
|
||||
The final opacity mapping is:
|
||||
|
||||
|
|
|
|||
9
frontend/src/routes/health/index.ts
Normal file
9
frontend/src/routes/health/index.ts
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
import type { RequestHandler } from "@builder.io/qwik-city";
|
||||
|
||||
export const onGet: RequestHandler = async ({ json }) => {
|
||||
json(200, {
|
||||
status: "ok",
|
||||
wizard: "harry",
|
||||
timestamp: new Date().toISOString(),
|
||||
});
|
||||
};
|
||||
|
|
@ -319,7 +319,7 @@ export default component$(() => {
|
|||
<span class="subtitle">
|
||||
sentence highlights based on their significance to the document
|
||||
</span>
|
||||
<a href="/about" class="about-link">How it works →</a>
|
||||
<a href="./about" class="about-link">How it works →</a>
|
||||
</h1>
|
||||
<div class="controls">
|
||||
<label for="model-select">Model:</label>
|
||||
|
|
|
|||
7257
frontend/yarn.lock
Normal file
7257
frontend/yarn.lock
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -1,51 +0,0 @@
|
|||
#!/bin/sh
|
||||
set -eu
|
||||
|
||||
# Port allocation for zero-downtime deployments
|
||||
#
|
||||
# Reads remote state to determine next available port.
|
||||
# State files:
|
||||
# $base/$project/base_port - starting port number
|
||||
# $base/$project/releases/*/assigned_port - port for each release
|
||||
#
|
||||
# Algorithm:
|
||||
# 1. read base_port (default 3100)
|
||||
# 2. find all assigned_port files in releases/
|
||||
# 3. return smallest unused port >= base_port
|
||||
|
||||
ssh=deploy-peoplesgrocers-website
|
||||
base=/home/peoplesgrocers
|
||||
project=salience
|
||||
|
||||
ssh $ssh "
|
||||
set -eu
|
||||
|
||||
base=$base
|
||||
project=$project
|
||||
|
||||
test -d \$base/\$project || { echo 'project dir does not exist' >&2; exit 1; }
|
||||
|
||||
# read or initialize base_port
|
||||
if test -f \$base/\$project/base_port; then
|
||||
base_port=\$(cat \$base/\$project/base_port)
|
||||
else
|
||||
base_port=3100
|
||||
echo \$base_port > \$base/\$project/base_port
|
||||
fi
|
||||
|
||||
# find all assigned ports
|
||||
assigned=\$(find \$base/\$project/releases -name assigned_port -exec cat {} \; 2>/dev/null | sort -n || true)
|
||||
|
||||
# find first unused port
|
||||
port=\$base_port
|
||||
while true; do
|
||||
found=0
|
||||
for p in \$assigned; do
|
||||
test \$p -eq \$port && { found=1; break; }
|
||||
done
|
||||
test \$found -eq 0 && break
|
||||
port=\$((port + 1))
|
||||
done
|
||||
|
||||
echo \$port
|
||||
"
|
||||
Loading…
Add table
Add a link
Reference in a new issue