feat: create deployment scripts
This commit is contained in:
parent
78297efe5c
commit
8d5bce4bfb
22 changed files with 2697 additions and 74 deletions
|
|
@ -63,3 +63,23 @@ The production build will generate client and server modules by running both cli
|
|||
```shell
|
||||
npm run build # or `yarn build`
|
||||
```
|
||||
|
||||
## Static Site Generator (Node.js)
|
||||
|
||||
Be sure to configure your server to serve very long cache headers for the `build/**/*.js` files.
|
||||
|
||||
Typically you'd set the `Cache-Control` header for those files to `public, max-age=31536000, immutable`.
|
||||
|
||||
```shell
|
||||
npm run build.server
|
||||
```
|
||||
|
||||
## Express Server
|
||||
|
||||
This app has a minimal [Express server](https://expressjs.com/) implementation. After running a full build, you can preview the build using the command:
|
||||
|
||||
```
|
||||
npm run serve
|
||||
```
|
||||
|
||||
Then visit [http://localhost:8080/](http://localhost:8080/)
|
||||
|
|
|
|||
15
frontend/adapters/express/vite.config.ts
Normal file
15
frontend/adapters/express/vite.config.ts
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
import { nodeServerAdapter } from "@builder.io/qwik-city/adapters/node-server/vite";
|
||||
import { extendConfig } from "@builder.io/qwik-city/vite";
|
||||
import baseConfig from "../../vite.config";
|
||||
|
||||
export default extendConfig(baseConfig, () => {
|
||||
return {
|
||||
build: {
|
||||
ssr: true,
|
||||
rollupOptions: {
|
||||
input: ["src/entry.express.tsx", "@qwik-city-plan"],
|
||||
},
|
||||
},
|
||||
plugins: [nodeServerAdapter({ name: "express" })],
|
||||
};
|
||||
});
|
||||
24
frontend/adapters/static/vite.config.ts
Normal file
24
frontend/adapters/static/vite.config.ts
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
import { staticAdapter } from "@builder.io/qwik-city/adapters/static/vite";
|
||||
import { extendConfig } from "@builder.io/qwik-city/vite";
|
||||
import baseConfig from "../../vite.config";
|
||||
|
||||
export default extendConfig(baseConfig, () => {
|
||||
return {
|
||||
build: {
|
||||
ssr: true,
|
||||
rollupOptions: {
|
||||
input: ["@qwik-city-plan"],
|
||||
},
|
||||
},
|
||||
plugins: [
|
||||
staticAdapter({
|
||||
origin: "https://peoplesgrocers.com",
|
||||
// Specify which routes to statically generate
|
||||
routes: [
|
||||
"/about",
|
||||
// Add more static routes here as needed
|
||||
],
|
||||
}),
|
||||
],
|
||||
};
|
||||
});
|
||||
896
frontend/package-lock.json
generated
896
frontend/package-lock.json
generated
File diff suppressed because it is too large
Load diff
|
|
@ -11,6 +11,8 @@
|
|||
"build": "qwik build",
|
||||
"build.client": "vite build",
|
||||
"build.preview": "vite build --ssr src/entry.preview.tsx",
|
||||
"build.server": "qwik check-client src dist && vite build -c adapters/express/vite.config.ts",
|
||||
"build.static": "vite build -c adapters/static/vite.config.ts",
|
||||
"build.types": "tsc --incremental --noEmit",
|
||||
"deploy": "echo 'Run \"npm run qwik add\" to install a server adapter'",
|
||||
"dev": "vite --mode ssr",
|
||||
|
|
@ -19,6 +21,7 @@
|
|||
"fmt.check": "prettier --check .",
|
||||
"lint": "eslint \"src/**/*.ts*\"",
|
||||
"preview": "qwik build preview && vite preview --open",
|
||||
"serve": "node server/entry.express",
|
||||
"start": "vite --open --mode ssr",
|
||||
"qwik": "qwik"
|
||||
},
|
||||
|
|
@ -27,7 +30,11 @@
|
|||
"@builder.io/qwik-city": "^1.17.1",
|
||||
"@eslint/js": "latest",
|
||||
"@tailwindcss/vite": "^4.1.16",
|
||||
"@types/compression": "^1.7.2",
|
||||
"@types/express": "^4.17.19",
|
||||
"@types/node": "20.19.0",
|
||||
"compression": "^1.7.4",
|
||||
"dotenv": "^16.3.2",
|
||||
"eslint": "9.32.0",
|
||||
"eslint-plugin-qwik": "^1.17.1",
|
||||
"globals": "16.4.0",
|
||||
|
|
@ -42,6 +49,7 @@
|
|||
},
|
||||
"dependencies": {
|
||||
"@tailwindcss/typography": "^0.5.19",
|
||||
"express": "4.20.0",
|
||||
"mathjs": "^15.0.0",
|
||||
"prosemirror-commands": "^1.7.1",
|
||||
"prosemirror-keymap": "^1.2.3",
|
||||
|
|
|
|||
73
frontend/src/entry.express.tsx
Normal file
73
frontend/src/entry.express.tsx
Normal file
|
|
@ -0,0 +1,73 @@
|
|||
/*
|
||||
* WHAT IS THIS FILE?
|
||||
*
|
||||
* It's the entry point for the Express HTTP server when building for production.
|
||||
*
|
||||
* Learn more about Node.js server integrations here:
|
||||
* - https://qwik.dev/docs/deployments/node/
|
||||
*
|
||||
*/
|
||||
import {
|
||||
createQwikCity,
|
||||
type PlatformNode,
|
||||
} from "@builder.io/qwik-city/middleware/node";
|
||||
import "dotenv/config";
|
||||
import qwikCityPlan from "@qwik-city-plan";
|
||||
import render from "./entry.ssr";
|
||||
import express from "express";
|
||||
import { fileURLToPath } from "node:url";
|
||||
import { join } from "node:path";
|
||||
|
||||
declare global {
|
||||
type QwikCityPlatform = PlatformNode;
|
||||
}
|
||||
|
||||
// Directories where the static assets are located
|
||||
const distDir = join(fileURLToPath(import.meta.url), "..", "..", "dist");
|
||||
const buildDir = join(distDir, "build");
|
||||
const assetsDir = join(distDir, "assets");
|
||||
|
||||
// Allow for dynamic port
|
||||
const PORT = process.env.PORT ?? 3000;
|
||||
|
||||
// Create the Qwik City Node middleware
|
||||
const { router, notFound } = createQwikCity({
|
||||
render,
|
||||
qwikCityPlan,
|
||||
// getOrigin(req) {
|
||||
// // If deploying under a proxy, you may need to build the origin from the request headers
|
||||
// // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-Proto
|
||||
// const protocol = req.headers["x-forwarded-proto"] ?? "http";
|
||||
// // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Forwarded-Host
|
||||
// const host = req.headers["x-forwarded-host"] ?? req.headers.host;
|
||||
// return `${protocol}://${host}`;
|
||||
// }
|
||||
});
|
||||
|
||||
// Create the express server
|
||||
// https://expressjs.com/
|
||||
const app = express();
|
||||
|
||||
// Enable gzip compression
|
||||
// app.use(compression());
|
||||
|
||||
// Static asset handlers
|
||||
// https://expressjs.com/en/starter/static-files.html
|
||||
app.use(`/build`, express.static(buildDir, { immutable: true, maxAge: "1y" }));
|
||||
app.use(
|
||||
`/assets`,
|
||||
express.static(assetsDir, { immutable: true, maxAge: "1y" }),
|
||||
);
|
||||
app.use(express.static(distDir, { redirect: false }));
|
||||
|
||||
// Use Qwik City's page and endpoint request handler
|
||||
app.use(router);
|
||||
|
||||
// Use Qwik City's 404 handler
|
||||
app.use(notFound);
|
||||
|
||||
// Start the express server
|
||||
app.listen(PORT, () => {
|
||||
/* eslint-disable */
|
||||
console.log(`Server started: http://localhost:${PORT}/`);
|
||||
});
|
||||
|
|
@ -6,21 +6,49 @@ import { Math } from "~/components/math/math"
|
|||
|
||||
# How Salience Works
|
||||
|
||||
A couple of days ago I came across
|
||||
[github.com/mattneary/salience](https://github.com/mattneary/salience) by Matt Neary. I thought it
|
||||
was quite neat how he took sentence embeddings and in just a few lines of code
|
||||
was able to determine the significance of all sentences in a document.
|
||||
|
||||
This post is an outsider's view of how salience works. If you're already working with ML models in Python, this will feel
|
||||
torturously detailed. I wrote this for the rest of us old world programmers: compilers, networking, systems programming looking at
|
||||
C++/Go/Rust, or the poor souls in the frontend Typescript mines.
|
||||
For us refugees of the barbarian past, the tooling and notation can look foreign. I wanted to walk through the math and
|
||||
numpy operations in detail to show what's actually happening with the data.
|
||||
|
||||
Salience highlights important sentences by treating your document as a graph where sentences that talk about similar things are connected. We then figure out which sentences are most "central" to the document's themes.
|
||||
|
||||
## Step 1: Break Text into Sentences
|
||||
|
||||
We use NLTK's Punkt tokenizer to split text into sentences. This handles tricky cases where simple punctuation splitting fails:
|
||||
The first problem we need to solve is finding the sentences in a document. This is not as easy as splitting on newlines or periods. Consider this example:
|
||||
|
||||
*"Dr. Smith earned his Ph.D. in 1995."* ← This is **one** sentence, not three!
|
||||
|
||||
## Step 2: Convert Sentences to Embeddings
|
||||
Fortunately, this problem has been adequately solved for decades. We are going to use the **Punkt sentence splitter** (2003) available in the Natural Language Toolkit (NLTK) Python package.
|
||||
|
||||
Now we have <Math tex="N" /> sentences. We convert each one into a high-dimensional vector that captures its meaning:
|
||||
## Step 2: Apply an Embedding Model
|
||||
|
||||
<Math display tex="\mathbf{E} = \text{model.encode}(\text{sentences}) \in \mathbb{R}^{N \times D}" />
|
||||
Now we have <Math tex="N" /> sentences. We convert each one into a high-dimensional vector that captures its meaning. For example:
|
||||
|
||||
This gives us an **embeddings matrix** <Math tex="\mathbf{E}" /> where each row is one sentence:
|
||||
<Math tex="\mathbf{Sentence \space A} = [a_1, a_2, a_3, \ldots, a_D]" display="block" />
|
||||
<Math tex="\mathbf{Sentence \space B} = [b_1, b_2, b_3, \ldots, b_D]" display="block" />
|
||||
<Math tex="\mathbf{Sentence \space C} = [c_1, c_2, c_3, \ldots, c_D]" display="block" />
|
||||
|
||||
## Step 3: Build the Adjacency Matrix
|
||||
|
||||
Now we create a new <Math tex="N \times N" /> adjacency matrix <Math tex="\mathbf{A}" /> that measures how similar each pair of sentences is. For every pair of sentences <Math tex="i" /> and <Math tex="j" />, we need the **cosine similarity**:
|
||||
|
||||
<Math display tex="A_{ij} = \frac{\mathbf{e}_i \cdot \mathbf{e}_j}{\|\mathbf{e}_i\| \|\mathbf{e}_j\|}" />
|
||||
|
||||
Each <Math tex="A_{ij}" /> represents how strongly sentence <Math tex="i" /> is connected to sentence <Math tex="j" />.
|
||||
- <Math tex="A_{ij} = 1" /> means sentences are identical in meaning
|
||||
- <Math tex="A_{ij} = 0" /> means sentences are unrelated
|
||||
- <Math tex="A_{ij} = -1" /> means sentences are opposite in meaning
|
||||
|
||||
|
||||
|
||||
You could work with these embedding vectors one at a time, using two for loops to build the adjacency matrix leetcode style. However, there's a way to delegate the computation to optimized libraries. Instead, organize all embeddings into a single matrix:
|
||||
|
||||
<Math display tex="\mathbf{E} = \begin{bmatrix} a_1 & a_2 & a_3 & \cdots & a_D \\ b_1 & b_2 & b_3 & \cdots & b_D \\ c_1 & c_2 & c_3 & \cdots & c_D \\ \vdots & \vdots & \vdots & \ddots & \vdots \\ z_1 & z_2 & z_3 & \cdots & z_D \end{bmatrix}" />
|
||||
|
||||
|
|
@ -29,38 +57,59 @@ Where:
|
|||
- <Math tex="D" /> = embedding dimension (768 for all-mpnet-base-v2, 1024 for gte-large-en-v1.5)
|
||||
- Each row represents one sentence in semantic space
|
||||
|
||||
## Step 3: Build the Adjacency Matrix
|
||||
**Step 3a: Compute all dot products**
|
||||
|
||||
Now we create a new matrix <Math tex="\mathbf{A}" /> that measures how similar each pair of sentences is. For every pair of sentences <Math tex="i" /> and <Math tex="j" />, we compute:
|
||||
<Math display tex="\mathbf{S} = \mathbf{E} \mathbf{E}^T" />
|
||||
|
||||
<Math display tex="A_{ij} = \frac{\mathbf{e}_i \cdot \mathbf{e}_j}{\|\mathbf{e}_i\| \|\mathbf{e}_j\|}" />
|
||||
Since <Math tex="\mathbf{E}" /> is <Math tex="N \times D" /> and <Math tex="\mathbf{E}^T" /> is <Math tex="D \times N" />, their product gives us an <Math tex="N \times N" /> matrix where entry <Math tex="S_{ij} = \mathbf{e}_i \cdot \mathbf{e}_j" />.
|
||||
|
||||
This is the **cosine similarity** between their embedding vectors. It tells us:
|
||||
- <Math tex="A_{ij} = 1" /> means sentences are identical in meaning
|
||||
- <Math tex="A_{ij} = 0" /> means sentences are unrelated
|
||||
- <Math tex="A_{ij} = -1" /> means sentences are opposite in meaning
|
||||
**Step 3b: Compute the norms and normalize**
|
||||
|
||||
First, compute a vector of norms:
|
||||
|
||||
<Math display tex="\mathbf{n} = \begin{bmatrix} \|\mathbf{e}_1\| \\ \|\mathbf{e}_2\| \\ \|\mathbf{e}_3\| \\ \vdots \\ \|\mathbf{e}_N\| \end{bmatrix}" />
|
||||
|
||||
This is an <Math tex="(N, 1)" /> vector where each element is the magnitude of one sentence's embedding. Now we need to visit every single element of <Math tex="\mathbf{S}" /> to make the adjacency matrix <Math tex="A_{ij} = \frac{S_{ij}}{n_i \cdot n_j}" />:
|
||||
|
||||
<Math display tex="\mathbf{A} = \begin{bmatrix} \frac{S_{11}}{n_1 \cdot n_1} & \frac{S_{12}}{n_1 \cdot n_2} & \cdots & \frac{S_{1N}}{n_1 \cdot n_N} \\ \frac{S_{21}}{n_2 \cdot n_1} & \frac{S_{22}}{n_2 \cdot n_2} & \cdots & \frac{S_{2N}}{n_2 \cdot n_N} \\ \vdots & \vdots & \ddots & \vdots \\ \frac{S_{N1}}{n_N \cdot n_1} & \frac{S_{N2}}{n_N \cdot n_2} & \cdots & \frac{S_{NN}}{n_N \cdot n_N} \end{bmatrix}" />
|
||||
|
||||
**Quick benchmark:** For a <Math tex="194 \times 768" /> embeddings matrix (194 sentences):
|
||||
|
||||
- Computing everything in Python for loops: **33.1 ms**
|
||||
- Using <Math tex="\mathbf{E} \mathbf{E}^T" /> for dot products, but element-by-element normalization in Python: **10.9 ms** (saves 22.2 ms)
|
||||
- Using numpy **broadcasting** for normalization too: **0.13 ms**
|
||||
|
||||
Broadcasting is a numpy feature where dividing arrays of different shapes automatically "stretches" the smaller array to match:
|
||||
|
||||
```python
|
||||
def cos_sim(a):
|
||||
sims = a @ a.T
|
||||
norms = np.linalg.norm(a, axis=-1, keepdims=True)
|
||||
sims /= norms # Divides each row i by norm[i]
|
||||
sims /= norms.T # Divides each column j by norm[j]
|
||||
return sims
|
||||
```
|
||||
|
||||
The `keepdims=True` makes `norms` shape <Math tex="(N, 1)" /> instead of <Math tex="(N,)" />, which is crucial—when transposed, <Math tex="(N, 1)" /> becomes <Math tex="(1, N)" />, allowing the broadcasting to work for column-wise division.
|
||||
|
||||
The result is an <Math tex="N \times N" /> **adjacency matrix** where <Math tex="A_{ij}" /> represents how strongly sentence <Math tex="i" /> is connected to sentence <Math tex="j" />.
|
||||
|
||||
## Step 4: Clean Up the Graph
|
||||
|
||||
We make two adjustments to the adjacency matrix to get a cleaner graph:
|
||||
We make two adjustments to the adjacency matrix to make our TextRank work:
|
||||
|
||||
1. **Remove self-loops:** Set diagonal to zero (<Math tex="A_{ii} = 0" />)
|
||||
- A sentence shouldn't vote for its own importance
|
||||
|
||||
2. **Remove negative edges:** Set <Math tex="A_{ij} = \max(0, A_{ij})" />
|
||||
- Sentences with opposite meanings get disconnected
|
||||
|
||||
A sentence shouldn't vote for its own importance. And sentences with opposite meanings get disconnected.
|
||||
|
||||
**Important assumption:** This assumes your document has a coherent main idea and that sentences are generally on-topic. We're betting that the topic with the most "semantic mass" is the *correct* topic.
|
||||
**Important assumption:** This assumes your document has a coherent main idea and that sentences are generally on-topic. We're betting that the topic with the most "semantic mass" is the *correct* topic. This is obviously not true for many documents:
|
||||
|
||||
**Where this breaks down:**
|
||||
- **Dialectical essays** that deliberately contrast opposing viewpoints
|
||||
- **Documents heavy with quotes** that argue against something
|
||||
- **Debate transcripts** where both sides are equally important
|
||||
- **Critical analysis** that spends significant time explaining a position before refuting it
|
||||
- Dialectical essays that deliberately contrast opposing viewpoints
|
||||
- Documents heavy with quotes that argue against something
|
||||
- Debate transcripts where both sides are equally important
|
||||
- Critical analysis that spends significant time explaining a position before refuting it
|
||||
|
||||
For example: "Nuclear power is dangerous. Critics say it causes meltdowns. However, modern reactors are actually very safe."
|
||||
For example: "Nuclear power is dangerous. Critics say it causes meltdowns [...]. However, modern reactors are actually very safe."
|
||||
|
||||
The algorithm might highlight the criticism because multiple sentences cluster around "danger", even though the document's actual position is pro-nuclear. There's nothing inherent in the math that identifies authorial intent vs. quoted opposition.
|
||||
|
||||
|
|
|
|||
|
|
@ -2,7 +2,6 @@ import {
|
|||
component$,
|
||||
useSignal,
|
||||
useVisibleTask$,
|
||||
useComputed$,
|
||||
$,
|
||||
noSerialize,
|
||||
useStore,
|
||||
|
|
@ -12,7 +11,7 @@ import {
|
|||
import { type DocumentHead, routeLoader$ } from "@builder.io/qwik-city";
|
||||
import { EditorState, Plugin, Transaction } from "prosemirror-state";
|
||||
import { EditorView, Decoration, DecorationSet } from "prosemirror-view";
|
||||
import { Schema, DOMParser, Node as PMNode } from "prosemirror-model";
|
||||
import { Schema, Node as PMNode } from "prosemirror-model";
|
||||
import { schema as basicSchema } from "prosemirror-schema-basic";
|
||||
import { keymap } from "prosemirror-keymap";
|
||||
import { baseKeymap } from "prosemirror-commands";
|
||||
|
|
@ -123,7 +122,7 @@ export default component$(() => {
|
|||
const models = useSignal<string[]>([]);
|
||||
const currentModel = useSignal("all-mpnet-base-v2");
|
||||
const syncState = useSignal<SyncState>("clean");
|
||||
const editorView = useSignal<EditorView | null>(null);
|
||||
const editorView = useSignal<NoSerialize<EditorView>>();
|
||||
const sentenceDecorations = useSignal<SentenceDecoration[]>([]);
|
||||
const salienceData = useSignal<SalienceData | null>(null);
|
||||
const debounceTimer = useSignal<number | null>(null);
|
||||
|
|
@ -224,15 +223,20 @@ export default component$(() => {
|
|||
|
||||
// Parse the initial document text into separate paragraphs
|
||||
// Split by single newlines and preserve blank lines as empty paragraphs
|
||||
const paragraphs = initialDocument.value
|
||||
.split('\n')
|
||||
.map(line => {
|
||||
// Create paragraph with text if line has content, otherwise empty paragraph
|
||||
const content = line.length > 0 ? [salienceSchema.text(line)] : [];
|
||||
return salienceSchema.node("paragraph", null, content);
|
||||
});
|
||||
const lines = initialDocument.value
|
||||
.split('\n');
|
||||
|
||||
const paragraphs: PMNode[] = [];
|
||||
lines.forEach((line) => {
|
||||
// Create paragraph with text if line has content, otherwise empty paragraph
|
||||
const content = line.length > 0 ? [salienceSchema.text(line)] : [];
|
||||
//content.push(salienceSchema.node("hard_break"));
|
||||
paragraphs.push(salienceSchema.node("paragraph", null, content));
|
||||
});
|
||||
console.log(paragraphs);
|
||||
|
||||
const initialDoc = salienceSchema.node("doc", null, paragraphs);
|
||||
console.log(initialDoc.textContent);
|
||||
|
||||
const state = EditorState.create({
|
||||
schema: salienceSchema,
|
||||
|
|
@ -272,7 +276,7 @@ export default component$(() => {
|
|||
},
|
||||
});
|
||||
|
||||
editorView.value = view;
|
||||
editorView.value = noSerialize(view);
|
||||
|
||||
fetchSalienceData(
|
||||
currentModel.value,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue