salience-editor/api/benchmarks/01-generate_embeddings.py

34 lines
1.1 KiB
Python
Raw Normal View History

2025-11-02 13:09:23 -08:00
import sys
import os
import numpy as np
# Add the parent directory to the path so we can import salience
script_dir = os.path.dirname(os.path.abspath(__file__))
parent_dir = os.path.dirname(script_dir)
sys.path.insert(0, parent_dir)
from salience.salience import models, get_sentences
# Load the transcript
transcript_path = os.path.join(parent_dir, 'transcript-1.txt')
with open(transcript_path, 'r') as f:
source_text = f.read()
# Get sentences and encode them
print("Loading transcript and encoding sentences...")
sentences, sentence_ranges = get_sentences(source_text)
print(f"Number of sentences: {len(sentences)}")
# Use the default model for comparison
model_name = 'all-mpnet-base-v2'
model = models[model_name]
vectors = model.encode(sentences)
print(f"Vector shape: {vectors.shape}")
# Save the embeddings to genfiles directory
genfiles_dir = os.path.join(script_dir, 'genfiles')
os.makedirs(genfiles_dir, exist_ok=True)
output_path = os.path.join(genfiles_dir, 'embeddings.npy')
np.save(output_path, vectors)
print(f"\nEmbeddings saved to: {output_path}")
print(f"File size: {os.path.getsize(output_path) / 1024 / 1024:.2f} MB")