34 lines
1.1 KiB
Python
34 lines
1.1 KiB
Python
import sys
|
|
import os
|
|
import numpy as np
|
|
|
|
# Add the parent directory to the path so we can import salience
|
|
script_dir = os.path.dirname(os.path.abspath(__file__))
|
|
parent_dir = os.path.dirname(script_dir)
|
|
sys.path.insert(0, parent_dir)
|
|
|
|
from salience.salience import models, get_sentences
|
|
|
|
# Load the transcript
|
|
transcript_path = os.path.join(parent_dir, 'transcript-1.txt')
|
|
with open(transcript_path, 'r') as f:
|
|
source_text = f.read()
|
|
|
|
# Get sentences and encode them
|
|
print("Loading transcript and encoding sentences...")
|
|
sentences, sentence_ranges = get_sentences(source_text)
|
|
print(f"Number of sentences: {len(sentences)}")
|
|
|
|
# Use the default model for comparison
|
|
model_name = 'all-mpnet-base-v2'
|
|
model = models[model_name]
|
|
vectors = model.encode(sentences)
|
|
print(f"Vector shape: {vectors.shape}")
|
|
|
|
# Save the embeddings to genfiles directory
|
|
genfiles_dir = os.path.join(script_dir, 'genfiles')
|
|
os.makedirs(genfiles_dir, exist_ok=True)
|
|
output_path = os.path.join(genfiles_dir, 'embeddings.npy')
|
|
np.save(output_path, vectors)
|
|
print(f"\nEmbeddings saved to: {output_path}")
|
|
print(f"File size: {os.path.getsize(output_path) / 1024 / 1024:.2f} MB")
|