salience-editor/api/benchmarks/test_bench_self_cosine_sim.py

"""
Benchmark different cosine similarity implementations for SELF-SIMILARITY (A vs A).
This specialized version only computes norms once since we're comparing A with itself.

First run: python generate_embeddings.py
Then run: pytest test_bench_self_cosine_sim.py --benchmark-json=genfiles/benchmark_self_results.json
To visualize: python visualize_benchmarks.py genfiles/benchmark_self_results.json
"""
import os
import numpy as np
import pytest

# Load pre-generated embeddings once for all tests
script_dir = os.path.dirname(os.path.abspath(__file__))
embeddings_path = os.path.join(script_dir, 'genfiles', 'embeddings.npy')
vectors = np.load(embeddings_path)


# Original cos_sim function adapted for self-similarity
def cos_sim_original_self(a):
    """Original implementation specialized for self-similarity"""
    sims = a @ a.T
    norms = np.linalg.norm(a, axis=-1)
    a_normalized = (sims.T / norms.T).T
    sims = a_normalized / norms
    return sims


# Nested for loop version - PROPERLY IMPLEMENTED (norms calculated once)
def cos_sim_nested_loop_self(a):
    """Naive nested loop but with norms calculated once using numpy"""
    n = a.shape[0]
    sims = np.zeros((n, n))

    # Calculate ALL norms once using vectorized numpy (not in the loop!)
    norms = np.linalg.norm(a, axis=-1)

    for i in range(n):
        for j in range(n):
            dot_product = np.dot(a[i], a[j])
            sims[i, j] = dot_product / (norms[i] * norms[j])

    return sims


# E*E^T with manual in-place normalization
def cos_sim_inplace_norm_self(a):
    """In-place normalization specialized for self-similarity"""
    # Compute raw dot products
    sims = a @ a.T

    # Compute norms ONCE (not separate a_norms and b_norms)
    norms = np.linalg.norm(a, axis=-1)

    # Normalize in place
    for i in range(sims.shape[0]):
        for j in range(sims.shape[1]):
            sims[i, j] = sims[i, j] / (norms[i] * norms[j])

    return sims


# Broadcast division with in-place operations
def cos_sim_broadcast_inplace_self(a):
    """Broadcast in-place specialized for self-similarity"""
    # Compute raw dot products
    sims = a @ a.T

    # Compute norms ONCE with keepdims for broadcasting
    norms = np.linalg.norm(a, axis=-1, keepdims=True)  # shape (n, 1)

    # Normalize in-place using broadcasting
    # Divide by norms (broadcasting across columns)
    sims /= norms
    # Divide by norms.T (broadcasting across rows)
    sims /= norms.T

    return sims


# Broadcast division without in-place operations
def cos_sim_broadcast_self(a):
    """Broadcast without in-place operations - allocates new matrices"""
    # Compute raw dot products
    sims = a @ a.T

    # Compute norms ONCE with keepdims for broadcasting
    norms = np.linalg.norm(a, axis=-1, keepdims=True)  # shape (n, 1)

    # Normalize using broadcasting (creates new matrices)
    sims = sims / norms
    sims = sims / norms.T

    return sims


# Optimized: normalize vectors first, then just do dot product
def cos_sim_prenormalize_self(a):
    """Pre-normalize vectors, then just compute dot products"""
    # Normalize all vectors once
    norms = np.linalg.norm(a, axis=-1, keepdims=True)
    a_normalized = a / norms

    # For normalized vectors, dot product = cosine similarity
    sims = a_normalized @ a_normalized.T

    return sims


# Optimized: exploit symmetry (only compute upper triangle)
def cos_sim_symmetric_self(a):
    """Exploit symmetry - only compute upper triangle, then mirror"""
    # Normalize all vectors once
    norms = np.linalg.norm(a, axis=-1, keepdims=True)
    a_normalized = a / norms

    # Compute full matrix (numpy is already optimized for this)
    # Note: Trying to exploit symmetry manually is usually slower than letting numpy do it
    sims = a_normalized @ a_normalized.T

    return sims


# Verify all implementations produce the same results
def test_correctness():
    """Verify all implementations produce identical results"""
    result_original = cos_sim_original_self(vectors)
    result_nested = cos_sim_nested_loop_self(vectors)
    result_inplace = cos_sim_inplace_norm_self(vectors)
    result_broadcast_inplace = cos_sim_broadcast_inplace_self(vectors)
    result_broadcast = cos_sim_broadcast_self(vectors)
    result_prenorm = cos_sim_prenormalize_self(vectors)
    result_symmetric = cos_sim_symmetric_self(vectors)

    assert np.allclose(result_original, result_nested, atol=1e-6), "Nested loop mismatch"
    assert np.allclose(result_original, result_inplace, atol=1e-6), "In-place mismatch"
    assert np.allclose(result_original, result_broadcast_inplace, atol=1e-6), "Broadcast inplace mismatch"
    assert np.allclose(result_original, result_broadcast, atol=1e-6), "Broadcast mismatch"
    assert np.allclose(result_original, result_prenorm, atol=1e-6), "Pre-normalize mismatch"
    assert np.allclose(result_original, result_symmetric, atol=1e-6), "Symmetric mismatch"


# Benchmark tests
def test_bench_original_self(benchmark):
    """Original implementation (self-similarity)"""
    result = benchmark(cos_sim_original_self, vectors)
    assert result.shape == (vectors.shape[0], vectors.shape[0])


def test_bench_nested_loop_self(benchmark):
    """Nested loop (properly implemented with norms calculated once)"""
    result = benchmark(cos_sim_nested_loop_self, vectors)
    assert result.shape == (vectors.shape[0], vectors.shape[0])


def test_bench_inplace_norm_self(benchmark):
    """E*E^T with in-place normalization (self-similarity)"""
    result = benchmark(cos_sim_inplace_norm_self, vectors)
    assert result.shape == (vectors.shape[0], vectors.shape[0])


def test_bench_broadcast_inplace_self(benchmark):
    """Broadcast with in-place operations (self-similarity)"""
    result = benchmark(cos_sim_broadcast_inplace_self, vectors)
    assert result.shape == (vectors.shape[0], vectors.shape[0])


def test_bench_broadcast_self(benchmark):
    """Broadcast without in-place operations (self-similarity)"""
    result = benchmark(cos_sim_broadcast_self, vectors)
    assert result.shape == (vectors.shape[0], vectors.shape[0])


def test_bench_prenormalize_self(benchmark):
    """Pre-normalize vectors first (self-similarity)"""
    result = benchmark(cos_sim_prenormalize_self, vectors)
    assert result.shape == (vectors.shape[0], vectors.shape[0])


def test_bench_symmetric_self(benchmark):
    """Exploit symmetry (self-similarity)"""
    result = benchmark(cos_sim_symmetric_self, vectors)
    assert result.shape == (vectors.shape[0], vectors.shape[0])
feat: create deployment scripts 2025-11-02 13:09:23 -08:00			`"""`
			`Benchmark different cosine similarity implementations for SELF-SIMILARITY (A vs A).`
			`This specialized version only computes norms once since we're comparing A with itself.`

			`First run: python generate_embeddings.py`
			`Then run: pytest test_bench_self_cosine_sim.py --benchmark-json=genfiles/benchmark_self_results.json`
			`To visualize: python visualize_benchmarks.py genfiles/benchmark_self_results.json`
			`"""`
			`import os`
			`import numpy as np`
			`import pytest`

			`# Load pre-generated embeddings once for all tests`
			`script_dir = os.path.dirname(os.path.abspath(__file__))`
			`embeddings_path = os.path.join(script_dir, 'genfiles', 'embeddings.npy')`
			`vectors = np.load(embeddings_path)`


			`# Original cos_sim function adapted for self-similarity`
			`def cos_sim_original_self(a):`
			`"""Original implementation specialized for self-similarity"""`
			`sims = a @ a.T`
			`norms = np.linalg.norm(a, axis=-1)`
			`a_normalized = (sims.T / norms.T).T`
			`sims = a_normalized / norms`
			`return sims`


			`# Nested for loop version - PROPERLY IMPLEMENTED (norms calculated once)`
			`def cos_sim_nested_loop_self(a):`
			`"""Naive nested loop but with norms calculated once using numpy"""`
			`n = a.shape[0]`
			`sims = np.zeros((n, n))`

			`# Calculate ALL norms once using vectorized numpy (not in the loop!)`
			`norms = np.linalg.norm(a, axis=-1)`

			`for i in range(n):`
			`for j in range(n):`
			`dot_product = np.dot(a[i], a[j])`
			`sims[i, j] = dot_product / (norms[i] * norms[j])`

			`return sims`


			`# E*E^T with manual in-place normalization`
			`def cos_sim_inplace_norm_self(a):`
			`"""In-place normalization specialized for self-similarity"""`
			`# Compute raw dot products`
			`sims = a @ a.T`

			`# Compute norms ONCE (not separate a_norms and b_norms)`
			`norms = np.linalg.norm(a, axis=-1)`

			`# Normalize in place`
			`for i in range(sims.shape[0]):`
			`for j in range(sims.shape[1]):`
			`sims[i, j] = sims[i, j] / (norms[i] * norms[j])`

			`return sims`


			`# Broadcast division with in-place operations`
			`def cos_sim_broadcast_inplace_self(a):`
			`"""Broadcast in-place specialized for self-similarity"""`
			`# Compute raw dot products`
			`sims = a @ a.T`

			`# Compute norms ONCE with keepdims for broadcasting`
			`norms = np.linalg.norm(a, axis=-1, keepdims=True) # shape (n, 1)`

			`# Normalize in-place using broadcasting`
			`# Divide by norms (broadcasting across columns)`
			`sims /= norms`
			`# Divide by norms.T (broadcasting across rows)`
			`sims /= norms.T`

			`return sims`


			`# Broadcast division without in-place operations`
			`def cos_sim_broadcast_self(a):`
			`"""Broadcast without in-place operations - allocates new matrices"""`
			`# Compute raw dot products`
			`sims = a @ a.T`

			`# Compute norms ONCE with keepdims for broadcasting`
			`norms = np.linalg.norm(a, axis=-1, keepdims=True) # shape (n, 1)`

			`# Normalize using broadcasting (creates new matrices)`
			`sims = sims / norms`
			`sims = sims / norms.T`

			`return sims`


			`# Optimized: normalize vectors first, then just do dot product`
			`def cos_sim_prenormalize_self(a):`
			`"""Pre-normalize vectors, then just compute dot products"""`
			`# Normalize all vectors once`
			`norms = np.linalg.norm(a, axis=-1, keepdims=True)`
			`a_normalized = a / norms`

			`# For normalized vectors, dot product = cosine similarity`
			`sims = a_normalized @ a_normalized.T`

			`return sims`


			`# Optimized: exploit symmetry (only compute upper triangle)`
			`def cos_sim_symmetric_self(a):`
			`"""Exploit symmetry - only compute upper triangle, then mirror"""`
			`# Normalize all vectors once`
			`norms = np.linalg.norm(a, axis=-1, keepdims=True)`
			`a_normalized = a / norms`

			`# Compute full matrix (numpy is already optimized for this)`
			`# Note: Trying to exploit symmetry manually is usually slower than letting numpy do it`
			`sims = a_normalized @ a_normalized.T`

			`return sims`


			`# Verify all implementations produce the same results`
			`def test_correctness():`
			`"""Verify all implementations produce identical results"""`
			`result_original = cos_sim_original_self(vectors)`
			`result_nested = cos_sim_nested_loop_self(vectors)`
			`result_inplace = cos_sim_inplace_norm_self(vectors)`
			`result_broadcast_inplace = cos_sim_broadcast_inplace_self(vectors)`
			`result_broadcast = cos_sim_broadcast_self(vectors)`
			`result_prenorm = cos_sim_prenormalize_self(vectors)`
			`result_symmetric = cos_sim_symmetric_self(vectors)`

			`assert np.allclose(result_original, result_nested, atol=1e-6), "Nested loop mismatch"`
			`assert np.allclose(result_original, result_inplace, atol=1e-6), "In-place mismatch"`
			`assert np.allclose(result_original, result_broadcast_inplace, atol=1e-6), "Broadcast inplace mismatch"`
			`assert np.allclose(result_original, result_broadcast, atol=1e-6), "Broadcast mismatch"`
			`assert np.allclose(result_original, result_prenorm, atol=1e-6), "Pre-normalize mismatch"`
			`assert np.allclose(result_original, result_symmetric, atol=1e-6), "Symmetric mismatch"`


			`# Benchmark tests`
			`def test_bench_original_self(benchmark):`
			`"""Original implementation (self-similarity)"""`
			`result = benchmark(cos_sim_original_self, vectors)`
			`assert result.shape == (vectors.shape[0], vectors.shape[0])`


			`def test_bench_nested_loop_self(benchmark):`
			`"""Nested loop (properly implemented with norms calculated once)"""`
			`result = benchmark(cos_sim_nested_loop_self, vectors)`
			`assert result.shape == (vectors.shape[0], vectors.shape[0])`


			`def test_bench_inplace_norm_self(benchmark):`
			`"""E*E^T with in-place normalization (self-similarity)"""`
			`result = benchmark(cos_sim_inplace_norm_self, vectors)`
			`assert result.shape == (vectors.shape[0], vectors.shape[0])`


			`def test_bench_broadcast_inplace_self(benchmark):`
			`"""Broadcast with in-place operations (self-similarity)"""`
			`result = benchmark(cos_sim_broadcast_inplace_self, vectors)`
			`assert result.shape == (vectors.shape[0], vectors.shape[0])`


			`def test_bench_broadcast_self(benchmark):`
			`"""Broadcast without in-place operations (self-similarity)"""`
			`result = benchmark(cos_sim_broadcast_self, vectors)`
			`assert result.shape == (vectors.shape[0], vectors.shape[0])`


			`def test_bench_prenormalize_self(benchmark):`
			`"""Pre-normalize vectors first (self-similarity)"""`
			`result = benchmark(cos_sim_prenormalize_self, vectors)`
			`assert result.shape == (vectors.shape[0], vectors.shape[0])`


			`def test_bench_symmetric_self(benchmark):`
			`"""Exploit symmetry (self-similarity)"""`
			`result = benchmark(cos_sim_symmetric_self, vectors)`
			`assert result.shape == (vectors.shape[0], vectors.shape[0])`