feat: experiment with different implementations of LSEQ
This commit is contained in:
commit
1e45ef9314
23 changed files with 3578 additions and 0 deletions
613
research/src/algorithms/lseq_base64.rs
Normal file
613
research/src/algorithms/lseq_base64.rs
Normal file
|
|
@ -0,0 +1,613 @@
|
|||
use rand::Rng;
|
||||
use std::error::Error;
|
||||
use std::fmt;
|
||||
use log::{trace, debug};
|
||||
|
||||
const BOUNDARY: u64 = 40; // The paper says this can be any constant
|
||||
|
||||
// The maximum level is 9 because the maximum value of a level is 2^(6+6*9) - 1,
|
||||
// which is 2^60 - 1, which fits in u64. At level 10, we would have 2^66 - 1,
|
||||
// which exceeds u64 capacity.
|
||||
const MAX_LEVEL: usize = 9;
|
||||
|
||||
// Python program used to generate LEVEL_DIGITS_LOOKUP:
|
||||
// ```python
|
||||
// def compute_level_digits():
|
||||
// digits = []
|
||||
// for i in range(10):
|
||||
// max_value = (64 * (64 ** i)) - 1 # 64^(i+1) - 1 = 2^(6+6*i) - 1
|
||||
// num_digits = len(str(max_value))
|
||||
// digits.append(num_digits)
|
||||
// return digits
|
||||
//
|
||||
// if __name__ == "__main__":
|
||||
// digits = compute_level_digits()
|
||||
// print(f"const LEVEL_DIGITS_LOOKUP: [usize; 10] = {digits};")
|
||||
// ```
|
||||
|
||||
// Precomputed number of digits needed for each level (0-9)
|
||||
// Level i has max value of 2^(6+6*i) - 1, so we need enough digits to represent that
|
||||
const LEVEL_DIGITS_LOOKUP: [usize; 10] = [
|
||||
2, 4, 6, 8, 10, 11, 13, 15, 17, 19
|
||||
];
|
||||
|
||||
/// L-SEQ implementation with 64 slots per level, multiplying by 64 each level
|
||||
pub struct LSEQBase64<R: Rng + std::fmt::Debug> {
|
||||
/// Strategy vector - true for + strategy, false for - strategy
|
||||
strategies: Vec<bool>,
|
||||
/// Random number generator
|
||||
rng: R,
|
||||
}
|
||||
|
||||
/// Sort key implementation for 64-slot L-SEQ
|
||||
#[derive(Clone, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct SortKeyBase64 {
|
||||
levels: Vec<u64>,
|
||||
}
|
||||
|
||||
impl SortKeyBase64 {
|
||||
pub fn new(levels: Vec<u64>) -> Self {
|
||||
Self { levels }
|
||||
}
|
||||
|
||||
pub fn levels(&self) -> &[u64] {
|
||||
&self.levels
|
||||
}
|
||||
|
||||
/// Calculate the number of base64 characters needed for maximally encoded form
|
||||
/// In this compact encoding, level i needs exactly (i+1) base64 characters:
|
||||
/// - Level 0: 1 character (6 bits, 0-63)
|
||||
/// - Level 1: 2 characters (12 bits, 0-4095)
|
||||
/// - Level 2: 3 characters (18 bits, 0-262143)
|
||||
/// - etc.
|
||||
/// No separators needed since we know the structure.
|
||||
pub fn max_base64_chars(&self) -> usize {
|
||||
self.levels.iter().enumerate().map(|(level, _)| level + 1).sum()
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the number of slots for a given level (64 * 64^level = 64^(level+1))
|
||||
#[allow(dead_code)]
|
||||
fn get_level_slots(level: usize) -> u64 {
|
||||
let base_slots = 64u64;
|
||||
let multiplier = 64u64.checked_pow(level as u32)
|
||||
.expect("Level exceeds u64 representation capacity");
|
||||
|
||||
base_slots.checked_mul(multiplier)
|
||||
.expect("Level slots exceed u64 capacity")
|
||||
}
|
||||
|
||||
impl fmt::Display for SortKeyBase64 {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let parts: Vec<String> = self.levels.iter().map(|&x| x.to_string()).collect();
|
||||
write!(f, "{}", parts.join("."))
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for SortKeyBase64 {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let parts: Vec<String> = self.levels.iter().enumerate().map(|(level, &value)| {
|
||||
let digits = if level <= MAX_LEVEL {
|
||||
LEVEL_DIGITS_LOOKUP[level]
|
||||
} else {
|
||||
// For levels beyond MAX_LEVEL, use the same digit count as MAX_LEVEL
|
||||
// since we're capping at 2^60 - 1
|
||||
LEVEL_DIGITS_LOOKUP[MAX_LEVEL]
|
||||
};
|
||||
format!("{:0width$}", value, width = digits)
|
||||
}).collect();
|
||||
write!(f, "{}", parts.join("."))
|
||||
}
|
||||
}
|
||||
|
||||
impl<R: Rng + std::fmt::Debug> LSEQBase64<R> {
|
||||
pub fn new(rng: R) -> Self {
|
||||
Self {
|
||||
strategies: Vec::new(),
|
||||
rng,
|
||||
}
|
||||
}
|
||||
|
||||
/// Set strategies for testing purposes
|
||||
#[cfg(test)]
|
||||
pub fn set_strategies(&mut self, strategies: Vec<bool>) {
|
||||
self.strategies = strategies;
|
||||
}
|
||||
|
||||
/// Allocate a new identifier between two existing identifiers
|
||||
pub fn allocate(&mut self, before: Option<&SortKeyBase64>, after: Option<&SortKeyBase64>) -> Result<SortKeyBase64, Box<dyn Error>> {
|
||||
|
||||
// Convert to the format expected by the paper's algorithm
|
||||
let p = before.map_or(vec![0], |k| k.levels().to_vec());
|
||||
let q = after.map_or(vec![self.get_depth_max(0)], |k| k.levels().to_vec());
|
||||
|
||||
let levels = self.alloc(&p, &q);
|
||||
let key = SortKeyBase64::new(levels);
|
||||
|
||||
// Debug assertions to verify the allocated key is properly ordered
|
||||
if let Some(before_key) = before {
|
||||
debug_assert!(
|
||||
before_key < &key,
|
||||
"ORDERING VIOLATION: before < allocated failed\n\
|
||||
before = {:?} (internal: {:?})\n\
|
||||
allocated = {:?} (internal: {:?})\n\
|
||||
after = {} (internal: {:?})\n\
|
||||
Expected: before < allocated < after",
|
||||
before_key, before_key.levels(),
|
||||
key, key.levels(),
|
||||
after.map(|k| format!("{:?}", k)).unwrap_or_else(|| "None".to_string()),
|
||||
after.map(|k| k.levels()).unwrap_or(&[])
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(after_key) = after {
|
||||
debug_assert!(
|
||||
&key < after_key,
|
||||
"ORDERING VIOLATION: allocated < after failed\n\
|
||||
before = {} (internal: {:?})\n\
|
||||
allocated = {:?} (internal: {:?})\n\
|
||||
after = {:?} (internal: {:?})\n\
|
||||
Expected: before < allocated < after",
|
||||
before.map(|k| format!("{:?}", k)).unwrap_or_else(|| "None".to_string()),
|
||||
before.map(|k| k.levels()).unwrap_or(&[]),
|
||||
key, key.levels(),
|
||||
after_key, after_key.levels()
|
||||
);
|
||||
}
|
||||
|
||||
Ok(key)
|
||||
}
|
||||
|
||||
/// Get the maximum value for a given level (64^(level+1) - 1 = 2^(6+6*level) - 1)
|
||||
/// For levels beyond 9, we cap at 2^60 - 1 to avoid u64 overflow
|
||||
fn get_depth_max(&self, depth: usize) -> u64 {
|
||||
let max_val = if depth <= MAX_LEVEL {
|
||||
(1 << (6 + 6 * depth)) - 1
|
||||
} else {
|
||||
// Cap at 2^60 - 1 for levels beyond 9
|
||||
(1 << 60) - 1
|
||||
};
|
||||
trace!("get_depth_max({}) -> {}", depth, max_val);
|
||||
max_val
|
||||
}
|
||||
|
||||
fn alloc(&mut self, p: &[u64], q: &[u64]) -> Vec<u64> {
|
||||
debug!("Starting allocation between p={:?} and q={:?}", p, q);
|
||||
if !(p.is_empty() && q.is_empty()) {
|
||||
debug_assert_ne!(p, q, "Cannot allocate between identical positions: p={:?}, q={:?}", p, q);
|
||||
}
|
||||
|
||||
let mut borrow_flag = false;
|
||||
let max_levels = std::cmp::max(p.len(), q.len()) + 1;
|
||||
let mut result = Vec::with_capacity(max_levels);
|
||||
|
||||
trace!("Initial state: carry_flag={}, max_levels={}", borrow_flag, max_levels);
|
||||
|
||||
// Phase 1: Find the allocation depth
|
||||
for depth in 0..max_levels {
|
||||
trace!("=== Processing depth {} ===", depth);
|
||||
trace!("Current result so far: {:?}", result);
|
||||
trace!("Current carry_flag: {}", borrow_flag);
|
||||
|
||||
if self.strategies.len() <= depth {
|
||||
let new_strategy = self.rng.gen_bool(0.5);
|
||||
trace!("BRANCH: Generating new strategy for depth {}: {} (+ strategy: {})",
|
||||
depth, new_strategy, new_strategy);
|
||||
self.strategies.push(new_strategy);
|
||||
} else {
|
||||
trace!("Using existing strategy for depth {}: {} (+ strategy: {})",
|
||||
depth, self.strategies[depth], self.strategies[depth]);
|
||||
}
|
||||
|
||||
|
||||
let p_val = if depth < p.len() {
|
||||
trace!("BRANCH: p_val from p[{}] = {}", depth, p[depth]);
|
||||
p[depth]
|
||||
} else {
|
||||
trace!("BRANCH: p_val defaulted to 0 (depth {} >= p.len() {})", depth, p.len());
|
||||
0
|
||||
};
|
||||
|
||||
let q_val = if borrow_flag {
|
||||
let max_val = self.get_depth_max(depth);
|
||||
trace!("BRANCH: q_val from get_depth_max({}) = {} (carry_flag=true)", depth, max_val);
|
||||
max_val
|
||||
} else if depth < q.len() {
|
||||
trace!("BRANCH: q_val from q[{}] = {} (carry_flag=false)", depth, q[depth]);
|
||||
q[depth]
|
||||
} else {
|
||||
trace!("BRANCH: q_val defaulted to 0 (depth {} >= q.len() {}, carry_flag=false)", depth, q.len());
|
||||
0
|
||||
};
|
||||
|
||||
trace!("At depth {}: p_val={}, q_val={}, gap={}", depth, p_val, q_val, q_val.saturating_sub(p_val));
|
||||
|
||||
if p_val == q_val {
|
||||
trace!("BRANCH: Values equal at depth {} (p_val={}, q_val={}), extending prefix and going deeper",
|
||||
depth, p_val, q_val);
|
||||
result.push(p_val);
|
||||
continue;
|
||||
}
|
||||
|
||||
if q_val < p_val {
|
||||
trace!("BRANCH: ERROR - q_val < p_val at depth {} (q_val={}, p_val={})", depth, q_val, p_val);
|
||||
debug_assert!(q_val > p_val, "q < p at depth {}", depth);
|
||||
// We know that q > p overall, and we know that we had a shared
|
||||
// prefix up until this point, therefor q_val must be greater than p_val
|
||||
// TODO I might want to return an error here instead of panicing
|
||||
}
|
||||
|
||||
let gap = q_val - p_val;
|
||||
if gap > 1 {
|
||||
// Enough space at this level
|
||||
trace!("BRANCH: Sufficient space found at depth {} (gap={} > 1)", depth, gap);
|
||||
let interval = gap - 1;
|
||||
let step = std::cmp::min(BOUNDARY, interval);
|
||||
|
||||
let allocated_value = if self.strategies[depth] {
|
||||
let delta = self.rng.gen_range(1..=step);
|
||||
trace!("Space allocation: interval={}, step={}, delta={}", interval, step, delta);
|
||||
let val = p_val + delta;
|
||||
trace!("BRANCH: Using + strategy, allocated_value = p_val + delta = {} + {} = {}",
|
||||
p_val, delta, val);
|
||||
val
|
||||
} else {
|
||||
let delta = if borrow_flag {
|
||||
//self.rng.gen_range(0..step)
|
||||
self.rng.gen_range(1..=step)
|
||||
} else {
|
||||
self.rng.gen_range(1..=step)
|
||||
};
|
||||
trace!("Space allocation: interval={}, step={}, delta={}", interval, step, delta);
|
||||
let val = q_val - delta;
|
||||
trace!("BRANCH: Using - strategy, allocated_value = q_val - delta = {} - {} = {}",
|
||||
q_val, delta, val);
|
||||
val
|
||||
};
|
||||
|
||||
result.push(allocated_value);
|
||||
trace!("BRANCH: Allocation complete at depth {}, final result: {:?}", depth, result);
|
||||
return result;
|
||||
} else {
|
||||
trace!("BRANCH: Insufficient space at depth {} (gap={} <= 1), extending prefix and setting carry_flag",
|
||||
depth, gap);
|
||||
result.push(p_val);
|
||||
borrow_flag = true;
|
||||
trace!("Updated state: result={:?}, carry_flag={}", result, borrow_flag);
|
||||
}
|
||||
}
|
||||
|
||||
trace!("BRANCH: Loop completed without allocation, returning result: {:?}", result);
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use rand::rngs::StdRng;
|
||||
use rand::SeedableRng;
|
||||
|
||||
#[test]
|
||||
fn test_level_max() {
|
||||
let lseq = LSEQBase64::new(StdRng::seed_from_u64(42));
|
||||
|
||||
// Level 0: 64 slots (0-63)
|
||||
assert_eq!(lseq.get_depth_max(0), 63);
|
||||
// Level 1: 4096 slots (0-4095)
|
||||
assert_eq!(lseq.get_depth_max(1), 4095);
|
||||
// Level 2: 262144 slots (0-262143)
|
||||
assert_eq!(lseq.get_depth_max(2), 262143);
|
||||
// Level 3: 16777216 slots (0-16777215)
|
||||
assert_eq!(lseq.get_depth_max(3), 16777215);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_basic_allocation() {
|
||||
let mut lseq = LSEQBase64::new(StdRng::seed_from_u64(42));
|
||||
|
||||
let key1 = lseq.allocate(None, None).unwrap();
|
||||
let key2 = lseq.allocate(Some(&key1), None).unwrap();
|
||||
let key3 = lseq.allocate(None, Some(&key1)).unwrap();
|
||||
|
||||
assert!(key3 < key1);
|
||||
assert!(key1 < key2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sort_key_ordering() {
|
||||
let key1 = SortKeyBase64::new(vec![5]);
|
||||
let key2 = SortKeyBase64::new(vec![5, 10]);
|
||||
let key3 = SortKeyBase64::new(vec![6]);
|
||||
|
||||
assert!(key1 < key2);
|
||||
assert!(key2 < key3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_boundary_usage() {
|
||||
let mut lseq = LSEQBase64::new(StdRng::seed_from_u64(42));
|
||||
|
||||
// Create keys with large gaps to test boundary limiting
|
||||
let key1 = SortKeyBase64::new(vec![0]);
|
||||
let key2 = SortKeyBase64::new(vec![63]);
|
||||
|
||||
// Allocate between them - should use BOUNDARY to limit step
|
||||
let key_between = lseq.allocate(Some(&key1), Some(&key2)).unwrap();
|
||||
|
||||
// The new key should be valid
|
||||
assert!(key1 < key_between);
|
||||
assert!(key_between < key2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_allocation_beyond_max_level() {
|
||||
let mut lseq = LSEQBase64::new(StdRng::seed_from_u64(42));
|
||||
|
||||
// Create two identifiers that are identical at every level up to MAX_LEVEL,
|
||||
// but differ by 1 at the MAX_LEVEL position. This forces the algorithm
|
||||
// to keep going deeper beyond MAX_LEVEL.
|
||||
|
||||
// Build p: [0, 0, 0, ..., 0, max_value_at_MAX_LEVEL - 1]
|
||||
let mut p = vec![0u64; MAX_LEVEL + 1];
|
||||
let max_value_at_max_level = (1u64 << (6 + 6 * MAX_LEVEL)) - 1;
|
||||
p[MAX_LEVEL] = max_value_at_max_level - 1;
|
||||
|
||||
// Build q: [0, 0, 0, ..., 0, max_value_at_MAX_LEVEL]
|
||||
let mut q = vec![0u64; MAX_LEVEL + 1];
|
||||
q[MAX_LEVEL] = max_value_at_max_level;
|
||||
|
||||
let p_key = SortKeyBase64::new(p);
|
||||
let q_key = SortKeyBase64::new(q);
|
||||
|
||||
// This should now succeed by allocating at depth MAX_LEVEL + 1 with capped max value
|
||||
let allocated_key = lseq.allocate(Some(&p_key), Some(&q_key)).unwrap();
|
||||
|
||||
// Verify the allocated key is properly ordered
|
||||
assert!(p_key < allocated_key, "p_key < allocated_key should be true");
|
||||
assert!(allocated_key < q_key, "allocated_key < q_key should be true");
|
||||
|
||||
// The allocated key should be at least MAX_LEVEL + 2 levels deep
|
||||
assert!(allocated_key.levels().len() >= MAX_LEVEL + 2,
|
||||
"Allocated key should be at least {} levels deep, got {}",
|
||||
MAX_LEVEL + 2, allocated_key.levels().len());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_formatting() {
|
||||
// Test with various values to verify digit padding
|
||||
let xs = vec![5, 6, 7, 8, 9];
|
||||
assert_eq!(SortKeyBase64::new(xs.clone()).to_string(), "5.6.7.8.9");
|
||||
assert_eq!(format!("{:?}", SortKeyBase64::new(xs)), "05.0006.000007.00000008.0000000009");
|
||||
|
||||
let ys = vec![5, 10, 63, 127, 4095];
|
||||
assert_eq!(SortKeyBase64::new(ys.clone()).to_string(), "5.10.63.127.4095");
|
||||
assert_eq!(format!("{:?}", SortKeyBase64::new(ys)), "05.0010.000063.00000127.0000004095");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_level_digits_lookup_correctness() {
|
||||
// Validate that our precomputed lookup table matches the actual calculation
|
||||
for i in 0..=MAX_LEVEL {
|
||||
let max_value = (1u64 << (6 + 6 * i)) - 1;
|
||||
let expected_digits = max_value.to_string().len();
|
||||
|
||||
assert_eq!(
|
||||
LEVEL_DIGITS_LOOKUP[i],
|
||||
expected_digits,
|
||||
"Level {} digit count mismatch: lookup={}, calculated={}, max_value={}",
|
||||
i, LEVEL_DIGITS_LOOKUP[i], expected_digits, max_value
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_level_slots() {
|
||||
// Test that get_level_slots function works correctly
|
||||
assert_eq!(get_level_slots(0), 64); // 64 * 64^0 = 64
|
||||
assert_eq!(get_level_slots(1), 4096); // 64 * 64^1 = 4096
|
||||
assert_eq!(get_level_slots(2), 262144); // 64 * 64^2 = 262144
|
||||
assert_eq!(get_level_slots(3), 16777216); // 64 * 64^3 = 16777216
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_max_base64_chars() {
|
||||
// Test the compact base64 encoding calculation (no separators)
|
||||
// Level i needs exactly (i+1) base64 characters in this encoding
|
||||
let key1 = SortKeyBase64::new(vec![5]); // Level 0 only
|
||||
assert_eq!(key1.max_base64_chars(), 1); // 1 character for level 0
|
||||
|
||||
let key2 = SortKeyBase64::new(vec![5, 10]); // Levels 0 and 1
|
||||
assert_eq!(key2.max_base64_chars(), 3); // 1 + 2 characters for levels 0 and 1
|
||||
|
||||
let key3 = SortKeyBase64::new(vec![5, 10, 15]); // Levels 0, 1, and 2
|
||||
assert_eq!(key3.max_base64_chars(), 6); // 1 + 2 + 3 characters for levels 0, 1, and 2
|
||||
|
||||
let key4 = SortKeyBase64::new(vec![1, 2, 3, 4, 5]); // Levels 0-4
|
||||
assert_eq!(key4.max_base64_chars(), 15); // 1 + 2 + 3 + 4 + 5 = 15
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_reproduce_ordering_violation_bug() {
|
||||
// Initialize logger with trace level for this test
|
||||
let _ = env_logger::Builder::from_default_env()
|
||||
.filter_level(log::LevelFilter::Trace)
|
||||
.is_test(true)
|
||||
.try_init();
|
||||
|
||||
// This test reproduces the exact bug found in random insertion:
|
||||
// ORDERING VIOLATION: allocated < after failed
|
||||
// before = "52.0034" (internal: [52, 34])
|
||||
// allocated = 52.0035.262119 (internal: [52, 35, 262119])
|
||||
// after = 52.0035 (internal: [52, 35])
|
||||
// Expected: before < allocated < after
|
||||
|
||||
let mut lseq = LSEQBase64::new(StdRng::seed_from_u64(42));
|
||||
|
||||
// Create the before and after keys from the bug report
|
||||
let before_key = SortKeyBase64::new(vec![52, 34]);
|
||||
let after_key = SortKeyBase64::new(vec![52, 35]);
|
||||
|
||||
// Verify the keys are properly ordered before we start
|
||||
assert!(before_key < after_key, "Sanity check: before < after should be true");
|
||||
|
||||
// Try to allocate between them - this should succeed and maintain ordering
|
||||
let allocated_key = lseq.allocate(Some(&before_key), Some(&after_key)).unwrap();
|
||||
|
||||
// Verify the allocated key is properly ordered
|
||||
assert!(before_key < allocated_key, "before < allocated should be true, got before={:?}, allocated={:?}", before_key, allocated_key);
|
||||
assert!(allocated_key < after_key, "allocated < after should be true, got allocated={:?}, after={:?}", allocated_key, after_key);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_reproduce_specific_ordering_violation_bug() {
|
||||
// Initialize logger with trace level for this test
|
||||
let _ = env_logger::Builder::from_default_env()
|
||||
.filter_level(log::LevelFilter::Trace)
|
||||
.is_test(true)
|
||||
.try_init();
|
||||
|
||||
// This test reproduces a specific ordering violation bug found in random insertion:
|
||||
// ORDERING VIOLATION: before < allocated failed
|
||||
// before = 51.0038 (internal: [51, 38])
|
||||
// allocated = 51.0017 (internal: [51, 17])
|
||||
// after = 52 (internal: [52])
|
||||
// Expected: before < allocated < after
|
||||
|
||||
// Create the before and after keys from the bug report
|
||||
let before_key = SortKeyBase64::new(vec![51, 38]);
|
||||
let after_key = SortKeyBase64::new(vec![52]);
|
||||
|
||||
// Verify the keys are properly ordered before we start
|
||||
assert!(before_key < after_key, "Sanity check: before < after should be true");
|
||||
|
||||
let mut violations_found = Vec::new();
|
||||
|
||||
// Loop over 1000 different seeds to see if we can reproduce the failure
|
||||
for seed in 0..1000 {
|
||||
let mut lseq: LSEQBase64<StdRng> = LSEQBase64::new(StdRng::seed_from_u64(seed));
|
||||
|
||||
// Initialize strategies to match the bug condition: [false, true, true]
|
||||
lseq.set_strategies(vec![false, true, true]);
|
||||
|
||||
// Try to allocate between them
|
||||
match lseq.allocate(Some(&before_key), Some(&after_key)) {
|
||||
Ok(allocated_key) => {
|
||||
// Check for ordering violations
|
||||
let before_violation = !(before_key < allocated_key);
|
||||
let after_violation = !(allocated_key < after_key);
|
||||
|
||||
if before_violation || after_violation {
|
||||
violations_found.push((seed, allocated_key.clone(), before_violation, after_violation));
|
||||
|
||||
eprintln!("ORDERING VIOLATION found with seed {}:
|
||||
before = {:?} (internal: {:?})
|
||||
allocated = {:?} (internal: {:?})
|
||||
after = {:?} (internal: {:?})
|
||||
before_violation: {} (before < allocated = {})
|
||||
after_violation: {} (allocated < after = {})",
|
||||
seed,
|
||||
before_key, before_key.levels(),
|
||||
allocated_key, allocated_key.levels(),
|
||||
after_key, after_key.levels(),
|
||||
before_violation, before_key < allocated_key,
|
||||
after_violation, allocated_key < after_key
|
||||
);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
eprintln!("Allocation failed with seed {}: {}", seed, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !violations_found.is_empty() {
|
||||
panic!("Found {} ordering violations out of 1000 seeds tested. First violation was with seed {}",
|
||||
violations_found.len(), violations_found[0].0);
|
||||
} else {
|
||||
println!("No ordering violations found across 1000 different seeds for the specific test case.");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_allocate_between_prefix_and_deep_extension() {
|
||||
// Initialize logger with trace level for this test
|
||||
let _ = env_logger::Builder::from_default_env()
|
||||
.filter_level(log::LevelFilter::Trace)
|
||||
.is_test(true)
|
||||
.try_init();
|
||||
|
||||
// Test allocating between [3] and [3, 0, 0, 0, 2]
|
||||
// This tests the case where we have a short key and a longer key that extends it deeply
|
||||
let mut lseq = LSEQBase64::new(StdRng::seed_from_u64(42));
|
||||
|
||||
let before_key = SortKeyBase64::new(vec![3]);
|
||||
let after_key = SortKeyBase64::new(vec![3, 0, 0, 0, 2]);
|
||||
|
||||
// Verify the keys are properly ordered before we start
|
||||
assert!(before_key < after_key, "Sanity check: before < after should be true");
|
||||
|
||||
// Allocate between them
|
||||
let allocated_key = lseq.allocate(Some(&before_key), Some(&after_key)).unwrap();
|
||||
|
||||
// Verify the allocated key is properly ordered
|
||||
assert!(before_key < allocated_key,
|
||||
"before < allocated should be true, got before={:?}, allocated={:?}",
|
||||
before_key, allocated_key);
|
||||
assert!(allocated_key < after_key,
|
||||
"allocated < after should be true, got allocated={:?}, after={:?}",
|
||||
allocated_key, after_key);
|
||||
|
||||
// The allocated key should start with [3] since that's the common prefix
|
||||
assert_eq!(allocated_key.levels()[0], 3, "Allocated key should start with 3");
|
||||
|
||||
// The allocated key should be at least 5 levels deep to fit between [3] and [3, 0, 0, 0, 2]
|
||||
assert_eq!(allocated_key.levels().len(), 5,
|
||||
"Allocated key should be 5 levels deep, got {:?}", allocated_key.levels());
|
||||
|
||||
println!("Successfully allocated between [3] and [3, 0, 0, 0, 2]: {:?}", allocated_key);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_allocate_between_max_value_and_next_level() {
|
||||
// Initialize logger with trace level for this test
|
||||
let _ = env_logger::Builder::from_default_env()
|
||||
.filter_level(log::LevelFilter::Trace)
|
||||
.is_test(true)
|
||||
.try_init();
|
||||
|
||||
// Test allocating between [2, 64^2 - 1] and [3, 0]
|
||||
// This tests suffix space allocation when the before key has max value at a level
|
||||
let mut lseq = LSEQBase64::new(StdRng::seed_from_u64(42));
|
||||
|
||||
let level_1_max = 64u64.pow(2) - 1; // 4095
|
||||
let before_key = SortKeyBase64::new(vec![2, level_1_max]);
|
||||
let after_key = SortKeyBase64::new(vec![3, 0]);
|
||||
|
||||
// Verify the keys are properly ordered before we start
|
||||
assert!(before_key < after_key, "Sanity check: before < after should be true");
|
||||
|
||||
// Allocate between them
|
||||
let allocated_key = lseq.allocate(Some(&before_key), Some(&after_key)).unwrap();
|
||||
|
||||
// Verify the allocated key is properly ordered
|
||||
assert!(before_key < allocated_key,
|
||||
"before < allocated should be true, got before={:?}, allocated={:?}",
|
||||
before_key, allocated_key);
|
||||
assert!(allocated_key < after_key,
|
||||
"allocated < after should be true, got allocated={:?}, after={:?}",
|
||||
allocated_key, after_key);
|
||||
|
||||
// Since [2] and [3] differ by 1, we should be allocating in suffix space after [2, 4095]
|
||||
// The allocated key should start with [2, 4095] as prefix
|
||||
assert_eq!(allocated_key.levels()[0], 2, "Allocated key should start with 2");
|
||||
assert_eq!(allocated_key.levels()[1], level_1_max, "Allocated key should have max value at level 1");
|
||||
|
||||
// The allocated key should be at least 3 levels deep for suffix space allocation
|
||||
assert!(allocated_key.levels().len() >= 3,
|
||||
"Allocated key should be at least 3 levels deep for suffix allocation, got {:?}",
|
||||
allocated_key.levels());
|
||||
|
||||
println!("Successfully allocated between [2, {}] and [3, 0]: {:?}", level_1_max, allocated_key);
|
||||
}
|
||||
}
|
||||
5
research/src/algorithms/mod.rs
Normal file
5
research/src/algorithms/mod.rs
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
pub mod original_paper_reference_impl;
|
||||
pub mod lseq_base64;
|
||||
|
||||
pub use original_paper_reference_impl::ReferenceLSEQ;
|
||||
pub use lseq_base64::LSEQBase64;
|
||||
501
research/src/algorithms/original_paper_reference_impl.rs
Normal file
501
research/src/algorithms/original_paper_reference_impl.rs
Normal file
|
|
@ -0,0 +1,501 @@
|
|||
use rand::Rng;
|
||||
use std::error::Error;
|
||||
use std::fmt;
|
||||
use log::{trace, debug};
|
||||
|
||||
const BOUNDARY: u64 = 10; // The paper says this can be any constant
|
||||
//
|
||||
// The maximum level is 58 because the maximum value of a level is 2^(4+58) - 1,
|
||||
// which is 2^62 - 1, which is i64::MAX. Because the coding below is lazy and
|
||||
// uses i64 to keep track of sign. This could be pushed to 59 if we used u64 for
|
||||
// calcuations.
|
||||
const MAX_LEVEL: usize = 58;
|
||||
|
||||
// Python program used to generate LEVEL_DIGITS_LOOKUP:
|
||||
// ```python
|
||||
// def compute_level_digits():
|
||||
// digits = []
|
||||
// for i in range(59):
|
||||
// max_value = (16 * (2 ** i)) - 1 # 2^(4+i) - 1
|
||||
// num_digits = len(str(max_value))
|
||||
// digits.append(num_digits)
|
||||
// return digits
|
||||
//
|
||||
// if __name__ == "__main__":
|
||||
// digits = compute_level_digits()
|
||||
// print(f"const LEVEL_DIGITS_LOOKUP: [usize; 59] = {digits};")
|
||||
// ```
|
||||
|
||||
// Precomputed number of digits needed for each level (0-58)
|
||||
// Level i has max value of 2^(4+i) - 1, so we need enough digits to represent that
|
||||
const LEVEL_DIGITS_LOOKUP: [usize; 59] = [
|
||||
2, 2, 2, 3, 3, 3, 4, 4, 4, 4,
|
||||
5, 5, 5, 6, 6, 6, 7, 7, 7, 7,
|
||||
8, 8, 8, 9, 9, 9, 10, 10, 10, 10,
|
||||
11, 11, 11, 12, 12, 12, 13, 13, 13, 13,
|
||||
14, 14, 14, 15, 15, 15, 16, 16, 16, 16,
|
||||
17, 17, 17, 18, 18, 18, 19, 19, 19,
|
||||
];
|
||||
|
||||
/// Reference implementation of L-SEQ following the original paper
|
||||
/// This is a direct, naive translation without optimizations
|
||||
pub struct ReferenceLSEQ<R: Rng> {
|
||||
/// Strategy vector - true for + strategy, false for - strategy
|
||||
strategies: Vec<bool>,
|
||||
/// Random number generator
|
||||
rng: R,
|
||||
}
|
||||
|
||||
/// Reference sort key implementation for the original paper
|
||||
#[derive(Clone, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct ReferenceSortKey {
|
||||
levels: Vec<u64>,
|
||||
}
|
||||
|
||||
impl ReferenceSortKey {
|
||||
pub fn new(levels: Vec<u64>) -> Self {
|
||||
Self { levels }
|
||||
}
|
||||
|
||||
pub fn levels(&self) -> &[u64] {
|
||||
&self.levels
|
||||
}
|
||||
|
||||
/// Calculate the number of base64 characters needed to encode the full identifier
|
||||
/// In this compact encoding, we pack all level bits together without separators:
|
||||
/// - Level 0: 4 bits (0-15)
|
||||
/// - Level 1: 5 bits (0-31)
|
||||
/// - Level 2: 6 bits (0-63)
|
||||
/// - etc.
|
||||
/// We sum all bits and encode as base64 (6 bits per character, rounding up).
|
||||
pub fn base64_chars_needed(&self) -> usize {
|
||||
let total_bits: usize = self.levels.iter().enumerate()
|
||||
.map(|(level, _)| 4 + level)
|
||||
.sum();
|
||||
|
||||
// Round up to nearest multiple of 6 bits (since base64 uses 6 bits per character)
|
||||
(total_bits + 5) / 6
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for ReferenceSortKey {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let parts: Vec<String> = self.levels.iter().map(|&x| x.to_string()).collect();
|
||||
write!(f, "{}", parts.join("."))
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for ReferenceSortKey {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let parts: Vec<String> = self.levels.iter().enumerate().map(|(level, &value)| {
|
||||
if level > MAX_LEVEL {
|
||||
panic!("Level exceeds u64 representation capacity");
|
||||
}
|
||||
let digits = LEVEL_DIGITS_LOOKUP[level];
|
||||
format!("{:0width$}", value, width = digits)
|
||||
}).collect();
|
||||
write!(f, "{}", parts.join("."))
|
||||
}
|
||||
}
|
||||
|
||||
impl<R: Rng> ReferenceLSEQ<R> {
|
||||
pub fn new(rng: R) -> Self {
|
||||
Self {
|
||||
strategies: Vec::new(),
|
||||
rng,
|
||||
}
|
||||
}
|
||||
|
||||
/// Set strategies for testing purposes
|
||||
#[cfg(test)]
|
||||
pub fn set_strategies(&mut self, strategies: Vec<bool>) {
|
||||
self.strategies = strategies;
|
||||
}
|
||||
|
||||
/// Allocate a new identifier between two existing identifiers
|
||||
pub fn allocate(&mut self, before: Option<&ReferenceSortKey>, after: Option<&ReferenceSortKey>) -> Result<ReferenceSortKey, Box<dyn Error>> {
|
||||
// Convert to the format expected by the paper's algorithm
|
||||
let p = before.map_or(vec![0], |k| k.levels().to_vec());
|
||||
let q = after.map_or(vec![self.get_depth_max(0)], |k| k.levels().to_vec());
|
||||
|
||||
let levels = self.alloc(&p, &q);
|
||||
let key = ReferenceSortKey::new(levels);
|
||||
|
||||
// Debug assertions to verify the allocated key is properly ordered
|
||||
if let Some(before_key) = before {
|
||||
debug_assert!(
|
||||
before_key < &key,
|
||||
"ORDERING VIOLATION: before < allocated failed\n\
|
||||
before = {:?} (internal: {:?})\n\
|
||||
allocated = {:?} (internal: {:?})\n\
|
||||
after = {} (internal: {:?})\n\
|
||||
Expected: before < allocated < after",
|
||||
before_key, before_key.levels(),
|
||||
key, key.levels(),
|
||||
after.map(|k| format!("{:?}", k)).unwrap_or_else(|| "None".to_string()),
|
||||
after.map(|k| k.levels()).unwrap_or(&[])
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(after_key) = after {
|
||||
debug_assert!(
|
||||
&key < after_key,
|
||||
"ORDERING VIOLATION: allocated < after failed\n\
|
||||
before = {} (internal: {:?})\n\
|
||||
allocated = {:?} (internal: {:?})\n\
|
||||
after = {:?} (internal: {:?})\n\
|
||||
Expected: before < allocated < after",
|
||||
before.map(|k| format!("{:?}", k)).unwrap_or_else(|| "None".to_string()),
|
||||
before.map(|k| k.levels()).unwrap_or(&[]),
|
||||
key, key.levels(),
|
||||
after_key, after_key.levels()
|
||||
);
|
||||
}
|
||||
|
||||
Ok(key)
|
||||
}
|
||||
|
||||
/// Get the maximum value for a given level (16 * 2^level - 1)
|
||||
/// For levels beyond MAX_LEVEL, we cap at 2^62 - 1 to avoid u64 overflow
|
||||
fn get_depth_max(&self, depth: usize) -> u64 {
|
||||
let max_val = if depth <= MAX_LEVEL {
|
||||
(1 << (4 + depth)) - 1
|
||||
} else {
|
||||
// Cap at 2^62 - 1 for levels beyond MAX_LEVEL
|
||||
(1 << 62) - 1
|
||||
};
|
||||
trace!("get_depth_max({}) -> {}", depth, max_val);
|
||||
max_val
|
||||
}
|
||||
|
||||
fn alloc(&mut self, p: &[u64], q: &[u64]) -> Vec<u64> {
|
||||
debug!("Starting allocation between p={:?} and q={:?}", p, q);
|
||||
if !(p.is_empty() && q.is_empty()) {
|
||||
debug_assert_ne!(p, q, "Cannot allocate between identical positions: p={:?}, q={:?}", p, q);
|
||||
}
|
||||
|
||||
let mut borrow_flag = false;
|
||||
let max_levels = std::cmp::max(p.len(), q.len()) + 1;
|
||||
let mut result = Vec::with_capacity(max_levels);
|
||||
|
||||
trace!("Initial state: carry_flag={}, max_levels={}", borrow_flag, max_levels);
|
||||
|
||||
// Phase 1: Find the allocation depth using continued fraction approach
|
||||
for depth in 0..max_levels {
|
||||
trace!("=== Processing depth {} ===", depth);
|
||||
trace!("Current result so far: {:?}", result);
|
||||
trace!("Current carry_flag: {}", borrow_flag);
|
||||
|
||||
if self.strategies.len() <= depth {
|
||||
let new_strategy = self.rng.gen_bool(0.5);
|
||||
trace!("BRANCH: Generating new strategy for depth {}: {} (+ strategy: {})",
|
||||
depth, new_strategy, new_strategy);
|
||||
self.strategies.push(new_strategy);
|
||||
} else {
|
||||
trace!("Using existing strategy for depth {}: {} (+ strategy: {})",
|
||||
depth, self.strategies[depth], self.strategies[depth]);
|
||||
}
|
||||
|
||||
let p_val = if depth < p.len() {
|
||||
trace!("BRANCH: p_val from p[{}] = {}", depth, p[depth]);
|
||||
p[depth]
|
||||
} else {
|
||||
trace!("BRANCH: p_val defaulted to 0 (depth {} >= p.len() {})", depth, p.len());
|
||||
0
|
||||
};
|
||||
|
||||
let q_val = if borrow_flag {
|
||||
let max_val = self.get_depth_max(depth);
|
||||
trace!("BRANCH: q_val from get_depth_max({}) = {} (carry_flag=true)", depth, max_val);
|
||||
max_val
|
||||
} else if depth < q.len() {
|
||||
trace!("BRANCH: q_val from q[{}] = {} (carry_flag=false)", depth, q[depth]);
|
||||
q[depth]
|
||||
} else {
|
||||
trace!("BRANCH: q_val defaulted to 0 (depth {} >= q.len() {}, carry_flag=false)", depth, q.len());
|
||||
0
|
||||
};
|
||||
|
||||
trace!("At depth {}: p_val={}, q_val={}, gap={}", depth, p_val, q_val, q_val.saturating_sub(p_val));
|
||||
|
||||
if p_val == q_val {
|
||||
trace!("BRANCH: Values equal at depth {} (p_val={}, q_val={}), extending prefix and going deeper",
|
||||
depth, p_val, q_val);
|
||||
result.push(p_val);
|
||||
continue;
|
||||
}
|
||||
|
||||
if q_val < p_val {
|
||||
trace!("BRANCH: ERROR - q_val < p_val at depth {} (q_val={}, p_val={})", depth, q_val, p_val);
|
||||
debug_assert!(q_val > p_val, "q < p at depth {}", depth);
|
||||
// We know that q > p overall, and we know that we had a shared
|
||||
// prefix up until this point, therefore q_val must be greater than p_val
|
||||
// TODO I might want to return an error here instead of panicking
|
||||
}
|
||||
|
||||
let gap = q_val - p_val;
|
||||
if gap > 1 {
|
||||
// Enough space at this level
|
||||
trace!("BRANCH: Sufficient space found at depth {} (gap={} > 1)", depth, gap);
|
||||
let interval = gap - 1;
|
||||
let step = std::cmp::min(BOUNDARY, interval);
|
||||
|
||||
let allocated_value = if self.strategies[depth] {
|
||||
let delta = self.rng.gen_range(1..=step);
|
||||
trace!("Space allocation: interval={}, step={}, delta={}", interval, step, delta);
|
||||
let val = p_val + delta;
|
||||
trace!("BRANCH: Using + strategy, allocated_value = p_val + delta = {} + {} = {}",
|
||||
p_val, delta, val);
|
||||
val
|
||||
} else {
|
||||
let delta = if borrow_flag {
|
||||
self.rng.gen_range(1..=step)
|
||||
} else {
|
||||
self.rng.gen_range(1..=step)
|
||||
};
|
||||
trace!("Space allocation: interval={}, step={}, delta={}", interval, step, delta);
|
||||
let val = q_val - delta;
|
||||
trace!("BRANCH: Using - strategy, allocated_value = q_val - delta = {} - {} = {}",
|
||||
q_val, delta, val);
|
||||
val
|
||||
};
|
||||
|
||||
result.push(allocated_value);
|
||||
trace!("BRANCH: Allocation complete at depth {}, final result: {:?}", depth, result);
|
||||
return result;
|
||||
} else {
|
||||
trace!("BRANCH: Insufficient space at depth {} (gap={} <= 1), extending prefix and setting carry_flag",
|
||||
depth, gap);
|
||||
result.push(p_val);
|
||||
borrow_flag = true;
|
||||
trace!("Updated state: result={:?}, carry_flag={}", result, borrow_flag);
|
||||
}
|
||||
}
|
||||
|
||||
trace!("BRANCH: Loop completed without allocation, returning result: {:?}", result);
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the number of slots for a given level (16 * 2^level)
|
||||
#[allow(dead_code)]
|
||||
fn get_level_slots(level: usize) -> u64 {
|
||||
let base_slots = 16u64;
|
||||
let multiplier = 2u64.checked_pow(level as u32)
|
||||
.expect("Level exceeds u64 representation capacity");
|
||||
|
||||
base_slots.checked_mul(multiplier)
|
||||
.expect("Level slots exceed u64 capacity")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use rand::rngs::StdRng;
|
||||
use rand::SeedableRng;
|
||||
|
||||
#[test]
|
||||
fn test_level_max() {
|
||||
let lseq = ReferenceLSEQ::new(StdRng::seed_from_u64(42));
|
||||
|
||||
assert_eq!(lseq.get_depth_max(0), 15);
|
||||
assert_eq!(lseq.get_depth_max(1), 31);
|
||||
assert_eq!(lseq.get_depth_max(2), 63);
|
||||
assert_eq!(lseq.get_depth_max(3), 127);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_basic_allocation() {
|
||||
let mut lseq = ReferenceLSEQ::new(StdRng::seed_from_u64(42));
|
||||
|
||||
let key1 = lseq.allocate(None, None).unwrap();
|
||||
let key2 = lseq.allocate(Some(&key1), None).unwrap();
|
||||
let key3 = lseq.allocate(None, Some(&key1)).unwrap();
|
||||
|
||||
assert!(key3 < key1);
|
||||
assert!(key1 < key2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sort_key_ordering() {
|
||||
let key1 = ReferenceSortKey::new(vec![5]);
|
||||
let key2 = ReferenceSortKey::new(vec![5, 10]);
|
||||
let key3 = ReferenceSortKey::new(vec![6]);
|
||||
|
||||
assert!(key1 < key2);
|
||||
assert!(key2 < key3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_boundary_usage() {
|
||||
let mut lseq = ReferenceLSEQ::new(StdRng::seed_from_u64(42));
|
||||
|
||||
// Create keys with large gaps to test boundary limiting
|
||||
let key1 = ReferenceSortKey::new(vec![0]);
|
||||
let key2 = ReferenceSortKey::new(vec![15]);
|
||||
|
||||
// Allocate between them - should use BOUNDARY to limit step
|
||||
let key_between = lseq.allocate(Some(&key1), Some(&key2)).unwrap();
|
||||
|
||||
// The new key should be valid
|
||||
assert!(key1 < key_between);
|
||||
assert!(key_between < key2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_allocation_beyond_max_level() {
|
||||
let mut lseq = ReferenceLSEQ::new(StdRng::seed_from_u64(42));
|
||||
|
||||
// Create two identifiers that are identical at every level up to MAX_LEVEL,
|
||||
// but differ by 1 at the MAX_LEVEL position. This forces the algorithm
|
||||
// to keep going deeper beyond MAX_LEVEL.
|
||||
|
||||
// Build p: [0, 0, 0, ..., 0, max_value_at_MAX_LEVEL - 1]
|
||||
let mut p = vec![0u64; MAX_LEVEL + 1];
|
||||
let max_value_at_max_level = (1u64 << (4 + MAX_LEVEL)) - 1;
|
||||
p[MAX_LEVEL] = max_value_at_max_level - 1;
|
||||
|
||||
// Build q: [0, 0, 0, ..., 0, max_value_at_MAX_LEVEL]
|
||||
let mut q = vec![0u64; MAX_LEVEL + 1];
|
||||
q[MAX_LEVEL] = max_value_at_max_level;
|
||||
|
||||
let p_key = ReferenceSortKey::new(p);
|
||||
let q_key = ReferenceSortKey::new(q);
|
||||
|
||||
// This should now succeed by allocating at depth MAX_LEVEL + 1 with capped max value
|
||||
let allocated_key = lseq.allocate(Some(&p_key), Some(&q_key)).unwrap();
|
||||
|
||||
// Verify the allocated key is properly ordered
|
||||
assert!(p_key < allocated_key, "p_key < allocated_key should be true");
|
||||
assert!(allocated_key < q_key, "allocated_key < q_key should be true");
|
||||
|
||||
// The allocated key should be at least MAX_LEVEL + 2 levels deep
|
||||
assert!(allocated_key.levels().len() >= MAX_LEVEL + 2,
|
||||
"Allocated key should be at least {} levels deep, got {}",
|
||||
MAX_LEVEL + 2, allocated_key.levels().len());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_formatting() {
|
||||
// Test with values that needs 3 digits at 4th level (128 slots)
|
||||
|
||||
let xs = vec![5, 6, 7, 8, 9];
|
||||
assert_eq!(ReferenceSortKey::new(xs.clone()).to_string(), "5.6.7.8.9");
|
||||
assert_eq!(format!("{:?}", ReferenceSortKey::new(xs)), "05.06.07.008.009");
|
||||
|
||||
|
||||
let ys = vec![5, 10, 63, 127];
|
||||
assert_eq!(ReferenceSortKey::new(ys.clone()).to_string(), "5.10.63.127");
|
||||
assert_eq!(format!("{:?}", ReferenceSortKey::new(ys)), "05.10.63.127");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_level_digits_lookup_correctness() {
|
||||
// Validate that our precomputed lookup table matches the actual calculation
|
||||
for i in 0..=MAX_LEVEL {
|
||||
let max_value = (1u64 << (4 + i)) - 1;
|
||||
let expected_digits = max_value.to_string().len();
|
||||
|
||||
assert_eq!(
|
||||
LEVEL_DIGITS_LOOKUP[i],
|
||||
expected_digits,
|
||||
"Level {} digit count mismatch: lookup={}, calculated={}, max_value={}",
|
||||
i, LEVEL_DIGITS_LOOKUP[i], expected_digits, max_value
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_base64_chars_needed() {
|
||||
// Test the compact base64 encoding calculation (no separators)
|
||||
let key1 = ReferenceSortKey::new(vec![5]); // Level 0 only: 4 bits
|
||||
assert_eq!(key1.base64_chars_needed(), 1); // 4 bits -> 1 base64 character
|
||||
|
||||
let key2 = ReferenceSortKey::new(vec![5, 10]); // Levels 0 and 1: 4 + 5 = 9 bits
|
||||
assert_eq!(key2.base64_chars_needed(), 2); // 9 bits -> 2 base64 characters
|
||||
|
||||
let key3 = ReferenceSortKey::new(vec![5, 10, 15]); // Levels 0, 1, and 2: 4 + 5 + 6 = 15 bits
|
||||
assert_eq!(key3.base64_chars_needed(), 3); // 15 bits -> 3 base64 characters
|
||||
|
||||
let key4 = ReferenceSortKey::new(vec![1, 2, 3, 4, 5]); // Levels 0-4: 4 + 5 + 6 + 7 + 8 = 30 bits
|
||||
assert_eq!(key4.base64_chars_needed(), 5); // 30 bits -> 5 base64 characters
|
||||
|
||||
// Test edge case: exactly divisible by 6
|
||||
let key5 = ReferenceSortKey::new(vec![1, 2, 3, 4, 5, 6]); // Levels 0-5: 4 + 5 + 6 + 7 + 8 + 9 = 39 bits
|
||||
assert_eq!(key5.base64_chars_needed(), 7); // 39 bits -> 7 base64 characters
|
||||
|
||||
// Test edge case with 36 bits (exactly divisible by 6)
|
||||
let key6 = ReferenceSortKey::new(vec![1, 2, 3, 4, 5, 6, 7]); // Levels 0-6: 4+5+6+7+8+9+10 = 49 bits
|
||||
assert_eq!(key6.base64_chars_needed(), 9); // 49 bits -> 9 base64 characters (rounded up from 8.17)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_continued_fraction_ordering_validation() {
|
||||
// Initialize logger with trace level for this test
|
||||
let _ = env_logger::Builder::from_default_env()
|
||||
.filter_level(log::LevelFilter::Trace)
|
||||
.is_test(true)
|
||||
.try_init();
|
||||
|
||||
// Test the continued fraction approach with adjacent identifiers
|
||||
let mut lseq = ReferenceLSEQ::new(StdRng::seed_from_u64(42));
|
||||
|
||||
// Create adjacent keys that need to use the continued fraction approach
|
||||
let before_key = ReferenceSortKey::new(vec![5, 10]);
|
||||
let after_key = ReferenceSortKey::new(vec![5, 11]);
|
||||
|
||||
// Verify the keys are properly ordered before we start
|
||||
assert!(before_key < after_key, "Sanity check: before < after should be true");
|
||||
|
||||
// Try to allocate between them - this should succeed using the continued fraction approach
|
||||
let allocated_key = lseq.allocate(Some(&before_key), Some(&after_key)).unwrap();
|
||||
|
||||
// Verify the allocated key is properly ordered
|
||||
assert!(before_key < allocated_key, "before < allocated should be true, got before={:?}, allocated={:?}", before_key, allocated_key);
|
||||
assert!(allocated_key < after_key, "allocated < after should be true, got allocated={:?}, after={:?}", allocated_key, after_key);
|
||||
|
||||
// The allocated key should be at least 3 levels deep since there's no space at level 1
|
||||
assert!(allocated_key.levels().len() >= 3,
|
||||
"Allocated key should be at least 3 levels deep for continued fraction, got {:?}",
|
||||
allocated_key.levels());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_allocate_between_prefix_and_deep_extension() {
|
||||
// Initialize logger with trace level for this test
|
||||
let _ = env_logger::Builder::from_default_env()
|
||||
.filter_level(log::LevelFilter::Trace)
|
||||
.is_test(true)
|
||||
.try_init();
|
||||
|
||||
// Test allocating between [3] and [3, 0, 0, 0, 2]
|
||||
// This tests the case where we have a short key and a longer key that extends it deeply
|
||||
let mut lseq = ReferenceLSEQ::new(StdRng::seed_from_u64(42));
|
||||
|
||||
let before_key = ReferenceSortKey::new(vec![3]);
|
||||
let after_key = ReferenceSortKey::new(vec![3, 0, 0, 0, 2]);
|
||||
|
||||
// Verify the keys are properly ordered before we start
|
||||
assert!(before_key < after_key, "Sanity check: before < after should be true");
|
||||
|
||||
// Allocate between them
|
||||
let allocated_key = lseq.allocate(Some(&before_key), Some(&after_key)).unwrap();
|
||||
|
||||
// Verify the allocated key is properly ordered
|
||||
assert!(before_key < allocated_key,
|
||||
"before < allocated should be true, got before={:?}, allocated={:?}",
|
||||
before_key, allocated_key);
|
||||
assert!(allocated_key < after_key,
|
||||
"allocated < after should be true, got allocated={:?}, after={:?}",
|
||||
allocated_key, after_key);
|
||||
|
||||
// The allocated key should start with [3] since that's the common prefix
|
||||
assert_eq!(allocated_key.levels()[0], 3, "Allocated key should start with 3");
|
||||
|
||||
// The allocated key should be at least 5 levels deep to fit between [3] and [3, 0, 0, 0, 2]
|
||||
assert_eq!(allocated_key.levels().len(), 5,
|
||||
"Allocated key should be 5 levels deep, got {:?}", allocated_key.levels());
|
||||
|
||||
println!("Successfully allocated between [3] and [3, 0, 0, 0, 2]: {:?}", allocated_key);
|
||||
}
|
||||
}
|
||||
373
research/src/bin/encoding_analyzer.rs
Normal file
373
research/src/bin/encoding_analyzer.rs
Normal file
|
|
@ -0,0 +1,373 @@
|
|||
/*!
|
||||
# L-SEQ Encoding Analysis Tool
|
||||
|
||||
This binary demonstrates the encoding efficiency analysis for L-SEQ algorithms.
|
||||
|
||||
It allocates a large number of identifiers (configurable, default 10,000) and shows:
|
||||
- Base64 encoding size histograms
|
||||
- Comparison between different L-SEQ variants
|
||||
- Statistics useful for real-world deployment decisions
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
cargo run --bin encoding_analyzer
|
||||
cargo run --bin encoding_analyzer -- --count 1000000
|
||||
cargo run --bin encoding_analyzer -- --count 10000 --insertion-mode random
|
||||
cargo run --bin encoding_analyzer -- --count 10000 --insertion-mode tail
|
||||
cargo run --bin encoding_analyzer -- --count 10000 --insertion-mode head
|
||||
```
|
||||
|
||||
## Options
|
||||
|
||||
- `--count <number>`: Number of identifiers to generate (default: 10000)
|
||||
- `--insertion-mode <mode>`: 'tail' for sequential insertion, 'random' for random insertion, or 'head' for head insertion (default: tail)
|
||||
*/
|
||||
|
||||
use std::env;
|
||||
use rand::rngs::StdRng;
|
||||
use rand::{Rng, SeedableRng};
|
||||
use peoplesgrocers_lseq_research::algorithms::lseq_base64::{LSEQBase64, SortKeyBase64};
|
||||
use peoplesgrocers_lseq_research::algorithms::original_paper_reference_impl::{ReferenceLSEQ, ReferenceSortKey};
|
||||
use peoplesgrocers_lseq_research::encoding_analysis::{analyze_base64_encoding, analyze_reference_encoding, compare_encodings};
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
enum InsertionMode {
|
||||
Tail,
|
||||
Random,
|
||||
Head,
|
||||
}
|
||||
|
||||
impl InsertionMode {
|
||||
fn from_str(s: &str) -> Result<Self, &'static str> {
|
||||
match s.to_lowercase().as_str() {
|
||||
"tail" => Ok(InsertionMode::Tail),
|
||||
"random" => Ok(InsertionMode::Random),
|
||||
"head" => Ok(InsertionMode::Head),
|
||||
_ => Err("Invalid insertion mode. Use 'tail', 'random', or 'head'"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Verify that all keys are sorted in proper order
|
||||
fn verify_sorted_base64(keys: &[SortKeyBase64]) -> Result<(), String> {
|
||||
for i in 1..keys.len() {
|
||||
if keys[i-1] >= keys[i] {
|
||||
return Err(format!(
|
||||
"I expected key at position {} to be smaller than key at position {}\n\
|
||||
[{}] = {:?} (internal: {:?})\n\
|
||||
[{}] = {:?} (internal: {:?})\n\
|
||||
But {:?} >= {:?}",
|
||||
i-1, i,
|
||||
i-1, keys[i-1], keys[i-1].levels(),
|
||||
i, keys[i], keys[i].levels(),
|
||||
keys[i-1], keys[i]
|
||||
));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Verify that all keys are sorted in proper order
|
||||
#[allow(dead_code)]
|
||||
fn verify_sorted_reference(keys: &[ReferenceSortKey]) -> Result<(), String> {
|
||||
for i in 1..keys.len() {
|
||||
if keys[i-1] >= keys[i] {
|
||||
return Err(format!(
|
||||
"I expected key at position {} to be smaller than key at position {}\n\
|
||||
[{}] = {:?} (internal: {:?})\n\
|
||||
[{}] = {:?} (internal: {:?})\n\
|
||||
But {:?} >= {:?}",
|
||||
i-1, i,
|
||||
i-1, keys[i-1], keys[i-1].levels(),
|
||||
i, keys[i], keys[i].levels(),
|
||||
keys[i-1], keys[i]
|
||||
));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Generate random insertion positions for consistent comparison
|
||||
fn generate_insertion_positions(count: usize, rng: &mut StdRng) -> Vec<usize> {
|
||||
let mut positions = Vec::new();
|
||||
|
||||
for i in 0..count {
|
||||
if i == 0 {
|
||||
positions.push(0); // First element always goes at position 0
|
||||
} else {
|
||||
// Insert after position 0 to i-1 (current list has i elements)
|
||||
positions.push(rng.gen_range(0..i));
|
||||
}
|
||||
}
|
||||
|
||||
positions
|
||||
}
|
||||
|
||||
/// Generate identifiers using tail insertion
|
||||
fn generate_tail_insertion_base64(count: usize, rng: StdRng) -> Vec<SortKeyBase64> {
|
||||
let mut keys = Vec::new();
|
||||
let mut lseq = LSEQBase64::new(rng);
|
||||
|
||||
for i in 0..count {
|
||||
let before = if i == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(&keys[i - 1])
|
||||
};
|
||||
|
||||
let key = lseq.allocate(before, None).unwrap();
|
||||
keys.push(key);
|
||||
}
|
||||
|
||||
keys
|
||||
}
|
||||
|
||||
/// Generate identifiers using tail insertion
|
||||
fn generate_tail_insertion_reference(count: usize, rng: StdRng) -> Vec<ReferenceSortKey> {
|
||||
let mut keys = Vec::new();
|
||||
let mut lseq = ReferenceLSEQ::new(rng);
|
||||
|
||||
for i in 0..count {
|
||||
let before = if i == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(&keys[i - 1])
|
||||
};
|
||||
|
||||
let key = lseq.allocate(before, None).unwrap();
|
||||
keys.push(key);
|
||||
}
|
||||
|
||||
keys
|
||||
}
|
||||
|
||||
/// Generate identifiers using head insertion
|
||||
fn generate_head_insertion_base64(count: usize, rng: StdRng) -> Vec<SortKeyBase64> {
|
||||
let mut keys = Vec::new();
|
||||
let mut lseq = LSEQBase64::new(rng);
|
||||
|
||||
for i in 0..count {
|
||||
let after = if i == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(&keys[0])
|
||||
};
|
||||
|
||||
let key = lseq.allocate(None, after).unwrap();
|
||||
keys.insert(0, key);
|
||||
}
|
||||
|
||||
keys
|
||||
}
|
||||
|
||||
/// Generate identifiers using head insertion
|
||||
fn generate_head_insertion_reference(count: usize, rng: StdRng) -> Vec<ReferenceSortKey> {
|
||||
let mut keys = Vec::new();
|
||||
let mut lseq = ReferenceLSEQ::new(rng);
|
||||
|
||||
for i in 0..count {
|
||||
let after = if i == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(&keys[0])
|
||||
};
|
||||
|
||||
let key = lseq.allocate(None, after).unwrap();
|
||||
keys.insert(0, key);
|
||||
}
|
||||
|
||||
keys
|
||||
}
|
||||
|
||||
/// Generate identifiers using random insertion at the same positions
|
||||
fn generate_random_insertion_base64(count: usize, positions: &[usize], rng: StdRng) -> Vec<SortKeyBase64> {
|
||||
let mut keys = Vec::new();
|
||||
let mut lseq = LSEQBase64::new(rng);
|
||||
|
||||
for i in 0..count {
|
||||
eprintln!("Generating key {} of {}", i, count);
|
||||
let insert_after_pos = positions[i];
|
||||
|
||||
// We want to insert after position insert_after_pos
|
||||
// before = element at insert_after_pos (if valid)
|
||||
// after = element at insert_after_pos + 1 (if valid)
|
||||
// insert at position insert_after_pos + 1
|
||||
|
||||
let before = if insert_after_pos >= keys.len() {
|
||||
// If insert_after_pos is beyond the end, insert at the end
|
||||
keys.last()
|
||||
} else {
|
||||
Some(&keys[insert_after_pos])
|
||||
};
|
||||
|
||||
let after = if insert_after_pos + 1 >= keys.len() {
|
||||
None
|
||||
} else {
|
||||
Some(&keys[insert_after_pos + 1])
|
||||
};
|
||||
|
||||
eprintln!("before: {:?}, after: {:?}", before, after);
|
||||
let key = lseq.allocate(before, after).unwrap();
|
||||
let insert_pos = std::cmp::min(insert_after_pos + 1, keys.len());
|
||||
keys.insert(insert_pos, key);
|
||||
}
|
||||
|
||||
keys
|
||||
}
|
||||
|
||||
/// Generate identifiers using random insertion at the same positions
|
||||
fn generate_random_insertion_reference(count: usize, positions: &[usize], rng: StdRng) -> Vec<ReferenceSortKey> {
|
||||
let mut keys = Vec::new();
|
||||
let mut lseq = ReferenceLSEQ::new(rng);
|
||||
|
||||
for i in 0..count {
|
||||
let insert_after_pos = positions[i];
|
||||
|
||||
// We want to insert after position insert_after_pos
|
||||
// before = element at insert_after_pos (if valid)
|
||||
// after = element at insert_after_pos + 1 (if valid)
|
||||
// insert at position insert_after_pos + 1
|
||||
|
||||
let before = if insert_after_pos >= keys.len() {
|
||||
// If insert_after_pos is beyond the end, insert at the end
|
||||
keys.last()
|
||||
} else {
|
||||
Some(&keys[insert_after_pos])
|
||||
};
|
||||
|
||||
let after = if insert_after_pos + 1 >= keys.len() {
|
||||
None
|
||||
} else {
|
||||
Some(&keys[insert_after_pos + 1])
|
||||
};
|
||||
|
||||
let key = lseq.allocate(before, after).unwrap();
|
||||
let insert_pos = std::cmp::min(insert_after_pos + 1, keys.len());
|
||||
keys.insert(insert_pos, key);
|
||||
}
|
||||
|
||||
keys
|
||||
}
|
||||
|
||||
fn main() {
|
||||
// Parse command line arguments
|
||||
let args: Vec<String> = env::args().collect();
|
||||
let mut count = 10000;
|
||||
let mut insertion_mode = InsertionMode::Tail;
|
||||
|
||||
let mut i = 1;
|
||||
while i < args.len() {
|
||||
match args[i].as_str() {
|
||||
"--count" => {
|
||||
if i + 1 < args.len() {
|
||||
count = args[i + 1].parse::<usize>().unwrap_or(10000);
|
||||
i += 2;
|
||||
} else {
|
||||
eprintln!("Error: --count requires a number");
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
"--insertion-mode" => {
|
||||
if i + 1 < args.len() {
|
||||
insertion_mode = InsertionMode::from_str(&args[i + 1]).unwrap_or_else(|err| {
|
||||
eprintln!("Error: {}", err);
|
||||
std::process::exit(1);
|
||||
});
|
||||
i += 2;
|
||||
} else {
|
||||
eprintln!("Error: --insertion-mode requires 'tail', 'random', or 'head'");
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
eprintln!("Unknown argument: {}", args[i]);
|
||||
std::process::exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
println!("L-SEQ Encoding Analysis Tool");
|
||||
println!("============================");
|
||||
println!("Allocating {} identifiers for analysis...", count);
|
||||
println!("Insertion mode: {:?}", insertion_mode);
|
||||
println!();
|
||||
|
||||
// Generate identifiers based on insertion mode
|
||||
let (base64_keys, reference_keys) = match insertion_mode {
|
||||
InsertionMode::Tail => {
|
||||
println!("Using tail insertion (sequential)...");
|
||||
let base64_keys = generate_tail_insertion_base64(count, StdRng::seed_from_u64(42));
|
||||
let reference_keys = generate_tail_insertion_reference(count, StdRng::seed_from_u64(42));
|
||||
(base64_keys, reference_keys)
|
||||
}
|
||||
InsertionMode::Random => {
|
||||
println!("Using random insertion...");
|
||||
let mut rng = StdRng::seed_from_u64(42);
|
||||
let positions = generate_insertion_positions(count, &mut rng);
|
||||
|
||||
let base64_keys = generate_random_insertion_base64(count, &positions, StdRng::seed_from_u64(42));
|
||||
let reference_keys = generate_random_insertion_reference(count, &positions, StdRng::seed_from_u64(42));
|
||||
(base64_keys, reference_keys)
|
||||
}
|
||||
InsertionMode::Head => {
|
||||
println!("Using head insertion (reverse sequential)...");
|
||||
let base64_keys = generate_head_insertion_base64(count, StdRng::seed_from_u64(42));
|
||||
let reference_keys = generate_head_insertion_reference(count, StdRng::seed_from_u64(42));
|
||||
(base64_keys, reference_keys)
|
||||
}
|
||||
};
|
||||
|
||||
// Verify that all keys are sorted
|
||||
println!("Verifying sort order...");
|
||||
if let Err(e) = verify_sorted_base64(&base64_keys) {
|
||||
eprintln!("ERROR: Base64 keys not sorted: {}", e);
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
||||
//if let Err(e) = verify_sorted_reference(&reference_keys) {
|
||||
// eprintln!("ERROR: Reference keys not sorted: {}", e);
|
||||
// std::process::exit(1);
|
||||
//}
|
||||
|
||||
println!("✓ All keys are properly sorted!");
|
||||
println!();
|
||||
|
||||
// Analyze encoding efficiency
|
||||
let base64_stats = analyze_base64_encoding(&base64_keys);
|
||||
let reference_stats = analyze_reference_encoding(&reference_keys);
|
||||
|
||||
// Print results
|
||||
base64_stats.print_summary("Base64 Variant (64 slots per level)");
|
||||
reference_stats.print_summary("Reference Implementation (16 * 2^level slots)");
|
||||
|
||||
compare_encodings(&base64_stats, "Base64 Variant", &reference_stats, "Reference");
|
||||
|
||||
// Additional analysis
|
||||
println!("\n=== Additional Analysis ===");
|
||||
println!("Total base64 characters needed:");
|
||||
let base64_total: usize = base64_keys.iter().map(|k| k.max_base64_chars()).sum();
|
||||
let reference_total: usize = reference_keys.iter().map(|k| k.base64_chars_needed()).sum();
|
||||
|
||||
println!(" Base64 variant: {} characters", base64_total);
|
||||
println!(" Reference impl: {} characters", reference_total);
|
||||
println!(" Difference: {} characters ({:.1}% {})",
|
||||
base64_total.abs_diff(reference_total),
|
||||
(base64_total as f64 - reference_total as f64).abs() / reference_total as f64 * 100.0,
|
||||
if base64_total > reference_total { "more" } else { "less" });
|
||||
|
||||
println!("\nAverage bytes per key (assuming 1 byte per base64 character):");
|
||||
println!(" Base64 variant: {:.2} bytes", base64_total as f64 / count as f64);
|
||||
println!(" Reference impl: {:.2} bytes", reference_total as f64 / count as f64);
|
||||
|
||||
// Show some sample keys for understanding
|
||||
println!("\n=== Sample Keys (first 10) ===");
|
||||
for i in 0..std::cmp::min(10, count) {
|
||||
println!("Key {}: Base64({} chars) = {:?}, Reference({} chars) = {:?}",
|
||||
i,
|
||||
base64_keys[i].max_base64_chars(),
|
||||
base64_keys[i],
|
||||
reference_keys[i].base64_chars_needed(),
|
||||
reference_keys[i]);
|
||||
}
|
||||
}
|
||||
180
research/src/encoding_analysis.rs
Normal file
180
research/src/encoding_analysis.rs
Normal file
|
|
@ -0,0 +1,180 @@
|
|||
/*!
|
||||
# L-SEQ Encoding Efficiency Analysis
|
||||
|
||||
This module provides tools for analyzing the encoding efficiency of L-SEQ algorithms.
|
||||
|
||||
## Use Case
|
||||
|
||||
When implementing L-SEQ in real-world applications (especially web applications), we need to
|
||||
serialize and transfer sort keys between systems. JavaScript and web APIs commonly use base64
|
||||
encoding for safely representing binary data in text format.
|
||||
|
||||
To measure the practical efficiency of different L-SEQ variants, we:
|
||||
|
||||
1. **Allocate large numbers of identifiers** (e.g., 1,000,000) in realistic usage patterns
|
||||
2. **Calculate base64 encoding requirements** for each identifier using the "maximally encoded"
|
||||
compact format (no separators, since the structure is known)
|
||||
3. **Generate histograms** showing the distribution of encoding sizes
|
||||
4. **Compare different algorithms** to understand their space efficiency trade-offs
|
||||
|
||||
## Encoding Formats
|
||||
|
||||
### Base64 Variant (64 slots per level)
|
||||
- Level 0: 1 base64 character (6 bits, 0-63)
|
||||
- Level 1: 2 base64 characters (12 bits, 0-4095)
|
||||
- Level 2: 3 base64 characters (18 bits, 0-262143)
|
||||
- Sequential parsing: read 1 char, then 2 chars, then 3 chars, etc.
|
||||
|
||||
### Original Paper Reference (16 * 2^level slots)
|
||||
- Level 0: 4 bits (0-15)
|
||||
- Level 1: 5 bits (0-31)
|
||||
- Level 2: 6 bits (0-63)
|
||||
- Packed encoding: concatenate all bits, encode as base64 (6 bits per character)
|
||||
|
||||
## Analysis Functions
|
||||
|
||||
This module provides functions to:
|
||||
- Calculate encoding size histograms for collections of sort keys
|
||||
- Compare efficiency between different L-SEQ variants
|
||||
- Generate statistics for real-world usage scenarios
|
||||
*/
|
||||
|
||||
use std::collections::HashMap;
|
||||
use crate::algorithms::lseq_base64::SortKeyBase64;
|
||||
use crate::algorithms::original_paper_reference_impl::ReferenceSortKey;
|
||||
|
||||
/// Histogram of base64 encoding sizes
|
||||
pub type EncodingSizeHistogram = HashMap<usize, usize>;
|
||||
|
||||
/// Statistics about encoding sizes
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct EncodingStats {
|
||||
pub total_keys: usize,
|
||||
pub min_size: usize,
|
||||
pub max_size: usize,
|
||||
pub mean_size: f64,
|
||||
pub median_size: usize,
|
||||
pub histogram: EncodingSizeHistogram,
|
||||
}
|
||||
|
||||
impl EncodingStats {
|
||||
/// Calculate statistics from a list of encoding sizes
|
||||
pub fn from_sizes(sizes: Vec<usize>) -> Self {
|
||||
let total_keys = sizes.len();
|
||||
let min_size = *sizes.iter().min().unwrap_or(&0);
|
||||
let max_size = *sizes.iter().max().unwrap_or(&0);
|
||||
let mean_size = sizes.iter().sum::<usize>() as f64 / total_keys as f64;
|
||||
|
||||
let mut sorted_sizes = sizes.clone();
|
||||
sorted_sizes.sort_unstable();
|
||||
let median_size = if total_keys % 2 == 0 {
|
||||
(sorted_sizes[total_keys / 2 - 1] + sorted_sizes[total_keys / 2]) / 2
|
||||
} else {
|
||||
sorted_sizes[total_keys / 2]
|
||||
};
|
||||
|
||||
let mut histogram = HashMap::new();
|
||||
for size in sizes {
|
||||
*histogram.entry(size).or_insert(0) += 1;
|
||||
}
|
||||
|
||||
Self {
|
||||
total_keys,
|
||||
min_size,
|
||||
max_size,
|
||||
mean_size,
|
||||
median_size,
|
||||
histogram,
|
||||
}
|
||||
}
|
||||
|
||||
/// Print a formatted summary of the statistics
|
||||
pub fn print_summary(&self, algorithm_name: &str) {
|
||||
println!("\n=== {} Encoding Statistics ===", algorithm_name);
|
||||
println!("Total keys: {}", self.total_keys);
|
||||
println!("Min size: {} base64 characters", self.min_size);
|
||||
println!("Max size: {} base64 characters", self.max_size);
|
||||
println!("Mean size: {:.2} base64 characters", self.mean_size);
|
||||
println!("Median size: {} base64 characters", self.median_size);
|
||||
|
||||
println!("\nSize distribution:");
|
||||
let mut sizes: Vec<_> = self.histogram.keys().collect();
|
||||
sizes.sort();
|
||||
for &size in sizes {
|
||||
let count = self.histogram[&size];
|
||||
let percentage = (count as f64 / self.total_keys as f64) * 100.0;
|
||||
println!(" {} chars: {} keys ({:.1}%)", size, count, percentage);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Analyze the encoding efficiency of Base64 variant sort keys
|
||||
pub fn analyze_base64_encoding(keys: &[SortKeyBase64]) -> EncodingStats {
|
||||
let sizes: Vec<usize> = keys.iter().map(|key| key.max_base64_chars()).collect();
|
||||
EncodingStats::from_sizes(sizes)
|
||||
}
|
||||
|
||||
/// Analyze the encoding efficiency of Reference implementation sort keys
|
||||
pub fn analyze_reference_encoding(keys: &[ReferenceSortKey]) -> EncodingStats {
|
||||
let sizes: Vec<usize> = keys.iter().map(|key| key.base64_chars_needed()).collect();
|
||||
EncodingStats::from_sizes(sizes)
|
||||
}
|
||||
|
||||
/// Compare encoding efficiency between two algorithms
|
||||
pub fn compare_encodings(stats1: &EncodingStats, name1: &str, stats2: &EncodingStats, name2: &str) {
|
||||
println!("\n=== Encoding Comparison: {} vs {} ===", name1, name2);
|
||||
println!("Mean size: {:.2} vs {:.2} chars ({:.1}% difference)",
|
||||
stats1.mean_size, stats2.mean_size,
|
||||
((stats2.mean_size - stats1.mean_size) / stats1.mean_size) * 100.0);
|
||||
println!("Max size: {} vs {} chars", stats1.max_size, stats2.max_size);
|
||||
println!("Min size: {} vs {} chars", stats1.min_size, stats2.min_size);
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_encoding_stats() {
|
||||
let sizes = vec![1, 2, 2, 3, 3, 3, 4, 5];
|
||||
let stats = EncodingStats::from_sizes(sizes);
|
||||
|
||||
assert_eq!(stats.total_keys, 8);
|
||||
assert_eq!(stats.min_size, 1);
|
||||
assert_eq!(stats.max_size, 5);
|
||||
assert_eq!(stats.mean_size, 2.875);
|
||||
assert_eq!(stats.median_size, 3);
|
||||
assert_eq!(stats.histogram[&3], 3);
|
||||
assert_eq!(stats.histogram[&2], 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_base64_analysis() {
|
||||
let keys = vec![
|
||||
SortKeyBase64::new(vec![1]),
|
||||
SortKeyBase64::new(vec![1, 2]),
|
||||
SortKeyBase64::new(vec![1, 2, 3]),
|
||||
];
|
||||
|
||||
let stats = analyze_base64_encoding(&keys);
|
||||
assert_eq!(stats.total_keys, 3);
|
||||
assert_eq!(stats.min_size, 1); // 1 level = 1 char
|
||||
assert_eq!(stats.max_size, 6); // 3 levels = 1+2+3 = 6 chars
|
||||
assert_eq!(stats.mean_size, 10.0/3.0); // (1+3+6)/3
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_reference_analysis() {
|
||||
let keys = vec![
|
||||
ReferenceSortKey::new(vec![1]),
|
||||
ReferenceSortKey::new(vec![1, 2]),
|
||||
ReferenceSortKey::new(vec![1, 2, 3]),
|
||||
];
|
||||
|
||||
let stats = analyze_reference_encoding(&keys);
|
||||
assert_eq!(stats.total_keys, 3);
|
||||
assert_eq!(stats.min_size, 1); // 4 bits = 1 char
|
||||
assert_eq!(stats.max_size, 3); // 4+5+6=15 bits = 3 chars
|
||||
assert_eq!(stats.mean_size, 2.0); // (1+2+3)/3
|
||||
}
|
||||
}
|
||||
7
research/src/lib.rs
Normal file
7
research/src/lib.rs
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
pub mod algorithms;
|
||||
pub mod encoding_analysis;
|
||||
|
||||
pub use algorithms::ReferenceLSEQ;
|
||||
|
||||
// Re-export for convenience in benchmarks
|
||||
pub use rand;
|
||||
52
research/src/main.rs
Normal file
52
research/src/main.rs
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
use peoplesgrocers_lseq_research::ReferenceLSEQ;
|
||||
use rand::rngs::StdRng;
|
||||
use rand::SeedableRng;
|
||||
use log::trace;
|
||||
|
||||
fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
// Because this smoke test is so simple, I'm not going to show the module name or timestamp.
|
||||
env_logger::Builder::from_default_env()
|
||||
.format(|buf, record| {
|
||||
use std::io::Write;
|
||||
use env_logger::fmt::Color;
|
||||
|
||||
let mut style = buf.style();
|
||||
let level_color = match record.level() {
|
||||
log::Level::Error => Color::Red,
|
||||
log::Level::Warn => Color::Yellow,
|
||||
log::Level::Info => Color::Green,
|
||||
log::Level::Debug => Color::Blue,
|
||||
log::Level::Trace => Color::Cyan,
|
||||
};
|
||||
style.set_color(level_color).set_bold(true);
|
||||
|
||||
writeln!(buf, "{} {}", style.value(record.level()), record.args())
|
||||
})
|
||||
.init();
|
||||
|
||||
println!("L-SEQ Research - Original Paper Reference Implementation");
|
||||
|
||||
// Test the original paper reference implementation
|
||||
let mut lseq = ReferenceLSEQ::new(StdRng::seed_from_u64(42));
|
||||
let mut keys = Vec::new();
|
||||
|
||||
// Generate 10 sequential insertions
|
||||
for i in 0..10 {
|
||||
let before = keys.last();
|
||||
let key = lseq.allocate(before, None)?;
|
||||
println!("Generated key {}: {}", i + 1, key);
|
||||
trace!("--------------------------------");
|
||||
keys.push(key);
|
||||
}
|
||||
|
||||
// Verify they are sorted
|
||||
println!("\nVerifying sort order:");
|
||||
for i in 0..keys.len() - 1 {
|
||||
println!("{} < {}", keys[i], keys[i + 1]);
|
||||
assert!(keys[i] < keys[i + 1]);
|
||||
}
|
||||
|
||||
println!("\nAll keys are properly sorted!");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue