feat: conformance tests pass for first time

Change implementation to exponentially increase search space at each level.
2025-12-12 21:05:29 -08:00 · 2025-12-12 21:05:29 -08:00 · 546d6deb69
commit 546d6deb69
parent 31c454a78c
13 changed files with 1852 additions and 102 deletions
--- a/rust/src/lib.rs
+++ b/rust/src/lib.rs
@ -10,11 +10,39 @@ use std::str::FromStr;

 const ALPHABET: &[u8] = b"-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz";

+/// Minimal RNG trait for LSEQ - only the methods we actually use.
+/// This allows custom implementations (e.g., recording wrappers) without
+/// implementing the full Rng trait.
+pub trait LseqRng {
+    fn gen_bool(&mut self, p: f64) -> bool;
+    fn gen_range(&mut self, range: std::ops::Range<u64>) -> u64;
+}
+
+/// Blanket implementation for anything that implements rand::Rng
+impl<R: Rng> LseqRng for R {
+    fn gen_bool(&mut self, p: f64) -> bool {
+        Rng::gen_bool(self, p)
+    }
+    fn gen_range(&mut self, range: std::ops::Range<u64>) -> u64 {
+        Rng::gen_range(self, range)
+    }
+}
+
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
 #[cfg_attr(feature = "serde", derive(Serialize))]
 #[cfg_attr(feature = "serde", serde(into = "String"))]
 pub struct SortKey {
-    numbers: Vec<u8>,
+    /// Each element is a value at one level of the LSEQ tree.
+    /// Level n (0-indexed) holds values 0 to 64^(n+1) - 1, capped at 64^8.
+    ///
+    /// String encoding length grows as triangular numbers only up to depth 8:
+    ///   Depths 1-8: 1, 3, 6, 10, 15, 21, 28, 36 chars (triangular)
+    ///   Depths 9+:  44, 52, 60, 68, ... chars (+8 per level, linear)
+    ///
+    /// We cap at 64^8 = 2^48 per level for JavaScript float compatibility (2^53 max).
+    /// But we can still keep going deeper: even at 8 chars per level, the total
+    /// address space is (2^48)^depth which remains astronomically large.
+    numbers: Vec<u64>,
 }

 #[cfg(feature = "serde")]
@ -44,14 +72,34 @@ impl<'de> Deserialize<'de> for SortKey {
    }
 }

+/// Maximum exponent for level values, capped for JavaScript compatibility.
+/// JavaScript numbers are IEEE 754 floats with 53 bits of precision.
+/// 64^8 = 2^48, which is safely within 2^53.
+const MAX_LEVEL_EXPONENT: u32 = 8;
+
 impl SortKey {
-    pub fn from_numbers(numbers: Vec<u8>) -> Self {
+    pub fn from_numbers(numbers: Vec<u64>) -> Self {
        SortKey { numbers }
    }
+
+    /// Returns the maximum value for a given level (0-indexed).
+    /// Level 0: 64^1 - 1 = 63
+    /// Level 1: 64^2 - 1 = 4095
+    /// ...
+    /// Level 7+: 64^8 - 1 (capped for JS compatibility)
+    fn max_value_for_level(level: usize) -> u64 {
+        let exp = (level as u32 + 1).min(MAX_LEVEL_EXPONENT);
+        64u64.pow(exp) - 1
+    }
+
+    /// Returns the number of characters needed to encode a value at this level.
+    fn chars_for_level(level: usize) -> usize {
+        (level + 1).min(MAX_LEVEL_EXPONENT as usize)
+    }
 }

-impl From<SortKey> for Vec<u8> {
-    fn from(key: SortKey) -> Vec<u8> {
+impl From<SortKey> for Vec<u64> {
+    fn from(key: SortKey) -> Vec<u64> {
        key.numbers
    }
 }
@ -62,8 +110,8 @@ impl From<SortKey> for String {
    }
 }

-impl AsRef<[u8]> for SortKey {
-    fn as_ref(&self) -> &[u8] {
+impl AsRef<[u64]> for SortKey {
+    fn as_ref(&self) -> &[u64] {
        &self.numbers
    }
 }
@ -76,8 +124,21 @@ impl From<String> for SortKey {

 impl fmt::Display for SortKey {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        for &n in &self.numbers {
-            write!(f, "{}", ALPHABET[n as usize] as char)?;
+        for (level, &value) in self.numbers.iter().enumerate() {
+            let num_chars = Self::chars_for_level(level);
+            let mut chars = Vec::with_capacity(num_chars);
+            let mut v = value;
+
+            // Extract digits from least significant to most significant
+            for _ in 0..num_chars {
+                chars.push(ALPHABET[(v % 64) as usize] as char);
+                v /= 64;
+            }
+
+            // Write in reverse (most significant first)
+            for c in chars.into_iter().rev() {
+                write!(f, "{}", c)?;
+            }
        }
        Ok(())
    }
@ -85,52 +146,106 @@ impl fmt::Display for SortKey {

 #[allow(dead_code)]
 #[derive(Debug)]
-pub struct LSEQ<R: Rng> {
+pub struct LSEQ<R: LseqRng> {
    strategies: Vec<bool>,
    rng: R,
 }

 #[allow(dead_code)]
-impl<R: Rng> LSEQ<R> {
+impl<R: LseqRng> LSEQ<R> {
    pub fn new(mut rng: R) -> Self {
        let strategies = vec![rng.gen_bool(0.5)];
        LSEQ { strategies, rng }
    }

-    pub fn alloc(&mut self, before: Option<&SortKey>, after: Option<&SortKey>) -> SortKey {
-        // Convert to numeric arrays, using boundary values for null
-        let p = before.map_or(vec![0], |s| s.numbers.clone());
-        let q = after.map_or(vec![63], |s| s.numbers.clone());
+    /// Consume the LSEQ and return the inner RNG
+    pub fn take_rng(self) -> R {
+        self.rng
+    }
+
+    /// Allocate a new sort key between `before` and `after`.
+    ///
+    /// # The invariant
+    ///
+    /// This function guarantees you can always allocate a key that sorts before
+    /// any previously allocated key. This is essential for CRDT list insertion.
+    ///
+    /// # The encoding
+    ///
+    /// Keys use a 64-character alphabet where `'-'` is 0 and `'z'` is 63, chosen
+    /// so that `strcmp` matches numeric comparison. Keys are paths through an
+    /// LSEQ tree where each level has exponentially more space:
+    ///
+    /// ```text
+    /// Level 1:  64¹ values  →  1 char    "-", "0", ..., "z"
+    /// Level 2:  64² values  →  2 chars   "--", "-0", ..., "zz"
+    /// Level 3:  64³ values  →  3 chars   "---", "--0", ..., "zzz"
+    /// ```
+    ///
+    /// A path is encoded by concatenating each level's representation:
+    ///
+    /// ```text
+    /// [0]       = ["-"]              = "-"       (1 char)
+    /// [0, 1]    = ["-", "-0"]        = "--0"     (1 + 2 = 3 chars)
+    /// [0, 0]    = ["-", "--"]        = "---"     (1 + 2 = 3 chars)
+    /// [0, 0, 1] = ["-", "--", "--0"] = "-----0"  (1 + 2 + 3 = 6 chars)
+    /// ```
+    ///
+    /// # Why we go deeper than the LSEQ paper
+    ///
+    /// With `strcmp`, `"-"` == `"---"` == `"------"` in a crucial sense: nothing
+    /// can sort before any of them. All-zeros at any depth is "negative infinity".
+    ///
+    /// The LSEQ paper says: to insert before `[0, 1]` (= `"--0"`), use `[0, 0]`.
+    /// But `[0, 0]` = `"---"`, and nothing can ever sort before that!
+    ///
+    /// This implementation goes one level deeper to preserve the invariant:
+    ///
+    /// ```text
+    /// Insert before "--0" (i.e., [0, 1])?
+    ///   Paper says:  use [0, 0] = "---"           → dead end
+    ///   We say:      use [0, 0, X] = "---" + X    → can still prepend [0, 0, Y] where Y < X
+    /// ```
+    ///
+    /// The cost is longer keys, but we guarantee indefinite prepending.
+    pub fn alloc(&mut self, before: Option<&SortKey>, after: Option<&SortKey>) -> SortKey {
+        let p = before.map_or(vec![], |s| s.numbers.clone());
+        let q = after.map_or(vec![], |s| s.numbers.clone());

-        // Walk through digits looking for space
        let mut depth = 0;
-        let mut result = Vec::new();
+        let mut result: Vec<u64> = Vec::new();

        loop {
-            let p_val = if depth < p.len() { p[depth] } else { 0 };
-            let q_val = if depth < q.len() { q[depth] } else { 63 };
+            let p_val = p.get(depth).copied().unwrap_or(0);
+            let q_upper = q.get(depth).copied();
+            let level_max = SortKey::max_value_for_level(depth);

-            let interval = q_val as i32 - p_val as i32;
+            // Minimum allocatable (inclusive): one above the lower bound.
+            // This naturally reserves value 0 when p_val=0, ensuring we never
+            // allocate an all-zeros key. If we allocate [0, 1] and later need
+            // to prepend before it, we simply go deeper to get [0, 0, X].
+            let min_alloc = p_val + 1;

-            // If we have space between values at this depth
-            if interval > 1 {
-                // Pick a value in the available range
-                let range = interval - 1;
-                let add_val = 1 + self.rng.gen_range(0..range) as u8;
+            // Maximum allocatable (inclusive):
+            // - With upper bound: one below it
+            // - Without upper bound (after=None): full range for this level
+            let max_alloc = q_upper.map_or(level_max, |q| q.saturating_sub(1));
+
+            if min_alloc <= max_alloc {
+                let range = max_alloc - min_alloc + 1;
+                let offset = self.rng.gen_range(0..range);
                let new_value = if self.strategies[depth] {
-                    p_val + add_val
+                    min_alloc + offset
                } else {
-                    q_val - add_val
+                    max_alloc - offset
                };
-
-                // Take the prefix from p up to depth and append our new value
                result.push(new_value);
                return SortKey::from_numbers(result);
            }
+
+            // Descend to next level
            result.push(p_val);

-            // If values are the same or adjacent at this depth,
-            // continue to next depth
            depth += 1;
            if depth >= self.strategies.len() {
                self.strategies.push(self.rng.gen_bool(0.5));
@ -165,21 +280,21 @@ pub struct EvenSpacingIterator {
 }

 impl EvenSpacingIterator {
-    // Static table of (64^k - 2) values for k from 1 to 9
-    // We subtract 2 from each space size because we need to reserve two boundary positions:
-    // 1. Position 0 (represented by "-") is reserved as the lower boundary
-    // 2. Position 63 (represented by "z") is reserved as the upper boundary
-    // This ensures we can always insert elements at the very beginning or end of the sequence
-    const USABLE_SPACE: [usize; 9] = [
-        64 - 2,                // 64^1 - 2
-        4096 - 2,              // 64^2 - 2
-        262144 - 2,            // 64^3 - 2
-        16777216 - 2,          // 64^4 - 2
-        1073741824 - 2,        // 64^5 - 2
-        68719476736 - 2,       // 64^6 - 2
-        4398046511104 - 2,     // 64^7 - 2
-        281474976710656 - 2,   // 64^8 - 2
-        18014398509481984 - 2, // 64^9 - 2
+    // Static table of (64^k - 1) values for k from 1 to 8
+    // We subtract 1 because we reserve only the lower boundary (position 0, all "-"s).
+    // Position 0 cannot be used because nothing can be lexicographically less than it.
+    // The upper boundary (all "z"s) IS usable because we can always insert after it
+    // by extending: "zzz" < "zzza" lexicographically (prefix comparison).
+    // Capped at 64^8 = 2^48 for JavaScript number compatibility (max safe: 2^53).
+    const USABLE_SPACE: [usize; 8] = [
+        64 - 1,              // 64^1 - 1
+        4096 - 1,            // 64^2 - 1
+        262144 - 1,          // 64^3 - 1
+        16777216 - 1,        // 64^4 - 1
+        1073741824 - 1,      // 64^5 - 1
+        68719476736 - 1,     // 64^6 - 1
+        4398046511104 - 1,   // 64^7 - 1
+        281474976710656 - 1, // 64^8 - 1
    ];

    pub fn new(total_items: usize) -> Result<(u64, Self), SpacingError> {
@ -222,25 +337,25 @@ impl EvenSpacingIterator {
        ))
    }

-    // Helper method to convert a position to a sort key
+    /// Convert a position within a level-k space to a SortKey.
+    ///
+    /// Creates a k-level key where levels 0 through k-2 are 0, and level k-1
+    /// contains the position value.
+    ///
+    /// Example: position_to_key(2, 1) = [0, 1] which displays as "--0"
    pub fn position_to_key(k: u64, position: u64) -> SortKey {
        let mut result = Vec::with_capacity(k as usize);
-        let mut pos = position;
-        const BASE: u64 = 64;

-        // Fill in digits from least significant to most significant
-        for _ in 0..k {
-            // SAFETY: digit is guaranteed to be in bounds because:
-            // 1. digit = pos % base where base is 64
-            // 2. ALPHABET has exactly 64 elements
-            // Therefore digit as u64 will always be 0-63
-            let digit = (pos % BASE) as u8;
-            pos /= BASE;
-            result.push(digit);
+        // Levels 0 through k-2 are 0
+        for _ in 0..k.saturating_sub(1) {
+            result.push(0);
+        }
+
+        // Level k-1 contains the position
+        if k > 0 {
+            result.push(position);
        }

-        // Reverse to get most significant digit first
-        result.reverse();
        SortKey::from_numbers(result)
    }
 }
@ -275,6 +390,7 @@ impl Iterator for EvenSpacingIterator {
 #[derive(Debug)]
 pub enum SortKeyParseError {
    InvalidCharacter(char),
+    InvalidLength(usize),
 }

 impl fmt::Display for SortKeyParseError {
@ -286,6 +402,11 @@ impl fmt::Display for SortKeyParseError {
                c,
                String::from_utf8_lossy(ALPHABET)
            ),
+            SortKeyParseError::InvalidLength(len) => write!(
+                f,
+                "Invalid sort key length {}. Expected triangular number up to 36 (1, 3, 6, 10, 15, 21, 28, 36), then +8 per level (44, 52, 60, ...)",
+                len
+            ),
        }
    }
 }
@ -296,11 +417,33 @@ impl FromStr for SortKey {
    type Err = SortKeyParseError;

    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        let numbers = s
-            .bytes()
-            .map(|b| ALPHABET.iter().position(|&x| x == b).map(|pos| pos as u8))
-            .collect::<Option<Vec<u8>>>()
-            .ok_or_else(|| SortKeyParseError::InvalidCharacter(s.chars().next().unwrap()))?;
+        let bytes = s.as_bytes();
+        let mut numbers = Vec::new();
+        let mut pos = 0;
+        let mut level = 0;
+
+        while pos < bytes.len() {
+            let num_chars = SortKey::chars_for_level(level);
+            if pos + num_chars > bytes.len() {
+                return Err(SortKeyParseError::InvalidLength(bytes.len()));
+            }
+
+            // Parse num_chars characters as a base-64 number
+            let mut value: u64 = 0;
+            for i in 0..num_chars {
+                let b = bytes[pos + i];
+                let digit = ALPHABET
+                    .iter()
+                    .position(|&x| x == b)
+                    .ok_or(SortKeyParseError::InvalidCharacter(b as char))?;
+                value = value * 64 + digit as u64;
+            }
+
+            numbers.push(value);
+            pos += num_chars;
+            level += 1;
+        }
+
        Ok(SortKey { numbers })
    }
 }
@ -311,13 +454,20 @@ mod tests {
    use rand::rngs::StdRng;
    use rand::SeedableRng;

+    /// Helper to create a SortKey from a slice of numbers
+    fn key(nums: &[u64]) -> SortKey {
+        SortKey::from_numbers(nums.to_vec())
+    }
+
    #[test]
    fn test_compare_lseq() {
-        let a = "a".parse::<SortKey>().unwrap();
-        let b = "b".parse::<SortKey>().unwrap();
-        assert_eq!(a < b, true);
-        assert_eq!(b < a, false);
-        assert_eq!(a < a, false);
+        // Single-character keys are level-0 values (0-63)
+        // "a" is position 38 in alphabet, "b" is 39
+        let a = "-".parse::<SortKey>().unwrap(); // value 0
+        let b = "0".parse::<SortKey>().unwrap(); // value 1
+        assert!(a < b);
+        assert!(!(b < a));
+        assert!(!(a < a));
    }

    #[test]
@ -335,8 +485,28 @@ mod tests {

    #[test]
    fn test_position_to_key() {
+        // k=2 means 2 levels: level 0 (1 char) + level 1 (2 chars) = 3 chars total
+        // position 1 goes into level 1, with level 0 = 0
+        // [0, 1] = "-" + "-0" = "--0"
        const K: u64 = 2;
-        assert_eq!(EvenSpacingIterator::position_to_key(K, 1).to_string(), "-0");
+        assert_eq!(
+            EvenSpacingIterator::position_to_key(K, 1).to_string(),
+            "--0"
+        );
+
+        // k=1 means just level 0 (1 char)
+        // position 1 = "0" (alphabet[1])
+        assert_eq!(
+            EvenSpacingIterator::position_to_key(1, 1).to_string(),
+            "0"
+        );
+
+        // k=2, position 4095 (max for level 1 = 64² - 1)
+        // [0, 4095] = "-" + "zz" = "-zz"
+        assert_eq!(
+            EvenSpacingIterator::position_to_key(2, 4095).to_string(),
+            "-zz"
+        );
    }

    #[test]
@ -379,4 +549,85 @@ mod tests {
            positions.len()
        );
    }
+
+    /// Test the "go deeper" strategy for prepending before left-edge keys
+    #[test]
+    fn test_prepend_before_left_edge() {
+        let rng = StdRng::seed_from_u64(123);
+        let mut lseq = LSEQ::new(rng);
+
+        // Prepend before [0, 1] -> should get [0, 0, X]
+        let target = key(&[0, 1]);
+        let result = lseq.alloc(None, Some(&target));
+        assert!(result < target);
+        assert_eq!(result.numbers.len(), 3);
+        assert_eq!(result.numbers[0], 0);
+        assert_eq!(result.numbers[1], 0);
+
+        // Prepend before [0, 0, 1] -> should get [0, 0, 0, X]
+        let target = key(&[0, 0, 1]);
+        let result = lseq.alloc(None, Some(&target));
+        assert!(result < target);
+        assert_eq!(result.numbers.len(), 4);
+
+        // Prepend before [0, 0, 0, 1] -> should get [0, 0, 0, 0, X]
+        let target = key(&[0, 0, 0, 1]);
+        let result = lseq.alloc(None, Some(&target));
+        assert!(result < target);
+        assert_eq!(result.numbers.len(), 5);
+    }
+
+    /// Verify the ordering: [0, 0, X] < [0, 1] for any X
+    #[test]
+    fn test_left_edge_ordering() {
+        assert!(key(&[0, 0, 63]) < key(&[0, 1]));
+        assert!(key(&[0, 0, 1]) < key(&[0, 1]));
+        assert!(key(&[0, 0, 0, 63]) < key(&[0, 0, 1]));
+    }
+
+    /// Verify roundtrip: display -> parse -> display gives same result
+    #[test]
+    fn test_roundtrip_encoding() {
+        let cases = vec![
+            key(&[0]),                    // "-"
+            key(&[63]),                   // "z"
+            key(&[0, 0]),                 // "---"
+            key(&[0, 1]),                 // "--0"
+            key(&[0, 4095]),              // "-zz"
+            key(&[1, 0]),                 // "0--"
+            key(&[0, 0, 0]),              // "------"
+            key(&[0, 0, 1]),              // "-----0"
+            key(&[0, 0, 262143]),         // "---zzz"
+        ];
+
+        for original in cases {
+            let s = original.to_string();
+            let parsed: SortKey = s.parse().expect(&format!("Failed to parse '{}'", s));
+            assert_eq!(
+                original.numbers, parsed.numbers,
+                "Roundtrip failed for {:?} -> '{}' -> {:?}",
+                original.numbers, s, parsed.numbers
+            );
+        }
+    }
+
+    /// Verify string encoding matches expected format
+    #[test]
+    fn test_string_encoding() {
+        // Level 0 only (1 char)
+        assert_eq!(key(&[0]).to_string(), "-");
+        assert_eq!(key(&[1]).to_string(), "0");
+        assert_eq!(key(&[63]).to_string(), "z");
+
+        // Level 0 + Level 1 (1 + 2 = 3 chars)
+        assert_eq!(key(&[0, 0]).to_string(), "---");
+        assert_eq!(key(&[0, 1]).to_string(), "--0");
+        assert_eq!(key(&[0, 64]).to_string(), "-0-"); // 64 = 1*64 + 0
+        assert_eq!(key(&[0, 4095]).to_string(), "-zz");
+
+        // Level 0 + Level 1 + Level 2 (1 + 2 + 3 = 6 chars)
+        assert_eq!(key(&[0, 0, 0]).to_string(), "------");
+        assert_eq!(key(&[0, 0, 1]).to_string(), "-----0");
+        assert_eq!(key(&[0, 0, 262143]).to_string(), "---zzz");
+    }
 }