sochdb_core/
learned_index.rs

1// Copyright 2025 Sushanth (https://github.com/sushanthpy)
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Learned Sparse Index (LSI)
16//!
17//! A novel index structure that uses linear regression to predict key positions
18//! instead of tree-based lookups. For well-distributed data (timestamps, sequential IDs),
19//! this provides O(1) expected lookups instead of O(log N).
20//!
21//! ## Mathematical Foundation
22//!
23//! Given sorted keys k₁ < k₂ < ... < kₙ, the CDF maps keys to positions:
24//! F(k) = |{kᵢ : kᵢ ≤ k}| / N
25//!
26//! For many real distributions, F can be approximated by a simple linear model:
27//! F̂(k) = slope * k + intercept
28//!
29//! With error bound ε: |F(k) - F̂(k)| ≤ ε
30//! We only search 2ε positions instead of the entire index.
31//!
32//! ## Complexity Analysis
33//!
34//! | Operation | B-Tree      | Learned Index           |
35//! |-----------|-------------|-------------------------|
36//! | Lookup    | O(log N)    | O(1) expected, O(ε) worst |
37//! | Insert    | O(log N)    | O(1) amortized + rebuild |
38//! | Space     | O(N)        | O(1) + O(outliers)      |
39
40use serde::{Deserialize, Serialize};
41
42/// Result of a learned index lookup
43#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
44pub enum LookupResult {
45    /// Exact position found (from correction table)
46    Exact(usize),
47    /// Range of positions to search [low, high]
48    Range { low: usize, high: usize },
49    /// Key is out of bounds
50    NotFound,
51}
52
53/// Learned Sparse Index using linear regression
54///
55/// Now with key normalization for numerical stability with large u64 keys.
56/// Keys are normalized to [0, n-1] before regression, avoiding precision
57/// loss with values near u64::MAX.
58#[derive(Debug, Clone, Serialize, Deserialize)]
59pub struct LearnedSparseIndex {
60    /// Linear model: position ≈ slope * normalized_key + intercept
61    slope: f64,
62    intercept: f64,
63    /// Maximum error of the model
64    max_error: usize,
65    /// Correction table for outliers (sparse), sorted by key
66    corrections: Vec<(u64, usize)>,
67    /// Minimum key in the index (for normalization)
68    min_key: u64,
69    /// Maximum key in the index (for normalization)
70    max_key: u64,
71    /// Precomputed key range as f64 (max_key - min_key)
72    key_range: f64,
73    /// Total number of keys
74    num_keys: usize,
75    /// Threshold for storing corrections (keys with error > this are stored)
76    correction_threshold: usize,
77}
78
79impl LearnedSparseIndex {
80    /// Default threshold for storing corrections
81    const DEFAULT_CORRECTION_THRESHOLD: usize = 64;
82
83    /// Create an empty index
84    pub fn empty() -> Self {
85        Self {
86            slope: 0.0,
87            intercept: 0.0,
88            max_error: 0,
89            corrections: Vec::new(),
90            min_key: 0,
91            max_key: 0,
92            key_range: 0.0,
93            num_keys: 0,
94            correction_threshold: Self::DEFAULT_CORRECTION_THRESHOLD,
95        }
96    }
97
98    /// Normalize a key to range [0, n-1] for numerical stability
99    ///
100    /// Given keys k_min < k_max, transforms key k to:
101    /// k_normalized = ((k - k_min) / (k_max - k_min)) * (n - 1)
102    #[inline]
103    fn normalize_key(&self, key: u64) -> f64 {
104        if self.key_range == 0.0 {
105            return 0.0;
106        }
107        // Use u128 arithmetic to avoid overflow when subtracting
108        let offset = (key as u128).saturating_sub(self.min_key as u128) as f64;
109        (offset / self.key_range) * (self.num_keys - 1) as f64
110    }
111
112    /// Build index from sorted keys
113    ///
114    /// Keys MUST be sorted in ascending order. This is not validated for performance.
115    pub fn build(keys: &[u64]) -> Self {
116        Self::build_with_threshold(keys, Self::DEFAULT_CORRECTION_THRESHOLD)
117    }
118
119    /// Build index with custom correction threshold
120    ///
121    /// Lower threshold = more corrections stored = faster lookups but more memory
122    /// Higher threshold = fewer corrections = slower lookups but less memory
123    pub fn build_with_threshold(keys: &[u64], correction_threshold: usize) -> Self {
124        let n = keys.len();
125        if n == 0 {
126            return Self::empty();
127        }
128
129        if n == 1 {
130            return Self {
131                slope: 0.0,
132                intercept: 0.0,
133                max_error: 0,
134                corrections: Vec::new(),
135                min_key: keys[0],
136                max_key: keys[0],
137                key_range: 0.0,
138                num_keys: 1,
139                correction_threshold,
140            };
141        }
142
143        let min_key = keys[0];
144        let max_key = keys[n - 1];
145        // Use u128 to avoid overflow when computing range
146        let key_range = (max_key as u128 - min_key as u128) as f64;
147
148        // Fit linear regression on NORMALIZED keys for numerical stability
149        let (slope, intercept) = Self::linear_regression_normalized(keys, min_key, key_range, n);
150
151        // Calculate errors and find outliers
152        let mut max_error = 0usize;
153        let mut corrections = Vec::new();
154
155        for (actual_pos, &key) in keys.iter().enumerate() {
156            // Use normalized key for prediction
157            let normalized = if key_range == 0.0 {
158                0.0
159            } else {
160                let offset = (key as u128 - min_key as u128) as f64;
161                (offset / key_range) * (n - 1) as f64
162            };
163            let predicted = slope * normalized + intercept;
164            let predicted_pos = predicted.round() as isize;
165            let error = (actual_pos as isize - predicted_pos).unsigned_abs();
166
167            if error > max_error {
168                max_error = error;
169            }
170
171            // Store correction for large errors (outliers)
172            if error > correction_threshold {
173                corrections.push((key, actual_pos));
174            }
175        }
176
177        Self {
178            slope,
179            intercept,
180            max_error,
181            corrections,
182            min_key,
183            max_key,
184            key_range,
185            num_keys: n,
186            correction_threshold,
187        }
188    }
189
190    /// Lookup: O(1) expected, O(ε) worst case
191    pub fn lookup(&self, key: u64) -> LookupResult {
192        if self.num_keys == 0 {
193            return LookupResult::NotFound;
194        }
195
196        // Bounds check
197        if key < self.min_key || key > self.max_key {
198            return LookupResult::NotFound;
199        }
200
201        // Check corrections first (outliers get O(1) exact lookup)
202        // Binary search in the sorted corrections vector
203        if let Ok(idx) = self.corrections.binary_search_by_key(&key, |&(k, _)| k) {
204            return LookupResult::Exact(self.corrections[idx].1);
205        }
206
207        // Predict position using normalized key for numerical stability
208        let normalized = self.normalize_key(key);
209        let predicted = self.slope * normalized + self.intercept;
210        let predicted_pos = predicted.round() as isize;
211
212        // Calculate search range based on max error
213        let low = (predicted_pos - self.max_error as isize).max(0) as usize;
214        let high =
215            (predicted_pos + self.max_error as isize).min(self.num_keys as isize - 1) as usize;
216
217        LookupResult::Range { low, high }
218    }
219
220    /// Lookup with a custom error margin (tighter range if you know data is regular)
221    pub fn lookup_with_error(&self, key: u64, max_error: usize) -> LookupResult {
222        if self.num_keys == 0 {
223            return LookupResult::NotFound;
224        }
225
226        if key < self.min_key || key > self.max_key {
227            return LookupResult::NotFound;
228        }
229
230        if let Ok(idx) = self.corrections.binary_search_by_key(&key, |&(k, _)| k) {
231            return LookupResult::Exact(self.corrections[idx].1);
232        }
233
234        // Use normalized key for prediction
235        let normalized = self.normalize_key(key);
236        let predicted = self.slope * normalized + self.intercept;
237        let predicted_pos = predicted.round() as isize;
238
239        let low = (predicted_pos - max_error as isize).max(0) as usize;
240        let high = (predicted_pos + max_error as isize).min(self.num_keys as isize - 1) as usize;
241
242        LookupResult::Range { low, high }
243    }
244
245    /// Get index statistics
246    pub fn stats(&self) -> LearnedIndexStats {
247        LearnedIndexStats {
248            num_keys: self.num_keys,
249            max_error: self.max_error,
250            num_corrections: self.corrections.len(),
251            slope: self.slope,
252            intercept: self.intercept,
253            correction_ratio: if self.num_keys > 0 {
254                self.corrections.len() as f64 / self.num_keys as f64
255            } else {
256                0.0
257            },
258        }
259    }
260
261    /// Returns true if this index is well-suited for learned indexing
262    /// (low error, few corrections)
263    pub fn is_efficient(&self) -> bool {
264        // Efficient if:
265        // - Max error is small (under 128 positions to search)
266        // - Correction ratio is low (under 5% of keys are outliers)
267        let low_error = self.max_error <= 128;
268        let low_corrections =
269            self.num_keys == 0 || (self.corrections.len() as f64 / self.num_keys as f64) < 0.05;
270        low_error && low_corrections
271    }
272
273    /// Memory usage in bytes
274    pub fn memory_bytes(&self) -> usize {
275        std::mem::size_of::<Self>()
276            + self.corrections.len() * (std::mem::size_of::<u64>() + std::mem::size_of::<usize>())
277    }
278
279    /// Linear regression on NORMALIZED keys for numerical stability
280    ///
281    /// Keys are normalized to [0, n-1] before regression to avoid precision loss
282    /// with large u64 values (near u64::MAX, squaring would overflow f64 precision).
283    fn linear_regression_normalized(
284        keys: &[u64],
285        min_key: u64,
286        key_range: f64,
287        n: usize,
288    ) -> (f64, f64) {
289        let n_f64 = n as f64;
290
291        // Using numerically stable algorithm with normalized keys
292        let mut sum_x: f64 = 0.0;
293        let mut sum_y: f64 = 0.0;
294        let mut sum_xy: f64 = 0.0;
295        let mut sum_xx: f64 = 0.0;
296
297        for (i, &key) in keys.iter().enumerate() {
298            // Normalize key to [0, n-1]
299            let x = if key_range == 0.0 {
300                0.0
301            } else {
302                let offset = (key as u128 - min_key as u128) as f64;
303                (offset / key_range) * (n - 1) as f64
304            };
305            let y = i as f64;
306
307            sum_x += x;
308            sum_y += y;
309            sum_xy += x * y;
310            sum_xx += x * x;
311        }
312
313        let denominator = n_f64 * sum_xx - sum_x * sum_x;
314
315        // Handle degenerate case (all keys are the same)
316        if denominator.abs() < f64::EPSILON {
317            return (0.0, sum_y / n_f64);
318        }
319
320        let slope = (n_f64 * sum_xy - sum_x * sum_y) / denominator;
321        let intercept = (sum_y - slope * sum_x) / n_f64;
322
323        (slope, intercept)
324    }
325
326    /// Legacy linear regression (kept for compatibility but deprecated)
327    #[allow(dead_code)]
328    fn linear_regression(keys: &[u64]) -> (f64, f64) {
329        let n = keys.len() as f64;
330
331        // Using numerically stable algorithm
332        let mut sum_x: f64 = 0.0;
333        let mut sum_y: f64 = 0.0;
334        let mut sum_xy: f64 = 0.0;
335        let mut sum_xx: f64 = 0.0;
336
337        for (i, &key) in keys.iter().enumerate() {
338            let x = key as f64;
339            let y = i as f64;
340            sum_x += x;
341            sum_y += y;
342            sum_xy += x * y;
343            sum_xx += x * x;
344        }
345
346        let denominator = n * sum_xx - sum_x * sum_x;
347
348        // Handle degenerate case (all keys are the same)
349        if denominator.abs() < f64::EPSILON {
350            return (0.0, sum_y / n);
351        }
352
353        let slope = (n * sum_xy - sum_x * sum_y) / denominator;
354        let intercept = (sum_y - slope * sum_x) / n;
355
356        (slope, intercept)
357    }
358
359    /// Incrementally add a key (rebuilds if model quality degrades)
360    /// Returns true if a rebuild was triggered
361    pub fn insert(&mut self, key: u64, position: usize, keys: &[u64]) -> bool {
362        // Predict where this key should be using normalized key
363        let normalized = self.normalize_key(key);
364        let predicted = self.slope * normalized + self.intercept;
365        let predicted_pos = predicted.round() as isize;
366        let error = (position as isize - predicted_pos).unsigned_abs();
367
368        // Update bounds
369        self.min_key = self.min_key.min(key);
370        self.max_key = self.max_key.max(key);
371        // Recalculate key_range
372        self.key_range = (self.max_key as u128 - self.min_key as u128) as f64;
373        self.num_keys += 1;
374
375        if error > self.max_error {
376            self.max_error = error;
377        }
378
379        if error > self.correction_threshold {
380            // Insert into sorted corrections vector
381            match self.corrections.binary_search_by_key(&key, |&(k, _)| k) {
382                Ok(idx) => self.corrections[idx] = (key, position),
383                Err(idx) => self.corrections.insert(idx, (key, position)),
384            }
385        }
386
387        // Rebuild if model is degrading too much
388        if self.corrections.len() > self.num_keys / 10 {
389            *self = Self::build_with_threshold(keys, self.correction_threshold);
390            return true;
391        }
392
393        false
394    }
395}
396
397/// Statistics about the learned index
398#[derive(Debug, Clone)]
399pub struct LearnedIndexStats {
400    /// Number of keys in the index
401    pub num_keys: usize,
402    /// Maximum prediction error
403    pub max_error: usize,
404    /// Number of keys with corrections stored
405    pub num_corrections: usize,
406    /// Linear model slope
407    pub slope: f64,
408    /// Linear model intercept
409    pub intercept: f64,
410    /// Ratio of keys that need corrections
411    pub correction_ratio: f64,
412}
413
414/// Piecewise Learned Index for non-linear distributions
415///
416/// Divides the key space into segments, each with its own linear model
417#[derive(Debug, Clone)]
418pub struct PiecewiseLearnedIndex {
419    /// Segment boundaries (sorted)
420    boundaries: Vec<u64>,
421    /// Index for each segment
422    segments: Vec<LearnedSparseIndex>,
423}
424
425impl PiecewiseLearnedIndex {
426    /// Build piecewise index with automatic segmentation
427    pub fn build(keys: &[u64], max_segments: usize) -> Self {
428        if keys.is_empty() || max_segments == 0 {
429            return Self {
430                boundaries: vec![],
431                segments: vec![],
432            };
433        }
434
435        // Simple even segmentation (could use dynamic programming for optimal)
436        let segment_size = keys.len().div_ceil(max_segments);
437        let mut boundaries = Vec::with_capacity(max_segments);
438        let mut segments = Vec::with_capacity(max_segments);
439
440        for chunk in keys.chunks(segment_size) {
441            if !chunk.is_empty() {
442                boundaries.push(chunk[0]);
443                segments.push(LearnedSparseIndex::build(chunk));
444            }
445        }
446
447        Self {
448            boundaries,
449            segments,
450        }
451    }
452
453    /// Find segment containing key
454    fn find_segment(&self, key: u64) -> Option<usize> {
455        if self.boundaries.is_empty() {
456            return None;
457        }
458
459        // Binary search for segment
460        match self.boundaries.binary_search(&key) {
461            Ok(i) => Some(i),
462            Err(i) => {
463                if i == 0 {
464                    None
465                } else {
466                    Some(i - 1)
467                }
468            }
469        }
470    }
471
472    /// Lookup with piecewise model
473    pub fn lookup(&self, key: u64) -> LookupResult {
474        match self.find_segment(key) {
475            Some(seg_idx) => self.segments[seg_idx].lookup(key),
476            None => LookupResult::NotFound,
477        }
478    }
479
480    /// Get aggregate statistics
481    pub fn stats(&self) -> PiecewiseStats {
482        let segment_stats: Vec<_> = self.segments.iter().map(|s| s.stats()).collect();
483        let total_keys: usize = segment_stats.iter().map(|s| s.num_keys).sum();
484        let max_error = segment_stats.iter().map(|s| s.max_error).max().unwrap_or(0);
485        let total_corrections: usize = segment_stats.iter().map(|s| s.num_corrections).sum();
486
487        PiecewiseStats {
488            num_segments: self.segments.len(),
489            total_keys,
490            max_error,
491            total_corrections,
492            avg_segment_size: if self.segments.is_empty() {
493                0.0
494            } else {
495                total_keys as f64 / self.segments.len() as f64
496            },
497        }
498    }
499}
500
501/// Statistics for piecewise learned index
502#[derive(Debug, Clone)]
503pub struct PiecewiseStats {
504    pub num_segments: usize,
505    pub total_keys: usize,
506    pub max_error: usize,
507    pub total_corrections: usize,
508    pub avg_segment_size: f64,
509}
510
511#[cfg(test)]
512mod tests {
513    use super::*;
514
515    #[test]
516    fn test_empty_index() {
517        let index = LearnedSparseIndex::build(&[]);
518        assert_eq!(index.lookup(42), LookupResult::NotFound);
519        assert_eq!(index.stats().num_keys, 0);
520    }
521
522    #[test]
523    fn test_single_key() {
524        let index = LearnedSparseIndex::build(&[100]);
525        assert!(matches!(
526            index.lookup(100),
527            LookupResult::Range { low: 0, high: 0 }
528        ));
529        assert_eq!(index.lookup(50), LookupResult::NotFound);
530        assert_eq!(index.lookup(150), LookupResult::NotFound);
531    }
532
533    #[test]
534    fn test_sequential_keys() {
535        // Perfect linear distribution - should have very low error
536        let keys: Vec<u64> = (0..1000).collect();
537        let index = LearnedSparseIndex::build(&keys);
538
539        let stats = index.stats();
540        assert!(
541            stats.max_error <= 1,
542            "Sequential keys should have near-zero error"
543        );
544        assert!(
545            stats.num_corrections == 0,
546            "No corrections needed for linear data"
547        );
548
549        // Lookup should give exact or near-exact positions
550        if let LookupResult::Range { low, high } = index.lookup(500) {
551            assert!(low <= 500 && high >= 500, "Key 500 should be in range");
552            assert!(high - low <= 2, "Range should be very tight");
553        }
554    }
555
556    #[test]
557    fn test_timestamp_like_keys() {
558        // Timestamps with gaps (realistic workload)
559        let mut keys: Vec<u64> = Vec::new();
560        let mut ts: u64 = 1704067200; // Jan 1, 2024
561        for _ in 0..10000 {
562            keys.push(ts);
563            ts += 1 + (ts % 10); // Variable gaps 1-10
564        }
565
566        let index = LearnedSparseIndex::build(&keys);
567
568        // Should still be efficient for roughly linear data
569        assert!(
570            index.is_efficient(),
571            "Timestamp data should be efficiently indexable"
572        );
573
574        // Verify lookups
575        for &key in keys.iter().take(100) {
576            let result = index.lookup(key);
577            assert!(
578                !matches!(result, LookupResult::NotFound),
579                "Existing key should be found"
580            );
581        }
582    }
583
584    #[test]
585    fn test_sparse_keys() {
586        // Very sparse keys (high gaps)
587        let keys: Vec<u64> = vec![1, 100, 10000, 1000000, 100000000];
588        let index = LearnedSparseIndex::build(&keys);
589
590        // All keys should be findable
591        for (i, &key) in keys.iter().enumerate() {
592            match index.lookup(key) {
593                LookupResult::Exact(pos) => assert_eq!(pos, i),
594                LookupResult::Range { low, high } => {
595                    assert!(
596                        low <= i && i <= high,
597                        "Key {} should be in range [{}, {}]",
598                        key,
599                        low,
600                        high
601                    );
602                }
603                LookupResult::NotFound => panic!("Key {} should be found", key),
604            }
605        }
606    }
607
608    #[test]
609    fn test_out_of_bounds() {
610        let keys: Vec<u64> = (100..200).collect();
611        let index = LearnedSparseIndex::build(&keys);
612
613        assert_eq!(index.lookup(50), LookupResult::NotFound);
614        assert_eq!(index.lookup(250), LookupResult::NotFound);
615    }
616
617    #[test]
618    fn test_piecewise_index() {
619        // Create non-linear data that benefits from segmentation
620        let mut keys: Vec<u64> = Vec::new();
621
622        // Three distinct clusters
623        keys.extend(0..1000); // Dense cluster 1
624        keys.extend((100000..101000).step_by(10)); // Sparse cluster 2
625        keys.extend(1000000..1001000); // Dense cluster 3
626
627        let piecewise = PiecewiseLearnedIndex::build(&keys, 3);
628        let stats = piecewise.stats();
629
630        assert_eq!(stats.num_segments, 3);
631
632        // Verify lookups across clusters
633        assert!(!matches!(piecewise.lookup(500), LookupResult::NotFound));
634        assert!(!matches!(piecewise.lookup(100500), LookupResult::NotFound));
635        assert!(!matches!(piecewise.lookup(1000500), LookupResult::NotFound));
636    }
637
638    #[test]
639    fn test_memory_efficiency() {
640        // Compare memory to theoretical B-tree
641        let keys: Vec<u64> = (0..100000).collect();
642        let index = LearnedSparseIndex::build(&keys);
643
644        let lsi_bytes = index.memory_bytes();
645        let btree_bytes = keys.len() * std::mem::size_of::<u64>(); // Minimum B-tree overhead
646
647        // LSI should use significantly less memory for linear data
648        assert!(
649            lsi_bytes < btree_bytes,
650            "LSI ({} bytes) should use less memory than keys alone ({} bytes)",
651            lsi_bytes,
652            btree_bytes
653        );
654    }
655
656    #[test]
657    fn test_correction_threshold() {
658        // Create data with some outliers
659        let mut keys: Vec<u64> = (0..100).map(|x| x * 10).collect();
660        keys.push(5000); // Outlier
661        keys.sort();
662
663        // Low threshold = more corrections
664        let low_thresh = LearnedSparseIndex::build_with_threshold(&keys, 10);
665
666        // High threshold = fewer corrections
667        let high_thresh = LearnedSparseIndex::build_with_threshold(&keys, 1000);
668
669        assert!(
670            low_thresh.stats().num_corrections >= high_thresh.stats().num_corrections,
671            "Lower threshold should produce more or equal corrections"
672        );
673    }
674
675    // ========================================================================
676    // Task 4: Key Normalization Tests for Numerical Stability
677    // ========================================================================
678
679    #[test]
680    fn test_large_key_normalization() {
681        // Test with keys near u64::MAX - this would overflow without normalization
682        let base = u64::MAX - 1000;
683        let keys: Vec<u64> = (0..100).map(|i| base + i * 10).collect();
684
685        let index = LearnedSparseIndex::build(&keys);
686
687        // Should have reasonable error even with huge keys
688        assert!(
689            index.max_error < 10,
690            "Error should be small for linear data"
691        );
692
693        // Lookups should work correctly
694        for (i, &key) in keys.iter().enumerate() {
695            let result = index.lookup(key);
696            match result {
697                LookupResult::Range { low, high } => {
698                    assert!(
699                        low <= i && i <= high,
700                        "Key {} at position {} should be in range [{}, {}]",
701                        key,
702                        i,
703                        low,
704                        high
705                    );
706                }
707                LookupResult::Exact(pos) => {
708                    assert_eq!(pos, i, "Exact position should match");
709                }
710                LookupResult::NotFound => {
711                    panic!("Key {} should be found", key);
712                }
713            }
714        }
715    }
716
717    #[test]
718    fn test_full_range_keys() {
719        // Test keys spanning from 0 to near MAX
720        let keys: Vec<u64> = vec![
721            0,
722            1_000_000,
723            1_000_000_000,
724            1_000_000_000_000,
725            1_000_000_000_000_000,
726            u64::MAX / 2,
727            u64::MAX - 1000,
728            u64::MAX - 100,
729            u64::MAX - 10,
730            u64::MAX - 1,
731        ];
732
733        let index = LearnedSparseIndex::build(&keys);
734
735        // All keys should be findable
736        for (i, &key) in keys.iter().enumerate() {
737            let result = index.lookup(key);
738            match result {
739                LookupResult::Range { low, high } => {
740                    assert!(
741                        low <= i && i <= high,
742                        "Key {} at position {} should be in range [{}, {}]",
743                        key,
744                        i,
745                        low,
746                        high
747                    );
748                }
749                LookupResult::Exact(pos) => {
750                    assert_eq!(pos, i, "Exact position should match");
751                }
752                LookupResult::NotFound => {
753                    panic!("Key {} should be found", key);
754                }
755            }
756        }
757    }
758
759    #[test]
760    fn test_timestamp_keys() {
761        // Simulate realistic timestamp keys (microseconds since epoch)
762        // Current time is around 1.7e15 microseconds
763        let base_ts: u64 = 1_700_000_000_000_000;
764        let keys: Vec<u64> = (0..1000).map(|i| base_ts + i * 1000).collect();
765
766        let index = LearnedSparseIndex::build(&keys);
767
768        // Should have very low error for sequential timestamps
769        assert!(
770            index.max_error <= 1,
771            "Error for sequential timestamps should be ≤ 1, got {}",
772            index.max_error
773        );
774
775        // Verify efficiency
776        assert!(
777            index.is_efficient(),
778            "Sequential timestamp data should be efficient"
779        );
780    }
781
782    #[test]
783    fn test_normalization_precision() {
784        // Test that normalization maintains precision
785        let index = LearnedSparseIndex {
786            slope: 1.0,
787            intercept: 0.0,
788            max_error: 0,
789            corrections: Vec::new(),
790            min_key: 0,
791            max_key: 99,
792            key_range: 99.0,
793            num_keys: 100,
794            correction_threshold: 64,
795        };
796
797        // Key 0 should normalize to 0.0
798        assert!((index.normalize_key(0) - 0.0).abs() < f64::EPSILON);
799
800        // Key max should normalize to n-1 = 99.0
801        assert!((index.normalize_key(99) - 99.0).abs() < f64::EPSILON);
802
803        // Middle key should normalize to middle
804        assert!((index.normalize_key(49) - 49.0).abs() < 0.5);
805    }
806}
sochdb_core/learned_index.rs

sochdb_core/
learned_index.rs