sochdb_core/
learned_index.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// SochDB - LLM-Optimized Embedded Database
3// Copyright (C) 2026 Sushanth Reddy Vanagala (https://github.com/sushanthpy)
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU Affero General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU Affero General Public License for more details.
14//
15// You should have received a copy of the GNU Affero General Public License
16// along with this program. If not, see <https://www.gnu.org/licenses/>.
17
18//! Learned Sparse Index (LSI)
19//!
20//! A novel index structure that uses linear regression to predict key positions
21//! instead of tree-based lookups. For well-distributed data (timestamps, sequential IDs),
22//! this provides O(1) expected lookups instead of O(log N).
23//!
24//! ## Mathematical Foundation
25//!
26//! Given sorted keys k₁ < k₂ < ... < kₙ, the CDF maps keys to positions:
27//! F(k) = |{kᵢ : kᵢ ≤ k}| / N
28//!
29//! For many real distributions, F can be approximated by a simple linear model:
30//! F̂(k) = slope * k + intercept
31//!
32//! With error bound ε: |F(k) - F̂(k)| ≤ ε
33//! We only search 2ε positions instead of the entire index.
34//!
35//! ## Complexity Analysis
36//!
37//! | Operation | B-Tree      | Learned Index           |
38//! |-----------|-------------|-------------------------|
39//! | Lookup    | O(log N)    | O(1) expected, O(ε) worst |
40//! | Insert    | O(log N)    | O(1) amortized + rebuild |
41//! | Space     | O(N)        | O(1) + O(outliers)      |
42
43use serde::{Deserialize, Serialize};
44
45/// Result of a learned index lookup
46#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
47pub enum LookupResult {
48    /// Exact position found (from correction table)
49    Exact(usize),
50    /// Range of positions to search [low, high]
51    Range { low: usize, high: usize },
52    /// Key is out of bounds
53    NotFound,
54}
55
56/// Learned Sparse Index using linear regression
57///
58/// Now with key normalization for numerical stability with large u64 keys.
59/// Keys are normalized to [0, n-1] before regression, avoiding precision
60/// loss with values near u64::MAX.
61#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct LearnedSparseIndex {
63    /// Linear model: position ≈ slope * normalized_key + intercept
64    slope: f64,
65    intercept: f64,
66    /// Maximum error of the model
67    max_error: usize,
68    /// Correction table for outliers (sparse), sorted by key
69    corrections: Vec<(u64, usize)>,
70    /// Minimum key in the index (for normalization)
71    min_key: u64,
72    /// Maximum key in the index (for normalization)
73    max_key: u64,
74    /// Precomputed key range as f64 (max_key - min_key)
75    key_range: f64,
76    /// Total number of keys
77    num_keys: usize,
78    /// Threshold for storing corrections (keys with error > this are stored)
79    correction_threshold: usize,
80}
81
82impl LearnedSparseIndex {
83    /// Default threshold for storing corrections
84    const DEFAULT_CORRECTION_THRESHOLD: usize = 64;
85
86    /// Create an empty index
87    pub fn empty() -> Self {
88        Self {
89            slope: 0.0,
90            intercept: 0.0,
91            max_error: 0,
92            corrections: Vec::new(),
93            min_key: 0,
94            max_key: 0,
95            key_range: 0.0,
96            num_keys: 0,
97            correction_threshold: Self::DEFAULT_CORRECTION_THRESHOLD,
98        }
99    }
100
101    /// Normalize a key to range [0, n-1] for numerical stability
102    ///
103    /// Given keys k_min < k_max, transforms key k to:
104    /// k_normalized = ((k - k_min) / (k_max - k_min)) * (n - 1)
105    #[inline]
106    fn normalize_key(&self, key: u64) -> f64 {
107        if self.key_range == 0.0 {
108            return 0.0;
109        }
110        // Use u128 arithmetic to avoid overflow when subtracting
111        let offset = (key as u128).saturating_sub(self.min_key as u128) as f64;
112        (offset / self.key_range) * (self.num_keys - 1) as f64
113    }
114
115    /// Build index from sorted keys
116    ///
117    /// Keys MUST be sorted in ascending order. This is not validated for performance.
118    pub fn build(keys: &[u64]) -> Self {
119        Self::build_with_threshold(keys, Self::DEFAULT_CORRECTION_THRESHOLD)
120    }
121
122    /// Build index with custom correction threshold
123    ///
124    /// Lower threshold = more corrections stored = faster lookups but more memory
125    /// Higher threshold = fewer corrections = slower lookups but less memory
126    pub fn build_with_threshold(keys: &[u64], correction_threshold: usize) -> Self {
127        let n = keys.len();
128        if n == 0 {
129            return Self::empty();
130        }
131
132        if n == 1 {
133            return Self {
134                slope: 0.0,
135                intercept: 0.0,
136                max_error: 0,
137                corrections: Vec::new(),
138                min_key: keys[0],
139                max_key: keys[0],
140                key_range: 0.0,
141                num_keys: 1,
142                correction_threshold,
143            };
144        }
145
146        let min_key = keys[0];
147        let max_key = keys[n - 1];
148        // Use u128 to avoid overflow when computing range
149        let key_range = (max_key as u128 - min_key as u128) as f64;
150
151        // Fit linear regression on NORMALIZED keys for numerical stability
152        let (slope, intercept) = Self::linear_regression_normalized(keys, min_key, key_range, n);
153
154        // Calculate errors and find outliers
155        let mut max_error = 0usize;
156        let mut corrections = Vec::new();
157
158        for (actual_pos, &key) in keys.iter().enumerate() {
159            // Use normalized key for prediction
160            let normalized = if key_range == 0.0 {
161                0.0
162            } else {
163                let offset = (key as u128 - min_key as u128) as f64;
164                (offset / key_range) * (n - 1) as f64
165            };
166            let predicted = slope * normalized + intercept;
167            let predicted_pos = predicted.round() as isize;
168            let error = (actual_pos as isize - predicted_pos).unsigned_abs();
169
170            if error > max_error {
171                max_error = error;
172            }
173
174            // Store correction for large errors (outliers)
175            if error > correction_threshold {
176                corrections.push((key, actual_pos));
177            }
178        }
179
180        Self {
181            slope,
182            intercept,
183            max_error,
184            corrections,
185            min_key,
186            max_key,
187            key_range,
188            num_keys: n,
189            correction_threshold,
190        }
191    }
192
193    /// Lookup: O(1) expected, O(ε) worst case
194    pub fn lookup(&self, key: u64) -> LookupResult {
195        if self.num_keys == 0 {
196            return LookupResult::NotFound;
197        }
198
199        // Bounds check
200        if key < self.min_key || key > self.max_key {
201            return LookupResult::NotFound;
202        }
203
204        // Check corrections first (outliers get O(1) exact lookup)
205        // Binary search in the sorted corrections vector
206        if let Ok(idx) = self.corrections.binary_search_by_key(&key, |&(k, _)| k) {
207            return LookupResult::Exact(self.corrections[idx].1);
208        }
209
210        // Predict position using normalized key for numerical stability
211        let normalized = self.normalize_key(key);
212        let predicted = self.slope * normalized + self.intercept;
213        let predicted_pos = predicted.round() as isize;
214
215        // Calculate search range based on max error
216        let low = (predicted_pos - self.max_error as isize).max(0) as usize;
217        let high =
218            (predicted_pos + self.max_error as isize).min(self.num_keys as isize - 1) as usize;
219
220        LookupResult::Range { low, high }
221    }
222
223    /// Lookup with a custom error margin (tighter range if you know data is regular)
224    pub fn lookup_with_error(&self, key: u64, max_error: usize) -> LookupResult {
225        if self.num_keys == 0 {
226            return LookupResult::NotFound;
227        }
228
229        if key < self.min_key || key > self.max_key {
230            return LookupResult::NotFound;
231        }
232
233        if let Ok(idx) = self.corrections.binary_search_by_key(&key, |&(k, _)| k) {
234            return LookupResult::Exact(self.corrections[idx].1);
235        }
236
237        // Use normalized key for prediction
238        let normalized = self.normalize_key(key);
239        let predicted = self.slope * normalized + self.intercept;
240        let predicted_pos = predicted.round() as isize;
241
242        let low = (predicted_pos - max_error as isize).max(0) as usize;
243        let high = (predicted_pos + max_error as isize).min(self.num_keys as isize - 1) as usize;
244
245        LookupResult::Range { low, high }
246    }
247
248    /// Get index statistics
249    pub fn stats(&self) -> LearnedIndexStats {
250        LearnedIndexStats {
251            num_keys: self.num_keys,
252            max_error: self.max_error,
253            num_corrections: self.corrections.len(),
254            slope: self.slope,
255            intercept: self.intercept,
256            correction_ratio: if self.num_keys > 0 {
257                self.corrections.len() as f64 / self.num_keys as f64
258            } else {
259                0.0
260            },
261        }
262    }
263
264    /// Returns true if this index is well-suited for learned indexing
265    /// (low error, few corrections)
266    pub fn is_efficient(&self) -> bool {
267        // Efficient if:
268        // - Max error is small (under 128 positions to search)
269        // - Correction ratio is low (under 5% of keys are outliers)
270        let low_error = self.max_error <= 128;
271        let low_corrections =
272            self.num_keys == 0 || (self.corrections.len() as f64 / self.num_keys as f64) < 0.05;
273        low_error && low_corrections
274    }
275
276    /// Memory usage in bytes
277    pub fn memory_bytes(&self) -> usize {
278        std::mem::size_of::<Self>()
279            + self.corrections.len() * (std::mem::size_of::<u64>() + std::mem::size_of::<usize>())
280    }
281
282    /// Linear regression on NORMALIZED keys for numerical stability
283    ///
284    /// Keys are normalized to [0, n-1] before regression to avoid precision loss
285    /// with large u64 values (near u64::MAX, squaring would overflow f64 precision).
286    fn linear_regression_normalized(
287        keys: &[u64],
288        min_key: u64,
289        key_range: f64,
290        n: usize,
291    ) -> (f64, f64) {
292        let n_f64 = n as f64;
293
294        // Using numerically stable algorithm with normalized keys
295        let mut sum_x: f64 = 0.0;
296        let mut sum_y: f64 = 0.0;
297        let mut sum_xy: f64 = 0.0;
298        let mut sum_xx: f64 = 0.0;
299
300        for (i, &key) in keys.iter().enumerate() {
301            // Normalize key to [0, n-1]
302            let x = if key_range == 0.0 {
303                0.0
304            } else {
305                let offset = (key as u128 - min_key as u128) as f64;
306                (offset / key_range) * (n - 1) as f64
307            };
308            let y = i as f64;
309
310            sum_x += x;
311            sum_y += y;
312            sum_xy += x * y;
313            sum_xx += x * x;
314        }
315
316        let denominator = n_f64 * sum_xx - sum_x * sum_x;
317
318        // Handle degenerate case (all keys are the same)
319        if denominator.abs() < f64::EPSILON {
320            return (0.0, sum_y / n_f64);
321        }
322
323        let slope = (n_f64 * sum_xy - sum_x * sum_y) / denominator;
324        let intercept = (sum_y - slope * sum_x) / n_f64;
325
326        (slope, intercept)
327    }
328
329    /// Legacy linear regression (kept for compatibility but deprecated)
330    #[allow(dead_code)]
331    fn linear_regression(keys: &[u64]) -> (f64, f64) {
332        let n = keys.len() as f64;
333
334        // Using numerically stable algorithm
335        let mut sum_x: f64 = 0.0;
336        let mut sum_y: f64 = 0.0;
337        let mut sum_xy: f64 = 0.0;
338        let mut sum_xx: f64 = 0.0;
339
340        for (i, &key) in keys.iter().enumerate() {
341            let x = key as f64;
342            let y = i as f64;
343            sum_x += x;
344            sum_y += y;
345            sum_xy += x * y;
346            sum_xx += x * x;
347        }
348
349        let denominator = n * sum_xx - sum_x * sum_x;
350
351        // Handle degenerate case (all keys are the same)
352        if denominator.abs() < f64::EPSILON {
353            return (0.0, sum_y / n);
354        }
355
356        let slope = (n * sum_xy - sum_x * sum_y) / denominator;
357        let intercept = (sum_y - slope * sum_x) / n;
358
359        (slope, intercept)
360    }
361
362    /// Incrementally add a key (rebuilds if model quality degrades)
363    /// Returns true if a rebuild was triggered
364    pub fn insert(&mut self, key: u64, position: usize, keys: &[u64]) -> bool {
365        // Predict where this key should be using normalized key
366        let normalized = self.normalize_key(key);
367        let predicted = self.slope * normalized + self.intercept;
368        let predicted_pos = predicted.round() as isize;
369        let error = (position as isize - predicted_pos).unsigned_abs();
370
371        // Update bounds
372        self.min_key = self.min_key.min(key);
373        self.max_key = self.max_key.max(key);
374        // Recalculate key_range
375        self.key_range = (self.max_key as u128 - self.min_key as u128) as f64;
376        self.num_keys += 1;
377
378        if error > self.max_error {
379            self.max_error = error;
380        }
381
382        if error > self.correction_threshold {
383            // Insert into sorted corrections vector
384            match self.corrections.binary_search_by_key(&key, |&(k, _)| k) {
385                Ok(idx) => self.corrections[idx] = (key, position),
386                Err(idx) => self.corrections.insert(idx, (key, position)),
387            }
388        }
389
390        // Rebuild if model is degrading too much
391        if self.corrections.len() > self.num_keys / 10 {
392            *self = Self::build_with_threshold(keys, self.correction_threshold);
393            return true;
394        }
395
396        false
397    }
398}
399
400/// Statistics about the learned index
401#[derive(Debug, Clone)]
402pub struct LearnedIndexStats {
403    /// Number of keys in the index
404    pub num_keys: usize,
405    /// Maximum prediction error
406    pub max_error: usize,
407    /// Number of keys with corrections stored
408    pub num_corrections: usize,
409    /// Linear model slope
410    pub slope: f64,
411    /// Linear model intercept
412    pub intercept: f64,
413    /// Ratio of keys that need corrections
414    pub correction_ratio: f64,
415}
416
417/// Piecewise Learned Index for non-linear distributions
418///
419/// Divides the key space into segments, each with its own linear model
420#[derive(Debug, Clone)]
421pub struct PiecewiseLearnedIndex {
422    /// Segment boundaries (sorted)
423    boundaries: Vec<u64>,
424    /// Index for each segment
425    segments: Vec<LearnedSparseIndex>,
426}
427
428impl PiecewiseLearnedIndex {
429    /// Build piecewise index with automatic segmentation
430    pub fn build(keys: &[u64], max_segments: usize) -> Self {
431        if keys.is_empty() || max_segments == 0 {
432            return Self {
433                boundaries: vec![],
434                segments: vec![],
435            };
436        }
437
438        // Simple even segmentation (could use dynamic programming for optimal)
439        let segment_size = keys.len().div_ceil(max_segments);
440        let mut boundaries = Vec::with_capacity(max_segments);
441        let mut segments = Vec::with_capacity(max_segments);
442
443        for chunk in keys.chunks(segment_size) {
444            if !chunk.is_empty() {
445                boundaries.push(chunk[0]);
446                segments.push(LearnedSparseIndex::build(chunk));
447            }
448        }
449
450        Self {
451            boundaries,
452            segments,
453        }
454    }
455
456    /// Find segment containing key
457    fn find_segment(&self, key: u64) -> Option<usize> {
458        if self.boundaries.is_empty() {
459            return None;
460        }
461
462        // Binary search for segment
463        match self.boundaries.binary_search(&key) {
464            Ok(i) => Some(i),
465            Err(i) => {
466                if i == 0 {
467                    None
468                } else {
469                    Some(i - 1)
470                }
471            }
472        }
473    }
474
475    /// Lookup with piecewise model
476    pub fn lookup(&self, key: u64) -> LookupResult {
477        match self.find_segment(key) {
478            Some(seg_idx) => self.segments[seg_idx].lookup(key),
479            None => LookupResult::NotFound,
480        }
481    }
482
483    /// Get aggregate statistics
484    pub fn stats(&self) -> PiecewiseStats {
485        let segment_stats: Vec<_> = self.segments.iter().map(|s| s.stats()).collect();
486        let total_keys: usize = segment_stats.iter().map(|s| s.num_keys).sum();
487        let max_error = segment_stats.iter().map(|s| s.max_error).max().unwrap_or(0);
488        let total_corrections: usize = segment_stats.iter().map(|s| s.num_corrections).sum();
489
490        PiecewiseStats {
491            num_segments: self.segments.len(),
492            total_keys,
493            max_error,
494            total_corrections,
495            avg_segment_size: if self.segments.is_empty() {
496                0.0
497            } else {
498                total_keys as f64 / self.segments.len() as f64
499            },
500        }
501    }
502}
503
504/// Statistics for piecewise learned index
505#[derive(Debug, Clone)]
506pub struct PiecewiseStats {
507    pub num_segments: usize,
508    pub total_keys: usize,
509    pub max_error: usize,
510    pub total_corrections: usize,
511    pub avg_segment_size: f64,
512}
513
514#[cfg(test)]
515mod tests {
516    use super::*;
517
518    #[test]
519    fn test_empty_index() {
520        let index = LearnedSparseIndex::build(&[]);
521        assert_eq!(index.lookup(42), LookupResult::NotFound);
522        assert_eq!(index.stats().num_keys, 0);
523    }
524
525    #[test]
526    fn test_single_key() {
527        let index = LearnedSparseIndex::build(&[100]);
528        assert!(matches!(
529            index.lookup(100),
530            LookupResult::Range { low: 0, high: 0 }
531        ));
532        assert_eq!(index.lookup(50), LookupResult::NotFound);
533        assert_eq!(index.lookup(150), LookupResult::NotFound);
534    }
535
536    #[test]
537    fn test_sequential_keys() {
538        // Perfect linear distribution - should have very low error
539        let keys: Vec<u64> = (0..1000).collect();
540        let index = LearnedSparseIndex::build(&keys);
541
542        let stats = index.stats();
543        assert!(
544            stats.max_error <= 1,
545            "Sequential keys should have near-zero error"
546        );
547        assert!(
548            stats.num_corrections == 0,
549            "No corrections needed for linear data"
550        );
551
552        // Lookup should give exact or near-exact positions
553        if let LookupResult::Range { low, high } = index.lookup(500) {
554            assert!(low <= 500 && high >= 500, "Key 500 should be in range");
555            assert!(high - low <= 2, "Range should be very tight");
556        }
557    }
558
559    #[test]
560    fn test_timestamp_like_keys() {
561        // Timestamps with gaps (realistic workload)
562        let mut keys: Vec<u64> = Vec::new();
563        let mut ts: u64 = 1704067200; // Jan 1, 2024
564        for _ in 0..10000 {
565            keys.push(ts);
566            ts += 1 + (ts % 10); // Variable gaps 1-10
567        }
568
569        let index = LearnedSparseIndex::build(&keys);
570
571        // Should still be efficient for roughly linear data
572        assert!(
573            index.is_efficient(),
574            "Timestamp data should be efficiently indexable"
575        );
576
577        // Verify lookups
578        for &key in keys.iter().take(100) {
579            let result = index.lookup(key);
580            assert!(
581                !matches!(result, LookupResult::NotFound),
582                "Existing key should be found"
583            );
584        }
585    }
586
587    #[test]
588    fn test_sparse_keys() {
589        // Very sparse keys (high gaps)
590        let keys: Vec<u64> = vec![1, 100, 10000, 1000000, 100000000];
591        let index = LearnedSparseIndex::build(&keys);
592
593        // All keys should be findable
594        for (i, &key) in keys.iter().enumerate() {
595            match index.lookup(key) {
596                LookupResult::Exact(pos) => assert_eq!(pos, i),
597                LookupResult::Range { low, high } => {
598                    assert!(
599                        low <= i && i <= high,
600                        "Key {} should be in range [{}, {}]",
601                        key,
602                        low,
603                        high
604                    );
605                }
606                LookupResult::NotFound => panic!("Key {} should be found", key),
607            }
608        }
609    }
610
611    #[test]
612    fn test_out_of_bounds() {
613        let keys: Vec<u64> = (100..200).collect();
614        let index = LearnedSparseIndex::build(&keys);
615
616        assert_eq!(index.lookup(50), LookupResult::NotFound);
617        assert_eq!(index.lookup(250), LookupResult::NotFound);
618    }
619
620    #[test]
621    fn test_piecewise_index() {
622        // Create non-linear data that benefits from segmentation
623        let mut keys: Vec<u64> = Vec::new();
624
625        // Three distinct clusters
626        keys.extend(0..1000); // Dense cluster 1
627        keys.extend((100000..101000).step_by(10)); // Sparse cluster 2
628        keys.extend(1000000..1001000); // Dense cluster 3
629
630        let piecewise = PiecewiseLearnedIndex::build(&keys, 3);
631        let stats = piecewise.stats();
632
633        assert_eq!(stats.num_segments, 3);
634
635        // Verify lookups across clusters
636        assert!(!matches!(piecewise.lookup(500), LookupResult::NotFound));
637        assert!(!matches!(piecewise.lookup(100500), LookupResult::NotFound));
638        assert!(!matches!(piecewise.lookup(1000500), LookupResult::NotFound));
639    }
640
641    #[test]
642    fn test_memory_efficiency() {
643        // Compare memory to theoretical B-tree
644        let keys: Vec<u64> = (0..100000).collect();
645        let index = LearnedSparseIndex::build(&keys);
646
647        let lsi_bytes = index.memory_bytes();
648        let btree_bytes = keys.len() * std::mem::size_of::<u64>(); // Minimum B-tree overhead
649
650        // LSI should use significantly less memory for linear data
651        assert!(
652            lsi_bytes < btree_bytes,
653            "LSI ({} bytes) should use less memory than keys alone ({} bytes)",
654            lsi_bytes,
655            btree_bytes
656        );
657    }
658
659    #[test]
660    fn test_correction_threshold() {
661        // Create data with some outliers
662        let mut keys: Vec<u64> = (0..100).map(|x| x * 10).collect();
663        keys.push(5000); // Outlier
664        keys.sort();
665
666        // Low threshold = more corrections
667        let low_thresh = LearnedSparseIndex::build_with_threshold(&keys, 10);
668
669        // High threshold = fewer corrections
670        let high_thresh = LearnedSparseIndex::build_with_threshold(&keys, 1000);
671
672        assert!(
673            low_thresh.stats().num_corrections >= high_thresh.stats().num_corrections,
674            "Lower threshold should produce more or equal corrections"
675        );
676    }
677
678    // ========================================================================
679    // Task 4: Key Normalization Tests for Numerical Stability
680    // ========================================================================
681
682    #[test]
683    fn test_large_key_normalization() {
684        // Test with keys near u64::MAX - this would overflow without normalization
685        let base = u64::MAX - 1000;
686        let keys: Vec<u64> = (0..100).map(|i| base + i * 10).collect();
687
688        let index = LearnedSparseIndex::build(&keys);
689
690        // Should have reasonable error even with huge keys
691        assert!(
692            index.max_error < 10,
693            "Error should be small for linear data"
694        );
695
696        // Lookups should work correctly
697        for (i, &key) in keys.iter().enumerate() {
698            let result = index.lookup(key);
699            match result {
700                LookupResult::Range { low, high } => {
701                    assert!(
702                        low <= i && i <= high,
703                        "Key {} at position {} should be in range [{}, {}]",
704                        key,
705                        i,
706                        low,
707                        high
708                    );
709                }
710                LookupResult::Exact(pos) => {
711                    assert_eq!(pos, i, "Exact position should match");
712                }
713                LookupResult::NotFound => {
714                    panic!("Key {} should be found", key);
715                }
716            }
717        }
718    }
719
720    #[test]
721    fn test_full_range_keys() {
722        // Test keys spanning from 0 to near MAX
723        let keys: Vec<u64> = vec![
724            0,
725            1_000_000,
726            1_000_000_000,
727            1_000_000_000_000,
728            1_000_000_000_000_000,
729            u64::MAX / 2,
730            u64::MAX - 1000,
731            u64::MAX - 100,
732            u64::MAX - 10,
733            u64::MAX - 1,
734        ];
735
736        let index = LearnedSparseIndex::build(&keys);
737
738        // All keys should be findable
739        for (i, &key) in keys.iter().enumerate() {
740            let result = index.lookup(key);
741            match result {
742                LookupResult::Range { low, high } => {
743                    assert!(
744                        low <= i && i <= high,
745                        "Key {} at position {} should be in range [{}, {}]",
746                        key,
747                        i,
748                        low,
749                        high
750                    );
751                }
752                LookupResult::Exact(pos) => {
753                    assert_eq!(pos, i, "Exact position should match");
754                }
755                LookupResult::NotFound => {
756                    panic!("Key {} should be found", key);
757                }
758            }
759        }
760    }
761
762    #[test]
763    fn test_timestamp_keys() {
764        // Simulate realistic timestamp keys (microseconds since epoch)
765        // Current time is around 1.7e15 microseconds
766        let base_ts: u64 = 1_700_000_000_000_000;
767        let keys: Vec<u64> = (0..1000).map(|i| base_ts + i * 1000).collect();
768
769        let index = LearnedSparseIndex::build(&keys);
770
771        // Should have very low error for sequential timestamps
772        assert!(
773            index.max_error <= 1,
774            "Error for sequential timestamps should be ≤ 1, got {}",
775            index.max_error
776        );
777
778        // Verify efficiency
779        assert!(
780            index.is_efficient(),
781            "Sequential timestamp data should be efficient"
782        );
783    }
784
785    #[test]
786    fn test_normalization_precision() {
787        // Test that normalization maintains precision
788        let index = LearnedSparseIndex {
789            slope: 1.0,
790            intercept: 0.0,
791            max_error: 0,
792            corrections: Vec::new(),
793            min_key: 0,
794            max_key: 99,
795            key_range: 99.0,
796            num_keys: 100,
797            correction_threshold: 64,
798        };
799
800        // Key 0 should normalize to 0.0
801        assert!((index.normalize_key(0) - 0.0).abs() < f64::EPSILON);
802
803        // Key max should normalize to n-1 = 99.0
804        assert!((index.normalize_key(99) - 99.0).abs() < f64::EPSILON);
805
806        // Middle key should normalize to middle
807        assert!((index.normalize_key(49) - 49.0).abs() < 0.5);
808    }
809}
sochdb_core/learned_index.rs

sochdb_core/
learned_index.rs