scry-index 0.1.0

A concurrent sorted key-value map backed by learned index structures
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
//! Linear model fitting for learned index nodes.
//!
//! The core algorithm is FMCD (Fastest Minimum Conflict Degree) from the LIPP
//! paper. Given a sorted set of keys and a target array size, it computes a
//! linear model `f(key) = slope * key + intercept` that maps each key to a
//! unique slot with minimum conflicts.

use crate::key::Key;

/// A linear model that predicts slot positions from keys.
///
/// Given a key `k`, the predicted slot index is:
/// `(slope * k.to_model_input() + intercept).round() as usize`
///
/// When `midpoint` is set, the linear formula is overridden with a binary
/// split: keys with `to_exact_ordinal() <= midpoint` predict to slot 0,
/// keys above predict to the last slot. This handles the degenerate case
/// where `f64` cannot distinguish two distinct keys.
#[derive(Debug, Clone, Copy)]
pub struct LinearModel {
    /// Slope of the linear function.
    pub slope: f64,
    /// Intercept of the linear function.
    pub intercept: f64,
    /// When set, overrides linear prediction with a binary split.
    midpoint: Option<i128>,
}

impl LinearModel {
    /// Create a new linear model with the given parameters.
    pub const fn new(slope: f64, intercept: f64) -> Self {
        Self {
            slope,
            intercept,
            midpoint: None,
        }
    }

    /// Create a binary-split model that separates keys at the given midpoint.
    ///
    /// Keys with `to_exact_ordinal() <= midpoint` predict to slot 0;
    /// keys above predict to `array_size - 1`.
    pub const fn binary_split(midpoint: i128) -> Self {
        Self {
            slope: 0.0,
            intercept: 0.0,
            midpoint: Some(midpoint),
        }
    }

    /// Predict the slot index for a key.
    ///
    /// Returns the predicted position clamped to `[0, array_size - 1]`.
    #[inline]
    pub fn predict<K: Key>(&self, key: &K, array_size: usize) -> usize {
        if let Some(mid) = self.midpoint {
            return if key.to_exact_ordinal() <= mid {
                0
            } else {
                array_size.saturating_sub(1)
            };
        }
        let pos = self.slope.mul_add(key.to_model_input(), self.intercept);
        let pos = pos.round().max(0.0) as usize;
        pos.min(array_size.saturating_sub(1))
    }

    /// Predict the slot index from a raw `f64` model input value.
    ///
    /// This avoids needing a `Key` reference, useful in FMCD fitting where
    /// only the `f64` representation is available via a closure.
    ///
    /// Returns the predicted position clamped to `[0, array_size - 1]`.
    #[inline]
    pub fn predict_raw(&self, value: f64, array_size: usize) -> usize {
        let pos = self.slope.mul_add(value, self.intercept);
        let pos = pos.round().max(0.0) as usize;
        pos.min(array_size.saturating_sub(1))
    }

    /// Create a model that maps a single key to slot 0.
    pub const fn constant() -> Self {
        Self {
            slope: 0.0,
            intercept: 0.0,
            midpoint: None,
        }
    }
}

/// Result of FMCD model fitting.
#[derive(Debug)]
pub struct FmcdResult {
    /// The fitted linear model.
    pub model: LinearModel,
    /// The array size needed to achieve zero (or minimal) conflicts.
    pub array_size: usize,
    /// Number of conflicts (keys mapping to the same slot).
    pub conflicts: usize,
}

/// Maximum number of candidate slopes to try in FMCD.
const MAX_FMCD_CANDIDATES: usize = 32;

/// Fit a linear model to sorted keys using the FMCD algorithm.
///
/// Finds a linear function `f(key) = slope * key + intercept` that maps each
/// key to a slot in an array, minimizing the number of collisions (conflicts).
///
/// The algorithm tries candidate slopes derived from the maximum inter-key gap
/// (the FMCD heuristic from the LIPP paper) and selects the slope with the
/// fewest conflicts. Linear interpolation is always tried as a baseline.
///
/// # Arguments
///
/// - `n`: Number of keys (must be > 0).
/// - `key_input`: Closure returning the `f64` model input for the key at index `i`.
/// - `expansion_factor`: Ratio of array size to number of keys. Must be >= 1.0.
///   A factor of 2.0 means the array is twice as large as the key count (50% gaps).
/// - `range_headroom`: Extra key range coverage beyond the current data. When > 0,
///   the model covers `(1 + range_headroom)` times the actual key range and the
///   array grows proportionally, leaving space for future keys beyond the current
///   max without clamping. Must be >= 0.0.
///
/// # Returns
///
/// A [`FmcdResult`] containing the model, array size, and conflict count.
pub fn fit_fmcd(
    n: usize,
    key_input: impl Fn(usize) -> f64,
    expansion_factor: f64,
    range_headroom: f64,
) -> FmcdResult {
    assert!(n > 0, "keys must be non-empty");
    assert!(
        expansion_factor >= 1.0,
        "expansion_factor must be >= 1.0, got {expansion_factor}"
    );

    // Special case: single key
    if n == 1 {
        return FmcdResult {
            model: LinearModel::constant(),
            array_size: 1,
            conflicts: 0,
        };
    }

    let headroom_mult = 1.0 + range_headroom;
    let array_size = (n as f64 * expansion_factor * headroom_mult)
        .ceil()
        .max(n as f64) as usize;

    let first = key_input(0);
    let last = key_input(n - 1);
    let key_range = last - first;

    if key_range.abs() < f64::EPSILON {
        // All keys are the same value. Map all to the middle.
        return FmcdResult {
            model: LinearModel::new(0.0, (array_size / 2) as f64),
            array_size,
            conflicts: n - 1,
        };
    }

    let effective_range = key_range * headroom_mult;

    // --- Candidate 1: Linear interpolation (baseline) ---
    let lin_slope = (array_size - 1) as f64 / effective_range;
    let lin_intercept = -lin_slope * first;
    let lin_model = LinearModel::new(lin_slope, lin_intercept);
    let lin_conflicts = count_conflicts_fast(n, &key_input, &lin_model, array_size);

    if lin_conflicts == 0 {
        return FmcdResult {
            model: lin_model,
            array_size,
            conflicts: 0,
        };
    }

    let mut best_model = lin_model;
    let mut best_conflicts = lin_conflicts;

    // --- FMCD: try slopes derived from the maximum inter-key gap ---
    // The insight: the optimal slope often places the widest-gap pair at
    // consecutive slots. Trying multiples of 1/max_gap searches
    // for the minimum-conflict model.
    let mut max_gap = 0.0_f64;
    for i in 0..n - 1 {
        let gap = key_input(i + 1) - key_input(i);
        max_gap = max_gap.max(gap);
    }

    if max_gap > f64::EPSILON {
        // Upper bound: slope where predict(last) = array_size - 1
        let max_slope = (array_size - 1) as f64 / key_range;

        for j in 1..=MAX_FMCD_CANDIDATES {
            let slope = j as f64 / max_gap;
            if slope > max_slope {
                break;
            }

            let intercept = -slope * first;
            let model = LinearModel::new(slope, intercept);
            let conflicts = count_conflicts_fast(n, &key_input, &model, array_size);

            if conflicts < best_conflicts {
                best_conflicts = conflicts;
                best_model = model;
            }
            if best_conflicts == 0 {
                break;
            }
        }
    }

    FmcdResult {
        model: best_model,
        array_size,
        conflicts: best_conflicts,
    }
}

/// Fast conflict counting for sorted keys.
///
/// Since keys are sorted and the model is monotonic, we only need to check
/// if adjacent keys map to the same slot (no allocation needed).
fn count_conflicts_fast(
    n: usize,
    key_input: &impl Fn(usize) -> f64,
    model: &LinearModel,
    array_size: usize,
) -> usize {
    let mut conflicts = 0;
    let mut prev_slot = model.predict_raw(key_input(0), array_size);
    for i in 1..n {
        let slot = model.predict_raw(key_input(i), array_size);
        if slot == prev_slot {
            conflicts += 1;
        }
        prev_slot = slot;
    }
    conflicts
}

/// Count how many keys collide (map to the same slot).
///
/// Full version: allocates a boolean array. Works for unsorted keys.
#[cfg(test)]
fn count_conflicts(
    n: usize,
    key_input: &impl Fn(usize) -> f64,
    model: &LinearModel,
    array_size: usize,
) -> usize {
    let mut occupied = vec![false; array_size];
    let mut conflicts = 0;
    for i in 0..n {
        let slot = model.predict_raw(key_input(i), array_size);
        if occupied[slot] {
            conflicts += 1;
        } else {
            occupied[slot] = true;
        }
    }
    conflicts
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn single_key() {
        let keys = [42u64];
        let result = fit_fmcd(keys.len(), |i| keys[i].to_model_input(), 2.0, 0.0);
        assert_eq!(result.array_size, 1);
        assert_eq!(result.conflicts, 0);
        assert_eq!(result.model.predict(&42u64, 1), 0);
    }

    #[test]
    fn two_keys() {
        let keys = [10u64, 20];
        let result = fit_fmcd(keys.len(), |i| keys[i].to_model_input(), 2.0, 0.0);
        assert_eq!(result.conflicts, 0);
        assert!(result.array_size >= 2);
        let s1 = result.model.predict(&10u64, result.array_size);
        let s2 = result.model.predict(&20u64, result.array_size);
        assert_ne!(s1, s2, "two keys should map to different slots");
    }

    #[test]
    fn sequential_keys_no_conflicts() {
        let keys: Vec<u64> = (0..100).collect();
        let result = fit_fmcd(keys.len(), |i| keys[i].to_model_input(), 2.0, 0.0);
        assert_eq!(
            result.conflicts, 0,
            "sequential keys with 2x expansion should have zero conflicts"
        );
    }

    #[test]
    fn dense_keys_some_conflicts() {
        let keys: Vec<u64> = vec![1, 2, 3, 100, 200, 300];
        let result = fit_fmcd(keys.len(), |i| keys[i].to_model_input(), 1.0, 0.0);
        assert!(result.array_size >= keys.len());
    }

    #[test]
    fn predict_clamps_to_range() {
        let model = LinearModel::new(1.0, -10.0);
        assert_eq!(model.predict(&5u64, 100), 0);
        let model2 = LinearModel::new(1.0, 1000.0);
        assert_eq!(model2.predict(&5u64, 100), 99);
    }

    #[test]
    fn expansion_factor_affects_size() {
        let keys: Vec<u64> = (0..50).collect();
        let r1 = fit_fmcd(keys.len(), |i| keys[i].to_model_input(), 1.5, 0.0);
        let r2 = fit_fmcd(keys.len(), |i| keys[i].to_model_input(), 3.0, 0.0);
        assert!(r2.array_size > r1.array_size);
    }

    #[test]
    fn identical_keys_handled() {
        let keys = [5u64; 10];
        let result = fit_fmcd(keys.len(), |i| keys[i].to_model_input(), 2.0, 0.0);
        assert_eq!(result.conflicts, 9);
    }

    #[test]
    fn large_key_range() {
        let keys = [0u64, u64::MAX / 2];
        let result = fit_fmcd(keys.len(), |i| keys[i].to_model_input(), 2.0, 0.0);
        assert_eq!(result.conflicts, 0);
    }

    #[test]
    fn signed_keys() {
        let keys: Vec<i64> = vec![-100, -50, 0, 50, 100];
        let result = fit_fmcd(keys.len(), |i| keys[i].to_model_input(), 2.0, 0.0);
        assert_eq!(result.conflicts, 0);
    }

    #[test]
    #[should_panic(expected = "keys must be non-empty")]
    fn empty_keys_panics() {
        fit_fmcd(0, |i: usize| i as f64, 2.0, 0.0);
    }

    #[test]
    #[should_panic(expected = "expansion_factor must be >= 1.0")]
    fn bad_expansion_panics() {
        let keys = [1u64, 2, 3];
        fit_fmcd(keys.len(), |i| keys[i].to_model_input(), 0.5, 0.0);
    }

    #[test]
    fn model_monotonic_for_sorted_keys() {
        let keys: Vec<u64> = (0..1000).map(|i| i * 7 + 3).collect();
        let result = fit_fmcd(keys.len(), |i| keys[i].to_model_input(), 2.0, 0.0);
        let positions: Vec<usize> = keys
            .iter()
            .map(|k| result.model.predict(k, result.array_size))
            .collect();
        for pair in positions.windows(2) {
            assert!(
                pair[0] <= pair[1],
                "model is not monotonic: {} > {}",
                pair[0],
                pair[1]
            );
        }
    }

    #[test]
    fn binary_split_separates_keys() {
        let base: u64 = 1_700_000_000_000_000_000;
        let lo_ord = base.to_exact_ordinal();
        let hi_ord = (base + 1).to_exact_ordinal();
        let midpoint = lo_ord + (hi_ord - lo_ord) / 2;
        let model = LinearModel::binary_split(midpoint);

        // array_size = 2: should map lo to 0, hi to 1
        assert_eq!(model.predict(&base, 2), 0);
        assert_eq!(model.predict(&(base + 1), 2), 1);

        // array_size = 4: should map lo to 0, hi to 3
        assert_eq!(model.predict(&base, 4), 0);
        assert_eq!(model.predict(&(base + 1), 4), 3);
    }

    #[test]
    fn binary_split_many_keys() {
        let base: u64 = 1_700_000_000_000_000_000;
        // Split at midpoint between base+4 and base+5
        let mid = base.to_exact_ordinal() + 4;
        let model = LinearModel::binary_split(mid);

        for i in 0..=4u64 {
            assert_eq!(
                model.predict(&(base + i), 2),
                0,
                "base+{i} should go to slot 0"
            );
        }
        for i in 5..10u64 {
            assert_eq!(
                model.predict(&(base + i), 2),
                1,
                "base+{i} should go to slot 1"
            );
        }
    }

    #[test]
    fn fast_and_full_conflict_count_agree() {
        // Non-uniform keys where conflicts are likely with tight expansion
        let keys: Vec<u64> = vec![1, 2, 3, 4, 5, 100, 200, 300, 400, 500];
        let array_size = 15;
        let first = keys[0].to_model_input();
        let last = keys[keys.len() - 1].to_model_input();
        let slope = (array_size - 1) as f64 / (last - first);
        let intercept = -slope * first;
        let model = LinearModel::new(slope, intercept);

        let ki = |i: usize| keys[i].to_model_input();
        let fast = count_conflicts_fast(keys.len(), &ki, &model, array_size);
        let full = count_conflicts(keys.len(), &ki, &model, array_size);
        assert_eq!(
            fast, full,
            "fast ({fast}) and full ({full}) conflict counts disagree"
        );
    }

    #[test]
    fn fmcd_reduces_conflicts_for_nonuniform_keys() {
        // Non-uniform spacing where FMCD can find a better slope than
        // linear interpolation. Keys with varying gaps benefit from
        // slope optimization.
        let keys: Vec<u64> = vec![0, 1, 2, 3, 10, 11, 12, 13, 20, 21];
        let result = fit_fmcd(keys.len(), |i| keys[i].to_model_input(), 2.0, 0.0);
        assert!(
            result.conflicts <= 2,
            "FMCD should achieve <= 2 conflicts for mildly clustered keys, got {}",
            result.conflicts
        );
    }

    #[test]
    fn headroom_increases_array_size() {
        let keys: Vec<u64> = (0..100).collect();
        let r0 = fit_fmcd(keys.len(), |i| keys[i].to_model_input(), 2.0, 0.0);
        let r1 = fit_fmcd(keys.len(), |i| keys[i].to_model_input(), 2.0, 1.0);
        assert!(
            r1.array_size > r0.array_size,
            "headroom=1.0 should produce larger array: {} vs {}",
            r1.array_size,
            r0.array_size
        );
    }

    #[test]
    fn headroom_keys_not_at_edge() {
        let keys: Vec<u64> = (0..100).collect();
        let result = fit_fmcd(keys.len(), |i| keys[i].to_model_input(), 2.0, 1.0);
        // With headroom=1.0, the last key should map to approximately
        // the middle of the array, not the end.
        let last_slot = result.model.predict(&99u64, result.array_size);
        let midpoint = result.array_size / 2;
        assert!(
            last_slot <= midpoint + midpoint / 4,
            "last key at slot {last_slot}, expected near {midpoint} (array_size={})",
            result.array_size
        );
    }

    #[test]
    fn uniform_still_zero_conflicts_with_fmcd() {
        // Regression guard: sequential keys with good expansion should
        // still get 0 conflicts after the FMCD rewrite.
        let keys: Vec<u64> = (0..1000).collect();
        let result = fit_fmcd(keys.len(), |i| keys[i].to_model_input(), 2.0, 0.0);
        assert_eq!(
            result.conflicts, 0,
            "sequential keys should have 0 conflicts"
        );
    }
}