spg-storage 7.34.3

In-memory storage primitives for SPG: values, rows, table schema, catalog with foreign-key constraints.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
//! HNSW vector-index algorithms: insert/connect, greedy + beam
//! search, neighbour heuristics, and the f32/SQ8 distance
//! kernels. Split out of lib.rs (monster tier-3 cut 2). The
//! `NswGraph` data type itself stays in lib.rs as index
//! vocabulary; this module is the algorithms over it.

use super::*;

/// Insert one row into the HNSW graph held by index slot `idx_pos`.
/// No-op when the row's value at the indexed column isn't a vector.
/// v6.0.1: handles `Value::Sq8Vector` by dequantising into an f32
/// "query" surface — the existing greedy + beam-search machinery
/// then uses `cell_to_query_metric_distance` to route every
/// distance call through the cell's actual encoding.
pub(crate) fn nsw_insert_at(table: &mut Table, idx_pos: usize, new_row_idx: usize) {
    let col_pos = table.indices[idx_pos].column_position;
    let cell_dim: Option<usize> = match &table.rows[new_row_idx].values[col_pos] {
        Value::Vector(v) => Some(v.len()),
        Value::Sq8Vector(q) => Some(q.bytes.len()),
        Value::HalfVector(h) => Some(h.dim()),
        _ => None,
    };
    let Some(dim) = cell_dim else {
        // Even non-vector rows occupy a level slot so per-node Vec
        // lengths stay aligned with `table.rows.len()`.
        ensure_node_slot(table, idx_pos, new_row_idx, 0);
        return;
    };
    if dim == 0 {
        ensure_node_slot(table, idx_pos, new_row_idx, 0);
        return;
    }
    let level = nsw_assign_level(new_row_idx);
    ensure_node_slot(table, idx_pos, new_row_idx, level);
    let (entry, entry_level, m) = match &table.indices[idx_pos].kind {
        IndexKind::Nsw(g) => (g.entry, g.entry_level, g.m),
        IndexKind::BTree(_)
        | IndexKind::Brin { .. }
        | IndexKind::Gin(_)
        | IndexKind::GinTrgm(_)
        | IndexKind::GinFulltext(_) => {
            unreachable!("nsw_insert_at on a non-NSW index")
        }
    };
    // First node ever — declare it the entry (it gets its own level).
    if entry.is_none() {
        if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind {
            g.entry = Some(new_row_idx);
            g.entry_level = level;
            *g.levels
                .get_mut(new_row_idx)
                .expect("levels slot padded by ensure_node_slot") = level;
        }
        return;
    }
    // Set the node's recorded level.
    if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind {
        *g.levels
            .get_mut(new_row_idx)
            .expect("levels slot padded by ensure_node_slot") = level;
    }
    let query = match &table.rows[new_row_idx].values[col_pos] {
        Value::Vector(v) => v.clone(),
        // v6.0.1: dequantise the inserted SQ8 cell into an f32 query
        // surface so the existing greedy / beam machinery can route
        // distances through `cell_to_query_metric_distance`. The
        // small dequantisation error is what the recall@10 ≥ 0.95
        // envelope already accounts for (V6_DESIGN deliberation #3).
        Value::Sq8Vector(q) => quantize::dequantize(q),
        // v6.0.3: halfvec dequant is bit-exact at the storage layer,
        // so the inserted query is a faithful representation.
        Value::HalfVector(h) => h.to_f32_vec(),
        _ => return,
    };
    // Phase 1: greedy descend from `entry` down to `level + 1`, keeping
    // exactly one current best so the next layer starts from it.
    let mut current = entry.expect("entry was Some above");
    let mut current_d = vec_l2_sq(table, col_pos, current, &query);
    if entry_level > level {
        for layer in (level + 1..=entry_level).rev() {
            (current, current_d) =
                greedy_layer_walk(table, idx_pos, layer, current, current_d, &query);
        }
    }
    // Phase 2: from `min(level, entry_level)` down to 0, beam-search
    // `ef_construction` candidates, run the HNSW §4 heuristic neighbour
    // selection over them, and connect bidirectionally.
    let top = level.min(entry_level);
    let ef = (m * 2).max(8);
    for layer in (0..=top).rev() {
        let cap = if layer == 0 { m * 2 } else { m };
        let mut candidates = layer_beam_search(
            table,
            idx_pos,
            layer,
            current,
            current_d,
            &query,
            ef,
            NswMetric::L2,
        );
        candidates.retain(|&(_, n)| n != new_row_idx);
        // Take the closest as the entry for the next layer down — done
        // before heuristic narrowing because the heuristic can reorder.
        if let Some(&(d, n)) = candidates.first() {
            current = n;
            current_d = d;
        }
        let peers = select_neighbours_heuristic(&candidates, cap, table, col_pos);
        connect_at_layer(table, idx_pos, layer, new_row_idx, &peers);
    }
    // Phase 3: if the new node climbed above the current entry, take
    // over as entry so future inserts/searches start from the new top.
    if level > entry_level
        && let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind
    {
        g.entry = Some(new_row_idx);
        g.entry_level = level;
    }
}

/// Make sure `layers[*][new_row_idx]` and `levels[new_row_idx]` exist,
/// padding with empty/zero entries as needed. Also grows `layers` to
/// accommodate the node's top `level`.
fn ensure_node_slot(table: &mut Table, idx_pos: usize, new_row_idx: usize, level: u8) {
    let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind else {
        unreachable!("ensure_node_slot on a BTree index");
    };
    while g.layers.len() <= level as usize {
        g.layers.push(PersistentVec::new());
    }
    while g.levels.len() <= new_row_idx {
        g.levels.push_mut(0);
    }
    for layer_vec in &mut g.layers {
        while layer_vec.len() <= new_row_idx {
            layer_vec.push_mut(Vec::new());
        }
    }
}

/// Single-step greedy walk on one layer: from `current` (with cached
/// distance `current_d`), inspect that node's neighbours at `layer` and
/// hop to the closest if it beats `current_d`. Repeat until no move
/// improves the distance. Cheap variant of beam-search used for the
/// "descend" phase that only needs one survivor per layer.
fn greedy_layer_walk(
    table: &Table,
    idx_pos: usize,
    layer: u8,
    mut current: usize,
    mut current_d: f32,
    query: &[f32],
) -> (usize, f32) {
    let g = match &table.indices[idx_pos].kind {
        IndexKind::Nsw(g) => g,
        IndexKind::BTree(_)
        | IndexKind::Brin { .. }
        | IndexKind::Gin(_)
        | IndexKind::GinTrgm(_)
        | IndexKind::GinFulltext(_) => {
            return (current, current_d);
        }
    };
    let col_pos = table.indices[idx_pos].column_position;
    loop {
        let neighbours: &[u32] = g
            .layers
            .get(layer as usize)
            .and_then(|layer_v| layer_v.get(current))
            .map_or(&[][..], Vec::as_slice);
        let mut best = current;
        let mut best_d = current_d;
        for &n in neighbours {
            let n = n as usize;
            let d = vec_l2_sq(table, col_pos, n, query);
            if d < best_d {
                best = n;
                best_d = d;
            }
        }
        if best == current {
            return (current, current_d);
        }
        current = best;
        current_d = best_d;
    }
}

/// Beam search on one layer starting from `entry_node` with cached
/// `entry_d`. Returns the top `ef` candidates in ascending-distance
/// order. Caller picks the closest as the next layer's entry and / or
/// trims to M for connection.
///
/// v3.0.1: uses two `BinaryHeap`s (min-heap for the open frontier,
/// max-heap for the working top-`ef` results) and a `Vec<bool>` visited
/// bitmap, replacing the v2.x `Vec` + `partition_point` + `BTreeSet`
/// implementation. Same algorithm shape (HNSW search algorithm 2 from
/// the paper); the data-structure swap cuts per-visit cost from
/// `O(ef + log row_count)` to amortised `O(log ef)`.
#[allow(clippy::too_many_arguments)] // Beam search threads layer, entry, query, ef, metric — each is intrinsic. Bundling them into a config struct hides the call sites.
fn layer_beam_search(
    table: &Table,
    idx_pos: usize,
    layer: u8,
    entry_node: usize,
    entry_d: f32,
    query: &[f32],
    ef: usize,
    metric: NswMetric,
) -> Vec<(f32, usize)> {
    let g = match &table.indices[idx_pos].kind {
        IndexKind::Nsw(g) => g,
        IndexKind::BTree(_)
        | IndexKind::Brin { .. }
        | IndexKind::Gin(_)
        | IndexKind::GinTrgm(_)
        | IndexKind::GinFulltext(_) => return Vec::new(),
    };
    let col_pos = table.indices[idx_pos].column_position;
    let d0 = if matches!(metric, NswMetric::L2) {
        entry_d
    } else {
        cell_to_query_metric_distance(table, col_pos, entry_node, query, metric)
    };
    let row_count = table.rows.len();
    let mut visited: Vec<bool> = alloc::vec![false; row_count];
    if entry_node < row_count {
        visited[entry_node] = true;
    }
    // candidates: min-heap by distance (Closest wrapper) — frontier
    // results:    max-heap by distance (Furthest wrapper) — top-ef working set
    let mut candidates: alloc::collections::BinaryHeap<NodeClosest> =
        alloc::collections::BinaryHeap::with_capacity(ef);
    let mut results: alloc::collections::BinaryHeap<NodeFurthest> =
        alloc::collections::BinaryHeap::with_capacity(ef);
    candidates.push(NodeClosest {
        dist: d0,
        node: entry_node,
    });
    results.push(NodeFurthest {
        dist: d0,
        node: entry_node,
    });
    while let Some(cur) = candidates.pop() {
        let worst = results.peek().map_or(f32::INFINITY, |c| c.dist);
        if cur.dist > worst && results.len() >= ef {
            break;
        }
        let neighbours: &[u32] = g
            .layers
            .get(layer as usize)
            .and_then(|layer_v| layer_v.get(cur.node))
            .map_or(&[][..], Vec::as_slice);
        for &n in neighbours {
            let n = n as usize;
            if n >= row_count || visited[n] {
                continue;
            }
            visited[n] = true;
            // v6.0.1: cell-aware distance — F32 cells take the
            // existing scalar metric, SQ8 cells route through
            // the asymmetric ADC variant for the same metric.
            let dn = cell_to_query_metric_distance(table, col_pos, n, query, metric);
            if !dn.is_finite() {
                continue;
            }
            let worst = results.peek().map_or(f32::INFINITY, |c| c.dist);
            if results.len() < ef || dn < worst {
                results.push(NodeFurthest { dist: dn, node: n });
                if results.len() > ef {
                    results.pop();
                }
                candidates.push(NodeClosest { dist: dn, node: n });
            }
        }
    }
    // Drain results (max-heap order) and re-sort ascending so callers
    // can take `closest = result[0]` without flipping.
    let mut out: Vec<(f32, usize)> = results.into_iter().map(|c| (c.dist, c.node)).collect();
    out.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
    out
}

/// Min-heap wrapper: smaller `dist` → higher priority in a `BinaryHeap`
/// (which is a max-heap), so we flip the comparison. NaN sorts last
/// (lowest priority) to keep the heap total-ordered.
#[derive(Debug, Clone, Copy)]
struct NodeClosest {
    dist: f32,
    node: usize,
}
impl PartialEq for NodeClosest {
    fn eq(&self, other: &Self) -> bool {
        self.dist == other.dist && self.node == other.node
    }
}
impl Eq for NodeClosest {}
impl PartialOrd for NodeClosest {
    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
        Some(self.cmp(other))
    }
}
impl Ord for NodeClosest {
    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
        // Reversed: smaller dist = greater priority.
        other
            .dist
            .partial_cmp(&self.dist)
            .unwrap_or(core::cmp::Ordering::Equal)
    }
}

/// Max-heap wrapper: larger `dist` sits at the top so the worst result
/// can be evicted in O(log n) when a better candidate arrives.
#[derive(Debug, Clone, Copy)]
struct NodeFurthest {
    dist: f32,
    node: usize,
}
impl PartialEq for NodeFurthest {
    fn eq(&self, other: &Self) -> bool {
        self.dist == other.dist && self.node == other.node
    }
}
impl Eq for NodeFurthest {}
impl PartialOrd for NodeFurthest {
    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
        Some(self.cmp(other))
    }
}
impl Ord for NodeFurthest {
    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
        self.dist
            .partial_cmp(&other.dist)
            .unwrap_or(core::cmp::Ordering::Equal)
    }
}

/// HNSW paper §4 algorithm 4: pick `m` neighbours from `candidates` so
/// that each chosen point isn't already covered by a closer chosen
/// point. Improves graph diversity → fewer hops needed at search time.
///
/// `candidates` arrives sorted ascending by distance-to-query. We walk
/// it in order, keeping a candidate only when no already-chosen point
/// is closer to it than the query is. Result is a vector of row
/// indices (length ≤ `m`).
fn select_neighbours_heuristic(
    candidates: &[(f32, usize)],
    m: usize,
    table: &Table,
    col_pos: usize,
) -> Vec<usize> {
    let mut chosen: Vec<usize> = Vec::with_capacity(m);
    for &(d_q, e) in candidates {
        if chosen.len() >= m {
            break;
        }
        // v6.0.1: works on either `Value::Vector` (F32) or
        // `Value::Sq8Vector` (Sq8) cells — `cell_l2_sq` dispatches
        // on encoding. A non-vector cell yields `f32::INFINITY`
        // which the `< d_q` test will never accept.
        if !matches!(
            table.rows.get(e).and_then(|r| r.values.get(col_pos)),
            Some(Value::Vector(_) | Value::Sq8Vector(_) | Value::HalfVector(_))
        ) {
            continue;
        }
        let mut covered = false;
        for &r in &chosen {
            // dist(e, r) measured in the same metric the topology was
            // built with (L2). If a chosen `r` is closer to `e` than
            // the query is, `r` already "covers" `e` for navigation.
            if cell_l2_sq(table, col_pos, e, r) < d_q {
                covered = true;
                break;
            }
        }
        if !covered {
            chosen.push(e);
        }
    }
    chosen
}

/// Bidirectionally connect `new_row_idx` to each of `peers` at `layer`,
/// trimming each endpoint's adjacency to that layer's degree cap by
/// keeping only the closest neighbours.
fn connect_at_layer(
    table: &mut Table,
    idx_pos: usize,
    layer: u8,
    new_row_idx: usize,
    peers: &[usize],
) {
    let col_pos = table.indices[idx_pos].column_position;
    let cap = match &table.indices[idx_pos].kind {
        IndexKind::Nsw(g) => g.cap_for_layer(layer),
        IndexKind::BTree(_)
        | IndexKind::Brin { .. }
        | IndexKind::Gin(_)
        | IndexKind::GinTrgm(_)
        | IndexKind::GinFulltext(_) => return,
    };
    // v6.1.x: NSW adjacency stores neighbour row indices as u32 (4 B
    // each) rather than usize (8 B on 64-bit). Boundary casts here
    // assert the row count fits in u32 — the catalog already enforces
    // ≤ 4G rows per table, so the conversion can't lose data.
    let new_row_u32 = u32::try_from(new_row_idx).expect("row index fits in u32");
    if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind {
        let layer_v = &mut g.layers[layer as usize];
        if let Some(slot) = layer_v.get_mut(new_row_idx) {
            *slot = peers
                .iter()
                .map(|&p| u32::try_from(p).expect("row index fits in u32"))
                .collect();
        }
    }
    for &peer in peers {
        // Skip peers whose indexed cell isn't a vector — same fence
        // as the F32 path; SQ8 cells flow through `cell_l2_sq`
        // below without dequantising.
        if !matches!(
            &table.rows[peer].values[col_pos],
            Value::Vector(_) | Value::Sq8Vector(_) | Value::HalfVector(_)
        ) {
            continue;
        }
        // 1. add the new node to peer's adjacency
        if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind {
            let layer_v = &mut g.layers[layer as usize];
            if let Some(slot) = layer_v.get_mut(peer)
                && !slot.contains(&new_row_u32)
            {
                slot.push(new_row_u32);
            }
        }
        // 2. if peer is over budget, rebuild its adjacency with the
        //    HNSW §4 heuristic — same diversity criterion as the
        //    insert path so connectivity stays consistent.
        let needs_trim = match &table.indices[idx_pos].kind {
            IndexKind::Nsw(g) => g.layers[layer as usize][peer].len() > cap,
            IndexKind::BTree(_)
            | IndexKind::Brin { .. }
            | IndexKind::Gin(_)
            | IndexKind::GinTrgm(_)
            | IndexKind::GinFulltext(_) => false,
        };
        if needs_trim {
            let current_peers: Vec<usize> = match &table.indices[idx_pos].kind {
                IndexKind::Nsw(g) => g.layers[layer as usize][peer]
                    .iter()
                    .map(|&n| n as usize)
                    .collect(),
                IndexKind::BTree(_)
                | IndexKind::Brin { .. }
                | IndexKind::Gin(_)
                | IndexKind::GinTrgm(_)
                | IndexKind::GinFulltext(_) => continue,
            };
            // Sort by distance from `peer`'s cell ascending so the
            // heuristic receives candidates closest-first. `cell_l2_sq`
            // dispatches on encoding so SQ8 columns trim using
            // symmetric ADC.
            let mut tagged: Vec<(f32, usize)> = current_peers
                .iter()
                .map(|&p| (cell_l2_sq(table, col_pos, peer, p), p))
                .collect();
            tagged.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
            let kept = select_neighbours_heuristic(&tagged, cap, table, col_pos);
            if let IndexKind::Nsw(g) = &mut table.indices[idx_pos].kind
                && let Some(slot) = g.layers[layer as usize].get_mut(peer)
            {
                *slot = kept
                    .into_iter()
                    .map(|p| u32::try_from(p).expect("row index fits in u32"))
                    .collect();
            }
        }
    }
}

/// Squared L2 distance from `query` (raw f32) to the cell at
/// `(row, col_pos)`. Dispatches on cell encoding: `Value::Vector`
/// (F32) uses `l2_distance_sq`; `Value::Sq8Vector` uses
/// `sq8_l2_distance_sq_asymmetric` (the v6.0.1 quantised path).
/// Returns `f32::INFINITY` for any non-vector cell so callers can
/// compare uniformly.
fn vec_l2_sq(table: &Table, col_pos: usize, row: usize, query: &[f32]) -> f32 {
    match table.rows.get(row).and_then(|r| r.values.get(col_pos)) {
        Some(Value::Vector(v)) if v.len() == query.len() => l2_distance_sq(v, query),
        Some(Value::Sq8Vector(q)) if q.bytes.len() == query.len() => {
            quantize::sq8_l2_distance_sq_asymmetric(q, query)
        }
        // v6.0.6: halfvec → fused NEON SIMD kernel; no Vec<f32>
        // allocation. v6.0.3 used `to_f32_vec()` + f32 NEON which
        // was correct but allocated per call (5× slower than F32).
        Some(Value::HalfVector(h)) if h.dim() == query.len() => {
            halfvec::half_l2_distance_sq_asymmetric(h, query)
        }
        _ => f32::INFINITY,
    }
}

/// Squared L2 distance between two stored cells (no f32 query in
/// sight). Used during HNSW graph build — both endpoints are
/// rows already in the table, so symmetric ADC applies for SQ8
/// columns. Mixed-encoding cells within one column are a
/// schema-level impossibility (INSERT-time coercion enforces
/// uniform encoding), so the catch-all is an abort.
fn cell_l2_sq(table: &Table, col_pos: usize, row_a: usize, row_b: usize) -> f32 {
    let Some(cell_a) = table.rows.get(row_a).and_then(|r| r.values.get(col_pos)) else {
        return f32::INFINITY;
    };
    let Some(cell_b) = table.rows.get(row_b).and_then(|r| r.values.get(col_pos)) else {
        return f32::INFINITY;
    };
    match (cell_a, cell_b) {
        (Value::Vector(a), Value::Vector(b)) if a.len() == b.len() => l2_distance_sq(a, b),
        (Value::Sq8Vector(a), Value::Sq8Vector(b)) if a.bytes.len() == b.bytes.len() => {
            quantize::sq8_l2_distance_sq(a, b)
        }
        // v6.0.6: halfvec symmetric NEON — fused SIMD kernel that
        // loads both cells' raw u16 bits, expands to f32 lanes
        // inline, FMA-accumulates the squared diff. No Vec<f32>
        // allocation per call.
        (Value::HalfVector(a), Value::HalfVector(b)) if a.dim() == b.dim() => {
            halfvec::half_l2_distance_sq(a, b)
        }
        _ => f32::INFINITY,
    }
}

/// kNN-search-time distance: stored cell → f32 query under the
/// caller's metric. Dispatches on cell encoding so SQ8 columns
/// take the ADC path with the right asymmetric variant. NaN /
/// dim-mismatch / non-vector → `f32::INFINITY`.
fn cell_to_query_metric_distance(
    table: &Table,
    col_pos: usize,
    row: usize,
    query: &[f32],
    metric: NswMetric,
) -> f32 {
    match table.rows.get(row).and_then(|r| r.values.get(col_pos)) {
        Some(Value::Vector(v)) if v.len() == query.len() => metric_distance(metric, v, query),
        Some(Value::Sq8Vector(q)) if q.bytes.len() == query.len() => match metric {
            NswMetric::L2 => quantize::sq8_l2_distance_sq_asymmetric(q, query),
            NswMetric::InnerProduct => quantize::sq8_inner_product_asymmetric(q, query),
            NswMetric::Cosine => quantize::sq8_cosine_distance_asymmetric(q, query),
        },
        // v6.0.6: halfvec dispatches by metric to fused NEON
        // kernels — no Vec<f32> allocation per call.
        Some(Value::HalfVector(h)) if h.dim() == query.len() => match metric {
            NswMetric::L2 => halfvec::half_l2_distance_sq_asymmetric(h, query),
            NswMetric::InnerProduct => halfvec::half_inner_product_asymmetric(h, query),
            NswMetric::Cosine => halfvec::half_cosine_distance_asymmetric(h, query),
        },
        _ => f32::INFINITY,
    }
}

/// Distance metric used at NSW search time. The graph topology is
/// always built with `L2`; querying with `InnerProduct` / `Cosine`
/// reuses the same edges but ranks candidates by the chosen metric.
/// For the corpus-sized graphs this loses negligible recall vs
/// building separate per-metric graphs.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum NswMetric {
    /// Squared Euclidean — ranks "smaller = closer" (the sqrt is
    /// monotonic so we skip it for ordering).
    L2,
    /// Negated dot product, matching pgvector `<#>` convention so
    /// "smaller = more similar" holds across all three metrics.
    InnerProduct,
    /// Cosine distance `1 - cos(a, b)`. Zero-norm operand yields
    /// `f32::INFINITY` so it sorts last.
    Cosine,
}

/// Multi-layer HNSW kNN search: greedy-descend from the entry to layer 0,
/// then beam-search there with the requested `ef` to return the top `k`
/// results under the caller-chosen metric. Topology was built with L2 —
/// upper-layer descent uses L2 as a coarse heuristic; final beam search
/// runs in the requested metric so rankings are correct for `<#>` / `<=>`.
pub(crate) fn nsw_search(
    table: &Table,
    idx_pos: usize,
    query: &[f32],
    k: usize,
    ef: usize,
    metric: NswMetric,
) -> Vec<(f32, usize)> {
    let (entry, entry_level) = match &table.indices[idx_pos].kind {
        IndexKind::Nsw(g) => (g.entry, g.entry_level),
        IndexKind::BTree(_)
        | IndexKind::Brin { .. }
        | IndexKind::Gin(_)
        | IndexKind::GinTrgm(_)
        | IndexKind::GinFulltext(_) => return Vec::new(),
    };
    let Some(entry) = entry else {
        return Vec::new();
    };
    let col_pos = table.indices[idx_pos].column_position;
    // v6.0.1 step 5: SQ8 columns over-fetch by `SQ8_RERANK_OVER_FETCH`
    // so the rerank pass below sees enough candidates to recover
    // recall after the ADC re-ordering. F32 + F16 columns skip the
    // over-fetch — F32 distances are exact, F16 dequant is
    // bit-exact at the storage layer so the beam search already
    // ranks under the column's full precision.
    let sq8 = matches!(
        table.schema.columns.get(col_pos).map(|c| c.ty),
        Some(DataType::Vector {
            encoding: VecEncoding::Sq8,
            ..
        })
    );
    let ef = if sq8 {
        ef.max(k).max(k * SQ8_RERANK_OVER_FETCH)
    } else {
        ef.max(k)
    };
    // Descend by L2 (the topology metric) so layers prune consistently.
    let entry_d = vec_l2_sq(table, col_pos, entry, query);
    let mut current = entry;
    let mut current_d = entry_d;
    for layer in (1..=entry_level).rev() {
        (current, current_d) = greedy_layer_walk(table, idx_pos, layer, current, current_d, query);
    }
    // Final beam search on layer 0 under the caller's metric.
    let mut results = layer_beam_search(table, idx_pos, 0, current, current_d, query, ef, metric);
    if sq8 {
        results = sq8_rerank(table, col_pos, &results, query, metric);
    }
    results.truncate(k);
    results
}

/// v6.0.1 step 5: re-score ADC top-`K*3` candidates with the
/// dequantised cell vs the f32 query, then re-sort. Recovers the
/// recall the SQ8 ADC sacrifices for 4× compression — the design's
/// "f32 rerank step is on by default" path (deliberation #3).
/// `metric` is the same metric the beam search used; the rerank
/// arithmetic re-derives the exact distance under that metric.
fn sq8_rerank(
    table: &Table,
    col_pos: usize,
    candidates: &[(f32, usize)],
    query: &[f32],
    metric: NswMetric,
) -> Vec<(f32, usize)> {
    let mut out: Vec<(f32, usize)> = candidates
        .iter()
        .filter_map(|&(adc_d, row)| {
            let cell = table.rows.get(row).and_then(|r| r.values.get(col_pos))?;
            let Value::Sq8Vector(q) = cell else {
                // F32 cells shouldn't reach this path (sq8 fence
                // above), but stay defensive: pass through with
                // the ADC distance unchanged.
                return Some((adc_d, row));
            };
            let deq = quantize::dequantize(q);
            if deq.len() != query.len() {
                return None;
            }
            Some((metric_distance(metric, &deq, query), row))
        })
        .collect();
    out.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
    out
}

/// Multiplier applied to `k` so the SQ8 rerank pass sees a wider
/// candidate set. 3× is the design-stage value; v6.0.5 sweep work
/// can re-tune once full corpus profiling is in.
const SQ8_RERANK_OVER_FETCH: usize = 3;

fn metric_distance(metric: NswMetric, a: &[f32], b: &[f32]) -> f32 {
    match metric {
        NswMetric::L2 => l2_distance_sq(a, b),
        NswMetric::InnerProduct => -inner_product_f32(a, b),
        NswMetric::Cosine => {
            let (dot, na, nb) = cosine_dot_norms_f32(a, b);
            if na == 0.0 || nb == 0.0 {
                return f32::INFINITY;
            }
            // `f32::sqrt` lives in std, so hand-roll Newton-Raphson on
            // f64 — same trick the L2 binary op already uses.
            let denom = sqrt_newton_f32(na) * sqrt_newton_f32(nb);
            1.0 - dot / denom
        }
    }
}

/// v6.0.2: dispatch wrapper for the f32 dot product (used by `<#>` +
/// the cosine numerator). NEON path when `len % 4 == 0 && len >= 4`,
/// scalar fallback otherwise. Returns the positive dot — callers
/// negate for the pgvector `<#>` "smaller = closer" convention.
///
/// Public so perf gates + downstream benches can microbenchmark the
/// dispatch directly; not part of the STABILITY contract — internal
/// SIMD layout can evolve in any release.
#[doc(hidden)]
#[inline]
pub fn inner_product_f32(a: &[f32], b: &[f32]) -> f32 {
    #[cfg(target_arch = "aarch64")]
    {
        if a.len() == b.len() && a.len() >= 4 && a.len().is_multiple_of(4) {
            // SAFETY: NEON is a baseline aarch64 feature; preconditions
            // (matching lengths, ≥ 1 full lane group) are checked above.
            return unsafe { inner_product_neon(a, b) };
        }
    }
    inner_product_scalar(a, b)
}

pub(crate) fn inner_product_scalar(a: &[f32], b: &[f32]) -> f32 {
    let mut dot: f32 = 0.0;
    for (x, y) in a.iter().zip(b.iter()) {
        dot += x * y;
    }
    dot
}

#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
#[allow(clippy::many_single_char_names)] // NEON intrinsics work in single-letter regs by convention
pub(crate) unsafe fn inner_product_neon(a: &[f32], b: &[f32]) -> f32 {
    use core::arch::aarch64::{
        float32x4_t, vaddq_f32, vaddvq_f32, vdupq_n_f32, vfmaq_f32, vld1q_f32,
    };
    unsafe {
        // Two parallel accumulators (same trick as L2 NEON) so the
        // FMA dependency chain doesn't serialise.
        let zero: float32x4_t = vdupq_n_f32(0.0);
        let mut acc0 = zero;
        let mut acc1 = zero;
        let n = a.len();
        let mut i = 0usize;
        while i + 8 <= n {
            let av0 = vld1q_f32(a.as_ptr().add(i));
            let bv0 = vld1q_f32(b.as_ptr().add(i));
            acc0 = vfmaq_f32(acc0, av0, bv0);
            let av1 = vld1q_f32(a.as_ptr().add(i + 4));
            let bv1 = vld1q_f32(b.as_ptr().add(i + 4));
            acc1 = vfmaq_f32(acc1, av1, bv1);
            i += 8;
        }
        while i + 4 <= n {
            let av = vld1q_f32(a.as_ptr().add(i));
            let bv = vld1q_f32(b.as_ptr().add(i));
            acc0 = vfmaq_f32(acc0, av, bv);
            i += 4;
        }
        vaddvq_f32(vaddq_f32(acc0, acc1))
    }
}

/// v6.0.2: dispatch wrapper for the three accumulators (`dot`, `||a||²`,
/// `||b||²`) cosine needs. Same NEON pre-condition as the L2 / IP
/// paths; same scalar fallback shape.
///
/// Public for benchmarking only (see `inner_product_f32`); not in the
/// STABILITY contract.
#[doc(hidden)]
#[inline]
pub fn cosine_dot_norms_f32(a: &[f32], b: &[f32]) -> (f32, f32, f32) {
    #[cfg(target_arch = "aarch64")]
    {
        if a.len() == b.len() && a.len() >= 4 && a.len().is_multiple_of(4) {
            // SAFETY: see `inner_product_neon`.
            return unsafe { cosine_dot_norms_neon(a, b) };
        }
    }
    cosine_dot_norms_scalar(a, b)
}

pub(crate) fn cosine_dot_norms_scalar(a: &[f32], b: &[f32]) -> (f32, f32, f32) {
    let mut dot: f32 = 0.0;
    let mut na: f32 = 0.0;
    let mut nb: f32 = 0.0;
    for (x, y) in a.iter().zip(b.iter()) {
        dot += x * y;
        na += x * x;
        nb += y * y;
    }
    (dot, na, nb)
}

#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
#[allow(clippy::many_single_char_names, clippy::similar_names)]
pub(crate) unsafe fn cosine_dot_norms_neon(a: &[f32], b: &[f32]) -> (f32, f32, f32) {
    use core::arch::aarch64::{float32x4_t, vaddvq_f32, vdupq_n_f32, vfmaq_f32, vld1q_f32};
    unsafe {
        let zero: float32x4_t = vdupq_n_f32(0.0);
        let mut acc_dot = zero;
        let mut acc_na = zero;
        let mut acc_nb = zero;
        let n = a.len();
        let mut i = 0usize;
        while i + 4 <= n {
            let av = vld1q_f32(a.as_ptr().add(i));
            let bv = vld1q_f32(b.as_ptr().add(i));
            acc_dot = vfmaq_f32(acc_dot, av, bv);
            acc_na = vfmaq_f32(acc_na, av, av);
            acc_nb = vfmaq_f32(acc_nb, bv, bv);
            i += 4;
        }
        (vaddvq_f32(acc_dot), vaddvq_f32(acc_na), vaddvq_f32(acc_nb))
    }
}

fn sqrt_newton_f32(x: f32) -> f32 {
    if x <= 0.0 {
        return 0.0;
    }
    let mut g = x;
    for _ in 0..10 {
        g = 0.5 * (g + x / g);
    }
    g
}

/// Squared Euclidean distance — used for ordering inside NSW (the sqrt
/// preserves the order). Caller takes sqrt before reporting back to SQL.
///
/// v3.3.2: aarch64 NEON path for `len % 4 == 0` (which covers every
/// HNSW-indexed VECTOR(N) where N is a multiple of 4 — i.e. all
/// production-shaped embeddings: 64, 128, 256, 384, 512, 768, 1024,
/// 1536, ...). Other shapes fall back to the scalar loop.
#[inline]
pub(crate) fn l2_distance_sq(a: &[f32], b: &[f32]) -> f32 {
    #[cfg(target_arch = "aarch64")]
    {
        if a.len() == b.len() && a.len() >= 4 && a.len().is_multiple_of(4) {
            // SAFETY: NEON is a baseline aarch64 feature (ARMv8);
            // the precondition is checked above (matching lengths,
            // multiple of 4, at least one 128-bit lane group).
            return unsafe { l2_distance_sq_neon(a, b) };
        }
    }
    l2_distance_sq_scalar(a, b)
}

pub(crate) fn l2_distance_sq_scalar(a: &[f32], b: &[f32]) -> f32 {
    let mut sum: f32 = 0.0;
    for (x, y) in a.iter().zip(b.iter()) {
        let d = *x - *y;
        sum += d * d;
    }
    sum
}

#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
#[allow(clippy::many_single_char_names)] // NEON intrinsics work in single-letter regs by convention
pub(crate) unsafe fn l2_distance_sq_neon(a: &[f32], b: &[f32]) -> f32 {
    use core::arch::aarch64::{
        float32x4_t, vaddq_f32, vaddvq_f32, vdupq_n_f32, vfmaq_f32, vld1q_f32, vsubq_f32,
    };
    unsafe {
        // Two independent accumulator registers so the FMA dependency
        // chain doesn't serialise (each FMA depends on prior FMA).
        // Pre-conditions checked by caller: `a.len() == b.len()`,
        // `a.len() % 4 == 0`, `a.len() >= 4`.
        let zero: float32x4_t = vdupq_n_f32(0.0);
        let mut acc0 = zero;
        let mut acc1 = zero;
        let n = a.len();
        let mut i = 0usize;
        // Process 8 floats per iter when available (two parallel
        // accumulators). Tail of 4 falls into the second loop.
        while i + 8 <= n {
            let d0 = vsubq_f32(vld1q_f32(a.as_ptr().add(i)), vld1q_f32(b.as_ptr().add(i)));
            acc0 = vfmaq_f32(acc0, d0, d0);
            let d1 = vsubq_f32(
                vld1q_f32(a.as_ptr().add(i + 4)),
                vld1q_f32(b.as_ptr().add(i + 4)),
            );
            acc1 = vfmaq_f32(acc1, d1, d1);
            i += 8;
        }
        while i + 4 <= n {
            let d = vsubq_f32(vld1q_f32(a.as_ptr().add(i)), vld1q_f32(b.as_ptr().add(i)));
            acc0 = vfmaq_f32(acc0, d, d);
            i += 4;
        }
        vaddvq_f32(vaddq_f32(acc0, acc1))
    }
}

/// Public wrapper: run an NSW kNN search and return the top-k row
/// indices ordered by ascending distance under the given metric.
pub fn nsw_query(
    table: &Table,
    idx_name: &str,
    query: &[f32],
    k: usize,
    metric: NswMetric,
) -> Vec<usize> {
    let Some(idx_pos) = table.indices.iter().position(|i| i.name == idx_name) else {
        return Vec::new();
    };
    let ef = (k * 2).max(NSW_DEFAULT_M);
    let mut hits = nsw_search(table, idx_pos, query, k, ef, metric);
    hits.truncate(k);
    hits.into_iter().map(|(_, idx)| idx).collect()
}

/// Find any NSW index on a column. Used by the planner to decide
/// whether an `ORDER BY col <-> literal LIMIT k` query can skip the
/// brute-force scan.
pub fn nsw_index_on(table: &Table, column_position: usize) -> Option<&Index> {
    table
        .indices
        .iter()
        .find(|i| i.column_position == column_position && matches!(i.kind, IndexKind::Nsw(_)))
}