velesdb-core 1.13.3

High-performance vector database engine written in Rust
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
//! GPU-accelerated HNSW graph traversal orchestrator.
//!
//! Implements the SONG 3-stage framework for GPU-based approximate nearest
//! neighbor search on HNSW layer 0:
//!
//! 1. **Expand** — parallel frontier neighbor expansion via CSR adjacency
//! 2. **Distance** — batch distance computation for all new candidates
//! 3. **Select** — parallel top-k selection to form the next frontier
//!
//! All three stages are encoded into a single wgpu command buffer to
//! eliminate per-iteration CPU↔GPU synchronization overhead.
//!
//! # Activation Threshold
//!
//! GPU traversal is only beneficial for large indices (>500K vectors).
//! Below this, the fixed GPU dispatch overhead (~900μs × iterations)
//! exceeds the CPU SIMD search time. Use [`should_traverse_gpu`] to check.

#![allow(clippy::similar_names)]
// Doc comments in this module mix WGSL binding names, shader variables,
// and HNSW-paper terminology that aren't Rust items; backticking every
// one would hurt readability without adding linkable references.
#![allow(clippy::doc_markdown)]
// The TraversalBuffers::new constructor intentionally inlines every GPU
// buffer allocation for a single query's lifecycle. Splitting would
// obscure the ordering relative to the WGSL bind group layout; tracked
// as a refactor for a follow-up PR.
#![allow(clippy::too_many_lines)]

use std::sync::{Arc, OnceLock};
use std::time::Instant;

use super::gpu_backend::GpuAccelerator;
use super::gpu_csr::CsrGraph;
use super::gpu_traversal_buffers::{TraversalBuffers, MAX_CANDIDATES_PER_ITER};
use super::gpu_traversal_pipelines as pipelines;

/// Computes the distance between `query` and `entry_vec` in the same
/// "lower = better" space that the traversal shaders produce.
///
/// This matches the shader conventions exactly so that an entry-node
/// distance computed on CPU can be directly merged with GPU-computed
/// candidate distances during the top-k selection:
/// * Cosine → `1.0 - cosine_similarity`
/// * Euclidean → squared L2 (not `sqrt`-reduced), matching
///   `TRAVERSAL_EUCLIDEAN_SQ_SHADER`
/// * DotProduct → `-dot_product`
///
/// Returns `f32::MAX` for metrics that have no GPU shader (caller must
/// have already bailed out via `should_traverse_gpu` or metric dispatch).
#[must_use]
fn gpu_distance_cpu_fallback(
    query: &[f32],
    entry_vec: &[f32],
    metric: crate::distance::DistanceMetric,
) -> f32 {
    use crate::distance::DistanceMetric;
    debug_assert_eq!(query.len(), entry_vec.len());
    match metric {
        DistanceMetric::Cosine => {
            let (mut dot, mut na, mut nb) = (0.0_f32, 0.0_f32, 0.0_f32);
            for (x, y) in query.iter().zip(entry_vec.iter()) {
                dot += x * y;
                na += x * x;
                nb += y * y;
            }
            let denom = (na * nb).sqrt();
            if denom == 0.0 {
                1.0
            } else {
                1.0 - (dot / denom)
            }
        }
        DistanceMetric::Euclidean => query
            .iter()
            .zip(entry_vec.iter())
            .map(|(a, b)| (a - b).powi(2))
            .sum::<f32>(),
        DistanceMetric::DotProduct => -query
            .iter()
            .zip(entry_vec.iter())
            .map(|(a, b)| a * b)
            .sum::<f32>(),
        DistanceMetric::Hamming | DistanceMetric::Jaccard => f32::MAX,
    }
}

/// Returns the adaptive GPU iteration count based on ef_search.
///
/// Larger beam widths converge faster (more candidates explored per step),
/// so we scale iterations sub-linearly. Capped at 25 for very large ef.
#[must_use]
fn adaptive_gpu_iterations(ef_search: usize) -> u32 {
    match ef_search {
        0..=64 => 20,
        65..=128 => 18,
        129..=256 => 15,
        257..=512 => 12,
        _ => 10,
    }
}

/// Returns `true` if GPU graph traversal is safe and likely faster than CPU
/// for the given index size and dimension.
///
/// Two gates are checked:
/// * **Performance gate** — `num_vectors > 500_000`. Below this, CPU SIMD with
///   prefetch and rayon parallelism is faster than GPU due to dispatch overhead.
/// * **Correctness gate** — `num_vectors * dimension <= u32::MAX`. The distance
///   shaders (`TRAVERSAL_EUCLIDEAN_SQ_SHADER`, `TRAVERSAL_COSINE_SHADER`,
///   `TRAVERSAL_DOT_PRODUCT_SHADER`) compute vector offsets as `node_id * dim`
///   in `u32`; a `10M × 768` index would overflow (~7.68B > 4.29B) and silently
///   return wrong distances. WGSL has no u64 scalar, so the caller must bail
///   out to CPU in this regime.
///
/// Returns `false` (fall back to CPU) when either gate is not satisfied.
#[must_use]
pub fn should_traverse_gpu(num_vectors: usize, dimension: usize) -> bool {
    if num_vectors <= 500_000 {
        return false;
    }
    // Correctness gate: keep `node_id * dim` within u32 range.
    num_vectors
        .checked_mul(dimension)
        .is_some_and(|prod| u32::try_from(prod).is_ok())
}

/// Observable statistics from a single GPU traversal execution.
#[derive(Debug, Clone)]
pub struct GpuTraversalStats {
    /// Number of GPU iterations executed.
    pub iterations: u32,
    /// Whether the graph buffers were served from cache.
    pub cache_hit: bool,
    /// Time spent uploading buffers to GPU (0 on cache hit).
    pub upload_ms: f64,
    /// Time spent in GPU compute (submit + readback).
    pub compute_ms: f64,
    /// Total wall-clock time for the GPU search.
    pub total_ms: f64,
}

impl std::fmt::Display for GpuTraversalStats {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "GpuTraversal(iters={}, cache={}, upload={:.2}ms, compute={:.2}ms, total={:.2}ms)",
            self.iterations,
            if self.cache_hit { "HIT" } else { "MISS" },
            self.upload_ms,
            self.compute_ms,
            self.total_ms,
        )
    }
}

/// GPU traversal context holding compiled pipelines and the wgpu device.
///
/// Created once via [`global()`](Self::global) and shared across all queries.
/// Holds the three traversal pipelines (expand, distance, select)
/// and references to the shared GPU device.
pub struct GpuTraversalContext {
    gpu: Arc<GpuAccelerator>,
    expand_pipeline: wgpu::ComputePipeline,
    distance_cosine_pipeline: wgpu::ComputePipeline,
    distance_euclidean_sq_pipeline: wgpu::ComputePipeline,
    distance_dot_pipeline: wgpu::ComputePipeline,
    select_pipeline: wgpu::ComputePipeline,
}

impl GpuTraversalContext {
    /// Returns the global singleton `GpuTraversalContext`, creating it on
    /// first call. Pipelines are compiled once and reused across all queries.
    ///
    /// Returns `None` if no GPU is available.
    #[must_use]
    pub fn global() -> Option<Arc<Self>> {
        static INSTANCE: OnceLock<Option<Arc<GpuTraversalContext>>> = OnceLock::new();
        INSTANCE
            .get_or_init(|| GpuTraversalContext::new().map(Arc::new))
            .clone()
    }

    /// Creates a new GPU traversal context, compiling all required pipelines.
    ///
    /// Prefer [`global()`](Self::global) for production use to avoid
    /// recompiling pipelines per query. Use `new()` only in benchmarks
    /// or tests that need isolated contexts.
    ///
    /// Returns `None` if no GPU is available.
    #[must_use]
    pub fn new() -> Option<Self> {
        let gpu = GpuAccelerator::global()?;
        let device = gpu.device();

        let expand_pipeline = pipelines::compile_expand_pipeline(device);
        let distance_cosine_pipeline = pipelines::compile_traversal_distance_pipeline(
            device,
            super::gpu_backend::shaders::TRAVERSAL_COSINE_SHADER,
            "traversal_cosine",
            "Traversal Cosine",
        );
        let distance_euclidean_sq_pipeline = pipelines::compile_traversal_distance_pipeline(
            device,
            super::gpu_backend::shaders::TRAVERSAL_EUCLIDEAN_SQ_SHADER,
            "traversal_euclidean_sq",
            "Traversal Euclidean Sq",
        );
        let distance_dot_pipeline = pipelines::compile_traversal_distance_pipeline(
            device,
            super::gpu_backend::shaders::TRAVERSAL_DOT_PRODUCT_SHADER,
            "traversal_dot",
            "Traversal Dot Product",
        );
        let select_pipeline = pipelines::compile_select_pipeline(device);

        Some(Self {
            gpu,
            expand_pipeline,
            distance_cosine_pipeline,
            distance_euclidean_sq_pipeline,
            distance_dot_pipeline,
            select_pipeline,
        })
    }

    /// Executes GPU-accelerated layer-0 search.
    ///
    /// # Arguments
    ///
    /// * `csr` — CSR representation of layer 0
    /// * `vectors_flat` — contiguous f32 vector storage (N × dim)
    /// * `query` — query vector (dim f32s)
    /// * `entry_node` — entry point from upper-layer greedy descent
    /// * `k` — number of nearest neighbors to return
    /// * `ef_search` — search beam width
    /// * `dimension` — vector dimension
    /// * `metric` — distance metric to use
    ///
    /// # Returns
    ///
    /// Vector of `(node_id, distance)` pairs sorted by distance ascending.
    /// Returns empty vec on any GPU error (caller should fall back to CPU).
    #[allow(clippy::too_many_arguments)]
    pub fn search_layer0(
        &self,
        csr: &CsrGraph,
        vectors_flat: &[f32],
        query: &[f32],
        entry_node: usize,
        k: usize,
        ef_search: usize,
        dimension: usize,
        metric: crate::distance::DistanceMetric,
    ) -> Vec<(usize, f32)> {
        if csr.is_empty() || query.is_empty() || dimension == 0 {
            return Vec::new();
        }

        let total_start = Instant::now();
        if let Some(results) = self.search_layer0_inner(
            csr,
            vectors_flat,
            query,
            entry_node,
            k,
            ef_search,
            dimension,
            metric,
        ) {
            let total_ms = total_start.elapsed().as_secs_f64() * 1000.0;
            tracing::debug!(
                k,
                ef_search,
                num_results = results.len(),
                total_ms = format!("{total_ms:.2}"),
                "GPU traversal completed"
            );
            results
        } else {
            tracing::warn!("GPU traversal failed, returning empty results for CPU fallback");
            Vec::new()
        }
    }

    /// Inner implementation that returns `None` on any GPU error.
    #[allow(clippy::too_many_arguments)]
    fn search_layer0_inner(
        &self,
        csr: &CsrGraph,
        vectors_flat: &[f32],
        query: &[f32],
        entry_node: usize,
        k: usize,
        ef_search: usize,
        dimension: usize,
        metric: crate::distance::DistanceMetric,
    ) -> Option<Vec<(usize, f32)>> {
        let device = self.gpu.device();
        let queue = self.gpu.queue();

        let ef = ef_search.max(k);
        let max_iterations = adaptive_gpu_iterations(ef_search);

        // Select the appropriate distance pipeline
        let distance_pipeline = match metric {
            crate::distance::DistanceMetric::Cosine => &self.distance_cosine_pipeline,
            crate::distance::DistanceMetric::Euclidean => &self.distance_euclidean_sq_pipeline,
            crate::distance::DistanceMetric::DotProduct => &self.distance_dot_pipeline,
            // Hamming and Jaccard don't have GPU shaders — fall back
            _ => return None,
        };

        // Compute the entry node's real query distance in the same space
        // the GPU shaders use. The SELECT shader seeds its output frontier
        // from `frontier_a`, and `frontier_a[0]` represents the entry node;
        // a sentinel/0.0 placeholder here would silently force the entry to
        // pollute every top-k with an artificial minimum.
        let entry_offset = entry_node.checked_mul(dimension).filter(|end| {
            end.checked_add(dimension)
                .is_some_and(|e| e <= vectors_flat.len())
        })?;
        let entry_vec = &vectors_flat[entry_offset..entry_offset + dimension];
        let entry_distance = gpu_distance_cpu_fallback(query, entry_vec, metric);

        // =====================================================================
        // Create GPU buffers — graph buffers may be cached
        // =====================================================================
        let upload_start = Instant::now();
        let buffers = TraversalBuffers::new(
            device,
            csr,
            vectors_flat,
            query,
            entry_node,
            entry_distance,
            ef,
            dimension,
        );
        let _upload_ms = upload_start.elapsed().as_secs_f64() * 1000.0;

        // =====================================================================
        // Encode all iterations into a single command buffer
        // =====================================================================
        let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
            label: Some("HNSW Traversal Encoder"),
        });

        for _iteration in 0..max_iterations {
            // Stage 1: Expand frontier — BFS neighbor expansion via CSR
            self.encode_expand_pass(&mut encoder, &buffers);

            // Stage 2: Compute distances for all new candidates
            self.encode_distance_pass(&mut encoder, distance_pipeline, &buffers);

            // Stage 3: Select top-k candidates as new frontier
            self.encode_select_pass(&mut encoder, &buffers, ef);
        }

        // Copy final frontier (top-k results) to staging buffer for readback
        let result_count = k.min(ef);
        #[allow(clippy::cast_possible_truncation)]
        let result_ids_size = (result_count * std::mem::size_of::<u32>()) as u64;
        #[allow(clippy::cast_possible_truncation)]
        let result_dists_size = (result_count * std::mem::size_of::<f32>()) as u64;

        encoder.copy_buffer_to_buffer(
            &buffers.frontier_a_ids,
            0,
            &buffers.staging_ids,
            0,
            result_ids_size,
        );
        encoder.copy_buffer_to_buffer(
            &buffers.frontier_a_dists,
            0,
            &buffers.staging_dists,
            0,
            result_dists_size,
        );

        queue.submit(std::iter::once(encoder.finish()));

        // =====================================================================
        // Read back results
        // =====================================================================
        let result_ids =
            super::helpers::readback_buffer::<u32>(device, &buffers.staging_ids, result_count)?;
        let result_dists =
            super::helpers::readback_buffer::<f32>(device, &buffers.staging_dists, result_count)?;

        // Combine and sort by distance
        let mut results: Vec<(usize, f32)> = result_ids
            .iter()
            .zip(result_dists.iter())
            .filter(|(&id, &dist)| id != u32::MAX && dist < f32::MAX)
            .map(|(&id, &dist)| (id as usize, dist))
            .collect();

        results.sort_by(|a, b| a.1.total_cmp(&b.1));
        results.truncate(k);

        Some(results)
    }

    // =========================================================================
    // Pass encoding
    // =========================================================================

    fn encode_expand_pass(&self, encoder: &mut wgpu::CommandEncoder, buffers: &TraversalBuffers) {
        // Reset candidate counter to 0 before expansion
        encoder.clear_buffer(&buffers.counters, 0, None);

        // Fill candidates buffer with sentinels (u32::MAX = 0xFFFFFFFF) by
        // copying from `candidates_sentinel` (initialized once at buffer
        // creation to all u32::MAX). Without this, unused slots contain 0
        // — a valid node ID — causing the distance shader to compute
        // distances against node 0 for all ~8000 unused slots.
        // The sentinels are detected by the TRAVERSAL_*_SHADER's
        // `if (node_id == 0xFFFFFFFFu)` guard and assigned f32::MAX.
        //
        // Do NOT `clear_buffer(&candidates_sentinel, …)` here: `clear_buffer`
        // fills with 0x00, which would destroy the u32::MAX pattern in
        // the read-only source buffer and poison every subsequent copy.
        encoder.copy_buffer_to_buffer(
            &buffers.candidates_sentinel,
            0,
            &buffers.candidates,
            0,
            buffers.candidates_byte_size as u64,
        );

        let bind_group = buffers.create_expand_bind_group(self.gpu.device(), &self.expand_pipeline);

        let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
            label: Some("Expand Pass"),
            timestamp_writes: None,
        });
        pass.set_pipeline(&self.expand_pipeline);
        pass.set_bind_group(0, &bind_group, &[]);
        // Fix: dispatch based on ef (full beam width), not frontier_size.
        // When ef > 256, we need multiple workgroups to expand all frontier nodes.
        #[allow(clippy::cast_possible_truncation)]
        let workgroups = buffers.ef.div_ceil(256) as u32;
        pass.dispatch_workgroups(workgroups.max(1), 1, 1);
    }

    fn encode_distance_pass(
        &self,
        encoder: &mut wgpu::CommandEncoder,
        pipeline: &wgpu::ComputePipeline,
        buffers: &TraversalBuffers,
    ) {
        let bind_group = buffers.create_distance_bind_group(self.gpu.device(), pipeline);

        let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
            label: Some("Distance Pass"),
            timestamp_writes: None,
        });
        pass.set_pipeline(pipeline);
        pass.set_bind_group(0, &bind_group, &[]);
        let workgroups = MAX_CANDIDATES_PER_ITER.div_ceil(256);
        pass.dispatch_workgroups(workgroups, 1, 1);
    }

    fn encode_select_pass(
        &self,
        encoder: &mut wgpu::CommandEncoder,
        buffers: &TraversalBuffers,
        ef: usize,
    ) {
        // Reset frontier counter before selection
        encoder.clear_buffer(&buffers.select_counters, 0, None);

        // Initialize frontier_b with sentinels so slots not filled by
        // the select shader remain as "empty" (u32::MAX / f32::MAX).
        // This prevents stale frontier entries from previous iterations
        // contaminating the next expand pass.
        let frontier_bytes = (ef * std::mem::size_of::<u32>()) as u64;
        encoder.copy_buffer_to_buffer(
            &buffers.frontier_ids_sentinel,
            0,
            &buffers.frontier_b_ids,
            0,
            frontier_bytes,
        );
        encoder.copy_buffer_to_buffer(
            &buffers.frontier_dists_sentinel,
            0,
            &buffers.frontier_b_dists,
            0,
            frontier_bytes, // same size: ef × f32
        );

        let bind_group =
            buffers.create_select_bind_group(self.gpu.device(), &self.select_pipeline, ef);

        let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
            label: Some("Select Pass"),
            timestamp_writes: None,
        });
        pass.set_pipeline(&self.select_pipeline);
        pass.set_bind_group(0, &bind_group, &[]);
        // SELECT_TOPK_SHADER is `@workgroup_size(1)` and does a serial
        // insertion-sort over all candidates. Dispatching >1 workgroup
        // would race on the frontier writes — see shader doc comment.
        pass.dispatch_workgroups(1, 1, 1);

        // Copy selected frontier back to frontier_a for next iteration
        let frontier_bytes = (ef * std::mem::size_of::<u32>()) as u64;
        encoder.copy_buffer_to_buffer(
            &buffers.frontier_b_ids,
            0,
            &buffers.frontier_a_ids,
            0,
            frontier_bytes,
        );
        let dists_bytes = (ef * std::mem::size_of::<f32>()) as u64;
        encoder.copy_buffer_to_buffer(
            &buffers.frontier_b_dists,
            0,
            &buffers.frontier_a_dists,
            0,
            dists_bytes,
        );
    }
}

// =============================================================================
// Tests
// =============================================================================

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_should_traverse_gpu_threshold() {
        // Performance gate (num_vectors > 500_000) — dimension 128 is always safe.
        assert!(!should_traverse_gpu(0, 128));
        assert!(!should_traverse_gpu(100_000, 128));
        assert!(!should_traverse_gpu(500_000, 128));
        assert!(should_traverse_gpu(500_001, 128));
        assert!(should_traverse_gpu(1_000_000, 128));
    }

    #[test]
    fn test_should_traverse_gpu_u32_offset_correctness_gate() {
        // 10M * 768 = 7_680_000_000 > u32::MAX (4_294_967_295) — must bail out to CPU.
        assert!(!should_traverse_gpu(10_000_000, 768));
        // 5M * 768 = 3_840_000_000 < u32::MAX — safe.
        assert!(should_traverse_gpu(5_000_000, 768));
        // Boundary: exactly u32::MAX offsets — safe.
        assert!(should_traverse_gpu((u32::MAX as usize) / 128, 128));
        // Overflow even after checked_mul — must bail out.
        assert!(!should_traverse_gpu(usize::MAX / 2, 4));
    }

    #[test]
    fn test_gpu_traversal_context_new_no_panic() {
        // GpuTraversalContext::new() should not panic even without GPU
        let _ctx = GpuTraversalContext::new();
        // May return None if no GPU — that's fine
    }

    #[test]
    fn test_search_empty_csr_returns_empty() {
        if let Some(ctx) = GpuTraversalContext::new() {
            let csr = CsrGraph {
                offsets: vec![0],
                neighbors: vec![],
                num_nodes: 0,
                max_degree: 0,
                total_edges: 0,
            };
            let result = ctx.search_layer0(
                &csr,
                &[],
                &[1.0, 0.0, 0.0],
                0,
                10,
                64,
                3,
                crate::distance::DistanceMetric::Cosine,
            );
            assert!(result.is_empty());
        }
    }

    #[test]
    fn test_search_unsupported_metric_returns_empty() {
        if let Some(ctx) = GpuTraversalContext::new() {
            let csr = CsrGraph {
                offsets: vec![0, 1],
                neighbors: vec![0],
                num_nodes: 1,
                max_degree: 1,
                total_edges: 1,
            };
            // Hamming has no GPU shader
            let result = ctx.search_layer0(
                &csr,
                &[1.0, 0.0, 0.0],
                &[1.0, 0.0, 0.0],
                0,
                10,
                64,
                3,
                crate::distance::DistanceMetric::Hamming,
            );
            assert!(result.is_empty());
        }
    }
}