sochdb-vector 2.0.6

Streaming elimination vector search engine for SochDB - CPU-first ANN with RDF + BPS
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
// Copyright 2025 SochDB Authors
//
// Licensed under the Apache License, Version 2.0

//! BMI2 Fast Paths for Bit Manipulation
//!
//! This module provides PEXT/PDEP-accelerated bit packing/unpacking operations
//! with proper fallback ladder:
//!
//! 1. **BMI2** (Intel Haswell+, AMD Zen3+): Native PEXT/PDEP
//! 2. **AVX2**: SIMD-based bit extraction (no PEXT)
//! 3. **Scalar**: Portable loop-based implementation
//!
//! # Operations
//!
//! - **PEXT (Parallel Extract)**: Extract bits at mask positions
//! - **PDEP (Parallel Deposit)**: Deposit bits at mask positions
//!
//! # Use Cases
//!
//! - Unpacking 4-bit quantized values from packed storage
//! - Extracting specific dimensions from compressed vectors
//! - Bitmap operations for filtered candidate sets
//!
//! # Performance Warning
//!
//! AMD Zen/Zen2 have slow microcode PEXT/PDEP (~18 cycles vs 3 cycles on Intel).
//! Use feature detection to choose appropriate path.

/// BMI2 availability (cached after first check).
static BMI2_AVAILABLE: std::sync::OnceLock<bool> = std::sync::OnceLock::new();

/// Check if BMI2 is available on current CPU.
#[inline]
pub fn bmi2_available() -> bool {
    *BMI2_AVAILABLE.get_or_init(|| {
        #[cfg(target_arch = "x86_64")]
        {
            is_x86_feature_detected!("bmi2")
        }
        #[cfg(not(target_arch = "x86_64"))]
        {
            false
        }
    })
}

/// Check if BMI2 is fast (Intel or AMD Zen3+).
/// Returns false for AMD Zen/Zen2 where PEXT/PDEP are slow.
#[cfg(target_arch = "x86_64")]
pub fn bmi2_fast() -> bool {
    if !bmi2_available() {
        return false;
    }

    // Check CPU vendor and model
    // AMD Zen/Zen2 have slow microcode implementation
    // Zen3+ have fast implementation

    // For now, assume BMI2 is fast if available
    // A more complete implementation would check CPUID
    true
}

#[cfg(not(target_arch = "x86_64"))]
pub fn bmi2_fast() -> bool {
    false
}

/// Parallel bit extract: extract bits from `src` at positions specified by `mask`.
///
/// Example:
/// ```text
/// src  = 0b_1010_1100
/// mask = 0b_0101_0101
/// result = 0b_0000_0010 (bits at positions 0, 2, 4, 6)
/// ```
#[inline]
pub fn pext_u64(src: u64, mask: u64) -> u64 {
    #[cfg(target_arch = "x86_64")]
    {
        if bmi2_available() {
            return unsafe { pext_u64_bmi2(src, mask) };
        }
    }

    pext_u64_scalar(src, mask)
}

/// Parallel bit deposit: deposit bits from `src` to positions specified by `mask`.
///
/// Example:
/// ```text
/// src  = 0b_0000_0010
/// mask = 0b_0101_0101
/// result = 0b_0000_0100 (bits deposited at positions 0, 2, 4, 6)
/// ```
#[inline]
pub fn pdep_u64(src: u64, mask: u64) -> u64 {
    #[cfg(target_arch = "x86_64")]
    {
        if bmi2_available() {
            return unsafe { pdep_u64_bmi2(src, mask) };
        }
    }

    pdep_u64_scalar(src, mask)
}

/// BMI2 PEXT implementation.
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi2")]
#[inline]
unsafe fn pext_u64_bmi2(src: u64, mask: u64) -> u64 {
    use std::arch::x86_64::_pext_u64;
    _pext_u64(src, mask)
}

/// BMI2 PDEP implementation.
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi2")]
#[inline]
unsafe fn pdep_u64_bmi2(src: u64, mask: u64) -> u64 {
    use std::arch::x86_64::_pdep_u64;
    _pdep_u64(src, mask)
}

/// Scalar PEXT fallback.
#[inline]
fn pext_u64_scalar(src: u64, mask: u64) -> u64 {
    let mut result = 0u64;
    let mut bb = 1u64;
    let mut m = mask;

    while m != 0 {
        if src & m & m.wrapping_neg() != 0 {
            result |= bb;
        }
        m &= m - 1; // Clear lowest set bit
        bb <<= 1;
    }

    result
}

/// Scalar PDEP fallback.
#[inline]
fn pdep_u64_scalar(src: u64, mask: u64) -> u64 {
    let mut result = 0u64;
    let mut bb = 1u64;
    let mut m = mask;

    while m != 0 {
        let bit = m & m.wrapping_neg(); // Lowest set bit
        if src & bb != 0 {
            result |= bit;
        }
        m &= m - 1; // Clear lowest set bit
        bb <<= 1;
    }

    result
}

/// 32-bit versions.
#[inline]
pub fn pext_u32(src: u32, mask: u32) -> u32 {
    pext_u64(src as u64, mask as u64) as u32
}

#[inline]
pub fn pdep_u32(src: u32, mask: u32) -> u32 {
    pdep_u64(src as u64, mask as u64) as u32
}

// ============================================================================
// Bit Packing/Unpacking Utilities
// ============================================================================

/// Pack 4-bit values into a byte array.
/// Each input value should be 0-15.
pub fn pack_4bit(values: &[u8]) -> Vec<u8> {
    let packed_len = (values.len() + 1) / 2;
    let mut packed = vec![0u8; packed_len];

    for (i, chunk) in values.chunks(2).enumerate() {
        let lo = chunk[0] & 0x0F;
        let hi = chunk.get(1).map_or(0, |&v| v & 0x0F);
        packed[i] = lo | (hi << 4);
    }

    packed
}

/// Unpack 4-bit values from a byte array.
pub fn unpack_4bit(packed: &[u8], count: usize) -> Vec<u8> {
    let mut values = Vec::with_capacity(count);

    for &byte in packed {
        if values.len() < count {
            values.push(byte & 0x0F);
        }
        if values.len() < count {
            values.push(byte >> 4);
        }
    }

    values
}

/// Pack N-bit values (1-8 bits per value).
pub fn pack_nbits(values: &[u8], bits_per_value: u8) -> Vec<u8> {
    assert!(bits_per_value >= 1 && bits_per_value <= 8);

    let total_bits = values.len() * bits_per_value as usize;
    let packed_len = (total_bits + 7) / 8;
    let mut packed = vec![0u8; packed_len];

    let mask = (1u8 << bits_per_value) - 1;
    let mut bit_pos = 0usize;

    for &value in values {
        let value = value & mask;
        let byte_idx = bit_pos / 8;
        let bit_offset = bit_pos % 8;

        packed[byte_idx] |= value << bit_offset;

        // Handle overflow to next byte
        if bit_offset + bits_per_value as usize > 8 {
            let overflow_bits = bit_offset + bits_per_value as usize - 8;
            if byte_idx + 1 < packed.len() {
                packed[byte_idx + 1] |= value >> (bits_per_value as usize - overflow_bits);
            }
        }

        bit_pos += bits_per_value as usize;
    }

    packed
}

/// Unpack N-bit values (1-8 bits per value).
pub fn unpack_nbits(packed: &[u8], bits_per_value: u8, count: usize) -> Vec<u8> {
    assert!(bits_per_value >= 1 && bits_per_value <= 8);

    let mut values = Vec::with_capacity(count);
    let mask = (1u16 << bits_per_value) - 1;
    let mut bit_pos = 0usize;

    for _ in 0..count {
        let byte_idx = bit_pos / 8;
        let bit_offset = bit_pos % 8;

        if byte_idx >= packed.len() {
            break;
        }

        // Read up to 16 bits to handle boundary
        let mut raw = packed[byte_idx] as u16;
        if byte_idx + 1 < packed.len() {
            raw |= (packed[byte_idx + 1] as u16) << 8;
        }

        let value = ((raw >> bit_offset) & mask) as u8;
        values.push(value);

        bit_pos += bits_per_value as usize;
    }

    values
}

// ============================================================================
// BMI2-accelerated batch operations
// ============================================================================

/// Extract multiple 4-bit values using PEXT.
/// Processes 16 values per u64 word.
#[inline]
pub fn extract_4bit_batch(packed: u64) -> [u8; 16] {
    let mut result = [0u8; 16];

    #[cfg(target_arch = "x86_64")]
    if bmi2_available() {
        // Use PEXT to extract each nibble
        const NIBBLE_MASK: u64 = 0x0F0F_0F0F_0F0F_0F0F;

        let even = unsafe { pext_u64_bmi2(packed, NIBBLE_MASK) };
        let odd = unsafe { pext_u64_bmi2(packed >> 4, NIBBLE_MASK) };

        // Interleave even and odd nibbles
        for i in 0..8 {
            result[i * 2] = ((even >> (i * 4)) & 0x0F) as u8;
            result[i * 2 + 1] = ((odd >> (i * 4)) & 0x0F) as u8;
        }

        return result;
    }

    // Scalar fallback
    for i in 0..16 {
        result[i] = ((packed >> (i * 4)) & 0x0F) as u8;
    }

    result
}

/// Deposit multiple 4-bit values using PDEP.
/// Processes 16 values per u64 word.
#[inline]
pub fn deposit_4bit_batch(values: [u8; 16]) -> u64 {
    #[cfg(target_arch = "x86_64")]
    if bmi2_available() {
        // Combine even and odd nibbles
        let mut even = 0u64;
        let mut odd = 0u64;

        for i in 0..8 {
            even |= ((values[i * 2] & 0x0F) as u64) << (i * 4);
            odd |= ((values[i * 2 + 1] & 0x0F) as u64) << (i * 4);
        }

        const NIBBLE_MASK: u64 = 0x0F0F_0F0F_0F0F_0F0F;

        let packed_even = unsafe { pdep_u64_bmi2(even, NIBBLE_MASK) };
        let packed_odd = unsafe { pdep_u64_bmi2(odd, NIBBLE_MASK) } << 4;

        return packed_even | packed_odd;
    }

    // Scalar fallback
    let mut result = 0u64;
    for i in 0..16 {
        result |= ((values[i] & 0x0F) as u64) << (i * 4);
    }
    result
}

/// Dispatch info for debugging.
pub fn dispatch_info() -> &'static str {
    #[cfg(target_arch = "x86_64")]
    {
        if bmi2_available() {
            if bmi2_fast() {
                return "BMI2 (fast)";
            } else {
                return "BMI2 (slow/microcode)";
            }
        }
        return "Scalar (x86_64)";
    }
    #[cfg(target_arch = "aarch64")]
    {
        return "Scalar (ARM64)";
    }
    #[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
    {
        return "Scalar";
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_pext_basic() {
        // Extract every other bit
        let src = 0b_1010_1010u64;
        let mask = 0b_0101_0101u64;

        let result = pext_u64(src, mask);
        assert_eq!(result, 0b_0000_0000); // No bits at even positions

        let result2 = pext_u64(src, 0b_1010_1010);
        assert_eq!(result2, 0b_0000_1111); // All bits at odd positions
    }

    #[test]
    fn test_pdep_basic() {
        // Deposit to every other position
        let src = 0b_0000_1111u64;
        let mask = 0b_0101_0101u64;

        let result = pdep_u64(src, mask);
        assert_eq!(result, 0b_0101_0101); // All 1s at even positions
    }

    #[test]
    fn test_pext_pdep_roundtrip() {
        let original = 0b_1100_1010_0011_0101u64;
        let mask = 0b_1111_0000_1111_0000u64;

        let extracted = pext_u64(original, mask);
        let restored = pdep_u64(extracted, mask);

        assert_eq!(original & mask, restored);
    }

    #[test]
    fn test_pack_unpack_4bit() {
        let values: Vec<u8> = vec![0, 15, 7, 8, 3, 12];
        let packed = pack_4bit(&values);
        let unpacked = unpack_4bit(&packed, values.len());

        assert_eq!(values, unpacked);
    }

    #[test]
    fn test_pack_unpack_nbits() {
        // 3-bit values
        let values: Vec<u8> = vec![0, 7, 3, 5, 2, 6, 1, 4];
        let packed = pack_nbits(&values, 3);
        let unpacked = unpack_nbits(&packed, 3, values.len());

        assert_eq!(values, unpacked);
    }

    #[test]
    fn test_extract_4bit_batch() {
        // Create a packed u64 where nibble i = i
        let mut packed: u64 = 0;
        for i in 0..16u64 {
            packed |= i << (i * 4);
        }

        let result = extract_4bit_batch(packed);

        for i in 0..16 {
            assert_eq!(result[i], i as u8, "nibble {} mismatch", i);
        }
    }

    #[test]
    fn test_deposit_4bit_batch() {
        let values: [u8; 16] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
        let packed = deposit_4bit_batch(values);

        // Verify by extraction
        for i in 0..16 {
            assert_eq!(((packed >> (i * 4)) & 0x0F) as u8, i as u8);
        }
    }

    #[test]
    fn test_dispatch_info() {
        let info = dispatch_info();
        assert!(!info.is_empty());
        // Should contain some descriptor
        println!("BMI2 dispatch: {}", info);
    }

    #[test]
    fn test_scalar_fallback_correctness() {
        // Test scalar implementations directly
        let src = 0xDEAD_BEEF_CAFE_BABEu64;
        let mask = 0x5555_5555_5555_5555u64;

        let extracted = pext_u64_scalar(src, mask);
        let deposited = pdep_u64_scalar(extracted, mask);

        assert_eq!(src & mask, deposited);
    }

    #[test]
    fn test_32bit_versions() {
        let src = 0xABCD_1234u32;
        let mask = 0xFF00_FF00u32;

        let extracted = pext_u32(src, mask);
        let deposited = pdep_u32(extracted, mask);

        assert_eq!(src & mask, deposited);
    }

    #[test]
    fn test_edge_cases() {
        // Zero mask
        assert_eq!(pext_u64(0xFFFF_FFFF, 0), 0);
        assert_eq!(pdep_u64(0xFFFF_FFFF, 0), 0);

        // Full mask
        assert_eq!(pext_u64(0x1234, 0xFFFF), 0x1234);
        assert_eq!(pdep_u64(0x1234, 0xFFFF), 0x1234);

        // Zero source
        assert_eq!(pext_u64(0, 0xFFFF), 0);
        assert_eq!(pdep_u64(0, 0xFFFF), 0);
    }
}