succinctly 0.7.0

High-performance succinct data structures for Rust
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
//! NEON-accelerated DSV indexing for ARM64.
//!
//! Processes 64 bytes at a time using ARM NEON SIMD instructions to find
//! delimiters, newlines, and quotes in parallel.

#[cfg(not(test))]
use alloc::vec;

use core::arch::aarch64::*;

use super::super::config::DsvConfig;
use super::super::index::DsvIndex;
use super::super::index_lightweight::DsvIndexLightweight;
use crate::json::BitWriter;

/// Build a DsvIndex using NEON SIMD acceleration.
///
/// This function processes 64 bytes at a time, using NEON to:
/// 1. Find all delimiter, newline, and quote positions
/// 2. Compute the in-quote mask using prefix XOR
/// 3. Mask out positions inside quotes
///
/// Returns a lightweight index structure for faster iteration (5-9x speedup).
pub fn build_index_simd(text: &[u8], config: &DsvConfig) -> DsvIndex {
    if text.is_empty() {
        return DsvIndex::new_lightweight(DsvIndexLightweight::new(vec![], vec![], 0));
    }

    // SAFETY: NEON is mandatory on aarch64
    unsafe { build_index_neon(text, config) }
}

#[target_feature(enable = "neon")]
unsafe fn build_index_neon(text: &[u8], config: &DsvConfig) -> DsvIndex {
    let num_words = text.len().div_ceil(64);
    let mut markers_writer = BitWriter::with_capacity(num_words);
    let mut newlines_writer = BitWriter::with_capacity(num_words);

    // Track quote state across chunks
    let mut in_quote = false;

    let delimiter = config.delimiter;
    let quote_char = config.quote_char;
    let newline = config.newline;

    let mut offset = 0;

    // Process 64-byte chunks (4x 16-byte NEON loads)
    while offset + 64 <= text.len() {
        let (markers_word, newlines_word, new_in_quote) = unsafe {
            process_chunk_64(
                text.as_ptr().add(offset),
                delimiter,
                quote_char,
                newline,
                in_quote,
            )
        };

        markers_writer.write_bits(markers_word, 64);
        newlines_writer.write_bits(newlines_word, 64);
        in_quote = new_in_quote;
        offset += 64;
    }

    // Process remaining bytes (less than 64)
    if offset < text.len() {
        let remaining = text.len() - offset;

        // Pad with zeros
        let mut padded = [0u8; 64];
        padded[..remaining].copy_from_slice(&text[offset..]);

        let (mut markers_word, mut newlines_word, _) =
            unsafe { process_chunk_64(padded.as_ptr(), delimiter, quote_char, newline, in_quote) };

        // Mask out the padding bits
        let mask = (1u64 << remaining) - 1;
        markers_word &= mask;
        newlines_word &= mask;

        markers_writer.write_bits(markers_word, remaining);
        newlines_writer.write_bits(newlines_word, remaining);
    }

    let markers_words = markers_writer.finish();
    let newlines_words = newlines_writer.finish();

    let lightweight = DsvIndexLightweight::new(markers_words, newlines_words, text.len());
    DsvIndex::new_lightweight(lightweight)
}

/// Process a 64-byte chunk and return (markers, newlines, new_in_quote).
///
/// The algorithm:
/// 1. Find all quote, delimiter, and newline positions using SIMD
/// 2. Compute the in-quote mask using prefix XOR on quote positions
/// 3. Mask delimiters and newlines to exclude those inside quotes
#[inline]
#[target_feature(enable = "neon")]
unsafe fn process_chunk_64(
    ptr: *const u8,
    delimiter: u8,
    quote_char: u8,
    newline: u8,
    in_quote: bool,
) -> (u64, u64, bool) {
    // Load 4 x 16-byte chunks
    let chunk0 = vld1q_u8(ptr);
    let chunk1 = vld1q_u8(ptr.add(16));
    let chunk2 = vld1q_u8(ptr.add(32));
    let chunk3 = vld1q_u8(ptr.add(48));

    // Create comparison vectors
    let v_delimiter = vdupq_n_u8(delimiter);
    let v_quote = vdupq_n_u8(quote_char);
    let v_newline = vdupq_n_u8(newline);

    // Compare each chunk against delimiter, quote, and newline
    let (delim0, quote0, nl0) = compare_chunk(chunk0, v_delimiter, v_quote, v_newline);
    let (delim1, quote1, nl1) = compare_chunk(chunk1, v_delimiter, v_quote, v_newline);
    let (delim2, quote2, nl2) = compare_chunk(chunk2, v_delimiter, v_quote, v_newline);
    let (delim3, quote3, nl3) = compare_chunk(chunk3, v_delimiter, v_quote, v_newline);

    // Extract bitmasks for each 16-byte chunk
    let delim_mask0 = neon_movemask(delim0) as u64;
    let delim_mask1 = neon_movemask(delim1) as u64;
    let delim_mask2 = neon_movemask(delim2) as u64;
    let delim_mask3 = neon_movemask(delim3) as u64;

    let quote_mask0 = neon_movemask(quote0) as u64;
    let quote_mask1 = neon_movemask(quote1) as u64;
    let quote_mask2 = neon_movemask(quote2) as u64;
    let quote_mask3 = neon_movemask(quote3) as u64;

    let nl_mask0 = neon_movemask(nl0) as u64;
    let nl_mask1 = neon_movemask(nl1) as u64;
    let nl_mask2 = neon_movemask(nl2) as u64;
    let nl_mask3 = neon_movemask(nl3) as u64;

    // Combine into 64-bit masks
    let delim_mask = delim_mask0 | (delim_mask1 << 16) | (delim_mask2 << 32) | (delim_mask3 << 48);
    let quote_mask = quote_mask0 | (quote_mask1 << 16) | (quote_mask2 << 32) | (quote_mask3 << 48);
    let nl_mask = nl_mask0 | (nl_mask1 << 16) | (nl_mask2 << 32) | (nl_mask3 << 48);

    // Compute the in-quote mask using prefix XOR (via NEON PMULL)
    // The prefix XOR of the quote mask tells us which positions are inside quotes.
    // If in_quote is true, we XOR with all 1s to flip the mask.
    let quote_xor = prefix_xor_neon(quote_mask);
    let in_quote_mask = if in_quote { !quote_xor } else { quote_xor };

    // Delimiters and newlines are valid only outside quotes
    let valid_delim = delim_mask & !in_quote_mask;
    let valid_nl = nl_mask & !in_quote_mask;

    // Markers = delimiters OR newlines (both outside quotes)
    let markers = valid_delim | valid_nl;
    let newlines = valid_nl;

    // New in_quote state: count quotes and XOR with current state
    let quote_count = quote_mask.count_ones();
    let new_in_quote = (quote_count & 1 == 1) != in_quote;

    (markers, newlines, new_in_quote)
}

/// Compare a 16-byte chunk against delimiter, quote, and newline characters.
/// Returns (delimiter_mask, quote_mask, newline_mask) as NEON vectors.
#[inline]
#[target_feature(enable = "neon")]
unsafe fn compare_chunk(
    chunk: uint8x16_t,
    v_delimiter: uint8x16_t,
    v_quote: uint8x16_t,
    v_newline: uint8x16_t,
) -> (uint8x16_t, uint8x16_t, uint8x16_t) {
    let eq_delim = vceqq_u8(chunk, v_delimiter);
    let eq_quote = vceqq_u8(chunk, v_quote);
    let eq_newline = vceqq_u8(chunk, v_newline);
    (eq_delim, eq_quote, eq_newline)
}

/// Extract a bitmask from the high bit of each byte in a NEON vector.
/// Returns a u16 where bit i is set if byte i has its high bit set.
#[inline]
#[target_feature(enable = "neon")]
unsafe fn neon_movemask(v: uint8x16_t) -> u16 {
    // Shift right by 7 to get 0 or 1 in each byte
    let high_bits = vshrq_n_u8::<7>(v);

    // Extract the 16 bytes as two u64 values
    let low_u64 = vgetq_lane_u64::<0>(vreinterpretq_u64_u8(high_bits));
    let high_u64 = vgetq_lane_u64::<1>(vreinterpretq_u64_u8(high_bits));

    // Pack 8 bytes into 8 bits using multiplication trick
    const MAGIC: u64 = 0x0102040810204080;

    let low_packed = (low_u64.wrapping_mul(MAGIC) >> 56) as u8;
    let high_packed = (high_u64.wrapping_mul(MAGIC) >> 56) as u8;

    (low_packed as u16) | ((high_packed as u16) << 8)
}

/// Compute inclusive prefix XOR (cumulative XOR) of a 64-bit mask using NEON PMULL.
///
/// The prefix XOR at position i is the XOR of all bits at positions 0..=i.
/// This tells us the "parity" of quotes seen so far (including current),
/// which indicates whether we're inside a quoted field.
///
/// After prefix XOR:
/// - Positions where the bit is 1 have seen an odd number of quotes (inside)
/// - Positions where the bit is 0 have seen an even number of quotes (outside)
///
/// Example: quotes at positions 2 and 5
/// quote_mask = 0b100100 (bits 2 and 5 set)
/// prefix_xor = 0b011100 (bits 2,3,4 are "inside" quotes)
///
/// ## Implementation
///
/// Uses carryless multiplication (PMULL instruction) which computes prefix XOR
/// in O(1) operations instead of O(log n) shifts.
///
/// The key insight: `prefix_xor(x) = clmul(x, 0xFFFFFFFFFFFFFFFF)` truncated to 64 bits.
/// This is because carryless multiplication by all-1s propagates each bit to all
/// higher positions via XOR, which is exactly the prefix XOR operation.
#[inline]
#[target_feature(enable = "neon")]
unsafe fn prefix_xor_neon(x: u64) -> u64 {
    use core::arch::aarch64::*;

    // Load x into a NEON register (low 64 bits of poly64x2)
    let a = vreinterpretq_p64_u64(vdupq_n_u64(x));

    // Load the all-1s constant for carryless multiplication
    let b = vreinterpretq_p64_u64(vdupq_n_u64(!0u64));

    // Carryless multiply: result = x * 0xFFFF...FFFF in GF(2)
    // This produces a 128-bit result, but we only need the low 64 bits
    // vmull_p64 multiplies the low lane of each operand
    let product = vmull_p64(vgetq_lane_p64::<0>(a), vgetq_lane_p64::<0>(b));

    // Extract the low 64 bits which contains the prefix XOR result
    vgetq_lane_u64::<0>(vreinterpretq_u64_p128(product))
}

/// Scalar fallback for prefix XOR (used in tests for comparison).
#[inline]
#[allow(dead_code)]
fn prefix_xor_scalar(x: u64) -> u64 {
    // Parallel prefix XOR using doubling with left shifts
    // After step k, each bit i contains XOR of bits max(0, i-2^k+1)..=i
    // After all steps, each bit i contains XOR of bits 0..=i
    let mut y = x;
    y ^= y << 1;
    y ^= y << 2;
    y ^= y << 4;
    y ^= y << 8;
    y ^= y << 16;
    y ^= y << 32;
    y
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_prefix_xor_neon() {
        // SAFETY: NEON is mandatory on aarch64
        unsafe {
            // No quotes
            assert_eq!(prefix_xor_neon(0b0), 0b0);

            // Single quote at position 0
            assert_eq!(prefix_xor_neon(0b1), !0u64); // All bits after are "inside"

            // Quote at position 2
            // prefix_xor should set all bits from position 2 onwards
            let q = 0b100u64;
            let px = prefix_xor_neon(q);
            // Bits 0,1 should be 0 (outside), bits 2+ should be 1 (inside)
            assert_eq!(px & 0b11, 0);
            assert_eq!(px & 0b100, 0b100);

            // Quotes at positions 2 and 5 (entering and exiting)
            // Bits 2,3,4 should be inside, bits 0,1,5+ should be outside
            let q = 0b100100u64; // positions 2 and 5
            let px = prefix_xor_neon(q);
            // After prefix XOR: bit i = XOR of bits 0..=i
            // pos 0: 0
            // pos 1: 0
            // pos 2: 1 (quote at 2)
            // pos 3: 1
            // pos 4: 1
            // pos 5: 0 (quote at 5 XORs with accumulated 1)
            assert_eq!(px & 0b111111, 0b011100);
        }
    }

    #[test]
    fn test_prefix_xor_neon_matches_scalar() {
        // Test that NEON PMULL implementation matches scalar implementation
        let test_patterns = [
            0u64,
            1,
            0b11,
            0b101,
            0b1001,
            0x8000_0000_0000_0000,
            0xAAAA_AAAA_AAAA_AAAA,
            0x5555_5555_5555_5555,
            0xFF00_FF00_FF00_FF00,
            0x0123_4567_89AB_CDEF,
            !0u64,
        ];

        for &pattern in &test_patterns {
            let scalar_result = prefix_xor_scalar(pattern);
            let neon_result = unsafe { prefix_xor_neon(pattern) };
            assert_eq!(
                neon_result, scalar_result,
                "Mismatch for pattern {:#018x}: NEON={:#018x}, scalar={:#018x}",
                pattern, neon_result, scalar_result
            );
        }
    }

    #[test]
    fn test_simple_csv() {
        let csv = b"a,b,c\n";
        let config = DsvConfig::default();
        let index = build_index_simd(csv, &config);

        // Should have 3 markers: positions 1 (,), 3 (,), 5 (\n)
        assert_eq!(index.marker_count(), 3);
        assert_eq!(index.row_count(), 1);
    }

    #[test]
    fn test_quoted_delimiter() {
        let csv = b"\"a,b\",c\n";
        let config = DsvConfig::default();
        let index = build_index_simd(csv, &config);

        // The comma inside quotes should not be a marker
        assert_eq!(index.marker_count(), 2); // comma after quote + newline
        assert_eq!(index.row_count(), 1);
    }

    #[test]
    fn test_quoted_newline() {
        let csv = b"\"a\nb\",c\n";
        let config = DsvConfig::default();
        let index = build_index_simd(csv, &config);

        // The newline inside quotes should not be counted
        assert_eq!(index.row_count(), 1);
    }

    #[test]
    fn test_multiple_rows() {
        let csv = b"a,b\nc,d\ne,f\n";
        let config = DsvConfig::default();
        let index = build_index_simd(csv, &config);

        assert_eq!(index.row_count(), 3);
        // 3 commas + 3 newlines = 6 markers
        assert_eq!(index.marker_count(), 6);
    }

    #[test]
    fn test_empty() {
        let csv = b"";
        let config = DsvConfig::default();
        let index = build_index_simd(csv, &config);

        assert_eq!(index.row_count(), 0);
        assert_eq!(index.marker_count(), 0);
    }

    #[test]
    fn test_simd_matches_scalar() {
        let csv = b"a,b,c\nd,e,f\n\"g,h\",i\n";
        let config = DsvConfig::default();

        let index_simd = build_index_simd(csv, &config);
        let index_scalar = super::super::super::parser::build_index(csv, &config);

        assert_eq!(index_simd.marker_count(), index_scalar.marker_count());
        assert_eq!(index_simd.row_count(), index_scalar.row_count());
    }

    #[test]
    fn test_long_quoted_field() {
        // Test with quoted field spanning multiple 16-byte chunks
        let csv = b"\"this is a very long quoted field that spans multiple NEON chunks\",b\n";
        let config = DsvConfig::default();

        let index_simd = build_index_simd(csv, &config);
        let index_scalar = super::super::super::parser::build_index(csv, &config);

        assert_eq!(index_simd.marker_count(), index_scalar.marker_count());
        assert_eq!(index_simd.row_count(), index_scalar.row_count());
    }

    #[test]
    fn test_large_csv() {
        // Test with more than 64 bytes
        let csv = b"a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z\n\
                   1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6\n";
        let config = DsvConfig::default();

        let index_simd = build_index_simd(csv, &config);
        let index_scalar = super::super::super::parser::build_index(csv, &config);

        assert_eq!(index_simd.marker_count(), index_scalar.marker_count());
        assert_eq!(index_simd.row_count(), index_scalar.row_count());
    }

    #[test]
    fn test_quoted_spanning_chunks() {
        // Quote that spans a 64-byte boundary
        let mut csv = Vec::new();
        csv.push(b'"');
        // Add content to push past 64 bytes before closing quote
        csv.resize(71, b'x');
        csv.push(b'"');
        csv.push(b',');
        csv.push(b'b');
        csv.push(b'\n');

        let config = DsvConfig::default();
        let index_simd = build_index_simd(&csv, &config);
        let index_scalar = super::super::super::parser::build_index(&csv, &config);

        assert_eq!(
            index_simd.marker_count(),
            index_scalar.marker_count(),
            "Marker count mismatch"
        );
        assert_eq!(
            index_simd.row_count(),
            index_scalar.row_count(),
            "Row count mismatch"
        );
    }
}