vibesql_executor/select/columnar/simd_ops/
mod.rs

1//! Auto-vectorized SIMD operations for columnar data processing.
2//!
3//! # Performance Architecture
4//!
5//! These functions are structured to enable LLVM auto-vectorization. They achieve
6//! equivalent performance to explicit SIMD (e.g., the `wide` crate) without the
7//! complexity of platform-specific code.
8//!
9//! ## Why This Pattern Works
10//!
11//! LLVM can auto-vectorize loops when:
12//! 1. Loop bounds are known or predictable
13//! 2. Memory access is sequential
14//! 3. Operations are independent across lanes
15//!
16//! The 4-accumulator pattern breaks loop-carried dependencies, allowing LLVM to
17//! use SIMD registers effectively:
18//!
19//! ```text
20//! // BAD: Single accumulator creates dependency chain
21//! for x in data { sum += x; }  // Each add waits for previous
22//!
23//! // GOOD: Four accumulators enable parallel execution
24//! for chunk in data.chunks(4) {
25//!     s0 += chunk[0];  // These four adds can execute
26//!     s1 += chunk[1];  // simultaneously in SIMD lanes
27//!     s2 += chunk[2];
28//!     s3 += chunk[3];
29//! }
30//! ```
31//!
32//! ## Benchmark Results (10M elements, Apple Silicon)
33//!
34//! | Operation | wide crate | auto-vectorized | naive iter |
35//! |-----------|------------|-----------------|------------|
36//! | sum_f64   | 2.0 ms     | 2.0 ms (1.0x)   | 7.8 ms     |
37//! | min_f64   | 1.5 ms     | 1.5 ms (1.0x)   | 1.4 ms     |
38//!
39//! ## WARNING
40//!
41//! DO NOT "simplify" these functions to use `.iter().sum()` or similar patterns.
42//! While cleaner-looking, they can be 3-4x slower due to floating-point
43//! associativity constraints preventing vectorization.
44//!
45//! If you need to modify these functions, run the SIMD benchmark first:
46//! ```bash
47//! cargo bench --bench tpch -- Q6
48//! ```
49
50mod aggregate;
51mod comparison;
52mod logical;
53
54pub use aggregate::*;
55pub use comparison::*;
56pub use logical::*;
57
58// ============================================================================
59// PACKED BITMASK TYPE
60// ============================================================================
61// Using packed bitmasks (1 bit per row) instead of Vec<bool> (1 byte per row)
62// provides 8x memory reduction and enables native SIMD bitwise operations.
63
64/// A packed bitmask where each bit represents whether a row passes a filter.
65///
66/// # Memory Efficiency
67///
68/// For 6M rows (lineitem at TPC-H SF 1):
69/// - `Vec<bool>`: 6 MB per predicate
70/// - `PackedMask`: 750 KB per predicate (8x reduction)
71///
72/// # Performance Benefits
73///
74/// - Bitwise AND is a native SIMD instruction (`vpand` on x86, `and` on ARM)
75/// - Better cache utilization due to reduced memory footprint
76/// - Faster popcount using `count_ones()` intrinsic
77///
78/// # Implementation
79///
80/// Bits are stored in little-endian order within each u64 word:
81/// - Bit 0 of word 0 = row 0
82/// - Bit 63 of word 0 = row 63
83/// - Bit 0 of word 1 = row 64
84/// - etc.
85#[derive(Debug, Clone, PartialEq, Eq)]
86pub struct PackedMask {
87    /// The packed bits. Each u64 holds 64 row flags.
88    words: Vec<u64>,
89    /// The actual number of rows (since the last word may be partial).
90    len: usize,
91}
92
93impl PackedMask {
94    /// Creates a new mask with all bits set (all rows pass).
95    #[inline]
96    pub fn new_all_set(len: usize) -> Self {
97        if len == 0 {
98            return Self { words: vec![], len: 0 };
99        }
100        let num_words = len.div_ceil(64);
101        let mut words = vec![u64::MAX; num_words];
102
103        // Clear unused bits in the last word
104        let remainder = len % 64;
105        if remainder != 0 {
106            words[num_words - 1] = (1u64 << remainder) - 1;
107        }
108
109        Self { words, len }
110    }
111
112    /// Creates a new mask with all bits clear (no rows pass).
113    #[inline]
114    pub fn new_all_clear(len: usize) -> Self {
115        if len == 0 {
116            return Self { words: vec![], len: 0 };
117        }
118        let num_words = len.div_ceil(64);
119        Self { words: vec![0; num_words], len }
120    }
121
122    /// Returns the number of rows this mask represents.
123    #[inline]
124    pub fn len(&self) -> usize {
125        self.len
126    }
127
128    /// Returns true if this mask is empty.
129    #[inline]
130    pub fn is_empty(&self) -> bool {
131        self.len == 0
132    }
133
134    /// Gets the value of the bit at the given index.
135    #[inline]
136    pub fn get(&self, index: usize) -> bool {
137        debug_assert!(index < self.len, "index out of bounds");
138        let word_idx = index / 64;
139        let bit_idx = index % 64;
140        (self.words[word_idx] >> bit_idx) & 1 == 1
141    }
142
143    /// Sets the bit at the given index.
144    #[inline]
145    pub fn set(&mut self, index: usize, value: bool) {
146        debug_assert!(index < self.len, "index out of bounds");
147        let word_idx = index / 64;
148        let bit_idx = index % 64;
149        if value {
150            self.words[word_idx] |= 1u64 << bit_idx;
151        } else {
152            self.words[word_idx] &= !(1u64 << bit_idx);
153        }
154    }
155
156    /// Performs in-place bitwise AND with another mask.
157    ///
158    /// This is the key operation for combining multiple predicates.
159    /// Uses native bitwise AND which maps directly to SIMD instructions.
160    #[inline]
161    pub fn and_inplace(&mut self, other: &PackedMask) {
162        debug_assert_eq!(self.len, other.len, "mask lengths must match");
163
164        // Process in chunks of 4 for potential auto-vectorization
165        let chunks = self.words.len() / 4;
166        for i in 0..chunks {
167            let base = i * 4;
168            self.words[base] &= other.words[base];
169            self.words[base + 1] &= other.words[base + 1];
170            self.words[base + 2] &= other.words[base + 2];
171            self.words[base + 3] &= other.words[base + 3];
172        }
173
174        // Handle remainder
175        for i in (chunks * 4)..self.words.len() {
176            self.words[i] &= other.words[i];
177        }
178    }
179
180    /// Performs in-place bitwise OR with another mask.
181    ///
182    /// This is used for combining multiple predicates in IN lists.
183    /// Uses native bitwise OR which maps directly to SIMD instructions.
184    #[inline]
185    pub fn or_inplace(&mut self, other: &PackedMask) {
186        debug_assert_eq!(self.len, other.len, "mask lengths must match");
187
188        // Process in chunks of 4 for potential auto-vectorization
189        let chunks = self.words.len() / 4;
190        for i in 0..chunks {
191            let base = i * 4;
192            self.words[base] |= other.words[base];
193            self.words[base + 1] |= other.words[base + 1];
194            self.words[base + 2] |= other.words[base + 2];
195            self.words[base + 3] |= other.words[base + 3];
196        }
197
198        // Handle remainder
199        for i in (chunks * 4)..self.words.len() {
200            self.words[i] |= other.words[i];
201        }
202    }
203
204    /// Returns a new mask with all bits inverted.
205    ///
206    /// This is used for NOT IN operations.
207    #[inline]
208    pub fn not(&self) -> PackedMask {
209        let mut result = PackedMask {
210            words: vec![0; self.words.len()],
211            len: self.len,
212        };
213
214        // Process in chunks of 4 for potential auto-vectorization
215        let chunks = self.words.len() / 4;
216        for i in 0..chunks {
217            let base = i * 4;
218            result.words[base] = !self.words[base];
219            result.words[base + 1] = !self.words[base + 1];
220            result.words[base + 2] = !self.words[base + 2];
221            result.words[base + 3] = !self.words[base + 3];
222        }
223
224        // Handle remainder
225        for i in (chunks * 4)..self.words.len() {
226            result.words[i] = !self.words[i];
227        }
228
229        // Clear unused bits in the last word
230        let remainder = self.len % 64;
231        let num_words = result.words.len();
232        if remainder != 0 && num_words > 0 {
233            result.words[num_words - 1] &= (1u64 << remainder) - 1;
234        }
235
236        result
237    }
238
239    /// Counts the number of set bits (rows that pass the filter).
240    ///
241    /// Uses the `count_ones()` intrinsic which maps to hardware popcount.
242    #[inline]
243    pub fn count_ones(&self) -> usize {
244        // Use 4-accumulator pattern for better throughput
245        let chunks = self.words.len() / 4;
246        let (mut c0, mut c1, mut c2, mut c3) = (0u32, 0u32, 0u32, 0u32);
247
248        for i in 0..chunks {
249            let base = i * 4;
250            c0 += self.words[base].count_ones();
251            c1 += self.words[base + 1].count_ones();
252            c2 += self.words[base + 2].count_ones();
253            c3 += self.words[base + 3].count_ones();
254        }
255
256        let mut total = (c0 + c1 + c2 + c3) as usize;
257        for i in (chunks * 4)..self.words.len() {
258            total += self.words[i].count_ones() as usize;
259        }
260        total
261    }
262
263    /// Creates a packed mask from a boolean slice.
264    ///
265    /// Used for compatibility with existing code during migration.
266    #[inline]
267    pub fn from_bool_slice(slice: &[bool]) -> Self {
268        let len = slice.len();
269        if len == 0 {
270            return Self { words: vec![], len: 0 };
271        }
272
273        let num_words = len.div_ceil(64);
274        let mut words = vec![0u64; num_words];
275
276        // Process 64 bits at a time
277        for (word_idx, word) in words.iter_mut().enumerate() {
278            let start = word_idx * 64;
279            let end = (start + 64).min(len);
280            let mut bits = 0u64;
281            for (bit_idx, &value) in slice[start..end].iter().enumerate() {
282                if value {
283                    bits |= 1u64 << bit_idx;
284                }
285            }
286            *word = bits;
287        }
288
289        Self { words, len }
290    }
291
292    /// Converts this packed mask back to a boolean vector.
293    ///
294    /// Used for compatibility with existing code during migration.
295    #[inline]
296    pub fn to_bool_vec(&self) -> Vec<bool> {
297        let mut result = Vec::with_capacity(self.len);
298        for i in 0..self.len {
299            result.push(self.get(i));
300        }
301        result
302    }
303
304    /// Returns a reference to the underlying word slice.
305    #[inline]
306    pub fn words(&self) -> &[u64] {
307        &self.words
308    }
309
310    /// Creates a PackedMask from raw words and length.
311    ///
312    /// This is used internally by comparison functions that build masks.
313    /// The caller must ensure the words are valid for the given length.
314    #[inline]
315    pub(crate) fn from_words(words: Vec<u64>, len: usize) -> Self {
316        Self { words, len }
317    }
318}
319
320#[cfg(test)]
321mod tests {
322    use super::*;
323
324    #[test]
325    fn test_packed_mask_new_all_set() {
326        let mask = PackedMask::new_all_set(100);
327        assert_eq!(mask.len(), 100);
328        for i in 0..100 {
329            assert!(mask.get(i), "bit {} should be set", i);
330        }
331
332        // Empty mask
333        let empty = PackedMask::new_all_set(0);
334        assert_eq!(empty.len(), 0);
335        assert!(empty.is_empty());
336    }
337
338    #[test]
339    fn test_packed_mask_new_all_clear() {
340        let mask = PackedMask::new_all_clear(100);
341        assert_eq!(mask.len(), 100);
342        for i in 0..100 {
343            assert!(!mask.get(i), "bit {} should be clear", i);
344        }
345    }
346
347    #[test]
348    fn test_packed_mask_set_get() {
349        let mut mask = PackedMask::new_all_clear(128);
350
351        // Set some bits
352        mask.set(0, true);
353        mask.set(63, true);
354        mask.set(64, true);
355        mask.set(127, true);
356
357        assert!(mask.get(0));
358        assert!(mask.get(63));
359        assert!(mask.get(64));
360        assert!(mask.get(127));
361        assert!(!mask.get(1));
362        assert!(!mask.get(62));
363        assert!(!mask.get(65));
364    }
365
366    #[test]
367    fn test_packed_mask_and_inplace() {
368        let mut mask_a = PackedMask::new_all_set(128);
369        let mut mask_b = PackedMask::new_all_clear(128);
370
371        // Set every other bit in mask_b
372        for i in (0..128).step_by(2) {
373            mask_b.set(i, true);
374        }
375
376        mask_a.and_inplace(&mask_b);
377
378        for i in 0..128 {
379            if i % 2 == 0 {
380                assert!(mask_a.get(i), "bit {} should be set (even)", i);
381            } else {
382                assert!(!mask_a.get(i), "bit {} should be clear (odd)", i);
383            }
384        }
385    }
386
387    #[test]
388    fn test_packed_mask_count_ones() {
389        let mask = PackedMask::new_all_set(100);
390        assert_eq!(mask.count_ones(), 100);
391
392        let mask = PackedMask::new_all_clear(100);
393        assert_eq!(mask.count_ones(), 0);
394
395        let mut mask = PackedMask::new_all_clear(128);
396        for i in (0..128).step_by(2) {
397            mask.set(i, true);
398        }
399        assert_eq!(mask.count_ones(), 64);
400    }
401
402    #[test]
403    fn test_packed_mask_from_bool_slice() {
404        let bools = vec![true, false, true, true, false, true];
405        let mask = PackedMask::from_bool_slice(&bools);
406
407        assert_eq!(mask.len(), 6);
408        assert!(mask.get(0));
409        assert!(!mask.get(1));
410        assert!(mask.get(2));
411        assert!(mask.get(3));
412        assert!(!mask.get(4));
413        assert!(mask.get(5));
414        assert_eq!(mask.count_ones(), 4);
415    }
416
417    #[test]
418    fn test_packed_mask_to_bool_vec() {
419        let mut mask = PackedMask::new_all_clear(6);
420        mask.set(0, true);
421        mask.set(2, true);
422        mask.set(3, true);
423        mask.set(5, true);
424
425        let bools = mask.to_bool_vec();
426        assert_eq!(bools, vec![true, false, true, true, false, true]);
427    }
428
429    #[test]
430    fn test_packed_mask_round_trip() {
431        let original = vec![true, false, true, true, false, true, false, false, true];
432        let mask = PackedMask::from_bool_slice(&original);
433        let result = mask.to_bool_vec();
434        assert_eq!(original, result);
435    }
436
437    #[test]
438    fn test_packed_mask_partial_word() {
439        // Test with sizes that don't fill a complete u64 word
440        for size in [1, 7, 15, 63, 65, 100, 127, 129] {
441            let mask = PackedMask::new_all_set(size);
442            assert_eq!(mask.count_ones(), size, "failed for size {}", size);
443
444            let mask = PackedMask::new_all_clear(size);
445            assert_eq!(mask.count_ones(), 0, "failed for size {}", size);
446
447            // Test conversion round-trip
448            let bools = vec![true; size];
449            let mask = PackedMask::from_bool_slice(&bools);
450            assert_eq!(mask.count_ones(), size, "round-trip failed for size {}", size);
451            assert_eq!(mask.to_bool_vec(), bools, "round-trip failed for size {}", size);
452        }
453    }
454}