vibesql_executor/select/columnar/simd_ops/
mod.rs

1//! Auto-vectorized SIMD operations for columnar data processing.
2//!
3//! # Performance Architecture
4//!
5//! These functions are structured to enable LLVM auto-vectorization. They achieve
6//! equivalent performance to explicit SIMD (e.g., the `wide` crate) without the
7//! complexity of platform-specific code.
8//!
9//! ## Why This Pattern Works
10//!
11//! LLVM can auto-vectorize loops when:
12//! 1. Loop bounds are known or predictable
13//! 2. Memory access is sequential
14//! 3. Operations are independent across lanes
15//!
16//! The 4-accumulator pattern breaks loop-carried dependencies, allowing LLVM to
17//! use SIMD registers effectively:
18//!
19//! ```text
20//! // BAD: Single accumulator creates dependency chain
21//! for x in data { sum += x; }  // Each add waits for previous
22//!
23//! // GOOD: Four accumulators enable parallel execution
24//! for chunk in data.chunks(4) {
25//!     s0 += chunk[0];  // These four adds can execute
26//!     s1 += chunk[1];  // simultaneously in SIMD lanes
27//!     s2 += chunk[2];
28//!     s3 += chunk[3];
29//! }
30//! ```
31//!
32//! ## Benchmark Results (10M elements, Apple Silicon)
33//!
34//! | Operation | wide crate | auto-vectorized | naive iter |
35//! |-----------|------------|-----------------|------------|
36//! | sum_f64   | 2.0 ms     | 2.0 ms (1.0x)   | 7.8 ms     |
37//! | min_f64   | 1.5 ms     | 1.5 ms (1.0x)   | 1.4 ms     |
38//!
39//! ## WARNING
40//!
41//! DO NOT "simplify" these functions to use `.iter().sum()` or similar patterns.
42//! While cleaner-looking, they can be 3-4x slower due to floating-point
43//! associativity constraints preventing vectorization.
44//!
45//! If you need to modify these functions, run the SIMD benchmark first:
46//! ```bash
47//! cargo bench --bench tpch -- Q6
48//! ```
49
50mod aggregate;
51mod comparison;
52mod logical;
53
54pub use aggregate::*;
55pub use comparison::*;
56pub use logical::*;
57
58// ============================================================================
59// PACKED BITMASK TYPE
60// ============================================================================
61// Using packed bitmasks (1 bit per row) instead of Vec<bool> (1 byte per row)
62// provides 8x memory reduction and enables native SIMD bitwise operations.
63
64/// A packed bitmask where each bit represents whether a row passes a filter.
65///
66/// # Memory Efficiency
67///
68/// For 6M rows (lineitem at TPC-H SF 1):
69/// - `Vec<bool>`: 6 MB per predicate
70/// - `PackedMask`: 750 KB per predicate (8x reduction)
71///
72/// # Performance Benefits
73///
74/// - Bitwise AND is a native SIMD instruction (`vpand` on x86, `and` on ARM)
75/// - Better cache utilization due to reduced memory footprint
76/// - Faster popcount using `count_ones()` intrinsic
77///
78/// # Implementation
79///
80/// Bits are stored in little-endian order within each u64 word:
81/// - Bit 0 of word 0 = row 0
82/// - Bit 63 of word 0 = row 63
83/// - Bit 0 of word 1 = row 64
84/// - etc.
85#[derive(Debug, Clone, PartialEq, Eq)]
86pub struct PackedMask {
87    /// The packed bits. Each u64 holds 64 row flags.
88    words: Vec<u64>,
89    /// The actual number of rows (since the last word may be partial).
90    len: usize,
91}
92
93impl PackedMask {
94    /// Creates a new mask with all bits set (all rows pass).
95    #[inline]
96    pub fn new_all_set(len: usize) -> Self {
97        if len == 0 {
98            return Self { words: vec![], len: 0 };
99        }
100        let num_words = len.div_ceil(64);
101        let mut words = vec![u64::MAX; num_words];
102
103        // Clear unused bits in the last word
104        let remainder = len % 64;
105        if remainder != 0 {
106            words[num_words - 1] = (1u64 << remainder) - 1;
107        }
108
109        Self { words, len }
110    }
111
112    /// Creates a new mask with all bits clear (no rows pass).
113    #[inline]
114    pub fn new_all_clear(len: usize) -> Self {
115        if len == 0 {
116            return Self { words: vec![], len: 0 };
117        }
118        let num_words = len.div_ceil(64);
119        Self { words: vec![0; num_words], len }
120    }
121
122    /// Returns the number of rows this mask represents.
123    #[inline]
124    pub fn len(&self) -> usize {
125        self.len
126    }
127
128    /// Returns true if this mask is empty.
129    #[inline]
130    pub fn is_empty(&self) -> bool {
131        self.len == 0
132    }
133
134    /// Gets the value of the bit at the given index.
135    #[inline]
136    pub fn get(&self, index: usize) -> bool {
137        debug_assert!(index < self.len, "index out of bounds");
138        let word_idx = index / 64;
139        let bit_idx = index % 64;
140        (self.words[word_idx] >> bit_idx) & 1 == 1
141    }
142
143    /// Sets the bit at the given index.
144    #[inline]
145    pub fn set(&mut self, index: usize, value: bool) {
146        debug_assert!(index < self.len, "index out of bounds");
147        let word_idx = index / 64;
148        let bit_idx = index % 64;
149        if value {
150            self.words[word_idx] |= 1u64 << bit_idx;
151        } else {
152            self.words[word_idx] &= !(1u64 << bit_idx);
153        }
154    }
155
156    /// Performs in-place bitwise AND with another mask.
157    ///
158    /// This is the key operation for combining multiple predicates.
159    /// Uses native bitwise AND which maps directly to SIMD instructions.
160    #[inline]
161    pub fn and_inplace(&mut self, other: &PackedMask) {
162        debug_assert_eq!(self.len, other.len, "mask lengths must match");
163
164        // Process in chunks of 4 for potential auto-vectorization
165        let chunks = self.words.len() / 4;
166        for i in 0..chunks {
167            let base = i * 4;
168            self.words[base] &= other.words[base];
169            self.words[base + 1] &= other.words[base + 1];
170            self.words[base + 2] &= other.words[base + 2];
171            self.words[base + 3] &= other.words[base + 3];
172        }
173
174        // Handle remainder
175        for i in (chunks * 4)..self.words.len() {
176            self.words[i] &= other.words[i];
177        }
178    }
179
180    /// Performs in-place bitwise OR with another mask.
181    ///
182    /// This is used for combining multiple predicates in IN lists.
183    /// Uses native bitwise OR which maps directly to SIMD instructions.
184    #[inline]
185    pub fn or_inplace(&mut self, other: &PackedMask) {
186        debug_assert_eq!(self.len, other.len, "mask lengths must match");
187
188        // Process in chunks of 4 for potential auto-vectorization
189        let chunks = self.words.len() / 4;
190        for i in 0..chunks {
191            let base = i * 4;
192            self.words[base] |= other.words[base];
193            self.words[base + 1] |= other.words[base + 1];
194            self.words[base + 2] |= other.words[base + 2];
195            self.words[base + 3] |= other.words[base + 3];
196        }
197
198        // Handle remainder
199        for i in (chunks * 4)..self.words.len() {
200            self.words[i] |= other.words[i];
201        }
202    }
203
204    /// Returns a new mask with all bits inverted.
205    ///
206    /// This is used for NOT IN operations.
207    #[inline]
208    pub fn not(&self) -> PackedMask {
209        let mut result = PackedMask { words: vec![0; self.words.len()], len: self.len };
210
211        // Process in chunks of 4 for potential auto-vectorization
212        let chunks = self.words.len() / 4;
213        for i in 0..chunks {
214            let base = i * 4;
215            result.words[base] = !self.words[base];
216            result.words[base + 1] = !self.words[base + 1];
217            result.words[base + 2] = !self.words[base + 2];
218            result.words[base + 3] = !self.words[base + 3];
219        }
220
221        // Handle remainder
222        for i in (chunks * 4)..self.words.len() {
223            result.words[i] = !self.words[i];
224        }
225
226        // Clear unused bits in the last word
227        let remainder = self.len % 64;
228        let num_words = result.words.len();
229        if remainder != 0 && num_words > 0 {
230            result.words[num_words - 1] &= (1u64 << remainder) - 1;
231        }
232
233        result
234    }
235
236    /// Counts the number of set bits (rows that pass the filter).
237    ///
238    /// Uses the `count_ones()` intrinsic which maps to hardware popcount.
239    #[inline]
240    pub fn count_ones(&self) -> usize {
241        // Use 4-accumulator pattern for better throughput
242        let chunks = self.words.len() / 4;
243        let (mut c0, mut c1, mut c2, mut c3) = (0u32, 0u32, 0u32, 0u32);
244
245        for i in 0..chunks {
246            let base = i * 4;
247            c0 += self.words[base].count_ones();
248            c1 += self.words[base + 1].count_ones();
249            c2 += self.words[base + 2].count_ones();
250            c3 += self.words[base + 3].count_ones();
251        }
252
253        let mut total = (c0 + c1 + c2 + c3) as usize;
254        for i in (chunks * 4)..self.words.len() {
255            total += self.words[i].count_ones() as usize;
256        }
257        total
258    }
259
260    /// Creates a packed mask from a boolean slice.
261    ///
262    /// Used for compatibility with existing code during migration.
263    #[inline]
264    pub fn from_bool_slice(slice: &[bool]) -> Self {
265        let len = slice.len();
266        if len == 0 {
267            return Self { words: vec![], len: 0 };
268        }
269
270        let num_words = len.div_ceil(64);
271        let mut words = vec![0u64; num_words];
272
273        // Process 64 bits at a time
274        for (word_idx, word) in words.iter_mut().enumerate() {
275            let start = word_idx * 64;
276            let end = (start + 64).min(len);
277            let mut bits = 0u64;
278            for (bit_idx, &value) in slice[start..end].iter().enumerate() {
279                if value {
280                    bits |= 1u64 << bit_idx;
281                }
282            }
283            *word = bits;
284        }
285
286        Self { words, len }
287    }
288
289    /// Converts this packed mask back to a boolean vector.
290    ///
291    /// Used for compatibility with existing code during migration.
292    #[inline]
293    pub fn to_bool_vec(&self) -> Vec<bool> {
294        let mut result = Vec::with_capacity(self.len);
295        for i in 0..self.len {
296            result.push(self.get(i));
297        }
298        result
299    }
300
301    /// Returns a reference to the underlying word slice.
302    #[inline]
303    pub fn words(&self) -> &[u64] {
304        &self.words
305    }
306
307    /// Creates a PackedMask from raw words and length.
308    ///
309    /// This is used internally by comparison functions that build masks.
310    /// The caller must ensure the words are valid for the given length.
311    #[inline]
312    pub(crate) fn from_words(words: Vec<u64>, len: usize) -> Self {
313        Self { words, len }
314    }
315}
316
317#[cfg(test)]
318mod tests {
319    use super::*;
320
321    #[test]
322    fn test_packed_mask_new_all_set() {
323        let mask = PackedMask::new_all_set(100);
324        assert_eq!(mask.len(), 100);
325        for i in 0..100 {
326            assert!(mask.get(i), "bit {} should be set", i);
327        }
328
329        // Empty mask
330        let empty = PackedMask::new_all_set(0);
331        assert_eq!(empty.len(), 0);
332        assert!(empty.is_empty());
333    }
334
335    #[test]
336    fn test_packed_mask_new_all_clear() {
337        let mask = PackedMask::new_all_clear(100);
338        assert_eq!(mask.len(), 100);
339        for i in 0..100 {
340            assert!(!mask.get(i), "bit {} should be clear", i);
341        }
342    }
343
344    #[test]
345    fn test_packed_mask_set_get() {
346        let mut mask = PackedMask::new_all_clear(128);
347
348        // Set some bits
349        mask.set(0, true);
350        mask.set(63, true);
351        mask.set(64, true);
352        mask.set(127, true);
353
354        assert!(mask.get(0));
355        assert!(mask.get(63));
356        assert!(mask.get(64));
357        assert!(mask.get(127));
358        assert!(!mask.get(1));
359        assert!(!mask.get(62));
360        assert!(!mask.get(65));
361    }
362
363    #[test]
364    fn test_packed_mask_and_inplace() {
365        let mut mask_a = PackedMask::new_all_set(128);
366        let mut mask_b = PackedMask::new_all_clear(128);
367
368        // Set every other bit in mask_b
369        for i in (0..128).step_by(2) {
370            mask_b.set(i, true);
371        }
372
373        mask_a.and_inplace(&mask_b);
374
375        for i in 0..128 {
376            if i % 2 == 0 {
377                assert!(mask_a.get(i), "bit {} should be set (even)", i);
378            } else {
379                assert!(!mask_a.get(i), "bit {} should be clear (odd)", i);
380            }
381        }
382    }
383
384    #[test]
385    fn test_packed_mask_count_ones() {
386        let mask = PackedMask::new_all_set(100);
387        assert_eq!(mask.count_ones(), 100);
388
389        let mask = PackedMask::new_all_clear(100);
390        assert_eq!(mask.count_ones(), 0);
391
392        let mut mask = PackedMask::new_all_clear(128);
393        for i in (0..128).step_by(2) {
394            mask.set(i, true);
395        }
396        assert_eq!(mask.count_ones(), 64);
397    }
398
399    #[test]
400    fn test_packed_mask_from_bool_slice() {
401        let bools = vec![true, false, true, true, false, true];
402        let mask = PackedMask::from_bool_slice(&bools);
403
404        assert_eq!(mask.len(), 6);
405        assert!(mask.get(0));
406        assert!(!mask.get(1));
407        assert!(mask.get(2));
408        assert!(mask.get(3));
409        assert!(!mask.get(4));
410        assert!(mask.get(5));
411        assert_eq!(mask.count_ones(), 4);
412    }
413
414    #[test]
415    fn test_packed_mask_to_bool_vec() {
416        let mut mask = PackedMask::new_all_clear(6);
417        mask.set(0, true);
418        mask.set(2, true);
419        mask.set(3, true);
420        mask.set(5, true);
421
422        let bools = mask.to_bool_vec();
423        assert_eq!(bools, vec![true, false, true, true, false, true]);
424    }
425
426    #[test]
427    fn test_packed_mask_round_trip() {
428        let original = vec![true, false, true, true, false, true, false, false, true];
429        let mask = PackedMask::from_bool_slice(&original);
430        let result = mask.to_bool_vec();
431        assert_eq!(original, result);
432    }
433
434    #[test]
435    fn test_packed_mask_partial_word() {
436        // Test with sizes that don't fill a complete u64 word
437        for size in [1, 7, 15, 63, 65, 100, 127, 129] {
438            let mask = PackedMask::new_all_set(size);
439            assert_eq!(mask.count_ones(), size, "failed for size {}", size);
440
441            let mask = PackedMask::new_all_clear(size);
442            assert_eq!(mask.count_ones(), 0, "failed for size {}", size);
443
444            // Test conversion round-trip
445            let bools = vec![true; size];
446            let mask = PackedMask::from_bool_slice(&bools);
447            assert_eq!(mask.count_ones(), size, "round-trip failed for size {}", size);
448            assert_eq!(mask.to_bool_vec(), bools, "round-trip failed for size {}", size);
449        }
450    }
451}