simd_kernels/kernels/bitmask/
simd.rs

1// Copyright Peter Bower 2025. All Rights Reserved.
2// Licensed under Mozilla Public License (MPL) 2.0.
3
4//! # **Bitmask SIMD Kernels** - *Vectorised High-Performance Bitmask Operations*
5//!
6//! SIMD-accelerated implementations of bitmask operations using portable vectorisation with `std::simd`.
7//! These kernels provide optimal performance for large bitmask operations through
8//! SIMD-parallel processing of multiple 64-bit words simultaneously.
9//!
10//! ## Overview
11//! 
12//! This module contains vectorised implementations of all bitmask operations. 
13//! it uses configurable SIMD lane counts to adapt to different CPU architectures whilst maintaining code portability.
14//!
15//! We do not check for SIMD alignment here because it is guaranteed by the `Bitmask` as it is backed by *Minarrow*'s `Vec64`. 
16//! 
17//! ## Architecture Principles
18//! 
19//! - **Portable SIMD**: Uses `std::simd` for cross-platform vectorisation without target-specific code
20//! - **Configurable lanes**: Lane counts determined at build time for optimal performance per architecture
21//! - **Hybrid processing**: SIMD inner loops with scalar tail handling for non-aligned lengths
22//! - **Low-cost abstraction**: `Bitmask` is a light-weight structure over a `Vec64`. See Minarrow for details and benchmarks
23//! demonstrating very low abstraction cost.
24//!
25//!
26//! ### **Memory Access Patterns**
27//! - Vectorised loads process multiple words per memory operation
28//! - Sequential access patterns optimise cache utilisation
29//! - Aligned access where possible for maximum performance
30//! - Streaming patterns for large bitmask operations
31//!
32//! ## Specialised Algorithms
33//! 
34//! ### **Population Count (Popcount)**
35//! Uses SIMD reduction for optimal performance:
36//! ```rust,ignore
37//! let counts = simd_vector.count_ones();
38//! total += counts.reduce_sum() as usize;
39//! ```
40//!
41//! ### **Equality Testing**
42//! Leverages SIMD comparison operations:
43//! ```rust,ignore
44//! let eq_mask = vector_a.simd_eq(vector_b);
45//! if !eq_mask.all() { return false; }
46//! ```
47
48use core::simd::{LaneCount, Simd, SupportedLaneCount};
49
50use minarrow::{Bitmask, BitmaskVT};
51
52use crate::kernels::bitmask::{
53    bitmask_window_bytes, bitmask_window_bytes_mut, clear_trailing_bits, mask_bits_as_words,
54    mask_bits_as_words_mut,
55};
56use crate::operators::{LogicalOperator, UnaryOperator};
57
58/// Primitive bit ops
59
60/// Performs vectorised bitwise binary operations (AND/OR/XOR) with configurable lane counts.
61/// 
62/// Core SIMD implementation for logical operations between bitmask windows. Processes data using
63/// vectorised instructions with automatic scalar tail handling for optimal performance across
64/// different data sizes and architectures.
65/// 
66/// # Type Parameters
67/// - `LANES`: Number of u64 lanes to process simultaneously (typically 8, 16, 32, or 64)
68/// 
69/// # Parameters
70/// - `lhs`: Left-hand side bitmask window as `(mask, offset, length)` tuple
71/// - `rhs`: Right-hand side bitmask window as `(mask, offset, length)` tuple
72/// - `op`: Logical operation to perform (AND, OR, XOR)
73/// 
74/// # Returns
75/// A new `Bitmask` containing the vectorised operation results with proper trailing bit handling.
76/// 
77/// # Performance Characteristics
78/// - Vectorised inner loop processes `LANES` words per iteration
79/// - Scalar tail handling ensures correctness for non-aligned lengths
80/// - Memory access patterns optimised for cache efficiency
81/// - Lane count scaling provides architecture-specific optimisation
82#[inline(always)]
83pub fn bitmask_binop_simd<const LANES: usize>(
84    lhs: BitmaskVT<'_>,
85    rhs: BitmaskVT<'_>,
86    op: LogicalOperator,
87) -> Bitmask
88where
89    LaneCount<LANES>: SupportedLaneCount,
90{
91    let (lhs_mask, lhs_off, len) = lhs;
92    let (rhs_mask, rhs_off, _) = rhs;
93    let mut out = Bitmask::new_set_all(len, false);
94    let nw = (len + 63) / 64;
95    unsafe {
96        let lp = mask_bits_as_words(bitmask_window_bytes(lhs_mask, lhs_off, len));
97        let rp = mask_bits_as_words(bitmask_window_bytes(rhs_mask, rhs_off, len));
98        let dp = mask_bits_as_words_mut(bitmask_window_bytes_mut(&mut out, 0, len));
99        let mut i = 0;
100        while i + LANES <= nw {
101            let a = Simd::<u64, LANES>::from_slice(std::slice::from_raw_parts(lp.add(i), LANES));
102            let b = Simd::<u64, LANES>::from_slice(std::slice::from_raw_parts(rp.add(i), LANES));
103            let r = match op {
104                LogicalOperator::And => a & b,
105                LogicalOperator::Or => a | b,
106                LogicalOperator::Xor => a ^ b,
107            };
108            std::ptr::copy_nonoverlapping(r.as_array().as_ptr(), dp.add(i), LANES);
109            i += LANES;
110        }
111        // Tail often caused by `n % LANES != 0`; uses scalar fallback.
112        for k in i..nw {
113            let a = *lp.add(k);
114            let b = *rp.add(k);
115            *dp.add(k) = match op {
116                LogicalOperator::And => a & b,
117                LogicalOperator::Or => a | b,
118                LogicalOperator::Xor => a ^ b,
119            };
120        }
121    }
122    out.len = len;
123    clear_trailing_bits(&mut out);
124    out
125}
126
127/// Performs vectorised bitwise unary operations (NOT) with configurable lane counts.
128/// 
129/// Core SIMD implementation for unary logical operations on bitmask windows. Processes data using
130/// vectorised instructions with automatic scalar tail handling for optimal performance across
131/// different data sizes and CPU architectures.
132/// 
133/// # Type Parameters
134/// - `LANES`: Number of u64 lanes to process simultaneously (typically 8, 16, 32, or 64)
135/// 
136/// # Parameters
137/// - `src`: Source bitmask window as `(mask, offset, length)` tuple
138/// - `op`: Unary operation to perform (currently only NOT supported)
139/// 
140/// # Returns
141/// A new `Bitmask` containing the vectorised operation results with proper trailing bit handling.
142/// 
143/// # Implementation Details
144/// - Vectorised inner loop processes `LANES` words per iteration using SIMD NOT operations
145/// - Scalar tail handling ensures correctness for non-aligned lengths
146/// - Memory access patterns optimised for cache efficiency and sequential processing
147/// - Lane count scaling provides architecture-specific optimisation for different CPU capabilities
148/// 
149/// # Performance Characteristics
150/// - Memory bandwidth: Vectorised loads/stores improve memory subsystem utilisation
151/// - Instruction throughput: Reduced total instruction count for large operations
152/// - Pipeline efficiency: Better utilisation of modern CPU execution units
153/// - Cache locality: Sequential access patterns optimise cache utilisation
154#[inline(always)]
155pub fn bitmask_unop_simd<const LANES: usize>(src: BitmaskVT<'_>, op: UnaryOperator) -> Bitmask
156where
157    LaneCount<LANES>: SupportedLaneCount,
158{
159    let (mask, offset, len) = src;
160    let mut out = Bitmask::new_set_all(len, false);
161    let nw = (len + 63) / 64;
162    unsafe {
163        let sp = mask_bits_as_words(bitmask_window_bytes(mask, offset, len));
164        let dp = mask_bits_as_words_mut(bitmask_window_bytes_mut(&mut out, 0, len));
165        let mut i = 0;
166        while i + LANES <= nw {
167            let a = Simd::<u64, LANES>::from_slice(std::slice::from_raw_parts(sp.add(i), LANES));
168            let r = match op {
169                UnaryOperator::Not => !a,
170                _ => unreachable!(),
171            };
172            std::ptr::copy_nonoverlapping(r.as_array().as_ptr(), dp.add(i), LANES);
173            i += LANES;
174        }
175        // Tail often caused by `n % LANES != 0`; uses scalar fallback.
176        for k in i..nw {
177            let a = *sp.add(k);
178            *dp.add(k) = match op {
179                UnaryOperator::Not => !a,
180                _ => unreachable!(),
181            };
182        }
183    }
184    out.len = len;
185    clear_trailing_bits(&mut out);
186    out
187}
188
189// ---- Entry points ----
190/// Performs vectorised bitwise AND operation between two bitmask windows.
191/// 
192/// High-performance SIMD implementation of logical AND using configurable lane counts for optimal
193/// CPU architecture utilisation. Delegates to the core `bitmask_binop_simd` implementation with
194/// the AND operator.
195/// 
196/// # Type Parameters
197/// - `LANES`: Number of u64 lanes to process simultaneously for vectorised operations
198/// 
199/// # Parameters
200/// - `lhs`: Left-hand side bitmask window as `(mask, offset, length)` tuple
201/// - `rhs`: Right-hand side bitmask window as `(mask, offset, length)` tuple
202/// 
203/// # Returns
204/// A new `Bitmask` containing bitwise AND results with proper trailing bit masking.
205/// 
206/// # Usage Example
207/// ```rust,ignore
208/// use simd_kernels::kernels::bitmask::simd::and_masks_simd;
209/// 
210/// // Process 8 lanes simultaneously (512 bits per instruction)
211/// let result = and_masks_simd::<8>(lhs_window, rhs_window);
212/// ```
213#[inline(always)]
214pub fn and_masks_simd<const LANES: usize>(lhs: BitmaskVT<'_>, rhs: BitmaskVT<'_>) -> Bitmask
215where
216    LaneCount<LANES>: SupportedLaneCount,
217{
218    bitmask_binop_simd::<LANES>(lhs, rhs, LogicalOperator::And)
219}
220
221/// Performs vectorised bitwise OR operation between two bitmask windows.
222/// 
223/// High-performance SIMD implementation of logical OR using configurable lane counts for optimal
224/// CPU architecture utilisation. Delegates to the core `bitmask_binop_simd` implementation with
225/// the OR operator.
226/// 
227/// # Type Parameters
228/// - `LANES`: Number of u64 lanes to process simultaneously for vectorised operations
229/// 
230/// # Parameters
231/// - `lhs`: Left-hand side bitmask window as `(mask, offset, length)` tuple
232/// - `rhs`: Right-hand side bitmask window as `(mask, offset, length)` tuple
233/// 
234/// # Returns
235/// A new `Bitmask` containing bitwise OR results with proper trailing bit masking.
236/// 
237/// # Usage Example
238/// ```rust,ignore
239/// use simd_kernels::kernels::bitmask::simd::or_masks_simd;
240/// 
241/// // Process 16 lanes simultaneously (1024 bits per instruction)
242/// let result = or_masks_simd::<16>(lhs_window, rhs_window);
243/// ```
244#[inline(always)]
245pub fn or_masks_simd<const LANES: usize>(lhs: BitmaskVT<'_>, rhs: BitmaskVT<'_>) -> Bitmask
246where
247    LaneCount<LANES>: SupportedLaneCount,
248{
249    bitmask_binop_simd::<LANES>(lhs, rhs, LogicalOperator::Or)
250}
251
252/// Performs vectorised bitwise XOR operation between two bitmask windows.
253/// 
254/// High-performance SIMD implementation of logical exclusive-OR using configurable lane counts
255/// for optimal CPU architecture utilisation. Delegates to the core `bitmask_binop_simd`
256/// implementation with the XOR operator.
257/// 
258/// # Type Parameters
259/// - `LANES`: Number of u64 lanes to process simultaneously for vectorised operations
260/// 
261/// # Parameters
262/// - `lhs`: Left-hand side bitmask window as `(mask, offset, length)` tuple
263/// - `rhs`: Right-hand side bitmask window as `(mask, offset, length)` tuple
264/// 
265/// # Returns
266/// A new `Bitmask` containing bitwise XOR results with proper trailing bit masking.
267/// 
268/// # Usage Example
269/// ```rust,ignore
270/// use simd_kernels::kernels::bitmask::simd::xor_masks_simd;
271/// 
272/// // Process 32 lanes simultaneously (2048 bits per instruction)
273/// let result = xor_masks_simd::<32>(lhs_window, rhs_window);
274/// ```
275#[inline(always)]
276pub fn xor_masks_simd<const LANES: usize>(lhs: BitmaskVT<'_>, rhs: BitmaskVT<'_>) -> Bitmask
277where
278    LaneCount<LANES>: SupportedLaneCount,
279{
280    bitmask_binop_simd::<LANES>(lhs, rhs, LogicalOperator::Xor)
281}
282
283/// Performs vectorised bitwise NOT operation on a bitmask window.
284/// 
285/// High-performance SIMD implementation of logical NOT using configurable lane counts for optimal
286/// CPU architecture utilisation. Delegates to the core `bitmask_unop_simd` implementation with
287/// the NOT operator.
288/// 
289/// # Type Parameters
290/// - `LANES`: Number of u64 lanes to process simultaneously for vectorised operations
291/// 
292/// # Parameters
293/// - `src`: Source bitmask window as `(mask, offset, length)` tuple
294/// 
295/// # Returns
296/// A new `Bitmask` containing bitwise NOT results with proper trailing bit masking.
297/// 
298/// # Usage Example
299/// ```rust,ignore
300/// use simd_kernels::kernels::bitmask::simd::not_mask_simd;
301/// 
302/// // Process 8 lanes simultaneously (512 bits per instruction)
303/// let inverted = not_mask_simd::<8>(source_window);
304/// ```
305#[inline(always)]
306pub fn not_mask_simd<const LANES: usize>(src: BitmaskVT<'_>) -> Bitmask
307where
308    LaneCount<LANES>: SupportedLaneCount,
309{
310    bitmask_unop_simd::<LANES>(src, UnaryOperator::Not)
311}
312
313/// Bitwise "in" for boolean bitmasks: each output bit is true if lhs bit is in the set of bits in rhs.
314#[inline]
315pub fn in_mask_simd<const LANES: usize>(lhs: BitmaskVT<'_>, rhs: BitmaskVT<'_>) -> Bitmask
316where
317    LaneCount<LANES>: SupportedLaneCount,
318{
319    let (lhs_mask, lhs_off, len) = lhs;
320    let (rhs_mask, rhs_off, rlen) = rhs;
321    debug_assert_eq!(len, rlen, "in_mask: window length mismatch");
322
323    // Scan rhs to see which values are present (true, false)
324    let mut has_true = false;
325    let mut has_false = false;
326    for i in 0..len {
327        let v = unsafe { rhs_mask.get_unchecked(rhs_off + i) };
328        if v {
329            has_true = true;
330        } else {
331            has_false = true;
332        }
333        if has_true && has_false {
334            break;
335        }
336    }
337
338    match (has_true, has_false) {
339        (true, true) => {
340            // Set contains both: every bit is in the set
341            Bitmask::new_set_all(len, true)
342        }
343        (true, false) => {
344            // Only 'true' in rhs: output bit is set iff lhs bit is true
345            lhs_mask.slice_clone(lhs_off, len)
346        }
347        (false, true) => {
348            // Only 'false' in rhs: output bit is set iff lhs bit is false
349            not_mask_simd::<LANES>((lhs_mask, lhs_off, len))
350        }
351        (false, false) => {
352            // Set is empty: all bits false
353            Bitmask::new_set_all(len, false)
354        }
355    }
356}
357
358/// Performs vectorised bitwise "not in" membership test for boolean bitmasks.
359/// 
360/// Computes the logical complement of the "in" operation where each output bit is true if the
361/// corresponding lhs bit is NOT in the set of bits defined by rhs. This function delegates to
362/// `in_mask_simd` followed by `not_mask_simd` for optimal performance.
363/// 
364/// # Type Parameters
365/// - `LANES`: Number of u64 lanes to process simultaneously for vectorised operations
366/// 
367/// # Parameters
368/// - `lhs`: Left-hand side bitmask window as `(mask, offset, length)` tuple (test values)
369/// - `rhs`: Right-hand side bitmask window as `(mask, offset, length)` tuple (set definition)
370/// 
371/// # Returns
372/// A new `Bitmask` where each bit is true if the corresponding lhs bit is not in the rhs set.
373#[inline]
374pub fn not_in_mask_simd<const LANES: usize>(lhs: BitmaskVT<'_>, rhs: BitmaskVT<'_>) -> Bitmask
375where
376    LaneCount<LANES>: SupportedLaneCount,
377{
378    let mask = in_mask_simd::<LANES>(lhs, rhs);
379    not_mask_simd::<LANES>((&mask, 0, mask.len))
380}
381
382/// Produces a bitmask where each output bit is 1 iff the corresponding bits of `a` and `b` are equal.
383#[inline]
384pub fn eq_mask_simd<const LANES: usize>(a: BitmaskVT<'_>, b: BitmaskVT<'_>) -> Bitmask
385where
386    LaneCount<LANES>: SupportedLaneCount,
387{
388    let (am, ao, len) = a;
389    let (bm, bo, blen) = b;
390    debug_assert_eq!(len, blen, "BitWindow length mismatch in eq_bits_mask");
391    if ao % 64 != 0 || bo % 64 != 0 {
392        panic!(
393            "eq_bits_mask: offsets must be 64-bit aligned (got a: {}, b: {})",
394            ao, bo
395        );
396    }
397    let a_words = ao / 64;
398    let b_words = bo / 64;
399    let n_words = (len + 63) / 64;
400    let mut out = Bitmask::new_set_all(len, false);
401
402    {
403        use core::simd::Simd;
404        let simd_chunks = n_words / LANES;
405        let tail_words = n_words % LANES;
406        for chunk in 0..simd_chunks {
407            let mut arr_a = [0u64; LANES];
408            let mut arr_b = [0u64; LANES];
409            for lane in 0..LANES {
410                arr_a[lane] = unsafe { am.word_unchecked(a_words + chunk * LANES + lane) };
411                arr_b[lane] = unsafe { bm.word_unchecked(b_words + chunk * LANES + lane) };
412            }
413            let sa = Simd::<u64, LANES>::from_array(arr_a);
414            let sb = Simd::<u64, LANES>::from_array(arr_b);
415            let eq = !(sa ^ sb);
416            for lane in 0..LANES {
417                unsafe {
418                    out.set_word_unchecked(chunk * LANES + lane, eq[lane]);
419                }
420            }
421        }
422        let base = simd_chunks * LANES;
423        for k in 0..tail_words {
424            let wa = unsafe { am.word_unchecked(a_words + base + k) };
425            let wb = unsafe { bm.word_unchecked(b_words + base + k) };
426            let eq = !(wa ^ wb);
427            unsafe {
428                out.set_word_unchecked(base + k, eq);
429            }
430        }
431    }
432    #[cfg(not(feature = "simd"))]
433    {
434        for k in 0..n_words {
435            let wa = unsafe { am.word_unchecked(a_words + k) };
436            let wb = unsafe { bm.word_unchecked(b_words + k) };
437            let eq = !(wa ^ wb);
438            unsafe {
439                out.set_word_unchecked(k, eq);
440            }
441        }
442    }
443    out.mask_trailing_bits();
444    out
445}
446
447/// Performs vectorised bitwise inequality comparison between two bitmask windows.
448/// 
449/// Computes the logical complement of equality where each output bit is true if the corresponding
450/// bits from the two input windows are different. This function delegates to `eq_mask_simd`
451/// followed by bitwise NOT for optimal performance.
452/// 
453/// # Type Parameters
454/// - `LANES`: Number of u64 lanes to process simultaneously for vectorised operations
455/// 
456/// # Parameters
457/// - `a`: First bitmask window as `(mask, offset, length)` tuple
458/// - `b`: Second bitmask window as `(mask, offset, length)` tuple
459/// 
460/// # Returns
461/// A new `Bitmask` where each bit is true if the corresponding input bits are different.
462#[inline]
463pub fn ne_mask_simd<const LANES: usize>(a: BitmaskVT<'_>, b: BitmaskVT<'_>) -> Bitmask
464where
465    LaneCount<LANES>: SupportedLaneCount,
466{
467    !eq_mask_simd::<LANES>(a, b)
468}
469
470/// Tests if all corresponding bits between two bitmask windows are different.
471/// 
472/// Performs bulk inequality comparison across entire bitmask windows by computing the logical
473/// complement of `all_eq_mask_simd`. Returns true only if every corresponding bit pair differs
474/// between the two input windows.
475/// 
476/// # Type Parameters
477/// - `LANES`: Number of u64 lanes to process simultaneously for vectorised comparison
478/// 
479/// # Parameters
480/// - `a`: First bitmask window as `(mask, offset, length)` tuple
481/// - `b`: Second bitmask window as `(mask, offset, length)` tuple
482/// 
483/// # Returns
484/// `true` if all corresponding bits are different, `false` if any bits are equal.
485#[inline]
486pub fn all_ne_mask_simd<const LANES: usize>(a: BitmaskVT<'_>, b: BitmaskVT<'_>) -> bool
487where
488    LaneCount<LANES>: SupportedLaneCount,
489{
490    !all_eq_mask_simd::<LANES>(a, b)
491}
492
493/// Vectorised equality test across entire bitmask windows with early termination optimisation.
494/// 
495/// Performs bulk equality comparison between two bitmask windows using SIMD comparison operations.
496/// The implementation processes multiple words simultaneously and uses early termination to avoid
497/// unnecessary work when differences are detected.
498/// 
499/// # Type Parameters
500/// - `LANES`: Number of u64 lanes to process simultaneously for vectorised comparison
501/// 
502/// # Parameters
503/// - `a`: First bitmask window as `(mask, offset, length)` tuple
504/// - `b`: Second bitmask window as `(mask, offset, length)` tuple
505/// 
506/// # Returns
507/// `true` if all corresponding bits are equal (ignoring slack bits), `false` otherwise.
508#[inline]
509pub fn all_eq_mask_simd<const LANES: usize>(a: BitmaskVT<'_>, b: BitmaskVT<'_>) -> bool
510where
511    LaneCount<LANES>: SupportedLaneCount,
512{
513    let (am, ao, len) = a;
514
515    // Mask < 64 bits early exit
516    if len < 64 {
517        for i in 0..len {
518            if a.0.get(a.1 + i) != unsafe { b.0.get_unchecked(b.1 + i) } {
519                return false;
520            }
521        }
522        return true;
523    }
524
525    let (bm, bo, blen) = b;
526    debug_assert_eq!(len, blen, "BitWindow length mismatch in all_eq_mask");
527    if ao % 64 != 0 || bo % 64 != 0 {
528        panic!(
529            "all_eq_mask_simd: offsets must be 64-bit aligned (got a: {}, b: {})",
530            ao, bo
531        );
532    }
533    let a_words = ao / 64;
534    let b_words = bo / 64;
535    let n_words = (len + 63) / 64;
536    let trailing = len & 63;
537
538    use core::simd::Simd;
539    use std::simd::prelude::SimdPartialEq;
540
541    let simd_chunks = n_words / LANES;
542    let tail_words = n_words % LANES;
543
544    for chunk in 0..simd_chunks {
545        let mut arr_a = [0u64; LANES];
546        let mut arr_b = [0u64; LANES];
547        for lane in 0..LANES {
548            arr_a[lane] = unsafe { am.word_unchecked(a_words + chunk * LANES + lane) };
549            arr_b[lane] = unsafe { bm.word_unchecked(b_words + chunk * LANES + lane) };
550        }
551        let sa = Simd::<u64, LANES>::from_array(arr_a);
552        let sb = Simd::<u64, LANES>::from_array(arr_b);
553        let eq_mask = sa.simd_eq(sb);
554        if !eq_mask.all() {
555            return false;
556        }
557    }
558
559    let base = simd_chunks * LANES;
560    for k in 0..tail_words {
561        let idx = base + k;
562        let wa = unsafe { am.word_unchecked(a_words + idx) };
563        let wb = unsafe { bm.word_unchecked(b_words + idx) };
564        // For the last (possibly partial) word, mask slack bits
565        if idx == n_words - 1 && trailing != 0 {
566            let mask = (1u64 << trailing) - 1;
567            if (wa & mask) != (wb & mask) {
568                return false;
569            }
570        } else if wa != wb {
571            return false;
572        }
573    }
574    true
575}
576
577/// Vectorised population count (number of set bits) with SIMD reduction for optimal performance.
578/// 
579/// Computes the total number of set bits in a bitmask window using SIMD population count instructions
580/// followed by horizontal reduction. This implementation provides significant performance improvements
581/// for large bitmasks through parallel processing of multiple words.
582/// 
583/// # Type Parameters
584/// - `LANES`: Number of u64 lanes to process simultaneously for vectorised popcount operations
585/// 
586/// # Parameters
587/// - `m`: Bitmask window as `(mask, offset, length)` tuple
588/// 
589/// # Returns
590/// The total count of set bits in the specified window.
591#[inline]
592pub fn popcount_mask_simd<const LANES: usize>(m: BitmaskVT<'_>) -> usize
593where
594    LaneCount<LANES>: SupportedLaneCount,
595{
596    let (mask, offset, len) = m;
597    let n_words = (len + 63) / 64;
598    let word_start = offset / 64;
599    let mut acc = 0usize;
600
601    {
602        use core::simd::Simd;
603        use std::simd::prelude::SimdUint;
604
605        let simd_chunks = n_words / LANES;
606        let tail_words = n_words % LANES;
607
608        for chunk in 0..simd_chunks {
609            let mut arr = [0u64; LANES];
610            for lane in 0..LANES {
611                arr[lane] = unsafe { mask.word_unchecked(word_start + chunk * LANES + lane) };
612            }
613            let v = Simd::<u64, LANES>::from_array(arr);
614            let counts = v.count_ones();
615            acc += counts.reduce_sum() as usize;
616        }
617
618        // Tail scalar loop for any remaining words
619        let base = simd_chunks * LANES;
620        for k in 0..tail_words {
621            let word = unsafe { mask.word_unchecked(word_start + base + k) };
622            // Mask off slack bits in final word if needed
623            if base + k == n_words - 1 && len % 64 != 0 {
624                let valid = len % 64;
625                let slack_mask = (1u64 << valid) - 1;
626                acc += (word & slack_mask).count_ones() as usize;
627            } else {
628                acc += word.count_ones() as usize;
629            }
630        }
631    }
632    acc
633}
634
635/// Returns true if all bits in the mask are set (1).
636#[inline]
637pub fn all_true_mask_simd<const LANES: usize>(mask: &Bitmask) -> bool
638where
639    LaneCount<LANES>: SupportedLaneCount,
640{
641    if mask.len < 64 {
642        for i in 0..mask.len {
643            if !unsafe { mask.get_unchecked(i) } {
644                return false;
645            }
646        }
647        return true;
648    }
649    let n_bits = mask.len;
650    let n_words = (n_bits + 63) / 64;
651    let words: &[u64] =
652        unsafe { std::slice::from_raw_parts(mask.bits.as_ptr() as *const u64, n_words) };
653
654    let simd_chunks = n_words / LANES;
655
656    let all_ones = Simd::<u64, LANES>::splat(!0u64);
657    for chunk in 0..simd_chunks {
658        let base = chunk * LANES;
659        let arr = Simd::<u64, LANES>::from_slice(&words[base..base + LANES]);
660        if arr != all_ones {
661            return false;
662        }
663    }
664    // Tail often caused by `n % LANES =! 0`; uses scalar fallback
665    let tail_words = n_words % LANES;
666    let base = simd_chunks * LANES;
667    for k in 0..tail_words {
668        if base + k == n_words - 1 && n_bits % 64 != 0 {
669            let valid_bits = n_bits % 64;
670            let slack_mask = (1u64 << valid_bits) - 1;
671            if words[base + k] != slack_mask {
672                return false;
673            }
674        } else {
675            if words[base + k] != !0u64 {
676                return false;
677            }
678        }
679    }
680    true
681}
682
683/// Returns true if all bits in the mask are set to (0).
684pub fn all_false_mask_simd<const LANES: usize>(mask: &Bitmask) -> bool
685where
686    LaneCount<LANES>: SupportedLaneCount,
687{
688    if mask.len < 64 {
689        for i in 0..mask.len {
690            if unsafe { mask.get_unchecked(i) } {
691                return false;
692            }
693        }
694        return true;
695    }
696    let n_bits = mask.len;
697    let n_words = (n_bits + 63) / 64;
698    let words: &[u64] =
699        unsafe { std::slice::from_raw_parts(mask.bits.as_ptr() as *const u64, n_words) };
700
701    let simd_chunks = n_words / LANES;
702    for chunk in 0..simd_chunks {
703        let base = chunk * LANES;
704        let arr = Simd::<u64, LANES>::from_slice(&words[base..base + LANES]);
705        if arr != Simd::<u64, LANES>::splat(0u64) {
706            return false;
707        }
708    }
709    let tail_words = n_words % LANES;
710    let base = simd_chunks * LANES;
711    for k in 0..tail_words {
712        if base + k == n_words - 1 && n_bits % 64 != 0 {
713            let valid_bits = n_bits % 64;
714            let slack_mask = (1u64 << valid_bits) - 1;
715            if words[base + k] & slack_mask != 0 {
716                return false;
717            }
718        } else {
719            if words[base + k] != 0u64 {
720                return false;
721            }
722        }
723    }
724    true
725}
726
727#[cfg(test)]
728mod tests {
729    use minarrow::{Bitmask, BitmaskVT};
730
731    use super::*;
732
733    macro_rules! simd_bitmask_suite {
734        ($mod_name:ident, $lanes:expr) => {
735            mod $mod_name {
736                use super::*;
737                const LANES: usize = $lanes;
738
739                fn bm(bits: &[bool]) -> Bitmask {
740                    let mut m = Bitmask::new_set_all(bits.len(), false);
741                    for (i, b) in bits.iter().enumerate() {
742                        if *b {
743                            m.set(i, true);
744                        }
745                    }
746                    m
747                }
748                fn slice(mask: &Bitmask) -> BitmaskVT<'_> {
749                    (mask, 0, mask.len)
750                }
751
752                #[test]
753                fn test_and_masks_simd() {
754                    let a = bm(&[true, false, true, false, true, true, false, false]);
755                    let b = bm(&[true, true, false, false, true, false, true, false]);
756                    let c = and_masks_simd::<LANES>(slice(&a), slice(&b));
757                    for i in 0..a.len {
758                        assert_eq!(c.get(i), a.get(i) & b.get(i), "bit {i}");
759                    }
760                }
761
762                #[test]
763                fn test_or_masks_simd() {
764                    let a = bm(&[true, false, true, false, true, true, false, false]);
765                    let b = bm(&[true, true, false, false, true, false, true, false]);
766                    let c = or_masks_simd::<LANES>(slice(&a), slice(&b));
767                    for i in 0..a.len {
768                        assert_eq!(c.get(i), a.get(i) | b.get(i), "bit {i}");
769                    }
770                }
771
772                #[test]
773                fn test_xor_masks_simd() {
774                    let a = bm(&[true, false, true, false, true, true, false, false]);
775                    let b = bm(&[true, true, false, false, true, false, true, false]);
776                    let c = xor_masks_simd::<LANES>(slice(&a), slice(&b));
777                    for i in 0..a.len {
778                        assert_eq!(c.get(i), a.get(i) ^ b.get(i), "bit {i}");
779                    }
780                }
781
782                #[test]
783                fn test_not_mask_simd() {
784                    let a = bm(&[true, false, true, false]);
785                    let c = not_mask_simd::<LANES>(slice(&a));
786                    for i in 0..a.len {
787                        assert_eq!(c.get(i), !a.get(i));
788                    }
789                }
790
791                #[test]
792                fn test_in_mask_simd_variants() {
793                    let lhs = bm(&[true, false, true, false]);
794                    // RHS = [true]: only 'true' in rhs
795                    let rhs_true = bm(&[true; 4]);
796                    let out = in_mask_simd::<LANES>(slice(&lhs), slice(&rhs_true));
797                    for i in 0..lhs.len {
798                        assert_eq!(out.get(i), lhs.get(i), "in_mask, only true, bit {i}");
799                    }
800                    // RHS = [false]: only 'false' in rhs
801                    let rhs_false = bm(&[false; 4]);
802                    let out = in_mask_simd::<LANES>(slice(&lhs), slice(&rhs_false));
803                    for i in 0..lhs.len {
804                        assert_eq!(out.get(i), !lhs.get(i), "in_mask, only false, bit {i}");
805                    }
806                    // RHS = [true, false]: both present
807                    let rhs_both = bm(&[true, false, true, false]);
808                    let out = in_mask_simd::<LANES>(slice(&lhs), slice(&rhs_both));
809                    for i in 0..lhs.len {
810                        assert!(out.get(i), "in_mask, both true/false, bit {i}");
811                    }
812                    // RHS empty
813                    let rhs_empty = bm(&[false; 0]);
814                    let out = in_mask_simd::<LANES>((&lhs, 0, 0), (&rhs_empty, 0, 0));
815                    assert_eq!(out.len, 0);
816                }
817
818                #[test]
819                fn test_not_in_mask_simd() {
820                    let lhs = bm(&[true, false, true, false]);
821                    let rhs = bm(&[true, false, true, false]);
822                    let in_mask = in_mask_simd::<LANES>(slice(&lhs), slice(&rhs));
823                    let not_in = not_in_mask_simd::<LANES>(slice(&lhs), slice(&rhs));
824                    for i in 0..lhs.len {
825                        assert_eq!(not_in.get(i), !in_mask.get(i));
826                    }
827                }
828
829                #[test]
830                fn test_eq_mask_simd_and_ne_mask_simd() {
831                    let a = bm(&[true, false, true, false]);
832                    let b = bm(&[true, false, false, true]);
833                    let eq = eq_mask_simd::<LANES>(slice(&a), slice(&b));
834                    let ne = ne_mask_simd::<LANES>(slice(&a), slice(&b));
835                    for i in 0..a.len {
836                        assert_eq!(eq.get(i), a.get(i) == b.get(i), "eq_mask bit {i}");
837                        assert_eq!(ne.get(i), a.get(i) != b.get(i), "ne_mask bit {i}");
838                    }
839                }
840
841                #[test]
842                fn test_all_eq_mask_simd() {
843                    let a = bm(&[true, false, true, false, true, true, false, false]);
844                    let b = bm(&[true, false, true, false, true, true, false, false]);
845                    assert!(all_eq_mask_simd::<LANES>(slice(&a), slice(&b)));
846                    let mut b2 = b.clone();
847                    b2.set(0, false);
848                    assert!(!all_eq_mask_simd::<LANES>(slice(&a), slice(&b2)));
849                }
850
851                #[test]
852                fn test_all_ne_mask_simd() {
853                    let a = bm(&[true, false, true]);
854                    let b = bm(&[false, true, false]);
855                    assert!(all_ne_mask_simd::<LANES>(slice(&a), slice(&b)));
856                    assert!(!all_ne_mask_simd::<LANES>(slice(&a), slice(&a)));
857                }
858
859                #[test]
860                fn test_popcount_mask_simd() {
861                    let a = bm(&[true, false, true, false, true, false, false, true]);
862                    let pop = popcount_mask_simd::<LANES>(slice(&a));
863                    assert_eq!(pop, 4);
864                }
865
866                #[test]
867                fn test_all_true_mask_simd_and_false() {
868                    let all_true = Bitmask::new_set_all(64 * LANES, true);
869                    assert!(all_true_mask_simd::<LANES>(&all_true));
870                    let mut not_true = all_true.clone();
871                    not_true.set(3, false);
872                    assert!(!all_true_mask_simd::<LANES>(&not_true));
873                }
874
875                #[test]
876                fn test_all_false_mask_simd() {
877                    let all_true = Bitmask::new_set_all(64 * LANES, true);
878                    assert!(!all_false_mask_simd::<LANES>(&all_true));
879                    let all_false = Bitmask::new_set_all(64 * LANES, false);
880                    assert!(all_false_mask_simd::<LANES>(&all_false));
881                }
882            }
883        };
884    }
885
886    include!(concat!(env!("OUT_DIR"), "/simd_lanes.rs"));
887
888    simd_bitmask_suite!(simd_bitmask_w8, W8);
889    simd_bitmask_suite!(simd_bitmask_w16, W16);
890    simd_bitmask_suite!(simd_bitmask_w32, W32);
891    simd_bitmask_suite!(simd_bitmask_w64, W64);
892}