simd_kernels/kernels/bitmask/simd.rs
1// Copyright Peter Bower 2025. All Rights Reserved.
2// Licensed under Mozilla Public License (MPL) 2.0.
3
4//! # **Bitmask SIMD Kernels** - *Vectorised High-Performance Bitmask Operations*
5//!
6//! SIMD-accelerated implementations of bitmask operations using portable vectorisation with `std::simd`.
7//! These kernels provide optimal performance for large bitmask operations through
8//! SIMD-parallel processing of multiple 64-bit words simultaneously.
9//!
10//! ## Overview
11//!
12//! This module contains vectorised implementations of all bitmask operations.
13//! it uses configurable SIMD lane counts to adapt to different CPU architectures whilst maintaining code portability.
14//!
15//! We do not check for SIMD alignment here because it is guaranteed by the `Bitmask` as it is backed by *Minarrow*'s `Vec64`.
16//!
17//! ## Architecture Principles
18//!
19//! - **Portable SIMD**: Uses `std::simd` for cross-platform vectorisation without target-specific code
20//! - **Configurable lanes**: Lane counts determined at build time for optimal performance per architecture
21//! - **Hybrid processing**: SIMD inner loops with scalar tail handling for non-aligned lengths
22//! - **Low-cost abstraction**: `Bitmask` is a light-weight structure over a `Vec64`. See Minarrow for details and benchmarks
23//! demonstrating very low abstraction cost.
24//!
25//!
26//! ### **Memory Access Patterns**
27//! - Vectorised loads process multiple words per memory operation
28//! - Sequential access patterns optimise cache utilisation
29//! - Aligned access where possible for maximum performance
30//! - Streaming patterns for large bitmask operations
31//!
32//! ## Specialised Algorithms
33//!
34//! ### **Population Count (Popcount)**
35//! Uses SIMD reduction for optimal performance:
36//! ```rust,ignore
37//! let counts = simd_vector.count_ones();
38//! total += counts.reduce_sum() as usize;
39//! ```
40//!
41//! ### **Equality Testing**
42//! Leverages SIMD comparison operations:
43//! ```rust,ignore
44//! let eq_mask = vector_a.simd_eq(vector_b);
45//! if !eq_mask.all() { return false; }
46//! ```
47
48use core::simd::{LaneCount, Simd, SupportedLaneCount};
49
50use minarrow::{Bitmask, BitmaskVT};
51
52use crate::kernels::bitmask::{
53 bitmask_window_bytes, bitmask_window_bytes_mut, clear_trailing_bits, mask_bits_as_words,
54 mask_bits_as_words_mut,
55};
56use crate::operators::{LogicalOperator, UnaryOperator};
57
58/// Primitive bit ops
59
60/// Performs vectorised bitwise binary operations (AND/OR/XOR) with configurable lane counts.
61///
62/// Core SIMD implementation for logical operations between bitmask windows. Processes data using
63/// vectorised instructions with automatic scalar tail handling for optimal performance across
64/// different data sizes and architectures.
65///
66/// # Type Parameters
67/// - `LANES`: Number of u64 lanes to process simultaneously (typically 8, 16, 32, or 64)
68///
69/// # Parameters
70/// - `lhs`: Left-hand side bitmask window as `(mask, offset, length)` tuple
71/// - `rhs`: Right-hand side bitmask window as `(mask, offset, length)` tuple
72/// - `op`: Logical operation to perform (AND, OR, XOR)
73///
74/// # Returns
75/// A new `Bitmask` containing the vectorised operation results with proper trailing bit handling.
76///
77/// # Performance Characteristics
78/// - Vectorised inner loop processes `LANES` words per iteration
79/// - Scalar tail handling ensures correctness for non-aligned lengths
80/// - Memory access patterns optimised for cache efficiency
81/// - Lane count scaling provides architecture-specific optimisation
82#[inline(always)]
83pub fn bitmask_binop_simd<const LANES: usize>(
84 lhs: BitmaskVT<'_>,
85 rhs: BitmaskVT<'_>,
86 op: LogicalOperator,
87) -> Bitmask
88where
89 LaneCount<LANES>: SupportedLaneCount,
90{
91 let (lhs_mask, lhs_off, len) = lhs;
92 let (rhs_mask, rhs_off, _) = rhs;
93 let mut out = Bitmask::new_set_all(len, false);
94 let nw = (len + 63) / 64;
95 unsafe {
96 let lp = mask_bits_as_words(bitmask_window_bytes(lhs_mask, lhs_off, len));
97 let rp = mask_bits_as_words(bitmask_window_bytes(rhs_mask, rhs_off, len));
98 let dp = mask_bits_as_words_mut(bitmask_window_bytes_mut(&mut out, 0, len));
99 let mut i = 0;
100 while i + LANES <= nw {
101 let a = Simd::<u64, LANES>::from_slice(std::slice::from_raw_parts(lp.add(i), LANES));
102 let b = Simd::<u64, LANES>::from_slice(std::slice::from_raw_parts(rp.add(i), LANES));
103 let r = match op {
104 LogicalOperator::And => a & b,
105 LogicalOperator::Or => a | b,
106 LogicalOperator::Xor => a ^ b,
107 };
108 std::ptr::copy_nonoverlapping(r.as_array().as_ptr(), dp.add(i), LANES);
109 i += LANES;
110 }
111 // Tail often caused by `n % LANES != 0`; uses scalar fallback.
112 for k in i..nw {
113 let a = *lp.add(k);
114 let b = *rp.add(k);
115 *dp.add(k) = match op {
116 LogicalOperator::And => a & b,
117 LogicalOperator::Or => a | b,
118 LogicalOperator::Xor => a ^ b,
119 };
120 }
121 }
122 out.len = len;
123 clear_trailing_bits(&mut out);
124 out
125}
126
127/// Performs vectorised bitwise unary operations (NOT) with configurable lane counts.
128///
129/// Core SIMD implementation for unary logical operations on bitmask windows. Processes data using
130/// vectorised instructions with automatic scalar tail handling for optimal performance across
131/// different data sizes and CPU architectures.
132///
133/// # Type Parameters
134/// - `LANES`: Number of u64 lanes to process simultaneously (typically 8, 16, 32, or 64)
135///
136/// # Parameters
137/// - `src`: Source bitmask window as `(mask, offset, length)` tuple
138/// - `op`: Unary operation to perform (currently only NOT supported)
139///
140/// # Returns
141/// A new `Bitmask` containing the vectorised operation results with proper trailing bit handling.
142///
143/// # Implementation Details
144/// - Vectorised inner loop processes `LANES` words per iteration using SIMD NOT operations
145/// - Scalar tail handling ensures correctness for non-aligned lengths
146/// - Memory access patterns optimised for cache efficiency and sequential processing
147/// - Lane count scaling provides architecture-specific optimisation for different CPU capabilities
148///
149/// # Performance Characteristics
150/// - Memory bandwidth: Vectorised loads/stores improve memory subsystem utilisation
151/// - Instruction throughput: Reduced total instruction count for large operations
152/// - Pipeline efficiency: Better utilisation of modern CPU execution units
153/// - Cache locality: Sequential access patterns optimise cache utilisation
154#[inline(always)]
155pub fn bitmask_unop_simd<const LANES: usize>(src: BitmaskVT<'_>, op: UnaryOperator) -> Bitmask
156where
157 LaneCount<LANES>: SupportedLaneCount,
158{
159 let (mask, offset, len) = src;
160 let mut out = Bitmask::new_set_all(len, false);
161 let nw = (len + 63) / 64;
162 unsafe {
163 let sp = mask_bits_as_words(bitmask_window_bytes(mask, offset, len));
164 let dp = mask_bits_as_words_mut(bitmask_window_bytes_mut(&mut out, 0, len));
165 let mut i = 0;
166 while i + LANES <= nw {
167 let a = Simd::<u64, LANES>::from_slice(std::slice::from_raw_parts(sp.add(i), LANES));
168 let r = match op {
169 UnaryOperator::Not => !a,
170 _ => unreachable!(),
171 };
172 std::ptr::copy_nonoverlapping(r.as_array().as_ptr(), dp.add(i), LANES);
173 i += LANES;
174 }
175 // Tail often caused by `n % LANES != 0`; uses scalar fallback.
176 for k in i..nw {
177 let a = *sp.add(k);
178 *dp.add(k) = match op {
179 UnaryOperator::Not => !a,
180 _ => unreachable!(),
181 };
182 }
183 }
184 out.len = len;
185 clear_trailing_bits(&mut out);
186 out
187}
188
189// ---- Entry points ----
190/// Performs vectorised bitwise AND operation between two bitmask windows.
191///
192/// High-performance SIMD implementation of logical AND using configurable lane counts for optimal
193/// CPU architecture utilisation. Delegates to the core `bitmask_binop_simd` implementation with
194/// the AND operator.
195///
196/// # Type Parameters
197/// - `LANES`: Number of u64 lanes to process simultaneously for vectorised operations
198///
199/// # Parameters
200/// - `lhs`: Left-hand side bitmask window as `(mask, offset, length)` tuple
201/// - `rhs`: Right-hand side bitmask window as `(mask, offset, length)` tuple
202///
203/// # Returns
204/// A new `Bitmask` containing bitwise AND results with proper trailing bit masking.
205///
206/// # Usage Example
207/// ```rust,ignore
208/// use simd_kernels::kernels::bitmask::simd::and_masks_simd;
209///
210/// // Process 8 lanes simultaneously (512 bits per instruction)
211/// let result = and_masks_simd::<8>(lhs_window, rhs_window);
212/// ```
213#[inline(always)]
214pub fn and_masks_simd<const LANES: usize>(lhs: BitmaskVT<'_>, rhs: BitmaskVT<'_>) -> Bitmask
215where
216 LaneCount<LANES>: SupportedLaneCount,
217{
218 bitmask_binop_simd::<LANES>(lhs, rhs, LogicalOperator::And)
219}
220
221/// Performs vectorised bitwise OR operation between two bitmask windows.
222///
223/// High-performance SIMD implementation of logical OR using configurable lane counts for optimal
224/// CPU architecture utilisation. Delegates to the core `bitmask_binop_simd` implementation with
225/// the OR operator.
226///
227/// # Type Parameters
228/// - `LANES`: Number of u64 lanes to process simultaneously for vectorised operations
229///
230/// # Parameters
231/// - `lhs`: Left-hand side bitmask window as `(mask, offset, length)` tuple
232/// - `rhs`: Right-hand side bitmask window as `(mask, offset, length)` tuple
233///
234/// # Returns
235/// A new `Bitmask` containing bitwise OR results with proper trailing bit masking.
236///
237/// # Usage Example
238/// ```rust,ignore
239/// use simd_kernels::kernels::bitmask::simd::or_masks_simd;
240///
241/// // Process 16 lanes simultaneously (1024 bits per instruction)
242/// let result = or_masks_simd::<16>(lhs_window, rhs_window);
243/// ```
244#[inline(always)]
245pub fn or_masks_simd<const LANES: usize>(lhs: BitmaskVT<'_>, rhs: BitmaskVT<'_>) -> Bitmask
246where
247 LaneCount<LANES>: SupportedLaneCount,
248{
249 bitmask_binop_simd::<LANES>(lhs, rhs, LogicalOperator::Or)
250}
251
252/// Performs vectorised bitwise XOR operation between two bitmask windows.
253///
254/// High-performance SIMD implementation of logical exclusive-OR using configurable lane counts
255/// for optimal CPU architecture utilisation. Delegates to the core `bitmask_binop_simd`
256/// implementation with the XOR operator.
257///
258/// # Type Parameters
259/// - `LANES`: Number of u64 lanes to process simultaneously for vectorised operations
260///
261/// # Parameters
262/// - `lhs`: Left-hand side bitmask window as `(mask, offset, length)` tuple
263/// - `rhs`: Right-hand side bitmask window as `(mask, offset, length)` tuple
264///
265/// # Returns
266/// A new `Bitmask` containing bitwise XOR results with proper trailing bit masking.
267///
268/// # Usage Example
269/// ```rust,ignore
270/// use simd_kernels::kernels::bitmask::simd::xor_masks_simd;
271///
272/// // Process 32 lanes simultaneously (2048 bits per instruction)
273/// let result = xor_masks_simd::<32>(lhs_window, rhs_window);
274/// ```
275#[inline(always)]
276pub fn xor_masks_simd<const LANES: usize>(lhs: BitmaskVT<'_>, rhs: BitmaskVT<'_>) -> Bitmask
277where
278 LaneCount<LANES>: SupportedLaneCount,
279{
280 bitmask_binop_simd::<LANES>(lhs, rhs, LogicalOperator::Xor)
281}
282
283/// Performs vectorised bitwise NOT operation on a bitmask window.
284///
285/// High-performance SIMD implementation of logical NOT using configurable lane counts for optimal
286/// CPU architecture utilisation. Delegates to the core `bitmask_unop_simd` implementation with
287/// the NOT operator.
288///
289/// # Type Parameters
290/// - `LANES`: Number of u64 lanes to process simultaneously for vectorised operations
291///
292/// # Parameters
293/// - `src`: Source bitmask window as `(mask, offset, length)` tuple
294///
295/// # Returns
296/// A new `Bitmask` containing bitwise NOT results with proper trailing bit masking.
297///
298/// # Usage Example
299/// ```rust,ignore
300/// use simd_kernels::kernels::bitmask::simd::not_mask_simd;
301///
302/// // Process 8 lanes simultaneously (512 bits per instruction)
303/// let inverted = not_mask_simd::<8>(source_window);
304/// ```
305#[inline(always)]
306pub fn not_mask_simd<const LANES: usize>(src: BitmaskVT<'_>) -> Bitmask
307where
308 LaneCount<LANES>: SupportedLaneCount,
309{
310 bitmask_unop_simd::<LANES>(src, UnaryOperator::Not)
311}
312
313/// Bitwise "in" for boolean bitmasks: each output bit is true if lhs bit is in the set of bits in rhs.
314#[inline]
315pub fn in_mask_simd<const LANES: usize>(lhs: BitmaskVT<'_>, rhs: BitmaskVT<'_>) -> Bitmask
316where
317 LaneCount<LANES>: SupportedLaneCount,
318{
319 let (lhs_mask, lhs_off, len) = lhs;
320 let (rhs_mask, rhs_off, rlen) = rhs;
321 debug_assert_eq!(len, rlen, "in_mask: window length mismatch");
322
323 // Scan rhs to see which values are present (true, false)
324 let mut has_true = false;
325 let mut has_false = false;
326 for i in 0..len {
327 let v = unsafe { rhs_mask.get_unchecked(rhs_off + i) };
328 if v {
329 has_true = true;
330 } else {
331 has_false = true;
332 }
333 if has_true && has_false {
334 break;
335 }
336 }
337
338 match (has_true, has_false) {
339 (true, true) => {
340 // Set contains both: every bit is in the set
341 Bitmask::new_set_all(len, true)
342 }
343 (true, false) => {
344 // Only 'true' in rhs: output bit is set iff lhs bit is true
345 lhs_mask.slice_clone(lhs_off, len)
346 }
347 (false, true) => {
348 // Only 'false' in rhs: output bit is set iff lhs bit is false
349 not_mask_simd::<LANES>((lhs_mask, lhs_off, len))
350 }
351 (false, false) => {
352 // Set is empty: all bits false
353 Bitmask::new_set_all(len, false)
354 }
355 }
356}
357
358/// Performs vectorised bitwise "not in" membership test for boolean bitmasks.
359///
360/// Computes the logical complement of the "in" operation where each output bit is true if the
361/// corresponding lhs bit is NOT in the set of bits defined by rhs. This function delegates to
362/// `in_mask_simd` followed by `not_mask_simd` for optimal performance.
363///
364/// # Type Parameters
365/// - `LANES`: Number of u64 lanes to process simultaneously for vectorised operations
366///
367/// # Parameters
368/// - `lhs`: Left-hand side bitmask window as `(mask, offset, length)` tuple (test values)
369/// - `rhs`: Right-hand side bitmask window as `(mask, offset, length)` tuple (set definition)
370///
371/// # Returns
372/// A new `Bitmask` where each bit is true if the corresponding lhs bit is not in the rhs set.
373#[inline]
374pub fn not_in_mask_simd<const LANES: usize>(lhs: BitmaskVT<'_>, rhs: BitmaskVT<'_>) -> Bitmask
375where
376 LaneCount<LANES>: SupportedLaneCount,
377{
378 let mask = in_mask_simd::<LANES>(lhs, rhs);
379 not_mask_simd::<LANES>((&mask, 0, mask.len))
380}
381
382/// Produces a bitmask where each output bit is 1 iff the corresponding bits of `a` and `b` are equal.
383#[inline]
384pub fn eq_mask_simd<const LANES: usize>(a: BitmaskVT<'_>, b: BitmaskVT<'_>) -> Bitmask
385where
386 LaneCount<LANES>: SupportedLaneCount,
387{
388 let (am, ao, len) = a;
389 let (bm, bo, blen) = b;
390 debug_assert_eq!(len, blen, "BitWindow length mismatch in eq_bits_mask");
391 if ao % 64 != 0 || bo % 64 != 0 {
392 panic!(
393 "eq_bits_mask: offsets must be 64-bit aligned (got a: {}, b: {})",
394 ao, bo
395 );
396 }
397 let a_words = ao / 64;
398 let b_words = bo / 64;
399 let n_words = (len + 63) / 64;
400 let mut out = Bitmask::new_set_all(len, false);
401
402 {
403 use core::simd::Simd;
404 let simd_chunks = n_words / LANES;
405 let tail_words = n_words % LANES;
406 for chunk in 0..simd_chunks {
407 let mut arr_a = [0u64; LANES];
408 let mut arr_b = [0u64; LANES];
409 for lane in 0..LANES {
410 arr_a[lane] = unsafe { am.word_unchecked(a_words + chunk * LANES + lane) };
411 arr_b[lane] = unsafe { bm.word_unchecked(b_words + chunk * LANES + lane) };
412 }
413 let sa = Simd::<u64, LANES>::from_array(arr_a);
414 let sb = Simd::<u64, LANES>::from_array(arr_b);
415 let eq = !(sa ^ sb);
416 for lane in 0..LANES {
417 unsafe {
418 out.set_word_unchecked(chunk * LANES + lane, eq[lane]);
419 }
420 }
421 }
422 let base = simd_chunks * LANES;
423 for k in 0..tail_words {
424 let wa = unsafe { am.word_unchecked(a_words + base + k) };
425 let wb = unsafe { bm.word_unchecked(b_words + base + k) };
426 let eq = !(wa ^ wb);
427 unsafe {
428 out.set_word_unchecked(base + k, eq);
429 }
430 }
431 }
432 #[cfg(not(feature = "simd"))]
433 {
434 for k in 0..n_words {
435 let wa = unsafe { am.word_unchecked(a_words + k) };
436 let wb = unsafe { bm.word_unchecked(b_words + k) };
437 let eq = !(wa ^ wb);
438 unsafe {
439 out.set_word_unchecked(k, eq);
440 }
441 }
442 }
443 out.mask_trailing_bits();
444 out
445}
446
447/// Performs vectorised bitwise inequality comparison between two bitmask windows.
448///
449/// Computes the logical complement of equality where each output bit is true if the corresponding
450/// bits from the two input windows are different. This function delegates to `eq_mask_simd`
451/// followed by bitwise NOT for optimal performance.
452///
453/// # Type Parameters
454/// - `LANES`: Number of u64 lanes to process simultaneously for vectorised operations
455///
456/// # Parameters
457/// - `a`: First bitmask window as `(mask, offset, length)` tuple
458/// - `b`: Second bitmask window as `(mask, offset, length)` tuple
459///
460/// # Returns
461/// A new `Bitmask` where each bit is true if the corresponding input bits are different.
462#[inline]
463pub fn ne_mask_simd<const LANES: usize>(a: BitmaskVT<'_>, b: BitmaskVT<'_>) -> Bitmask
464where
465 LaneCount<LANES>: SupportedLaneCount,
466{
467 !eq_mask_simd::<LANES>(a, b)
468}
469
470/// Tests if all corresponding bits between two bitmask windows are different.
471///
472/// Performs bulk inequality comparison across entire bitmask windows by computing the logical
473/// complement of `all_eq_mask_simd`. Returns true only if every corresponding bit pair differs
474/// between the two input windows.
475///
476/// # Type Parameters
477/// - `LANES`: Number of u64 lanes to process simultaneously for vectorised comparison
478///
479/// # Parameters
480/// - `a`: First bitmask window as `(mask, offset, length)` tuple
481/// - `b`: Second bitmask window as `(mask, offset, length)` tuple
482///
483/// # Returns
484/// `true` if all corresponding bits are different, `false` if any bits are equal.
485#[inline]
486pub fn all_ne_mask_simd<const LANES: usize>(a: BitmaskVT<'_>, b: BitmaskVT<'_>) -> bool
487where
488 LaneCount<LANES>: SupportedLaneCount,
489{
490 !all_eq_mask_simd::<LANES>(a, b)
491}
492
493/// Vectorised equality test across entire bitmask windows with early termination optimisation.
494///
495/// Performs bulk equality comparison between two bitmask windows using SIMD comparison operations.
496/// The implementation processes multiple words simultaneously and uses early termination to avoid
497/// unnecessary work when differences are detected.
498///
499/// # Type Parameters
500/// - `LANES`: Number of u64 lanes to process simultaneously for vectorised comparison
501///
502/// # Parameters
503/// - `a`: First bitmask window as `(mask, offset, length)` tuple
504/// - `b`: Second bitmask window as `(mask, offset, length)` tuple
505///
506/// # Returns
507/// `true` if all corresponding bits are equal (ignoring slack bits), `false` otherwise.
508#[inline]
509pub fn all_eq_mask_simd<const LANES: usize>(a: BitmaskVT<'_>, b: BitmaskVT<'_>) -> bool
510where
511 LaneCount<LANES>: SupportedLaneCount,
512{
513 let (am, ao, len) = a;
514
515 // Mask < 64 bits early exit
516 if len < 64 {
517 for i in 0..len {
518 if a.0.get(a.1 + i) != unsafe { b.0.get_unchecked(b.1 + i) } {
519 return false;
520 }
521 }
522 return true;
523 }
524
525 let (bm, bo, blen) = b;
526 debug_assert_eq!(len, blen, "BitWindow length mismatch in all_eq_mask");
527 if ao % 64 != 0 || bo % 64 != 0 {
528 panic!(
529 "all_eq_mask_simd: offsets must be 64-bit aligned (got a: {}, b: {})",
530 ao, bo
531 );
532 }
533 let a_words = ao / 64;
534 let b_words = bo / 64;
535 let n_words = (len + 63) / 64;
536 let trailing = len & 63;
537
538 use core::simd::Simd;
539 use std::simd::prelude::SimdPartialEq;
540
541 let simd_chunks = n_words / LANES;
542 let tail_words = n_words % LANES;
543
544 for chunk in 0..simd_chunks {
545 let mut arr_a = [0u64; LANES];
546 let mut arr_b = [0u64; LANES];
547 for lane in 0..LANES {
548 arr_a[lane] = unsafe { am.word_unchecked(a_words + chunk * LANES + lane) };
549 arr_b[lane] = unsafe { bm.word_unchecked(b_words + chunk * LANES + lane) };
550 }
551 let sa = Simd::<u64, LANES>::from_array(arr_a);
552 let sb = Simd::<u64, LANES>::from_array(arr_b);
553 let eq_mask = sa.simd_eq(sb);
554 if !eq_mask.all() {
555 return false;
556 }
557 }
558
559 let base = simd_chunks * LANES;
560 for k in 0..tail_words {
561 let idx = base + k;
562 let wa = unsafe { am.word_unchecked(a_words + idx) };
563 let wb = unsafe { bm.word_unchecked(b_words + idx) };
564 // For the last (possibly partial) word, mask slack bits
565 if idx == n_words - 1 && trailing != 0 {
566 let mask = (1u64 << trailing) - 1;
567 if (wa & mask) != (wb & mask) {
568 return false;
569 }
570 } else if wa != wb {
571 return false;
572 }
573 }
574 true
575}
576
577/// Vectorised population count (number of set bits) with SIMD reduction for optimal performance.
578///
579/// Computes the total number of set bits in a bitmask window using SIMD population count instructions
580/// followed by horizontal reduction. This implementation provides significant performance improvements
581/// for large bitmasks through parallel processing of multiple words.
582///
583/// # Type Parameters
584/// - `LANES`: Number of u64 lanes to process simultaneously for vectorised popcount operations
585///
586/// # Parameters
587/// - `m`: Bitmask window as `(mask, offset, length)` tuple
588///
589/// # Returns
590/// The total count of set bits in the specified window.
591#[inline]
592pub fn popcount_mask_simd<const LANES: usize>(m: BitmaskVT<'_>) -> usize
593where
594 LaneCount<LANES>: SupportedLaneCount,
595{
596 let (mask, offset, len) = m;
597 let n_words = (len + 63) / 64;
598 let word_start = offset / 64;
599 let mut acc = 0usize;
600
601 {
602 use core::simd::Simd;
603 use std::simd::prelude::SimdUint;
604
605 let simd_chunks = n_words / LANES;
606 let tail_words = n_words % LANES;
607
608 for chunk in 0..simd_chunks {
609 let mut arr = [0u64; LANES];
610 for lane in 0..LANES {
611 arr[lane] = unsafe { mask.word_unchecked(word_start + chunk * LANES + lane) };
612 }
613 let v = Simd::<u64, LANES>::from_array(arr);
614 let counts = v.count_ones();
615 acc += counts.reduce_sum() as usize;
616 }
617
618 // Tail scalar loop for any remaining words
619 let base = simd_chunks * LANES;
620 for k in 0..tail_words {
621 let word = unsafe { mask.word_unchecked(word_start + base + k) };
622 // Mask off slack bits in final word if needed
623 if base + k == n_words - 1 && len % 64 != 0 {
624 let valid = len % 64;
625 let slack_mask = (1u64 << valid) - 1;
626 acc += (word & slack_mask).count_ones() as usize;
627 } else {
628 acc += word.count_ones() as usize;
629 }
630 }
631 }
632 acc
633}
634
635/// Returns true if all bits in the mask are set (1).
636#[inline]
637pub fn all_true_mask_simd<const LANES: usize>(mask: &Bitmask) -> bool
638where
639 LaneCount<LANES>: SupportedLaneCount,
640{
641 if mask.len < 64 {
642 for i in 0..mask.len {
643 if !unsafe { mask.get_unchecked(i) } {
644 return false;
645 }
646 }
647 return true;
648 }
649 let n_bits = mask.len;
650 let n_words = (n_bits + 63) / 64;
651 let words: &[u64] =
652 unsafe { std::slice::from_raw_parts(mask.bits.as_ptr() as *const u64, n_words) };
653
654 let simd_chunks = n_words / LANES;
655
656 let all_ones = Simd::<u64, LANES>::splat(!0u64);
657 for chunk in 0..simd_chunks {
658 let base = chunk * LANES;
659 let arr = Simd::<u64, LANES>::from_slice(&words[base..base + LANES]);
660 if arr != all_ones {
661 return false;
662 }
663 }
664 // Tail often caused by `n % LANES =! 0`; uses scalar fallback
665 let tail_words = n_words % LANES;
666 let base = simd_chunks * LANES;
667 for k in 0..tail_words {
668 if base + k == n_words - 1 && n_bits % 64 != 0 {
669 let valid_bits = n_bits % 64;
670 let slack_mask = (1u64 << valid_bits) - 1;
671 if words[base + k] != slack_mask {
672 return false;
673 }
674 } else {
675 if words[base + k] != !0u64 {
676 return false;
677 }
678 }
679 }
680 true
681}
682
683/// Returns true if all bits in the mask are set to (0).
684pub fn all_false_mask_simd<const LANES: usize>(mask: &Bitmask) -> bool
685where
686 LaneCount<LANES>: SupportedLaneCount,
687{
688 if mask.len < 64 {
689 for i in 0..mask.len {
690 if unsafe { mask.get_unchecked(i) } {
691 return false;
692 }
693 }
694 return true;
695 }
696 let n_bits = mask.len;
697 let n_words = (n_bits + 63) / 64;
698 let words: &[u64] =
699 unsafe { std::slice::from_raw_parts(mask.bits.as_ptr() as *const u64, n_words) };
700
701 let simd_chunks = n_words / LANES;
702 for chunk in 0..simd_chunks {
703 let base = chunk * LANES;
704 let arr = Simd::<u64, LANES>::from_slice(&words[base..base + LANES]);
705 if arr != Simd::<u64, LANES>::splat(0u64) {
706 return false;
707 }
708 }
709 let tail_words = n_words % LANES;
710 let base = simd_chunks * LANES;
711 for k in 0..tail_words {
712 if base + k == n_words - 1 && n_bits % 64 != 0 {
713 let valid_bits = n_bits % 64;
714 let slack_mask = (1u64 << valid_bits) - 1;
715 if words[base + k] & slack_mask != 0 {
716 return false;
717 }
718 } else {
719 if words[base + k] != 0u64 {
720 return false;
721 }
722 }
723 }
724 true
725}
726
727#[cfg(test)]
728mod tests {
729 use minarrow::{Bitmask, BitmaskVT};
730
731 use super::*;
732
733 macro_rules! simd_bitmask_suite {
734 ($mod_name:ident, $lanes:expr) => {
735 mod $mod_name {
736 use super::*;
737 const LANES: usize = $lanes;
738
739 fn bm(bits: &[bool]) -> Bitmask {
740 let mut m = Bitmask::new_set_all(bits.len(), false);
741 for (i, b) in bits.iter().enumerate() {
742 if *b {
743 m.set(i, true);
744 }
745 }
746 m
747 }
748 fn slice(mask: &Bitmask) -> BitmaskVT<'_> {
749 (mask, 0, mask.len)
750 }
751
752 #[test]
753 fn test_and_masks_simd() {
754 let a = bm(&[true, false, true, false, true, true, false, false]);
755 let b = bm(&[true, true, false, false, true, false, true, false]);
756 let c = and_masks_simd::<LANES>(slice(&a), slice(&b));
757 for i in 0..a.len {
758 assert_eq!(c.get(i), a.get(i) & b.get(i), "bit {i}");
759 }
760 }
761
762 #[test]
763 fn test_or_masks_simd() {
764 let a = bm(&[true, false, true, false, true, true, false, false]);
765 let b = bm(&[true, true, false, false, true, false, true, false]);
766 let c = or_masks_simd::<LANES>(slice(&a), slice(&b));
767 for i in 0..a.len {
768 assert_eq!(c.get(i), a.get(i) | b.get(i), "bit {i}");
769 }
770 }
771
772 #[test]
773 fn test_xor_masks_simd() {
774 let a = bm(&[true, false, true, false, true, true, false, false]);
775 let b = bm(&[true, true, false, false, true, false, true, false]);
776 let c = xor_masks_simd::<LANES>(slice(&a), slice(&b));
777 for i in 0..a.len {
778 assert_eq!(c.get(i), a.get(i) ^ b.get(i), "bit {i}");
779 }
780 }
781
782 #[test]
783 fn test_not_mask_simd() {
784 let a = bm(&[true, false, true, false]);
785 let c = not_mask_simd::<LANES>(slice(&a));
786 for i in 0..a.len {
787 assert_eq!(c.get(i), !a.get(i));
788 }
789 }
790
791 #[test]
792 fn test_in_mask_simd_variants() {
793 let lhs = bm(&[true, false, true, false]);
794 // RHS = [true]: only 'true' in rhs
795 let rhs_true = bm(&[true; 4]);
796 let out = in_mask_simd::<LANES>(slice(&lhs), slice(&rhs_true));
797 for i in 0..lhs.len {
798 assert_eq!(out.get(i), lhs.get(i), "in_mask, only true, bit {i}");
799 }
800 // RHS = [false]: only 'false' in rhs
801 let rhs_false = bm(&[false; 4]);
802 let out = in_mask_simd::<LANES>(slice(&lhs), slice(&rhs_false));
803 for i in 0..lhs.len {
804 assert_eq!(out.get(i), !lhs.get(i), "in_mask, only false, bit {i}");
805 }
806 // RHS = [true, false]: both present
807 let rhs_both = bm(&[true, false, true, false]);
808 let out = in_mask_simd::<LANES>(slice(&lhs), slice(&rhs_both));
809 for i in 0..lhs.len {
810 assert!(out.get(i), "in_mask, both true/false, bit {i}");
811 }
812 // RHS empty
813 let rhs_empty = bm(&[false; 0]);
814 let out = in_mask_simd::<LANES>((&lhs, 0, 0), (&rhs_empty, 0, 0));
815 assert_eq!(out.len, 0);
816 }
817
818 #[test]
819 fn test_not_in_mask_simd() {
820 let lhs = bm(&[true, false, true, false]);
821 let rhs = bm(&[true, false, true, false]);
822 let in_mask = in_mask_simd::<LANES>(slice(&lhs), slice(&rhs));
823 let not_in = not_in_mask_simd::<LANES>(slice(&lhs), slice(&rhs));
824 for i in 0..lhs.len {
825 assert_eq!(not_in.get(i), !in_mask.get(i));
826 }
827 }
828
829 #[test]
830 fn test_eq_mask_simd_and_ne_mask_simd() {
831 let a = bm(&[true, false, true, false]);
832 let b = bm(&[true, false, false, true]);
833 let eq = eq_mask_simd::<LANES>(slice(&a), slice(&b));
834 let ne = ne_mask_simd::<LANES>(slice(&a), slice(&b));
835 for i in 0..a.len {
836 assert_eq!(eq.get(i), a.get(i) == b.get(i), "eq_mask bit {i}");
837 assert_eq!(ne.get(i), a.get(i) != b.get(i), "ne_mask bit {i}");
838 }
839 }
840
841 #[test]
842 fn test_all_eq_mask_simd() {
843 let a = bm(&[true, false, true, false, true, true, false, false]);
844 let b = bm(&[true, false, true, false, true, true, false, false]);
845 assert!(all_eq_mask_simd::<LANES>(slice(&a), slice(&b)));
846 let mut b2 = b.clone();
847 b2.set(0, false);
848 assert!(!all_eq_mask_simd::<LANES>(slice(&a), slice(&b2)));
849 }
850
851 #[test]
852 fn test_all_ne_mask_simd() {
853 let a = bm(&[true, false, true]);
854 let b = bm(&[false, true, false]);
855 assert!(all_ne_mask_simd::<LANES>(slice(&a), slice(&b)));
856 assert!(!all_ne_mask_simd::<LANES>(slice(&a), slice(&a)));
857 }
858
859 #[test]
860 fn test_popcount_mask_simd() {
861 let a = bm(&[true, false, true, false, true, false, false, true]);
862 let pop = popcount_mask_simd::<LANES>(slice(&a));
863 assert_eq!(pop, 4);
864 }
865
866 #[test]
867 fn test_all_true_mask_simd_and_false() {
868 let all_true = Bitmask::new_set_all(64 * LANES, true);
869 assert!(all_true_mask_simd::<LANES>(&all_true));
870 let mut not_true = all_true.clone();
871 not_true.set(3, false);
872 assert!(!all_true_mask_simd::<LANES>(¬_true));
873 }
874
875 #[test]
876 fn test_all_false_mask_simd() {
877 let all_true = Bitmask::new_set_all(64 * LANES, true);
878 assert!(!all_false_mask_simd::<LANES>(&all_true));
879 let all_false = Bitmask::new_set_all(64 * LANES, false);
880 assert!(all_false_mask_simd::<LANES>(&all_false));
881 }
882 }
883 };
884 }
885
886 include!(concat!(env!("OUT_DIR"), "/simd_lanes.rs"));
887
888 simd_bitmask_suite!(simd_bitmask_w8, W8);
889 simd_bitmask_suite!(simd_bitmask_w16, W16);
890 simd_bitmask_suite!(simd_bitmask_w32, W32);
891 simd_bitmask_suite!(simd_bitmask_w64, W64);
892}