pub type FilterBitmap = Vec<u64>;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum FilterOp {
Equal,
NotEqual,
LessThan,
LessEqual,
GreaterThan,
GreaterEqual,
IsNull,
IsNotNull,
}
#[inline]
pub fn allocate_bitmap(num_rows: usize) -> FilterBitmap {
vec![0u64; (num_rows + 63) / 64]
}
#[inline]
pub fn set_bit(bitmap: &mut FilterBitmap, idx: usize) {
let word_idx = idx / 64;
let bit_idx = idx % 64;
if word_idx < bitmap.len() {
bitmap[word_idx] |= 1u64 << bit_idx;
}
}
#[inline]
pub fn get_bit(bitmap: &FilterBitmap, idx: usize) -> bool {
let word_idx = idx / 64;
let bit_idx = idx % 64;
if word_idx < bitmap.len() {
(bitmap[word_idx] >> bit_idx) & 1 == 1
} else {
false
}
}
pub fn popcount(bitmap: &FilterBitmap) -> usize {
bitmap.iter().map(|w| w.count_ones() as usize).sum()
}
pub fn bitmap_and(a: &FilterBitmap, b: &FilterBitmap) -> FilterBitmap {
a.iter().zip(b.iter()).map(|(x, y)| x & y).collect()
}
pub fn bitmap_or(a: &FilterBitmap, b: &FilterBitmap) -> FilterBitmap {
a.iter().zip(b.iter()).map(|(x, y)| x | y).collect()
}
pub fn bitmap_not(a: &FilterBitmap) -> FilterBitmap {
a.iter().map(|x| !x).collect()
}
pub fn filter_i64_gt_scalar(data: &[i64], threshold: i64, result: &mut FilterBitmap) {
for (idx, &value) in data.iter().enumerate() {
if value > threshold {
set_bit(result, idx);
}
}
}
pub fn filter_i64_ge_scalar(data: &[i64], threshold: i64, result: &mut FilterBitmap) {
for (idx, &value) in data.iter().enumerate() {
if value >= threshold {
set_bit(result, idx);
}
}
}
pub fn filter_i64_lt_scalar(data: &[i64], threshold: i64, result: &mut FilterBitmap) {
for (idx, &value) in data.iter().enumerate() {
if value < threshold {
set_bit(result, idx);
}
}
}
pub fn filter_i64_eq_scalar(data: &[i64], target: i64, result: &mut FilterBitmap) {
for (idx, &value) in data.iter().enumerate() {
if value == target {
set_bit(result, idx);
}
}
}
pub fn filter_i64_between_scalar(data: &[i64], low: i64, high: i64, result: &mut FilterBitmap) {
for (idx, &value) in data.iter().enumerate() {
if value >= low && value <= high {
set_bit(result, idx);
}
}
}
pub fn filter_f64_gt_scalar(data: &[f64], threshold: f64, result: &mut FilterBitmap) {
for (idx, &value) in data.iter().enumerate() {
if value > threshold {
set_bit(result, idx);
}
}
}
pub fn filter_f64_lt_scalar(data: &[f64], threshold: f64, result: &mut FilterBitmap) {
for (idx, &value) in data.iter().enumerate() {
if value < threshold {
set_bit(result, idx);
}
}
}
#[cfg(target_arch = "x86_64")]
mod avx2 {
use super::*;
use std::arch::x86_64::*;
#[inline]
pub fn is_available() -> bool {
is_x86_feature_detected!("avx2")
}
#[target_feature(enable = "avx2")]
pub unsafe fn filter_i64_gt_avx2(data: &[i64], threshold: i64, result: &mut FilterBitmap) {
let threshold_vec = _mm256_set1_epi64x(threshold);
let len = data.len();
let chunks = len / 4;
for chunk_idx in 0..chunks {
let offset = chunk_idx * 4;
let data_vec = _mm256_loadu_si256(data.as_ptr().add(offset) as *const __m256i);
let cmp = _mm256_cmpgt_epi64(data_vec, threshold_vec);
let mask = _mm256_movemask_pd(_mm256_castsi256_pd(cmp)) as u64;
let word_idx = offset / 64;
let bit_offset = offset % 64;
if word_idx < result.len() {
result[word_idx] |= mask << bit_offset;
if bit_offset > 60 && word_idx + 1 < result.len() {
result[word_idx + 1] |= mask >> (64 - bit_offset);
}
}
}
let remainder_start = chunks * 4;
for idx in remainder_start..len {
if data[idx] > threshold {
set_bit(result, idx);
}
}
}
#[target_feature(enable = "avx2")]
pub unsafe fn filter_i64_lt_avx2(data: &[i64], threshold: i64, result: &mut FilterBitmap) {
let threshold_vec = _mm256_set1_epi64x(threshold);
let len = data.len();
let chunks = len / 4;
for chunk_idx in 0..chunks {
let offset = chunk_idx * 4;
let data_vec = _mm256_loadu_si256(data.as_ptr().add(offset) as *const __m256i);
let cmp = _mm256_cmpgt_epi64(threshold_vec, data_vec);
let mask = _mm256_movemask_pd(_mm256_castsi256_pd(cmp)) as u64;
let word_idx = offset / 64;
let bit_offset = offset % 64;
if word_idx < result.len() {
result[word_idx] |= mask << bit_offset;
if bit_offset > 60 && word_idx + 1 < result.len() {
result[word_idx + 1] |= mask >> (64 - bit_offset);
}
}
}
let remainder_start = chunks * 4;
for idx in remainder_start..len {
if data[idx] < threshold {
set_bit(result, idx);
}
}
}
#[target_feature(enable = "avx2")]
pub unsafe fn filter_i64_eq_avx2(data: &[i64], target: i64, result: &mut FilterBitmap) {
let target_vec = _mm256_set1_epi64x(target);
let len = data.len();
let chunks = len / 4;
for chunk_idx in 0..chunks {
let offset = chunk_idx * 4;
let data_vec = _mm256_loadu_si256(data.as_ptr().add(offset) as *const __m256i);
let cmp = _mm256_cmpeq_epi64(data_vec, target_vec);
let mask = _mm256_movemask_pd(_mm256_castsi256_pd(cmp)) as u64;
let word_idx = offset / 64;
let bit_offset = offset % 64;
if word_idx < result.len() {
result[word_idx] |= mask << bit_offset;
if bit_offset > 60 && word_idx + 1 < result.len() {
result[word_idx + 1] |= mask >> (64 - bit_offset);
}
}
}
let remainder_start = chunks * 4;
for idx in remainder_start..len {
if data[idx] == target {
set_bit(result, idx);
}
}
}
#[target_feature(enable = "avx2")]
pub unsafe fn filter_i64_between_avx2(
data: &[i64],
low: i64,
high: i64,
result: &mut FilterBitmap,
) {
let low_vec = _mm256_set1_epi64x(low - 1); let high_vec = _mm256_set1_epi64x(high);
let len = data.len();
let chunks = len / 4;
for chunk_idx in 0..chunks {
let offset = chunk_idx * 4;
let data_vec = _mm256_loadu_si256(data.as_ptr().add(offset) as *const __m256i);
let cmp_low = _mm256_cmpgt_epi64(data_vec, low_vec);
let cmp_high = _mm256_cmpgt_epi64(high_vec, data_vec);
let cmp_high_eq = _mm256_cmpeq_epi64(data_vec, high_vec);
let cmp_high_final = _mm256_or_si256(cmp_high, cmp_high_eq);
let cmp = _mm256_and_si256(cmp_low, cmp_high_final);
let mask = _mm256_movemask_pd(_mm256_castsi256_pd(cmp)) as u64;
let word_idx = offset / 64;
let bit_offset = offset % 64;
if word_idx < result.len() {
result[word_idx] |= mask << bit_offset;
if bit_offset > 60 && word_idx + 1 < result.len() {
result[word_idx + 1] |= mask >> (64 - bit_offset);
}
}
}
let remainder_start = chunks * 4;
for idx in remainder_start..len {
let v = data[idx];
if v >= low && v <= high {
set_bit(result, idx);
}
}
}
#[target_feature(enable = "avx2")]
pub unsafe fn filter_f64_gt_avx2(data: &[f64], threshold: f64, result: &mut FilterBitmap) {
let threshold_vec = _mm256_set1_pd(threshold);
let len = data.len();
let chunks = len / 4;
for chunk_idx in 0..chunks {
let offset = chunk_idx * 4;
let data_vec = _mm256_loadu_pd(data.as_ptr().add(offset));
let cmp = _mm256_cmp_pd(data_vec, threshold_vec, _CMP_GT_OQ);
let mask = _mm256_movemask_pd(cmp) as u64;
let word_idx = offset / 64;
let bit_offset = offset % 64;
if word_idx < result.len() {
result[word_idx] |= mask << bit_offset;
if bit_offset > 60 && word_idx + 1 < result.len() {
result[word_idx + 1] |= mask >> (64 - bit_offset);
}
}
}
let remainder_start = chunks * 4;
for idx in remainder_start..len {
if data[idx] > threshold {
set_bit(result, idx);
}
}
}
}
#[cfg(target_arch = "aarch64")]
mod neon {
use super::*;
use std::arch::aarch64::*;
#[inline]
#[allow(dead_code)]
pub fn is_available() -> bool {
true
}
#[target_feature(enable = "neon")]
pub unsafe fn filter_i64_gt_neon(data: &[i64], threshold: i64, result: &mut FilterBitmap) {
unsafe {
let threshold_vec = vdupq_n_s64(threshold);
let len = data.len();
let chunks = len / 2;
for chunk_idx in 0..chunks {
let offset = chunk_idx * 2;
let data_vec = vld1q_s64(data.as_ptr().add(offset));
let cmp = vcgtq_s64(data_vec, threshold_vec);
let mask_low = vgetq_lane_u64(cmp, 0);
let mask_high = vgetq_lane_u64(cmp, 1);
let mask = ((mask_low != 0) as u64) | (((mask_high != 0) as u64) << 1);
let word_idx = offset / 64;
let bit_offset = offset % 64;
if word_idx < result.len() {
result[word_idx] |= mask << bit_offset;
}
}
let remainder_start = chunks * 2;
for idx in remainder_start..len {
if data[idx] > threshold {
set_bit(result, idx);
}
}
}
}
#[target_feature(enable = "neon")]
pub unsafe fn filter_i64_lt_neon(data: &[i64], threshold: i64, result: &mut FilterBitmap) {
unsafe {
let threshold_vec = vdupq_n_s64(threshold);
let len = data.len();
let chunks = len / 2;
for chunk_idx in 0..chunks {
let offset = chunk_idx * 2;
let data_vec = vld1q_s64(data.as_ptr().add(offset));
let cmp = vcltq_s64(data_vec, threshold_vec);
let mask_low = vgetq_lane_u64(cmp, 0);
let mask_high = vgetq_lane_u64(cmp, 1);
let mask = ((mask_low != 0) as u64) | (((mask_high != 0) as u64) << 1);
let word_idx = offset / 64;
let bit_offset = offset % 64;
if word_idx < result.len() {
result[word_idx] |= mask << bit_offset;
}
}
let remainder_start = chunks * 2;
for idx in remainder_start..len {
if data[idx] < threshold {
set_bit(result, idx);
}
}
}
}
#[target_feature(enable = "neon")]
pub unsafe fn filter_i64_eq_neon(data: &[i64], target: i64, result: &mut FilterBitmap) {
unsafe {
let target_vec = vdupq_n_s64(target);
let len = data.len();
let chunks = len / 2;
for chunk_idx in 0..chunks {
let offset = chunk_idx * 2;
let data_vec = vld1q_s64(data.as_ptr().add(offset));
let cmp = vceqq_s64(data_vec, target_vec);
let mask_low = vgetq_lane_u64(cmp, 0);
let mask_high = vgetq_lane_u64(cmp, 1);
let mask = ((mask_low != 0) as u64) | (((mask_high != 0) as u64) << 1);
let word_idx = offset / 64;
let bit_offset = offset % 64;
if word_idx < result.len() {
result[word_idx] |= mask << bit_offset;
}
}
let remainder_start = chunks * 2;
for idx in remainder_start..len {
if data[idx] == target {
set_bit(result, idx);
}
}
}
}
#[target_feature(enable = "neon")]
pub unsafe fn filter_f64_gt_neon(data: &[f64], threshold: f64, result: &mut FilterBitmap) {
unsafe {
let threshold_vec = vdupq_n_f64(threshold);
let len = data.len();
let chunks = len / 2;
for chunk_idx in 0..chunks {
let offset = chunk_idx * 2;
let data_vec = vld1q_f64(data.as_ptr().add(offset));
let cmp = vcgtq_f64(data_vec, threshold_vec);
let mask_low = vgetq_lane_u64(cmp, 0);
let mask_high = vgetq_lane_u64(cmp, 1);
let mask = ((mask_low != 0) as u64) | (((mask_high != 0) as u64) << 1);
let word_idx = offset / 64;
let bit_offset = offset % 64;
if word_idx < result.len() {
result[word_idx] |= mask << bit_offset;
}
}
let remainder_start = chunks * 2;
for idx in remainder_start..len {
if data[idx] > threshold {
set_bit(result, idx);
}
}
}
}
}
pub fn filter_i64_gt(data: &[i64], threshold: i64, result: &mut FilterBitmap) {
#[cfg(target_arch = "x86_64")]
{
if avx2::is_available() {
unsafe {
avx2::filter_i64_gt_avx2(data, threshold, result);
}
return;
}
}
#[cfg(target_arch = "aarch64")]
{
unsafe {
neon::filter_i64_gt_neon(data, threshold, result);
}
return;
}
#[allow(unreachable_code)]
filter_i64_gt_scalar(data, threshold, result);
}
pub fn filter_i64_lt(data: &[i64], threshold: i64, result: &mut FilterBitmap) {
#[cfg(target_arch = "x86_64")]
{
if avx2::is_available() {
unsafe {
avx2::filter_i64_lt_avx2(data, threshold, result);
}
return;
}
}
#[cfg(target_arch = "aarch64")]
{
unsafe {
neon::filter_i64_lt_neon(data, threshold, result);
}
return;
}
#[allow(unreachable_code)]
filter_i64_lt_scalar(data, threshold, result);
}
pub fn filter_i64_eq(data: &[i64], target: i64, result: &mut FilterBitmap) {
#[cfg(target_arch = "x86_64")]
{
if avx2::is_available() {
unsafe {
avx2::filter_i64_eq_avx2(data, target, result);
}
return;
}
}
#[cfg(target_arch = "aarch64")]
{
unsafe {
neon::filter_i64_eq_neon(data, target, result);
}
return;
}
#[allow(unreachable_code)]
filter_i64_eq_scalar(data, target, result);
}
pub fn filter_i64_between(data: &[i64], low: i64, high: i64, result: &mut FilterBitmap) {
#[cfg(target_arch = "x86_64")]
{
if avx2::is_available() {
unsafe {
avx2::filter_i64_between_avx2(data, low, high, result);
}
return;
}
}
filter_i64_between_scalar(data, low, high, result);
}
pub fn filter_f64_gt(data: &[f64], threshold: f64, result: &mut FilterBitmap) {
#[cfg(target_arch = "x86_64")]
{
if avx2::is_available() {
unsafe {
avx2::filter_f64_gt_avx2(data, threshold, result);
}
return;
}
}
#[cfg(target_arch = "aarch64")]
{
unsafe {
neon::filter_f64_gt_neon(data, threshold, result);
}
return;
}
#[allow(unreachable_code)]
filter_f64_gt_scalar(data, threshold, result);
}
pub fn simd_info() -> SimdInfo {
SimdInfo {
#[cfg(target_arch = "x86_64")]
has_avx2: is_x86_feature_detected!("avx2"),
#[cfg(target_arch = "x86_64")]
has_avx512f: is_x86_feature_detected!("avx512f"),
#[cfg(not(target_arch = "x86_64"))]
has_avx2: false,
#[cfg(not(target_arch = "x86_64"))]
has_avx512f: false,
#[cfg(target_arch = "aarch64")]
has_neon: true,
#[cfg(not(target_arch = "aarch64"))]
has_neon: false,
}
}
#[derive(Debug, Clone)]
pub struct SimdInfo {
pub has_avx2: bool,
pub has_avx512f: bool,
pub has_neon: bool,
}
impl SimdInfo {
pub fn expected_speedup_i64(&self) -> f64 {
if self.has_avx512f {
8.0
} else if self.has_avx2 {
4.0
} else if self.has_neon {
2.0
} else {
1.0
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_filter_i64_gt() {
let data: Vec<i64> = (0..100).collect();
let mut result = allocate_bitmap(data.len());
filter_i64_gt(&data, 50, &mut result);
assert_eq!(popcount(&result), 49);
for i in 0..100 {
assert_eq!(get_bit(&result, i), i > 50, "Failed at index {}", i);
}
}
#[test]
fn test_filter_i64_lt() {
let data: Vec<i64> = (0..100).collect();
let mut result = allocate_bitmap(data.len());
filter_i64_lt(&data, 50, &mut result);
assert_eq!(popcount(&result), 50);
for i in 0..100 {
assert_eq!(get_bit(&result, i), i < 50, "Failed at index {}", i);
}
}
#[test]
fn test_filter_i64_eq() {
let data: Vec<i64> = (0..100).collect();
let mut result = allocate_bitmap(data.len());
filter_i64_eq(&data, 42, &mut result);
assert_eq!(popcount(&result), 1);
assert!(get_bit(&result, 42));
}
#[test]
fn test_filter_i64_between() {
let data: Vec<i64> = (0..100).collect();
let mut result = allocate_bitmap(data.len());
filter_i64_between(&data, 25, 75, &mut result);
assert_eq!(popcount(&result), 51);
for i in 0..100 {
assert_eq!(
get_bit(&result, i),
i >= 25 && i <= 75,
"Failed at index {}",
i
);
}
}
#[test]
fn test_filter_f64_gt() {
let data: Vec<f64> = (0..100).map(|x| x as f64).collect();
let mut result = allocate_bitmap(data.len());
filter_f64_gt(&data, 50.0, &mut result);
assert_eq!(popcount(&result), 49);
}
#[test]
fn test_bitmap_operations() {
let mut a = allocate_bitmap(64);
let mut b = allocate_bitmap(64);
for i in 0..32 {
set_bit(&mut a, i);
}
for i in 16..48 {
set_bit(&mut b, i);
}
let and_result = bitmap_and(&a, &b);
assert_eq!(popcount(&and_result), 16);
let or_result = bitmap_or(&a, &b);
assert_eq!(popcount(&or_result), 48); }
#[test]
fn test_simd_info() {
let info = simd_info();
println!("SIMD capabilities: {:?}", info);
println!("Expected speedup: {}x", info.expected_speedup_i64());
}
#[test]
fn test_large_dataset() {
let data: Vec<i64> = (0..1_000_000).collect();
let mut result = allocate_bitmap(data.len());
let start = std::time::Instant::now();
filter_i64_gt(&data, 500_000, &mut result);
let elapsed = start.elapsed();
assert_eq!(popcount(&result), 499_999);
println!("Filtered 1M rows in {:?}", elapsed);
}
}