use std::sync::Arc;
use arrow::buffer::{Buffer, OffsetBuffer};
const FINGERPRINT_BUCKETS: u8 = 32;
const FINGERPRINT_MASK: u8 = FINGERPRINT_BUCKETS - 1;
#[derive(Clone, Copy, Debug)]
pub(super) struct StringFingerprint(u32);
impl StringFingerprint {
pub(super) fn from_bits(bits: u32) -> Self {
Self(bits)
}
pub(super) fn from_bytes(bytes: &[u8]) -> Self {
let mut bits = 0u32;
for &byte in bytes {
let bucket = (byte & FINGERPRINT_MASK) as u32;
bits |= 1u32 << bucket;
}
Self(bits)
}
pub(super) fn bits(self) -> u32 {
self.0
}
pub(super) fn might_contain(self, needle: Self) -> bool {
(self.0 & needle.0) == needle.0
}
}
pub(super) fn build_fingerprints(values: &Buffer, offsets: &OffsetBuffer<i32>) -> Arc<[u32]> {
let offsets = offsets.as_ref();
if offsets.len() < 2 {
return Arc::from([]);
}
let mut fingerprints = Vec::with_capacity(offsets.len().saturating_sub(1));
let values = values.as_slice();
for window in offsets.windows(2) {
let start = window[0] as usize;
let end = window[1] as usize;
debug_assert!(start <= end);
debug_assert!(end <= values.len());
let bytes = &values[start..end];
fingerprints.push(StringFingerprint::from_bytes(bytes).0);
}
Arc::from(fingerprints.into_boxed_slice())
}
pub(super) fn substring_pattern_bytes(pattern: &[u8]) -> Option<&[u8]> {
if pattern.len() < 2 {
return None;
}
if pattern[0] != b'%' || pattern[pattern.len() - 1] != b'%' {
return None;
}
let inner = &pattern[1..pattern.len() - 1];
if inner.is_empty() {
return None;
}
if inner.iter().any(|b| *b == b'%' || *b == b'_') {
return None;
}
Some(inner)
}