use crate::error::{Result, ZiporaError};
use crate::succinct::rank_select::bmi2_acceleration::{
Bmi2Capabilities, Bmi2BextrOps
};
use std::collections::HashMap;
pub struct Bmi2StringProcessor {
capabilities: Bmi2Capabilities,
}
impl Bmi2StringProcessor {
pub fn new() -> Self {
Self {
capabilities: Bmi2Capabilities::detect(),
}
}
pub fn is_bmi2_available(&self) -> bool {
self.capabilities.has_bmi2
}
pub fn capabilities(&self) -> &Bmi2Capabilities {
&self.capabilities
}
pub fn validate_utf8_bmi2(&self, input: &[u8]) -> bool {
#[cfg(target_arch = "x86_64")]
{
if self.capabilities.has_bmi2 && input.len() >= 8 {
return unsafe { self.validate_utf8_bmi2_impl(input) };
}
}
std::str::from_utf8(input).is_ok()
}
pub fn count_utf8_chars_bmi2(&self, input: &[u8]) -> Result<usize> {
#[cfg(target_arch = "x86_64")]
{
if self.capabilities.has_bmi2 && input.len() >= 8 {
return unsafe { self.count_utf8_chars_bmi2_impl(input) };
}
}
match std::str::from_utf8(input) {
Ok(s) => Ok(s.chars().count()),
Err(_) => Err(ZiporaError::invalid_data("Invalid UTF-8 sequence")),
}
}
pub fn extract_utf8_chars_bmi2(&self, input: &[u8]) -> Result<Vec<u32>> {
#[cfg(target_arch = "x86_64")]
{
if self.capabilities.has_bmi2 && input.len() >= 8 {
return unsafe { self.extract_utf8_chars_bmi2_impl(input) };
}
}
match std::str::from_utf8(input) {
Ok(s) => Ok(s.chars().map(|c| c as u32).collect()),
Err(_) => Err(ZiporaError::invalid_data("Invalid UTF-8 sequence")),
}
}
pub fn utf8_to_utf16_bmi2(&self, input: &[u8]) -> Result<Vec<u16>> {
#[cfg(target_arch = "x86_64")]
{
if self.capabilities.has_bmi2 && input.len() >= 8 {
return unsafe { self.utf8_to_utf16_bmi2_impl(input) };
}
}
match std::str::from_utf8(input) {
Ok(s) => Ok(s.encode_utf16().collect()),
Err(_) => Err(ZiporaError::invalid_data("Invalid UTF-8 sequence")),
}
}
pub fn search_bmi2(&self, haystack: &str, needle: &str) -> Option<usize> {
if needle.is_empty() {
return Some(0);
}
if haystack.len() < needle.len() {
return None;
}
#[cfg(target_arch = "x86_64")]
{
if self.capabilities.has_bmi2 && haystack.len() >= 16 && needle.len() >= 4 {
return unsafe { self.search_bmi2_impl(haystack.as_bytes(), needle.as_bytes()) };
}
}
haystack.find(needle)
}
pub fn wildcard_match_bmi2(&self, text: &str, pattern: &str) -> bool {
#[cfg(target_arch = "x86_64")]
{
if self.capabilities.has_bmi2 && text.len() >= 8 && pattern.len() >= 4 {
return unsafe { self.wildcard_match_bmi2_impl(text.as_bytes(), pattern.as_bytes()) };
}
}
self.wildcard_match_scalar(text, pattern)
}
pub fn char_class_match_bmi2(&self, text: &str, char_classes: &[CharClass]) -> Vec<bool> {
#[cfg(target_arch = "x86_64")]
{
if self.capabilities.has_bmi2 && text.len() >= 8 {
return unsafe { self.char_class_match_bmi2_impl(text.as_bytes(), char_classes) };
}
}
text.chars()
.map(|c| char_classes.iter().any(|class| class.matches(c)))
.collect()
}
pub fn extract_substrings_bmi2(&self, text: &str, ranges: &[(usize, usize)]) -> Result<Vec<String>> {
let mut results = Vec::with_capacity(ranges.len());
let text_bytes = text.as_bytes();
for &(start, len) in ranges {
if start + len > text.len() {
return Err(ZiporaError::invalid_data("Substring range out of bounds"));
}
#[cfg(target_arch = "x86_64")]
{
if self.capabilities.has_bmi2 && len >= 8 {
let substring = unsafe { self.extract_substring_bmi2_impl(text_bytes, start, len)? };
results.push(substring);
continue;
}
}
let substring = text[start..start + len].to_string();
results.push(substring);
}
Ok(results)
}
pub fn to_lowercase_ascii_bmi2(&self, input: &str) -> String {
#[cfg(target_arch = "x86_64")]
{
if self.capabilities.has_bmi2 && input.len() >= 8 {
return unsafe { self.to_lowercase_ascii_bmi2_impl(input.as_bytes()) };
}
}
input.to_ascii_lowercase()
}
pub fn to_uppercase_ascii_bmi2(&self, input: &str) -> String {
#[cfg(target_arch = "x86_64")]
{
if self.capabilities.has_bmi2 && input.len() >= 8 {
return unsafe { self.to_uppercase_ascii_bmi2_impl(input.as_bytes()) };
}
}
input.to_ascii_uppercase()
}
pub fn filter_chars_bmi2(&self, input: &str, filter: CharFilter) -> String {
#[cfg(target_arch = "x86_64")]
{
if self.capabilities.has_bmi2 && input.len() >= 8 {
return unsafe { self.filter_chars_bmi2_impl(input.as_bytes(), filter) };
}
}
input.chars().filter(|&c| filter.matches(c)).collect()
}
pub fn hash_string_bmi2(&self, input: &str, seed: u64) -> u64 {
#[cfg(target_arch = "x86_64")]
{
if self.capabilities.has_bmi2 && input.len() >= 8 {
return unsafe { self.hash_string_bmi2_impl(input.as_bytes(), seed) };
}
}
self.hash_string_scalar(input.as_bytes(), seed)
}
pub fn detect_runs_bmi2(&self, input: &str) -> Vec<CharRun> {
#[cfg(target_arch = "x86_64")]
{
if self.capabilities.has_bmi2 && input.len() >= 8 {
return unsafe { self.detect_runs_bmi2_impl(input.as_bytes()) };
}
}
self.detect_runs_scalar(input)
}
pub fn analyze_compression_bmi2(&self, input: &str) -> CompressionAnalysis {
#[cfg(target_arch = "x86_64")]
{
if self.capabilities.has_bmi2 && input.len() >= 16 {
return unsafe { self.analyze_compression_bmi2_impl(input.as_bytes()) };
}
}
self.analyze_compression_scalar(input)
}
pub fn dictionary_lookup_bmi2(&self, text: &str, dictionary: &StringDictionary) -> Vec<DictionaryMatch> {
#[cfg(target_arch = "x86_64")]
{
if self.capabilities.has_bmi2 && text.len() >= 8 {
return unsafe { self.dictionary_lookup_bmi2_impl(text.as_bytes(), dictionary) };
}
}
self.dictionary_lookup_scalar(text, dictionary)
}
pub fn validate_bulk_bmi2(&self, strings: &[&str]) -> Vec<bool> {
#[cfg(target_arch = "x86_64")]
{
if self.capabilities.has_bmi2 && strings.len() >= 4 {
return unsafe { self.validate_bulk_bmi2_impl(strings) };
}
}
strings.iter()
.map(|s| self.validate_utf8_bmi2(s.as_bytes()))
.collect()
}
pub fn hash_bulk_bmi2(&self, strings: &[&str], seed: u64) -> Vec<u64> {
#[cfg(target_arch = "x86_64")]
{
if self.capabilities.has_bmi2 && strings.len() >= 4 {
return unsafe { self.hash_bulk_bmi2_impl(strings, seed) };
}
}
strings.iter()
.map(|s| self.hash_string_bmi2(s, seed))
.collect()
}
pub fn compare_bulk_bmi2(&self, pairs: &[(&str, &str)]) -> Vec<bool> {
#[cfg(target_arch = "x86_64")]
{
if self.capabilities.has_bmi2 && pairs.len() >= 4 {
return unsafe { self.compare_bulk_bmi2_impl(pairs) };
}
}
pairs.iter()
.map(|(a, b)| a == b)
.collect()
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi1,bmi2")]
unsafe fn validate_utf8_bmi2_impl(&self, input: &[u8]) -> bool {
std::str::from_utf8(input).is_ok()
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi1,bmi2")]
unsafe fn count_utf8_chars_bmi2_impl(&self, input: &[u8]) -> Result<usize> {
if std::str::from_utf8(input).is_err() {
return Err(ZiporaError::invalid_data("Invalid UTF-8 sequence"));
}
let mut continuation_bytes = 0;
let mut i = 0;
while i + 8 <= input.len() {
let bytes = unsafe { std::ptr::read_unaligned(input.as_ptr().add(i) as *const u64) };
continuation_bytes += self.count_utf8_continuation_bytes_bmi2(bytes);
i += 8;
}
while i < input.len() {
if (input[i] & 0xC0) == 0x80 {
continuation_bytes += 1;
}
i += 1;
}
Ok(input.len() - continuation_bytes)
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi1,bmi2")]
unsafe fn extract_utf8_chars_bmi2_impl(&self, input: &[u8]) -> Result<Vec<u32>> {
let mut chars = Vec::new();
let mut i = 0;
while i < input.len() {
if i + 4 <= input.len() {
let char_bytes = unsafe { std::ptr::read_unaligned(input.as_ptr().add(i) as *const u32) };
match self.decode_utf8_char_bmi2(char_bytes, &mut i) {
Some(code_point) => chars.push(code_point),
None => return Err(ZiporaError::invalid_data("Invalid UTF-8 character")),
}
} else {
let remainder = &input[i..];
match std::str::from_utf8(remainder) {
Ok(s) => {
chars.extend(s.chars().map(|c| c as u32));
break;
}
Err(_) => return Err(ZiporaError::invalid_data("Invalid UTF-8 sequence")),
}
}
}
Ok(chars)
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi1,bmi2")]
unsafe fn utf8_to_utf16_bmi2_impl(&self, input: &[u8]) -> Result<Vec<u16>> {
let mut utf16_output = Vec::new();
let mut i = 0;
while i < input.len() {
if i + 4 <= input.len() {
let char_bytes = unsafe { std::ptr::read_unaligned(input.as_ptr().add(i) as *const u32) };
match self.decode_utf8_char_bmi2(char_bytes, &mut i) {
Some(code_point) => {
let utf16_chars = self.encode_utf16_bmi2(code_point);
utf16_output.extend_from_slice(&utf16_chars);
}
None => return Err(ZiporaError::invalid_data("Invalid UTF-8 character")),
}
} else {
let remainder = &input[i..];
match std::str::from_utf8(remainder) {
Ok(s) => {
utf16_output.extend(s.encode_utf16());
break;
}
Err(_) => return Err(ZiporaError::invalid_data("Invalid UTF-8 sequence")),
}
}
}
Ok(utf16_output)
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi1,bmi2")]
unsafe fn search_bmi2_impl(&self, haystack: &[u8], needle: &[u8]) -> Option<usize> {
if needle.len() > haystack.len() {
return None;
}
let first_char = needle[0];
let search_range = haystack.len() - needle.len() + 1;
let mut i = 0;
while i < search_range {
if i + 8 <= haystack.len() {
let chunk = unsafe { std::ptr::read_unaligned(haystack.as_ptr().add(i) as *const u64) };
for byte_pos in 0..8 {
if i + byte_pos >= search_range {
break;
}
let byte_val = Bmi2BextrOps::extract_bits_bextr(chunk, (byte_pos * 8) as u32, 8) as u8;
if byte_val == first_char {
if self.compare_substring_bmi2(&haystack[i + byte_pos..], needle) {
return Some(i + byte_pos);
}
}
}
i += 8; } else {
if haystack[i] == first_char {
if haystack[i..].starts_with(needle) {
return Some(i);
}
}
i += 1;
}
}
None
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi1,bmi2")]
unsafe fn wildcard_match_bmi2_impl(&self, text: &[u8], pattern: &[u8]) -> bool {
let mut text_idx = 0;
let mut pattern_idx = 0;
while pattern_idx < pattern.len() && text_idx < text.len() {
match pattern[pattern_idx] {
b'*' => {
while pattern_idx < pattern.len() && pattern[pattern_idx] == b'*' {
pattern_idx += 1;
}
if pattern_idx == pattern.len() {
return true; }
let next_char = pattern[pattern_idx];
while text_idx < text.len() {
let current_char = if text_idx + 8 <= text.len() {
let chunk = unsafe { std::ptr::read_unaligned(text.as_ptr().add(text_idx) as *const u64) };
Bmi2BextrOps::extract_bits_bextr(chunk, 0, 8) as u8
} else {
text[text_idx]
};
if current_char == next_char {
break;
}
text_idx += 1;
}
}
b'?' => {
text_idx += 1;
pattern_idx += 1;
}
c => {
let text_char = if text_idx + 8 <= text.len() {
let chunk = unsafe { std::ptr::read_unaligned(text.as_ptr().add(text_idx) as *const u64) };
Bmi2BextrOps::extract_bits_bextr(chunk, 0, 8) as u8
} else {
text[text_idx]
};
if text_char != c {
return false;
}
text_idx += 1;
pattern_idx += 1;
}
}
}
while pattern_idx < pattern.len() && pattern[pattern_idx] == b'*' {
pattern_idx += 1;
}
pattern_idx == pattern.len()
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi1,bmi2")]
unsafe fn char_class_match_bmi2_impl(&self, text: &[u8], char_classes: &[CharClass]) -> Vec<bool> {
let mut results = Vec::with_capacity(text.len());
let mut i = 0;
while i < text.len() {
if i + 8 <= text.len() {
let chunk = unsafe { std::ptr::read_unaligned(text.as_ptr().add(i) as *const u64) };
for byte_pos in 0..8 {
let char_val = Bmi2BextrOps::extract_bits_bextr(chunk, (byte_pos * 8) as u32, 8) as u8;
let matches = char_classes.iter().any(|class| class.matches_byte(char_val));
results.push(matches);
}
i += 8;
} else {
let char_val = text[i];
let matches = char_classes.iter().any(|class| class.matches_byte(char_val));
results.push(matches);
i += 1;
}
}
results
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi1,bmi2")]
unsafe fn extract_substring_bmi2_impl(&self, text: &[u8], start: usize, len: usize) -> Result<String> {
let end = start + len;
if end > text.len() {
return Err(ZiporaError::invalid_data("Substring range out of bounds"));
}
let substring_bytes = &text[start..end];
if len >= 8 && self.validate_utf8_bmi2(substring_bytes) {
Ok(unsafe { String::from_utf8_unchecked(substring_bytes.to_vec()) })
} else {
match std::str::from_utf8(substring_bytes) {
Ok(s) => Ok(s.to_string()),
Err(_) => Err(ZiporaError::invalid_data("Invalid UTF-8 in substring")),
}
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi1,bmi2")]
unsafe fn to_lowercase_ascii_bmi2_impl(&self, input: &[u8]) -> String {
let mut output = Vec::with_capacity(input.len());
for chunk in input.chunks(8) {
if chunk.len() == 8 {
let bytes = unsafe { std::ptr::read_unaligned(chunk.as_ptr() as *const u64) };
let lowercase_bytes = self.to_lowercase_chunk_bmi2(bytes);
for byte_pos in 0..8 {
let converted = Bmi2BextrOps::extract_bits_bextr(lowercase_bytes, (byte_pos * 8) as u32, 8) as u8;
output.push(converted);
}
} else {
for &byte in chunk {
output.push(byte.to_ascii_lowercase());
}
}
}
unsafe { String::from_utf8_unchecked(output) }
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi1,bmi2")]
unsafe fn to_uppercase_ascii_bmi2_impl(&self, input: &[u8]) -> String {
let mut output = Vec::with_capacity(input.len());
for chunk in input.chunks(8) {
if chunk.len() == 8 {
let bytes = unsafe { std::ptr::read_unaligned(chunk.as_ptr() as *const u64) };
let uppercase_bytes = self.to_uppercase_chunk_bmi2(bytes);
for byte_pos in 0..8 {
let converted = Bmi2BextrOps::extract_bits_bextr(uppercase_bytes, (byte_pos * 8) as u32, 8) as u8;
output.push(converted);
}
} else {
for &byte in chunk {
output.push(byte.to_ascii_uppercase());
}
}
}
unsafe { String::from_utf8_unchecked(output) }
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi1,bmi2")]
unsafe fn filter_chars_bmi2_impl(&self, input: &[u8], filter: CharFilter) -> String {
let mut output = Vec::new();
for chunk in input.chunks(8) {
if chunk.len() == 8 {
let bytes = unsafe { std::ptr::read_unaligned(chunk.as_ptr() as *const u64) };
for byte_pos in 0..8 {
let byte_val = Bmi2BextrOps::extract_bits_bextr(bytes, (byte_pos * 8) as u32, 8) as u8;
if filter.matches_byte(byte_val) {
output.push(byte_val);
}
}
} else {
for &byte in chunk {
if filter.matches_byte(byte) {
output.push(byte);
}
}
}
}
unsafe { String::from_utf8_unchecked(output) }
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi1,bmi2")]
unsafe fn hash_string_bmi2_impl(&self, input: &[u8], mut hash: u64) -> u64 {
for chunk in input.chunks(8) {
if chunk.len() == 8 {
let bytes = unsafe { std::ptr::read_unaligned(chunk.as_ptr() as *const u64) };
hash = hash.rotate_left(5).wrapping_add(bytes);
hash ^= Bmi2BextrOps::extract_bits_bextr(hash, 13, 19);
} else {
for &byte in chunk {
hash = hash.rotate_left(5).wrapping_add(byte as u64);
}
}
}
hash
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi1,bmi2")]
unsafe fn detect_runs_bmi2_impl(&self, input: &[u8]) -> Vec<CharRun> {
let mut runs = Vec::new();
if input.is_empty() {
return runs;
}
let mut current_char = input[0];
let mut run_start = 0;
let mut run_length = 1;
for i in 1..input.len() {
let next_char = if i + 8 <= input.len() {
let chunk = unsafe { std::ptr::read_unaligned(input.as_ptr().add(i) as *const u64) };
Bmi2BextrOps::extract_bits_bextr(chunk, 0, 8) as u8
} else {
input[i]
};
if next_char == current_char {
run_length += 1;
} else {
runs.push(CharRun {
character: current_char,
start: run_start,
length: run_length,
});
current_char = next_char;
run_start = i;
run_length = 1;
}
}
runs.push(CharRun {
character: current_char,
start: run_start,
length: run_length,
});
runs
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi1,bmi2")]
unsafe fn analyze_compression_bmi2_impl(&self, input: &[u8]) -> CompressionAnalysis {
let mut char_freq: HashMap<u8, u32> = HashMap::new();
let mut total_chars = 0;
for chunk in input.chunks(8) {
if chunk.len() == 8 {
let bytes = unsafe { std::ptr::read_unaligned(chunk.as_ptr() as *const u64) };
for byte_pos in 0..8 {
let char_val = Bmi2BextrOps::extract_bits_bextr(bytes, (byte_pos * 8) as u32, 8) as u8;
*char_freq.entry(char_val).or_insert(0) += 1;
total_chars += 1;
}
} else {
for &byte in chunk {
*char_freq.entry(byte).or_insert(0) += 1;
total_chars += 1;
}
}
}
let mut entropy = 0.0;
for &freq in char_freq.values() {
if freq > 0 {
let p = freq as f64 / total_chars as f64;
entropy -= p * p.log2();
}
}
CompressionAnalysis {
unique_chars: char_freq.len(),
total_chars,
entropy,
estimated_compression_ratio: entropy / 8.0,
char_frequencies: char_freq,
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi1,bmi2")]
unsafe fn dictionary_lookup_bmi2_impl(&self, text: &[u8], dictionary: &StringDictionary) -> Vec<DictionaryMatch> {
let mut matches = Vec::new();
for i in 0..text.len() {
for entry in &dictionary.entries {
let remaining = &text[i..];
if remaining.len() >= entry.text.len() {
let match_found = if entry.text.len() >= 8 && self.capabilities.has_bmi2 {
self.compare_strings_bmi2(remaining, entry.text.as_bytes(), entry.text.len())
} else {
remaining.starts_with(entry.text.as_bytes())
};
if match_found {
matches.push(DictionaryMatch {
position: i,
length: entry.text.len(),
dictionary_index: entry.index,
});
}
}
}
}
matches
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi1,bmi2")]
unsafe fn validate_bulk_bmi2_impl(&self, strings: &[&str]) -> Vec<bool> {
strings.iter()
.map(|s| self.validate_utf8_bmi2(s.as_bytes()))
.collect()
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi1,bmi2")]
unsafe fn hash_bulk_bmi2_impl(&self, strings: &[&str], seed: u64) -> Vec<u64> {
strings.iter()
.map(|s| unsafe { self.hash_string_bmi2_impl(s.as_bytes(), seed) })
.collect()
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "bmi1,bmi2")]
unsafe fn compare_bulk_bmi2_impl(&self, pairs: &[(&str, &str)]) -> Vec<bool> {
pairs.iter()
.map(|(a, b)| {
let a_bytes = a.as_bytes();
let b_bytes = b.as_bytes();
if a_bytes.len() != b_bytes.len() {
false
} else {
self.compare_strings_bmi2(a_bytes, b_bytes, a_bytes.len())
}
})
.collect()
}
#[cfg(target_arch = "x86_64")]
#[inline]
fn count_utf8_continuation_bytes_bmi2(&self, bytes: u64) -> usize {
let mut count = 0;
for byte_pos in 0..8 {
let byte_val = Bmi2BextrOps::extract_bits_bextr(bytes, (byte_pos * 8) as u32, 8) as u8;
if (byte_val & 0xC0) == 0x80 {
count += 1;
}
}
count
}
#[cfg(target_arch = "x86_64")]
#[inline]
fn compare_strings_bmi2(&self, text1: &[u8], text2: &[u8], len: usize) -> bool {
if len < 8 {
return text1.starts_with(text2);
}
let mut i = 0;
while i + 8 <= len {
unsafe {
let bytes1 = std::ptr::read_unaligned(text1.as_ptr().add(i) as *const u64);
let bytes2 = std::ptr::read_unaligned(text2.as_ptr().add(i) as *const u64);
if bytes1 != bytes2 {
return false;
}
}
i += 8;
}
while i < len {
if text1[i] != text2[i] {
return false;
}
i += 1;
}
true
}
#[cfg(target_arch = "x86_64")]
#[inline]
fn decode_utf8_char_bmi2(&self, char_bytes: u32, position: &mut usize) -> Option<u32> {
let first_byte = (char_bytes & 0xFF) as u8;
match first_byte {
0x00..=0x7F => {
*position += 1;
Some(first_byte as u32)
}
0xC0..=0xDF => {
if *position + 1 < 4 {
let second_byte = ((char_bytes >> 8) & 0xFF) as u8;
*position += 2;
Some(((first_byte as u32 & 0x1F) << 6) | (second_byte as u32 & 0x3F))
} else {
None
}
}
0xE0..=0xEF => {
if *position + 2 < 4 {
let second_byte = ((char_bytes >> 8) & 0xFF) as u8;
let third_byte = ((char_bytes >> 16) & 0xFF) as u8;
*position += 3;
Some(((first_byte as u32 & 0x0F) << 12) |
((second_byte as u32 & 0x3F) << 6) |
(third_byte as u32 & 0x3F))
} else {
None
}
}
0xF0..=0xF7 => {
if *position + 3 < 4 {
let second_byte = ((char_bytes >> 8) & 0xFF) as u8;
let third_byte = ((char_bytes >> 16) & 0xFF) as u8;
let fourth_byte = ((char_bytes >> 24) & 0xFF) as u8;
*position += 4;
Some(((first_byte as u32 & 0x07) << 18) |
((second_byte as u32 & 0x3F) << 12) |
((third_byte as u32 & 0x3F) << 6) |
(fourth_byte as u32 & 0x3F))
} else {
None
}
}
_ => None,
}
}
#[cfg(target_arch = "x86_64")]
#[inline]
fn encode_utf16_bmi2(&self, code_point: u32) -> Vec<u16> {
if code_point <= 0xFFFF {
vec![code_point as u16]
} else {
let adjusted = code_point - 0x10000;
vec![
0xD800 + (adjusted >> 10) as u16,
0xDC00 + (adjusted & 0x3FF) as u16,
]
}
}
#[cfg(target_arch = "x86_64")]
#[inline]
fn compare_substring_bmi2(&self, haystack: &[u8], needle: &[u8]) -> bool {
if haystack.len() < needle.len() {
return false;
}
for i in 0..needle.len() {
if haystack[i] != needle[i] {
return false;
}
}
true
}
#[cfg(target_arch = "x86_64")]
#[inline]
fn to_lowercase_chunk_bmi2(&self, bytes: u64) -> u64 {
let mut result = 0u64;
for byte_pos in 0..8 {
let byte_val = Bmi2BextrOps::extract_bits_bextr(bytes, (byte_pos * 8) as u32, 8) as u8;
let lowercase = if byte_val >= b'A' && byte_val <= b'Z' {
byte_val + 32
} else {
byte_val
};
result |= (lowercase as u64) << (byte_pos * 8);
}
result
}
#[cfg(target_arch = "x86_64")]
#[inline]
fn to_uppercase_chunk_bmi2(&self, bytes: u64) -> u64 {
let mut result = 0u64;
for byte_pos in 0..8 {
let byte_val = Bmi2BextrOps::extract_bits_bextr(bytes, (byte_pos * 8) as u32, 8) as u8;
let uppercase = if byte_val >= b'a' && byte_val <= b'z' {
byte_val - 32
} else {
byte_val
};
result |= (uppercase as u64) << (byte_pos * 8);
}
result
}
fn wildcard_match_scalar(&self, text: &str, pattern: &str) -> bool {
let text_chars: Vec<char> = text.chars().collect();
let pattern_chars: Vec<char> = pattern.chars().collect();
self.wildcard_match_recursive(&text_chars, &pattern_chars, 0, 0)
}
fn wildcard_match_recursive(&self, text: &[char], pattern: &[char], text_idx: usize, pattern_idx: usize) -> bool {
if pattern_idx == pattern.len() {
return text_idx == text.len();
}
if text_idx == text.len() {
return pattern[pattern_idx..].iter().all(|&c| c == '*');
}
match pattern[pattern_idx] {
'*' => {
self.wildcard_match_recursive(text, pattern, text_idx, pattern_idx + 1) ||
self.wildcard_match_recursive(text, pattern, text_idx + 1, pattern_idx)
}
'?' => {
self.wildcard_match_recursive(text, pattern, text_idx + 1, pattern_idx + 1)
}
c => {
text[text_idx] == c &&
self.wildcard_match_recursive(text, pattern, text_idx + 1, pattern_idx + 1)
}
}
}
fn detect_runs_scalar(&self, input: &str) -> Vec<CharRun> {
let mut runs = Vec::new();
let bytes = input.as_bytes();
if bytes.is_empty() {
return runs;
}
let mut current_char = bytes[0];
let mut run_start = 0;
let mut run_length = 1;
for (i, &byte) in bytes.iter().enumerate().skip(1) {
if byte == current_char {
run_length += 1;
} else {
runs.push(CharRun {
character: current_char,
start: run_start,
length: run_length,
});
current_char = byte;
run_start = i;
run_length = 1;
}
}
runs.push(CharRun {
character: current_char,
start: run_start,
length: run_length,
});
runs
}
fn analyze_compression_scalar(&self, input: &str) -> CompressionAnalysis {
let mut char_freq: HashMap<u8, u32> = HashMap::new();
let bytes = input.as_bytes();
for &byte in bytes {
*char_freq.entry(byte).or_insert(0) += 1;
}
let mut entropy = 0.0;
let total_chars = bytes.len();
for &freq in char_freq.values() {
if freq > 0 {
let p = freq as f64 / total_chars as f64;
entropy -= p * p.log2();
}
}
CompressionAnalysis {
unique_chars: char_freq.len(),
total_chars,
entropy,
estimated_compression_ratio: entropy / 8.0,
char_frequencies: char_freq,
}
}
fn dictionary_lookup_scalar(&self, text: &str, dictionary: &StringDictionary) -> Vec<DictionaryMatch> {
let mut matches = Vec::new();
let bytes = text.as_bytes();
for i in 0..bytes.len() {
for entry in &dictionary.entries {
if bytes[i..].starts_with(entry.text.as_bytes()) {
matches.push(DictionaryMatch {
position: i,
length: entry.text.len(),
dictionary_index: entry.index,
});
}
}
}
matches
}
fn hash_string_scalar(&self, input: &[u8], mut hash: u64) -> u64 {
for &byte in input {
hash = hash.rotate_left(5).wrapping_add(byte as u64);
}
hash
}
}
impl Default for Bmi2StringProcessor {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone)]
pub enum CharClass {
Alpha,
Digit,
Alnum,
Space,
Punct,
Custom(Vec<u8>),
Range(u8, u8),
}
impl CharClass {
pub fn matches(&self, c: char) -> bool {
self.matches_byte(c as u8)
}
pub fn matches_byte(&self, byte: u8) -> bool {
match self {
CharClass::Alpha => byte.is_ascii_alphabetic(),
CharClass::Digit => byte.is_ascii_digit(),
CharClass::Alnum => byte.is_ascii_alphanumeric(),
CharClass::Space => byte.is_ascii_whitespace(),
CharClass::Punct => byte.is_ascii_punctuation(),
CharClass::Custom(chars) => chars.contains(&byte),
CharClass::Range(start, end) => byte >= *start && byte <= *end,
}
}
}
#[derive(Debug, Clone)]
pub enum CharFilter {
AlphaOnly,
DigitOnly,
AlnumOnly,
NoWhitespace,
KeepChars(Vec<u8>),
RemoveChars(Vec<u8>),
}
impl CharFilter {
pub fn matches(&self, c: char) -> bool {
self.matches_byte(c as u8)
}
pub fn matches_byte(&self, byte: u8) -> bool {
match self {
CharFilter::AlphaOnly => byte.is_ascii_alphabetic(),
CharFilter::DigitOnly => byte.is_ascii_digit(),
CharFilter::AlnumOnly => byte.is_ascii_alphanumeric(),
CharFilter::NoWhitespace => !byte.is_ascii_whitespace(),
CharFilter::KeepChars(chars) => chars.contains(&byte),
CharFilter::RemoveChars(chars) => !chars.contains(&byte),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CharRun {
pub character: u8,
pub start: usize,
pub length: usize,
}
#[derive(Debug, Clone)]
pub struct CompressionAnalysis {
pub unique_chars: usize,
pub total_chars: usize,
pub entropy: f64,
pub estimated_compression_ratio: f64,
pub char_frequencies: HashMap<u8, u32>,
}
#[derive(Debug, Clone)]
pub struct DictionaryEntry {
pub text: String,
pub index: usize,
pub hash: u64,
}
#[derive(Debug, Clone)]
pub struct StringDictionary {
pub entries: Vec<DictionaryEntry>,
pub hash_table: HashMap<u64, usize>,
}
impl StringDictionary {
pub fn new(strings: Vec<String>) -> Self {
let mut entries = Vec::with_capacity(strings.len());
let mut hash_table = HashMap::new();
for (index, text) in strings.into_iter().enumerate() {
let hash = text.as_bytes().iter().fold(0u64, |acc, &b| {
acc.rotate_left(5).wrapping_add(b as u64)
});
entries.push(DictionaryEntry {
text,
index,
hash,
});
hash_table.insert(hash, index);
}
Self {
entries,
hash_table,
}
}
pub fn lookup_by_hash(&self, hash: u64) -> Option<&DictionaryEntry> {
self.hash_table.get(&hash).and_then(|&index| self.entries.get(index))
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct DictionaryMatch {
pub position: usize,
pub length: usize,
pub dictionary_index: usize,
}
static GLOBAL_BMI2_PROCESSOR: std::sync::OnceLock<Bmi2StringProcessor> = std::sync::OnceLock::new();
pub fn get_global_bmi2_processor() -> &'static Bmi2StringProcessor {
GLOBAL_BMI2_PROCESSOR.get_or_init(|| Bmi2StringProcessor::new())
}
pub fn validate_utf8_bmi2(input: &[u8]) -> bool {
get_global_bmi2_processor().validate_utf8_bmi2(input)
}
pub fn count_utf8_chars_bmi2(input: &[u8]) -> Result<usize> {
get_global_bmi2_processor().count_utf8_chars_bmi2(input)
}
pub fn search_string_bmi2(haystack: &str, needle: &str) -> Option<usize> {
get_global_bmi2_processor().search_bmi2(haystack, needle)
}
pub fn wildcard_match_bmi2(text: &str, pattern: &str) -> bool {
get_global_bmi2_processor().wildcard_match_bmi2(text, pattern)
}
pub fn to_lowercase_ascii_bmi2(input: &str) -> String {
get_global_bmi2_processor().to_lowercase_ascii_bmi2(input)
}
pub fn to_uppercase_ascii_bmi2(input: &str) -> String {
get_global_bmi2_processor().to_uppercase_ascii_bmi2(input)
}
pub fn hash_string_bmi2(input: &str, seed: u64) -> u64 {
get_global_bmi2_processor().hash_string_bmi2(input, seed)
}
pub fn detect_runs_bmi2(input: &str) -> Vec<CharRun> {
get_global_bmi2_processor().detect_runs_bmi2(input)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_bmi2_processor_creation() {
let processor = Bmi2StringProcessor::new();
println!("BMI2 available: {}", processor.is_bmi2_available());
}
#[test]
fn test_utf8_validation() {
let processor = Bmi2StringProcessor::new();
assert!(processor.validate_utf8_bmi2("Hello, World!".as_bytes()));
assert!(processor.validate_utf8_bmi2("こんにちは".as_bytes())); assert!(processor.validate_utf8_bmi2("🦀 Rust".as_bytes()));
assert!(!processor.validate_utf8_bmi2(&[0xFF, 0xFE, 0xFD]));
}
#[test]
fn test_character_counting() {
let processor = Bmi2StringProcessor::new();
let test_cases = vec![
("Hello", 5),
("こんにちは", 5), ("🦀🔥⚡", 3), ("", 0),
];
for (text, expected) in test_cases {
let count = processor.count_utf8_chars_bmi2(text.as_bytes()).unwrap();
assert_eq!(count, expected, "Failed for text: {}", text);
}
}
#[test]
fn test_string_search() {
let processor = Bmi2StringProcessor::new();
let haystack = "The quick brown fox jumps over the lazy dog";
assert_eq!(processor.search_bmi2(haystack, "quick"), Some(4));
assert_eq!(processor.search_bmi2(haystack, "fox"), Some(16));
assert_eq!(processor.search_bmi2(haystack, "dog"), Some(40));
assert_eq!(processor.search_bmi2(haystack, "cat"), None);
assert_eq!(processor.search_bmi2(haystack, ""), Some(0));
}
#[test]
fn test_wildcard_matching() {
let processor = Bmi2StringProcessor::new();
assert!(processor.wildcard_match_bmi2("hello", "hello"));
assert!(processor.wildcard_match_bmi2("hello", "h*"));
assert!(processor.wildcard_match_bmi2("hello", "*o"));
assert!(processor.wildcard_match_bmi2("hello", "h?llo"));
assert!(processor.wildcard_match_bmi2("hello", "*"));
assert!(!processor.wildcard_match_bmi2("hello", "hi"));
assert!(!processor.wildcard_match_bmi2("hello", "h?"));
assert!(!processor.wildcard_match_bmi2("hello", "h??"));
}
#[test]
fn test_case_conversion() {
let processor = Bmi2StringProcessor::new();
let test_string = "Hello World 123!";
let lowercase = processor.to_lowercase_ascii_bmi2(test_string);
assert_eq!(lowercase, "hello world 123!");
let uppercase = processor.to_uppercase_ascii_bmi2(test_string);
assert_eq!(uppercase, "HELLO WORLD 123!");
}
#[test]
fn test_character_filtering() {
let processor = Bmi2StringProcessor::new();
let test_string = "Hello, World! 123";
let alpha_only = processor.filter_chars_bmi2(test_string, CharFilter::AlphaOnly);
assert_eq!(alpha_only, "HelloWorld");
let digit_only = processor.filter_chars_bmi2(test_string, CharFilter::DigitOnly);
assert_eq!(digit_only, "123");
let no_whitespace = processor.filter_chars_bmi2(test_string, CharFilter::NoWhitespace);
assert_eq!(no_whitespace, "Hello,World!123");
}
#[test]
fn test_run_detection() {
let processor = Bmi2StringProcessor::new();
let test_string = "aaabbccccdd";
let runs = processor.detect_runs_bmi2(test_string);
assert_eq!(runs.len(), 4);
assert_eq!(runs[0], CharRun { character: b'a', start: 0, length: 3 });
assert_eq!(runs[1], CharRun { character: b'b', start: 3, length: 2 });
assert_eq!(runs[2], CharRun { character: b'c', start: 5, length: 4 });
assert_eq!(runs[3], CharRun { character: b'd', start: 9, length: 2 });
}
#[test]
fn test_compression_analysis() {
let processor = Bmi2StringProcessor::new();
let test_string = "aaabbbccc";
let analysis = processor.analyze_compression_bmi2(test_string);
assert_eq!(analysis.unique_chars, 3);
assert_eq!(analysis.total_chars, 9);
assert!(analysis.entropy > 0.0);
assert!(analysis.estimated_compression_ratio < 1.0);
}
#[test]
fn test_string_hashing() {
let processor = Bmi2StringProcessor::new();
let test_string = "Hash this string";
let hash1 = processor.hash_string_bmi2(test_string, 0);
let hash2 = processor.hash_string_bmi2(test_string, 0);
let hash3 = processor.hash_string_bmi2(test_string, 42);
assert_eq!(hash1, hash2);
assert_ne!(hash1, hash3);
}
#[test]
fn test_bulk_operations() {
let processor = Bmi2StringProcessor::new();
let strings = vec!["hello", "world", "test", "string"];
let validations = processor.validate_bulk_bmi2(&strings);
assert!(validations.iter().all(|&v| v));
let hashes = processor.hash_bulk_bmi2(&strings, 0);
assert_eq!(hashes.len(), 4);
assert!(hashes.iter().all(|&h| h != 0));
let pairs = vec![("hello", "hello"), ("world", "word"), ("test", "test")];
let comparisons = processor.compare_bulk_bmi2(&pairs);
assert_eq!(comparisons, vec![true, false, true]);
}
#[test]
fn test_convenience_functions() {
assert!(validate_utf8_bmi2("Hello, World!".as_bytes()));
assert_eq!(count_utf8_chars_bmi2("Hello".as_bytes()).unwrap(), 5);
assert_eq!(search_string_bmi2("hello world", "world"), Some(6));
assert!(wildcard_match_bmi2("hello", "h*o"));
assert_eq!(to_lowercase_ascii_bmi2("HELLO"), "hello");
assert_eq!(to_uppercase_ascii_bmi2("hello"), "HELLO");
let hash = hash_string_bmi2("test", 0);
assert_ne!(hash, 0);
let runs = detect_runs_bmi2("aabbcc");
assert_eq!(runs.len(), 3);
}
#[test]
fn test_character_classes() {
let alpha = CharClass::Alpha;
let digit = CharClass::Digit;
let custom = CharClass::Custom(vec![b'x', b'y', b'z']);
let range = CharClass::Range(b'a', b'z');
assert!(alpha.matches('A'));
assert!(alpha.matches('z'));
assert!(!alpha.matches('1'));
assert!(digit.matches('5'));
assert!(!digit.matches('a'));
assert!(custom.matches('x'));
assert!(!custom.matches('a'));
assert!(range.matches('m'));
assert!(!range.matches('A'));
}
#[test]
fn test_dictionary_operations() {
let processor = Bmi2StringProcessor::new();
let dict_strings = vec![
"the".to_string(),
"quick".to_string(),
"brown".to_string(),
"fox".to_string(),
];
let dictionary = StringDictionary::new(dict_strings);
let text = "the quick brown fox";
let matches = processor.dictionary_lookup_bmi2(text, &dictionary);
assert!(!matches.is_empty());
assert!(matches.iter().any(|m| m.position == 0 && m.length == 3)); assert!(matches.iter().any(|m| m.position == 4 && m.length == 5)); }
#[test]
fn test_edge_cases() {
let processor = Bmi2StringProcessor::new();
assert!(processor.validate_utf8_bmi2(&[]));
assert_eq!(processor.count_utf8_chars_bmi2(&[]).unwrap(), 0);
assert_eq!(processor.search_bmi2("", ""), Some(0));
assert_eq!(processor.search_bmi2("test", ""), Some(0));
assert!(processor.validate_utf8_bmi2("a".as_bytes()));
assert_eq!(processor.count_utf8_chars_bmi2("a".as_bytes()).unwrap(), 1);
let long_string = "a".repeat(1000);
assert!(processor.validate_utf8_bmi2(long_string.as_bytes()));
assert_eq!(processor.count_utf8_chars_bmi2(long_string.as_bytes()).unwrap(), 1000);
}
}