use crate::error::{Result, ZiporaError};
use crate::succinct::rank_select::simd::SimdCapabilities;
use std::char;
use std::str;
#[inline(always)]
pub fn utf8_byte_count(byte: u8) -> usize {
static UTF8_BYTE_COUNT_LUT: [u8; 256] = {
let mut table = [0u8; 256];
let mut i = 0;
while i < 256 {
let byte = i as u8;
table[i] = if byte < 0x80 {
1 } else if byte < 0xC0 {
0 } else if byte < 0xE0 {
2 } else if byte < 0xF0 {
3 } else if byte < 0xF8 {
4 } else {
0 };
i += 1;
}
table
};
UTF8_BYTE_COUNT_LUT[byte as usize] as usize
}
pub fn validate_utf8_and_count_chars(bytes: &[u8]) -> Result<usize> {
let caps = SimdCapabilities::get();
if caps.cpu_features.has_avx2 && bytes.len() >= 32 {
validate_utf8_and_count_chars_simd(bytes)
} else {
validate_utf8_and_count_chars_scalar(bytes)
}
}
fn validate_utf8_and_count_chars_scalar(bytes: &[u8]) -> Result<usize> {
match str::from_utf8(bytes) {
Ok(s) => Ok(s.chars().count()),
Err(e) => Err(ZiporaError::invalid_data(&format!("Invalid UTF-8: {}", e))),
}
}
#[cfg(target_arch = "x86_64")]
fn validate_utf8_and_count_chars_simd(bytes: &[u8]) -> Result<usize> {
#[cfg(target_feature = "avx2")]
{
if is_ascii_simd(bytes) {
return Ok(bytes.len());
}
}
validate_utf8_and_count_chars_scalar(bytes)
}
#[cfg(not(target_arch = "x86_64"))]
fn validate_utf8_and_count_chars_simd(bytes: &[u8]) -> Result<usize> {
validate_utf8_and_count_chars_scalar(bytes)
}
#[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
fn is_ascii_simd(bytes: &[u8]) -> bool {
use std::arch::x86_64::*;
unsafe {
let mut i = 0;
let len = bytes.len();
while i + 32 <= len {
let chunk = _mm256_loadu_si256(bytes.as_ptr().add(i) as *const __m256i);
let has_high_bit = _mm256_movemask_epi8(chunk);
if has_high_bit != 0 {
return false; }
i += 32;
}
for &byte in &bytes[i..] {
if byte >= 0x80 {
return false;
}
}
true
}
}
#[cfg(not(all(target_arch = "x86_64", target_feature = "avx2")))]
fn is_ascii_simd(bytes: &[u8]) -> bool {
bytes.iter().all(|&b| b < 0x80)
}
pub struct UnicodeProcessor {
normalize: bool,
case_fold: bool,
buffer: Vec<u8>,
}
impl UnicodeProcessor {
pub fn new() -> Self {
Self {
normalize: false,
case_fold: false,
buffer: Vec::new(),
}
}
pub fn with_normalization(mut self, enable: bool) -> Self {
self.normalize = enable;
self
}
pub fn with_case_folding(mut self, enable: bool) -> Self {
self.case_fold = enable;
self
}
pub fn process(&mut self, input: &str) -> Result<String> {
let mut result = if self.case_fold {
input.to_lowercase()
} else {
input.to_string()
};
if self.normalize {
result = self.normalize_basic(&result)?;
}
Ok(result)
}
fn normalize_basic(&self, input: &str) -> Result<String> {
Ok(input.chars().collect::<String>())
}
pub fn analyze(&self, input: &str) -> UnicodeAnalysis {
let mut analysis = UnicodeAnalysis::default();
for ch in input.chars() {
analysis.char_count += 1;
if ch.is_ascii() {
analysis.ascii_count += 1;
}
if ch.is_alphabetic() {
analysis.alphabetic_count += 1;
}
if ch.is_numeric() {
analysis.numeric_count += 1;
}
if ch.is_whitespace() {
analysis.whitespace_count += 1;
}
if ch.is_control() {
analysis.control_count += 1;
}
let codepoint = ch as u32;
if codepoint <= 0x7F {
analysis.basic_latin += 1;
} else if codepoint <= 0xFF {
analysis.latin_supplement += 1;
} else if codepoint <= 0x17FF {
analysis.extended_latin += 1;
} else {
analysis.other_unicode += 1;
}
}
analysis.byte_count = input.len();
analysis
}
}
impl Default for UnicodeProcessor {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, Default)]
pub struct UnicodeAnalysis {
pub byte_count: usize,
pub char_count: usize,
pub ascii_count: usize,
pub alphabetic_count: usize,
pub numeric_count: usize,
pub whitespace_count: usize,
pub control_count: usize,
pub basic_latin: usize,
pub latin_supplement: usize,
pub extended_latin: usize,
pub other_unicode: usize,
}
impl UnicodeAnalysis {
pub fn is_ascii(&self) -> bool {
self.ascii_count == self.char_count
}
pub fn avg_bytes_per_char(&self) -> f64 {
if self.char_count == 0 {
0.0
} else {
self.byte_count as f64 / self.char_count as f64
}
}
pub fn complexity_score(&self) -> f64 {
if self.char_count == 0 {
return 0.0;
}
let non_ascii_ratio = (self.char_count - self.ascii_count) as f64 / self.char_count as f64;
let avg_bytes = self.avg_bytes_per_char();
(non_ascii_ratio * 0.7) + ((avg_bytes - 1.0) / 3.0 * 0.3).min(0.3)
}
}
pub struct Utf8ToUtf32Iterator<'a> {
bytes: &'a [u8],
position: usize,
current_char: Option<char>,
}
impl<'a> Utf8ToUtf32Iterator<'a> {
pub fn new(bytes: &'a [u8]) -> Result<Self> {
str::from_utf8(bytes)
.map_err(|e| ZiporaError::invalid_data(&format!("Invalid UTF-8: {}", e)))?;
Ok(Self {
bytes,
position: 0,
current_char: None,
})
}
pub fn current(&self) -> Option<char> {
self.current_char
}
pub fn next_char(&mut self) -> Option<char> {
if self.position >= self.bytes.len() {
self.current_char = None;
return None;
}
let byte = self.bytes[self.position];
let char_len = utf8_byte_count(byte);
if char_len == 0 || self.position + char_len > self.bytes.len() {
self.current_char = None;
return None;
}
let char_bytes = &self.bytes[self.position..self.position + char_len];
match str::from_utf8(char_bytes) {
Ok(s) => {
self.current_char = s.chars().next();
self.position += char_len;
self.current_char
}
Err(_) => {
self.current_char = None;
None
}
}
}
pub fn prev_char(&mut self) -> Option<char> {
if self.position == 0 {
self.current_char = None;
return None;
}
let mut pos = self.position.saturating_sub(1);
while pos > 0 && (self.bytes[pos] & 0xC0) == 0x80 {
pos -= 1;
}
let byte = self.bytes[pos];
let char_len = utf8_byte_count(byte);
if char_len == 0 || pos + char_len > self.bytes.len() {
self.current_char = None;
return None;
}
let char_bytes = &self.bytes[pos..pos + char_len];
match str::from_utf8(char_bytes) {
Ok(s) => {
self.current_char = s.chars().next();
self.position = pos;
self.current_char
}
Err(_) => {
self.current_char = None;
None
}
}
}
pub fn byte_position(&self) -> usize {
self.position
}
pub fn reset(&mut self) {
self.position = 0;
self.current_char = None;
}
}
pub mod utils {
use super::*;
pub fn to_lowercase_unicode(input: &str) -> String {
input.to_lowercase()
}
pub fn to_uppercase_unicode(input: &str) -> String {
input.to_uppercase()
}
pub fn display_width(input: &str) -> usize {
input.chars().map(|ch| {
if ch.is_control() {
0
} else if ch as u32 > 0x1100 && is_wide_char(ch) {
2 } else {
1 }
}).sum()
}
fn is_wide_char(ch: char) -> bool {
let codepoint = ch as u32;
(codepoint >= 0x1100 && codepoint <= 0x115F) || (codepoint >= 0x2E80 && codepoint <= 0x2EFF) || (codepoint >= 0x2F00 && codepoint <= 0x2FDF) || (codepoint >= 0x3000 && codepoint <= 0x303F) || (codepoint >= 0x3040 && codepoint <= 0x309F) || (codepoint >= 0x30A0 && codepoint <= 0x30FF) || (codepoint >= 0x3100 && codepoint <= 0x312F) || (codepoint >= 0x3130 && codepoint <= 0x318F) || (codepoint >= 0x3190 && codepoint <= 0x319F) || (codepoint >= 0x31A0 && codepoint <= 0x31BF) || (codepoint >= 0x31C0 && codepoint <= 0x31EF) || (codepoint >= 0x31F0 && codepoint <= 0x31FF) || (codepoint >= 0x3200 && codepoint <= 0x32FF) || (codepoint >= 0x3300 && codepoint <= 0x33FF) || (codepoint >= 0x3400 && codepoint <= 0x4DBF) || (codepoint >= 0x4E00 && codepoint <= 0x9FFF) || (codepoint >= 0xA000 && codepoint <= 0xA48F) || (codepoint >= 0xA490 && codepoint <= 0xA4CF) || (codepoint >= 0xAC00 && codepoint <= 0xD7AF) || (codepoint >= 0xF900 && codepoint <= 0xFAFF) || (codepoint >= 0xFE10 && codepoint <= 0xFE1F) || (codepoint >= 0xFE30 && codepoint <= 0xFE4F) || (codepoint >= 0xFE50 && codepoint <= 0xFE6F) || (codepoint >= 0xFF00 && codepoint <= 0xFFEF) || (codepoint >= 0x20000 && codepoint <= 0x2A6DF) || (codepoint >= 0x2A700 && codepoint <= 0x2B73F) || (codepoint >= 0x2B740 && codepoint <= 0x2B81F) || (codepoint >= 0x2F800 && codepoint <= 0x2FA1F) }
pub fn extract_codepoints(input: &str) -> Vec<u32> {
input.chars().map(|ch| ch as u32).collect()
}
pub fn is_printable(input: &str) -> bool {
input.chars().all(|ch| !ch.is_control() || ch == '\t' || ch == '\n' || ch == '\r')
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_utf8_byte_count() {
assert_eq!(utf8_byte_count(0x41), 1); assert_eq!(utf8_byte_count(0xC2), 2); assert_eq!(utf8_byte_count(0xE2), 3); assert_eq!(utf8_byte_count(0xF0), 4); assert_eq!(utf8_byte_count(0x80), 0); }
#[test]
fn test_unicode_analysis() {
let processor = UnicodeProcessor::new();
let analysis = processor.analyze("Hello World!");
assert!(analysis.is_ascii());
assert_eq!(analysis.char_count, 12);
assert_eq!(analysis.byte_count, 12);
assert_eq!(analysis.ascii_count, 12);
assert_eq!(analysis.avg_bytes_per_char(), 1.0);
let analysis = processor.analyze("Hello 世界!");
assert!(!analysis.is_ascii());
assert_eq!(analysis.char_count, 9);
assert!(analysis.byte_count > 9); assert_eq!(analysis.ascii_count, 7);
assert!(analysis.avg_bytes_per_char() > 1.0);
}
#[test]
fn test_utf8_iterator() {
let text = "Hello 世界!";
let mut iter = Utf8ToUtf32Iterator::new(text.as_bytes()).unwrap();
let chars: Vec<char> = text.chars().collect();
let mut collected = Vec::new();
while let Some(ch) = iter.next_char() {
collected.push(ch);
}
assert_eq!(chars, collected);
}
#[test]
fn test_backward_iteration() {
let text = "Héllo!";
let mut iter = Utf8ToUtf32Iterator::new(text.as_bytes()).unwrap();
while iter.next_char().is_some() {}
let mut backward_chars = Vec::new();
while let Some(ch) = iter.prev_char() {
backward_chars.push(ch);
}
let forward_chars: Vec<char> = text.chars().collect();
backward_chars.reverse();
assert_eq!(forward_chars, backward_chars);
}
#[test]
fn test_unicode_validation() {
let valid = "Hello 世界!";
assert!(validate_utf8_and_count_chars(valid.as_bytes()).is_ok());
let invalid = &[0xFF, 0xFE, 0xFD];
assert!(validate_utf8_and_count_chars(invalid).is_err());
}
#[test]
fn test_display_width() {
assert_eq!(utils::display_width("Hello"), 5);
assert_eq!(utils::display_width("世界"), 4); assert_eq!(utils::display_width("Hello世界"), 9); }
#[test]
fn test_case_conversion() {
assert_eq!(utils::to_lowercase_unicode("HELLO"), "hello");
assert_eq!(utils::to_uppercase_unicode("hello"), "HELLO");
assert_eq!(utils::to_lowercase_unicode("WORLD"), "world");
assert_eq!(utils::to_uppercase_unicode("world"), "WORLD");
}
#[test]
fn test_processor_configuration() {
let mut processor = UnicodeProcessor::new()
.with_case_folding(true)
.with_normalization(true);
let result = processor.process("HELLO World").unwrap();
assert_eq!(result, "hello world");
}
#[test]
fn test_codepoint_extraction() {
let text = "A世";
let codepoints = utils::extract_codepoints(text);
assert_eq!(codepoints, vec![0x41, 0x4E16]); }
}