use crate::utils::CoreError;
use wide::u8x16;
#[cfg(not(feature = "std"))]
extern crate alloc;
#[cfg(not(feature = "std"))]
#[cfg(feature = "simd")]
use wide::u8x16;
#[must_use]
pub fn scan_delimiters(text: &str) -> Option<usize> {
let bytes = text.as_bytes();
#[cfg(feature = "simd")]
{
if bytes.len() >= 16 {
return scan_delimiters_simd_impl(bytes);
}
}
scan_delimiters_scalar(bytes)
}
#[cfg(feature = "simd")]
fn scan_delimiters_simd_impl(bytes: &[u8]) -> Option<usize> {
let delim_colon = u8x16::splat(b':');
let delim_comma = u8x16::splat(b',');
let delim_open_brace = u8x16::splat(b'{');
let delim_close_brace = u8x16::splat(b'}');
let delim_open_bracket = u8x16::splat(b'[');
let delim_close_bracket = u8x16::splat(b']');
let delim_newline = u8x16::splat(b'\n');
let delim_carriage = u8x16::splat(b'\r');
let chunks = bytes.chunks_exact(16);
let remainder = chunks.remainder();
for (chunk_idx, chunk) in chunks.enumerate() {
let chunk_array: [u8; 16] = chunk.try_into().unwrap();
let simd_chunk = u8x16::from(chunk_array);
let mask = simd_chunk.cmp_eq(delim_colon)
| simd_chunk.cmp_eq(delim_comma)
| simd_chunk.cmp_eq(delim_open_brace)
| simd_chunk.cmp_eq(delim_close_brace)
| simd_chunk.cmp_eq(delim_open_bracket)
| simd_chunk.cmp_eq(delim_close_bracket)
| simd_chunk.cmp_eq(delim_newline)
| simd_chunk.cmp_eq(delim_carriage);
let mask_bits = mask.move_mask();
if mask_bits != 0 {
let first_match = mask_bits.trailing_zeros() as usize;
return Some(chunk_idx * 16 + first_match);
}
}
if !remainder.is_empty() {
let remainder_offset = bytes.len() - remainder.len();
if let Some(pos) = scan_delimiters_scalar(remainder) {
return Some(remainder_offset + pos);
}
}
None
}
fn scan_delimiters_scalar(bytes: &[u8]) -> Option<usize> {
for (i, &byte) in bytes.iter().enumerate() {
match byte {
b':' | b',' | b'{' | b'}' | b'[' | b']' | b'\n' | b'\r' => return Some(i),
_ => {}
}
}
None
}
#[must_use]
pub fn parse_hex_u32(hex_str: &str) -> Option<u32> {
if hex_str.is_empty() || hex_str.len() > 8 {
return None;
}
#[cfg(feature = "simd")]
{
if hex_str.len() >= 4 {
return parse_hex_simd_impl(hex_str);
}
}
parse_hex_scalar(hex_str)
}
#[cfg(feature = "simd")]
fn parse_hex_simd_impl(hex_str: &str) -> Option<u32> {
let bytes = hex_str.as_bytes();
if bytes.len() <= 8 {
return parse_hex_scalar_direct(bytes);
}
let chunks = bytes.chunks_exact(16);
let remainder = chunks.remainder();
for chunk in chunks {
let chunk_array: [u8; 16] = chunk.try_into().unwrap();
let simd_chunk = u8x16::from(chunk_array);
if !validate_hex_chars_simd(simd_chunk) {
return None;
}
}
for &byte in remainder {
if !byte.is_ascii_hexdigit() {
return None;
}
}
parse_hex_scalar(hex_str)
}
#[cfg(feature = "simd")]
fn validate_hex_chars_simd(simd_chunk: u8x16) -> bool {
let mut valid_mask = u8x16::splat(0);
for digit in b'0'..=b'9' {
valid_mask |= simd_chunk.cmp_eq(u8x16::splat(digit));
}
for hex_char in b'A'..=b'F' {
valid_mask |= simd_chunk.cmp_eq(u8x16::splat(hex_char));
}
for hex_char in b'a'..=b'f' {
valid_mask |= simd_chunk.cmp_eq(u8x16::splat(hex_char));
}
valid_mask.move_mask() == 0xFFFF
}
#[cfg(feature = "simd")]
fn parse_hex_scalar_direct(bytes: &[u8]) -> Option<u32> {
if bytes.is_empty() || bytes.len() > 8 {
return None;
}
let mut result: u32 = 0;
for &byte in bytes {
let digit_value = match byte {
b'0'..=b'9' => byte - b'0',
b'A'..=b'F' => byte - b'A' + 10,
b'a'..=b'f' => byte - b'a' + 10,
_ => return None,
};
result = result
.checked_mul(16)?
.checked_add(u32::from(digit_value))?;
}
Some(result)
}
fn parse_hex_scalar(hex_str: &str) -> Option<u32> {
u32::from_str_radix(hex_str, 16).ok()
}
pub fn validate_utf8_batch(bytes: &[u8]) -> Result<(), CoreError> {
#[cfg(feature = "simd")]
{
if bytes.len() >= 16 {
return validate_utf8_simd_impl(bytes);
}
}
validate_utf8_scalar(bytes)
}
#[cfg(feature = "simd")]
fn validate_utf8_simd_impl(bytes: &[u8]) -> Result<(), CoreError> {
let chunks = bytes.chunks_exact(16);
let remainder = chunks.remainder();
for chunk in chunks {
let chunk_array: [u8; 16] = chunk.try_into().unwrap();
let simd_chunk = u8x16::from(chunk_array);
let ascii_mask = u8x16::splat(0x80);
let has_non_ascii = (simd_chunk & ascii_mask).move_mask();
if has_non_ascii != 0 {
return validate_utf8_scalar(bytes);
}
}
validate_utf8_scalar(remainder)
}
fn validate_utf8_scalar(bytes: &[u8]) -> Result<(), CoreError> {
core::str::from_utf8(bytes)
.map(|_| ())
.map_err(|e| CoreError::utf8_error(e.valid_up_to(), format!("{e}")))
}
#[cfg(test)]
mod tests {
use super::*;
#[cfg(not(feature = "std"))]
use alloc::{format, vec};
#[test]
fn scan_delimiters_finds_colon() {
let text = "key: value";
assert_eq!(scan_delimiters(text), Some(3));
}
#[test]
fn scan_delimiters_finds_comma() {
let text = "value1, value2";
assert_eq!(scan_delimiters(text), Some(6));
}
#[test]
fn scan_delimiters_finds_brace() {
let text = "text{override}";
assert_eq!(scan_delimiters(text), Some(4));
}
#[test]
fn scan_delimiters_no_match() {
let text = "plain text";
assert_eq!(scan_delimiters(text), None);
}
#[test]
fn scan_delimiters_long_text() {
let text = format!("{}:value", "a".repeat(50));
assert_eq!(scan_delimiters(&text), Some(50));
}
#[test]
fn parse_hex_valid() {
assert_eq!(parse_hex_u32("FF"), Some(0xFF));
assert_eq!(parse_hex_u32("00FF00FF"), Some(0x00FF_00FF));
assert_eq!(parse_hex_u32("12345678"), Some(0x1234_5678));
assert_eq!(parse_hex_u32("abcdef"), Some(0x00ab_cdef));
assert_eq!(parse_hex_u32("ABCDEF"), Some(0x00AB_CDEF));
}
#[test]
fn parse_hex_invalid() {
assert_eq!(parse_hex_u32("GG"), None);
assert_eq!(parse_hex_u32("123456789"), None); assert_eq!(parse_hex_u32(""), None);
assert_eq!(parse_hex_u32("XYZ"), None);
}
#[test]
fn validate_utf8_valid() {
assert!(validate_utf8_batch(b"Hello, World!").is_ok());
assert!(validate_utf8_batch("Hello, 世界! 🎵".as_bytes()).is_ok());
assert!(validate_utf8_batch("a".repeat(50).as_bytes()).is_ok());
}
#[test]
fn validate_utf8_invalid() {
assert!(validate_utf8_batch(&[0xFF, 0xFE]).is_err());
}
#[test]
fn scan_delimiters_all_delimiter_types() {
let delimiters = vec![
("text:more", 4, ':'),
("text,more", 4, ','),
("text{more", 4, '{'),
("text}more", 4, '}'),
("text[more", 4, '['),
("text]more", 4, ']'),
("text\nmore", 4, '\n'),
("text\rmore", 4, '\r'),
];
for (text, expected_pos, _delimiter) in delimiters {
assert_eq!(scan_delimiters(text), Some(expected_pos));
}
}
#[test]
fn scan_delimiters_empty_input() {
assert_eq!(scan_delimiters(""), None);
}
#[test]
fn scan_delimiters_single_char() {
assert_eq!(scan_delimiters(":"), Some(0));
assert_eq!(scan_delimiters("a"), None);
}
#[test]
fn scan_delimiters_at_beginning() {
assert_eq!(scan_delimiters(":text"), Some(0));
assert_eq!(scan_delimiters(",text"), Some(0));
assert_eq!(scan_delimiters("{text"), Some(0));
}
#[test]
fn scan_delimiters_at_end() {
assert_eq!(scan_delimiters("text:"), Some(4));
assert_eq!(scan_delimiters("text,"), Some(4));
assert_eq!(scan_delimiters("text}"), Some(4));
}
#[test]
fn scan_delimiters_multiple_delimiters() {
assert_eq!(scan_delimiters("a:b,c{d"), Some(1));
assert_eq!(scan_delimiters("text,more:values"), Some(4));
}
#[test]
fn scan_delimiters_exactly_16_bytes() {
let text = "abcdefghijklmno:"; assert_eq!(scan_delimiters(text), Some(15));
}
#[test]
fn scan_delimiters_less_than_16_bytes() {
let text = "short:text"; assert_eq!(scan_delimiters(text), Some(5));
}
#[test]
fn scan_delimiters_much_longer_than_16_bytes() {
let prefix = "a".repeat(32);
let text = format!("{prefix}:value");
assert_eq!(scan_delimiters(&text), Some(32));
}
#[test]
fn scan_delimiters_unicode_text() {
let text = "café🎭:value";
let colon_pos = text.find(':').unwrap();
assert_eq!(scan_delimiters(text), Some(colon_pos));
}
#[test]
fn parse_hex_edge_cases() {
assert_eq!(parse_hex_u32("0"), Some(0));
assert_eq!(parse_hex_u32("F"), Some(15));
assert_eq!(parse_hex_u32("FFFFFFFF"), Some(0xFFFF_FFFF));
assert_eq!(parse_hex_u32("aBcDeF"), Some(0x00ab_cdef));
assert_eq!(parse_hex_u32("AbCdEf"), Some(0x00ab_cdef));
assert_eq!(parse_hex_u32("00000001"), Some(1));
assert_eq!(parse_hex_u32("0000FF00"), Some(0xFF00));
}
#[test]
fn parse_hex_invalid_length() {
assert_eq!(parse_hex_u32("123456789"), None);
assert_eq!(parse_hex_u32("FFFFFFFFF"), None);
}
#[test]
fn parse_hex_invalid_characters() {
assert_eq!(parse_hex_u32("GHIJ"), None);
assert_eq!(parse_hex_u32("123G"), None);
assert_eq!(parse_hex_u32("12 34"), None); assert_eq!(parse_hex_u32("12-34"), None); assert_eq!(parse_hex_u32("FF\n"), None); }
#[test]
fn parse_hex_overflow_handling() {
assert_eq!(parse_hex_u32("FFFFFFFF"), Some(u32::MAX));
}
#[test]
fn validate_utf8_empty_input() {
assert!(validate_utf8_batch(&[]).is_ok());
}
#[test]
fn validate_utf8_ascii_only() {
let ascii_text = "Hello, World! 123 @#$%";
assert!(validate_utf8_batch(ascii_text.as_bytes()).is_ok());
}
#[test]
fn validate_utf8_exactly_16_bytes() {
let text = "1234567890123456"; assert!(validate_utf8_batch(text.as_bytes()).is_ok());
}
#[test]
fn validate_utf8_less_than_16_bytes() {
let text = "short"; assert!(validate_utf8_batch(text.as_bytes()).is_ok());
}
#[test]
fn validate_utf8_much_longer() {
let text = "a".repeat(100);
assert!(validate_utf8_batch(text.as_bytes()).is_ok());
}
#[test]
fn validate_utf8_mixed_unicode() {
let text = "ASCII中文🎵عربي";
assert!(validate_utf8_batch(text.as_bytes()).is_ok());
}
#[test]
fn validate_utf8_invalid_sequences() {
assert!(validate_utf8_batch(&[0xC0, 0x80]).is_err()); assert!(validate_utf8_batch(&[0xED, 0xA0, 0x80]).is_err()); assert!(validate_utf8_batch(&[0xF4, 0x90, 0x80, 0x80]).is_err()); }
#[test]
fn validate_utf8_incomplete_sequences() {
assert!(validate_utf8_batch(&[0xC2]).is_err()); assert!(validate_utf8_batch(&[0xE0, 0x80]).is_err()); assert!(validate_utf8_batch(&[0xF0, 0x90, 0x80]).is_err()); }
#[test]
fn scan_delimiters_scalar_fallback() {
let short_texts = vec![
"a:b", "test,val", "x{y}z", ];
for text in short_texts {
let result = scan_delimiters(text);
assert!(result.is_some());
}
}
#[test]
fn parse_hex_scalar_fallback() {
assert_eq!(parse_hex_u32("A"), Some(10));
assert_eq!(parse_hex_u32("FF"), Some(255));
assert_eq!(parse_hex_u32("123"), Some(0x123));
}
#[test]
fn scan_delimiters_boundary_at_chunk_edge() {
let text = format!("{}:", "a".repeat(15)); assert_eq!(scan_delimiters(&text), Some(15));
let text2 = format!("{}:", "a".repeat(16)); assert_eq!(scan_delimiters(&text2), Some(16));
}
#[test]
fn validate_utf8_non_ascii_in_chunks() {
let text = format!("{}café", "a".repeat(12)); assert!(validate_utf8_batch(text.as_bytes()).is_ok());
}
#[test]
fn parse_hex_case_sensitivity() {
assert_eq!(parse_hex_u32("abcdef"), parse_hex_u32("ABCDEF"));
assert_eq!(parse_hex_u32("deadbeef"), parse_hex_u32("DEADBEEF"));
}
#[test]
fn scan_delimiters_no_false_positives() {
let text = "abcdefghijklmnopqrstuvwxyz"; assert_eq!(scan_delimiters(text), None);
let text2 = "0123456789"; assert_eq!(scan_delimiters(text2), None);
}
#[test]
fn validate_utf8_chunk_remainder_handling() {
let text = format!("{}café", "a".repeat(17)); assert!(validate_utf8_batch(text.as_bytes()).is_ok());
let text2 = format!("{}🎵", "a".repeat(18)); assert!(validate_utf8_batch(text2.as_bytes()).is_ok());
}
#[test]
fn parse_hex_maximum_value() {
assert_eq!(parse_hex_u32("FFFFFFFF"), Some(u32::MAX));
assert_eq!(parse_hex_u32("ffffffff"), Some(u32::MAX));
}
#[test]
fn scan_delimiters_all_positions() {
for i in 0..10 {
let mut chars: Vec<char> = "abcdefghij".chars().collect();
chars[i] = ':';
let text: String = chars.iter().collect();
assert_eq!(scan_delimiters(&text), Some(i));
}
}
}