ass_core/tokenizer/simd/
utf8.rs1use crate::utils::CoreError;
8use wide::u8x16;
9
10pub fn validate_utf8_batch(bytes: &[u8]) -> Result<(), CoreError> {
20 #[cfg(feature = "simd")]
21 {
22 if bytes.len() >= 16 {
23 return validate_utf8_simd_impl(bytes);
24 }
25 }
26
27 validate_utf8_scalar(bytes)
28}
29
30#[cfg(feature = "simd")]
32fn validate_utf8_simd_impl(bytes: &[u8]) -> Result<(), CoreError> {
33 let chunks = bytes.chunks_exact(16);
34 let remainder = chunks.remainder();
35
36 for chunk in chunks {
37 let chunk_array: [u8; 16] = chunk.try_into().unwrap();
38 let simd_chunk = u8x16::from(chunk_array);
39 let ascii_mask = u8x16::splat(0x80);
40
41 let has_non_ascii = (simd_chunk & ascii_mask).move_mask();
42 if has_non_ascii != 0 {
43 return validate_utf8_scalar(bytes);
44 }
45 }
46
47 validate_utf8_scalar(remainder)
48}
49
50fn validate_utf8_scalar(bytes: &[u8]) -> Result<(), CoreError> {
52 core::str::from_utf8(bytes)
53 .map(|_| ())
54 .map_err(|e| CoreError::utf8_error(e.valid_up_to(), format!("{e}")))
55}