Skip to main content

ass_core/tokenizer/simd/
utf8.rs

1//! SIMD-accelerated UTF-8 validation utilities
2//!
3//! Provides batch validation of UTF-8 byte sequences using vectorized
4//! operations, with automatic fallback to the scalar path when SIMD is
5//! unavailable.
6
7use crate::utils::CoreError;
8use wide::u8x16;
9
10/// Batch validate UTF-8 sequences using SIMD
11///
12/// Validates multiple bytes at once for UTF-8 compliance.
13/// Provides faster validation for large text blocks.
14/// Validate UTF-8 encoding of byte slice using batch processing
15///
16/// # Errors
17///
18/// Returns an error if the byte slice contains invalid UTF-8 sequences.
19pub fn validate_utf8_batch(bytes: &[u8]) -> Result<(), CoreError> {
20    #[cfg(feature = "simd")]
21    {
22        if bytes.len() >= 16 {
23            return validate_utf8_simd_impl(bytes);
24        }
25    }
26
27    validate_utf8_scalar(bytes)
28}
29
30/// SIMD implementation for UTF-8 validation
31#[cfg(feature = "simd")]
32fn validate_utf8_simd_impl(bytes: &[u8]) -> Result<(), CoreError> {
33    let chunks = bytes.chunks_exact(16);
34    let remainder = chunks.remainder();
35
36    for chunk in chunks {
37        let chunk_array: [u8; 16] = chunk.try_into().unwrap();
38        let simd_chunk = u8x16::from(chunk_array);
39        let ascii_mask = u8x16::splat(0x80);
40
41        let has_non_ascii = (simd_chunk & ascii_mask).move_mask();
42        if has_non_ascii != 0 {
43            return validate_utf8_scalar(bytes);
44        }
45    }
46
47    validate_utf8_scalar(remainder)
48}
49
50/// Scalar UTF-8 validation implementation
51fn validate_utf8_scalar(bytes: &[u8]) -> Result<(), CoreError> {
52    core::str::from_utf8(bytes)
53        .map(|_| ())
54        .map_err(|e| CoreError::utf8_error(e.valid_up_to(), format!("{e}")))
55}