Skip to main content

simd_utf16_len/
lib.rs

1//! SIMD-accelerated UTF-16 length calculation from UTF-8 bytes.
2//!
3//! Formula: `utf16_len = byte_length - continuation_bytes + four_byte_leaders`
4//!
5//! Where:
6//! - continuation bytes: `(byte & 0xC0) == 0x80`
7//! - four-byte leaders: `byte >= 0xF0`
8
9/// Find the smallest index `>= i` that is a valid UTF-8 char boundary.
10/// Stable replacement for the unstable `str::ceil_char_boundary`.
11#[inline(always)]
12fn ceil_char_boundary(s: &str, i: usize) -> usize {
13    let bytes = s.as_bytes();
14    let len = bytes.len();
15    if i >= len {
16        return len;
17    }
18    // Skip continuation bytes (0b10xx_xxxx) directly on the byte slice,
19    // avoiding repeated bounds checks and method-call overhead.
20    let mut pos = i;
21    while pos < len && (unsafe { *bytes.get_unchecked(pos) } & 0xC0) == 0x80 {
22        pos += 1;
23    }
24    pos
25}
26
27#[cfg(target_arch = "x86_64")]
28mod x86_64;
29
30#[cfg(target_arch = "aarch64")]
31mod aarch64;
32
33#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
34mod wasm32;
35
36#[cfg(not(any(
37    target_arch = "x86_64",
38    target_arch = "aarch64",
39    all(target_arch = "wasm32", target_feature = "simd128"),
40)))]
41mod scalar;
42
43#[cfg(target_arch = "x86_64")]
44pub use x86_64::utf16_len;
45
46#[cfg(target_arch = "aarch64")]
47pub use aarch64::utf16_len;
48
49#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
50pub use wasm32::utf16_len;
51
52#[cfg(not(any(
53    target_arch = "x86_64",
54    target_arch = "aarch64",
55    all(target_arch = "wasm32", target_feature = "simd128"),
56)))]
57pub use scalar::utf16_len;
58
59#[cfg(test)]
60mod tests {
61    use super::utf16_len;
62
63    /// Reference implementation using the standard library.
64    fn reference(s: &str) -> usize {
65        s.encode_utf16().count()
66    }
67
68    #[test]
69    fn empty() {
70        assert_eq!(utf16_len(""), reference(""));
71    }
72
73    #[test]
74    fn ascii_only() {
75        assert_eq!(utf16_len("hello"), reference("hello"));
76    }
77
78    #[test]
79    fn two_byte_chars() {
80        // Latin, Cyrillic, etc.
81        let s = "café résumé";
82        assert_eq!(utf16_len(s), reference(s));
83    }
84
85    #[test]
86    fn three_byte_chars() {
87        // CJK characters (U+4E00..U+9FFF)
88        let s = "你好世界";
89        assert_eq!(utf16_len(s), reference(s));
90    }
91
92    #[test]
93    fn four_byte_chars() {
94        // Emoji / supplementary plane (surrogate pairs in UTF-16)
95        let s = "😀🎉🚀💯";
96        assert_eq!(utf16_len(s), reference(s));
97    }
98
99    #[test]
100    fn mixed() {
101        let s = "Hello, 世界! 🌍🌎🌏 café";
102        assert_eq!(utf16_len(s), reference(s));
103    }
104
105    #[test]
106    fn single_char_boundaries() {
107        // One character of each UTF-8 width
108        for c in ['a', 'é', '中', '🦀'] {
109            let s = String::from(c);
110            assert_eq!(utf16_len(&s), reference(&s), "char: {c}");
111        }
112    }
113
114    #[test]
115    fn longer_than_simd_width() {
116        // Ensure the SIMD loop and scalar tail both work (> 16 bytes).
117        let s = "abcdefghijklmnopqrstuvwxyz";
118        assert_eq!(utf16_len(s), reference(s));
119
120        let s = "αβγδεζηθικλμνξοπρστυφχψω";
121        assert_eq!(utf16_len(s), reference(s));
122
123        let s = "你好世界你好世界你好世界你好世界";
124        assert_eq!(utf16_len(s), reference(s));
125
126        let s = "🦀🦀🦀🦀🦀🦀🦀🦀🦀🦀🦀🦀🦀🦀🦀🦀";
127        assert_eq!(utf16_len(s), reference(s));
128    }
129
130    #[test]
131    fn repeated_pattern_large() {
132        // Stress test: exceed the 255-iteration batch boundary (255 * 16 = 4080 bytes).
133        let s = "a".repeat(5000);
134        assert_eq!(utf16_len(&s), reference(&s));
135
136        let s = "🦀".repeat(1500); // 1500 * 4 = 6000 bytes
137        assert_eq!(utf16_len(&s), reference(&s));
138    }
139
140    #[test]
141    fn all_byte_widths_interleaved() {
142        // Repeating pattern of 1+2+3+4 byte chars to test alignment variations.
143        let pattern = "aé中🦀";
144        let s = pattern.repeat(100);
145        assert_eq!(utf16_len(&s), reference(&s));
146    }
147}