fory_core/util/
string_util.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::mem;
19use std::ptr;
20
21const MAX_HASH32: u64 = (1 << 31) - 1;
22
23#[allow(dead_code)]
24pub enum StringFlag {
25    LATIN1 = 0,
26    UTF8 = 1,
27}
28
29/// Swaps the high 8 bits and the low 8 bits of a 16-bit value.
30fn swap_endian(value: u16) -> u16 {
31    value.rotate_right(8)
32}
33
34/// Converts UTF-16 encoded data to UTF-8.
35pub fn to_utf8(utf16: &[u16], is_little_endian: bool) -> Result<Vec<u8>, String> {
36    // Pre-allocating capacity to avoid dynamic resizing.
37    // Longest case: 1 u16 to 3 u8.
38    let mut utf8_bytes: Vec<u8> = Vec::with_capacity(utf16.len() * 3);
39    let ptr = utf8_bytes.as_mut_ptr();
40    let mut offset = 0;
41    let mut iter = utf16.iter();
42    while let Some(&wc) = iter.next() {
43        let wc = if is_little_endian {
44            swap_endian(wc)
45        } else {
46            wc
47        };
48        match wc {
49            code_point if code_point < 0x80 => {
50                unsafe {
51                    ptr.add(offset).write(code_point as u8);
52                }
53                offset += 1;
54            }
55            code_point if code_point < 0x800 => {
56                let bytes = [
57                    ((code_point >> 6) & 0b1_1111) as u8 | 0b1100_0000,
58                    (code_point & 0b11_1111) as u8 | 0b1000_0000,
59                ];
60                unsafe {
61                    ptr::copy_nonoverlapping(bytes.as_ptr(), ptr.add(offset), 2);
62                }
63                offset += 2;
64            }
65            wc1 if (0xd800..=0xdbff).contains(&wc1) => {
66                if let Some(&wc2) = iter.next() {
67                    let wc2 = if is_little_endian {
68                        swap_endian(wc2)
69                    } else {
70                        wc2
71                    };
72                    if !(0xdc00..=0xdfff).contains(&wc2) {
73                        return Err("Invalid UTF-16 string: wrong surrogate pair".to_string());
74                    }
75                    let code_point =
76                        ((((wc1 as u32) - 0xd800) << 10) | ((wc2 as u32) - 0xdc00)) + 0x10000;
77                    let bytes = [
78                        ((code_point >> 18) & 0b111) as u8 | 0b1111_0000,
79                        ((code_point >> 12) & 0b11_1111) as u8 | 0b1000_0000,
80                        ((code_point >> 6) & 0b11_1111) as u8 | 0b1000_0000,
81                        (code_point & 0b11_1111) as u8 | 0b1000_0000,
82                    ];
83                    unsafe {
84                        ptr::copy_nonoverlapping(bytes.as_ptr(), ptr.add(offset), 4);
85                    }
86                    offset += 4;
87                } else {
88                    return Err("Invalid UTF-16 string: missing surrogate pair".to_string());
89                }
90            }
91            _ => {
92                let bytes = [
93                    ((wc >> 12) | 0b1110_0000) as u8,
94                    ((wc >> 6) & 0b11_1111) as u8 | 0b1000_0000,
95                    (wc & 0b11_1111) as u8 | 0b1000_0000,
96                ];
97                unsafe {
98                    ptr::copy_nonoverlapping(bytes.as_ptr(), ptr.add(offset), 3);
99                }
100                offset += 3;
101            }
102        }
103    }
104    unsafe {
105        utf8_bytes.set_len(offset);
106    }
107    Ok(utf8_bytes)
108}
109
110/// Converts a camelCase or PascalCase string to snake_case.
111pub fn to_snake_case(name: &str) -> String {
112    let mut result = String::with_capacity(name.len() + 4);
113    let chars: Vec<char> = name.chars().collect();
114
115    for (i, &c) in chars.iter().enumerate() {
116        if c.is_ascii_uppercase() {
117            if i > 0 {
118                let prev_upper = chars.get(i - 1).is_some_and(|c| c.is_ascii_uppercase());
119                let next_upper_or_end = chars.get(i + 1).map_or(true, |c| c.is_ascii_uppercase());
120                if !prev_upper || !next_upper_or_end {
121                    result.push('_');
122                }
123            }
124            result.push(c.to_ascii_lowercase());
125        } else {
126            result.push(c);
127        }
128    }
129    result
130}
131
132/// Converts a snake_case string to lowerCamelCase.
133pub fn to_camel_case(name: &str) -> String {
134    let mut result = String::with_capacity(name.len());
135    let mut capitalize_next = false;
136
137    for c in name.chars() {
138        if c == '_' {
139            capitalize_next = true;
140        } else if capitalize_next {
141            result.push(c.to_ascii_uppercase());
142            capitalize_next = false;
143        } else {
144            result.push(c);
145        }
146    }
147    result
148}
149
150#[allow(dead_code)]
151pub fn compute_string_hash(s: &str) -> u32 {
152    let mut hash: u64 = 17;
153    s.as_bytes().iter().for_each(|b| {
154        hash = (hash * 31) + (*b as u64);
155        while hash >= MAX_HASH32 {
156            hash /= 7;
157        }
158    });
159    hash as u32
160}
161
162#[cfg(target_feature = "neon")]
163use std::arch::aarch64::*;
164
165#[cfg(target_feature = "avx2")]
166use std::arch::x86_64::*;
167
168#[cfg(target_feature = "sse2")]
169use std::arch::x86_64::*;
170
171#[cfg(target_arch = "x86_64")]
172pub const MIN_DIM_SIZE_AVX: usize = 32;
173
174#[cfg(any(
175    target_arch = "x86",
176    target_arch = "x86_64",
177    all(target_arch = "aarch64", target_feature = "neon")
178))]
179pub const MIN_DIM_SIZE_SIMD: usize = 16;
180
181#[cfg(target_arch = "x86_64")]
182unsafe fn is_latin_avx(s: &str) -> bool {
183    let bytes = s.as_bytes();
184    let len = bytes.len();
185    let mut i = 0;
186    // SIMD skip ASCII
187    while i + MIN_DIM_SIZE_AVX <= len {
188        let chunk = _mm256_loadu_si256(bytes.as_ptr().add(i) as *const __m256i);
189        let hi_mask = _mm256_set1_epi8(0x80u8 as i8);
190        let masked = _mm256_and_si256(chunk, hi_mask);
191        let cmp = _mm256_cmpeq_epi8(masked, _mm256_setzero_si256());
192        if _mm256_movemask_epi8(cmp) != -1 {
193            break;
194        }
195        i += MIN_DIM_SIZE_AVX;
196    }
197    // check latin in remaining chars
198    let s_tail = &s[i..];
199    for c in s_tail.chars() {
200        if c as u32 > 0xFF {
201            return false;
202        }
203    }
204    true
205}
206
207#[cfg(target_feature = "sse2")]
208unsafe fn is_latin_sse(s: &str) -> bool {
209    let bytes = s.as_bytes();
210    let len = bytes.len();
211    let mut i = 0;
212    // SIMD skip ASCII
213    while i + MIN_DIM_SIZE_SIMD <= len {
214        let chunk = _mm_loadu_si128(bytes.as_ptr().add(i) as *const __m128i);
215        let hi_mask = _mm_set1_epi8(0x80u8 as i8);
216        let masked = _mm_and_si128(chunk, hi_mask);
217        let cmp = _mm_cmpeq_epi8(masked, _mm_setzero_si128());
218        if _mm_movemask_epi8(cmp) != 0xFFFF {
219            break;
220        }
221        i += MIN_DIM_SIZE_SIMD;
222    }
223    // check latin in remaining chars
224    let s_tail = &s[i..];
225    for c in s_tail.chars() {
226        if c as u32 > 0xFF {
227            return false;
228        }
229    }
230    true
231}
232
233#[cfg(target_feature = "neon")]
234unsafe fn is_latin_neon(s: &str) -> bool {
235    let bytes = s.as_bytes();
236    let len = bytes.len();
237    let mut i = 0;
238    // SIMD skip ASCII
239    while i + MIN_DIM_SIZE_SIMD <= len {
240        let chunk = vld1q_u8(bytes.as_ptr().add(i));
241        let hi_mask = vdupq_n_u8(0x80);
242        let masked = vandq_u8(chunk, hi_mask);
243        if vmaxvq_u8(masked) != 0 {
244            break;
245        }
246        i += MIN_DIM_SIZE_SIMD;
247    }
248    // check latin in remaining chars
249    let s_tail = &s[i..];
250    for c in s_tail.chars() {
251        if c as u32 > 0xFF {
252            return false;
253        }
254    }
255    true
256}
257
258fn is_latin_standard(s: &str) -> bool {
259    s.chars().all(|c| c as u32 <= 0xFF)
260}
261
262pub fn is_latin(s: &str) -> bool {
263    #[cfg(target_arch = "x86_64")]
264    {
265        if is_x86_feature_detected!("avx")
266            && is_x86_feature_detected!("fma")
267            && s.len() >= MIN_DIM_SIZE_AVX
268        {
269            return unsafe { is_latin_avx(s) };
270        }
271    }
272
273    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
274    {
275        if is_x86_feature_detected!("sse") && s.len() >= MIN_DIM_SIZE_SIMD {
276            return unsafe { is_latin_sse(s) };
277        }
278    }
279
280    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
281    {
282        if std::arch::is_aarch64_feature_detected!("neon") && s.len() >= MIN_DIM_SIZE_SIMD {
283            return unsafe { is_latin_neon(s) };
284        }
285    }
286    is_latin_standard(s)
287}
288
289#[cfg(target_arch = "x86_64")]
290unsafe fn get_latin1_length_avx(s: &str) -> i32 {
291    let bytes = s.as_bytes();
292    let len = bytes.len();
293    let mut count = 0;
294    // SIMD skip ASCII
295    while count + MIN_DIM_SIZE_AVX <= len {
296        let chunk = _mm256_loadu_si256(bytes.as_ptr().add(count) as *const __m256i);
297        let hi_mask = _mm256_set1_epi8(0x80u8 as i8);
298        let masked = _mm256_and_si256(chunk, hi_mask);
299        let cmp = _mm256_cmpeq_epi8(masked, _mm256_setzero_si256());
300        if _mm256_movemask_epi8(cmp) != -1 {
301            break;
302        }
303        count += MIN_DIM_SIZE_AVX;
304    }
305    // check latin in remaining chars
306    let s_tail = &s[count..];
307    for c in s_tail.chars() {
308        if c as u32 > 0xFF {
309            return -1;
310        }
311        count += 1;
312    }
313    count as i32
314}
315
316#[cfg(target_feature = "sse2")]
317unsafe fn get_latin1_length_sse(s: &str) -> i32 {
318    let bytes = s.as_bytes();
319    let len = bytes.len();
320    let mut count = 0;
321    // SIMD skip ASCII
322    while count + MIN_DIM_SIZE_SIMD <= len {
323        let chunk = _mm_loadu_si128(bytes.as_ptr().add(count) as *const __m128i);
324        let hi_mask = _mm_set1_epi8(0x80u8 as i8);
325        let masked = _mm_and_si128(chunk, hi_mask);
326        let cmp = _mm_cmpeq_epi8(masked, _mm_setzero_si128());
327        if _mm_movemask_epi8(cmp) != 0xFFFF {
328            break;
329        }
330        count += MIN_DIM_SIZE_SIMD;
331    }
332    // check latin in remaining chars
333    let s_tail = &s[count..];
334    for c in s_tail.chars() {
335        if c as u32 > 0xFF {
336            return -1;
337        }
338        count += 1;
339    }
340    count as i32
341}
342
343#[cfg(target_feature = "neon")]
344unsafe fn get_latin1_length_neon(s: &str) -> i32 {
345    let bytes = s.as_bytes();
346    let len = bytes.len();
347    let mut count = 0;
348    // SIMD skip ASCII
349    while count + MIN_DIM_SIZE_SIMD <= len {
350        let chunk = vld1q_u8(bytes.as_ptr().add(count));
351        let hi_mask = vdupq_n_u8(0x80);
352        let masked = vandq_u8(chunk, hi_mask);
353        if vmaxvq_u8(masked) != 0 {
354            break;
355        }
356        count += MIN_DIM_SIZE_SIMD;
357    }
358    // check latin in remaining chars
359    let s_tail = &s[count..];
360    for c in s_tail.chars() {
361        if c as u32 > 0xFF {
362            return -1;
363        }
364        count += 1;
365    }
366    count as i32
367}
368
369fn get_latin1_length_standard(s: &str) -> i32 {
370    let mut count = 0;
371    for c in s.chars() {
372        if c as u32 > 0xFF {
373            return -1;
374        }
375        count += 1;
376    }
377    count
378}
379
380pub fn get_latin1_length(s: &str) -> i32 {
381    #[cfg(target_arch = "x86_64")]
382    {
383        if is_x86_feature_detected!("avx")
384            && is_x86_feature_detected!("fma")
385            && s.len() >= MIN_DIM_SIZE_AVX
386        {
387            return unsafe { get_latin1_length_avx(s) };
388        }
389    }
390
391    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
392    {
393        if is_x86_feature_detected!("sse") && s.len() >= MIN_DIM_SIZE_SIMD {
394            return unsafe { get_latin1_length_sse(s) };
395        }
396    }
397
398    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
399    {
400        if std::arch::is_aarch64_feature_detected!("neon") && s.len() >= MIN_DIM_SIZE_SIMD {
401            return unsafe { get_latin1_length_neon(s) };
402        }
403    }
404    get_latin1_length_standard(s)
405}
406
407#[cfg(test)]
408mod latin_tests {
409    // Import content from external modules
410    use super::*;
411    use rand::Rng;
412
413    fn generate_random_string(length: usize) -> String {
414        const CHARSET: &[u8] = b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
415        let mut rng = rand::thread_rng();
416
417        let result: String = (0..length)
418            .map(|_| {
419                let idx = rng.gen_range(0..CHARSET.len());
420                CHARSET[idx] as char
421            })
422            .collect();
423
424        result
425    }
426
427    #[test]
428    fn test_is_latin() {
429        let s = generate_random_string(1000);
430        let not_latin_str = generate_random_string(1000) + "abc\u{1234}";
431
432        #[cfg(target_arch = "x86_64")]
433        {
434            if is_x86_feature_detected!("avx") && is_x86_feature_detected!("fma") {
435                assert!(unsafe { is_latin_avx(&s) });
436                assert!(!unsafe { is_latin_avx(&not_latin_str) });
437            }
438        }
439
440        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
441        {
442            if is_x86_feature_detected!("sse") && s.len() >= MIN_DIM_SIZE_SIMD {
443                assert!(unsafe { is_latin_sse(&s) });
444                assert!(!unsafe { is_latin_sse(&not_latin_str) });
445            }
446        }
447
448        #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
449        {
450            if std::arch::is_aarch64_feature_detected!("neon") && s.len() >= MIN_DIM_SIZE_SIMD {
451                assert!(unsafe { is_latin_neon(&s) });
452                assert!(!unsafe { is_latin_neon(&not_latin_str) });
453            }
454        }
455        assert!(is_latin_standard(&s));
456        assert!(!is_latin_standard(&not_latin_str));
457    }
458}
459
460fn fmix64(mut k: u64) -> u64 {
461    k ^= k >> 33;
462    k = k.wrapping_mul(0xff51afd7ed558ccdu64);
463    k ^= k >> 33;
464    k = k.wrapping_mul(0xc4ceb9fe1a85ec53u64);
465    k ^= k >> 33;
466
467    k
468}
469
470pub fn murmurhash3_x64_128(bytes: &[u8], seed: u64) -> (u64, u64) {
471    let c1 = 0x87c37b91114253d5u64;
472    let c2 = 0x4cf5ad432745937fu64;
473    let read_size = 16;
474    let len = bytes.len() as u64;
475    let block_count = len / read_size;
476
477    let (mut h1, mut h2) = (seed, seed);
478
479    for i in 0..block_count as usize {
480        let b64: &[u64] = unsafe { mem::transmute(bytes) };
481        let (mut k1, mut k2) = (b64[i * 2], b64[i * 2 + 1]);
482
483        k1 = k1.wrapping_mul(c1);
484        k1 = k1.rotate_left(31);
485        k1 = k1.wrapping_mul(c2);
486        h1 ^= k1;
487
488        h1 = h1.rotate_left(27);
489        h1 = h1.wrapping_add(h2);
490        h1 = h1.wrapping_mul(5);
491        h1 = h1.wrapping_add(0x52dce729);
492
493        k2 = k2.wrapping_mul(c2);
494        k2 = k2.rotate_left(33);
495        k2 = k2.wrapping_mul(c1);
496        h2 ^= k2;
497
498        h2 = h2.rotate_left(31);
499        h2 = h2.wrapping_add(h1);
500        h2 = h2.wrapping_mul(5);
501        h2 = h2.wrapping_add(0x38495ab5);
502    }
503    let (mut k1, mut k2) = (0u64, 0u64);
504
505    if len & 15 == 15 {
506        k2 ^= (bytes[(block_count * read_size) as usize + 14] as u64) << 48;
507    }
508    if len & 15 >= 14 {
509        k2 ^= (bytes[(block_count * read_size) as usize + 13] as u64) << 40;
510    }
511    if len & 15 >= 13 {
512        k2 ^= (bytes[(block_count * read_size) as usize + 12] as u64) << 32;
513    }
514    if len & 15 >= 12 {
515        k2 ^= (bytes[(block_count * read_size) as usize + 11] as u64) << 24;
516    }
517    if len & 15 >= 11 {
518        k2 ^= (bytes[(block_count * read_size) as usize + 10] as u64) << 16;
519    }
520    if len & 15 >= 10 {
521        k2 ^= (bytes[(block_count * read_size) as usize + 9] as u64) << 8;
522    }
523    if len & 15 >= 9 {
524        k2 ^= bytes[(block_count * read_size) as usize + 8] as u64;
525        k2 = k2.wrapping_mul(c2);
526        k2 = k2.rotate_left(33);
527        k2 = k2.wrapping_mul(c1);
528        h2 ^= k2;
529    }
530
531    if len & 15 >= 8 {
532        k1 ^= (bytes[(block_count * read_size) as usize + 7] as u64) << 56;
533    }
534    if len & 15 >= 7 {
535        k1 ^= (bytes[(block_count * read_size) as usize + 6] as u64) << 48;
536    }
537    if len & 15 >= 6 {
538        k1 ^= (bytes[(block_count * read_size) as usize + 5] as u64) << 40;
539    }
540    if len & 15 >= 5 {
541        k1 ^= (bytes[(block_count * read_size) as usize + 4] as u64) << 32;
542    }
543    if len & 15 >= 4 {
544        k1 ^= (bytes[(block_count * read_size) as usize + 3] as u64) << 24;
545    }
546    if len & 15 >= 3 {
547        k1 ^= (bytes[(block_count * read_size) as usize + 2] as u64) << 16;
548    }
549    if len & 15 >= 2 {
550        k1 ^= (bytes[(block_count * read_size) as usize + 1] as u64) << 8;
551    }
552    if len & 15 >= 1 {
553        k1 ^= bytes[(block_count * read_size) as usize] as u64;
554        k1 = k1.wrapping_mul(c1);
555        k1 = k1.rotate_left(31);
556        k1 = k1.wrapping_mul(c2);
557        h1 ^= k1;
558    }
559
560    h1 ^= bytes.len() as u64;
561    h2 ^= bytes.len() as u64;
562
563    h1 = h1.wrapping_add(h2);
564    h2 = h2.wrapping_add(h1);
565
566    h1 = fmix64(h1);
567    h2 = fmix64(h2);
568
569    h1 = h1.wrapping_add(h2);
570    h2 = h2.wrapping_add(h1);
571
572    (h1, h2)
573}
574
575#[cfg(test)]
576mod test_hash {
577    use super::murmurhash3_x64_128;
578
579    #[test]
580    fn test_empty_string() {
581        assert!(murmurhash3_x64_128("".as_bytes(), 0) == (0, 0));
582    }
583
584    #[test]
585    fn test_tail_lengths() {
586        assert!(
587            murmurhash3_x64_128("1".as_bytes(), 0) == (8213365047359667313, 10676604921780958775)
588        );
589        assert!(
590            murmurhash3_x64_128("12".as_bytes(), 0) == (5355690773644049813, 9855895140584599837)
591        );
592        assert!(
593            murmurhash3_x64_128("123".as_bytes(), 0) == (10978418110857903978, 4791445053355511657)
594        );
595        assert!(
596            murmurhash3_x64_128("1234".as_bytes(), 0) == (619023178690193332, 3755592904005385637)
597        );
598        assert!(
599            murmurhash3_x64_128("12345".as_bytes(), 0)
600                == (2375712675693977547, 17382870096830835188)
601        );
602        assert!(
603            murmurhash3_x64_128("123456".as_bytes(), 0)
604                == (16435832985690558678, 5882968373513761278)
605        );
606        assert!(
607            murmurhash3_x64_128("1234567".as_bytes(), 0)
608                == (3232113351312417698, 4025181827808483669)
609        );
610        assert!(
611            murmurhash3_x64_128("12345678".as_bytes(), 0)
612                == (4272337174398058908, 10464973996478965079)
613        );
614        assert!(
615            murmurhash3_x64_128("123456789".as_bytes(), 0)
616                == (4360720697772133540, 11094893415607738629)
617        );
618        assert!(
619            murmurhash3_x64_128("123456789a".as_bytes(), 0)
620                == (12594836289594257748, 2662019112679848245)
621        );
622        assert!(
623            murmurhash3_x64_128("123456789ab".as_bytes(), 0)
624                == (6978636991469537545, 12243090730442643750)
625        );
626        assert!(
627            murmurhash3_x64_128("123456789abc".as_bytes(), 0)
628                == (211890993682310078, 16480638721813329343)
629        );
630        assert!(
631            murmurhash3_x64_128("123456789abcd".as_bytes(), 0)
632                == (12459781455342427559, 3193214493011213179)
633        );
634        assert!(
635            murmurhash3_x64_128("123456789abcde".as_bytes(), 0)
636                == (12538342858731408721, 9820739847336455216)
637        );
638        assert!(
639            murmurhash3_x64_128("123456789abcdef".as_bytes(), 0)
640                == (9165946068217512774, 2451472574052603025)
641        );
642        assert!(
643            murmurhash3_x64_128("123456789abcdef1".as_bytes(), 0)
644                == (9259082041050667785, 12459473952842597282)
645        );
646    }
647
648    #[test]
649    fn test_large_data() {
650        assert!(murmurhash3_x64_128("Lorem ipsum dolor sit amet, consectetur adipiscing elit. Etiam at consequat massa. Cras eleifend pellentesque ex, at dignissim libero maximus ut. Sed eget nulla felis".as_bytes(), 0)
651            == (9455322759164802692, 17863277201603478371));
652    }
653}
654
655#[cfg(test)]
656mod case_tests {
657    use super::*;
658
659    #[test]
660    fn test_to_snake_case() {
661        assert_eq!(to_snake_case("camelCase"), "camel_case");
662        assert_eq!(to_snake_case("PascalCase"), "pascal_case");
663        assert_eq!(to_snake_case("HTTPRequest"), "http_request");
664        assert_eq!(to_snake_case("simpleTest"), "simple_test");
665        assert_eq!(to_snake_case("already_snake"), "already_snake");
666        assert_eq!(to_snake_case("ABC"), "abc");
667    }
668
669    #[test]
670    fn test_to_camel_case() {
671        assert_eq!(to_camel_case("snake_case"), "snakeCase");
672        assert_eq!(to_camel_case("simple_test"), "simpleTest");
673        assert_eq!(to_camel_case("already"), "already");
674        assert_eq!(to_camel_case("a_b_c"), "aBC");
675    }
676}
677
678pub mod buffer_rw_string {
679    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
680    use std::arch::aarch64::*;
681    #[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
682    use std::arch::x86_64::*;
683    #[cfg(all(
684        any(target_arch = "x86", target_arch = "x86_64"),
685        target_feature = "sse2",
686        not(target_feature = "avx2")
687    ))]
688    use std::arch::x86_64::*;
689
690    use crate::buffer::{Reader, Writer};
691    use crate::error::Error;
692
693    #[inline]
694    pub fn write_latin1_standard(writer: &mut Writer, s: &str) {
695        for c in s.chars() {
696            let b = c as u32;
697            assert!(b <= 0xFF, "Non-Latin1 character found");
698            writer.write_u8(b as u8);
699        }
700    }
701
702    #[inline(always)]
703    pub fn write_latin1_string(writer: &mut Writer, s: &str) {
704        if s.len() < 128 {
705            // Fast path for small buffers
706            let bytes = s.as_bytes();
707            // CRITICAL: Only safe if ASCII (UTF-8 == Latin1 for ASCII)
708            let is_ascii = bytes.iter().all(|&b| b < 0x80);
709            if is_ascii {
710                writer.bf.reserve(s.len());
711                writer.bf.extend_from_slice(bytes);
712            } else {
713                // Non-ASCII: must iterate chars to extract Latin1 byte values
714                writer.bf.reserve(s.len());
715                for c in s.chars() {
716                    let v = c as u32;
717                    assert!(v <= 0xFF, "Non-Latin1 character found");
718                    writer.bf.push(v as u8);
719                }
720            }
721            return;
722        }
723        write_latin1_simd(writer, s);
724    }
725
726    #[inline]
727    pub fn write_utf8_standard(writer: &mut Writer, s: &str) {
728        let bytes = s.as_bytes();
729        writer.bf.extend_from_slice(bytes);
730    }
731
732    #[inline]
733    pub fn write_utf16_standard(writer: &mut Writer, utf16: &[u16]) {
734        #[cfg(target_endian = "little")]
735        {
736            let total_bytes = utf16.len() * 2;
737            let old_len = writer.bf.len();
738            writer.bf.reserve(total_bytes);
739            unsafe {
740                let dest = writer.bf.as_mut_ptr().add(old_len);
741                let src = utf16.as_ptr() as *const u8;
742                std::ptr::copy_nonoverlapping(src, dest, total_bytes);
743                writer.bf.set_len(old_len + total_bytes);
744            }
745        }
746        #[cfg(target_endian = "big")]
747        {
748            let total_bytes = utf16.len() * 2;
749            let old_len = writer.bf.len();
750            writer.bf.reserve(total_bytes);
751            unsafe {
752                let dest = writer.bf.as_mut_ptr().add(old_len);
753                // Need to swap bytes for each u16 to little-endian
754                for (i, &unit) in utf16.iter().enumerate() {
755                    let swapped = unit.swap_bytes();
756                    let ptr = dest.add(i * 2) as *mut u16;
757                    std::ptr::write_unaligned(ptr, swapped);
758                }
759                writer.bf.set_len(old_len + total_bytes);
760            }
761        }
762    }
763
764    #[inline]
765    pub fn read_latin1_standard(reader: &mut Reader, len: usize) -> Result<String, Error> {
766        let slice = reader.sub_slice(reader.get_cursor(), reader.get_cursor() + len)?;
767        let result: String = slice.iter().map(|&b| b as char).collect();
768        reader.move_next(len);
769        Ok(result)
770    }
771
772    #[inline]
773    pub fn read_utf8_standard(reader: &mut Reader, len: usize) -> Result<String, Error> {
774        let slice = reader.sub_slice(reader.get_cursor(), reader.get_cursor() + len)?;
775        // Rust is the only runtime that checks UTF-8 string payloads by default; borrow first so
776        // the check adds no temporary Vec before constructing the final String.
777        let value = std::str::from_utf8(slice)
778            .map_err(|_| Error::encoding_error("invalid UTF-8 string"))?
779            .to_owned();
780        reader.move_next(len);
781        Ok(value)
782    }
783
784    #[inline]
785    pub fn read_utf16_standard(reader: &mut Reader, len: usize) -> Result<String, Error> {
786        if len % 2 != 0 {
787            return Err(Error::encoding_error("UTF-16 length must be even"));
788        }
789        unsafe {
790            let slice = std::slice::from_raw_parts(reader.bf.as_ptr().add(reader.cursor), len);
791            let units: Vec<u16> = slice
792                .chunks_exact(2)
793                .map(|c| u16::from_le_bytes([c[0], c[1]]))
794                .collect();
795            reader.move_next(len);
796            Ok(String::from_utf16_lossy(&units))
797        }
798    }
799
800    #[inline]
801    fn is_ascii_bytes(bytes: &[u8]) -> bool {
802        let len = bytes.len();
803        let mut i = 0;
804
805        #[cfg(target_arch = "x86_64")]
806        unsafe {
807            if is_x86_feature_detected!("avx2") && len >= 32 {
808                while i + 32 <= len {
809                    let chunk = _mm256_loadu_si256(bytes.as_ptr().add(i) as *const __m256i);
810                    let mask = _mm256_movemask_epi8(chunk);
811                    if mask != 0 {
812                        return false;
813                    }
814                    i += 32;
815                }
816            }
817        }
818
819        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
820        unsafe {
821            if is_x86_feature_detected!("sse2") && len >= 16 {
822                while i + 16 <= len {
823                    let chunk = _mm_loadu_si128(bytes.as_ptr().add(i) as *const __m128i);
824                    let mask = _mm_movemask_epi8(chunk);
825                    if mask != 0 {
826                        return false;
827                    }
828                    i += 16;
829                }
830            }
831        }
832
833        #[cfg(target_arch = "aarch64")]
834        unsafe {
835            if std::arch::is_aarch64_feature_detected!("neon") && len >= 16 {
836                while i + 16 <= len {
837                    let chunk = vld1q_u8(bytes.as_ptr().add(i));
838                    if vmaxvq_u8(chunk) >= 0x80 {
839                        return false;
840                    }
841                    i += 16;
842                }
843            }
844        }
845
846        // Scalar fallback
847        bytes[i..].iter().all(|&b| b < 0x80)
848    }
849
850    #[inline]
851    pub fn write_latin1_simd(writer: &mut Writer, s: &str) {
852        if s.is_empty() {
853            return;
854        }
855
856        let bytes = s.as_bytes();
857
858        // CRITICAL OPTIMIZATION: For ASCII strings, UTF-8 bytes == Latin1 bytes
859        // Check if all ASCII using SIMD
860        if is_ascii_bytes(bytes) {
861            // Zero-copy fast path: direct write
862            let len = bytes.len();
863            writer.bf.reserve(len);
864            writer.bf.extend_from_slice(bytes);
865        } else {
866            // Non-ASCII: Must iterate chars to extract Latin1 byte values
867            // Example: 'À' in Rust String is UTF-8 [0xC3, 0x80] but Latin1 is [0xC0]
868            let mut buf: Vec<u8> = Vec::with_capacity(s.len());
869            for c in s.chars() {
870                let v = c as u32;
871                assert!(v <= 0xFF, "Non-Latin1 character found");
872                buf.push(v as u8);
873            }
874            let len = buf.len();
875            writer.bf.reserve(len);
876            writer.bf.extend_from_slice(&buf);
877        }
878    }
879
880    #[inline]
881    pub fn read_latin1_simd(reader: &mut Reader, len: usize) -> Result<String, Error> {
882        if len == 0 {
883            return Ok(String::new());
884        }
885        let src = reader.sub_slice(reader.get_cursor(), reader.get_cursor() + len)?;
886
887        // Pessimistic allocation: Latin1 0x80-0xFF expands to 2 bytes in UTF-8
888        let mut out: Vec<u8> = Vec::with_capacity(len * 2);
889
890        unsafe {
891            let out_ptr = out.as_mut_ptr();
892            let mut out_len = 0usize;
893            let mut i = 0usize;
894
895            // ---- AVX2 fast-path: process 32 ASCII bytes at once ----
896            #[cfg(target_arch = "x86_64")]
897            {
898                if std::arch::is_x86_feature_detected!("avx2") {
899                    use std::arch::x86_64::*;
900                    while i + 32 <= len {
901                        let ptr = src.as_ptr().add(i) as *const __m256i;
902                        let chunk = _mm256_loadu_si256(ptr);
903                        let mask = _mm256_movemask_epi8(chunk);
904                        if mask == 0 {
905                            // All ASCII: direct copy (no conversion needed)
906                            _mm256_storeu_si256(out_ptr.add(out_len) as *mut __m256i, chunk);
907                            out_len += 32;
908                            i += 32;
909                            continue;
910                        } else {
911                            // Contains Latin1 bytes, break to scalar
912                            break;
913                        }
914                    }
915                }
916            }
917
918            // ---- SSE2 fast-path: process 16 ASCII bytes at once ----
919            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
920            {
921                if std::arch::is_x86_feature_detected!("sse2") {
922                    use std::arch::x86_64::*;
923                    while i + 16 <= len {
924                        let ptr = src.as_ptr().add(i) as *const __m128i;
925                        let chunk = _mm_loadu_si128(ptr);
926                        let mask = _mm_movemask_epi8(chunk);
927                        if mask == 0 {
928                            // All ASCII: direct copy
929                            _mm_storeu_si128(out_ptr.add(out_len) as *mut __m128i, chunk);
930                            out_len += 16;
931                            i += 16;
932                            continue;
933                        } else {
934                            break;
935                        }
936                    }
937                }
938            }
939
940            // ---- NEON fast-path: process 16 ASCII bytes at once ----
941            #[cfg(target_arch = "aarch64")]
942            {
943                if std::arch::is_aarch64_feature_detected!("neon") {
944                    use std::arch::aarch64::*;
945                    while i + 16 <= len {
946                        let ptr = src.as_ptr().add(i);
947                        let v = vld1q_u8(ptr);
948                        // Check if any byte >= 0x80
949                        if vmaxvq_u8(v) < 0x80 {
950                            // All ASCII: direct copy
951                            vst1q_u8(out_ptr.add(out_len), v);
952                            out_len += 16;
953                            i += 16;
954                            continue;
955                        } else {
956                            break;
957                        }
958                    }
959                }
960            }
961
962            // ---- Scalar fallback: convert Latin1 -> UTF-8 ----
963            // ASCII (0x00-0x7F): copy as-is
964            // Latin1 (0x80-0xFF): encode as 2-byte UTF-8
965            while i < len {
966                let b = *src.get_unchecked(i);
967                if b < 0x80 {
968                    *out_ptr.add(out_len) = b;
969                    out_len += 1;
970                } else {
971                    // Latin1 byte 0x80-0xFF -> UTF-8 encoding
972                    // Example: 0xC0 (À) -> [0xC3, 0x80]
973                    *out_ptr.add(out_len) = 0xC0 | (b >> 6);
974                    *out_ptr.add(out_len + 1) = 0x80 | (b & 0x3F);
975                    out_len += 2;
976                }
977                i += 1;
978            }
979
980            out.set_len(out_len);
981        }
982        reader.move_next(len);
983        Ok(unsafe { String::from_utf8_unchecked(out) })
984    }
985
986    #[cfg(test)]
987    mod tests {
988        use super::*;
989        use crate::buffer::{Reader, Writer};
990
991        #[test]
992        fn test_latin1() {
993            let samples = [
994                "Hello World!",
995                "Rusty Café",
996                "1234567890",
997                "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝ",
998            ];
999
1000            for s in samples {
1001                let mut buffer = vec![];
1002                let mut writer = Writer::from_buffer(&mut buffer);
1003                write_latin1_simd(&mut writer, s);
1004                write_latin1_simd(&mut writer, s);
1005                let bytes = &*writer.dump();
1006                let bytes_len = bytes.len() / 2;
1007                let mut reader = Reader::new(bytes);
1008                assert_eq!(read_latin1_standard(&mut reader, bytes_len).unwrap(), s);
1009                assert_eq!(read_latin1_standard(&mut reader, bytes_len).unwrap(), s);
1010
1011                let mut buffer = vec![];
1012                let mut writer = Writer::from_buffer(&mut buffer);
1013                write_latin1_standard(&mut writer, s);
1014                write_latin1_standard(&mut writer, s);
1015                let bytes = &*writer.dump();
1016                let bytes_len = bytes.len() / 2;
1017                let mut reader = Reader::new(bytes);
1018                assert_eq!(read_latin1_simd(&mut reader, bytes_len).unwrap(), s);
1019                assert_eq!(read_latin1_simd(&mut reader, bytes_len).unwrap(), s);
1020            }
1021        }
1022
1023        #[test]
1024        fn test_utf8() {
1025            let samples = [
1026                "hello",
1027                "rust语言",
1028                "你好，世界",
1029                "emoji 😀😃😄😁",
1030                "mixed ASCII + 中文 + emoji 😁",
1031            ];
1032
1033            for s in samples {
1034                let bytes_len = s.len();
1035
1036                let mut buffer = vec![];
1037                let mut writer = Writer::from_buffer(&mut buffer);
1038                write_utf8_standard(&mut writer, s);
1039                write_utf8_standard(&mut writer, s);
1040                let bytes = &*writer.dump();
1041                let mut reader = Reader::new(bytes);
1042                assert_eq!(read_utf8_standard(&mut reader, bytes_len).unwrap(), s);
1043                assert_eq!(read_utf8_standard(&mut reader, bytes_len).unwrap(), s);
1044            }
1045        }
1046
1047        #[test]
1048        fn test_utf16() {
1049            let samples = [
1050                "hello",
1051                "rust语言",
1052                "你好，世界",
1053                "emoji 😀😃😄😁",
1054                "混合文字 + emoji 🐍💻🦀",
1055            ];
1056            for s in samples {
1057                let utf16: Vec<u16> = s.encode_utf16().collect();
1058                let bytes_len = utf16.len() * 2;
1059
1060                let mut buffer = vec![];
1061                let mut writer = Writer::from_buffer(&mut buffer);
1062                write_utf16_standard(&mut writer, &utf16);
1063                write_utf16_standard(&mut writer, &utf16);
1064
1065                let mut buffer = vec![];
1066                let mut writer = Writer::from_buffer(&mut buffer);
1067                write_utf16_standard(&mut writer, &utf16);
1068                write_utf16_standard(&mut writer, &utf16);
1069                let bytes = &*writer.dump();
1070                let mut reader = Reader::new(bytes);
1071                assert_eq!(read_utf16_standard(&mut reader, bytes_len).unwrap(), s);
1072                assert_eq!(read_utf16_standard(&mut reader, bytes_len).unwrap(), s);
1073            }
1074        }
1075    }
1076}
fory_core/util/string_util.rs

fory_core/util/
string_util.rs