1use std::mem;
19use std::ptr;
20
21const MAX_HASH32: u64 = (1 << 31) - 1;
22
23#[allow(dead_code)]
24pub enum StringFlag {
25 LATIN1 = 0,
26 UTF8 = 1,
27}
28
29fn swap_endian(value: u16) -> u16 {
31 value.rotate_right(8)
32}
33
34pub fn to_utf8(utf16: &[u16], is_little_endian: bool) -> Result<Vec<u8>, String> {
36 let mut utf8_bytes: Vec<u8> = Vec::with_capacity(utf16.len() * 3);
39 let ptr = utf8_bytes.as_mut_ptr();
40 let mut offset = 0;
41 let mut iter = utf16.iter();
42 while let Some(&wc) = iter.next() {
43 let wc = if is_little_endian {
44 swap_endian(wc)
45 } else {
46 wc
47 };
48 match wc {
49 code_point if code_point < 0x80 => {
50 unsafe {
51 ptr.add(offset).write(code_point as u8);
52 }
53 offset += 1;
54 }
55 code_point if code_point < 0x800 => {
56 let bytes = [
57 ((code_point >> 6) & 0b1_1111) as u8 | 0b1100_0000,
58 (code_point & 0b11_1111) as u8 | 0b1000_0000,
59 ];
60 unsafe {
61 ptr::copy_nonoverlapping(bytes.as_ptr(), ptr.add(offset), 2);
62 }
63 offset += 2;
64 }
65 wc1 if (0xd800..=0xdbff).contains(&wc1) => {
66 if let Some(&wc2) = iter.next() {
67 let wc2 = if is_little_endian {
68 swap_endian(wc2)
69 } else {
70 wc2
71 };
72 if !(0xdc00..=0xdfff).contains(&wc2) {
73 return Err("Invalid UTF-16 string: wrong surrogate pair".to_string());
74 }
75 let code_point =
76 ((((wc1 as u32) - 0xd800) << 10) | ((wc2 as u32) - 0xdc00)) + 0x10000;
77 let bytes = [
78 ((code_point >> 18) & 0b111) as u8 | 0b1111_0000,
79 ((code_point >> 12) & 0b11_1111) as u8 | 0b1000_0000,
80 ((code_point >> 6) & 0b11_1111) as u8 | 0b1000_0000,
81 (code_point & 0b11_1111) as u8 | 0b1000_0000,
82 ];
83 unsafe {
84 ptr::copy_nonoverlapping(bytes.as_ptr(), ptr.add(offset), 4);
85 }
86 offset += 4;
87 } else {
88 return Err("Invalid UTF-16 string: missing surrogate pair".to_string());
89 }
90 }
91 _ => {
92 let bytes = [
93 ((wc >> 12) | 0b1110_0000) as u8,
94 ((wc >> 6) & 0b11_1111) as u8 | 0b1000_0000,
95 (wc & 0b11_1111) as u8 | 0b1000_0000,
96 ];
97 unsafe {
98 ptr::copy_nonoverlapping(bytes.as_ptr(), ptr.add(offset), 3);
99 }
100 offset += 3;
101 }
102 }
103 }
104 unsafe {
105 utf8_bytes.set_len(offset);
106 }
107 Ok(utf8_bytes)
108}
109
110pub fn to_snake_case(name: &str) -> String {
112 let mut result = String::with_capacity(name.len() + 4);
113 let chars: Vec<char> = name.chars().collect();
114
115 for (i, &c) in chars.iter().enumerate() {
116 if c.is_ascii_uppercase() {
117 if i > 0 {
118 let prev_upper = chars.get(i - 1).is_some_and(|c| c.is_ascii_uppercase());
119 let next_upper_or_end = chars.get(i + 1).map_or(true, |c| c.is_ascii_uppercase());
120 if !prev_upper || !next_upper_or_end {
121 result.push('_');
122 }
123 }
124 result.push(c.to_ascii_lowercase());
125 } else {
126 result.push(c);
127 }
128 }
129 result
130}
131
132pub fn to_camel_case(name: &str) -> String {
134 let mut result = String::with_capacity(name.len());
135 let mut capitalize_next = false;
136
137 for c in name.chars() {
138 if c == '_' {
139 capitalize_next = true;
140 } else if capitalize_next {
141 result.push(c.to_ascii_uppercase());
142 capitalize_next = false;
143 } else {
144 result.push(c);
145 }
146 }
147 result
148}
149
150#[allow(dead_code)]
151pub fn compute_string_hash(s: &str) -> u32 {
152 let mut hash: u64 = 17;
153 s.as_bytes().iter().for_each(|b| {
154 hash = (hash * 31) + (*b as u64);
155 while hash >= MAX_HASH32 {
156 hash /= 7;
157 }
158 });
159 hash as u32
160}
161
162#[cfg(target_feature = "neon")]
163use std::arch::aarch64::*;
164
165#[cfg(target_feature = "avx2")]
166use std::arch::x86_64::*;
167
168#[cfg(target_feature = "sse2")]
169use std::arch::x86_64::*;
170
171#[cfg(target_arch = "x86_64")]
172pub const MIN_DIM_SIZE_AVX: usize = 32;
173
174#[cfg(any(
175 target_arch = "x86",
176 target_arch = "x86_64",
177 all(target_arch = "aarch64", target_feature = "neon")
178))]
179pub const MIN_DIM_SIZE_SIMD: usize = 16;
180
181#[cfg(target_arch = "x86_64")]
182unsafe fn is_latin_avx(s: &str) -> bool {
183 let bytes = s.as_bytes();
184 let len = bytes.len();
185 let mut i = 0;
186 while i + MIN_DIM_SIZE_AVX <= len {
188 let chunk = _mm256_loadu_si256(bytes.as_ptr().add(i) as *const __m256i);
189 let hi_mask = _mm256_set1_epi8(0x80u8 as i8);
190 let masked = _mm256_and_si256(chunk, hi_mask);
191 let cmp = _mm256_cmpeq_epi8(masked, _mm256_setzero_si256());
192 if _mm256_movemask_epi8(cmp) != -1 {
193 break;
194 }
195 i += MIN_DIM_SIZE_AVX;
196 }
197 let s_tail = &s[i..];
199 for c in s_tail.chars() {
200 if c as u32 > 0xFF {
201 return false;
202 }
203 }
204 true
205}
206
207#[cfg(target_feature = "sse2")]
208unsafe fn is_latin_sse(s: &str) -> bool {
209 let bytes = s.as_bytes();
210 let len = bytes.len();
211 let mut i = 0;
212 while i + MIN_DIM_SIZE_SIMD <= len {
214 let chunk = _mm_loadu_si128(bytes.as_ptr().add(i) as *const __m128i);
215 let hi_mask = _mm_set1_epi8(0x80u8 as i8);
216 let masked = _mm_and_si128(chunk, hi_mask);
217 let cmp = _mm_cmpeq_epi8(masked, _mm_setzero_si128());
218 if _mm_movemask_epi8(cmp) != 0xFFFF {
219 break;
220 }
221 i += MIN_DIM_SIZE_SIMD;
222 }
223 let s_tail = &s[i..];
225 for c in s_tail.chars() {
226 if c as u32 > 0xFF {
227 return false;
228 }
229 }
230 true
231}
232
233#[cfg(target_feature = "neon")]
234unsafe fn is_latin_neon(s: &str) -> bool {
235 let bytes = s.as_bytes();
236 let len = bytes.len();
237 let mut i = 0;
238 while i + MIN_DIM_SIZE_SIMD <= len {
240 let chunk = vld1q_u8(bytes.as_ptr().add(i));
241 let hi_mask = vdupq_n_u8(0x80);
242 let masked = vandq_u8(chunk, hi_mask);
243 if vmaxvq_u8(masked) != 0 {
244 break;
245 }
246 i += MIN_DIM_SIZE_SIMD;
247 }
248 let s_tail = &s[i..];
250 for c in s_tail.chars() {
251 if c as u32 > 0xFF {
252 return false;
253 }
254 }
255 true
256}
257
258fn is_latin_standard(s: &str) -> bool {
259 s.chars().all(|c| c as u32 <= 0xFF)
260}
261
262pub fn is_latin(s: &str) -> bool {
263 #[cfg(target_arch = "x86_64")]
264 {
265 if is_x86_feature_detected!("avx")
266 && is_x86_feature_detected!("fma")
267 && s.len() >= MIN_DIM_SIZE_AVX
268 {
269 return unsafe { is_latin_avx(s) };
270 }
271 }
272
273 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
274 {
275 if is_x86_feature_detected!("sse") && s.len() >= MIN_DIM_SIZE_SIMD {
276 return unsafe { is_latin_sse(s) };
277 }
278 }
279
280 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
281 {
282 if std::arch::is_aarch64_feature_detected!("neon") && s.len() >= MIN_DIM_SIZE_SIMD {
283 return unsafe { is_latin_neon(s) };
284 }
285 }
286 is_latin_standard(s)
287}
288
289#[cfg(target_arch = "x86_64")]
290unsafe fn get_latin1_length_avx(s: &str) -> i32 {
291 let bytes = s.as_bytes();
292 let len = bytes.len();
293 let mut count = 0;
294 while count + MIN_DIM_SIZE_AVX <= len {
296 let chunk = _mm256_loadu_si256(bytes.as_ptr().add(count) as *const __m256i);
297 let hi_mask = _mm256_set1_epi8(0x80u8 as i8);
298 let masked = _mm256_and_si256(chunk, hi_mask);
299 let cmp = _mm256_cmpeq_epi8(masked, _mm256_setzero_si256());
300 if _mm256_movemask_epi8(cmp) != -1 {
301 break;
302 }
303 count += MIN_DIM_SIZE_AVX;
304 }
305 let s_tail = &s[count..];
307 for c in s_tail.chars() {
308 if c as u32 > 0xFF {
309 return -1;
310 }
311 count += 1;
312 }
313 count as i32
314}
315
316#[cfg(target_feature = "sse2")]
317unsafe fn get_latin1_length_sse(s: &str) -> i32 {
318 let bytes = s.as_bytes();
319 let len = bytes.len();
320 let mut count = 0;
321 while count + MIN_DIM_SIZE_SIMD <= len {
323 let chunk = _mm_loadu_si128(bytes.as_ptr().add(count) as *const __m128i);
324 let hi_mask = _mm_set1_epi8(0x80u8 as i8);
325 let masked = _mm_and_si128(chunk, hi_mask);
326 let cmp = _mm_cmpeq_epi8(masked, _mm_setzero_si128());
327 if _mm_movemask_epi8(cmp) != 0xFFFF {
328 break;
329 }
330 count += MIN_DIM_SIZE_SIMD;
331 }
332 let s_tail = &s[count..];
334 for c in s_tail.chars() {
335 if c as u32 > 0xFF {
336 return -1;
337 }
338 count += 1;
339 }
340 count as i32
341}
342
343#[cfg(target_feature = "neon")]
344unsafe fn get_latin1_length_neon(s: &str) -> i32 {
345 let bytes = s.as_bytes();
346 let len = bytes.len();
347 let mut count = 0;
348 while count + MIN_DIM_SIZE_SIMD <= len {
350 let chunk = vld1q_u8(bytes.as_ptr().add(count));
351 let hi_mask = vdupq_n_u8(0x80);
352 let masked = vandq_u8(chunk, hi_mask);
353 if vmaxvq_u8(masked) != 0 {
354 break;
355 }
356 count += MIN_DIM_SIZE_SIMD;
357 }
358 let s_tail = &s[count..];
360 for c in s_tail.chars() {
361 if c as u32 > 0xFF {
362 return -1;
363 }
364 count += 1;
365 }
366 count as i32
367}
368
369fn get_latin1_length_standard(s: &str) -> i32 {
370 let mut count = 0;
371 for c in s.chars() {
372 if c as u32 > 0xFF {
373 return -1;
374 }
375 count += 1;
376 }
377 count
378}
379
380pub fn get_latin1_length(s: &str) -> i32 {
381 #[cfg(target_arch = "x86_64")]
382 {
383 if is_x86_feature_detected!("avx")
384 && is_x86_feature_detected!("fma")
385 && s.len() >= MIN_DIM_SIZE_AVX
386 {
387 return unsafe { get_latin1_length_avx(s) };
388 }
389 }
390
391 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
392 {
393 if is_x86_feature_detected!("sse") && s.len() >= MIN_DIM_SIZE_SIMD {
394 return unsafe { get_latin1_length_sse(s) };
395 }
396 }
397
398 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
399 {
400 if std::arch::is_aarch64_feature_detected!("neon") && s.len() >= MIN_DIM_SIZE_SIMD {
401 return unsafe { get_latin1_length_neon(s) };
402 }
403 }
404 get_latin1_length_standard(s)
405}
406
407#[cfg(test)]
408mod latin_tests {
409 use super::*;
411 use rand::Rng;
412
413 fn generate_random_string(length: usize) -> String {
414 const CHARSET: &[u8] = b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
415 let mut rng = rand::thread_rng();
416
417 let result: String = (0..length)
418 .map(|_| {
419 let idx = rng.gen_range(0..CHARSET.len());
420 CHARSET[idx] as char
421 })
422 .collect();
423
424 result
425 }
426
427 #[test]
428 fn test_is_latin() {
429 let s = generate_random_string(1000);
430 let not_latin_str = generate_random_string(1000) + "abc\u{1234}";
431
432 #[cfg(target_arch = "x86_64")]
433 {
434 if is_x86_feature_detected!("avx") && is_x86_feature_detected!("fma") {
435 assert!(unsafe { is_latin_avx(&s) });
436 assert!(!unsafe { is_latin_avx(¬_latin_str) });
437 }
438 }
439
440 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
441 {
442 if is_x86_feature_detected!("sse") && s.len() >= MIN_DIM_SIZE_SIMD {
443 assert!(unsafe { is_latin_sse(&s) });
444 assert!(!unsafe { is_latin_sse(¬_latin_str) });
445 }
446 }
447
448 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
449 {
450 if std::arch::is_aarch64_feature_detected!("neon") && s.len() >= MIN_DIM_SIZE_SIMD {
451 assert!(unsafe { is_latin_neon(&s) });
452 assert!(!unsafe { is_latin_neon(¬_latin_str) });
453 }
454 }
455 assert!(is_latin_standard(&s));
456 assert!(!is_latin_standard(¬_latin_str));
457 }
458}
459
460fn fmix64(mut k: u64) -> u64 {
461 k ^= k >> 33;
462 k = k.wrapping_mul(0xff51afd7ed558ccdu64);
463 k ^= k >> 33;
464 k = k.wrapping_mul(0xc4ceb9fe1a85ec53u64);
465 k ^= k >> 33;
466
467 k
468}
469
470pub fn murmurhash3_x64_128(bytes: &[u8], seed: u64) -> (u64, u64) {
471 let c1 = 0x87c37b91114253d5u64;
472 let c2 = 0x4cf5ad432745937fu64;
473 let read_size = 16;
474 let len = bytes.len() as u64;
475 let block_count = len / read_size;
476
477 let (mut h1, mut h2) = (seed, seed);
478
479 for i in 0..block_count as usize {
480 let b64: &[u64] = unsafe { mem::transmute(bytes) };
481 let (mut k1, mut k2) = (b64[i * 2], b64[i * 2 + 1]);
482
483 k1 = k1.wrapping_mul(c1);
484 k1 = k1.rotate_left(31);
485 k1 = k1.wrapping_mul(c2);
486 h1 ^= k1;
487
488 h1 = h1.rotate_left(27);
489 h1 = h1.wrapping_add(h2);
490 h1 = h1.wrapping_mul(5);
491 h1 = h1.wrapping_add(0x52dce729);
492
493 k2 = k2.wrapping_mul(c2);
494 k2 = k2.rotate_left(33);
495 k2 = k2.wrapping_mul(c1);
496 h2 ^= k2;
497
498 h2 = h2.rotate_left(31);
499 h2 = h2.wrapping_add(h1);
500 h2 = h2.wrapping_mul(5);
501 h2 = h2.wrapping_add(0x38495ab5);
502 }
503 let (mut k1, mut k2) = (0u64, 0u64);
504
505 if len & 15 == 15 {
506 k2 ^= (bytes[(block_count * read_size) as usize + 14] as u64) << 48;
507 }
508 if len & 15 >= 14 {
509 k2 ^= (bytes[(block_count * read_size) as usize + 13] as u64) << 40;
510 }
511 if len & 15 >= 13 {
512 k2 ^= (bytes[(block_count * read_size) as usize + 12] as u64) << 32;
513 }
514 if len & 15 >= 12 {
515 k2 ^= (bytes[(block_count * read_size) as usize + 11] as u64) << 24;
516 }
517 if len & 15 >= 11 {
518 k2 ^= (bytes[(block_count * read_size) as usize + 10] as u64) << 16;
519 }
520 if len & 15 >= 10 {
521 k2 ^= (bytes[(block_count * read_size) as usize + 9] as u64) << 8;
522 }
523 if len & 15 >= 9 {
524 k2 ^= bytes[(block_count * read_size) as usize + 8] as u64;
525 k2 = k2.wrapping_mul(c2);
526 k2 = k2.rotate_left(33);
527 k2 = k2.wrapping_mul(c1);
528 h2 ^= k2;
529 }
530
531 if len & 15 >= 8 {
532 k1 ^= (bytes[(block_count * read_size) as usize + 7] as u64) << 56;
533 }
534 if len & 15 >= 7 {
535 k1 ^= (bytes[(block_count * read_size) as usize + 6] as u64) << 48;
536 }
537 if len & 15 >= 6 {
538 k1 ^= (bytes[(block_count * read_size) as usize + 5] as u64) << 40;
539 }
540 if len & 15 >= 5 {
541 k1 ^= (bytes[(block_count * read_size) as usize + 4] as u64) << 32;
542 }
543 if len & 15 >= 4 {
544 k1 ^= (bytes[(block_count * read_size) as usize + 3] as u64) << 24;
545 }
546 if len & 15 >= 3 {
547 k1 ^= (bytes[(block_count * read_size) as usize + 2] as u64) << 16;
548 }
549 if len & 15 >= 2 {
550 k1 ^= (bytes[(block_count * read_size) as usize + 1] as u64) << 8;
551 }
552 if len & 15 >= 1 {
553 k1 ^= bytes[(block_count * read_size) as usize] as u64;
554 k1 = k1.wrapping_mul(c1);
555 k1 = k1.rotate_left(31);
556 k1 = k1.wrapping_mul(c2);
557 h1 ^= k1;
558 }
559
560 h1 ^= bytes.len() as u64;
561 h2 ^= bytes.len() as u64;
562
563 h1 = h1.wrapping_add(h2);
564 h2 = h2.wrapping_add(h1);
565
566 h1 = fmix64(h1);
567 h2 = fmix64(h2);
568
569 h1 = h1.wrapping_add(h2);
570 h2 = h2.wrapping_add(h1);
571
572 (h1, h2)
573}
574
575#[cfg(test)]
576mod test_hash {
577 use super::murmurhash3_x64_128;
578
579 #[test]
580 fn test_empty_string() {
581 assert!(murmurhash3_x64_128("".as_bytes(), 0) == (0, 0));
582 }
583
584 #[test]
585 fn test_tail_lengths() {
586 assert!(
587 murmurhash3_x64_128("1".as_bytes(), 0) == (8213365047359667313, 10676604921780958775)
588 );
589 assert!(
590 murmurhash3_x64_128("12".as_bytes(), 0) == (5355690773644049813, 9855895140584599837)
591 );
592 assert!(
593 murmurhash3_x64_128("123".as_bytes(), 0) == (10978418110857903978, 4791445053355511657)
594 );
595 assert!(
596 murmurhash3_x64_128("1234".as_bytes(), 0) == (619023178690193332, 3755592904005385637)
597 );
598 assert!(
599 murmurhash3_x64_128("12345".as_bytes(), 0)
600 == (2375712675693977547, 17382870096830835188)
601 );
602 assert!(
603 murmurhash3_x64_128("123456".as_bytes(), 0)
604 == (16435832985690558678, 5882968373513761278)
605 );
606 assert!(
607 murmurhash3_x64_128("1234567".as_bytes(), 0)
608 == (3232113351312417698, 4025181827808483669)
609 );
610 assert!(
611 murmurhash3_x64_128("12345678".as_bytes(), 0)
612 == (4272337174398058908, 10464973996478965079)
613 );
614 assert!(
615 murmurhash3_x64_128("123456789".as_bytes(), 0)
616 == (4360720697772133540, 11094893415607738629)
617 );
618 assert!(
619 murmurhash3_x64_128("123456789a".as_bytes(), 0)
620 == (12594836289594257748, 2662019112679848245)
621 );
622 assert!(
623 murmurhash3_x64_128("123456789ab".as_bytes(), 0)
624 == (6978636991469537545, 12243090730442643750)
625 );
626 assert!(
627 murmurhash3_x64_128("123456789abc".as_bytes(), 0)
628 == (211890993682310078, 16480638721813329343)
629 );
630 assert!(
631 murmurhash3_x64_128("123456789abcd".as_bytes(), 0)
632 == (12459781455342427559, 3193214493011213179)
633 );
634 assert!(
635 murmurhash3_x64_128("123456789abcde".as_bytes(), 0)
636 == (12538342858731408721, 9820739847336455216)
637 );
638 assert!(
639 murmurhash3_x64_128("123456789abcdef".as_bytes(), 0)
640 == (9165946068217512774, 2451472574052603025)
641 );
642 assert!(
643 murmurhash3_x64_128("123456789abcdef1".as_bytes(), 0)
644 == (9259082041050667785, 12459473952842597282)
645 );
646 }
647
648 #[test]
649 fn test_large_data() {
650 assert!(murmurhash3_x64_128("Lorem ipsum dolor sit amet, consectetur adipiscing elit. Etiam at consequat massa. Cras eleifend pellentesque ex, at dignissim libero maximus ut. Sed eget nulla felis".as_bytes(), 0)
651 == (9455322759164802692, 17863277201603478371));
652 }
653}
654
655#[cfg(test)]
656mod case_tests {
657 use super::*;
658
659 #[test]
660 fn test_to_snake_case() {
661 assert_eq!(to_snake_case("camelCase"), "camel_case");
662 assert_eq!(to_snake_case("PascalCase"), "pascal_case");
663 assert_eq!(to_snake_case("HTTPRequest"), "http_request");
664 assert_eq!(to_snake_case("simpleTest"), "simple_test");
665 assert_eq!(to_snake_case("already_snake"), "already_snake");
666 assert_eq!(to_snake_case("ABC"), "abc");
667 }
668
669 #[test]
670 fn test_to_camel_case() {
671 assert_eq!(to_camel_case("snake_case"), "snakeCase");
672 assert_eq!(to_camel_case("simple_test"), "simpleTest");
673 assert_eq!(to_camel_case("already"), "already");
674 assert_eq!(to_camel_case("a_b_c"), "aBC");
675 }
676}
677
678pub mod buffer_rw_string {
679 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
680 use std::arch::aarch64::*;
681 #[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
682 use std::arch::x86_64::*;
683 #[cfg(all(
684 any(target_arch = "x86", target_arch = "x86_64"),
685 target_feature = "sse2",
686 not(target_feature = "avx2")
687 ))]
688 use std::arch::x86_64::*;
689
690 use crate::buffer::{Reader, Writer};
691 use crate::error::Error;
692
693 #[inline]
694 pub fn write_latin1_standard(writer: &mut Writer, s: &str) {
695 for c in s.chars() {
696 let b = c as u32;
697 assert!(b <= 0xFF, "Non-Latin1 character found");
698 writer.write_u8(b as u8);
699 }
700 }
701
702 #[inline(always)]
703 pub fn write_latin1_string(writer: &mut Writer, s: &str) {
704 if s.len() < 128 {
705 let bytes = s.as_bytes();
707 let is_ascii = bytes.iter().all(|&b| b < 0x80);
709 if is_ascii {
710 writer.bf.reserve(s.len());
711 writer.bf.extend_from_slice(bytes);
712 } else {
713 writer.bf.reserve(s.len());
715 for c in s.chars() {
716 let v = c as u32;
717 assert!(v <= 0xFF, "Non-Latin1 character found");
718 writer.bf.push(v as u8);
719 }
720 }
721 return;
722 }
723 write_latin1_simd(writer, s);
724 }
725
726 #[inline]
727 pub fn write_utf8_standard(writer: &mut Writer, s: &str) {
728 let bytes = s.as_bytes();
729 writer.bf.extend_from_slice(bytes);
730 }
731
732 #[inline]
733 pub fn write_utf16_standard(writer: &mut Writer, utf16: &[u16]) {
734 #[cfg(target_endian = "little")]
735 {
736 let total_bytes = utf16.len() * 2;
737 let old_len = writer.bf.len();
738 writer.bf.reserve(total_bytes);
739 unsafe {
740 let dest = writer.bf.as_mut_ptr().add(old_len);
741 let src = utf16.as_ptr() as *const u8;
742 std::ptr::copy_nonoverlapping(src, dest, total_bytes);
743 writer.bf.set_len(old_len + total_bytes);
744 }
745 }
746 #[cfg(target_endian = "big")]
747 {
748 let total_bytes = utf16.len() * 2;
749 let old_len = writer.bf.len();
750 writer.bf.reserve(total_bytes);
751 unsafe {
752 let dest = writer.bf.as_mut_ptr().add(old_len);
753 for (i, &unit) in utf16.iter().enumerate() {
755 let swapped = unit.swap_bytes();
756 let ptr = dest.add(i * 2) as *mut u16;
757 std::ptr::write_unaligned(ptr, swapped);
758 }
759 writer.bf.set_len(old_len + total_bytes);
760 }
761 }
762 }
763
764 #[inline]
765 pub fn read_latin1_standard(reader: &mut Reader, len: usize) -> Result<String, Error> {
766 let slice = reader.sub_slice(reader.get_cursor(), reader.get_cursor() + len)?;
767 let result: String = slice.iter().map(|&b| b as char).collect();
768 reader.move_next(len);
769 Ok(result)
770 }
771
772 #[inline]
773 pub fn read_utf8_standard(reader: &mut Reader, len: usize) -> Result<String, Error> {
774 let slice = reader.sub_slice(reader.get_cursor(), reader.get_cursor() + len)?;
775 let value = std::str::from_utf8(slice)
778 .map_err(|_| Error::encoding_error("invalid UTF-8 string"))?
779 .to_owned();
780 reader.move_next(len);
781 Ok(value)
782 }
783
784 #[inline]
785 pub fn read_utf16_standard(reader: &mut Reader, len: usize) -> Result<String, Error> {
786 if len % 2 != 0 {
787 return Err(Error::encoding_error("UTF-16 length must be even"));
788 }
789 unsafe {
790 let slice = std::slice::from_raw_parts(reader.bf.as_ptr().add(reader.cursor), len);
791 let units: Vec<u16> = slice
792 .chunks_exact(2)
793 .map(|c| u16::from_le_bytes([c[0], c[1]]))
794 .collect();
795 reader.move_next(len);
796 Ok(String::from_utf16_lossy(&units))
797 }
798 }
799
800 #[inline]
801 fn is_ascii_bytes(bytes: &[u8]) -> bool {
802 let len = bytes.len();
803 let mut i = 0;
804
805 #[cfg(target_arch = "x86_64")]
806 unsafe {
807 if is_x86_feature_detected!("avx2") && len >= 32 {
808 while i + 32 <= len {
809 let chunk = _mm256_loadu_si256(bytes.as_ptr().add(i) as *const __m256i);
810 let mask = _mm256_movemask_epi8(chunk);
811 if mask != 0 {
812 return false;
813 }
814 i += 32;
815 }
816 }
817 }
818
819 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
820 unsafe {
821 if is_x86_feature_detected!("sse2") && len >= 16 {
822 while i + 16 <= len {
823 let chunk = _mm_loadu_si128(bytes.as_ptr().add(i) as *const __m128i);
824 let mask = _mm_movemask_epi8(chunk);
825 if mask != 0 {
826 return false;
827 }
828 i += 16;
829 }
830 }
831 }
832
833 #[cfg(target_arch = "aarch64")]
834 unsafe {
835 if std::arch::is_aarch64_feature_detected!("neon") && len >= 16 {
836 while i + 16 <= len {
837 let chunk = vld1q_u8(bytes.as_ptr().add(i));
838 if vmaxvq_u8(chunk) >= 0x80 {
839 return false;
840 }
841 i += 16;
842 }
843 }
844 }
845
846 bytes[i..].iter().all(|&b| b < 0x80)
848 }
849
850 #[inline]
851 pub fn write_latin1_simd(writer: &mut Writer, s: &str) {
852 if s.is_empty() {
853 return;
854 }
855
856 let bytes = s.as_bytes();
857
858 if is_ascii_bytes(bytes) {
861 let len = bytes.len();
863 writer.bf.reserve(len);
864 writer.bf.extend_from_slice(bytes);
865 } else {
866 let mut buf: Vec<u8> = Vec::with_capacity(s.len());
869 for c in s.chars() {
870 let v = c as u32;
871 assert!(v <= 0xFF, "Non-Latin1 character found");
872 buf.push(v as u8);
873 }
874 let len = buf.len();
875 writer.bf.reserve(len);
876 writer.bf.extend_from_slice(&buf);
877 }
878 }
879
880 #[inline]
881 pub fn read_latin1_simd(reader: &mut Reader, len: usize) -> Result<String, Error> {
882 if len == 0 {
883 return Ok(String::new());
884 }
885 let src = reader.sub_slice(reader.get_cursor(), reader.get_cursor() + len)?;
886
887 let mut out: Vec<u8> = Vec::with_capacity(len * 2);
889
890 unsafe {
891 let out_ptr = out.as_mut_ptr();
892 let mut out_len = 0usize;
893 let mut i = 0usize;
894
895 #[cfg(target_arch = "x86_64")]
897 {
898 if std::arch::is_x86_feature_detected!("avx2") {
899 use std::arch::x86_64::*;
900 while i + 32 <= len {
901 let ptr = src.as_ptr().add(i) as *const __m256i;
902 let chunk = _mm256_loadu_si256(ptr);
903 let mask = _mm256_movemask_epi8(chunk);
904 if mask == 0 {
905 _mm256_storeu_si256(out_ptr.add(out_len) as *mut __m256i, chunk);
907 out_len += 32;
908 i += 32;
909 continue;
910 } else {
911 break;
913 }
914 }
915 }
916 }
917
918 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
920 {
921 if std::arch::is_x86_feature_detected!("sse2") {
922 use std::arch::x86_64::*;
923 while i + 16 <= len {
924 let ptr = src.as_ptr().add(i) as *const __m128i;
925 let chunk = _mm_loadu_si128(ptr);
926 let mask = _mm_movemask_epi8(chunk);
927 if mask == 0 {
928 _mm_storeu_si128(out_ptr.add(out_len) as *mut __m128i, chunk);
930 out_len += 16;
931 i += 16;
932 continue;
933 } else {
934 break;
935 }
936 }
937 }
938 }
939
940 #[cfg(target_arch = "aarch64")]
942 {
943 if std::arch::is_aarch64_feature_detected!("neon") {
944 use std::arch::aarch64::*;
945 while i + 16 <= len {
946 let ptr = src.as_ptr().add(i);
947 let v = vld1q_u8(ptr);
948 if vmaxvq_u8(v) < 0x80 {
950 vst1q_u8(out_ptr.add(out_len), v);
952 out_len += 16;
953 i += 16;
954 continue;
955 } else {
956 break;
957 }
958 }
959 }
960 }
961
962 while i < len {
966 let b = *src.get_unchecked(i);
967 if b < 0x80 {
968 *out_ptr.add(out_len) = b;
969 out_len += 1;
970 } else {
971 *out_ptr.add(out_len) = 0xC0 | (b >> 6);
974 *out_ptr.add(out_len + 1) = 0x80 | (b & 0x3F);
975 out_len += 2;
976 }
977 i += 1;
978 }
979
980 out.set_len(out_len);
981 }
982 reader.move_next(len);
983 Ok(unsafe { String::from_utf8_unchecked(out) })
984 }
985
986 #[cfg(test)]
987 mod tests {
988 use super::*;
989 use crate::buffer::{Reader, Writer};
990
991 #[test]
992 fn test_latin1() {
993 let samples = [
994 "Hello World!",
995 "Rusty Café",
996 "1234567890",
997 "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝ",
998 ];
999
1000 for s in samples {
1001 let mut buffer = vec![];
1002 let mut writer = Writer::from_buffer(&mut buffer);
1003 write_latin1_simd(&mut writer, s);
1004 write_latin1_simd(&mut writer, s);
1005 let bytes = &*writer.dump();
1006 let bytes_len = bytes.len() / 2;
1007 let mut reader = Reader::new(bytes);
1008 assert_eq!(read_latin1_standard(&mut reader, bytes_len).unwrap(), s);
1009 assert_eq!(read_latin1_standard(&mut reader, bytes_len).unwrap(), s);
1010
1011 let mut buffer = vec![];
1012 let mut writer = Writer::from_buffer(&mut buffer);
1013 write_latin1_standard(&mut writer, s);
1014 write_latin1_standard(&mut writer, s);
1015 let bytes = &*writer.dump();
1016 let bytes_len = bytes.len() / 2;
1017 let mut reader = Reader::new(bytes);
1018 assert_eq!(read_latin1_simd(&mut reader, bytes_len).unwrap(), s);
1019 assert_eq!(read_latin1_simd(&mut reader, bytes_len).unwrap(), s);
1020 }
1021 }
1022
1023 #[test]
1024 fn test_utf8() {
1025 let samples = [
1026 "hello",
1027 "rust语言",
1028 "你好,世界",
1029 "emoji 😀😃😄😁",
1030 "mixed ASCII + 中文 + emoji 😁",
1031 ];
1032
1033 for s in samples {
1034 let bytes_len = s.len();
1035
1036 let mut buffer = vec![];
1037 let mut writer = Writer::from_buffer(&mut buffer);
1038 write_utf8_standard(&mut writer, s);
1039 write_utf8_standard(&mut writer, s);
1040 let bytes = &*writer.dump();
1041 let mut reader = Reader::new(bytes);
1042 assert_eq!(read_utf8_standard(&mut reader, bytes_len).unwrap(), s);
1043 assert_eq!(read_utf8_standard(&mut reader, bytes_len).unwrap(), s);
1044 }
1045 }
1046
1047 #[test]
1048 fn test_utf16() {
1049 let samples = [
1050 "hello",
1051 "rust语言",
1052 "你好,世界",
1053 "emoji 😀😃😄😁",
1054 "混合文字 + emoji 🐍💻🦀",
1055 ];
1056 for s in samples {
1057 let utf16: Vec<u16> = s.encode_utf16().collect();
1058 let bytes_len = utf16.len() * 2;
1059
1060 let mut buffer = vec![];
1061 let mut writer = Writer::from_buffer(&mut buffer);
1062 write_utf16_standard(&mut writer, &utf16);
1063 write_utf16_standard(&mut writer, &utf16);
1064
1065 let mut buffer = vec![];
1066 let mut writer = Writer::from_buffer(&mut buffer);
1067 write_utf16_standard(&mut writer, &utf16);
1068 write_utf16_standard(&mut writer, &utf16);
1069 let bytes = &*writer.dump();
1070 let mut reader = Reader::new(bytes);
1071 assert_eq!(read_utf16_standard(&mut reader, bytes_len).unwrap(), s);
1072 assert_eq!(read_utf16_standard(&mut reader, bytes_len).unwrap(), s);
1073 }
1074 }
1075 }
1076}