json_escape_simd/
lib.rs

1//! Borrowed from <https://github.com/cloudwego/sonic-rs/blob/v0.5.5/src/util/string.rs>
2//!
3//! Only takes the string escaping part to avoid the abstraction overhead.
4
5#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
6use std::arch::is_x86_feature_detected;
7
8mod simd;
9
10pub(crate) const QUOTE_TAB: [(u8, [u8; 8]); 256] = [
11    // 0x00 ~ 0x1f
12    (6, *b"\\u0000\0\0"),
13    (6, *b"\\u0001\0\0"),
14    (6, *b"\\u0002\0\0"),
15    (6, *b"\\u0003\0\0"),
16    (6, *b"\\u0004\0\0"),
17    (6, *b"\\u0005\0\0"),
18    (6, *b"\\u0006\0\0"),
19    (6, *b"\\u0007\0\0"),
20    (2, *b"\\b\0\0\0\0\0\0"),
21    (2, *b"\\t\0\0\0\0\0\0"),
22    (2, *b"\\n\0\0\0\0\0\0"),
23    (6, *b"\\u000b\0\0"),
24    (2, *b"\\f\0\0\0\0\0\0"),
25    (2, *b"\\r\0\0\0\0\0\0"),
26    (6, *b"\\u000e\0\0"),
27    (6, *b"\\u000f\0\0"),
28    (6, *b"\\u0010\0\0"),
29    (6, *b"\\u0011\0\0"),
30    (6, *b"\\u0012\0\0"),
31    (6, *b"\\u0013\0\0"),
32    (6, *b"\\u0014\0\0"),
33    (6, *b"\\u0015\0\0"),
34    (6, *b"\\u0016\0\0"),
35    (6, *b"\\u0017\0\0"),
36    (6, *b"\\u0018\0\0"),
37    (6, *b"\\u0019\0\0"),
38    (6, *b"\\u001a\0\0"),
39    (6, *b"\\u001b\0\0"),
40    (6, *b"\\u001c\0\0"),
41    (6, *b"\\u001d\0\0"),
42    (6, *b"\\u001e\0\0"),
43    (6, *b"\\u001f\0\0"),
44    // 0x20 ~ 0x2f
45    (0, [0; 8]),
46    (0, [0; 8]),
47    (2, *b"\\\"\0\0\0\0\0\0"),
48    (0, [0; 8]),
49    (0, [0; 8]),
50    (0, [0; 8]),
51    (0, [0; 8]),
52    (0, [0; 8]),
53    (0, [0; 8]),
54    (0, [0; 8]),
55    (0, [0; 8]),
56    (0, [0; 8]),
57    (0, [0; 8]),
58    (0, [0; 8]),
59    (0, [0; 8]),
60    (0, [0; 8]),
61    // 0x30 ~ 0x3f
62    (0, [0; 8]),
63    (0, [0; 8]),
64    (0, [0; 8]),
65    (0, [0; 8]),
66    (0, [0; 8]),
67    (0, [0; 8]),
68    (0, [0; 8]),
69    (0, [0; 8]),
70    (0, [0; 8]),
71    (0, [0; 8]),
72    (0, [0; 8]),
73    (0, [0; 8]),
74    (0, [0; 8]),
75    (0, [0; 8]),
76    (0, [0; 8]),
77    (0, [0; 8]),
78    // 0x40 ~ 0x4f
79    (0, [0; 8]),
80    (0, [0; 8]),
81    (0, [0; 8]),
82    (0, [0; 8]),
83    (0, [0; 8]),
84    (0, [0; 8]),
85    (0, [0; 8]),
86    (0, [0; 8]),
87    (0, [0; 8]),
88    (0, [0; 8]),
89    (0, [0; 8]),
90    (0, [0; 8]),
91    (0, [0; 8]),
92    (0, [0; 8]),
93    (0, [0; 8]),
94    (0, [0; 8]),
95    // 0x50 ~ 0x5f
96    (0, [0; 8]),
97    (0, [0; 8]),
98    (0, [0; 8]),
99    (0, [0; 8]),
100    (0, [0; 8]),
101    (0, [0; 8]),
102    (0, [0; 8]),
103    (0, [0; 8]),
104    (0, [0; 8]),
105    (0, [0; 8]),
106    (0, [0; 8]),
107    (0, [0; 8]),
108    (2, *b"\\\\\0\0\0\0\0\0"),
109    (0, [0; 8]),
110    (0, [0; 8]),
111    (0, [0; 8]),
112    // 0x60 ~ 0xff
113    (0, [0; 8]),
114    (0, [0; 8]),
115    (0, [0; 8]),
116    (0, [0; 8]),
117    (0, [0; 8]),
118    (0, [0; 8]),
119    (0, [0; 8]),
120    (0, [0; 8]),
121    (0, [0; 8]),
122    (0, [0; 8]),
123    (0, [0; 8]),
124    (0, [0; 8]),
125    (0, [0; 8]),
126    (0, [0; 8]),
127    (0, [0; 8]),
128    (0, [0; 8]),
129    (0, [0; 8]),
130    (0, [0; 8]),
131    (0, [0; 8]),
132    (0, [0; 8]),
133    (0, [0; 8]),
134    (0, [0; 8]),
135    (0, [0; 8]),
136    (0, [0; 8]),
137    (0, [0; 8]),
138    (0, [0; 8]),
139    (0, [0; 8]),
140    (0, [0; 8]),
141    (0, [0; 8]),
142    (0, [0; 8]),
143    (0, [0; 8]),
144    (0, [0; 8]),
145    (0, [0; 8]),
146    (0, [0; 8]),
147    (0, [0; 8]),
148    (0, [0; 8]),
149    (0, [0; 8]),
150    (0, [0; 8]),
151    (0, [0; 8]),
152    (0, [0; 8]),
153    (0, [0; 8]),
154    (0, [0; 8]),
155    (0, [0; 8]),
156    (0, [0; 8]),
157    (0, [0; 8]),
158    (0, [0; 8]),
159    (0, [0; 8]),
160    (0, [0; 8]),
161    (0, [0; 8]),
162    (0, [0; 8]),
163    (0, [0; 8]),
164    (0, [0; 8]),
165    (0, [0; 8]),
166    (0, [0; 8]),
167    (0, [0; 8]),
168    (0, [0; 8]),
169    (0, [0; 8]),
170    (0, [0; 8]),
171    (0, [0; 8]),
172    (0, [0; 8]),
173    (0, [0; 8]),
174    (0, [0; 8]),
175    (0, [0; 8]),
176    (0, [0; 8]),
177    (0, [0; 8]),
178    (0, [0; 8]),
179    (0, [0; 8]),
180    (0, [0; 8]),
181    (0, [0; 8]),
182    (0, [0; 8]),
183    (0, [0; 8]),
184    (0, [0; 8]),
185    (0, [0; 8]),
186    (0, [0; 8]),
187    (0, [0; 8]),
188    (0, [0; 8]),
189    (0, [0; 8]),
190    (0, [0; 8]),
191    (0, [0; 8]),
192    (0, [0; 8]),
193    (0, [0; 8]),
194    (0, [0; 8]),
195    (0, [0; 8]),
196    (0, [0; 8]),
197    (0, [0; 8]),
198    (0, [0; 8]),
199    (0, [0; 8]),
200    (0, [0; 8]),
201    (0, [0; 8]),
202    (0, [0; 8]),
203    (0, [0; 8]),
204    (0, [0; 8]),
205    (0, [0; 8]),
206    (0, [0; 8]),
207    (0, [0; 8]),
208    (0, [0; 8]),
209    (0, [0; 8]),
210    (0, [0; 8]),
211    (0, [0; 8]),
212    (0, [0; 8]),
213    (0, [0; 8]),
214    (0, [0; 8]),
215    (0, [0; 8]),
216    (0, [0; 8]),
217    (0, [0; 8]),
218    (0, [0; 8]),
219    (0, [0; 8]),
220    (0, [0; 8]),
221    (0, [0; 8]),
222    (0, [0; 8]),
223    (0, [0; 8]),
224    (0, [0; 8]),
225    (0, [0; 8]),
226    (0, [0; 8]),
227    (0, [0; 8]),
228    (0, [0; 8]),
229    (0, [0; 8]),
230    (0, [0; 8]),
231    (0, [0; 8]),
232    (0, [0; 8]),
233    (0, [0; 8]),
234    (0, [0; 8]),
235    (0, [0; 8]),
236    (0, [0; 8]),
237    (0, [0; 8]),
238    (0, [0; 8]),
239    (0, [0; 8]),
240    (0, [0; 8]),
241    (0, [0; 8]),
242    (0, [0; 8]),
243    (0, [0; 8]),
244    (0, [0; 8]),
245    (0, [0; 8]),
246    (0, [0; 8]),
247    (0, [0; 8]),
248    (0, [0; 8]),
249    (0, [0; 8]),
250    (0, [0; 8]),
251    (0, [0; 8]),
252    (0, [0; 8]),
253    (0, [0; 8]),
254    (0, [0; 8]),
255    (0, [0; 8]),
256    (0, [0; 8]),
257    (0, [0; 8]),
258    (0, [0; 8]),
259    (0, [0; 8]),
260    (0, [0; 8]),
261    (0, [0; 8]),
262    (0, [0; 8]),
263    (0, [0; 8]),
264    (0, [0; 8]),
265    (0, [0; 8]),
266    (0, [0; 8]),
267    (0, [0; 8]),
268    (0, [0; 8]),
269    (0, [0; 8]),
270    (0, [0; 8]),
271    (0, [0; 8]),
272    (0, [0; 8]),
273];
274
275pub(crate) const NEED_ESCAPED: [u8; 256] = [
276    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
277    0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
278    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
279    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
280    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
281    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
282    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
283    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
284];
285
286#[inline(always)]
287fn format_string(value: &str, dst: &mut [u8]) -> usize {
288    #[cfg(target_arch = "aarch64")]
289    {
290        let has_neon = cfg!(target_os = "macos") || std::arch::is_aarch64_feature_detected!("neon");
291        if has_neon {
292            unsafe { simd::neon::format_string(value, dst) }
293        } else {
294            simd::v128::format_string(value, dst)
295        }
296    }
297
298    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
299    {
300        if is_x86_feature_detected!("avx512f") {
301            unsafe { simd::avx512::format_string(value, dst) }
302        } else if is_x86_feature_detected!("avx2") {
303            unsafe { simd::avx2::format_string(value, dst) }
304        } else if is_x86_feature_detected!("sse2") {
305            unsafe { simd::sse2::format_string(value, dst) }
306        } else {
307            simd::v128::format_string(value, dst)
308        }
309    }
310
311    #[cfg(not(any(target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64")))]
312    {
313        simd::v128::format_string(value, dst)
314    }
315}
316
317pub fn escape(value: &str) -> String {
318    let capacity = value.len() * 6 + 32 + 3;
319    let mut buf = Vec::with_capacity(capacity);
320    #[allow(clippy::uninit_vec)]
321    unsafe {
322        buf.set_len(capacity)
323    };
324    let cnt = format_string(value, &mut buf);
325    unsafe { buf.set_len(cnt) };
326    unsafe { String::from_utf8_unchecked(buf) }
327}
328
329pub fn escape_into<S: AsRef<str>>(value: S, dst: &mut Vec<u8>) -> usize {
330    let value = value.as_ref();
331    let needed_capacity = value.len() * 6 + 32 + 3;
332
333    // Ensure we have enough capacity
334    dst.reserve(needed_capacity);
335
336    let old_len = dst.len();
337
338    // SAFETY: We've reserved enough capacity above, and format_string will
339    // write valid UTF-8 bytes. We'll set the correct length after.
340    unsafe {
341        // Get a slice that includes the spare capacity
342        let spare =
343            std::slice::from_raw_parts_mut(dst.as_mut_ptr().add(old_len), dst.capacity() - old_len);
344        let cnt = format_string(value, spare);
345        dst.set_len(old_len + cnt);
346        cnt
347    }
348}
349
350#[cfg(test)]
351mod tests {
352    use std::fs::read_dir;
353    use std::path::{Path, PathBuf};
354
355    use rand::seq::SliceRandom;
356
357    use super::*;
358
359    #[test]
360    fn test_escape_ascii_json_string() {
361        let fixture = r#"abcdefghijklmnopqrstuvwxyz .*? hello world escape json string"#;
362        assert_eq!(escape(fixture), serde_json::to_string(fixture).unwrap());
363    }
364
365    #[test]
366    fn test_escape_json_string() {
367        let mut fixture = String::new();
368        for i in 0u8..=0x1F {
369            fixture.push(i as char);
370        }
371        fixture.push('\t');
372        fixture.push('\x08');
373        fixture.push('\x09');
374        fixture.push('\x0A');
375        fixture.push('\x0C');
376        fixture.push('\x0D');
377        fixture.push('\x22');
378        fixture.push('\x5C');
379        fixture.push_str("normal string");
380        fixture.push('😊');
381        fixture.push_str("δΈ­ζ–‡ English πŸš€ \n❓ π„ž");
382        escape(fixture.as_str());
383        assert_eq!(
384            escape(fixture.as_str()),
385            serde_json::to_string(fixture.as_str()).unwrap(),
386            "fixture: {:?}",
387            fixture
388        );
389    }
390
391    // Test cases for various string sizes to cover different SIMD paths
392
393    #[test]
394    fn test_empty_string() {
395        assert_eq!(escape(""), r#""""#);
396    }
397
398    #[test]
399    fn test_very_small_strings() {
400        // Less than 16 bytes (SSE register size)
401        assert_eq!(escape("a"), r#""a""#);
402        assert_eq!(escape("ab"), r#""ab""#);
403        assert_eq!(escape("hello"), r#""hello""#);
404        assert_eq!(escape("hello\n"), r#""hello\n""#);
405        assert_eq!(escape("\""), r#""\"""#);
406        assert_eq!(escape("\\"), r#""\\""#);
407        assert_eq!(escape("\t"), r#""\t""#);
408        assert_eq!(escape("\r\n"), r#""\r\n""#);
409    }
410
411    #[test]
412    fn test_small_strings_16_bytes() {
413        // Exactly 16 bytes - SSE register boundary
414        let s16 = "0123456789abcdef";
415        assert_eq!(s16.len(), 16);
416        assert_eq!(escape(s16), serde_json::to_string(s16).unwrap());
417
418        // 16 bytes with escapes
419        let s16_esc = "01234567\t9abcde";
420        assert_eq!(s16_esc.len(), 15); // \t is 1 byte
421        assert_eq!(escape(s16_esc), serde_json::to_string(s16_esc).unwrap());
422    }
423
424    #[test]
425    fn test_medium_strings_32_bytes() {
426        // Exactly 32 bytes - AVX2 register boundary
427        let s32 = "0123456789abcdef0123456789abcdef";
428        assert_eq!(s32.len(), 32);
429        assert_eq!(escape(s32), serde_json::to_string(s32).unwrap());
430
431        // 32 bytes with escapes at different positions
432        let s32_esc = "0123456789abcde\"0123456789abcde";
433        assert_eq!(escape(s32_esc), serde_json::to_string(s32_esc).unwrap());
434    }
435
436    #[test]
437    fn test_large_strings_128_bytes() {
438        // Exactly 128 bytes - main loop size
439        let s128 = "0123456789abcdef".repeat(8);
440        assert_eq!(s128.len(), 128);
441        assert_eq!(escape(&s128), serde_json::to_string(&s128).unwrap());
442
443        // 128 bytes with escapes spread throughout
444        let mut s128_esc = String::new();
445        for i in 0..8 {
446            if i % 2 == 0 {
447                s128_esc.push_str("0123456789abcd\n");
448            } else {
449                s128_esc.push_str("0123456789abcd\"");
450            }
451        }
452        assert_eq!(escape(&s128_esc), serde_json::to_string(&s128_esc).unwrap());
453    }
454
455    #[test]
456    fn test_unaligned_data() {
457        // Test strings that start at various alignments
458        for offset in 0..32 {
459            let padding = " ".repeat(offset);
460            let test_str = format!("{}{}", padding, "test\nstring\"with\\escapes");
461            let result = escape(&test_str[offset..]);
462            let expected = serde_json::to_string(&test_str[offset..]).unwrap();
463            assert_eq!(result, expected, "Failed at offset {}", offset);
464        }
465    }
466
467    #[test]
468    fn test_sparse_escapes() {
469        // Large string with escapes only at the beginning and end
470        let mut s = String::new();
471        s.push('"');
472        s.push_str(&"a".repeat(500));
473        s.push('\\');
474        assert_eq!(escape(&s), serde_json::to_string(&s).unwrap());
475    }
476
477    #[test]
478    fn test_dense_escapes() {
479        // String with many escapes
480        let s = "\"\\\"\\\"\\\"\\".repeat(50);
481        assert_eq!(escape(&s), serde_json::to_string(&s).unwrap());
482
483        // All control characters
484        let mut ctrl = String::new();
485        for _ in 0..10 {
486            for i in 0u8..32 {
487                ctrl.push(i as char);
488            }
489        }
490        assert_eq!(escape(&ctrl), serde_json::to_string(&ctrl).unwrap());
491    }
492
493    #[test]
494    fn test_boundary_conditions() {
495        // Test around 256 byte boundary (common cache line multiple)
496        for size in 250..260 {
497            let s = "a".repeat(size);
498            assert_eq!(escape(&s), serde_json::to_string(&s).unwrap());
499
500            // With escape at the end
501            let mut s_esc = "a".repeat(size - 1);
502            s_esc.push('"');
503            assert_eq!(escape(&s_esc), serde_json::to_string(&s_esc).unwrap());
504        }
505    }
506
507    #[test]
508    fn test_all_escape_types() {
509        // Test each escape type individually
510        assert_eq!(escape("\x00"), r#""\u0000""#);
511        assert_eq!(escape("\x08"), r#""\b""#);
512        assert_eq!(escape("\x09"), r#""\t""#);
513        assert_eq!(escape("\x0A"), r#""\n""#);
514        assert_eq!(escape("\x0C"), r#""\f""#);
515        assert_eq!(escape("\x0D"), r#""\r""#);
516        assert_eq!(escape("\x1F"), r#""\u001f""#);
517        assert_eq!(escape("\""), r#""\"""#);
518        assert_eq!(escape("\\"), r#""\\""#);
519
520        // Test all control characters
521        for i in 0u8..32 {
522            let s = String::from_utf8(vec![i]).unwrap();
523            let result = escape(&s);
524            let expected = String::from_utf8(QUOTE_TAB[i as usize].1.to_vec())
525                .unwrap()
526                .trim_end_matches('\0')
527                .to_string();
528            assert_eq!(
529                result,
530                format!("\"{}\"", expected),
531                "Failed for byte 0x{:02x}",
532                i
533            );
534        }
535    }
536
537    #[test]
538    fn test_mixed_content() {
539        // Mix of ASCII, escapes, and multi-byte UTF-8
540        let mixed = r#"Hello "World"!
541    Tab:	Here
542    Emoji: πŸ˜€ Chinese: δΈ­ζ–‡
543    Math: βˆ‘βˆ«βˆ‚ Music: π„ž
544    Escape: \" \\ \n \r \t"#;
545        assert_eq!(escape(mixed), serde_json::to_string(mixed).unwrap());
546    }
547
548    #[test]
549    fn test_repeated_patterns() {
550        // Patterns that might benefit from or confuse SIMD operations
551        let pattern1 = "abcd".repeat(100);
552        assert_eq!(escape(&pattern1), serde_json::to_string(&pattern1).unwrap());
553
554        let pattern2 = "a\"b\"".repeat(100);
555        assert_eq!(escape(&pattern2), serde_json::to_string(&pattern2).unwrap());
556
557        let pattern3 = "\t\n".repeat(100);
558        assert_eq!(escape(&pattern3), serde_json::to_string(&pattern3).unwrap());
559    }
560
561    #[test]
562    fn test_rxjs() {
563        let mut sources = Vec::new();
564        read_dir_recursive("node_modules/rxjs/src", &mut sources, |p| {
565            matches!(p.extension().and_then(|e| e.to_str()), Some("ts"))
566        })
567        .unwrap();
568        assert!(!sources.is_empty());
569        sources.shuffle(&mut rand::rng());
570        for source in sources
571            .iter()
572            .take(if cfg!(miri) { 10 } else { sources.len() })
573        {
574            assert_eq!(escape(source), serde_json::to_string(&source).unwrap());
575            let mut output = String::new();
576            escape_into(source, unsafe { output.as_mut_vec() });
577            assert_eq!(output, serde_json::to_string(&source).unwrap());
578        }
579    }
580
581    #[test]
582    fn test_sources() {
583        for source in load_affine_sources().unwrap() {
584            assert_eq!(escape(&source), serde_json::to_string(&source).unwrap());
585            let mut output = String::with_capacity(source.len() * 6 + 32 + 3);
586            escape_into(&source, unsafe { output.as_mut_vec() });
587            assert_eq!(output, serde_json::to_string(&source).unwrap());
588        }
589    }
590
591    fn load_affine_sources() -> Result<impl Iterator<Item = String>, std::io::Error> {
592        let mut sources = Vec::new();
593        read_dir_recursive("fixtures", &mut sources, |p| {
594            matches!(
595                p.extension().and_then(|e| e.to_str()),
596                Some("ts") | Some("tsx") | Some("js") | Some("mjs") | Some("cjs")
597            )
598        })?;
599        assert!(!sources.is_empty());
600        let len = sources.len();
601        sources.shuffle(&mut rand::rng());
602        Ok(sources.into_iter().take(if cfg!(miri) { 10 } else { len }))
603    }
604
605    fn read_dir_recursive<P: AsRef<Path>, F: Fn(PathBuf) -> bool + Copy>(
606        dir: P,
607        sources: &mut Vec<String>,
608        f: F,
609    ) -> Result<(), std::io::Error> {
610        let dir = read_dir(dir)?;
611        for entry in dir {
612            let p = entry?;
613            let metadata = std::fs::metadata(p.path())?;
614            if metadata.is_file() && f(p.path()) {
615                sources.push(std::fs::read_to_string(p.path())?);
616            }
617            if metadata.is_dir() {
618                read_dir_recursive(p.path(), sources, f)?;
619            }
620        }
621        Ok(())
622    }
623}