json_escape_simd/
lib.rs

1//! Borrowed from <https://github.com/cloudwego/sonic-rs/blob/v0.5.5/src/util/string.rs>
2//!
3//! Only takes the string escaping part to avoid the abstraction overhead.
4
5#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
6use std::arch::is_x86_feature_detected;
7
8mod simd;
9
10pub(crate) const QUOTE_TAB: [(u8, [u8; 8]); 256] = [
11    // 0x00 ~ 0x1f
12    (6, *b"\\u0000\0\0"),
13    (6, *b"\\u0001\0\0"),
14    (6, *b"\\u0002\0\0"),
15    (6, *b"\\u0003\0\0"),
16    (6, *b"\\u0004\0\0"),
17    (6, *b"\\u0005\0\0"),
18    (6, *b"\\u0006\0\0"),
19    (6, *b"\\u0007\0\0"),
20    (2, *b"\\b\0\0\0\0\0\0"),
21    (2, *b"\\t\0\0\0\0\0\0"),
22    (2, *b"\\n\0\0\0\0\0\0"),
23    (6, *b"\\u000b\0\0"),
24    (2, *b"\\f\0\0\0\0\0\0"),
25    (2, *b"\\r\0\0\0\0\0\0"),
26    (6, *b"\\u000e\0\0"),
27    (6, *b"\\u000f\0\0"),
28    (6, *b"\\u0010\0\0"),
29    (6, *b"\\u0011\0\0"),
30    (6, *b"\\u0012\0\0"),
31    (6, *b"\\u0013\0\0"),
32    (6, *b"\\u0014\0\0"),
33    (6, *b"\\u0015\0\0"),
34    (6, *b"\\u0016\0\0"),
35    (6, *b"\\u0017\0\0"),
36    (6, *b"\\u0018\0\0"),
37    (6, *b"\\u0019\0\0"),
38    (6, *b"\\u001a\0\0"),
39    (6, *b"\\u001b\0\0"),
40    (6, *b"\\u001c\0\0"),
41    (6, *b"\\u001d\0\0"),
42    (6, *b"\\u001e\0\0"),
43    (6, *b"\\u001f\0\0"),
44    // 0x20 ~ 0x2f
45    (0, [0; 8]),
46    (0, [0; 8]),
47    (2, *b"\\\"\0\0\0\0\0\0"),
48    (0, [0; 8]),
49    (0, [0; 8]),
50    (0, [0; 8]),
51    (0, [0; 8]),
52    (0, [0; 8]),
53    (0, [0; 8]),
54    (0, [0; 8]),
55    (0, [0; 8]),
56    (0, [0; 8]),
57    (0, [0; 8]),
58    (0, [0; 8]),
59    (0, [0; 8]),
60    (0, [0; 8]),
61    // 0x30 ~ 0x3f
62    (0, [0; 8]),
63    (0, [0; 8]),
64    (0, [0; 8]),
65    (0, [0; 8]),
66    (0, [0; 8]),
67    (0, [0; 8]),
68    (0, [0; 8]),
69    (0, [0; 8]),
70    (0, [0; 8]),
71    (0, [0; 8]),
72    (0, [0; 8]),
73    (0, [0; 8]),
74    (0, [0; 8]),
75    (0, [0; 8]),
76    (0, [0; 8]),
77    (0, [0; 8]),
78    // 0x40 ~ 0x4f
79    (0, [0; 8]),
80    (0, [0; 8]),
81    (0, [0; 8]),
82    (0, [0; 8]),
83    (0, [0; 8]),
84    (0, [0; 8]),
85    (0, [0; 8]),
86    (0, [0; 8]),
87    (0, [0; 8]),
88    (0, [0; 8]),
89    (0, [0; 8]),
90    (0, [0; 8]),
91    (0, [0; 8]),
92    (0, [0; 8]),
93    (0, [0; 8]),
94    (0, [0; 8]),
95    // 0x50 ~ 0x5f
96    (0, [0; 8]),
97    (0, [0; 8]),
98    (0, [0; 8]),
99    (0, [0; 8]),
100    (0, [0; 8]),
101    (0, [0; 8]),
102    (0, [0; 8]),
103    (0, [0; 8]),
104    (0, [0; 8]),
105    (0, [0; 8]),
106    (0, [0; 8]),
107    (0, [0; 8]),
108    (2, *b"\\\\\0\0\0\0\0\0"),
109    (0, [0; 8]),
110    (0, [0; 8]),
111    (0, [0; 8]),
112    // 0x60 ~ 0xff
113    (0, [0; 8]),
114    (0, [0; 8]),
115    (0, [0; 8]),
116    (0, [0; 8]),
117    (0, [0; 8]),
118    (0, [0; 8]),
119    (0, [0; 8]),
120    (0, [0; 8]),
121    (0, [0; 8]),
122    (0, [0; 8]),
123    (0, [0; 8]),
124    (0, [0; 8]),
125    (0, [0; 8]),
126    (0, [0; 8]),
127    (0, [0; 8]),
128    (0, [0; 8]),
129    (0, [0; 8]),
130    (0, [0; 8]),
131    (0, [0; 8]),
132    (0, [0; 8]),
133    (0, [0; 8]),
134    (0, [0; 8]),
135    (0, [0; 8]),
136    (0, [0; 8]),
137    (0, [0; 8]),
138    (0, [0; 8]),
139    (0, [0; 8]),
140    (0, [0; 8]),
141    (0, [0; 8]),
142    (0, [0; 8]),
143    (0, [0; 8]),
144    (0, [0; 8]),
145    (0, [0; 8]),
146    (0, [0; 8]),
147    (0, [0; 8]),
148    (0, [0; 8]),
149    (0, [0; 8]),
150    (0, [0; 8]),
151    (0, [0; 8]),
152    (0, [0; 8]),
153    (0, [0; 8]),
154    (0, [0; 8]),
155    (0, [0; 8]),
156    (0, [0; 8]),
157    (0, [0; 8]),
158    (0, [0; 8]),
159    (0, [0; 8]),
160    (0, [0; 8]),
161    (0, [0; 8]),
162    (0, [0; 8]),
163    (0, [0; 8]),
164    (0, [0; 8]),
165    (0, [0; 8]),
166    (0, [0; 8]),
167    (0, [0; 8]),
168    (0, [0; 8]),
169    (0, [0; 8]),
170    (0, [0; 8]),
171    (0, [0; 8]),
172    (0, [0; 8]),
173    (0, [0; 8]),
174    (0, [0; 8]),
175    (0, [0; 8]),
176    (0, [0; 8]),
177    (0, [0; 8]),
178    (0, [0; 8]),
179    (0, [0; 8]),
180    (0, [0; 8]),
181    (0, [0; 8]),
182    (0, [0; 8]),
183    (0, [0; 8]),
184    (0, [0; 8]),
185    (0, [0; 8]),
186    (0, [0; 8]),
187    (0, [0; 8]),
188    (0, [0; 8]),
189    (0, [0; 8]),
190    (0, [0; 8]),
191    (0, [0; 8]),
192    (0, [0; 8]),
193    (0, [0; 8]),
194    (0, [0; 8]),
195    (0, [0; 8]),
196    (0, [0; 8]),
197    (0, [0; 8]),
198    (0, [0; 8]),
199    (0, [0; 8]),
200    (0, [0; 8]),
201    (0, [0; 8]),
202    (0, [0; 8]),
203    (0, [0; 8]),
204    (0, [0; 8]),
205    (0, [0; 8]),
206    (0, [0; 8]),
207    (0, [0; 8]),
208    (0, [0; 8]),
209    (0, [0; 8]),
210    (0, [0; 8]),
211    (0, [0; 8]),
212    (0, [0; 8]),
213    (0, [0; 8]),
214    (0, [0; 8]),
215    (0, [0; 8]),
216    (0, [0; 8]),
217    (0, [0; 8]),
218    (0, [0; 8]),
219    (0, [0; 8]),
220    (0, [0; 8]),
221    (0, [0; 8]),
222    (0, [0; 8]),
223    (0, [0; 8]),
224    (0, [0; 8]),
225    (0, [0; 8]),
226    (0, [0; 8]),
227    (0, [0; 8]),
228    (0, [0; 8]),
229    (0, [0; 8]),
230    (0, [0; 8]),
231    (0, [0; 8]),
232    (0, [0; 8]),
233    (0, [0; 8]),
234    (0, [0; 8]),
235    (0, [0; 8]),
236    (0, [0; 8]),
237    (0, [0; 8]),
238    (0, [0; 8]),
239    (0, [0; 8]),
240    (0, [0; 8]),
241    (0, [0; 8]),
242    (0, [0; 8]),
243    (0, [0; 8]),
244    (0, [0; 8]),
245    (0, [0; 8]),
246    (0, [0; 8]),
247    (0, [0; 8]),
248    (0, [0; 8]),
249    (0, [0; 8]),
250    (0, [0; 8]),
251    (0, [0; 8]),
252    (0, [0; 8]),
253    (0, [0; 8]),
254    (0, [0; 8]),
255    (0, [0; 8]),
256    (0, [0; 8]),
257    (0, [0; 8]),
258    (0, [0; 8]),
259    (0, [0; 8]),
260    (0, [0; 8]),
261    (0, [0; 8]),
262    (0, [0; 8]),
263    (0, [0; 8]),
264    (0, [0; 8]),
265    (0, [0; 8]),
266    (0, [0; 8]),
267    (0, [0; 8]),
268    (0, [0; 8]),
269    (0, [0; 8]),
270    (0, [0; 8]),
271    (0, [0; 8]),
272    (0, [0; 8]),
273];
274
275pub(crate) const NEED_ESCAPED: [u8; 256] = [
276    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
277    0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
278    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
279    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
280    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
281    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
282    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
283    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
284];
285
286#[inline(always)]
287fn format_string(value: &str, dst: &mut [u8]) -> usize {
288    #[cfg(target_arch = "aarch64")]
289    {
290        let has_neon = cfg!(target_os = "macos") || std::arch::is_aarch64_feature_detected!("neon");
291        if has_neon {
292            unsafe { simd::neon::format_string(value, dst) }
293        } else {
294            simd::v128::format_string(value, dst)
295        }
296    }
297
298    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
299    {
300        if is_x86_feature_detected!("avx512f") {
301            unsafe { simd::avx512::format_string(value, dst) }
302        } else if is_x86_feature_detected!("avx2") {
303            unsafe { simd::avx2::format_string(value, dst) }
304        } else if is_x86_feature_detected!("sse2") {
305            unsafe { simd::sse2::format_string(value, dst) }
306        } else {
307            simd::v128::format_string(value, dst)
308        }
309    }
310
311    #[cfg(not(any(target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64")))]
312    {
313        simd::v128::format_string(value, dst)
314    }
315}
316
317pub fn escape(value: &str) -> String {
318    let capacity = value.len() * 6 + 32 + 3;
319    let mut buf = Vec::with_capacity(capacity);
320    #[allow(clippy::uninit_vec)]
321    unsafe {
322        buf.set_len(capacity)
323    };
324    let cnt = format_string(value, &mut buf);
325    unsafe { buf.set_len(cnt) };
326    unsafe { String::from_utf8_unchecked(buf) }
327}
328
329/// # Panics
330///
331/// Panics if the buffer is not large enough. Allocate enough capacity for dst.
332pub fn escape_into<S: AsRef<str>>(value: S, dst: &mut Vec<u8>) {
333    let value = value.as_ref();
334    let old_len = dst.len();
335
336    // SAFETY: We've reserved enough capacity above, and format_string will
337    // write valid UTF-8 bytes. We'll set the correct length after.
338    unsafe {
339        // Get a slice that includes the spare capacity
340        let spare =
341            std::slice::from_raw_parts_mut(dst.as_mut_ptr().add(old_len), dst.capacity() - old_len);
342        let cnt = format_string(value, spare);
343        dst.set_len(old_len + cnt);
344    }
345}
346
347#[cfg(test)]
348mod tests {
349    use std::fs::read_dir;
350    use std::path::{Path, PathBuf};
351
352    use rand::seq::SliceRandom;
353
354    use super::*;
355
356    #[test]
357    fn test_escape_ascii_json_string() {
358        let fixture = r#"abcdefghijklmnopqrstuvwxyz .*? hello world escape json string"#;
359        assert_eq!(escape(fixture), serde_json::to_string(fixture).unwrap());
360    }
361
362    #[test]
363    fn test_escape_json_string() {
364        let mut fixture = String::new();
365        for i in 0u8..=0x1F {
366            fixture.push(i as char);
367        }
368        fixture.push('\t');
369        fixture.push('\x08');
370        fixture.push('\x09');
371        fixture.push('\x0A');
372        fixture.push('\x0C');
373        fixture.push('\x0D');
374        fixture.push('\x22');
375        fixture.push('\x5C');
376        fixture.push_str("normal string");
377        fixture.push('😊');
378        fixture.push_str("δΈ­ζ–‡ English πŸš€ \n❓ π„ž");
379        escape(fixture.as_str());
380        assert_eq!(
381            escape(fixture.as_str()),
382            serde_json::to_string(fixture.as_str()).unwrap(),
383            "fixture: {:?}",
384            fixture
385        );
386    }
387
388    // Test cases for various string sizes to cover different SIMD paths
389
390    #[test]
391    fn test_empty_string() {
392        assert_eq!(escape(""), r#""""#);
393    }
394
395    #[test]
396    fn test_very_small_strings() {
397        // Less than 16 bytes (SSE register size)
398        assert_eq!(escape("a"), r#""a""#);
399        assert_eq!(escape("ab"), r#""ab""#);
400        assert_eq!(escape("hello"), r#""hello""#);
401        assert_eq!(escape("hello\n"), r#""hello\n""#);
402        assert_eq!(escape("\""), r#""\"""#);
403        assert_eq!(escape("\\"), r#""\\""#);
404        assert_eq!(escape("\t"), r#""\t""#);
405        assert_eq!(escape("\r\n"), r#""\r\n""#);
406    }
407
408    #[test]
409    fn test_small_strings_16_bytes() {
410        // Exactly 16 bytes - SSE register boundary
411        let s16 = "0123456789abcdef";
412        assert_eq!(s16.len(), 16);
413        assert_eq!(escape(s16), serde_json::to_string(s16).unwrap());
414
415        // 16 bytes with escapes
416        let s16_esc = "01234567\t9abcde";
417        assert_eq!(s16_esc.len(), 15); // \t is 1 byte
418        assert_eq!(escape(s16_esc), serde_json::to_string(s16_esc).unwrap());
419    }
420
421    #[test]
422    fn test_medium_strings_32_bytes() {
423        // Exactly 32 bytes - AVX2 register boundary
424        let s32 = "0123456789abcdef0123456789abcdef";
425        assert_eq!(s32.len(), 32);
426        assert_eq!(escape(s32), serde_json::to_string(s32).unwrap());
427
428        // 32 bytes with escapes at different positions
429        let s32_esc = "0123456789abcde\"0123456789abcde";
430        assert_eq!(escape(s32_esc), serde_json::to_string(s32_esc).unwrap());
431    }
432
433    #[test]
434    fn test_large_strings_128_bytes() {
435        // Exactly 128 bytes - main loop size
436        let s128 = "0123456789abcdef".repeat(8);
437        assert_eq!(s128.len(), 128);
438        assert_eq!(escape(&s128), serde_json::to_string(&s128).unwrap());
439
440        // 128 bytes with escapes spread throughout
441        let mut s128_esc = String::new();
442        for i in 0..8 {
443            if i % 2 == 0 {
444                s128_esc.push_str("0123456789abcd\n");
445            } else {
446                s128_esc.push_str("0123456789abcd\"");
447            }
448        }
449        assert_eq!(escape(&s128_esc), serde_json::to_string(&s128_esc).unwrap());
450    }
451
452    #[test]
453    fn test_unaligned_data() {
454        // Test strings that start at various alignments
455        for offset in 0..32 {
456            let padding = " ".repeat(offset);
457            let test_str = format!("{}{}", padding, "test\nstring\"with\\escapes");
458            let result = escape(&test_str[offset..]);
459            let expected = serde_json::to_string(&test_str[offset..]).unwrap();
460            assert_eq!(result, expected, "Failed at offset {}", offset);
461        }
462    }
463
464    #[test]
465    fn test_sparse_escapes() {
466        // Large string with escapes only at the beginning and end
467        let mut s = String::new();
468        s.push('"');
469        s.push_str(&"a".repeat(500));
470        s.push('\\');
471        assert_eq!(escape(&s), serde_json::to_string(&s).unwrap());
472    }
473
474    #[test]
475    fn test_dense_escapes() {
476        // String with many escapes
477        let s = "\"\\\"\\\"\\\"\\".repeat(50);
478        assert_eq!(escape(&s), serde_json::to_string(&s).unwrap());
479
480        // All control characters
481        let mut ctrl = String::new();
482        for _ in 0..10 {
483            for i in 0u8..32 {
484                ctrl.push(i as char);
485            }
486        }
487        assert_eq!(escape(&ctrl), serde_json::to_string(&ctrl).unwrap());
488    }
489
490    #[test]
491    fn test_boundary_conditions() {
492        // Test around 256 byte boundary (common cache line multiple)
493        for size in 250..260 {
494            let s = "a".repeat(size);
495            assert_eq!(escape(&s), serde_json::to_string(&s).unwrap());
496
497            // With escape at the end
498            let mut s_esc = "a".repeat(size - 1);
499            s_esc.push('"');
500            assert_eq!(escape(&s_esc), serde_json::to_string(&s_esc).unwrap());
501        }
502    }
503
504    #[test]
505    fn test_all_escape_types() {
506        // Test each escape type individually
507        assert_eq!(escape("\x00"), r#""\u0000""#);
508        assert_eq!(escape("\x08"), r#""\b""#);
509        assert_eq!(escape("\x09"), r#""\t""#);
510        assert_eq!(escape("\x0A"), r#""\n""#);
511        assert_eq!(escape("\x0C"), r#""\f""#);
512        assert_eq!(escape("\x0D"), r#""\r""#);
513        assert_eq!(escape("\x1F"), r#""\u001f""#);
514        assert_eq!(escape("\""), r#""\"""#);
515        assert_eq!(escape("\\"), r#""\\""#);
516
517        // Test all control characters
518        for i in 0u8..32 {
519            let s = String::from_utf8(vec![i]).unwrap();
520            let result = escape(&s);
521            let expected = String::from_utf8(QUOTE_TAB[i as usize].1.to_vec())
522                .unwrap()
523                .trim_end_matches('\0')
524                .to_string();
525            assert_eq!(
526                result,
527                format!("\"{}\"", expected),
528                "Failed for byte 0x{:02x}",
529                i
530            );
531        }
532    }
533
534    #[test]
535    fn test_mixed_content() {
536        // Mix of ASCII, escapes, and multi-byte UTF-8
537        let mixed = r#"Hello "World"!
538    Tab:	Here
539    Emoji: πŸ˜€ Chinese: δΈ­ζ–‡
540    Math: βˆ‘βˆ«βˆ‚ Music: π„ž
541    Escape: \" \\ \n \r \t"#;
542        assert_eq!(escape(mixed), serde_json::to_string(mixed).unwrap());
543    }
544
545    #[test]
546    fn test_repeated_patterns() {
547        // Patterns that might benefit from or confuse SIMD operations
548        let pattern1 = "abcd".repeat(100);
549        assert_eq!(escape(&pattern1), serde_json::to_string(&pattern1).unwrap());
550
551        let pattern2 = "a\"b\"".repeat(100);
552        assert_eq!(escape(&pattern2), serde_json::to_string(&pattern2).unwrap());
553
554        let pattern3 = "\t\n".repeat(100);
555        assert_eq!(escape(&pattern3), serde_json::to_string(&pattern3).unwrap());
556    }
557
558    #[test]
559    fn test_rxjs() {
560        let mut sources = Vec::new();
561        read_dir_recursive("node_modules/rxjs/src", &mut sources, |p| {
562            matches!(p.extension().and_then(|e| e.to_str()), Some("ts"))
563        })
564        .unwrap();
565        assert!(!sources.is_empty());
566        sources.shuffle(&mut rand::rng());
567        for source in sources
568            .iter()
569            .take(if cfg!(miri) { 10 } else { sources.len() })
570        {
571            assert_eq!(escape(source), serde_json::to_string(&source).unwrap());
572            let mut output = String::with_capacity(source.len() * 6 + 32 + 3);
573            escape_into(source, unsafe { output.as_mut_vec() });
574            assert_eq!(output, serde_json::to_string(&source).unwrap());
575        }
576    }
577
578    #[test]
579    fn test_sources() {
580        for source in load_affine_sources().unwrap() {
581            assert_eq!(escape(&source), serde_json::to_string(&source).unwrap());
582            let mut output = String::with_capacity(source.len() * 6 + 32 + 3);
583            escape_into(&source, unsafe { output.as_mut_vec() });
584            assert_eq!(output, serde_json::to_string(&source).unwrap());
585        }
586    }
587
588    fn load_affine_sources() -> Result<impl Iterator<Item = String>, std::io::Error> {
589        let mut sources = Vec::new();
590        read_dir_recursive("fixtures", &mut sources, |p| {
591            matches!(
592                p.extension().and_then(|e| e.to_str()),
593                Some("ts") | Some("tsx") | Some("js") | Some("mjs") | Some("cjs")
594            )
595        })?;
596        assert!(!sources.is_empty());
597        let len = sources.len();
598        sources.shuffle(&mut rand::rng());
599        Ok(sources.into_iter().take(if cfg!(miri) { 10 } else { len }))
600    }
601
602    fn read_dir_recursive<P: AsRef<Path>, F: Fn(PathBuf) -> bool + Copy>(
603        dir: P,
604        sources: &mut Vec<String>,
605        f: F,
606    ) -> Result<(), std::io::Error> {
607        let dir = read_dir(dir)?;
608        for entry in dir {
609            let p = entry?;
610            let metadata = std::fs::metadata(p.path())?;
611            if metadata.is_file() && f(p.path()) {
612                sources.push(std::fs::read_to_string(p.path())?);
613            }
614            if metadata.is_dir() {
615                read_dir_recursive(p.path(), sources, f)?;
616            }
617        }
618        Ok(())
619    }
620}