json_escape_simd/
lib.rs

1//! Borrowed from <https://github.com/cloudwego/sonic-rs/blob/v0.5.5/src/util/string.rs>
2//!
3//! Only takes the string escaping part to avoid the abstraction overhead.
4
5#![allow(clippy::incompatible_msrv)]
6
7#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
8use std::arch::is_x86_feature_detected;
9
10mod simd;
11
12pub(crate) const QUOTE_TAB: [(u8, [u8; 8]); 256] = [
13    // 0x00 ~ 0x1f
14    (6, *b"\\u0000\0\0"),
15    (6, *b"\\u0001\0\0"),
16    (6, *b"\\u0002\0\0"),
17    (6, *b"\\u0003\0\0"),
18    (6, *b"\\u0004\0\0"),
19    (6, *b"\\u0005\0\0"),
20    (6, *b"\\u0006\0\0"),
21    (6, *b"\\u0007\0\0"),
22    (2, *b"\\b\0\0\0\0\0\0"),
23    (2, *b"\\t\0\0\0\0\0\0"),
24    (2, *b"\\n\0\0\0\0\0\0"),
25    (6, *b"\\u000b\0\0"),
26    (2, *b"\\f\0\0\0\0\0\0"),
27    (2, *b"\\r\0\0\0\0\0\0"),
28    (6, *b"\\u000e\0\0"),
29    (6, *b"\\u000f\0\0"),
30    (6, *b"\\u0010\0\0"),
31    (6, *b"\\u0011\0\0"),
32    (6, *b"\\u0012\0\0"),
33    (6, *b"\\u0013\0\0"),
34    (6, *b"\\u0014\0\0"),
35    (6, *b"\\u0015\0\0"),
36    (6, *b"\\u0016\0\0"),
37    (6, *b"\\u0017\0\0"),
38    (6, *b"\\u0018\0\0"),
39    (6, *b"\\u0019\0\0"),
40    (6, *b"\\u001a\0\0"),
41    (6, *b"\\u001b\0\0"),
42    (6, *b"\\u001c\0\0"),
43    (6, *b"\\u001d\0\0"),
44    (6, *b"\\u001e\0\0"),
45    (6, *b"\\u001f\0\0"),
46    // 0x20 ~ 0x2f
47    (0, [0; 8]),
48    (0, [0; 8]),
49    (2, *b"\\\"\0\0\0\0\0\0"),
50    (0, [0; 8]),
51    (0, [0; 8]),
52    (0, [0; 8]),
53    (0, [0; 8]),
54    (0, [0; 8]),
55    (0, [0; 8]),
56    (0, [0; 8]),
57    (0, [0; 8]),
58    (0, [0; 8]),
59    (0, [0; 8]),
60    (0, [0; 8]),
61    (0, [0; 8]),
62    (0, [0; 8]),
63    // 0x30 ~ 0x3f
64    (0, [0; 8]),
65    (0, [0; 8]),
66    (0, [0; 8]),
67    (0, [0; 8]),
68    (0, [0; 8]),
69    (0, [0; 8]),
70    (0, [0; 8]),
71    (0, [0; 8]),
72    (0, [0; 8]),
73    (0, [0; 8]),
74    (0, [0; 8]),
75    (0, [0; 8]),
76    (0, [0; 8]),
77    (0, [0; 8]),
78    (0, [0; 8]),
79    (0, [0; 8]),
80    // 0x40 ~ 0x4f
81    (0, [0; 8]),
82    (0, [0; 8]),
83    (0, [0; 8]),
84    (0, [0; 8]),
85    (0, [0; 8]),
86    (0, [0; 8]),
87    (0, [0; 8]),
88    (0, [0; 8]),
89    (0, [0; 8]),
90    (0, [0; 8]),
91    (0, [0; 8]),
92    (0, [0; 8]),
93    (0, [0; 8]),
94    (0, [0; 8]),
95    (0, [0; 8]),
96    (0, [0; 8]),
97    // 0x50 ~ 0x5f
98    (0, [0; 8]),
99    (0, [0; 8]),
100    (0, [0; 8]),
101    (0, [0; 8]),
102    (0, [0; 8]),
103    (0, [0; 8]),
104    (0, [0; 8]),
105    (0, [0; 8]),
106    (0, [0; 8]),
107    (0, [0; 8]),
108    (0, [0; 8]),
109    (0, [0; 8]),
110    (2, *b"\\\\\0\0\0\0\0\0"),
111    (0, [0; 8]),
112    (0, [0; 8]),
113    (0, [0; 8]),
114    // 0x60 ~ 0xff
115    (0, [0; 8]),
116    (0, [0; 8]),
117    (0, [0; 8]),
118    (0, [0; 8]),
119    (0, [0; 8]),
120    (0, [0; 8]),
121    (0, [0; 8]),
122    (0, [0; 8]),
123    (0, [0; 8]),
124    (0, [0; 8]),
125    (0, [0; 8]),
126    (0, [0; 8]),
127    (0, [0; 8]),
128    (0, [0; 8]),
129    (0, [0; 8]),
130    (0, [0; 8]),
131    (0, [0; 8]),
132    (0, [0; 8]),
133    (0, [0; 8]),
134    (0, [0; 8]),
135    (0, [0; 8]),
136    (0, [0; 8]),
137    (0, [0; 8]),
138    (0, [0; 8]),
139    (0, [0; 8]),
140    (0, [0; 8]),
141    (0, [0; 8]),
142    (0, [0; 8]),
143    (0, [0; 8]),
144    (0, [0; 8]),
145    (0, [0; 8]),
146    (0, [0; 8]),
147    (0, [0; 8]),
148    (0, [0; 8]),
149    (0, [0; 8]),
150    (0, [0; 8]),
151    (0, [0; 8]),
152    (0, [0; 8]),
153    (0, [0; 8]),
154    (0, [0; 8]),
155    (0, [0; 8]),
156    (0, [0; 8]),
157    (0, [0; 8]),
158    (0, [0; 8]),
159    (0, [0; 8]),
160    (0, [0; 8]),
161    (0, [0; 8]),
162    (0, [0; 8]),
163    (0, [0; 8]),
164    (0, [0; 8]),
165    (0, [0; 8]),
166    (0, [0; 8]),
167    (0, [0; 8]),
168    (0, [0; 8]),
169    (0, [0; 8]),
170    (0, [0; 8]),
171    (0, [0; 8]),
172    (0, [0; 8]),
173    (0, [0; 8]),
174    (0, [0; 8]),
175    (0, [0; 8]),
176    (0, [0; 8]),
177    (0, [0; 8]),
178    (0, [0; 8]),
179    (0, [0; 8]),
180    (0, [0; 8]),
181    (0, [0; 8]),
182    (0, [0; 8]),
183    (0, [0; 8]),
184    (0, [0; 8]),
185    (0, [0; 8]),
186    (0, [0; 8]),
187    (0, [0; 8]),
188    (0, [0; 8]),
189    (0, [0; 8]),
190    (0, [0; 8]),
191    (0, [0; 8]),
192    (0, [0; 8]),
193    (0, [0; 8]),
194    (0, [0; 8]),
195    (0, [0; 8]),
196    (0, [0; 8]),
197    (0, [0; 8]),
198    (0, [0; 8]),
199    (0, [0; 8]),
200    (0, [0; 8]),
201    (0, [0; 8]),
202    (0, [0; 8]),
203    (0, [0; 8]),
204    (0, [0; 8]),
205    (0, [0; 8]),
206    (0, [0; 8]),
207    (0, [0; 8]),
208    (0, [0; 8]),
209    (0, [0; 8]),
210    (0, [0; 8]),
211    (0, [0; 8]),
212    (0, [0; 8]),
213    (0, [0; 8]),
214    (0, [0; 8]),
215    (0, [0; 8]),
216    (0, [0; 8]),
217    (0, [0; 8]),
218    (0, [0; 8]),
219    (0, [0; 8]),
220    (0, [0; 8]),
221    (0, [0; 8]),
222    (0, [0; 8]),
223    (0, [0; 8]),
224    (0, [0; 8]),
225    (0, [0; 8]),
226    (0, [0; 8]),
227    (0, [0; 8]),
228    (0, [0; 8]),
229    (0, [0; 8]),
230    (0, [0; 8]),
231    (0, [0; 8]),
232    (0, [0; 8]),
233    (0, [0; 8]),
234    (0, [0; 8]),
235    (0, [0; 8]),
236    (0, [0; 8]),
237    (0, [0; 8]),
238    (0, [0; 8]),
239    (0, [0; 8]),
240    (0, [0; 8]),
241    (0, [0; 8]),
242    (0, [0; 8]),
243    (0, [0; 8]),
244    (0, [0; 8]),
245    (0, [0; 8]),
246    (0, [0; 8]),
247    (0, [0; 8]),
248    (0, [0; 8]),
249    (0, [0; 8]),
250    (0, [0; 8]),
251    (0, [0; 8]),
252    (0, [0; 8]),
253    (0, [0; 8]),
254    (0, [0; 8]),
255    (0, [0; 8]),
256    (0, [0; 8]),
257    (0, [0; 8]),
258    (0, [0; 8]),
259    (0, [0; 8]),
260    (0, [0; 8]),
261    (0, [0; 8]),
262    (0, [0; 8]),
263    (0, [0; 8]),
264    (0, [0; 8]),
265    (0, [0; 8]),
266    (0, [0; 8]),
267    (0, [0; 8]),
268    (0, [0; 8]),
269    (0, [0; 8]),
270    (0, [0; 8]),
271    (0, [0; 8]),
272    (0, [0; 8]),
273    (0, [0; 8]),
274    (0, [0; 8]),
275];
276
277pub(crate) const NEED_ESCAPED: [u8; 256] = [
278    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
279    0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
280    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
281    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
282    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
283    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
284    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
285    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
286];
287
288#[inline(always)]
289fn format_string(value: &str, dst: &mut [u8]) -> usize {
290    #[cfg(target_arch = "aarch64")]
291    {
292        let has_neon = cfg!(target_os = "macos") || std::arch::is_aarch64_feature_detected!("neon");
293        if has_neon {
294            unsafe { simd::neon::format_string(value, dst) }
295        } else {
296            simd::v128::format_string(value, dst)
297        }
298    }
299
300    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
301    {
302        #[cfg(feature = "avx512")]
303        {
304            if is_x86_feature_detected!("avx512f") {
305                return unsafe { simd::avx512::format_string(value, dst) };
306            }
307        }
308        if is_x86_feature_detected!("avx2") {
309            unsafe { simd::avx2::format_string(value, dst) }
310        } else if is_x86_feature_detected!("sse2") {
311            unsafe { simd::sse2::format_string(value, dst) }
312        } else {
313            simd::v128::format_string(value, dst)
314        }
315    }
316
317    #[cfg(not(any(target_arch = "aarch64", target_arch = "x86", target_arch = "x86_64")))]
318    {
319        simd::v128::format_string(value, dst)
320    }
321}
322
323pub fn escape(value: &str) -> String {
324    let capacity = value.len() * 6 + 32 + 3;
325    let mut buf = Vec::with_capacity(capacity);
326    #[allow(clippy::uninit_vec)]
327    unsafe {
328        buf.set_len(capacity)
329    };
330    let cnt = format_string(value, &mut buf);
331    unsafe { buf.set_len(cnt) };
332    unsafe { String::from_utf8_unchecked(buf) }
333}
334
335/// # Panics
336///
337/// Panics if the buffer is not large enough. Allocate enough capacity for dst.
338pub fn escape_into<S: AsRef<str>>(value: S, dst: &mut Vec<u8>) {
339    let value = value.as_ref();
340    let old_len = dst.len();
341
342    // SAFETY: We've reserved enough capacity above, and format_string will
343    // write valid UTF-8 bytes. We'll set the correct length after.
344    unsafe {
345        // Get a slice that includes the spare capacity
346        let spare =
347            std::slice::from_raw_parts_mut(dst.as_mut_ptr().add(old_len), dst.capacity() - old_len);
348        let cnt = format_string(value, spare);
349        dst.set_len(old_len + cnt);
350    }
351}
352
353#[cfg(test)]
354mod tests {
355    use std::fs::read_dir;
356    use std::path::{Path, PathBuf};
357
358    use rand::seq::SliceRandom;
359
360    use super::*;
361
362    #[test]
363    fn test_escape_ascii_json_string() {
364        let fixture = r#"abcdefghijklmnopqrstuvwxyz .*? hello world escape json string"#;
365        assert_eq!(escape(fixture), serde_json::to_string(fixture).unwrap());
366    }
367
368    #[test]
369    fn test_escape_json_string() {
370        let mut fixture = String::new();
371        for i in 0u8..=0x1F {
372            fixture.push(i as char);
373        }
374        fixture.push('\t');
375        fixture.push('\x08');
376        fixture.push('\x09');
377        fixture.push('\x0A');
378        fixture.push('\x0C');
379        fixture.push('\x0D');
380        fixture.push('\x22');
381        fixture.push('\x5C');
382        fixture.push_str("normal string");
383        fixture.push('😊');
384        fixture.push_str("δΈ­ζ–‡ English πŸš€ \n❓ π„ž");
385        escape(fixture.as_str());
386        assert_eq!(
387            escape(fixture.as_str()),
388            serde_json::to_string(fixture.as_str()).unwrap(),
389            "fixture: {:?}",
390            fixture
391        );
392    }
393
394    // Test cases for various string sizes to cover different SIMD paths
395
396    #[test]
397    fn test_empty_string() {
398        assert_eq!(escape(""), r#""""#);
399    }
400
401    #[test]
402    fn test_very_small_strings() {
403        // Less than 16 bytes (SSE register size)
404        assert_eq!(escape("a"), r#""a""#);
405        assert_eq!(escape("ab"), r#""ab""#);
406        assert_eq!(escape("hello"), r#""hello""#);
407        assert_eq!(escape("hello\n"), r#""hello\n""#);
408        assert_eq!(escape("\""), r#""\"""#);
409        assert_eq!(escape("\\"), r#""\\""#);
410        assert_eq!(escape("\t"), r#""\t""#);
411        assert_eq!(escape("\r\n"), r#""\r\n""#);
412    }
413
414    #[test]
415    fn test_small_strings_16_bytes() {
416        // Exactly 16 bytes - SSE register boundary
417        let s16 = "0123456789abcdef";
418        assert_eq!(s16.len(), 16);
419        assert_eq!(escape(s16), serde_json::to_string(s16).unwrap());
420
421        // 16 bytes with escapes
422        let s16_esc = "01234567\t9abcde";
423        assert_eq!(s16_esc.len(), 15); // \t is 1 byte
424        assert_eq!(escape(s16_esc), serde_json::to_string(s16_esc).unwrap());
425    }
426
427    #[test]
428    fn test_medium_strings_32_bytes() {
429        // Exactly 32 bytes - AVX2 register boundary
430        let s32 = "0123456789abcdef0123456789abcdef";
431        assert_eq!(s32.len(), 32);
432        assert_eq!(escape(s32), serde_json::to_string(s32).unwrap());
433
434        // 32 bytes with escapes at different positions
435        let s32_esc = "0123456789abcde\"0123456789abcde";
436        assert_eq!(escape(s32_esc), serde_json::to_string(s32_esc).unwrap());
437    }
438
439    #[test]
440    fn test_large_strings_128_bytes() {
441        // Exactly 128 bytes - main loop size
442        let s128 = "0123456789abcdef".repeat(8);
443        assert_eq!(s128.len(), 128);
444        assert_eq!(escape(&s128), serde_json::to_string(&s128).unwrap());
445
446        // 128 bytes with escapes spread throughout
447        let mut s128_esc = String::new();
448        for i in 0..8 {
449            if i % 2 == 0 {
450                s128_esc.push_str("0123456789abcd\n");
451            } else {
452                s128_esc.push_str("0123456789abcd\"");
453            }
454        }
455        assert_eq!(escape(&s128_esc), serde_json::to_string(&s128_esc).unwrap());
456    }
457
458    #[test]
459    fn test_unaligned_data() {
460        // Test strings that start at various alignments
461        for offset in 0..32 {
462            let padding = " ".repeat(offset);
463            let test_str = format!("{}{}", padding, "test\nstring\"with\\escapes");
464            let result = escape(&test_str[offset..]);
465            let expected = serde_json::to_string(&test_str[offset..]).unwrap();
466            assert_eq!(result, expected, "Failed at offset {}", offset);
467        }
468    }
469
470    #[test]
471    fn test_sparse_escapes() {
472        // Large string with escapes only at the beginning and end
473        let mut s = String::new();
474        s.push('"');
475        s.push_str(&"a".repeat(500));
476        s.push('\\');
477        assert_eq!(escape(&s), serde_json::to_string(&s).unwrap());
478    }
479
480    #[test]
481    fn test_dense_escapes() {
482        // String with many escapes
483        let s = "\"\\\"\\\"\\\"\\".repeat(50);
484        assert_eq!(escape(&s), serde_json::to_string(&s).unwrap());
485
486        // All control characters
487        let mut ctrl = String::new();
488        for _ in 0..10 {
489            for i in 0u8..32 {
490                ctrl.push(i as char);
491            }
492        }
493        assert_eq!(escape(&ctrl), serde_json::to_string(&ctrl).unwrap());
494    }
495
496    #[test]
497    fn test_boundary_conditions() {
498        // Test around 256 byte boundary (common cache line multiple)
499        for size in 250..260 {
500            let s = "a".repeat(size);
501            assert_eq!(escape(&s), serde_json::to_string(&s).unwrap());
502
503            // With escape at the end
504            let mut s_esc = "a".repeat(size - 1);
505            s_esc.push('"');
506            assert_eq!(escape(&s_esc), serde_json::to_string(&s_esc).unwrap());
507        }
508    }
509
510    #[test]
511    fn test_all_escape_types() {
512        // Test each escape type individually
513        assert_eq!(escape("\x00"), r#""\u0000""#);
514        assert_eq!(escape("\x08"), r#""\b""#);
515        assert_eq!(escape("\x09"), r#""\t""#);
516        assert_eq!(escape("\x0A"), r#""\n""#);
517        assert_eq!(escape("\x0C"), r#""\f""#);
518        assert_eq!(escape("\x0D"), r#""\r""#);
519        assert_eq!(escape("\x1F"), r#""\u001f""#);
520        assert_eq!(escape("\""), r#""\"""#);
521        assert_eq!(escape("\\"), r#""\\""#);
522
523        // Test all control characters
524        for i in 0u8..32 {
525            let s = String::from_utf8(vec![i]).unwrap();
526            let result = escape(&s);
527            let expected = String::from_utf8(QUOTE_TAB[i as usize].1.to_vec())
528                .unwrap()
529                .trim_end_matches('\0')
530                .to_string();
531            assert_eq!(
532                result,
533                format!("\"{}\"", expected),
534                "Failed for byte 0x{:02x}",
535                i
536            );
537        }
538    }
539
540    #[test]
541    fn test_mixed_content() {
542        // Mix of ASCII, escapes, and multi-byte UTF-8
543        let mixed = r#"Hello "World"!
544    Tab:	Here
545    Emoji: πŸ˜€ Chinese: δΈ­ζ–‡
546    Math: βˆ‘βˆ«βˆ‚ Music: π„ž
547    Escape: \" \\ \n \r \t"#;
548        assert_eq!(escape(mixed), serde_json::to_string(mixed).unwrap());
549    }
550
551    #[test]
552    fn test_repeated_patterns() {
553        // Patterns that might benefit from or confuse SIMD operations
554        let pattern1 = "abcd".repeat(100);
555        assert_eq!(escape(&pattern1), serde_json::to_string(&pattern1).unwrap());
556
557        let pattern2 = "a\"b\"".repeat(100);
558        assert_eq!(escape(&pattern2), serde_json::to_string(&pattern2).unwrap());
559
560        let pattern3 = "\t\n".repeat(100);
561        assert_eq!(escape(&pattern3), serde_json::to_string(&pattern3).unwrap());
562    }
563
564    #[test]
565    fn test_rxjs() {
566        let mut sources = Vec::new();
567        read_dir_recursive("node_modules/rxjs/src", &mut sources, |p| {
568            matches!(p.extension().and_then(|e| e.to_str()), Some("ts"))
569        })
570        .unwrap();
571        assert!(!sources.is_empty());
572        sources.shuffle(&mut rand::rng());
573        for source in sources
574            .iter()
575            .take(if cfg!(miri) { 10 } else { sources.len() })
576        {
577            assert_eq!(escape(source), serde_json::to_string(&source).unwrap());
578            let mut output = String::with_capacity(source.len() * 6 + 32 + 3);
579            escape_into(source, unsafe { output.as_mut_vec() });
580            assert_eq!(output, serde_json::to_string(&source).unwrap());
581        }
582    }
583
584    #[test]
585    fn test_sources() {
586        for source in load_affine_sources().unwrap() {
587            assert_eq!(escape(&source), serde_json::to_string(&source).unwrap());
588            let mut output = String::with_capacity(source.len() * 6 + 32 + 3);
589            escape_into(&source, unsafe { output.as_mut_vec() });
590            assert_eq!(output, serde_json::to_string(&source).unwrap());
591        }
592    }
593
594    fn load_affine_sources() -> Result<impl Iterator<Item = String>, std::io::Error> {
595        let mut sources = Vec::new();
596        read_dir_recursive("fixtures", &mut sources, |p| {
597            matches!(
598                p.extension().and_then(|e| e.to_str()),
599                Some("ts") | Some("tsx") | Some("js") | Some("mjs") | Some("cjs")
600            )
601        })?;
602        assert!(!sources.is_empty());
603        let len = sources.len();
604        sources.shuffle(&mut rand::rng());
605        Ok(sources.into_iter().take(if cfg!(miri) { 10 } else { len }))
606    }
607
608    fn read_dir_recursive<P: AsRef<Path>, F: Fn(PathBuf) -> bool + Copy>(
609        dir: P,
610        sources: &mut Vec<String>,
611        f: F,
612    ) -> Result<(), std::io::Error> {
613        let dir = read_dir(dir)?;
614        for entry in dir {
615            let p = entry?;
616            let metadata = std::fs::metadata(p.path())?;
617            if metadata.is_file() && f(p.path()) {
618                sources.push(std::fs::read_to_string(p.path())?);
619            }
620            if metadata.is_dir() {
621                read_dir_recursive(p.path(), sources, f)?;
622            }
623        }
624        Ok(())
625    }
626}