json_escape/
token.rs

1//! Provides low-level, granular, token-based JSON string processing.
2//!
3//! This module offers the most fundamental building blocks for both escaping and
4//! unescaping. It provides iterators, [`UnescapeTokens`] and [`EscapeTokens`],
5//! that walk a byte/string slice and yield tokens. This approach is highly flexible
6//! and composable, allowing consumers to handle the data in a zero-copy,
7//! streaming fashion for literal (non-processed) parts.
8//!
9//! ## Unescaping
10//!
11//! The [`UnescapeTokens`] iterator yields [`UnescapedToken`]s, separating literal
12//! byte slices from single unescaped `char`s. This ensures that in the case of an
13//! error (e.g., an invalid escape sequence), all preceding valid literal parts have
14//! already been successfully yielded.
15//!
16//! ## Escaping
17//!
18//! The [`EscapeTokens`] iterator yields [`EscapedToken`]s, separating literal
19//! string slices from the `&'static str` representation of an escaped character.
20//! This allows for efficient, allocation-free iteration over a string to produce
21//! its JSON-escaped form.
22
23#[cfg(feature = "alloc")]
24use crate::DecodeUtf8Error;
25use crate::{
26    InvalidEscapeError, InvalidHexError, LoneSurrogateError, UnescapeError, UnescapeErrorKind,
27};
28use core::{
29    fmt::{self, Write as _},
30    iter::FusedIterator,
31};
32use memchr::memchr;
33
34#[cfg(feature = "alloc")]
35use alloc::{borrow::Cow, string::String, vec::Vec};
36
37//==============================================================================
38// Escaping
39//==============================================================================
40
41/// Creates an iterator that yields tokens of an escaped JSON string.
42///
43/// See the [module-level documentation](self) for more details.
44#[inline]
45pub fn escape_str(s: &str) -> EscapeTokens<'_> {
46    EscapeTokens {
47        bytes: s.as_bytes(),
48    }
49}
50
51/// A token representing a piece of an escaped JSON string.
52///
53/// This enum is the item yielded by the [`EscapeTokens`] iterator.
54#[derive(Debug, Clone, Copy, PartialEq, Eq)]
55pub enum EscapedToken<'a> {
56    /// A slice of the original input that did not require escaping.
57    Literal(&'a str),
58    /// The `&'static str` representation of an escaped character (e.g., `r#"\n"#`).
59    Escaped(&'static str),
60}
61
62impl<'a> EscapedToken<'a> {
63    #[inline(always)]
64    pub(crate) fn as_str(&self) -> &'a str {
65        match self {
66            EscapedToken::Literal(s) => s,
67            EscapedToken::Escaped(s) => s,
68        }
69    }
70}
71
72impl fmt::Display for EscapedToken<'_> {
73    #[inline]
74    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
75        f.write_str(self.as_str())
76    }
77}
78
79/// An iterator over a string that yields [`EscapedToken`]s.
80///
81/// This is a low-level API for producing a JSON-escaped representation of a
82/// string slice without allocation. It yields borrowed string slices for literal
83/// parts and static string slices for the escape sequences themselves.
84#[derive(Clone, Debug)]
85#[must_use = "iterators are lazy and do nothing unless consumed"]
86pub struct EscapeTokens<'a> {
87    pub(crate) bytes: &'a [u8],
88}
89
90impl<'a> EscapeTokens<'a> {
91    /// Creates a new tokenizing escaper for the given string slice.
92    #[inline]
93    pub const fn new(s: &'a str) -> Self {
94        Self {
95            bytes: s.as_bytes(),
96        }
97    }
98
99    #[inline(always)]
100    pub(crate) fn escape(byte: u8) -> Option<&'static str> {
101        ESCAPE_TABLE[byte as usize]
102    }
103
104    /// Splits the slice at the first byte that needs to be escaped.
105    ///
106    /// The first element of the returned tuple is the literal part, which is
107    /// guaranteed to be valid UTF-8. The second is the rest of the slice,
108    /// which starts with an escapable byte, or is empty.
109    ///
110    /// # SAFETY
111    ///
112    /// The input byte slice `bytes` must be valid UTF-8. This is because the
113    /// function uses `from_utf8_unchecked`, relying on the fact that all
114    /// escapable characters are single-byte ASCII and thus cannot occur in
115    /// the middle of a multi-byte UTF-8 sequence.
116    #[inline(always)]
117    pub(crate) unsafe fn split_at_escape(bytes: &[u8]) -> (&str, &[u8]) {
118        // Find the first byte that needs escaping.
119        let pos = match Self::find_escape_char(bytes) {
120            // Found a backslash, the literal is the part before it.
121            Some(p) => p,
122            // No more backslashes, the rest of the slice is a literal.
123            None => bytes.len(),
124        };
125
126        let (literal_bytes, rest) = bytes.split_at(pos);
127        // SAFETY: `find_escape_char` guarantees `pos` is on a UTF-8 boundary
128        // because escapable characters are single-byte ASCII.
129        (
130            unsafe { std::str::from_utf8_unchecked(literal_bytes) },
131            rest,
132        )
133    }
134
135    // Not public API. Exposed for test
136    #[doc(hidden)]
137    // This is the SIMD version, compiled only when the "simd" feature is enabled on nightly build.
138    #[cfg(all(feature = "simd", nightly))]
139    #[inline]
140    pub fn find_escape_char(bytes: &[u8]) -> Option<usize> {
141        use std::simd::{Simd, prelude::SimdPartialEq, prelude::SimdPartialOrd};
142
143        const LANES: usize = 16; // Process 16 bytes at a time (fits in SSE2/AVX)
144        let mut i = 0;
145
146        // SIMD main loop
147        while i + LANES <= bytes.len() {
148            // Load 16 bytes from the slice into a SIMD vector.
149            let chunk = Simd::<u8, LANES>::from_slice(&bytes[i..]);
150
151            // Create comparison vectors. These are effectively 16 copies of the byte.
152            let space_v = Simd::splat(b' ' - 1); // For the < ' ' check (i.e., <= 0x1F)
153            let quote_v = Simd::splat(b'"');
154            let slash_v = Simd::splat(b'\\');
155
156            // Perform all 16 comparisons at once. The result is a mask.
157            let lt_space_mask = chunk.simd_le(space_v);
158            let eq_quote_mask = chunk.simd_eq(quote_v);
159            let eq_slash_mask = chunk.simd_eq(slash_v);
160
161            // Combine the masks. A byte needs escaping if ANY of the conditions are true.
162            let combined_mask = lt_space_mask | eq_quote_mask | eq_slash_mask;
163
164            // Check if any lane in the combined mask is true.
165            if combined_mask.any() {
166                // If yes, find the index of the *first* true lane.
167                // trailing_zeros() on the bitmask gives us this index directly.
168                let first_match_index = combined_mask.to_bitmask().trailing_zeros() as usize;
169                return Some(i + first_match_index);
170            }
171
172            i += LANES;
173        }
174
175        // Handle the remaining bytes (if any) with the simple iterator method.
176        if i < bytes.len() {
177            if let Some(pos) = bytes[i..]
178                .iter()
179                .position(|&b| ESCAPE_DECISION_TABLE[b as usize] != 0)
180            {
181                return Some(i + pos);
182            }
183        }
184
185        None
186    }
187
188    // Not public API. Exposed for test
189    #[doc(hidden)]
190    #[cfg(all(feature = "simd", not(nightly), target_arch = "x86_64"))]
191    #[inline]
192    pub fn find_escape_char(bytes: &[u8]) -> Option<usize> {
193        // This is the stable Rust path using explicit CPU intrinsics.
194        // It's guarded by cfg flags to only compile on x86_64 with the simd feature.
195        use std::arch::x86_64::*;
196
197        let mut i = 0;
198        const LANES: usize = 16; // SSE2 works on 128-bit registers, which is 16 bytes.
199
200        // On x86_64, we can tell the compiler to use SSE2 features in this specific function.
201        // This is safe because we've already checked the target architecture.
202        // SAFETY: calling this unsafe function is only safe if the caller ensures:
203        //  - the CPU supports SSE2, and
204        //  - i + LANES <= bytes.len()
205        #[target_feature(enable = "sse2")]
206        unsafe fn find_in_chunk(bytes: &[u8], i: usize) -> Option<usize> {
207            // Safety check: ensure the 16 bytes we will load are inside the slice.
208            // This is a debug-time assertion to help catch incorrect call.
209            debug_assert!(
210                i + LANES <= bytes.len(),
211                "find_in_chunk: attempted to load past end of slice"
212            );
213
214            // Load 16 bytes of data from the slice.
215            // SAFETY: caller must guarantee `i + LANES <= bytes.len()`. We assert that above.
216            let chunk = unsafe { _mm_loadu_si128(bytes.as_ptr().add(i) as *const _) };
217
218            // Create comparison vectors for quote and slash.
219            let quote_v = _mm_set1_epi8(b'"' as i8);
220            let slash_v = _mm_set1_epi8(b'\\' as i8);
221
222            // Emulate unsigned comparison for control characters
223            // Create a vector with the value 0x80 in each lane.
224            let bias = _mm_set1_epi8(0x80u8 as i8);
225            // Create the comparison vector for bytes < 0x20 (' ').
226            let space_v = _mm_set1_epi8(b' ' as i8);
227
228            // Bias both the input chunk and the comparison vector by XORing with 0x80.
229            let biased_chunk = _mm_xor_si128(chunk, bias);
230            let biased_space_v = _mm_xor_si128(space_v, bias);
231
232            // Now, a signed less-than comparison on the biased values gives the
233            // same result as an unsigned less-than on the original values.
234            let lt_space_mask = _mm_cmplt_epi8(biased_chunk, biased_space_v);
235
236            // Perform the equality comparisons (these are unaffected by signedness).
237            let eq_quote_mask = _mm_cmpeq_epi8(chunk, quote_v);
238            let eq_slash_mask = _mm_cmpeq_epi8(chunk, slash_v);
239
240            // Combine the results.
241            let combined_mask =
242                _mm_or_si128(lt_space_mask, _mm_or_si128(eq_quote_mask, eq_slash_mask));
243
244            // Create a bitmask to find the first match.
245            let mask = _mm_movemask_epi8(combined_mask);
246
247            if mask != 0 {
248                Some(i + mask.trailing_zeros() as usize)
249            } else {
250                None
251            }
252        }
253
254        if cfg!(target_feature = "sse2") {
255            // Main loop (vectorized)
256            while i + LANES <= bytes.len() {
257                // Safety: calling `find_in_chunk` is safe here because:
258                //  - we've checked CPU supports SSE2 via is_x86_feature_detected!
259                //  - loop condition ensures `i + LANES <= bytes.len()`, matching the debug_assert in the function.
260                if let Some(result) = unsafe { find_in_chunk(bytes, i) } {
261                    return Some(result);
262                }
263                i += LANES;
264            }
265        } else {
266            // CPU doesn't support SSE2: fall through to scalar path below.
267            // (We intentionally do not attempt to call the sse2 function.)
268        }
269
270        // Handle the remainder with the fast scalar lookup.
271        if i < bytes.len() {
272            if let Some(pos) = bytes[i..]
273                .iter()
274                .position(|&b| ESCAPE_DECISION_TABLE[b as usize] != 0)
275            {
276                return Some(i + pos);
277            }
278        }
279
280        None
281    }
282
283    // Not public API. Exposed for test
284    // A fallback for when SIMD feature is off.
285    #[doc(hidden)]
286    #[cfg(not(feature = "simd"))]
287    #[inline]
288    pub fn find_escape_char(bytes: &[u8]) -> Option<usize> {
289        use core::mem::size_of;
290
291        const WORD_SIZE: usize = size_of::<usize>();
292        const THRESH: u8 = 0x20; // control threshold
293
294        // helper: repeat a byte across a usize (works for any usize width)
295        const fn repeat(b: u8) -> usize {
296            let mut m: usize = 0;
297            let mut i = 0;
298            while i < WORD_SIZE {
299                m = (m << 8) | (b as usize);
300                i += 1;
301            }
302            m
303        }
304
305        // Precompute masks as constants
306        const ONE_MASK: usize = repeat(0x01);
307        const MSB_MASK: usize = repeat(0x80);
308        const QUOTE_MASK: usize = repeat(b'"');
309        const SLASH_MASK: usize = repeat(b'\\');
310        const THR_MASK: usize = repeat(THRESH);
311
312        let mut i = 0usize;
313        while i + WORD_SIZE <= bytes.len() {
314            // SAFETY: we checked bounds; read_unaligned is allowed for any alignment.
315            let word = unsafe { (bytes.as_ptr().add(i) as *const usize).read_unaligned() };
316
317            // equality tests (SWAR zero-byte detection on XOR)
318            let xq = word ^ QUOTE_MASK;
319            let quote_bits = (xq.wrapping_sub(ONE_MASK) & !xq) & MSB_MASK;
320
321            let xs = word ^ SLASH_MASK;
322            let slash_bits = (xs.wrapping_sub(ONE_MASK) & !xs) & MSB_MASK;
323
324            // control: detect bytes < 0x20 using subtract+~word+msb trick
325            // If any byte b satisfies b < 0x20 then the corresponding MSB bit in control_bits is set.
326            let control_bits = (word.wrapping_sub(THR_MASK) & !word) & MSB_MASK;
327
328            // combined mask: MSB-bit set per candidate byte
329            let combined = quote_bits | slash_bits | control_bits;
330
331            if combined != 0 {
332                // Find earliest matching byte inside this word in a portable way:
333                // - on little-endian the least-significant set bit corresponds to the earliest byte
334                // - on big-endian the most-significant set bit corresponds to the earliest byte
335                let byte_index = if cfg!(target_endian = "little") {
336                    (combined.trailing_zeros() as usize) / 8
337                } else {
338                    (combined.leading_zeros() as usize) / 8
339                };
340                return Some(i + byte_index);
341            }
342
343            i += WORD_SIZE;
344        }
345
346        // tail bytes
347        if i < bytes.len() {
348            if let Some(pos) = bytes[i..]
349                .iter()
350                .position(|&b| ESCAPE_DECISION_TABLE[b as usize] != 0)
351            {
352                return Some(i + pos);
353            }
354        }
355
356        None
357    }
358
359    #[cfg(all(feature = "simd", not(nightly), not(target_arch = "x86_64")))]
360    compile_error! { "simd requires nightly or target_arch = \"x86_64\"" }
361}
362
363impl<'a> Iterator for EscapeTokens<'a> {
364    type Item = EscapedToken<'a>;
365
366    #[inline]
367    fn next(&mut self) -> Option<Self::Item> {
368        if self.bytes.is_empty() {
369            return None;
370        }
371
372        if let Some(escaped) = Self::escape(self.bytes[0]) {
373            // --- Handle Escape ---
374            // An escapable byte is at the beginning of the slice.
375            self.bytes = &self.bytes[1..];
376            Some(EscapedToken::Escaped(escaped))
377        } else {
378            // --- Handle Literal ---
379            // SAFETY: Input is string
380            let (literal, rest) = unsafe { Self::split_at_escape(self.bytes) };
381            self.bytes = rest;
382            Some(EscapedToken::Literal(literal))
383        }
384    }
385
386    fn size_hint(&self) -> (usize, Option<usize>) {
387        if self.bytes.is_empty() {
388            (0, Some(0))
389        } else {
390            // We'll yield at least 1 slice, and at most `len` slices if every byte is escaped.
391            (1, Some(self.bytes.len()))
392        }
393    }
394}
395
396impl<'a> FusedIterator for EscapeTokens<'a> {}
397
398impl fmt::Display for EscapeTokens<'_> {
399    #[inline]
400    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
401        for token in self.clone() {
402            f.write_str(token.as_str())?;
403        }
404        Ok(())
405    }
406}
407
408#[cfg(feature = "alloc")]
409impl<'a> From<EscapeTokens<'a>> for Cow<'a, str> {
410    /// Efficiently collects the escaped parts into a `Cow<'a, str>`.
411    ///
412    /// This implementation is optimized to avoid allocation if possible:
413    /// - If the input string requires **no escaping**, it returns `Cow::Borrowed`
414    ///   with a slice of the original string.
415    /// - If escaping is needed, it allocates a `String` and returns `Cow::Owned`.
416    fn from(mut iter: EscapeTokens<'a>) -> Self {
417        match iter.next() {
418            None => Cow::Borrowed(""),
419            Some(EscapedToken::Literal(s)) if iter.bytes.is_empty() => {
420                // No escape in the first (and only) chunk, so no escaping was needed.
421                Cow::Borrowed(s)
422            }
423            Some(first) => {
424                // Escaping occurred. We must allocate.
425                let mut s = String::with_capacity(first.as_str().len() + iter.bytes.len());
426                s.push_str(first.as_str());
427                s.extend(iter);
428                Cow::Owned(s)
429            }
430        }
431    }
432}
433
434//==============================================================================
435// Unescaping
436//==============================================================================
437
438/// Creates an iterator that yields tokens of an unescaped JSON string.
439///
440/// See the [module-level documentation](self) for more details.
441#[inline]
442pub fn unescape<I: AsRef<[u8]> + ?Sized>(input: &I) -> UnescapeTokens<'_> {
443    UnescapeTokens {
444        bytes: input.as_ref(),
445    }
446}
447
448/// A token representing a piece of an unescaped JSON string.
449///
450/// This enum is the item yielded by the [`UnescapeTokens`] iterator.
451#[derive(Debug, Clone, Copy, PartialEq, Eq)]
452pub enum UnescapedToken<'a> {
453    /// A slice of the original input that did not require unescaping.
454    Literal(&'a [u8]),
455    /// A single character that was unescaped from an escape sequence.
456    Unescaped(char),
457}
458
459impl UnescapedToken<'_> {
460    /// Returns a wrapper that implements [`fmt::Display`].
461    ///
462    /// If the token is a `Literal` containing invalid UTF-8, a `fmt::Error`
463    /// is returned, which will cause `format!` and friends to panic.
464    pub fn display_utf8(&self) -> DisplayUnescapedToken<'_> {
465        DisplayUnescapedToken {
466            token: self,
467            lossy: true,
468        }
469    }
470
471    /// Returns a wrapper that implements [`fmt::Display`] for lossy UTF-8 decoding.
472    ///
473    /// If the token is a `Literal` containing invalid UTF-8, it will be replaced
474    /// with the replacement character.
475    pub fn display_utf8_lossy(&self) -> DisplayUnescapedToken<'_> {
476        DisplayUnescapedToken {
477            token: self,
478            lossy: true,
479        }
480    }
481
482    #[inline(always)]
483    const fn len(&self) -> usize {
484        match self {
485            UnescapedToken::Literal(literal) => literal.len(),
486            UnescapedToken::Unescaped(ch) => ch.len_utf8(),
487        }
488    }
489}
490
491/// Helper struct for safely displaying an [`DisplayUnescapedToken`].
492pub struct DisplayUnescapedToken<'a> {
493    token: &'a UnescapedToken<'a>,
494    lossy: bool,
495}
496
497impl fmt::Display for DisplayUnescapedToken<'_> {
498    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
499        match self.token {
500            UnescapedToken::Literal(bytes) => crate::display_bytes_utf8(bytes, f, self.lossy),
501            UnescapedToken::Unescaped(c) => f.write_char(*c),
502        }
503    }
504}
505
506/// An iterator over a byte slice that yields [`UnescapedToken`]s.
507///
508/// This is the foundational, low-level unescaping API. It processes the byte
509/// slice, yielding runs of literal bytes as borrowed slices and successfully
510/// parsed escape sequences as `char`s.
511///
512/// If an error is encountered while parsing an escape, the iterator will yield
513/// an `Err` and all subsequent calls to `next()` will return `None`.
514///
515/// Created by [`UnescapeTokens::new`].
516#[derive(Clone, Debug)]
517#[must_use = "iterators are lazy and do nothing unless consumed"]
518pub struct UnescapeTokens<'a> {
519    bytes: &'a [u8],
520}
521
522impl<'a> UnescapeTokens<'a> {
523    /// Creates a new tokenizing unescaper for the given byte slice.
524    #[inline]
525    pub const fn new(bytes: &'a [u8]) -> Self {
526        Self { bytes }
527    }
528
529    /// Returns the remaining unprocessed slice of bytes.
530    ///
531    /// If the iterator encounters an `UnexpectedEof` error, this method can be
532    /// used to retrieve the incomplete segment that needs to be stitched with
533    /// the next chunk in a streaming context.
534    #[inline]
535    pub const fn remnant(&self) -> &'a [u8] {
536        self.bytes
537    }
538
539    /// Decodes the unescaped byte stream into a UTF-8 string.
540    ///
541    /// This method consumes the iterator and collects all resulting byte chunks
542    /// into a `Cow<[u8]>`, which is then validated as UTF-8. If an unescaping
543    /// error occurs, it's returned immediately. If the final sequence of bytes
544    /// is not valid UTF-8, a UTF-8 error is returned.
545    ///
546    /// This is optimized to return a `Cow::Borrowed` if no escapes were present
547    /// in the input, avoiding allocation.
548    ///
549    /// **Requires the `alloc` feature.**
550    #[cfg(feature = "alloc")]
551    pub fn decode_utf8(self) -> Result<Cow<'a, str>, DecodeUtf8Error> {
552        match self.try_into().map_err(DecodeUtf8Error::Unescape)? {
553            Cow::Borrowed(bytes) => str::from_utf8(bytes)
554                .map(Cow::Borrowed)
555                .map_err(DecodeUtf8Error::Utf8),
556            Cow::Owned(bytes) => String::from_utf8(bytes)
557                .map(Cow::Owned)
558                .map_err(|e| DecodeUtf8Error::Utf8(e.utf8_error())),
559        }
560    }
561
562    /// Decodes the unescaped byte stream lossily into a UTF-8 string.
563    ///
564    /// This is similar to [`UnescapeTokens::decode_utf8`] but replaces any invalid UTF-8 sequences
565    /// with the replacement character (`U+FFFD`) instead of returning an error.
566    ///
567    /// An `UnescapeError` can still be returned if the JSON escaping itself is invalid.
568    ///
569    /// **Requires the `alloc` feature.**
570    #[cfg(feature = "alloc")]
571    pub fn decode_utf8_lossy(self) -> Result<Cow<'a, str>, UnescapeError> {
572        use crate::decode_utf8_lossy;
573
574        Ok(decode_utf8_lossy(self.try_into()?))
575    }
576
577    /// Returns a wrapper that implements [`fmt::Display`].
578    ///
579    /// If an unescaping error or invalid UTF-8 sequence is encountered,
580    /// a `fmt::Error` is returned, which will cause `format!` and friends to panic.
581    pub fn display_utf8(self) -> DisplayUnescapeTokens<'a> {
582        DisplayUnescapeTokens {
583            inner: self,
584            lossy: false,
585        }
586    }
587
588    /// Returns a wrapper that implements [`fmt::Display` for lossy UTF-8 decoding.
589    ///
590    /// Invalid UTF-8 sequences will be replaced with the replacement character.
591    /// An unescaping error will still result in a `fmt::Error`.
592    pub fn display_utf8_lossy(self) -> DisplayUnescapeTokens<'a> {
593        DisplayUnescapeTokens {
594            inner: self,
595            lossy: true,
596        }
597    }
598
599    /// Splits the slice at the first backslash `\`.
600    ///
601    /// The first element of the returned tuple is the literal part before the
602    /// backslash. The second is the rest of the slice, which starts with the
603    /// backslash, or is empty if no backslash was found.
604    #[inline(always)]
605    pub(crate) fn split_at_escape(bytes: &'a [u8]) -> (&'a [u8], &'a [u8]) {
606        let pos = match memchr(b'\\', bytes) {
607            // Found a backslash, the literal is the part before it.
608            Some(p) => p,
609            // No more backslashes, the rest of the slice is a literal.
610            None => bytes.len(),
611        };
612
613        let (literal, rest) = bytes.split_at(pos);
614        (literal, rest)
615    }
616
617    /// Parses any escape sequence (`\uXXXX`, '\n', e.t.c).
618    /// The input slice `bytes` must be positioned *after* the `\`.
619    ///
620    /// On error, returns an `Err` and the input slice may be modified.
621    #[inline(always)]
622    pub(crate) fn handle_escape(bytes: &mut &'a [u8]) -> Result<char, UnescapeError> {
623        match bytes.first() {
624            Some(b'u') => {
625                // Advance past 'u' and parse unicode
626                *bytes = &bytes[1..];
627                Self::handle_unicode_escape(bytes)
628            }
629            Some(&byte) => {
630                // Simple 1-char escape like \n, \t, etc.
631                match UNESCAPE_TABLE[byte as usize] {
632                    Some(c) => {
633                        *bytes = &bytes[1..];
634                        Ok(c)
635                    }
636                    None => {
637                        Err(UnescapeError {
638                            kind: UnescapeErrorKind::InvalidEscape(InvalidEscapeError {
639                                found: byte,
640                            }),
641                            // The invalid character is 1 byte after '\'.
642                            offset: 1,
643                        })
644                    }
645                }
646            }
647            None => {
648                // Dangling backslash at the end of input
649                Err(UnescapeError {
650                    kind: UnescapeErrorKind::UnexpectedEof,
651                    // EOF occurred 1 byte after '\'.
652                    offset: 1,
653                })
654            }
655        }
656    }
657
658    /// Parses a unicode escape sequence `\uXXXX` which may be a surrogate pair.
659    /// The input slice `bytes` must be positioned *after* the `\u`.
660    ///
661    /// On success, returns the parsed `char` and advances the slice.
662    /// On error, returns an `Err` and the input slice may be modified.
663    #[inline(always)]
664    fn handle_unicode_escape(bytes: &mut &'a [u8]) -> Result<char, UnescapeError> {
665        // Parse first 4 hex digits (\uXXXX)
666        //
667        // The slice starts *after* '\u'. The first hex digit is at offset 2 from '\'.
668        let first = Self::parse_hex4(bytes, 2)?;
669        *bytes = &bytes[4..];
670
671        // High surrogate → must be followed by another \uXXXX low surrogate
672        if (0xD800..=0xDBFF).contains(&first) {
673            // A high surrogate must be followed by a `\u` sequence.
674            // We check for at least 2 bytes and that they are `\` and `u`.
675            #[allow(clippy::get_first)]
676            match (bytes.get(0), bytes.get(1)) {
677                (Some(b'\\'), Some(b'u')) => {
678                    // We have `\u`, so we now expect 4 more hex digits for the low surrogate.
679                    // The slice for `parse_hex4` starts after `\u`, and the overall offset
680                    // from the beginning of the original escape is 8 (`\uXXXX\u`).
681                    match Self::parse_hex4(&bytes[2..], 8) {
682                        Ok(low) if (0xDC00..=0xDFFF).contains(&low) => {
683                            // Valid low surrogate found. Combine them.
684                            let high_t = first as u32;
685                            let low_t = low as u32;
686                            let code = 0x10000 + (((high_t - 0xD800) << 10) | (low_t - 0xDC00));
687                            let result_char = char::from_u32(code).expect(
688                                "valid surrogate pair math should always produce a valid char",
689                            );
690
691                            // Advance the slice past the entire low surrogate sequence.
692                            *bytes = &bytes[6..];
693                            return Ok(result_char);
694                        }
695                        Ok(_) => {
696                            // We parsed `\uXXXX`, but the value was not a valid low surrogate.
697                            // This makes the initial high surrogate a "lone surrogate".
698                            return Err(UnescapeError {
699                                kind: UnescapeErrorKind::LoneSurrogate(LoneSurrogateError {
700                                    surrogate: first,
701                                }),
702                                offset: 6,
703                            });
704                        }
705                        Err(err) => {
706                            // `parse_hex4` failed (e.g., incomplete hex, invalid char).
707                            // Propagate the error.
708                            return Err(err);
709                        }
710                    }
711                }
712                (Some(b'\\'), None) => {
713                    return Err(UnescapeError {
714                        kind: UnescapeErrorKind::UnexpectedEof,
715                        offset: 7,
716                    });
717                }
718                (None, None) => {
719                    // The input ended immediately after the high surrogate.
720                    return Err(UnescapeError {
721                        kind: UnescapeErrorKind::UnexpectedEof,
722                        offset: 6,
723                    });
724                }
725                // Something else after high surrogate → LoneSurrogate
726                _ => {
727                    // There are other characters, but they don't form a `\u` sequence.
728                    return Err(UnescapeError {
729                        kind: UnescapeErrorKind::LoneSurrogate(LoneSurrogateError {
730                            surrogate: first,
731                        }),
732                        offset: 6,
733                    });
734                }
735            }
736        }
737
738        // Not a surrogate → normal path
739        match char::from_u32(first as u32) {
740            Some(c) => Ok(c),
741            None => {
742                // The parsed value is not a valid char (e.g., it's a lone low surrogate).
743                Err(UnescapeError {
744                    kind: UnescapeErrorKind::LoneSurrogate(LoneSurrogateError { surrogate: first }),
745                    // The error is detected after consuming `\uXXXX` (6 bytes total from '\').
746                    offset: 6,
747                })
748            }
749        }
750    }
751
752    /// Parses 4 hex digits, optimized for the success path.
753    #[inline(always)]
754    fn parse_hex4(slice: &[u8], base_offset: u8) -> Result<u16, UnescapeError> {
755        // --- HOT PATH ---
756        // This is the path we expect to take most of the time.
757        if let Some(chunk) = slice.get(..4) {
758            // By slicing to 4, we've performed a single bounds check.
759            // The compiler now knows any access from chunk[0] to chunk[3] is safe,
760            // so it will not generate additional bounds checks.
761
762            // We can now safely access the bytes.
763            let b0 = chunk[0];
764            let b1 = chunk[1];
765            let b2 = chunk[2];
766            let b3 = chunk[3];
767
768            // Use the LUT to get the values.
769            if let (Some(v0), Some(v1), Some(v2), Some(v3)) = (
770                HEX[b0 as usize],
771                HEX[b1 as usize],
772                HEX[b2 as usize],
773                HEX[b3 as usize],
774            ) {
775                // All characters are valid hex, combine and return.
776                let result = (v0 as u16) << 12 | (v1 as u16) << 8 | (v2 as u16) << 4 | (v3 as u16);
777                return Ok(result);
778            }
779
780            // If we're here, it means the slice was long enough, but one
781            // of the characters was not a valid hex digit. Fall through to the cold path
782            // to correctly identify which character was invalid.
783        }
784
785        // --- COLD PATH ---
786        // This path handles all errors. It's marked as `#[cold]` to hint to the
787        // compiler that it's less frequently executed.
788        #[cold]
789        fn handle_error(slice: &[u8], base_offset: u8) -> UnescapeError {
790            // Loop through the bytes we *do* have.
791            for (i, &b) in slice.iter().enumerate() {
792                if HEX[b as usize].is_none() {
793                    // We found an invalid hex character before running out of bytes.
794                    return UnescapeError {
795                        kind: UnescapeErrorKind::InvalidHex(InvalidHexError { found: b }),
796                        offset: base_offset + i as u8,
797                    };
798                }
799            }
800
801            // If the loop completes, all available characters were valid,
802            // but there weren't enough of them.
803            UnescapeError {
804                kind: UnescapeErrorKind::UnexpectedEof,
805                // The error is at the position of the first *missing* character.
806                offset: base_offset + slice.len() as u8,
807            }
808        }
809
810        Err(handle_error(slice, base_offset))
811    }
812}
813
814impl<'a> Iterator for UnescapeTokens<'a> {
815    type Item = Result<UnescapedToken<'a>, UnescapeError>;
816
817    #[inline]
818    fn next(&mut self) -> Option<Self::Item> {
819        if self.bytes.is_empty() {
820            return None;
821        }
822
823        // Check if the next part is an escape sequence or a literal.
824        if self.bytes[0] == b'\\' {
825            // --- Handle Escape Sequence ---
826            Some({
827                // TODO: Try abstract... repeated in explicit
828                // rest starts with '\\'
829                let mut remainder = &self.bytes[1..];
830                match UnescapeTokens::handle_escape(&mut remainder) {
831                    Ok(unescaped_char) => {
832                        self.bytes = remainder;
833                        Ok(UnescapedToken::Unescaped(unescaped_char))
834                    }
835                    Err(err) => Err(err),
836                }
837            })
838        } else {
839            // --- Handle Literal ---
840            let (literal, rest) = Self::split_at_escape(self.bytes);
841            self.bytes = rest;
842            Some(Ok(UnescapedToken::Literal(literal)))
843        }
844    }
845
846    fn size_hint(&self) -> (usize, Option<usize>) {
847        if self.bytes.is_empty() {
848            (0, Some(0))
849        } else {
850            // Worst-case is \uXXXX -> 1 byte, so 6 -> 1.
851            (
852                self.bytes.len().saturating_add(1) / 6,
853                Some(self.bytes.len()),
854            )
855        }
856    }
857}
858
859impl<'a> FusedIterator for UnescapeTokens<'a> {}
860
861#[cfg(feature = "alloc")]
862impl<'a> TryFrom<UnescapeTokens<'a>> for Cow<'a, [u8]> {
863    type Error = UnescapeError;
864
865    /// Efficiently collects the unescaped bytes into a `Cow<'a, [u8]>`.
866    ///
867    /// Returns `Cow::Borrowed` if no escape sequences were present, avoiding
868    /// allocation. Otherwise, returns `Cow::Owned`. If an error occurs, it's
869    /// returned immediately.
870    fn try_from(mut value: UnescapeTokens<'a>) -> Result<Self, Self::Error> {
871        match value.next() {
872            None => Ok(Cow::Borrowed(b"")),
873            Some(Ok(UnescapedToken::Literal(literal))) if value.bytes.is_empty() => {
874                // The first and only token is a literal. No allocation needed.
875                Ok(Cow::Borrowed(literal))
876            }
877            Some(Ok(first_token)) => {
878                // An escape was processed or there are more tokens. Must allocate.
879                let mut buf = Vec::with_capacity(first_token.len() + value.bytes.len());
880
881                let process_token = |buf: &mut Vec<u8>, token: UnescapedToken| match token {
882                    UnescapedToken::Literal(bytes) => buf.extend_from_slice(bytes),
883                    UnescapedToken::Unescaped(c) => {
884                        append_char(buf, c);
885                    }
886                };
887
888                process_token(&mut buf, first_token);
889                for item in value {
890                    process_token(&mut buf, item?);
891                }
892
893                Ok(Cow::Owned(buf))
894            }
895            Some(Err(e)) => Err(e),
896        }
897    }
898}
899
900/// A wrapper struct for implementing `fmt::Display` on an [`UnescapeTokens`] iterator.
901pub struct DisplayUnescapeTokens<'a> {
902    inner: UnescapeTokens<'a>,
903    lossy: bool,
904}
905
906impl<'a> fmt::Display for DisplayUnescapeTokens<'a> {
907    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
908        for chunk_result in self.inner.clone() {
909            match chunk_result {
910                Ok(token) => {
911                    let display_chunk = DisplayUnescapedToken {
912                        token: &token,
913                        lossy: self.lossy,
914                    };
915                    write!(f, "{}", display_chunk)?;
916                }
917                Err(_) => return Err(fmt::Error), // Signal error to formatter
918            }
919        }
920        Ok(())
921    }
922}
923
924// =============================================================================
925// Utilities
926// =============================================================================
927
928// A const lookup table for JSON escape sequences.
929// Maps a byte to its escaped `&'static str` representation.
930// `None` indicates the byte does not need to be escaped.
931const ESCAPE_TABLE: [Option<&'static str>; 256] = {
932    let mut table: [Option<&'static str>; 256] = [None; 256];
933
934    // Special characters
935    table[b'"' as usize] = Some(r#"\""#);
936    table[b'\\' as usize] = Some(r#"\\"#);
937
938    // Common control characters with short escapes
939    table[0x08] = Some(r#"\b"#); // Backspace
940    table[0x09] = Some(r#"\t"#); // Tab
941    table[0x0A] = Some(r#"\n"#); // Line Feed
942    table[0x0C] = Some(r#"\f"#); // Form Feed
943    table[0x0D] = Some(r#"\r"#); // Carriage Return
944
945    // The rest of the control characters must be `\uXXXX` encoded.
946    // We can pre-calculate and store all of them as static strings.
947    table[0x00] = Some(r#"\u0000"#);
948    table[0x01] = Some(r#"\u0001"#);
949    table[0x02] = Some(r#"\u0002"#);
950    table[0x03] = Some(r#"\u0003"#);
951    table[0x04] = Some(r#"\u0004"#);
952    table[0x05] = Some(r#"\u0005"#);
953    table[0x06] = Some(r#"\u0006"#);
954    table[0x07] = Some(r#"\u0007"#);
955    // 0x08 to 0x0D are already handled above
956    table[0x0B] = Some(r#"\u000b"#);
957    table[0x0E] = Some(r#"\u000e"#);
958    table[0x0F] = Some(r#"\u000f"#);
959    table[0x10] = Some(r#"\u0010"#);
960    table[0x11] = Some(r#"\u0011"#);
961    table[0x12] = Some(r#"\u0012"#);
962    table[0x13] = Some(r#"\u0013"#);
963    table[0x14] = Some(r#"\u0014"#);
964    table[0x15] = Some(r#"\u0015"#);
965    table[0x16] = Some(r#"\u0016"#);
966    table[0x17] = Some(r#"\u0017"#);
967    table[0x18] = Some(r#"\u0018"#);
968    table[0x19] = Some(r#"\u0019"#);
969    table[0x1A] = Some(r#"\u001a"#);
970    table[0x1B] = Some(r#"\u001b"#);
971    table[0x1C] = Some(r#"\u001c"#);
972    table[0x1D] = Some(r#"\u001d"#);
973    table[0x1E] = Some(r#"\u001e"#);
974    table[0x1F] = Some(r#"\u001f"#);
975
976    table
977};
978
979// Not public API. Exposed for test
980#[doc(hidden)]
981// A simple boolean-like lookup table for SIMD.
982// 0 = no escape needed, 1 = escape needed.
983// This is very compact (256 bytes) and fits easily in the L1 cache.
984#[allow(unused)]
985pub const ESCAPE_DECISION_TABLE: [u8; 256] = {
986    let mut table = [0u8; 256];
987    let mut i = 0;
988    while i < 256 {
989        if ESCAPE_TABLE[i].is_some() {
990            table[i] = 1;
991        }
992        i += 1;
993    }
994    table
995};
996
997// Escape table: maps the byte after '\' to its escaped representation.
998const UNESCAPE_TABLE: [Option<char>; 256] = {
999    let mut tbl: [Option<char>; 256] = [None; 256];
1000    tbl[b'"' as usize] = Some('\"');
1001    tbl[b'\\' as usize] = Some('\\');
1002    tbl[b'/' as usize] = Some('/');
1003    tbl[b'b' as usize] = Some('\x08');
1004    tbl[b'f' as usize] = Some('\x0C');
1005    tbl[b'n' as usize] = Some('\n');
1006    tbl[b'r' as usize] = Some('\r');
1007    tbl[b't' as usize] = Some('\t');
1008    tbl
1009};
1010
1011// --- Look-Up Table for Hex Decoding ---
1012const HEX: [Option<u8>; 256] = {
1013    let mut table = [None; 256];
1014    let mut i = 0;
1015    while i < 256 {
1016        table[i] = match i as u8 {
1017            b'0'..=b'9' => Some(i as u8 - b'0'),
1018            b'a'..=b'f' => Some(i as u8 - b'a' + 10),
1019            b'A'..=b'F' => Some(i as u8 - b'A' + 10),
1020            _ => None,
1021        };
1022        i += 1;
1023    }
1024    table
1025};
1026
1027// Helper to append a char directly to the Vec<u8> buffer.
1028// This should be more efficient than using an intermediate stack buffer.
1029#[inline]
1030pub(crate) fn append_char(buf: &mut Vec<u8>, c: char) {
1031    // Reserve space for the character's bytes and write directly into the buffer.
1032    let char_len = c.len_utf8();
1033    let old_len = buf.len();
1034    buf.resize(old_len + char_len, 0);
1035    c.encode_utf8(&mut buf[old_len..]);
1036}
1037
1038//==============================================================================
1039// Iterator Trait Implementations
1040//==============================================================================
1041
1042#[cfg(feature = "alloc")]
1043mod iter_traits {
1044    use super::{EscapedToken, UnescapedToken, append_char};
1045    use alloc::string::String;
1046    use alloc::vec::Vec;
1047
1048    /// Collects an iterator of escaped chunks into a single `String`.
1049    impl<'a> FromIterator<EscapedToken<'a>> for String {
1050        #[inline]
1051        fn from_iter<I: IntoIterator<Item = EscapedToken<'a>>>(iter: I) -> String {
1052            let mut s = String::new();
1053            s.extend(iter);
1054            s
1055        }
1056    }
1057
1058    /// Extends a `String` with an iterator of escaped tokens.
1059    impl<'a> Extend<EscapedToken<'a>> for String {
1060        #[inline]
1061        fn extend<I: IntoIterator<Item = EscapedToken<'a>>>(&mut self, iter: I) {
1062            iter.into_iter().for_each(move |token| {
1063                self.push_str(token.as_str());
1064            });
1065        }
1066    }
1067
1068    /// Collects an iterator of unescaped chunks into a byte vector.
1069    impl<'a> FromIterator<UnescapedToken<'a>> for Vec<u8> {
1070        #[inline]
1071        fn from_iter<I: IntoIterator<Item = UnescapedToken<'a>>>(iter: I) -> Vec<u8> {
1072            let mut buf = Vec::new();
1073            buf.extend(iter);
1074            buf
1075        }
1076    }
1077
1078    /// Extends a byte vector with an iterator of unescaped chunks.
1079    impl<'a> Extend<UnescapedToken<'a>> for Vec<u8> {
1080        #[inline]
1081        fn extend<I: IntoIterator<Item = UnescapedToken<'a>>>(&mut self, iter: I) {
1082            iter.into_iter().for_each(move |token| match token {
1083                UnescapedToken::Literal(literal) => self.extend_from_slice(literal),
1084                UnescapedToken::Unescaped(ch) => append_char(self, ch),
1085            })
1086        }
1087    }
1088}
1089
1090#[cfg(test)]
1091mod tests {
1092    use super::*;
1093
1094    #[test]
1095    fn test_empty_string() {
1096        let mut iter = UnescapeTokens::new(b"");
1097        assert_eq!(iter.next(), None);
1098    }
1099
1100    #[test]
1101    fn test_pure_literal() {
1102        let mut iter = UnescapeTokens::new(b"hello world");
1103        assert_eq!(
1104            iter.next(),
1105            Some(Ok(UnescapedToken::Literal(b"hello world")))
1106        );
1107        assert_eq!(iter.next(), None);
1108    }
1109
1110    #[test]
1111    fn test_simple_escapes() {
1112        let mut iter = UnescapeTokens::new(b"a\\nb\\tc");
1113        assert_eq!(iter.next(), Some(Ok(UnescapedToken::Literal(b"a"))));
1114        assert_eq!(iter.next(), Some(Ok(UnescapedToken::Unescaped('\n'))));
1115        assert_eq!(iter.next(), Some(Ok(UnescapedToken::Literal(b"b"))));
1116        assert_eq!(iter.next(), Some(Ok(UnescapedToken::Unescaped('\t'))));
1117        assert_eq!(iter.next(), Some(Ok(UnescapedToken::Literal(b"c"))));
1118        assert_eq!(iter.next(), None);
1119    }
1120
1121    #[test]
1122    fn test_starts_with_escape() {
1123        let mut iter = UnescapeTokens::new(b"\\nhello");
1124        assert_eq!(iter.next(), Some(Ok(UnescapedToken::Unescaped('\n'))));
1125        assert_eq!(iter.next(), Some(Ok(UnescapedToken::Literal(b"hello"))));
1126        assert_eq!(iter.next(), None);
1127    }
1128
1129    #[test]
1130    fn test_ends_with_escape() {
1131        let mut iter = UnescapeTokens::new(b"hello\\n");
1132        assert_eq!(iter.next(), Some(Ok(UnescapedToken::Literal(b"hello"))));
1133        assert_eq!(iter.next(), Some(Ok(UnescapedToken::Unescaped('\n'))));
1134        assert_eq!(iter.next(), None);
1135    }
1136
1137    #[test]
1138    fn test_unicode_and_surrogate() {
1139        let mut iter = UnescapeTokens::new(b"A is \\u0041, smiley is \\uD83D\\uDE00!");
1140        assert_eq!(iter.next(), Some(Ok(UnescapedToken::Literal(b"A is "))));
1141        assert_eq!(iter.next(), Some(Ok(UnescapedToken::Unescaped('A'))));
1142        assert_eq!(
1143            iter.next(),
1144            Some(Ok(UnescapedToken::Literal(b", smiley is ")))
1145        );
1146        assert_eq!(iter.next(), Some(Ok(UnescapedToken::Unescaped('😀'))));
1147        assert_eq!(iter.next(), Some(Ok(UnescapedToken::Literal(b"!"))));
1148        assert_eq!(iter.next(), None);
1149    }
1150
1151    #[test]
1152    fn test_invalid_escape_yields_literal_first() {
1153        let mut iter = UnescapeTokens::new(b"ValidPart\\zInvalid");
1154        // First, we get the valid literal part. THIS is the key fix.
1155        assert_eq!(iter.next(), Some(Ok(UnescapedToken::Literal(b"ValidPart"))));
1156        // Then, we get the error.
1157        let err = iter.next().unwrap().unwrap_err();
1158        assert_eq!(
1159            err,
1160            UnescapeError {
1161                kind: UnescapeErrorKind::InvalidEscape(InvalidEscapeError { found: b'z' }),
1162                offset: 1,
1163            }
1164        );
1165        // The iterator should keep erroring
1166        assert_eq!(iter.remnant(), b"\\zInvalid");
1167        assert_eq!(iter.next(), Some(Err(err)));
1168    }
1169
1170    #[test]
1171    fn test_sticky_error_behavior() {
1172        let mut iter = UnescapeTokens::new(b"a\\zb");
1173        assert_eq!(iter.next(), Some(Ok(UnescapedToken::Literal(b"a"))));
1174
1175        // First error
1176        let err1 = iter.next().unwrap().unwrap_err();
1177        assert_eq!(
1178            err1.kind,
1179            UnescapeErrorKind::InvalidEscape(InvalidEscapeError { found: b'z' })
1180        );
1181        assert_eq!(iter.remnant(), b"\\zb");
1182
1183        // Second call should yield the same error
1184        let err2 = iter.next().unwrap().unwrap_err();
1185        assert_eq!(err1, err2);
1186        assert_eq!(iter.remnant(), b"\\zb"); // Remnant is unchanged
1187    }
1188
1189    #[test]
1190    fn test_incomplete_escape_at_end() {
1191        let mut iter = UnescapeTokens::new(b"ValidPart\\u12");
1192        assert_eq!(iter.next(), Some(Ok(UnescapedToken::Literal(b"ValidPart"))));
1193
1194        // Before consuming the error, check the remnant.
1195        assert_eq!(iter.remnant(), b"\\u12");
1196
1197        let err = iter.next().unwrap().unwrap_err();
1198        assert_eq!(
1199            err,
1200            UnescapeError {
1201                kind: UnescapeErrorKind::UnexpectedEof,
1202                offset: 4,
1203            }
1204        );
1205
1206        assert_eq!(iter.remnant(), b"\\u12");
1207        assert_eq!(iter.next(), Some(Err(err)));
1208    }
1209
1210    #[test]
1211    fn test_dangling_backslash() {
1212        let mut iter = UnescapeTokens::new(b"end with \\");
1213        assert_eq!(iter.next(), Some(Ok(UnescapedToken::Literal(b"end with "))));
1214        let err = iter.next().unwrap().unwrap_err();
1215        assert_eq!(
1216            err,
1217            UnescapeError {
1218                kind: UnescapeErrorKind::UnexpectedEof,
1219                offset: 1,
1220            }
1221        );
1222        assert_eq!(iter.next(), Some(Err(err)));
1223    }
1224
1225    #[test]
1226    fn test_display_unescape_tokens() {
1227        let iter = UnescapeTokens::new(b"hello \\u0041\\nworld");
1228        let display = iter.display_utf8();
1229        assert_eq!(alloc::format!("{}", display), "hello A\nworld");
1230    }
1231
1232    #[test]
1233    fn test_display_unescape_error() {
1234        let iter = UnescapeTokens::new(b"hello\\z");
1235        let mut out = String::new();
1236        write!(out, "{}", iter.display_utf8_lossy()).unwrap_err();
1237        // Formatting fails, but doesn't panic. The result is just truncated.
1238        // The exact output may vary, but we test that it doesn't contain the bad part.
1239        assert!(out.starts_with("hello"));
1240    }
1241
1242    // --- Escape Tests ---
1243    #[test]
1244    fn test_escape_no_escapes() {
1245        let mut iter = EscapeTokens::new("hello world");
1246        assert_eq!(iter.next(), Some(EscapedToken::Literal("hello world")));
1247        assert_eq!(iter.next(), None);
1248    }
1249
1250    #[test]
1251    fn test_escape_simple() {
1252        let mut iter = EscapeTokens::new("hello\nworld");
1253        assert_eq!(iter.next(), Some(EscapedToken::Literal("hello")));
1254        assert_eq!(iter.next(), Some(EscapedToken::Escaped(r#"\n"#)));
1255        assert_eq!(iter.next(), Some(EscapedToken::Literal("world")));
1256        assert_eq!(iter.next(), None);
1257    }
1258
1259    #[test]
1260    fn test_display_escape_tokens() {
1261        let iter = EscapeTokens::new("a\"b\tc");
1262        assert_eq!(alloc::format!("{}", iter), r#"a\"b\tc"#);
1263    }
1264
1265    #[cfg(feature = "alloc")]
1266    #[test]
1267    fn test_escape_to_cow_borrowed() {
1268        let iter = EscapeTokens::new("no escapes here");
1269        let cow: Cow<'_, str> = iter.into();
1270        assert!(matches!(cow, Cow::Borrowed(_)));
1271        assert_eq!(cow, "no escapes here");
1272    }
1273
1274    #[cfg(feature = "alloc")]
1275    #[test]
1276    fn test_escape_to_cow_owned() {
1277        let iter = EscapeTokens::new("has\n an escape");
1278        let cow: Cow<'_, str> = iter.into();
1279        assert!(matches!(cow, Cow::Owned(_)));
1280        assert_eq!(cow, r#"has\n an escape"#);
1281    }
1282
1283    #[cfg(feature = "alloc")]
1284    #[test]
1285    fn test_unescape_to_cow_borrowed() {
1286        let iter = UnescapeTokens::new(b"no escapes here");
1287        let cow: Cow<'_, [u8]> = iter.try_into().unwrap();
1288        assert!(matches!(cow, Cow::Borrowed(_)));
1289        assert_eq!(*cow, *b"no escapes here");
1290    }
1291
1292    #[cfg(feature = "alloc")]
1293    #[test]
1294    fn test_unescape_to_cow_owned() {
1295        let iter = UnescapeTokens::new(b"has\\n an escape");
1296        let cow: Cow<'_, [u8]> = iter.try_into().unwrap();
1297        assert!(matches!(cow, Cow::Owned(_)));
1298        assert_eq!(*cow, *b"has\n an escape");
1299    }
1300}
1301
1302#[cfg(test)]
1303mod find_escape_char_tests {
1304    use std::format;
1305
1306    use super::{ESCAPE_DECISION_TABLE, EscapeTokens};
1307
1308    /// Helper function to run a single test case and provide a clear error message on failure.
1309    fn run_test(input: &str, expected: Option<usize>, case_name: &str) {
1310        let result = EscapeTokens::find_escape_char(input.as_bytes());
1311        assert_eq!(result, expected, "Failed test case: '{}'", case_name);
1312    }
1313
1314    #[test]
1315    fn test_no_escapes() {
1316        run_test("", None, "Empty string");
1317        run_test("Hello, world!", None, "Simple ASCII");
1318        run_test("This string is exactly 16 bytes", None, "16-byte ASCII");
1319        run_test(
1320            "This string is over 16 bytes long now",
1321            None,
1322            "Over 16-byte ASCII",
1323        );
1324
1325        // The original source of the bug: non-ASCII UTF-8 characters.
1326        // This ensures the signedness bug is truly fixed.
1327        run_test("Hello, éàçüö!", None, "Non-ASCII UTF-8");
1328        run_test("Testing with emojis 😀❤️✅", None, "Emojis");
1329    }
1330
1331    #[test]
1332    fn test_single_escapes() {
1333        run_test("\"", Some(0), "Quote at start");
1334        run_test("Hello \" world", Some(6), "Quote in middle");
1335        run_test("Hello\\", Some(5), "Backslash at end");
1336        run_test("\n", Some(0), "Control char (newline) at start");
1337        run_test("Hello\tworld", Some(5), "Control char (tab) in middle");
1338        run_test(
1339            "Control char at end\u{08}",
1340            Some(19),
1341            "Control char (backspace) at end",
1342        );
1343    }
1344
1345    #[test]
1346    fn test_finds_first_of_multiple() {
1347        // This confirms it always finds the *first* match, not a later one.
1348        run_test("a\"b\\c\nd", Some(1), "Finds first quote");
1349        run_test("ab\\c\"d\ne", Some(2), "Finds first backslash");
1350        run_test("abc\nd\"e\\f", Some(3), "Finds first control char");
1351        run_test("\"\n\\", Some(0), "Multiple escapes at start");
1352    }
1353
1354    #[test]
1355    fn test_simd_chunk_boundaries() {
1356        // These tests are critical for verifying the SIMD logic. A chunk is 16 bytes.
1357        let s15 = "a".repeat(15);
1358        let s16 = "a".repeat(16);
1359        let s17 = "a".repeat(17);
1360
1361        // Escape at the exact end of the first 16-byte chunk
1362        run_test(&format!("{}\"", s15), Some(15), "Escape at index 15");
1363
1364        // Escape at the exact start of the second 16-byte chunk
1365        run_test(&format!("{}\n", s16), Some(16), "Escape at index 16");
1366
1367        // Escape within the second chunk
1368        run_test(&format!("{}\t", s17), Some(17), "Escape at index 17");
1369
1370        // A long string with an escape several chunks in
1371        let long = "a".repeat(40);
1372        run_test(
1373            &format!("{}\\\\", long),
1374            Some(40),
1375            "Escape deep in a long string",
1376        );
1377    }
1378
1379    #[test]
1380    fn test_remainder_logic() {
1381        // These tests ensure the scalar fallback logic works correctly for inputs
1382        // that are not a multiple of 16 bytes long.
1383
1384        // String shorter than 16 bytes
1385        run_test("short\nstring", Some(5), "Short string with escape");
1386        run_test("no escapes", None, "Short string no escape");
1387
1388        // String with 17 bytes (16 for SIMD, 1 for remainder)
1389        let s16 = "a".repeat(16);
1390        run_test(
1391            &format!("{}\"", s16),
1392            Some(16),
1393            "Escape in 1-byte remainder",
1394        );
1395
1396        // String with 31 bytes (16 for SIMD, 15 for remainder)
1397        let s15 = "b".repeat(15);
1398        run_test(
1399            &format!("{}{}\t", s15, s15),
1400            Some(30),
1401            "Escape at end of 15-byte remainder",
1402        );
1403    }
1404
1405    #[test]
1406    fn test_all_escapable_bytes_individually() {
1407        // This is the ultimate test. It iterates through all 256 possible byte values
1408        // and confirms that our function's decision matches the ESCAPE_DECISION_TABLE.
1409        let prefix = "0123456789abcdef"; // A 16-byte safe prefix to engage the SIMD loop.
1410
1411        for byte_val in 0..=255u8 {
1412            // We can't create a &str from invalid UTF-8, so we work with byte slices.
1413            let mut test_bytes = prefix.as_bytes().to_vec();
1414            test_bytes.push(byte_val);
1415
1416            let result = EscapeTokens::find_escape_char(&test_bytes);
1417            let expected_to_escape = ESCAPE_DECISION_TABLE[byte_val as usize] == 1;
1418
1419            if expected_to_escape {
1420                // If this byte SHOULD be escaped, we expect to find it at index 16.
1421                assert_eq!(
1422                    result,
1423                    Some(16),
1424                    "Failed to find required escape for byte 0x{:02X}",
1425                    byte_val
1426                );
1427            } else {
1428                // If this byte should NOT be escaped, we expect to find nothing.
1429                assert_eq!(
1430                    result, None,
1431                    "Incorrectly found an escape for byte 0x{:02X}",
1432                    byte_val
1433                );
1434            }
1435        }
1436    }
1437}
json_escape/token.rs

json_escape/
token.rs