json_escape/
lib.rs

1//! # Streaming JSON String Escape/Unescape
2//!
3//! Welcome to a highly efficient, `no_std` compatible library for handling JSON string escaping and unescaping. This crate provides iterator-based tools that process strings on the fly, avoiding heap allocations for the entire result. It's designed for performance-critical applications, such as parsing large JSON files or working in memory-constrained environments. ⚡
4//!
5//! The core of the library is two iterator structs:
6//! - **[`Escape`]**: Takes a string slice (`&str`) and yields escaped string slices ready for JSON serialization.
7//! - **[`Unescape`]**: Takes a byte slice (`&[u8]`) representing the content of a JSON string and yields the decoded byte slices.
8//!
9//! ## Key Features
10//! - **Zero-Copy Slicing**: For sequences of characters that don't need modification, the iterators yield slices that borrow directly from the input, avoiding unnecessary data copying.
11//! - **Comprehensive JSON Support**: Correctly handles all standard JSON escapes: `\"`, `\\`, `\/`, `\b`, `\f`, `\n`, `\r`, `\t`.
12//! - **Full Unicode Handling**: Correctly decodes `\uXXXX` sequences, including full support for UTF-16 surrogate pairs (e.g., `\uD83D\uDE00` for `😀`).
13//! - **Robust Error Handling**: The `Unescape` iterator returns descriptive errors (`UnescapeError`) for invalid or truncated escape sequences, making debugging straightforward.
14//! - **Allocation Control** (with `alloc` feature): Provides convenient methods to collect the iterator's output into owned types like `String` or `Cow<str>`.
15//! - **`std::io` Integration** (with `std` feature): The `Unescape` iterator implements `std::io::Read`, allowing it to be used as an efficient reader for I/O streams.
16//!
17//! ## Quick Start: Escaping a String
18//!
19//! ```
20//! use json_escape::escape_str;
21//!
22//! let input = "Hello, \"world\"!\nThis contains a \\ backslash.";
23//! let expected = r#"Hello, \"world\"!\nThis contains a \\ backslash."#;
24//!
25//! // The `escape_str` function returns an iterator.
26//! let mut escaper = escape_str(input);
27//!
28//! // You can iterate over the chunks:
29//! assert_eq!(escaper.next(), Some("Hello, "));
30//! assert_eq!(escaper.next(), Some(r#"\""#));
31//! assert_eq!(escaper.next(), Some("world"));
32//! // ...and so on.
33//!
34//! // Or, collect it into a String (requires the "alloc" feature).
35//! // let escaped_string: String = escape_str(input).collect();
36//! // assert_eq!(escaped_string, expected);
37//! ```
38//!
39//! ## Quick Start: Unescaping a String
40//!
41//! ```
42//! use json_escape::unescape;
43//!
44//! let input = r#"A 😀 emoji: \uD83D\uDE00 and a tab\t!"#;
45//!
46//! // The unescape iterator yields `Result<&[u8], _>`.
47//! let unescaper = unescape(input);
48//!
49//! // With the "alloc" feature, you can decode it directly into a string.
50//! let decoded_cow = unescaper.decode_utf8().unwrap();
51//! assert_eq!(decoded_cow, "A 😀 emoji: 😀 and a tab\t!");
52//! ```
53//!
54//! ## Performance and the `explicit` Module
55//!
56//! This crate is designed for high-performance, zero-allocation escaping and
57//! unescaping. For most use cases, the functions in this root module provide the
58//! best balance of ergonomics and speed.
59//!
60//! However, for users with extreme performance requirements, the [`explicit`]
61//! module is provided. Its iterators yield structured `Chunk` data instead of
62//! simple slices. As shown by benchmarks, this approach can be slightly faster,
63//! especially on inputs with a high density of escape sequences. If you are
64//! processing a very large volume of JSON strings in a tight loop, consider
65//! using the `explicit` module for a potential performance boost.
66#![no_std]
67#![deny(missing_docs)]
68#![cfg_attr(all(feature = "simd", nightly), feature(portable_simd))]
69
70#[cfg(any(test, feature = "std"))]
71extern crate std;
72
73#[cfg(feature = "alloc")]
74extern crate alloc;
75
76#[cfg(any(test, feature = "alloc"))]
77use alloc::{borrow::Cow, string::String, vec::Vec};
78
79use core::{
80    char,
81    fmt::{self, Write as _},
82    iter::FusedIterator,
83    slice, str,
84};
85use memchr::memchr;
86
87pub mod explicit;
88
89// =============================================================================
90// Escape Implementation
91// =============================================================================
92
93/// Creates a streaming JSON string escaper from a string slice.
94///
95/// The returned [`Escape`] iterator lazily processes the input string, yielding
96/// slices that represent the escaped output.
97///
98/// # Examples
99///
100/// ```
101/// use json_escape::escape_str;
102///
103/// let escaper = escape_str("a\nb");
104/// let escaped_parts: Vec<_> = escaper.collect();
105///
106/// assert_eq!(escaped_parts, vec!["a", r#"\n"#, "b"]);
107/// ```
108#[inline]
109pub fn escape_str(input: &str) -> Escape<'_> {
110    Escape {
111        bytes: input.as_bytes(),
112    }
113}
114
115/// A streaming JSON string escaper that yields `&'a str` slices.
116///
117/// This struct is created by the [`escape_str`] function. It is an [`Iterator`]
118/// that breaks the input string into chunks at each character that needs to be
119/// escaped according to JSON rules.
120///
121/// - For sequences of safe characters, it yields a single borrowed slice (`&'a str`).
122/// - For each character that must be escaped, it yields a `'static` slice
123///   containing the escaped representation (e.g., `r#"\n"#`).
124///
125/// This approach is highly efficient as it avoids allocating a new string for the
126/// entire output, processing the input in a streaming fashion.
127///
128/// ### Implemented Traits
129/// - **`Iterator<Item = &'a str>`**: Allows you to process the escaped parts in a loop or with adapters.
130/// - **`Display`**: Lets you write the escaped content directly to any formatter, like `println!` or a file, without intermediate allocation.
131/// - **`Clone`**, **`Debug`**: Standard utility traits.
132/// - **`PartialEq`**, **`PartialEq<B: AsRef<[u8]>>`**: Allows direct comparison of the escaped output. An `Escape` iterator is equal to another `Escape` or a byte slice if they produce an identical sequence of escaped bytes.
133/// - **`From<Escape<'a>> for Cow<'a, str>`** (requires `alloc` feature): Provides an efficient way to convert the iterator into a potentially owned string.
134#[derive(Clone)]
135#[must_use = "iterators are lazy and do nothing unless consumed"]
136pub struct Escape<'a> {
137    bytes: &'a [u8],
138}
139
140impl<'a> Iterator for Escape<'a> {
141    type Item = &'a str;
142
143    #[inline]
144    fn next(&mut self) -> Option<&'a str> {
145        if self.bytes.is_empty() {
146            return None;
147        }
148
149        // Find the first byte that needs escaping.
150        let pos = find_escape_char(self.bytes);
151
152        match pos {
153            // No escapable characters left; return the rest of the slice.
154            None => {
155                let s = self.bytes;
156                self.bytes = &[];
157                // SAFETY: The input was a valid &str, and we're returning the
158                // whole remaining chunk, so it's still valid UTF-8.
159                Some(unsafe { str::from_utf8_unchecked(s) })
160            }
161            // An escapable byte is at the beginning of the slice.
162            Some(0) => {
163                let byte = self.bytes[0];
164                self.bytes = &self.bytes[1..];
165                // The table lookup gives us a &'static str, which is a valid &'a str.
166                //
167                // Some(....unwrap()) is more correct
168                ESCAPE_TABLE[byte as usize]
169            }
170            // Found an escapable byte after a safe prefix. Return the prefix.
171            Some(p) => {
172                let (prefix, rest) = self.bytes.split_at(p);
173                self.bytes = rest;
174                // SAFETY: The soundness of this operation is critical.
175                // We are splitting the byte slice at the position of the first
176                // character that requires escaping. All JSON characters that
177                // require escaping (`"`, `\`, and control characters `\u0000`-`\u001F`)
178                // are single-byte ASCII characters. Therefore, `p` is guaranteed
179                // to be on a valid UTF-8 character boundary.
180                Some(unsafe { str::from_utf8_unchecked(prefix) })
181            }
182        }
183    }
184
185    fn size_hint(&self) -> (usize, Option<usize>) {
186        if self.bytes.is_empty() {
187            (0, Some(0))
188        } else {
189            // We'll yield at least 1 slice, and at most `len` slices if every byte is escaped.
190            (1, Some(self.bytes.len()))
191        }
192    }
193}
194
195impl<'a> FusedIterator for Escape<'a> {}
196
197impl fmt::Display for Escape<'_> {
198    /// Allows direct formatting of the escaped string without intermediate allocation.
199    ///
200    /// This is very useful for writing the escaped output directly to a stream,
201    /// such as a file or a network socket.
202    ///
203    /// # Example
204    ///
205    /// ```
206    /// use json_escape::escape_str;
207    ///
208    /// let escaper = escape_str("User said: \"Hi!\"\n");
209    /// let formatted = format!("{}", escaper);
210    ///
211    /// assert_eq!(formatted, r#"User said: \"Hi!\"\n"#);
212    /// ```
213    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
214        // The `clone()` is cheap as it only copies a slice reference.
215        for s in self.clone() {
216            f.write_str(s)?
217        }
218        Ok(())
219    }
220}
221
222impl fmt::Debug for Escape<'_> {
223    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
224        f.debug_struct("Escape").finish_non_exhaustive()
225    }
226}
227
228impl<B: AsRef<[u8]> + ?Sized> PartialEq<B> for Escape<'_> {
229    /// Compares the escaped output with any byte-slice-like object.
230    ///
231    /// This is primarily a convenience for testing, allowing you to check the
232    /// fully concatenated result of an `Escape` iterator against a known `&str` or `&[u8]`.
233    ///
234    /// The notion of equality is based on the **output**, not the iterator's internal state.
235    ///
236    /// # Example
237    ///
238    /// ```
239    /// use json_escape::escape_str;
240    ///
241    /// let escaper = escape_str("key\tvalue");
242    ///
243    /// // The escaper's output, when concatenated, equals the right-hand side.
244    /// assert_eq!(escaper, r#"key\tvalue"#);
245    /// ```
246    fn eq(&self, other: &B) -> bool {
247        let mut other = other.as_ref();
248        for chunk in self.clone() {
249            if !other.starts_with(chunk.as_bytes()) {
250                return false;
251            }
252            other = &other[chunk.len()..];
253        }
254        // We completely searched it
255        other.is_empty()
256    }
257}
258
259impl<'a, 'b> PartialEq<Escape<'a>> for Escape<'b> {
260    /// Compares two `Escape` iterators for equality.
261    ///
262    /// Two `Escape` iterators are considered equal if they'll produce the same **output**.
263    /// It first performs a fast check on the underlying byte slices.
264    fn eq(&self, other: &Escape<'a>) -> bool {
265        // Fast path: if they are views into the same underlying data.
266        self.bytes == other.bytes || chunks_eq(self.clone(), other.clone())
267    }
268}
269
270#[cfg(feature = "alloc")]
271impl<'a> From<Escape<'a>> for Cow<'a, str> {
272    /// Efficiently collects the escaped parts into a `Cow<'a, str>`.
273    ///
274    /// This implementation is optimized to avoid allocation if possible:
275    /// - If the input string requires **no escaping**, it returns `Cow::Borrowed`
276    ///   with a slice of the original string.
277    /// - If escaping is needed, it allocates a `String` and returns `Cow::Owned`.
278    ///
279    /// This is more efficient than `iter.collect::<String>()` because `collect`
280    /// will always allocate.
281    ///
282    /// **Requires the `alloc` feature.**
283    ///
284    /// # Example
285    ///
286    /// ```
287    /// # #[cfg(feature = "alloc")] {
288    /// use json_escape::escape_str;
289    /// use std::borrow::Cow;
290    ///
291    /// // No escaping needed, so no allocation occurs.
292    /// let cow_borrowed: Cow<str> = escape_str("plain text").into();
293    /// assert!(matches!(cow_borrowed, Cow::Borrowed(_)));
294    ///
295    /// // Escaping is required, so a new String is allocated.
296    /// let cow_owned: Cow<str> = escape_str("text with\nnewline").into();
297    /// assert!(matches!(cow_owned, Cow::Owned(_)));
298    /// assert_eq!(cow_owned, r#"text with\nnewline"#);
299    /// # }
300    /// ```
301    fn from(mut iter: Escape<'a>) -> Self {
302        match iter.next() {
303            None => Cow::Borrowed(""),
304            Some(first) => match iter.next() {
305                None => Cow::Borrowed(first),
306                Some(second) => {
307                    let mut string =
308                        String::with_capacity(first.len() + second.len() + iter.bytes.len());
309                    string.push_str(first);
310                    string.push_str(second);
311                    string.extend(iter);
312                    Cow::Owned(string)
313                }
314            },
315        }
316    }
317}
318
319// =============================================================================
320// Unescape Implementation
321// =============================================================================
322
323/// Creates a streaming JSON string unescaper from a byte slice.
324///
325/// This function creates an iterator to unescape a byte slice representing the
326/// **raw contents** of a JSON string, assuming the outer quotes have already
327/// been removed.
328///
329/// For a more convenient way to handle complete JSON string literals (including
330/// their surrounding `"` quotes), see the [`unescape_quoted`] function, which
331/// automatically trims them.
332///
333/// The iterator will fail if the input contains invalid JSON escape sequences.
334///
335/// # Example
336///
337/// ```
338/// use json_escape::{unescape, unescape_quoted};
339///
340/// // `unescape` works on the raw content, without quotes.
341/// let content = r#"hello\tworld"#;
342/// assert_eq!(unescape(content), "hello\tworld");
343///
344/// // If you pass a full JSON literal, the quotes are treated as literal characters.
345/// let literal = r#""hello\tworld""#;
346/// assert_eq!(unescape(literal), "\"hello\tworld\""); // Note the quotes in the output.
347///
348/// // For full literals like this, `unescape_quoted` is the recommended function.
349/// assert_eq!(unescape_quoted(literal), "hello\tworld");
350/// ```
351#[inline]
352pub fn unescape<I: AsRef<[u8]> + ?Sized>(input: &I) -> Unescape<'_> {
353    Unescape::new(input.as_ref())
354}
355
356/// Creates a streaming JSON string unescaper, trimming enclosing quotes.
357///
358/// This function acts as a convenience wrapper around [`unescape`]. It first
359/// inspects the input byte slice. If the slice begins and ends with a double-quote
360/// character (`"`), these quotes are trimmed before the inner content is passed to
361/// the unescaper.
362///
363/// If the input is not enclosed in quotes, this function behaves exactly like
364/// [`unescape`]. This is useful for directly unescaping a complete JSON string
365/// literal.
366///
367/// # Example
368///
369/// ```
370/// use json_escape::{unescape, unescape_quoted};
371///
372/// // 1. With quotes: The outer quotes are trimmed before unescaping.
373/// let unescaper = unescape_quoted(r#""hello\nworld""#);
374/// assert_eq!(unescaper, b"hello\nworld");
375///
376/// // 2. Without quotes: Behaves exactly like the standard `unescape`.
377/// let unescaper_no_quotes = unescape_quoted(r#"raw string"#);
378/// assert_eq!(unescaper_no_quotes, b"raw string");
379///
380/// // 3. Mismatched quotes: The input is passed through as-is, quotes are not trimmed.
381/// let mismatched_quotes = unescape_quoted(r#"hello""#);
382/// assert_eq!(mismatched_quotes, b"hello\"");
383///
384/// // 4. Empty quoted string: Correctly results in an empty output.
385/// let empty_quoted = unescape_quoted(r#""""#);
386/// assert_eq!(empty_quoted, b"");
387/// ```
388#[inline]
389pub fn unescape_quoted<I: AsRef<[u8]> + ?Sized>(input: &I) -> Unescape<'_> {
390    let bytes = input.as_ref();
391    let input = if bytes.len() >= 2 && bytes[0] == b'\"' && bytes[bytes.len() - 1] == b'\"' {
392        &bytes[1..bytes.len() - 1]
393    } else {
394        bytes
395    };
396
397    unescape(input)
398}
399
400/// A streaming JSON string unescaper.
401///
402/// This struct is created by the [`unescape`] function. It implements an [`Iterator`]
403/// that yields `Result<&'a [u8], UnescapeError>`, lazily decoding the input.
404///
405/// The iterator's output chunks are one of the following:
406/// - **`Ok(&'a [u8])`**: A borrowed slice of the original input for a sequence of non-escaped bytes.
407/// - **`Ok(&'static [u8])`**: A single-byte slice for a decoded escape sequence (e.g., `\n` becomes a slice containing `0x0A`).
408///   For `\uXXXX` sequences, it yields a series of single-byte slices representing the UTF-8 encoding of the character.
409/// - **`Err(UnescapeError)`**: An error indicating an invalid escape sequence, which halts further iteration as described below.
410///
411/// Because the iterator operates on bytes, you can use helper methods like
412/// [`Unescape::decode_utf8`] or [`Unescape::decode_utf8_lossy`] to convert the
413/// final result into a string.
414///
415/// # Error Handling
416///
417/// When the iterator encounters an invalid or incomplete escape, it returns an
418/// `Err(UnescapeError)` describing the problem. The iterator then remains in an
419/// **error state**: subsequent calls to `next()` will continue to return that same
420/// error (i.e., the error is idempotent) and the iterator will not produce further
421/// `Ok` chunks. This makes the behavior deterministic for callers that check the
422/// first error and then stop.
423///
424/// Errors are classified by the precise condition encountered:
425/// - **`InvalidEscape`**: The escape sequence uses an unknown escape character (e.g., `\q`).
426/// - **`InvalidHex`**: A `\u` escape contains a non-hex character where a hex
427///   digit was expected (e.g., `\uZ`).
428/// - **`UnexpectedEof`**: The input ended before a complete escape sequence could be
429///   read. This is used when there isn't enough input yet to decide whether the
430///   sequence would be valid (for instance, an incomplete `\u` or a truncated
431///   surrogate pair).
432/// - **`LoneSurrogate`**: A complete `\uXXXX` was read, and it encodes a *high*
433///   surrogate, but the following bytes definitively do not form a valid low
434///   surrogate escape (for example, the next character is a space or any
435///   non-`\u` character).
436///
437/// The difference between `UnexpectedEof` and `LoneSurrogate` is important:
438/// - `UnexpectedEof` means **we couldn't decide** because the input ended too early.
439/// - `LoneSurrogate` means **we did decide**—we saw a full `\uXXXX` high surrogate,
440///   and the following input proves a pair will not follow.
441///
442/// #### Concrete examples
443///
444/// 1) A high surrogate followed by other data (not a `\u` low-surrogate) → `LoneSurrogate`:
445///
446/// ```rust
447/// use json_escape::{unescape, UnescapeErrorKind, LoneSurrogateError};
448///
449/// let mut iter = unescape(r"\uD83D more data");
450/// let err = iter.next().unwrap().unwrap_err();
451/// assert!(matches!(err.kind(), UnescapeErrorKind::LoneSurrogate(LoneSurrogateError { surrogate: 0xD83D, .. })));
452///
453/// // Subsequent calls return the same error (iterator remains in the same error state).
454/// let err = iter.next().unwrap().unwrap_err();
455/// assert!(matches!(err.kind(), UnescapeErrorKind::LoneSurrogate(LoneSurrogateError { surrogate: 0xD83D, .. })));
456/// ```
457///
458/// 2) An invalid escape character → `InvalidEscape`:
459///
460/// ```rust
461/// use json_escape::{unescape, UnescapeErrorKind, InvalidEscapeError};
462///
463/// let mut iter = unescape(r"\q"); // `\q` is not a defined escape
464/// let err = iter.next().unwrap().unwrap_err();
465/// assert!(matches!(err.kind(), UnescapeErrorKind::InvalidEscape(InvalidEscapeError { found: b'q', .. })));
466/// ```
467///
468/// 3) A malformed `\u` with a non-hex character → `InvalidHex`:
469///
470/// ```rust
471/// use json_escape::{unescape, UnescapeErrorKind, InvalidHexError};
472///
473/// let mut iter = unescape(r"\uZ");
474/// let err = iter.next().unwrap().unwrap_err();
475/// assert!(matches!(err.kind(), UnescapeErrorKind::InvalidHex(InvalidHexError { found: b'Z', .. })));
476/// ```
477///
478/// 4) Truncated / incomplete input ⇒ `UnexpectedEof`:
479///
480/// ```rust
481/// use json_escape::{unescape, UnescapeErrorKind};
482///
483/// // a) truncated after the first \uXXXX (no following bytes yet)
484/// let mut iter = unescape(r"\uD83D");
485/// let err = iter.next().unwrap().unwrap_err();
486/// assert!(matches!(err.kind(), UnescapeErrorKind::UnexpectedEof));
487///
488/// // b) starts a second \u but is truncated before hex digits
489/// let mut iter = unescape(r"\uD83D\u");
490/// let err = iter.next().unwrap().unwrap_err();
491/// assert!(matches!(err.kind(), UnescapeErrorKind::UnexpectedEof));
492///
493/// // c) a lone backslash at end of input
494/// let mut iter = unescape("\\");
495/// let err = iter.next().unwrap().unwrap_err();
496/// assert!(matches!(err.kind(), UnescapeErrorKind::UnexpectedEof));
497/// ```
498///
499/// **Note**: This behavior intentionally mirrors common JSON parsers (e.g.,
500/// `serde_json`, Go's `encoding/json`) for the EOF vs. semantic error distinction.
501///
502/// # Implemented Traits and Usage
503///
504/// - **`Iterator<Item = Result<&'a [u8], UnescapeError>>`**: The core trait for
505///   processing the unescaped byte chunks.
506/// - **`std::io::Read`** (requires `std` feature): Lets you use the unescaper as a
507///   standard reader, perfect for integrating with other I/O APIs.
508/// - **`TryFrom<Unescape<'a>> for Cow<'a, [u8]>`** (requires `alloc` feature): An
509///   efficient way to collect the unescaped bytes, propagating any errors.
510/// - **`Clone`**, **`Debug`**: Standard utility traits.
511/// - **`PartialEq<B: AsRef<[u8]>>`**: Compares the fully unescaped output with a byte slice.
512///
513/// ## Reading Unescaped Bytes
514///
515/// With the `std` feature, `Unescape` can be used as any other `std::io::Read`
516/// source. This is ideal for streaming and decoding large JSON string contents
517/// without buffering the entire result in memory first.
518///
519/// ```rust
520/// # #[cfg(feature = "std")] {
521/// use json_escape::unescape;
522/// use std::io::Read;
523///
524/// let mut reader = unescape(r#"chunk1\nchunk2"#);
525/// let mut buf = Vec::new();
526///
527/// // Read all unescaped bytes from the iterator into the buffer.
528/// reader.read_to_end(&mut buf).unwrap();
529///
530/// assert_eq!(buf, b"chunk1\nchunk2");
531/// # }
532/// ```
533#[derive(Clone)]
534#[must_use = "iterators are lazy and do nothing unless consumed"]
535pub struct Unescape<'a> {
536    // iterator over the input bytes (we use slice::Iter to clone/peek where necessary
537    // without worrying too much about bookkeeping)
538    bytes: slice::Iter<'a, u8>,
539
540    // scratch buffer for encoded UTF-8 bytes from a \uXXXX (or surrogate pair)
541    unicode: [u8; 4],
542    // We can eliminate this by depending on the header.
543    unicode_len: u8, // how many bytes are valid in buf (0 means no pending)
544    unicode_pos: u8, // how many bytes already emitted
545}
546
547impl<'a> Unescape<'a> {
548    /// Construct from a byte slice which contains the characters inside the JSON string (no quotes).
549    fn new(input: &'a [u8]) -> Self {
550        Self {
551            bytes: input.iter(),
552            unicode: [0; 4],
553            unicode_len: 0,
554            unicode_pos: 0,
555        }
556    }
557
558    // FIXME: Replace iter with slice and match on b with a table
559    /// Helper: parse exactly 4 hex digits from `it`. Returns Ok(u16) or an error.
560    #[inline(always)]
561    fn parse_hex4(iter: &mut slice::Iter<'a, u8>, base_offset: u8) -> Result<u16, UnescapeError> {
562        let mut acc = 0u16;
563        for i in 0..4 {
564            let b = match iter.next() {
565                Some(b) => *b,
566                None => {
567                    return Err(UnescapeError {
568                        kind: UnescapeErrorKind::UnexpectedEof,
569                        // The error occurs where the next digit was expected.
570                        offset: base_offset + i,
571                    });
572                }
573            };
574            let v = match b {
575                b'0'..=b'9' => (b - b'0') as u16,
576                b'a'..=b'f' => (b - b'a' + 10) as u16,
577                b'A'..=b'F' => (b - b'A' + 10) as u16,
578                _ => {
579                    return Err(UnescapeError {
580                        kind: UnescapeErrorKind::InvalidHex(InvalidHexError { found: b }),
581                        // The error is the invalid digit itself.
582                        offset: base_offset + i,
583                    });
584                }
585            };
586            acc = (acc << 4) | v;
587        }
588        Ok(acc)
589    }
590
591    /// Parses a unicode escape sequence `\uXXXX` which may be a surrogate pair.
592    /// The iterator `bytes` must be positioned *after* the `\u`.
593    ///
594    /// NOTE: Doesn't preserve the state of the iterator on error
595    #[inline(always)]
596    fn handle_unicode_escape(bytes: &mut slice::Iter<'a, u8>) -> Result<char, UnescapeError> {
597        // Parse first 4 hex digits (\uXXXX)
598        //
599        // The iterator starts *after* '\u'. The first hex digit is at offset 2 from '\'.
600        let first = Self::parse_hex4(bytes, 2)?;
601
602        // High surrogate → must be followed by another \uXXXX low surrogate
603        if (0xD800..=0xDBFF).contains(&first) {
604            match (bytes.next(), bytes.next()) {
605                (Some(b'\\'), Some(b'u')) => {
606                    // Try parsing the low surrogate
607                    //
608                    // The first hex digit of the second escape is at offset 8.
609                    // (\uXXXX\u -> 8 chars)
610                    match Self::parse_hex4(bytes, 8) {
611                        Ok(low) if (0xDC00..=0xDFFF).contains(&low) => {
612                            let high_t = first as u32;
613                            let low_t = low as u32;
614                            let code = 0x10000 + (((high_t - 0xD800) << 10) | (low_t - 0xDC00));
615                            return Ok(char::from_u32(code).expect(
616                                "valid surrogate pair math should always produce a valid char",
617                            ));
618                        }
619                        Ok(_) => {
620                            // Got a full escape but not a low surrogate → Lone surrogate
621                            return Err(UnescapeError {
622                                kind: UnescapeErrorKind::LoneSurrogate(LoneSurrogateError {
623                                    surrogate: first,
624                                }),
625                                offset: 6,
626                            });
627                        }
628                        Err(err) => {
629                            // parse_hex4 failed (e.g. ran out of hex digits)
630                            return Err(err);
631                        }
632                    }
633                }
634                // EOF before even seeing '\' or 'u' → UnexpectedEof
635                (None, _) | (_, None) => {
636                    return Err(UnescapeError {
637                        kind: UnescapeErrorKind::UnexpectedEof,
638                        offset: 6,
639                    });
640                }
641                // Something else after high surrogate → LoneSurrogate
642                _ => {
643                    return Err(UnescapeError {
644                        kind: UnescapeErrorKind::LoneSurrogate(LoneSurrogateError {
645                            surrogate: first,
646                        }),
647                        // The error is detected after consuming `\uXXXX` (6 bytes).
648                        offset: 6,
649                    });
650                }
651            }
652        }
653
654        // Not a surrogate → normal path
655        match char::from_u32(first as u32) {
656            Some(c) => Ok(c),
657            None => Err(UnescapeError {
658                kind: UnescapeErrorKind::LoneSurrogate(LoneSurrogateError { surrogate: first }),
659                // The error is detected after consuming `\uXXXX` (6 bytes).
660                offset: 6,
661            }),
662        }
663    }
664
665    #[inline]
666    fn store_unicode(&mut self, ch: char) {
667        self.unicode_len = ch.encode_utf8(&mut self.unicode).len() as u8;
668        self.unicode_pos = 0;
669    }
670
671    #[inline]
672    fn emit_pending_byte(&mut self) -> Option<u8> {
673        if self.unicode_pos < self.unicode_len {
674            let b = self.unicode[self.unicode_pos as usize];
675            self.unicode_pos += 1;
676            Some(b)
677        } else {
678            None
679        }
680    }
681
682    /// Helper to emit the full unicode sequence and advance the internal position.
683    #[inline]
684    fn emit_unicode_as_str(&mut self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
685        // The check `unicode_pos > 0` is implicit from the call site.
686        // The buffer is guaranteed to contain a valid UTF-8 sequence.
687        let s = unsafe { str::from_utf8_unchecked(&self.unicode[..self.unicode_len as usize]) };
688        f.write_str(s)?;
689
690        // Mark the entire sequence as emitted.
691        self.unicode_pos = self.unicode_len;
692
693        Ok(())
694    }
695
696    /// The single, authoritative helper for producing unescaped byte chunks.
697    ///
698    /// It takes an optional `max` length to limit the size of the returned slice,
699    /// which is essential for the `std::io::Read` implementation.
700    #[inline(always)]
701    fn next_limit(&mut self, limit: Option<usize>) -> Option<Result<&'a [u8], UnescapeError>> {
702        if limit.is_some_and(|l| l == 0) {
703            return Some(Ok(&[]));
704        }
705
706        // If we have pending bytes, emit them first (fast).
707        //
708        // LIMIT: We're allowed not checking here since we'll only produce 1 byte
709        // and limit is at least 1.
710        if let Some(s) = self.emit_pending_byte() {
711            // s: &'static [u8] coerces to &'a [u8]
712            return Some(Ok(byte_as_static_slice(s)));
713        }
714
715        let bytes = self.bytes.as_slice();
716        if bytes.is_empty() {
717            return None;
718        }
719
720        // Find next backslash in the remaining bytes.
721        let pos = memchr(b'\\', bytes);
722
723        match pos {
724            None => {
725                // No more escapes. Return the rest of the slice as a borrowed chunk.
726                let chunk_len = bytes.len().min(limit.unwrap_or(bytes.len()));
727                let (chunk, rest) = bytes.split_at(chunk_len);
728                self.bytes = rest.iter();
729                Some(Ok(chunk))
730            }
731            // LIMIT: We're allowed not checking here since we'll only produce 1 byte
732            // and limit is at least 1.
733            Some(0) => {
734                // We need to parse 4 hex digits from the iterator. But because
735                // `bytes` implements `Clone`, we can clone it to peek ahead
736                // in order to preserve the state of the iterator on failure.
737                let mut lookahead = self.bytes.clone();
738                // Backslash is the first byte in the slice: handle escape
739                lookahead.next(); // Consume the backslash
740
741                match lookahead.next() {
742                    Some(b'u') => match Self::handle_unicode_escape(&mut lookahead) {
743                        Ok(ch) => {
744                            self.bytes = lookahead; // commit
745                            self.store_unicode(ch);
746                            self.emit_pending_byte()
747                                .map(|b| Ok(byte_as_static_slice(b)))
748                        }
749                        Err(err) => Some(Err(err)),
750                    },
751                    Some(byte) => {
752                        if let Some(slice) = UNESCAPE_TABLE[*byte as usize] {
753                            self.bytes = lookahead; // commit
754                            Some(Ok(slice))
755                        } else {
756                            Some(Err(UnescapeError {
757                                kind: UnescapeErrorKind::InvalidEscape(InvalidEscapeError {
758                                    found: *byte,
759                                }),
760                                // The invalid character is 1 byte after '\'.
761                                offset: 1,
762                            }))
763                        }
764                    }
765                    None => Some(Err(UnescapeError {
766                        kind: UnescapeErrorKind::UnexpectedEof,
767                        // EOF occurred 1 byte after '\'.
768                        offset: 1,
769                    })),
770                }
771            }
772            // Found \ after a safe prefix. Return the prefix. We'll handle on next call to next
773            Some(p) => {
774                // Return the safe prefix (borrowed from input)
775                let chunk_len = p.min(limit.unwrap_or(p));
776                let (chunk, rest) = bytes.split_at(chunk_len);
777                self.bytes = rest.iter();
778                Some(Ok(chunk))
779            }
780        }
781    }
782
783    fn _display_utf8(mut self, f: &mut fmt::Formatter<'_>, lossy: bool) -> fmt::Result {
784        // The key insight: Chunks with more than one byte are *always*
785        // borrowed from the original input, as all escaped characters
786        // are yielded byte-by-byte.
787        while let Some(result) = self.next() {
788            match result {
789                Ok(chunk) => {
790                    if chunk.is_empty() {
791                        continue;
792                    }
793
794                    // THE CORE LOGIC:
795                    // Check if the iterator just yielded the *first byte* of a *multi-byte* sequence.
796                    // - `unicode_pos == 1` means the first byte was just emitted.
797                    // - `unicode_len > 1` means it's a multi-byte char (e.g., '¢', '😎').
798                    if self.unicode_pos == 1 && self.unicode_len > 1 {
799                        // This is our special case. We have the first byte in `chunk`, but
800                        // it's more efficient to write the whole character at once from our buffer.
801                        self.emit_unicode_as_str(f)?;
802                        // The iterator will no longer yield the rest of the bytes. Since our helper
803                        // has now advanced it. But to be sure...
804                        self.unicode_pos = self.unicode_len;
805                    } else {
806                        // This is the normal case:
807                        // 1. A large chunk borrowed from the original input.
808                        // 2. A single-byte escape like `\n` or `\t`.
809                        // 3. The last byte of a multi-byte sequence (or the only byte).
810                        // In all these cases, we just need to display the chunk we received.
811                        display_bytes_utf8(chunk, f, lossy)?;
812                    }
813                }
814                Err(_) => {
815                    if lossy {
816                        break;
817                    } else {
818                        return Err(fmt::Error);
819                    }
820                }
821            }
822        }
823
824        Ok(())
825    }
826
827    /// Decodes the unescaped byte stream into a UTF-8 string.
828    ///
829    /// This method consumes the iterator and collects all resulting byte chunks.
830    /// If an unescaping error occurs, it's returned immediately. If the final
831    /// sequence of bytes is not valid UTF-8, a UTF-8 error is returned.
832    ///
833    /// Like `From<Escape>`, this is optimized to return a `Cow::Borrowed` if no
834    /// escapes were present in the input, avoiding allocation.
835    ///
836    /// **Requires the `alloc` feature.**
837    ///
838    /// # Example
839    ///
840    /// ```
841    /// # #[cfg(feature = "alloc")] {
842    /// use json_escape::unescape;
843    ///
844    /// let input = r#"Emoji: \uD83D\uDE00"#;
845    /// let cow = unescape(input).decode_utf8().unwrap();
846    ///
847    /// assert_eq!(cow, "Emoji: 😀");
848    /// # }
849    /// ```
850    #[cfg(feature = "alloc")]
851    pub fn decode_utf8(self) -> Result<Cow<'a, str>, DecodeUtf8Error> {
852        match self.try_into().map_err(DecodeUtf8Error::Unescape)? {
853            Cow::Borrowed(bytes) => str::from_utf8(bytes)
854                .map(Cow::Borrowed)
855                .map_err(DecodeUtf8Error::Utf8),
856            Cow::Owned(bytes) => String::from_utf8(bytes)
857                .map(Cow::Owned)
858                .map_err(|e| DecodeUtf8Error::Utf8(e.utf8_error())),
859        }
860    }
861
862    /// Decodes the unescaped byte stream lossily into a UTF-8 string.
863    ///
864    /// This is similar to [`Unescape::decode_utf8`] but replaces any invalid UTF-8 sequences
865    /// with the replacement character (U+FFFD) instead of returning an error.
866    ///
867    /// An `UnescapeError` can still be returned if the JSON escaping itself is invalid.
868    ///
869    /// **Requires the `alloc` feature.**
870    #[cfg(feature = "alloc")]
871    pub fn decode_utf8_lossy(self) -> Result<Cow<'a, str>, UnescapeError> {
872        Ok(decode_utf8_lossy(self.try_into()?))
873    }
874
875    /// Returns a wrapper that implements [`fmt::Display`].
876    ///
877    /// This allows an `Unescape` iterator to be used directly with formatting
878    /// macros like `println!`, `format!`, etc. It writes the unescaped content
879    /// directly to the formatter's buffer, **avoiding any heap allocations**.
880    ///
881    /// The iterator is consumed, and the resulting unescaped string is written
882    /// to the formatter. Any invalid JSON escape sequences or invalid UTF-8 will
883    /// cause a `fmt::Error`. **You should be cautious when using this method
884    /// with the `format!` macro, as a `fmt::Error` from us will cause the macro
885    /// to panic**.
886    ///
887    /// For a more robust alternative that will not panic on `UnescapeError` or
888    /// invalid bytes, consider using [`Unescape::display_utf8_lossy`] instead.
889    ///
890    /// This method is a **zero-allocation** alternative to [`Unescape::decode_utf8`],
891    /// which might allocate a `String` to return the unescaped content.
892    ///
893    /// # Example
894    ///
895    /// ```
896    /// use json_escape::unescape;
897    ///
898    /// let original = r#"Hello, \uD83C\uDF0E!"#;
899    /// let unescaper = unescape(original);
900    ///
901    /// let formatted = format!("{}", unescaper.display_utf8());
902    /// assert_eq!(formatted, "Hello, 🌎!");
903    /// ```
904    pub fn display_utf8(self) -> DisplayUnescape<'a> {
905        DisplayUnescape { inner: self }
906    }
907
908    /// Returns a wrapper that implements [`fmt::Display`] lossily.
909    ///
910    /// This method is an **allocation-free** way to write unescaped content
911    /// to a formatter. It handles invalid JSON escape sequences and invalid
912    /// UTF-8 gracefully, making it a "lossy" operation.
913    ///
914    /// - **Invalid JSON escape sequences:** Instead of causing an error, the iterator
915    ///   terminates without an error.
916    /// - **Invalid UTF-8 bytes:** These are replaced with the Unicode
917    ///   replacement character (U+FFFD).
918    ///
919    /// This method is the **zero-allocation** counterpart to [`Unescape::decode_utf8_lossy`].
920    pub fn display_utf8_lossy(self) -> DisplayUnescapeLossy<'a> {
921        DisplayUnescapeLossy { inner: self }
922    }
923}
924
925impl<'a> Iterator for Unescape<'a> {
926    type Item = Result<&'a [u8], UnescapeError>;
927
928    fn next(&mut self) -> Option<Self::Item> {
929        self.next_limit(None)
930    }
931
932    fn size_hint(&self) -> (usize, Option<usize>) {
933        // The minimum size is 0 (if the rest of the string is an invalid escape).
934        // The maximum size is the remaining length of the underlying bytes + pending_unicode
935        let (lower, upper) = self.bytes.size_hint();
936        let upper = upper.map(|x| x + (self.unicode_len as usize));
937        // Worst-case is \uXXXX -> 1 byte, so 6 -> 1.
938        (lower.saturating_add(1) / 6, upper)
939    }
940}
941
942impl<'a> FusedIterator for Unescape<'a> {}
943
944#[cfg(feature = "std")]
945impl std::io::Read for Unescape<'_> {
946    fn read(&mut self, mut buf: &mut [u8]) -> std::io::Result<usize> {
947        let start_len = buf.len();
948
949        // Read until buf is full or iter drained
950        loop {
951            // If the buffer is empty, we're done.
952            if buf.is_empty() {
953                return Ok(start_len);
954            }
955
956            match self.next_limit(Some(buf.len())) {
957                Some(Ok(chunk)) => {
958                    // chunk.len() <= buf.len()... next_limit ensures this
959                    let len = chunk.len();
960                    buf[..len].copy_from_slice(chunk);
961                    buf = &mut buf[len..]
962                }
963                Some(Err(err)) => {
964                    return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, err));
965                }
966                None => {
967                    // iter is drained
968                    return Ok(start_len - buf.len());
969                }
970            }
971        }
972    }
973
974    // We can provide an optimized version of read_to_end
975    fn read_to_end(&mut self, buf: &mut Vec<u8>) -> std::io::Result<usize> {
976        let start_len = buf.len();
977
978        // Now, efficiently consume the rest of the iterator
979        for result in self {
980            match result {
981                Ok(chunk) => buf.extend_from_slice(chunk),
982                Err(err) => return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, err)),
983            }
984        }
985
986        Ok(buf.len() - start_len)
987    }
988}
989
990impl fmt::Debug for Unescape<'_> {
991    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
992        f.debug_struct("Unescape").finish_non_exhaustive()
993    }
994}
995
996impl<B: AsRef<[u8]> + ?Sized> PartialEq<B> for Unescape<'_> {
997    /// Compares the unescaped output with a byte-slice-like object.
998    ///
999    /// An `Unescape` iterator is considered equal to a byte slice if it successfully
1000    /// unescapes to produce a sequence of bytes identical to that slice. If the
1001    /// iterator would produce an error, the comparison returns `false`.
1002    ///
1003    /// # Example
1004    ///
1005    /// ```
1006    /// use json_escape::unescape;
1007    ///
1008    /// let unescaper = unescape(r#"hello\nworld"#);
1009    /// assert_eq!(unescaper, b"hello\nworld");
1010    ///
1011    /// // An iterator that produces an error is not equal to any valid slice.
1012    /// let failing_unescaper = unescape(r#"\k"#);
1013    /// assert_ne!(failing_unescaper, b"k");
1014    /// ```
1015    fn eq(&self, other: &B) -> bool {
1016        let mut other = other.as_ref();
1017        for result in self.clone() {
1018            match result {
1019                Ok(chunk) => {
1020                    if !other.starts_with(chunk) {
1021                        return false;
1022                    }
1023                    other = &other[chunk.len()..];
1024                }
1025                Err(_) => return false, // An erroring iterator cannot be equal to a valid slice.
1026            }
1027        }
1028        other.is_empty()
1029    }
1030}
1031
1032impl<B: AsRef<[u8]>> PartialEq<Unescape<'_>> for Result<B, UnescapeError> {
1033    /// Compares the unescaper's outcome with a `Result`.
1034    ///
1035    /// This implementation allows for precise testing of the `Unescape` iterator
1036    /// by comparing it against either a successful outcome (`Ok`) or a specific
1037    /// failure (`Err`).
1038    ///
1039    /// - If `result` is `Ok(bytes)`, the comparison is `true` only if the iterator
1040    ///   completes successfully and its concatenated output is identical to `bytes`.
1041    ///
1042    /// - If `result` is `Err(error)`, the comparison is `true` only if the iterator
1043    ///   produces the exact same `UnescapeError`.
1044    ///
1045    /// # Example
1046    ///
1047    /// ```
1048    /// use json_escape::{unescape, UnescapeError, InvalidEscapeError};
1049    ///
1050    /// // --- Success Case ---
1051    /// let unescaper = unescape(r#"hello\tworld"#);
1052    /// // The comparison is against an `Ok` variant.
1053    /// assert_eq!(Ok("hello\tworld"), unescaper);
1054    ///
1055    /// // --- Error Case ---
1056    /// let failing_unescaper = unescape(r#"invalid-\u"#);
1057    /// // We can assert that the iterator produces a specific error.
1058    /// # let unexpected_eof = unescape(r"\u").next().unwrap().unwrap_err();
1059    /// assert_eq!(Err::<&str, _>(unexpected_eof), failing_unescaper);
1060    /// ```
1061    fn eq(&self, unescape: &Unescape<'_>) -> bool {
1062        match self {
1063            Ok(expected_bytes) => unescape == expected_bytes,
1064            Err(expected_error) => {
1065                for result in unescape.clone() {
1066                    if let Err(actual_error) = result {
1067                        // The iterator's first error is its final outcome.
1068                        // It must match the expected error exactly.
1069                        return actual_error == *expected_error;
1070                    }
1071                }
1072                // `unescape` completed successfully, but an error was expected.
1073                false
1074            }
1075        }
1076    }
1077}
1078
1079impl<'a, 'b> PartialEq<Unescape<'a>> for Unescape<'b> {
1080    /// Compares two `Unescape` iterators for equality based on their terminal result.
1081    ///
1082    /// The equality of two `Unescape` iterators is determined by the final `Result`
1083    /// that would be obtained if each iterator were fully consumed (e.g., by using `try_collect()`).
1084    ///
1085    /// The specific rules are as follows:
1086    ///
1087    /// 1.  **Error vs. Error**: If both iterators terminate with an `Err`, they are
1088    ///     considered **equal** if and only if their `UnescapeError`s are identical.
1089    ///     Any bytes successfully unescaped *before* the error are ignored in this case.
1090    /// 2.  **Success vs. Success**: If both iterators terminate with `Ok`, they are
1091    ///     considered **equal** if and only if the complete sequence of unescaped bytes
1092    ///     is identical for both.
1093    /// 3.  **Success vs. Error**: If one iterator terminates with `Ok` and the other
1094    ///     with `Err`, they are always **not equal**.
1095    ///
1096    /// # Example
1097    ///
1098    /// ```
1099    /// use json_escape::unescape;
1100    ///
1101    /// // Case 1: Both iterators produce the same error. They are equal,
1102    /// // even though their valid prefixes ("a" and "b") are different.
1103    /// let failing_a = unescape(r#"a\k"#);
1104    /// let failing_b = unescape(r#"b\k"#);
1105    /// assert_eq!(failing_a, failing_b);
1106    ///
1107    /// // Case 2: Both iterators succeed. Equality depends on the byte stream.
1108    /// let successful_a = unescape(r#"hello\nworld"#);
1109    /// let successful_b = unescape(r#"hello\nworld"#);
1110    /// assert_eq!(successful_a, successful_b);
1111    ///
1112    /// let successful_c = unescape(r#"different"#);
1113    /// assert_ne!(successful_a, successful_c);
1114    ///
1115    /// // Case 3: One succeeds and one fails. They are not equal.
1116    /// let succeeding = unescape(r#"stop"#);
1117    /// let failing = unescape(r#"stop\k"#);
1118    /// assert_ne!(succeeding, failing);
1119    ///
1120    /// // Case 4: Both iterators fail differently. They are not equal.
1121    /// let failing_a = unescape(r#"data:\k"#);
1122    /// let failing_b = unescape(r#"data:\"#);
1123    /// assert_ne!(failing_a, failing_b);
1124    /// ```
1125    fn eq(&self, other: &Unescape<'a>) -> bool {
1126        // Fast path: if they are views into the same underlying data with the same state.
1127        ((self.bytes.as_ref() == other.bytes.as_ref())
1128            && (self.unicode == other.unicode)
1129            && (self.unicode_len == other.unicode_len)
1130            && (self.unicode_pos == other.unicode_pos))
1131            || {
1132                let mut a_error = None;
1133                let mut b_error = None;
1134
1135                let mut a = self.clone().map_while(|result| match result {
1136                    Ok(ok) => Some(ok),
1137                    Err(err) => {
1138                        a_error = Some(err);
1139                        None
1140                    }
1141                });
1142
1143                let mut b = other.clone().map_while(|result| match result {
1144                    Ok(ok) => Some(ok),
1145                    Err(err) => {
1146                        b_error = Some(err);
1147                        None
1148                    }
1149                });
1150
1151                let streams_match = chunks_eq(&mut a, &mut b);
1152
1153                // Drain the iterators to ensure the error state is captured,
1154                // especially if chunks_eq returned false early.
1155                // (e.g unescape("a\k") and unescape("b\k") which are actually
1156                // equal)
1157                a.for_each(|_| {});
1158                b.for_each(|_| {});
1159
1160                match (a_error, b_error) {
1161                    // Both errored: equality depends only on the errors being the same.
1162                    (Some(a_err), Some(b_err)) => a_err == b_err,
1163                    // Both succeeded: equality depends on the byte streams having been identical.
1164                    (None, None) => streams_match,
1165                    // One errored and the other didn't: they are not equal.
1166                    _ => false,
1167                }
1168            }
1169    }
1170}
1171
1172#[cfg(feature = "alloc")]
1173impl<'a> TryFrom<Unescape<'a>> for Cow<'a, [u8]> {
1174    type Error = UnescapeError;
1175
1176    /// Efficiently collects the unescaped bytes into a `Cow<'a, [u8]>`.
1177    ///
1178    /// This implementation will return `Cow::Borrowed` if the original input contained
1179    /// no escape sequences, avoiding allocation. Otherwise, it returns `Cow::Owned`.
1180    ///
1181    /// If any `UnescapeError` is encountered during iteration, the operation
1182    /// halts and returns that error.
1183    ///
1184    /// **Requires the `alloc` feature.**
1185    fn try_from(mut value: Unescape<'a>) -> Result<Self, Self::Error> {
1186        match value.next() {
1187            None => Ok(Cow::Borrowed(b"")),
1188            Some(Ok(first)) => match value.next() {
1189                None => Ok(Cow::Borrowed(first)),
1190                Some(Ok(second)) => {
1191                    let mut buf =
1192                        Vec::with_capacity(first.len() + second.len() + value.bytes.len());
1193                    buf.extend_from_slice(first);
1194                    buf.extend_from_slice(second);
1195                    for item in value {
1196                        buf.extend_from_slice(item?);
1197                    }
1198                    Ok(Cow::Owned(buf))
1199                }
1200                Some(Err(e)) => Err(e),
1201            },
1202            Some(Err(e)) => Err(e),
1203        }
1204    }
1205}
1206
1207// =============================================================================
1208// DisplayUnescape Implementation
1209// =============================================================================
1210
1211/// A wrapper for an [`Unescape`] iterator that implements [`fmt::Display`].
1212///
1213/// This struct is created by the [`Unescape::display_utf8()`] method. It allows for
1214/// printing the unescaped content directly to a formatter, which **avoids
1215/// any heap allocations**. The unescaping and UTF-8 decoding are performed on-the-fly as the
1216/// `fmt` method is called.
1217pub struct DisplayUnescape<'a> {
1218    inner: Unescape<'a>,
1219}
1220
1221impl fmt::Display for DisplayUnescape<'_> {
1222    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1223        self.inner.clone()._display_utf8(f, false)
1224    }
1225}
1226
1227/// A wrapper for an [`Unescape`] iterator that implements [`fmt::Display`] lossily.
1228///
1229/// This struct is created by the [`Unescape::display_utf8_lossy()`] method. Like
1230/// `DisplayUnescape`, it performs its operation **without any heap allocations**.
1231///
1232/// This method differs from `display_utf8` in that it handles two types of
1233/// errors gracefully:
1234/// - Invalid JSON escape sequences will be ignored, and the iterator will
1235///   continue to completion without a `fmt::Error`.
1236/// - Invalid UTF-8 byte sequences will be replaced with the Unicode
1237///   replacement character (``, U+FFFD)
1238pub struct DisplayUnescapeLossy<'a> {
1239    inner: Unescape<'a>,
1240}
1241
1242impl fmt::Display for DisplayUnescapeLossy<'_> {
1243    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1244        // Lossy mode: replace invalid sequences with U+FFFD and continue.
1245        self.inner.clone()._display_utf8(f, true)
1246    }
1247}
1248
1249// =============================================================================
1250// Error Types
1251// =============================================================================
1252
1253/// An error that can occur when decoding the final byte stream to a UTF-8 string.
1254#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1255pub enum DecodeUtf8Error {
1256    /// The unescaped byte sequence was not valid UTF-8.
1257    Utf8(str::Utf8Error),
1258    /// An error occurred during the JSON unescaping process itself.
1259    Unescape(UnescapeError),
1260}
1261
1262impl fmt::Display for DecodeUtf8Error {
1263    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1264        match self {
1265            DecodeUtf8Error::Utf8(e) => fmt::Display::fmt(e, f),
1266            DecodeUtf8Error::Unescape(e) => fmt::Display::fmt(e, f),
1267        }
1268    }
1269}
1270
1271/// Details of an invalid escape sequence error.
1272#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1273#[non_exhaustive]
1274pub struct InvalidEscapeError {
1275    /// The invalid character found after a `\`.
1276    pub found: u8,
1277}
1278
1279/// Details of a lone UTF-16 surrogate error.
1280#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1281#[non_exhaustive]
1282pub struct LoneSurrogateError {
1283    /// The 16-bit surrogate code point.
1284    pub surrogate: u16,
1285}
1286
1287/// Details of an invalid hex digit error within a `\uXXXX` sequence.
1288#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1289#[non_exhaustive]
1290pub struct InvalidHexError {
1291    /// The non-hex character that was found.
1292    pub found: u8,
1293}
1294
1295impl fmt::Display for InvalidHexError {
1296    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1297        write!(f, "found invalid hex digit '0x{:02X}'", self.found)
1298    }
1299}
1300
1301/// An error that can occur during the JSON string unescaping process.
1302#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1303pub struct UnescapeError {
1304    /// The specific kind of unescaping error.
1305    pub(crate) kind: UnescapeErrorKind,
1306    /// The byte offset from the start of the escape sequence (`\`) where the
1307    /// error was detected.
1308    ///
1309    /// This is guaranteed to be less than 12, as the maximum escape sequence
1310    /// is `\uXXXX\uXXXX`.
1311    pub(crate) offset: u8,
1312}
1313
1314impl UnescapeError {
1315    /// Returns the specific kind of error that occurred.
1316    ///
1317    /// This can be used to programmatically handle different error types,
1318    /// such as distinguishing between a malformed hex sequence and an
1319    /// invalid escape character.
1320    ///
1321    /// ### Example
1322    ///
1323    /// ```
1324    /// # use json_escape::{unescape, UnescapeErrorKind, InvalidHexError};
1325    /// let mut unescaper = unescape(r#"\u123Z"#);
1326    /// let err = unescaper.next().unwrap().unwrap_err();
1327    ///
1328    /// match err.kind() {
1329    ///     UnescapeErrorKind::InvalidHex(InvalidHexError { found, .. }) => {
1330    ///         // We can inspect the exact invalid character found.
1331    ///         assert_eq!(found, b'Z');
1332    ///     }
1333    ///     _ => panic!("Expected an InvalidHex error"),
1334    /// }
1335    /// ```
1336    pub fn kind(&self) -> UnescapeErrorKind {
1337        self.kind
1338    }
1339
1340    /// Returns the byte offset from the start of the escape sequence (`\`)
1341    /// where the error was detected.
1342    ///
1343    /// - For `\x`, the offset is `1` (pointing to `x`).
1344    /// - For `\u123?`, the offset is `5` (pointing to `?`).
1345    /// - For a lone surrogate `\uD800`, the offset is `6` (pointing after the sequence).
1346    ///
1347    /// This is useful for providing detailed error messages that can point
1348    /// to the exact location of the problem in the source string.
1349    ///
1350    /// ### Example
1351    ///
1352    /// ```
1353    /// # use json_escape::unescape;
1354    /// let json_string_content = r#"bad escape \x here"#;
1355    /// let mut unescaper = unescape(json_string_content);
1356    ///
1357    /// // read off 'bad escape '
1358    /// let first = unescaper.next().unwrap().unwrap();
1359    /// assert_eq!(first, b"bad escape ");
1360    ///
1361    /// let err = unescaper.next().unwrap().unwrap_err();
1362    ///
1363    /// // The error occurred at the 'x', which is 1 byte after the '\'
1364    /// assert_eq!(err.offset(), 1);
1365    ///
1366    /// // You could use this to highlight the error in the original input
1367    /// let backslash_pos = json_string_content.find('\\').unwrap();
1368    /// let error_pos = backslash_pos + err.offset() as usize;
1369    /// assert_eq!(json_string_content.as_bytes()[error_pos], b'x');
1370    ///
1371    /// // The generated error message also includes this info.
1372    /// let expected_msg = "invalid escape: '\\0x78' at offset 1";
1373    /// assert_eq!(err.to_string(), expected_msg);
1374    /// ```
1375    pub fn offset(&self) -> u8 {
1376        self.offset
1377    }
1378}
1379
1380/// The specific kind of error that can occur during JSON string unescaping.
1381///
1382/// This enum covers all possible failures described by the JSON standard for string contents.
1383#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1384#[non_exhaustive]
1385pub enum UnescapeErrorKind {
1386    /// Found a backslash followed by an unexpected character (e.g., `\x`).
1387    InvalidEscape(InvalidEscapeError),
1388    /// Found `\u` but the following characters were not 4 valid hex digits.
1389    InvalidHex(InvalidHexError),
1390    /// Input ended unexpectedly while parsing an escape sequence (e.g., `\u12`).
1391    UnexpectedEof,
1392    /// The `\u` sequence yielded a lone high or low surrogate without a matching pair.
1393    LoneSurrogate(LoneSurrogateError),
1394}
1395
1396impl fmt::Display for UnescapeError {
1397    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1398        match self.kind {
1399            UnescapeErrorKind::InvalidEscape(e) => {
1400                write!(
1401                    f,
1402                    "invalid escape: '\\0x{:02X}' at offset {}",
1403                    e.found, self.offset
1404                )
1405            }
1406            UnescapeErrorKind::InvalidHex(ref s) => {
1407                write!(f, "{} at offset {}", s, self.offset)
1408            }
1409            UnescapeErrorKind::UnexpectedEof => {
1410                write!(
1411                    f,
1412                    "unexpected end of input while parsing escape sequence, expected character at offset {}",
1413                    self.offset
1414                )
1415            }
1416            UnescapeErrorKind::LoneSurrogate(e) => write!(
1417                f,
1418                "invalid unicode sequence: lone surrogate found: 0x{:04X} at offset {}",
1419                e.surrogate, self.offset
1420            ),
1421        }
1422    }
1423}
1424
1425impl core::error::Error for UnescapeError {}
1426impl core::error::Error for DecodeUtf8Error {
1427    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
1428        match self {
1429            DecodeUtf8Error::Utf8(e) => Some(e),
1430            DecodeUtf8Error::Unescape(e) => Some(e),
1431        }
1432    }
1433}
1434
1435// =============================================================================
1436// Utilities
1437// =============================================================================
1438
1439// A const lookup table for JSON escape sequences.
1440// Maps a byte to its escaped `&'static str` representation.
1441// `None` indicates the byte does not need to be escaped.
1442const ESCAPE_TABLE: [Option<&'static str>; 256] = {
1443    let mut table: [Option<&'static str>; 256] = [None; 256];
1444
1445    // Special characters
1446    table[b'"' as usize] = Some(r#"\""#);
1447    table[b'\\' as usize] = Some(r#"\\"#);
1448
1449    // Common control characters with short escapes
1450    table[0x08] = Some(r#"\b"#); // Backspace
1451    table[0x09] = Some(r#"\t"#); // Tab
1452    table[0x0A] = Some(r#"\n"#); // Line Feed
1453    table[0x0C] = Some(r#"\f"#); // Form Feed
1454    table[0x0D] = Some(r#"\r"#); // Carriage Return
1455
1456    // The rest of the control characters must be `\uXXXX` encoded.
1457    // We can pre-calculate and store all of them as static strings.
1458    table[0x00] = Some(r#"\u0000"#);
1459    table[0x01] = Some(r#"\u0001"#);
1460    table[0x02] = Some(r#"\u0002"#);
1461    table[0x03] = Some(r#"\u0003"#);
1462    table[0x04] = Some(r#"\u0004"#);
1463    table[0x05] = Some(r#"\u0005"#);
1464    table[0x06] = Some(r#"\u0006"#);
1465    table[0x07] = Some(r#"\u0007"#);
1466    // 0x08 to 0x0D are already handled above
1467    table[0x0B] = Some(r#"\u000b"#);
1468    table[0x0E] = Some(r#"\u000e"#);
1469    table[0x0F] = Some(r#"\u000f"#);
1470    table[0x10] = Some(r#"\u0010"#);
1471    table[0x11] = Some(r#"\u0011"#);
1472    table[0x12] = Some(r#"\u0012"#);
1473    table[0x13] = Some(r#"\u0013"#);
1474    table[0x14] = Some(r#"\u0014"#);
1475    table[0x15] = Some(r#"\u0015"#);
1476    table[0x16] = Some(r#"\u0016"#);
1477    table[0x17] = Some(r#"\u0017"#);
1478    table[0x18] = Some(r#"\u0018"#);
1479    table[0x19] = Some(r#"\u0019"#);
1480    table[0x1A] = Some(r#"\u001a"#);
1481    table[0x1B] = Some(r#"\u001b"#);
1482    table[0x1C] = Some(r#"\u001c"#);
1483    table[0x1D] = Some(r#"\u001d"#);
1484    table[0x1E] = Some(r#"\u001e"#);
1485    table[0x1F] = Some(r#"\u001f"#);
1486
1487    table
1488};
1489
1490// A simple boolean-like lookup table for SIMD.
1491// 0 = no escape needed, 1 = escape needed.
1492// This is very compact (256 bytes) and fits easily in the L1 cache.
1493#[allow(unused)]
1494const ESCAPE_DECISION_TABLE: [u8; 256] = {
1495    let mut table = [0u8; 256];
1496    let mut i = 0;
1497    while i < 256 {
1498        if ESCAPE_TABLE[i].is_some() {
1499            table[i] = 1;
1500        }
1501        i += 1;
1502    }
1503    table
1504};
1505
1506// This is the SIMD version, compiled only when the "simd" feature is enabled on nightly build.
1507#[cfg(all(feature = "simd", nightly))]
1508#[inline]
1509fn find_escape_char(bytes: &[u8]) -> Option<usize> {
1510    use std::simd::{Simd, prelude::SimdPartialEq, prelude::SimdPartialOrd};
1511
1512    const LANES: usize = 16; // Process 16 bytes at a time (fits in SSE2/AVX)
1513    let mut i = 0;
1514
1515    // SIMD main loop
1516    while i + LANES <= bytes.len() {
1517        // Load 16 bytes from the slice into a SIMD vector.
1518        let chunk = Simd::<u8, LANES>::from_slice(&bytes[i..]);
1519
1520        // Create comparison vectors. These are effectively 16 copies of the byte.
1521        let space_v = Simd::splat(b' ' - 1); // For the < ' ' check (i.e., <= 0x1F)
1522        let quote_v = Simd::splat(b'"');
1523        let slash_v = Simd::splat(b'\\');
1524
1525        // Perform all 16 comparisons at once. The result is a mask.
1526        let lt_space_mask = chunk.simd_le(space_v);
1527        let eq_quote_mask = chunk.simd_eq(quote_v);
1528        let eq_slash_mask = chunk.simd_eq(slash_v);
1529
1530        // Combine the masks. A byte needs escaping if ANY of the conditions are true.
1531        let combined_mask = lt_space_mask | eq_quote_mask | eq_slash_mask;
1532
1533        // Check if any lane in the combined mask is true.
1534        if combined_mask.any() {
1535            // If yes, find the index of the *first* true lane.
1536            // trailing_zeros() on the bitmask gives us this index directly.
1537            let first_match_index = combined_mask.to_bitmask().trailing_zeros() as usize;
1538            return Some(i + first_match_index);
1539        }
1540
1541        i += LANES;
1542    }
1543
1544    // Handle the remaining bytes (if any) with the simple iterator method.
1545    if i < bytes.len() {
1546        if let Some(pos) = bytes[i..]
1547            .iter()
1548            .position(|&b| ESCAPE_DECISION_TABLE[b as usize] != 0)
1549        {
1550            return Some(i + pos);
1551        }
1552    }
1553
1554    None
1555}
1556
1557#[cfg(all(feature = "simd", not(nightly), target_arch = "x86_64"))]
1558#[inline]
1559fn find_escape_char(bytes: &[u8]) -> Option<usize> {
1560    // This is the stable Rust path using explicit CPU intrinsics.
1561    // It's guarded by cfg flags to only compile on x86_64 with the simd feature.
1562    use std::arch::x86_64::*;
1563
1564    let mut i = 0;
1565    const LANES: usize = 16; // SSE2 works on 128-bit registers, which is 16 bytes.
1566
1567    // On x86_64, we can tell the compiler to use SSE2 features in this specific function.
1568    // This is safe because we've already checked the target architecture.
1569    #[target_feature(enable = "sse2")]
1570    unsafe fn find_in_chunk(bytes: &[u8], i: usize) -> Option<usize> {
1571        // Load 16 bytes of data from the slice.
1572        let chunk = unsafe { _mm_loadu_si128(bytes.as_ptr().add(i) as *const _) };
1573
1574        // Create comparison vectors for quote and slash.
1575        let quote_v = _mm_set1_epi8(b'"' as i8);
1576        let slash_v = _mm_set1_epi8(b'\\' as i8);
1577
1578        // Emulate unsigned comparison for control characters
1579        // Create a vector with the value 0x80 in each lane.
1580        let bias = _mm_set1_epi8(0x80u8 as i8);
1581        // Create the comparison vector for bytes < 0x20 (' ').
1582        let space_v = _mm_set1_epi8(b' ' as i8);
1583
1584        // Bias both the input chunk and the comparison vector by XORing with 0x80.
1585        let biased_chunk = _mm_xor_si128(chunk, bias);
1586        let biased_space_v = _mm_xor_si128(space_v, bias);
1587
1588        // Now, a signed less-than comparison on the biased values gives the
1589        // same result as an unsigned less-than on the original values.
1590        let lt_space_mask = _mm_cmplt_epi8(biased_chunk, biased_space_v);
1591
1592        // Perform the equality comparisons (these are unaffected by signedness).
1593        let eq_quote_mask = _mm_cmpeq_epi8(chunk, quote_v);
1594        let eq_slash_mask = _mm_cmpeq_epi8(chunk, slash_v);
1595
1596        // Combine the results.
1597        let combined_mask = _mm_or_si128(lt_space_mask, _mm_or_si128(eq_quote_mask, eq_slash_mask));
1598
1599        // Create a bitmask to find the first match.
1600        let mask = _mm_movemask_epi8(combined_mask);
1601
1602        if mask != 0 {
1603            Some(i + mask.trailing_zeros() as usize)
1604        } else {
1605            None
1606        }
1607    }
1608    // Main loop
1609    while i + LANES <= bytes.len() {
1610        if let Some(result) = unsafe { find_in_chunk(bytes, i) } {
1611            return Some(result);
1612        }
1613        i += LANES;
1614    }
1615
1616    // Handle the remainder with the fast scalar lookup.
1617    if i < bytes.len() {
1618        if let Some(pos) = bytes[i..]
1619            .iter()
1620            .position(|&b| ESCAPE_DECISION_TABLE[b as usize] != 0)
1621        {
1622            return Some(i + pos);
1623        }
1624    }
1625
1626    None
1627}
1628
1629// A fallback for when SIMD feature is off.
1630#[cfg(not(feature = "simd"))]
1631#[inline]
1632fn find_escape_char(bytes: &[u8]) -> Option<usize> {
1633    bytes
1634        .iter()
1635        .position(|&b| ESCAPE_DECISION_TABLE[b as usize] != 0)
1636}
1637
1638#[cfg(all(feature = "simd", not(nightly), not(target_arch = "x86_64")))]
1639compile_error! { "simd requires nightly or target_arch = \"x86_64\"" }
1640
1641// Escape table: maps the byte after '\' to its escaped representation.
1642const UNESCAPE_TABLE: [Option<&[u8]>; 256] = {
1643    let mut tbl: [Option<&[u8]>; 256] = [None; 256];
1644    tbl[b'"' as usize] = Some(b"\"");
1645    tbl[b'\\' as usize] = Some(b"\\");
1646    tbl[b'/' as usize] = Some(b"/");
1647    tbl[b'b' as usize] = Some(b"\x08");
1648    tbl[b'f' as usize] = Some(b"\x0C");
1649    tbl[b'n' as usize] = Some(b"\n");
1650    tbl[b'r' as usize] = Some(b"\r");
1651    tbl[b't' as usize] = Some(b"\t");
1652    tbl
1653};
1654
1655/// Static table mapping every u8 -> a &'static [u8] of length 1.
1656/// This lets us return a `'static` slice for any single byte cheaply.
1657const U8_TABLE: [[u8; 1]; 256] = {
1658    let mut arr = [[0u8; 1]; 256];
1659    let mut i = 0usize;
1660    while i < 256 {
1661        arr[i] = [i as u8];
1662        i += 1;
1663    }
1664    arr
1665};
1666
1667#[inline(always)]
1668fn byte_as_static_slice(b: u8) -> &'static [u8] {
1669    // coerce from &'static [u8;1] to &'static [u8]
1670    &U8_TABLE[b as usize]
1671}
1672
1673// The following function is copied from the `percent-encoding` crate, version 2.3.2.
1674// Source: https://github.com/servo/rust-url/blob/22b925f93ad505a830f1089538a9ed6f5fd90612/percent_encoding/src/lib.rs#L337-L365
1675//
1676// It is licensed under the same terms as the `percent-encoding` crate (MIT/Apache-2.0).
1677//
1678// This helper is used to efficiently convert a Cow<'_, [u8]> to a Cow<'_, str>
1679// lossily, with a specific optimization to avoid a re-allocation when the input
1680// is an owned, valid UTF-8 Vec<u8>.
1681#[cfg(feature = "alloc")]
1682#[allow(ambiguous_wide_pointer_comparisons)]
1683fn decode_utf8_lossy(input: Cow<'_, [u8]>) -> Cow<'_, str> {
1684    // Note: This function is duplicated in `form_urlencoded/src/query_encoding.rs`.
1685    match input {
1686        Cow::Borrowed(bytes) => String::from_utf8_lossy(bytes),
1687        Cow::Owned(bytes) => {
1688            match String::from_utf8_lossy(&bytes) {
1689                Cow::Borrowed(utf8) => {
1690                    // If from_utf8_lossy returns a Cow::Borrowed, then we can
1691                    // be sure our original bytes were valid UTF-8. This is because
1692                    // if the bytes were invalid UTF-8 from_utf8_lossy would have
1693                    // to allocate a new owned string to back the Cow so it could
1694                    // replace invalid bytes with a placeholder.
1695
1696                    // First we do a debug_assert to confirm our description above.
1697                    let raw_utf8: *const [u8] = utf8.as_bytes();
1698                    debug_assert!(core::ptr::eq(raw_utf8, &*bytes));
1699
1700                    // Given we know the original input bytes are valid UTF-8,
1701                    // and we have ownership of those bytes, we re-use them and
1702                    // return a Cow::Owned here.
1703                    Cow::Owned(unsafe { String::from_utf8_unchecked(bytes) })
1704                }
1705                Cow::Owned(s) => Cow::Owned(s),
1706            }
1707        }
1708    }
1709}
1710
1711/// Compare two chunk-iterators by their concatenated byte stream (streaming,
1712/// zero allocations).
1713///
1714/// This is allocation-free: it streams through both iterators, comparing
1715/// overlapping prefixes and carrying the remainder of the longer chunk
1716/// forward into the next round.
1717fn chunks_eq<'a, I1, A, I2, B>(mut a: I1, mut b: I2) -> bool
1718where
1719    A: 'a + AsRef<[u8]> + ?Sized,
1720    B: 'a + AsRef<[u8]> + ?Sized,
1721    I1: Iterator<Item = &'a A>,
1722    I2: Iterator<Item = &'a B>,
1723{
1724    let mut a_rem: &[u8] = &[];
1725    let mut b_rem: &[u8] = &[];
1726
1727    loop {
1728        // If the remainder buffer for 'a' is empty, try to get the next chunk.
1729        if a_rem.is_empty() {
1730            match a.next() {
1731                Some(chunk) => a_rem = chunk.as_ref(),
1732                // 'a' is exhausted. They are equal only if 'b' is also exhausted.
1733                None => return b_rem.is_empty() && b.next().is_none(),
1734            }
1735        }
1736
1737        // If the remainder buffer for 'b' is empty, try to get the next chunk.
1738        if b_rem.is_empty() {
1739            match b.next() {
1740                Some(chunk) => b_rem = chunk.as_ref(),
1741                // 'b' is exhausted, but we know 'a' is not (since a_rem is non-empty).
1742                // Therefore, they cannot be equal.
1743                None => return false,
1744            }
1745        }
1746
1747        // At this point, both a_rem and b_rem are guaranteed to be non-empty.
1748        // Determine the length of the smaller chunk to compare.
1749        let n = a_rem.len().min(b_rem.len());
1750
1751        // Compare the overlapping parts of the chunks.
1752        if a_rem[..n] != b_rem[..n] {
1753            return false;
1754        }
1755
1756        // Move the slices past the part we just compared.
1757        a_rem = &a_rem[n..];
1758        b_rem = &b_rem[n..];
1759    }
1760}
1761
1762#[inline]
1763fn display_bytes_utf8(bytes: &[u8], f: &mut fmt::Formatter<'_>, lossy: bool) -> fmt::Result {
1764    for chunk in bytes.utf8_chunks() {
1765        f.write_str(chunk.valid())?;
1766
1767        if !chunk.invalid().is_empty() {
1768            if lossy {
1769                f.write_char(char::REPLACEMENT_CHARACTER)?
1770            } else {
1771                return Err(fmt::Error);
1772            }
1773        }
1774    }
1775
1776    Ok(())
1777}
1778
1779#[cfg(test)]
1780mod tests {
1781    use core::fmt::Display;
1782    use std::{io::Read as _, string::ToString as _, vec};
1783
1784    use super::*;
1785
1786    // ===================== Escape ===================== //
1787
1788    fn test_escape_typical(input: &str, want: &str) {
1789        let got = escape_str(input).collect::<String>();
1790        assert_eq!(got, want);
1791
1792        // Test PartialEq too
1793        assert_eq!(escape_str(input), want);
1794
1795        // FIXME: Once logic is unified, remove this.
1796        let got = explicit::escape_str(input).collect::<String>();
1797        assert_eq!(got, want);
1798
1799        // Test PartialEq too
1800        assert_eq!(escape_str(input), want)
1801    }
1802
1803    #[test]
1804    fn test_empty_string() {
1805        test_escape_typical("", "");
1806    }
1807
1808    #[test]
1809    fn test_quotes() {
1810        test_escape_typical("\"hello\"", "\\\"hello\\\"")
1811    }
1812
1813    #[test]
1814    fn test_backslash() {
1815        test_escape_typical("\\hello\\", "\\\\hello\\\\");
1816    }
1817
1818    #[test]
1819    fn test_slash() {
1820        test_escape_typical("/hello/", "/hello/");
1821    }
1822
1823    #[test]
1824    fn test_control_chars() {
1825        test_escape_typical("\n\r\t\x08\x0C", "\\n\\r\\t\\b\\f");
1826    }
1827
1828    #[test]
1829    fn test_escape_fully() {
1830        let input = "Hello, \"world\"!\nThis contains a \\ backslash and a \t tab.";
1831        let expected = r#"Hello, \"world\"!\nThis contains a \\ backslash and a \t tab."#;
1832        test_escape_typical(input, expected);
1833    }
1834
1835    #[test]
1836    fn test_other_control_chars() {
1837        let input = "Null:\0, Bell:\x07";
1838        let expected = r#"Null:\u0000, Bell:\u0007"#;
1839        test_escape_typical(input, expected);
1840
1841        test_escape_typical("\x00\x1F", "\\u0000\\u001f");
1842        test_escape_typical("\x19", "\\u0019");
1843    }
1844
1845    #[test]
1846    fn test_iterator_chunks() {
1847        let input = "prefix\npostfix";
1848        let mut iter = escape_str(input);
1849        assert_eq!(iter.next(), Some("prefix"));
1850        assert_eq!(iter.next(), Some(r#"\n"#));
1851        assert_eq!(iter.next(), Some("postfix"));
1852        assert_eq!(iter.next(), None);
1853    }
1854
1855    #[test]
1856    fn test_no_escape_needed() {
1857        let input = "A simple string with no escapes.";
1858        let mut iter = escape_str(input);
1859        assert_eq!(iter.next(), Some("A simple string with no escapes."));
1860        assert_eq!(iter.next(), None);
1861
1862        let input = "café";
1863        let mut iter = escape_str(input);
1864        assert_eq!(iter.next(), Some("café"));
1865        assert_eq!(iter.next(), None);
1866
1867        let input = "❤️";
1868        let mut iter = escape_str(input);
1869        assert_eq!(iter.next(), Some("❤️"));
1870        assert_eq!(iter.next(), None);
1871    }
1872
1873    // ===================== Unescape ===================== //
1874
1875    #[test]
1876    fn test_byte_table() {
1877        assert_eq!(byte_as_static_slice(0), &[0]);
1878        assert_eq!(byte_as_static_slice(5), &[5]);
1879        assert_eq!(byte_as_static_slice(255), &[255]);
1880    }
1881
1882    fn test_unescape_typical<I: AsRef<[u8]> + ?Sized>(input: &I, want: &str) {
1883        let got = unescape(input).decode_utf8().unwrap();
1884        assert_eq!(got, want);
1885
1886        // Test PartialEq too
1887        assert_eq!(unescape(input), want);
1888
1889        // Help display
1890        assert_display(unescape(input).display_utf8(), Ok(want));
1891
1892        // FIXME: Once logic is unified, remove this.
1893        let got = explicit::unescape(input).decode_utf8().unwrap();
1894        assert_eq!(got, want);
1895
1896        // Test PartialEq too
1897        assert_eq!(explicit::unescape(input), want);
1898
1899        // Help display
1900        assert_display(explicit::unescape(input).display_utf8(), Ok(want));
1901    }
1902
1903    #[test]
1904    fn test_unicode_escape_basic_unescape() {
1905        // \u4E16 => 世 (E4 B8 96)
1906        let s = "X\\u4E16Y";
1907        test_unescape_typical(s, "X世Y");
1908
1909        let s = "Snow: \\u2603"; // \u2603 => ☃
1910        test_unescape_typical(s, "Snow: ☃");
1911
1912        let s = "A \\u03A9 B"; // Ω is U+03A9
1913        test_unescape_typical(s, "A Ω B");
1914    }
1915
1916    #[test]
1917    fn test_surrogate_pair_unescape() {
1918        // 😀 is U+1F600 -> in JSON: \uD83D\uDE00
1919        let s = "A\\uD83D\\uDE00B";
1920        test_unescape_typical(s, "A😀B")
1921    }
1922
1923    #[test]
1924    fn test_invalid_escape_unescape() {
1925        let s = b"\\x";
1926        let mut u = unescape(s);
1927
1928        match u.next() {
1929            Some(Err(UnescapeError {
1930                kind: UnescapeErrorKind::InvalidEscape(InvalidEscapeError { found: b'x' }),
1931                offset: 1,
1932            })) => {}
1933            _ => panic!("expected invalid escape"),
1934        }
1935
1936        // FIXME: Once logic is unified, remove this.
1937        let mut u = explicit::unescape(s);
1938
1939        match u.next() {
1940            Some(Err(UnescapeError {
1941                kind: UnescapeErrorKind::InvalidEscape(InvalidEscapeError { found: b'x' }),
1942                offset: 1,
1943            })) => {}
1944            _ => panic!("expected invalid escape"),
1945        }
1946    }
1947
1948    #[test]
1949    fn test_simple_unescape() {
1950        let input = "Hello\\nWorld\\\"!"; // "Hello\nWorld\"!"
1951        test_unescape_typical(input, "Hello\nWorld\"!")
1952    }
1953
1954    #[test]
1955    fn test_truncated_unicode() {
1956        let input = "Trunc: \\u12"; // too short
1957        let it = unescape(input);
1958        let mut found = false;
1959        for r in it {
1960            match r {
1961                Ok(_) => continue,
1962                Err(UnescapeError {
1963                    kind: UnescapeErrorKind::UnexpectedEof,
1964                    offset: 4,
1965                }) => {
1966                    found = true;
1967                    break;
1968                }
1969                Err(_) => break,
1970            }
1971        }
1972        assert!(found);
1973
1974        // FIXME: Once logic is unified, remove this.
1975        assert_eq!(
1976            explicit::unescape(input).next(),
1977            Some(Err(UnescapeError {
1978                kind: UnescapeErrorKind::UnexpectedEof,
1979                offset: 4,
1980            }))
1981        );
1982    }
1983
1984    // ===================== Chunk_Eq ===================== //
1985
1986    #[test]
1987    fn test_empty_iterators_are_equal() {
1988        let a: Vec<&[u8]> = vec![];
1989        let b: Vec<&[u8]> = vec![];
1990        assert!(chunks_eq(a.into_iter(), b.into_iter()));
1991    }
1992
1993    #[test]
1994    fn test_empty_vs_non_empty() {
1995        let a: Vec<&[u8]> = vec![];
1996        let b = vec![&[1, 2, 3]];
1997        assert!(!chunks_eq(a.into_iter(), b.into_iter()));
1998
1999        // And the other way around
2000        let a = vec![&[1, 2, 3]];
2001        let b: Vec<&[u8]> = vec![];
2002        assert!(!chunks_eq(a.into_iter(), b.into_iter()));
2003    }
2004
2005    #[test]
2006    fn test_single_identical_chunks() {
2007        let a = vec!["hello world"];
2008        let b = vec!["hello world"];
2009        assert!(chunks_eq(a.into_iter(), b.into_iter()));
2010    }
2011
2012    #[test]
2013    fn test_different_chunk_boundaries_str() {
2014        // This is the key test: the concatenated content is identical,
2015        // but the chunk divisions are different.
2016        let a = vec!["he", "llo", " ", "world"];
2017        let b = vec!["hello ", "wo", "rld"];
2018        assert!(chunks_eq(a.into_iter(), b.into_iter()));
2019    }
2020
2021    #[test]
2022    fn test_different_chunk_boundaries_bytes() {
2023        let a = vec![&[1, 2], &[3, 4, 5][..]];
2024        let b = vec![&[1, 2, 3], &[4, 5][..]];
2025        assert!(chunks_eq(a.into_iter(), b.into_iter()));
2026    }
2027
2028    #[test]
2029    fn test_one_long_vs_many_short() {
2030        let a = vec!["a-long-single-chunk"];
2031        let b = vec!["a", "-", "long", "-", "single", "-", "chunk"];
2032        assert!(chunks_eq(a.into_iter(), b.into_iter()));
2033    }
2034
2035    #[test]
2036    fn test_unequal_content_same_length() {
2037        let a = vec!["hello"];
2038        let b = vec!["hallo"];
2039        assert!(!chunks_eq(a.into_iter(), b.into_iter()));
2040    }
2041
2042    #[test]
2043    fn test_unequal_at_chunk_boundary() {
2044        let a = vec!["ab", "c"]; // "abc"
2045        let b = vec!["ab", "d"]; // "abd"
2046        assert!(!chunks_eq(a.into_iter(), b.into_iter()));
2047    }
2048
2049    #[test]
2050    fn test_one_is_prefix_of_other() {
2051        // a is shorter
2052        let a = vec!["user", "name"]; // "username"
2053        let b = vec!["user", "name", "123"]; // "username123"
2054        assert!(!chunks_eq(a.into_iter(), b.into_iter()));
2055
2056        // b is shorter
2057        let a = vec!["user", "name", "123"];
2058        let b = vec!["user", "name"];
2059        assert!(!chunks_eq(a.into_iter(), b.into_iter()));
2060    }
2061
2062    #[test]
2063    fn test_complex_remainer_logic() {
2064        // This tests the carry-over logic extensively.
2065        // a: [1,2,3], [4,5], [6,7,8,9], [10]
2066        // b: [1,2], [3,4,5,6], [7,8], [9,10]
2067        let a = vec![&[1, 2, 3], &[4, 5][..], &[6, 7, 8, 9], &[10]];
2068        let b = vec![&[1, 2], &[3, 4, 5, 6][..], &[7, 8], &[9, 10]];
2069        assert!(chunks_eq(a.into_iter(), b.into_iter()));
2070    }
2071
2072    #[test]
2073    fn test_with_vec_references() {
2074        let v_a1 = vec![1, 2];
2075        let v_a2 = vec![3, 4, 5];
2076        let a_data = vec![&v_a1, &v_a2];
2077
2078        let v_b1 = vec![1, 2, 3];
2079        let v_b2 = vec![4, 5];
2080        let b_data = vec![&v_b1, &v_b2];
2081        assert!(chunks_eq(a_data.into_iter(), b_data.into_iter()));
2082    }
2083
2084    // ===================== Unescape Read ===================== //
2085
2086    #[test]
2087    fn test_read_simple() {
2088        let input = br#"hello world"#;
2089        let mut reader = unescape(input);
2090        let mut buf = [0u8; 20];
2091
2092        let bytes_read = reader.read(&mut buf).unwrap();
2093
2094        assert_eq!(bytes_read, 11);
2095        assert_eq!(&buf[..bytes_read], b"hello world");
2096
2097        // Second read should return 0 (EOF)
2098        let bytes_read_eof = reader.read(&mut buf).unwrap();
2099        assert_eq!(bytes_read_eof, 0);
2100    }
2101
2102    #[test]
2103    fn test_read_with_simple_escapes() {
2104        let input = br#"hello\tworld\nline2"#;
2105        let mut reader = unescape(input);
2106        let mut buf = Vec::new();
2107
2108        reader.read_to_end(&mut buf).unwrap();
2109
2110        assert_eq!(buf, b"hello\tworld\nline2");
2111    }
2112
2113    #[test]
2114    fn test_read_into_small_buffer_multiple_calls() {
2115        let input = br#"this is a long string with no escapes"#;
2116        let mut reader = unescape(input);
2117        let mut buf = [0u8; 10];
2118        let mut result = Vec::new();
2119
2120        loop {
2121            match reader.read(&mut buf) {
2122                Ok(0) => break, // EOF
2123                Ok(n) => {
2124                    result.extend_from_slice(&buf[..n]);
2125                }
2126                Err(e) => panic!("Read error: {}", e),
2127            }
2128        }
2129
2130        assert_eq!(result, input);
2131    }
2132
2133    #[test]
2134    fn test_read_multibyte_char_across_buffer_boundary() {
2135        // The grinning face emoji 😀 is \uD83D\uDE00, which is 4 bytes in UTF-8: 0xF0 0x9F 0x98 0x80
2136        let input = br#"emoji: \uD83D\uDE00 is here"#;
2137        let mut reader = unescape(input);
2138
2139        // Buffer is small, forcing the 4-byte emoji to be written across multiple calls
2140        let mut buf = [0u8; 8];
2141        let mut result = Vec::new();
2142
2143        // First read: "emoji: " (7 bytes) + first byte of emoji
2144        let n1 = reader.read(&mut buf).unwrap();
2145        assert_eq!(n1, 8);
2146        assert_eq!(&buf[..n1], b"emoji: \xF0");
2147        result.extend_from_slice(&buf[..n1]);
2148
2149        // Second read: next 3 bytes of emoji + " is h"
2150        let n2 = reader.read(&mut buf).unwrap();
2151        assert_eq!(n2, 8);
2152        assert_eq!(&buf[..n2], b"\x9F\x98\x80 is h");
2153        result.extend_from_slice(&buf[..n2]);
2154
2155        // Third read: "ere"
2156        let n3 = reader.read(&mut buf).unwrap();
2157        assert_eq!(n3, 3);
2158        assert_eq!(&buf[..n3], b"ere");
2159        result.extend_from_slice(&buf[..n3]);
2160
2161        // Final read should be EOF
2162        let n4 = reader.read(&mut buf).unwrap();
2163        assert_eq!(n4, 0);
2164
2165        assert_eq!(result, b"emoji: \xF0\x9F\x98\x80 is here");
2166        assert_eq!(result, "emoji: 😀 is here".as_bytes());
2167    }
2168
2169    #[test]
2170    fn test_read_error_invalid_escape() {
2171        let input = br#"hello \q world"#;
2172        let mut reader = unescape(input);
2173        let mut buf = [0u8; 20];
2174
2175        let result = reader.read(&mut buf);
2176
2177        assert!(result.is_err());
2178        let err = result.unwrap_err();
2179        assert_eq!(err.kind(), std::io::ErrorKind::InvalidData);
2180        assert!(err.to_string().contains("invalid escape"));
2181    }
2182
2183    #[test]
2184    fn test_read_error_lone_surrogate() {
2185        let input = br#"\uD83D rest of data seen"#; // High surrogate without a following low one
2186        let mut reader = unescape(input);
2187        let mut buf = [0u8; 10];
2188
2189        let err = reader.read(&mut buf).unwrap_err();
2190        assert_eq!(err.kind(), std::io::ErrorKind::InvalidData);
2191        assert!(err.to_string().contains("lone surrogate"));
2192    }
2193
2194    #[test]
2195    fn test_read_empty_input() {
2196        let input = b"";
2197        let mut reader = unescape(input);
2198        let mut buf = [0u8; 10];
2199        let bytes_read = reader.read(&mut buf).unwrap();
2200        assert_eq!(bytes_read, 0);
2201    }
2202
2203    #[test]
2204    fn test_read_into_empty_buffer() {
2205        let input = b"hello";
2206        let mut reader = unescape(input);
2207        let mut buf = [0u8; 0];
2208        let bytes_read = reader.read(&mut buf).unwrap();
2209        // A read into an empty buffer should always succeed and return 0.
2210        assert_eq!(bytes_read, 0);
2211    }
2212
2213    #[test]
2214    fn test_read_to_end_optimized() {
2215        let input = br#"first\nsecond\tthird \uD83D\uDE00 last"#;
2216        let mut reader = unescape(input);
2217        let mut buf = Vec::new();
2218
2219        let bytes_read = reader.read_to_end(&mut buf).unwrap();
2220        let expected = b"first\nsecond\tthird \xF0\x9F\x98\x80 last";
2221
2222        assert_eq!(bytes_read, expected.len());
2223        assert_eq!(buf, expected);
2224    }
2225
2226    // ===================== Unescape Display ===================== //
2227
2228    fn assert_display(display: impl Display, want: Result<&str, ()>) {
2229        let mut w = String::new();
2230        let res = fmt::write(&mut w, format_args!("{display}"));
2231
2232        match want {
2233            Ok(want) => {
2234                assert!(res.is_ok());
2235                assert_eq!(w, want)
2236            }
2237            Err(_) => assert!(
2238                res.is_err(),
2239                "strict mode should return Err on invalid bytes"
2240            ),
2241        }
2242    }
2243
2244    // -- NON-LOSSY TESTS (must be perfect) --
2245
2246    #[test]
2247    fn test_display_simple_string() {
2248        let display = unescape("hello world").display_utf8();
2249        assert_display(display, Ok("hello world"));
2250    }
2251
2252    #[test]
2253    fn test_display_empty_string() {
2254        assert_display(unescape("").display_utf8(), Ok(""));
2255    }
2256
2257    #[test]
2258    fn test_display_standard_escapes() {
2259        let input = br#"\" \\ \/ \b \f \n \r \t"#;
2260        let expected = "\" \\ / \x08 \x0C \n \r \t";
2261        assert_display(unescape(input).display_utf8(), Ok(expected));
2262    }
2263
2264    #[test]
2265    fn test_display_non_escaped_utf8() {
2266        let input = "你好, world".as_bytes();
2267        let expected = "你好, world";
2268        assert_display(unescape(input).display_utf8(), Ok(expected));
2269    }
2270
2271    #[test]
2272    fn test_display_unicode_escape_bmp() {
2273        // cent sign: \u00A2 -> C2 A2 (2 bytes)
2274        let input = br"a\u00A2b";
2275        let expected = "a¢b";
2276        assert_display(unescape(input).display_utf8(), Ok(expected));
2277    }
2278
2279    #[test]
2280    fn test_display_mixed_content() {
2281        let input = br#"Text with \n, \u00A2, and \uD83D\uDE0E emojis."#;
2282        let expected = "Text with \n, ¢, and 😎 emojis.";
2283        assert_display(unescape(input).display_utf8(), Ok(expected));
2284    }
2285
2286    #[test]
2287    fn test_display_starts_and_ends_with_escape() {
2288        let input = br#"\u00A2hello\t"#;
2289        let expected = "¢hello\t";
2290        assert_display(unescape(input).display_utf8(), Ok(expected));
2291    }
2292
2293    // -- NON-LOSSY ERROR TESTS --
2294
2295    #[test]
2296    fn test_display_err_invalid_escape() {
2297        assert_display(unescape(br"hello \z world").display_utf8(), Err(()));
2298    }
2299
2300    #[test]
2301    fn test_display_err_incomplete_unicode() {
2302        assert_display(unescape(br"\u123").display_utf8(), Err(()));
2303    }
2304
2305    #[test]
2306    fn test_display_err_invalid_hex_in_unicode() {
2307        assert_display(unescape(br"\u123g").display_utf8(), Err(()));
2308    }
2309
2310    #[test]
2311    fn test_display_err_lone_high_surrogate() {
2312        assert_display(unescape(br"\uD800").display_utf8(), Err(()));
2313    }
2314
2315    #[test]
2316    fn test_display_err_high_surrogate_not_followed_by_low() {
2317        assert_display(unescape(br"\uD800\uABCD").display_utf8(), Err(()));
2318    }
2319
2320    #[test]
2321    fn test_display_err_invalid_source_utf8() {
2322        // A valid UTF-8 sequence for 'h' followed by an invalid byte
2323        assert_display(unescape(b"h\x80ello").display_utf8(), Err(()));
2324    }
2325
2326    #[test]
2327    fn strict_valid_multi_byte_split() {
2328        // "€" U+20AC => bytes [0xE2, 0x82, 0xAC]
2329        let input = &[0xE2, 0x82, 0xAC];
2330        let display = unescape(input).display_utf8();
2331        assert_display(display, Ok("€"));
2332    }
2333
2334    #[test]
2335    fn strict_errors_on_invalid_start_byte() {
2336        let input = &[0xFF, b'a'];
2337        let display = unescape(input).display_utf8();
2338
2339        assert_display(display, Err(()));
2340    }
2341
2342    // -- LOSSY TESTS --
2343
2344    #[test]
2345    fn lossy_replaces_invalid_start_byte() {
2346        // 0xFF is invalid as a leading UTF-8 byte.
2347        let input = &[0xFF, b'a']; // invalid byte then ASCII 'a';
2348        let display = unescape(input).display_utf8_lossy();
2349        // replacement char + 'a'
2350        assert_display(display, Ok("\u{FFFD}a"));
2351    }
2352
2353    #[test]
2354    fn lossy_handles_trailing_incomplete_bytes() {
2355        // A trailing incomplete 3-byte sequence: [0xE2, 0x82] (missing 0xAC)
2356        let input: &[u8] = &[0xE2, 0x82];
2357        let display = unescape(input).display_utf8_lossy();
2358        // Should replace incomplete tail with U+FFFD.
2359        assert_display(display, Ok("\u{FFFD}"));
2360    }
2361
2362    #[test]
2363    fn test_display_lossy_invalid_source_utf8() {
2364        // The invalid byte sequence should be replaced.
2365        let input = b"valid\xF0\x90\x80invalid";
2366        let expected = "valid\u{FFFD}invalid";
2367        assert_display(unescape(input).display_utf8_lossy(), Ok(expected));
2368    }
2369
2370    #[test]
2371    fn test_display_lossy_invalid_escape_truncates() {
2372        // In lossy mode, an invalid JSON escape stops the processing.
2373        let input = br"this is ok\z but this is not";
2374        let expected = "this is ok";
2375        assert_display(unescape(input).display_utf8_lossy(), Ok(expected));
2376    }
2377
2378    #[test]
2379    fn test_display_lossy_incomplete_unicode_truncates() {
2380        let input = br"truncate here \uD83D";
2381        let expected = "truncate here ";
2382        assert_display(unescape(input).display_utf8_lossy(), Ok(expected));
2383    }
2384
2385    // Inspired by and copied from memchr
2386    #[test]
2387    fn sync_regression() {
2388        use core::panic::{RefUnwindSafe, UnwindSafe};
2389
2390        fn assert_send_sync<T: Send + Sync + UnwindSafe + RefUnwindSafe>() {}
2391        assert_send_sync::<Unescape<'_>>();
2392        assert_send_sync::<Escape<'_>>();
2393    }
2394}
2395
2396#[cfg(test)]
2397mod find_escape_char_tests {
2398    use std::format;
2399
2400    use super::{ESCAPE_DECISION_TABLE, find_escape_char};
2401
2402    /// Helper function to run a single test case and provide a clear error message on failure.
2403    fn run_test(input: &str, expected: Option<usize>, case_name: &str) {
2404        let result = find_escape_char(input.as_bytes());
2405        assert_eq!(result, expected, "Failed test case: '{}'", case_name);
2406    }
2407
2408    #[test]
2409    fn test_no_escapes() {
2410        run_test("", None, "Empty string");
2411        run_test("Hello, world!", None, "Simple ASCII");
2412        run_test("This string is exactly 16 bytes", None, "16-byte ASCII");
2413        run_test(
2414            "This string is over 16 bytes long now",
2415            None,
2416            "Over 16-byte ASCII",
2417        );
2418
2419        // The original source of the bug: non-ASCII UTF-8 characters.
2420        // This ensures the signedness bug is truly fixed.
2421        run_test("Hello, éàçüö!", None, "Non-ASCII UTF-8");
2422        run_test("Testing with emojis 😀❤️✅", None, "Emojis");
2423    }
2424
2425    #[test]
2426    fn test_single_escapes() {
2427        run_test("\"", Some(0), "Quote at start");
2428        run_test("Hello \" world", Some(6), "Quote in middle");
2429        run_test("Hello\\", Some(5), "Backslash at end");
2430        run_test("\n", Some(0), "Control char (newline) at start");
2431        run_test("Hello\tworld", Some(5), "Control char (tab) in middle");
2432        run_test(
2433            "Control char at end\u{08}",
2434            Some(19),
2435            "Control char (backspace) at end",
2436        );
2437    }
2438
2439    #[test]
2440    fn test_finds_first_of_multiple() {
2441        // This confirms it always finds the *first* match, not a later one.
2442        run_test("a\"b\\c\nd", Some(1), "Finds first quote");
2443        run_test("ab\\c\"d\ne", Some(2), "Finds first backslash");
2444        run_test("abc\nd\"e\\f", Some(3), "Finds first control char");
2445        run_test("\"\n\\", Some(0), "Multiple escapes at start");
2446    }
2447
2448    #[test]
2449    fn test_simd_chunk_boundaries() {
2450        // These tests are critical for verifying the SIMD logic. A chunk is 16 bytes.
2451        let s15 = "a".repeat(15);
2452        let s16 = "a".repeat(16);
2453        let s17 = "a".repeat(17);
2454
2455        // Escape at the exact end of the first 16-byte chunk
2456        run_test(&format!("{}\"", s15), Some(15), "Escape at index 15");
2457
2458        // Escape at the exact start of the second 16-byte chunk
2459        run_test(&format!("{}\n", s16), Some(16), "Escape at index 16");
2460
2461        // Escape within the second chunk
2462        run_test(&format!("{}\t", s17), Some(17), "Escape at index 17");
2463
2464        // A long string with an escape several chunks in
2465        let long = "a".repeat(40);
2466        run_test(
2467            &format!("{}\\\\", long),
2468            Some(40),
2469            "Escape deep in a long string",
2470        );
2471    }
2472
2473    #[test]
2474    fn test_remainder_logic() {
2475        // These tests ensure the scalar fallback logic works correctly for inputs
2476        // that are not a multiple of 16 bytes long.
2477
2478        // String shorter than 16 bytes
2479        run_test("short\nstring", Some(5), "Short string with escape");
2480        run_test("no escapes", None, "Short string no escape");
2481
2482        // String with 17 bytes (16 for SIMD, 1 for remainder)
2483        let s16 = "a".repeat(16);
2484        run_test(
2485            &format!("{}\"", s16),
2486            Some(16),
2487            "Escape in 1-byte remainder",
2488        );
2489
2490        // String with 31 bytes (16 for SIMD, 15 for remainder)
2491        let s15 = "b".repeat(15);
2492        run_test(
2493            &format!("{}{}\t", s15, s15),
2494            Some(30),
2495            "Escape at end of 15-byte remainder",
2496        );
2497    }
2498
2499    #[test]
2500    fn test_all_escapable_bytes_individually() {
2501        // This is the ultimate test. It iterates through all 256 possible byte values
2502        // and confirms that our function's decision matches the ESCAPE_DECISION_TABLE.
2503        let prefix = "0123456789abcdef"; // A 16-byte safe prefix to engage the SIMD loop.
2504
2505        for byte_val in 0..=255u8 {
2506            // We can't create a &str from invalid UTF-8, so we work with byte slices.
2507            let mut test_bytes = prefix.as_bytes().to_vec();
2508            test_bytes.push(byte_val);
2509
2510            let result = find_escape_char(&test_bytes);
2511            let expected_to_escape = ESCAPE_DECISION_TABLE[byte_val as usize] == 1;
2512
2513            if expected_to_escape {
2514                // If this byte SHOULD be escaped, we expect to find it at index 16.
2515                assert_eq!(
2516                    result,
2517                    Some(16),
2518                    "Failed to find required escape for byte 0x{:02X}",
2519                    byte_val
2520                );
2521            } else {
2522                // If this byte should NOT be escaped, we expect to find nothing.
2523                assert_eq!(
2524                    result, None,
2525                    "Incorrectly found an escape for byte 0x{:02X}",
2526                    byte_val
2527                );
2528            }
2529        }
2530    }
2531}
json_escape/lib.rs

json_escape/
lib.rs