json_escape/
lib.rs

1//! # Streaming JSON String Escape/Unescape
2//!
3//! Welcome to a highly efficient, `no_std` compatible library for handling JSON string escaping and unescaping. This crate provides iterator-based tools that process strings on the fly, avoiding heap allocations for the entire result. It's designed for performance-critical applications, such as parsing large JSON files or working in memory-constrained environments. ⚡
4//!
5//! The core of the library is two iterator structs:
6//! - **[`Escape`]**: Takes a string slice (`&str`) and yields escaped string slices ready for JSON serialization.
7//! - **[`Unescape`]**: Takes a byte slice (`&[u8]`) representing the content of a JSON string and yields the decoded byte slices.
8//!
9//! ## Key Features
10//! - **Zero-Copy Slicing**: For sequences of characters that don't need modification, the iterators yield slices that borrow directly from the input, avoiding unnecessary data copying.
11//! - **Comprehensive JSON Support**: Correctly handles all standard JSON escapes: `\"`, `\\`, `\/`, `\b`, `\f`, `\n`, `\r`, `\t`.
12//! - **Full Unicode Handling**: Correctly decodes `\uXXXX` sequences, including full support for UTF-16 surrogate pairs (e.g., `\uD83D\uDE00` for `😀`).
13//! - **Robust Error Handling**: The `Unescape` iterator returns descriptive errors (`UnescapeError`) for invalid or truncated escape sequences, making debugging straightforward.
14//! - **Allocation Control** (with `alloc` feature): Provides convenient methods to collect the iterator's output into owned types like `String` or `Cow<str>`.
15//! - **`std::io` Integration** (with `std` feature): The `Unescape` iterator implements `std::io::Read`, allowing it to be used as an efficient reader for I/O streams.
16//!
17//! ## Quick Start: Escaping a String
18//!
19//! ```
20//! use json_escape::escape_str;
21//!
22//! let input = "Hello, \"world\"!\nThis contains a \\ backslash.";
23//! let expected = r#"Hello, \"world\"!\nThis contains a \\ backslash."#;
24//!
25//! // The `escape_str` function returns an iterator.
26//! let mut escaper = escape_str(input);
27//!
28//! // You can iterate over the chunks:
29//! assert_eq!(escaper.next(), Some("Hello, "));
30//! assert_eq!(escaper.next(), Some(r#"\""#));
31//! assert_eq!(escaper.next(), Some("world"));
32//! // ...and so on.
33//!
34//! // Or, collect it into a String (requires the "alloc" feature).
35//! // let escaped_string: String = escape_str(input).collect();
36//! // assert_eq!(escaped_string, expected);
37//! ```
38//!
39//! ## Quick Start: Unescaping a String
40//!
41//! ```
42//! use json_escape::unescape;
43//!
44//! let input = r#"A 😀 emoji: \uD83D\uDE00 and a tab\t!"#;
45//!
46//! // The unescape iterator yields `Result<&[u8], _>`.
47//! let unescaper = unescape(input);
48//!
49//! // With the "alloc" feature, you can decode it directly into a string.
50//! let decoded_cow = unescaper.decode_utf8().unwrap();
51//! assert_eq!(decoded_cow, "A 😀 emoji: 😀 and a tab\t!");
52//! ```
53#![no_std]
54#![deny(missing_docs)]
55#![cfg_attr(all(feature = "simd", nightly), feature(portable_simd))]
56
57#[cfg(any(test, feature = "std"))]
58extern crate std;
59
60#[cfg(feature = "alloc")]
61extern crate alloc;
62
63#[cfg(any(test, feature = "alloc"))]
64use alloc::{borrow::Cow, string::String, vec::Vec};
65
66use core::{
67    char,
68    fmt::{self, Write as _},
69    iter::FusedIterator,
70    slice, str,
71};
72use memchr::memchr;
73
74// =============================================================================
75// Escape Implementation
76// =============================================================================
77
78/// Creates a streaming JSON string escaper from a string slice.
79///
80/// The returned [`Escape`] iterator lazily processes the input string, yielding
81/// slices that represent the escaped output.
82///
83/// # Examples
84///
85/// ```
86/// use json_escape::escape_str;
87///
88/// let escaper = escape_str("a\nb");
89/// let escaped_parts: Vec<_> = escaper.collect();
90///
91/// assert_eq!(escaped_parts, vec!["a", r#"\n"#, "b"]);
92/// ```
93#[inline]
94pub fn escape_str(input: &str) -> Escape<'_> {
95    Escape {
96        bytes: input.as_bytes(),
97    }
98}
99
100/// A streaming JSON string escaper that yields `&'a str` slices.
101///
102/// This struct is created by the [`escape_str`] function. It is an [`Iterator`]
103/// that breaks the input string into chunks at each character that needs to be
104/// escaped according to JSON rules.
105///
106/// - For sequences of safe characters, it yields a single borrowed slice (`&'a str`).
107/// - For each character that must be escaped, it yields a `'static` slice
108///   containing the escaped representation (e.g., `r#"\n"#`).
109///
110/// This approach is highly efficient as it avoids allocating a new string for the
111/// entire output, processing the input in a streaming fashion.
112///
113/// ### Implemented Traits
114/// - **`Iterator<Item = &'a str>`**: Allows you to process the escaped parts in a loop or with adapters.
115/// - **`Display`**: Lets you write the escaped content directly to any formatter, like `println!` or a file, without intermediate allocation.
116/// - **`Clone`**, **`Debug`**: Standard utility traits.
117/// - **`PartialEq`**, **`PartialEq<B: AsRef<[u8]>>`**: Allows direct comparison of the escaped output. An `Escape` iterator is equal to another `Escape` or a byte slice if they produce an identical sequence of escaped bytes.
118/// - **`From<Escape<'a>> for Cow<'a, str>`** (requires `alloc` feature): Provides an efficient way to convert the iterator into a potentially owned string.
119#[derive(Clone)]
120#[must_use = "iterators are lazy and do nothing unless consumed"]
121pub struct Escape<'a> {
122    bytes: &'a [u8],
123}
124
125impl<'a> Iterator for Escape<'a> {
126    type Item = &'a str;
127
128    #[inline]
129    fn next(&mut self) -> Option<&'a str> {
130        if self.bytes.is_empty() {
131            return None;
132        }
133
134        // Find the first byte that needs escaping.
135        let pos = find_escape_char(self.bytes);
136
137        match pos {
138            // No escapable characters left; return the rest of the slice.
139            None => {
140                let s = self.bytes;
141                self.bytes = &[];
142                // SAFETY: The input was a valid &str, and we're returning the
143                // whole remaining chunk, so it's still valid UTF-8.
144                Some(unsafe { str::from_utf8_unchecked(s) })
145            }
146            // An escapable byte is at the beginning of the slice.
147            Some(0) => {
148                let byte = self.bytes[0];
149                self.bytes = &self.bytes[1..];
150                // The table lookup gives us a &'static str, which is a valid &'a str.
151                //
152                // Some(....unwrap()) is more correct
153                ESCAPE_TABLE[byte as usize]
154            }
155            // Found an escapable byte after a safe prefix. Return the prefix.
156            Some(p) => {
157                let (prefix, rest) = self.bytes.split_at(p);
158                self.bytes = rest;
159                // SAFETY: The soundness of this operation is critical.
160                // We are splitting the byte slice at the position of the first
161                // character that requires escaping. All JSON characters that
162                // require escaping (`"`, `\`, and control characters `\u0000`-`\u001F`)
163                // are single-byte ASCII characters. Therefore, `p` is guaranteed
164                // to be on a valid UTF-8 character boundary.
165                Some(unsafe { str::from_utf8_unchecked(prefix) })
166            }
167        }
168    }
169
170    fn size_hint(&self) -> (usize, Option<usize>) {
171        if self.bytes.is_empty() {
172            (0, Some(0))
173        } else {
174            // We'll yield at least 1 slice, and at most `len` slices if every byte is escaped.
175            (1, Some(self.bytes.len()))
176        }
177    }
178}
179
180impl<'a> FusedIterator for Escape<'a> {}
181
182impl fmt::Display for Escape<'_> {
183    /// Allows direct formatting of the escaped string without intermediate allocation.
184    ///
185    /// This is very useful for writing the escaped output directly to a stream,
186    /// such as a file or a network socket.
187    ///
188    /// # Example
189    ///
190    /// ```
191    /// use json_escape::escape_str;
192    ///
193    /// let escaper = escape_str("User said: \"Hi!\"\n");
194    /// let formatted = format!("{}", escaper);
195    ///
196    /// assert_eq!(formatted, r#"User said: \"Hi!\"\n"#);
197    /// ```
198    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
199        // The `clone()` is cheap as it only copies a slice reference.
200        for s in self.clone() {
201            f.write_str(s)?
202        }
203        Ok(())
204    }
205}
206
207impl fmt::Debug for Escape<'_> {
208    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
209        f.debug_struct("Escape").finish_non_exhaustive()
210    }
211}
212
213impl<B: AsRef<[u8]> + ?Sized> PartialEq<B> for Escape<'_> {
214    /// Compares the escaped output with any byte-slice-like object.
215    ///
216    /// This is primarily a convenience for testing, allowing you to check the
217    /// fully concatenated result of an `Escape` iterator against a known `&str` or `&[u8]`.
218    ///
219    /// The notion of equality is based on the **output**, not the iterator's internal state.
220    ///
221    /// # Example
222    ///
223    /// ```
224    /// use json_escape::escape_str;
225    ///
226    /// let escaper = escape_str("key\tvalue");
227    ///
228    /// // The escaper's output, when concatenated, equals the right-hand side.
229    /// assert_eq!(escaper, r#"key\tvalue"#);
230    /// ```
231    fn eq(&self, other: &B) -> bool {
232        let mut other = other.as_ref();
233        for chunk in self.clone() {
234            if !other.starts_with(chunk.as_bytes()) {
235                return false;
236            }
237            other = &other[chunk.len()..];
238        }
239        // We completely searched it
240        other.is_empty()
241    }
242}
243
244impl<'a, 'b> PartialEq<Escape<'a>> for Escape<'b> {
245    /// Compares two `Escape` iterators for equality.
246    ///
247    /// Two `Escape` iterators are considered equal if they'll produce the same **output**.
248    /// It first performs a fast check on the underlying byte slices.
249    fn eq(&self, other: &Escape<'a>) -> bool {
250        // Fast path: if they are views into the same underlying data.
251        self.bytes == other.bytes || chunks_eq(self.clone(), other.clone())
252    }
253}
254
255#[cfg(feature = "alloc")]
256impl<'a> From<Escape<'a>> for Cow<'a, str> {
257    /// Efficiently collects the escaped parts into a `Cow<'a, str>`.
258    ///
259    /// This implementation is optimized to avoid allocation if possible:
260    /// - If the input string requires **no escaping**, it returns `Cow::Borrowed`
261    ///   with a slice of the original string.
262    /// - If escaping is needed, it allocates a `String` and returns `Cow::Owned`.
263    ///
264    /// This is more efficient than `iter.collect::<String>()` because `collect`
265    /// will always allocate.
266    ///
267    /// **Requires the `alloc` feature.**
268    ///
269    /// # Example
270    ///
271    /// ```
272    /// # #[cfg(feature = "alloc")] {
273    /// use json_escape::escape_str;
274    /// use std::borrow::Cow;
275    ///
276    /// // No escaping needed, so no allocation occurs.
277    /// let cow_borrowed: Cow<str> = escape_str("plain text").into();
278    /// assert!(matches!(cow_borrowed, Cow::Borrowed(_)));
279    ///
280    /// // Escaping is required, so a new String is allocated.
281    /// let cow_owned: Cow<str> = escape_str("text with\nnewline").into();
282    /// assert!(matches!(cow_owned, Cow::Owned(_)));
283    /// assert_eq!(cow_owned, r#"text with\nnewline"#);
284    /// # }
285    /// ```
286    fn from(mut iter: Escape<'a>) -> Self {
287        match iter.next() {
288            None => Cow::Borrowed(""),
289            Some(first) => match iter.next() {
290                None => Cow::Borrowed(first),
291                Some(second) => {
292                    let mut string =
293                        String::with_capacity(first.len() + second.len() + iter.bytes.len());
294                    string.push_str(first);
295                    string.push_str(second);
296                    string.extend(iter);
297                    Cow::Owned(string)
298                }
299            },
300        }
301    }
302}
303
304// =============================================================================
305// Unescape Implementation
306// =============================================================================
307
308/// Creates a streaming JSON string unescaper from a byte slice.
309///
310/// This function creates an iterator to unescape a byte slice representing the
311/// **raw contents** of a JSON string, assuming the outer quotes have already
312/// been removed.
313///
314/// For a more convenient way to handle complete JSON string literals (including
315/// their surrounding `"` quotes), see the [`unescape_quoted`] function, which
316/// automatically trims them.
317///
318/// The iterator will fail if the input contains invalid JSON escape sequences.
319///
320/// # Example
321///
322/// ```
323/// use json_escape::{unescape, unescape_quoted};
324///
325/// // `unescape` works on the raw content, without quotes.
326/// let content = r#"hello\tworld"#;
327/// assert_eq!(unescape(content), "hello\tworld");
328///
329/// // If you pass a full JSON literal, the quotes are treated as literal characters.
330/// let literal = r#""hello\tworld""#;
331/// assert_eq!(unescape(literal), "\"hello\tworld\""); // Note the quotes in the output.
332///
333/// // For full literals like this, `unescape_quoted` is the recommended function.
334/// assert_eq!(unescape_quoted(literal), "hello\tworld");
335/// ```
336#[inline]
337pub fn unescape<I: AsRef<[u8]> + ?Sized>(input: &I) -> Unescape<'_> {
338    Unescape::new(input.as_ref())
339}
340
341/// Creates a streaming JSON string unescaper, trimming enclosing quotes.
342///
343/// This function acts as a convenience wrapper around [`unescape`]. It first
344/// inspects the input byte slice. If the slice begins and ends with a double-quote
345/// character (`"`), these quotes are trimmed before the inner content is passed to
346/// the unescaper.
347///
348/// If the input is not enclosed in quotes, this function behaves exactly like
349/// [`unescape`]. This is useful for directly unescaping a complete JSON string
350/// literal.
351///
352/// # Example
353///
354/// ```
355/// use json_escape::{unescape, unescape_quoted};
356///
357/// // 1. With quotes: The outer quotes are trimmed before unescaping.
358/// let unescaper = unescape_quoted(r#""hello\nworld""#);
359/// assert_eq!(unescaper, b"hello\nworld");
360///
361/// // 2. Without quotes: Behaves exactly like the standard `unescape`.
362/// let unescaper_no_quotes = unescape_quoted(r#"raw string"#);
363/// assert_eq!(unescaper_no_quotes, b"raw string");
364///
365/// // 3. Mismatched quotes: The input is passed through as-is, quotes are not trimmed.
366/// let mismatched_quotes = unescape_quoted(r#"hello""#);
367/// assert_eq!(mismatched_quotes, b"hello\"");
368///
369/// // 4. Empty quoted string: Correctly results in an empty output.
370/// let empty_quoted = unescape_quoted(r#""""#);
371/// assert_eq!(empty_quoted, b"");
372/// ```
373#[inline]
374pub fn unescape_quoted<I: AsRef<[u8]> + ?Sized>(input: &I) -> Unescape<'_> {
375    let bytes = input.as_ref();
376    let input = if bytes.len() >= 2 && bytes[0] == b'\"' && bytes[bytes.len() - 1] == b'\"' {
377        &bytes[1..bytes.len() - 1]
378    } else {
379        bytes
380    };
381
382    unescape(input)
383}
384
385/// A streaming JSON string unescaper.
386///
387/// This struct is created by the [`unescape`] function. It implements an [`Iterator`]
388/// that yields `Result<&'a [u8], UnescapeError>`, lazily decoding the input.
389///
390/// The iterator's output chunks are one of the following:
391/// - **`Ok(&'a [u8])`**: A borrowed slice of the original input for a sequence of non-escaped bytes.
392/// - **`Ok(&'static [u8])`**: A single-byte slice for a decoded escape sequence (e.g., `\n` becomes a slice containing `0x0A`).
393///   For `\uXXXX` sequences, it yields a series of single-byte slices representing the UTF-8 encoding of the character.
394/// - **`Err(UnescapeError)`**: An error indicating an invalid escape sequence, which halts further iteration as described below.
395///
396/// Because the iterator operates on bytes, you can use helper methods like
397/// [`Unescape::decode_utf8`] or [`Unescape::decode_utf8_lossy`] to convert the
398/// final result into a string.
399///
400/// # Error Handling
401///
402/// When the iterator encounters an invalid or incomplete escape, it returns an
403/// `Err(UnescapeError)` describing the problem. The iterator then remains in an
404/// **error state**: subsequent calls to `next()` will continue to return that same
405/// error (i.e., the error is idempotent) and the iterator will not produce further
406/// `Ok` chunks. This makes the behavior deterministic for callers that check the
407/// first error and then stop.
408///
409/// Errors are classified by the precise condition encountered:
410/// - **`InvalidEscape`**: The escape sequence uses an unknown escape character (e.g., `\q`).
411/// - **`InvalidHex`**: A `\u` escape contains a non-hex character where a hex
412///   digit was expected (e.g., `\uZ`).
413/// - **`UnexpectedEof`**: The input ended before a complete escape sequence could be
414///   read. This is used when there isn't enough input yet to decide whether the
415///   sequence would be valid (for instance, an incomplete `\u` or a truncated
416///   surrogate pair).
417/// - **`LoneSurrogate`**: A complete `\uXXXX` was read, and it encodes a *high*
418///   surrogate, but the following bytes definitively do not form a valid low
419///   surrogate escape (for example, the next character is a space or any
420///   non-`\u` character).
421///
422/// The difference between `UnexpectedEof` and `LoneSurrogate` is important:
423/// - `UnexpectedEof` means **we couldn't decide** because the input ended too early.
424/// - `LoneSurrogate` means **we did decide**—we saw a full `\uXXXX` high surrogate,
425///   and the following input proves a pair will not follow.
426///
427/// #### Concrete examples
428///
429/// 1) A high surrogate followed by other data (not a `\u` low-surrogate) → `LoneSurrogate`:
430///
431/// ```rust
432/// use json_escape::{unescape, UnescapeErrorKind, LoneSurrogateError};
433///
434/// let mut iter = unescape(r"\uD83D more data");
435/// let err = iter.next().unwrap().unwrap_err();
436/// assert!(matches!(err.kind(), UnescapeErrorKind::LoneSurrogate(LoneSurrogateError { surrogate: 0xD83D, .. })));
437///
438/// // Subsequent calls return the same error (iterator remains in the same error state).
439/// let err = iter.next().unwrap().unwrap_err();
440/// assert!(matches!(err.kind(), UnescapeErrorKind::LoneSurrogate(LoneSurrogateError { surrogate: 0xD83D, .. })));
441/// ```
442///
443/// 2) An invalid escape character → `InvalidEscape`:
444///
445/// ```rust
446/// use json_escape::{unescape, UnescapeErrorKind, InvalidEscapeError};
447///
448/// let mut iter = unescape(r"\q"); // `\q` is not a defined escape
449/// let err = iter.next().unwrap().unwrap_err();
450/// assert!(matches!(err.kind(), UnescapeErrorKind::InvalidEscape(InvalidEscapeError { found: b'q', .. })));
451/// ```
452///
453/// 3) A malformed `\u` with a non-hex character → `InvalidHex`:
454///
455/// ```rust
456/// use json_escape::{unescape, UnescapeErrorKind, InvalidHexError};
457///
458/// let mut iter = unescape(r"\uZ");
459/// let err = iter.next().unwrap().unwrap_err();
460/// assert!(matches!(err.kind(), UnescapeErrorKind::InvalidHex(InvalidHexError { found: b'Z', .. })));
461/// ```
462///
463/// 4) Truncated / incomplete input ⇒ `UnexpectedEof`:
464///
465/// ```rust
466/// use json_escape::{unescape, UnescapeErrorKind};
467///
468/// // a) truncated after the first \uXXXX (no following bytes yet)
469/// let mut iter = unescape(r"\uD83D");
470/// let err = iter.next().unwrap().unwrap_err();
471/// assert!(matches!(err.kind(), UnescapeErrorKind::UnexpectedEof));
472///
473/// // b) starts a second \u but is truncated before hex digits
474/// let mut iter = unescape(r"\uD83D\u");
475/// let err = iter.next().unwrap().unwrap_err();
476/// assert!(matches!(err.kind(), UnescapeErrorKind::UnexpectedEof));
477///
478/// // c) a lone backslash at end of input
479/// let mut iter = unescape("\\");
480/// let err = iter.next().unwrap().unwrap_err();
481/// assert!(matches!(err.kind(), UnescapeErrorKind::UnexpectedEof));
482/// ```
483///
484/// **Note**: This behavior intentionally mirrors common JSON parsers (e.g.,
485/// `serde_json`, Go's `encoding/json`) for the EOF vs. semantic error distinction.
486///
487/// # Implemented Traits and Usage
488///
489/// - **`Iterator<Item = Result<&'a [u8], UnescapeError>>`**: The core trait for
490///   processing the unescaped byte chunks.
491/// - **`std::io::Read`** (requires `std` feature): Lets you use the unescaper as a
492///   standard reader, perfect for integrating with other I/O APIs.
493/// - **`TryFrom<Unescape<'a>> for Cow<'a, [u8]>`** (requires `alloc` feature): An
494///   efficient way to collect the unescaped bytes, propagating any errors.
495/// - **`Clone`**, **`Debug`**: Standard utility traits.
496/// - **`PartialEq<B: AsRef<[u8]>>`**: Compares the fully unescaped output with a byte slice.
497///
498/// ## Reading Unescaped Bytes
499///
500/// With the `std` feature, `Unescape` can be used as any other `std::io::Read`
501/// source. This is ideal for streaming and decoding large JSON string contents
502/// without buffering the entire result in memory first.
503///
504/// ```rust
505/// # #[cfg(feature = "std")] {
506/// use json_escape::unescape;
507/// use std::io::Read;
508///
509/// let mut reader = unescape(r#"chunk1\nchunk2"#);
510/// let mut buf = Vec::new();
511///
512/// // Read all unescaped bytes from the iterator into the buffer.
513/// reader.read_to_end(&mut buf).unwrap();
514///
515/// assert_eq!(buf, b"chunk1\nchunk2");
516/// # }
517/// ```
518#[derive(Clone)]
519#[must_use = "iterators are lazy and do nothing unless consumed"]
520pub struct Unescape<'a> {
521    // iterator over the input bytes (we use slice::Iter to clone/peek where necessary
522    // without worrying too much about bookkeeping)
523    pub(crate) bytes: slice::Iter<'a, u8>,
524
525    // scratch buffer for encoded UTF-8 bytes from a \uXXXX (or surrogate pair)
526    unicode: [u8; 4],
527    // We can eliminate this by depending on the header.
528    unicode_len: u8, // how many bytes are valid in buf (0 means no pending)
529    unicode_pos: u8, // how many bytes already emitted
530}
531
532impl<'a> Unescape<'a> {
533    /// Construct from a byte slice which contains the characters inside the JSON string (no quotes).
534    fn new(input: &'a [u8]) -> Self {
535        Self {
536            bytes: input.iter(),
537            unicode: [0; 4],
538            unicode_len: 0,
539            unicode_pos: 0,
540        }
541    }
542
543    /// Helper: parse exactly 4 hex digits from `it`. Returns Ok(u16) or an error.
544    #[inline(always)]
545    fn parse_hex4(iter: &mut slice::Iter<'a, u8>, base_offset: u8) -> Result<u16, UnescapeError> {
546        let mut acc = 0u16;
547        for i in 0..4 {
548            let b = match iter.next() {
549                Some(b) => *b,
550                None => {
551                    return Err(UnescapeError {
552                        kind: UnescapeErrorKind::UnexpectedEof,
553                        // The error occurs where the next digit was expected.
554                        offset: base_offset + i,
555                    });
556                }
557            };
558            let v = match b {
559                b'0'..=b'9' => (b - b'0') as u16,
560                b'a'..=b'f' => (b - b'a' + 10) as u16,
561                b'A'..=b'F' => (b - b'A' + 10) as u16,
562                _ => {
563                    return Err(UnescapeError {
564                        kind: UnescapeErrorKind::InvalidHex(InvalidHexError { found: b }),
565                        // The error is the invalid digit itself.
566                        offset: base_offset + i,
567                    });
568                }
569            };
570            acc = (acc << 4) | v;
571        }
572        Ok(acc)
573    }
574
575    /// Parses a unicode escape sequence `\uXXXX` which may be a surrogate pair.
576    /// The iterator `bytes` must be positioned *after* the `\u`.
577    ///
578    /// NOTE: Doesn't preserve the state of the iterator on error
579    #[inline(always)]
580    fn handle_unicode_escape(bytes: &mut slice::Iter<'a, u8>) -> Result<char, UnescapeError> {
581        // Parse first 4 hex digits (\uXXXX)
582        //
583        // The iterator starts *after* '\u'. The first hex digit is at offset 2 from '\'.
584        let first = Self::parse_hex4(bytes, 2)?;
585
586        // High surrogate → must be followed by another \uXXXX low surrogate
587        if (0xD800..=0xDBFF).contains(&first) {
588            match (bytes.next(), bytes.next()) {
589                (Some(b'\\'), Some(b'u')) => {
590                    // Try parsing the low surrogate
591                    //
592                    // The first hex digit of the second escape is at offset 8.
593                    // (\uXXXX\u -> 8 chars)
594                    match Self::parse_hex4(bytes, 8) {
595                        Ok(low) if (0xDC00..=0xDFFF).contains(&low) => {
596                            let high_t = first as u32;
597                            let low_t = low as u32;
598                            let code = 0x10000 + (((high_t - 0xD800) << 10) | (low_t - 0xDC00));
599                            return Ok(char::from_u32(code).expect(
600                                "valid surrogate pair math should always produce a valid char",
601                            ));
602                        }
603                        Ok(_) => {
604                            // Got a full escape but not a low surrogate → Lone surrogate
605                            return Err(UnescapeError {
606                                kind: UnescapeErrorKind::LoneSurrogate(LoneSurrogateError {
607                                    surrogate: first,
608                                }),
609                                offset: 6,
610                            });
611                        }
612                        Err(err) => {
613                            // parse_hex4 failed (e.g. ran out of hex digits)
614                            return Err(err);
615                        }
616                    }
617                }
618                // EOF before even seeing '\' or 'u' → UnexpectedEof
619                (None, _) | (_, None) => {
620                    return Err(UnescapeError {
621                        kind: UnescapeErrorKind::UnexpectedEof,
622                        offset: 6,
623                    });
624                }
625                // Something else after high surrogate → LoneSurrogate
626                _ => {
627                    return Err(UnescapeError {
628                        kind: UnescapeErrorKind::LoneSurrogate(LoneSurrogateError {
629                            surrogate: first,
630                        }),
631                        // The error is detected after consuming `\uXXXX` (6 bytes).
632                        offset: 6,
633                    });
634                }
635            }
636        }
637
638        // Not a surrogate → normal path
639        match char::from_u32(first as u32) {
640            Some(c) => Ok(c),
641            None => Err(UnescapeError {
642                kind: UnescapeErrorKind::LoneSurrogate(LoneSurrogateError { surrogate: first }),
643                // The error is detected after consuming `\uXXXX` (6 bytes).
644                offset: 6,
645            }),
646        }
647    }
648
649    #[inline]
650    fn store_unicode(&mut self, ch: char) {
651        self.unicode_len = ch.encode_utf8(&mut self.unicode).len() as u8;
652        self.unicode_pos = 0;
653    }
654
655    #[inline]
656    fn emit_pending_byte(&mut self) -> Option<u8> {
657        if self.unicode_pos < self.unicode_len {
658            let b = self.unicode[self.unicode_pos as usize];
659            self.unicode_pos += 1;
660            Some(b)
661        } else {
662            None
663        }
664    }
665
666    /// Helper to emit the full unicode sequence and advance the internal position.
667    #[inline]
668    fn emit_unicode_as_str(&mut self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
669        // The check `unicode_pos > 0` is implicit from the call site.
670        // The buffer is guaranteed to contain a valid UTF-8 sequence.
671        let s = unsafe { str::from_utf8_unchecked(&self.unicode[..self.unicode_len as usize]) };
672        f.write_str(s)?;
673
674        // Mark the entire sequence as emitted.
675        self.unicode_pos = self.unicode_len;
676
677        Ok(())
678    }
679
680    /// The single, authoritative helper for producing unescaped byte chunks.
681    ///
682    /// It takes an optional `max` length to limit the size of the returned slice,
683    /// which is essential for the `std::io::Read` implementation.
684    #[inline(always)]
685    fn next_limit(&mut self, limit: Option<usize>) -> Option<Result<&'a [u8], UnescapeError>> {
686        if limit.is_some_and(|l| l == 0) {
687            return Some(Ok(&[]));
688        }
689
690        // If we have pending bytes, emit them first (fast).
691        //
692        // LIMIT: We're allowed not checking here since we'll only produce 1 byte
693        // and limit is at least 1.
694        if let Some(s) = self.emit_pending_byte() {
695            // s: &'static [u8] coerces to &'a [u8]
696            return Some(Ok(byte_as_static_slice(s)));
697        }
698
699        let bytes = self.bytes.as_slice();
700        if bytes.is_empty() {
701            return None;
702        }
703
704        // Find next backslash in the remaining bytes.
705        let pos = memchr(b'\\', bytes);
706
707        match pos {
708            None => {
709                // No more escapes. Return the rest of the slice as a borrowed chunk.
710                let chunk_len = bytes.len().min(limit.unwrap_or(bytes.len()));
711                let (chunk, rest) = bytes.split_at(chunk_len);
712                self.bytes = rest.iter();
713                Some(Ok(chunk))
714            }
715            // LIMIT: We're allowed not checking here since we'll only produce 1 byte
716            // and limit is at least 1.
717            Some(0) => {
718                // We need to parse 4 hex digits from the iterator. But because
719                // `bytes` implements `Clone`, we can clone it to peek ahead
720                // in order to preserve the state of the iterator on failure.
721                let mut lookahead = self.bytes.clone();
722                // Backslash is the first byte in the slice: handle escape
723                lookahead.next(); // Consume the backslash
724
725                match lookahead.next() {
726                    Some(b'u') => match Self::handle_unicode_escape(&mut lookahead) {
727                        Ok(ch) => {
728                            self.bytes = lookahead; // commit
729                            self.store_unicode(ch);
730                            self.emit_pending_byte()
731                                .map(|b| Ok(byte_as_static_slice(b)))
732                        }
733                        Err(err) => Some(Err(err)),
734                    },
735                    Some(byte) => {
736                        if let Some(slice) = UNESCAPE_TABLE[*byte as usize] {
737                            self.bytes = lookahead; // commit
738                            Some(Ok(slice))
739                        } else {
740                            Some(Err(UnescapeError {
741                                kind: UnescapeErrorKind::InvalidEscape(InvalidEscapeError {
742                                    found: *byte,
743                                }),
744                                // The invalid character is 1 byte after '\'.
745                                offset: 1,
746                            }))
747                        }
748                    }
749                    None => Some(Err(UnescapeError {
750                        kind: UnescapeErrorKind::UnexpectedEof,
751                        // EOF occurred 1 byte after '\'.
752                        offset: 1,
753                    })),
754                }
755            }
756            // Found \ after a safe prefix. Return the prefix. We'll handle on next call to next
757            Some(p) => {
758                // Return the safe prefix (borrowed from input)
759                let chunk_len = p.min(limit.unwrap_or(p));
760                let (chunk, rest) = bytes.split_at(chunk_len);
761                self.bytes = rest.iter();
762                Some(Ok(chunk))
763            }
764        }
765    }
766
767    fn _display_utf8(mut self, f: &mut fmt::Formatter<'_>, lossy: bool) -> fmt::Result {
768        // The key insight: Chunks with more than one byte are *always*
769        // borrowed from the original input, as all escaped characters
770        // are yielded byte-by-byte.
771        while let Some(result) = self.next() {
772            match result {
773                Ok(chunk) => {
774                    if chunk.is_empty() {
775                        continue;
776                    }
777
778                    // THE CORE LOGIC:
779                    // Check if the iterator just yielded the *first byte* of a *multi-byte* sequence.
780                    // - `unicode_pos == 1` means the first byte was just emitted.
781                    // - `unicode_len > 1` means it's a multi-byte char (e.g., '¢', '😎').
782                    if self.unicode_pos == 1 && self.unicode_len > 1 {
783                        // This is our special case. We have the first byte in `chunk`, but
784                        // it's more efficient to write the whole character at once from our buffer.
785                        self.emit_unicode_as_str(f)?;
786                        // The iterator will no longer yield the rest of the bytes. Since our helper
787                        // has now advanced it. But to be sure...
788                        self.unicode_pos = self.unicode_len;
789                    } else {
790                        // This is the normal case:
791                        // 1. A large chunk borrowed from the original input.
792                        // 2. A single-byte escape like `\n` or `\t`.
793                        // 3. The last byte of a multi-byte sequence (or the only byte).
794                        // In all these cases, we just need to display the chunk we received.
795                        display_bytes_uft8(chunk, f, lossy)?;
796                    }
797                }
798                Err(_) => {
799                    if lossy {
800                        break;
801                    } else {
802                        return Err(fmt::Error);
803                    }
804                }
805            }
806        }
807
808        Ok(())
809    }
810
811    /// Decodes the unescaped byte stream into a UTF-8 string.
812    ///
813    /// This method consumes the iterator and collects all resulting byte chunks.
814    /// If an unescaping error occurs, it's returned immediately. If the final
815    /// sequence of bytes is not valid UTF-8, a UTF-8 error is returned.
816    ///
817    /// Like `From<Escape>`, this is optimized to return a `Cow::Borrowed` if no
818    /// escapes were present in the input, avoiding allocation.
819    ///
820    /// **Requires the `alloc` feature.**
821    ///
822    /// # Example
823    ///
824    /// ```
825    /// # #[cfg(feature = "alloc")] {
826    /// use json_escape::unescape;
827    ///
828    /// let input = r#"Emoji: \uD83D\uDE00"#;
829    /// let cow = unescape(input).decode_utf8().unwrap();
830    ///
831    /// assert_eq!(cow, "Emoji: 😀");
832    /// # }
833    /// ```
834    #[cfg(feature = "alloc")]
835    pub fn decode_utf8(self) -> Result<Cow<'a, str>, DecodeUtf8Error> {
836        match self.try_into().map_err(DecodeUtf8Error::Unescape)? {
837            Cow::Borrowed(bytes) => str::from_utf8(bytes)
838                .map(Cow::Borrowed)
839                .map_err(DecodeUtf8Error::Utf8),
840            Cow::Owned(bytes) => String::from_utf8(bytes)
841                .map(Cow::Owned)
842                .map_err(|e| DecodeUtf8Error::Utf8(e.utf8_error())),
843        }
844    }
845
846    /// Decodes the unescaped byte stream lossily into a UTF-8 string.
847    ///
848    /// This is similar to [`Unescape::decode_utf8`] but replaces any invalid UTF-8 sequences
849    /// with the replacement character (U+FFFD) instead of returning an error.
850    ///
851    /// An `UnescapeError` can still be returned if the JSON escaping itself is invalid.
852    ///
853    /// **Requires the `alloc` feature.**
854    #[cfg(feature = "alloc")]
855    pub fn decode_utf8_lossy(self) -> Result<Cow<'a, str>, UnescapeError> {
856        Ok(decode_utf8_lossy(self.try_into()?))
857    }
858
859    /// Returns a wrapper that implements [`fmt::Display`].
860    ///
861    /// This allows an `Unescape` iterator to be used directly with formatting
862    /// macros like `println!`, `format!`, etc. It writes the unescaped content
863    /// directly to the formatter's buffer, **avoiding any heap allocations**.
864    ///
865    /// The iterator is consumed, and the resulting unescaped string is written
866    /// to the formatter. Any invalid JSON escape sequences or invalid UTF-8 will
867    /// cause a `fmt::Error`. **You should be cautious when using this method
868    /// with the `format!` macro, as a `fmt::Error` from us will cause the macro
869    /// to panic**.
870    ///
871    /// For a more robust alternative that will not panic on `UnescapeError` or
872    /// invalid bytes, consider using [`Unescape::display_utf8_lossy`] instead.
873    ///
874    /// This method is a **zero-allocation** alternative to [`Unescape::decode_utf8`],
875    /// which might allocate a `String` to return the unescaped content.
876    ///
877    /// # Example
878    ///
879    /// ```
880    /// use json_escape::unescape;
881    ///
882    /// let original = r#"Hello, \uD83C\uDF0E!"#;
883    /// let unescaper = unescape(original);
884    ///
885    /// let formatted = format!("{}", unescaper.display_utf8());
886    /// assert_eq!(formatted, "Hello, 🌎!");
887    /// ```
888    pub fn display_utf8(self) -> DisplayUnescape<'a> {
889        DisplayUnescape { inner: self }
890    }
891
892    /// Returns a wrapper that implements [`fmt::Display`] lossily.
893    ///
894    /// This method is an **allocation-free** way to write unescaped content
895    /// to a formatter. It handles invalid JSON escape sequences and invalid
896    /// UTF-8 gracefully, making it a "lossy" operation.
897    ///
898    /// - **Invalid JSON escape sequences:** Instead of causing an error, the iterator
899    ///   terminates without an error.
900    /// - **Invalid UTF-8 bytes:** These are replaced with the Unicode
901    ///   replacement character (U+FFFD).
902    ///
903    /// This method is the **zero-allocation** counterpart to [`Unescape::decode_utf8_lossy`].
904    pub fn display_utf8_lossy(self) -> DisplayUnescapeLossy<'a> {
905        DisplayUnescapeLossy { inner: self }
906    }
907}
908
909impl<'a> Iterator for Unescape<'a> {
910    type Item = Result<&'a [u8], UnescapeError>;
911
912    fn next(&mut self) -> Option<Self::Item> {
913        self.next_limit(None)
914    }
915
916    fn size_hint(&self) -> (usize, Option<usize>) {
917        // The minimum size is 0 (if the rest of the string is an invalid escape).
918        // The maximum size is the remaining length of the underlying bytes + pending_unicode
919        let (lower, upper) = self.bytes.size_hint();
920        let upper = upper.map(|x| x + (self.unicode_len as usize));
921        // Worst-case is \uXXXX -> 1 byte, so 6 -> 1.
922        (lower.saturating_add(1) / 6, upper)
923    }
924}
925
926impl<'a> FusedIterator for Unescape<'a> {}
927
928#[cfg(feature = "std")]
929impl std::io::Read for Unescape<'_> {
930    fn read(&mut self, mut buf: &mut [u8]) -> std::io::Result<usize> {
931        let start_len = buf.len();
932
933        // Read until buf is full or iter drained
934        loop {
935            // If the buffer is empty, we're done.
936            if buf.is_empty() {
937                return Ok(start_len);
938            }
939
940            match self.next_limit(Some(buf.len())) {
941                Some(Ok(chunk)) => {
942                    // chunk.len() <= buf.len()... next_limit ensures this
943                    let len = chunk.len();
944                    buf[..len].copy_from_slice(chunk);
945                    buf = &mut buf[len..]
946                }
947                Some(Err(err)) => {
948                    return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, err));
949                }
950                None => {
951                    // iter is drained
952                    return Ok(start_len - buf.len());
953                }
954            }
955        }
956    }
957
958    // We can provide an optimized version of read_to_end
959    fn read_to_end(&mut self, buf: &mut Vec<u8>) -> std::io::Result<usize> {
960        let start_len = buf.len();
961
962        // Now, efficiently consume the rest of the iterator
963        for result in self {
964            match result {
965                Ok(chunk) => buf.extend_from_slice(chunk),
966                Err(err) => return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, err)),
967            }
968        }
969
970        Ok(buf.len() - start_len)
971    }
972}
973
974impl fmt::Debug for Unescape<'_> {
975    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
976        f.debug_struct("Unescape").finish_non_exhaustive()
977    }
978}
979
980impl<B: AsRef<[u8]> + ?Sized> PartialEq<B> for Unescape<'_> {
981    /// Compares the unescaped output with a byte-slice-like object.
982    ///
983    /// An `Unescape` iterator is considered equal to a byte slice if it successfully
984    /// unescapes to produce a sequence of bytes identical to that slice. If the
985    /// iterator would produce an error, the comparison returns `false`.
986    ///
987    /// # Example
988    ///
989    /// ```
990    /// use json_escape::unescape;
991    ///
992    /// let unescaper = unescape(r#"hello\nworld"#);
993    /// assert_eq!(unescaper, b"hello\nworld");
994    ///
995    /// // An iterator that produces an error is not equal to any valid slice.
996    /// let failing_unescaper = unescape(r#"\k"#);
997    /// assert_ne!(failing_unescaper, b"k");
998    /// ```
999    fn eq(&self, other: &B) -> bool {
1000        let mut other = other.as_ref();
1001        for result in self.clone() {
1002            match result {
1003                Ok(chunk) => {
1004                    if !other.starts_with(chunk) {
1005                        return false;
1006                    }
1007                    other = &other[chunk.len()..];
1008                }
1009                Err(_) => return false, // An erroring iterator cannot be equal to a valid slice.
1010            }
1011        }
1012        other.is_empty()
1013    }
1014}
1015
1016impl<B: AsRef<[u8]>> PartialEq<Unescape<'_>> for Result<B, UnescapeError> {
1017    /// Compares the unescaper's outcome with a `Result`.
1018    ///
1019    /// This implementation allows for precise testing of the `Unescape` iterator
1020    /// by comparing it against either a successful outcome (`Ok`) or a specific
1021    /// failure (`Err`).
1022    ///
1023    /// - If `result` is `Ok(bytes)`, the comparison is `true` only if the iterator
1024    ///   completes successfully and its concatenated output is identical to `bytes`.
1025    ///
1026    /// - If `result` is `Err(error)`, the comparison is `true` only if the iterator
1027    ///   produces the exact same `UnescapeError`.
1028    ///
1029    /// # Example
1030    ///
1031    /// ```
1032    /// use json_escape::{unescape, UnescapeError, InvalidEscapeError};
1033    ///
1034    /// // --- Success Case ---
1035    /// let unescaper = unescape(r#"hello\tworld"#);
1036    /// // The comparison is against an `Ok` variant.
1037    /// assert_eq!(Ok("hello\tworld"), unescaper);
1038    ///
1039    /// // --- Error Case ---
1040    /// let failing_unescaper = unescape(r#"invalid-\u"#);
1041    /// // We can assert that the iterator produces a specific error.
1042    /// # let unexpected_eof = unescape(r"\u").next().unwrap().unwrap_err();
1043    /// assert_eq!(Err::<&str, _>(unexpected_eof), failing_unescaper);
1044    /// ```
1045    fn eq(&self, unescape: &Unescape<'_>) -> bool {
1046        match self {
1047            Ok(expected_bytes) => unescape == expected_bytes,
1048            Err(expected_error) => {
1049                for result in unescape.clone() {
1050                    if let Err(actual_error) = result {
1051                        // The iterator's first error is its final outcome.
1052                        // It must match the expected error exactly.
1053                        return actual_error == *expected_error;
1054                    }
1055                }
1056                // `unescape` completed successfully, but an error was expected.
1057                false
1058            }
1059        }
1060    }
1061}
1062
1063impl<'a, 'b> PartialEq<Unescape<'a>> for Unescape<'b> {
1064    /// Compares two `Unescape` iterators for equality based on their terminal result.
1065    ///
1066    /// The equality of two `Unescape` iterators is determined by the final `Result`
1067    /// that would be obtained if each iterator were fully consumed (e.g., by using `try_collect()`).
1068    ///
1069    /// The specific rules are as follows:
1070    ///
1071    /// 1.  **Error vs. Error**: If both iterators terminate with an `Err`, they are
1072    ///     considered **equal** if and only if their `UnescapeError`s are identical.
1073    ///     Any bytes successfully unescaped *before* the error are ignored in this case.
1074    /// 2.  **Success vs. Success**: If both iterators terminate with `Ok`, they are
1075    ///     considered **equal** if and only if the complete sequence of unescaped bytes
1076    ///     is identical for both.
1077    /// 3.  **Success vs. Error**: If one iterator terminates with `Ok` and the other
1078    ///     with `Err`, they are always **not equal**.
1079    ///
1080    /// # Example
1081    ///
1082    /// ```
1083    /// use json_escape::unescape;
1084    ///
1085    /// // Case 1: Both iterators produce the same error. They are equal,
1086    /// // even though their valid prefixes ("a" and "b") are different.
1087    /// let failing_a = unescape(r#"a\k"#);
1088    /// let failing_b = unescape(r#"b\k"#);
1089    /// assert_eq!(failing_a, failing_b);
1090    ///
1091    /// // Case 2: Both iterators succeed. Equality depends on the byte stream.
1092    /// let successful_a = unescape(r#"hello\nworld"#);
1093    /// let successful_b = unescape(r#"hello\nworld"#);
1094    /// assert_eq!(successful_a, successful_b);
1095    ///
1096    /// let successful_c = unescape(r#"different"#);
1097    /// assert_ne!(successful_a, successful_c);
1098    ///
1099    /// // Case 3: One succeeds and one fails. They are not equal.
1100    /// let succeeding = unescape(r#"stop"#);
1101    /// let failing = unescape(r#"stop\k"#);
1102    /// assert_ne!(succeeding, failing);
1103    ///
1104    /// // Case 4: Both iterators fail differently. They are not equal.
1105    /// let failing_a = unescape(r#"data:\k"#);
1106    /// let failing_b = unescape(r#"data:\"#);
1107    /// assert_ne!(failing_a, failing_b);
1108    /// ```
1109    fn eq(&self, other: &Unescape<'a>) -> bool {
1110        // Fast path: if they are views into the same underlying data with the same state.
1111        ((self.bytes.as_ref() == other.bytes.as_ref())
1112            && (self.unicode == other.unicode)
1113            && (self.unicode_len == other.unicode_len)
1114            && (self.unicode_pos == other.unicode_pos))
1115            || {
1116                let mut a_error = None;
1117                let mut b_error = None;
1118
1119                let mut a = self.clone().map_while(|result| match result {
1120                    Ok(ok) => Some(ok),
1121                    Err(err) => {
1122                        a_error = Some(err);
1123                        None
1124                    }
1125                });
1126
1127                let mut b = other.clone().map_while(|result| match result {
1128                    Ok(ok) => Some(ok),
1129                    Err(err) => {
1130                        b_error = Some(err);
1131                        None
1132                    }
1133                });
1134
1135                let streams_match = chunks_eq(&mut a, &mut b);
1136
1137                // Drain the iterators to ensure the error state is captured,
1138                // especially if chunks_eq returned false early.
1139                // (e.g unescape("a\k") and unescape("b\k") which are actually
1140                // equal)
1141                a.for_each(|_| {});
1142                b.for_each(|_| {});
1143
1144                match (a_error, b_error) {
1145                    // Both errored: equality depends only on the errors being the same.
1146                    (Some(a_err), Some(b_err)) => a_err == b_err,
1147                    // Both succeeded: equality depends on the byte streams having been identical.
1148                    (None, None) => streams_match,
1149                    // One errored and the other didn't: they are not equal.
1150                    _ => false,
1151                }
1152            }
1153    }
1154}
1155
1156#[cfg(feature = "alloc")]
1157impl<'a> TryFrom<Unescape<'a>> for Cow<'a, [u8]> {
1158    type Error = UnescapeError;
1159
1160    /// Efficiently collects the unescaped bytes into a `Cow<'a, [u8]>`.
1161    ///
1162    /// This implementation will return `Cow::Borrowed` if the original input contained
1163    /// no escape sequences, avoiding allocation. Otherwise, it returns `Cow::Owned`.
1164    ///
1165    /// If any `UnescapeError` is encountered during iteration, the operation
1166    /// halts and returns that error.
1167    ///
1168    /// **Requires the `alloc` feature.**
1169    fn try_from(mut value: Unescape<'a>) -> Result<Self, Self::Error> {
1170        match value.next() {
1171            None => Ok(Cow::Borrowed(b"")),
1172            Some(Ok(first)) => match value.next() {
1173                None => Ok(Cow::Borrowed(first)),
1174                Some(Ok(second)) => {
1175                    let mut buf =
1176                        Vec::with_capacity(first.len() + second.len() + value.bytes.len());
1177                    buf.extend_from_slice(first);
1178                    buf.extend_from_slice(second);
1179                    for item in value {
1180                        buf.extend_from_slice(item?);
1181                    }
1182                    Ok(Cow::Owned(buf))
1183                }
1184                Some(Err(e)) => Err(e),
1185            },
1186            Some(Err(e)) => Err(e),
1187        }
1188    }
1189}
1190
1191// =============================================================================
1192// DisplayUnescape Implementation
1193// =============================================================================
1194
1195/// A wrapper for an [`Unescape`] iterator that implements [`fmt::Display`].
1196///
1197/// This struct is created by the [`Unescape::display_utf8()`] method. It allows for
1198/// printing the unescaped content directly to a formatter, which **avoids
1199/// any heap allocations**. The unescaping and UTF-8 decoding are performed on-the-fly as the
1200/// `fmt` method is called.
1201pub struct DisplayUnescape<'a> {
1202    inner: Unescape<'a>,
1203}
1204
1205impl fmt::Display for DisplayUnescape<'_> {
1206    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1207        self.inner.clone()._display_utf8(f, false)
1208    }
1209}
1210
1211/// A wrapper for an [`Unescape`] iterator that implements [`fmt::Display`] lossily.
1212///
1213/// This struct is created by the [`Unescape::display_utf8_lossy()`] method. Like
1214/// `DisplayUnescape`, it performs its operation **without any heap allocations**.
1215///
1216/// This method differs from `display_utf8` in that it handles two types of
1217/// errors gracefully:
1218/// - Invalid JSON escape sequences will be ignored, and the iterator will
1219///   continue to completion without a `fmt::Error`.
1220/// - Invalid UTF-8 byte sequences will be replaced with the Unicode
1221///   replacement character (``, U+FFFD)
1222pub struct DisplayUnescapeLossy<'a> {
1223    inner: Unescape<'a>,
1224}
1225
1226impl fmt::Display for DisplayUnescapeLossy<'_> {
1227    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1228        // Lossy mode: replace invalid sequences with U+FFFD and continue.
1229        self.inner.clone()._display_utf8(f, true)
1230    }
1231}
1232
1233// =============================================================================
1234// Error Types
1235// =============================================================================
1236
1237/// An error that can occur when decoding the final byte stream to a UTF-8 string.
1238#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1239pub enum DecodeUtf8Error {
1240    /// The unescaped byte sequence was not valid UTF-8.
1241    Utf8(str::Utf8Error),
1242    /// An error occurred during the JSON unescaping process itself.
1243    Unescape(UnescapeError),
1244}
1245
1246impl fmt::Display for DecodeUtf8Error {
1247    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1248        match self {
1249            DecodeUtf8Error::Utf8(e) => fmt::Display::fmt(e, f),
1250            DecodeUtf8Error::Unescape(e) => fmt::Display::fmt(e, f),
1251        }
1252    }
1253}
1254
1255/// Details of an invalid escape sequence error.
1256#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1257#[non_exhaustive]
1258pub struct InvalidEscapeError {
1259    /// The invalid character found after a `\`.
1260    pub found: u8,
1261}
1262
1263/// Details of a lone UTF-16 surrogate error.
1264#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1265#[non_exhaustive]
1266pub struct LoneSurrogateError {
1267    /// The 16-bit surrogate code point.
1268    pub surrogate: u16,
1269}
1270
1271/// Details of an invalid hex digit error within a `\uXXXX` sequence.
1272#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1273#[non_exhaustive]
1274pub struct InvalidHexError {
1275    /// The non-hex character that was found.
1276    pub found: u8,
1277}
1278
1279impl fmt::Display for InvalidHexError {
1280    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1281        write!(f, "found invalid hex digit '0x{:02X}'", self.found)
1282    }
1283}
1284
1285/// An error that can occur during the JSON string unescaping process.
1286#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1287pub struct UnescapeError {
1288    /// The specific kind of unescaping error.
1289    pub(crate) kind: UnescapeErrorKind,
1290    /// The byte offset from the start of the escape sequence (`\`) where the
1291    /// error was detected.
1292    ///
1293    /// This is guaranteed to be less than 12, as the maximum escape sequence
1294    /// is `\uXXXX\uXXXX`.
1295    pub(crate) offset: u8,
1296}
1297
1298impl UnescapeError {
1299    /// Returns the specific kind of error that occurred.
1300    ///
1301    /// This can be used to programmatically handle different error types,
1302    /// such as distinguishing between a malformed hex sequence and an
1303    /// invalid escape character.
1304    ///
1305    /// ### Example
1306    ///
1307    /// ```
1308    /// # use json_escape::{unescape, UnescapeErrorKind, InvalidHexError};
1309    /// let mut unescaper = unescape(r#"\u123Z"#);
1310    /// let err = unescaper.next().unwrap().unwrap_err();
1311    ///
1312    /// match err.kind() {
1313    ///     UnescapeErrorKind::InvalidHex(InvalidHexError { found, .. }) => {
1314    ///         // We can inspect the exact invalid character found.
1315    ///         assert_eq!(found, b'Z');
1316    ///     }
1317    ///     _ => panic!("Expected an InvalidHex error"),
1318    /// }
1319    /// ```
1320    pub fn kind(&self) -> UnescapeErrorKind {
1321        self.kind
1322    }
1323
1324    /// Returns the byte offset from the start of the escape sequence (`\`)
1325    /// where the error was detected.
1326    ///
1327    /// - For `\x`, the offset is `1` (pointing to `x`).
1328    /// - For `\u123?`, the offset is `5` (pointing to `?`).
1329    /// - For a lone surrogate `\uD800`, the offset is `6` (pointing after the sequence).
1330    ///
1331    /// This is useful for providing detailed error messages that can point
1332    /// to the exact location of the problem in the source string.
1333    ///
1334    /// ### Example
1335    ///
1336    /// ```
1337    /// # use json_escape::unescape;
1338    /// let json_string_content = r#"bad escape \x here"#;
1339    /// let mut unescaper = unescape(json_string_content);
1340    ///
1341    /// // read off 'bad escape '
1342    /// let first = unescaper.next().unwrap().unwrap();
1343    /// assert_eq!(first, b"bad escape ");
1344    ///
1345    /// let err = unescaper.next().unwrap().unwrap_err();
1346    ///
1347    /// // The error occurred at the 'x', which is 1 byte after the '\'
1348    /// assert_eq!(err.offset(), 1);
1349    ///
1350    /// // You could use this to highlight the error in the original input
1351    /// let backslash_pos = json_string_content.find('\\').unwrap();
1352    /// let error_pos = backslash_pos + err.offset() as usize;
1353    /// assert_eq!(json_string_content.as_bytes()[error_pos], b'x');
1354    ///
1355    /// // The generated error message also includes this info.
1356    /// let expected_msg = "invalid escape: '\\0x78' at offset 1";
1357    /// assert_eq!(err.to_string(), expected_msg);
1358    /// ```
1359    pub fn offset(&self) -> u8 {
1360        self.offset
1361    }
1362}
1363
1364/// The specific kind of error that can occur during JSON string unescaping.
1365///
1366/// This enum covers all possible failures described by the JSON standard for string contents.
1367#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1368#[non_exhaustive]
1369pub enum UnescapeErrorKind {
1370    /// Found a backslash followed by an unexpected character (e.g., `\x`).
1371    InvalidEscape(InvalidEscapeError),
1372    /// Found `\u` but the following characters were not 4 valid hex digits.
1373    InvalidHex(InvalidHexError),
1374    /// Input ended unexpectedly while parsing an escape sequence (e.g., `\u12`).
1375    UnexpectedEof,
1376    /// The `\u` sequence yielded a lone high or low surrogate without a matching pair.
1377    LoneSurrogate(LoneSurrogateError),
1378}
1379
1380impl fmt::Display for UnescapeError {
1381    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1382        match self.kind {
1383            UnescapeErrorKind::InvalidEscape(e) => {
1384                write!(
1385                    f,
1386                    "invalid escape: '\\0x{:02X}' at offset {}",
1387                    e.found, self.offset
1388                )
1389            }
1390            UnescapeErrorKind::InvalidHex(ref s) => {
1391                write!(f, "{} at offset {}", s, self.offset)
1392            }
1393            UnescapeErrorKind::UnexpectedEof => {
1394                write!(
1395                    f,
1396                    "unexpected end of input while parsing escape sequence, expected character at offset {}",
1397                    self.offset
1398                )
1399            }
1400            UnescapeErrorKind::LoneSurrogate(e) => write!(
1401                f,
1402                "invalid unicode sequence: lone surrogate found: 0x{:04X} at offset {}",
1403                e.surrogate, self.offset
1404            ),
1405        }
1406    }
1407}
1408
1409impl core::error::Error for UnescapeError {}
1410impl core::error::Error for DecodeUtf8Error {
1411    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
1412        match self {
1413            DecodeUtf8Error::Utf8(e) => Some(e),
1414            DecodeUtf8Error::Unescape(e) => Some(e),
1415        }
1416    }
1417}
1418
1419// =============================================================================
1420// Utilities
1421// =============================================================================
1422
1423// A const lookup table for JSON escape sequences.
1424// Maps a byte to its escaped `&'static str` representation.
1425// `None` indicates the byte does not need to be escaped.
1426const ESCAPE_TABLE: [Option<&'static str>; 256] = {
1427    let mut table: [Option<&'static str>; 256] = [None; 256];
1428
1429    // Special characters
1430    table[b'"' as usize] = Some(r#"\""#);
1431    table[b'\\' as usize] = Some(r#"\\"#);
1432
1433    // Common control characters with short escapes
1434    table[0x08] = Some(r#"\b"#); // Backspace
1435    table[0x09] = Some(r#"\t"#); // Tab
1436    table[0x0A] = Some(r#"\n"#); // Line Feed
1437    table[0x0C] = Some(r#"\f"#); // Form Feed
1438    table[0x0D] = Some(r#"\r"#); // Carriage Return
1439
1440    // The rest of the control characters must be `\uXXXX` encoded.
1441    // We can pre-calculate and store all of them as static strings.
1442    table[0x00] = Some(r#"\u0000"#);
1443    table[0x01] = Some(r#"\u0001"#);
1444    table[0x02] = Some(r#"\u0002"#);
1445    table[0x03] = Some(r#"\u0003"#);
1446    table[0x04] = Some(r#"\u0004"#);
1447    table[0x05] = Some(r#"\u0005"#);
1448    table[0x06] = Some(r#"\u0006"#);
1449    table[0x07] = Some(r#"\u0007"#);
1450    // 0x08 to 0x0D are already handled above
1451    table[0x0B] = Some(r#"\u000b"#);
1452    table[0x0E] = Some(r#"\u000e"#);
1453    table[0x0F] = Some(r#"\u000f"#);
1454    table[0x10] = Some(r#"\u0010"#);
1455    table[0x11] = Some(r#"\u0011"#);
1456    table[0x12] = Some(r#"\u0012"#);
1457    table[0x13] = Some(r#"\u0013"#);
1458    table[0x14] = Some(r#"\u0014"#);
1459    table[0x15] = Some(r#"\u0015"#);
1460    table[0x16] = Some(r#"\u0016"#);
1461    table[0x17] = Some(r#"\u0017"#);
1462    table[0x18] = Some(r#"\u0018"#);
1463    table[0x19] = Some(r#"\u0019"#);
1464    table[0x1A] = Some(r#"\u001a"#);
1465    table[0x1B] = Some(r#"\u001b"#);
1466    table[0x1C] = Some(r#"\u001c"#);
1467    table[0x1D] = Some(r#"\u001d"#);
1468    table[0x1E] = Some(r#"\u001e"#);
1469    table[0x1F] = Some(r#"\u001f"#);
1470
1471    table
1472};
1473
1474// A simple boolean-like lookup table for SIMD.
1475// 0 = no escape needed, 1 = escape needed.
1476// This is very compact (256 bytes) and fits easily in the L1 cache.
1477#[allow(unused)]
1478const ESCAPE_DECISION_TABLE: [u8; 256] = {
1479    let mut table = [0u8; 256];
1480    let mut i = 0;
1481    while i < 256 {
1482        if ESCAPE_TABLE[i].is_some() {
1483            table[i] = 1;
1484        }
1485        i += 1;
1486    }
1487    table
1488};
1489
1490// This is the SIMD version, compiled only when the "simd" feature is enabled on nightly build.
1491#[cfg(all(feature = "simd", nightly))]
1492#[inline]
1493fn find_escape_char(bytes: &[u8]) -> Option<usize> {
1494    use std::simd::{Simd, prelude::SimdPartialEq, prelude::SimdPartialOrd};
1495
1496    const LANES: usize = 16; // Process 16 bytes at a time (fits in SSE2/AVX)
1497    let mut i = 0;
1498
1499    // SIMD main loop
1500    while i + LANES <= bytes.len() {
1501        // Load 16 bytes from the slice into a SIMD vector.
1502        let chunk = Simd::<u8, LANES>::from_slice(&bytes[i..]);
1503
1504        // Create comparison vectors. These are effectively 16 copies of the byte.
1505        let space_v = Simd::splat(b' ' - 1); // For the < ' ' check (i.e., <= 0x1F)
1506        let quote_v = Simd::splat(b'"');
1507        let slash_v = Simd::splat(b'\\');
1508
1509        // Perform all 16 comparisons at once. The result is a mask.
1510        let lt_space_mask = chunk.simd_le(space_v);
1511        let eq_quote_mask = chunk.simd_eq(quote_v);
1512        let eq_slash_mask = chunk.simd_eq(slash_v);
1513
1514        // Combine the masks. A byte needs escaping if ANY of the conditions are true.
1515        let combined_mask = lt_space_mask | eq_quote_mask | eq_slash_mask;
1516
1517        // Check if any lane in the combined mask is true.
1518        if combined_mask.any() {
1519            // If yes, find the index of the *first* true lane.
1520            // trailing_zeros() on the bitmask gives us this index directly.
1521            let first_match_index = combined_mask.to_bitmask().trailing_zeros() as usize;
1522            return Some(i + first_match_index);
1523        }
1524
1525        i += LANES;
1526    }
1527
1528    // Handle the remaining bytes (if any) with the simple iterator method.
1529    if i < bytes.len() {
1530        if let Some(pos) = bytes[i..]
1531            .iter()
1532            .position(|&b| ESCAPE_DECISION_TABLE[b as usize] != 0)
1533        {
1534            return Some(i + pos);
1535        }
1536    }
1537
1538    None
1539}
1540
1541#[cfg(all(feature = "simd", not(nightly), target_arch = "x86_64"))]
1542#[inline]
1543fn find_escape_char(bytes: &[u8]) -> Option<usize> {
1544    // This is the stable Rust path using explicit CPU intrinsics.
1545    // It's guarded by cfg flags to only compile on x86_64 with the simd feature.
1546    use std::arch::x86_64::*;
1547
1548    let mut i = 0;
1549    const LANES: usize = 16; // SSE2 works on 128-bit registers, which is 16 bytes.
1550
1551    // On x86_64, we can tell the compiler to use SSE2 features in this specific function.
1552    // This is safe because we've already checked the target architecture.
1553    #[target_feature(enable = "sse2")]
1554    unsafe fn find_in_chunk(bytes: &[u8], i: usize) -> Option<usize> {
1555        // Load 16 bytes of data from the slice. `_mm_loadu` handles unaligned memory.
1556        let chunk = unsafe { _mm_loadu_si128(bytes.as_ptr().add(i) as *const _) };
1557
1558        // Create vectors with the character to check for in all 16 lanes.
1559        // `_mm_set1_epi8` is the intrinsic equivalent of `Simd::splat`.
1560        // Note: The comparison for `< ' '` works correctly with a signed comparison
1561        // because all relevant characters (0-31) are positive.
1562        let space_v = _mm_set1_epi8(b' ' as i8);
1563        let quote_v = _mm_set1_epi8(b'"' as i8);
1564        let slash_v = _mm_set1_epi8(b'\\' as i8);
1565
1566        // Perform the three comparisons in parallel.
1567        let lt_space_mask = _mm_cmplt_epi8(chunk, space_v); // Signed less-than
1568        let eq_quote_mask = _mm_cmpeq_epi8(chunk, quote_v);
1569        let eq_slash_mask = _mm_cmpeq_epi8(chunk, slash_v);
1570
1571        // Combine the results. A lane is all 1s if any condition was true.
1572        let combined_mask = _mm_or_si128(lt_space_mask, _mm_or_si128(eq_quote_mask, eq_slash_mask));
1573
1574        // Create a 16-bit integer bitmask from the most significant bit of each byte lane.
1575        // This is the fastest way to find if any lane matched.
1576        let mask = _mm_movemask_epi8(combined_mask);
1577
1578        // If the mask is not zero, at least one byte matched.
1579        if mask != 0 {
1580            // `trailing_zeros` finds the index of the first `1` bit, which
1581            // corresponds to the first matching byte in our chunk.
1582            Some(i + mask.trailing_zeros() as usize)
1583        } else {
1584            None
1585        }
1586    }
1587
1588    // Main loop
1589    while i + LANES <= bytes.len() {
1590        if let Some(result) = unsafe { find_in_chunk(bytes, i) } {
1591            return Some(result);
1592        }
1593        i += LANES;
1594    }
1595
1596    // Handle the remainder with the fast scalar lookup.
1597    if i < bytes.len() {
1598        if let Some(pos) = bytes[i..]
1599            .iter()
1600            .position(|&b| ESCAPE_DECISION_TABLE[b as usize] != 0)
1601        {
1602            return Some(i + pos);
1603        }
1604    }
1605
1606    None
1607}
1608
1609// A fallback for when SIMD feature is off.
1610#[cfg(not(feature = "simd"))]
1611#[inline]
1612fn find_escape_char(bytes: &[u8]) -> Option<usize> {
1613    bytes
1614        .iter()
1615        .position(|&b| ESCAPE_DECISION_TABLE[b as usize] != 0)
1616}
1617
1618#[cfg(all(feature = "simd", not(nightly), not(target_arch = "x86_64")))]
1619compile_error! { "simd requires nightly or target_arch = \"x86_64\"" }
1620
1621// Escape table: maps the byte after '\' to its escaped representation.
1622const UNESCAPE_TABLE: [Option<&[u8]>; 256] = {
1623    let mut tbl: [Option<&[u8]>; 256] = [None; 256];
1624    tbl[b'"' as usize] = Some(b"\"");
1625    tbl[b'\\' as usize] = Some(b"\\");
1626    tbl[b'/' as usize] = Some(b"/");
1627    tbl[b'b' as usize] = Some(b"\x08");
1628    tbl[b'f' as usize] = Some(b"\x0C");
1629    tbl[b'n' as usize] = Some(b"\n");
1630    tbl[b'r' as usize] = Some(b"\r");
1631    tbl[b't' as usize] = Some(b"\t");
1632    tbl
1633};
1634
1635/// Static table mapping every u8 -> a &'static [u8] of length 1.
1636/// This lets us return a `'static` slice for any single byte cheaply.
1637const U8_TABLE: [[u8; 1]; 256] = {
1638    let mut arr = [[0u8; 1]; 256];
1639    let mut i = 0usize;
1640    while i < 256 {
1641        arr[i] = [i as u8];
1642        i += 1;
1643    }
1644    arr
1645};
1646
1647#[inline(always)]
1648fn byte_as_static_slice(b: u8) -> &'static [u8] {
1649    // coerce from &'static [u8;1] to &'static [u8]
1650    &U8_TABLE[b as usize]
1651}
1652
1653// The following function is copied from the `percent-encoding` crate, version 2.3.2.
1654// Source: https://github.com/servo/rust-url/blob/22b925f93ad505a830f1089538a9ed6f5fd90612/percent_encoding/src/lib.rs#L337-L365
1655//
1656// It is licensed under the same terms as the `percent-encoding` crate (MIT/Apache-2.0).
1657//
1658// This helper is used to efficiently convert a Cow<'_, [u8]> to a Cow<'_, str>
1659// lossily, with a specific optimization to avoid a re-allocation when the input
1660// is an owned, valid UTF-8 Vec<u8>.
1661#[cfg(feature = "alloc")]
1662#[allow(ambiguous_wide_pointer_comparisons)]
1663fn decode_utf8_lossy(input: Cow<'_, [u8]>) -> Cow<'_, str> {
1664    // Note: This function is duplicated in `form_urlencoded/src/query_encoding.rs`.
1665    match input {
1666        Cow::Borrowed(bytes) => String::from_utf8_lossy(bytes),
1667        Cow::Owned(bytes) => {
1668            match String::from_utf8_lossy(&bytes) {
1669                Cow::Borrowed(utf8) => {
1670                    // If from_utf8_lossy returns a Cow::Borrowed, then we can
1671                    // be sure our original bytes were valid UTF-8. This is because
1672                    // if the bytes were invalid UTF-8 from_utf8_lossy would have
1673                    // to allocate a new owned string to back the Cow so it could
1674                    // replace invalid bytes with a placeholder.
1675
1676                    // First we do a debug_assert to confirm our description above.
1677                    let raw_utf8: *const [u8] = utf8.as_bytes();
1678                    debug_assert!(core::ptr::eq(raw_utf8, &*bytes));
1679
1680                    // Given we know the original input bytes are valid UTF-8,
1681                    // and we have ownership of those bytes, we re-use them and
1682                    // return a Cow::Owned here.
1683                    Cow::Owned(unsafe { String::from_utf8_unchecked(bytes) })
1684                }
1685                Cow::Owned(s) => Cow::Owned(s),
1686            }
1687        }
1688    }
1689}
1690
1691/// Compare two chunk-iterators by their concatenated byte stream (streaming,
1692/// zero allocations).
1693///
1694/// This is allocation-free: it streams through both iterators, comparing
1695/// overlapping prefixes and carrying the remainder of the longer chunk
1696/// forward into the next round.
1697fn chunks_eq<'a, I1, A, I2, B>(mut a: I1, mut b: I2) -> bool
1698where
1699    A: 'a + AsRef<[u8]> + ?Sized,
1700    B: 'a + AsRef<[u8]> + ?Sized,
1701    I1: Iterator<Item = &'a A>,
1702    I2: Iterator<Item = &'a B>,
1703{
1704    let mut a_rem: &[u8] = &[];
1705    let mut b_rem: &[u8] = &[];
1706
1707    loop {
1708        // If the remainder buffer for 'a' is empty, try to get the next chunk.
1709        if a_rem.is_empty() {
1710            match a.next() {
1711                Some(chunk) => a_rem = chunk.as_ref(),
1712                // 'a' is exhausted. They are equal only if 'b' is also exhausted.
1713                None => return b_rem.is_empty() && b.next().is_none(),
1714            }
1715        }
1716
1717        // If the remainder buffer for 'b' is empty, try to get the next chunk.
1718        if b_rem.is_empty() {
1719            match b.next() {
1720                Some(chunk) => b_rem = chunk.as_ref(),
1721                // 'b' is exhausted, but we know 'a' is not (since a_rem is non-empty).
1722                // Therefore, they cannot be equal.
1723                None => return false,
1724            }
1725        }
1726
1727        // At this point, both a_rem and b_rem are guaranteed to be non-empty.
1728        // Determine the length of the smaller chunk to compare.
1729        let n = a_rem.len().min(b_rem.len());
1730
1731        // Compare the overlapping parts of the chunks.
1732        if a_rem[..n] != b_rem[..n] {
1733            return false;
1734        }
1735
1736        // Move the slices past the part we just compared.
1737        a_rem = &a_rem[n..];
1738        b_rem = &b_rem[n..];
1739    }
1740}
1741
1742#[inline]
1743fn display_bytes_uft8(bytes: &[u8], f: &mut fmt::Formatter<'_>, lossy: bool) -> fmt::Result {
1744    for chunk in bytes.utf8_chunks() {
1745        f.write_str(chunk.valid())?;
1746
1747        if !chunk.invalid().is_empty() {
1748            if lossy {
1749                f.write_char(char::REPLACEMENT_CHARACTER)?
1750            } else {
1751                return Err(fmt::Error);
1752            }
1753        }
1754    }
1755
1756    Ok(())
1757}
1758
1759#[cfg(test)]
1760mod tests {
1761    use core::fmt::Display;
1762    use std::{io::Read as _, string::ToString as _, vec};
1763
1764    use super::*;
1765
1766    // ===================== Escape ===================== //
1767
1768    fn test_escape_typical(input: &str, want: &str) {
1769        let got = escape_str(input).collect::<String>();
1770        assert_eq!(got, want);
1771
1772        // Test PartialEq too
1773        assert_eq!(escape_str(input), want)
1774    }
1775
1776    #[test]
1777    fn test_empty_string() {
1778        test_escape_typical("", "");
1779    }
1780
1781    #[test]
1782    fn test_quotes() {
1783        test_escape_typical("\"hello\"", "\\\"hello\\\"")
1784    }
1785
1786    #[test]
1787    fn test_backslash() {
1788        test_escape_typical("\\hello\\", "\\\\hello\\\\");
1789    }
1790
1791    #[test]
1792    fn test_slash() {
1793        test_escape_typical("/hello/", "/hello/");
1794    }
1795
1796    #[test]
1797    fn test_control_chars() {
1798        test_escape_typical("\n\r\t\x08\x0C", "\\n\\r\\t\\b\\f");
1799    }
1800
1801    #[test]
1802    fn test_escape_fully() {
1803        let input = "Hello, \"world\"!\nThis contains a \\ backslash and a \t tab.";
1804        let expected = r#"Hello, \"world\"!\nThis contains a \\ backslash and a \t tab."#;
1805        test_escape_typical(input, expected);
1806    }
1807
1808    #[test]
1809    fn test_other_control_chars() {
1810        let input = "Null:\0, Bell:\x07";
1811        let expected = r#"Null:\u0000, Bell:\u0007"#;
1812        test_escape_typical(input, expected);
1813
1814        test_escape_typical("\x00\x1F", "\\u0000\\u001f");
1815        test_escape_typical("\x19", "\\u0019");
1816    }
1817
1818    #[test]
1819    fn test_iterator_chunks() {
1820        let input = "prefix\npostfix";
1821        let mut iter = escape_str(input);
1822        assert_eq!(iter.next(), Some("prefix"));
1823        assert_eq!(iter.next(), Some(r#"\n"#));
1824        assert_eq!(iter.next(), Some("postfix"));
1825        assert_eq!(iter.next(), None);
1826    }
1827
1828    #[test]
1829    fn test_no_escape_needed() {
1830        let input = "A simple string with no escapes.";
1831        let mut iter = escape_str(input);
1832        assert_eq!(iter.next(), Some("A simple string with no escapes."));
1833        assert_eq!(iter.next(), None);
1834
1835        let input = "café";
1836        let mut iter = escape_str(input);
1837        assert_eq!(iter.next(), Some("café"));
1838        assert_eq!(iter.next(), None);
1839
1840        let input = "❤️";
1841        let mut iter = escape_str(input);
1842        assert_eq!(iter.next(), Some("❤️"));
1843        assert_eq!(iter.next(), None);
1844    }
1845
1846    // ===================== Unescape ===================== //
1847
1848    #[test]
1849    fn test_byte_table() {
1850        assert_eq!(byte_as_static_slice(0), &[0]);
1851        assert_eq!(byte_as_static_slice(5), &[5]);
1852        assert_eq!(byte_as_static_slice(255), &[255]);
1853    }
1854
1855    fn test_unescape_typical<I: AsRef<[u8]> + ?Sized>(input: &I, want: &str) {
1856        let got = unescape(input).decode_utf8().unwrap();
1857        assert_eq!(got, want);
1858
1859        // Test PartialEq too
1860        assert_eq!(unescape(input), want);
1861
1862        // Help display
1863        assert_display(unescape(input).display_utf8(), Ok(want));
1864    }
1865
1866    #[test]
1867    fn test_unicode_escape_basic_unescape() {
1868        // \u4E16 => 世 (E4 B8 96)
1869        let s = "X\\u4E16Y";
1870        test_unescape_typical(s, "X世Y");
1871
1872        let s = "Snow: \\u2603"; // \u2603 => ☃
1873        test_unescape_typical(s, "Snow: ☃");
1874
1875        let s = "A \\u03A9 B"; // Ω is U+03A9
1876        test_unescape_typical(s, "A Ω B");
1877    }
1878
1879    #[test]
1880    fn test_surrogate_pair_unescape() {
1881        // 😀 is U+1F600 -> in JSON: \uD83D\uDE00
1882        let s = "A\\uD83D\\uDE00B";
1883        test_unescape_typical(s, "A😀B")
1884    }
1885
1886    #[test]
1887    fn test_invalid_escape_unescape() {
1888        let s = b"\\x";
1889        let mut u = unescape(s);
1890
1891        match u.next() {
1892            Some(Err(UnescapeError {
1893                kind: UnescapeErrorKind::InvalidEscape(InvalidEscapeError { found: b'x' }),
1894                offset: 1,
1895            })) => {}
1896            _ => panic!("expected invalid escape"),
1897        }
1898    }
1899
1900    #[test]
1901    fn test_simple_unescape() {
1902        let input = "Hello\\nWorld\\\"!"; // "Hello\nWorld\"!"
1903        test_unescape_typical(input, "Hello\nWorld\"!")
1904    }
1905
1906    #[test]
1907    fn test_truncated_unicode() {
1908        let input = "Trunc: \\u12"; // too short
1909        let it = unescape(input);
1910        let mut found = false;
1911        for r in it {
1912            match r {
1913                Ok(_) => continue,
1914                Err(UnescapeError {
1915                    kind: UnescapeErrorKind::UnexpectedEof,
1916                    offset: 4,
1917                }) => {
1918                    found = true;
1919                    break;
1920                }
1921                Err(_) => break,
1922            }
1923        }
1924        assert!(found);
1925    }
1926
1927    // ===================== Chunk_Eq ===================== //
1928
1929    #[test]
1930    fn test_empty_iterators_are_equal() {
1931        let a: Vec<&[u8]> = vec![];
1932        let b: Vec<&[u8]> = vec![];
1933        assert!(chunks_eq(a.into_iter(), b.into_iter()));
1934    }
1935
1936    #[test]
1937    fn test_empty_vs_non_empty() {
1938        let a: Vec<&[u8]> = vec![];
1939        let b = vec![&[1, 2, 3]];
1940        assert!(!chunks_eq(a.into_iter(), b.into_iter()));
1941
1942        // And the other way around
1943        let a = vec![&[1, 2, 3]];
1944        let b: Vec<&[u8]> = vec![];
1945        assert!(!chunks_eq(a.into_iter(), b.into_iter()));
1946    }
1947
1948    #[test]
1949    fn test_single_identical_chunks() {
1950        let a = vec!["hello world"];
1951        let b = vec!["hello world"];
1952        assert!(chunks_eq(a.into_iter(), b.into_iter()));
1953    }
1954
1955    #[test]
1956    fn test_different_chunk_boundaries_str() {
1957        // This is the key test: the concatenated content is identical,
1958        // but the chunk divisions are different.
1959        let a = vec!["he", "llo", " ", "world"];
1960        let b = vec!["hello ", "wo", "rld"];
1961        assert!(chunks_eq(a.into_iter(), b.into_iter()));
1962    }
1963
1964    #[test]
1965    fn test_different_chunk_boundaries_bytes() {
1966        let a = vec![&[1, 2], &[3, 4, 5][..]];
1967        let b = vec![&[1, 2, 3], &[4, 5][..]];
1968        assert!(chunks_eq(a.into_iter(), b.into_iter()));
1969    }
1970
1971    #[test]
1972    fn test_one_long_vs_many_short() {
1973        let a = vec!["a-long-single-chunk"];
1974        let b = vec!["a", "-", "long", "-", "single", "-", "chunk"];
1975        assert!(chunks_eq(a.into_iter(), b.into_iter()));
1976    }
1977
1978    #[test]
1979    fn test_unequal_content_same_length() {
1980        let a = vec!["hello"];
1981        let b = vec!["hallo"];
1982        assert!(!chunks_eq(a.into_iter(), b.into_iter()));
1983    }
1984
1985    #[test]
1986    fn test_unequal_at_chunk_boundary() {
1987        let a = vec!["ab", "c"]; // "abc"
1988        let b = vec!["ab", "d"]; // "abd"
1989        assert!(!chunks_eq(a.into_iter(), b.into_iter()));
1990    }
1991
1992    #[test]
1993    fn test_one_is_prefix_of_other() {
1994        // a is shorter
1995        let a = vec!["user", "name"]; // "username"
1996        let b = vec!["user", "name", "123"]; // "username123"
1997        assert!(!chunks_eq(a.into_iter(), b.into_iter()));
1998
1999        // b is shorter
2000        let a = vec!["user", "name", "123"];
2001        let b = vec!["user", "name"];
2002        assert!(!chunks_eq(a.into_iter(), b.into_iter()));
2003    }
2004
2005    #[test]
2006    fn test_complex_remainer_logic() {
2007        // This tests the carry-over logic extensively.
2008        // a: [1,2,3], [4,5], [6,7,8,9], [10]
2009        // b: [1,2], [3,4,5,6], [7,8], [9,10]
2010        let a = vec![&[1, 2, 3], &[4, 5][..], &[6, 7, 8, 9], &[10]];
2011        let b = vec![&[1, 2], &[3, 4, 5, 6][..], &[7, 8], &[9, 10]];
2012        assert!(chunks_eq(a.into_iter(), b.into_iter()));
2013    }
2014
2015    #[test]
2016    fn test_with_vec_references() {
2017        let v_a1 = vec![1, 2];
2018        let v_a2 = vec![3, 4, 5];
2019        let a_data = vec![&v_a1, &v_a2];
2020
2021        let v_b1 = vec![1, 2, 3];
2022        let v_b2 = vec![4, 5];
2023        let b_data = vec![&v_b1, &v_b2];
2024        assert!(chunks_eq(a_data.into_iter(), b_data.into_iter()));
2025    }
2026
2027    // ===================== Unescape Read ===================== //
2028
2029    #[test]
2030    fn test_read_simple() {
2031        let input = br#"hello world"#;
2032        let mut reader = unescape(input);
2033        let mut buf = [0u8; 20];
2034
2035        let bytes_read = reader.read(&mut buf).unwrap();
2036
2037        assert_eq!(bytes_read, 11);
2038        assert_eq!(&buf[..bytes_read], b"hello world");
2039
2040        // Second read should return 0 (EOF)
2041        let bytes_read_eof = reader.read(&mut buf).unwrap();
2042        assert_eq!(bytes_read_eof, 0);
2043    }
2044
2045    #[test]
2046    fn test_read_with_simple_escapes() {
2047        let input = br#"hello\tworld\nline2"#;
2048        let mut reader = unescape(input);
2049        let mut buf = Vec::new();
2050
2051        reader.read_to_end(&mut buf).unwrap();
2052
2053        assert_eq!(buf, b"hello\tworld\nline2");
2054    }
2055
2056    #[test]
2057    fn test_read_into_small_buffer_multiple_calls() {
2058        let input = br#"this is a long string with no escapes"#;
2059        let mut reader = unescape(input);
2060        let mut buf = [0u8; 10];
2061        let mut result = Vec::new();
2062
2063        loop {
2064            match reader.read(&mut buf) {
2065                Ok(0) => break, // EOF
2066                Ok(n) => {
2067                    result.extend_from_slice(&buf[..n]);
2068                }
2069                Err(e) => panic!("Read error: {}", e),
2070            }
2071        }
2072
2073        assert_eq!(result, input);
2074    }
2075
2076    #[test]
2077    fn test_read_multibyte_char_across_buffer_boundary() {
2078        // The grinning face emoji 😀 is \uD83D\uDE00, which is 4 bytes in UTF-8: 0xF0 0x9F 0x98 0x80
2079        let input = br#"emoji: \uD83D\uDE00 is here"#;
2080        let mut reader = unescape(input);
2081
2082        // Buffer is small, forcing the 4-byte emoji to be written across multiple calls
2083        let mut buf = [0u8; 8];
2084        let mut result = Vec::new();
2085
2086        // First read: "emoji: " (7 bytes) + first byte of emoji
2087        let n1 = reader.read(&mut buf).unwrap();
2088        assert_eq!(n1, 8);
2089        assert_eq!(&buf[..n1], b"emoji: \xF0");
2090        result.extend_from_slice(&buf[..n1]);
2091
2092        // Second read: next 3 bytes of emoji + " is h"
2093        let n2 = reader.read(&mut buf).unwrap();
2094        assert_eq!(n2, 8);
2095        assert_eq!(&buf[..n2], b"\x9F\x98\x80 is h");
2096        result.extend_from_slice(&buf[..n2]);
2097
2098        // Third read: "ere"
2099        let n3 = reader.read(&mut buf).unwrap();
2100        assert_eq!(n3, 3);
2101        assert_eq!(&buf[..n3], b"ere");
2102        result.extend_from_slice(&buf[..n3]);
2103
2104        // Final read should be EOF
2105        let n4 = reader.read(&mut buf).unwrap();
2106        assert_eq!(n4, 0);
2107
2108        assert_eq!(result, b"emoji: \xF0\x9F\x98\x80 is here");
2109        assert_eq!(result, "emoji: 😀 is here".as_bytes());
2110    }
2111
2112    #[test]
2113    fn test_read_error_invalid_escape() {
2114        let input = br#"hello \q world"#;
2115        let mut reader = unescape(input);
2116        let mut buf = [0u8; 20];
2117
2118        let result = reader.read(&mut buf);
2119
2120        assert!(result.is_err());
2121        let err = result.unwrap_err();
2122        assert_eq!(err.kind(), std::io::ErrorKind::InvalidData);
2123        assert!(err.to_string().contains("invalid escape"));
2124    }
2125
2126    #[test]
2127    fn test_read_error_lone_surrogate() {
2128        let input = br#"\uD83D rest of data seen"#; // High surrogate without a following low one
2129        let mut reader = unescape(input);
2130        let mut buf = [0u8; 10];
2131
2132        let err = reader.read(&mut buf).unwrap_err();
2133        assert_eq!(err.kind(), std::io::ErrorKind::InvalidData);
2134        assert!(err.to_string().contains("lone surrogate"));
2135    }
2136
2137    #[test]
2138    fn test_read_empty_input() {
2139        let input = b"";
2140        let mut reader = unescape(input);
2141        let mut buf = [0u8; 10];
2142        let bytes_read = reader.read(&mut buf).unwrap();
2143        assert_eq!(bytes_read, 0);
2144    }
2145
2146    #[test]
2147    fn test_read_into_empty_buffer() {
2148        let input = b"hello";
2149        let mut reader = unescape(input);
2150        let mut buf = [0u8; 0];
2151        let bytes_read = reader.read(&mut buf).unwrap();
2152        // A read into an empty buffer should always succeed and return 0.
2153        assert_eq!(bytes_read, 0);
2154    }
2155
2156    #[test]
2157    fn test_read_to_end_optimized() {
2158        let input = br#"first\nsecond\tthird \uD83D\uDE00 last"#;
2159        let mut reader = unescape(input);
2160        let mut buf = Vec::new();
2161
2162        let bytes_read = reader.read_to_end(&mut buf).unwrap();
2163        let expected = b"first\nsecond\tthird \xF0\x9F\x98\x80 last";
2164
2165        assert_eq!(bytes_read, expected.len());
2166        assert_eq!(buf, expected);
2167    }
2168
2169    // ===================== Unescape Display ===================== //
2170
2171    fn assert_display(display: impl Display, want: Result<&str, ()>) {
2172        let mut w = String::new();
2173        let res = fmt::write(&mut w, format_args!("{display}"));
2174
2175        match want {
2176            Ok(want) => {
2177                assert!(res.is_ok());
2178                assert_eq!(w, want)
2179            }
2180            Err(_) => assert!(
2181                res.is_err(),
2182                "strict mode should return Err on invalid bytes"
2183            ),
2184        }
2185    }
2186
2187    // -- NON-LOSSY TESTS (must be perfect) --
2188
2189    #[test]
2190    fn test_display_simple_string() {
2191        let display = unescape("hello world").display_utf8();
2192        assert_display(display, Ok("hello world"));
2193    }
2194
2195    #[test]
2196    fn test_display_empty_string() {
2197        assert_display(unescape("").display_utf8(), Ok(""));
2198    }
2199
2200    #[test]
2201    fn test_display_standard_escapes() {
2202        let input = br#"\" \\ \/ \b \f \n \r \t"#;
2203        let expected = "\" \\ / \x08 \x0C \n \r \t";
2204        assert_display(unescape(input).display_utf8(), Ok(expected));
2205    }
2206
2207    #[test]
2208    fn test_display_non_escaped_utf8() {
2209        let input = "你好, world".as_bytes();
2210        let expected = "你好, world";
2211        assert_display(unescape(input).display_utf8(), Ok(expected));
2212    }
2213
2214    #[test]
2215    fn test_display_unicode_escape_bmp() {
2216        // cent sign: \u00A2 -> C2 A2 (2 bytes)
2217        let input = br"a\u00A2b";
2218        let expected = "a¢b";
2219        assert_display(unescape(input).display_utf8(), Ok(expected));
2220    }
2221
2222    #[test]
2223    fn test_display_mixed_content() {
2224        let input = br#"Text with \n, \u00A2, and \uD83D\uDE0E emojis."#;
2225        let expected = "Text with \n, ¢, and 😎 emojis.";
2226        assert_display(unescape(input).display_utf8(), Ok(expected));
2227    }
2228
2229    #[test]
2230    fn test_display_starts_and_ends_with_escape() {
2231        let input = br#"\u00A2hello\t"#;
2232        let expected = "¢hello\t";
2233        assert_display(unescape(input).display_utf8(), Ok(expected));
2234    }
2235
2236    // -- NON-LOSSY ERROR TESTS --
2237
2238    #[test]
2239    fn test_display_err_invalid_escape() {
2240        assert_display(unescape(br"hello \z world").display_utf8(), Err(()));
2241    }
2242
2243    #[test]
2244    fn test_display_err_incomplete_unicode() {
2245        assert_display(unescape(br"\u123").display_utf8(), Err(()));
2246    }
2247
2248    #[test]
2249    fn test_display_err_invalid_hex_in_unicode() {
2250        assert_display(unescape(br"\u123g").display_utf8(), Err(()));
2251    }
2252
2253    #[test]
2254    fn test_display_err_lone_high_surrogate() {
2255        assert_display(unescape(br"\uD800").display_utf8(), Err(()));
2256    }
2257
2258    #[test]
2259    fn test_display_err_high_surrogate_not_followed_by_low() {
2260        assert_display(unescape(br"\uD800\uABCD").display_utf8(), Err(()));
2261    }
2262
2263    #[test]
2264    fn test_display_err_invalid_source_utf8() {
2265        // A valid UTF-8 sequence for 'h' followed by an invalid byte
2266        assert_display(unescape(b"h\x80ello").display_utf8(), Err(()));
2267    }
2268
2269    #[test]
2270    fn strict_valid_multi_byte_split() {
2271        // "€" U+20AC => bytes [0xE2, 0x82, 0xAC]
2272        let input = &[0xE2, 0x82, 0xAC];
2273        let display = unescape(input).display_utf8();
2274        assert_display(display, Ok("€"));
2275    }
2276
2277    #[test]
2278    fn strict_errors_on_invalid_start_byte() {
2279        let input = &[0xFF, b'a'];
2280        let display = unescape(input).display_utf8();
2281
2282        assert_display(display, Err(()));
2283    }
2284
2285    // -- LOSSY TESTS --
2286
2287    #[test]
2288    fn lossy_replaces_invalid_start_byte() {
2289        // 0xFF is invalid as a leading UTF-8 byte.
2290        let input = &[0xFF, b'a']; // invalid byte then ASCII 'a';
2291        let display = unescape(input).display_utf8_lossy();
2292        // replacement char + 'a'
2293        assert_display(display, Ok("\u{FFFD}a"));
2294    }
2295
2296    #[test]
2297    fn lossy_handles_trailing_incomplete_bytes() {
2298        // A trailing incomplete 3-byte sequence: [0xE2, 0x82] (missing 0xAC)
2299        let input: &[u8] = &[0xE2, 0x82];
2300        let display = unescape(input).display_utf8_lossy();
2301        // Should replace incomplete tail with U+FFFD.
2302        assert_display(display, Ok("\u{FFFD}"));
2303    }
2304
2305    #[test]
2306    fn test_display_lossy_invalid_source_utf8() {
2307        // The invalid byte sequence should be replaced.
2308        let input = b"valid\xF0\x90\x80invalid";
2309        let expected = "valid\u{FFFD}invalid";
2310        assert_display(unescape(input).display_utf8_lossy(), Ok(expected));
2311    }
2312
2313    #[test]
2314    fn test_display_lossy_invalid_escape_truncates() {
2315        // In lossy mode, an invalid JSON escape stops the processing.
2316        let input = br"this is ok\z but this is not";
2317        let expected = "this is ok";
2318        assert_display(unescape(input).display_utf8_lossy(), Ok(expected));
2319    }
2320
2321    #[test]
2322    fn test_display_lossy_incomplete_unicode_truncates() {
2323        let input = br"truncate here \uD83D";
2324        let expected = "truncate here ";
2325        assert_display(unescape(input).display_utf8_lossy(), Ok(expected));
2326    }
2327
2328    // Inspired by and copied from memchr
2329    #[test]
2330    fn sync_regression() {
2331        use core::panic::{RefUnwindSafe, UnwindSafe};
2332
2333        fn assert_send_sync<T: Send + Sync + UnwindSafe + RefUnwindSafe>() {}
2334        assert_send_sync::<Unescape<'_>>();
2335        assert_send_sync::<Escape<'_>>();
2336    }
2337}
json_escape/lib.rs

json_escape/
lib.rs