json_escape/
lib.rs

1//! # Streaming JSON String Escape/Unescape
2//!
3//! Welcome to a highly efficient, `no_std` compatible library for handling JSON string escaping and unescaping. This crate provides iterator-based tools that process strings on the fly, avoiding heap allocations for the entire result. It's designed for performance-critical applications, such as parsing large JSON files or working in memory-constrained environments. ⚡
4//!
5//! The core of the library is two iterator structs:
6//! - **[`Escape`]**: Takes a string slice (`&str`) and yields escaped string slices ready for JSON serialization.
7//! - **[`Unescape`]**: Takes a byte slice (`&[u8]`) representing the content of a JSON string and yields the decoded byte slices.
8//!
9//! ## Key Features
10//! - **Zero-Copy Slicing**: For sequences of characters that don't need modification, the iterators yield slices that borrow directly from the input, avoiding unnecessary data copying.
11//! - **Comprehensive JSON Support**: Correctly handles all standard JSON escapes: `\"`, `\\`, `\/`, `\b`, `\f`, `\n`, `\r`, `\t`.
12//! - **Full Unicode Handling**: Correctly decodes `\uXXXX` sequences, including full support for UTF-16 surrogate pairs (e.g., `\uD83D\uDE00` for `😀`).
13//! - **Robust Error Handling**: The `Unescape` iterator returns descriptive errors (`UnescapeError`) for invalid or truncated escape sequences, making debugging straightforward.
14//! - **Allocation Control** (with `alloc` feature): Provides convenient methods to collect the iterator's output into owned types like `String` or `Cow<str>`.
15//! - **`std::io` Integration** (with `std` feature): The `Unescape` iterator implements `std::io::Read`, allowing it to be used as an efficient reader for I/O streams.
16//!
17//! ## Quick Start: Escaping a String
18//!
19//! ```
20//! use json_escape::escape_str;
21//!
22//! let input = "Hello, \"world\"!\nThis contains a \\ backslash.";
23//! let expected = r#"Hello, \"world\"!\nThis contains a \\ backslash."#;
24//!
25//! // The `escape_str` function returns an iterator.
26//! let mut escaper = escape_str(input);
27//!
28//! // You can iterate over the chunks:
29//! assert_eq!(escaper.next(), Some("Hello, "));
30//! assert_eq!(escaper.next(), Some(r#"\""#));
31//! assert_eq!(escaper.next(), Some("world"));
32//! // ...and so on.
33//!
34//! // Or, collect it into a String (requires the "alloc" feature).
35//! // let escaped_string: String = escape_str(input).collect();
36//! // assert_eq!(escaped_string, expected);
37//! ```
38//!
39//! ## Quick Start: Unescaping a String
40//!
41//! ```
42//! use json_escape::unescape;
43//!
44//! let input = r#"A 😀 emoji: \uD83D\uDE00 and a tab\t!"#;
45//!
46//! // The unescape iterator yields `Result<&[u8], _>`.
47//! let unescaper = unescape(input);
48//!
49//! // With the "alloc" feature, you can decode it directly into a string.
50//! let decoded_cow = unescaper.decode_utf8().unwrap();
51//! assert_eq!(decoded_cow, "A 😀 emoji: 😀 and a tab\t!");
52//! ```
53//!
54//! ## Performance and the `explicit` Module
55//!
56//! This crate is designed for high-performance, zero-allocation escaping and
57//! unescaping. For most use cases, the functions in this root module provide the
58//! best balance of ergonomics and speed.
59//!
60//! However, for users with extreme performance requirements, the [`explicit`]
61//! module is provided. Its iterators yield structured `Chunk` data instead of
62//! simple slices. As shown by benchmarks, this approach can be slightly faster,
63//! especially on inputs with a high density of escape sequences. If you are
64//! processing a very large volume of JSON strings in a tight loop, consider
65//! using the `explicit` module for a potential performance boost.
66#![no_std]
67#![deny(missing_docs)]
68#![deny(clippy::undocumented_unsafe_blocks)]
69#![cfg_attr(all(feature = "simd", nightly), feature(portable_simd))]
70
71#[cfg(any(test, feature = "std"))]
72extern crate std;
73
74#[cfg(feature = "alloc")]
75extern crate alloc;
76
77#[cfg(any(test, feature = "alloc"))]
78use alloc::{borrow::Cow, string::String, vec::Vec};
79use token::EscapeTokens;
80
81use core::{
82    char,
83    fmt::{self, Write as _},
84    iter::FusedIterator,
85    str,
86};
87
88pub mod explicit;
89pub mod stream;
90pub mod token;
91
92// =============================================================================
93// Escape Implementation
94// =============================================================================
95
96/// Creates a streaming JSON string escaper from a string slice.
97///
98/// The returned [`Escape`] iterator lazily processes the input string, yielding
99/// slices that represent the escaped output.
100///
101/// # Examples
102///
103/// ```
104/// use json_escape::escape_str;
105///
106/// let escaper = escape_str("a\nb");
107/// let escaped_parts: Vec<_> = escaper.collect();
108///
109/// assert_eq!(escaped_parts, vec!["a", r#"\n"#, "b"]);
110/// ```
111#[inline]
112pub fn escape_str(input: &str) -> Escape<'_> {
113    Escape {
114        inner: EscapeTokens::new(input),
115    }
116}
117
118/// A streaming JSON string escaper that yields `&'a str` slices.
119///
120/// This struct is created by the [`escape_str`] function. It is an [`Iterator`]
121/// that breaks the input string into chunks at each character that needs to be
122/// escaped according to JSON rules.
123///
124/// - For sequences of safe characters, it yields a single borrowed slice (`&'a str`).
125/// - For each character that must be escaped, it yields a `'static` slice
126///   containing the escaped representation (e.g., `r#"\n"#`).
127///
128/// This approach is highly efficient as it avoids allocating a new string for the
129/// entire output, processing the input in a streaming fashion.
130///
131/// ### Implemented Traits
132/// - **`Iterator<Item = &'a str>`**: Allows you to process the escaped parts in a loop or with adapters.
133/// - **`Display`**: Lets you write the escaped content directly to any formatter, like `println!` or a file, without intermediate allocation.
134/// - **`Clone`**, **`Debug`**: Standard utility traits.
135/// - **`PartialEq`**, **`PartialEq<B: AsRef<[u8]>>`**: Allows direct comparison of the escaped output. An `Escape` iterator is equal to another `Escape` or a byte slice if they produce an identical sequence of escaped bytes.
136/// - **`From<Escape<'a>> for Cow<'a, str>`** (requires `alloc` feature): Provides an efficient way to convert the iterator into a potentially owned string.
137#[derive(Clone)]
138#[must_use = "iterators are lazy and do nothing unless consumed"]
139pub struct Escape<'a> {
140    inner: EscapeTokens<'a>,
141}
142
143impl<'a> Iterator for Escape<'a> {
144    type Item = &'a str;
145
146    #[inline(always)]
147    fn next(&mut self) -> Option<&'a str> {
148        self.inner.next().map(|s| s.as_str())
149    }
150
151    fn size_hint(&self) -> (usize, Option<usize>) {
152        self.inner.size_hint()
153    }
154}
155
156impl<'a> FusedIterator for Escape<'a> {}
157
158impl fmt::Display for Escape<'_> {
159    /// Allows direct formatting of the escaped string without intermediate allocation.
160    ///
161    /// This is very useful for writing the escaped output directly to a stream,
162    /// such as a file or a network socket.
163    ///
164    /// # Example
165    ///
166    /// ```
167    /// use json_escape::escape_str;
168    ///
169    /// let escaper = escape_str("User said: \"Hi!\"\n");
170    /// let formatted = format!("{}", escaper);
171    ///
172    /// assert_eq!(formatted, r#"User said: \"Hi!\"\n"#);
173    /// ```
174    #[inline]
175    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
176        fmt::Display::fmt(&self.inner, f)
177    }
178}
179
180impl fmt::Debug for Escape<'_> {
181    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
182        f.debug_struct("Escape").finish_non_exhaustive()
183    }
184}
185
186impl<B: AsRef<[u8]> + ?Sized> PartialEq<B> for Escape<'_> {
187    /// Compares the escaped output with any byte-slice-like object.
188    ///
189    /// This is primarily a convenience for testing, allowing you to check the
190    /// fully concatenated result of an `Escape` iterator against a known `&str` or `&[u8]`.
191    ///
192    /// The notion of equality is based on the **output**, not the iterator's internal state.
193    ///
194    /// # Example
195    ///
196    /// ```
197    /// use json_escape::escape_str;
198    ///
199    /// let escaper = escape_str("key\tvalue");
200    ///
201    /// // The escaper's output, when concatenated, equals the right-hand side.
202    /// assert_eq!(escaper, r#"key\tvalue"#);
203    /// ```
204    fn eq(&self, other: &B) -> bool {
205        let mut other = other.as_ref();
206        for chunk in self.clone() {
207            if !other.starts_with(chunk.as_bytes()) {
208                return false;
209            }
210            other = &other[chunk.len()..];
211        }
212        // We completely searched it
213        other.is_empty()
214    }
215}
216
217impl<'a, 'b> PartialEq<Escape<'a>> for Escape<'b> {
218    /// Compares two `Escape` iterators for equality.
219    ///
220    /// Two `Escape` iterators are considered equal if they'll produce the same **output**.
221    /// It first performs a fast check on the underlying byte slices.
222    fn eq(&self, other: &Escape<'a>) -> bool {
223        // Fast path: if they are views into the same underlying data.
224        self.inner.bytes == other.inner.bytes || chunks_eq(self.clone(), other.clone())
225    }
226}
227
228#[cfg(feature = "alloc")]
229impl<'a> From<Escape<'a>> for Cow<'a, str> {
230    /// Efficiently collects the escaped parts into a `Cow<'a, str>`.
231    ///
232    /// This implementation is optimized to avoid allocation if possible:
233    /// - If the input string requires **no escaping**, it returns `Cow::Borrowed`
234    ///   with a slice of the original string.
235    /// - If escaping is needed, it allocates a `String` and returns `Cow::Owned`.
236    ///
237    /// This is more efficient than `iter.collect::<String>()` because `collect`
238    /// will always allocate.
239    ///
240    /// **Requires the `alloc` feature.**
241    ///
242    /// # Example
243    ///
244    /// ```
245    /// # #[cfg(feature = "alloc")] {
246    /// use json_escape::escape_str;
247    /// use std::borrow::Cow;
248    ///
249    /// // No escaping needed, so no allocation occurs.
250    /// let cow_borrowed: Cow<str> = escape_str("plain text").into();
251    /// assert!(matches!(cow_borrowed, Cow::Borrowed(_)));
252    ///
253    /// // Escaping is required, so a new String is allocated.
254    /// let cow_owned: Cow<str> = escape_str("text with\nnewline").into();
255    /// assert!(matches!(cow_owned, Cow::Owned(_)));
256    /// assert_eq!(cow_owned, r#"text with\nnewline"#);
257    /// # }
258    /// ```
259    fn from(iter: Escape<'a>) -> Self {
260        iter.inner.into()
261    }
262}
263
264// =============================================================================
265// Unescape Implementation
266// =============================================================================
267
268/// Creates a streaming JSON string unescaper from a byte slice.
269///
270/// This function creates an iterator to unescape a byte slice representing the
271/// **raw contents** of a JSON string, assuming the outer quotes have already
272/// been removed.
273///
274/// For a more convenient way to handle complete JSON string literals (including
275/// their surrounding `"` quotes), see the [`unescape_quoted`] function, which
276/// automatically trims them.
277///
278/// The iterator will fail if the input contains invalid JSON escape sequences.
279///
280/// # Example
281///
282/// ```
283/// use json_escape::{unescape, unescape_quoted};
284///
285/// // `unescape` works on the raw content, without quotes.
286/// let content = r#"hello\tworld"#;
287/// assert_eq!(unescape(content), "hello\tworld");
288///
289/// // If you pass a full JSON literal, the quotes are treated as literal characters.
290/// let literal = r#""hello\tworld""#;
291/// assert_eq!(unescape(literal), "\"hello\tworld\""); // Note the quotes in the output.
292///
293/// // For full literals like this, `unescape_quoted` is the recommended function.
294/// assert_eq!(unescape_quoted(literal), "hello\tworld");
295/// ```
296#[inline]
297pub fn unescape<I: AsRef<[u8]> + ?Sized>(input: &I) -> Unescape<'_> {
298    Unescape::new(input.as_ref())
299}
300
301/// Creates a streaming JSON string unescaper, trimming enclosing quotes.
302///
303/// This function acts as a convenience wrapper around [`unescape`]. It first
304/// inspects the input byte slice. If the slice begins and ends with a double-quote
305/// character (`"`), these quotes are trimmed before the inner content is passed to
306/// the unescaper.
307///
308/// If the input is not enclosed in quotes, this function behaves exactly like
309/// [`unescape`]. This is useful for directly unescaping a complete JSON string
310/// literal.
311///
312/// # Example
313///
314/// ```
315/// use json_escape::{unescape, unescape_quoted};
316///
317/// // 1. With quotes: The outer quotes are trimmed before unescaping.
318/// let unescaper = unescape_quoted(r#""hello\nworld""#);
319/// assert_eq!(unescaper, b"hello\nworld");
320///
321/// // 2. Without quotes: Behaves exactly like the standard `unescape`.
322/// let unescaper_no_quotes = unescape_quoted(r#"raw string"#);
323/// assert_eq!(unescaper_no_quotes, b"raw string");
324///
325/// // 3. Mismatched quotes: The input is passed through as-is, quotes are not trimmed.
326/// let mismatched_quotes = unescape_quoted(r#"hello""#);
327/// assert_eq!(mismatched_quotes, b"hello\"");
328///
329/// // 4. Empty quoted string: Correctly results in an empty output.
330/// let empty_quoted = unescape_quoted(r#""""#);
331/// assert_eq!(empty_quoted, b"");
332/// ```
333#[inline]
334pub fn unescape_quoted<I: AsRef<[u8]> + ?Sized>(input: &I) -> Unescape<'_> {
335    let bytes = input.as_ref();
336    let input = if bytes.len() >= 2 && bytes[0] == b'\"' && bytes[bytes.len() - 1] == b'\"' {
337        &bytes[1..bytes.len() - 1]
338    } else {
339        bytes
340    };
341
342    unescape(input)
343}
344
345/// A streaming JSON string unescaper.
346///
347/// This struct is created by the [`unescape`] function. It implements an [`Iterator`]
348/// that yields `Result<&'a [u8], UnescapeError>`, lazily decoding the input.
349///
350/// The iterator's output chunks are one of the following:
351/// - **`Ok(&'a [u8])`**: A borrowed slice of the original input for a sequence of non-escaped bytes.
352/// - **`Ok(&'static [u8])`**: A single-byte slice for a decoded escape sequence (e.g., `\n` becomes a slice containing `0x0A`).
353///   For `\uXXXX` sequences, it yields a series of single-byte slices representing the UTF-8 encoding of the character.
354/// - **`Err(UnescapeError)`**: An error indicating an invalid escape sequence, which halts further iteration as described below.
355///
356/// Because the iterator operates on bytes, you can use helper methods like
357/// [`Unescape::decode_utf8`] or [`Unescape::decode_utf8_lossy`] to convert the
358/// final result into a string.
359///
360/// # Error Handling
361///
362/// When the iterator encounters an invalid or incomplete escape, it returns an
363/// `Err(UnescapeError)` describing the problem. The iterator then remains in an
364/// **error state**: subsequent calls to `next()` will continue to return that same
365/// error (i.e., the error is idempotent) and the iterator will not produce further
366/// `Ok` chunks. This makes the behavior deterministic for callers that check the
367/// first error and then stop.
368///
369/// Errors are classified by the precise condition encountered:
370/// - **`InvalidEscape`**: The escape sequence uses an unknown escape character (e.g., `\q`).
371/// - **`InvalidHex`**: A `\u` escape contains a non-hex character where a hex
372///   digit was expected (e.g., `\uZ`).
373/// - **`UnexpectedEof`**: The input ended before a complete escape sequence could be
374///   read. This is used when there isn't enough input yet to decide whether the
375///   sequence would be valid (for instance, an incomplete `\u` or a truncated
376///   surrogate pair).
377/// - **`LoneSurrogate`**: A complete `\uXXXX` was read, and it encodes a *high*
378///   surrogate, but the following bytes definitively do not form a valid low
379///   surrogate escape (for example, the next character is a space or any
380///   non-`\u` character).
381///
382/// The difference between `UnexpectedEof` and `LoneSurrogate` is important:
383/// - `UnexpectedEof` means **we couldn't decide** because the input ended too early.
384/// - `LoneSurrogate` means **we did decide**—we saw a full `\uXXXX` high surrogate,
385///   and the following input proves a pair will not follow.
386///
387/// #### Concrete examples
388///
389/// 1) A high surrogate followed by other data (not a `\u` low-surrogate) → `LoneSurrogate`:
390///
391/// ```rust
392/// use json_escape::{unescape, UnescapeErrorKind, LoneSurrogateError};
393///
394/// let mut iter = unescape(r"\uD83D more data");
395/// let err = iter.next().unwrap().unwrap_err();
396/// assert!(matches!(err.kind(), UnescapeErrorKind::LoneSurrogate(LoneSurrogateError { surrogate: 0xD83D, .. })));
397///
398/// // Subsequent calls return the same error (iterator remains in the same error state).
399/// let err = iter.next().unwrap().unwrap_err();
400/// assert!(matches!(err.kind(), UnescapeErrorKind::LoneSurrogate(LoneSurrogateError { surrogate: 0xD83D, .. })));
401/// ```
402///
403/// 2) An invalid escape character → `InvalidEscape`:
404///
405/// ```rust
406/// use json_escape::{unescape, UnescapeErrorKind, InvalidEscapeError};
407///
408/// let mut iter = unescape(r"\q"); // `\q` is not a defined escape
409/// let err = iter.next().unwrap().unwrap_err();
410/// assert!(matches!(err.kind(), UnescapeErrorKind::InvalidEscape(InvalidEscapeError { found: b'q', .. })));
411/// ```
412///
413/// 3) A malformed `\u` with a non-hex character → `InvalidHex`:
414///
415/// ```rust
416/// use json_escape::{unescape, UnescapeErrorKind, InvalidHexError};
417///
418/// let mut iter = unescape(r"\uZ");
419/// let err = iter.next().unwrap().unwrap_err();
420/// assert!(matches!(err.kind(), UnescapeErrorKind::InvalidHex(InvalidHexError { found: b'Z', .. })));
421/// ```
422///
423/// 4) Truncated / incomplete input ⇒ `UnexpectedEof`:
424///
425/// ```rust
426/// use json_escape::{unescape, UnescapeErrorKind};
427///
428/// // a) truncated after the first \uXXXX (no following bytes yet)
429/// let mut iter = unescape(r"\uD83D");
430/// let err = iter.next().unwrap().unwrap_err();
431/// assert!(matches!(err.kind(), UnescapeErrorKind::UnexpectedEof));
432///
433/// // b) starts a second \u but is truncated before hex digits
434/// let mut iter = unescape(r"\uD83D\u");
435/// let err = iter.next().unwrap().unwrap_err();
436/// assert!(matches!(err.kind(), UnescapeErrorKind::UnexpectedEof));
437///
438/// // c) a lone backslash at end of input
439/// let mut iter = unescape("\\");
440/// let err = iter.next().unwrap().unwrap_err();
441/// assert!(matches!(err.kind(), UnescapeErrorKind::UnexpectedEof));
442/// ```
443///
444/// **Note**: This behavior intentionally mirrors common JSON parsers (e.g.,
445/// `serde_json`, Go's `encoding/json`) for the EOF vs. semantic error distinction.
446///
447/// # Implemented Traits and Usage
448///
449/// - **`Iterator<Item = Result<&'a [u8], UnescapeError>>`**: The core trait for
450///   processing the unescaped byte chunks.
451/// - **`std::io::Read`** (requires `std` feature): Lets you use the unescaper as a
452///   standard reader, perfect for integrating with other I/O APIs.
453/// - **`TryFrom<Unescape<'a>> for Cow<'a, [u8]>`** (requires `alloc` feature): An
454///   efficient way to collect the unescaped bytes, propagating any errors.
455/// - **`Clone`**, **`Debug`**: Standard utility traits.
456/// - **`PartialEq<B: AsRef<[u8]>>`**: Compares the fully unescaped output with a byte slice.
457///
458/// ## Reading Unescaped Bytes
459///
460/// With the `std` feature, `Unescape` can be used as any other `std::io::Read`
461/// source. This is ideal for streaming and decoding large JSON string contents
462/// without buffering the entire result in memory first.
463///
464/// ```rust
465/// # #[cfg(feature = "std")] {
466/// use json_escape::unescape;
467/// use std::io::Read;
468///
469/// let mut reader = unescape(r#"chunk1\nchunk2"#);
470/// let mut buf = Vec::new();
471///
472/// // Read all unescaped bytes from the iterator into the buffer.
473/// reader.read_to_end(&mut buf).unwrap();
474///
475/// assert_eq!(buf, b"chunk1\nchunk2");
476/// # }
477/// ```
478#[derive(Clone)]
479#[must_use = "iterators are lazy and do nothing unless consumed"]
480pub struct Unescape<'a> {
481    // The inner, chunk-based iterator.
482    inner: explicit::Unescape<'a>,
483    // scratch buffer for encoded UTF-8 bytes from a \uXXXX (or surrogate pair)
484    unicode: [u8; 4],
485    // We can eliminate this by depending on the header.
486    unicode_len: u8, // how many bytes are valid in buf (0 means no pending)
487    unicode_pos: u8, // how many bytes already emitted
488}
489
490impl<'a> Unescape<'a> {
491    /// Construct from a byte slice which contains the characters inside the JSON string (no quotes).
492    fn new(input: &'a [u8]) -> Self {
493        Self {
494            inner: explicit::Unescape { bytes: input },
495            unicode: [0; 4],
496            unicode_len: 0,
497            unicode_pos: 0,
498        }
499    }
500
501    #[inline]
502    fn store_unicode(&mut self, ch: char) {
503        self.unicode_len = ch.encode_utf8(&mut self.unicode).len() as u8;
504        self.unicode_pos = 0;
505    }
506
507    #[inline]
508    fn emit_pending_byte(&mut self) -> Option<u8> {
509        if self.unicode_pos < self.unicode_len {
510            let b = self.unicode[self.unicode_pos as usize];
511            self.unicode_pos += 1;
512            Some(b)
513        } else {
514            None
515        }
516    }
517
518    /// Helper to emit the full unicode sequence and advance the internal position.
519    #[inline]
520    fn emit_unicode_as_str(&mut self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
521        // The check `unicode_pos > 0` is implicit from the call site.
522        //
523        // SAFETY: The buffer is guaranteed to contain a valid UTF-8 sequence.
524        let s = unsafe { str::from_utf8_unchecked(&self.unicode[..self.unicode_len as usize]) };
525        f.write_str(s)?;
526
527        // Mark the entire sequence as emitted.
528        self.unicode_pos = self.unicode_len;
529
530        Ok(())
531    }
532
533    fn _display_utf8(mut self, f: &mut fmt::Formatter<'_>, lossy: bool) -> fmt::Result {
534        // The key insight: Chunks with more than one byte are *always*
535        // borrowed from the original input, as all escaped characters
536        // are yielded byte-by-byte.
537        while let Some(result) = self.next() {
538            match result {
539                Ok(chunk) => {
540                    if chunk.is_empty() {
541                        continue;
542                    }
543
544                    // THE CORE LOGIC:
545                    // Check if the iterator just yielded the *first byte* of a *multi-byte* sequence.
546                    // - `unicode_pos == 1` means the first byte was just emitted.
547                    // - `unicode_len > 1` means it's a multi-byte char (e.g., '¢', '😎').
548                    if self.unicode_pos == 1 && self.unicode_len > 1 {
549                        // This is our special case. We have the first byte in `chunk`, but
550                        // it's more efficient to write the whole character at once from our buffer.
551                        self.emit_unicode_as_str(f)?;
552                        // The iterator will no longer yield the rest of the bytes. Since our helper
553                        // has now advanced it. But to be sure...
554                        self.unicode_pos = self.unicode_len;
555                    } else {
556                        // This is the normal case:
557                        // 1. A large chunk borrowed from the original input.
558                        // 2. A single-byte escape like `\n` or `\t`.
559                        // 3. The last byte of a multi-byte sequence (or the only byte).
560                        // In all these cases, we just need to display the chunk we received.
561                        display_bytes_utf8(chunk, f, lossy)?;
562                    }
563                }
564                Err(_) => {
565                    if lossy {
566                        break;
567                    } else {
568                        return Err(fmt::Error);
569                    }
570                }
571            }
572        }
573
574        Ok(())
575    }
576
577    /// Decodes the unescaped byte stream into a UTF-8 string.
578    ///
579    /// This method consumes the iterator and collects all resulting byte chunks.
580    /// If an unescaping error occurs, it's returned immediately. If the final
581    /// sequence of bytes is not valid UTF-8, a UTF-8 error is returned.
582    ///
583    /// Like `From<Escape>`, this is optimized to return a `Cow::Borrowed` if no
584    /// escapes were present in the input, avoiding allocation.
585    ///
586    /// **Requires the `alloc` feature.**
587    ///
588    /// # Example
589    ///
590    /// ```
591    /// # #[cfg(feature = "alloc")] {
592    /// use json_escape::unescape;
593    ///
594    /// let input = r#"Emoji: \uD83D\uDE00"#;
595    /// let cow = unescape(input).decode_utf8().unwrap();
596    ///
597    /// assert_eq!(cow, "Emoji: 😀");
598    /// # }
599    /// ```
600    #[cfg(feature = "alloc")]
601    pub fn decode_utf8(self) -> Result<Cow<'a, str>, DecodeUtf8Error> {
602        match self.try_into().map_err(DecodeUtf8Error::Unescape)? {
603            Cow::Borrowed(bytes) => str::from_utf8(bytes)
604                .map(Cow::Borrowed)
605                .map_err(DecodeUtf8Error::Utf8),
606            Cow::Owned(bytes) => String::from_utf8(bytes)
607                .map(Cow::Owned)
608                .map_err(|e| DecodeUtf8Error::Utf8(e.utf8_error())),
609        }
610    }
611
612    /// Decodes the unescaped byte stream lossily into a UTF-8 string.
613    ///
614    /// This is similar to [`Unescape::decode_utf8`] but replaces any invalid UTF-8 sequences
615    /// with the replacement character (U+FFFD) instead of returning an error.
616    ///
617    /// An `UnescapeError` can still be returned if the JSON escaping itself is invalid.
618    ///
619    /// **Requires the `alloc` feature.**
620    #[cfg(feature = "alloc")]
621    pub fn decode_utf8_lossy(self) -> Result<Cow<'a, str>, UnescapeError> {
622        Ok(decode_utf8_lossy(self.try_into()?))
623    }
624
625    /// Returns a wrapper that implements [`fmt::Display`].
626    ///
627    /// This allows an `Unescape` iterator to be used directly with formatting
628    /// macros like `println!`, `format!`, etc. It writes the unescaped content
629    /// directly to the formatter's buffer, **avoiding any heap allocations**.
630    ///
631    /// The iterator is consumed, and the resulting unescaped string is written
632    /// to the formatter. Any invalid JSON escape sequences or invalid UTF-8 will
633    /// cause a `fmt::Error`. **You should be cautious when using this method
634    /// with the `format!` macro, as a `fmt::Error` from us will cause the macro
635    /// to panic**.
636    ///
637    /// For a more robust alternative that will not panic on `UnescapeError` or
638    /// invalid bytes, consider using [`Unescape::display_utf8_lossy`] instead.
639    ///
640    /// This method is a **zero-allocation** alternative to [`Unescape::decode_utf8`],
641    /// which might allocate a `String` to return the unescaped content.
642    ///
643    /// # Example
644    ///
645    /// ```
646    /// use json_escape::unescape;
647    ///
648    /// let original = r#"Hello, \uD83C\uDF0E!"#;
649    /// let unescaper = unescape(original);
650    ///
651    /// let formatted = format!("{}", unescaper.display_utf8());
652    /// assert_eq!(formatted, "Hello, 🌎!");
653    /// ```
654    pub fn display_utf8(self) -> DisplayUnescape<'a> {
655        DisplayUnescape { inner: self }
656    }
657
658    /// Returns a wrapper that implements [`fmt::Display`] lossily.
659    ///
660    /// This method is an **allocation-free** way to write unescaped content
661    /// to a formatter. It handles invalid JSON escape sequences and invalid
662    /// UTF-8 gracefully, making it a "lossy" operation.
663    ///
664    /// - **Invalid JSON escape sequences:** Instead of causing an error, the iterator
665    ///   terminates without an error.
666    /// - **Invalid UTF-8 bytes:** These are replaced with the Unicode
667    ///   replacement character (U+FFFD).
668    ///
669    /// This method is the **zero-allocation** counterpart to [`Unescape::decode_utf8_lossy`].
670    pub fn display_utf8_lossy(self) -> DisplayUnescapeLossy<'a> {
671        DisplayUnescapeLossy { inner: self }
672    }
673}
674
675impl<'a> Iterator for Unescape<'a> {
676    type Item = Result<&'a [u8], UnescapeError>;
677
678    fn next(&mut self) -> Option<Self::Item> {
679        // If we have pending bytes, emit them first (fast).
680        if let Some(s) = self.emit_pending_byte() {
681            // s: &'static [u8] coerces to &'a [u8]
682            return Some(Ok(byte_as_static_slice(s)));
683        }
684
685        match self.inner.next() {
686            Some(Ok(chunk)) => {
687                if let Some(ch) = chunk.unescaped {
688                    self.store_unicode(ch);
689                }
690                Some(Ok(chunk.literal))
691            }
692            Some(Err(err)) => Some(Err(err)),
693            None => None,
694        }
695    }
696}
697
698impl<'a> FusedIterator for Unescape<'a> {}
699
700#[cfg(feature = "std")]
701impl std::io::Read for Unescape<'_> {
702    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
703        let mut total_written = 0;
704        let mut remaining_buf = buf;
705
706        // Loop until the destination buffer is full or we are completely out of data.
707        loop {
708            // Priority 1: Drain any pending bytes from an unescaped character first.
709            if self.unicode_pos < self.unicode_len {
710                let pending_unicode =
711                    &self.unicode[self.unicode_pos as usize..self.unicode_len as usize];
712                let bytes_to_copy = pending_unicode.len().min(remaining_buf.len());
713
714                remaining_buf[..bytes_to_copy].copy_from_slice(&pending_unicode[..bytes_to_copy]);
715                self.unicode_pos += bytes_to_copy as u8;
716                total_written += bytes_to_copy;
717                remaining_buf = &mut remaining_buf[bytes_to_copy..];
718
719                // If buffer is now full, we are done for this call.
720                if remaining_buf.is_empty() {
721                    break;
722                }
723            }
724            if self.unicode_pos >= self.unicode_len {
725                self.unicode_pos = 0;
726                self.unicode_len = 0;
727            }
728
729            // Priority 2: Get and process a new chunk from the inner iterator.
730            match self.inner.next() {
731                Some(Ok(chunk)) => {
732                    let bytes_to_copy = chunk.literal.len().min(remaining_buf.len());
733                    if bytes_to_copy > 0 {
734                        remaining_buf[..bytes_to_copy]
735                            .copy_from_slice(&chunk.literal[..bytes_to_copy]);
736                        total_written += bytes_to_copy;
737                        remaining_buf = &mut remaining_buf[bytes_to_copy..];
738                    }
739
740                    // ### THE BACKTRACKING TRICK ###
741                    // This block executes if the destination `buf` was filled before we could
742                    // finish reading the `literal` part of the current chunk.
743                    if bytes_to_copy < chunk.literal.len() {
744                        // We must reconstruct the *entire unread portion of the stream*.
745                        // This includes:
746                        //   1. The rest of the literal (e.g., "de").
747                        //   2. The original escaped sequence (e.g., "\\n").
748                        //   3. The rest of the stream that followed (e.g., "fghi").
749                        //
750                        // These parts are all contiguous in the original input slice.
751                        // We can create a new slice view over this memory using pointer arithmetic.
752
753                        // SAFETY: This is safe for several reasons:
754                        // 1. `chunk.literal` and `self.inner.bytes` are both derived from the same
755                        //    original slice with lifetime `'a`. All memory is valid.
756                        // 2. `new_start_ptr` points to the start of the unread literal part, a valid memory location.
757                        // 3. `stream_end_ptr` points to the end of the stream that `self.inner.bytes` currently sees.
758                        // 4. The resulting slice is therefore a valid, contiguous sub-slice of the original input.
759                        unsafe {
760                            // Pointer to the first byte of the unread part of the literal.
761                            let new_start_ptr = chunk.literal.as_ptr().add(bytes_to_copy);
762
763                            // Pointer to one byte past the end of the remaining stream.
764                            // We don't set self.inner.bytes to &[] in explicit
765                            let stream_end_ptr =
766                                self.inner.bytes.as_ptr().add(self.inner.bytes.len());
767
768                            // The new length is the distance between these two pointers.
769                            let new_len = stream_end_ptr as usize - new_start_ptr as usize;
770
771                            // Reset the inner iterator's slice to this reconstructed view.
772                            self.inner.bytes = std::slice::from_raw_parts(new_start_ptr, new_len);
773                        }
774
775                        // Since the buffer is full, we must stop and return. The next `read` call
776                        // will now correctly resume from the middle of the previous chunk.
777                        break;
778                    }
779
780                    // If we get here, the entire literal was consumed. Now handle the unescaped char.
781                    if let Some(ch) = chunk.unescaped {
782                        let encoded = ch.encode_utf8(&mut self.unicode);
783                        self.unicode_len = encoded.len() as u8;
784                        // Loop to immediately process the newly buffered unicode bytes.
785                        continue;
786                    }
787                }
788                Some(Err(e)) => {
789                    return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, e));
790                }
791                None => break, // Inner iterator is exhausted.
792            }
793        }
794
795        Ok(total_written)
796    }
797
798    // We can provide an optimized version of read_to_end
799    fn read_to_end(&mut self, buf: &mut Vec<u8>) -> std::io::Result<usize> {
800        let start_len = buf.len();
801
802        // Now, efficiently consume the rest of the iterator
803        for result in self {
804            match result {
805                Ok(chunk) => buf.extend_from_slice(chunk),
806                Err(err) => return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, err)),
807            }
808        }
809
810        Ok(buf.len() - start_len)
811    }
812}
813
814impl fmt::Debug for Unescape<'_> {
815    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
816        f.debug_struct("Unescape").finish_non_exhaustive()
817    }
818}
819
820impl<B: AsRef<[u8]> + ?Sized> PartialEq<B> for Unescape<'_> {
821    /// Compares the unescaped output with a byte-slice-like object.
822    ///
823    /// An `Unescape` iterator is considered equal to a byte slice if it successfully
824    /// unescapes to produce a sequence of bytes identical to that slice. If the
825    /// iterator would produce an error, the comparison returns `false`.
826    ///
827    /// # Example
828    ///
829    /// ```
830    /// use json_escape::unescape;
831    ///
832    /// let unescaper = unescape(r#"hello\nworld"#);
833    /// assert_eq!(unescaper, b"hello\nworld");
834    ///
835    /// // An iterator that produces an error is not equal to any valid slice.
836    /// let failing_unescaper = unescape(r#"\k"#);
837    /// assert_ne!(failing_unescaper, b"k");
838    /// ```
839    fn eq(&self, other: &B) -> bool {
840        let mut other = other.as_ref();
841        for result in self.clone() {
842            match result {
843                Ok(chunk) => {
844                    if !other.starts_with(chunk) {
845                        return false;
846                    }
847                    other = &other[chunk.len()..];
848                }
849                Err(_) => return false, // An erroring iterator cannot be equal to a valid slice.
850            }
851        }
852        other.is_empty()
853    }
854}
855
856impl<B: AsRef<[u8]>> PartialEq<Unescape<'_>> for Result<B, UnescapeError> {
857    /// Compares the unescaper's outcome with a `Result`.
858    ///
859    /// This implementation allows for precise testing of the `Unescape` iterator
860    /// by comparing it against either a successful outcome (`Ok`) or a specific
861    /// failure (`Err`).
862    ///
863    /// - If `result` is `Ok(bytes)`, the comparison is `true` only if the iterator
864    ///   completes successfully and its concatenated output is identical to `bytes`.
865    ///
866    /// - If `result` is `Err(error)`, the comparison is `true` only if the iterator
867    ///   produces the exact same `UnescapeError`.
868    ///
869    /// # Example
870    ///
871    /// ```
872    /// use json_escape::{unescape, UnescapeError, InvalidEscapeError};
873    ///
874    /// // --- Success Case ---
875    /// let unescaper = unescape(r#"hello\tworld"#);
876    /// // The comparison is against an `Ok` variant.
877    /// assert_eq!(Ok("hello\tworld"), unescaper);
878    ///
879    /// // --- Error Case ---
880    /// let failing_unescaper = unescape(r#"invalid-\u"#);
881    /// // We can assert that the iterator produces a specific error.
882    /// # let unexpected_eof = unescape(r"\u").next().unwrap().unwrap_err();
883    /// assert_eq!(Err::<&str, _>(unexpected_eof), failing_unescaper);
884    /// ```
885    fn eq(&self, unescape: &Unescape<'_>) -> bool {
886        match self {
887            Ok(expected_bytes) => unescape == expected_bytes,
888            Err(expected_error) => {
889                for result in unescape.clone() {
890                    if let Err(actual_error) = result {
891                        // The iterator's first error is its final outcome.
892                        // It must match the expected error exactly.
893                        return actual_error == *expected_error;
894                    }
895                }
896                // `unescape` completed successfully, but an error was expected.
897                false
898            }
899        }
900    }
901}
902
903impl<'a, 'b> PartialEq<Unescape<'a>> for Unescape<'b> {
904    /// Compares two `Unescape` iterators for equality based on their terminal result.
905    ///
906    /// The equality of two `Unescape` iterators is determined by the final `Result`
907    /// that would be obtained if each iterator were fully consumed (e.g., by using `try_collect()`).
908    ///
909    /// The specific rules are as follows:
910    ///
911    /// 1.  **Error vs. Error**: If both iterators terminate with an `Err`, they are
912    ///     considered **equal** if and only if their `UnescapeError`s are identical.
913    ///     Any bytes successfully unescaped *before* the error are ignored in this case.
914    /// 2.  **Success vs. Success**: If both iterators terminate with `Ok`, they are
915    ///     considered **equal** if and only if the complete sequence of unescaped bytes
916    ///     is identical for both.
917    /// 3.  **Success vs. Error**: If one iterator terminates with `Ok` and the other
918    ///     with `Err`, they are always **not equal**.
919    ///
920    /// # Example
921    ///
922    /// ```
923    /// use json_escape::unescape;
924    ///
925    /// // Case 1: Both iterators produce the same error. They are equal,
926    /// // even though their valid prefixes ("a" and "b") are different.
927    /// let failing_a = unescape(r#"a\k"#);
928    /// let failing_b = unescape(r#"b\k"#);
929    /// assert_eq!(failing_a, failing_b);
930    ///
931    /// // Case 2: Both iterators succeed. Equality depends on the byte stream.
932    /// let successful_a = unescape(r#"hello\nworld"#);
933    /// let successful_b = unescape(r#"hello\nworld"#);
934    /// assert_eq!(successful_a, successful_b);
935    ///
936    /// let successful_c = unescape(r#"different"#);
937    /// assert_ne!(successful_a, successful_c);
938    ///
939    /// // Case 3: One succeeds and one fails. They are not equal.
940    /// let succeeding = unescape(r#"stop"#);
941    /// let failing = unescape(r#"stop\k"#);
942    /// assert_ne!(succeeding, failing);
943    ///
944    /// // Case 4: Both iterators fail differently. They are not equal.
945    /// let failing_a = unescape(r#"data:\k"#);
946    /// let failing_b = unescape(r#"data:\"#);
947    /// assert_ne!(failing_a, failing_b);
948    /// ```
949    fn eq(&self, other: &Unescape<'a>) -> bool {
950        // Fast path: if they are views into the same underlying data with the same state.
951        ((self.inner.bytes == other.inner.bytes)
952            && (self.unicode == other.unicode)
953            && (self.unicode_len == other.unicode_len)
954            && (self.unicode_pos == other.unicode_pos))
955            || {
956                let mut a_error = None;
957                let mut b_error = None;
958
959                let mut a = self.clone().map_while(|result| match result {
960                    Ok(ok) => Some(ok),
961                    Err(err) => {
962                        a_error = Some(err);
963                        None
964                    }
965                });
966
967                let mut b = other.clone().map_while(|result| match result {
968                    Ok(ok) => Some(ok),
969                    Err(err) => {
970                        b_error = Some(err);
971                        None
972                    }
973                });
974
975                let streams_match = chunks_eq(&mut a, &mut b);
976
977                // Drain the iterators to ensure the error state is captured,
978                // especially if chunks_eq returned false early.
979                // (e.g unescape("a\k") and unescape("b\k") which are actually
980                // equal)
981                a.for_each(|_| {});
982                b.for_each(|_| {});
983
984                match (a_error, b_error) {
985                    // Both errored: equality depends only on the errors being the same.
986                    (Some(a_err), Some(b_err)) => a_err == b_err,
987                    // Both succeeded: equality depends on the byte streams having been identical.
988                    (None, None) => streams_match,
989                    // One errored and the other didn't: they are not equal.
990                    _ => false,
991                }
992            }
993    }
994}
995
996#[cfg(feature = "alloc")]
997impl<'a> TryFrom<Unescape<'a>> for Cow<'a, [u8]> {
998    type Error = UnescapeError;
999
1000    /// Efficiently collects the unescaped bytes into a `Cow<'a, [u8]>`.
1001    ///
1002    /// This implementation will return `Cow::Borrowed` if the original input contained
1003    /// no escape sequences, avoiding allocation. Otherwise, it returns `Cow::Owned`.
1004    ///
1005    /// If any `UnescapeError` is encountered during iteration, the operation
1006    /// halts and returns that error.
1007    ///
1008    /// **Requires the `alloc` feature.**
1009    fn try_from(mut value: Unescape<'a>) -> Result<Self, Self::Error> {
1010        match value.next() {
1011            None => Ok(Cow::Borrowed(b"")),
1012            Some(Ok(first)) => match value.next() {
1013                None => Ok(Cow::Borrowed(first)),
1014                Some(Ok(second)) => {
1015                    let mut buf =
1016                        Vec::with_capacity(first.len() + second.len() + value.inner.bytes.len());
1017                    buf.extend_from_slice(first);
1018                    buf.extend_from_slice(second);
1019                    for item in value {
1020                        buf.extend_from_slice(item?);
1021                    }
1022                    Ok(Cow::Owned(buf))
1023                }
1024                Some(Err(e)) => Err(e),
1025            },
1026            Some(Err(e)) => Err(e),
1027        }
1028    }
1029}
1030
1031// =============================================================================
1032// DisplayUnescape Implementation
1033// =============================================================================
1034
1035/// A wrapper for an [`Unescape`] iterator that implements [`fmt::Display`].
1036///
1037/// This struct is created by the [`Unescape::display_utf8()`] method. It allows for
1038/// printing the unescaped content directly to a formatter, which **avoids
1039/// any heap allocations**. The unescaping and UTF-8 decoding are performed on-the-fly as the
1040/// `fmt` method is called.
1041pub struct DisplayUnescape<'a> {
1042    inner: Unescape<'a>,
1043}
1044
1045impl fmt::Display for DisplayUnescape<'_> {
1046    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1047        self.inner.clone()._display_utf8(f, false)
1048    }
1049}
1050
1051/// A wrapper for an [`Unescape`] iterator that implements [`fmt::Display`] lossily.
1052///
1053/// This struct is created by the [`Unescape::display_utf8_lossy()`] method. Like
1054/// `DisplayUnescape`, it performs its operation **without any heap allocations**.
1055///
1056/// This method differs from `display_utf8` in that it handles two types of
1057/// errors gracefully:
1058/// - Invalid JSON escape sequences will be ignored, and the iterator will
1059///   continue to completion without a `fmt::Error`.
1060/// - Invalid UTF-8 byte sequences will be replaced with the Unicode
1061///   replacement character (``, U+FFFD)
1062pub struct DisplayUnescapeLossy<'a> {
1063    inner: Unescape<'a>,
1064}
1065
1066impl fmt::Display for DisplayUnescapeLossy<'_> {
1067    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1068        // Lossy mode: replace invalid sequences with U+FFFD and continue.
1069        self.inner.clone()._display_utf8(f, true)
1070    }
1071}
1072
1073// =============================================================================
1074// Error Types
1075// =============================================================================
1076
1077/// An error that can occur when decoding the final byte stream to a UTF-8 string.
1078#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1079pub enum DecodeUtf8Error {
1080    /// The unescaped byte sequence was not valid UTF-8.
1081    Utf8(str::Utf8Error),
1082    /// An error occurred during the JSON unescaping process itself.
1083    Unescape(UnescapeError),
1084}
1085
1086impl fmt::Display for DecodeUtf8Error {
1087    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1088        match self {
1089            DecodeUtf8Error::Utf8(e) => fmt::Display::fmt(e, f),
1090            DecodeUtf8Error::Unescape(e) => fmt::Display::fmt(e, f),
1091        }
1092    }
1093}
1094
1095/// Details of an invalid escape sequence error.
1096#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1097#[non_exhaustive]
1098pub struct InvalidEscapeError {
1099    /// The invalid character found after a `\`.
1100    pub found: u8,
1101}
1102
1103/// Details of a lone UTF-16 surrogate error.
1104#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1105#[non_exhaustive]
1106pub struct LoneSurrogateError {
1107    /// The 16-bit surrogate code point.
1108    pub surrogate: u16,
1109}
1110
1111/// Details of an invalid hex digit error within a `\uXXXX` sequence.
1112#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1113#[non_exhaustive]
1114pub struct InvalidHexError {
1115    /// The non-hex character that was found.
1116    pub found: u8,
1117}
1118
1119impl fmt::Display for InvalidHexError {
1120    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1121        write!(f, "found invalid hex digit '0x{:02X}'", self.found)
1122    }
1123}
1124
1125/// An error that can occur during the JSON string unescaping process.
1126#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1127pub struct UnescapeError {
1128    /// The specific kind of unescaping error.
1129    pub(crate) kind: UnescapeErrorKind,
1130    /// The byte offset from the start of the escape sequence (`\`) where the
1131    /// error was detected.
1132    ///
1133    /// This is guaranteed to be less than 12, as the maximum escape sequence
1134    /// is `\uXXXX\uXXXX`.
1135    pub(crate) offset: u8,
1136}
1137
1138impl UnescapeError {
1139    /// Returns the specific kind of error that occurred.
1140    ///
1141    /// This can be used to programmatically handle different error types,
1142    /// such as distinguishing between a malformed hex sequence and an
1143    /// invalid escape character.
1144    ///
1145    /// ### Example
1146    ///
1147    /// ```
1148    /// # use json_escape::{unescape, UnescapeErrorKind, InvalidHexError};
1149    /// let mut unescaper = unescape(r#"\u123Z"#);
1150    /// let err = unescaper.next().unwrap().unwrap_err();
1151    ///
1152    /// match err.kind() {
1153    ///     UnescapeErrorKind::InvalidHex(InvalidHexError { found, .. }) => {
1154    ///         // We can inspect the exact invalid character found.
1155    ///         assert_eq!(found, b'Z');
1156    ///     }
1157    ///     _ => panic!("Expected an InvalidHex error"),
1158    /// }
1159    /// ```
1160    pub fn kind(&self) -> UnescapeErrorKind {
1161        self.kind
1162    }
1163
1164    /// Returns the byte offset from the start of the escape sequence (`\`)
1165    /// where the error was detected.
1166    ///
1167    /// - For `\x`, the offset is `1` (pointing to `x`).
1168    /// - For `\u123?`, the offset is `5` (pointing to `?`).
1169    /// - For a lone surrogate `\uD800`, the offset is `6` (pointing after the sequence).
1170    ///
1171    /// This is useful for providing detailed error messages that can point
1172    /// to the exact location of the problem in the source string.
1173    ///
1174    /// ### Example
1175    ///
1176    /// ```
1177    /// # use json_escape::unescape;
1178    /// let json_string_content = r#"bad escape \x here"#;
1179    /// let mut unescaper = unescape(json_string_content);
1180    ///
1181    /// // previous read
1182    /// // { ... }
1183    ///
1184    /// let err = unescaper.next().unwrap().unwrap_err();
1185    ///
1186    /// // The error occurred at the 'x', which is 1 byte after the '\'
1187    /// assert_eq!(err.offset(), 1);
1188    ///
1189    /// // You could use this to highlight the error in the original input
1190    /// let backslash_pos = json_string_content.find('\\').unwrap();
1191    /// let error_pos = backslash_pos + err.offset() as usize;
1192    /// assert_eq!(json_string_content.as_bytes()[error_pos], b'x');
1193    ///
1194    /// // The generated error message also includes this info.
1195    /// let expected_msg = "invalid escape: '\\0x78' at offset 1";
1196    /// assert_eq!(err.to_string(), expected_msg);
1197    /// ```
1198    pub fn offset(&self) -> u8 {
1199        self.offset
1200    }
1201}
1202
1203/// The specific kind of error that can occur during JSON string unescaping.
1204///
1205/// This enum covers all possible failures described by the JSON standard for string contents.
1206#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1207#[non_exhaustive]
1208pub enum UnescapeErrorKind {
1209    /// Found a backslash followed by an unexpected character (e.g., `\x`).
1210    InvalidEscape(InvalidEscapeError),
1211    /// Found `\u` but the following characters were not 4 valid hex digits.
1212    InvalidHex(InvalidHexError),
1213    /// Input ended unexpectedly while parsing an escape sequence (e.g., `\u12`).
1214    UnexpectedEof,
1215    /// The `\u` sequence yielded a lone high or low surrogate without a matching pair.
1216    LoneSurrogate(LoneSurrogateError),
1217}
1218
1219impl fmt::Display for UnescapeError {
1220    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1221        match self.kind {
1222            UnescapeErrorKind::InvalidEscape(e) => {
1223                write!(
1224                    f,
1225                    "invalid escape: '\\0x{:02X}' at offset {}",
1226                    e.found, self.offset
1227                )
1228            }
1229            UnescapeErrorKind::InvalidHex(ref s) => {
1230                write!(f, "{} at offset {}", s, self.offset)
1231            }
1232            UnescapeErrorKind::UnexpectedEof => {
1233                write!(
1234                    f,
1235                    "unexpected end of input while parsing escape sequence, expected character at offset {}",
1236                    self.offset
1237                )
1238            }
1239            UnescapeErrorKind::LoneSurrogate(e) => write!(
1240                f,
1241                "invalid unicode sequence: lone surrogate found: 0x{:04X} at offset {}",
1242                e.surrogate, self.offset
1243            ),
1244        }
1245    }
1246}
1247
1248impl core::error::Error for UnescapeError {}
1249impl core::error::Error for DecodeUtf8Error {
1250    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
1251        match self {
1252            DecodeUtf8Error::Utf8(e) => Some(e),
1253            DecodeUtf8Error::Unescape(e) => Some(e),
1254        }
1255    }
1256}
1257
1258// =============================================================================
1259// Utilities
1260// =============================================================================
1261
1262/// Static table mapping every u8 -> a &'static [u8] of length 1.
1263/// This lets us return a `'static` slice for any single byte cheaply.
1264const U8_TABLE: [[u8; 1]; 256] = {
1265    let mut arr = [[0u8; 1]; 256];
1266    let mut i = 0usize;
1267    while i < 256 {
1268        arr[i] = [i as u8];
1269        i += 1;
1270    }
1271    arr
1272};
1273
1274#[inline(always)]
1275fn byte_as_static_slice(b: u8) -> &'static [u8] {
1276    // coerce from &'static [u8;1] to &'static [u8]
1277    &U8_TABLE[b as usize]
1278}
1279
1280// The following function is copied from the `percent-encoding` crate, version 2.3.2.
1281// Source: https://github.com/servo/rust-url/blob/22b925f93ad505a830f1089538a9ed6f5fd90612/percent_encoding/src/lib.rs#L337-L365
1282//
1283// It is licensed under the same terms as the `percent-encoding` crate (MIT/Apache-2.0).
1284//
1285// This helper is used to efficiently convert a Cow<'_, [u8]> to a Cow<'_, str>
1286// lossily, with a specific optimization to avoid a re-allocation when the input
1287// is an owned, valid UTF-8 Vec<u8>.
1288#[cfg(feature = "alloc")]
1289#[allow(ambiguous_wide_pointer_comparisons)]
1290fn decode_utf8_lossy(input: Cow<'_, [u8]>) -> Cow<'_, str> {
1291    // Note: This function is duplicated in `form_urlencoded/src/query_encoding.rs`.
1292    match input {
1293        Cow::Borrowed(bytes) => String::from_utf8_lossy(bytes),
1294        Cow::Owned(bytes) => {
1295            match String::from_utf8_lossy(&bytes) {
1296                Cow::Borrowed(utf8) => {
1297                    // If from_utf8_lossy returns a Cow::Borrowed, then we can
1298                    // be sure our original bytes were valid UTF-8. This is because
1299                    // if the bytes were invalid UTF-8 from_utf8_lossy would have
1300                    // to allocate a new owned string to back the Cow so it could
1301                    // replace invalid bytes with a placeholder.
1302
1303                    // First we do a debug_assert to confirm our description above.
1304                    let raw_utf8: *const [u8] = utf8.as_bytes();
1305                    debug_assert!(core::ptr::eq(raw_utf8, &*bytes));
1306
1307                    // SAFETY: Given we know the original input bytes are valid UTF-8,
1308                    // and we have ownership of those bytes, we re-use them and
1309                    // return a Cow::Owned here.
1310                    Cow::Owned(unsafe { String::from_utf8_unchecked(bytes) })
1311                }
1312                Cow::Owned(s) => Cow::Owned(s),
1313            }
1314        }
1315    }
1316}
1317
1318/// Compare two chunk-iterators by their concatenated byte stream (streaming,
1319/// zero allocations).
1320///
1321/// This is allocation-free: it streams through both iterators, comparing
1322/// overlapping prefixes and carrying the remainder of the longer chunk
1323/// forward into the next round.
1324fn chunks_eq<'a, I1, A, I2, B>(mut a: I1, mut b: I2) -> bool
1325where
1326    A: 'a + AsRef<[u8]> + ?Sized,
1327    B: 'a + AsRef<[u8]> + ?Sized,
1328    I1: Iterator<Item = &'a A>,
1329    I2: Iterator<Item = &'a B>,
1330{
1331    let mut a_rem: &[u8] = &[];
1332    let mut b_rem: &[u8] = &[];
1333
1334    loop {
1335        // If the remainder buffer for 'a' is empty, try to get the next chunk.
1336        if a_rem.is_empty() {
1337            match a.next() {
1338                Some(chunk) => a_rem = chunk.as_ref(),
1339                // 'a' is exhausted. They are equal only if 'b' is also exhausted.
1340                None => return b_rem.is_empty() && b.next().is_none(),
1341            }
1342        }
1343
1344        // If the remainder buffer for 'b' is empty, try to get the next chunk.
1345        if b_rem.is_empty() {
1346            match b.next() {
1347                Some(chunk) => b_rem = chunk.as_ref(),
1348                // 'b' is exhausted, but we know 'a' is not (since a_rem is non-empty).
1349                // Therefore, they cannot be equal.
1350                None => return false,
1351            }
1352        }
1353
1354        // At this point, both a_rem and b_rem are guaranteed to be non-empty.
1355        // Determine the length of the smaller chunk to compare.
1356        let n = a_rem.len().min(b_rem.len());
1357
1358        // Compare the overlapping parts of the chunks.
1359        if a_rem[..n] != b_rem[..n] {
1360            return false;
1361        }
1362
1363        // Move the slices past the part we just compared.
1364        a_rem = &a_rem[n..];
1365        b_rem = &b_rem[n..];
1366    }
1367}
1368
1369#[inline]
1370fn display_bytes_utf8(bytes: &[u8], f: &mut fmt::Formatter<'_>, lossy: bool) -> fmt::Result {
1371    for chunk in bytes.utf8_chunks() {
1372        f.write_str(chunk.valid())?;
1373
1374        if !chunk.invalid().is_empty() {
1375            if lossy {
1376                f.write_char(char::REPLACEMENT_CHARACTER)?
1377            } else {
1378                return Err(fmt::Error);
1379            }
1380        }
1381    }
1382
1383    Ok(())
1384}
1385
1386#[cfg(test)]
1387mod tests {
1388    use core::fmt::Display;
1389    use std::{io::Read as _, string::ToString as _, vec};
1390
1391    use super::*;
1392
1393    // ===================== Escape ===================== //
1394
1395    fn test_escape_typical(input: &str, want: &str) {
1396        let got = escape_str(input).collect::<String>();
1397        assert_eq!(got, want);
1398
1399        // Test PartialEq too
1400        assert_eq!(escape_str(input), want);
1401
1402        // Let's test explicit regardless
1403        let got = explicit::escape_str(input).collect::<String>();
1404        assert_eq!(got, want);
1405
1406        // Test PartialEq too
1407        assert_eq!(explicit::escape_str(input), want)
1408    }
1409
1410    #[test]
1411    fn test_empty_string() {
1412        test_escape_typical("", "");
1413    }
1414
1415    #[test]
1416    fn test_quotes() {
1417        test_escape_typical("\"hello\"", "\\\"hello\\\"")
1418    }
1419
1420    #[test]
1421    fn test_backslash() {
1422        test_escape_typical("\\hello\\", "\\\\hello\\\\");
1423    }
1424
1425    #[test]
1426    fn test_slash() {
1427        test_escape_typical("/hello/", "/hello/");
1428    }
1429
1430    #[test]
1431    fn test_control_chars() {
1432        test_escape_typical("\n\r\t\x08\x0C", "\\n\\r\\t\\b\\f");
1433    }
1434
1435    #[test]
1436    fn test_escape_fully() {
1437        let input = "Hello, \"world\"!\nThis contains a \\ backslash and a \t tab.";
1438        let expected = r#"Hello, \"world\"!\nThis contains a \\ backslash and a \t tab."#;
1439        test_escape_typical(input, expected);
1440    }
1441
1442    #[test]
1443    fn test_other_control_chars() {
1444        let input = "Null:\0, Bell:\x07";
1445        let expected = r#"Null:\u0000, Bell:\u0007"#;
1446        test_escape_typical(input, expected);
1447
1448        test_escape_typical("\x00\x1F", "\\u0000\\u001f");
1449        test_escape_typical("\x19", "\\u0019");
1450    }
1451
1452    #[test]
1453    fn test_iterator_chunks() {
1454        let input = "prefix\npostfix";
1455        let mut iter = escape_str(input);
1456        assert_eq!(iter.next(), Some("prefix"));
1457        assert_eq!(iter.next(), Some(r#"\n"#));
1458        assert_eq!(iter.next(), Some("postfix"));
1459        assert_eq!(iter.next(), None);
1460    }
1461
1462    #[test]
1463    fn test_no_escape_needed() {
1464        let input = "A simple string with no escapes.";
1465        let mut iter = escape_str(input);
1466        assert_eq!(iter.next(), Some("A simple string with no escapes."));
1467        assert_eq!(iter.next(), None);
1468
1469        let input = "café";
1470        let mut iter = escape_str(input);
1471        assert_eq!(iter.next(), Some("café"));
1472        assert_eq!(iter.next(), None);
1473
1474        let input = "❤️";
1475        let mut iter = escape_str(input);
1476        assert_eq!(iter.next(), Some("❤️"));
1477        assert_eq!(iter.next(), None);
1478    }
1479
1480    // ===================== Unescape ===================== //
1481
1482    #[test]
1483    fn test_byte_table() {
1484        assert_eq!(byte_as_static_slice(0), &[0]);
1485        assert_eq!(byte_as_static_slice(5), &[5]);
1486        assert_eq!(byte_as_static_slice(255), &[255]);
1487    }
1488
1489    fn test_unescape_typical<I: AsRef<[u8]> + ?Sized>(input: &I, want: &str) {
1490        let got = unescape(input).decode_utf8().unwrap();
1491        assert_eq!(got, want);
1492
1493        // Test PartialEq too
1494        assert_eq!(unescape(input), want);
1495
1496        // Help display
1497        assert_display(unescape(input).display_utf8(), Ok(want));
1498
1499        // Let's test explicit regardless
1500        let got = explicit::unescape(input).decode_utf8().unwrap();
1501        assert_eq!(got, want);
1502
1503        // Test PartialEq too
1504        assert_eq!(explicit::unescape(input), want);
1505
1506        // Help display
1507        assert_display(explicit::unescape(input).display_utf8(), Ok(want));
1508    }
1509
1510    #[test]
1511    fn test_unicode_escape_basic_unescape() {
1512        // \u4E16 => 世 (E4 B8 96)
1513        let s = "X\\u4E16Y";
1514        test_unescape_typical(s, "X世Y");
1515
1516        let s = "Snow: \\u2603"; // \u2603 => ☃
1517        test_unescape_typical(s, "Snow: ☃");
1518
1519        let s = "A \\u03A9 B"; // Ω is U+03A9
1520        test_unescape_typical(s, "A Ω B");
1521    }
1522
1523    #[test]
1524    fn test_surrogate_pair_unescape() {
1525        // 😀 is U+1F600 -> in JSON: \uD83D\uDE00
1526        let s = "A\\uD83D\\uDE00B";
1527        test_unescape_typical(s, "A😀B")
1528    }
1529
1530    #[test]
1531    fn test_invalid_escape_unescape() {
1532        let s = b"\\x";
1533        let mut u = unescape(s);
1534
1535        match u.next() {
1536            Some(Err(UnescapeError {
1537                kind: UnescapeErrorKind::InvalidEscape(InvalidEscapeError { found: b'x' }),
1538                offset: 1,
1539            })) => {}
1540            _ => panic!("expected invalid escape"),
1541        }
1542
1543        // Let's test explicit regardless
1544        let mut u = explicit::unescape(s);
1545
1546        match u.next() {
1547            Some(Err(UnescapeError {
1548                kind: UnescapeErrorKind::InvalidEscape(InvalidEscapeError { found: b'x' }),
1549                offset: 1,
1550            })) => {}
1551            _ => panic!("expected invalid escape"),
1552        }
1553    }
1554
1555    #[test]
1556    fn test_simple_unescape() {
1557        let input = "Hello\\nWorld\\\"!"; // "Hello\nWorld\"!"
1558        test_unescape_typical(input, "Hello\nWorld\"!")
1559    }
1560
1561    #[test]
1562    fn test_truncated_unicode() {
1563        let input = "Trunc: \\u12"; // too short
1564        let it = unescape(input);
1565        let mut found = false;
1566        for r in it {
1567            match r {
1568                Ok(_) => continue,
1569                Err(UnescapeError {
1570                    kind: UnescapeErrorKind::UnexpectedEof,
1571                    offset: 4,
1572                }) => {
1573                    found = true;
1574                    break;
1575                }
1576                Err(_) => break,
1577            }
1578        }
1579        assert!(found);
1580
1581        // Let's test explicit regardless
1582        assert_eq!(
1583            explicit::unescape(input).next(),
1584            Some(Err(UnescapeError {
1585                kind: UnescapeErrorKind::UnexpectedEof,
1586                offset: 4,
1587            }))
1588        );
1589    }
1590
1591    // ===================== Chunk_Eq ===================== //
1592
1593    #[test]
1594    fn test_empty_iterators_are_equal() {
1595        let a: Vec<&[u8]> = vec![];
1596        let b: Vec<&[u8]> = vec![];
1597        assert!(chunks_eq(a.into_iter(), b.into_iter()));
1598    }
1599
1600    #[test]
1601    fn test_empty_vs_non_empty() {
1602        let a: Vec<&[u8]> = vec![];
1603        let b = vec![&[1, 2, 3]];
1604        assert!(!chunks_eq(a.into_iter(), b.into_iter()));
1605
1606        // And the other way around
1607        let a = vec![&[1, 2, 3]];
1608        let b: Vec<&[u8]> = vec![];
1609        assert!(!chunks_eq(a.into_iter(), b.into_iter()));
1610    }
1611
1612    #[test]
1613    fn test_single_identical_chunks() {
1614        let a = vec!["hello world"];
1615        let b = vec!["hello world"];
1616        assert!(chunks_eq(a.into_iter(), b.into_iter()));
1617    }
1618
1619    #[test]
1620    fn test_different_chunk_boundaries_str() {
1621        // This is the key test: the concatenated content is identical,
1622        // but the chunk divisions are different.
1623        let a = vec!["he", "llo", " ", "world"];
1624        let b = vec!["hello ", "wo", "rld"];
1625        assert!(chunks_eq(a.into_iter(), b.into_iter()));
1626    }
1627
1628    #[test]
1629    fn test_different_chunk_boundaries_bytes() {
1630        let a = vec![&[1, 2], &[3, 4, 5][..]];
1631        let b = vec![&[1, 2, 3], &[4, 5][..]];
1632        assert!(chunks_eq(a.into_iter(), b.into_iter()));
1633    }
1634
1635    #[test]
1636    fn test_one_long_vs_many_short() {
1637        let a = vec!["a-long-single-chunk"];
1638        let b = vec!["a", "-", "long", "-", "single", "-", "chunk"];
1639        assert!(chunks_eq(a.into_iter(), b.into_iter()));
1640    }
1641
1642    #[test]
1643    fn test_unequal_content_same_length() {
1644        let a = vec!["hello"];
1645        let b = vec!["hallo"];
1646        assert!(!chunks_eq(a.into_iter(), b.into_iter()));
1647    }
1648
1649    #[test]
1650    fn test_unequal_at_chunk_boundary() {
1651        let a = vec!["ab", "c"]; // "abc"
1652        let b = vec!["ab", "d"]; // "abd"
1653        assert!(!chunks_eq(a.into_iter(), b.into_iter()));
1654    }
1655
1656    #[test]
1657    fn test_one_is_prefix_of_other() {
1658        // a is shorter
1659        let a = vec!["user", "name"]; // "username"
1660        let b = vec!["user", "name", "123"]; // "username123"
1661        assert!(!chunks_eq(a.into_iter(), b.into_iter()));
1662
1663        // b is shorter
1664        let a = vec!["user", "name", "123"];
1665        let b = vec!["user", "name"];
1666        assert!(!chunks_eq(a.into_iter(), b.into_iter()));
1667    }
1668
1669    #[test]
1670    fn test_complex_remainer_logic() {
1671        // This tests the carry-over logic extensively.
1672        // a: [1,2,3], [4,5], [6,7,8,9], [10]
1673        // b: [1,2], [3,4,5,6], [7,8], [9,10]
1674        let a = vec![&[1, 2, 3], &[4, 5][..], &[6, 7, 8, 9], &[10]];
1675        let b = vec![&[1, 2], &[3, 4, 5, 6][..], &[7, 8], &[9, 10]];
1676        assert!(chunks_eq(a.into_iter(), b.into_iter()));
1677    }
1678
1679    #[test]
1680    fn test_with_vec_references() {
1681        let v_a1 = vec![1, 2];
1682        let v_a2 = vec![3, 4, 5];
1683        let a_data = vec![&v_a1, &v_a2];
1684
1685        let v_b1 = vec![1, 2, 3];
1686        let v_b2 = vec![4, 5];
1687        let b_data = vec![&v_b1, &v_b2];
1688        assert!(chunks_eq(a_data.into_iter(), b_data.into_iter()));
1689    }
1690
1691    // ===================== Unescape Read ===================== //
1692    #[test]
1693    fn bytes_provenance() {
1694        // Input chosen so we hit the "final literal" branch and then try to backtrack.
1695        let input = b"hello";
1696        let mut iter = explicit::unescape(input);
1697
1698        // First call yields the entire "hello" as one literal chunk.
1699        let chunk = iter.next().unwrap().unwrap();
1700        assert_eq!(chunk.literal, b"hello");
1701
1702        // At this point, before the fix, `iter.bytes` would have been set to `&[]`
1703        // (not tied to `input`), so later pointer arithmetic could underflow.
1704        // After the fix, `iter.bytes` is `&input[input.len()..]`, which is safe.
1705        assert!(core::ptr::eq(iter.bytes, &input[input.len()..]));
1706
1707        // -- ESCAPE --
1708        let input = "hello";
1709        let mut iter = explicit::escape_str(input);
1710
1711        // First call yields the entire "hello" as one literal chunk.
1712        let chunk = iter.next().unwrap();
1713        assert_eq!(chunk.literal(), "hello");
1714
1715        // At this point, before the fix, `iter.bytes` would have been set to `&[]`
1716        // (not tied to `input`), so later pointer arithmetic could underflow.
1717        // After the fix, `iter.bytes` is `&input[input.len()..]`, which is safe.
1718        assert!(core::ptr::eq(
1719            // SAFETY: input is string
1720            unsafe { str::from_utf8_unchecked(iter.bytes) },
1721            &input[input.len()..]
1722        ));
1723
1724        // -- ESCAPE --
1725        let mut iter = escape_str(input);
1726
1727        // First call yields the entire "hello" as one literal chunk.
1728        let chunk = iter.next().unwrap();
1729        assert_eq!(chunk, "hello");
1730
1731        // At this point, before the fix, `iter.bytes` would have been set to `&[]`
1732        // (not tied to `input`), so later pointer arithmetic could underflow.
1733        // After the fix, `iter.bytes` is `&input[input.len()..]`, which is safe.
1734        assert!(core::ptr::eq(
1735            // SAFETY: input is string
1736            unsafe { str::from_utf8_unchecked(iter.inner.bytes) },
1737            &input[input.len()..]
1738        ))
1739    }
1740
1741    #[test]
1742    fn test_read_simple() {
1743        let input = br#"hello world"#;
1744        let mut reader = unescape(input);
1745        let mut buf = [0u8; 20];
1746
1747        let bytes_read = reader.read(&mut buf).unwrap();
1748
1749        assert_eq!(bytes_read, 11);
1750        assert_eq!(&buf[..bytes_read], b"hello world");
1751
1752        // Second read should return 0 (EOF)
1753        let bytes_read_eof = reader.read(&mut buf).unwrap();
1754        assert_eq!(bytes_read_eof, 0);
1755    }
1756
1757    #[test]
1758    fn test_read_with_simple_escapes() {
1759        let input = br#"hello\tworld\nline2"#;
1760        let mut reader = unescape(input);
1761        let mut buf = Vec::new();
1762
1763        reader.read_to_end(&mut buf).unwrap();
1764
1765        assert_eq!(buf, b"hello\tworld\nline2");
1766    }
1767
1768    #[test]
1769    fn test_read_into_small_buffer_multiple_calls() {
1770        let input = br#"this is a long string with no escapes"#;
1771        let mut reader = unescape(input);
1772        let mut buf = [0u8; 10];
1773        let mut result = Vec::new();
1774
1775        loop {
1776            match reader.read(&mut buf) {
1777                Ok(0) => break, // EOF
1778                Ok(n) => {
1779                    result.extend_from_slice(&buf[..n]);
1780                }
1781                Err(e) => panic!("Read error: {}", e),
1782            }
1783        }
1784
1785        assert_eq!(result, input);
1786    }
1787
1788    #[test]
1789    fn test_read_multibyte_char_across_buffer_boundary() {
1790        // The grinning face emoji 😀 is \uD83D\uDE00, which is 4 bytes in UTF-8: 0xF0 0x9F 0x98 0x80
1791        let input = br#"emoji: \uD83D\uDE00 is here"#;
1792        let mut reader = unescape(input);
1793
1794        // Buffer is small, forcing the 4-byte emoji to be written across multiple calls
1795        let mut buf = [0u8; 8];
1796        let mut result = Vec::new();
1797
1798        // First read: "emoji: " (7 bytes) + first byte of emoji
1799        let n1 = reader.read(&mut buf).unwrap();
1800        assert_eq!(n1, 8);
1801        assert_eq!(&buf[..n1], b"emoji: \xF0");
1802        result.extend_from_slice(&buf[..n1]);
1803
1804        // Second read: next 3 bytes of emoji + " is h"
1805        let n2 = reader.read(&mut buf).unwrap();
1806        assert_eq!(n2, 8);
1807        assert_eq!(&buf[..n2], b"\x9F\x98\x80 is h");
1808        result.extend_from_slice(&buf[..n2]);
1809
1810        // Third read: "ere"
1811        let n3 = reader.read(&mut buf).unwrap();
1812        assert_eq!(n3, 3);
1813        assert_eq!(&buf[..n3], b"ere");
1814        result.extend_from_slice(&buf[..n3]);
1815
1816        // Final read should be EOF
1817        let n4 = reader.read(&mut buf).unwrap();
1818        assert_eq!(n4, 0);
1819
1820        assert_eq!(result, b"emoji: \xF0\x9F\x98\x80 is here");
1821        assert_eq!(result, "emoji: 😀 is here".as_bytes());
1822    }
1823
1824    #[test]
1825    fn test_read_error_invalid_escape() {
1826        let input = br#"hello \q world"#;
1827        let mut reader = unescape(input);
1828        let mut buf = [0u8; 20];
1829
1830        let result = reader.read(&mut buf);
1831
1832        assert!(result.is_err());
1833        let err = result.unwrap_err();
1834        assert_eq!(err.kind(), std::io::ErrorKind::InvalidData);
1835        assert!(err.to_string().contains("invalid escape"));
1836    }
1837
1838    #[test]
1839    fn test_read_error_lone_surrogate() {
1840        let input = br#"\uD83D rest of data seen"#; // High surrogate without a following low one
1841        let mut reader = unescape(input);
1842        let mut buf = [0u8; 10];
1843
1844        let err = reader.read(&mut buf).unwrap_err();
1845        assert_eq!(err.kind(), std::io::ErrorKind::InvalidData);
1846        assert!(err.to_string().contains("lone surrogate"));
1847    }
1848
1849    #[test]
1850    fn test_read_empty_input() {
1851        let input = b"";
1852        let mut reader = unescape(input);
1853        let mut buf = [0u8; 10];
1854        let bytes_read = reader.read(&mut buf).unwrap();
1855        assert_eq!(bytes_read, 0);
1856    }
1857
1858    #[test]
1859    fn test_read_into_empty_buffer() {
1860        let input = b"hello";
1861        let mut reader = unescape(input);
1862        let mut buf = [0u8; 0];
1863        let bytes_read = reader.read(&mut buf).unwrap();
1864        // A read into an empty buffer should always succeed and return 0.
1865        assert_eq!(bytes_read, 0);
1866    }
1867
1868    #[test]
1869    fn test_read_to_end_optimized() {
1870        let input = br#"first\nsecond\tthird \uD83D\uDE00 last"#;
1871        let mut reader = unescape(input);
1872        let mut buf = Vec::new();
1873
1874        let bytes_read = reader.read_to_end(&mut buf).unwrap();
1875        let expected = b"first\nsecond\tthird \xF0\x9F\x98\x80 last";
1876
1877        assert_eq!(bytes_read, expected.len());
1878        assert_eq!(buf, expected);
1879    }
1880
1881    // ===================== Unescape Display ===================== //
1882
1883    fn assert_display(display: impl Display, want: Result<&str, ()>) {
1884        let mut w = String::new();
1885        let res = fmt::write(&mut w, format_args!("{display}"));
1886
1887        match want {
1888            Ok(want) => {
1889                assert!(res.is_ok());
1890                assert_eq!(w, want)
1891            }
1892            Err(_) => assert!(
1893                res.is_err(),
1894                "strict mode should return Err on invalid bytes"
1895            ),
1896        }
1897    }
1898
1899    // -- NON-LOSSY TESTS (must be perfect) --
1900
1901    #[test]
1902    fn test_display_simple_string() {
1903        let display = unescape("hello world").display_utf8();
1904        assert_display(display, Ok("hello world"));
1905    }
1906
1907    #[test]
1908    fn test_display_empty_string() {
1909        assert_display(unescape("").display_utf8(), Ok(""));
1910    }
1911
1912    #[test]
1913    fn test_display_standard_escapes() {
1914        let input = br#"\" \\ \/ \b \f \n \r \t"#;
1915        let expected = "\" \\ / \x08 \x0C \n \r \t";
1916        assert_display(unescape(input).display_utf8(), Ok(expected));
1917    }
1918
1919    #[test]
1920    fn test_display_non_escaped_utf8() {
1921        let input = "你好, world".as_bytes();
1922        let expected = "你好, world";
1923        assert_display(unescape(input).display_utf8(), Ok(expected));
1924    }
1925
1926    #[test]
1927    fn test_display_unicode_escape_bmp() {
1928        // cent sign: \u00A2 -> C2 A2 (2 bytes)
1929        let input = br"a\u00A2b";
1930        let expected = "a¢b";
1931        assert_display(unescape(input).display_utf8(), Ok(expected));
1932    }
1933
1934    #[test]
1935    fn test_display_mixed_content() {
1936        let input = br#"Text with \n, \u00A2, and \uD83D\uDE0E emojis."#;
1937        let expected = "Text with \n, ¢, and 😎 emojis.";
1938        assert_display(unescape(input).display_utf8(), Ok(expected));
1939    }
1940
1941    #[test]
1942    fn test_display_starts_and_ends_with_escape() {
1943        let input = br#"\u00A2hello\t"#;
1944        let expected = "¢hello\t";
1945        assert_display(unescape(input).display_utf8(), Ok(expected));
1946    }
1947
1948    // -- NON-LOSSY ERROR TESTS --
1949
1950    #[test]
1951    fn test_display_err_invalid_escape() {
1952        assert_display(unescape(br"hello \z world").display_utf8(), Err(()));
1953    }
1954
1955    #[test]
1956    fn test_display_err_incomplete_unicode() {
1957        assert_display(unescape(br"\u123").display_utf8(), Err(()));
1958    }
1959
1960    #[test]
1961    fn test_display_err_invalid_hex_in_unicode() {
1962        assert_display(unescape(br"\u123g").display_utf8(), Err(()));
1963    }
1964
1965    #[test]
1966    fn test_display_err_lone_high_surrogate() {
1967        assert_display(unescape(br"\uD800").display_utf8(), Err(()));
1968    }
1969
1970    #[test]
1971    fn test_display_err_high_surrogate_not_followed_by_low() {
1972        assert_display(unescape(br"\uD800\uABCD").display_utf8(), Err(()));
1973    }
1974
1975    #[test]
1976    fn test_display_err_invalid_source_utf8() {
1977        // A valid UTF-8 sequence for 'h' followed by an invalid byte
1978        assert_display(unescape(b"h\x80ello").display_utf8(), Err(()));
1979    }
1980
1981    #[test]
1982    fn strict_valid_multi_byte_split() {
1983        // "€" U+20AC => bytes [0xE2, 0x82, 0xAC]
1984        let input = &[0xE2, 0x82, 0xAC];
1985        let display = unescape(input).display_utf8();
1986        assert_display(display, Ok("€"));
1987    }
1988
1989    #[test]
1990    fn strict_errors_on_invalid_start_byte() {
1991        let input = &[0xFF, b'a'];
1992        let display = unescape(input).display_utf8();
1993
1994        assert_display(display, Err(()));
1995    }
1996
1997    // -- LOSSY TESTS --
1998
1999    #[test]
2000    fn lossy_replaces_invalid_start_byte() {
2001        // 0xFF is invalid as a leading UTF-8 byte.
2002        let input = &[0xFF, b'a']; // invalid byte then ASCII 'a';
2003        let display = unescape(input).display_utf8_lossy();
2004        // replacement char + 'a'
2005        assert_display(display, Ok("\u{FFFD}a"));
2006    }
2007
2008    #[test]
2009    fn lossy_handles_trailing_incomplete_bytes() {
2010        // A trailing incomplete 3-byte sequence: [0xE2, 0x82] (missing 0xAC)
2011        let input: &[u8] = &[0xE2, 0x82];
2012        let display = unescape(input).display_utf8_lossy();
2013        // Should replace incomplete tail with U+FFFD.
2014        assert_display(display, Ok("\u{FFFD}"));
2015    }
2016
2017    #[test]
2018    fn test_display_lossy_invalid_source_utf8() {
2019        // The invalid byte sequence should be replaced.
2020        let input = b"valid\xF0\x90\x80invalid";
2021        let expected = "valid\u{FFFD}invalid";
2022        assert_display(unescape(input).display_utf8_lossy(), Ok(expected));
2023    }
2024
2025    #[test]
2026    fn test_display_lossy_invalid_escape_truncates() {
2027        // In lossy mode, an invalid JSON escape stops the processing.
2028        let input = br"this is ok \n but this is not \z";
2029        let expected = "this is ok \n";
2030        assert_display(unescape(input).display_utf8_lossy(), Ok(expected));
2031    }
2032
2033    #[test]
2034    fn test_display_lossy_incomplete_unicode_truncates() {
2035        let input = br"truncate after \n \uD83D";
2036        let expected = "truncate after \n";
2037        assert_display(unescape(input).display_utf8_lossy(), Ok(expected));
2038    }
2039
2040    // Inspired by and copied from memchr
2041    #[test]
2042    fn sync_regression() {
2043        use core::panic::{RefUnwindSafe, UnwindSafe};
2044
2045        fn assert_send_sync<T: Send + Sync + UnwindSafe + RefUnwindSafe>() {}
2046        assert_send_sync::<Unescape<'_>>();
2047        assert_send_sync::<Escape<'_>>();
2048    }
2049}
json_escape/lib.rs

json_escape/
lib.rs