json_escape/lib.rs
1//! # Streaming JSON String Escape/Unescape
2//!
3//! Welcome to a highly efficient, `no_std` compatible library for handling JSON string escaping and unescaping. This crate provides iterator-based tools that process strings on the fly, avoiding heap allocations for the entire result. It's designed for performance-critical applications, such as parsing large JSON files or working in memory-constrained environments. ⚡
4//!
5//! The core of the library is two iterator structs:
6//! - **[`Escape`]**: Takes a string slice (`&str`) and yields escaped string slices ready for JSON serialization.
7//! - **[`Unescape`]**: Takes a byte slice (`&[u8]`) representing the content of a JSON string and yields the decoded byte slices.
8//!
9//! ## Key Features
10//! - **Zero-Copy Slicing**: For sequences of characters that don't need modification, the iterators yield slices that borrow directly from the input, avoiding unnecessary data copying.
11//! - **Comprehensive JSON Support**: Correctly handles all standard JSON escapes: `\"`, `\\`, `\/`, `\b`, `\f`, `\n`, `\r`, `\t`.
12//! - **Full Unicode Handling**: Correctly decodes `\uXXXX` sequences, including full support for UTF-16 surrogate pairs (e.g., `\uD83D\uDE00` for `😀`).
13//! - **Robust Error Handling**: The `Unescape` iterator returns descriptive errors (`UnescapeError`) for invalid or truncated escape sequences, making debugging straightforward.
14//! - **Allocation Control** (with `alloc` feature): Provides convenient methods to collect the iterator's output into owned types like `String` or `Cow<str>`.
15//! - **`std::io` Integration** (with `std` feature): The `Unescape` iterator implements `std::io::Read`, allowing it to be used as an efficient reader for I/O streams.
16//!
17//! ## Quick Start: Escaping a String
18//!
19//! ```
20//! use json_escape::escape_str;
21//!
22//! let input = "Hello, \"world\"!\nThis contains a \\ backslash.";
23//! let expected = r#"Hello, \"world\"!\nThis contains a \\ backslash."#;
24//!
25//! // The `escape_str` function returns an iterator.
26//! let mut escaper = escape_str(input);
27//!
28//! // You can iterate over the chunks:
29//! assert_eq!(escaper.next(), Some("Hello, "));
30//! assert_eq!(escaper.next(), Some(r#"\""#));
31//! assert_eq!(escaper.next(), Some("world"));
32//! // ...and so on.
33//!
34//! // Or, collect it into a String (requires the "alloc" feature).
35//! // let escaped_string: String = escape_str(input).collect();
36//! // assert_eq!(escaped_string, expected);
37//! ```
38//!
39//! ## Quick Start: Unescaping a String
40//!
41//! ```
42//! use json_escape::unescape;
43//!
44//! let input = r#"A 😀 emoji: \uD83D\uDE00 and a tab\t!"#;
45//!
46//! // The unescape iterator yields `Result<&[u8], _>`.
47//! let unescaper = unescape(input);
48//!
49//! // With the "alloc" feature, you can decode it directly into a string.
50//! let decoded_cow = unescaper.decode_utf8().unwrap();
51//! assert_eq!(decoded_cow, "A 😀 emoji: 😀 and a tab\t!");
52//! ```
53#![no_std]
54#![deny(missing_docs)]
55#![cfg_attr(all(feature = "simd", nightly), feature(portable_simd))]
56
57#[cfg(any(test, feature = "std"))]
58extern crate std;
59
60#[cfg(feature = "alloc")]
61extern crate alloc;
62
63#[cfg(any(test, feature = "alloc"))]
64use alloc::{borrow::Cow, string::String, vec::Vec};
65
66use core::{
67 char,
68 fmt::{self, Write as _},
69 iter::FusedIterator,
70 slice, str,
71};
72use memchr::memchr;
73
74// =============================================================================
75// Escape Implementation
76// =============================================================================
77
78/// Creates a streaming JSON string escaper from a string slice.
79///
80/// The returned [`Escape`] iterator lazily processes the input string, yielding
81/// slices that represent the escaped output.
82///
83/// # Examples
84///
85/// ```
86/// use json_escape::escape_str;
87///
88/// let escaper = escape_str("a\nb");
89/// let escaped_parts: Vec<_> = escaper.collect();
90///
91/// assert_eq!(escaped_parts, vec!["a", r#"\n"#, "b"]);
92/// ```
93#[inline]
94pub fn escape_str(input: &str) -> Escape<'_> {
95 Escape {
96 bytes: input.as_bytes(),
97 }
98}
99
100/// A streaming JSON string escaper that yields `&'a str` slices.
101///
102/// This struct is created by the [`escape_str`] function. It is an [`Iterator`]
103/// that breaks the input string into chunks at each character that needs to be
104/// escaped according to JSON rules.
105///
106/// - For sequences of safe characters, it yields a single borrowed slice (`&'a str`).
107/// - For each character that must be escaped, it yields a `'static` slice
108/// containing the escaped representation (e.g., `r#"\n"#`).
109///
110/// This approach is highly efficient as it avoids allocating a new string for the
111/// entire output, processing the input in a streaming fashion.
112///
113/// ### Implemented Traits
114/// - **`Iterator<Item = &'a str>`**: Allows you to process the escaped parts in a loop or with adapters.
115/// - **`Display`**: Lets you write the escaped content directly to any formatter, like `println!` or a file, without intermediate allocation.
116/// - **`Clone`**, **`Debug`**: Standard utility traits.
117/// - **`PartialEq`**, **`PartialEq<B: AsRef<[u8]>>`**: Allows direct comparison of the escaped output. An `Escape` iterator is equal to another `Escape` or a byte slice if they produce an identical sequence of escaped bytes.
118/// - **`From<Escape<'a>> for Cow<'a, str>`** (requires `alloc` feature): Provides an efficient way to convert the iterator into a potentially owned string.
119#[derive(Clone)]
120#[must_use = "iterators are lazy and do nothing unless consumed"]
121pub struct Escape<'a> {
122 bytes: &'a [u8],
123}
124
125impl<'a> Iterator for Escape<'a> {
126 type Item = &'a str;
127
128 #[inline]
129 fn next(&mut self) -> Option<&'a str> {
130 if self.bytes.is_empty() {
131 return None;
132 }
133
134 // Find the first byte that needs escaping.
135 let pos = find_escape_char(self.bytes);
136
137 match pos {
138 // No escapable characters left; return the rest of the slice.
139 None => {
140 let s = self.bytes;
141 self.bytes = &[];
142 // SAFETY: The input was a valid &str, and we're returning the
143 // whole remaining chunk, so it's still valid UTF-8.
144 Some(unsafe { str::from_utf8_unchecked(s) })
145 }
146 // An escapable byte is at the beginning of the slice.
147 Some(0) => {
148 let byte = self.bytes[0];
149 self.bytes = &self.bytes[1..];
150 // The table lookup gives us a &'static str, which is a valid &'a str.
151 //
152 // Some(....unwrap()) is more correct
153 ESCAPE_TABLE[byte as usize]
154 }
155 // Found an escapable byte after a safe prefix. Return the prefix.
156 Some(p) => {
157 let (prefix, rest) = self.bytes.split_at(p);
158 self.bytes = rest;
159 // SAFETY: The soundness of this operation is critical.
160 // We are splitting the byte slice at the position of the first
161 // character that requires escaping. All JSON characters that
162 // require escaping (`"`, `\`, and control characters `\u0000`-`\u001F`)
163 // are single-byte ASCII characters. Therefore, `p` is guaranteed
164 // to be on a valid UTF-8 character boundary.
165 Some(unsafe { str::from_utf8_unchecked(prefix) })
166 }
167 }
168 }
169
170 fn size_hint(&self) -> (usize, Option<usize>) {
171 if self.bytes.is_empty() {
172 (0, Some(0))
173 } else {
174 // We'll yield at least 1 slice, and at most `len` slices if every byte is escaped.
175 (1, Some(self.bytes.len()))
176 }
177 }
178}
179
180impl<'a> FusedIterator for Escape<'a> {}
181
182impl fmt::Display for Escape<'_> {
183 /// Allows direct formatting of the escaped string without intermediate allocation.
184 ///
185 /// This is very useful for writing the escaped output directly to a stream,
186 /// such as a file or a network socket.
187 ///
188 /// # Example
189 ///
190 /// ```
191 /// use json_escape::escape_str;
192 ///
193 /// let escaper = escape_str("User said: \"Hi!\"\n");
194 /// let formatted = format!("{}", escaper);
195 ///
196 /// assert_eq!(formatted, r#"User said: \"Hi!\"\n"#);
197 /// ```
198 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
199 // The `clone()` is cheap as it only copies a slice reference.
200 for s in self.clone() {
201 f.write_str(s)?
202 }
203 Ok(())
204 }
205}
206
207impl fmt::Debug for Escape<'_> {
208 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
209 f.debug_struct("Escape").finish_non_exhaustive()
210 }
211}
212
213impl<B: AsRef<[u8]> + ?Sized> PartialEq<B> for Escape<'_> {
214 /// Compares the escaped output with any byte-slice-like object.
215 ///
216 /// This is primarily a convenience for testing, allowing you to check the
217 /// fully concatenated result of an `Escape` iterator against a known `&str` or `&[u8]`.
218 ///
219 /// The notion of equality is based on the **output**, not the iterator's internal state.
220 ///
221 /// # Example
222 ///
223 /// ```
224 /// use json_escape::escape_str;
225 ///
226 /// let escaper = escape_str("key\tvalue");
227 ///
228 /// // The escaper's output, when concatenated, equals the right-hand side.
229 /// assert_eq!(escaper, r#"key\tvalue"#);
230 /// ```
231 fn eq(&self, other: &B) -> bool {
232 let mut other = other.as_ref();
233 for chunk in self.clone() {
234 if !other.starts_with(chunk.as_bytes()) {
235 return false;
236 }
237 other = &other[chunk.len()..];
238 }
239 // We completely searched it
240 other.is_empty()
241 }
242}
243
244impl<'a, 'b> PartialEq<Escape<'a>> for Escape<'b> {
245 /// Compares two `Escape` iterators for equality.
246 ///
247 /// Two `Escape` iterators are considered equal if they'll produce the same **output**.
248 /// It first performs a fast check on the underlying byte slices.
249 fn eq(&self, other: &Escape<'a>) -> bool {
250 // Fast path: if they are views into the same underlying data.
251 self.bytes == other.bytes || chunks_eq(self.clone(), other.clone())
252 }
253}
254
255#[cfg(feature = "alloc")]
256impl<'a> From<Escape<'a>> for Cow<'a, str> {
257 /// Efficiently collects the escaped parts into a `Cow<'a, str>`.
258 ///
259 /// This implementation is optimized to avoid allocation if possible:
260 /// - If the input string requires **no escaping**, it returns `Cow::Borrowed`
261 /// with a slice of the original string.
262 /// - If escaping is needed, it allocates a `String` and returns `Cow::Owned`.
263 ///
264 /// This is more efficient than `iter.collect::<String>()` because `collect`
265 /// will always allocate.
266 ///
267 /// **Requires the `alloc` feature.**
268 ///
269 /// # Example
270 ///
271 /// ```
272 /// # #[cfg(feature = "alloc")] {
273 /// use json_escape::escape_str;
274 /// use std::borrow::Cow;
275 ///
276 /// // No escaping needed, so no allocation occurs.
277 /// let cow_borrowed: Cow<str> = escape_str("plain text").into();
278 /// assert!(matches!(cow_borrowed, Cow::Borrowed(_)));
279 ///
280 /// // Escaping is required, so a new String is allocated.
281 /// let cow_owned: Cow<str> = escape_str("text with\nnewline").into();
282 /// assert!(matches!(cow_owned, Cow::Owned(_)));
283 /// assert_eq!(cow_owned, r#"text with\nnewline"#);
284 /// # }
285 /// ```
286 fn from(mut iter: Escape<'a>) -> Self {
287 match iter.next() {
288 None => Cow::Borrowed(""),
289 Some(first) => match iter.next() {
290 None => Cow::Borrowed(first),
291 Some(second) => {
292 let mut string =
293 String::with_capacity(first.len() + second.len() + iter.bytes.len());
294 string.push_str(first);
295 string.push_str(second);
296 string.extend(iter);
297 Cow::Owned(string)
298 }
299 },
300 }
301 }
302}
303
304// =============================================================================
305// Unescape Implementation
306// =============================================================================
307
308/// Creates a streaming JSON string unescaper from a byte slice.
309///
310/// This function creates an iterator to unescape a byte slice representing the
311/// **raw contents** of a JSON string, assuming the outer quotes have already
312/// been removed.
313///
314/// For a more convenient way to handle complete JSON string literals (including
315/// their surrounding `"` quotes), see the [`unescape_quoted`] function, which
316/// automatically trims them.
317///
318/// The iterator will fail if the input contains invalid JSON escape sequences.
319///
320/// # Example
321///
322/// ```
323/// use json_escape::{unescape, unescape_quoted};
324///
325/// // `unescape` works on the raw content, without quotes.
326/// let content = r#"hello\tworld"#;
327/// assert_eq!(unescape(content), "hello\tworld");
328///
329/// // If you pass a full JSON literal, the quotes are treated as literal characters.
330/// let literal = r#""hello\tworld""#;
331/// assert_eq!(unescape(literal), "\"hello\tworld\""); // Note the quotes in the output.
332///
333/// // For full literals like this, `unescape_quoted` is the recommended function.
334/// assert_eq!(unescape_quoted(literal), "hello\tworld");
335/// ```
336#[inline]
337pub fn unescape<I: AsRef<[u8]> + ?Sized>(input: &I) -> Unescape<'_> {
338 Unescape::new(input.as_ref())
339}
340
341/// Creates a streaming JSON string unescaper, trimming enclosing quotes.
342///
343/// This function acts as a convenience wrapper around [`unescape`]. It first
344/// inspects the input byte slice. If the slice begins and ends with a double-quote
345/// character (`"`), these quotes are trimmed before the inner content is passed to
346/// the unescaper.
347///
348/// If the input is not enclosed in quotes, this function behaves exactly like
349/// [`unescape`]. This is useful for directly unescaping a complete JSON string
350/// literal.
351///
352/// # Example
353///
354/// ```
355/// use json_escape::{unescape, unescape_quoted};
356///
357/// // 1. With quotes: The outer quotes are trimmed before unescaping.
358/// let unescaper = unescape_quoted(r#""hello\nworld""#);
359/// assert_eq!(unescaper, b"hello\nworld");
360///
361/// // 2. Without quotes: Behaves exactly like the standard `unescape`.
362/// let unescaper_no_quotes = unescape_quoted(r#"raw string"#);
363/// assert_eq!(unescaper_no_quotes, b"raw string");
364///
365/// // 3. Mismatched quotes: The input is passed through as-is, quotes are not trimmed.
366/// let mismatched_quotes = unescape_quoted(r#"hello""#);
367/// assert_eq!(mismatched_quotes, b"hello\"");
368///
369/// // 4. Empty quoted string: Correctly results in an empty output.
370/// let empty_quoted = unescape_quoted(r#""""#);
371/// assert_eq!(empty_quoted, b"");
372/// ```
373#[inline]
374pub fn unescape_quoted<I: AsRef<[u8]> + ?Sized>(input: &I) -> Unescape<'_> {
375 let bytes = input.as_ref();
376 let input = if bytes.len() >= 2 && bytes[0] == b'\"' && bytes[bytes.len() - 1] == b'\"' {
377 &bytes[1..bytes.len() - 1]
378 } else {
379 bytes
380 };
381
382 unescape(input)
383}
384
385/// A streaming JSON string unescaper.
386///
387/// This struct is created by the [`unescape`] function. It implements an [`Iterator`]
388/// that yields `Result<&'a [u8], UnescapeError>`, lazily decoding the input.
389///
390/// The iterator's output chunks are either:
391/// - **`Ok(&'a [u8])`**: A borrowed slice of the original input for a sequence of non-escaped bytes.
392/// - **`Ok(&'static [u8])`**: A single-byte slice for a decoded escape sequence (e.g., `\n` becomes a slice containing `0x0A`). For `\uXXXX` sequences, it yields a series of single-byte slices representing the UTF-8 encoding of the character.
393/// - **`Err(UnescapeError)`**: An error indicating an invalid escape sequence, which halts further iteration.
394///
395/// Because it operates on bytes, you can use helper methods like [`Unescape::decode_utf8`] or [`Unescape::decode_utf8_lossy`] to convert the final result into a string.
396///
397/// ### Implemented Traits
398/// - **`Iterator<Item = Result<&'a [u8], UnescapeError>>`**: The core trait for processing the unescaped byte chunks.
399/// - **`std::io::Read`** (requires `std` feature): Lets you use the unescaper as a standard reader, perfect for integrating with other I/O APIs.
400/// - **`Clone`**, **`Debug`**: Standard utility traits.
401/// - **`PartialEq<B: AsRef<[u8]>>`**: Compares the fully unescaped output with a byte slice.
402/// - **`TryFrom<Unescape<'a>> for Cow<'a, [u8]>`** (requires `alloc` feature): An efficient way to collect the unescaped bytes, propagating any errors.
403///
404/// ### Reading Unescaped Bytes
405///
406/// With the `std` feature, `Unescape` can be used as any other `std::io::Read` source.
407/// This is ideal for streaming and decoding large JSON string contents without
408/// buffering the entire result in memory first.
409///
410/// ```
411/// # #[cfg(feature = "std")] {
412/// use json_escape::unescape;
413/// use std::io::Read;
414///
415/// let mut reader = unescape(r#"chunk1\nchunk2"#);
416/// let mut buf = Vec::new();
417///
418/// // Read all unescaped bytes from the iterator into the buffer.
419/// reader.read_to_end(&mut buf).unwrap();
420///
421/// assert_eq!(buf, b"chunk1\nchunk2");
422/// # }
423/// ```
424#[derive(Clone)]
425#[must_use = "iterators are lazy and do nothing unless consumed"]
426pub struct Unescape<'a> {
427 // iterator over the input bytes (we use slice::Iter to clone/peek where necessary
428 // without worrying too much about bookkeeping)
429 bytes: slice::Iter<'a, u8>,
430
431 // scratch buffer for encoded UTF-8 bytes from a \uXXXX (or surrogate pair)
432 unicode: [u8; 4],
433 // We can eliminate this by depending on the header.
434 unicode_len: u8, // how many bytes are valid in buf (0 means no pending)
435 unicode_pos: u8, // how many bytes already emitted
436}
437
438impl<'a> Unescape<'a> {
439 /// Construct from a byte slice which contains the characters inside the JSON string (no quotes).
440 fn new(input: &'a [u8]) -> Self {
441 Self {
442 bytes: input.iter(),
443 unicode: [0; 4],
444 unicode_len: 0,
445 unicode_pos: 0,
446 }
447 }
448
449 /// Helper: parse exactly 4 hex digits from `it`. Returns Ok(u16) or an error.
450 #[inline(always)]
451 fn parse_hex4(iter: &mut slice::Iter<'a, u8>, base_offset: u8) -> Result<u16, UnescapeError> {
452 let mut acc = 0u16;
453 for i in 0..4 {
454 let b = match iter.next() {
455 Some(b) => *b,
456 None => {
457 return Err(UnescapeError {
458 kind: UnescapeErrorKind::UnexpectedEof,
459 // The error occurs where the next digit was expected.
460 offset: base_offset + i,
461 });
462 }
463 };
464 let v = match b {
465 b'0'..=b'9' => (b - b'0') as u16,
466 b'a'..=b'f' => (b - b'a' + 10) as u16,
467 b'A'..=b'F' => (b - b'A' + 10) as u16,
468 _ => {
469 return Err(UnescapeError {
470 kind: UnescapeErrorKind::InvalidHex(InvalidHexError { found: b }),
471 // The error is the invalid digit itself.
472 offset: base_offset + i,
473 });
474 }
475 };
476 acc = (acc << 4) | v;
477 }
478 Ok(acc)
479 }
480
481 #[inline(always)]
482 fn handle_unicode_escape(bytes: &mut slice::Iter<'a, u8>) -> Result<char, UnescapeError> {
483 // We need to parse 4 hex digits from the iterator. But because
484 // `bytes` implements `Clone`, we can clone it to peek ahead
485 // in order to support surrogate pair detection without losing
486 // the original iterator state on failure.
487 let mut cloned_iter = bytes.clone();
488 // parse first 4 hex from cloned_iter to leave original untouched until we commit
489 //
490 // The iterator starts *after* '\u'. The first hex digit is at offset 2 from '\'.
491 let first = Self::parse_hex4(&mut cloned_iter, 2)?;
492
493 // If it's a high surrogate, check for a following `\uXXXX` low surrogate
494 if (0xD800..=0xDBFF).contains(&first) {
495 // cloned_iter currently points after the 4 hex digits; check next two chars
496 if cloned_iter.next() == Some(&b'\\') && cloned_iter.next() == Some(&b'u') {
497 // try parse low
498 //
499 // The first hex digit of the second escape is at offset 8.
500 // (\uXXXX\u -> 8 chars)
501 if let Ok(low) = Self::parse_hex4(&mut cloned_iter, 8) {
502 if (0xDC00..=0xDFFF).contains(&low) {
503 // success: we must advance the real iterator
504 *bytes = cloned_iter;
505 let high_t = first as u32;
506 let low_t = low as u32;
507 let code = 0x10000 + (((high_t - 0xD800) << 10) | (low_t - 0xDC00));
508 return Ok(char::from_u32(code).expect(
509 "valid surrogate pair math should always produce a valid char",
510 ));
511 }
512 }
513 // If parse_hex4 failed, the error would have been returned.
514 // If it succeeded but the value wasn't a low surrogate, we fallthrough.
515 }
516 // If we reach here, no valid surrogate pair followed. That's a lone high surrogate.
517 return Err(UnescapeError {
518 kind: UnescapeErrorKind::LoneSurrogate(LoneSurrogateError { surrogate: first }),
519 // The error is detected after consuming `\uXXXX` (6 bytes).
520 offset: 6,
521 });
522 }
523
524 // Not a surrogate, or a valid low surrogate on its own (which is an error).
525 // If `first` is a low surrogate, `from_u32` will return None.
526 match char::from_u32(first as u32) {
527 Some(c) => {
528 // Success. Advance the main iterator.
529 *bytes = cloned_iter;
530 Ok(c)
531 }
532
533 None => Err(UnescapeError {
534 kind: UnescapeErrorKind::LoneSurrogate(LoneSurrogateError { surrogate: first }),
535 // The error is detected after consuming `\uXXXX` (6 bytes).
536 offset: 6,
537 }),
538 }
539 }
540
541 #[inline]
542 fn store_unicode(&mut self, ch: char) {
543 self.unicode_len = ch.encode_utf8(&mut self.unicode).len() as u8;
544 self.unicode_pos = 0;
545 }
546
547 #[inline]
548 fn emit_pending_byte(&mut self) -> Option<u8> {
549 if self.unicode_pos < self.unicode_len {
550 let b = self.unicode[self.unicode_pos as usize];
551 self.unicode_pos += 1;
552 Some(b)
553 } else {
554 None
555 }
556 }
557
558 /// Helper to emit the full unicode sequence and advance the internal position.
559 #[inline]
560 fn emit_unicode_as_str(&mut self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
561 // The check `unicode_pos > 0` is implicit from the call site.
562 // The buffer is guaranteed to contain a valid UTF-8 sequence.
563 let s = unsafe { str::from_utf8_unchecked(&self.unicode[..self.unicode_len as usize]) };
564 f.write_str(s)?;
565
566 // Mark the entire sequence as emitted.
567 self.unicode_pos = self.unicode_len;
568
569 Ok(())
570 }
571
572 /// The single, authoritative helper for producing unescaped byte chunks.
573 ///
574 /// It takes an optional `max` length to limit the size of the returned slice,
575 /// which is essential for the `std::io::Read` implementation.
576 #[inline(always)]
577 fn next_limit(&mut self, limit: Option<usize>) -> Option<Result<&'a [u8], UnescapeError>> {
578 if limit.is_some_and(|l| l == 0) {
579 return Some(Ok(&[]));
580 }
581
582 // If we have pending bytes, emit them first (fast).
583 //
584 // LIMIT: We're allowed not checking here since we'll only produce 1 byte
585 // and limit is at least 1.
586 if let Some(s) = self.emit_pending_byte() {
587 // s: &'static [u8] coerces to &'a [u8]
588 return Some(Ok(byte_as_static_slice(s)));
589 }
590
591 let bytes = self.bytes.as_slice();
592 if bytes.is_empty() {
593 return None;
594 }
595
596 // Find next backslash in the remaining bytes.
597 let pos = memchr(b'\\', bytes);
598
599 match pos {
600 None => {
601 // No more escapes. Return the rest of the slice as a borrowed chunk.
602 let chunk_len = bytes.len().min(limit.unwrap_or(bytes.len()));
603 let (chunk, rest) = bytes.split_at(chunk_len);
604 self.bytes = rest.iter();
605 Some(Ok(chunk))
606 }
607 // LIMIT: We're allowed not checking here since we'll only produce 1 byte
608 // and limit is at least 1.
609 Some(0) => {
610 // Backslash is the first byte in the slice: handle escape
611 self.bytes.next(); // Consume the backslash
612
613 // Next byte dictates the escape form
614 match self.bytes.next() {
615 Some(b'"') => Some(Ok(b"\"")),
616 Some(b'\\') => Some(Ok(b"\\")),
617 Some(b'/') => Some(Ok(b"/")),
618 Some(b'b') => Some(Ok(b"\x08")),
619 Some(b'f') => Some(Ok(b"\x0C")),
620 Some(b'n') => Some(Ok(b"\n")),
621 Some(b'r') => Some(Ok(b"\r")),
622 Some(b't') => Some(Ok(b"\t")),
623 Some(b'u') => match Self::handle_unicode_escape(&mut self.bytes) {
624 Ok(ch) => {
625 self.store_unicode(ch);
626 self.emit_pending_byte()
627 .map(|b| Ok(byte_as_static_slice(b)))
628 }
629 Err(err) => Some(Err(err)),
630 },
631 Some(other) => Some(Err(UnescapeError {
632 kind: UnescapeErrorKind::InvalidEscape(InvalidEscapeError {
633 found: *other,
634 }),
635 // The invalid character is 1 byte after '\'.
636 offset: 1,
637 })),
638 None => Some(Err(UnescapeError {
639 kind: UnescapeErrorKind::UnexpectedEof,
640 // EOF occurred 1 byte after '\'.
641 offset: 1,
642 })),
643 }
644 }
645 // Found \ after a safe prefix. Return the prefix. We'll handle on next call to next
646 Some(p) => {
647 // Return the safe prefix (borrowed from input)
648 let chunk_len = p.min(limit.unwrap_or(p));
649 let (chunk, rest) = bytes.split_at(chunk_len);
650 self.bytes = rest.iter();
651 Some(Ok(chunk))
652 }
653 }
654 }
655
656 fn _display_utf8(mut self, f: &mut fmt::Formatter<'_>, lossy: bool) -> fmt::Result {
657 // The key insight: Chunks with more than one byte are *always*
658 // borrowed from the original input, as all escaped characters
659 // are yielded byte-by-byte.
660 while let Some(result) = self.next() {
661 match result {
662 Ok(chunk) => {
663 if chunk.is_empty() {
664 continue;
665 }
666
667 // THE CORE LOGIC:
668 // Check if the iterator just yielded the *first byte* of a *multi-byte* sequence.
669 // - `unicode_pos == 1` means the first byte was just emitted.
670 // - `unicode_len > 1` means it's a multi-byte char (e.g., '¢', '😎').
671 if self.unicode_pos == 1 && self.unicode_len > 1 {
672 // This is our special case. We have the first byte in `chunk`, but
673 // it's more efficient to write the whole character at once from our buffer.
674 self.emit_unicode_as_str(f)?;
675 // The iterator will no longer yield the rest of the bytes. Since our helper
676 // has now advanced it. But to be sure...
677 self.unicode_pos = self.unicode_len;
678 } else {
679 // This is the normal case:
680 // 1. A large chunk borrowed from the original input.
681 // 2. A single-byte escape like `\n` or `\t`.
682 // 3. The last byte of a multi-byte sequence (or the only byte).
683 // In all these cases, we just need to display the chunk we received.
684 display_bytes_uft8(chunk, f, lossy)?;
685 }
686 }
687 Err(_) => {
688 if lossy {
689 break;
690 } else {
691 return Err(fmt::Error);
692 }
693 }
694 }
695 }
696
697 Ok(())
698 }
699
700 /// Decodes the unescaped byte stream into a UTF-8 string.
701 ///
702 /// This method consumes the iterator and collects all resulting byte chunks.
703 /// If an unescaping error occurs, it's returned immediately. If the final
704 /// sequence of bytes is not valid UTF-8, a UTF-8 error is returned.
705 ///
706 /// Like `From<Escape>`, this is optimized to return a `Cow::Borrowed` if no
707 /// escapes were present in the input, avoiding allocation.
708 ///
709 /// **Requires the `alloc` feature.**
710 ///
711 /// # Example
712 ///
713 /// ```
714 /// # #[cfg(feature = "alloc")] {
715 /// use json_escape::unescape;
716 ///
717 /// let input = r#"Emoji: \uD83D\uDE00"#;
718 /// let cow = unescape(input).decode_utf8().unwrap();
719 ///
720 /// assert_eq!(cow, "Emoji: 😀");
721 /// # }
722 /// ```
723 #[cfg(feature = "alloc")]
724 pub fn decode_utf8(self) -> Result<Cow<'a, str>, DecodeUtf8Error> {
725 match self.try_into().map_err(DecodeUtf8Error::Unescape)? {
726 Cow::Borrowed(bytes) => str::from_utf8(bytes)
727 .map(Cow::Borrowed)
728 .map_err(DecodeUtf8Error::Utf8),
729 Cow::Owned(bytes) => String::from_utf8(bytes)
730 .map(Cow::Owned)
731 .map_err(|e| DecodeUtf8Error::Utf8(e.utf8_error())),
732 }
733 }
734
735 /// Decodes the unescaped byte stream lossily into a UTF-8 string.
736 ///
737 /// This is similar to [`Unescape::decode_utf8`] but replaces any invalid UTF-8 sequences
738 /// with the replacement character (U+FFFD) instead of returning an error.
739 ///
740 /// An `UnescapeError` can still be returned if the JSON escaping itself is invalid.
741 ///
742 /// **Requires the `alloc` feature.**
743 #[cfg(feature = "alloc")]
744 pub fn decode_utf8_lossy(self) -> Result<Cow<'a, str>, UnescapeError> {
745 Ok(decode_utf8_lossy(self.try_into()?))
746 }
747
748 /// Returns a wrapper that implements [`fmt::Display`].
749 ///
750 /// This allows an `Unescape` iterator to be used directly with formatting
751 /// macros like `println!`, `format!`, etc. It writes the unescaped content
752 /// directly to the formatter's buffer, **avoiding any heap allocations**.
753 ///
754 /// The iterator is consumed, and the resulting unescaped string is written
755 /// to the formatter. Any invalid JSON escape sequences or invalid UTF-8 will
756 /// cause a `fmt::Error`. **You should be cautious when using this method
757 /// with the `format!` macro, as a `fmt::Error` from us will cause the macro
758 /// to panic**.
759 ///
760 /// For a more robust alternative that will not panic on `UnescapeError` or
761 /// invalid bytes, consider using [`Unescape::display_utf8_lossy`] instead.
762 ///
763 /// This method is a **zero-allocation** alternative to [`Unescape::decode_utf8`],
764 /// which might allocate a `String` to return the unescaped content.
765 ///
766 /// # Example
767 ///
768 /// ```
769 /// use json_escape::unescape;
770 ///
771 /// let original = r#"Hello, \uD83C\uDF0E!"#;
772 /// let unescaper = unescape(original);
773 ///
774 /// let formatted = format!("{}", unescaper.display_utf8());
775 /// assert_eq!(formatted, "Hello, 🌎!");
776 /// ```
777 pub fn display_utf8(self) -> DisplayUnescape<'a> {
778 DisplayUnescape { inner: self }
779 }
780
781 /// Returns a wrapper that implements [`fmt::Display`] lossily.
782 ///
783 /// This method is an **allocation-free** way to write unescaped content
784 /// to a formatter. It handles invalid JSON escape sequences and invalid
785 /// UTF-8 gracefully, making it a "lossy" operation.
786 ///
787 /// - **Invalid JSON escape sequences:** Instead of causing an error, the iterator
788 /// terminates without an error.
789 /// - **Invalid UTF-8 bytes:** These are replaced with the Unicode
790 /// replacement character (U+FFFD).
791 ///
792 /// This method is the **zero-allocation** counterpart to [`Unescape::decode_utf8_lossy`].
793 pub fn display_utf8_lossy(self) -> DisplayUnescapeLossy<'a> {
794 DisplayUnescapeLossy { inner: self }
795 }
796}
797
798impl<'a> Iterator for Unescape<'a> {
799 type Item = Result<&'a [u8], UnescapeError>;
800
801 fn next(&mut self) -> Option<Self::Item> {
802 self.next_limit(None)
803 }
804
805 fn size_hint(&self) -> (usize, Option<usize>) {
806 // The minimum size is 0 (if the rest of the string is an invalid escape).
807 // The maximum size is the remaining length of the underlying bytes + pending_unicode
808 let (lower, upper) = self.bytes.size_hint();
809 let upper = upper.map(|x| x + (self.unicode_len as usize));
810 // Worst-case is \uXXXX -> 1 byte, so 6 -> 1.
811 (lower.saturating_add(1) / 6, upper)
812 }
813}
814
815impl<'a> FusedIterator for Unescape<'a> {}
816
817#[cfg(feature = "std")]
818impl std::io::Read for Unescape<'_> {
819 fn read(&mut self, mut buf: &mut [u8]) -> std::io::Result<usize> {
820 let start_len = buf.len();
821
822 // Read until buf is full or iter drained
823 loop {
824 // If the buffer is empty, we're done.
825 if buf.is_empty() {
826 return Ok(start_len);
827 }
828
829 match self.next_limit(Some(buf.len())) {
830 Some(Ok(chunk)) => {
831 // chunk.len() <= buf.len()... next_limit ensures this
832 let len = chunk.len();
833 buf[..len].copy_from_slice(chunk);
834 buf = &mut buf[len..]
835 }
836 Some(Err(err)) => {
837 return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, err));
838 }
839 None => {
840 // iter is drained
841 return Ok(start_len - buf.len());
842 }
843 }
844 }
845 }
846
847 // We can provide an optimized version of read_to_end
848 fn read_to_end(&mut self, buf: &mut Vec<u8>) -> std::io::Result<usize> {
849 let start_len = buf.len();
850
851 // Now, efficiently consume the rest of the iterator
852 for result in self {
853 match result {
854 Ok(chunk) => buf.extend_from_slice(chunk),
855 Err(err) => return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, err)),
856 }
857 }
858
859 Ok(buf.len() - start_len)
860 }
861}
862
863impl fmt::Debug for Unescape<'_> {
864 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
865 f.debug_struct("Unescape").finish_non_exhaustive()
866 }
867}
868
869impl<B: AsRef<[u8]> + ?Sized> PartialEq<B> for Unescape<'_> {
870 /// Compares the unescaped output with a byte-slice-like object.
871 ///
872 /// An `Unescape` iterator is considered equal to a byte slice if it successfully
873 /// unescapes to produce a sequence of bytes identical to that slice. If the
874 /// iterator would produce an error, the comparison returns `false`.
875 ///
876 /// # Example
877 ///
878 /// ```
879 /// use json_escape::unescape;
880 ///
881 /// let unescaper = unescape(r#"hello\nworld"#);
882 /// assert_eq!(unescaper, b"hello\nworld");
883 ///
884 /// // An iterator that produces an error is not equal to any valid slice.
885 /// let failing_unescaper = unescape(r#"\k"#);
886 /// assert_ne!(failing_unescaper, b"k");
887 /// ```
888 fn eq(&self, other: &B) -> bool {
889 let mut other = other.as_ref();
890 for result in self.clone() {
891 match result {
892 Ok(chunk) => {
893 if !other.starts_with(chunk) {
894 return false;
895 }
896 other = &other[chunk.len()..];
897 }
898 Err(_) => return false, // An erroring iterator cannot be equal to a valid slice.
899 }
900 }
901 other.is_empty()
902 }
903}
904
905impl<B: AsRef<[u8]>> PartialEq<Unescape<'_>> for Result<B, UnescapeError> {
906 /// Compares the unescaper's outcome with a `Result`.
907 ///
908 /// This implementation allows for precise testing of the `Unescape` iterator
909 /// by comparing it against either a successful outcome (`Ok`) or a specific
910 /// failure (`Err`).
911 ///
912 /// - If `result` is `Ok(bytes)`, the comparison is `true` only if the iterator
913 /// completes successfully and its concatenated output is identical to `bytes`.
914 ///
915 /// - If `result` is `Err(error)`, the comparison is `true` only if the iterator
916 /// produces the exact same `UnescapeError`.
917 ///
918 /// # Example
919 ///
920 /// ```
921 /// use json_escape::{unescape, UnescapeError, InvalidEscapeError};
922 ///
923 /// // --- Success Case ---
924 /// let unescaper = unescape(r#"hello\tworld"#);
925 /// // The comparison is against an `Ok` variant.
926 /// assert_eq!(Ok("hello\tworld"), unescaper);
927 ///
928 /// // --- Error Case ---
929 /// let failing_unescaper = unescape(r#"invalid-\u"#);
930 /// // We can assert that the iterator produces a specific error.
931 /// # let unexpected_eof = unescape(r"\u").next().unwrap().unwrap_err();
932 /// assert_eq!(Err::<&str, _>(unexpected_eof), failing_unescaper);
933 /// ```
934 fn eq(&self, unescape: &Unescape<'_>) -> bool {
935 match self {
936 Ok(expected_bytes) => unescape == expected_bytes,
937 Err(expected_error) => {
938 for result in unescape.clone() {
939 if let Err(actual_error) = result {
940 // The iterator's first error is its final outcome.
941 // It must match the expected error exactly.
942 return actual_error == *expected_error;
943 }
944 }
945 // `unescape` completed successfully, but an error was expected.
946 false
947 }
948 }
949 }
950}
951
952impl<'a, 'b> PartialEq<Unescape<'a>> for Unescape<'b> {
953 /// Compares two `Unescape` iterators for equality based on their terminal result.
954 ///
955 /// The equality of two `Unescape` iterators is determined by the final `Result`
956 /// that would be obtained if each iterator were fully consumed (e.g., by using `try_collect()`).
957 ///
958 /// The specific rules are as follows:
959 ///
960 /// 1. **Error vs. Error**: If both iterators terminate with an `Err`, they are
961 /// considered **equal** if and only if their `UnescapeError`s are identical.
962 /// Any bytes successfully unescaped *before* the error are ignored in this case.
963 /// 2. **Success vs. Success**: If both iterators terminate with `Ok`, they are
964 /// considered **equal** if and only if the complete sequence of unescaped bytes
965 /// is identical for both.
966 /// 3. **Success vs. Error**: If one iterator terminates with `Ok` and the other
967 /// with `Err`, they are always **not equal**.
968 ///
969 /// # Example
970 ///
971 /// ```
972 /// use json_escape::unescape;
973 ///
974 /// // Case 1: Both iterators produce the same error. They are equal,
975 /// // even though their valid prefixes ("a" and "b") are different.
976 /// let failing_a = unescape(r#"a\k"#);
977 /// let failing_b = unescape(r#"b\k"#);
978 /// assert_eq!(failing_a, failing_b);
979 ///
980 /// // Case 2: Both iterators succeed. Equality depends on the byte stream.
981 /// let successful_a = unescape(r#"hello\nworld"#);
982 /// let successful_b = unescape(r#"hello\nworld"#);
983 /// assert_eq!(successful_a, successful_b);
984 ///
985 /// let successful_c = unescape(r#"different"#);
986 /// assert_ne!(successful_a, successful_c);
987 ///
988 /// // Case 3: One succeeds and one fails. They are not equal.
989 /// let succeeding = unescape(r#"stop"#);
990 /// let failing = unescape(r#"stop\k"#);
991 /// assert_ne!(succeeding, failing);
992 ///
993 /// // Case 4: Both iterators fail differently. They are not equal.
994 /// let failing_a = unescape(r#"data:\k"#);
995 /// let failing_b = unescape(r#"data:\"#);
996 /// assert_ne!(failing_a, failing_b);
997 /// ```
998 fn eq(&self, other: &Unescape<'a>) -> bool {
999 // Fast path: if they are views into the same underlying data with the same state.
1000 ((self.bytes.as_ref() == other.bytes.as_ref())
1001 && (self.unicode == other.unicode)
1002 && (self.unicode_len == other.unicode_len)
1003 && (self.unicode_pos == other.unicode_pos))
1004 || {
1005 let mut a_error = None;
1006 let mut b_error = None;
1007
1008 let mut a = self.clone().map_while(|result| match result {
1009 Ok(ok) => Some(ok),
1010 Err(err) => {
1011 a_error = Some(err);
1012 None
1013 }
1014 });
1015
1016 let mut b = other.clone().map_while(|result| match result {
1017 Ok(ok) => Some(ok),
1018 Err(err) => {
1019 b_error = Some(err);
1020 None
1021 }
1022 });
1023
1024 let streams_match = chunks_eq(&mut a, &mut b);
1025
1026 // Drain the iterators to ensure the error state is captured,
1027 // especially if chunks_eq returned false early.
1028 // (e.g unescape("a\k") and unescape("b\k") which are actually
1029 // equal)
1030 a.for_each(|_| {});
1031 b.for_each(|_| {});
1032
1033 match (a_error, b_error) {
1034 // Both errored: equality depends only on the errors being the same.
1035 (Some(a_err), Some(b_err)) => a_err == b_err,
1036 // Both succeeded: equality depends on the byte streams having been identical.
1037 (None, None) => streams_match,
1038 // One errored and the other didn't: they are not equal.
1039 _ => false,
1040 }
1041 }
1042 }
1043}
1044
1045#[cfg(feature = "alloc")]
1046impl<'a> TryFrom<Unescape<'a>> for Cow<'a, [u8]> {
1047 type Error = UnescapeError;
1048
1049 /// Efficiently collects the unescaped bytes into a `Cow<'a, [u8]>`.
1050 ///
1051 /// This implementation will return `Cow::Borrowed` if the original input contained
1052 /// no escape sequences, avoiding allocation. Otherwise, it returns `Cow::Owned`.
1053 ///
1054 /// If any `UnescapeError` is encountered during iteration, the operation
1055 /// halts and returns that error.
1056 ///
1057 /// **Requires the `alloc` feature.**
1058 fn try_from(mut value: Unescape<'a>) -> Result<Self, Self::Error> {
1059 match value.next() {
1060 None => Ok(Cow::Borrowed(b"")),
1061 Some(Ok(first)) => match value.next() {
1062 None => Ok(Cow::Borrowed(first)),
1063 Some(Ok(second)) => {
1064 let mut buf =
1065 Vec::with_capacity(first.len() + second.len() + value.bytes.len());
1066 buf.extend_from_slice(first);
1067 buf.extend_from_slice(second);
1068 for item in value {
1069 buf.extend_from_slice(item?);
1070 }
1071 Ok(Cow::Owned(buf))
1072 }
1073 Some(Err(e)) => Err(e),
1074 },
1075 Some(Err(e)) => Err(e),
1076 }
1077 }
1078}
1079
1080// =============================================================================
1081// DisplayUnescape Implementation
1082// =============================================================================
1083
1084/// A wrapper for an [`Unescape`] iterator that implements [`fmt::Display`].
1085///
1086/// This struct is created by the [`Unescape::display_utf8()`] method. It allows for
1087/// printing the unescaped content directly to a formatter, which **avoids
1088/// any heap allocations**. The unescaping and UTF-8 decoding are performed on-the-fly as the
1089/// `fmt` method is called.
1090pub struct DisplayUnescape<'a> {
1091 inner: Unescape<'a>,
1092}
1093
1094impl fmt::Display for DisplayUnescape<'_> {
1095 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1096 self.inner.clone()._display_utf8(f, false)
1097 }
1098}
1099
1100/// A wrapper for an [`Unescape`] iterator that implements [`fmt::Display`] lossily.
1101///
1102/// This struct is created by the [`Unescape::display_utf8_lossy()`] method. Like
1103/// `DisplayUnescape`, it performs its operation **without any heap allocations**.
1104///
1105/// This method differs from `display_utf8` in that it handles two types of
1106/// errors gracefully:
1107/// - Invalid JSON escape sequences will be ignored, and the iterator will
1108/// continue to completion without a `fmt::Error`.
1109/// - Invalid UTF-8 byte sequences will be replaced with the Unicode
1110/// replacement character (``, U+FFFD)
1111pub struct DisplayUnescapeLossy<'a> {
1112 inner: Unescape<'a>,
1113}
1114
1115impl fmt::Display for DisplayUnescapeLossy<'_> {
1116 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1117 // Lossy mode: replace invalid sequences with U+FFFD and continue.
1118 self.inner.clone()._display_utf8(f, true)
1119 }
1120}
1121
1122// =============================================================================
1123// Error Types
1124// =============================================================================
1125
1126/// An error that can occur when decoding the final byte stream to a UTF-8 string.
1127#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1128pub enum DecodeUtf8Error {
1129 /// The unescaped byte sequence was not valid UTF-8.
1130 Utf8(str::Utf8Error),
1131 /// An error occurred during the JSON unescaping process itself.
1132 Unescape(UnescapeError),
1133}
1134
1135impl fmt::Display for DecodeUtf8Error {
1136 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1137 match self {
1138 DecodeUtf8Error::Utf8(e) => fmt::Display::fmt(e, f),
1139 DecodeUtf8Error::Unescape(e) => fmt::Display::fmt(e, f),
1140 }
1141 }
1142}
1143
1144/// Details of an invalid escape sequence error.
1145#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1146#[non_exhaustive]
1147pub struct InvalidEscapeError {
1148 /// The invalid character found after a `\`.
1149 pub found: u8,
1150}
1151
1152/// Details of a lone UTF-16 surrogate error.
1153#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1154#[non_exhaustive]
1155pub struct LoneSurrogateError {
1156 /// The 16-bit surrogate code point.
1157 pub surrogate: u16,
1158}
1159
1160/// Details of an invalid hex digit error within a `\uXXXX` sequence.
1161#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1162#[non_exhaustive]
1163pub struct InvalidHexError {
1164 /// The non-hex character that was found.
1165 pub found: u8,
1166}
1167
1168impl fmt::Display for InvalidHexError {
1169 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1170 write!(f, "found invalid hex digit '0x{:02X}'", self.found)
1171 }
1172}
1173
1174/// An error that can occur during the JSON string unescaping process.
1175#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1176pub struct UnescapeError {
1177 /// The specific kind of unescaping error.
1178 pub(crate) kind: UnescapeErrorKind,
1179 /// The byte offset from the start of the escape sequence (`\`) where the
1180 /// error was detected.
1181 ///
1182 /// This is guaranteed to be less than 12, as the maximum escape sequence
1183 /// is `\uXXXX\uXXXX`.
1184 pub(crate) offset: u8,
1185}
1186
1187impl UnescapeError {
1188 /// Returns the specific kind of error that occurred.
1189 ///
1190 /// This can be used to programmatically handle different error types,
1191 /// such as distinguishing between a malformed hex sequence and an
1192 /// invalid escape character.
1193 ///
1194 /// ### Example
1195 ///
1196 /// ```
1197 /// # use json_escape::{unescape, UnescapeErrorKind, InvalidHexError};
1198 /// let mut unescaper = unescape(r#"\u123Z"#);
1199 /// let err = unescaper.next().unwrap().unwrap_err();
1200 ///
1201 /// match err.kind() {
1202 /// UnescapeErrorKind::InvalidHex(InvalidHexError { found, .. }) => {
1203 /// // We can inspect the exact invalid character found.
1204 /// assert_eq!(found, b'Z');
1205 /// }
1206 /// _ => panic!("Expected an InvalidHex error"),
1207 /// }
1208 /// ```
1209 pub fn kind(&self) -> UnescapeErrorKind {
1210 self.kind
1211 }
1212
1213 /// Returns the byte offset from the start of the escape sequence (`\`)
1214 /// where the error was detected.
1215 ///
1216 /// - For `\x`, the offset is `1` (pointing to `x`).
1217 /// - For `\u123?`, the offset is `5` (pointing to `?`).
1218 /// - For a lone surrogate `\uD800`, the offset is `6` (pointing after the sequence).
1219 ///
1220 /// This is useful for providing detailed error messages that can point
1221 /// to the exact location of the problem in the source string.
1222 ///
1223 /// ### Example
1224 ///
1225 /// ```
1226 /// # use json_escape::unescape;
1227 /// let json_string_content = r#"bad escape \x here"#;
1228 /// let mut unescaper = unescape(json_string_content);
1229 ///
1230 /// // read off 'bad escape '
1231 /// let first = unescaper.next().unwrap().unwrap();
1232 /// assert_eq!(first, b"bad escape ");
1233 ///
1234 /// let err = unescaper.next().unwrap().unwrap_err();
1235 ///
1236 /// // The error occurred at the 'x', which is 1 byte after the '\'
1237 /// assert_eq!(err.offset(), 1);
1238 ///
1239 /// // You could use this to highlight the error in the original input
1240 /// let backslash_pos = json_string_content.find('\\').unwrap();
1241 /// let error_pos = backslash_pos + err.offset() as usize;
1242 /// assert_eq!(json_string_content.as_bytes()[error_pos], b'x');
1243 ///
1244 /// // The generated error message also includes this info.
1245 /// let expected_msg = "invalid escape: '\\0x78' at offset 1";
1246 /// assert_eq!(err.to_string(), expected_msg);
1247 /// ```
1248 pub fn offset(&self) -> u8 {
1249 self.offset
1250 }
1251}
1252
1253/// The specific kind of error that can occur during JSON string unescaping.
1254///
1255/// This enum covers all possible failures described by the JSON standard for string contents.
1256#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1257#[non_exhaustive]
1258pub enum UnescapeErrorKind {
1259 /// Found a backslash followed by an unexpected character (e.g., `\x`).
1260 InvalidEscape(InvalidEscapeError),
1261 /// Found `\u` but the following characters were not 4 valid hex digits.
1262 InvalidHex(InvalidHexError),
1263 /// Input ended unexpectedly while parsing an escape sequence (e.g., `\u12`).
1264 UnexpectedEof,
1265 /// The `\u` sequence yielded a lone high or low surrogate without a matching pair.
1266 LoneSurrogate(LoneSurrogateError),
1267}
1268
1269impl fmt::Display for UnescapeError {
1270 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1271 match self.kind {
1272 UnescapeErrorKind::InvalidEscape(e) => {
1273 write!(
1274 f,
1275 "invalid escape: '\\0x{:02X}' at offset {}",
1276 e.found, self.offset
1277 )
1278 }
1279 UnescapeErrorKind::InvalidHex(ref s) => {
1280 write!(f, "{} at offset {}", s, self.offset)
1281 }
1282 UnescapeErrorKind::UnexpectedEof => {
1283 write!(
1284 f,
1285 "unexpected end of input while parsing escape sequence, expected character at offset {}",
1286 self.offset
1287 )
1288 }
1289 UnescapeErrorKind::LoneSurrogate(e) => write!(
1290 f,
1291 "invalid unicode sequence: lone surrogate found: 0x{:04X} at offset {}",
1292 e.surrogate, self.offset
1293 ),
1294 }
1295 }
1296}
1297
1298impl core::error::Error for UnescapeError {}
1299impl core::error::Error for DecodeUtf8Error {
1300 fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
1301 match self {
1302 DecodeUtf8Error::Utf8(e) => Some(e),
1303 DecodeUtf8Error::Unescape(e) => Some(e),
1304 }
1305 }
1306}
1307
1308// =============================================================================
1309// Utilities
1310// =============================================================================
1311
1312// A const lookup table for JSON escape sequences.
1313// Maps a byte to its escaped `&'static str` representation.
1314// `None` indicates the byte does not need to be escaped.
1315const ESCAPE_TABLE: [Option<&'static str>; 256] = {
1316 let mut table: [Option<&'static str>; 256] = [None; 256];
1317
1318 // Special characters
1319 table[b'"' as usize] = Some(r#"\""#);
1320 table[b'\\' as usize] = Some(r#"\\"#);
1321
1322 // Common control characters with short escapes
1323 table[0x08] = Some(r#"\b"#); // Backspace
1324 table[0x09] = Some(r#"\t"#); // Tab
1325 table[0x0A] = Some(r#"\n"#); // Line Feed
1326 table[0x0C] = Some(r#"\f"#); // Form Feed
1327 table[0x0D] = Some(r#"\r"#); // Carriage Return
1328
1329 // The rest of the control characters must be `\uXXXX` encoded.
1330 // We can pre-calculate and store all of them as static strings.
1331 table[0x00] = Some(r#"\u0000"#);
1332 table[0x01] = Some(r#"\u0001"#);
1333 table[0x02] = Some(r#"\u0002"#);
1334 table[0x03] = Some(r#"\u0003"#);
1335 table[0x04] = Some(r#"\u0004"#);
1336 table[0x05] = Some(r#"\u0005"#);
1337 table[0x06] = Some(r#"\u0006"#);
1338 table[0x07] = Some(r#"\u0007"#);
1339 // 0x08 to 0x0D are already handled above
1340 table[0x0B] = Some(r#"\u000b"#);
1341 table[0x0E] = Some(r#"\u000e"#);
1342 table[0x0F] = Some(r#"\u000f"#);
1343 table[0x10] = Some(r#"\u0010"#);
1344 table[0x11] = Some(r#"\u0011"#);
1345 table[0x12] = Some(r#"\u0012"#);
1346 table[0x13] = Some(r#"\u0013"#);
1347 table[0x14] = Some(r#"\u0014"#);
1348 table[0x15] = Some(r#"\u0015"#);
1349 table[0x16] = Some(r#"\u0016"#);
1350 table[0x17] = Some(r#"\u0017"#);
1351 table[0x18] = Some(r#"\u0018"#);
1352 table[0x19] = Some(r#"\u0019"#);
1353 table[0x1A] = Some(r#"\u001a"#);
1354 table[0x1B] = Some(r#"\u001b"#);
1355 table[0x1C] = Some(r#"\u001c"#);
1356 table[0x1D] = Some(r#"\u001d"#);
1357 table[0x1E] = Some(r#"\u001e"#);
1358 table[0x1F] = Some(r#"\u001f"#);
1359
1360 table
1361};
1362
1363// A simple boolean-like lookup table for SIMD.
1364// 0 = no escape needed, 1 = escape needed.
1365// This is very compact (256 bytes) and fits easily in the L1 cache.
1366#[allow(unused)]
1367const ESCAPE_DECISION_TABLE: [u8; 256] = {
1368 let mut table = [0u8; 256];
1369 let mut i = 0;
1370 while i < 256 {
1371 if ESCAPE_TABLE[i].is_some() {
1372 table[i] = 1;
1373 }
1374 i += 1;
1375 }
1376 table
1377};
1378
1379// This is the SIMD version, compiled only when the "simd" feature is enabled on nightly build.
1380#[cfg(all(feature = "simd", nightly))]
1381#[inline]
1382fn find_escape_char(bytes: &[u8]) -> Option<usize> {
1383 use std::simd::{Simd, prelude::SimdPartialEq, prelude::SimdPartialOrd};
1384
1385 const LANES: usize = 16; // Process 16 bytes at a time (fits in SSE2/AVX)
1386 let mut i = 0;
1387
1388 // SIMD main loop
1389 while i + LANES <= bytes.len() {
1390 // Load 16 bytes from the slice into a SIMD vector.
1391 let chunk = Simd::<u8, LANES>::from_slice(&bytes[i..]);
1392
1393 // Create comparison vectors. These are effectively 16 copies of the byte.
1394 let space_v = Simd::splat(b' ' - 1); // For the < ' ' check (i.e., <= 0x1F)
1395 let quote_v = Simd::splat(b'"');
1396 let slash_v = Simd::splat(b'\\');
1397
1398 // Perform all 16 comparisons at once. The result is a mask.
1399 let lt_space_mask = chunk.simd_le(space_v);
1400 let eq_quote_mask = chunk.simd_eq(quote_v);
1401 let eq_slash_mask = chunk.simd_eq(slash_v);
1402
1403 // Combine the masks. A byte needs escaping if ANY of the conditions are true.
1404 let combined_mask = lt_space_mask | eq_quote_mask | eq_slash_mask;
1405
1406 // Check if any lane in the combined mask is true.
1407 if combined_mask.any() {
1408 // If yes, find the index of the *first* true lane.
1409 // trailing_zeros() on the bitmask gives us this index directly.
1410 let first_match_index = combined_mask.to_bitmask().trailing_zeros() as usize;
1411 return Some(i + first_match_index);
1412 }
1413
1414 i += LANES;
1415 }
1416
1417 // Handle the remaining bytes (if any) with the simple iterator method.
1418 if i < bytes.len() {
1419 if let Some(pos) = bytes[i..]
1420 .iter()
1421 .position(|&b| ESCAPE_DECISION_TABLE[b as usize] != 0)
1422 {
1423 return Some(i + pos);
1424 }
1425 }
1426
1427 None
1428}
1429
1430// A fallback for when SIMD feature is off.
1431#[cfg(not(nightly))]
1432#[inline]
1433fn find_escape_char(bytes: &[u8]) -> Option<usize> {
1434 bytes
1435 .iter()
1436 .position(|&b| ESCAPE_DECISION_TABLE[b as usize] != 0)
1437}
1438
1439/// Static table mapping every u8 -> a &'static [u8] of length 1.
1440/// This lets us return a `'static` slice for any single byte cheaply.
1441const U8_TABLE: [[u8; 1]; 256] = {
1442 let mut arr = [[0u8; 1]; 256];
1443 let mut i = 0usize;
1444 while i < 256 {
1445 arr[i] = [i as u8];
1446 i += 1;
1447 }
1448 arr
1449};
1450
1451#[inline(always)]
1452fn byte_as_static_slice(b: u8) -> &'static [u8] {
1453 // coerce from &'static [u8;1] to &'static [u8]
1454 &U8_TABLE[b as usize]
1455}
1456
1457// The following function is copied from the `percent-encoding` crate, version 2.3.2.
1458// Source: https://github.com/servo/rust-url/blob/22b925f93ad505a830f1089538a9ed6f5fd90612/percent_encoding/src/lib.rs#L337-L365
1459//
1460// It is licensed under the same terms as the `percent-encoding` crate (MIT/Apache-2.0).
1461//
1462// This helper is used to efficiently convert a Cow<'_, [u8]> to a Cow<'_, str>
1463// lossily, with a specific optimization to avoid a re-allocation when the input
1464// is an owned, valid UTF-8 Vec<u8>.
1465#[cfg(feature = "alloc")]
1466#[allow(ambiguous_wide_pointer_comparisons)]
1467fn decode_utf8_lossy(input: Cow<'_, [u8]>) -> Cow<'_, str> {
1468 // Note: This function is duplicated in `form_urlencoded/src/query_encoding.rs`.
1469 match input {
1470 Cow::Borrowed(bytes) => String::from_utf8_lossy(bytes),
1471 Cow::Owned(bytes) => {
1472 match String::from_utf8_lossy(&bytes) {
1473 Cow::Borrowed(utf8) => {
1474 // If from_utf8_lossy returns a Cow::Borrowed, then we can
1475 // be sure our original bytes were valid UTF-8. This is because
1476 // if the bytes were invalid UTF-8 from_utf8_lossy would have
1477 // to allocate a new owned string to back the Cow so it could
1478 // replace invalid bytes with a placeholder.
1479
1480 // First we do a debug_assert to confirm our description above.
1481 let raw_utf8: *const [u8] = utf8.as_bytes();
1482 debug_assert!(core::ptr::eq(raw_utf8, &*bytes));
1483
1484 // Given we know the original input bytes are valid UTF-8,
1485 // and we have ownership of those bytes, we re-use them and
1486 // return a Cow::Owned here.
1487 Cow::Owned(unsafe { String::from_utf8_unchecked(bytes) })
1488 }
1489 Cow::Owned(s) => Cow::Owned(s),
1490 }
1491 }
1492 }
1493}
1494
1495/// Compare two chunk-iterators by their concatenated byte stream (streaming,
1496/// zero allocations).
1497///
1498/// This is allocation-free: it streams through both iterators, comparing
1499/// overlapping prefixes and carrying the remainder of the longer chunk
1500/// forward into the next round.
1501fn chunks_eq<'a, I1, A, I2, B>(mut a: I1, mut b: I2) -> bool
1502where
1503 A: 'a + AsRef<[u8]> + ?Sized,
1504 B: 'a + AsRef<[u8]> + ?Sized,
1505 I1: Iterator<Item = &'a A>,
1506 I2: Iterator<Item = &'a B>,
1507{
1508 let mut a_rem: &[u8] = &[];
1509 let mut b_rem: &[u8] = &[];
1510
1511 loop {
1512 // If the remainder buffer for 'a' is empty, try to get the next chunk.
1513 if a_rem.is_empty() {
1514 match a.next() {
1515 Some(chunk) => a_rem = chunk.as_ref(),
1516 // 'a' is exhausted. They are equal only if 'b' is also exhausted.
1517 None => return b_rem.is_empty() && b.next().is_none(),
1518 }
1519 }
1520
1521 // If the remainder buffer for 'b' is empty, try to get the next chunk.
1522 if b_rem.is_empty() {
1523 match b.next() {
1524 Some(chunk) => b_rem = chunk.as_ref(),
1525 // 'b' is exhausted, but we know 'a' is not (since a_rem is non-empty).
1526 // Therefore, they cannot be equal.
1527 None => return false,
1528 }
1529 }
1530
1531 // At this point, both a_rem and b_rem are guaranteed to be non-empty.
1532 // Determine the length of the smaller chunk to compare.
1533 let n = a_rem.len().min(b_rem.len());
1534
1535 // Compare the overlapping parts of the chunks.
1536 if a_rem[..n] != b_rem[..n] {
1537 return false;
1538 }
1539
1540 // Move the slices past the part we just compared.
1541 a_rem = &a_rem[n..];
1542 b_rem = &b_rem[n..];
1543 }
1544}
1545
1546#[inline]
1547fn display_bytes_uft8(bytes: &[u8], f: &mut fmt::Formatter<'_>, lossy: bool) -> fmt::Result {
1548 for chunk in bytes.utf8_chunks() {
1549 f.write_str(chunk.valid())?;
1550
1551 if !chunk.invalid().is_empty() {
1552 if lossy {
1553 f.write_char(char::REPLACEMENT_CHARACTER)?
1554 } else {
1555 return Err(fmt::Error);
1556 }
1557 }
1558 }
1559
1560 Ok(())
1561}
1562
1563#[cfg(test)]
1564mod tests {
1565 use core::fmt::Display;
1566 use std::{io::Read as _, string::ToString as _, vec};
1567
1568 use super::*;
1569
1570 // ===================== Escape ===================== //
1571
1572 fn test_escape_typical(input: &str, want: &str) {
1573 let got = escape_str(input).collect::<String>();
1574 assert_eq!(got, want);
1575
1576 // Test PartialEq too
1577 assert_eq!(escape_str(input), want)
1578 }
1579
1580 #[test]
1581 fn test_empty_string() {
1582 test_escape_typical("", "");
1583 }
1584
1585 #[test]
1586 fn test_quotes() {
1587 test_escape_typical("\"hello\"", "\\\"hello\\\"")
1588 }
1589
1590 #[test]
1591 fn test_backslash() {
1592 test_escape_typical("\\hello\\", "\\\\hello\\\\");
1593 }
1594
1595 #[test]
1596 fn test_slash() {
1597 test_escape_typical("/hello/", "/hello/");
1598 }
1599
1600 #[test]
1601 fn test_control_chars() {
1602 test_escape_typical("\n\r\t\x08\x0C", "\\n\\r\\t\\b\\f");
1603 }
1604
1605 #[test]
1606 fn test_escape_fully() {
1607 let input = "Hello, \"world\"!\nThis contains a \\ backslash and a \t tab.";
1608 let expected = r#"Hello, \"world\"!\nThis contains a \\ backslash and a \t tab."#;
1609 test_escape_typical(input, expected);
1610 }
1611
1612 #[test]
1613 fn test_other_control_chars() {
1614 let input = "Null:\0, Bell:\x07";
1615 let expected = r#"Null:\u0000, Bell:\u0007"#;
1616 test_escape_typical(input, expected);
1617
1618 test_escape_typical("\x00\x1F", "\\u0000\\u001f");
1619 test_escape_typical("\x19", "\\u0019");
1620 }
1621
1622 #[test]
1623 fn test_iterator_chunks() {
1624 let input = "prefix\npostfix";
1625 let mut iter = escape_str(input);
1626 assert_eq!(iter.next(), Some("prefix"));
1627 assert_eq!(iter.next(), Some(r#"\n"#));
1628 assert_eq!(iter.next(), Some("postfix"));
1629 assert_eq!(iter.next(), None);
1630 }
1631
1632 #[test]
1633 fn test_no_escape_needed() {
1634 let input = "A simple string with no escapes.";
1635 let mut iter = escape_str(input);
1636 assert_eq!(iter.next(), Some("A simple string with no escapes."));
1637 assert_eq!(iter.next(), None);
1638
1639 let input = "café";
1640 let mut iter = escape_str(input);
1641 assert_eq!(iter.next(), Some("café"));
1642 assert_eq!(iter.next(), None);
1643
1644 let input = "❤️";
1645 let mut iter = escape_str(input);
1646 assert_eq!(iter.next(), Some("❤️"));
1647 assert_eq!(iter.next(), None);
1648 }
1649
1650 // ===================== Unescape ===================== //
1651
1652 #[test]
1653 fn test_byte_table() {
1654 assert_eq!(byte_as_static_slice(0), &[0]);
1655 assert_eq!(byte_as_static_slice(5), &[5]);
1656 assert_eq!(byte_as_static_slice(255), &[255]);
1657 }
1658
1659 fn test_unescape_typical<I: AsRef<[u8]> + ?Sized>(input: &I, want: &str) {
1660 let got = unescape(input).decode_utf8().unwrap();
1661 assert_eq!(got, want);
1662
1663 // Test PartialEq too
1664 assert_eq!(unescape(input), want);
1665
1666 // Help display
1667 assert_display(unescape(input).display_utf8(), Ok(want));
1668 }
1669
1670 #[test]
1671 fn test_unicode_escape_basic_unescape() {
1672 // \u4E16 => 世 (E4 B8 96)
1673 let s = "X\\u4E16Y";
1674 test_unescape_typical(s, "X世Y");
1675
1676 let s = "Snow: \\u2603"; // \u2603 => ☃
1677 test_unescape_typical(s, "Snow: ☃");
1678
1679 let s = "A \\u03A9 B"; // Ω is U+03A9
1680 test_unescape_typical(s, "A Ω B");
1681 }
1682
1683 #[test]
1684 fn test_surrogate_pair_unescape() {
1685 // 😀 is U+1F600 -> in JSON: \uD83D\uDE00
1686 let s = "A\\uD83D\\uDE00B";
1687 test_unescape_typical(s, "A😀B")
1688 }
1689
1690 #[test]
1691 fn test_invalid_escape_unescape() {
1692 let s = b"\\x";
1693 let mut u = unescape(s);
1694
1695 match u.next() {
1696 Some(Err(UnescapeError {
1697 kind: UnescapeErrorKind::InvalidEscape(InvalidEscapeError { found: b'x' }),
1698 offset: 1,
1699 })) => {}
1700 _ => panic!("expected invalid escape"),
1701 }
1702 }
1703
1704 #[test]
1705 fn test_simple_unescape() {
1706 let input = "Hello\\nWorld\\\"!"; // "Hello\nWorld\"!"
1707 test_unescape_typical(input, "Hello\nWorld\"!")
1708 }
1709
1710 #[test]
1711 fn test_truncated_unicode() {
1712 let input = "Trunc: \\u12"; // too short
1713 let it = unescape(input);
1714 let mut found = false;
1715 for r in it {
1716 match r {
1717 Ok(_) => continue,
1718 Err(UnescapeError {
1719 kind: UnescapeErrorKind::UnexpectedEof,
1720 offset: 4,
1721 }) => {
1722 found = true;
1723 break;
1724 }
1725 Err(_) => break,
1726 }
1727 }
1728 assert!(found);
1729 }
1730
1731 // ===================== Chunk_Eq ===================== //
1732
1733 #[test]
1734 fn test_empty_iterators_are_equal() {
1735 let a: Vec<&[u8]> = vec![];
1736 let b: Vec<&[u8]> = vec![];
1737 assert!(chunks_eq(a.into_iter(), b.into_iter()));
1738 }
1739
1740 #[test]
1741 fn test_empty_vs_non_empty() {
1742 let a: Vec<&[u8]> = vec![];
1743 let b = vec![&[1, 2, 3]];
1744 assert!(!chunks_eq(a.into_iter(), b.into_iter()));
1745
1746 // And the other way around
1747 let a = vec![&[1, 2, 3]];
1748 let b: Vec<&[u8]> = vec![];
1749 assert!(!chunks_eq(a.into_iter(), b.into_iter()));
1750 }
1751
1752 #[test]
1753 fn test_single_identical_chunks() {
1754 let a = vec!["hello world"];
1755 let b = vec!["hello world"];
1756 assert!(chunks_eq(a.into_iter(), b.into_iter()));
1757 }
1758
1759 #[test]
1760 fn test_different_chunk_boundaries_str() {
1761 // This is the key test: the concatenated content is identical,
1762 // but the chunk divisions are different.
1763 let a = vec!["he", "llo", " ", "world"];
1764 let b = vec!["hello ", "wo", "rld"];
1765 assert!(chunks_eq(a.into_iter(), b.into_iter()));
1766 }
1767
1768 #[test]
1769 fn test_different_chunk_boundaries_bytes() {
1770 let a = vec![&[1, 2], &[3, 4, 5][..]];
1771 let b = vec![&[1, 2, 3], &[4, 5][..]];
1772 assert!(chunks_eq(a.into_iter(), b.into_iter()));
1773 }
1774
1775 #[test]
1776 fn test_one_long_vs_many_short() {
1777 let a = vec!["a-long-single-chunk"];
1778 let b = vec!["a", "-", "long", "-", "single", "-", "chunk"];
1779 assert!(chunks_eq(a.into_iter(), b.into_iter()));
1780 }
1781
1782 #[test]
1783 fn test_unequal_content_same_length() {
1784 let a = vec!["hello"];
1785 let b = vec!["hallo"];
1786 assert!(!chunks_eq(a.into_iter(), b.into_iter()));
1787 }
1788
1789 #[test]
1790 fn test_unequal_at_chunk_boundary() {
1791 let a = vec!["ab", "c"]; // "abc"
1792 let b = vec!["ab", "d"]; // "abd"
1793 assert!(!chunks_eq(a.into_iter(), b.into_iter()));
1794 }
1795
1796 #[test]
1797 fn test_one_is_prefix_of_other() {
1798 // a is shorter
1799 let a = vec!["user", "name"]; // "username"
1800 let b = vec!["user", "name", "123"]; // "username123"
1801 assert!(!chunks_eq(a.into_iter(), b.into_iter()));
1802
1803 // b is shorter
1804 let a = vec!["user", "name", "123"];
1805 let b = vec!["user", "name"];
1806 assert!(!chunks_eq(a.into_iter(), b.into_iter()));
1807 }
1808
1809 #[test]
1810 fn test_complex_remainer_logic() {
1811 // This tests the carry-over logic extensively.
1812 // a: [1,2,3], [4,5], [6,7,8,9], [10]
1813 // b: [1,2], [3,4,5,6], [7,8], [9,10]
1814 let a = vec![&[1, 2, 3], &[4, 5][..], &[6, 7, 8, 9], &[10]];
1815 let b = vec![&[1, 2], &[3, 4, 5, 6][..], &[7, 8], &[9, 10]];
1816 assert!(chunks_eq(a.into_iter(), b.into_iter()));
1817 }
1818
1819 #[test]
1820 fn test_with_vec_references() {
1821 let v_a1 = vec![1, 2];
1822 let v_a2 = vec![3, 4, 5];
1823 let a_data = vec![&v_a1, &v_a2];
1824
1825 let v_b1 = vec![1, 2, 3];
1826 let v_b2 = vec![4, 5];
1827 let b_data = vec![&v_b1, &v_b2];
1828 assert!(chunks_eq(a_data.into_iter(), b_data.into_iter()));
1829 }
1830
1831 // ===================== Unescape Read ===================== //
1832
1833 #[test]
1834 fn test_read_simple() {
1835 let input = br#"hello world"#;
1836 let mut reader = unescape(input);
1837 let mut buf = [0u8; 20];
1838
1839 let bytes_read = reader.read(&mut buf).unwrap();
1840
1841 assert_eq!(bytes_read, 11);
1842 assert_eq!(&buf[..bytes_read], b"hello world");
1843
1844 // Second read should return 0 (EOF)
1845 let bytes_read_eof = reader.read(&mut buf).unwrap();
1846 assert_eq!(bytes_read_eof, 0);
1847 }
1848
1849 #[test]
1850 fn test_read_with_simple_escapes() {
1851 let input = br#"hello\tworld\nline2"#;
1852 let mut reader = unescape(input);
1853 let mut buf = Vec::new();
1854
1855 reader.read_to_end(&mut buf).unwrap();
1856
1857 assert_eq!(buf, b"hello\tworld\nline2");
1858 }
1859
1860 #[test]
1861 fn test_read_into_small_buffer_multiple_calls() {
1862 let input = br#"this is a long string with no escapes"#;
1863 let mut reader = unescape(input);
1864 let mut buf = [0u8; 10];
1865 let mut result = Vec::new();
1866
1867 loop {
1868 match reader.read(&mut buf) {
1869 Ok(0) => break, // EOF
1870 Ok(n) => {
1871 result.extend_from_slice(&buf[..n]);
1872 }
1873 Err(e) => panic!("Read error: {}", e),
1874 }
1875 }
1876
1877 assert_eq!(result, input);
1878 }
1879
1880 #[test]
1881 fn test_read_multibyte_char_across_buffer_boundary() {
1882 // The grinning face emoji 😀 is \uD83D\uDE00, which is 4 bytes in UTF-8: 0xF0 0x9F 0x98 0x80
1883 let input = br#"emoji: \uD83D\uDE00 is here"#;
1884 let mut reader = unescape(input);
1885
1886 // Buffer is small, forcing the 4-byte emoji to be written across multiple calls
1887 let mut buf = [0u8; 8];
1888 let mut result = Vec::new();
1889
1890 // First read: "emoji: " (7 bytes) + first byte of emoji
1891 let n1 = reader.read(&mut buf).unwrap();
1892 assert_eq!(n1, 8);
1893 assert_eq!(&buf[..n1], b"emoji: \xF0");
1894 result.extend_from_slice(&buf[..n1]);
1895
1896 // Second read: next 3 bytes of emoji + " is h"
1897 let n2 = reader.read(&mut buf).unwrap();
1898 assert_eq!(n2, 8);
1899 assert_eq!(&buf[..n2], b"\x9F\x98\x80 is h");
1900 result.extend_from_slice(&buf[..n2]);
1901
1902 // Third read: "ere"
1903 let n3 = reader.read(&mut buf).unwrap();
1904 assert_eq!(n3, 3);
1905 assert_eq!(&buf[..n3], b"ere");
1906 result.extend_from_slice(&buf[..n3]);
1907
1908 // Final read should be EOF
1909 let n4 = reader.read(&mut buf).unwrap();
1910 assert_eq!(n4, 0);
1911
1912 assert_eq!(result, b"emoji: \xF0\x9F\x98\x80 is here");
1913 assert_eq!(result, "emoji: 😀 is here".as_bytes());
1914 }
1915
1916 #[test]
1917 fn test_read_error_invalid_escape() {
1918 let input = br#"hello \q world"#;
1919 let mut reader = unescape(input);
1920 let mut buf = [0u8; 20];
1921
1922 let result = reader.read(&mut buf);
1923
1924 assert!(result.is_err());
1925 let err = result.unwrap_err();
1926 assert_eq!(err.kind(), std::io::ErrorKind::InvalidData);
1927 assert!(err.to_string().contains("invalid escape"));
1928 }
1929
1930 #[test]
1931 fn test_read_error_lone_surrogate() {
1932 let input = br#"\uD83D"#; // High surrogate without a following low one
1933 let mut reader = unescape(input);
1934 let mut buf = [0u8; 10];
1935
1936 let err = reader.read(&mut buf).unwrap_err();
1937 assert_eq!(err.kind(), std::io::ErrorKind::InvalidData);
1938 assert!(err.to_string().contains("lone surrogate"));
1939 }
1940
1941 #[test]
1942 fn test_read_empty_input() {
1943 let input = b"";
1944 let mut reader = unescape(input);
1945 let mut buf = [0u8; 10];
1946 let bytes_read = reader.read(&mut buf).unwrap();
1947 assert_eq!(bytes_read, 0);
1948 }
1949
1950 #[test]
1951 fn test_read_into_empty_buffer() {
1952 let input = b"hello";
1953 let mut reader = unescape(input);
1954 let mut buf = [0u8; 0];
1955 let bytes_read = reader.read(&mut buf).unwrap();
1956 // A read into an empty buffer should always succeed and return 0.
1957 assert_eq!(bytes_read, 0);
1958 }
1959
1960 #[test]
1961 fn test_read_to_end_optimized() {
1962 let input = br#"first\nsecond\tthird \uD83D\uDE00 last"#;
1963 let mut reader = unescape(input);
1964 let mut buf = Vec::new();
1965
1966 let bytes_read = reader.read_to_end(&mut buf).unwrap();
1967 let expected = b"first\nsecond\tthird \xF0\x9F\x98\x80 last";
1968
1969 assert_eq!(bytes_read, expected.len());
1970 assert_eq!(buf, expected);
1971 }
1972
1973 // ===================== Unescape Display ===================== //
1974
1975 fn assert_display(display: impl Display, want: Result<&str, ()>) {
1976 let mut w = String::new();
1977 let res = fmt::write(&mut w, format_args!("{display}"));
1978
1979 match want {
1980 Ok(want) => {
1981 assert!(res.is_ok());
1982 assert_eq!(w, want)
1983 }
1984 Err(_) => assert!(
1985 res.is_err(),
1986 "strict mode should return Err on invalid bytes"
1987 ),
1988 }
1989 }
1990
1991 // -- NON-LOSSY TESTS (must be perfect) --
1992
1993 #[test]
1994 fn test_display_simple_string() {
1995 let display = unescape("hello world").display_utf8();
1996 assert_display(display, Ok("hello world"));
1997 }
1998
1999 #[test]
2000 fn test_display_empty_string() {
2001 assert_display(unescape("").display_utf8(), Ok(""));
2002 }
2003
2004 #[test]
2005 fn test_display_standard_escapes() {
2006 let input = br#"\" \\ \/ \b \f \n \r \t"#;
2007 let expected = "\" \\ / \x08 \x0C \n \r \t";
2008 assert_display(unescape(input).display_utf8(), Ok(expected));
2009 }
2010
2011 #[test]
2012 fn test_display_non_escaped_utf8() {
2013 let input = "你好, world".as_bytes();
2014 let expected = "你好, world";
2015 assert_display(unescape(input).display_utf8(), Ok(expected));
2016 }
2017
2018 #[test]
2019 fn test_display_unicode_escape_bmp() {
2020 // cent sign: \u00A2 -> C2 A2 (2 bytes)
2021 let input = br"a\u00A2b";
2022 let expected = "a¢b";
2023 assert_display(unescape(input).display_utf8(), Ok(expected));
2024 }
2025
2026 #[test]
2027 fn test_display_mixed_content() {
2028 let input = br#"Text with \n, \u00A2, and \uD83D\uDE0E emojis."#;
2029 let expected = "Text with \n, ¢, and 😎 emojis.";
2030 assert_display(unescape(input).display_utf8(), Ok(expected));
2031 }
2032
2033 #[test]
2034 fn test_display_starts_and_ends_with_escape() {
2035 let input = br#"\u00A2hello\t"#;
2036 let expected = "¢hello\t";
2037 assert_display(unescape(input).display_utf8(), Ok(expected));
2038 }
2039
2040 // -- NON-LOSSY ERROR TESTS --
2041
2042 #[test]
2043 fn test_display_err_invalid_escape() {
2044 assert_display(unescape(br"hello \z world").display_utf8(), Err(()));
2045 }
2046
2047 #[test]
2048 fn test_display_err_incomplete_unicode() {
2049 assert_display(unescape(br"\u123").display_utf8(), Err(()));
2050 }
2051
2052 #[test]
2053 fn test_display_err_invalid_hex_in_unicode() {
2054 assert_display(unescape(br"\u123g").display_utf8(), Err(()));
2055 }
2056
2057 #[test]
2058 fn test_display_err_lone_high_surrogate() {
2059 assert_display(unescape(br"\uD800").display_utf8(), Err(()));
2060 }
2061
2062 #[test]
2063 fn test_display_err_high_surrogate_not_followed_by_low() {
2064 assert_display(unescape(br"\uD800\uABCD").display_utf8(), Err(()));
2065 }
2066
2067 #[test]
2068 fn test_display_err_invalid_source_utf8() {
2069 // A valid UTF-8 sequence for 'h' followed by an invalid byte
2070 assert_display(unescape(b"h\x80ello").display_utf8(), Err(()));
2071 }
2072
2073 #[test]
2074 fn strict_valid_multi_byte_split() {
2075 // "€" U+20AC => bytes [0xE2, 0x82, 0xAC]
2076 let input = &[0xE2, 0x82, 0xAC];
2077 let display = unescape(input).display_utf8();
2078 assert_display(display, Ok("€"));
2079 }
2080
2081 #[test]
2082 fn strict_errors_on_invalid_start_byte() {
2083 let input = &[0xFF, b'a'];
2084 let display = unescape(input).display_utf8();
2085
2086 assert_display(display, Err(()));
2087 }
2088
2089 // -- LOSSY TESTS --
2090
2091 #[test]
2092 fn lossy_replaces_invalid_start_byte() {
2093 // 0xFF is invalid as a leading UTF-8 byte.
2094 let input = &[0xFF, b'a']; // invalid byte then ASCII 'a';
2095 let display = unescape(input).display_utf8_lossy();
2096 // replacement char + 'a'
2097 assert_display(display, Ok("\u{FFFD}a"));
2098 }
2099
2100 #[test]
2101 fn lossy_handles_trailing_incomplete_bytes() {
2102 // A trailing incomplete 3-byte sequence: [0xE2, 0x82] (missing 0xAC)
2103 let input: &[u8] = &[0xE2, 0x82];
2104 let display = unescape(input).display_utf8_lossy();
2105 // Should replace incomplete tail with U+FFFD.
2106 assert_display(display, Ok("\u{FFFD}"));
2107 }
2108
2109 #[test]
2110 fn test_display_lossy_invalid_source_utf8() {
2111 // The invalid byte sequence should be replaced.
2112 let input = b"valid\xF0\x90\x80invalid";
2113 let expected = "valid\u{FFFD}invalid";
2114 assert_display(unescape(input).display_utf8_lossy(), Ok(expected));
2115 }
2116
2117 #[test]
2118 fn test_display_lossy_invalid_escape_truncates() {
2119 // In lossy mode, an invalid JSON escape stops the processing.
2120 let input = br"this is ok\z but this is not";
2121 let expected = "this is ok";
2122 assert_display(unescape(input).display_utf8_lossy(), Ok(expected));
2123 }
2124
2125 #[test]
2126 fn test_display_lossy_incomplete_unicode_truncates() {
2127 let input = br"truncate here \uD83D";
2128 let expected = "truncate here ";
2129 assert_display(unescape(input).display_utf8_lossy(), Ok(expected));
2130 }
2131
2132 // Inspired by and copied from memchr
2133 #[test]
2134 fn sync_regression() {
2135 use core::panic::{RefUnwindSafe, UnwindSafe};
2136
2137 fn assert_send_sync<T: Send + Sync + UnwindSafe + RefUnwindSafe>() {}
2138 assert_send_sync::<Unescape<'_>>();
2139 assert_send_sync::<Escape<'_>>();
2140 }
2141}