json_escape/lib.rs
1//! # Streaming JSON String Escape/Unescape
2//!
3//! Welcome to a highly efficient, `no_std` compatible library for handling JSON string escaping and unescaping. This crate provides iterator-based tools that process strings on the fly, avoiding heap allocations for the entire result. It's designed for performance-critical applications, such as parsing large JSON files or working in memory-constrained environments. ⚡
4//!
5//! The core of the library is two iterator structs:
6//! - **[`Escape`]**: Takes a string slice (`&str`) and yields escaped string slices ready for JSON serialization.
7//! - **[`Unescape`]**: Takes a byte slice (`&[u8]`) representing the content of a JSON string and yields the decoded byte slices.
8//!
9//! ## Key Features
10//! - **Zero-Copy Slicing**: For sequences of characters that don't need modification, the iterators yield slices that borrow directly from the input, avoiding unnecessary data copying.
11//! - **Comprehensive JSON Support**: Correctly handles all standard JSON escapes: `\"`, `\\`, `\/`, `\b`, `\f`, `\n`, `\r`, `\t`.
12//! - **Full Unicode Handling**: Correctly decodes `\uXXXX` sequences, including full support for UTF-16 surrogate pairs (e.g., `\uD83D\uDE00` for `😀`).
13//! - **Robust Error Handling**: The `Unescape` iterator returns descriptive errors (`UnescapeError`) for invalid or truncated escape sequences, making debugging straightforward.
14//! - **Allocation Control** (with `alloc` feature): Provides convenient methods to collect the iterator's output into owned types like `String` or `Cow<str>`.
15//! - **`std::io` Integration** (with `std` feature): The `Unescape` iterator implements `std::io::Read`, allowing it to be used as an efficient reader for I/O streams.
16//!
17//! ## Quick Start: Escaping a String
18//!
19//! ```
20//! use json_escape::escape_str;
21//!
22//! let input = "Hello, \"world\"!\nThis contains a \\ backslash.";
23//! let expected = r#"Hello, \"world\"!\nThis contains a \\ backslash."#;
24//!
25//! // The `escape_str` function returns an iterator.
26//! let mut escaper = escape_str(input);
27//!
28//! // You can iterate over the chunks:
29//! assert_eq!(escaper.next(), Some("Hello, "));
30//! assert_eq!(escaper.next(), Some(r#"\""#));
31//! assert_eq!(escaper.next(), Some("world"));
32//! // ...and so on.
33//!
34//! // Or, collect it into a String (requires the "alloc" feature).
35//! // let escaped_string: String = escape_str(input).collect();
36//! // assert_eq!(escaped_string, expected);
37//! ```
38//!
39//! ## Quick Start: Unescaping a String
40//!
41//! ```
42//! use json_escape::unescape;
43//!
44//! let input = r#"A 😀 emoji: \uD83D\uDE00 and a tab\t!"#;
45//!
46//! // The unescape iterator yields `Result<&[u8], _>`.
47//! let unescaper = unescape(input);
48//!
49//! // With the "alloc" feature, you can decode it directly into a string.
50//! let decoded_cow = unescaper.decode_utf8().unwrap();
51//! assert_eq!(decoded_cow, "A 😀 emoji: 😀 and a tab\t!");
52//! ```
53//!
54//! ## Performance and the `explicit` Module
55//!
56//! This crate is designed for high-performance, zero-allocation escaping and
57//! unescaping. For most use cases, the functions in this root module provide the
58//! best balance of ergonomics and speed.
59//!
60//! However, for users with extreme performance requirements, the [`explicit`]
61//! module is provided. Its iterators yield structured `Chunk` data instead of
62//! simple slices. As shown by benchmarks, this approach can be slightly faster,
63//! especially on inputs with a high density of escape sequences. If you are
64//! processing a very large volume of JSON strings in a tight loop, consider
65//! using the `explicit` module for a potential performance boost.
66#![no_std]
67#![deny(missing_docs)]
68#![cfg_attr(all(feature = "simd", nightly), feature(portable_simd))]
69
70#[cfg(any(test, feature = "std"))]
71extern crate std;
72
73#[cfg(feature = "alloc")]
74extern crate alloc;
75
76#[cfg(any(test, feature = "alloc"))]
77use alloc::{borrow::Cow, string::String, vec::Vec};
78
79use core::{
80 char,
81 fmt::{self, Write as _},
82 iter::FusedIterator,
83 slice, str,
84};
85use memchr::memchr;
86
87pub mod explicit;
88
89// =============================================================================
90// Escape Implementation
91// =============================================================================
92
93/// Creates a streaming JSON string escaper from a string slice.
94///
95/// The returned [`Escape`] iterator lazily processes the input string, yielding
96/// slices that represent the escaped output.
97///
98/// # Examples
99///
100/// ```
101/// use json_escape::escape_str;
102///
103/// let escaper = escape_str("a\nb");
104/// let escaped_parts: Vec<_> = escaper.collect();
105///
106/// assert_eq!(escaped_parts, vec!["a", r#"\n"#, "b"]);
107/// ```
108#[inline]
109pub fn escape_str(input: &str) -> Escape<'_> {
110 Escape {
111 bytes: input.as_bytes(),
112 }
113}
114
115/// A streaming JSON string escaper that yields `&'a str` slices.
116///
117/// This struct is created by the [`escape_str`] function. It is an [`Iterator`]
118/// that breaks the input string into chunks at each character that needs to be
119/// escaped according to JSON rules.
120///
121/// - For sequences of safe characters, it yields a single borrowed slice (`&'a str`).
122/// - For each character that must be escaped, it yields a `'static` slice
123/// containing the escaped representation (e.g., `r#"\n"#`).
124///
125/// This approach is highly efficient as it avoids allocating a new string for the
126/// entire output, processing the input in a streaming fashion.
127///
128/// ### Implemented Traits
129/// - **`Iterator<Item = &'a str>`**: Allows you to process the escaped parts in a loop or with adapters.
130/// - **`Display`**: Lets you write the escaped content directly to any formatter, like `println!` or a file, without intermediate allocation.
131/// - **`Clone`**, **`Debug`**: Standard utility traits.
132/// - **`PartialEq`**, **`PartialEq<B: AsRef<[u8]>>`**: Allows direct comparison of the escaped output. An `Escape` iterator is equal to another `Escape` or a byte slice if they produce an identical sequence of escaped bytes.
133/// - **`From<Escape<'a>> for Cow<'a, str>`** (requires `alloc` feature): Provides an efficient way to convert the iterator into a potentially owned string.
134#[derive(Clone)]
135#[must_use = "iterators are lazy and do nothing unless consumed"]
136pub struct Escape<'a> {
137 bytes: &'a [u8],
138}
139
140impl<'a> Iterator for Escape<'a> {
141 type Item = &'a str;
142
143 #[inline]
144 fn next(&mut self) -> Option<&'a str> {
145 if self.bytes.is_empty() {
146 return None;
147 }
148
149 // Find the first byte that needs escaping.
150 let pos = find_escape_char(self.bytes);
151
152 match pos {
153 // No escapable characters left; return the rest of the slice.
154 None => {
155 let s = self.bytes;
156 self.bytes = &[];
157 // SAFETY: The input was a valid &str, and we're returning the
158 // whole remaining chunk, so it's still valid UTF-8.
159 Some(unsafe { str::from_utf8_unchecked(s) })
160 }
161 // An escapable byte is at the beginning of the slice.
162 Some(0) => {
163 let byte = self.bytes[0];
164 self.bytes = &self.bytes[1..];
165 // The table lookup gives us a &'static str, which is a valid &'a str.
166 //
167 // Some(....unwrap()) is more correct
168 ESCAPE_TABLE[byte as usize]
169 }
170 // Found an escapable byte after a safe prefix. Return the prefix.
171 Some(p) => {
172 let (prefix, rest) = self.bytes.split_at(p);
173 self.bytes = rest;
174 // SAFETY: The soundness of this operation is critical.
175 // We are splitting the byte slice at the position of the first
176 // character that requires escaping. All JSON characters that
177 // require escaping (`"`, `\`, and control characters `\u0000`-`\u001F`)
178 // are single-byte ASCII characters. Therefore, `p` is guaranteed
179 // to be on a valid UTF-8 character boundary.
180 Some(unsafe { str::from_utf8_unchecked(prefix) })
181 }
182 }
183 }
184
185 fn size_hint(&self) -> (usize, Option<usize>) {
186 if self.bytes.is_empty() {
187 (0, Some(0))
188 } else {
189 // We'll yield at least 1 slice, and at most `len` slices if every byte is escaped.
190 (1, Some(self.bytes.len()))
191 }
192 }
193}
194
195impl<'a> FusedIterator for Escape<'a> {}
196
197impl fmt::Display for Escape<'_> {
198 /// Allows direct formatting of the escaped string without intermediate allocation.
199 ///
200 /// This is very useful for writing the escaped output directly to a stream,
201 /// such as a file or a network socket.
202 ///
203 /// # Example
204 ///
205 /// ```
206 /// use json_escape::escape_str;
207 ///
208 /// let escaper = escape_str("User said: \"Hi!\"\n");
209 /// let formatted = format!("{}", escaper);
210 ///
211 /// assert_eq!(formatted, r#"User said: \"Hi!\"\n"#);
212 /// ```
213 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
214 // The `clone()` is cheap as it only copies a slice reference.
215 for s in self.clone() {
216 f.write_str(s)?
217 }
218 Ok(())
219 }
220}
221
222impl fmt::Debug for Escape<'_> {
223 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
224 f.debug_struct("Escape").finish_non_exhaustive()
225 }
226}
227
228impl<B: AsRef<[u8]> + ?Sized> PartialEq<B> for Escape<'_> {
229 /// Compares the escaped output with any byte-slice-like object.
230 ///
231 /// This is primarily a convenience for testing, allowing you to check the
232 /// fully concatenated result of an `Escape` iterator against a known `&str` or `&[u8]`.
233 ///
234 /// The notion of equality is based on the **output**, not the iterator's internal state.
235 ///
236 /// # Example
237 ///
238 /// ```
239 /// use json_escape::escape_str;
240 ///
241 /// let escaper = escape_str("key\tvalue");
242 ///
243 /// // The escaper's output, when concatenated, equals the right-hand side.
244 /// assert_eq!(escaper, r#"key\tvalue"#);
245 /// ```
246 fn eq(&self, other: &B) -> bool {
247 let mut other = other.as_ref();
248 for chunk in self.clone() {
249 if !other.starts_with(chunk.as_bytes()) {
250 return false;
251 }
252 other = &other[chunk.len()..];
253 }
254 // We completely searched it
255 other.is_empty()
256 }
257}
258
259impl<'a, 'b> PartialEq<Escape<'a>> for Escape<'b> {
260 /// Compares two `Escape` iterators for equality.
261 ///
262 /// Two `Escape` iterators are considered equal if they'll produce the same **output**.
263 /// It first performs a fast check on the underlying byte slices.
264 fn eq(&self, other: &Escape<'a>) -> bool {
265 // Fast path: if they are views into the same underlying data.
266 self.bytes == other.bytes || chunks_eq(self.clone(), other.clone())
267 }
268}
269
270#[cfg(feature = "alloc")]
271impl<'a> From<Escape<'a>> for Cow<'a, str> {
272 /// Efficiently collects the escaped parts into a `Cow<'a, str>`.
273 ///
274 /// This implementation is optimized to avoid allocation if possible:
275 /// - If the input string requires **no escaping**, it returns `Cow::Borrowed`
276 /// with a slice of the original string.
277 /// - If escaping is needed, it allocates a `String` and returns `Cow::Owned`.
278 ///
279 /// This is more efficient than `iter.collect::<String>()` because `collect`
280 /// will always allocate.
281 ///
282 /// **Requires the `alloc` feature.**
283 ///
284 /// # Example
285 ///
286 /// ```
287 /// # #[cfg(feature = "alloc")] {
288 /// use json_escape::escape_str;
289 /// use std::borrow::Cow;
290 ///
291 /// // No escaping needed, so no allocation occurs.
292 /// let cow_borrowed: Cow<str> = escape_str("plain text").into();
293 /// assert!(matches!(cow_borrowed, Cow::Borrowed(_)));
294 ///
295 /// // Escaping is required, so a new String is allocated.
296 /// let cow_owned: Cow<str> = escape_str("text with\nnewline").into();
297 /// assert!(matches!(cow_owned, Cow::Owned(_)));
298 /// assert_eq!(cow_owned, r#"text with\nnewline"#);
299 /// # }
300 /// ```
301 fn from(mut iter: Escape<'a>) -> Self {
302 match iter.next() {
303 None => Cow::Borrowed(""),
304 Some(first) => match iter.next() {
305 None => Cow::Borrowed(first),
306 Some(second) => {
307 let mut string =
308 String::with_capacity(first.len() + second.len() + iter.bytes.len());
309 string.push_str(first);
310 string.push_str(second);
311 string.extend(iter);
312 Cow::Owned(string)
313 }
314 },
315 }
316 }
317}
318
319// =============================================================================
320// Unescape Implementation
321// =============================================================================
322
323/// Creates a streaming JSON string unescaper from a byte slice.
324///
325/// This function creates an iterator to unescape a byte slice representing the
326/// **raw contents** of a JSON string, assuming the outer quotes have already
327/// been removed.
328///
329/// For a more convenient way to handle complete JSON string literals (including
330/// their surrounding `"` quotes), see the [`unescape_quoted`] function, which
331/// automatically trims them.
332///
333/// The iterator will fail if the input contains invalid JSON escape sequences.
334///
335/// # Example
336///
337/// ```
338/// use json_escape::{unescape, unescape_quoted};
339///
340/// // `unescape` works on the raw content, without quotes.
341/// let content = r#"hello\tworld"#;
342/// assert_eq!(unescape(content), "hello\tworld");
343///
344/// // If you pass a full JSON literal, the quotes are treated as literal characters.
345/// let literal = r#""hello\tworld""#;
346/// assert_eq!(unescape(literal), "\"hello\tworld\""); // Note the quotes in the output.
347///
348/// // For full literals like this, `unescape_quoted` is the recommended function.
349/// assert_eq!(unescape_quoted(literal), "hello\tworld");
350/// ```
351#[inline]
352pub fn unescape<I: AsRef<[u8]> + ?Sized>(input: &I) -> Unescape<'_> {
353 Unescape::new(input.as_ref())
354}
355
356/// Creates a streaming JSON string unescaper, trimming enclosing quotes.
357///
358/// This function acts as a convenience wrapper around [`unescape`]. It first
359/// inspects the input byte slice. If the slice begins and ends with a double-quote
360/// character (`"`), these quotes are trimmed before the inner content is passed to
361/// the unescaper.
362///
363/// If the input is not enclosed in quotes, this function behaves exactly like
364/// [`unescape`]. This is useful for directly unescaping a complete JSON string
365/// literal.
366///
367/// # Example
368///
369/// ```
370/// use json_escape::{unescape, unescape_quoted};
371///
372/// // 1. With quotes: The outer quotes are trimmed before unescaping.
373/// let unescaper = unescape_quoted(r#""hello\nworld""#);
374/// assert_eq!(unescaper, b"hello\nworld");
375///
376/// // 2. Without quotes: Behaves exactly like the standard `unescape`.
377/// let unescaper_no_quotes = unescape_quoted(r#"raw string"#);
378/// assert_eq!(unescaper_no_quotes, b"raw string");
379///
380/// // 3. Mismatched quotes: The input is passed through as-is, quotes are not trimmed.
381/// let mismatched_quotes = unescape_quoted(r#"hello""#);
382/// assert_eq!(mismatched_quotes, b"hello\"");
383///
384/// // 4. Empty quoted string: Correctly results in an empty output.
385/// let empty_quoted = unescape_quoted(r#""""#);
386/// assert_eq!(empty_quoted, b"");
387/// ```
388#[inline]
389pub fn unescape_quoted<I: AsRef<[u8]> + ?Sized>(input: &I) -> Unescape<'_> {
390 let bytes = input.as_ref();
391 let input = if bytes.len() >= 2 && bytes[0] == b'\"' && bytes[bytes.len() - 1] == b'\"' {
392 &bytes[1..bytes.len() - 1]
393 } else {
394 bytes
395 };
396
397 unescape(input)
398}
399
400/// A streaming JSON string unescaper.
401///
402/// This struct is created by the [`unescape`] function. It implements an [`Iterator`]
403/// that yields `Result<&'a [u8], UnescapeError>`, lazily decoding the input.
404///
405/// The iterator's output chunks are one of the following:
406/// - **`Ok(&'a [u8])`**: A borrowed slice of the original input for a sequence of non-escaped bytes.
407/// - **`Ok(&'static [u8])`**: A single-byte slice for a decoded escape sequence (e.g., `\n` becomes a slice containing `0x0A`).
408/// For `\uXXXX` sequences, it yields a series of single-byte slices representing the UTF-8 encoding of the character.
409/// - **`Err(UnescapeError)`**: An error indicating an invalid escape sequence, which halts further iteration as described below.
410///
411/// Because the iterator operates on bytes, you can use helper methods like
412/// [`Unescape::decode_utf8`] or [`Unescape::decode_utf8_lossy`] to convert the
413/// final result into a string.
414///
415/// # Error Handling
416///
417/// When the iterator encounters an invalid or incomplete escape, it returns an
418/// `Err(UnescapeError)` describing the problem. The iterator then remains in an
419/// **error state**: subsequent calls to `next()` will continue to return that same
420/// error (i.e., the error is idempotent) and the iterator will not produce further
421/// `Ok` chunks. This makes the behavior deterministic for callers that check the
422/// first error and then stop.
423///
424/// Errors are classified by the precise condition encountered:
425/// - **`InvalidEscape`**: The escape sequence uses an unknown escape character (e.g., `\q`).
426/// - **`InvalidHex`**: A `\u` escape contains a non-hex character where a hex
427/// digit was expected (e.g., `\uZ`).
428/// - **`UnexpectedEof`**: The input ended before a complete escape sequence could be
429/// read. This is used when there isn't enough input yet to decide whether the
430/// sequence would be valid (for instance, an incomplete `\u` or a truncated
431/// surrogate pair).
432/// - **`LoneSurrogate`**: A complete `\uXXXX` was read, and it encodes a *high*
433/// surrogate, but the following bytes definitively do not form a valid low
434/// surrogate escape (for example, the next character is a space or any
435/// non-`\u` character).
436///
437/// The difference between `UnexpectedEof` and `LoneSurrogate` is important:
438/// - `UnexpectedEof` means **we couldn't decide** because the input ended too early.
439/// - `LoneSurrogate` means **we did decide**—we saw a full `\uXXXX` high surrogate,
440/// and the following input proves a pair will not follow.
441///
442/// #### Concrete examples
443///
444/// 1) A high surrogate followed by other data (not a `\u` low-surrogate) → `LoneSurrogate`:
445///
446/// ```rust
447/// use json_escape::{unescape, UnescapeErrorKind, LoneSurrogateError};
448///
449/// let mut iter = unescape(r"\uD83D more data");
450/// let err = iter.next().unwrap().unwrap_err();
451/// assert!(matches!(err.kind(), UnescapeErrorKind::LoneSurrogate(LoneSurrogateError { surrogate: 0xD83D, .. })));
452///
453/// // Subsequent calls return the same error (iterator remains in the same error state).
454/// let err = iter.next().unwrap().unwrap_err();
455/// assert!(matches!(err.kind(), UnescapeErrorKind::LoneSurrogate(LoneSurrogateError { surrogate: 0xD83D, .. })));
456/// ```
457///
458/// 2) An invalid escape character → `InvalidEscape`:
459///
460/// ```rust
461/// use json_escape::{unescape, UnescapeErrorKind, InvalidEscapeError};
462///
463/// let mut iter = unescape(r"\q"); // `\q` is not a defined escape
464/// let err = iter.next().unwrap().unwrap_err();
465/// assert!(matches!(err.kind(), UnescapeErrorKind::InvalidEscape(InvalidEscapeError { found: b'q', .. })));
466/// ```
467///
468/// 3) A malformed `\u` with a non-hex character → `InvalidHex`:
469///
470/// ```rust
471/// use json_escape::{unescape, UnescapeErrorKind, InvalidHexError};
472///
473/// let mut iter = unescape(r"\uZ");
474/// let err = iter.next().unwrap().unwrap_err();
475/// assert!(matches!(err.kind(), UnescapeErrorKind::InvalidHex(InvalidHexError { found: b'Z', .. })));
476/// ```
477///
478/// 4) Truncated / incomplete input ⇒ `UnexpectedEof`:
479///
480/// ```rust
481/// use json_escape::{unescape, UnescapeErrorKind};
482///
483/// // a) truncated after the first \uXXXX (no following bytes yet)
484/// let mut iter = unescape(r"\uD83D");
485/// let err = iter.next().unwrap().unwrap_err();
486/// assert!(matches!(err.kind(), UnescapeErrorKind::UnexpectedEof));
487///
488/// // b) starts a second \u but is truncated before hex digits
489/// let mut iter = unescape(r"\uD83D\u");
490/// let err = iter.next().unwrap().unwrap_err();
491/// assert!(matches!(err.kind(), UnescapeErrorKind::UnexpectedEof));
492///
493/// // c) a lone backslash at end of input
494/// let mut iter = unescape("\\");
495/// let err = iter.next().unwrap().unwrap_err();
496/// assert!(matches!(err.kind(), UnescapeErrorKind::UnexpectedEof));
497/// ```
498///
499/// **Note**: This behavior intentionally mirrors common JSON parsers (e.g.,
500/// `serde_json`, Go's `encoding/json`) for the EOF vs. semantic error distinction.
501///
502/// # Implemented Traits and Usage
503///
504/// - **`Iterator<Item = Result<&'a [u8], UnescapeError>>`**: The core trait for
505/// processing the unescaped byte chunks.
506/// - **`std::io::Read`** (requires `std` feature): Lets you use the unescaper as a
507/// standard reader, perfect for integrating with other I/O APIs.
508/// - **`TryFrom<Unescape<'a>> for Cow<'a, [u8]>`** (requires `alloc` feature): An
509/// efficient way to collect the unescaped bytes, propagating any errors.
510/// - **`Clone`**, **`Debug`**: Standard utility traits.
511/// - **`PartialEq<B: AsRef<[u8]>>`**: Compares the fully unescaped output with a byte slice.
512///
513/// ## Reading Unescaped Bytes
514///
515/// With the `std` feature, `Unescape` can be used as any other `std::io::Read`
516/// source. This is ideal for streaming and decoding large JSON string contents
517/// without buffering the entire result in memory first.
518///
519/// ```rust
520/// # #[cfg(feature = "std")] {
521/// use json_escape::unescape;
522/// use std::io::Read;
523///
524/// let mut reader = unescape(r#"chunk1\nchunk2"#);
525/// let mut buf = Vec::new();
526///
527/// // Read all unescaped bytes from the iterator into the buffer.
528/// reader.read_to_end(&mut buf).unwrap();
529///
530/// assert_eq!(buf, b"chunk1\nchunk2");
531/// # }
532/// ```
533#[derive(Clone)]
534#[must_use = "iterators are lazy and do nothing unless consumed"]
535pub struct Unescape<'a> {
536 // iterator over the input bytes (we use slice::Iter to clone/peek where necessary
537 // without worrying too much about bookkeeping)
538 bytes: slice::Iter<'a, u8>,
539
540 // scratch buffer for encoded UTF-8 bytes from a \uXXXX (or surrogate pair)
541 unicode: [u8; 4],
542 // We can eliminate this by depending on the header.
543 unicode_len: u8, // how many bytes are valid in buf (0 means no pending)
544 unicode_pos: u8, // how many bytes already emitted
545}
546
547impl<'a> Unescape<'a> {
548 /// Construct from a byte slice which contains the characters inside the JSON string (no quotes).
549 fn new(input: &'a [u8]) -> Self {
550 Self {
551 bytes: input.iter(),
552 unicode: [0; 4],
553 unicode_len: 0,
554 unicode_pos: 0,
555 }
556 }
557
558 /// Helper: parse exactly 4 hex digits from `it`. Returns Ok(u16) or an error.
559 #[inline(always)]
560 fn parse_hex4(iter: &mut slice::Iter<'a, u8>, base_offset: u8) -> Result<u16, UnescapeError> {
561 let mut acc = 0u16;
562 for i in 0..4 {
563 let b = match iter.next() {
564 Some(b) => *b,
565 None => {
566 return Err(UnescapeError {
567 kind: UnescapeErrorKind::UnexpectedEof,
568 // The error occurs where the next digit was expected.
569 offset: base_offset + i,
570 });
571 }
572 };
573 let v = match b {
574 b'0'..=b'9' => (b - b'0') as u16,
575 b'a'..=b'f' => (b - b'a' + 10) as u16,
576 b'A'..=b'F' => (b - b'A' + 10) as u16,
577 _ => {
578 return Err(UnescapeError {
579 kind: UnescapeErrorKind::InvalidHex(InvalidHexError { found: b }),
580 // The error is the invalid digit itself.
581 offset: base_offset + i,
582 });
583 }
584 };
585 acc = (acc << 4) | v;
586 }
587 Ok(acc)
588 }
589
590 /// Parses a unicode escape sequence `\uXXXX` which may be a surrogate pair.
591 /// The iterator `bytes` must be positioned *after* the `\u`.
592 ///
593 /// NOTE: Doesn't preserve the state of the iterator on error
594 #[inline(always)]
595 fn handle_unicode_escape(bytes: &mut slice::Iter<'a, u8>) -> Result<char, UnescapeError> {
596 // Parse first 4 hex digits (\uXXXX)
597 //
598 // The iterator starts *after* '\u'. The first hex digit is at offset 2 from '\'.
599 let first = Self::parse_hex4(bytes, 2)?;
600
601 // High surrogate → must be followed by another \uXXXX low surrogate
602 if (0xD800..=0xDBFF).contains(&first) {
603 match (bytes.next(), bytes.next()) {
604 (Some(b'\\'), Some(b'u')) => {
605 // Try parsing the low surrogate
606 //
607 // The first hex digit of the second escape is at offset 8.
608 // (\uXXXX\u -> 8 chars)
609 match Self::parse_hex4(bytes, 8) {
610 Ok(low) if (0xDC00..=0xDFFF).contains(&low) => {
611 let high_t = first as u32;
612 let low_t = low as u32;
613 let code = 0x10000 + (((high_t - 0xD800) << 10) | (low_t - 0xDC00));
614 return Ok(char::from_u32(code).expect(
615 "valid surrogate pair math should always produce a valid char",
616 ));
617 }
618 Ok(_) => {
619 // Got a full escape but not a low surrogate → Lone surrogate
620 return Err(UnescapeError {
621 kind: UnescapeErrorKind::LoneSurrogate(LoneSurrogateError {
622 surrogate: first,
623 }),
624 offset: 6,
625 });
626 }
627 Err(err) => {
628 // parse_hex4 failed (e.g. ran out of hex digits)
629 return Err(err);
630 }
631 }
632 }
633 // EOF before even seeing '\' or 'u' → UnexpectedEof
634 (None, _) | (_, None) => {
635 return Err(UnescapeError {
636 kind: UnescapeErrorKind::UnexpectedEof,
637 offset: 6,
638 });
639 }
640 // Something else after high surrogate → LoneSurrogate
641 _ => {
642 return Err(UnescapeError {
643 kind: UnescapeErrorKind::LoneSurrogate(LoneSurrogateError {
644 surrogate: first,
645 }),
646 // The error is detected after consuming `\uXXXX` (6 bytes).
647 offset: 6,
648 });
649 }
650 }
651 }
652
653 // Not a surrogate → normal path
654 match char::from_u32(first as u32) {
655 Some(c) => Ok(c),
656 None => Err(UnescapeError {
657 kind: UnescapeErrorKind::LoneSurrogate(LoneSurrogateError { surrogate: first }),
658 // The error is detected after consuming `\uXXXX` (6 bytes).
659 offset: 6,
660 }),
661 }
662 }
663
664 #[inline]
665 fn store_unicode(&mut self, ch: char) {
666 self.unicode_len = ch.encode_utf8(&mut self.unicode).len() as u8;
667 self.unicode_pos = 0;
668 }
669
670 #[inline]
671 fn emit_pending_byte(&mut self) -> Option<u8> {
672 if self.unicode_pos < self.unicode_len {
673 let b = self.unicode[self.unicode_pos as usize];
674 self.unicode_pos += 1;
675 Some(b)
676 } else {
677 None
678 }
679 }
680
681 /// Helper to emit the full unicode sequence and advance the internal position.
682 #[inline]
683 fn emit_unicode_as_str(&mut self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
684 // The check `unicode_pos > 0` is implicit from the call site.
685 // The buffer is guaranteed to contain a valid UTF-8 sequence.
686 let s = unsafe { str::from_utf8_unchecked(&self.unicode[..self.unicode_len as usize]) };
687 f.write_str(s)?;
688
689 // Mark the entire sequence as emitted.
690 self.unicode_pos = self.unicode_len;
691
692 Ok(())
693 }
694
695 /// The single, authoritative helper for producing unescaped byte chunks.
696 ///
697 /// It takes an optional `max` length to limit the size of the returned slice,
698 /// which is essential for the `std::io::Read` implementation.
699 #[inline(always)]
700 fn next_limit(&mut self, limit: Option<usize>) -> Option<Result<&'a [u8], UnescapeError>> {
701 if limit.is_some_and(|l| l == 0) {
702 return Some(Ok(&[]));
703 }
704
705 // If we have pending bytes, emit them first (fast).
706 //
707 // LIMIT: We're allowed not checking here since we'll only produce 1 byte
708 // and limit is at least 1.
709 if let Some(s) = self.emit_pending_byte() {
710 // s: &'static [u8] coerces to &'a [u8]
711 return Some(Ok(byte_as_static_slice(s)));
712 }
713
714 let bytes = self.bytes.as_slice();
715 if bytes.is_empty() {
716 return None;
717 }
718
719 // Find next backslash in the remaining bytes.
720 let pos = memchr(b'\\', bytes);
721
722 match pos {
723 None => {
724 // No more escapes. Return the rest of the slice as a borrowed chunk.
725 let chunk_len = bytes.len().min(limit.unwrap_or(bytes.len()));
726 let (chunk, rest) = bytes.split_at(chunk_len);
727 self.bytes = rest.iter();
728 Some(Ok(chunk))
729 }
730 // LIMIT: We're allowed not checking here since we'll only produce 1 byte
731 // and limit is at least 1.
732 Some(0) => {
733 // We need to parse 4 hex digits from the iterator. But because
734 // `bytes` implements `Clone`, we can clone it to peek ahead
735 // in order to preserve the state of the iterator on failure.
736 let mut lookahead = self.bytes.clone();
737 // Backslash is the first byte in the slice: handle escape
738 lookahead.next(); // Consume the backslash
739
740 match lookahead.next() {
741 Some(b'u') => match Self::handle_unicode_escape(&mut lookahead) {
742 Ok(ch) => {
743 self.bytes = lookahead; // commit
744 self.store_unicode(ch);
745 self.emit_pending_byte()
746 .map(|b| Ok(byte_as_static_slice(b)))
747 }
748 Err(err) => Some(Err(err)),
749 },
750 Some(byte) => {
751 if let Some(slice) = UNESCAPE_TABLE[*byte as usize] {
752 self.bytes = lookahead; // commit
753 Some(Ok(slice))
754 } else {
755 Some(Err(UnescapeError {
756 kind: UnescapeErrorKind::InvalidEscape(InvalidEscapeError {
757 found: *byte,
758 }),
759 // The invalid character is 1 byte after '\'.
760 offset: 1,
761 }))
762 }
763 }
764 None => Some(Err(UnescapeError {
765 kind: UnescapeErrorKind::UnexpectedEof,
766 // EOF occurred 1 byte after '\'.
767 offset: 1,
768 })),
769 }
770 }
771 // Found \ after a safe prefix. Return the prefix. We'll handle on next call to next
772 Some(p) => {
773 // Return the safe prefix (borrowed from input)
774 let chunk_len = p.min(limit.unwrap_or(p));
775 let (chunk, rest) = bytes.split_at(chunk_len);
776 self.bytes = rest.iter();
777 Some(Ok(chunk))
778 }
779 }
780 }
781
782 fn _display_utf8(mut self, f: &mut fmt::Formatter<'_>, lossy: bool) -> fmt::Result {
783 // The key insight: Chunks with more than one byte are *always*
784 // borrowed from the original input, as all escaped characters
785 // are yielded byte-by-byte.
786 while let Some(result) = self.next() {
787 match result {
788 Ok(chunk) => {
789 if chunk.is_empty() {
790 continue;
791 }
792
793 // THE CORE LOGIC:
794 // Check if the iterator just yielded the *first byte* of a *multi-byte* sequence.
795 // - `unicode_pos == 1` means the first byte was just emitted.
796 // - `unicode_len > 1` means it's a multi-byte char (e.g., '¢', '😎').
797 if self.unicode_pos == 1 && self.unicode_len > 1 {
798 // This is our special case. We have the first byte in `chunk`, but
799 // it's more efficient to write the whole character at once from our buffer.
800 self.emit_unicode_as_str(f)?;
801 // The iterator will no longer yield the rest of the bytes. Since our helper
802 // has now advanced it. But to be sure...
803 self.unicode_pos = self.unicode_len;
804 } else {
805 // This is the normal case:
806 // 1. A large chunk borrowed from the original input.
807 // 2. A single-byte escape like `\n` or `\t`.
808 // 3. The last byte of a multi-byte sequence (or the only byte).
809 // In all these cases, we just need to display the chunk we received.
810 display_bytes_utf8(chunk, f, lossy)?;
811 }
812 }
813 Err(_) => {
814 if lossy {
815 break;
816 } else {
817 return Err(fmt::Error);
818 }
819 }
820 }
821 }
822
823 Ok(())
824 }
825
826 /// Decodes the unescaped byte stream into a UTF-8 string.
827 ///
828 /// This method consumes the iterator and collects all resulting byte chunks.
829 /// If an unescaping error occurs, it's returned immediately. If the final
830 /// sequence of bytes is not valid UTF-8, a UTF-8 error is returned.
831 ///
832 /// Like `From<Escape>`, this is optimized to return a `Cow::Borrowed` if no
833 /// escapes were present in the input, avoiding allocation.
834 ///
835 /// **Requires the `alloc` feature.**
836 ///
837 /// # Example
838 ///
839 /// ```
840 /// # #[cfg(feature = "alloc")] {
841 /// use json_escape::unescape;
842 ///
843 /// let input = r#"Emoji: \uD83D\uDE00"#;
844 /// let cow = unescape(input).decode_utf8().unwrap();
845 ///
846 /// assert_eq!(cow, "Emoji: 😀");
847 /// # }
848 /// ```
849 #[cfg(feature = "alloc")]
850 pub fn decode_utf8(self) -> Result<Cow<'a, str>, DecodeUtf8Error> {
851 match self.try_into().map_err(DecodeUtf8Error::Unescape)? {
852 Cow::Borrowed(bytes) => str::from_utf8(bytes)
853 .map(Cow::Borrowed)
854 .map_err(DecodeUtf8Error::Utf8),
855 Cow::Owned(bytes) => String::from_utf8(bytes)
856 .map(Cow::Owned)
857 .map_err(|e| DecodeUtf8Error::Utf8(e.utf8_error())),
858 }
859 }
860
861 /// Decodes the unescaped byte stream lossily into a UTF-8 string.
862 ///
863 /// This is similar to [`Unescape::decode_utf8`] but replaces any invalid UTF-8 sequences
864 /// with the replacement character (U+FFFD) instead of returning an error.
865 ///
866 /// An `UnescapeError` can still be returned if the JSON escaping itself is invalid.
867 ///
868 /// **Requires the `alloc` feature.**
869 #[cfg(feature = "alloc")]
870 pub fn decode_utf8_lossy(self) -> Result<Cow<'a, str>, UnescapeError> {
871 Ok(decode_utf8_lossy(self.try_into()?))
872 }
873
874 /// Returns a wrapper that implements [`fmt::Display`].
875 ///
876 /// This allows an `Unescape` iterator to be used directly with formatting
877 /// macros like `println!`, `format!`, etc. It writes the unescaped content
878 /// directly to the formatter's buffer, **avoiding any heap allocations**.
879 ///
880 /// The iterator is consumed, and the resulting unescaped string is written
881 /// to the formatter. Any invalid JSON escape sequences or invalid UTF-8 will
882 /// cause a `fmt::Error`. **You should be cautious when using this method
883 /// with the `format!` macro, as a `fmt::Error` from us will cause the macro
884 /// to panic**.
885 ///
886 /// For a more robust alternative that will not panic on `UnescapeError` or
887 /// invalid bytes, consider using [`Unescape::display_utf8_lossy`] instead.
888 ///
889 /// This method is a **zero-allocation** alternative to [`Unescape::decode_utf8`],
890 /// which might allocate a `String` to return the unescaped content.
891 ///
892 /// # Example
893 ///
894 /// ```
895 /// use json_escape::unescape;
896 ///
897 /// let original = r#"Hello, \uD83C\uDF0E!"#;
898 /// let unescaper = unescape(original);
899 ///
900 /// let formatted = format!("{}", unescaper.display_utf8());
901 /// assert_eq!(formatted, "Hello, 🌎!");
902 /// ```
903 pub fn display_utf8(self) -> DisplayUnescape<'a> {
904 DisplayUnescape { inner: self }
905 }
906
907 /// Returns a wrapper that implements [`fmt::Display`] lossily.
908 ///
909 /// This method is an **allocation-free** way to write unescaped content
910 /// to a formatter. It handles invalid JSON escape sequences and invalid
911 /// UTF-8 gracefully, making it a "lossy" operation.
912 ///
913 /// - **Invalid JSON escape sequences:** Instead of causing an error, the iterator
914 /// terminates without an error.
915 /// - **Invalid UTF-8 bytes:** These are replaced with the Unicode
916 /// replacement character (U+FFFD).
917 ///
918 /// This method is the **zero-allocation** counterpart to [`Unescape::decode_utf8_lossy`].
919 pub fn display_utf8_lossy(self) -> DisplayUnescapeLossy<'a> {
920 DisplayUnescapeLossy { inner: self }
921 }
922}
923
924impl<'a> Iterator for Unescape<'a> {
925 type Item = Result<&'a [u8], UnescapeError>;
926
927 fn next(&mut self) -> Option<Self::Item> {
928 self.next_limit(None)
929 }
930
931 fn size_hint(&self) -> (usize, Option<usize>) {
932 // The minimum size is 0 (if the rest of the string is an invalid escape).
933 // The maximum size is the remaining length of the underlying bytes + pending_unicode
934 let (lower, upper) = self.bytes.size_hint();
935 let upper = upper.map(|x| x + (self.unicode_len as usize));
936 // Worst-case is \uXXXX -> 1 byte, so 6 -> 1.
937 (lower.saturating_add(1) / 6, upper)
938 }
939}
940
941impl<'a> FusedIterator for Unescape<'a> {}
942
943#[cfg(feature = "std")]
944impl std::io::Read for Unescape<'_> {
945 fn read(&mut self, mut buf: &mut [u8]) -> std::io::Result<usize> {
946 let start_len = buf.len();
947
948 // Read until buf is full or iter drained
949 loop {
950 // If the buffer is empty, we're done.
951 if buf.is_empty() {
952 return Ok(start_len);
953 }
954
955 match self.next_limit(Some(buf.len())) {
956 Some(Ok(chunk)) => {
957 // chunk.len() <= buf.len()... next_limit ensures this
958 let len = chunk.len();
959 buf[..len].copy_from_slice(chunk);
960 buf = &mut buf[len..]
961 }
962 Some(Err(err)) => {
963 return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, err));
964 }
965 None => {
966 // iter is drained
967 return Ok(start_len - buf.len());
968 }
969 }
970 }
971 }
972
973 // We can provide an optimized version of read_to_end
974 fn read_to_end(&mut self, buf: &mut Vec<u8>) -> std::io::Result<usize> {
975 let start_len = buf.len();
976
977 // Now, efficiently consume the rest of the iterator
978 for result in self {
979 match result {
980 Ok(chunk) => buf.extend_from_slice(chunk),
981 Err(err) => return Err(std::io::Error::new(std::io::ErrorKind::InvalidData, err)),
982 }
983 }
984
985 Ok(buf.len() - start_len)
986 }
987}
988
989impl fmt::Debug for Unescape<'_> {
990 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
991 f.debug_struct("Unescape").finish_non_exhaustive()
992 }
993}
994
995impl<B: AsRef<[u8]> + ?Sized> PartialEq<B> for Unescape<'_> {
996 /// Compares the unescaped output with a byte-slice-like object.
997 ///
998 /// An `Unescape` iterator is considered equal to a byte slice if it successfully
999 /// unescapes to produce a sequence of bytes identical to that slice. If the
1000 /// iterator would produce an error, the comparison returns `false`.
1001 ///
1002 /// # Example
1003 ///
1004 /// ```
1005 /// use json_escape::unescape;
1006 ///
1007 /// let unescaper = unescape(r#"hello\nworld"#);
1008 /// assert_eq!(unescaper, b"hello\nworld");
1009 ///
1010 /// // An iterator that produces an error is not equal to any valid slice.
1011 /// let failing_unescaper = unescape(r#"\k"#);
1012 /// assert_ne!(failing_unescaper, b"k");
1013 /// ```
1014 fn eq(&self, other: &B) -> bool {
1015 let mut other = other.as_ref();
1016 for result in self.clone() {
1017 match result {
1018 Ok(chunk) => {
1019 if !other.starts_with(chunk) {
1020 return false;
1021 }
1022 other = &other[chunk.len()..];
1023 }
1024 Err(_) => return false, // An erroring iterator cannot be equal to a valid slice.
1025 }
1026 }
1027 other.is_empty()
1028 }
1029}
1030
1031impl<B: AsRef<[u8]>> PartialEq<Unescape<'_>> for Result<B, UnescapeError> {
1032 /// Compares the unescaper's outcome with a `Result`.
1033 ///
1034 /// This implementation allows for precise testing of the `Unescape` iterator
1035 /// by comparing it against either a successful outcome (`Ok`) or a specific
1036 /// failure (`Err`).
1037 ///
1038 /// - If `result` is `Ok(bytes)`, the comparison is `true` only if the iterator
1039 /// completes successfully and its concatenated output is identical to `bytes`.
1040 ///
1041 /// - If `result` is `Err(error)`, the comparison is `true` only if the iterator
1042 /// produces the exact same `UnescapeError`.
1043 ///
1044 /// # Example
1045 ///
1046 /// ```
1047 /// use json_escape::{unescape, UnescapeError, InvalidEscapeError};
1048 ///
1049 /// // --- Success Case ---
1050 /// let unescaper = unescape(r#"hello\tworld"#);
1051 /// // The comparison is against an `Ok` variant.
1052 /// assert_eq!(Ok("hello\tworld"), unescaper);
1053 ///
1054 /// // --- Error Case ---
1055 /// let failing_unescaper = unescape(r#"invalid-\u"#);
1056 /// // We can assert that the iterator produces a specific error.
1057 /// # let unexpected_eof = unescape(r"\u").next().unwrap().unwrap_err();
1058 /// assert_eq!(Err::<&str, _>(unexpected_eof), failing_unescaper);
1059 /// ```
1060 fn eq(&self, unescape: &Unescape<'_>) -> bool {
1061 match self {
1062 Ok(expected_bytes) => unescape == expected_bytes,
1063 Err(expected_error) => {
1064 for result in unescape.clone() {
1065 if let Err(actual_error) = result {
1066 // The iterator's first error is its final outcome.
1067 // It must match the expected error exactly.
1068 return actual_error == *expected_error;
1069 }
1070 }
1071 // `unescape` completed successfully, but an error was expected.
1072 false
1073 }
1074 }
1075 }
1076}
1077
1078impl<'a, 'b> PartialEq<Unescape<'a>> for Unescape<'b> {
1079 /// Compares two `Unescape` iterators for equality based on their terminal result.
1080 ///
1081 /// The equality of two `Unescape` iterators is determined by the final `Result`
1082 /// that would be obtained if each iterator were fully consumed (e.g., by using `try_collect()`).
1083 ///
1084 /// The specific rules are as follows:
1085 ///
1086 /// 1. **Error vs. Error**: If both iterators terminate with an `Err`, they are
1087 /// considered **equal** if and only if their `UnescapeError`s are identical.
1088 /// Any bytes successfully unescaped *before* the error are ignored in this case.
1089 /// 2. **Success vs. Success**: If both iterators terminate with `Ok`, they are
1090 /// considered **equal** if and only if the complete sequence of unescaped bytes
1091 /// is identical for both.
1092 /// 3. **Success vs. Error**: If one iterator terminates with `Ok` and the other
1093 /// with `Err`, they are always **not equal**.
1094 ///
1095 /// # Example
1096 ///
1097 /// ```
1098 /// use json_escape::unescape;
1099 ///
1100 /// // Case 1: Both iterators produce the same error. They are equal,
1101 /// // even though their valid prefixes ("a" and "b") are different.
1102 /// let failing_a = unescape(r#"a\k"#);
1103 /// let failing_b = unescape(r#"b\k"#);
1104 /// assert_eq!(failing_a, failing_b);
1105 ///
1106 /// // Case 2: Both iterators succeed. Equality depends on the byte stream.
1107 /// let successful_a = unescape(r#"hello\nworld"#);
1108 /// let successful_b = unescape(r#"hello\nworld"#);
1109 /// assert_eq!(successful_a, successful_b);
1110 ///
1111 /// let successful_c = unescape(r#"different"#);
1112 /// assert_ne!(successful_a, successful_c);
1113 ///
1114 /// // Case 3: One succeeds and one fails. They are not equal.
1115 /// let succeeding = unescape(r#"stop"#);
1116 /// let failing = unescape(r#"stop\k"#);
1117 /// assert_ne!(succeeding, failing);
1118 ///
1119 /// // Case 4: Both iterators fail differently. They are not equal.
1120 /// let failing_a = unescape(r#"data:\k"#);
1121 /// let failing_b = unescape(r#"data:\"#);
1122 /// assert_ne!(failing_a, failing_b);
1123 /// ```
1124 fn eq(&self, other: &Unescape<'a>) -> bool {
1125 // Fast path: if they are views into the same underlying data with the same state.
1126 ((self.bytes.as_ref() == other.bytes.as_ref())
1127 && (self.unicode == other.unicode)
1128 && (self.unicode_len == other.unicode_len)
1129 && (self.unicode_pos == other.unicode_pos))
1130 || {
1131 let mut a_error = None;
1132 let mut b_error = None;
1133
1134 let mut a = self.clone().map_while(|result| match result {
1135 Ok(ok) => Some(ok),
1136 Err(err) => {
1137 a_error = Some(err);
1138 None
1139 }
1140 });
1141
1142 let mut b = other.clone().map_while(|result| match result {
1143 Ok(ok) => Some(ok),
1144 Err(err) => {
1145 b_error = Some(err);
1146 None
1147 }
1148 });
1149
1150 let streams_match = chunks_eq(&mut a, &mut b);
1151
1152 // Drain the iterators to ensure the error state is captured,
1153 // especially if chunks_eq returned false early.
1154 // (e.g unescape("a\k") and unescape("b\k") which are actually
1155 // equal)
1156 a.for_each(|_| {});
1157 b.for_each(|_| {});
1158
1159 match (a_error, b_error) {
1160 // Both errored: equality depends only on the errors being the same.
1161 (Some(a_err), Some(b_err)) => a_err == b_err,
1162 // Both succeeded: equality depends on the byte streams having been identical.
1163 (None, None) => streams_match,
1164 // One errored and the other didn't: they are not equal.
1165 _ => false,
1166 }
1167 }
1168 }
1169}
1170
1171#[cfg(feature = "alloc")]
1172impl<'a> TryFrom<Unescape<'a>> for Cow<'a, [u8]> {
1173 type Error = UnescapeError;
1174
1175 /// Efficiently collects the unescaped bytes into a `Cow<'a, [u8]>`.
1176 ///
1177 /// This implementation will return `Cow::Borrowed` if the original input contained
1178 /// no escape sequences, avoiding allocation. Otherwise, it returns `Cow::Owned`.
1179 ///
1180 /// If any `UnescapeError` is encountered during iteration, the operation
1181 /// halts and returns that error.
1182 ///
1183 /// **Requires the `alloc` feature.**
1184 fn try_from(mut value: Unescape<'a>) -> Result<Self, Self::Error> {
1185 match value.next() {
1186 None => Ok(Cow::Borrowed(b"")),
1187 Some(Ok(first)) => match value.next() {
1188 None => Ok(Cow::Borrowed(first)),
1189 Some(Ok(second)) => {
1190 let mut buf =
1191 Vec::with_capacity(first.len() + second.len() + value.bytes.len());
1192 buf.extend_from_slice(first);
1193 buf.extend_from_slice(second);
1194 for item in value {
1195 buf.extend_from_slice(item?);
1196 }
1197 Ok(Cow::Owned(buf))
1198 }
1199 Some(Err(e)) => Err(e),
1200 },
1201 Some(Err(e)) => Err(e),
1202 }
1203 }
1204}
1205
1206// =============================================================================
1207// DisplayUnescape Implementation
1208// =============================================================================
1209
1210/// A wrapper for an [`Unescape`] iterator that implements [`fmt::Display`].
1211///
1212/// This struct is created by the [`Unescape::display_utf8()`] method. It allows for
1213/// printing the unescaped content directly to a formatter, which **avoids
1214/// any heap allocations**. The unescaping and UTF-8 decoding are performed on-the-fly as the
1215/// `fmt` method is called.
1216pub struct DisplayUnescape<'a> {
1217 inner: Unescape<'a>,
1218}
1219
1220impl fmt::Display for DisplayUnescape<'_> {
1221 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1222 self.inner.clone()._display_utf8(f, false)
1223 }
1224}
1225
1226/// A wrapper for an [`Unescape`] iterator that implements [`fmt::Display`] lossily.
1227///
1228/// This struct is created by the [`Unescape::display_utf8_lossy()`] method. Like
1229/// `DisplayUnescape`, it performs its operation **without any heap allocations**.
1230///
1231/// This method differs from `display_utf8` in that it handles two types of
1232/// errors gracefully:
1233/// - Invalid JSON escape sequences will be ignored, and the iterator will
1234/// continue to completion without a `fmt::Error`.
1235/// - Invalid UTF-8 byte sequences will be replaced with the Unicode
1236/// replacement character (``, U+FFFD)
1237pub struct DisplayUnescapeLossy<'a> {
1238 inner: Unescape<'a>,
1239}
1240
1241impl fmt::Display for DisplayUnescapeLossy<'_> {
1242 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1243 // Lossy mode: replace invalid sequences with U+FFFD and continue.
1244 self.inner.clone()._display_utf8(f, true)
1245 }
1246}
1247
1248// =============================================================================
1249// Error Types
1250// =============================================================================
1251
1252/// An error that can occur when decoding the final byte stream to a UTF-8 string.
1253#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1254pub enum DecodeUtf8Error {
1255 /// The unescaped byte sequence was not valid UTF-8.
1256 Utf8(str::Utf8Error),
1257 /// An error occurred during the JSON unescaping process itself.
1258 Unescape(UnescapeError),
1259}
1260
1261impl fmt::Display for DecodeUtf8Error {
1262 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1263 match self {
1264 DecodeUtf8Error::Utf8(e) => fmt::Display::fmt(e, f),
1265 DecodeUtf8Error::Unescape(e) => fmt::Display::fmt(e, f),
1266 }
1267 }
1268}
1269
1270/// Details of an invalid escape sequence error.
1271#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1272#[non_exhaustive]
1273pub struct InvalidEscapeError {
1274 /// The invalid character found after a `\`.
1275 pub found: u8,
1276}
1277
1278/// Details of a lone UTF-16 surrogate error.
1279#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1280#[non_exhaustive]
1281pub struct LoneSurrogateError {
1282 /// The 16-bit surrogate code point.
1283 pub surrogate: u16,
1284}
1285
1286/// Details of an invalid hex digit error within a `\uXXXX` sequence.
1287#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1288#[non_exhaustive]
1289pub struct InvalidHexError {
1290 /// The non-hex character that was found.
1291 pub found: u8,
1292}
1293
1294impl fmt::Display for InvalidHexError {
1295 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1296 write!(f, "found invalid hex digit '0x{:02X}'", self.found)
1297 }
1298}
1299
1300/// An error that can occur during the JSON string unescaping process.
1301#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1302pub struct UnescapeError {
1303 /// The specific kind of unescaping error.
1304 pub(crate) kind: UnescapeErrorKind,
1305 /// The byte offset from the start of the escape sequence (`\`) where the
1306 /// error was detected.
1307 ///
1308 /// This is guaranteed to be less than 12, as the maximum escape sequence
1309 /// is `\uXXXX\uXXXX`.
1310 pub(crate) offset: u8,
1311}
1312
1313impl UnescapeError {
1314 /// Returns the specific kind of error that occurred.
1315 ///
1316 /// This can be used to programmatically handle different error types,
1317 /// such as distinguishing between a malformed hex sequence and an
1318 /// invalid escape character.
1319 ///
1320 /// ### Example
1321 ///
1322 /// ```
1323 /// # use json_escape::{unescape, UnescapeErrorKind, InvalidHexError};
1324 /// let mut unescaper = unescape(r#"\u123Z"#);
1325 /// let err = unescaper.next().unwrap().unwrap_err();
1326 ///
1327 /// match err.kind() {
1328 /// UnescapeErrorKind::InvalidHex(InvalidHexError { found, .. }) => {
1329 /// // We can inspect the exact invalid character found.
1330 /// assert_eq!(found, b'Z');
1331 /// }
1332 /// _ => panic!("Expected an InvalidHex error"),
1333 /// }
1334 /// ```
1335 pub fn kind(&self) -> UnescapeErrorKind {
1336 self.kind
1337 }
1338
1339 /// Returns the byte offset from the start of the escape sequence (`\`)
1340 /// where the error was detected.
1341 ///
1342 /// - For `\x`, the offset is `1` (pointing to `x`).
1343 /// - For `\u123?`, the offset is `5` (pointing to `?`).
1344 /// - For a lone surrogate `\uD800`, the offset is `6` (pointing after the sequence).
1345 ///
1346 /// This is useful for providing detailed error messages that can point
1347 /// to the exact location of the problem in the source string.
1348 ///
1349 /// ### Example
1350 ///
1351 /// ```
1352 /// # use json_escape::unescape;
1353 /// let json_string_content = r#"bad escape \x here"#;
1354 /// let mut unescaper = unescape(json_string_content);
1355 ///
1356 /// // read off 'bad escape '
1357 /// let first = unescaper.next().unwrap().unwrap();
1358 /// assert_eq!(first, b"bad escape ");
1359 ///
1360 /// let err = unescaper.next().unwrap().unwrap_err();
1361 ///
1362 /// // The error occurred at the 'x', which is 1 byte after the '\'
1363 /// assert_eq!(err.offset(), 1);
1364 ///
1365 /// // You could use this to highlight the error in the original input
1366 /// let backslash_pos = json_string_content.find('\\').unwrap();
1367 /// let error_pos = backslash_pos + err.offset() as usize;
1368 /// assert_eq!(json_string_content.as_bytes()[error_pos], b'x');
1369 ///
1370 /// // The generated error message also includes this info.
1371 /// let expected_msg = "invalid escape: '\\0x78' at offset 1";
1372 /// assert_eq!(err.to_string(), expected_msg);
1373 /// ```
1374 pub fn offset(&self) -> u8 {
1375 self.offset
1376 }
1377}
1378
1379/// The specific kind of error that can occur during JSON string unescaping.
1380///
1381/// This enum covers all possible failures described by the JSON standard for string contents.
1382#[derive(Copy, Eq, PartialEq, Clone, Debug)]
1383#[non_exhaustive]
1384pub enum UnescapeErrorKind {
1385 /// Found a backslash followed by an unexpected character (e.g., `\x`).
1386 InvalidEscape(InvalidEscapeError),
1387 /// Found `\u` but the following characters were not 4 valid hex digits.
1388 InvalidHex(InvalidHexError),
1389 /// Input ended unexpectedly while parsing an escape sequence (e.g., `\u12`).
1390 UnexpectedEof,
1391 /// The `\u` sequence yielded a lone high or low surrogate without a matching pair.
1392 LoneSurrogate(LoneSurrogateError),
1393}
1394
1395impl fmt::Display for UnescapeError {
1396 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1397 match self.kind {
1398 UnescapeErrorKind::InvalidEscape(e) => {
1399 write!(
1400 f,
1401 "invalid escape: '\\0x{:02X}' at offset {}",
1402 e.found, self.offset
1403 )
1404 }
1405 UnescapeErrorKind::InvalidHex(ref s) => {
1406 write!(f, "{} at offset {}", s, self.offset)
1407 }
1408 UnescapeErrorKind::UnexpectedEof => {
1409 write!(
1410 f,
1411 "unexpected end of input while parsing escape sequence, expected character at offset {}",
1412 self.offset
1413 )
1414 }
1415 UnescapeErrorKind::LoneSurrogate(e) => write!(
1416 f,
1417 "invalid unicode sequence: lone surrogate found: 0x{:04X} at offset {}",
1418 e.surrogate, self.offset
1419 ),
1420 }
1421 }
1422}
1423
1424impl core::error::Error for UnescapeError {}
1425impl core::error::Error for DecodeUtf8Error {
1426 fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
1427 match self {
1428 DecodeUtf8Error::Utf8(e) => Some(e),
1429 DecodeUtf8Error::Unescape(e) => Some(e),
1430 }
1431 }
1432}
1433
1434// =============================================================================
1435// Utilities
1436// =============================================================================
1437
1438// A const lookup table for JSON escape sequences.
1439// Maps a byte to its escaped `&'static str` representation.
1440// `None` indicates the byte does not need to be escaped.
1441const ESCAPE_TABLE: [Option<&'static str>; 256] = {
1442 let mut table: [Option<&'static str>; 256] = [None; 256];
1443
1444 // Special characters
1445 table[b'"' as usize] = Some(r#"\""#);
1446 table[b'\\' as usize] = Some(r#"\\"#);
1447
1448 // Common control characters with short escapes
1449 table[0x08] = Some(r#"\b"#); // Backspace
1450 table[0x09] = Some(r#"\t"#); // Tab
1451 table[0x0A] = Some(r#"\n"#); // Line Feed
1452 table[0x0C] = Some(r#"\f"#); // Form Feed
1453 table[0x0D] = Some(r#"\r"#); // Carriage Return
1454
1455 // The rest of the control characters must be `\uXXXX` encoded.
1456 // We can pre-calculate and store all of them as static strings.
1457 table[0x00] = Some(r#"\u0000"#);
1458 table[0x01] = Some(r#"\u0001"#);
1459 table[0x02] = Some(r#"\u0002"#);
1460 table[0x03] = Some(r#"\u0003"#);
1461 table[0x04] = Some(r#"\u0004"#);
1462 table[0x05] = Some(r#"\u0005"#);
1463 table[0x06] = Some(r#"\u0006"#);
1464 table[0x07] = Some(r#"\u0007"#);
1465 // 0x08 to 0x0D are already handled above
1466 table[0x0B] = Some(r#"\u000b"#);
1467 table[0x0E] = Some(r#"\u000e"#);
1468 table[0x0F] = Some(r#"\u000f"#);
1469 table[0x10] = Some(r#"\u0010"#);
1470 table[0x11] = Some(r#"\u0011"#);
1471 table[0x12] = Some(r#"\u0012"#);
1472 table[0x13] = Some(r#"\u0013"#);
1473 table[0x14] = Some(r#"\u0014"#);
1474 table[0x15] = Some(r#"\u0015"#);
1475 table[0x16] = Some(r#"\u0016"#);
1476 table[0x17] = Some(r#"\u0017"#);
1477 table[0x18] = Some(r#"\u0018"#);
1478 table[0x19] = Some(r#"\u0019"#);
1479 table[0x1A] = Some(r#"\u001a"#);
1480 table[0x1B] = Some(r#"\u001b"#);
1481 table[0x1C] = Some(r#"\u001c"#);
1482 table[0x1D] = Some(r#"\u001d"#);
1483 table[0x1E] = Some(r#"\u001e"#);
1484 table[0x1F] = Some(r#"\u001f"#);
1485
1486 table
1487};
1488
1489// A simple boolean-like lookup table for SIMD.
1490// 0 = no escape needed, 1 = escape needed.
1491// This is very compact (256 bytes) and fits easily in the L1 cache.
1492#[allow(unused)]
1493const ESCAPE_DECISION_TABLE: [u8; 256] = {
1494 let mut table = [0u8; 256];
1495 let mut i = 0;
1496 while i < 256 {
1497 if ESCAPE_TABLE[i].is_some() {
1498 table[i] = 1;
1499 }
1500 i += 1;
1501 }
1502 table
1503};
1504
1505// This is the SIMD version, compiled only when the "simd" feature is enabled on nightly build.
1506#[cfg(all(feature = "simd", nightly))]
1507#[inline]
1508fn find_escape_char(bytes: &[u8]) -> Option<usize> {
1509 use std::simd::{Simd, prelude::SimdPartialEq, prelude::SimdPartialOrd};
1510
1511 const LANES: usize = 16; // Process 16 bytes at a time (fits in SSE2/AVX)
1512 let mut i = 0;
1513
1514 // SIMD main loop
1515 while i + LANES <= bytes.len() {
1516 // Load 16 bytes from the slice into a SIMD vector.
1517 let chunk = Simd::<u8, LANES>::from_slice(&bytes[i..]);
1518
1519 // Create comparison vectors. These are effectively 16 copies of the byte.
1520 let space_v = Simd::splat(b' ' - 1); // For the < ' ' check (i.e., <= 0x1F)
1521 let quote_v = Simd::splat(b'"');
1522 let slash_v = Simd::splat(b'\\');
1523
1524 // Perform all 16 comparisons at once. The result is a mask.
1525 let lt_space_mask = chunk.simd_le(space_v);
1526 let eq_quote_mask = chunk.simd_eq(quote_v);
1527 let eq_slash_mask = chunk.simd_eq(slash_v);
1528
1529 // Combine the masks. A byte needs escaping if ANY of the conditions are true.
1530 let combined_mask = lt_space_mask | eq_quote_mask | eq_slash_mask;
1531
1532 // Check if any lane in the combined mask is true.
1533 if combined_mask.any() {
1534 // If yes, find the index of the *first* true lane.
1535 // trailing_zeros() on the bitmask gives us this index directly.
1536 let first_match_index = combined_mask.to_bitmask().trailing_zeros() as usize;
1537 return Some(i + first_match_index);
1538 }
1539
1540 i += LANES;
1541 }
1542
1543 // Handle the remaining bytes (if any) with the simple iterator method.
1544 if i < bytes.len() {
1545 if let Some(pos) = bytes[i..]
1546 .iter()
1547 .position(|&b| ESCAPE_DECISION_TABLE[b as usize] != 0)
1548 {
1549 return Some(i + pos);
1550 }
1551 }
1552
1553 None
1554}
1555
1556#[cfg(all(feature = "simd", not(nightly), target_arch = "x86_64"))]
1557#[inline]
1558fn find_escape_char(bytes: &[u8]) -> Option<usize> {
1559 // This is the stable Rust path using explicit CPU intrinsics.
1560 // It's guarded by cfg flags to only compile on x86_64 with the simd feature.
1561 use std::arch::x86_64::*;
1562
1563 let mut i = 0;
1564 const LANES: usize = 16; // SSE2 works on 128-bit registers, which is 16 bytes.
1565
1566 // On x86_64, we can tell the compiler to use SSE2 features in this specific function.
1567 // This is safe because we've already checked the target architecture.
1568 #[target_feature(enable = "sse2")]
1569 unsafe fn find_in_chunk(bytes: &[u8], i: usize) -> Option<usize> {
1570 // Load 16 bytes of data from the slice.
1571 let chunk = unsafe { _mm_loadu_si128(bytes.as_ptr().add(i) as *const _) };
1572
1573 // Create comparison vectors for quote and slash.
1574 let quote_v = _mm_set1_epi8(b'"' as i8);
1575 let slash_v = _mm_set1_epi8(b'\\' as i8);
1576
1577 // Emulate unsigned comparison for control characters
1578 // Create a vector with the value 0x80 in each lane.
1579 let bias = _mm_set1_epi8(0x80u8 as i8);
1580 // Create the comparison vector for bytes < 0x20 (' ').
1581 let space_v = _mm_set1_epi8(b' ' as i8);
1582
1583 // Bias both the input chunk and the comparison vector by XORing with 0x80.
1584 let biased_chunk = _mm_xor_si128(chunk, bias);
1585 let biased_space_v = _mm_xor_si128(space_v, bias);
1586
1587 // Now, a signed less-than comparison on the biased values gives the
1588 // same result as an unsigned less-than on the original values.
1589 let lt_space_mask = _mm_cmplt_epi8(biased_chunk, biased_space_v);
1590
1591 // Perform the equality comparisons (these are unaffected by signedness).
1592 let eq_quote_mask = _mm_cmpeq_epi8(chunk, quote_v);
1593 let eq_slash_mask = _mm_cmpeq_epi8(chunk, slash_v);
1594
1595 // Combine the results.
1596 let combined_mask = _mm_or_si128(lt_space_mask, _mm_or_si128(eq_quote_mask, eq_slash_mask));
1597
1598 // Create a bitmask to find the first match.
1599 let mask = _mm_movemask_epi8(combined_mask);
1600
1601 if mask != 0 {
1602 Some(i + mask.trailing_zeros() as usize)
1603 } else {
1604 None
1605 }
1606 }
1607 // Main loop
1608 while i + LANES <= bytes.len() {
1609 if let Some(result) = unsafe { find_in_chunk(bytes, i) } {
1610 return Some(result);
1611 }
1612 i += LANES;
1613 }
1614
1615 // Handle the remainder with the fast scalar lookup.
1616 if i < bytes.len() {
1617 if let Some(pos) = bytes[i..]
1618 .iter()
1619 .position(|&b| ESCAPE_DECISION_TABLE[b as usize] != 0)
1620 {
1621 return Some(i + pos);
1622 }
1623 }
1624
1625 None
1626}
1627
1628// A fallback for when SIMD feature is off.
1629#[cfg(not(feature = "simd"))]
1630#[inline]
1631fn find_escape_char(bytes: &[u8]) -> Option<usize> {
1632 bytes
1633 .iter()
1634 .position(|&b| ESCAPE_DECISION_TABLE[b as usize] != 0)
1635}
1636
1637#[cfg(all(feature = "simd", not(nightly), not(target_arch = "x86_64")))]
1638compile_error! { "simd requires nightly or target_arch = \"x86_64\"" }
1639
1640// Escape table: maps the byte after '\' to its escaped representation.
1641const UNESCAPE_TABLE: [Option<&[u8]>; 256] = {
1642 let mut tbl: [Option<&[u8]>; 256] = [None; 256];
1643 tbl[b'"' as usize] = Some(b"\"");
1644 tbl[b'\\' as usize] = Some(b"\\");
1645 tbl[b'/' as usize] = Some(b"/");
1646 tbl[b'b' as usize] = Some(b"\x08");
1647 tbl[b'f' as usize] = Some(b"\x0C");
1648 tbl[b'n' as usize] = Some(b"\n");
1649 tbl[b'r' as usize] = Some(b"\r");
1650 tbl[b't' as usize] = Some(b"\t");
1651 tbl
1652};
1653
1654/// Static table mapping every u8 -> a &'static [u8] of length 1.
1655/// This lets us return a `'static` slice for any single byte cheaply.
1656const U8_TABLE: [[u8; 1]; 256] = {
1657 let mut arr = [[0u8; 1]; 256];
1658 let mut i = 0usize;
1659 while i < 256 {
1660 arr[i] = [i as u8];
1661 i += 1;
1662 }
1663 arr
1664};
1665
1666#[inline(always)]
1667fn byte_as_static_slice(b: u8) -> &'static [u8] {
1668 // coerce from &'static [u8;1] to &'static [u8]
1669 &U8_TABLE[b as usize]
1670}
1671
1672// The following function is copied from the `percent-encoding` crate, version 2.3.2.
1673// Source: https://github.com/servo/rust-url/blob/22b925f93ad505a830f1089538a9ed6f5fd90612/percent_encoding/src/lib.rs#L337-L365
1674//
1675// It is licensed under the same terms as the `percent-encoding` crate (MIT/Apache-2.0).
1676//
1677// This helper is used to efficiently convert a Cow<'_, [u8]> to a Cow<'_, str>
1678// lossily, with a specific optimization to avoid a re-allocation when the input
1679// is an owned, valid UTF-8 Vec<u8>.
1680#[cfg(feature = "alloc")]
1681#[allow(ambiguous_wide_pointer_comparisons)]
1682fn decode_utf8_lossy(input: Cow<'_, [u8]>) -> Cow<'_, str> {
1683 // Note: This function is duplicated in `form_urlencoded/src/query_encoding.rs`.
1684 match input {
1685 Cow::Borrowed(bytes) => String::from_utf8_lossy(bytes),
1686 Cow::Owned(bytes) => {
1687 match String::from_utf8_lossy(&bytes) {
1688 Cow::Borrowed(utf8) => {
1689 // If from_utf8_lossy returns a Cow::Borrowed, then we can
1690 // be sure our original bytes were valid UTF-8. This is because
1691 // if the bytes were invalid UTF-8 from_utf8_lossy would have
1692 // to allocate a new owned string to back the Cow so it could
1693 // replace invalid bytes with a placeholder.
1694
1695 // First we do a debug_assert to confirm our description above.
1696 let raw_utf8: *const [u8] = utf8.as_bytes();
1697 debug_assert!(core::ptr::eq(raw_utf8, &*bytes));
1698
1699 // Given we know the original input bytes are valid UTF-8,
1700 // and we have ownership of those bytes, we re-use them and
1701 // return a Cow::Owned here.
1702 Cow::Owned(unsafe { String::from_utf8_unchecked(bytes) })
1703 }
1704 Cow::Owned(s) => Cow::Owned(s),
1705 }
1706 }
1707 }
1708}
1709
1710/// Compare two chunk-iterators by their concatenated byte stream (streaming,
1711/// zero allocations).
1712///
1713/// This is allocation-free: it streams through both iterators, comparing
1714/// overlapping prefixes and carrying the remainder of the longer chunk
1715/// forward into the next round.
1716fn chunks_eq<'a, I1, A, I2, B>(mut a: I1, mut b: I2) -> bool
1717where
1718 A: 'a + AsRef<[u8]> + ?Sized,
1719 B: 'a + AsRef<[u8]> + ?Sized,
1720 I1: Iterator<Item = &'a A>,
1721 I2: Iterator<Item = &'a B>,
1722{
1723 let mut a_rem: &[u8] = &[];
1724 let mut b_rem: &[u8] = &[];
1725
1726 loop {
1727 // If the remainder buffer for 'a' is empty, try to get the next chunk.
1728 if a_rem.is_empty() {
1729 match a.next() {
1730 Some(chunk) => a_rem = chunk.as_ref(),
1731 // 'a' is exhausted. They are equal only if 'b' is also exhausted.
1732 None => return b_rem.is_empty() && b.next().is_none(),
1733 }
1734 }
1735
1736 // If the remainder buffer for 'b' is empty, try to get the next chunk.
1737 if b_rem.is_empty() {
1738 match b.next() {
1739 Some(chunk) => b_rem = chunk.as_ref(),
1740 // 'b' is exhausted, but we know 'a' is not (since a_rem is non-empty).
1741 // Therefore, they cannot be equal.
1742 None => return false,
1743 }
1744 }
1745
1746 // At this point, both a_rem and b_rem are guaranteed to be non-empty.
1747 // Determine the length of the smaller chunk to compare.
1748 let n = a_rem.len().min(b_rem.len());
1749
1750 // Compare the overlapping parts of the chunks.
1751 if a_rem[..n] != b_rem[..n] {
1752 return false;
1753 }
1754
1755 // Move the slices past the part we just compared.
1756 a_rem = &a_rem[n..];
1757 b_rem = &b_rem[n..];
1758 }
1759}
1760
1761#[inline]
1762fn display_bytes_utf8(bytes: &[u8], f: &mut fmt::Formatter<'_>, lossy: bool) -> fmt::Result {
1763 for chunk in bytes.utf8_chunks() {
1764 f.write_str(chunk.valid())?;
1765
1766 if !chunk.invalid().is_empty() {
1767 if lossy {
1768 f.write_char(char::REPLACEMENT_CHARACTER)?
1769 } else {
1770 return Err(fmt::Error);
1771 }
1772 }
1773 }
1774
1775 Ok(())
1776}
1777
1778#[cfg(test)]
1779mod tests {
1780 use core::fmt::Display;
1781 use std::{io::Read as _, string::ToString as _, vec};
1782
1783 use super::*;
1784
1785 // ===================== Escape ===================== //
1786
1787 fn test_escape_typical(input: &str, want: &str) {
1788 let got = escape_str(input).collect::<String>();
1789 assert_eq!(got, want);
1790
1791 // Test PartialEq too
1792 assert_eq!(escape_str(input), want)
1793 }
1794
1795 #[test]
1796 fn test_empty_string() {
1797 test_escape_typical("", "");
1798 }
1799
1800 #[test]
1801 fn test_quotes() {
1802 test_escape_typical("\"hello\"", "\\\"hello\\\"")
1803 }
1804
1805 #[test]
1806 fn test_backslash() {
1807 test_escape_typical("\\hello\\", "\\\\hello\\\\");
1808 }
1809
1810 #[test]
1811 fn test_slash() {
1812 test_escape_typical("/hello/", "/hello/");
1813 }
1814
1815 #[test]
1816 fn test_control_chars() {
1817 test_escape_typical("\n\r\t\x08\x0C", "\\n\\r\\t\\b\\f");
1818 }
1819
1820 #[test]
1821 fn test_escape_fully() {
1822 let input = "Hello, \"world\"!\nThis contains a \\ backslash and a \t tab.";
1823 let expected = r#"Hello, \"world\"!\nThis contains a \\ backslash and a \t tab."#;
1824 test_escape_typical(input, expected);
1825 }
1826
1827 #[test]
1828 fn test_other_control_chars() {
1829 let input = "Null:\0, Bell:\x07";
1830 let expected = r#"Null:\u0000, Bell:\u0007"#;
1831 test_escape_typical(input, expected);
1832
1833 test_escape_typical("\x00\x1F", "\\u0000\\u001f");
1834 test_escape_typical("\x19", "\\u0019");
1835 }
1836
1837 #[test]
1838 fn test_iterator_chunks() {
1839 let input = "prefix\npostfix";
1840 let mut iter = escape_str(input);
1841 assert_eq!(iter.next(), Some("prefix"));
1842 assert_eq!(iter.next(), Some(r#"\n"#));
1843 assert_eq!(iter.next(), Some("postfix"));
1844 assert_eq!(iter.next(), None);
1845 }
1846
1847 #[test]
1848 fn test_no_escape_needed() {
1849 let input = "A simple string with no escapes.";
1850 let mut iter = escape_str(input);
1851 assert_eq!(iter.next(), Some("A simple string with no escapes."));
1852 assert_eq!(iter.next(), None);
1853
1854 let input = "café";
1855 let mut iter = escape_str(input);
1856 assert_eq!(iter.next(), Some("café"));
1857 assert_eq!(iter.next(), None);
1858
1859 let input = "❤️";
1860 let mut iter = escape_str(input);
1861 assert_eq!(iter.next(), Some("❤️"));
1862 assert_eq!(iter.next(), None);
1863 }
1864
1865 // ===================== Unescape ===================== //
1866
1867 #[test]
1868 fn test_byte_table() {
1869 assert_eq!(byte_as_static_slice(0), &[0]);
1870 assert_eq!(byte_as_static_slice(5), &[5]);
1871 assert_eq!(byte_as_static_slice(255), &[255]);
1872 }
1873
1874 fn test_unescape_typical<I: AsRef<[u8]> + ?Sized>(input: &I, want: &str) {
1875 let got = unescape(input).decode_utf8().unwrap();
1876 assert_eq!(got, want);
1877
1878 // Test PartialEq too
1879 assert_eq!(unescape(input), want);
1880
1881 // Help display
1882 assert_display(unescape(input).display_utf8(), Ok(want));
1883 }
1884
1885 #[test]
1886 fn test_unicode_escape_basic_unescape() {
1887 // \u4E16 => 世 (E4 B8 96)
1888 let s = "X\\u4E16Y";
1889 test_unescape_typical(s, "X世Y");
1890
1891 let s = "Snow: \\u2603"; // \u2603 => ☃
1892 test_unescape_typical(s, "Snow: ☃");
1893
1894 let s = "A \\u03A9 B"; // Ω is U+03A9
1895 test_unescape_typical(s, "A Ω B");
1896 }
1897
1898 #[test]
1899 fn test_surrogate_pair_unescape() {
1900 // 😀 is U+1F600 -> in JSON: \uD83D\uDE00
1901 let s = "A\\uD83D\\uDE00B";
1902 test_unescape_typical(s, "A😀B")
1903 }
1904
1905 #[test]
1906 fn test_invalid_escape_unescape() {
1907 let s = b"\\x";
1908 let mut u = unescape(s);
1909
1910 match u.next() {
1911 Some(Err(UnescapeError {
1912 kind: UnescapeErrorKind::InvalidEscape(InvalidEscapeError { found: b'x' }),
1913 offset: 1,
1914 })) => {}
1915 _ => panic!("expected invalid escape"),
1916 }
1917 }
1918
1919 #[test]
1920 fn test_simple_unescape() {
1921 let input = "Hello\\nWorld\\\"!"; // "Hello\nWorld\"!"
1922 test_unescape_typical(input, "Hello\nWorld\"!")
1923 }
1924
1925 #[test]
1926 fn test_truncated_unicode() {
1927 let input = "Trunc: \\u12"; // too short
1928 let it = unescape(input);
1929 let mut found = false;
1930 for r in it {
1931 match r {
1932 Ok(_) => continue,
1933 Err(UnescapeError {
1934 kind: UnescapeErrorKind::UnexpectedEof,
1935 offset: 4,
1936 }) => {
1937 found = true;
1938 break;
1939 }
1940 Err(_) => break,
1941 }
1942 }
1943 assert!(found);
1944 }
1945
1946 // ===================== Chunk_Eq ===================== //
1947
1948 #[test]
1949 fn test_empty_iterators_are_equal() {
1950 let a: Vec<&[u8]> = vec![];
1951 let b: Vec<&[u8]> = vec![];
1952 assert!(chunks_eq(a.into_iter(), b.into_iter()));
1953 }
1954
1955 #[test]
1956 fn test_empty_vs_non_empty() {
1957 let a: Vec<&[u8]> = vec![];
1958 let b = vec![&[1, 2, 3]];
1959 assert!(!chunks_eq(a.into_iter(), b.into_iter()));
1960
1961 // And the other way around
1962 let a = vec![&[1, 2, 3]];
1963 let b: Vec<&[u8]> = vec![];
1964 assert!(!chunks_eq(a.into_iter(), b.into_iter()));
1965 }
1966
1967 #[test]
1968 fn test_single_identical_chunks() {
1969 let a = vec!["hello world"];
1970 let b = vec!["hello world"];
1971 assert!(chunks_eq(a.into_iter(), b.into_iter()));
1972 }
1973
1974 #[test]
1975 fn test_different_chunk_boundaries_str() {
1976 // This is the key test: the concatenated content is identical,
1977 // but the chunk divisions are different.
1978 let a = vec!["he", "llo", " ", "world"];
1979 let b = vec!["hello ", "wo", "rld"];
1980 assert!(chunks_eq(a.into_iter(), b.into_iter()));
1981 }
1982
1983 #[test]
1984 fn test_different_chunk_boundaries_bytes() {
1985 let a = vec![&[1, 2], &[3, 4, 5][..]];
1986 let b = vec![&[1, 2, 3], &[4, 5][..]];
1987 assert!(chunks_eq(a.into_iter(), b.into_iter()));
1988 }
1989
1990 #[test]
1991 fn test_one_long_vs_many_short() {
1992 let a = vec!["a-long-single-chunk"];
1993 let b = vec!["a", "-", "long", "-", "single", "-", "chunk"];
1994 assert!(chunks_eq(a.into_iter(), b.into_iter()));
1995 }
1996
1997 #[test]
1998 fn test_unequal_content_same_length() {
1999 let a = vec!["hello"];
2000 let b = vec!["hallo"];
2001 assert!(!chunks_eq(a.into_iter(), b.into_iter()));
2002 }
2003
2004 #[test]
2005 fn test_unequal_at_chunk_boundary() {
2006 let a = vec!["ab", "c"]; // "abc"
2007 let b = vec!["ab", "d"]; // "abd"
2008 assert!(!chunks_eq(a.into_iter(), b.into_iter()));
2009 }
2010
2011 #[test]
2012 fn test_one_is_prefix_of_other() {
2013 // a is shorter
2014 let a = vec!["user", "name"]; // "username"
2015 let b = vec!["user", "name", "123"]; // "username123"
2016 assert!(!chunks_eq(a.into_iter(), b.into_iter()));
2017
2018 // b is shorter
2019 let a = vec!["user", "name", "123"];
2020 let b = vec!["user", "name"];
2021 assert!(!chunks_eq(a.into_iter(), b.into_iter()));
2022 }
2023
2024 #[test]
2025 fn test_complex_remainer_logic() {
2026 // This tests the carry-over logic extensively.
2027 // a: [1,2,3], [4,5], [6,7,8,9], [10]
2028 // b: [1,2], [3,4,5,6], [7,8], [9,10]
2029 let a = vec![&[1, 2, 3], &[4, 5][..], &[6, 7, 8, 9], &[10]];
2030 let b = vec![&[1, 2], &[3, 4, 5, 6][..], &[7, 8], &[9, 10]];
2031 assert!(chunks_eq(a.into_iter(), b.into_iter()));
2032 }
2033
2034 #[test]
2035 fn test_with_vec_references() {
2036 let v_a1 = vec![1, 2];
2037 let v_a2 = vec![3, 4, 5];
2038 let a_data = vec![&v_a1, &v_a2];
2039
2040 let v_b1 = vec![1, 2, 3];
2041 let v_b2 = vec![4, 5];
2042 let b_data = vec![&v_b1, &v_b2];
2043 assert!(chunks_eq(a_data.into_iter(), b_data.into_iter()));
2044 }
2045
2046 // ===================== Unescape Read ===================== //
2047
2048 #[test]
2049 fn test_read_simple() {
2050 let input = br#"hello world"#;
2051 let mut reader = unescape(input);
2052 let mut buf = [0u8; 20];
2053
2054 let bytes_read = reader.read(&mut buf).unwrap();
2055
2056 assert_eq!(bytes_read, 11);
2057 assert_eq!(&buf[..bytes_read], b"hello world");
2058
2059 // Second read should return 0 (EOF)
2060 let bytes_read_eof = reader.read(&mut buf).unwrap();
2061 assert_eq!(bytes_read_eof, 0);
2062 }
2063
2064 #[test]
2065 fn test_read_with_simple_escapes() {
2066 let input = br#"hello\tworld\nline2"#;
2067 let mut reader = unescape(input);
2068 let mut buf = Vec::new();
2069
2070 reader.read_to_end(&mut buf).unwrap();
2071
2072 assert_eq!(buf, b"hello\tworld\nline2");
2073 }
2074
2075 #[test]
2076 fn test_read_into_small_buffer_multiple_calls() {
2077 let input = br#"this is a long string with no escapes"#;
2078 let mut reader = unescape(input);
2079 let mut buf = [0u8; 10];
2080 let mut result = Vec::new();
2081
2082 loop {
2083 match reader.read(&mut buf) {
2084 Ok(0) => break, // EOF
2085 Ok(n) => {
2086 result.extend_from_slice(&buf[..n]);
2087 }
2088 Err(e) => panic!("Read error: {}", e),
2089 }
2090 }
2091
2092 assert_eq!(result, input);
2093 }
2094
2095 #[test]
2096 fn test_read_multibyte_char_across_buffer_boundary() {
2097 // The grinning face emoji 😀 is \uD83D\uDE00, which is 4 bytes in UTF-8: 0xF0 0x9F 0x98 0x80
2098 let input = br#"emoji: \uD83D\uDE00 is here"#;
2099 let mut reader = unescape(input);
2100
2101 // Buffer is small, forcing the 4-byte emoji to be written across multiple calls
2102 let mut buf = [0u8; 8];
2103 let mut result = Vec::new();
2104
2105 // First read: "emoji: " (7 bytes) + first byte of emoji
2106 let n1 = reader.read(&mut buf).unwrap();
2107 assert_eq!(n1, 8);
2108 assert_eq!(&buf[..n1], b"emoji: \xF0");
2109 result.extend_from_slice(&buf[..n1]);
2110
2111 // Second read: next 3 bytes of emoji + " is h"
2112 let n2 = reader.read(&mut buf).unwrap();
2113 assert_eq!(n2, 8);
2114 assert_eq!(&buf[..n2], b"\x9F\x98\x80 is h");
2115 result.extend_from_slice(&buf[..n2]);
2116
2117 // Third read: "ere"
2118 let n3 = reader.read(&mut buf).unwrap();
2119 assert_eq!(n3, 3);
2120 assert_eq!(&buf[..n3], b"ere");
2121 result.extend_from_slice(&buf[..n3]);
2122
2123 // Final read should be EOF
2124 let n4 = reader.read(&mut buf).unwrap();
2125 assert_eq!(n4, 0);
2126
2127 assert_eq!(result, b"emoji: \xF0\x9F\x98\x80 is here");
2128 assert_eq!(result, "emoji: 😀 is here".as_bytes());
2129 }
2130
2131 #[test]
2132 fn test_read_error_invalid_escape() {
2133 let input = br#"hello \q world"#;
2134 let mut reader = unescape(input);
2135 let mut buf = [0u8; 20];
2136
2137 let result = reader.read(&mut buf);
2138
2139 assert!(result.is_err());
2140 let err = result.unwrap_err();
2141 assert_eq!(err.kind(), std::io::ErrorKind::InvalidData);
2142 assert!(err.to_string().contains("invalid escape"));
2143 }
2144
2145 #[test]
2146 fn test_read_error_lone_surrogate() {
2147 let input = br#"\uD83D rest of data seen"#; // High surrogate without a following low one
2148 let mut reader = unescape(input);
2149 let mut buf = [0u8; 10];
2150
2151 let err = reader.read(&mut buf).unwrap_err();
2152 assert_eq!(err.kind(), std::io::ErrorKind::InvalidData);
2153 assert!(err.to_string().contains("lone surrogate"));
2154 }
2155
2156 #[test]
2157 fn test_read_empty_input() {
2158 let input = b"";
2159 let mut reader = unescape(input);
2160 let mut buf = [0u8; 10];
2161 let bytes_read = reader.read(&mut buf).unwrap();
2162 assert_eq!(bytes_read, 0);
2163 }
2164
2165 #[test]
2166 fn test_read_into_empty_buffer() {
2167 let input = b"hello";
2168 let mut reader = unescape(input);
2169 let mut buf = [0u8; 0];
2170 let bytes_read = reader.read(&mut buf).unwrap();
2171 // A read into an empty buffer should always succeed and return 0.
2172 assert_eq!(bytes_read, 0);
2173 }
2174
2175 #[test]
2176 fn test_read_to_end_optimized() {
2177 let input = br#"first\nsecond\tthird \uD83D\uDE00 last"#;
2178 let mut reader = unescape(input);
2179 let mut buf = Vec::new();
2180
2181 let bytes_read = reader.read_to_end(&mut buf).unwrap();
2182 let expected = b"first\nsecond\tthird \xF0\x9F\x98\x80 last";
2183
2184 assert_eq!(bytes_read, expected.len());
2185 assert_eq!(buf, expected);
2186 }
2187
2188 // ===================== Unescape Display ===================== //
2189
2190 fn assert_display(display: impl Display, want: Result<&str, ()>) {
2191 let mut w = String::new();
2192 let res = fmt::write(&mut w, format_args!("{display}"));
2193
2194 match want {
2195 Ok(want) => {
2196 assert!(res.is_ok());
2197 assert_eq!(w, want)
2198 }
2199 Err(_) => assert!(
2200 res.is_err(),
2201 "strict mode should return Err on invalid bytes"
2202 ),
2203 }
2204 }
2205
2206 // -- NON-LOSSY TESTS (must be perfect) --
2207
2208 #[test]
2209 fn test_display_simple_string() {
2210 let display = unescape("hello world").display_utf8();
2211 assert_display(display, Ok("hello world"));
2212 }
2213
2214 #[test]
2215 fn test_display_empty_string() {
2216 assert_display(unescape("").display_utf8(), Ok(""));
2217 }
2218
2219 #[test]
2220 fn test_display_standard_escapes() {
2221 let input = br#"\" \\ \/ \b \f \n \r \t"#;
2222 let expected = "\" \\ / \x08 \x0C \n \r \t";
2223 assert_display(unescape(input).display_utf8(), Ok(expected));
2224 }
2225
2226 #[test]
2227 fn test_display_non_escaped_utf8() {
2228 let input = "你好, world".as_bytes();
2229 let expected = "你好, world";
2230 assert_display(unescape(input).display_utf8(), Ok(expected));
2231 }
2232
2233 #[test]
2234 fn test_display_unicode_escape_bmp() {
2235 // cent sign: \u00A2 -> C2 A2 (2 bytes)
2236 let input = br"a\u00A2b";
2237 let expected = "a¢b";
2238 assert_display(unescape(input).display_utf8(), Ok(expected));
2239 }
2240
2241 #[test]
2242 fn test_display_mixed_content() {
2243 let input = br#"Text with \n, \u00A2, and \uD83D\uDE0E emojis."#;
2244 let expected = "Text with \n, ¢, and 😎 emojis.";
2245 assert_display(unescape(input).display_utf8(), Ok(expected));
2246 }
2247
2248 #[test]
2249 fn test_display_starts_and_ends_with_escape() {
2250 let input = br#"\u00A2hello\t"#;
2251 let expected = "¢hello\t";
2252 assert_display(unescape(input).display_utf8(), Ok(expected));
2253 }
2254
2255 // -- NON-LOSSY ERROR TESTS --
2256
2257 #[test]
2258 fn test_display_err_invalid_escape() {
2259 assert_display(unescape(br"hello \z world").display_utf8(), Err(()));
2260 }
2261
2262 #[test]
2263 fn test_display_err_incomplete_unicode() {
2264 assert_display(unescape(br"\u123").display_utf8(), Err(()));
2265 }
2266
2267 #[test]
2268 fn test_display_err_invalid_hex_in_unicode() {
2269 assert_display(unescape(br"\u123g").display_utf8(), Err(()));
2270 }
2271
2272 #[test]
2273 fn test_display_err_lone_high_surrogate() {
2274 assert_display(unescape(br"\uD800").display_utf8(), Err(()));
2275 }
2276
2277 #[test]
2278 fn test_display_err_high_surrogate_not_followed_by_low() {
2279 assert_display(unescape(br"\uD800\uABCD").display_utf8(), Err(()));
2280 }
2281
2282 #[test]
2283 fn test_display_err_invalid_source_utf8() {
2284 // A valid UTF-8 sequence for 'h' followed by an invalid byte
2285 assert_display(unescape(b"h\x80ello").display_utf8(), Err(()));
2286 }
2287
2288 #[test]
2289 fn strict_valid_multi_byte_split() {
2290 // "€" U+20AC => bytes [0xE2, 0x82, 0xAC]
2291 let input = &[0xE2, 0x82, 0xAC];
2292 let display = unescape(input).display_utf8();
2293 assert_display(display, Ok("€"));
2294 }
2295
2296 #[test]
2297 fn strict_errors_on_invalid_start_byte() {
2298 let input = &[0xFF, b'a'];
2299 let display = unescape(input).display_utf8();
2300
2301 assert_display(display, Err(()));
2302 }
2303
2304 // -- LOSSY TESTS --
2305
2306 #[test]
2307 fn lossy_replaces_invalid_start_byte() {
2308 // 0xFF is invalid as a leading UTF-8 byte.
2309 let input = &[0xFF, b'a']; // invalid byte then ASCII 'a';
2310 let display = unescape(input).display_utf8_lossy();
2311 // replacement char + 'a'
2312 assert_display(display, Ok("\u{FFFD}a"));
2313 }
2314
2315 #[test]
2316 fn lossy_handles_trailing_incomplete_bytes() {
2317 // A trailing incomplete 3-byte sequence: [0xE2, 0x82] (missing 0xAC)
2318 let input: &[u8] = &[0xE2, 0x82];
2319 let display = unescape(input).display_utf8_lossy();
2320 // Should replace incomplete tail with U+FFFD.
2321 assert_display(display, Ok("\u{FFFD}"));
2322 }
2323
2324 #[test]
2325 fn test_display_lossy_invalid_source_utf8() {
2326 // The invalid byte sequence should be replaced.
2327 let input = b"valid\xF0\x90\x80invalid";
2328 let expected = "valid\u{FFFD}invalid";
2329 assert_display(unescape(input).display_utf8_lossy(), Ok(expected));
2330 }
2331
2332 #[test]
2333 fn test_display_lossy_invalid_escape_truncates() {
2334 // In lossy mode, an invalid JSON escape stops the processing.
2335 let input = br"this is ok\z but this is not";
2336 let expected = "this is ok";
2337 assert_display(unescape(input).display_utf8_lossy(), Ok(expected));
2338 }
2339
2340 #[test]
2341 fn test_display_lossy_incomplete_unicode_truncates() {
2342 let input = br"truncate here \uD83D";
2343 let expected = "truncate here ";
2344 assert_display(unescape(input).display_utf8_lossy(), Ok(expected));
2345 }
2346
2347 // Inspired by and copied from memchr
2348 #[test]
2349 fn sync_regression() {
2350 use core::panic::{RefUnwindSafe, UnwindSafe};
2351
2352 fn assert_send_sync<T: Send + Sync + UnwindSafe + RefUnwindSafe>() {}
2353 assert_send_sync::<Unescape<'_>>();
2354 assert_send_sync::<Escape<'_>>();
2355 }
2356}
2357
2358#[cfg(test)]
2359mod find_escape_char_tests {
2360 use std::format;
2361
2362 use super::{ESCAPE_DECISION_TABLE, find_escape_char};
2363
2364 /// Helper function to run a single test case and provide a clear error message on failure.
2365 fn run_test(input: &str, expected: Option<usize>, case_name: &str) {
2366 let result = find_escape_char(input.as_bytes());
2367 assert_eq!(result, expected, "Failed test case: '{}'", case_name);
2368 }
2369
2370 #[test]
2371 fn test_no_escapes() {
2372 run_test("", None, "Empty string");
2373 run_test("Hello, world!", None, "Simple ASCII");
2374 run_test("This string is exactly 16 bytes", None, "16-byte ASCII");
2375 run_test(
2376 "This string is over 16 bytes long now",
2377 None,
2378 "Over 16-byte ASCII",
2379 );
2380
2381 // The original source of the bug: non-ASCII UTF-8 characters.
2382 // This ensures the signedness bug is truly fixed.
2383 run_test("Hello, éàçüö!", None, "Non-ASCII UTF-8");
2384 run_test("Testing with emojis 😀❤️✅", None, "Emojis");
2385 }
2386
2387 #[test]
2388 fn test_single_escapes() {
2389 run_test("\"", Some(0), "Quote at start");
2390 run_test("Hello \" world", Some(6), "Quote in middle");
2391 run_test("Hello\\", Some(5), "Backslash at end");
2392 run_test("\n", Some(0), "Control char (newline) at start");
2393 run_test("Hello\tworld", Some(5), "Control char (tab) in middle");
2394 run_test(
2395 "Control char at end\u{08}",
2396 Some(19),
2397 "Control char (backspace) at end",
2398 );
2399 }
2400
2401 #[test]
2402 fn test_finds_first_of_multiple() {
2403 // This confirms it always finds the *first* match, not a later one.
2404 run_test("a\"b\\c\nd", Some(1), "Finds first quote");
2405 run_test("ab\\c\"d\ne", Some(2), "Finds first backslash");
2406 run_test("abc\nd\"e\\f", Some(3), "Finds first control char");
2407 run_test("\"\n\\", Some(0), "Multiple escapes at start");
2408 }
2409
2410 #[test]
2411 fn test_simd_chunk_boundaries() {
2412 // These tests are critical for verifying the SIMD logic. A chunk is 16 bytes.
2413 let s15 = "a".repeat(15);
2414 let s16 = "a".repeat(16);
2415 let s17 = "a".repeat(17);
2416
2417 // Escape at the exact end of the first 16-byte chunk
2418 run_test(&format!("{}\"", s15), Some(15), "Escape at index 15");
2419
2420 // Escape at the exact start of the second 16-byte chunk
2421 run_test(&format!("{}\n", s16), Some(16), "Escape at index 16");
2422
2423 // Escape within the second chunk
2424 run_test(&format!("{}\t", s17), Some(17), "Escape at index 17");
2425
2426 // A long string with an escape several chunks in
2427 let long = "a".repeat(40);
2428 run_test(
2429 &format!("{}\\\\", long),
2430 Some(40),
2431 "Escape deep in a long string",
2432 );
2433 }
2434
2435 #[test]
2436 fn test_remainder_logic() {
2437 // These tests ensure the scalar fallback logic works correctly for inputs
2438 // that are not a multiple of 16 bytes long.
2439
2440 // String shorter than 16 bytes
2441 run_test("short\nstring", Some(5), "Short string with escape");
2442 run_test("no escapes", None, "Short string no escape");
2443
2444 // String with 17 bytes (16 for SIMD, 1 for remainder)
2445 let s16 = "a".repeat(16);
2446 run_test(
2447 &format!("{}\"", s16),
2448 Some(16),
2449 "Escape in 1-byte remainder",
2450 );
2451
2452 // String with 31 bytes (16 for SIMD, 15 for remainder)
2453 let s15 = "b".repeat(15);
2454 run_test(
2455 &format!("{}{}\t", s15, s15),
2456 Some(30),
2457 "Escape at end of 15-byte remainder",
2458 );
2459 }
2460
2461 #[test]
2462 fn test_all_escapable_bytes_individually() {
2463 // This is the ultimate test. It iterates through all 256 possible byte values
2464 // and confirms that our function's decision matches the ESCAPE_DECISION_TABLE.
2465 let prefix = "0123456789abcdef"; // A 16-byte safe prefix to engage the SIMD loop.
2466
2467 for byte_val in 0..=255u8 {
2468 // We can't create a &str from invalid UTF-8, so we work with byte slices.
2469 let mut test_bytes = prefix.as_bytes().to_vec();
2470 test_bytes.push(byte_val);
2471
2472 let result = find_escape_char(&test_bytes);
2473 let expected_to_escape = ESCAPE_DECISION_TABLE[byte_val as usize] == 1;
2474
2475 if expected_to_escape {
2476 // If this byte SHOULD be escaped, we expect to find it at index 16.
2477 assert_eq!(
2478 result,
2479 Some(16),
2480 "Failed to find required escape for byte 0x{:02X}",
2481 byte_val
2482 );
2483 } else {
2484 // If this byte should NOT be escaped, we expect to find nothing.
2485 assert_eq!(
2486 result, None,
2487 "Incorrectly found an escape for byte 0x{:02X}",
2488 byte_val
2489 );
2490 }
2491 }
2492 }
2493}