bufjson/
lexical.rs

1//! Scan JSON text, extracting a stream of tokens (lexical analysis).
2//!
3//! This module provides the traits, helpers, and type definitions needed to perform stream-oriented
4//! lexical analysis on JSON text.
5//!
6//! The fundamental types are the enum [`Token`], which represents the type of a JSON token, and
7//! the traits [`Analyzer`] (does the lexical analysis); [`Content`] (efficiently provides the
8//! actual content of a token from the JSON text); and [`Error`] (describes errors encountered by
9//! the lexical analyzer).
10//!
11//! The sub-modules provide concrete implementations of JSON tokenizers:
12//!
13//! - [`state`] is a lower-level module containing a simple reusable finite state machine; all the
14//!   concrete lexical analyzers in this crate use this state machine for their core logic.
15//! - [`fixed`] contains an implementation of [`Analyzer`] for tokenizing fixed-size in-memory
16//!   buffers.
17#![cfg_attr(feature = "read", doc = "- [`read`][`crate::lexical::read`]")]
18#![cfg_attr(not(feature = "read"), doc = "- `read`")]
19//! contains an implementation of [`Analyzer`] for tokenizing streams that implement `std::io::Read`
20//! stream. Requires the `read` feature to be enabled.
21//!
22//! # Performance
23//!
24//! Performance characteristics are documented on all relevant types at the trait level (this
25//! module) and at the concrete implementation level (in the sub-modules).
26//!
27//! In all cases, allocations and copies are avoided except where it is technically infeasible. When
28//! they have to be done, they are minimized.
29//!
30//! # Token content
31//!
32//! By design, the [`Content`] trait provides the literal text of all tokens appearing in the input
33//! JSON, including whitespace, without any change whatsoever. This policy facilitates use cases
34//! such as stream editing, where you might want to make changes to the JSON text, such as deleting
35//! some JSON elements or inserting new ones, while leaving everything else unchanged.
36//!
37//! # Numbers
38//!
39//! For number tokens ([`Token::Num`]), the [`Content`] trait provides the literal content of the
40//! number as it appears in the JSON text, without attempting to coerce it into a Rust numeric type.
41//!
42//! The reason for leaving numbers as text is that the [JSON spec][rfc] places no limits on the
43//! range and precision of numbers \[1\]. Since this module aims to faithfully implement the spec at
44//! the lexical level, it will recognize any valid JSON number, no matter the magnitude or
45//! precision. This would not be possible if it coerced the text into a numeric type, which all have
46//! their own limits on range and precision.
47//!
48//! \[1\]: The spec *does* urge software developers using JSON to be thoughtful about
49//! interoperability and, kinda sorta, to just stay within the IEEE double-precision floating point
50//! range, *a.k.a.*, `f64`. But that's not a requirement.
51//!
52//! # Strings
53//!
54//! For string tokens ([`Token::Str`]), the [`Content`] trait provides the literal content of the
55//! string as it appears in the JSON text, *including* the quotation marks that surround it, without
56//! attempting to expand the escape sequences.
57//!
58//! Escape sequences can be expanded by explicitly requesting [`Content::unescaped`] instead of
59//! [`Content::literal`]. Note that getting the unescaped content, will trigger an allocation if the
60//! string indeed does contain at least one escape sequence, which may not be desirable in all
61//! circumstances.
62//!
63//! Example of a string token without any escape sequences.
64//!
65//! ```
66//! # use bufjson::lexical::{Token, fixed::FixedAnalyzer};
67//! let mut lexer = FixedAnalyzer::new(&br#""foo""#[..]);
68//! assert_eq!(Token::Str, lexer.next());
69//! assert_eq!(r#""foo""#, lexer.content().literal()); // Note the surrounding quotes.
70//! assert_eq!(r#""foo""#, lexer.content().unescaped()); // No allocation, returns same value.
71//! ```
72//!
73//! Example of a string token containing an escape sequence.
74//!
75//! ```
76//! # use bufjson::lexical::{Token, fixed::FixedAnalyzer};
77//! let mut lexer = FixedAnalyzer::new(&br#""foo\u0020bar""#[..]);
78//! assert_eq!(Token::Str, lexer.next());
79//! assert_eq!(r#""foo\u0020bar""#, lexer.content().literal()); // Note the surrounding quotes.
80//! assert_eq!(r#""foo bar""#, lexer.content().unescaped()); // Allocates, expands \u0020 -> ' '.
81//! ```
82//!
83//! # Roll your own lexer
84//!
85//! The sub-module [`state`] provides the basic state machine for tokenizing JSON text. You can use
86//! it to build your own implementation of [`Analyzer`] or any other application that needs a
87//! low-level ability to identify JSON tokens that is faithful to the [JSON spec][rfc].
88//!
89//! For string tokens, you can use the [`unescape`] function standalone to expand escape sequences.
90//!
91//! [rfc]: https://datatracker.ietf.org/doc/html/rfc8259
92
93use crate::{Buf, EqStr, IntoBuf, OrdStr, Pos, StringBuf};
94use std::{
95    borrow::Borrow,
96    cmp::{Ord, Ordering},
97    fmt,
98    hash::{Hash, Hasher},
99    ops::Deref,
100};
101
102pub mod fixed;
103pub mod state;
104
105#[cfg(feature = "read")]
106pub mod read;
107
108/// Kind of lexical token in a JSON text, such as begin object `{`, literal `true`, or string.
109///
110/// This is a list of the JSON lexical token types as described in the [JSON spec][rfc]. The names
111/// of enumeration members are aligned with the names as they appear in the spec.
112///
113/// Note that `Token` just models the token *type*, not the content. Some token types have static
114/// content that never changes (*e.g.*, [`ArrBegin`] is always `'['`) while others have variable
115/// content that depends on the specific JSON text being analyzed (*e.g.* [`Str`]).
116///
117/// [rfc]: https://datatracker.ietf.org/doc/html/rfc8259
118/// [`ArrBegin`]: Token::ArrBegin
119/// [`Str`]: Token::Str
120#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
121pub enum Token {
122    /// The begin array token, which has the literal value `[`.
123    ArrBegin,
124    /// The end array token, which has the literal value `]`.
125    ArrEnd,
126    /// Pseudo-token representing the end of the JSON text (end of file).
127    Eof,
128    /// Pseudo-token representing an unrecoverable lexical error detected in the JSON text.
129    Err,
130    /// The value literal `false`.
131    LitFalse,
132    /// The value literal `null`.
133    LitNull,
134    /// The value literal `true`.
135    LitTrue,
136    /// The name separator token, which has the literal value `:`.
137    NameSep,
138    /// A number token such as `0`, `123.4`, or `-1.25e+6`.
139    Num,
140    /// The begin object token, which has the literal value `{`.
141    ObjBegin,
142    /// The end object token, which has the literal value `}`.
143    ObjEnd,
144    /// A string token, such as `""`, `"foo"`, or `"Hello,\u0020world! 🌎"`.
145    Str,
146    /// The value separator token, which has the literal value `,`.
147    ValueSep,
148    /// A maximal string of insignificant whitespace.
149    White,
150}
151
152impl Token {
153    /// Returns `true` for lexical tokens that are literal JSON values and `false` otherwise.
154    ///
155    /// The following tokens are considered literal values:
156    /// - [`LitFalse`][Token::LitFalse]
157    /// - [`LitNull`][Token::LitNull]
158    /// - [`LitTrue`][Token::LitTrue]
159    ///
160    /// # Examples
161    ///
162    /// ```
163    /// # use bufjson::lexical::Token;
164    /// assert!(Token::LitFalse.is_literal());
165    /// assert!(Token::LitNull.is_literal());
166    /// assert!(Token::LitTrue.is_literal());
167    ///
168    /// assert!(!Token::Str.is_literal());
169    /// assert!(!Token::Eof.is_literal());
170    /// ```
171    #[inline]
172    pub const fn is_literal(&self) -> bool {
173        matches!(self, Self::LitFalse | Self::LitNull | Self::LitTrue)
174    }
175
176    /// Returns `true` for pseudo-tokens and `false` otherwise.
177    ///
178    /// The following are considered pseudo-tokens:
179    /// - [`Eof`][Token::Eof]
180    /// - [`Err`][Token::Err]
181    /// - [`White`][Token::White]
182    ///
183    /// # Examples
184    ///
185    /// ```
186    /// # use bufjson::lexical::Token;
187    /// assert!(Token::Eof.is_pseudo());
188    /// assert!(Token::Err.is_pseudo());
189    /// assert!(Token::White.is_pseudo());
190    ///
191    /// assert!(!Token::ArrEnd.is_pseudo());
192    /// assert!(!Token::LitNull.is_pseudo());
193    /// assert!(!Token::Num.is_pseudo());
194    /// ```
195    #[inline]
196    pub const fn is_pseudo(&self) -> bool {
197        matches!(self, Self::Eof | Self::Err | Self::White)
198    }
199
200    /// Returns `true` for lexical tokens that are primitive JSON values and `false` otherwise.
201    ///
202    /// The following tokens are considered primitive values:
203    /// - [`LitFalse`][Token::LitFalse]
204    /// - [`LitNull`][Token::LitNull]
205    /// - [`LitTrue`][Token::LitTrue]
206    /// - [`Num`][Token::Num]
207    /// - [`Str`][Token::Str]
208    ///
209    /// # Examples
210    ///
211    /// ```
212    /// # use bufjson::lexical::Token;
213    /// assert!(Token::LitNull.is_primitive());
214    /// assert!(Token::Num.is_primitive());
215    /// assert!(Token::Str.is_primitive());
216    ///
217    /// assert!(!Token::ObjEnd.is_primitive());
218    /// assert!(!Token::White.is_primitive());
219    /// ```
220    #[inline]
221    pub const fn is_primitive(&self) -> bool {
222        matches!(
223            self,
224            Self::LitFalse | Self::LitNull | Self::LitTrue | Self::Num | Self::Str
225        )
226    }
227
228    /// Returns `true` for lexical tokens that are punctuation and `false` otherwise.
229    ///
230    /// The following tokens are considered punctuation:
231    ///
232    /// - [`ArrBegin`][Token::ArrBegin]
233    /// - [`ArrEnd`][Token::ArrEnd]
234    /// - [`NameSep`][Token::NameSep]
235    /// - [`ObjBegin`][Token::ObjBegin]
236    /// - [`ObjEnd`][Token::ObjEnd]
237    /// - [`ValueSep`][Token::ValueSep]
238    ///
239    /// # Examples
240    ///
241    /// ```
242    /// # use bufjson::lexical::Token;
243    /// assert!(Token::ArrBegin.is_punct());
244    /// assert!(Token::ValueSep.is_punct());
245    ///
246    /// assert!(!Token::Num.is_punct());
247    /// assert!(!Token::White.is_punct());
248    /// assert!(!Token::Err.is_punct());
249    /// ```
250    #[inline]
251    pub const fn is_punct(&self) -> bool {
252        matches!(
253            self,
254            Self::ArrBegin
255                | Self::ArrEnd
256                | Self::NameSep
257                | Self::ObjBegin
258                | Self::ObjEnd
259                | Self::ValueSep
260        )
261    }
262
263    /// Returns `true` for pseudo-tokens that terminate a stream of lexical tokens and `false`
264    /// otherwise.
265    ///
266    /// The following tokens are considered terminal:
267    /// - [`Eof`][Token::Eof]
268    /// - [`Err`][Token::Err]
269    ///
270    /// # Examples
271    ///
272    /// ```
273    /// # use bufjson::lexical::Token;
274    /// assert!(Token::Eof.is_terminal());
275    /// assert!(Token::Err.is_terminal());
276    ///
277    /// assert!(!Token::Num.is_terminal());
278    /// assert!(!Token::ObjBegin.is_terminal());
279    /// assert!(!Token::White.is_terminal());
280    /// ```
281    #[inline]
282    pub const fn is_terminal(&self) -> bool {
283        matches!(self, Self::Eof | Self::Err)
284    }
285
286    /// Returns the static content for lexical tokens that always have the same static text content.
287    ///
288    /// # Examples
289    ///
290    /// ```
291    /// # use bufjson::lexical::Token;
292    /// assert_eq!(Some("["), Token::ArrBegin.static_content());
293    /// assert_eq!(Some("true"), Token::LitTrue.static_content());
294    ///
295    /// assert_eq!(None, Token::Str.static_content());
296    /// assert_eq!(None, Token::White.static_content());
297    /// ```
298    #[inline]
299    pub const fn static_content(&self) -> Option<&'static str> {
300        match self {
301            Self::ArrBegin => Some("["),
302            Self::ArrEnd => Some("]"),
303            Self::LitFalse => Some("false"),
304            Self::LitNull => Some("null"),
305            Self::LitTrue => Some("true"),
306            Self::NameSep => Some(":"),
307            Self::ObjBegin => Some("{"),
308            Self::ObjEnd => Some("}"),
309            Self::ValueSep => Some(","),
310            _ => None,
311        }
312    }
313}
314
315impl fmt::Display for Token {
316    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
317        let s = match self {
318            Self::ArrBegin => "[",
319            Self::ArrEnd => "]",
320            Self::Eof => "EOF",
321            Self::Err => "error",
322            Self::LitFalse => "false",
323            Self::LitNull => "null",
324            Self::LitTrue => "true",
325            Self::NameSep => ":",
326            Self::Num => "number",
327            Self::ObjBegin => "{",
328            Self::ObjEnd => "}",
329            Self::Str => "string",
330            Self::ValueSep => ",",
331            Self::White => "whitespace",
332        };
333
334        f.write_str(s)
335    }
336}
337
338/// Result of expanding escape sequences in a JSON string token.
339///
340/// An `Unescaped` value is a valid UTF-8 string that is free of JSON escape sequences. It either
341/// contains the literal content of a JSON token exactly as it appears in the JSON text, if the
342/// token did not contain any escape sequences; or it contains the normalized version of the token
343/// content with escape sequences expanded, if the token had at least one escape sequence.
344/// Evidently, the latter case can only occur for string tokens.
345///
346/// # Example
347///
348/// ```
349/// use bufjson::lexical::{Token, Unescaped, fixed::FixedAnalyzer};
350///
351/// let mut lexer = FixedAnalyzer::new(&br#"["foo\u0020bar"]"#[..]);
352///
353/// assert_eq!(Token::ArrBegin, lexer.next());
354/// let content = lexer.content();
355/// let u: Unescaped<_> = content.unescaped();
356/// assert!(u.is_literal());
357/// assert_eq!("[", u);
358///
359/// assert_eq!(Token::Str, lexer.next());
360/// let content = lexer.content();
361/// let u: Unescaped<_> = content.unescaped();
362/// assert!(u.is_expanded());
363/// assert_eq!(r#""foo bar""#, u);
364/// ```
365#[derive(Clone, Debug)]
366pub enum Unescaped<T> {
367    /// Literal content of the JSON token exactly as it appears in the JSON text.
368    Literal(T),
369
370    /// Normalized content of the JSON token with all escape sequences expanded.
371    Expanded(String),
372}
373
374impl<T> Unescaped<T> {
375    /// Returns the literal content if available.
376    ///
377    /// The return value is `Some(...)` if `self` is [`Literal`], and `None` otherwise.
378    ///
379    /// [`Literal`]: Self::Literal
380    pub fn literal(&self) -> Option<&T> {
381        match self {
382            Self::Literal(t) => Some(t),
383            Self::Expanded(_) => None,
384        }
385    }
386
387    /// Returns the expanded content if available.
388    ///
389    /// The return value is `Some(...)` if `self` is [`Expanded`], and `None` otherwise.
390    ///
391    /// [`Expanded`]: Self::Expanded
392    pub fn expanded(&self) -> Option<&str> {
393        match self {
394            Self::Literal(_) => None,
395            Self::Expanded(e) => Some(e.as_str()),
396        }
397    }
398
399    /// Returns `true` if `self` is [`Literal`], and `false` otherwise.
400    ///
401    /// [`Literal`]: Self::Literal
402    pub fn is_literal(&self) -> bool {
403        matches!(self, Self::Literal(_))
404    }
405
406    /// Returns `true` if `self` is [`Expanded`], and `false` otherwise.
407    ///
408    /// [`Expanded`]: Self::Expanded
409    pub fn is_expanded(&self) -> bool {
410        matches!(self, Self::Expanded(_))
411    }
412}
413
414impl<T: IntoBuf> IntoBuf for Unescaped<T> {
415    type Buf = UnescapedBuf<T::Buf>;
416
417    fn into_buf(self) -> Self::Buf {
418        match self {
419            Self::Literal(t) => UnescapedBuf(UnescapedBufInner::Literal(t.into_buf())),
420            Self::Expanded(e) => UnescapedBuf(UnescapedBufInner::Expanded(e.into_buf())),
421        }
422    }
423}
424
425impl AsRef<str> for Unescaped<&str> {
426    fn as_ref(&self) -> &str {
427        match self {
428            Unescaped::Literal(t) => t,
429            Unescaped::Expanded(e) => e.as_str(),
430        }
431    }
432}
433
434impl AsRef<[u8]> for Unescaped<&str> {
435    fn as_ref(&self) -> &[u8] {
436        match self {
437            Unescaped::Literal(t) => t.as_bytes(),
438            Unescaped::Expanded(e) => e.as_bytes(),
439        }
440    }
441}
442
443impl Deref for Unescaped<&str> {
444    type Target = str;
445
446    fn deref(&self) -> &str {
447        match self {
448            Unescaped::Literal(t) => t,
449            Unescaped::Expanded(e) => e.as_str(),
450        }
451    }
452}
453
454impl Borrow<str> for Unescaped<&str> {
455    fn borrow(&self) -> &str {
456        match self {
457            Unescaped::Literal(t) => t,
458            Unescaped::Expanded(e) => e.as_str(),
459        }
460    }
461}
462
463impl<T> fmt::Display for Unescaped<T>
464where
465    T: fmt::Display,
466{
467    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
468        match self {
469            Unescaped::Literal(t) => t.fmt(f),
470            Unescaped::Expanded(e) => e.fmt(f),
471        }
472    }
473}
474
475impl<T> Eq for Unescaped<T> where T: Eq + EqStr {}
476
477impl<T> From<Unescaped<T>> for String
478where
479    String: From<T>,
480{
481    fn from(u: Unescaped<T>) -> Self {
482        match u {
483            Unescaped::Literal(t) => t.into(),
484            Unescaped::Expanded(e) => e,
485        }
486    }
487}
488
489impl<T> Hash for Unescaped<T>
490where
491    T: Hash,
492{
493    fn hash<H: Hasher>(&self, state: &mut H) {
494        match self {
495            Unescaped::Literal(t) => t.hash(state),
496            Unescaped::Expanded(e) => e.hash(state),
497        }
498    }
499}
500
501impl<T> Ord for Unescaped<T>
502where
503    T: Eq + Ord + EqStr + OrdStr,
504    Self: Eq + PartialOrd,
505{
506    fn cmp(&self, other: &Unescaped<T>) -> Ordering {
507        match (self, other) {
508            (Unescaped::Literal(t1), Unescaped::Literal(t2)) => Ord::cmp(t1, t2),
509            (Unescaped::Expanded(e1), Unescaped::Expanded(e2)) => e1.cmp(e2),
510            (Unescaped::Literal(t), Unescaped::Expanded(e)) => OrdStr::cmp(t, e.as_str()),
511            (Unescaped::Expanded(e), Unescaped::Literal(t)) => OrdStr::cmp(t, e.as_str()).reverse(),
512        }
513    }
514}
515
516impl<T> PartialEq<Unescaped<T>> for Unescaped<T>
517where
518    T: for<'a> PartialEq<&'a str>,
519    T: PartialEq<T>,
520{
521    fn eq(&self, other: &Unescaped<T>) -> bool {
522        match (self, other) {
523            (Unescaped::Literal(t1), Unescaped::Literal(t2)) => t1 == t2,
524            (Unescaped::Expanded(e1), Unescaped::Expanded(e2)) => e1 == e2,
525            (Unescaped::Literal(t1), Unescaped::Expanded(e2)) => *t1 == e2.as_str(),
526            (Unescaped::Expanded(e1), Unescaped::Literal(t2)) => *t2 == e1.as_str(),
527        }
528    }
529}
530
531impl<T> PartialEq<&str> for Unescaped<T>
532where
533    T: for<'a> PartialEq<&'a str>,
534{
535    fn eq(&self, other: &&str) -> bool {
536        match self {
537            Unescaped::Literal(t) => *t == *other,
538            Unescaped::Expanded(e) => e == other,
539        }
540    }
541}
542
543impl<'a, 'b, T> PartialEq<Unescaped<T>> for &'a str
544where
545    T: PartialEq<&'b str>,
546    'a: 'b,
547{
548    fn eq(&self, other: &Unescaped<T>) -> bool {
549        match other {
550            Unescaped::Literal(t) => *t == *self,
551            Unescaped::Expanded(e) => self == e,
552        }
553    }
554}
555
556impl<T> PartialEq<String> for Unescaped<T>
557where
558    T: PartialEq<String>,
559{
560    fn eq(&self, other: &String) -> bool {
561        match self {
562            Unescaped::Literal(t) => t == other,
563            Unescaped::Expanded(e) => e == other,
564        }
565    }
566}
567
568impl<T> PartialEq<Unescaped<T>> for String
569where
570    T: PartialEq<String>,
571{
572    fn eq(&self, other: &Unescaped<T>) -> bool {
573        match other {
574            Unescaped::Literal(t) => t == self,
575            Unescaped::Expanded(e) => self == e,
576        }
577    }
578}
579
580impl<T> PartialOrd<Unescaped<T>> for Unescaped<T>
581where
582    T: for<'a> PartialOrd<&'a str>,
583    for<'a> &'a str: PartialOrd<T>,
584    T: PartialOrd<T>,
585    Self: PartialEq,
586{
587    fn partial_cmp(&self, other: &Unescaped<T>) -> Option<Ordering> {
588        match (self, other) {
589            (Unescaped::Literal(t1), Unescaped::Literal(t2)) => t1.partial_cmp(t2),
590            (Unescaped::Expanded(e1), Unescaped::Expanded(e2)) => e1.partial_cmp(e2),
591            (Unescaped::Literal(t), Unescaped::Expanded(e)) => t.partial_cmp(&e.as_str()),
592            (Unescaped::Expanded(e), Unescaped::Literal(t)) => {
593                PartialOrd::<T>::partial_cmp(&e.as_str(), t)
594            }
595        }
596    }
597}
598
599impl<T> PartialOrd<&str> for Unescaped<T>
600where
601    T: for<'a> PartialOrd<&'a str>,
602    Self: for<'a> PartialEq<&'a str>,
603{
604    fn partial_cmp(&self, other: &&str) -> Option<Ordering> {
605        match self {
606            Unescaped::Literal(t) => t.partial_cmp(other),
607            Unescaped::Expanded(e) => e.as_str().partial_cmp(*other),
608        }
609    }
610}
611
612impl<T> PartialOrd<Unescaped<T>> for &str
613where
614    Self: PartialOrd<T>,
615    Self: for<'c> PartialEq<Unescaped<T>>,
616{
617    fn partial_cmp(&self, other: &Unescaped<T>) -> Option<Ordering> {
618        match other {
619            Unescaped::Literal(t) => self.partial_cmp(t),
620            Unescaped::Expanded(e) => PartialOrd::<&str>::partial_cmp(self, &e.as_str()),
621        }
622    }
623}
624
625#[derive(Debug)]
626enum UnescapedBufInner<B> {
627    Literal(B),
628    Expanded(StringBuf),
629}
630
631/// A [`Buf`] implementation for [`Unescaped`].
632///
633/// # Example
634///
635/// ```
636/// use bufjson::{Buf, IntoBuf, lexical::{Token, UnescapedBuf, fixed::FixedAnalyzer}};
637///
638/// let mut lexer = FixedAnalyzer::new(&b"123.456"[..]);
639///
640/// assert_eq!(Token::Num, lexer.next());
641///
642/// let content = lexer.content();
643/// let unescaped = content.unescaped();
644/// let mut buf: UnescapedBuf<_> = unescaped.into_buf();
645///
646/// buf.advance(4);
647/// assert_eq!(3, buf.remaining());
648/// assert_eq!(b"456", buf.chunk());
649/// ```
650#[derive(Debug)]
651pub struct UnescapedBuf<B>(UnescapedBufInner<B>);
652
653impl<B: Buf> Buf for UnescapedBuf<B> {
654    fn advance(&mut self, n: usize) {
655        match &mut self.0 {
656            UnescapedBufInner::Literal(b) => b.advance(n),
657            UnescapedBufInner::Expanded(e) => e.advance(n),
658        }
659    }
660
661    fn chunk(&self) -> &[u8] {
662        match &self.0 {
663            UnescapedBufInner::Literal(b) => b.chunk(),
664            UnescapedBufInner::Expanded(e) => e.chunk(),
665        }
666    }
667
668    fn remaining(&self) -> usize {
669        match &self.0 {
670            UnescapedBufInner::Literal(b) => b.remaining(),
671            UnescapedBufInner::Expanded(e) => e.remaining(),
672        }
673    }
674
675    fn try_copy_to_slice(&mut self, dst: &mut [u8]) -> Result<(), crate::BufUnderflow> {
676        match &mut self.0 {
677            UnescapedBufInner::Literal(b) => b.try_copy_to_slice(dst),
678            UnescapedBufInner::Expanded(e) => e.try_copy_to_slice(dst),
679        }
680    }
681}
682
683/// Text content of a JSON token.
684///
685/// Contains the actual textual *content* of the JSON token read from the JSON text. This is in
686/// distinction to [`Token`], which only indicates the *type* of the token.
687///
688/// For example, consider the following JSON text:
689///
690/// ```json
691/// "foo"
692/// ```
693///
694/// The above JSON text contains one token whose type is [`Token::Str`] and whose content is `"foo"`.
695pub trait Content: fmt::Debug {
696    /// Type that contains the literal string of the token exactly as it appears in the JSON text.
697    type Literal<'a>: IntoBuf
698    where
699        Self: 'a;
700
701    /// Returns the literal content of the token exactly as it appears in the JSON text.
702    ///
703    /// # Static content tokens
704    ///
705    /// For token types with a static text content, *e.g.* [`Token::NameSep`], the value returned
706    /// is the static content.
707    ///
708    /// # Numbers
709    ///
710    /// For number tokens, the value returned is the literal text of the number token.
711    ///
712    /// # Strings
713    ///
714    /// For string tokens, the value returned is the literal text of the string token *including*
715    /// the opening and closing double quote (`"`) characters. Therefore, every string token has
716    /// length at least two and the unquoted value can be extracted by dropping the first and last
717    /// characters.
718    ///
719    /// Because the return value contains the entire literal string token as it appears in the JSON
720    /// text, any escape sequences the string may contain are not expanded. This has the benefit
721    /// of supporting the following use cases: allowing lexical analyzer implementations to minimize
722    /// or eliminate allocations when returning token values; and allowing applications to observe
723    /// or edit a stream of JSON tokens without making any unintended changes to the raw JSON input.
724    ///
725    /// Some applications need to have escape sequences expanded in order to work with normalized
726    /// strings. For example, it's pretty hard to reliably do a dictionary lookup for the name
727    /// `"foo"` if the literal value might be `"fo\u006f"`, `"f\u006f\u006f"`, `"\u0066oo"`, *etc.*
728    /// To check if the string contains an escape sequence, use [`is_escaped`]; and to obtain the
729    /// normalized value with all escape sequences expanded, use [`unescaped`].
730    ///
731    /// [`is_escaped`]: method@Self::is_escaped
732    /// [`unescaped`]: method@Self::unescaped
733    ///
734    /// # Whitespace
735    ///
736    /// For whitespace tokens, the value returned is the literal string of whitespace characters.
737    ///
738    /// # End of file
739    ///
740    /// For the pseudo-token [`Token::Eof`], the value is the empty string.
741    fn literal<'a>(&'a self) -> Self::Literal<'a>;
742
743    /// Indicates whether the token content contains escape sequences.
744    ///
745    /// This method must always return `false` for all token types except [`Token::Str`]. For
746    /// [`Token::Str`], it must return `true` if the literal text of the string token contains at
747    /// least one escape sequence, and `false` otherwise.
748    fn is_escaped(&self) -> bool;
749
750    /// Returns a normalized version of [`literal`] with all escape sequences in the JSON text fully
751    /// expanded.
752    ///
753    /// For non-string tokens, and string tokens for which [`is_escaped`] returns `false`, this
754    /// method returns an [`Unescaped::Literal`] containing the same value returned by [`literal`].
755    ///
756    /// For string tokens with one or more escape sequences, this method returns an
757    /// [`Unescaped::Expanded`] containing a normalized version of the string value with the escape
758    /// sequences expanded. An allocation will be triggered by this expansion, so it may be
759    /// preferable to cache the value returned rather than calling this method repeatedly on the
760    /// same content.
761    ///
762    /// As described in the [JSON spec][rfc], the following escape sequence expansions are done:
763    ///
764    /// | Sequence | Expands to |
765    /// |-|-|
766    /// | `\"` | Quotation mark, `"`, U+0022 |
767    /// | `\\` | Reverse solidus, `\`, U+005c |
768    /// | `\/` | Solidus, `/`, U+002f |
769    /// | `\b` | Backspace, U+0008 |
770    /// | `\f` | Form feed, U+000c |
771    /// | `\n` | Line feed, U+000a |
772    /// | `\r` | Carriage return, U+000d |
773    /// | `\t` | Horizontal tab, U+0009 |
774    /// | `\uXXXX` | Any Unicode character in basic multilingual plane, U+0000 through U+ffff |
775    /// | `\uHHHH\uLLLL` | Unicode characters outside the basic multilingual plane represented as a high/low surrogate pair |
776    ///
777    /// [`literal`]: method@Self::literal
778    /// [`is_escaped`]: method@Self::is_escaped
779    /// [rfc]: https://datatracker.ietf.org/doc/html/rfc8259
780    fn unescaped<'a>(&'a self) -> Unescaped<Self::Literal<'a>>;
781}
782
783/// Character or class of characters expected at the next input position of a JSON text.
784///
785/// This enumeration provides detail information for [`ErrorKind::UnexpectedByte`].
786#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
787pub enum Expect {
788    /// Any token boundary character.
789    ///
790    /// One of:
791    /// - `'{'` (opening brace, U+007B)
792    /// - `'}'` (closing brace, U+007D)
793    /// - `'['` (opening bracket, U+005B)
794    /// - `']'` (closing bracket, U+005D)
795    /// - `':'` (colon, U+003A)
796    /// - `','` (comma, U+002C)
797    /// - `' '` (space, U+0020)
798    /// - `'\t'` (horizontal tab, U+0009)
799    /// - `'\n'` (line feed, U+000A)
800    /// - `'\r'` (carriage return, U+000D).
801    Boundary,
802
803    /// A specific character.
804    Char(char),
805
806    /// Any decimal digit character, `'0'`..`'9'` (U+0030..U+0039).
807    Digit,
808
809    /// Any decimal digit character ([`Digit`]); or one of the two exponent indicator characters 'E'
810    /// (U+0045) or 'e' (U+0065); or any token boundary character ([`Boundary`]).
811    ///
812    /// [`Digit`]: Expect::Digit
813    /// [`Boundary`]: Expect::Boundary
814    DigitExpOrBoundary,
815
816    /// Any decimal digit character ([`Digit`]) or one of the two exponent sign characters `'+'`
817    /// (U+002B) or `'-'` (U+002D).
818    ///
819    /// [`Digit`]: Expect::Digit
820    DigitOrExpSign,
821
822    /// Any decimal digit character ([`Digit`]) or token boundary character ([`Boundary`]).
823    ///
824    /// [`Digit`]: Expect::Digit
825    /// [`Boundary`]: Expect::Boundary
826    DigitOrBoundary,
827
828    /// The dot or period character `'.'` (U+002E); one of the two exponent indicator characters 'E'
829    /// (U+0045) or 'e' (U+0065); or any token boundary character ([`Boundary`]).
830    ///
831    /// [`Boundary`]: Expect::Boundary
832    DotExpOrBoundary,
833
834    /// Any character that completes a short-form escape sequence or starts a Unicode escape
835    /// sequence.
836    ///
837    /// One of:
838    /// - `'"'` (double quotation mark, U+0022)
839    /// - `'\\'` (reverse solidus, U+005C)
840    /// - `'/'` (solidus, U+002F)
841    /// - `'b'` (lowercase 'b', U+0062)
842    /// - `'f'` (lowercase 'f', U+0066)
843    /// - `'n'` (lowercase 'n', U+006E)
844    /// - `'r'` (lowercase 'r', U+0072)
845    /// - `'t'` (lowercase 't', U+0074)
846    /// - `'u'` (lowercase 'u', U+0075)
847    EscChar,
848
849    /// Any character that is valid in a JSON string token, the string token termination character
850    /// `'"'` (double quotation mark, U+0022).
851    ///
852    /// This essentially means any valid Unicode character at or above the space `' '` (U+0020).
853    StrChar,
854
855    /// Any character that validly starts a JSON token.
856    ///
857    /// One of:
858    ///
859    /// - A boundary character ([`Boundary`])
860    /// - A digit ([`Digit`])
861    /// - `'"'` (double quotation mark, U+0022)
862    /// - `'f'` (U+0066)
863    /// - `'n'` (U+006E)
864    /// - `'t'` (U+0074)
865    ///
866    /// [`Digit`]: Expect::Digit
867    /// [`Boundary`]: Expect::Boundary
868    TokenStartChar,
869
870    /// Any hexadecimal digit character allowed in a Unicode escape sequence.
871    ///
872    /// One of:
873    /// - A decimal digit character ([`Digit`])
874    /// - An uppercase letter `'A'`..`'F'` (U+0041..U+0046)
875    /// - A lowercase letter `'a'`..`'f'` (U+0061..0066)
876    ///
877    /// [`Digit`]: Expect::Digit
878    UnicodeEscHexDigit,
879}
880
881impl fmt::Display for Expect {
882    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
883        match self {
884            Self::Boundary => f.write_str("boundary character or EOF"),
885            Self::Char(c) => write!(f, "character '{c}'"),
886            Self::Digit => f.write_str("digit character '0'..'9'"),
887            Self::DigitOrBoundary => f.write_str("digit character '0'..'9', boundary character, or EOF"),
888            Self::DigitExpOrBoundary => f.write_str("digit character '0'..'9', exponent character 'E' or 'e', boundary character, or EOF"),
889            Self::DigitOrExpSign => f.write_str("exponent sign character '+' or '-', or exponent digit character '0'..'9'"),
890            Self::DotExpOrBoundary => f.write_str("character '.', 'exponent character 'E' or 'e', boundary character, or EOF"),
891            Self::EscChar => f.write_str("escape sequence character '\\', '\"', '/', 'r', 'n', 't', or 'u'"),
892            Self::StrChar => f.write_str("string character"),
893            Self::TokenStartChar => f.write_str("token start character"),
894            Self::UnicodeEscHexDigit => f.write_str("Unicode escape sequence hex digit '0'..'9', 'A'..'F', or 'a'..'f'"),
895        }
896    }
897}
898
899/// Category of error that can occur while tokenizing a JSON text.
900#[derive(Debug, Clone, Copy, Eq, PartialEq)]
901pub enum ErrorKind {
902    /// A Unicode escape sequence of the form `\uLLLL` or `\uHHHH\uLLLL` within a
903    /// [string token][Token::Str] has a bad Unicode surrogate.
904    BadSurrogate {
905        /// The 16-bit number read from the first Unicode escape sequence.
906        ///
907        /// This will always be a valid Unicode surrogate code unit, either a high surrogate or a
908        /// low surrogate code pair.
909        first: u16,
910
911        /// The 16-bit number read from Unicode escape sequence that immediately followed the first
912        /// escape sequence (if there was one).
913        ///
914        /// This may be a Unicode high surrogate code unit, or it may be a valid Unicode code point,
915        /// but will never be a low surrogate code unit.
916        second: Option<u16>,
917
918        /// Byte offset from the start of the last Unicode escape sequence (`first` if `second` is
919        /// `None`, otherwise `second`) where the error was detected.
920        offset: u8,
921    },
922
923    /// A UTF-8 continuation byte within a [string token][Token::Str] has an invalid value.
924    BadUtf8ContByte {
925        /// Length of the UTF-8 byte sequence.
926        seq_len: u8,
927
928        /// Zero-based offset of the invalid byte value within the sequence.
929        offset: u8,
930
931        /// Invalid byte value.
932        value: u8,
933    },
934
935    /// The underlying source of JSON text (*e.g.* a file or stream) reported an error when the
936    /// lexical analyzer tried to read the next block of JSON text to analyze.
937    Read,
938
939    /// An unexpected byte was encountered in a token.
940    UnexpectedByte {
941        /// Type of token within which the unexpected byte was encountered.
942        token: Option<Token>,
943
944        /// Character or characters expected.
945        expect: Expect,
946
947        /// The unexpected byte actually encountered.
948        actual: u8,
949    },
950
951    /// The JSON text ended abruptly in the middle of an incomplete lexical token.
952    UnexpectedEof(Token),
953}
954
955impl ErrorKind {
956    pub(crate) fn bad_utf8_cont_byte(seq_len: u8, offset: u8, value: u8) -> ErrorKind {
957        ErrorKind::BadUtf8ContByte {
958            seq_len,
959            offset,
960            value,
961        }
962    }
963
964    pub(crate) fn expect_boundary(token: Token, actual: u8) -> ErrorKind {
965        let expect = Expect::Boundary;
966
967        ErrorKind::UnexpectedByte {
968            token: Some(token),
969            expect,
970            actual,
971        }
972    }
973
974    pub(crate) fn expect_char(token: Token, actual: u8, expect: char) -> ErrorKind {
975        let expect = Expect::Char(expect);
976
977        ErrorKind::UnexpectedByte {
978            token: Some(token),
979            expect,
980            actual,
981        }
982    }
983
984    pub(crate) fn expect_digit(actual: u8) -> ErrorKind {
985        let expect = Expect::Digit;
986
987        ErrorKind::UnexpectedByte {
988            token: Some(Token::Num),
989            expect,
990            actual,
991        }
992    }
993
994    pub(crate) fn expect_digit_exp_or_boundary(actual: u8) -> ErrorKind {
995        let expect = Expect::DigitExpOrBoundary;
996
997        ErrorKind::UnexpectedByte {
998            token: Some(Token::Num),
999            expect,
1000            actual,
1001        }
1002    }
1003
1004    pub(crate) fn expect_digit_or_boundary(actual: u8) -> ErrorKind {
1005        let expect = Expect::DigitOrBoundary;
1006
1007        ErrorKind::UnexpectedByte {
1008            token: Some(Token::Num),
1009            expect,
1010            actual,
1011        }
1012    }
1013
1014    pub(crate) fn expect_dot_exp_or_boundary(actual: u8) -> ErrorKind {
1015        let expect = Expect::DotExpOrBoundary;
1016
1017        ErrorKind::UnexpectedByte {
1018            token: Some(Token::Num),
1019            expect,
1020            actual,
1021        }
1022    }
1023
1024    pub(crate) fn expect_esc_char(actual: u8) -> ErrorKind {
1025        let expect = Expect::EscChar;
1026
1027        ErrorKind::UnexpectedByte {
1028            token: Some(Token::Str),
1029            expect,
1030            actual,
1031        }
1032    }
1033
1034    pub(crate) fn expect_exp_sign_or_digit(actual: u8) -> ErrorKind {
1035        let expect = Expect::DigitOrExpSign;
1036
1037        ErrorKind::UnexpectedByte {
1038            token: Some(Token::Num),
1039            expect,
1040            actual,
1041        }
1042    }
1043
1044    pub(crate) fn expect_str_char(actual: u8) -> ErrorKind {
1045        let expect = Expect::StrChar;
1046
1047        ErrorKind::UnexpectedByte {
1048            token: Some(Token::Str),
1049            expect,
1050            actual,
1051        }
1052    }
1053
1054    pub(crate) fn expect_token_start_char(actual: u8) -> ErrorKind {
1055        let expect = Expect::TokenStartChar;
1056
1057        ErrorKind::UnexpectedByte {
1058            token: None,
1059            expect,
1060            actual,
1061        }
1062    }
1063
1064    pub(crate) fn expect_unicode_esc_hex_digit(actual: u8) -> ErrorKind {
1065        let expect = Expect::UnicodeEscHexDigit;
1066
1067        ErrorKind::UnexpectedByte {
1068            token: Some(Token::Str),
1069            expect,
1070            actual,
1071        }
1072    }
1073
1074    pub(crate) fn fmt_at(&self, f: &mut fmt::Formatter, pos: Option<&Pos>) -> fmt::Result {
1075        match self {
1076            Self::BadSurrogate {
1077                first: lo,
1078                second: None,
1079                offset: _,
1080            } if (0xdc00..=0xdfff).contains(lo) => {
1081                write!(
1082                    f,
1083                    "bad Unicode escape sequence: low surrogate '\\u{lo:04X}' without preceding high surrogate"
1084                )?;
1085            }
1086
1087            Self::BadSurrogate {
1088                first: hi,
1089                second: None,
1090                offset: _,
1091            } => {
1092                write!(
1093                    f,
1094                    "bad Unicode escape sequence: high surrogate '\\u{hi:04X}' not followed by low surrogate"
1095                )?;
1096            }
1097
1098            Self::BadSurrogate {
1099                first: hi,
1100                second: Some(lo),
1101                offset: _,
1102            } => {
1103                write!(
1104                    f,
1105                    "bad Unicode escape sequence surrogate pair: high surrogate '\\u{hi:04X}' followed by invalid low surrogate '\\u{lo:04X}'"
1106                )?;
1107            }
1108
1109            Self::BadUtf8ContByte {
1110                seq_len,
1111                offset,
1112                value,
1113            } => {
1114                write!(
1115                    f,
1116                    "bad UTF-8 continuation byte 0x{value:02x} in {seq_len}-byte UTF-8 sequence (byte #{offset})"
1117                )?;
1118            }
1119
1120            Self::Read => write!(f, "read error")?,
1121
1122            Self::UnexpectedByte {
1123                token,
1124                expect,
1125                actual,
1126            } if (b' '..=0x7e).contains(actual) => {
1127                write!(
1128                    f,
1129                    "expected {expect} but got character '{}' (ASCII 0x{actual:02x})",
1130                    *actual as char
1131                )?;
1132                if let Some(t) = token {
1133                    write!(f, " in {t} token")?;
1134                }
1135            }
1136
1137            Self::UnexpectedByte {
1138                token,
1139                expect,
1140                actual,
1141            } => {
1142                write!(f, "expected {expect} but got byte {actual:02x}")?;
1143                if let Some(t) = token {
1144                    write!(f, " in {t} token")?;
1145                }
1146            }
1147
1148            Self::UnexpectedEof(token) => {
1149                write!(f, "unexpected EOF in {token} token")?;
1150            }
1151        };
1152
1153        if let Some(p) = pos {
1154            write!(f, " at {}", *p)?;
1155        }
1156
1157        Ok(())
1158    }
1159}
1160
1161impl fmt::Display for ErrorKind {
1162    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1163        self.fmt_at(f, None)
1164    }
1165}
1166
1167/// An error encountered during lexical analysis of JSON text.
1168pub trait Error: std::error::Error + Send + Sync {
1169    /// Returns the category of error.
1170    ///
1171    /// If the category is [`ErrorKind::Read`], the underlying I/O error is available from the
1172    /// [`source`] method.
1173    ///
1174    /// [`source`]: method@std::error::Error::source
1175    fn kind(&self) -> ErrorKind;
1176
1177    /// Returns the position in the JSON text where the error was encountered.
1178    ///
1179    /// The error position returned by this method is more precise than the position returned by
1180    /// [`Analyzer::pos`]. This is because [`Analyzer::pos`] returns the position of the start of
1181    /// the token returned by [`Analyzer::next`], while this method provides the granular position
1182    /// where the error occurred.
1183    ///
1184    /// For example, consider the following lexically-invalid JSON text:
1185    ///
1186    /// ```json
1187    /// "foo
1188    /// ```
1189    ///
1190    /// The above text contains an unteriminated string token. A lexical analyzer tokenizing this
1191    /// text will return:
1192    ///
1193    /// 1. [`Token::Err`] on the first call to its [`next`][`Analyzer::next`] method, since the very
1194    ///    first token has an error.
1195    /// 2. The position of the first `"` character in the text on a subsequent call to its
1196    ///    [`pos`][Analyzer::pos] method, because that is the position of the start of the token
1197    ///    returned by [`next`][Analyzer::next].
1198    /// 3. An `Err` result containing an `Error` whose `pos` method (this method) returns the
1199    ///    position immediately right of the last `o` character in the text, because this is where
1200    ///    the actual error, an unexpected end of file, was encountered.
1201    fn pos(&self) -> &Pos;
1202}
1203
1204/// Lexical analyzer for JSON text.
1205///
1206/// Converts JSON text into a stream of lexical tokens.
1207pub trait Analyzer {
1208    /// The type that contains token content, returned by the [`content`] and [`try_content`]
1209    /// methods.
1210    ///
1211    /// [`content`]: method@Self::content
1212    /// [`try_content`]: method@Self::try_content
1213    type Content: Content;
1214
1215    /// The type that reports errors during the lexical analysis process, returned by the [`err`]
1216    /// and [`try_content`] methods.
1217    ///
1218    /// [`err`]: method@Self::err
1219    /// [`try_content`]: method@Self::try_content
1220    type Error: Error;
1221
1222    /// Recognizes the next lexical token in the JSON text.
1223    ///
1224    /// Returns the type of the token recognized. After this method returns, the text content of the
1225    /// recognized token can be obtained by calling the [`content`] method.
1226    ///
1227    /// If the end of the JSON text is reached, without encountering an error, the token type
1228    /// returned is `Token::Eof`; and this token type is also returned on all subsequent calls. For
1229    /// `Token::Eof`, the [`content`] method returns an `Ok` result containing empty text.
1230    ///
1231    /// If an error is encountered while analyzing the JSON text, the token type returned is
1232    /// `Token::Err`; and this token type is also returned on all subsequent calls. For
1233    /// `Token::Err`, the [`content`] method returns an `Ok` result containing empty text.
1234    ///
1235    /// [`content`]: method@Self::content
1236    ///
1237    /// # Performance considerations
1238    ///
1239    /// Implementations of this method should not trigger an allocation unless an allocation is
1240    /// required to read in more input from an underlying source of JSON text, *e.g.* a file or
1241    /// stream. Outside this singular scenario, the process of recognizing the next JSON token
1242    /// should never allocate.
1243    fn next(&mut self) -> Token;
1244
1245    /// Returns the text content of the non-error token most recently recognized by [`next`].
1246    ///
1247    /// If called before any call to `next`, returns empty content.
1248    ///
1249    /// If called repeatedly between calls to `next`, subsequent calls return a value equivalent to
1250    /// the value returned by the first call.
1251    ///
1252    /// # Panics
1253    ///
1254    /// Panics if the token most recently returned by `next` was [`Token::Err`].
1255    ///
1256    /// # Performance considerations
1257    ///
1258    /// A call to this method may allocate, although implementations should avoid allocation if
1259    /// possible. Therefore, it is best to cache the result of this method rather than calling it
1260    /// repeatedly to fetch the same value between calls to `next`. If the text content of the last
1261    /// token is not needed for some reason, the best course is not to call this method at all.
1262    ///
1263    /// [`next`]: method@Self::next
1264    #[inline]
1265    fn content(&self) -> Self::Content {
1266        self.try_content().unwrap()
1267    }
1268
1269    /// Returns the error value associated with the error token most recently returned by [`next`].
1270    ///
1271    /// If called repeatedly after a call to `next` that returned [`Token::Err`], subsequent calls
1272    /// return a value equivalent to the value returned by the first call.
1273    ///
1274    /// # Panics
1275    ///
1276    /// If the token most recently returned by `next` was not [`Token::Err`].
1277    ///
1278    /// [`next`]: method@Self::next
1279    #[inline]
1280    fn err(&self) -> Self::Error {
1281        self.try_content().unwrap_err()
1282    }
1283
1284    /// Returns the position of the lexical analyzer's cursor within the JSON text.
1285    ///
1286    /// Before the first call to [`next`], the return value is [`Pos::default`].
1287    ///
1288    /// After `next` is called, the return value is the position of the first character of the
1289    /// recognized token. In the case where `next` returns `Token::Err`, the return value is the
1290    /// position of the first character of the token that was being recognized at the time when the
1291    /// error was detected.
1292    ///
1293    /// [`next`]: method@Self::next
1294    fn pos(&self) -> &Pos;
1295
1296    /// Returns the content or error value associated with the token most recently recognized by
1297    /// [`next`].
1298    ///
1299    /// If the most recent call to `next` returned [`Token::Err`], an `Err` result is returned.
1300    /// Otherwise, an `Ok` result containing the text content of the recognized lexical token is
1301    /// returned.
1302    ///
1303    /// If called before any call to `next`, this method returns an `Ok` result containing empty
1304    /// text.
1305    ///
1306    /// If called repeatedly between calls to `next`, subsequent calls return a value equivalent to
1307    /// the value returned by the first call.
1308    ///
1309    /// When the value of the most recent token is known, calling [`content`] or [`err`] directly,
1310    /// as the case may be, will produce cleaner and more compact code than calling this method and
1311    /// unwrapping the result.
1312    ///
1313    /// # Performance considerations
1314    ///
1315    /// A call to this method may allocate, although implementations should avoid allocation if
1316    /// possible. Therefore, it is best to cache the result of this method rather than calling it
1317    /// repeatedly to fetch the same value between calls to `next`. If the text content of the last
1318    /// token is not needed for some reason, the best course is not to call this method at all.
1319    ///
1320    /// [`next`]: method@Self::next
1321    /// [`content`]: method@Self::content
1322    /// [`err`]: method@Self::err
1323    fn try_content(&self) -> Result<Self::Content, Self::Error>;
1324}
1325
1326pub(crate) fn hex2u16(b: u8) -> u16 {
1327    match b {
1328        b'0'..=b'9' => (b - b'0') as u16,
1329        b'a'..=b'f' => (10 + b - b'a') as u16,
1330        b'A'..=b'F' => (10 + b - b'A') as u16,
1331        _ => panic!("invalid hex character: 0x{b:02x}"),
1332    }
1333}
1334
1335/// Expands escape sequences in the content of a valid JSON string.
1336///
1337/// The [`Buf`] to unescape must contain the literal content of a valid JSON string value, as it
1338/// appears in the JSON text (with or without the surrounding double quotation mark characters).
1339///
1340/// The unescaped text is appended to the given byte vector.
1341///
1342/// # Panics
1343///
1344/// Panics if the input `Buf` contains an invalid or unterminated JSON escape sequence.
1345///
1346/// # Examples
1347///
1348/// Unescape a string with surrounding double quote characters...
1349///
1350/// ```
1351/// use bufjson::{Buf, lexical::unescape};
1352///
1353/// let mut dst = Vec::new();
1354/// unescape(r#""foo\nbar""#, &mut dst);
1355/// assert_eq!(
1356///    &br#""foo
1357/// bar""#[..],
1358///     &dst);
1359/// ```
1360///
1361/// ...Or without them...
1362///
1363/// ```
1364/// use bufjson::{Buf, lexical::unescape};
1365///
1366/// let mut dst = Vec::new();
1367/// unescape(r#"hello\u002c\u0020world"#, &mut dst);
1368/// assert_eq!(&b"hello, world"[..], &dst);
1369/// ```
1370///
1371/// # Notes
1372pub fn unescape(literal: impl IntoBuf, dst: &mut Vec<u8>) {
1373    let mut literal = literal.into_buf();
1374
1375    // Reserve bytes in the destination. If the incoming literal has at least one escape sequence,
1376    // the length should shrink by one, but if called erroneously, it might not shrink, and might
1377    // even be empty.
1378    if !literal.has_remaining() {
1379        return;
1380    }
1381    dst.reserve(literal.remaining());
1382
1383    #[derive(Default)]
1384    struct Esc {
1385        len: u32, // Number of valid bytes, 1-5 (how many characters we have after first '\').
1386        hi: u32,  // High surrogate.
1387        lo: u32,  // Low surrogate.
1388    }
1389    let mut esc: Option<Esc> = None;
1390
1391    loop {
1392        let chunk = literal.chunk();
1393        let (mut i, mut j) = (0usize, 0usize);
1394
1395        loop {
1396            let b = chunk[j];
1397            match &mut esc {
1398                None if b != b'\\' => j += 1,
1399
1400                None => {
1401                    dst.extend_from_slice(&chunk[i..j]);
1402                    esc = Some(Esc::default());
1403                    j += 1;
1404                    i = j;
1405                }
1406
1407                Some(e) if e.len == 0 => {
1408                    let mut single = |b: u8, esc: &mut Option<Esc>| {
1409                        dst.push(b);
1410                        *esc = None;
1411                        j += 1;
1412                        i = j;
1413                    };
1414
1415                    match b {
1416                        b'"' | b'\\' | b'/' => single(b, &mut esc),
1417                        b'b' => single(b'\x08', &mut esc),
1418                        b't' => single(b'\t', &mut esc),
1419                        b'f' => single(b'\x0c', &mut esc),
1420                        b'n' => single(b'\n', &mut esc),
1421                        b'r' => single(b'\r', &mut esc),
1422                        b'u' => {
1423                            e.len = 1;
1424                            j += 1;
1425                            i = j;
1426                        }
1427                        _ => panic!(r#"invalid escape sequence byte after '\': 0x{b:02x}"#),
1428                    }
1429                }
1430
1431                Some(e) if (1..=4).contains(&e.len) => {
1432                    let shift = 4 * (4 - e.len);
1433                    e.hi |= (hex2u16(b) as u32) << shift;
1434                    e.len += 1;
1435                    if e.len == 5 {
1436                        match e.hi {
1437                            0xd800..=0xdbff => (),
1438
1439                            0xdc00..=0xdfff => panic!(
1440                                "Unicode escape low surrogate without preceding high surrogate: 0x{:02x}",
1441                                e.hi
1442                            ),
1443
1444                            _ => {
1445                                append_code_point(e.hi, dst);
1446                                esc = None;
1447                            }
1448                        }
1449                    }
1450                    j += 1;
1451                    i = j;
1452                }
1453
1454                Some(e) if e.len == 5 && b == b'\\' => {
1455                    e.len = 6;
1456                    j += 1;
1457                    i = j;
1458                }
1459
1460                Some(e) if e.len == 5 => panic!(
1461                    r#"expected '\' to start low surrogate Unicode escape after high surrogate 0x{:04x}, found byte 0x{b:02x}"#,
1462                    e.hi
1463                ),
1464
1465                Some(e) if e.len == 6 && b == b'u' => {
1466                    e.len = 7;
1467                    j += 1;
1468                    i = j;
1469                }
1470
1471                Some(e) if e.len == 6 => panic!(
1472                    r#"expected '\u' to start low surrogate Unicode escape after high surrogate 0x{:04x}, found '\' followed by byte {b:02x}"#,
1473                    e.hi
1474                ),
1475
1476                Some(e) if (7..=10).contains(&e.len) => {
1477                    let shift = 4 * (10 - e.len);
1478                    e.lo |= (hex2u16(b) as u32) << shift;
1479                    e.len += 1;
1480                    if e.len == 11 {
1481                        match e.lo {
1482                            0xdc00..=0xdfff => {
1483                                let code_point =
1484                                    0x10000 + (((e.hi - 0xd800) << 10) | (e.lo - 0xdc00));
1485                                append_code_point(code_point, dst);
1486                                esc = None;
1487                            }
1488
1489                            _ => {
1490                                panic!(
1491                                    "Unicode escape high surrogate not followed by low surrogate: 0x{:04x} and then 0x{:04x}",
1492                                    e.hi, e.lo
1493                                )
1494                            }
1495                        }
1496                    }
1497                    j += 1;
1498                    i = j;
1499                }
1500
1501                _ => unreachable!(),
1502            }
1503
1504            if j == chunk.len() {
1505                break;
1506            }
1507        }
1508
1509        dst.extend_from_slice(&chunk[i..j]);
1510        literal.advance(chunk.len());
1511        if !literal.has_remaining() {
1512            break;
1513        }
1514    }
1515
1516    if esc.is_some() {
1517        panic!("unexpected end of input within Unicode escape sequence");
1518    }
1519}
1520
1521fn append_code_point(code_point: u32, dst: &mut Vec<u8>) {
1522    match char::from_u32(code_point) {
1523        Some(c) => {
1524            let mut seq = [0u8; 4];
1525            let utf8_str = c.encode_utf8(&mut seq);
1526            dst.extend_from_slice(utf8_str.as_bytes());
1527        }
1528
1529        None => unreachable!(),
1530    }
1531}
1532
1533#[cfg(test)]
1534mod tests {
1535    use super::*;
1536    use rstest::rstest;
1537    use std::collections::{BTreeMap, HashMap};
1538
1539    #[rstest]
1540    #[case(Token::ArrBegin, false)]
1541    #[case(Token::ArrEnd, false)]
1542    #[case(Token::Eof, false)]
1543    #[case(Token::Err, false)]
1544    #[case(Token::LitFalse, true)]
1545    #[case(Token::LitNull, true)]
1546    #[case(Token::LitTrue, true)]
1547    #[case(Token::NameSep, false)]
1548    #[case(Token::Num, false)]
1549    #[case(Token::ObjBegin, false)]
1550    #[case(Token::ObjEnd, false)]
1551    #[case(Token::Str, false)]
1552    #[case(Token::ValueSep, false)]
1553    #[case(Token::White, false)]
1554    fn test_token_is_literal(#[case] token: Token, #[case] is_literal: bool) {
1555        assert_eq!(is_literal, token.is_literal());
1556    }
1557
1558    #[rstest]
1559    #[case(Token::ArrBegin, false)]
1560    #[case(Token::ArrEnd, false)]
1561    #[case(Token::Eof, true)]
1562    #[case(Token::Err, true)]
1563    #[case(Token::LitFalse, false)]
1564    #[case(Token::LitNull, false)]
1565    #[case(Token::LitTrue, false)]
1566    #[case(Token::NameSep, false)]
1567    #[case(Token::Num, false)]
1568    #[case(Token::ObjBegin, false)]
1569    #[case(Token::ObjEnd, false)]
1570    #[case(Token::Str, false)]
1571    #[case(Token::ValueSep, false)]
1572    #[case(Token::White, true)]
1573    fn test_token_is_pseudo(#[case] token: Token, #[case] is_pseudo: bool) {
1574        assert_eq!(is_pseudo, token.is_pseudo());
1575    }
1576
1577    #[rstest]
1578    #[case(Token::ArrBegin, false)]
1579    #[case(Token::ArrEnd, false)]
1580    #[case(Token::Eof, false)]
1581    #[case(Token::Err, false)]
1582    #[case(Token::LitFalse, true)]
1583    #[case(Token::LitNull, true)]
1584    #[case(Token::LitTrue, true)]
1585    #[case(Token::NameSep, false)]
1586    #[case(Token::Num, true)]
1587    #[case(Token::ObjBegin, false)]
1588    #[case(Token::ObjEnd, false)]
1589    #[case(Token::Str, true)]
1590    #[case(Token::ValueSep, false)]
1591    #[case(Token::White, false)]
1592    fn test_token_is_primitive(#[case] token: Token, #[case] is_primitive: bool) {
1593        assert_eq!(is_primitive, token.is_primitive());
1594    }
1595
1596    #[rstest]
1597    #[case(Token::ArrBegin, true)]
1598    #[case(Token::ArrEnd, true)]
1599    #[case(Token::Eof, false)]
1600    #[case(Token::Err, false)]
1601    #[case(Token::LitFalse, false)]
1602    #[case(Token::LitNull, false)]
1603    #[case(Token::LitTrue, false)]
1604    #[case(Token::NameSep, true)]
1605    #[case(Token::Num, false)]
1606    #[case(Token::ObjBegin, true)]
1607    #[case(Token::ObjEnd, true)]
1608    #[case(Token::Str, false)]
1609    #[case(Token::ValueSep, true)]
1610    #[case(Token::White, false)]
1611    fn test_token_is_punct(#[case] token: Token, #[case] is_punct: bool) {
1612        assert_eq!(is_punct, token.is_punct());
1613    }
1614
1615    #[rstest]
1616    #[case(Token::ArrBegin, false)]
1617    #[case(Token::ArrEnd, false)]
1618    #[case(Token::Eof, true)]
1619    #[case(Token::Err, true)]
1620    #[case(Token::LitFalse, false)]
1621    #[case(Token::LitNull, false)]
1622    #[case(Token::LitTrue, false)]
1623    #[case(Token::NameSep, false)]
1624    #[case(Token::Num, false)]
1625    #[case(Token::ObjBegin, false)]
1626    #[case(Token::ObjEnd, false)]
1627    #[case(Token::Str, false)]
1628    #[case(Token::ValueSep, false)]
1629    #[case(Token::White, false)]
1630    fn test_token_is_terminal(#[case] token: Token, #[case] is_terminal: bool) {
1631        assert_eq!(is_terminal, token.is_terminal());
1632    }
1633
1634    #[rstest]
1635    #[case(Token::ArrBegin, Some("["))]
1636    #[case(Token::ArrEnd, Some("]"))]
1637    #[case(Token::Eof, None)]
1638    #[case(Token::Err, None)]
1639    #[case(Token::LitFalse, Some("false"))]
1640    #[case(Token::LitNull, Some("null"))]
1641    #[case(Token::LitTrue, Some("true"))]
1642    #[case(Token::NameSep, Some(":"))]
1643    #[case(Token::Num, None)]
1644    #[case(Token::ObjBegin, Some("{"))]
1645    #[case(Token::ObjEnd, Some("}"))]
1646    #[case(Token::Str, None)]
1647    #[case(Token::ValueSep, Some(","))]
1648    #[case(Token::White, None)]
1649    fn test_token_static_content(#[case] token: Token, #[case] static_content: Option<&str>) {
1650        assert_eq!(static_content, token.static_content());
1651    }
1652
1653    #[rstest]
1654    #[case(Token::ArrBegin, "[")]
1655    #[case(Token::ArrEnd, "]")]
1656    #[case(Token::Eof, "EOF")]
1657    #[case(Token::Err, "error")]
1658    #[case(Token::LitFalse, "false")]
1659    #[case(Token::LitNull, "null")]
1660    #[case(Token::LitTrue, "true")]
1661    #[case(Token::NameSep, ":")]
1662    #[case(Token::Num, "number")]
1663    #[case(Token::ObjBegin, "{")]
1664    #[case(Token::ObjEnd, "}")]
1665    #[case(Token::Str, "string")]
1666    #[case(Token::ValueSep, ",")]
1667    #[case(Token::White, "whitespace")]
1668    fn test_token_display(#[case] token: Token, #[case] expect: &str) {
1669        assert_eq!(expect, format!("{token}"));
1670    }
1671
1672    #[rstest]
1673    #[case(Unescaped::Literal("foo"), "foo")]
1674    #[case(Unescaped::Expanded("bar".to_string()), "bar")]
1675    fn test_unescaped_str_into_buf(#[case] u: Unescaped<&str>, #[case] expect: &str) {
1676        let mut b = u.into_buf();
1677
1678        assert_eq!(expect.len(), b.remaining());
1679        assert_eq!(expect, str::from_utf8(b.chunk()).unwrap());
1680
1681        if b.remaining() > 0 {
1682            b.advance(1);
1683
1684            assert_eq!(expect.len() - 1, b.remaining());
1685            assert_eq!(&expect[1..], str::from_utf8(b.chunk()).unwrap());
1686        }
1687
1688        let mut v = vec![0; expect.len() - 1];
1689        b.copy_to_slice(&mut v);
1690
1691        assert_eq!(0, b.remaining());
1692        assert_eq!(b"", b.chunk())
1693    }
1694
1695    #[test]
1696    fn test_unescaped_str() {
1697        let a1 = Unescaped::Literal("a");
1698        let b1 = Unescaped::Expanded("bb".to_string());
1699        let a2 = Unescaped::Expanded("a".to_string());
1700        let b2 = Unescaped::Literal("bb");
1701
1702        assert_eq!("a", Into::<String>::into(a1.clone()));
1703        assert_eq!("bb", Into::<String>::into(b1.clone()));
1704        assert_eq!("a", Into::<String>::into(a2.clone()));
1705        assert_eq!("bb", Into::<String>::into(b2.clone()));
1706
1707        assert!(matches!(a1.literal(), Some(&"a")));
1708        assert!(b1.literal().is_none());
1709        assert!(a2.literal().is_none());
1710        assert!(matches!(b2.literal(), Some(&"bb")));
1711
1712        assert!(a1.expanded().is_none());
1713        assert!(matches!(b1.expanded(), Some("bb")));
1714        assert!(matches!(a2.expanded(), Some("a")));
1715        assert!(b2.expanded().is_none());
1716
1717        assert!(a1.is_literal());
1718        assert!(!a1.is_expanded());
1719        assert!(!b1.is_literal());
1720        assert!(b1.is_expanded());
1721
1722        assert_eq!(1, a1.len());
1723        assert_eq!(2, b1.len());
1724        assert_eq!(1, a2.len());
1725        assert_eq!(2, b2.len());
1726
1727        let a3: &str = a1.as_ref();
1728        let b3: &str = b1.as_ref();
1729        let a4: &str = a2.as_ref();
1730        let b4: &str = b2.as_ref();
1731
1732        assert_eq!("a", format!("{a1}"));
1733        assert_eq!("bb", format!("{b1}"));
1734        assert_eq!("a", format!("{a2}"));
1735        assert_eq!("bb", format!("{b2}"));
1736
1737        assert_eq!("a", a3);
1738        assert_eq!("bb", b3);
1739        assert_eq!("a", a4);
1740        assert_eq!("bb", b4);
1741
1742        let x1: &[u8] = a1.as_ref();
1743        let y1: &[u8] = b1.as_ref();
1744        let x2: &[u8] = a2.as_ref();
1745        let y2: &[u8] = b2.as_ref();
1746
1747        assert_eq!(b"a", x1);
1748        assert_eq!(b"bb", y1);
1749        assert_eq!(b"a", x2);
1750        assert_eq!(b"bb", y2);
1751
1752        assert_eq!(a1, a2);
1753        assert_eq!(a2, a1);
1754        assert_eq!(b1, b2);
1755        assert_eq!(b2, b1);
1756
1757        assert_ne!(a1, b1);
1758        assert_ne!(b1, a1);
1759        assert_ne!(a1, b2);
1760        assert_ne!(b1, a2);
1761
1762        assert!(a1 < b1);
1763        assert!(a1 < b2);
1764        assert!(a2 < b1);
1765        assert!(a2 < b2);
1766        assert!(b1 > a1);
1767        assert!(b1 > a2);
1768        assert!(b2 > a1);
1769        assert!(b2 > a2);
1770
1771        assert_eq!("a", a1);
1772        assert_eq!(a1, "a");
1773        assert_eq!("bb", b1);
1774        assert_eq!(b1, "bb");
1775        assert_eq!("a", a2);
1776        assert_eq!(a2, "a");
1777        assert_eq!("bb", b2);
1778        assert_eq!(b2, "bb");
1779
1780        assert_eq!("a".to_string(), a1);
1781        assert_eq!(a1, "a".to_string());
1782        assert_eq!("bb".to_string(), b1);
1783        assert_eq!(b1, "bb".to_string());
1784        assert_eq!("a".to_string(), a2);
1785        assert_eq!(a2, "a".to_string());
1786        assert_eq!("bb".to_string(), b2);
1787        assert_eq!(b2, "bb".to_string());
1788
1789        assert!(a1 < "bb");
1790        assert!("bb" > a1);
1791        assert!(b1 > "a");
1792        assert!("a" < b1);
1793
1794        let mut m1 = HashMap::new();
1795        m1.insert(a1.clone(), "a1");
1796        m1.insert(b1.clone(), "b1");
1797
1798        assert_eq!(Some(&"a1"), m1.get("a"));
1799        assert_eq!(Some(&"a1"), m1.get(&a2));
1800        assert_eq!(Some(&"b1"), m1.get("bb"));
1801        assert_eq!(Some(&"b1"), m1.get(&b2));
1802        assert!(!m1.contains_key("aa"));
1803
1804        let mut m2 = BTreeMap::new();
1805        m2.insert(a1.clone(), "a1");
1806        m2.insert(b1.clone(), "b1");
1807
1808        assert_eq!(Some(&"a1"), m2.get("a"));
1809        assert_eq!(Some(&"a1"), m2.get(&a2));
1810        assert_eq!(Some(&"b1"), m2.get("bb"));
1811        assert_eq!(Some(&"b1"), m2.get(&b2));
1812        assert!(!m2.contains_key("aa"));
1813        assert_eq!(Some("a1"), m2.remove(&a2));
1814        assert_eq!(Some("b1"), m2.remove(&b2));
1815
1816        m2.insert(b2.clone(), "b2");
1817        m2.insert(a2.clone(), "a2");
1818
1819        assert_eq!(Some(&"a2"), m2.get("a"));
1820        assert_eq!(Some(&"a2"), m2.get(&a1));
1821        assert_eq!(Some(&"b2"), m2.get("bb"));
1822        assert_eq!(Some(&"b2"), m2.get(&b1));
1823        assert!(!m2.contains_key("aa"));
1824        assert_eq!(Some("a2"), m2.remove("a"));
1825        assert_eq!(Some("b2"), m2.remove("bb"));
1826    }
1827
1828    #[rstest]
1829    #[case(ErrorKind::BadSurrogate {
1830        first: 0xD800,
1831        second: None,
1832        offset: 5,
1833    }, "bad Unicode escape sequence: high surrogate '\\uD800' not followed by low surrogate")]
1834    #[case(ErrorKind::BadUtf8ContByte {
1835        seq_len: 3,
1836        offset: 2,
1837        value: 0x20,
1838    }, "bad UTF-8 continuation byte 0x20 in 3-byte UTF-8 sequence (byte #2)")]
1839    #[case(ErrorKind::Read, "read error")]
1840    #[case(ErrorKind::UnexpectedByte {
1841        token: Some(Token::Num),
1842        expect: Expect::Digit,
1843        actual: 0x41,
1844    }, "expected digit character '0'..'9' but got character 'A' (ASCII 0x41) in number token")]
1845    #[case(ErrorKind::UnexpectedEof(Token::Str), "unexpected EOF in string token")]
1846    fn test_error_kind_display(#[case] kind: ErrorKind, #[case] expect: &str) {
1847        assert_eq!(expect, format!("{kind}"));
1848
1849        struct Wrapper(ErrorKind);
1850
1851        impl fmt::Display for Wrapper {
1852            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1853                let pos = Pos::default();
1854
1855                self.0.fmt_at(f, Some(&pos))
1856            }
1857        }
1858
1859        assert_eq!(
1860            format!("{expect} at line 1, column 1 (offset: 0)"),
1861            format!("{}", Wrapper(kind))
1862        );
1863    }
1864
1865    #[rstest]
1866    #[case(r#""""#, r#""""#)]
1867    #[case(r#""f""#, r#""f""#)]
1868    #[case(r#""fo""#, r#""fo""#)]
1869    #[case(r#""foo""#, r#""foo""#)]
1870    #[case(r#""\\""#, r#""\""#)]
1871    #[case(r#""\/""#, r#""/""#)]
1872    #[case(r#""\"""#, r#"""""#)]
1873    #[case(r#""\b""#, "\"\x08\"")]
1874    #[case(r#""\t""#, "\"\t\"")]
1875    #[case(r#""\f""#, "\"\x0c\"")]
1876    #[case(r#""\n""#, "\"\n\"")]
1877    #[case(r#""\r""#, "\"\r\"")]
1878    #[case(r#""\u0000""#, "\"\0\"")]
1879    #[case(r#""\u0008""#, "\"\x08\"")]
1880    #[case(r#""\u0009""#, "\"\t\"")]
1881    #[case(r#""\u000c""#, "\"\x0c\"")]
1882    #[case(r#""\u000C""#, "\"\x0C\"")]
1883    #[case(r#""\u000a""#, "\"\n\"")]
1884    #[case(r#""\u000A""#, "\"\n\"")]
1885    #[case(r#""\u000d""#, "\"\r\"")]
1886    #[case(r#""\u000D""#, "\"\r\"")]
1887    #[case(r#""\u000D""#, "\"\r\"")]
1888    #[case(r#""\u0021""#, r#""!""#)]
1889    #[case(r#""\u0030""#, r#""0""#)]
1890    #[case(r#""\u0041""#, r#""A""#)]
1891    #[case(r#""\u0062""#, r#""b""#)]
1892    #[case(r#""\u007F""#, "\"\x7f\"")] // DEL (U+007F, highest 1-byte UTF-8)
1893    #[case(r#""\u00A9""#, r#""©""#)] // Copyright sign (U+00A9, 2-byte UTF-8)
1894    #[case(r#""\u03A9""#, r#""Ω""#)] // Greek capital Omega (U+03A9, 2-byte UTF-8)
1895    #[case(r#""\u0080""#, "\"\u{80}\"")] // First 2-byte UTF-8 code point
1896    #[case(r#""\u07FF""#, "\"\u{7ff}\"")] // Last 2-byte UTF-8 code point
1897    #[case(r#""\u20AC""#, r#""€""#)] // Euro sign (U+20AC, 3-byte UTF-8)
1898    #[case(r#""\u2603""#, r#""☃""#)] // Snowman (U+2603, 3-byte UTF-8)
1899    #[case(r#""\u0800""#, "\"\u{800}\"")] // First 3-byte UTF-8 code point
1900    #[case(r#""\uFFFF""#, "\"\u{ffff}\"")] // Last valid BMP code point (3-byte UTF-8)
1901    #[case(r#""\ud83D\uDe00""#, r#""😀""#)] // Grinning face emoji (U+1F600, 4-byte UTF-8)
1902    #[case(r#""\ud800\uDC00""#, "\"\u{10000}\"")] // First 4-byte UTF-8 code point
1903    #[case(r#""\uDBFF\udfff""#, "\"\u{10FFFF}\"")] // Highest valid Unicode scalar value
1904    fn test_unescape_ok(#[case] input: &str, #[case] expect: &str) {
1905        // Test with an empty buffer.
1906        {
1907            let mut buf = Vec::new();
1908
1909            unescape(input, &mut buf);
1910            let actual = String::from_utf8(buf).unwrap();
1911
1912            assert_eq!(actual, expect);
1913        }
1914
1915        // Test with a non-empty buffer.
1916        {
1917            let mut buf = Vec::new();
1918
1919            buf.extend_from_slice(b"foo");
1920            unescape(input, &mut buf);
1921            let actual = String::from_utf8(buf).unwrap();
1922
1923            assert_eq!(actual, format!("foo{expect}"));
1924        }
1925    }
1926
1927    #[rstest]
1928    #[case(r#""\a""#)]
1929    #[case(r#""\U""#)]
1930    #[case(r#""\:""#)]
1931    #[should_panic(expected = "invalid escape sequence byte after '\\'")]
1932    fn test_unescape_panic_invalid_esc_seq_byte(#[case] literal: &str) {
1933        let mut buf = Vec::new();
1934
1935        unescape(literal, &mut buf);
1936    }
1937
1938    #[rstest]
1939    #[case(r#"\ud800\u0000"#)]
1940    #[case(r#"\ud800\ud7ff"#)]
1941    #[case(r#"\ud800\ud800"#)]
1942    #[case(r#"\ud800\ue000"#)]
1943    #[case(r#"\ud800\uffff"#)]
1944    #[case(r#"\udbff\u0000"#)]
1945    #[case(r#"\udbff\ud7ff"#)]
1946    #[case(r#"\udbff\ud800"#)]
1947    #[case(r#"\udbff\ue000"#)]
1948    #[case(r#"\udbff\uffff"#)]
1949    #[should_panic(expected = "Unicode escape high surrogate not followed by low surrogate")]
1950    fn test_unescape_panic_low_surrogate_no_high(#[case] literal: &str) {
1951        let mut buf = Vec::new();
1952
1953        unescape(literal, &mut buf);
1954    }
1955
1956    #[rstest]
1957    #[case(r#""\ud800\u0000""#)]
1958    #[case(r#""\uDBFF\ud800""#)]
1959    #[should_panic(expected = "Unicode escape high surrogate not followed by low surrogate")]
1960    fn test_unescape_panic_high_surrogate_no_low(#[case] literal: &str) {
1961        let mut buf = Vec::new();
1962
1963        unescape(literal, &mut buf);
1964    }
1965
1966    #[rstest]
1967    #[case(r#"\ud800 "#)]
1968    #[case(r#"\udbff "#)]
1969    #[should_panic(
1970        expected = r#"expected '\' to start low surrogate Unicode escape after high surrogate"#
1971    )]
1972    fn test_unescape_panic_high_surrogate_no_backslash(#[case] literal: &str) {
1973        let mut buf = Vec::new();
1974
1975        unescape(literal, &mut buf);
1976    }
1977
1978    #[rstest]
1979    #[case(r#"\ud800\n"#)]
1980    #[case(r#"\udbff\a"#)]
1981    #[should_panic(
1982        expected = r#"expected '\u' to start low surrogate Unicode escape after high surrogate"#
1983    )]
1984    fn test_unescape_panic_high_surrogate_no_backslash_u(#[case] literal: &str) {
1985        let mut buf = Vec::new();
1986
1987        unescape(literal, &mut buf);
1988    }
1989
1990    #[rstest]
1991    #[case(r#"\"#)]
1992    #[case(r#"\u"#)]
1993    #[case(r#"\u0"#)]
1994    #[case(r#"\u00"#)]
1995    #[case(r#"\u000"#)]
1996    #[case(r#"\u0000\"#)]
1997    #[case(r#"\u0000\u"#)]
1998    #[case(r#"\u0000\u1"#)]
1999    #[case(r#"\u0000\u11"#)]
2000    #[case(r#"\u0000\u111"#)]
2001    #[case(r#"\ud800\u111"#)]
2002    #[case(r#"\udbff\u111"#)]
2003    #[should_panic(expected = "unexpected end of input within Unicode escape sequence")]
2004    fn test_unescape_panic_unexpected_eof(#[case] literal: &str) {
2005        let mut buf = Vec::new();
2006
2007        unescape(literal, &mut buf);
2008    }
2009}
bufjson/lexical.rs

bufjson/
lexical.rs