bufjson/lexical/
read.rs

1//! Convert a [`std::io::Read`] into a stream of JSON lexical tokens.
2
3use crate::{
4    Buf, BufUnderflow, EqStr, IntoBuf, OrdStr, Pos,
5    lexical::{
6        self, state, {Analyzer, ErrorKind, Token, Unescaped},
7    },
8    syntax,
9};
10use std::{
11    borrow::Cow,
12    cmp::{Ordering, min},
13    collections::VecDeque,
14    convert::Infallible,
15    fmt,
16    hash::{Hash, Hasher},
17    io::{self, Read},
18    ops::Range,
19    str::FromStr,
20    sync::Arc,
21};
22
23// Use a smaller inline buffer size in tests to push more test cases out of the inline
24// representation and into the more complex representations that contain references into the actual
25// read buffers.
26#[cfg(test)]
27const INLINE_LEN: usize = 4;
28#[cfg(not(test))]
29const INLINE_LEN: usize = 21;
30
31type InlineBuf = [u8; INLINE_LEN];
32
33#[derive(Debug, Clone)]
34struct UniRef {
35    buf: Arc<Vec<u8>>,
36    rng: Range<u32>,
37}
38
39impl UniRef {
40    fn new(buf: Arc<Vec<u8>>, rng: Range<u32>) -> Self {
41        debug_assert!(rng.start <= rng.end);
42        debug_assert!(rng.end as usize <= buf.len());
43
44        Self { buf, rng }
45    }
46
47    #[cfg(test)]
48    fn test_new(buf: impl Into<Vec<u8>>, rng: Range<u32>) -> Self {
49        Self::new(Arc::new(buf.into()), rng)
50    }
51}
52
53impl Buf for UniRef {
54    fn advance(&mut self, n: usize) {
55        if self.remaining() < n {
56            panic!(
57                "{}",
58                &BufUnderflow {
59                    requested: n,
60                    remaining: self.remaining(),
61                }
62            );
63        } else {
64            debug_assert!(n <= u32::MAX as usize);
65            self.rng.start += n as u32;
66        }
67    }
68
69    fn chunk(&self) -> &[u8] {
70        &self.buf[self.rng.start as usize..self.rng.end as usize]
71    }
72
73    fn remaining(&self) -> usize {
74        (self.rng.end - self.rng.start) as usize
75    }
76
77    fn try_copy_to_slice(&mut self, dst: &mut [u8]) -> Result<(), crate::BufUnderflow> {
78        if self.remaining() < dst.len() {
79            Err(BufUnderflow {
80                requested: dst.len(),
81                remaining: self.remaining(),
82            })
83        } else {
84            let start = self.rng.start as usize;
85            dst.copy_from_slice(&self.buf[start..start + dst.len()]);
86            self.rng.start += dst.len() as u32;
87
88            Ok(())
89        }
90    }
91}
92
93impl IntoBuf for UniRef {
94    type Buf = Self;
95
96    fn into_buf(self) -> Self::Buf {
97        self
98    }
99}
100
101#[derive(Debug, Clone)]
102struct MultiRef {
103    // Method to the madness of the very nesty Arc/Vec/Arc/Vec:
104    // - Outer Arc allows MultiRef to be cloned without an allocation.
105    // - Inner Arc maintains a read-only ownership stake in the buffers, preventing them from dropping.
106    bufs: Arc<Vec<Arc<Vec<u8>>>>,
107    // Index into `bufs`.
108    //   INVARIANT: `buf <= bufs.len()`; so it can be one past the end, hence invalid
109    //   INVARIANT: `buf == bufs.len()` <=> rng.start == rng.end
110    buf: usize,
111    off: usize,
112    rng: Range<usize>,
113}
114
115impl MultiRef {
116    fn new(bufs: Arc<Vec<Arc<Vec<u8>>>>, rng: Range<usize>) -> Self {
117        #[cfg(debug_assertions)]
118        {
119            debug_assert!(rng.start <= rng.end);
120            let len = bufs.iter().map(|v| v.len()).sum();
121            debug_assert!(
122                rng.end <= len,
123                "rng.end ({}) must not exceed total length of buffers ({})",
124                rng.end,
125                len
126            );
127            bufs.iter()
128                .take(bufs.len().saturating_sub(1))
129                .enumerate()
130                .for_each(|(i, buf)| {
131                    debug_assert!(!buf.is_empty(), "empty buffer not allowed at index {i}")
132                });
133        }
134
135        Self {
136            bufs,
137            buf: 0,
138            off: rng.start,
139            rng,
140        }
141    }
142
143    #[cfg(test)]
144    fn test_new<I, T>(bufs: I, rng: Range<usize>) -> Self
145    where
146        I: IntoIterator<Item = T>,
147        T: Into<Vec<u8>>,
148    {
149        let bufs = Arc::new(
150            bufs.into_iter()
151                .map(Into::into)
152                .map(Arc::new)
153                .collect::<Vec<_>>(),
154        );
155
156        Self::new(bufs, rng)
157    }
158
159    fn remaining(&self) -> usize {
160        self.rng.end - self.rng.start
161    }
162
163    fn usable_len(&self, buf: &[u8]) -> usize {
164        let n = min(buf.len(), self.off + self.remaining());
165
166        debug_assert!(
167            self.off <= n,
168            "self.off ({}) > usable_len {n} for {}-byte buffer {buf:?}",
169            self.off,
170            buf.len()
171        );
172
173        n
174    }
175}
176
177impl Buf for MultiRef {
178    fn advance(&mut self, mut n: usize) {
179        if self.remaining() < n {
180            panic!(
181                "{}",
182                &BufUnderflow {
183                    requested: n,
184                    remaining: self.remaining(),
185                }
186            );
187        }
188
189        while n > 0 {
190            let step = self.bufs[self.buf].len() - self.off;
191            if n < step {
192                self.off += n;
193                self.rng.start += n;
194                break;
195            }
196            self.off = 0;
197            self.rng.start += step;
198            self.buf += 1;
199            n -= step;
200        }
201    }
202
203    fn chunk(&self) -> &[u8] {
204        if self.buf < self.bufs.len() {
205            let buf = &self.bufs[self.buf];
206
207            &buf[self.off..self.usable_len(buf)]
208        } else {
209            &[]
210        }
211    }
212
213    fn remaining(&self) -> usize {
214        MultiRef::remaining(self)
215    }
216
217    fn try_copy_to_slice(&mut self, mut dst: &mut [u8]) -> Result<(), crate::BufUnderflow> {
218        let mut n = dst.len();
219
220        if self.remaining() < n {
221            return Err(BufUnderflow {
222                requested: n,
223                remaining: self.remaining(),
224            });
225        }
226
227        while n > 0 {
228            let buf = &self.bufs[self.buf];
229            let step = self.usable_len(buf) - self.off;
230            if n < step {
231                dst[..n].copy_from_slice(&buf[self.off..self.off + n]);
232                self.off += n;
233                self.rng.start += n;
234                break;
235            }
236            dst[..step].copy_from_slice(&buf[self.off..self.off + step]);
237            dst = &mut dst[step..];
238            self.off = 0;
239            self.buf += 1;
240            self.rng.start += step;
241            n -= step;
242            debug_assert!(n == dst.len());
243        }
244
245        Ok(())
246    }
247}
248
249impl IntoBuf for MultiRef {
250    type Buf = Self;
251
252    fn into_buf(self) -> Self::Buf {
253        self
254    }
255}
256
257#[derive(Debug)]
258enum Repr<'a> {
259    Together(&'a str),
260    Split(&'a MultiRef),
261}
262
263#[derive(Clone, Debug)]
264enum InnerLiteral {
265    Static(&'static str),
266    Inline(u8, u8, InlineBuf),
267    Uni(UniRef),
268    Multi(Box<MultiRef>),
269}
270
271impl InnerLiteral {
272    fn len(&self) -> usize {
273        match self {
274            Self::Static(s) => s.len(),
275            Self::Inline(i, j, _b) => (*j - *i) as usize,
276            Self::Uni(r) => r.remaining(),
277            Self::Multi(r) => r.remaining(),
278        }
279    }
280
281    fn inline(s: &str) -> Self {
282        debug_assert!(s.len() <= INLINE_LEN);
283
284        let mut b = [0; INLINE_LEN];
285        b[0..s.len()].copy_from_slice(s.as_bytes());
286
287        Self::Inline(0, s.len() as u8, b)
288    }
289
290    fn uni(b: Vec<u8>) -> Self {
291        let n: u32 = b
292            .len()
293            .try_into()
294            .expect("buffer length cannot exceed u32::MAX");
295
296        Self::Uni(UniRef::new(Arc::new(b), 0..n))
297    }
298
299    fn repr(&self) -> Repr<'_> {
300        match self {
301            Self::Static(s) => Repr::Together(s),
302            Self::Inline(i, j, b) => {
303                Repr::Together(unsafe { str::from_utf8_unchecked(&b[*i as usize..*j as usize]) })
304            }
305            Self::Uni(r) => Repr::Together(unsafe {
306                str::from_utf8_unchecked(&r.buf[r.rng.start as usize..r.rng.end as usize])
307            }),
308            Self::Multi(r) => Repr::Split(r.as_ref()),
309        }
310    }
311}
312
313impl From<InnerContent> for InnerLiteral {
314    fn from(value: InnerContent) -> Self {
315        match value {
316            InnerContent::Static(s) => Self::Static(s),
317            InnerContent::Inline(len, b) => Self::Inline(0, len, b),
318            InnerContent::NotEscapedUni(r) | InnerContent::EscapedUni(r) => Self::Uni(r),
319            InnerContent::NotEscapedMulti(r) | InnerContent::EscapedMulti(r) => Self::Multi(r),
320        }
321    }
322}
323
324/// Zero allocation view of the literal text content of a JSON token.
325///
326/// To prevent allocation and minimize copying, a `Literal` may provide a direct view into the
327/// buffers used by the [`ReadAnalyzer`]. Since these buffers have a uniform size, but JSON tokens
328/// can have arbitrary lengths, the text content of a token may be split across two or more buffers.
329/// In other words, the full text of the content may be non-contiguous in memory. To make this data
330/// structure usable in the widest range of use cases, `Literal` implements the [`Buf`] trait, which
331/// provides a uniform interface for reading data from potentially non-contiguous sources.
332///
333/// # Performance considerations
334///
335/// Clones are cheap and do not allocate. However, for the memory considerations described below, it
336/// is preferable to use short-lifetime clones for discrete tasks and not to proliferate long-lived
337/// clones.
338///
339/// # Memory considerations
340///
341/// Because a `Literal` may hold references to the internal buffers of a `ReadAnalyzer`, holding on
342/// to a `Literal` instance may prevent the `ReadAnalyzer` from reusing buffers. This can lead to
343/// increased allocation activity, which will inevitably have a small performance cost. A somewhat
344/// more problematic effect is increased memory usage. If all `Literal` instances produced by a
345/// `ReadAnalyzer` are retained, it will require memory roughly equal to the total length of the
346/// JSON text being analyzed. This undermines a key value proposition of a streaming analyzer and,
347/// for large enough JSON texts, may lead to out-of-memory conditions. Therefore, it is strongly
348/// advised that you retain `Literal` instances only as long as necessary to process them,
349/// extracting owned copies of their data if you need long-lived access to the token text.
350#[derive(Clone, Debug)]
351pub struct Literal(InnerLiteral);
352
353impl Literal {
354    /// Converts a static lifetime string slice to a literal value.
355    ///
356    /// This function is the most efficient way to wrap a static string as a `Literal`. It does not
357    /// allocate and produces the lightest-weight `Literal` value.
358    ///
359    /// If you have a non-static string slice, use [`from_ref`], one of the [`From`] trait
360    /// implementations, or the [`FromStr`] implementation. If creating a literal value from an
361    /// owned `String`, use [`from_string`].
362    ///
363    /// # Examples
364    ///
365    /// Populate and use a hash set of allowed JSON object keys.
366    ///
367    /// ```
368    /// use bufjson::lexical::{Token, read::{Literal, ReadAnalyzer}};
369    /// use std::collections::HashSet;
370    ///
371    /// // Populate the set of allowed JSON object keys.
372    /// let mut allowed = HashSet::with_capacity(3);
373    /// allowed.insert(Literal::from_static(r#""foo""#)); // Note: store `"foo"`, not `foo`
374    /// allowed.insert(Literal::from_static(r#""baz""#)); // Note: store `"baz"`, not `baz`
375    ///
376    /// // Parse some JSON.
377    /// let mut parser = ReadAnalyzer::new(&br#"{"foo":"bar","baz":"qux"}"#[..]).into_parser();
378    ///
379    /// // Verify that the literal value of every object key is allowed.
380    /// assert_eq!(Token::ObjBegin, parser.next());
381    /// loop {
382    ///     match parser.next_meaningful() {
383    ///         Token::Str => {
384    ///             let key = parser.content().literal();
385    ///             assert!(allowed.contains(&key));
386    ///             assert_eq!(Token::Str, parser.next_meaningful()); // Skip corresponding value.
387    ///         },
388    ///         Token::ObjEnd => (),
389    ///         Token::Eof => break,
390    ///         _ => unreachable!(),
391    ///     }
392    /// }
393    /// ```
394    ///
395    /// [`from_ref`]: method@Self::from_ref
396    /// [`from_string`]: method@Self::from_str
397    pub const fn from_static(s: &'static str) -> Self {
398        Self(InnerLiteral::Static(s))
399    }
400
401    /// Creates a literal value from anything that cheaply converts to a string slice reference.
402    ///
403    /// If you have a static string slice, prefer [`from_static`], which has a lower construction
404    /// cost and a more efficient implementation. If you have an owned `String` you can consume,
405    /// prefer [`from_string`], which will avoid allocation. If you have a `Cow` you can consume,
406    /// prefer `From<Cow<'_, str>>`, which will avoid allocation if the `Cow` contains an owned
407    /// value.
408    ///
409    /// [`from_static`]: method@Self::from_static
410    /// [`from_string`]: method@Self::from_string
411    pub fn from_ref<T: AsRef<str> + ?Sized>(s: &T) -> Self {
412        let t = s.as_ref();
413
414        if t.len() <= INLINE_LEN {
415            Self(InnerLiteral::inline(t))
416        } else {
417            Self(InnerLiteral::uni(t.as_bytes().to_vec()))
418        }
419    }
420
421    /// Creates a literal value by consuming an owned string value.
422    ///
423    /// # Examples
424    ///
425    /// Create a literal from an owned string.
426    ///
427    /// ```
428    /// # use bufjson::lexical::read::Literal;
429    /// let s = "foo".to_string();
430    /// let lit = Literal::from_string(s);
431    /// assert_eq!("foo", lit);
432    /// ```
433    ///
434    /// There is a `From<String>` implementation that is functionally equivalent.
435    ///
436    /// ```
437    /// # use bufjson::lexical::read::Literal;
438    /// let s = "bar".to_string();
439    /// let lit: Literal = s.into();
440    /// assert_eq!("bar", lit);
441    /// ```
442    pub fn from_string(s: String) -> Self {
443        if s.len() <= INLINE_LEN {
444            Self(InnerLiteral::inline(&s))
445        } else {
446            Self(InnerLiteral::uni(s.into_bytes()))
447        }
448    }
449
450    /// Returns the length of `self`.
451    ///
452    /// This length is in bytes, not `char` values or graphemes. In other words, it might not be
453    /// what a human considers the length of the string.
454    ///
455    /// # Examples
456    ///
457    /// Get the length of a literal.
458    ///
459    /// ```
460    /// # use bufjson::lexical::read::Literal;
461    /// let boring = Literal::from_static("foo");
462    /// assert_eq!(3, boring.len());
463    ///
464    /// let fancy = Literal::from_static("ƒoo"); // fancy f!
465    /// assert_eq!(fancy.len(), 4);
466    /// ```
467    pub fn len(&self) -> usize {
468        self.0.len()
469    }
470
471    /// Returns `true` if `self` has a length of zero bytes.
472    ///
473    /// # Examples
474    ///
475    /// ```
476    /// # use bufjson::lexical::read::Literal;
477    /// assert_eq!(true, Literal::from_static("").is_empty());
478    /// ```
479    pub fn is_empty(&self) -> bool {
480        self.len() == 0
481    }
482
483    fn repr(&self) -> Repr<'_> {
484        self.0.repr()
485    }
486}
487
488impl IntoBuf for Literal {
489    type Buf = LiteralBuf;
490
491    fn into_buf(self) -> Self::Buf {
492        LiteralBuf(self.0)
493    }
494}
495
496impl fmt::Display for Literal {
497    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
498        match self.repr() {
499            Repr::Together(s) => f.write_str(s),
500            Repr::Split(r) => crate::buf_display(r.clone(), f),
501        }
502    }
503}
504
505impl EqStr for Literal {}
506
507impl Eq for Literal {}
508
509impl From<Literal> for String {
510    fn from(value: Literal) -> Self {
511        match value.repr() {
512            Repr::Together(s) => s.to_string(),
513            Repr::Split(r) => crate::buf_to_string(r.clone()),
514        }
515    }
516}
517
518impl<T: ?Sized + AsRef<str>> From<&T> for Literal {
519    fn from(value: &T) -> Self {
520        Literal::from_ref(&value)
521    }
522}
523
524impl<'a> From<Cow<'a, str>> for Literal {
525    fn from(value: Cow<'a, str>) -> Self {
526        match value {
527            Cow::Borrowed(s) => Literal::from_ref(&s),
528            Cow::Owned(s) => Literal::from_string(s),
529        }
530    }
531}
532
533impl From<String> for Literal {
534    fn from(value: String) -> Self {
535        Literal::from_string(value)
536    }
537}
538
539impl FromStr for Literal {
540    type Err = Infallible;
541
542    fn from_str(s: &str) -> Result<Self, Self::Err> {
543        Ok(Literal::from_ref(&s))
544    }
545}
546
547impl Hash for Literal {
548    fn hash<H: Hasher>(&self, state: &mut H) {
549        match self.repr() {
550            Repr::Together(s) => state.write(s.as_bytes()),
551            Repr::Split(r) => {
552                let mut x = r.clone();
553                while x.remaining() > 0 {
554                    let b = x.chunk();
555                    state.write(b);
556                    x.advance(b.len());
557                }
558            }
559        }
560    }
561}
562
563impl Ord for Literal {
564    fn cmp(&self, other: &Self) -> Ordering {
565        match (self.repr(), other.repr()) {
566            (Repr::Together(a), Repr::Together(b)) => Ord::cmp(a, b),
567            (Repr::Together(a), Repr::Split(b)) => crate::buf_cmp(a, b.clone()),
568            (Repr::Split(a), Repr::Together(b)) => crate::buf_cmp(a.clone(), b),
569            (Repr::Split(a), Repr::Split(b)) => crate::buf_cmp(a.clone(), b.clone()),
570        }
571    }
572}
573
574impl OrdStr for Literal {
575    fn cmp(&self, other: &str) -> Ordering {
576        match self.repr() {
577            Repr::Together(s) => Ord::cmp(s, other),
578            Repr::Split(r) => crate::buf_cmp(r.clone(), other),
579        }
580    }
581}
582
583impl PartialEq for Literal {
584    fn eq(&self, other: &Self) -> bool {
585        if self.len() != other.len() {
586            false
587        } else {
588            match (self.repr(), other.repr()) {
589                (Repr::Together(a), Repr::Together(b)) => a == b,
590                (Repr::Together(a), Repr::Split(b)) => {
591                    crate::buf_cmp(a, b.clone()) == Ordering::Equal
592                }
593                (Repr::Split(a), Repr::Together(b)) => {
594                    crate::buf_cmp(a.clone(), b) == Ordering::Equal
595                }
596                (Repr::Split(a), Repr::Split(b)) => {
597                    crate::buf_cmp(a.clone(), b.clone()) == Ordering::Equal
598                }
599            }
600        }
601    }
602}
603
604impl PartialEq<str> for Literal {
605    fn eq(&self, other: &str) -> bool {
606        if self.len() != other.len() {
607            false
608        } else {
609            match self.repr() {
610                Repr::Together(s) => s == other,
611                Repr::Split(r) => crate::buf_cmp(r.clone(), other) == Ordering::Equal,
612            }
613        }
614    }
615}
616
617impl PartialEq<&str> for Literal {
618    fn eq(&self, other: &&str) -> bool {
619        self == *other
620    }
621}
622
623impl PartialEq<String> for Literal {
624    fn eq(&self, other: &String) -> bool {
625        self == other.as_str()
626    }
627}
628
629impl PartialEq<Literal> for str {
630    fn eq(&self, other: &Literal) -> bool {
631        other == self
632    }
633}
634
635impl PartialEq<Literal> for &str {
636    fn eq(&self, other: &Literal) -> bool {
637        other == self
638    }
639}
640
641impl PartialEq<Literal> for String {
642    fn eq(&self, other: &Literal) -> bool {
643        other == self
644    }
645}
646
647impl PartialOrd for Literal {
648    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
649        Some(Ord::cmp(self, other))
650    }
651}
652
653impl PartialOrd<str> for Literal {
654    fn partial_cmp(&self, other: &str) -> Option<Ordering> {
655        Some(OrdStr::cmp(self, other))
656    }
657}
658
659impl PartialOrd<Literal> for str {
660    fn partial_cmp(&self, other: &Literal) -> Option<Ordering> {
661        Some(OrdStr::cmp(other, self).reverse())
662    }
663}
664
665impl PartialOrd<&str> for Literal {
666    fn partial_cmp(&self, other: &&str) -> Option<Ordering> {
667        Some(OrdStr::cmp(self, other))
668    }
669}
670
671impl PartialOrd<Literal> for &str {
672    fn partial_cmp(&self, other: &Literal) -> Option<Ordering> {
673        Some(OrdStr::cmp(other, self).reverse())
674    }
675}
676
677impl PartialOrd<String> for Literal {
678    fn partial_cmp(&self, other: &String) -> Option<Ordering> {
679        self.partial_cmp(other.as_str())
680    }
681}
682
683impl PartialOrd<Literal> for String {
684    fn partial_cmp(&self, other: &Literal) -> Option<Ordering> {
685        self.as_str().partial_cmp(other)
686    }
687}
688
689/// A [`Buf`] implementation for [`Literal`].
690///
691/// # Example
692///
693/// ```
694/// use bufjson::{Buf, IntoBuf, lexical::read::{Literal}};
695///
696/// let lit = Literal::from_static("hello, world!");
697/// let mut buf = lit.into_buf();
698///
699/// assert_eq!(13, buf.remaining());
700///
701/// let mut dst = [0; 5];
702/// buf.copy_to_slice(&mut dst);
703///
704/// assert_eq!(b"hello", &dst);
705/// assert_eq!(8, buf.remaining());
706/// ```
707pub struct LiteralBuf(InnerLiteral);
708
709impl LiteralBuf {
710    /// Advances the internal cursor.
711    ///
712    /// The next call to [`chunk`] will return a slice starting `n` bytes further into the literal.
713    ///
714    /// This is an inherent implementation of [`Buf::advance`] for convenience, so it is available
715    /// even when you don't have the trait imported.
716    ///
717    /// # Panics
718    ///
719    /// Panics if `n > self.remaining()`.
720    ///
721    /// [`chunk`]: method@Self::chunk
722    pub fn advance(&mut self, n: usize) {
723        match &mut self.0 {
724            InnerLiteral::Static(s) => {
725                if s.len() < n {
726                    panic!(
727                        "{}",
728                        &BufUnderflow {
729                            requested: n,
730                            remaining: s.len(),
731                        }
732                    );
733                } else {
734                    self.0 = InnerLiteral::Static(&s[n..]);
735                }
736            }
737
738            InnerLiteral::Inline(i, j, b) => {
739                let len = (*j - *i) as usize;
740                if len < n {
741                    panic!(
742                        "{}",
743                        &BufUnderflow {
744                            requested: n,
745                            remaining: len,
746                        }
747                    );
748                } else {
749                    self.0 = InnerLiteral::Inline(*i + n as u8, *j, *b);
750                }
751            }
752
753            InnerLiteral::Uni(r) => r.advance(n),
754            InnerLiteral::Multi(r) => r.advance(n),
755        }
756    }
757
758    /// Returns a slice of bytes starting at the current position, with length between 0 and
759    /// [`remaining`].
760    ///
761    /// The returned slice may be shorter than [`remaining`] to if the internal representation is
762    /// not contiguous. An empty slice is returned only when [`remaining`] returns 0, and is always
763    /// returned in this case since this method never panics.
764    ///
765    /// Calling `chunk` does not advance the internal cursor.
766    ///
767    /// This is an inherent implementation of [`Buf::chunk`] for convenience, so it is available
768    /// even when you don't have the trait imported.
769    ///
770    /// [`remaining`]: method@Self::remaining
771    pub fn chunk(&self) -> &[u8] {
772        match &self.0 {
773            InnerLiteral::Static(s) => s.as_bytes(),
774            InnerLiteral::Inline(i, j, b) => &b[*i as usize..*j as usize],
775            InnerLiteral::Uni(r) => r.chunk(),
776            InnerLiteral::Multi(r) => r.chunk(),
777        }
778    }
779
780    /// Returns the number of bytes between the current position and the end of the `Literal`.
781    ///
782    /// This value is always greater than or equal to the length of the slice returned by [`chunk`].
783    ///
784    /// This is an inherent implementation of [`Buf::remaining`] for convenience, so it is available
785    /// even when you don't have the trait imported.
786    ///
787    /// [`chunk`]: method@Self::chunk
788    pub fn remaining(&self) -> usize {
789        self.0.len()
790    }
791
792    /// Copies bytes from `self` into `dst`.
793    ///
794    /// Advances the internal cursor by the number of bytes copied.
795    ///
796    /// Returns a buffer underflow error without advancing the cursor if `self` does not have enough
797    /// bytes [`remaining`] to fill `dst`.
798    ///
799    /// This is an inherent implementation of [`Buf::try_copy_to_slice`] for convenience, so it is
800    /// available even when you don't have the trait imported.
801    ///
802    /// [`remaining`]: method@Self::remaining
803    pub fn try_copy_to_slice(&mut self, dst: &mut [u8]) -> Result<(), crate::BufUnderflow> {
804        match &mut self.0 {
805            InnerLiteral::Static(s) => {
806                if s.len() < dst.len() {
807                    Err(BufUnderflow {
808                        requested: dst.len(),
809                        remaining: s.len(),
810                    })
811                } else {
812                    dst.copy_from_slice(&s.as_bytes()[..dst.len()]);
813                    *self = Self(InnerLiteral::Static(&s[dst.len()..]));
814
815                    Ok(())
816                }
817            }
818
819            InnerLiteral::Inline(i, j, b) => {
820                let len = (*j - *i) as usize;
821                if len < dst.len() {
822                    Err(BufUnderflow {
823                        requested: dst.len(),
824                        remaining: len,
825                    })
826                } else {
827                    dst.copy_from_slice(&b[*i as usize..*i as usize + dst.len()]);
828                    *i += dst.len() as u8;
829
830                    Ok(())
831                }
832            }
833
834            InnerLiteral::Uni(r) => r.try_copy_to_slice(dst),
835            InnerLiteral::Multi(r) => r.try_copy_to_slice(dst),
836        }
837    }
838}
839
840impl Buf for LiteralBuf {
841    fn advance(&mut self, n: usize) {
842        LiteralBuf::advance(self, n);
843    }
844
845    fn chunk(&self) -> &[u8] {
846        LiteralBuf::chunk(self)
847    }
848
849    fn remaining(&self) -> usize {
850        LiteralBuf::remaining(self)
851    }
852
853    fn try_copy_to_slice(&mut self, dst: &mut [u8]) -> Result<(), crate::BufUnderflow> {
854        LiteralBuf::try_copy_to_slice(self, dst)
855    }
856}
857
858#[derive(Debug, Clone)]
859enum InnerContent {
860    Static(&'static str),
861    Inline(u8, InlineBuf),
862    NotEscapedUni(UniRef),
863    NotEscapedMulti(Box<MultiRef>),
864    EscapedUni(UniRef),
865    EscapedMulti(Box<MultiRef>),
866}
867
868/// Text content of a JSON token identified by a [`ReadAnalyzer`].
869///
870/// See the [`lexical::Content`] trait, implemented by this struct, for detailed conceptual
871/// documentation.
872///
873/// # Memory considerations
874///
875/// A `Content` value may hold references to the internal buffers of a `ReadAnalyzer`. Consequently,
876/// holding on to a `Content` value may prevent the `ReadAnalyzer` from reusing buffers. This can
877/// lead to increased allocation activity, which will inevitably have a small performance cost, but
878/// the bigger and more nefarious effect is increased memory usage. If all `Content` values produced
879/// by a `ReadAnalyzer` are retained, it will require memory roughly equal to the total length of
880/// the JSON text being analyzed. This undermines a key value proposition of a streaming analyzer
881/// and, for large enough JSON texts, may lead to out-of-memory conditions. Therefore, it is advised
882/// that you retain `Content` values only as long as necessary to examine them.
883#[derive(Debug)]
884pub struct Content(InnerContent);
885
886impl Content {
887    /// Returns the literal content of the token exactly as it appears in the JSON text.
888    ///
889    /// This is an inherent implementation of [`lexical::Content::literal`] for convenience, so it
890    /// is available even when you don't have the trait imported. Refer to the trait documentation
891    /// for conceptual details.
892    pub fn literal(&self) -> Literal {
893        Literal(self.0.clone().into())
894    }
895
896    /// Indicates whether the token content contains escape sequences.
897    ///
898    /// This is an inherent implementation of [`lexical::Content::is_escaped`] for convenience, so
899    /// it is available even when you don't have the trait imported. Refer to the trait
900    /// documentation for conceptual details.
901    pub fn is_escaped(&self) -> bool {
902        matches!(
903            self.0,
904            InnerContent::EscapedUni(_) | InnerContent::EscapedMulti(_)
905        )
906    }
907
908    /// Returns a normalized version of literal with all escape sequences in the JSON text fully
909    /// expanded.
910    ///
911    /// This is an inherent implementation of [`lexical::Content::unescaped`] for convenience, so
912    /// it is available even when you don't have the trait imported. Refer to the trait
913    /// documentation for conceptual details.
914    ///
915    /// # Performance considerations
916    ///
917    /// - If this content belongs to a non-string token, or a string token that contains no escape
918    ///   sequences, does not allocate, and simply returns an [`Unescaped::Literal`] wrapping the
919    ///   `Literal` returned by [`literal`], which is a reference to the internals of this content.
920    /// - If this content belongs to a string token containing at least one escape sequence,
921    ///   allocates a new owned string value containing the unescaped string content and returns it
922    ///   wrapped in [`Unescaped::Expanded`].
923    ///
924    /// [`literal`]: method@Self::literal
925    pub fn unescaped(&self) -> Unescaped<Literal> {
926        match &self.0 {
927            InnerContent::EscapedUni(r) => {
928                let mut buf = Vec::new();
929                lexical::unescape(r.clone(), &mut buf);
930
931                // SAFETY: `r` was valid UTF-8 before it was de-escaped, and the de-escaping process
932                //         maintains UTF-8 safety.
933                let s = unsafe { String::from_utf8_unchecked(buf) };
934
935                Unescaped::Expanded(s)
936            }
937
938            InnerContent::EscapedMulti(r) => {
939                let mut buf = Vec::new();
940                lexical::unescape(r.as_ref().clone(), &mut buf);
941
942                // SAFETY: `r` was valid UTF-8 before it was de-escaped, and the de-escaping process
943                //         maintains UTF-8 safety.
944                let s = unsafe { String::from_utf8_unchecked(buf) };
945
946                Unescaped::Expanded(s)
947            }
948
949            _ => Unescaped::Literal(self.literal()),
950        }
951    }
952
953    fn from_static(s: &'static str) -> Self {
954        Self(InnerContent::Static(s))
955    }
956
957    fn from_bufs(bufs: &Bufs, rng: Range<usize>, escaped: bool) -> Self {
958        let len = rng.end - rng.start;
959
960        if len <= INLINE_LEN && !escaped {
961            let mut buf = [0u8; INLINE_LEN];
962            let mut off = 0;
963            let mut rem = len;
964
965            let mut used_iter = bufs.used.iter();
966            let cur_off = if let Some(used_0) = used_iter.next() {
967                let n = used_0.len() - rng.start;
968                buf[0..n].copy_from_slice(&used_0[rng.start..]);
969                off = n;
970                rem = len - n;
971                debug_assert!(off <= len && rem <= len && off + rem == len);
972
973                for used_i in used_iter {
974                    let n = used_i.len();
975                    buf[off..off + n].copy_from_slice(&used_i[..n]);
976                    off += n;
977                    rem -= n;
978                    debug_assert!(off <= len && rem <= len && off + rem == len);
979                }
980
981                0
982            } else {
983                rng.start
984            };
985
986            buf[off..off + rem].copy_from_slice(&bufs.current[cur_off..cur_off + rem]);
987
988            Self(InnerContent::Inline(len as u8, buf))
989        } else if rng.end <= bufs.current.len() && rng.end < u32::MAX as usize {
990            let r = UniRef::new(Arc::clone(&bufs.current), rng.start as u32..rng.end as u32);
991
992            if escaped {
993                Self(InnerContent::EscapedUni(r))
994            } else {
995                Self(InnerContent::NotEscapedUni(r))
996            }
997        } else {
998            let mut all = Vec::with_capacity(bufs.used.len() + 1);
999            all.extend(bufs.used.iter().cloned());
1000            all.push(Arc::clone(&bufs.current));
1001
1002            let r = MultiRef::new(Arc::new(all), rng);
1003
1004            if escaped {
1005                Self(InnerContent::EscapedMulti(Box::new(r)))
1006            } else {
1007                Self(InnerContent::NotEscapedMulti(Box::new(r)))
1008            }
1009        }
1010    }
1011}
1012
1013impl fmt::Display for Content {
1014    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1015        self.literal().fmt(f)
1016    }
1017}
1018
1019impl super::Content for Content {
1020    type Literal<'a> = Literal;
1021
1022    #[inline(always)]
1023    fn literal<'a>(&'a self) -> Self::Literal<'a> {
1024        Content::literal(self)
1025    }
1026
1027    #[inline(always)]
1028    fn is_escaped(&self) -> bool {
1029        Content::is_escaped(self)
1030    }
1031
1032    #[inline(always)]
1033    fn unescaped<'a>(&'a self) -> Unescaped<Self::Literal<'a>> {
1034        Content::unescaped(self)
1035    }
1036}
1037
1038// Assert that `Literal` does not grow beyond 24 bytes (three 64-bit words).
1039const _: [(); 24] = [(); std::mem::size_of::<Literal>()];
1040
1041// Assert that `Content` does not grow beyond 24 bytes (three 64-bit words).
1042const _: [(); 24] = [(); std::mem::size_of::<Content>()];
1043
1044/// Lexical analysis error detected by a [`ReadAnalyzer`].
1045///
1046/// See the [`lexical::Error`] trait, implemented by this struct, for further documentation.
1047#[derive(Clone, Debug)]
1048pub struct Error {
1049    kind: ErrorKind,
1050    pos: Pos,
1051    source: Option<Arc<io::Error>>,
1052}
1053
1054impl Error {
1055    /// Returns the category of error.
1056    ///
1057    /// This is an inherent implementation of [`lexical::Error::kind`] for convenience, so it is
1058    /// available even when you don't have the trait imported.
1059    pub fn kind(&self) -> ErrorKind {
1060        self.kind
1061    }
1062
1063    /// Returns the position in the JSON text where the error was encountered.
1064    ///
1065    /// This is an inherent implementation of [`lexical::Error::pos`] for convenience, so it is
1066    /// available even when you don't have the trait imported.
1067    pub fn pos(&self) -> &Pos {
1068        &self.pos
1069    }
1070}
1071
1072impl fmt::Display for Error {
1073    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1074        self.kind.fmt_at(f, Some(&self.pos))
1075    }
1076}
1077
1078impl std::error::Error for Error {
1079    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
1080        self.source.as_ref().map(|e| &**e as &dyn std::error::Error)
1081    }
1082}
1083
1084impl lexical::Error for Error {
1085    fn kind(&self) -> ErrorKind {
1086        Error::kind(self)
1087    }
1088
1089    fn pos(&self) -> &Pos {
1090        Error::pos(self)
1091    }
1092}
1093
1094#[derive(Debug)]
1095struct Bufs {
1096    current: Arc<Vec<u8>>,
1097    used: Vec<Arc<Vec<u8>>>,
1098    i: usize, // Start index into `used[0]` or `current`
1099    j: usize, // End index into `current`
1100    k: usize, // Length of token is k - i, but k is not a real index if token spans buffers.
1101    maybe_free: VecDeque<Arc<Vec<u8>>>,
1102    buf_size: usize,
1103    eof: bool,
1104}
1105
1106impl Bufs {
1107    const DEFAULT_BUF_SIZE: usize = 8 * 1024;
1108
1109    // Use miniature minimum buffer size in tests to allow test cases involving tokens that span
1110    // multiple buffers to be trivially set up. In production, set the minimum buffer size to a
1111    // value that should lead to tolerably efficient reads in most cases.
1112    #[cfg(test)]
1113    const MIN_BUF_SIZE: usize = 1;
1114    #[cfg(not(test))]
1115    const MIN_BUF_SIZE: usize = 512;
1116
1117    fn new(buf_size: usize) -> Self {
1118        if buf_size < Self::MIN_BUF_SIZE {
1119            panic!(
1120                "buffer size too low: minimum is {} bytes, but {} was given",
1121                Self::MIN_BUF_SIZE,
1122                buf_size
1123            );
1124        }
1125
1126        Self {
1127            current: Arc::new(Vec::new()), // empty Vec does not allocate
1128            used: Vec::new(),              // empty Vec does not allocate
1129            i: 0,
1130            j: 0,
1131            k: 0,
1132            maybe_free: VecDeque::new(), // empty VecDeque does not allocate
1133            buf_size,
1134            eof: false,
1135        }
1136    }
1137
1138    #[inline]
1139    fn rewind(&mut self) {
1140        self.j -= 1;
1141        self.k -= 1;
1142    }
1143
1144    #[inline]
1145    fn reset(&mut self) {
1146        if !self.used.is_empty() {
1147            self.maybe_free.extend(self.used.drain(..));
1148        }
1149
1150        self.i = self.j;
1151        self.k = self.j;
1152    }
1153
1154    #[inline(always)]
1155    fn byte(&mut self) -> Option<u8> {
1156        if self.j < self.current.len() {
1157            let b = unsafe { self.current.get_unchecked(self.j) };
1158            self.j += 1;
1159            self.k += 1;
1160
1161            Some(*b)
1162        } else {
1163            None
1164        }
1165    }
1166
1167    fn read<R: Read>(&mut self, r: &mut R) -> io::Result<bool> {
1168        debug_assert!(self.j == self.current.len());
1169
1170        if self.eof {
1171            return Ok(true);
1172        }
1173
1174        // Obtain a buffer to read into. This may be a new buffer or a previously allocated buffer
1175        // that is no longer used by any live `Content` values.
1176        let mut buf = Arc::new(self.alloc_or_reuse());
1177        let inner =
1178            Arc::get_mut(&mut buf).expect("buffer must be exclusively owned to use for read");
1179        debug_assert!(
1180            inner.len() == self.buf_size,
1181            "allocated buffer must have len buf_size = {}, but its len is {}",
1182            self.buf_size,
1183            inner.len()
1184        );
1185
1186        match r.read(inner.as_mut_slice()) {
1187            Ok(0) => {
1188                self.eof = true;
1189
1190                Ok(true)
1191            }
1192
1193            Ok(n) if n <= inner.len() => {
1194                // If fewer bytes were read than the buffer since, truncate the buffer to the
1195                // number of bytes actually read. This ensures that `byte()` knows when to stop.
1196                //
1197                // Note one subtle consequence of this behavior: if `r.read` provides substantially
1198                // fewer bytes than the buffer size, the buffer will be truncated to a smaller size,
1199                // much of the buffer will go unused. If this keeps happening, while earlier buffers
1200                // are not getting freed due to long-lived `Content` values, the analyzer may end up
1201                // consuming more buffer memory than the length of the JSON text. Doing it this way
1202                // avoids multi-tenant safety risks. Otherwise, re-lengthening a truncated current
1203                // buffer to get to full capacity utilization requires us to get a mutable reference
1204                // to it while there may be one or more immutable references to it alive in
1205                // `Content` values. This is easy to do safely with `bytes::Bytes` but not with
1206                // `Vec`.
1207                inner.truncate(n);
1208
1209                if self.j != self.i {
1210                    // Incomplete token in progress...
1211                    debug_assert!(!self.current.is_empty());
1212
1213                    self.used.push(Arc::clone(&self.current));
1214                } else if !self.current.is_empty() {
1215                    // Beginning of the new buffer starts a new token.
1216                    debug_assert!(self.k > 0);
1217
1218                    self.i = 0;
1219                    self.k = 0;
1220                    self.maybe_free.push_back(Arc::clone(&self.current));
1221                } else {
1222                    // Initial state only.
1223                    debug_assert!(self.i == 0 && self.j == 0 && self.k == 0);
1224                }
1225
1226                self.current = buf;
1227                self.j = 0;
1228
1229                Ok(false)
1230            }
1231            Ok(n) => panic!("read {n} bytes but buffer size is only {}", inner.len()),
1232            Err(err) => Err(err),
1233        }
1234    }
1235
1236    fn alloc_or_reuse(&mut self) -> Vec<u8> {
1237        if let Some(buf) = self.maybe_free.pop_front() {
1238            let mut replace: Option<Arc<Vec<u8>>> = None;
1239
1240            // Pop the first old buffer from the list. If it is unused, return it. If it is used and
1241            // it was the only old buffer in the list, replace it in case it becomes free soon.
1242            match Arc::try_unwrap(buf) {
1243                Ok(inner) => return inner,
1244                Err(buf) => {
1245                    if self.maybe_free.is_empty() {
1246                        replace = Some(buf);
1247                    }
1248                }
1249            };
1250
1251            // The first buffer was not free. Scan through the remaining buffers to see if a free
1252            // one can be found. Discard non-free buffers as we go, to avoid building up an enormous
1253            // free list that will make buffer allocation slow.
1254            while let Some(buf) = self.maybe_free.pop_front() {
1255                if let Ok(inner) = Arc::try_unwrap(buf) {
1256                    return inner;
1257                }
1258            }
1259
1260            // If the first buffer was the only buffer, replace it onto the list in case it becomes
1261            // free soon.
1262            if let Some(buf) = replace {
1263                self.maybe_free.push_back(buf);
1264            }
1265        }
1266
1267        // There was no free buffer to reuse. Allocate a new one.
1268        let mut v = Vec::with_capacity(self.buf_size);
1269        #[allow(clippy::uninit_vec)]
1270        unsafe {
1271            v.set_len(self.buf_size);
1272        };
1273        v
1274    }
1275}
1276
1277#[derive(Debug)]
1278enum StoredContent {
1279    Literal(&'static str),
1280    Range(Range<usize>, bool),
1281    Err(Error),
1282}
1283
1284impl Default for StoredContent {
1285    fn default() -> Self {
1286        StoredContent::Literal("")
1287    }
1288}
1289
1290/// A [`lexical::Analyzer`] to tokenize JSON text read from a [`std::io::Read`].
1291///
1292/// Use `ReadAnalyzer` for low allocation, low-copy, stream-oriented lexical analysis of any stream
1293/// of JSON text.
1294///
1295/// As with any [`lexical::Analyzer`] implementation, you can construct a [`syntax::Parser`] from a
1296/// `ReadAnalyzer` to unlock richer stream-oriented syntactic analysis while retaining low overhead
1297/// guarantees of the underlying lexical analyzer.
1298///
1299/// # Performance considerations
1300///
1301/// ## Buffering
1302///
1303/// Since `ReadAnalyzer` already buffers its reads from the input [`Read`] stream, wrapping the
1304/// input stream in an added layer of buffering (for example, a [`std::io::BufReader`]) will only
1305/// result in double copying from one buffer to the next, decreasing efficiency rather than
1306/// increasing it. Avoid adding extra buffering layers.
1307///
1308/// `ReadAnalyzer` is the best choice when your `Read` implementation represents some kind of I/O
1309/// device, such as a file or network stream. If you already have your entire JSON text in memory,
1310/// it is preferable to use [`FixedAnalyzer`], which operates directly on the in-memory JSON text
1311/// without any extra buffering overhead. Even if your in-memory value implements `Read`, as `&[u8]`
1312/// does, for example, [`FixedAnalyzer`] will be a better choice.
1313///
1314/// ## Method performance
1315///
1316/// The [`next`] method may allocate a new buffer if the current buffer is full. Before allocating
1317/// a new buffer, it will make reasonable efforts to reuse one previously allocated. If a new buffer
1318/// is required, `next` will read from the input stream to fill it. For very long tokens, multiple
1319/// buffers may be needed in sequence. Beyond obtaining and reading into buffers, the method does no
1320/// other allocating or copying.
1321///
1322/// The [`content`] method typically does not allocate, although it may do so in edge cases. For
1323/// punctuation and literal tokens, it never copies. For number and string tokens, it may copy if
1324/// the token is very short; otherwise, it just returns a reference-counted view of its internal
1325/// buffers.
1326///
1327/// It should be noted that the `Content` structure returned by [`content`] is somewhat "fat", at 24
1328/// bytes, and that creating the structure, while very cheap, is not entirely free. It is therefore
1329/// preferable not to fetch it for tokens where the content is statically knowable (literals and
1330/// punctuation) or not required (*e.g.*, whitespace in some applications).
1331///
1332/// # Memory considerations
1333///
1334/// Because [`Content`] can refer directly to the internal buffers, keeping `Content` values alive
1335/// for long lifetimes can prevent the internal buffers from being dropped or reused. In the worst
1336/// case, if values referencing into every internal buffer are kept alive, the `ReadAnalyzer` can
1337/// use memory proportionate to the length of the JSON text being analyzed. Since this type of usage
1338/// reduces the value proposition of a truly streaming JSON processor, it is recommended that you
1339/// drop `Content` values soon after inspecting them; and when, a longer lifetime is required,
1340/// convert them into some other convenient owned value.
1341///
1342/// # Examples
1343///
1344/// Scan the contents of a file into tokens.
1345///
1346/// ```
1347/// use bufjson::lexical::{Token, read::ReadAnalyzer};
1348/// # let example_dir = tempfile::tempdir_in(".").unwrap();
1349/// # let example_file = example_dir.path().join("example.json");
1350/// use std::fs::{self, File};
1351///
1352/// fs::write(&example_file, r#"{"user":"alice","score":95,"tags":["admin"]}"#).unwrap();
1353///
1354/// let mut lexer = ReadAnalyzer::new(File::open(&example_file).unwrap());
1355///
1356/// assert_eq!(Token::ObjBegin, lexer.next());
1357/// assert_eq!(Token::Str, lexer.next());
1358/// assert_eq!(Token::NameSep, lexer.next());
1359/// assert_eq!(Token::Str, lexer.next());
1360/// assert_eq!(Token::ValueSep, lexer.next());
1361/// assert_eq!(Token::Str, lexer.next());
1362/// assert_eq!(Token::NameSep, lexer.next());
1363/// assert_eq!(Token::Num, lexer.next());
1364/// assert_eq!(Token::ValueSep, lexer.next());
1365/// assert_eq!(Token::Str, lexer.next());
1366/// assert_eq!(Token::NameSep, lexer.next());
1367/// assert_eq!(Token::ArrBegin, lexer.next());
1368/// assert_eq!(Token::Str, lexer.next());
1369/// assert_eq!(Token::ArrEnd, lexer.next());
1370/// assert_eq!(Token::ObjEnd, lexer.next());
1371/// assert_eq!(Token::Eof, lexer.next());
1372/// ```
1373///
1374/// [`content`]: method@Self::content
1375/// [`next`]: method@Self::next
1376/// [`FixedAnalyzer`]: crate::lexical::fixed::FixedAnalyzer
1377#[derive(Debug)]
1378pub struct ReadAnalyzer<R: Read> {
1379    bufs: Bufs,
1380    content: StoredContent,
1381    content_pos: Pos,
1382    mach: state::Machine,
1383    read: R,
1384}
1385
1386impl<R: Read> ReadAnalyzer<R> {
1387    /// Constructs a new lexer to tokenize JSON text streamed from a reader.
1388    ///
1389    /// The reader can be anything that implements [`std::io::Read`], such as a file or network
1390    /// connection.
1391    ///
1392    /// This method creates a `ReadAnalyzer` with a default buffer size of 8 KiB. To control the
1393    /// buffer size, construct using [`with_buf_size`] instead.
1394    ///
1395    /// # Examples
1396    ///
1397    /// ```no_run
1398    /// # use bufjson::lexical::read::ReadAnalyzer;
1399    /// use std::fs::File;
1400    ///
1401    /// let mut lexer = ReadAnalyzer::new(File::open("example.json").unwrap());
1402    /// ```
1403    ///
1404    /// [`with_buf_size`]: method@Self::with_buf_size
1405    pub fn new(read: R) -> Self {
1406        Self::with_buf_size(read, Bufs::DEFAULT_BUF_SIZE)
1407    }
1408
1409    /// Recognizes the next lexical token in the buffer without allocating or copying.
1410    ///
1411    /// This is an inherent implementation of [`lexical::Analyzer::next`] for convenience, so it is
1412    /// available even when you don't have the trait imported.
1413    ///
1414    /// # Example
1415    ///
1416    /// ```
1417    /// # use bufjson::lexical::{Token, read::ReadAnalyzer};
1418    /// # let example_dir = tempfile::tempdir_in(".").unwrap();
1419    /// # let example_file = example_dir.path().join("example.json");
1420    /// use std::fs::{self, File};
1421    ///
1422    /// fs::write(&example_file, "99.9e-1").unwrap();
1423    ///
1424    /// let mut lexer = ReadAnalyzer::new(File::open(&example_file).unwrap());
1425    ///
1426    /// assert_eq!(Token::Num, lexer.next());
1427    /// assert_eq!(Token::Eof, lexer.next());
1428    /// assert_eq!(Token::Eof, lexer.next());
1429    /// ```
1430    #[allow(clippy::should_implement_trait)]
1431    pub fn next(&mut self) -> Token {
1432        if matches!(self.content, StoredContent::Err(_)) {
1433            return Token::Err;
1434        }
1435
1436        self.content_pos = *self.mach.pos();
1437        self.bufs.reset();
1438
1439        let mut b = match self.byte() {
1440            Ok(b) => b,
1441            Err(err) => {
1442                self.content = StoredContent::Err(err);
1443
1444                return Token::Err;
1445            }
1446        };
1447
1448        loop {
1449            match self.mach.next(b) {
1450                state::State::Mid => match self.byte() {
1451                    Ok(v) => b = v,
1452                    Err(err) => {
1453                        self.content = StoredContent::Err(err);
1454
1455                        return Token::Err;
1456                    }
1457                },
1458
1459                state::State::End {
1460                    token,
1461                    escaped,
1462                    repeat,
1463                } => {
1464                    if repeat && b.is_some() {
1465                        self.bufs.rewind();
1466                    }
1467
1468                    self.content = match token {
1469                        Token::ObjBegin => StoredContent::Literal("{"),
1470                        Token::ObjEnd => StoredContent::Literal("}"),
1471                        Token::ArrBegin => StoredContent::Literal("["),
1472                        Token::NameSep => StoredContent::Literal(":"),
1473                        Token::ValueSep => StoredContent::Literal(","),
1474                        Token::LitFalse => StoredContent::Literal("false"),
1475                        Token::LitNull => StoredContent::Literal("null"),
1476                        Token::LitTrue => StoredContent::Literal("true"),
1477                        _ => StoredContent::Range(self.bufs.i..self.bufs.k, escaped),
1478                    };
1479
1480                    return token;
1481                }
1482
1483                state::State::Err(kind) => {
1484                    let mut pos = *self.mach.pos();
1485
1486                    match &kind {
1487                        ErrorKind::BadSurrogate {
1488                            first: _,
1489                            second: _,
1490                            offset,
1491                        } => {
1492                            pos.offset -= *offset as usize;
1493                            pos.col -= *offset as usize;
1494                        }
1495
1496                        ErrorKind::BadUtf8ContByte {
1497                            seq_len,
1498                            offset: _,
1499                            value: _,
1500                        } => {
1501                            // Current `pos.offset` is at the end of the multibyte UTF-8 sequence.
1502                            // Rewind it to the start of the sequence.
1503                            let rewind = seq_len - 1;
1504                            pos.offset -= rewind as usize;
1505                        }
1506
1507                        _ => (),
1508                    }
1509
1510                    self.content = StoredContent::Err(Error {
1511                        kind,
1512                        pos,
1513                        source: None,
1514                    });
1515
1516                    return Token::Err;
1517                }
1518            }
1519        }
1520    }
1521
1522    /// Fetches the text content of the most recent non-error token.
1523    ///
1524    /// This is an inherent implementation of [`lexical::Analyzer::content`] for convenience, so it
1525    /// is available even when you don't have the trait imported.
1526    ///
1527    /// # Panics
1528    ///
1529    /// Panics if the most recent token returned by [`next`] was [`Token::Err`].
1530    ///
1531    /// # Example
1532    ///
1533    /// ```
1534    /// # use bufjson::lexical::{Token, read::ReadAnalyzer};
1535    /// # let example_dir = tempfile::tempdir_in(".").unwrap();
1536    /// # let example_file = example_dir.path().join("example.json");
1537    /// use std::fs::{self, File};
1538    ///
1539    /// fs::write(&example_file, r#"  null"#).unwrap();
1540    ///
1541    /// let mut lexer = ReadAnalyzer::new(File::open(&example_file).unwrap());
1542    ///
1543    /// assert_eq!(Token::White, lexer.next());
1544    /// assert_eq!("  ", lexer.content().literal());
1545    ///
1546    /// assert_eq!(Token::LitNull, lexer.next());
1547    /// assert_eq!("null", lexer.content().literal());
1548    /// ```
1549    ///
1550    /// [`next`]: method@Self::next
1551    #[inline]
1552    pub fn content(&self) -> Content {
1553        if let Ok(content) = self.try_content() {
1554            content
1555        } else {
1556            panic!("no content: last `next()` returned `Token::Err` (use `err()` instead)");
1557        }
1558    }
1559
1560    /// Fetches the error value associated with the most recent error token.
1561    ///
1562    /// This is an inherent implementation of [`lexical::Analyzer::err`] for convenience, so it is
1563    /// available even when you don't have the trait imported.
1564    ///
1565    /// # Panics
1566    ///
1567    /// Panics if the most recent token returned by [`next`] was not [`Token::Err`].
1568    ///
1569    /// # Example
1570    ///
1571    /// ```
1572    /// use bufjson::lexical::{ErrorKind, Expect, Token, read::ReadAnalyzer};
1573    /// # let example_dir = tempfile::tempdir_in(".").unwrap();
1574    /// # let example_file = example_dir.path().join("example.json");
1575    /// use std::fs::{self, File};
1576    ///
1577    /// fs::write(&example_file, "garbage!").unwrap();
1578    ///
1579    /// let mut lexer = ReadAnalyzer::new(File::open(&example_file).unwrap());
1580    ///
1581    /// assert_eq!(Token::Err, lexer.next());
1582    /// assert!(matches!(
1583    ///     lexer.err().kind(),
1584    ///     ErrorKind::UnexpectedByte { token: None, expect: Expect::TokenStartChar, actual: b'g'}
1585    /// ));
1586    /// ```
1587    ///
1588    /// [`next`]: method@Self::next
1589    #[inline]
1590    pub fn err(&self) -> Error {
1591        if let Err(err) = self.try_content() {
1592            err
1593        } else {
1594            panic!("no error: last `next()` did not return `Token::Err` (use `content()` instead)");
1595        }
1596    }
1597
1598    /// Returns the position of the start of the token most recently scanned by [`next`].
1599    ///
1600    /// This is an inherent implementation of [`lexical::Analyzer::pos`] for convenience, so it is
1601    /// available even when you don't have the trait imported.
1602    ///
1603    /// # Examples
1604    ///
1605    /// Before any token is scanned, the position is the default position.
1606    ///
1607    /// ```
1608    /// # use bufjson::{Pos, lexical::read::ReadAnalyzer};
1609    /// assert_eq!(Pos::default(), *ReadAnalyzer::new(&b""[..]).pos());
1610    /// ```
1611    ///
1612    /// The position of the first token returned is always the start of the buffer.
1613    ///
1614    /// ```
1615    /// use bufjson::{Pos, lexical::{Token, read::ReadAnalyzer}};
1616    ///
1617    /// let mut lexer = ReadAnalyzer::new(&b" \n"[..]);
1618    ///
1619    /// // Read the two-byte whitespace token that starts at offset 0.
1620    /// assert_eq!(Token::White, lexer.next());
1621    /// assert_eq!(Pos::default(), *lexer.pos());
1622    ///
1623    /// // The EOF token starts at the end of the whitespace token.
1624    /// assert_eq!(Token::Eof, lexer.next());
1625    /// assert_eq!(Pos { offset: 2, line: 2, col: 1}, *lexer.pos());
1626    /// ```
1627    ///
1628    /// On errors, the position reported by `pos` may be different than the position reported by
1629    /// the error returned from [`content`]. This is because the `pos` indicates the start of the
1630    /// token where the error occurred, and the error position is the exact position of the error.
1631    ///
1632    /// ```
1633    /// use bufjson::{Pos, lexical::{Token, read::ReadAnalyzer}};
1634    ///
1635    /// let mut lexer = ReadAnalyzer::new(&b"123_"[..]);
1636    ///
1637    /// assert_eq!(Token::Err, lexer.next());
1638    /// // `pos` is at the start of the number token that has the problem...
1639    /// assert_eq!(Pos::default(), *lexer.pos());
1640    /// // ...but the error contains the exact problem position: offset 3, column 4.
1641    /// assert_eq!(Pos { offset: 3, line: 1, col: 4 }, *lexer.err().pos())
1642    /// ```
1643    ///
1644    /// [`next`]: method@Self::next
1645    /// [`content`]: method@Self::content
1646    #[inline(always)]
1647    pub fn pos(&self) -> &Pos {
1648        &self.content_pos
1649    }
1650
1651    /// Fetches the content or error associated with the most recent token.
1652    ///
1653    /// This is an inherent implementation of [`lexical::Analyzer::try_content`] for convenience, so
1654    /// it is available even when you don't have the trait imported.
1655    ///
1656    /// # Examples
1657    ///
1658    /// An `Ok` value is returned as long as the lexical analyzer isn't in an error state.
1659    ///
1660    /// ```
1661    /// # use bufjson::lexical::{Token, read::ReadAnalyzer};
1662    /// let mut lexer = ReadAnalyzer::new(&b"99.9e-1"[..]);
1663    /// assert_eq!(Token::Num, lexer.next());
1664    /// assert!(matches!(lexer.try_content(), Ok(c) if c.literal() == "99.9e-1"));
1665    /// ```
1666    ///
1667    /// Once the lexical analyzer encounters a lexical error, it will return an `Err` value
1668    /// describing that error.
1669    ///
1670    /// ```
1671    /// use bufjson::{Pos, lexical::{Token, read::ReadAnalyzer}};
1672    ///
1673    /// let mut lexer = ReadAnalyzer::new(&b"[unquoted]"[..]);
1674    /// assert_eq!(Token::ArrBegin, lexer.next());
1675    /// assert_eq!(Token::Err, lexer.next());
1676    /// assert_eq!(Pos { offset: 1, line: 1, col: 2}, *lexer.try_content().unwrap_err().pos());
1677    /// ```
1678    pub fn try_content(&self) -> Result<Content, Error> {
1679        match &self.content {
1680            StoredContent::Literal(s) => Ok(Content::from_static(s)),
1681            StoredContent::Range(rng, escaped) => {
1682                Ok(Content::from_bufs(&self.bufs, rng.clone(), *escaped))
1683            }
1684            StoredContent::Err(err) => Err(err.clone()),
1685        }
1686    }
1687
1688    /// Converts a lexical analyzer into a syntax parser, consuming the lexical analyzer in the
1689    /// process.
1690    ///
1691    /// You can convert the parser back into the underlying lexical analyzer using
1692    /// [`Parser::into_inner`].
1693    ///
1694    /// # Examples
1695    ///
1696    /// ```
1697    /// use bufjson::lexical::{Token, read::ReadAnalyzer};
1698    /// # let example_dir = tempfile::tempdir_in(".").unwrap();
1699    /// # let example_file = example_dir.path().join("example.json");
1700    /// use std::fs::{self, File};
1701    ///
1702    /// // Write some example JSON text to a file.
1703    /// fs::write(&example_file, r#"true false"#).unwrap();
1704    ///
1705    /// // Create a lexical analyzer and consume the first token.
1706    /// let mut lexer = ReadAnalyzer::new(File::open(&example_file).unwrap());
1707    /// assert_eq!(Token::LitTrue, lexer.next());
1708    ///
1709    /// // Convert the lexer into a parser. Since `true` is consumed, the next meaningful token is
1710    /// // `false`.
1711    /// let mut parser = lexer.into_parser();
1712    /// assert_eq!(Token::LitFalse, parser.next_meaningful());
1713    /// ```
1714    ///
1715    /// [`Parser::into_inner`]: syntax::Parser::into_inner
1716    pub fn into_parser(self) -> syntax::Parser<ReadAnalyzer<R>> {
1717        syntax::Parser::new(self)
1718    }
1719
1720    /// Constructs a new lexer with a specified buffer size to tokenize JSON text from a reader.
1721    ///
1722    /// The minimum buffer size is 512 bytes.
1723    ///
1724    /// The reader can be anything that implements [`std::io::Read`], such as a file or network
1725    /// connection.
1726    ///
1727    /// # Panics
1728    ///
1729    /// Panics if the specified buffer size is less than 512 bytes.
1730    ///
1731    /// # Example
1732    ///
1733    /// ```no_run
1734    /// # use bufjson::lexical::read::ReadAnalyzer;
1735    /// # let example_dir = tempfile::tempdir_in(".").unwrap();
1736    /// # let example_file = example_dir.path().join("example.json");
1737    /// use std::fs::File;
1738    ///
1739    /// let mut lexer = ReadAnalyzer::with_buf_size(
1740    ///     File::open(&example_file).unwrap(),
1741    ///     16 * 1024                           // Use 16 KiB buffers instead of the default 8 KiB.
1742    /// );
1743    ///
1744    /// let _ = lexer.next();
1745    /// ```
1746    pub fn with_buf_size(read: R, buf_size: usize) -> Self {
1747        Self {
1748            bufs: Bufs::new(buf_size),
1749            content: StoredContent::default(),
1750            content_pos: Pos::default(),
1751            mach: state::Machine::default(),
1752            read,
1753        }
1754    }
1755
1756    #[inline]
1757    fn byte(&mut self) -> Result<Option<u8>, Error> {
1758        if let Some(b) = self.bufs.byte() {
1759            Ok(Some(b))
1760        } else {
1761            match self.bufs.read(&mut self.read) {
1762                Ok(eof) if eof => Ok(None),
1763                Ok(_) => Ok(self.bufs.byte()),
1764                Err(err) => Err(Error {
1765                    kind: ErrorKind::Read,
1766                    pos: *self.mach.pos(),
1767                    source: Some(Arc::new(err)),
1768                }),
1769            }
1770        }
1771    }
1772}
1773
1774impl<R: Read> Analyzer for ReadAnalyzer<R> {
1775    type Content = Content;
1776    type Error = Error;
1777
1778    #[inline(always)]
1779    fn next(&mut self) -> Token {
1780        ReadAnalyzer::next(self)
1781    }
1782
1783    #[inline(always)]
1784    fn try_content(&self) -> Result<Self::Content, Error> {
1785        ReadAnalyzer::try_content(self)
1786    }
1787
1788    #[inline(always)]
1789    fn pos(&self) -> &Pos {
1790        ReadAnalyzer::pos(self)
1791    }
1792}
1793
1794#[cfg(test)]
1795mod tests {
1796    use super::*;
1797    use crate::{IntoBuf, lexical::Expect};
1798    use rstest::rstest;
1799    use std::{
1800        collections::{BTreeMap, HashMap},
1801        error::Error as _,
1802    };
1803
1804    #[test]
1805    #[should_panic(expected = "not enough bytes in buffer (4 requested, but only 3 remain)")]
1806    fn test_uniref_buf_advance_panic() {
1807        let mut b = UniRef::test_new("foo", 0..3);
1808
1809        b.advance(4);
1810    }
1811
1812    #[rstest]
1813    #[case("", 0..0, 0, "")]
1814    #[case("x", 0..0, 0, "")]
1815    #[case("x", 1..1, 0, "")]
1816    #[case("x", 0..1, 0, "x")]
1817    #[case("x", 0..1, 1, "")]
1818    #[case("hello", 0..5, 0, "hello")]
1819    #[case("hello", 0..5, 5, "")]
1820    #[case("hello", 0..2, 0, "he")]
1821    #[case("hello", 0..2, 1, "e")]
1822    #[case("hello", 0..2, 2, "")]
1823    #[case("hello", 1..5, 2, "lo")]
1824    #[case("hello", 1..5, 1, "llo")]
1825    #[case("hello", 1..5, 0, "ello")]
1826    #[case("hello", 1..4, 0, "ell")]
1827    fn test_uniref_buf_advance_ok(
1828        #[case] buf: &str,
1829        #[case] rng: Range<u32>,
1830        #[case] n: usize,
1831        #[case] chunk: &str,
1832    ) {
1833        let mut b = UniRef::test_new(buf, rng);
1834
1835        b.advance(n);
1836
1837        assert_eq!(chunk, str::from_utf8(b.chunk()).unwrap());
1838        assert_eq!(chunk.len(), b.remaining());
1839    }
1840
1841    #[rstest]
1842    #[case("", 0..0, "")]
1843    #[case("a", 0..0, "")]
1844    #[case("a", 0..1, "a")]
1845    #[case("a", 1..1, "")]
1846    #[case("foo", 0..3, "foo")]
1847    fn test_uniref_buf_chunk(#[case] buf: &str, #[case] rng: Range<u32>, #[case] expect: &str) {
1848        let b = UniRef::test_new(buf, rng);
1849
1850        assert_eq!(expect, str::from_utf8(b.chunk()).unwrap());
1851    }
1852
1853    #[rstest]
1854    #[case("", 0..0, 0, false)]
1855    #[case("a", 0..1, 1, true)]
1856    #[case("foo", 0..3, 3, true)]
1857    fn test_uniref_buf_remaining(
1858        #[case] buf: &str,
1859        #[case] rng: Range<u32>,
1860        #[case] expect_remaining: usize,
1861        #[case] expect_has_remaining: bool,
1862    ) {
1863        let b = UniRef::test_new(buf, rng);
1864
1865        assert_eq!(expect_remaining, b.remaining());
1866        assert_eq!(expect_has_remaining, b.has_remaining());
1867    }
1868
1869    #[rstest]
1870    #[case("", 0..0, b"", "")]
1871    #[case("a", 0..0, b"", "")]
1872    #[case("a", 0..1, b"", "a")]
1873    #[case("a", 0..1, b"a", "")]
1874    #[case("bar", 0..3, b"", "bar")]
1875    #[case("bar", 0..3, b"b", "ar")]
1876    #[case("bar", 0..3, b"ba", "r")]
1877    #[case("bar", 0..3, b"bar", "")]
1878    #[case("bar", 0..2, b"b", "a")]
1879    #[case("bar", 1..3, b"ar", "")]
1880    fn test_uniref_buf_try_copy_to_slice_ok<const N: usize>(
1881        #[case] buf: &str,
1882        #[case] rng: Range<u32>,
1883        #[case] expect: &[u8; N],
1884        #[case] rem: &str,
1885    ) {
1886        let mut b = UniRef::test_new(buf, rng);
1887        let mut actual = [0; N];
1888
1889        let result = b.try_copy_to_slice(&mut actual);
1890
1891        assert_eq!(Ok(()), result);
1892        assert_eq!(expect, &actual);
1893        assert_eq!(rem, str::from_utf8(b.chunk()).unwrap());
1894    }
1895
1896    #[rstest]
1897    #[case("", 0..0, [0; 1])]
1898    #[case("", 0..0, [0; 2])]
1899    #[case("a", 0..1, [0; 2])]
1900    #[case("foo", 0..3, [0; 4])]
1901    #[case("foo", 1..2, [0; 99])]
1902    fn test_uniref_buf_try_copy_to_slice_err<const N: usize>(
1903        #[case] buf: &str,
1904        #[case] rng: Range<u32>,
1905        #[case] mut dst: [u8; N],
1906    ) {
1907        let expect = &buf[rng.start as usize..rng.end as usize];
1908        let mut b = UniRef::test_new(buf, rng.clone());
1909
1910        let result = b.try_copy_to_slice(&mut dst);
1911
1912        assert_eq!(
1913            Err(BufUnderflow {
1914                remaining: (rng.end - rng.start) as usize,
1915                requested: N
1916            }),
1917            result
1918        );
1919        assert_eq!(expect, str::from_utf8(b.chunk()).unwrap());
1920    }
1921
1922    #[rstest]
1923    #[case(MultiRef::test_new([""; 0], 0..0), 1)]
1924    #[case(MultiRef::test_new([""], 0..0), 1)]
1925    #[case(MultiRef::test_new(["foo", ""], 0..3), 4)]
1926    #[case(MultiRef::test_new(["f", "o", "o", ""], 0..3), 4)]
1927    #[case(MultiRef::test_new(["hell", "o worl", "d"], 6..11), 6)]
1928    #[should_panic(expected = "not enough bytes in buffer")]
1929    fn test_multiref_buf_advance_panic(#[case] mut b: MultiRef, #[case] n: usize) {
1930        b.advance(n);
1931    }
1932
1933    #[rstest]
1934    #[case(MultiRef::test_new([""; 0], 0..0), "", 0, b"")]
1935    #[case(MultiRef::test_new([""], 0..0), "", 0, b"")]
1936    #[case(MultiRef::test_new(["a"], 0..0), "", 0, b"")]
1937    #[case(MultiRef::test_new(["a"], 0..1), "a", 0, b"a")]
1938    #[case(MultiRef::test_new(["a", ""], 0..1), "a", 1, b"")]
1939    #[case(MultiRef::test_new(["f", "o", "o"], 0..3), "f", 1, b"oo")]
1940    #[case(MultiRef::test_new(["f", "o", "o"], 0..3), "f", 2, b"o")]
1941    #[case(MultiRef::test_new(["f", "o", "o"], 0..3), "f", 3, b"")]
1942    #[case(MultiRef::test_new(["fo", "o", ""], 0..3), "fo", 1, b"oo")]
1943    #[case(MultiRef::test_new(["fo", "o", ""], 0..3), "fo", 2, b"o")]
1944    #[case(MultiRef::test_new(["fo", "o", ""], 0..3), "fo", 3, b"")]
1945    #[case(MultiRef::test_new(["he", "ll", "o world"], 0..5), "he", 0, b"hello")]
1946    #[case(MultiRef::test_new(["he", "ll", "o world"], 0..5), "he", 1, b"ello")]
1947    #[case(MultiRef::test_new(["he", "ll", "o world"], 0..5), "he", 2, b"llo")]
1948    #[case(MultiRef::test_new(["he", "ll", "o world"], 0..5), "he", 3, b"lo")]
1949    #[case(MultiRef::test_new(["he", "ll", "o world"], 0..5), "he", 4, b"o")]
1950    #[case(MultiRef::test_new(["he", "ll", "o world"], 0..5), "he", 5, b"")]
1951    fn test_multiref_buf_advance_ok<const N: usize>(
1952        #[case] mut b: MultiRef,
1953        #[case] expect_chunk: &str,
1954        #[case] n: usize,
1955        #[case] expect_tail: &[u8; N],
1956    ) {
1957        let before = b.chunk();
1958        assert_eq!(expect_chunk, str::from_utf8(before).unwrap());
1959
1960        b.advance(n);
1961
1962        assert_eq!(N, b.remaining());
1963
1964        let after = b.chunk();
1965        if N > 0 {
1966            assert!(after.len() > 0);
1967        } else {
1968            assert!(after.is_empty());
1969        }
1970
1971        let mut dst = [0u8; N];
1972        b.copy_to_slice(&mut dst);
1973        assert_eq!(expect_tail, &dst);
1974
1975        assert_eq!(0, b.remaining());
1976        assert_eq!(b"", b.chunk());
1977    }
1978
1979    #[test]
1980    #[should_panic(expected = "not enough bytes in buffer (1 requested, but only 0 remain)")]
1981    fn test_multiref_copy_to_slice_underflow_panic() {
1982        let mut b = MultiRef::test_new([""], 0..0).into_buf();
1983        let mut dst = [0u8; 1];
1984
1985        b.copy_to_slice(&mut dst);
1986    }
1987
1988    #[test]
1989    fn test_multiref_copy_to_slice_partial_buf() {
1990        let mut b = MultiRef::test_new([" f", "oolishness"], 1..8);
1991        let mut dst = [0u8; 3];
1992
1993        b.copy_to_slice(&mut dst);
1994
1995        assert_eq!(b"foo", &dst);
1996        assert_eq!(4, b.remaining());
1997        assert_eq!(b"lish", b.chunk());
1998    }
1999
2000    #[test]
2001    fn test_multiref_copy_to_slice_full_buf() {
2002        let mut b = MultiRef::test_new([" f", "oolishness"], 1..5);
2003        let mut dst = [0u8; 4];
2004
2005        b.copy_to_slice(&mut dst);
2006
2007        assert_eq!(b"fool", &dst);
2008    }
2009
2010    #[test]
2011    fn test_multiref_copy_to_slice_blarg() {
2012        let mut b = MultiRef::test_new(["foo", "li", "shness"], 0..7);
2013        let mut dst = [0u8; 4];
2014
2015        b.copy_to_slice(&mut dst);
2016
2017        assert_eq!(b"fool", &dst);
2018        assert_eq!(3, b.remaining());
2019        assert_eq!(b"i", b.chunk());
2020    }
2021
2022    #[rstest]
2023    #[case(InnerLiteral::Static(""), 0)]
2024    #[case(InnerLiteral::Static("a"), 1)]
2025    #[case(InnerLiteral::Inline(0, 0, [0; INLINE_LEN]), 0)]
2026    #[case(InnerLiteral::Inline(0, 1, [0; INLINE_LEN]), 1)]
2027    #[case(InnerLiteral::Inline(1, 1, [0; INLINE_LEN]), 0)]
2028    #[case(InnerLiteral::Inline(1, 2, [0; INLINE_LEN]), 1)]
2029    #[case(InnerLiteral::Inline(3, 7, [0; INLINE_LEN]), 4)]
2030    #[case(InnerLiteral::Uni(UniRef::test_new("", 0..0)), 0)]
2031    #[case(InnerLiteral::Uni(UniRef::test_new("a", 0..0)), 0)]
2032    #[case(InnerLiteral::Uni(UniRef::test_new("a", 0..1)), 1)]
2033    #[case(InnerLiteral::Uni(UniRef::test_new("ab", 1..2)), 1)]
2034    #[case(InnerLiteral::Uni(UniRef::test_new("abcd", 1..3)), 2)]
2035    #[case(InnerLiteral::Multi(Box::new(MultiRef::test_new([""; 0], 0..0))), 0)]
2036    #[case(InnerLiteral::Multi(Box::new(MultiRef::test_new([""], 0..0))), 0)]
2037    #[case(InnerLiteral::Multi(Box::new(MultiRef::test_new(["a"], 0..0))), 0)]
2038    #[case(InnerLiteral::Multi(Box::new(MultiRef::test_new(["a", ""], 0..0))), 0)]
2039    #[case(InnerLiteral::Multi(Box::new(MultiRef::test_new(["a"], 0..1))), 1)]
2040    #[case(InnerLiteral::Multi(Box::new(MultiRef::test_new(["a", "b"], 0..2))), 2)]
2041    #[case(InnerLiteral::Multi(Box::new(MultiRef::test_new(["a", "b", "cd"], 1..4))), 3)]
2042    fn test_inner_literal_len(#[case] inner: InnerLiteral, #[case] expect: usize) {
2043        assert_eq!(expect, inner.len());
2044    }
2045
2046    #[rstest]
2047    #[case(InnerLiteral::Static(""), "")]
2048    #[case(InnerLiteral::Static("a"), "a")]
2049    #[case(InnerLiteral::Inline(0, 0, [0; INLINE_LEN]), "")]
2050    #[case(InnerLiteral::Inline(0, 1, [b'a'; INLINE_LEN]), "a")]
2051    #[case(InnerLiteral::Inline(0, INLINE_LEN as u8, [b'b'; INLINE_LEN]), "b".repeat(INLINE_LEN))]
2052    #[case(InnerLiteral::Uni(UniRef::test_new("c", 0..1)), "c")]
2053    #[case(InnerLiteral::Uni(UniRef::test_new("def".repeat(u8::MAX as usize), 0..(3 * u8::MAX as u32))), "def".repeat(u8::MAX as usize))]
2054    fn test_inner_literal_repr_together(
2055        #[case] inner: InnerLiteral,
2056        #[case] expect: impl AsRef<str>,
2057    ) {
2058        assert!(matches!(inner.repr(), Repr::Together(s) if s == expect.as_ref()));
2059    }
2060
2061    #[test]
2062    fn test_inner_literal_repr_split() {
2063        let inner = InnerLiteral::Multi(Box::new(MultiRef::test_new(["xfoo", " ", "barx"], 1..8)));
2064        let repr = inner.repr();
2065
2066        if let Repr::Split(m) = repr {
2067            let mut b = m.clone();
2068            let mut dst = [0u8; 7];
2069
2070            b.copy_to_slice(&mut dst);
2071
2072            assert_eq!(b"foo bar", &dst);
2073            assert_eq!(0, b.remaining());
2074            assert_eq!(0, b.chunk().len());
2075        } else {
2076            panic!("expected {:?} to be Repr::Split", repr);
2077        }
2078    }
2079
2080    #[rstest]
2081    #[case(Literal::from_static(""), 0)]
2082    #[case(Literal::from_static("a"), 1)]
2083    #[case(Literal::from_static(concat!(
2084        "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
2085        "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
2086        "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
2087        "aaaaaaaaaaaaaab",
2088    )), u8::MAX as usize)]
2089    #[case(Literal::from_ref(""), 0)]
2090    #[case(Literal::from_ref(&"a".repeat(INLINE_LEN)), INLINE_LEN)]
2091    #[case(Literal::from_ref(&"b".repeat(INLINE_LEN+1)), INLINE_LEN+1)]
2092    #[case(Literal::from_ref(&Cow::Borrowed("foo")), 3)]
2093    #[case(Literal::from_ref(&Cow::Owned("bar".to_string())), 3)]
2094    #[case(Literal::from_string("".to_string()), 0)]
2095    #[case(Literal::from_string("c".to_string()), 1)]
2096    #[case(Literal::from_string("d".repeat(100 * INLINE_LEN)), 100 * INLINE_LEN)]
2097    #[case("baz".into(), 3)]
2098    #[case(Cow::Borrowed("").into(), 0)]
2099    #[case(Cow::<str>::Owned("e".repeat(INLINE_LEN-1)).into(), INLINE_LEN-1)]
2100    #[case("qux".to_string().into(), 3)]
2101    #[case(Literal::from_str("hello, world").unwrap(), 12)]
2102    #[case(Literal(InnerLiteral::Multi(Box::new(MultiRef::test_new(["b", "a", "z"], 0..3)))), 3)]
2103    fn test_literal_convert(#[case] literal: Literal, #[case] expect_len: usize) {
2104        assert_eq!(expect_len, literal.len());
2105        assert_eq!(expect_len == 0, literal.is_empty());
2106
2107        let mut b = literal.clone().into_buf();
2108
2109        assert_eq!(expect_len, b.remaining());
2110        assert_eq!(expect_len == 0, !b.has_remaining());
2111
2112        let mut dst = vec![0u8; expect_len];
2113        b.copy_to_slice(&mut dst);
2114
2115        let s = String::from_utf8(dst).unwrap();
2116
2117        assert_eq!(literal.to_string(), s);
2118        assert_eq!(Into::<String>::into(literal), s);
2119    }
2120
2121    #[test]
2122    fn test_literal_compare() {
2123        let a_s = vec![
2124            Literal::from_static("a"),
2125            Literal::from_ref("a"),
2126            Literal::from_string("a".to_string()),
2127            Literal(InnerLiteral::Multi(Box::new(MultiRef::test_new(
2128                ["aaa"],
2129                1..2,
2130            )))),
2131        ];
2132        let aa_s: Vec<Literal> = vec![
2133            Literal::from_ref(&"a".repeat(INLINE_LEN)),
2134            Literal::from_string("a".repeat(INLINE_LEN)),
2135            Literal(InnerLiteral::Multi(Box::new(MultiRef::test_new(
2136                [[b'a'; INLINE_LEN]],
2137                0..INLINE_LEN,
2138            )))),
2139            Literal(InnerLiteral::Multi(Box::new(MultiRef::test_new(
2140                ["a"; INLINE_LEN],
2141                0..INLINE_LEN,
2142            )))),
2143        ];
2144        let aab_s: Vec<Literal> = vec![
2145            Literal::from_static(concat!(
2146                "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
2147                "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
2148                "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
2149                "aaaaaaaaaaaaaab",
2150            )),
2151            Literal::from_ref(("a".repeat(u8::MAX as usize - 1) + "b").as_str()),
2152            Literal::from_string("a".repeat(u8::MAX as usize - 1) + "b"),
2153            Literal(InnerLiteral::Multi(Box::new(MultiRef::test_new(
2154                ["a".repeat(u8::MAX as usize - 1), "abc".to_string()],
2155                1..u8::MAX as usize + 1,
2156            )))),
2157        ];
2158
2159        macro_rules! assert_all_eq {
2160            ($a:expr, $b:expr) => {
2161                assert_eq!($a, $a);
2162                assert_eq!($b, $a);
2163                assert_eq!($a, $b);
2164                assert!($a <= $a);
2165                assert!(!($a < $a));
2166                assert!($a >= $a);
2167                assert!(!($a > $a));
2168            };
2169        }
2170
2171        macro_rules! assert_all_ne {
2172            ($a:expr, $b:expr) => {
2173                assert_ne!($a, $b);
2174                assert_ne!($b, $a);
2175            };
2176        }
2177
2178        macro_rules! assert_all_lt {
2179            ($a:expr, $b:expr) => {
2180                assert!($a < $b);
2181                assert!(!($b < $a));
2182                assert!(!($a > $b));
2183                assert!($b > $a);
2184                assert!($a <= $b);
2185                assert!($b >= $a);
2186            };
2187        }
2188
2189        macro_rules! assert_all_gt {
2190            ($a:expr, $b:expr) => {
2191                assert!($a > $b);
2192                assert!(!($b > $a));
2193                assert!(!($a < $b));
2194                assert!($b < $a);
2195                assert!($a >= $b);
2196                assert!($b <= $a);
2197            };
2198        }
2199
2200        for a in &a_s {
2201            assert_all_eq!(a, "a");
2202            assert_all_eq!(Unescaped::Literal(a), "a");
2203            assert_all_ne!(a, "ab");
2204            assert_all_ne!(Unescaped::Literal(a), "aa");
2205            assert_eq!(&"a", a);
2206            assert_eq!(&"a".to_string(), a);
2207            assert_eq!(a, &"a");
2208            assert_eq!(a, &"a".to_string());
2209
2210            assert!(a <= &"a");
2211            assert!(a <= &"a".to_string());
2212            assert!(!(a < &"a"));
2213            assert!(!(a < &"a".to_string()));
2214            assert!(a >= &"a");
2215            assert!(a >= &"a".to_string());
2216            assert!(!(a > &"a"));
2217            assert!(!(a > &"a".to_string()));
2218
2219            for other in aa_s.iter().chain(aab_s.iter()) {
2220                assert_all_ne!(a, other);
2221                assert_all_lt!(a, other);
2222                assert_all_gt!(other, a);
2223            }
2224        }
2225
2226        for aa in &aa_s {
2227            assert_all_eq!(aa, "a".repeat(INLINE_LEN).as_str());
2228            assert_all_eq!(Unescaped::Literal(aa), "a".repeat(INLINE_LEN).as_str());
2229            assert_all_ne!(aa, "aab");
2230            assert_all_ne!(Unescaped::Literal(aa), "aab");
2231
2232            assert_all_gt!(aa, "a");
2233            assert_all_gt!(Unescaped::Literal(aa), "a");
2234            assert_all_lt!(aa, "aab");
2235            assert_all_lt!(Unescaped::Literal(aa), "aab");
2236
2237            assert!(aa < &"aab");
2238            assert!(aa < &"aab".to_string());
2239            assert!(aa <= &"aab");
2240            assert!(aa <= &"aab".to_string());
2241            assert!(&"aab" > aa);
2242            assert!(&"aab".to_string() > aa);
2243            assert!(aa <= &"aab");
2244            assert!(aa <= &"aab".to_string());
2245            assert!(&"aab" > aa);
2246            assert!(&"aab".to_string() > aa);
2247
2248            for aab in &aab_s {
2249                assert_all_ne!(aa, aab);
2250                assert_all_lt!(aa, aab);
2251                assert_all_gt!(aab, aa);
2252            }
2253        }
2254
2255        macro_rules! check_map {
2256            ($map:ident, $patient_zero:expr, $iter:expr) => {
2257                assert!($map.insert($patient_zero, $patient_zero).is_none());
2258                for item in $iter {
2259                    assert_eq!($patient_zero, *$map.get(&item).unwrap());
2260                }
2261            };
2262        }
2263
2264        let mut hash_map1 = HashMap::new();
2265
2266        check_map!(hash_map1, a_s[0].clone(), a_s.clone());
2267        check_map!(hash_map1, aa_s[0].clone(), aa_s.clone());
2268        check_map!(hash_map1, aab_s[0].clone(), aab_s.clone());
2269
2270        let mut hash_map2 = HashMap::new();
2271
2272        let unescaped_a = Unescaped::Literal(a_s[0].clone());
2273        let unescaped_aa = Unescaped::Literal(aa_s[0].clone());
2274        let unescaped_aab = Unescaped::Literal(aab_s[0].clone());
2275
2276        check_map!(
2277            hash_map2,
2278            unescaped_a.clone(),
2279            a_s.iter().cloned().map(Unescaped::Literal)
2280        );
2281        check_map!(
2282            hash_map2,
2283            unescaped_aa.clone(),
2284            aa_s.iter().cloned().map(Unescaped::Literal)
2285        );
2286        check_map!(
2287            hash_map2,
2288            unescaped_aab.clone(),
2289            aab_s.iter().cloned().map(Unescaped::Literal)
2290        );
2291
2292        let mut btree_map1 = BTreeMap::new();
2293
2294        check_map!(btree_map1, a_s[0].clone(), a_s.clone());
2295        check_map!(btree_map1, aa_s[0].clone(), aa_s.clone());
2296        check_map!(btree_map1, aab_s[0].clone(), aab_s.clone());
2297
2298        let mut btree_map2 = BTreeMap::new();
2299
2300        check_map!(
2301            btree_map2,
2302            unescaped_a.clone(),
2303            a_s.iter().cloned().map(Unescaped::Literal)
2304        );
2305        check_map!(
2306            btree_map2,
2307            unescaped_aa.clone(),
2308            aa_s.iter().cloned().map(Unescaped::Literal)
2309        );
2310        check_map!(
2311            btree_map2,
2312            unescaped_aab.clone(),
2313            aab_s.iter().cloned().map(Unescaped::Literal)
2314        );
2315    }
2316
2317    #[rstest]
2318    #[case(Literal::from_static(""))]
2319    #[case(Literal::from_ref(""))]
2320    #[case(Literal::from_string("".into()))]
2321    #[case(Literal(InnerLiteral::Uni(UniRef::test_new("", 0..0))))]
2322    #[case(Literal(InnerLiteral::Uni(UniRef::test_new("a", 1..1))))]
2323    #[case(Literal(InnerLiteral::Uni(UniRef::test_new("ab", 1..1))))]
2324    #[case(Literal(InnerLiteral::Multi(Box::new(MultiRef::test_new(["0"], 0..0)))))]
2325    #[case(Literal(InnerLiteral::Multi(Box::new(MultiRef::test_new(["a"], 1..1)))))]
2326    #[case(Literal(InnerLiteral::Multi(Box::new(MultiRef::test_new(["a", "b"], 1..1)))))]
2327    #[should_panic(expected = "not enough bytes in buffer (1 requested, but only 0 remain)")]
2328    fn test_literal_buf_advance_panic(#[case] literal: Literal) {
2329        let _ = literal.into_buf().advance(1);
2330    }
2331
2332    #[rstest]
2333    #[case(Literal::from_static(""))]
2334    #[case(Literal::from_ref(""))]
2335    #[case(Literal::from_string("".into()))]
2336    #[case(Literal(InnerLiteral::Uni(UniRef::test_new("", 0..0))))]
2337    #[case(Literal(InnerLiteral::Uni(UniRef::test_new("a", 1..1))))]
2338    #[case(Literal(InnerLiteral::Uni(UniRef::test_new("ab", 1..1))))]
2339    #[case(Literal(InnerLiteral::Multi(Box::new(MultiRef::test_new(["0"], 0..0)))))]
2340    #[case(Literal(InnerLiteral::Multi(Box::new(MultiRef::test_new(["a"], 1..1)))))]
2341    #[case(Literal(InnerLiteral::Multi(Box::new(MultiRef::test_new(["a", "b"], 1..1)))))]
2342    #[should_panic(expected = "not enough bytes in buffer (1 requested, but only 0 remain)")]
2343    fn test_literal_buf_copy_to_slice_panic(#[case] literal: Literal) {
2344        let mut dst = [0; 1];
2345
2346        let _ = literal.into_buf().copy_to_slice(&mut dst);
2347    }
2348
2349    #[rstest]
2350    #[case(Content::from_static(""), "", None)]
2351    #[case(Content::from_static(""), "", None)]
2352    #[case(
2353        Content::from_static(concat!(
2354            "................................................................................",
2355            ",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,",
2356            "________________________________________________________________________________",
2357            "+++++++++++++++",
2358        )),
2359        concat!(
2360            "................................................................................",
2361            ",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,",
2362            "________________________________________________________________________________",
2363            "+++++++++++++++",
2364        ),
2365        None,
2366    )]
2367    #[case(Content(InnerContent::Inline(0, [0; INLINE_LEN])), "", None)]
2368    #[case(Content(InnerContent::NotEscapedUni(UniRef::test_new("", 0..0))), "", None)]
2369    #[case(Content(InnerContent::NotEscapedUni(UniRef::test_new("foo", 0..3))), "foo", None)]
2370    #[case(Content(InnerContent::NotEscapedUni(UniRef::test_new("a barge", 2..5))), "bar", None)]
2371    #[case(Content(InnerContent::NotEscapedMulti(Box::new(MultiRef::test_new([""], 0..0)))), "", None)]
2372    #[case(Content(InnerContent::NotEscapedMulti(Box::new(MultiRef::test_new(["a b", "a", "rge"], 2..5)))), "bar", None)]
2373    #[case(Content(InnerContent::EscapedUni(UniRef::test_new("", 0..0))), "", Some(""))]
2374    #[case(Content(InnerContent::EscapedUni(UniRef::test_new("foo", 0..3))), "foo", Some("foo"))]
2375    #[case(Content(InnerContent::EscapedUni(UniRef::test_new("a b\\u0061rge", 2..10))), "b\\u0061r", Some("bar"))]
2376    #[case(Content(InnerContent::EscapedMulti(Box::new(MultiRef::test_new([""], 0..0)))), "", Some(""))]
2377    #[case(Content(InnerContent::EscapedMulti(Box::new(MultiRef::test_new(["tomf", "oo", "lery"], 3..6)))), "foo", Some("foo"))]
2378    #[case(Content(InnerContent::EscapedMulti(Box::new(MultiRef::test_new(["\\", "u", "006", "6\\u", "0", "06", "fox"], 0..13)))), "\\u0066\\u006fo", Some("foo"))]
2379    #[case(Content::from_bufs(&Bufs::new(Bufs::MIN_BUF_SIZE), 0..0, false), "", None)]
2380    #[case(Content::from_bufs(&Bufs::new(Bufs::MIN_BUF_SIZE), 0..0, true), "", Some(""))]
2381    fn test_content(
2382        #[case] content: Content,
2383        #[case] expect_literal: &str,
2384        #[case] expect_unescaped: Option<&str>,
2385    ) {
2386        assert_eq!(expect_literal, content.literal().into_string());
2387        assert_eq!(expect_unescaped.is_some(), content.is_escaped());
2388        if let Some(expect) = expect_unescaped {
2389            assert_eq!(expect, content.unescaped().into_string());
2390        }
2391    }
2392
2393    #[rstest]
2394    #[case(
2395        ErrorKind::Read,
2396        "read error at line 2, column 1 (offset: 3)",
2397        Some(io::ErrorKind::BrokenPipe)
2398    )]
2399    #[case(
2400        ErrorKind::UnexpectedEof(Token::LitNull),
2401        "unexpected EOF in null token at line 2, column 1 (offset: 3)",
2402        None
2403    )]
2404    fn test_error(
2405        #[case] kind: ErrorKind,
2406        #[case] expect_display: &str,
2407        #[case] source: Option<std::io::ErrorKind>,
2408    ) {
2409        let pos = Pos::new(3, 2, 1);
2410        let err = Error {
2411            kind,
2412            pos,
2413            source: source.map(io::Error::from).map(Arc::new),
2414        };
2415
2416        assert_eq!(kind, err.kind());
2417        assert_eq!(&pos, err.pos());
2418        assert_eq!(
2419            source,
2420            err.source()
2421                .and_then(|e| e.downcast_ref::<io::Error>())
2422                .map(|e| e.kind()),
2423        );
2424
2425        let actual_display = format!("{err}");
2426        assert_eq!(expect_display, actual_display);
2427    }
2428
2429    #[test]
2430    #[should_panic(expected = "buffer size too low: minimum is 1 bytes, but 0 was given")]
2431    fn test_bufs_new_panic() {
2432        let _ = Bufs::new(0);
2433    }
2434
2435    #[test]
2436    fn test_bufs_new_reset() {
2437        let mut bufs = Bufs::new(Bufs::MIN_BUF_SIZE);
2438
2439        bufs.reset();
2440
2441        assert!(bufs.current.is_empty());
2442        assert!(bufs.used.is_empty());
2443        assert_eq!(0, bufs.i);
2444        assert_eq!(0, bufs.j);
2445        assert_eq!(0, bufs.k);
2446        assert!(bufs.maybe_free.is_empty());
2447        assert_eq!(Bufs::MIN_BUF_SIZE, bufs.buf_size);
2448        assert!(!bufs.eof);
2449
2450        assert!(bufs.byte().is_none());
2451    }
2452
2453    #[test]
2454    fn test_bufs_new_byte() {
2455        let mut bufs = Bufs::new(Bufs::MIN_BUF_SIZE);
2456
2457        assert!(bufs.byte().is_none());
2458
2459        assert!(bufs.current.is_empty());
2460        assert!(bufs.used.is_empty());
2461        assert_eq!(0, bufs.i);
2462        assert_eq!(0, bufs.j);
2463        assert_eq!(0, bufs.k);
2464        assert!(bufs.maybe_free.is_empty());
2465        assert_eq!(Bufs::MIN_BUF_SIZE, bufs.buf_size);
2466        assert!(!bufs.eof);
2467    }
2468
2469    #[test]
2470    fn test_bufs_read_empty() {
2471        let mut bufs = Bufs::new(Bufs::MIN_BUF_SIZE);
2472        let mut empty: &[u8] = &[];
2473
2474        assert!(matches!(bufs.read(&mut empty), Ok(true)));
2475
2476        assert!(bufs.current.is_empty());
2477        assert!(bufs.used.is_empty());
2478        assert_eq!(0, bufs.i);
2479        assert_eq!(0, bufs.j);
2480        assert_eq!(0, bufs.k);
2481        assert!(bufs.maybe_free.is_empty());
2482        assert_eq!(Bufs::MIN_BUF_SIZE, bufs.buf_size);
2483        assert!(bufs.eof);
2484
2485        assert!(matches!(bufs.read(&mut empty), Ok(true)));
2486    }
2487
2488    #[rstest]
2489    #[case(Bufs::MIN_BUF_SIZE, "a", 0)]
2490    #[case(Bufs::DEFAULT_BUF_SIZE, "b", 0)]
2491    #[case(Bufs::MIN_BUF_SIZE, "foo", 2)]
2492    #[case(Bufs::DEFAULT_BUF_SIZE, "bar", 0)]
2493    fn test_bufs_read_to_end(
2494        #[case] buf_size: usize,
2495        #[case] input: &str,
2496        #[case] expect_used: usize,
2497    ) {
2498        let mut bufs = Bufs::new(buf_size);
2499        let mut reader = input.as_bytes();
2500        let mut dst = Vec::with_capacity(input.len());
2501
2502        loop {
2503            assert!(bufs.used.len() <= expect_used);
2504
2505            loop {
2506                match bufs.byte() {
2507                    Some(b) => dst.push(b),
2508                    None => break,
2509                }
2510            }
2511
2512            match bufs.read(&mut reader) {
2513                Ok(true) => break,
2514                Ok(false) => continue,
2515                Err(err) => panic!("unexpected error: {err},"),
2516            }
2517        }
2518
2519        assert!(bufs.eof);
2520        assert_eq!(input, str::from_utf8(&dst).unwrap());
2521        assert_eq!(expect_used, bufs.used.len());
2522        assert_eq!(buf_size, bufs.current.capacity());
2523        bufs.used.iter().enumerate().for_each(|(i, u)| {
2524            assert_eq!(
2525                buf_size,
2526                u.len(),
2527                "expected used[{i}] to have length {buf_size}, but it is {}",
2528                u.len()
2529            )
2530        });
2531    }
2532
2533    #[test]
2534    #[should_panic(expected = "read 2 bytes but buffer size is only 1")]
2535    fn test_bufs_read_too_much() {
2536        struct ReadTooMuch;
2537
2538        impl Read for ReadTooMuch {
2539            fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
2540                Ok(buf.len() + 1)
2541            }
2542        }
2543
2544        let mut bufs = Bufs::new(Bufs::MIN_BUF_SIZE);
2545        let mut reader = ReadTooMuch;
2546
2547        let _ = bufs.read(&mut reader);
2548    }
2549
2550    #[test]
2551    fn test_bufs_read_error() {
2552        struct ReadError;
2553
2554        impl Read for ReadError {
2555            fn read(&mut self, _buf: &mut [u8]) -> io::Result<usize> {
2556                Err(io::Error::new(io::ErrorKind::Other, "snafu"))
2557            }
2558        }
2559
2560        let mut bufs = Bufs::new(Bufs::MIN_BUF_SIZE);
2561        let mut reader = ReadError;
2562
2563        let result = bufs.read(&mut reader);
2564
2565        assert!(
2566            matches!(result, Err(err) if err.kind() == io::ErrorKind::Other && err.to_string() == "snafu")
2567        );
2568    }
2569
2570    #[test]
2571    fn test_analyzer_empty() {
2572        let mut an = ReadAnalyzer::new(io::empty());
2573
2574        assert_eq!(an.next(), Token::Eof);
2575        assert_eq!("", an.content().literal().into_string());
2576        assert_eq!("", an.content().unescaped().into_string());
2577    }
2578
2579    #[test]
2580    fn test_analyzer_initial_state_content() {
2581        let an = ReadAnalyzer::new(io::empty());
2582
2583        for _ in 0..5 {
2584            let content = an.content();
2585            assert_eq!("", content.literal().into_string());
2586            assert!(!content.is_escaped());
2587            assert_eq!("", content.unescaped().into_string());
2588
2589            let content = an.try_content().unwrap();
2590            assert_eq!("", content.literal().into_string());
2591            assert!(!content.is_escaped());
2592            assert_eq!("", content.unescaped().into_string());
2593        }
2594    }
2595
2596    #[test]
2597    #[should_panic(
2598        expected = "no error: last `next()` did not return `Token::Err` (use `content()` instead)"
2599    )]
2600    fn test_analyzer_initial_state_err() {
2601        let _ = ReadAnalyzer::new(io::empty()).err();
2602    }
2603
2604    #[rstest]
2605    #[case("", Token::Eof, None)]
2606    #[case("{", Token::ObjBegin, None)]
2607    #[case("}", Token::ObjEnd, None)]
2608    #[case("[", Token::ArrBegin, None)]
2609    #[case("]", Token::ArrEnd, None)]
2610    #[case(":", Token::NameSep, None)]
2611    #[case(",", Token::ValueSep, None)]
2612    #[case("false", Token::LitFalse, None)]
2613    #[case("null", Token::LitNull, None)]
2614    #[case("true", Token::LitTrue, None)]
2615    #[case("0", Token::Num, None)]
2616    #[case("-0", Token::Num, None)]
2617    #[case("1", Token::Num, None)]
2618    #[case("-1", Token::Num, None)]
2619    #[case("12", Token::Num, None)]
2620    #[case("-12", Token::Num, None)]
2621    #[case("0.0", Token::Num, None)]
2622    #[case("-0.0", Token::Num, None)]
2623    #[case("0.123456789", Token::Num, None)]
2624    #[case("-123.456789", Token::Num, None)]
2625    #[case("0E0", Token::Num, None)]
2626    #[case("0e0", Token::Num, None)]
2627    #[case("0E+0", Token::Num, None)]
2628    #[case("0e+0", Token::Num, None)]
2629    #[case("0E-0", Token::Num, None)]
2630    #[case("0e-0", Token::Num, None)]
2631    #[case("0.0E0", Token::Num, None)]
2632    #[case("0.0e0", Token::Num, None)]
2633    #[case("0.0E+0", Token::Num, None)]
2634    #[case("0.0e+0", Token::Num, None)]
2635    #[case("0.0E0", Token::Num, None)]
2636    #[case("0.0e0", Token::Num, None)]
2637    #[case("0E0", Token::Num, None)]
2638    #[case("0e0", Token::Num, None)]
2639    #[case("-0E+0", Token::Num, None)]
2640    #[case("-0e+0", Token::Num, None)]
2641    #[case("-0E-0", Token::Num, None)]
2642    #[case("-0e-0", Token::Num, None)]
2643    #[case("-0.0E0", Token::Num, None)]
2644    #[case("-0.0e0", Token::Num, None)]
2645    #[case("-0.0E+0", Token::Num, None)]
2646    #[case("-0.0e+0", Token::Num, None)]
2647    #[case("-0.0E0", Token::Num, None)]
2648    #[case("-0.0e0", Token::Num, None)]
2649    #[case("123E456", Token::Num, None)]
2650    #[case("123e456", Token::Num, None)]
2651    #[case("123.456E+7", Token::Num, None)]
2652    #[case("123.456e+7", Token::Num, None)]
2653    #[case("123.456E-89", Token::Num, None)]
2654    #[case("123.456e-89", Token::Num, None)]
2655    #[case("-123E456", Token::Num, None)]
2656    #[case("-123e456", Token::Num, None)]
2657    #[case("-123.456E+7", Token::Num, None)]
2658    #[case("-123.456e+7", Token::Num, None)]
2659    #[case("-123.456E-89", Token::Num, None)]
2660    #[case("-123.456e-89", Token::Num, None)]
2661    #[case(r#""""#, Token::Str, None)]
2662    #[case(r#"" ""#, Token::Str, None)]
2663    #[case(r#""foo""#, Token::Str, None)]
2664    #[case(r#""The quick brown fox jumped over the lazy dog!""#, Token::Str, None)]
2665    #[case(r#""\\""#, Token::Str, Some(r#""\""#))]
2666    #[case(r#""\/""#, Token::Str, Some(r#""/""#))]
2667    #[case(r#""\t""#, Token::Str, Some("\"\t\""))]
2668    #[case(r#""\r""#, Token::Str, Some("\"\r\""))]
2669    #[case(r#""\n""#, Token::Str, Some("\"\n\""))]
2670    #[case(r#""\f""#, Token::Str, Some("\"\u{000c}\""))]
2671    #[case(r#""\b""#, Token::Str, Some("\"\u{0008}\""))]
2672    #[case(r#""\u0000""#, Token::Str, Some("\"\u{0000}\""))]
2673    #[case(r#""\u001f""#, Token::Str, Some("\"\u{001f}\""))]
2674    #[case(r#""\u0020""#, Token::Str, Some(r#"" ""#))]
2675    #[case(r#""\u007E""#, Token::Str, Some(r#""~""#))]
2676    #[case(r#""\u007F""#, Token::Str, Some("\"\u{007f}\""))]
2677    #[case(r#""\u0080""#, Token::Str, Some("\"\u{0080}\""))]
2678    #[case(r#""\u0100""#, Token::Str, Some("\"\u{0100}\""))]
2679    #[case(r#""\uE000""#, Token::Str, Some("\"\u{e000}\""))]
2680    #[case(r#""\ufDCf""#, Token::Str, Some("\"\u{fdcf}\""))]
2681    #[case(r#""\uFdeF""#, Token::Str, Some("\"\u{fdef}\""))]
2682    #[case(r#""\ufffd""#, Token::Str, Some("\"\u{fffd}\""))]
2683    #[case(r#""\uFFFE""#, Token::Str, Some("\"\u{fffe}\""))]
2684    #[case(r#""\uFFFF""#, Token::Str, Some("\"\u{ffff}\""))]
2685    #[case(r#""\ud800\udc00""#, Token::Str, Some("\"\u{10000}\""))] // Lowest valid surrogate pair → U+10000
2686    #[case(r#""\uD800\uDFFF""#, Token::Str, Some("\"\u{103ff}\""))] // High surrogate with highest low surrogate → U+103FF
2687    #[case(r#""\uDBFF\uDC00""#, Token::Str, Some("\"\u{10fc00}\""))] // Highest high surrogate with lowest low surrogate → U+10FC00
2688    #[case(r#""\udbFf\udfff""#, Token::Str, Some("\"\u{10ffff}\""))] // Highest valid surrogate pair → U+10FFFF (max Unicode scalar value)
2689    #[case(" ", Token::White, None)]
2690    #[case("\t", Token::White, None)]
2691    #[case("  ", Token::White, None)]
2692    #[case("\t\t", Token::White, None)]
2693    #[case(" \t \t    \t          \t\t", Token::White, None)]
2694    fn test_analyzer_single_token(
2695        #[case] input: &str,
2696        #[case] expect: Token,
2697        #[case] unescaped: Option<&str>,
2698    ) {
2699        const BUF_SIZES: [usize; 7] = [
2700            1,
2701            2,
2702            INLINE_LEN - 1,
2703            INLINE_LEN,
2704            INLINE_LEN + 1,
2705            10,
2706            Bufs::DEFAULT_BUF_SIZE,
2707        ];
2708
2709        for buf_size in BUF_SIZES {
2710            // With content fetch.
2711            {
2712                let mut an =
2713                    ReadAnalyzer::with_buf_size(io::Cursor::new(input.as_bytes()), buf_size);
2714                assert_eq!(Pos::default(), *an.pos());
2715
2716                assert_eq!(expect, an.next());
2717                assert_eq!(Pos::default(), *an.pos());
2718
2719                let content = an.content();
2720                assert_eq!(
2721                    input,
2722                    content.literal().into_string(),
2723                    "buf_size = {buf_size}, input = {input:?}, content = {content}"
2724                );
2725                assert_eq!(unescaped.is_some(), content.is_escaped());
2726                if let Some(u) = unescaped {
2727                    assert_eq!(u, content.unescaped().into_string());
2728                } else {
2729                    assert_eq!(input, content.unescaped().into_string());
2730                }
2731
2732                assert_eq!(Token::Eof, an.next());
2733                assert_eq!(
2734                    Pos {
2735                        offset: input.len(),
2736                        line: 1,
2737                        col: input.len() + 1,
2738                    },
2739                    *an.pos()
2740                );
2741
2742                assert_eq!(Token::Eof, an.next());
2743                assert_eq!(
2744                    Pos {
2745                        offset: input.len(),
2746                        line: 1,
2747                        col: input.len() + 1,
2748                    },
2749                    *an.pos()
2750                );
2751            }
2752
2753            // Without content fetch.
2754            {
2755                let mut an =
2756                    ReadAnalyzer::with_buf_size(io::Cursor::new(input.as_bytes()), buf_size);
2757                assert_eq!(Pos::default(), *an.pos());
2758
2759                assert_eq!(expect, an.next());
2760                assert_eq!(Pos::default(), *an.pos());
2761
2762                assert_eq!(Token::Eof, an.next());
2763                assert_eq!(
2764                    Pos {
2765                        offset: input.len(),
2766                        line: 1,
2767                        col: input.len() + 1,
2768                    },
2769                    *an.pos()
2770                );
2771
2772                assert_eq!(Token::Eof, an.next());
2773                assert_eq!(
2774                    Pos {
2775                        offset: input.len(),
2776                        line: 1,
2777                        col: input.len() + 1,
2778                    },
2779                    *an.pos()
2780                );
2781            }
2782        }
2783    }
2784
2785    #[rstest]
2786    #[case(r#"["#)]
2787    #[case(r#"]"#)]
2788    #[case(r#"false"#)]
2789    #[case(r#":"#)]
2790    #[case(r#"null"#)]
2791    #[case(r#"3.14159e+0"#)]
2792    #[case(r#"{"#)]
2793    #[case(r#"}"#)]
2794    #[case(r#""foo\/\u1234\/bar""#)]
2795    #[case(r#"true"#)]
2796    #[case(r#","#)]
2797    #[case("\n\n\n   ")]
2798    #[should_panic(
2799        expected = "no error: last `next()` did not return `Token::Err` (use `content()` instead)"
2800    )]
2801    fn test_analyzer_single_token_panic_no_err(#[case] input: &str) {
2802        const BUF_SIZES: [usize; 7] = [
2803            1,
2804            2,
2805            INLINE_LEN - 1,
2806            INLINE_LEN,
2807            INLINE_LEN + 1,
2808            10,
2809            Bufs::DEFAULT_BUF_SIZE,
2810        ];
2811
2812        for buf_size in BUF_SIZES {
2813            let mut an = ReadAnalyzer::with_buf_size(io::Cursor::new(input.as_bytes()), buf_size);
2814
2815            let token = an.next();
2816            assert!(!token.is_terminal(), "input = {input:?}, token = {token:?}");
2817
2818            let _ = an.err();
2819        }
2820    }
2821
2822    #[test]
2823    #[should_panic(expected = "last `next()` returned `Token::Err` (use `err()` instead)")]
2824    fn test_analyzer_single_error_panic_no_content() {
2825        let mut an = ReadAnalyzer::new("a".as_bytes());
2826
2827        assert_eq!(Token::Err, an.next());
2828
2829        let _ = an.content();
2830    }
2831
2832    #[rstest]
2833    #[case(r#""\uDC00""#, ErrorKind::BadSurrogate { first: 0xdc00, second: None, offset: 5 }, 1)]
2834    #[case(&[b'"', 0xc2, 0xc0], ErrorKind::BadUtf8ContByte { seq_len: 2, offset: 1, value: 0xc0 }, 1)]
2835    #[case(&b"\"\x80", ErrorKind::UnexpectedByte { token: Some(Token::Str), expect: Expect::StrChar, actual: 0x80 }, 1)]
2836    #[case([b'"'], ErrorKind::UnexpectedEof(Token::Str), 1)]
2837    #[case("10.", ErrorKind::UnexpectedEof(Token::Num), 3)]
2838    fn test_analyzer_single_lexical_error<T>(
2839        #[case] input: T,
2840        #[case] kind: ErrorKind,
2841        #[case] pos_offset: usize,
2842    ) where
2843        T: AsRef<[u8]> + fmt::Debug,
2844    {
2845        const BUF_SIZES: [usize; 7] = [
2846            1,
2847            2,
2848            INLINE_LEN - 1,
2849            INLINE_LEN,
2850            INLINE_LEN + 1,
2851            10,
2852            Bufs::DEFAULT_BUF_SIZE,
2853        ];
2854
2855        for buf_size in BUF_SIZES {
2856            // With error fetch.
2857            {
2858                let mut an = ReadAnalyzer::with_buf_size(input.as_ref(), buf_size);
2859                assert_eq!(Pos::default(), *an.pos());
2860
2861                assert_eq!(Token::Err, an.next());
2862                assert_eq!(Pos::default(), *an.pos());
2863
2864                let err = an.err();
2865                assert_eq!(kind, err.kind());
2866                assert_eq!(
2867                    Pos {
2868                        offset: pos_offset,
2869                        line: 1,
2870                        col: pos_offset + 1
2871                    },
2872                    *err.pos()
2873                );
2874                assert!(err.source().is_none());
2875
2876                assert_eq!(Token::Err, an.next());
2877                assert_eq!(Pos::default(), *an.pos());
2878            }
2879
2880            // Without error fetch.
2881            {
2882                let mut an = ReadAnalyzer::with_buf_size(input.as_ref(), buf_size);
2883                assert_eq!(Pos::default(), *an.pos());
2884
2885                assert_eq!(Token::Err, an.next());
2886                assert_eq!(Pos::default(), *an.pos());
2887
2888                assert_eq!(Token::Err, an.next());
2889                assert_eq!(Pos::default(), *an.pos());
2890            }
2891        }
2892    }
2893
2894    #[rstest]
2895    #[case(1, r#"{"#, [Token::ObjBegin], Pos::new(1, 1, 2), Pos::new(1, 1, 2))]
2896    #[case(1, r#"fals"#, [], Pos::default(), Pos::new(4, 1, 5))]
2897    #[case(2, r#"fals"#, [], Pos::default(), Pos::new(4, 1, 5))]
2898    #[case(Bufs::DEFAULT_BUF_SIZE, r#"fals"#, [], Pos::default(), Pos::new(4, 1, 5))]
2899    #[case(1, r#"[3.141592653589793238462643383279"#, [Token::ArrBegin], Pos::new(1, 1, 2), Pos::new(33, 1, 34))]
2900    #[case(2, r#"[3.141592653589793238462643383279"#, [Token::ArrBegin], Pos::new(1, 1, 2), Pos::new(33, 1, 34))]
2901    #[case(1, r#"[3.141592653589793238462643383279,"#, [Token::ArrBegin, Token::Num, Token::ValueSep], Pos::new(34, 1, 35), Pos::new(34, 1, 35))]
2902    #[case(2, r#"[3.141592653589793238462643383279,"#, [Token::ArrBegin, Token::Num, Token::ValueSep], Pos::new(34, 1, 35), Pos::new(34, 1, 35))]
2903    #[case(INLINE_LEN-1, r#"[314.1592653589793238462643383279e-2"#, [Token::ArrBegin], Pos::new(1, 1, 2), Pos::new(36, 1, 37))]
2904    #[case(INLINE_LEN-1, r#"[314.1592653589793238462643383279e-2 :"#, [Token::ArrBegin, Token::Num, Token::White, Token::NameSep], Pos::new(38, 1, 39), Pos::new(38, 1, 39))]
2905    #[case(INLINE_LEN, r#"[314.1592653589793238462643383279e-2"#, [Token::ArrBegin], Pos::new(1, 1, 2), Pos::new(36, 1, 37))]
2906    #[case(INLINE_LEN, r#"[314.1592653589793238462643383279e-2 :"#, [Token::ArrBegin, Token::Num, Token::White, Token::NameSep], Pos::new(38, 1, 39), Pos::new(38, 1, 39))]
2907    #[case(INLINE_LEN+1, r#"[314.1592653589793238462643383279e-2"#, [Token::ArrBegin], Pos::new(1, 1, 2), Pos::new(36, 1, 37))]
2908    #[case(INLINE_LEN+1, r#"[314.1592653589793238462643383279E+999 :"#, [Token::ArrBegin, Token::Num, Token::White, Token::NameSep], Pos::new(40, 1, 41), Pos::new(40, 1, 41))]
2909    #[case(Bufs::DEFAULT_BUF_SIZE, r#"[3141.592653589793238462643383279e-3,{"aaaaaaaaaaaaaaaaaaaaaaaaaaaa":true}]    "#, [Token::ArrBegin, Token::Num, Token::ValueSep, Token::ObjBegin, Token::Str, Token::NameSep, Token::LitTrue,  Token::ObjEnd, Token::ArrEnd], Pos::new(75, 1, 76), Pos::new(79, 1, 80))]
2910    fn test_analyzer_single_read_error<T>(
2911        #[case] buf_size: usize,
2912        #[case] input: &str,
2913        #[case] expect_tokens: T,
2914        #[case] expect_token_pos: Pos,
2915        #[case] expect_err_pos: Pos,
2916    ) where
2917        T: IntoIterator<Item = Token>,
2918    {
2919        struct ErrorRead<'a>(&'a [u8]);
2920
2921        impl<'a> Read for ErrorRead<'a> {
2922            fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
2923                let n = min(buf.len(), self.0.len());
2924                if n == 0 {
2925                    Err(io::Error::new(io::ErrorKind::Other, "snafu"))
2926                } else {
2927                    buf[..n].copy_from_slice(&self.0[..n]);
2928                    self.0 = &self.0[n..];
2929
2930                    Ok(n)
2931                }
2932            }
2933        }
2934
2935        let reader = ErrorRead(input.as_bytes());
2936        let mut an = ReadAnalyzer::with_buf_size(reader, buf_size);
2937
2938        for expect_token in expect_tokens.into_iter() {
2939            let actual_token = an.next();
2940
2941            assert_eq!(expect_token, actual_token);
2942        }
2943
2944        assert_eq!(Token::Err, an.next());
2945        assert_eq!(expect_token_pos, *an.pos());
2946        let err = an.err();
2947        assert_eq!(ErrorKind::Read, err.kind());
2948        assert_eq!(expect_err_pos, *err.pos());
2949
2950        assert_eq!(Token::Err, an.next());
2951        assert_eq!(expect_token_pos, *an.pos());
2952        let err = an.try_content().unwrap_err();
2953        assert_eq!(ErrorKind::Read, err.kind());
2954        assert_eq!(expect_err_pos, *err.pos());
2955        assert_eq!(
2956            io::ErrorKind::Other,
2957            err.source()
2958                .and_then(|e| e.downcast_ref::<io::Error>())
2959                .map(|e| e.kind())
2960                .unwrap(),
2961        );
2962        assert_eq!("snafu", format!("{}", err.source().unwrap()));
2963
2964        assert_eq!(Token::Err, an.next());
2965    }
2966
2967    #[rstest]
2968    #[case(1)]
2969    #[case(2)]
2970    #[case(INLINE_LEN - 1)]
2971    #[case(INLINE_LEN)]
2972    #[case(INLINE_LEN + 1)]
2973    #[case(Bufs::DEFAULT_BUF_SIZE)]
2974    fn test_analyzer_into_parser(#[case] buf_size: usize) {
2975        let input = r#"{"hello":["🌍"]}"#;
2976        let mut parser = ReadAnalyzer::with_buf_size(input.as_bytes(), buf_size).into_parser();
2977
2978        assert_eq!(Token::ObjBegin, parser.next());
2979        assert_eq!("{", parser.content().literal());
2980        assert_eq!(Pos::default(), *parser.pos());
2981        assert_eq!(1, parser.level());
2982
2983        assert_eq!(Token::Str, parser.next());
2984        assert_eq!(r#""hello""#, parser.content().literal());
2985        assert_eq!(Pos::new(1, 1, 2), *parser.pos());
2986        assert_eq!(1, parser.level());
2987
2988        assert_eq!(Token::NameSep, parser.next());
2989        assert_eq!(":", parser.content().literal());
2990        assert_eq!(Pos::new(8, 1, 9), *parser.pos());
2991        assert_eq!(1, parser.level());
2992
2993        assert_eq!(Token::ArrBegin, parser.next());
2994        assert_eq!("[", parser.content().literal());
2995        assert_eq!(Pos::new(9, 1, 10), *parser.pos());
2996        assert_eq!(2, parser.level());
2997
2998        assert_eq!(Token::Str, parser.next());
2999        assert_eq!(r#""🌍""#, parser.content().literal());
3000        assert_eq!(Pos::new(10, 1, 11), *parser.pos());
3001        assert_eq!(2, parser.level());
3002
3003        assert_eq!(Token::ArrEnd, parser.next());
3004        assert_eq!("]", parser.content().literal());
3005        assert_eq!(Pos::new(16, 1, 14), *parser.pos());
3006        assert_eq!(1, parser.level());
3007
3008        assert_eq!(Token::ObjEnd, parser.next());
3009        assert_eq!("}", parser.content().literal());
3010        assert_eq!(Pos::new(17, 1, 15), *parser.pos());
3011        assert_eq!(0, parser.level());
3012
3013        for _ in 0..5 {
3014            assert_eq!(Token::Eof, parser.next());
3015            assert_eq!(Pos::new(18, 1, 16), *parser.pos());
3016            assert_eq!(0, parser.level());
3017        }
3018    }
3019
3020    #[rstest]
3021    #[case(1)]
3022    #[case(2)]
3023    #[case(INLINE_LEN - 1)]
3024    #[case(INLINE_LEN)]
3025    #[case(INLINE_LEN + 1)]
3026    #[case(Bufs::DEFAULT_BUF_SIZE)]
3027    fn test_analyzer_smoke(#[case] buf_size: usize) {
3028        const JSON_TEXT: &str = r#"
3029
3030[
3031  [],
3032  {},
3033  [true, false, null, "foo",-9, -9.9, -99.99e-99, {"❤️😊":1}, 10000000],
3034  "\u0068\u0065\u006c\u006c\u006f\u002c\u0020\u0077\u006f\u0072\u006c\u0064",
3035  "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt.\nUt labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco.\nLaboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in."
3036]"#;
3037
3038        const EXPECT: &[(Token, Pos, &str, Option<&str>)] = &[
3039            // Line 1.
3040            (Token::White, Pos::new(0, 1, 1), "\n\n", None),
3041            // Line 3.
3042            (Token::ArrBegin, Pos::new(2, 3, 1), "[", None),
3043            (Token::White, Pos::new(3, 3, 2), "\n  ", None),
3044            // Line 4.
3045            (Token::ArrBegin, Pos::new(6, 4, 3), "[", None),
3046            (Token::ArrEnd, Pos::new(7, 4, 4), "]", None),
3047            (Token::ValueSep, Pos::new(8, 4, 5), ",", None),
3048            (Token::White, Pos::new(9, 4, 6), "\n  ", None),
3049            // Line 5.
3050            (Token::ObjBegin, Pos::new(12, 5, 3), "{", None),
3051            (Token::ObjEnd, Pos::new(13, 5, 4), "}", None),
3052            (Token::ValueSep, Pos::new(14, 5, 5), ",", None),
3053            (Token::White, Pos::new(15, 5, 6), "\n  ", None),
3054            // Line 6.
3055            (Token::ArrBegin, Pos::new(18, 6, 3), "[", None),
3056            (Token::LitTrue, Pos::new(19, 6, 4), "true", None),
3057            (Token::ValueSep, Pos::new(23, 6, 8), ",", None),
3058            (Token::White, Pos::new(24, 6, 9), " ", None),
3059            (Token::LitFalse, Pos::new(25, 6, 10), "false", None),
3060            (Token::ValueSep, Pos::new(30, 6, 15), ",", None),
3061            (Token::White, Pos::new(31, 6, 16), " ", None),
3062            (Token::LitNull, Pos::new(32, 6, 17), "null", None),
3063            (Token::ValueSep, Pos::new(36, 6, 21), ",", None),
3064            (Token::White, Pos::new(37, 6, 22), " ", None),
3065            (Token::Str, Pos::new(38, 6, 23), r#""foo""#, None),
3066            (Token::ValueSep, Pos::new(43, 6, 28), ",", None),
3067            (Token::Num, Pos::new(44, 6, 29), "-9", None),
3068            (Token::ValueSep, Pos::new(46, 6, 31), ",", None),
3069            (Token::White, Pos::new(47, 6, 32), " ", None),
3070            (Token::Num, Pos::new(48, 6, 33), "-9.9", None),
3071            (Token::ValueSep, Pos::new(52, 6, 37), ",", None),
3072            (Token::White, Pos::new(53, 6, 38), " ", None),
3073            (Token::Num, Pos::new(54, 6, 39), "-99.99e-99", None),
3074            (Token::ValueSep, Pos::new(64, 6, 49), ",", None),
3075            (Token::White, Pos::new(65, 6, 50), " ", None),
3076            (Token::ObjBegin, Pos::new(66, 6, 51), "{", None),
3077            (Token::Str, Pos::new(67, 6, 52), r#""❤️😊""#, None),
3078            (Token::NameSep, Pos::new(79, 6, 57), ":", None),
3079            (Token::Num, Pos::new(80, 6, 58), "1", None),
3080            (Token::ObjEnd, Pos::new(81, 6, 59), "}", None),
3081            (Token::ValueSep, Pos::new(82, 6, 60), ",", None),
3082            (Token::White, Pos::new(83, 6, 61), " ", None),
3083            (Token::Num, Pos::new(84, 6, 62), "10000000", None),
3084            (Token::ArrEnd, Pos::new(92, 6, 70), "]", None),
3085            (Token::ValueSep, Pos::new(93, 6, 71), ",", None),
3086            (Token::White, Pos::new(94, 6, 72), "\n  ", None),
3087            // Line 7.
3088            (
3089                Token::Str,
3090                Pos::new(97, 7, 3),
3091                r#""\u0068\u0065\u006c\u006c\u006f\u002c\u0020\u0077\u006f\u0072\u006c\u0064""#,
3092                Some(r#""hello, world""#),
3093            ),
3094            (Token::ValueSep, Pos::new(171, 7, 77), ",", None),
3095            (Token::White, Pos::new(172, 7, 78), "\n  ", None),
3096            // Line 8.
3097            (
3098                Token::Str,
3099                Pos::new(175, 8, 3),
3100                concat!(
3101                    r#""Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt.\n"#,
3102                    r#"Ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco.\n"#,
3103                    r#"Laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in.""#,
3104                ),
3105                Some(concat!(
3106                    "\"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt.\n",
3107                    "Ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco.\n",
3108                    "Laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in.\"",
3109                )),
3110            ),
3111            // Line 9.
3112            (Token::White, Pos::new(455, 8, 283), "\n", None),
3113            (Token::ArrEnd, Pos::new(456, 9, 1), "]", None),
3114            (Token::Eof, Pos::new(457, 9, 2), "", None),
3115        ];
3116
3117        let mut an = ReadAnalyzer::with_buf_size(JSON_TEXT.as_bytes(), buf_size);
3118
3119        for (i, (expect_token, expect_pos, expect_literal, expect_unescaped)) in
3120            EXPECT.iter().enumerate()
3121        {
3122            let actual_token = an.next();
3123            let actual_pos = *an.pos();
3124            let content = an.content();
3125
3126            assert_eq!(
3127                *expect_token, actual_token,
3128                "i = {i}, actual_pos = {actual_pos}, expect_pos = {expect_pos}"
3129            );
3130            assert_eq!(
3131                *expect_pos, actual_pos,
3132                "i = {i}, token = {actual_token}, content = {content}"
3133            );
3134            assert_eq!(
3135                *expect_literal,
3136                content.literal(),
3137                "i = {i}, token = {actual_token}, expect_literal = {expect_literal:?}, content.literal() = {}",
3138                content.literal(),
3139            );
3140            if let Some(u) = expect_unescaped {
3141                assert!(
3142                    content.is_escaped(),
3143                    "i = {i}, token = {actual_token}, literal = {expect_literal:?}"
3144                );
3145                assert_eq!(*u, content.unescaped());
3146            } else {
3147                assert!(
3148                    !content.is_escaped(),
3149                    "i = {i}, token = {actual_token}, literal = {expect_literal:?}"
3150                );
3151                assert_eq!(*expect_literal, content.unescaped());
3152            }
3153        }
3154    }
3155
3156    #[rstest]
3157    #[case(29)]
3158    #[case(30)]
3159    #[case(31)]
3160    fn test_analyzer_replace_buf(#[case] buf_size: usize) {
3161        // The purpose of this test is to cover the code branch in which there is a single "maybe
3162        // free" buffer, but when it gets taken off the "maybe free" list, it turns out to still
3163        // have active references, so it gets put back.
3164        //
3165        // The structure of the test is as follows:
3166        //     - The buffer size is ~30.
3167        //     - A string token that fits within the first buffer is read and its content is held,
3168        //       keeping a reference to the first buffer alive.
3169        //     - Once all tokens from the first buffer have been read, it goes on the "maybe free"
3170        //       list.
3171        //     - Tokens are read until the second buffer is filled and a new one is allocated, which
3172        //       causes the first buffer to come off the "maybe free" list, checked, and replaced
3173        //       because it still has active references. A third buffer is then allocated.
3174        //     - The string token is now dropped, freeing excess references to the first buffer.
3175        //     - Tokens are read until the third buffer is filled. A fourth buffer is needed, and
3176        //       at this point the first buffer can be reused.
3177        //
3178        // In the input string below, the commas and trailing `]` occur on the 30, 60, and 90 byte
3179        // boundaries.
3180
3181        const INPUT: &str = r#"["_________xxxxxxxxxx_______",             true            ,1.000000001111111111000000000] null"#;
3182        let mut an = ReadAnalyzer::with_buf_size(INPUT.as_bytes(), buf_size);
3183
3184        // Read tokens from the first buffer.
3185        assert_eq!(Token::ArrBegin, an.next());
3186        assert_eq!(Token::Str, an.next());
3187        let str_content = an.content();
3188        assert_eq!(r#""_________xxxxxxxxxx_______""#, str_content.literal());
3189        assert_eq!(Token::ValueSep, an.next());
3190
3191        // Read tokens from the second buffer.
3192        assert_eq!(Token::White, an.next());
3193        assert_eq!(Token::LitTrue, an.next());
3194        assert_eq!(Token::White, an.next());
3195        assert_eq!(Token::ValueSep, an.next());
3196
3197        // Start reading tokens from the third buffer.
3198        assert_eq!(Token::Num, an.next());
3199
3200        // Drop the string content, which held a reference to the first buffer, allowing the first
3201        // buffer to be reused.
3202        drop(str_content);
3203
3204        // Finish reading tokens from the third buffer.
3205        assert_eq!(Token::ArrEnd, an.next());
3206
3207        // Read tokens from the fourth buffer.
3208        assert_eq!(Token::White, an.next());
3209        assert_eq!(Token::LitNull, an.next());
3210        assert_eq!(Token::Eof, an.next());
3211    }
3212
3213    trait IntoString {
3214        fn into_string(self) -> String;
3215    }
3216
3217    impl<T: IntoBuf> IntoString for T {
3218        fn into_string(self) -> String {
3219            let mut src = self.into_buf();
3220            let mut dst = Vec::with_capacity(src.remaining());
3221            while src.remaining() > 0 {
3222                let chunk = src.chunk();
3223                dst.extend_from_slice(chunk);
3224                src.advance(chunk.len());
3225            }
3226
3227            String::from_utf8(dst).expect("valid UTF-8")
3228        }
3229    }
3230}
bufjson/lexical/read.rs

bufjson/lexical/
read.rs