facet_json/
scanner.rs

1//! Low-level JSON scanner that finds token boundaries without materializing strings.
2//!
3//! The scanner's job is to identify where tokens are in a buffer, not to interpret them.
4//! String content is returned as indices + a `has_escapes` flag. The deserializer
5//! decides whether to decode escapes based on the target type.
6//!
7//! This design enables:
8//! - Zero-copy borrowed strings (when no escapes)
9//! - Streaming from `std::io::Read` with buffer refills
10//! - Skipping values without allocation (RawJson, unknown fields)
11
12use core::str;
13
14use facet_reflect::Span;
15
16/// Token kinds with minimal data - strings/numbers are just indices into the buffer.
17#[derive(Debug, Clone, PartialEq)]
18pub enum Token {
19    /// `{`
20    ObjectStart,
21    /// `}`
22    ObjectEnd,
23    /// `[`
24    ArrayStart,
25    /// `]`
26    ArrayEnd,
27    /// `:`
28    Colon,
29    /// `,`
30    Comma,
31    /// `null`
32    Null,
33    /// `true`
34    True,
35    /// `false`
36    False,
37    /// A string literal - indices point to content (excluding quotes)
38    String {
39        /// Start index of string content (after opening quote)
40        start: usize,
41        /// End index of string content (before closing quote)
42        end: usize,
43        /// True if the string contains escape sequences that need processing
44        has_escapes: bool,
45    },
46    /// A number literal - indices point to the raw number text
47    Number {
48        /// Start index of number
49        start: usize,
50        /// End index of number
51        end: usize,
52        /// Hint about number format
53        hint: NumberHint,
54    },
55    /// End of input reached
56    Eof,
57    /// Buffer exhausted mid-token - need refill for streaming
58    NeedMore {
59        /// How many bytes were consumed before hitting the boundary
60        consumed: usize,
61    },
62}
63
64/// Hint about number format to guide parsing
65#[derive(Debug, Clone, Copy, PartialEq)]
66pub enum NumberHint {
67    /// Unsigned integer (no sign, no decimal, no exponent)
68    Unsigned,
69    /// Signed integer (has `-` prefix, no decimal, no exponent)
70    Signed,
71    /// Floating point (has `.` or `e`/`E`)
72    Float,
73}
74
75/// Spanned token with location information
76#[derive(Debug, Clone, PartialEq)]
77pub struct SpannedToken {
78    /// The token
79    pub token: Token,
80    /// Source span
81    pub span: Span,
82}
83
84/// Scanner error
85#[derive(Debug, Clone, PartialEq)]
86pub struct ScanError {
87    /// The error kind
88    pub kind: ScanErrorKind,
89    /// Source span
90    pub span: Span,
91}
92
93/// Types of scanner errors
94#[derive(Debug, Clone, PartialEq)]
95pub enum ScanErrorKind {
96    /// Unexpected character
97    UnexpectedChar(char),
98    /// Unexpected end of input (with context)
99    UnexpectedEof(&'static str),
100    /// Invalid UTF-8
101    InvalidUtf8,
102}
103
104/// Result type for scanner operations
105pub type ScanResult = Result<SpannedToken, ScanError>;
106
107/// JSON scanner state machine.
108///
109/// The scanner operates on a byte buffer and tracks position. For streaming,
110/// the buffer can be refilled when `Token::NeedMore` is returned.
111pub struct Scanner {
112    /// Current position in the buffer
113    pos: usize,
114    /// State for resuming after NeedMore (for streaming)
115    state: ScanState,
116}
117
118/// Internal state for resuming mid-token after buffer refill
119#[derive(Debug, Clone, Default)]
120enum ScanState {
121    #[default]
122    Ready,
123    /// In the middle of scanning a string
124    InString {
125        start: usize,
126        has_escapes: bool,
127        escape_next: bool,
128    },
129    /// In the middle of scanning a number
130    InNumber { start: usize, hint: NumberHint },
131    /// In the middle of scanning a literal (true/false/null)
132    InLiteral {
133        start: usize,
134        expected: &'static [u8],
135        matched: usize,
136    },
137}
138
139impl Scanner {
140    /// Create a new scanner starting at position 0
141    pub fn new() -> Self {
142        Self {
143            pos: 0,
144            state: ScanState::Ready,
145        }
146    }
147
148    /// Create a scanner starting at a specific position
149    pub fn at_position(pos: usize) -> Self {
150        Self {
151            pos,
152            state: ScanState::Ready,
153        }
154    }
155
156    /// Current position in the buffer
157    pub fn pos(&self) -> usize {
158        self.pos
159    }
160
161    /// Set position (used after buffer operations)
162    pub fn set_pos(&mut self, pos: usize) {
163        self.pos = pos;
164    }
165
166    /// Finalize any pending token at true EOF.
167    ///
168    /// Call this when the scanner returned `NeedMore` but no more data is available.
169    /// Returns the completed token if one is pending (e.g., a number at EOF),
170    /// or an error if the token is incomplete (e.g., unterminated string).
171    pub fn finalize_at_eof(&mut self, buf: &[u8]) -> ScanResult {
172        match core::mem::take(&mut self.state) {
173            ScanState::Ready => {
174                // Nothing pending
175                Ok(SpannedToken {
176                    token: Token::Eof,
177                    span: Span::new(self.pos, 0),
178                })
179            }
180            ScanState::InNumber { start, hint } => {
181                // Number is complete at EOF (numbers don't need closing delimiter)
182                let end = self.pos;
183                if end == start || (end == start + 1 && buf.get(start) == Some(&b'-')) {
184                    return Err(ScanError {
185                        kind: ScanErrorKind::UnexpectedEof("in number"),
186                        span: Span::new(start, end - start),
187                    });
188                }
189                Ok(SpannedToken {
190                    token: Token::Number { start, end, hint },
191                    span: Span::new(start, end - start),
192                })
193            }
194            ScanState::InString { start, .. } => {
195                // Unterminated string
196                Err(ScanError {
197                    kind: ScanErrorKind::UnexpectedEof("in string"),
198                    span: Span::new(start, self.pos - start),
199                })
200            }
201            ScanState::InLiteral {
202                start,
203                expected,
204                matched,
205            } => {
206                // Check if the literal is complete
207                if matched == expected.len() {
208                    let token = match expected {
209                        b"true" => Token::True,
210                        b"false" => Token::False,
211                        b"null" => Token::Null,
212                        _ => unreachable!(),
213                    };
214                    Ok(SpannedToken {
215                        token,
216                        span: Span::new(start, expected.len()),
217                    })
218                } else {
219                    Err(ScanError {
220                        kind: ScanErrorKind::UnexpectedEof("in literal"),
221                        span: Span::new(start, self.pos - start),
222                    })
223                }
224            }
225        }
226    }
227
228    /// Scan the next token from the buffer.
229    ///
230    /// Returns `Token::NeedMore` if the buffer is exhausted mid-token,
231    /// allowing the caller to refill and retry.
232    pub fn next_token(&mut self, buf: &[u8]) -> ScanResult {
233        // If we have pending state from a previous NeedMore, resume
234        match core::mem::take(&mut self.state) {
235            ScanState::Ready => {}
236            ScanState::InString {
237                start,
238                has_escapes,
239                escape_next,
240            } => {
241                return self.resume_string(buf, start, has_escapes, escape_next);
242            }
243            ScanState::InNumber { start, hint } => {
244                return self.resume_number(buf, start, hint);
245            }
246            ScanState::InLiteral {
247                start,
248                expected,
249                matched,
250            } => {
251                return self.resume_literal(buf, start, expected, matched);
252            }
253        }
254
255        self.skip_whitespace(buf);
256
257        let start = self.pos;
258        let Some(&byte) = buf.get(self.pos) else {
259            return Ok(SpannedToken {
260                token: Token::Eof,
261                span: Span::new(self.pos, 0),
262            });
263        };
264
265        match byte {
266            b'{' => {
267                self.pos += 1;
268                Ok(SpannedToken {
269                    token: Token::ObjectStart,
270                    span: Span::new(start, 1),
271                })
272            }
273            b'}' => {
274                self.pos += 1;
275                Ok(SpannedToken {
276                    token: Token::ObjectEnd,
277                    span: Span::new(start, 1),
278                })
279            }
280            b'[' => {
281                self.pos += 1;
282                Ok(SpannedToken {
283                    token: Token::ArrayStart,
284                    span: Span::new(start, 1),
285                })
286            }
287            b']' => {
288                self.pos += 1;
289                Ok(SpannedToken {
290                    token: Token::ArrayEnd,
291                    span: Span::new(start, 1),
292                })
293            }
294            b':' => {
295                self.pos += 1;
296                Ok(SpannedToken {
297                    token: Token::Colon,
298                    span: Span::new(start, 1),
299                })
300            }
301            b',' => {
302                self.pos += 1;
303                Ok(SpannedToken {
304                    token: Token::Comma,
305                    span: Span::new(start, 1),
306                })
307            }
308            b'"' => self.scan_string(buf, start),
309            b'-' | b'0'..=b'9' => self.scan_number(buf, start),
310            b't' => self.scan_literal(buf, start, b"true", Token::True),
311            b'f' => self.scan_literal(buf, start, b"false", Token::False),
312            b'n' => self.scan_literal(buf, start, b"null", Token::Null),
313            _ => Err(ScanError {
314                kind: ScanErrorKind::UnexpectedChar(byte as char),
315                span: Span::new(start, 1),
316            }),
317        }
318    }
319
320    fn skip_whitespace(&mut self, buf: &[u8]) {
321        while let Some(&b) = buf.get(self.pos) {
322            match b {
323                b' ' | b'\t' | b'\n' | b'\r' => self.pos += 1,
324                _ => break,
325            }
326        }
327    }
328
329    /// Scan a string, finding its boundaries and noting if it has escapes.
330    fn scan_string(&mut self, buf: &[u8], start: usize) -> ScanResult {
331        // Skip opening quote
332        self.pos += 1;
333        let content_start = self.pos;
334
335        self.scan_string_content(buf, start, content_start, false, false)
336    }
337
338    fn resume_string(
339        &mut self,
340        buf: &[u8],
341        start: usize,
342        has_escapes: bool,
343        escape_next: bool,
344    ) -> ScanResult {
345        let content_start = start + 1; // After opening quote
346        self.scan_string_content(buf, start, content_start, has_escapes, escape_next)
347    }
348
349    fn scan_string_content(
350        &mut self,
351        buf: &[u8],
352        start: usize,
353        content_start: usize,
354        mut has_escapes: bool,
355        mut escape_next: bool,
356    ) -> ScanResult {
357        // SIMD-friendly fast path: scan 16 bytes at a time looking for quotes/backslashes
358        const STEP_SIZE: usize = 16;
359        type Window = u128;
360        type Chunk = [u8; STEP_SIZE];
361
362        // SIMD fast path: only if we're not in escape mode
363        if !escape_next {
364            loop {
365                if let Some(Ok(chunk)) = buf
366                    .get(self.pos..)
367                    .and_then(|s| s.get(..STEP_SIZE))
368                    .map(Chunk::try_from)
369                {
370                    let window = Window::from_ne_bytes(chunk);
371                    let has_quote = contains_byte(window, b'"');
372                    let has_backslash = contains_byte(window, b'\\');
373
374                    if !has_quote && !has_backslash {
375                        // Fast path: no special chars in this chunk
376                        self.pos += STEP_SIZE;
377                        continue;
378                    }
379                }
380                // Fall through to byte-by-byte scanning
381                break;
382            }
383        }
384
385        // Byte-by-byte scanning
386        while let Some(&byte) = buf.get(self.pos) {
387            if escape_next {
388                // Previous char was backslash, skip this char
389                escape_next = false;
390                self.pos += 1;
391
392                // Handle \uXXXX - need to skip 4 more hex digits
393                if byte == b'u' {
394                    // Check if we have 4 more bytes
395                    if self.pos + 4 > buf.len() {
396                        // Need more data
397                        self.state = ScanState::InString {
398                            start,
399                            has_escapes: true,
400                            escape_next: false,
401                        };
402                        return Ok(SpannedToken {
403                            token: Token::NeedMore { consumed: start },
404                            span: Span::new(start, self.pos - start),
405                        });
406                    }
407                    self.pos += 4;
408
409                    // Check for surrogate pair (\uXXXX\uXXXX)
410                    if self.pos + 2 <= buf.len()
411                        && buf.get(self.pos) == Some(&b'\\')
412                        && buf.get(self.pos + 1) == Some(&b'u')
413                    {
414                        if self.pos + 6 > buf.len() {
415                            // Need more data for second surrogate
416                            self.state = ScanState::InString {
417                                start,
418                                has_escapes: true,
419                                escape_next: false,
420                            };
421                            return Ok(SpannedToken {
422                                token: Token::NeedMore { consumed: start },
423                                span: Span::new(start, self.pos - start),
424                            });
425                        }
426                        // Skip \uXXXX
427                        self.pos += 6;
428                    }
429                }
430                continue;
431            }
432
433            match byte {
434                b'"' => {
435                    // Found closing quote
436                    let content_end = self.pos;
437                    self.pos += 1; // Skip closing quote
438
439                    return Ok(SpannedToken {
440                        token: Token::String {
441                            start: content_start,
442                            end: content_end,
443                            has_escapes,
444                        },
445                        span: Span::new(start, self.pos - start),
446                    });
447                }
448                b'\\' => {
449                    has_escapes = true;
450                    escape_next = true;
451                    self.pos += 1;
452                }
453                _ => {
454                    self.pos += 1;
455                }
456            }
457        }
458
459        // Reached end of buffer without closing quote
460        if escape_next || self.pos > start {
461            // Mid-string, need more data
462            self.state = ScanState::InString {
463                start,
464                has_escapes,
465                escape_next,
466            };
467            Ok(SpannedToken {
468                token: Token::NeedMore { consumed: start },
469                span: Span::new(start, self.pos - start),
470            })
471        } else {
472            Err(ScanError {
473                kind: ScanErrorKind::UnexpectedEof("in string"),
474                span: Span::new(start, self.pos - start),
475            })
476        }
477    }
478
479    /// Scan a number, finding its boundaries and determining its type hint.
480    fn scan_number(&mut self, buf: &[u8], start: usize) -> ScanResult {
481        let mut hint = NumberHint::Unsigned;
482
483        if buf.get(self.pos) == Some(&b'-') {
484            hint = NumberHint::Signed;
485            self.pos += 1;
486        }
487
488        self.scan_number_content(buf, start, hint)
489    }
490
491    fn resume_number(&mut self, buf: &[u8], start: usize, hint: NumberHint) -> ScanResult {
492        self.scan_number_content(buf, start, hint)
493    }
494
495    fn scan_number_content(
496        &mut self,
497        buf: &[u8],
498        start: usize,
499        mut hint: NumberHint,
500    ) -> ScanResult {
501        // Integer part
502        while let Some(&b) = buf.get(self.pos) {
503            if b.is_ascii_digit() {
504                self.pos += 1;
505            } else {
506                break;
507            }
508        }
509
510        // Check for decimal part
511        if buf.get(self.pos) == Some(&b'.') {
512            hint = NumberHint::Float;
513            self.pos += 1;
514
515            // Fractional digits
516            while let Some(&b) = buf.get(self.pos) {
517                if b.is_ascii_digit() {
518                    self.pos += 1;
519                } else {
520                    break;
521                }
522            }
523        }
524
525        // Check for exponent
526        if matches!(buf.get(self.pos), Some(b'e') | Some(b'E')) {
527            hint = NumberHint::Float;
528            self.pos += 1;
529
530            // Optional sign
531            if matches!(buf.get(self.pos), Some(b'+') | Some(b'-')) {
532                self.pos += 1;
533            }
534
535            // Exponent digits
536            while let Some(&b) = buf.get(self.pos) {
537                if b.is_ascii_digit() {
538                    self.pos += 1;
539                } else {
540                    break;
541                }
542            }
543        }
544
545        // Check if we're at end of buffer - might need more data
546        // Numbers end at whitespace, punctuation, or true EOF
547        if self.pos == buf.len() {
548            // At end of buffer - need more data to see terminator
549            self.state = ScanState::InNumber { start, hint };
550            return Ok(SpannedToken {
551                token: Token::NeedMore { consumed: start },
552                span: Span::new(start, self.pos - start),
553            });
554        }
555
556        let end = self.pos;
557
558        // Validate we actually parsed something
559        if end == start || (end == start + 1 && buf.get(start) == Some(&b'-')) {
560            return Err(ScanError {
561                kind: ScanErrorKind::UnexpectedChar(
562                    buf.get(self.pos).map(|&b| b as char).unwrap_or('?'),
563                ),
564                span: Span::new(start, 1),
565            });
566        }
567
568        Ok(SpannedToken {
569            token: Token::Number { start, end, hint },
570            span: Span::new(start, end - start),
571        })
572    }
573
574    /// Scan a literal keyword (true, false, null)
575    fn scan_literal(
576        &mut self,
577        buf: &[u8],
578        start: usize,
579        expected: &'static [u8],
580        token: Token,
581    ) -> ScanResult {
582        self.scan_literal_content(buf, start, expected, 0, token)
583    }
584
585    fn resume_literal(
586        &mut self,
587        buf: &[u8],
588        start: usize,
589        expected: &'static [u8],
590        matched: usize,
591    ) -> ScanResult {
592        let token = match expected {
593            b"true" => Token::True,
594            b"false" => Token::False,
595            b"null" => Token::Null,
596            _ => unreachable!(),
597        };
598        self.scan_literal_content(buf, start, expected, matched, token)
599    }
600
601    fn scan_literal_content(
602        &mut self,
603        buf: &[u8],
604        start: usize,
605        expected: &'static [u8],
606        mut matched: usize,
607        token: Token,
608    ) -> ScanResult {
609        while matched < expected.len() {
610            match buf.get(self.pos) {
611                Some(&b) if b == expected[matched] => {
612                    self.pos += 1;
613                    matched += 1;
614                }
615                Some(&b) => {
616                    return Err(ScanError {
617                        kind: ScanErrorKind::UnexpectedChar(b as char),
618                        span: Span::new(self.pos, 1),
619                    });
620                }
621                None => {
622                    // Need more data
623                    self.state = ScanState::InLiteral {
624                        start,
625                        expected,
626                        matched,
627                    };
628                    return Ok(SpannedToken {
629                        token: Token::NeedMore { consumed: start },
630                        span: Span::new(start, self.pos - start),
631                    });
632                }
633            }
634        }
635
636        Ok(SpannedToken {
637            token,
638            span: Span::new(start, expected.len()),
639        })
640    }
641}
642
643impl Default for Scanner {
644    fn default() -> Self {
645        Self::new()
646    }
647}
648
649/// Check if a 128-bit window contains a specific byte (SIMD-friendly)
650#[inline]
651fn contains_byte(window: u128, byte: u8) -> bool {
652    let pattern = u128::from_ne_bytes([byte; 16]);
653    let xor = window ^ pattern;
654    let has_zero = (xor.wrapping_sub(0x01010101010101010101010101010101))
655        & !xor
656        & 0x80808080808080808080808080808080;
657    has_zero != 0
658}
659
660// =============================================================================
661// String decoding utilities (second pass)
662// =============================================================================
663
664/// Decode a JSON string from a buffer, handling escape sequences.
665///
666/// This is the "second pass" - only called when the deserializer actually needs
667/// the string content. For borrowed strings without escapes, use `decode_string_borrowed`.
668///
669/// # Arguments
670/// * `buf` - The buffer containing the string
671/// * `start` - Start index (after opening quote)
672/// * `end` - End index (before closing quote)
673///
674/// # Returns
675/// The decoded string, or an error if the string contains invalid escapes.
676pub fn decode_string_owned(
677    buf: &[u8],
678    start: usize,
679    end: usize,
680) -> Result<alloc::string::String, ScanError> {
681    use alloc::string::String;
682
683    let slice = &buf[start..end];
684    let mut result = String::with_capacity(end - start);
685    let mut i = 0;
686
687    while i < slice.len() {
688        let byte = slice[i];
689        if byte == b'\\' {
690            i += 1;
691            if i >= slice.len() {
692                return Err(ScanError {
693                    kind: ScanErrorKind::UnexpectedEof("in escape sequence"),
694                    span: Span::new(start + i - 1, 1),
695                });
696            }
697
698            match slice[i] {
699                b'"' => result.push('"'),
700                b'\\' => result.push('\\'),
701                b'/' => result.push('/'),
702                b'b' => result.push('\x08'),
703                b'f' => result.push('\x0c'),
704                b'n' => result.push('\n'),
705                b'r' => result.push('\r'),
706                b't' => result.push('\t'),
707                b'u' => {
708                    i += 1;
709                    if i + 4 > slice.len() {
710                        return Err(ScanError {
711                            kind: ScanErrorKind::UnexpectedEof("in unicode escape"),
712                            span: Span::new(start + i - 2, slice.len() - i + 2),
713                        });
714                    }
715
716                    let hex = &slice[i..i + 4];
717                    let hex_str = str::from_utf8(hex).map_err(|_| ScanError {
718                        kind: ScanErrorKind::InvalidUtf8,
719                        span: Span::new(start + i, 4),
720                    })?;
721
722                    let code_unit = u16::from_str_radix(hex_str, 16).map_err(|_| ScanError {
723                        kind: ScanErrorKind::UnexpectedChar('?'),
724                        span: Span::new(start + i, 4),
725                    })?;
726
727                    i += 4;
728
729                    // Check for surrogate pairs
730                    let code_point = if (0xD800..=0xDBFF).contains(&code_unit) {
731                        // High surrogate - expect \uXXXX to follow
732                        if i + 6 > slice.len() || slice[i] != b'\\' || slice[i + 1] != b'u' {
733                            return Err(ScanError {
734                                kind: ScanErrorKind::InvalidUtf8,
735                                span: Span::new(start + i - 6, 6),
736                            });
737                        }
738
739                        i += 2; // Skip \u
740                        let low_hex = &slice[i..i + 4];
741                        let low_hex_str = str::from_utf8(low_hex).map_err(|_| ScanError {
742                            kind: ScanErrorKind::InvalidUtf8,
743                            span: Span::new(start + i, 4),
744                        })?;
745
746                        let low_unit =
747                            u16::from_str_radix(low_hex_str, 16).map_err(|_| ScanError {
748                                kind: ScanErrorKind::UnexpectedChar('?'),
749                                span: Span::new(start + i, 4),
750                            })?;
751
752                        i += 4;
753
754                        if !(0xDC00..=0xDFFF).contains(&low_unit) {
755                            return Err(ScanError {
756                                kind: ScanErrorKind::InvalidUtf8,
757                                span: Span::new(start + i - 4, 4),
758                            });
759                        }
760
761                        // Combine surrogates
762                        let high = code_unit as u32;
763                        let low = low_unit as u32;
764                        0x10000 + ((high & 0x3FF) << 10) + (low & 0x3FF)
765                    } else if (0xDC00..=0xDFFF).contains(&code_unit) {
766                        // Lone low surrogate
767                        return Err(ScanError {
768                            kind: ScanErrorKind::InvalidUtf8,
769                            span: Span::new(start + i - 4, 4),
770                        });
771                    } else {
772                        code_unit as u32
773                    };
774
775                    let c = char::from_u32(code_point).ok_or_else(|| ScanError {
776                        kind: ScanErrorKind::InvalidUtf8,
777                        span: Span::new(start + i - 4, 4),
778                    })?;
779
780                    result.push(c);
781                    continue; // Don't increment i again
782                }
783                other => {
784                    // Unknown escape - just push the character
785                    result.push(other as char);
786                }
787            }
788            i += 1;
789        } else {
790            // Regular UTF-8 byte
791            // Fast path for ASCII
792            if byte < 0x80 {
793                result.push(byte as char);
794                i += 1;
795            } else {
796                // Multi-byte UTF-8 sequence - find the end of valid UTF-8
797                let remaining = &slice[i..];
798                match str::from_utf8(remaining) {
799                    Ok(s) => {
800                        result.push_str(s);
801                        break;
802                    }
803                    Err(e) => {
804                        // Partial valid UTF-8
805                        let valid_len = e.valid_up_to();
806                        if valid_len > 0 {
807                            // Re-validate the valid portion (safe, no unsafe)
808                            let valid = str::from_utf8(&remaining[..valid_len])
809                                .expect("valid_up_to guarantees valid UTF-8");
810                            result.push_str(valid);
811                            i += valid_len;
812                        } else {
813                            return Err(ScanError {
814                                kind: ScanErrorKind::InvalidUtf8,
815                                span: Span::new(start + i, 1),
816                            });
817                        }
818                    }
819                }
820            }
821        }
822    }
823
824    Ok(result)
825}
826
827/// Try to borrow a string directly from the buffer (zero-copy).
828///
829/// This only works for strings without escape sequences. Returns `None` if
830/// the string contains escapes or invalid UTF-8.
831///
832/// # Arguments
833/// * `buf` - The buffer containing the string
834/// * `start` - Start index (after opening quote)
835/// * `end` - End index (before closing quote)
836pub fn decode_string_borrowed(buf: &[u8], start: usize, end: usize) -> Option<&str> {
837    let slice = &buf[start..end];
838
839    // Quick check for backslashes
840    if slice.contains(&b'\\') {
841        return None;
842    }
843
844    str::from_utf8(slice).ok()
845}
846
847/// Decode a JSON string, returning either a borrowed or owned string.
848///
849/// Uses `Cow<str>` to avoid allocation when possible.
850pub fn decode_string<'a>(
851    buf: &'a [u8],
852    start: usize,
853    end: usize,
854    has_escapes: bool,
855) -> Result<alloc::borrow::Cow<'a, str>, ScanError> {
856    use alloc::borrow::Cow;
857
858    if has_escapes {
859        decode_string_owned(buf, start, end).map(Cow::Owned)
860    } else {
861        decode_string_borrowed(buf, start, end)
862            .map(Cow::Borrowed)
863            .ok_or_else(|| ScanError {
864                kind: ScanErrorKind::InvalidUtf8,
865                span: Span::new(start, end - start),
866            })
867    }
868}
869
870/// Parse a number from the buffer.
871///
872/// Returns the appropriate numeric type based on the hint and value.
873#[derive(Debug, Clone, PartialEq)]
874pub enum ParsedNumber {
875    /// Unsigned 64-bit integer
876    U64(u64),
877    /// Signed 64-bit integer
878    I64(i64),
879    /// Unsigned 128-bit integer
880    U128(u128),
881    /// Signed 128-bit integer
882    I128(i128),
883    /// 64-bit floating point
884    F64(f64),
885}
886
887/// Parse a number from the buffer slice.
888pub fn parse_number(
889    buf: &[u8],
890    start: usize,
891    end: usize,
892    hint: NumberHint,
893) -> Result<ParsedNumber, ScanError> {
894    use lexical_parse_float::FromLexical as _;
895    use lexical_parse_integer::FromLexical as _;
896
897    let slice = &buf[start..end];
898
899    match hint {
900        NumberHint::Float => f64::from_lexical(slice)
901            .map(ParsedNumber::F64)
902            .map_err(|_| ScanError {
903                kind: ScanErrorKind::UnexpectedChar('?'),
904                span: Span::new(start, end - start),
905            }),
906        NumberHint::Signed => {
907            if let Ok(n) = i64::from_lexical(slice) {
908                Ok(ParsedNumber::I64(n))
909            } else if let Ok(n) = i128::from_lexical(slice) {
910                Ok(ParsedNumber::I128(n))
911            } else {
912                Err(ScanError {
913                    kind: ScanErrorKind::UnexpectedChar('?'),
914                    span: Span::new(start, end - start),
915                })
916            }
917        }
918        NumberHint::Unsigned => {
919            if let Ok(n) = u64::from_lexical(slice) {
920                Ok(ParsedNumber::U64(n))
921            } else if let Ok(n) = u128::from_lexical(slice) {
922                Ok(ParsedNumber::U128(n))
923            } else {
924                Err(ScanError {
925                    kind: ScanErrorKind::UnexpectedChar('?'),
926                    span: Span::new(start, end - start),
927                })
928            }
929        }
930    }
931}
932
933#[cfg(test)]
934mod tests {
935    use super::*;
936
937    #[test]
938    fn test_simple_tokens() {
939        let input = b"{}[],:";
940        let mut scanner = Scanner::new();
941
942        assert!(matches!(
943            scanner.next_token(input).unwrap().token,
944            Token::ObjectStart
945        ));
946        assert!(matches!(
947            scanner.next_token(input).unwrap().token,
948            Token::ObjectEnd
949        ));
950        assert!(matches!(
951            scanner.next_token(input).unwrap().token,
952            Token::ArrayStart
953        ));
954        assert!(matches!(
955            scanner.next_token(input).unwrap().token,
956            Token::ArrayEnd
957        ));
958        assert!(matches!(
959            scanner.next_token(input).unwrap().token,
960            Token::Comma
961        ));
962        assert!(matches!(
963            scanner.next_token(input).unwrap().token,
964            Token::Colon
965        ));
966        assert!(matches!(
967            scanner.next_token(input).unwrap().token,
968            Token::Eof
969        ));
970    }
971
972    #[test]
973    fn test_string_no_escapes() {
974        let input = b"\"hello world\"";
975        let mut scanner = Scanner::new();
976
977        let result = scanner.next_token(input).unwrap();
978        assert!(matches!(
979            result.token,
980            Token::String {
981                start: 1,
982                end: 12,
983                has_escapes: false
984            }
985        ));
986    }
987
988    #[test]
989    fn test_string_with_escapes() {
990        let input = br#""hello\nworld""#;
991        let mut scanner = Scanner::new();
992
993        let result = scanner.next_token(input).unwrap();
994        assert!(matches!(
995            result.token,
996            Token::String {
997                start: 1,
998                end: 13,
999                has_escapes: true
1000            }
1001        ));
1002    }
1003
1004    #[test]
1005    fn test_numbers() {
1006        let mut scanner = Scanner::new();
1007
1008        // Unsigned (with terminator so scanner knows number is complete)
1009        let result = scanner.next_token(b"42,").unwrap();
1010        assert!(matches!(
1011            result.token,
1012            Token::Number {
1013                hint: NumberHint::Unsigned,
1014                ..
1015            }
1016        ));
1017
1018        // Signed
1019        scanner.set_pos(0);
1020        let result = scanner.next_token(b"-42]").unwrap();
1021        assert!(matches!(
1022            result.token,
1023            Token::Number {
1024                hint: NumberHint::Signed,
1025                ..
1026            }
1027        ));
1028
1029        // Float
1030        scanner.set_pos(0);
1031        let result = scanner.next_token(b"3.14}").unwrap();
1032        assert!(matches!(
1033            result.token,
1034            Token::Number {
1035                hint: NumberHint::Float,
1036                ..
1037            }
1038        ));
1039
1040        // Exponent
1041        scanner.set_pos(0);
1042        let result = scanner.next_token(b"1e10 ").unwrap();
1043        assert!(matches!(
1044            result.token,
1045            Token::Number {
1046                hint: NumberHint::Float,
1047                ..
1048            }
1049        ));
1050
1051        // Number at end of buffer returns NeedMore (streaming behavior)
1052        scanner.set_pos(0);
1053        let result = scanner.next_token(b"42").unwrap();
1054        assert!(matches!(result.token, Token::NeedMore { .. }));
1055    }
1056
1057    #[test]
1058    fn test_literals() {
1059        let mut scanner = Scanner::new();
1060
1061        // Literals need terminators too (scanner can't know if "truex" is coming)
1062        let result = scanner.next_token(b"true,").unwrap();
1063        assert!(matches!(result.token, Token::True));
1064
1065        scanner.set_pos(0);
1066        let result = scanner.next_token(b"false]").unwrap();
1067        assert!(matches!(result.token, Token::False));
1068
1069        scanner.set_pos(0);
1070        let result = scanner.next_token(b"null}").unwrap();
1071        assert!(matches!(result.token, Token::Null));
1072    }
1073
1074    #[test]
1075    fn test_whitespace_handling() {
1076        let input = b"  {\n\t\"key\"  :  42  }  ";
1077        let mut scanner = Scanner::new();
1078
1079        assert!(matches!(
1080            scanner.next_token(input).unwrap().token,
1081            Token::ObjectStart
1082        ));
1083        assert!(matches!(
1084            scanner.next_token(input).unwrap().token,
1085            Token::String { .. }
1086        ));
1087        assert!(matches!(
1088            scanner.next_token(input).unwrap().token,
1089            Token::Colon
1090        ));
1091        assert!(matches!(
1092            scanner.next_token(input).unwrap().token,
1093            Token::Number { .. }
1094        ));
1095        assert!(matches!(
1096            scanner.next_token(input).unwrap().token,
1097            Token::ObjectEnd
1098        ));
1099        assert!(matches!(
1100            scanner.next_token(input).unwrap().token,
1101            Token::Eof
1102        ));
1103    }
1104
1105    #[test]
1106    fn test_decode_string_no_escapes() {
1107        let input = b"hello world";
1108        let result = decode_string_borrowed(input, 0, input.len());
1109        assert_eq!(result, Some("hello world"));
1110    }
1111
1112    #[test]
1113    fn test_decode_string_with_escapes() {
1114        let input = br#"hello\nworld"#;
1115        let result = decode_string_owned(input, 0, input.len()).unwrap();
1116        assert_eq!(result, "hello\nworld");
1117    }
1118
1119    #[test]
1120    fn test_decode_string_unicode() {
1121        // \u0048 = 'H', \u0065 = 'e', \u006C = 'l', \u006C = 'l', \u006F = 'o'
1122        let input = br#"\u0048\u0065\u006C\u006C\u006F"#;
1123        let result = decode_string_owned(input, 0, input.len()).unwrap();
1124        assert_eq!(result, "Hello");
1125    }
1126
1127    #[test]
1128    fn test_decode_string_surrogate_pair() {
1129        // U+1F600 (grinning face) = \uD83D\uDE00
1130        let input = br#"\uD83D\uDE00"#;
1131        let result = decode_string_owned(input, 0, input.len()).unwrap();
1132        assert_eq!(result, "😀");
1133    }
1134
1135    #[test]
1136    fn test_decode_cow_borrowed() {
1137        let input = b"simple";
1138        let result = decode_string(input, 0, input.len(), false).unwrap();
1139        assert!(matches!(result, alloc::borrow::Cow::Borrowed(_)));
1140        assert_eq!(&*result, "simple");
1141    }
1142
1143    #[test]
1144    fn test_decode_cow_owned() {
1145        let input = br#"has\tescape"#;
1146        let result = decode_string(input, 0, input.len(), true).unwrap();
1147        assert!(matches!(result, alloc::borrow::Cow::Owned(_)));
1148        assert_eq!(&*result, "has\tescape");
1149    }
1150
1151    #[test]
1152    fn test_parse_numbers() {
1153        assert_eq!(
1154            parse_number(b"42", 0, 2, NumberHint::Unsigned).unwrap(),
1155            ParsedNumber::U64(42)
1156        );
1157        assert_eq!(
1158            parse_number(b"-42", 0, 3, NumberHint::Signed).unwrap(),
1159            ParsedNumber::I64(-42)
1160        );
1161        #[allow(clippy::approx_constant)]
1162        {
1163            assert_eq!(
1164                parse_number(b"3.14", 0, 4, NumberHint::Float).unwrap(),
1165                ParsedNumber::F64(3.14)
1166            );
1167        }
1168    }
1169}
1170
1171#[cfg(all(test, feature = "bolero-inline-tests"))]
1172#[allow(clippy::while_let_loop, clippy::same_item_push)]
1173mod fuzz_tests {
1174    use super::*;
1175    use bolero::check;
1176
1177    /// Fuzz the scanner with arbitrary bytes - it should never panic
1178    #[test]
1179    fn fuzz_scanner_arbitrary_bytes() {
1180        check!().for_each(|input: &[u8]| {
1181            let mut scanner = Scanner::new();
1182            loop {
1183                match scanner.next_token(input) {
1184                    Ok(spanned) => {
1185                        if matches!(spanned.token, Token::Eof | Token::NeedMore { .. }) {
1186                            break;
1187                        }
1188                    }
1189                    Err(_) => break, // Errors are fine, panics are not
1190                }
1191            }
1192        });
1193    }
1194
1195    /// Fuzz with valid JSON-like input - scanner should handle it
1196    #[test]
1197    fn fuzz_scanner_json_like() {
1198        check!().for_each(|input: &[u8]| {
1199            // Wrap input in array to make it more JSON-like
1200            let mut wrapped = Vec::with_capacity(input.len() + 2);
1201            wrapped.push(b'[');
1202            wrapped.extend_from_slice(input);
1203            wrapped.push(b']');
1204
1205            let mut scanner = Scanner::new();
1206            loop {
1207                match scanner.next_token(&wrapped) {
1208                    Ok(spanned) => {
1209                        if matches!(spanned.token, Token::Eof | Token::NeedMore { .. }) {
1210                            break;
1211                        }
1212                    }
1213                    Err(_) => break,
1214                }
1215            }
1216        });
1217    }
1218
1219    /// Fuzz string decoding - should never panic
1220    #[test]
1221    fn fuzz_decode_string() {
1222        check!().for_each(|input: &[u8]| {
1223            if input.len() >= 2 {
1224                // Try decoding as if it were string content
1225                let _ = decode_string_owned(input, 0, input.len());
1226            }
1227        });
1228    }
1229
1230    /// Fuzz with strings that might have escapes
1231    #[test]
1232    fn fuzz_scanner_strings() {
1233        check!().for_each(|content: &[u8]| {
1234            // Build a quoted string
1235            let mut input = Vec::with_capacity(content.len() + 2);
1236            input.push(b'"');
1237            input.extend_from_slice(content);
1238            input.push(b'"');
1239
1240            let mut scanner = Scanner::new();
1241            let _ = scanner.next_token(&input);
1242        });
1243    }
1244
1245    /// Fuzz with numbers of various formats
1246    #[test]
1247    fn fuzz_scanner_numbers() {
1248        check!().for_each(|content: &[u8]| {
1249            // Only try if it looks number-like (starts with digit or minus)
1250            if !content.is_empty() && (content[0].is_ascii_digit() || content[0] == b'-') {
1251                let mut scanner = Scanner::new();
1252                let _ = scanner.next_token(content);
1253            }
1254        });
1255    }
1256
1257    /// Fuzz number parsing directly
1258    #[test]
1259    fn fuzz_parse_number() {
1260        check!().for_each(|input: &[u8]| {
1261            if !input.is_empty() {
1262                // Try all hints
1263                let _ = parse_number(input, 0, input.len(), NumberHint::Unsigned);
1264                let _ = parse_number(input, 0, input.len(), NumberHint::Signed);
1265                let _ = parse_number(input, 0, input.len(), NumberHint::Float);
1266            }
1267        });
1268    }
1269
1270    /// Fuzz with deeply nested structures
1271    #[test]
1272    fn fuzz_scanner_nested() {
1273        check!().for_each(|input: &[u8]| {
1274            // Use first byte as depth indicator
1275            let depth = input.first().copied().unwrap_or(0) as usize % 100;
1276            let mut nested = Vec::new();
1277            for _ in 0..depth {
1278                nested.push(b'[');
1279            }
1280            for _ in 0..depth {
1281                nested.push(b']');
1282            }
1283
1284            let mut scanner = Scanner::new();
1285            loop {
1286                match scanner.next_token(&nested) {
1287                    Ok(spanned) => {
1288                        if matches!(spanned.token, Token::Eof | Token::NeedMore { .. }) {
1289                            break;
1290                        }
1291                    }
1292                    Err(_) => break,
1293                }
1294            }
1295        });
1296    }
1297}
facet_json/scanner.rs

facet_json/
scanner.rs