facet_json/
scanner.rs

1//! Low-level JSON scanner that finds token boundaries without materializing strings.
2//!
3//! The scanner's job is to identify where tokens are in a buffer, not to interpret them.
4//! String content is returned as indices + a `has_escapes` flag. The deserializer
5//! decides whether to decode escapes based on the target type.
6//!
7//! This design enables:
8//! - Zero-copy borrowed strings (when no escapes)
9//! - Streaming from `std::io::Read` with buffer refills
10//! - Skipping values without allocation (RawJson, unknown fields)
11
12use core::str;
13
14use facet_reflect::Span;
15
16/// Token kinds with minimal data - strings/numbers are just indices into the buffer.
17#[derive(Debug, Clone, PartialEq)]
18pub enum Token {
19    /// `{`
20    ObjectStart,
21    /// `}`
22    ObjectEnd,
23    /// `[`
24    ArrayStart,
25    /// `]`
26    ArrayEnd,
27    /// `:`
28    Colon,
29    /// `,`
30    Comma,
31    /// `null`
32    Null,
33    /// `true`
34    True,
35    /// `false`
36    False,
37    /// A string literal - indices point to content (excluding quotes)
38    String {
39        /// Start index of string content (after opening quote)
40        start: usize,
41        /// End index of string content (before closing quote)
42        end: usize,
43        /// True if the string contains escape sequences that need processing
44        has_escapes: bool,
45    },
46    /// A number literal - indices point to the raw number text
47    Number {
48        /// Start index of number
49        start: usize,
50        /// End index of number
51        end: usize,
52        /// Hint about number format
53        hint: NumberHint,
54    },
55    /// End of input reached
56    Eof,
57    /// Buffer exhausted mid-token - need refill for streaming
58    NeedMore {
59        /// How many bytes were consumed before hitting the boundary
60        consumed: usize,
61    },
62}
63
64/// Hint about number format to guide parsing
65#[derive(Debug, Clone, Copy, PartialEq)]
66pub enum NumberHint {
67    /// Unsigned integer (no sign, no decimal, no exponent)
68    Unsigned,
69    /// Signed integer (has `-` prefix, no decimal, no exponent)
70    Signed,
71    /// Floating point (has `.` or `e`/`E`)
72    Float,
73}
74
75/// Spanned token with location information
76#[derive(Debug, Clone, PartialEq)]
77pub struct SpannedToken {
78    /// The token
79    pub token: Token,
80    /// Source span
81    pub span: Span,
82}
83
84/// Scanner error
85#[derive(Debug, Clone, PartialEq)]
86pub struct ScanError {
87    /// The error kind
88    pub kind: ScanErrorKind,
89    /// Source span
90    pub span: Span,
91}
92
93/// Types of scanner errors
94#[derive(Debug, Clone, PartialEq)]
95pub enum ScanErrorKind {
96    /// Unexpected character
97    UnexpectedChar(char),
98    /// Unexpected end of input (with context)
99    UnexpectedEof(&'static str),
100    /// Invalid UTF-8
101    InvalidUtf8,
102}
103
104/// Result type for scanner operations
105pub type ScanResult = Result<SpannedToken, ScanError>;
106
107/// JSON scanner state machine.
108///
109/// The scanner operates on a byte buffer and tracks position. For streaming,
110/// the buffer can be refilled when `Token::NeedMore` is returned.
111pub struct Scanner {
112    /// Current position in the buffer
113    pos: usize,
114    /// State for resuming after NeedMore (for streaming)
115    state: ScanState,
116}
117
118/// Internal state for resuming mid-token after buffer refill
119#[derive(Debug, Clone, Default)]
120enum ScanState {
121    #[default]
122    Ready,
123    /// In the middle of scanning a string
124    InString {
125        start: usize,
126        has_escapes: bool,
127        escape_next: bool,
128    },
129    /// In the middle of scanning a number
130    InNumber { start: usize, hint: NumberHint },
131    /// In the middle of scanning a literal (true/false/null)
132    InLiteral {
133        start: usize,
134        expected: &'static [u8],
135        matched: usize,
136    },
137}
138
139impl Scanner {
140    /// Create a new scanner starting at position 0
141    pub fn new() -> Self {
142        Self {
143            pos: 0,
144            state: ScanState::Ready,
145        }
146    }
147
148    /// Create a scanner starting at a specific position
149    pub fn at_position(pos: usize) -> Self {
150        Self {
151            pos,
152            state: ScanState::Ready,
153        }
154    }
155
156    /// Current position in the buffer
157    pub fn pos(&self) -> usize {
158        self.pos
159    }
160
161    /// Set position (used after buffer operations)
162    pub fn set_pos(&mut self, pos: usize) {
163        self.pos = pos;
164    }
165
166    /// Finalize any pending token at true EOF.
167    ///
168    /// Call this when the scanner returned `NeedMore` but no more data is available.
169    /// Returns the completed token if one is pending (e.g., a number at EOF),
170    /// or an error if the token is incomplete (e.g., unterminated string).
171    pub fn finalize_at_eof(&mut self, buf: &[u8]) -> ScanResult {
172        match core::mem::take(&mut self.state) {
173            ScanState::Ready => {
174                // Nothing pending
175                Ok(SpannedToken {
176                    token: Token::Eof,
177                    span: Span::new(self.pos, 0),
178                })
179            }
180            ScanState::InNumber { start, hint } => {
181                // Number is complete at EOF (numbers don't need closing delimiter)
182                let end = self.pos;
183                if end == start || (end == start + 1 && buf.get(start) == Some(&b'-')) {
184                    return Err(ScanError {
185                        kind: ScanErrorKind::UnexpectedEof("in number"),
186                        span: Span::new(start, end - start),
187                    });
188                }
189                Ok(SpannedToken {
190                    token: Token::Number { start, end, hint },
191                    span: Span::new(start, end - start),
192                })
193            }
194            ScanState::InString { start, .. } => {
195                // Unterminated string
196                Err(ScanError {
197                    kind: ScanErrorKind::UnexpectedEof("in string"),
198                    span: Span::new(start, self.pos - start),
199                })
200            }
201            ScanState::InLiteral {
202                start,
203                expected,
204                matched,
205            } => {
206                // Check if the literal is complete
207                if matched == expected.len() {
208                    let token = match expected {
209                        b"true" => Token::True,
210                        b"false" => Token::False,
211                        b"null" => Token::Null,
212                        _ => unreachable!(),
213                    };
214                    Ok(SpannedToken {
215                        token,
216                        span: Span::new(start, expected.len()),
217                    })
218                } else {
219                    Err(ScanError {
220                        kind: ScanErrorKind::UnexpectedEof("in literal"),
221                        span: Span::new(start, self.pos - start),
222                    })
223                }
224            }
225        }
226    }
227
228    /// Scan the next token from the buffer.
229    ///
230    /// Returns `Token::NeedMore` if the buffer is exhausted mid-token,
231    /// allowing the caller to refill and retry.
232    pub fn next_token(&mut self, buf: &[u8]) -> ScanResult {
233        // If we have pending state from a previous NeedMore, resume
234        match core::mem::take(&mut self.state) {
235            ScanState::Ready => {}
236            ScanState::InString {
237                start,
238                has_escapes,
239                escape_next,
240            } => {
241                return self.resume_string(buf, start, has_escapes, escape_next);
242            }
243            ScanState::InNumber { start, hint } => {
244                return self.resume_number(buf, start, hint);
245            }
246            ScanState::InLiteral {
247                start,
248                expected,
249                matched,
250            } => {
251                return self.resume_literal(buf, start, expected, matched);
252            }
253        }
254
255        self.skip_whitespace(buf);
256
257        let start = self.pos;
258        let Some(&byte) = buf.get(self.pos) else {
259            return Ok(SpannedToken {
260                token: Token::Eof,
261                span: Span::new(self.pos, 0),
262            });
263        };
264
265        match byte {
266            b'{' => {
267                self.pos += 1;
268                Ok(SpannedToken {
269                    token: Token::ObjectStart,
270                    span: Span::new(start, 1),
271                })
272            }
273            b'}' => {
274                self.pos += 1;
275                Ok(SpannedToken {
276                    token: Token::ObjectEnd,
277                    span: Span::new(start, 1),
278                })
279            }
280            b'[' => {
281                self.pos += 1;
282                Ok(SpannedToken {
283                    token: Token::ArrayStart,
284                    span: Span::new(start, 1),
285                })
286            }
287            b']' => {
288                self.pos += 1;
289                Ok(SpannedToken {
290                    token: Token::ArrayEnd,
291                    span: Span::new(start, 1),
292                })
293            }
294            b':' => {
295                self.pos += 1;
296                Ok(SpannedToken {
297                    token: Token::Colon,
298                    span: Span::new(start, 1),
299                })
300            }
301            b',' => {
302                self.pos += 1;
303                Ok(SpannedToken {
304                    token: Token::Comma,
305                    span: Span::new(start, 1),
306                })
307            }
308            b'"' => self.scan_string(buf, start),
309            b'-' | b'0'..=b'9' => self.scan_number(buf, start),
310            b't' => self.scan_literal(buf, start, b"true", Token::True),
311            b'f' => self.scan_literal(buf, start, b"false", Token::False),
312            b'n' => self.scan_literal(buf, start, b"null", Token::Null),
313            _ => Err(ScanError {
314                kind: ScanErrorKind::UnexpectedChar(byte as char),
315                span: Span::new(start, 1),
316            }),
317        }
318    }
319
320    fn skip_whitespace(&mut self, buf: &[u8]) {
321        while let Some(&b) = buf.get(self.pos) {
322            match b {
323                b' ' | b'\t' | b'\n' | b'\r' => self.pos += 1,
324                _ => break,
325            }
326        }
327    }
328
329    /// Scan a string, finding its boundaries and noting if it has escapes.
330    fn scan_string(&mut self, buf: &[u8], start: usize) -> ScanResult {
331        // Skip opening quote
332        self.pos += 1;
333        let content_start = self.pos;
334
335        self.scan_string_content(buf, start, content_start, false, false)
336    }
337
338    fn resume_string(
339        &mut self,
340        buf: &[u8],
341        start: usize,
342        has_escapes: bool,
343        escape_next: bool,
344    ) -> ScanResult {
345        let content_start = start + 1; // After opening quote
346        self.scan_string_content(buf, start, content_start, has_escapes, escape_next)
347    }
348
349    fn scan_string_content(
350        &mut self,
351        buf: &[u8],
352        start: usize,
353        content_start: usize,
354        mut has_escapes: bool,
355        mut escape_next: bool,
356    ) -> ScanResult {
357        // SIMD-friendly fast path: scan 16 bytes at a time looking for quotes/backslashes
358        const STEP_SIZE: usize = 16;
359        type Window = u128;
360        type Chunk = [u8; STEP_SIZE];
361
362        // SIMD fast path: only if we're not in escape mode
363        if !escape_next {
364            loop {
365                if let Some(Ok(chunk)) = buf
366                    .get(self.pos..)
367                    .and_then(|s| s.get(..STEP_SIZE))
368                    .map(Chunk::try_from)
369                {
370                    let window = Window::from_ne_bytes(chunk);
371                    let has_quote = contains_byte(window, b'"');
372                    let has_backslash = contains_byte(window, b'\\');
373
374                    if !has_quote && !has_backslash {
375                        // Fast path: no special chars in this chunk
376                        self.pos += STEP_SIZE;
377                        continue;
378                    }
379                }
380                // Fall through to byte-by-byte scanning
381                break;
382            }
383        }
384
385        // Byte-by-byte scanning
386        while let Some(&byte) = buf.get(self.pos) {
387            if escape_next {
388                // Previous char was backslash, skip this char
389                escape_next = false;
390                self.pos += 1;
391
392                // Handle \uXXXX - need to skip 4 more hex digits
393                if byte == b'u' {
394                    // Check if we have 4 more bytes
395                    if self.pos + 4 > buf.len() {
396                        // Need more data
397                        self.state = ScanState::InString {
398                            start,
399                            has_escapes: true,
400                            escape_next: false,
401                        };
402                        return Ok(SpannedToken {
403                            token: Token::NeedMore { consumed: start },
404                            span: Span::new(start, self.pos - start),
405                        });
406                    }
407                    self.pos += 4;
408
409                    // Check for surrogate pair (\uXXXX\uXXXX)
410                    if self.pos + 2 <= buf.len()
411                        && buf.get(self.pos) == Some(&b'\\')
412                        && buf.get(self.pos + 1) == Some(&b'u')
413                    {
414                        if self.pos + 6 > buf.len() {
415                            // Need more data for second surrogate
416                            self.state = ScanState::InString {
417                                start,
418                                has_escapes: true,
419                                escape_next: false,
420                            };
421                            return Ok(SpannedToken {
422                                token: Token::NeedMore { consumed: start },
423                                span: Span::new(start, self.pos - start),
424                            });
425                        }
426                        // Skip \uXXXX
427                        self.pos += 6;
428                    }
429                }
430                continue;
431            }
432
433            match byte {
434                b'"' => {
435                    // Found closing quote
436                    let content_end = self.pos;
437                    self.pos += 1; // Skip closing quote
438
439                    return Ok(SpannedToken {
440                        token: Token::String {
441                            start: content_start,
442                            end: content_end,
443                            has_escapes,
444                        },
445                        span: Span::new(start, self.pos - start),
446                    });
447                }
448                b'\\' => {
449                    has_escapes = true;
450                    escape_next = true;
451                    self.pos += 1;
452                }
453                _ => {
454                    self.pos += 1;
455                }
456            }
457        }
458
459        // Reached end of buffer without closing quote
460        if escape_next || self.pos > start {
461            // Mid-string, need more data
462            self.state = ScanState::InString {
463                start,
464                has_escapes,
465                escape_next,
466            };
467            Ok(SpannedToken {
468                token: Token::NeedMore { consumed: start },
469                span: Span::new(start, self.pos - start),
470            })
471        } else {
472            Err(ScanError {
473                kind: ScanErrorKind::UnexpectedEof("in string"),
474                span: Span::new(start, self.pos - start),
475            })
476        }
477    }
478
479    /// Scan a number, finding its boundaries and determining its type hint.
480    fn scan_number(&mut self, buf: &[u8], start: usize) -> ScanResult {
481        let mut hint = NumberHint::Unsigned;
482
483        if buf.get(self.pos) == Some(&b'-') {
484            hint = NumberHint::Signed;
485            self.pos += 1;
486        }
487
488        self.scan_number_content(buf, start, hint)
489    }
490
491    fn resume_number(&mut self, buf: &[u8], start: usize, hint: NumberHint) -> ScanResult {
492        self.scan_number_content(buf, start, hint)
493    }
494
495    fn scan_number_content(
496        &mut self,
497        buf: &[u8],
498        start: usize,
499        mut hint: NumberHint,
500    ) -> ScanResult {
501        // Integer part
502        while let Some(&b) = buf.get(self.pos) {
503            if b.is_ascii_digit() {
504                self.pos += 1;
505            } else {
506                break;
507            }
508        }
509
510        // Check for decimal part
511        if buf.get(self.pos) == Some(&b'.') {
512            hint = NumberHint::Float;
513            self.pos += 1;
514
515            // Fractional digits
516            while let Some(&b) = buf.get(self.pos) {
517                if b.is_ascii_digit() {
518                    self.pos += 1;
519                } else {
520                    break;
521                }
522            }
523        }
524
525        // Check for exponent
526        if matches!(buf.get(self.pos), Some(b'e') | Some(b'E')) {
527            hint = NumberHint::Float;
528            self.pos += 1;
529
530            // Optional sign
531            if matches!(buf.get(self.pos), Some(b'+') | Some(b'-')) {
532                self.pos += 1;
533            }
534
535            // Exponent digits
536            while let Some(&b) = buf.get(self.pos) {
537                if b.is_ascii_digit() {
538                    self.pos += 1;
539                } else {
540                    break;
541                }
542            }
543        }
544
545        // Check if we're at end of buffer - might need more data
546        // Numbers end at whitespace, punctuation, or true EOF
547        if self.pos == buf.len() {
548            // At end of buffer - need more data to see terminator
549            self.state = ScanState::InNumber { start, hint };
550            return Ok(SpannedToken {
551                token: Token::NeedMore { consumed: start },
552                span: Span::new(start, self.pos - start),
553            });
554        }
555
556        let end = self.pos;
557
558        // Validate we actually parsed something
559        if end == start || (end == start + 1 && buf.get(start) == Some(&b'-')) {
560            return Err(ScanError {
561                kind: ScanErrorKind::UnexpectedChar(
562                    buf.get(self.pos).map(|&b| b as char).unwrap_or('?'),
563                ),
564                span: Span::new(start, 1),
565            });
566        }
567
568        Ok(SpannedToken {
569            token: Token::Number { start, end, hint },
570            span: Span::new(start, end - start),
571        })
572    }
573
574    /// Scan a literal keyword (true, false, null)
575    fn scan_literal(
576        &mut self,
577        buf: &[u8],
578        start: usize,
579        expected: &'static [u8],
580        token: Token,
581    ) -> ScanResult {
582        self.scan_literal_content(buf, start, expected, 0, token)
583    }
584
585    fn resume_literal(
586        &mut self,
587        buf: &[u8],
588        start: usize,
589        expected: &'static [u8],
590        matched: usize,
591    ) -> ScanResult {
592        let token = match expected {
593            b"true" => Token::True,
594            b"false" => Token::False,
595            b"null" => Token::Null,
596            _ => unreachable!(),
597        };
598        self.scan_literal_content(buf, start, expected, matched, token)
599    }
600
601    fn scan_literal_content(
602        &mut self,
603        buf: &[u8],
604        start: usize,
605        expected: &'static [u8],
606        mut matched: usize,
607        token: Token,
608    ) -> ScanResult {
609        while matched < expected.len() {
610            match buf.get(self.pos) {
611                Some(&b) if b == expected[matched] => {
612                    self.pos += 1;
613                    matched += 1;
614                }
615                Some(&b) => {
616                    return Err(ScanError {
617                        kind: ScanErrorKind::UnexpectedChar(b as char),
618                        span: Span::new(self.pos, 1),
619                    });
620                }
621                None => {
622                    // Need more data
623                    self.state = ScanState::InLiteral {
624                        start,
625                        expected,
626                        matched,
627                    };
628                    return Ok(SpannedToken {
629                        token: Token::NeedMore { consumed: start },
630                        span: Span::new(start, self.pos - start),
631                    });
632                }
633            }
634        }
635
636        Ok(SpannedToken {
637            token,
638            span: Span::new(start, expected.len()),
639        })
640    }
641}
642
643impl Default for Scanner {
644    fn default() -> Self {
645        Self::new()
646    }
647}
648
649/// Check if a 128-bit window contains a specific byte (SIMD-friendly)
650#[inline]
651fn contains_byte(window: u128, byte: u8) -> bool {
652    let pattern = u128::from_ne_bytes([byte; 16]);
653    let xor = window ^ pattern;
654    let has_zero = (xor.wrapping_sub(0x01010101010101010101010101010101))
655        & !xor
656        & 0x80808080808080808080808080808080;
657    has_zero != 0
658}
659
660// =============================================================================
661// String decoding utilities (second pass)
662// =============================================================================
663
664/// Decode a JSON string from a buffer, handling escape sequences.
665///
666/// This is the "second pass" - only called when the deserializer actually needs
667/// the string content. For borrowed strings without escapes, use `decode_string_borrowed`.
668///
669/// # Arguments
670/// * `buf` - The buffer containing the string
671/// * `start` - Start index (after opening quote)
672/// * `end` - End index (before closing quote)
673///
674/// # Returns
675/// The decoded string, or an error if the string contains invalid escapes.
676pub fn decode_string_owned(
677    buf: &[u8],
678    start: usize,
679    end: usize,
680) -> Result<alloc::string::String, ScanError> {
681    use alloc::string::String;
682
683    let slice = &buf[start..end];
684    let mut result = String::with_capacity(end - start);
685    let mut i = 0;
686
687    while i < slice.len() {
688        let byte = slice[i];
689        if byte == b'\\' {
690            i += 1;
691            if i >= slice.len() {
692                return Err(ScanError {
693                    kind: ScanErrorKind::UnexpectedEof("in escape sequence"),
694                    span: Span::new(start + i - 1, 1),
695                });
696            }
697
698            match slice[i] {
699                b'"' => result.push('"'),
700                b'\\' => result.push('\\'),
701                b'/' => result.push('/'),
702                b'b' => result.push('\x08'),
703                b'f' => result.push('\x0c'),
704                b'n' => result.push('\n'),
705                b'r' => result.push('\r'),
706                b't' => result.push('\t'),
707                b'u' => {
708                    i += 1;
709                    if i + 4 > slice.len() {
710                        return Err(ScanError {
711                            kind: ScanErrorKind::UnexpectedEof("in unicode escape"),
712                            span: Span::new(start + i - 2, slice.len() - i + 2),
713                        });
714                    }
715
716                    let hex = &slice[i..i + 4];
717                    let hex_str = str::from_utf8(hex).map_err(|_| ScanError {
718                        kind: ScanErrorKind::InvalidUtf8,
719                        span: Span::new(start + i, 4),
720                    })?;
721
722                    let code_unit = u16::from_str_radix(hex_str, 16).map_err(|_| ScanError {
723                        kind: ScanErrorKind::UnexpectedChar('?'),
724                        span: Span::new(start + i, 4),
725                    })?;
726
727                    i += 4;
728
729                    // Check for surrogate pairs
730                    let code_point = if (0xD800..=0xDBFF).contains(&code_unit) {
731                        // High surrogate - expect \uXXXX to follow
732                        if i + 6 > slice.len() || slice[i] != b'\\' || slice[i + 1] != b'u' {
733                            return Err(ScanError {
734                                kind: ScanErrorKind::InvalidUtf8,
735                                span: Span::new(start + i - 6, 6),
736                            });
737                        }
738
739                        i += 2; // Skip \u
740                        let low_hex = &slice[i..i + 4];
741                        let low_hex_str = str::from_utf8(low_hex).map_err(|_| ScanError {
742                            kind: ScanErrorKind::InvalidUtf8,
743                            span: Span::new(start + i, 4),
744                        })?;
745
746                        let low_unit =
747                            u16::from_str_radix(low_hex_str, 16).map_err(|_| ScanError {
748                                kind: ScanErrorKind::UnexpectedChar('?'),
749                                span: Span::new(start + i, 4),
750                            })?;
751
752                        i += 4;
753
754                        if !(0xDC00..=0xDFFF).contains(&low_unit) {
755                            return Err(ScanError {
756                                kind: ScanErrorKind::InvalidUtf8,
757                                span: Span::new(start + i - 4, 4),
758                            });
759                        }
760
761                        // Combine surrogates
762                        let high = code_unit as u32;
763                        let low = low_unit as u32;
764                        0x10000 + ((high & 0x3FF) << 10) + (low & 0x3FF)
765                    } else if (0xDC00..=0xDFFF).contains(&code_unit) {
766                        // Lone low surrogate
767                        return Err(ScanError {
768                            kind: ScanErrorKind::InvalidUtf8,
769                            span: Span::new(start + i - 4, 4),
770                        });
771                    } else {
772                        code_unit as u32
773                    };
774
775                    let c = char::from_u32(code_point).ok_or_else(|| ScanError {
776                        kind: ScanErrorKind::InvalidUtf8,
777                        span: Span::new(start + i - 4, 4),
778                    })?;
779
780                    result.push(c);
781                    continue; // Don't increment i again
782                }
783                other => {
784                    // Unknown escape - just push the character
785                    result.push(other as char);
786                }
787            }
788            i += 1;
789        } else {
790            // Regular UTF-8 byte
791            // Fast path for ASCII
792            if byte < 0x80 {
793                result.push(byte as char);
794                i += 1;
795            } else {
796                // Multi-byte UTF-8. We must *not* fast-path the entire remaining slice, because it
797                // might contain escape sequences (e.g. `\\n`) that still need decoding.
798                let remaining = &slice[i..];
799                match str::from_utf8(remaining) {
800                    Ok(s) => {
801                        // Consume exactly one UTF-8 char, then continue scanning (so later escapes
802                        // are still handled).
803                        let ch = s.chars().next().expect("non-empty remaining slice");
804                        result.push(ch);
805                        i += ch.len_utf8();
806                    }
807                    Err(e) => {
808                        // Partial valid UTF-8
809                        let valid_len = e.valid_up_to();
810                        if valid_len > 0 {
811                            // Re-validate the valid portion (safe, no unsafe). We still only
812                            // consume what we can prove is valid.
813                            let valid = str::from_utf8(&remaining[..valid_len])
814                                .expect("valid_up_to guarantees valid UTF-8");
815                            result.push_str(valid);
816                            i += valid_len;
817                        } else {
818                            return Err(ScanError {
819                                kind: ScanErrorKind::InvalidUtf8,
820                                span: Span::new(start + i, 1),
821                            });
822                        }
823                    }
824                }
825            }
826        }
827    }
828
829    Ok(result)
830}
831
832/// Try to borrow a string directly from the buffer (zero-copy).
833///
834/// This only works for strings without escape sequences. Returns `None` if
835/// the string contains escapes or invalid UTF-8.
836///
837/// # Arguments
838/// * `buf` - The buffer containing the string
839/// * `start` - Start index (after opening quote)
840/// * `end` - End index (before closing quote)
841pub fn decode_string_borrowed(buf: &[u8], start: usize, end: usize) -> Option<&str> {
842    let slice = &buf[start..end];
843
844    // Quick check for backslashes
845    if slice.contains(&b'\\') {
846        return None;
847    }
848
849    str::from_utf8(slice).ok()
850}
851
852/// Decode a JSON string, returning either a borrowed or owned string.
853///
854/// Uses `Cow<str>` to avoid allocation when possible.
855pub fn decode_string<'a>(
856    buf: &'a [u8],
857    start: usize,
858    end: usize,
859    has_escapes: bool,
860) -> Result<alloc::borrow::Cow<'a, str>, ScanError> {
861    use alloc::borrow::Cow;
862
863    if has_escapes {
864        decode_string_owned(buf, start, end).map(Cow::Owned)
865    } else {
866        decode_string_borrowed(buf, start, end)
867            .map(Cow::Borrowed)
868            .ok_or_else(|| ScanError {
869                kind: ScanErrorKind::InvalidUtf8,
870                span: Span::new(start, end - start),
871            })
872    }
873}
874
875/// Parse a number from the buffer.
876///
877/// Returns the appropriate numeric type based on the hint and value.
878#[derive(Debug, Clone, PartialEq)]
879pub enum ParsedNumber {
880    /// Unsigned 64-bit integer
881    U64(u64),
882    /// Signed 64-bit integer
883    I64(i64),
884    /// Unsigned 128-bit integer
885    U128(u128),
886    /// Signed 128-bit integer
887    I128(i128),
888    /// 64-bit floating point
889    F64(f64),
890}
891
892/// Parse a number from the buffer slice.
893pub fn parse_number(
894    buf: &[u8],
895    start: usize,
896    end: usize,
897    hint: NumberHint,
898) -> Result<ParsedNumber, ScanError> {
899    use lexical_parse_float::FromLexical as _;
900    use lexical_parse_integer::FromLexical as _;
901
902    let slice = &buf[start..end];
903
904    match hint {
905        NumberHint::Float => f64::from_lexical(slice)
906            .map(ParsedNumber::F64)
907            .map_err(|_| ScanError {
908                kind: ScanErrorKind::UnexpectedChar('?'),
909                span: Span::new(start, end - start),
910            }),
911        NumberHint::Signed => {
912            if let Ok(n) = i64::from_lexical(slice) {
913                Ok(ParsedNumber::I64(n))
914            } else if let Ok(n) = i128::from_lexical(slice) {
915                Ok(ParsedNumber::I128(n))
916            } else {
917                Err(ScanError {
918                    kind: ScanErrorKind::UnexpectedChar('?'),
919                    span: Span::new(start, end - start),
920                })
921            }
922        }
923        NumberHint::Unsigned => {
924            if let Ok(n) = u64::from_lexical(slice) {
925                Ok(ParsedNumber::U64(n))
926            } else if let Ok(n) = u128::from_lexical(slice) {
927                Ok(ParsedNumber::U128(n))
928            } else {
929                Err(ScanError {
930                    kind: ScanErrorKind::UnexpectedChar('?'),
931                    span: Span::new(start, end - start),
932                })
933            }
934        }
935    }
936}
937
938#[cfg(test)]
939mod tests {
940    use super::*;
941
942    #[test]
943    fn test_simple_tokens() {
944        let input = b"{}[],:";
945        let mut scanner = Scanner::new();
946
947        assert!(matches!(
948            scanner.next_token(input).unwrap().token,
949            Token::ObjectStart
950        ));
951        assert!(matches!(
952            scanner.next_token(input).unwrap().token,
953            Token::ObjectEnd
954        ));
955        assert!(matches!(
956            scanner.next_token(input).unwrap().token,
957            Token::ArrayStart
958        ));
959        assert!(matches!(
960            scanner.next_token(input).unwrap().token,
961            Token::ArrayEnd
962        ));
963        assert!(matches!(
964            scanner.next_token(input).unwrap().token,
965            Token::Comma
966        ));
967        assert!(matches!(
968            scanner.next_token(input).unwrap().token,
969            Token::Colon
970        ));
971        assert!(matches!(
972            scanner.next_token(input).unwrap().token,
973            Token::Eof
974        ));
975    }
976
977    #[test]
978    fn test_string_no_escapes() {
979        let input = b"\"hello world\"";
980        let mut scanner = Scanner::new();
981
982        let result = scanner.next_token(input).unwrap();
983        assert!(matches!(
984            result.token,
985            Token::String {
986                start: 1,
987                end: 12,
988                has_escapes: false
989            }
990        ));
991    }
992
993    #[test]
994    fn test_string_with_escapes() {
995        let input = br#""hello\nworld""#;
996        let mut scanner = Scanner::new();
997
998        let result = scanner.next_token(input).unwrap();
999        assert!(matches!(
1000            result.token,
1001            Token::String {
1002                start: 1,
1003                end: 13,
1004                has_escapes: true
1005            }
1006        ));
1007    }
1008
1009    #[test]
1010    fn test_numbers() {
1011        let mut scanner = Scanner::new();
1012
1013        // Unsigned (with terminator so scanner knows number is complete)
1014        let result = scanner.next_token(b"42,").unwrap();
1015        assert!(matches!(
1016            result.token,
1017            Token::Number {
1018                hint: NumberHint::Unsigned,
1019                ..
1020            }
1021        ));
1022
1023        // Signed
1024        scanner.set_pos(0);
1025        let result = scanner.next_token(b"-42]").unwrap();
1026        assert!(matches!(
1027            result.token,
1028            Token::Number {
1029                hint: NumberHint::Signed,
1030                ..
1031            }
1032        ));
1033
1034        // Float
1035        scanner.set_pos(0);
1036        let result = scanner.next_token(b"3.14}").unwrap();
1037        assert!(matches!(
1038            result.token,
1039            Token::Number {
1040                hint: NumberHint::Float,
1041                ..
1042            }
1043        ));
1044
1045        // Exponent
1046        scanner.set_pos(0);
1047        let result = scanner.next_token(b"1e10 ").unwrap();
1048        assert!(matches!(
1049            result.token,
1050            Token::Number {
1051                hint: NumberHint::Float,
1052                ..
1053            }
1054        ));
1055
1056        // Number at end of buffer returns NeedMore (streaming behavior)
1057        scanner.set_pos(0);
1058        let result = scanner.next_token(b"42").unwrap();
1059        assert!(matches!(result.token, Token::NeedMore { .. }));
1060    }
1061
1062    #[test]
1063    fn test_literals() {
1064        let mut scanner = Scanner::new();
1065
1066        // Literals need terminators too (scanner can't know if "truex" is coming)
1067        let result = scanner.next_token(b"true,").unwrap();
1068        assert!(matches!(result.token, Token::True));
1069
1070        scanner.set_pos(0);
1071        let result = scanner.next_token(b"false]").unwrap();
1072        assert!(matches!(result.token, Token::False));
1073
1074        scanner.set_pos(0);
1075        let result = scanner.next_token(b"null}").unwrap();
1076        assert!(matches!(result.token, Token::Null));
1077    }
1078
1079    #[test]
1080    fn test_whitespace_handling() {
1081        let input = b"  {\n\t\"key\"  :  42  }  ";
1082        let mut scanner = Scanner::new();
1083
1084        assert!(matches!(
1085            scanner.next_token(input).unwrap().token,
1086            Token::ObjectStart
1087        ));
1088        assert!(matches!(
1089            scanner.next_token(input).unwrap().token,
1090            Token::String { .. }
1091        ));
1092        assert!(matches!(
1093            scanner.next_token(input).unwrap().token,
1094            Token::Colon
1095        ));
1096        assert!(matches!(
1097            scanner.next_token(input).unwrap().token,
1098            Token::Number { .. }
1099        ));
1100        assert!(matches!(
1101            scanner.next_token(input).unwrap().token,
1102            Token::ObjectEnd
1103        ));
1104        assert!(matches!(
1105            scanner.next_token(input).unwrap().token,
1106            Token::Eof
1107        ));
1108    }
1109
1110    #[test]
1111    fn test_decode_string_no_escapes() {
1112        let input = b"hello world";
1113        let result = decode_string_borrowed(input, 0, input.len());
1114        assert_eq!(result, Some("hello world"));
1115    }
1116
1117    #[test]
1118    fn test_decode_string_with_escapes() {
1119        let input = br#"hello\nworld"#;
1120        let result = decode_string_owned(input, 0, input.len()).unwrap();
1121        assert_eq!(result, "hello\nworld");
1122    }
1123
1124    #[test]
1125    fn test_decode_string_unicode() {
1126        // \u0048 = 'H', \u0065 = 'e', \u006C = 'l', \u006C = 'l', \u006F = 'o'
1127        let input = br#"\u0048\u0065\u006C\u006C\u006F"#;
1128        let result = decode_string_owned(input, 0, input.len()).unwrap();
1129        assert_eq!(result, "Hello");
1130    }
1131
1132    #[test]
1133    fn test_decode_string_surrogate_pair() {
1134        // U+1F600 (grinning face) = \uD83D\uDE00
1135        let input = br#"\uD83D\uDE00"#;
1136        let result = decode_string_owned(input, 0, input.len()).unwrap();
1137        assert_eq!(result, "😀");
1138    }
1139
1140    #[test]
1141    fn test_decode_cow_borrowed() {
1142        let input = b"simple";
1143        let result = decode_string(input, 0, input.len(), false).unwrap();
1144        assert!(matches!(result, alloc::borrow::Cow::Borrowed(_)));
1145        assert_eq!(&*result, "simple");
1146    }
1147
1148    #[test]
1149    fn test_decode_cow_owned() {
1150        let input = br#"has\tescape"#;
1151        let result = decode_string(input, 0, input.len(), true).unwrap();
1152        assert!(matches!(result, alloc::borrow::Cow::Owned(_)));
1153        assert_eq!(&*result, "has\tescape");
1154    }
1155
1156    #[test]
1157    fn test_parse_numbers() {
1158        assert_eq!(
1159            parse_number(b"42", 0, 2, NumberHint::Unsigned).unwrap(),
1160            ParsedNumber::U64(42)
1161        );
1162        assert_eq!(
1163            parse_number(b"-42", 0, 3, NumberHint::Signed).unwrap(),
1164            ParsedNumber::I64(-42)
1165        );
1166        #[allow(clippy::approx_constant)]
1167        {
1168            assert_eq!(
1169                parse_number(b"3.14", 0, 4, NumberHint::Float).unwrap(),
1170                ParsedNumber::F64(3.14)
1171            );
1172        }
1173    }
1174}
1175
1176#[cfg(all(test, feature = "bolero-inline-tests"))]
1177#[allow(clippy::while_let_loop, clippy::same_item_push)]
1178mod fuzz_tests {
1179    use super::*;
1180    use bolero::check;
1181
1182    /// Fuzz the scanner with arbitrary bytes - it should never panic
1183    #[test]
1184    fn fuzz_scanner_arbitrary_bytes() {
1185        check!().for_each(|input: &[u8]| {
1186            let mut scanner = Scanner::new();
1187            loop {
1188                match scanner.next_token(input) {
1189                    Ok(spanned) => {
1190                        if matches!(spanned.token, Token::Eof | Token::NeedMore { .. }) {
1191                            break;
1192                        }
1193                    }
1194                    Err(_) => break, // Errors are fine, panics are not
1195                }
1196            }
1197        });
1198    }
1199
1200    /// Fuzz with valid JSON-like input - scanner should handle it
1201    #[test]
1202    fn fuzz_scanner_json_like() {
1203        check!().for_each(|input: &[u8]| {
1204            // Wrap input in array to make it more JSON-like
1205            let mut wrapped = Vec::with_capacity(input.len() + 2);
1206            wrapped.push(b'[');
1207            wrapped.extend_from_slice(input);
1208            wrapped.push(b']');
1209
1210            let mut scanner = Scanner::new();
1211            loop {
1212                match scanner.next_token(&wrapped) {
1213                    Ok(spanned) => {
1214                        if matches!(spanned.token, Token::Eof | Token::NeedMore { .. }) {
1215                            break;
1216                        }
1217                    }
1218                    Err(_) => break,
1219                }
1220            }
1221        });
1222    }
1223
1224    /// Fuzz string decoding - should never panic
1225    #[test]
1226    fn fuzz_decode_string() {
1227        check!().for_each(|input: &[u8]| {
1228            if input.len() >= 2 {
1229                // Try decoding as if it were string content
1230                let _ = decode_string_owned(input, 0, input.len());
1231            }
1232        });
1233    }
1234
1235    /// Fuzz with strings that might have escapes
1236    #[test]
1237    fn fuzz_scanner_strings() {
1238        check!().for_each(|content: &[u8]| {
1239            // Build a quoted string
1240            let mut input = Vec::with_capacity(content.len() + 2);
1241            input.push(b'"');
1242            input.extend_from_slice(content);
1243            input.push(b'"');
1244
1245            let mut scanner = Scanner::new();
1246            let _ = scanner.next_token(&input);
1247        });
1248    }
1249
1250    /// Fuzz with numbers of various formats
1251    #[test]
1252    fn fuzz_scanner_numbers() {
1253        check!().for_each(|content: &[u8]| {
1254            // Only try if it looks number-like (starts with digit or minus)
1255            if !content.is_empty() && (content[0].is_ascii_digit() || content[0] == b'-') {
1256                let mut scanner = Scanner::new();
1257                let _ = scanner.next_token(content);
1258            }
1259        });
1260    }
1261
1262    /// Fuzz number parsing directly
1263    #[test]
1264    fn fuzz_parse_number() {
1265        check!().for_each(|input: &[u8]| {
1266            if !input.is_empty() {
1267                // Try all hints
1268                let _ = parse_number(input, 0, input.len(), NumberHint::Unsigned);
1269                let _ = parse_number(input, 0, input.len(), NumberHint::Signed);
1270                let _ = parse_number(input, 0, input.len(), NumberHint::Float);
1271            }
1272        });
1273    }
1274
1275    /// Fuzz with deeply nested structures
1276    #[test]
1277    fn fuzz_scanner_nested() {
1278        check!().for_each(|input: &[u8]| {
1279            // Use first byte as depth indicator
1280            let depth = input.first().copied().unwrap_or(0) as usize % 100;
1281            let mut nested = Vec::new();
1282            for _ in 0..depth {
1283                nested.push(b'[');
1284            }
1285            for _ in 0..depth {
1286                nested.push(b']');
1287            }
1288
1289            let mut scanner = Scanner::new();
1290            loop {
1291                match scanner.next_token(&nested) {
1292                    Ok(spanned) => {
1293                        if matches!(spanned.token, Token::Eof | Token::NeedMore { .. }) {
1294                            break;
1295                        }
1296                    }
1297                    Err(_) => break,
1298                }
1299            }
1300        });
1301    }
1302}
facet_json/scanner.rs

facet_json/
scanner.rs