facet_json/deserialize/
tokenizer.rs

1use alloc::string::String;
2use alloc::string::ToString;
3use alloc::vec::Vec;
4use core::str;
5
6/// Position in the input (byte index)
7pub type Pos = usize;
8
9/// A span in the input, with a start position and length
10#[derive(Debug, Clone, Copy, PartialEq, Eq)]
11pub struct Span {
12    /// Starting position of the span in bytes
13    start: Pos,
14    /// Length of the span in bytes
15    len: usize,
16}
17
18impl Span {
19    /// Creates a new span with the given start position and length
20    pub fn new(start: Pos, len: usize) -> Self {
21        Span { start, len }
22    }
23    /// Start position of the span
24    pub fn start(&self) -> Pos {
25        self.start
26    }
27    /// Length of the span
28    pub fn len(&self) -> usize {
29        self.len
30    }
31    /// Returns `true` if this span has zero length
32    pub fn is_empty(&self) -> bool {
33        self.len == 0
34    }
35    /// End position (start + length)
36    pub fn end(&self) -> Pos {
37        self.start + self.len
38    }
39}
40
41/// A value of type `T` annotated with its `Span`
42#[derive(Debug, Clone, PartialEq, Eq)]
43pub struct Spanned<T> {
44    /// The actual data/value being wrapped
45    pub node: T,
46    /// The span information indicating the position and length in the source
47    pub span: Span,
48}
49
50/// Error encountered during tokenization
51#[derive(Debug, Clone, PartialEq)]
52pub struct TokenError {
53    /// The specific type of error that occurred during tokenization
54    pub kind: TokenErrorKind,
55    /// The location in the source where the error occurred
56    pub span: Span,
57}
58
59/// Types of errors that can occur during tokenization
60#[derive(Debug, Clone, PartialEq)]
61pub enum TokenErrorKind {
62    /// Unexpected character encountered
63    UnexpectedCharacter(char),
64    /// End of file reached unexpectedly
65    UnexpectedEof(&'static str),
66    /// Invalid UTF-8 sequence
67    InvalidUtf8(String),
68    /// Number is out of range
69    NumberOutOfRange(f64),
70}
71
72impl Display for TokenErrorKind {
73    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
74        match self {
75            TokenErrorKind::UnexpectedCharacter(c) => write!(f, "unexpected character: '{}'", c),
76            TokenErrorKind::UnexpectedEof(context) => write!(f, "unexpected EOF {}", context),
77            TokenErrorKind::InvalidUtf8(detail) => write!(f, "invalid UTF-8: {}", detail),
78            TokenErrorKind::NumberOutOfRange(n) => write!(f, "number out of range: {}", n),
79        }
80    }
81}
82
83/// Tokenization result, yielding a spanned token
84pub type TokenizeResult = Result<Spanned<Token>, TokenError>;
85
86/// JSON tokens (without positions)
87#[derive(Debug, Clone, PartialEq)]
88pub enum Token {
89    /// Left brace character: '{'
90    LBrace,
91    /// Right brace character: '}'
92    RBrace,
93    /// Left bracket character: '['
94    LBracket,
95    /// Right bracket character: ']'
96    RBracket,
97    /// Colon character: ':'
98    Colon,
99    /// Comma character: ','
100    Comma,
101    /// A JSON string value — todo: should be a Cow
102    String(String),
103    /// A 64-bit floating point number value — used if the value contains a decimal point
104    F64(f64),
105    /// A signed 64-bit integer number value — used if the value does not contain a decimal point but contains a sign
106    I64(i64),
107    /// An unsigned 64-bit integer number value — used if the value does not contain a decimal point and does not contain a sign
108    U64(u64),
109    /// The JSON boolean value 'true'
110    True,
111    /// The JSON boolean value 'false'
112    False,
113    /// The JSON null value
114    Null,
115    /// End of file marker
116    EOF,
117}
118
119use core::fmt::{self, Display, Formatter};
120
121impl Display for Token {
122    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
123        match self {
124            Token::LBrace => write!(f, "{{"),
125            Token::RBrace => write!(f, "}}"),
126            Token::LBracket => write!(f, "["),
127            Token::RBracket => write!(f, "]"),
128            Token::Colon => write!(f, ":"),
129            Token::Comma => write!(f, ","),
130            Token::String(s) => write!(f, "\"{}\"", s),
131            Token::F64(n) => write!(f, "{}", n),
132            Token::I64(n) => write!(f, "{}", n),
133            Token::U64(n) => write!(f, "{}", n),
134            Token::True => write!(f, "true"),
135            Token::False => write!(f, "false"),
136            Token::Null => write!(f, "null"),
137            Token::EOF => write!(f, "EOF"),
138        }
139    }
140}
141
142/// Simple JSON tokenizer producing spanned tokens from byte input.
143pub struct Tokenizer<'input> {
144    input: &'input [u8],
145    pos: Pos,
146}
147
148impl<'input> Tokenizer<'input> {
149    /// Create a new tokenizer for the given input slice.
150    pub fn new(input: &'input [u8]) -> Self {
151        Tokenizer { input, pos: 0 }
152    }
153
154    /// Current cursor position in the input
155    pub fn position(&self) -> Pos {
156        self.pos
157    }
158
159    /// Return the next spanned token or a TokenizeError
160    pub fn next_token(&mut self) -> TokenizeResult {
161        self.skip_whitespace();
162        let start = self.pos;
163        let c = match self.input.get(self.pos).copied() {
164            Some(c) => c,
165            None => {
166                // EOF at this position
167                let span = Span::new(self.pos, 0);
168                return Ok(Spanned {
169                    node: Token::EOF,
170                    span,
171                });
172            }
173        };
174        let sp = match c {
175            b'{' => {
176                self.pos += 1;
177                Spanned {
178                    node: Token::LBrace,
179                    span: Span::new(start, 1),
180                }
181            }
182            b'}' => {
183                self.pos += 1;
184                Spanned {
185                    node: Token::RBrace,
186                    span: Span::new(start, 1),
187                }
188            }
189            b'[' => {
190                self.pos += 1;
191                Spanned {
192                    node: Token::LBracket,
193                    span: Span::new(start, 1),
194                }
195            }
196            b']' => {
197                self.pos += 1;
198                Spanned {
199                    node: Token::RBracket,
200                    span: Span::new(start, 1),
201                }
202            }
203            b':' => {
204                self.pos += 1;
205                Spanned {
206                    node: Token::Colon,
207                    span: Span::new(start, 1),
208                }
209            }
210            b',' => {
211                self.pos += 1;
212                Spanned {
213                    node: Token::Comma,
214                    span: Span::new(start, 1),
215                }
216            }
217            b'"' => return self.parse_string(start),
218            b'-' | b'0'..=b'9' => return self.parse_number(start),
219            b't' => return self.parse_literal(start, b"true", || Token::True),
220            b'f' => return self.parse_literal(start, b"false", || Token::False),
221            b'n' => return self.parse_literal(start, b"null", || Token::Null),
222            _ => {
223                return Err(TokenError {
224                    kind: TokenErrorKind::UnexpectedCharacter(c as char),
225                    span: Span::new(start, 1),
226                });
227            }
228        };
229        Ok(sp)
230    }
231
232    /// Skip whitespace characters
233    fn skip_whitespace(&mut self) {
234        while let Some(&b) = self.input.get(self.pos) {
235            match b {
236                b' ' | b'\t' | b'\n' | b'\r' => self.pos += 1,
237                _ => break,
238            }
239        }
240    }
241
242    fn parse_string(&mut self, start: Pos) -> TokenizeResult {
243        // Skip opening quote
244        self.pos += 1;
245        let mut buf = Vec::new();
246        let content_start = self.pos;
247
248        while let Some(&b) = self.input.get(self.pos) {
249            match b {
250                b'"' => {
251                    self.pos += 1;
252                    break;
253                }
254                b'\\' => {
255                    self.pos += 1;
256                    if let Some(&esc) = self.input.get(self.pos) {
257                        match esc {
258                            b'"' | b'\\' | b'/' => buf.push(esc),
259                            b'b' => buf.push(b'\x08'), // backspace
260                            b'f' => buf.push(b'\x0C'), // form feed
261                            b'n' => buf.push(b'\n'),   // line feed
262                            b'r' => buf.push(b'\r'),   // carriage return
263                            b't' => buf.push(b'\t'),   // tab
264                            _ => buf.push(esc), // other escapes (should handle \uXXXX properly)
265                        }
266                        self.pos += 1;
267                    } else {
268                        return Err(TokenError {
269                            kind: TokenErrorKind::UnexpectedEof("in string escape"),
270                            span: Span::new(self.pos, 0),
271                        });
272                    }
273                }
274                _ => {
275                    buf.push(b);
276                    self.pos += 1;
277                }
278            }
279        }
280
281        // Check if we reached the end without finding a closing quote
282        if self.pos > self.input.len()
283            || (self.pos == self.input.len() && self.input[self.pos - 1] != b'"')
284        {
285            return Err(TokenError {
286                kind: TokenErrorKind::UnexpectedEof("in string literal"),
287                span: Span::new(start, self.pos - start),
288            });
289        }
290
291        let s = match str::from_utf8(&buf) {
292            Ok(st) => st.to_string(),
293            Err(e) => {
294                return Err(TokenError {
295                    kind: TokenErrorKind::InvalidUtf8(e.to_string()),
296                    span: Span::new(content_start, buf.len()),
297                });
298            }
299        };
300
301        let len = self.pos - start;
302        let span = Span::new(start, len);
303        Ok(Spanned {
304            node: Token::String(s),
305            span,
306        })
307    }
308
309    fn parse_number(&mut self, start: Pos) -> TokenizeResult {
310        let mut end = self.pos;
311        if self.input[end] == b'-' {
312            end += 1;
313        }
314        while end < self.input.len() && self.input[end].is_ascii_digit() {
315            end += 1;
316        }
317        if end < self.input.len() && self.input[end] == b'.' {
318            end += 1;
319            while end < self.input.len() && self.input[end].is_ascii_digit() {
320                end += 1;
321            }
322        }
323        if end < self.input.len() && (self.input[end] == b'e' || self.input[end] == b'E') {
324            end += 1;
325            if end < self.input.len() && (self.input[end] == b'+' || self.input[end] == b'-') {
326                end += 1;
327            }
328            while end < self.input.len() && self.input[end].is_ascii_digit() {
329                end += 1;
330            }
331        }
332        let slice = &self.input[start..end];
333        let span = Span::new(start, end - start);
334
335        let text = match str::from_utf8(slice) {
336            Ok(t) => t,
337            Err(e) => {
338                return Err(TokenError {
339                    kind: TokenErrorKind::InvalidUtf8(e.to_string()),
340                    span,
341                });
342            }
343        };
344
345        let token = if text.contains('.') || text.contains('e') || text.contains('E') {
346            // If the number contains a decimal point or exponent, parse as f64
347            match text.parse::<f64>() {
348                Ok(n) => Token::F64(n),
349                Err(_) => {
350                    return Err(TokenError {
351                        kind: TokenErrorKind::NumberOutOfRange(0.0),
352                        span,
353                    });
354                }
355            }
356        } else if text.starts_with('-') {
357            // If the number starts with a negative sign, parse as i64
358            match text.parse::<i64>() {
359                Ok(n) => Token::I64(n),
360                Err(_) => {
361                    // If i64 parsing fails, try to parse as f64 for error reporting
362                    let num = text.parse::<f64>().unwrap_or(0.0);
363                    return Err(TokenError {
364                        kind: TokenErrorKind::NumberOutOfRange(num),
365                        span,
366                    });
367                }
368            }
369        } else {
370            // Otherwise, parse as u64
371            match text.parse::<u64>() {
372                Ok(n) => Token::U64(n),
373                Err(_) => {
374                    // If u64 parsing fails, try to parse as f64 for error reporting
375                    let num = text.parse::<f64>().unwrap_or(0.0);
376                    return Err(TokenError {
377                        kind: TokenErrorKind::NumberOutOfRange(num),
378                        span,
379                    });
380                }
381            }
382        };
383
384        self.pos = end;
385        Ok(Spanned { node: token, span })
386    }
387
388    fn parse_literal<F>(&mut self, start: Pos, pat: &[u8], ctor: F) -> TokenizeResult
389    where
390        F: FnOnce() -> Token,
391    {
392        let end = start + pat.len();
393        if end <= self.input.len() && &self.input[start..end] == pat {
394            self.pos = end;
395            let span = Span::new(start, pat.len());
396            Ok(Spanned { node: ctor(), span })
397        } else {
398            // Determine how much of the pattern matched before mismatch
399            let actual_len = self.input.len().saturating_sub(start).min(pat.len());
400            let span = Span::new(start, actual_len.max(1)); // Ensure span covers at least one character
401
402            let got = self.input.get(start).copied().unwrap_or(b'?') as char;
403            Err(TokenError {
404                kind: TokenErrorKind::UnexpectedCharacter(got),
405                span,
406            })
407        }
408    }
409}