json_fixer/jsonfixer/
json_tokenizer.rs

1//! JSON tokenizer module that converts input text into a stream of JSON tokens.
2//!
3//! This module handles the lexical analysis of JSON input, including support for
4//! various numeric formats, string escape sequences, and tracking of position information.
5
6use std::iter::Peekable;
7use std::str::Chars;
8
9use super::jsonfixer_error::{SyntaxError, JsonFixerError};
10
11/// Represents a position in the input text.
12#[derive(Debug, PartialEq, Clone)]
13pub struct Position {
14    /// Line number (1-based)
15    pub line: usize,
16    /// Column number (0-based)
17    pub column: usize,
18}
19
20#[derive(Debug, PartialEq, Clone)]
21pub enum Token {
22    LeftBrace(Position),      // '{'
23    RightBrace(Position),     // '}'
24    LeftBracket(Position),    // '['
25    RightBracket(Position),   // ']'
26    Colon(Position),          // ':'
27    Comma(Position),          // ','
28    String(String, Position), // JSON string
29    Number(String, Position), // JSON number will kept as string to preserve the numbers like 1e5
30    Boolean(bool, Position),  // true or false
31    Null(Position),
32    Whitespace(String, Position),     // null
33    UnquotedString(String, Position), // null
34}
35
36impl Token {
37    /// Converts the token to its string representation.
38    pub fn get(&self) -> String {
39        match self {
40            Self::LeftBrace(_) => "'{'".to_string(),
41            Self::RightBrace(_) => "'}'".to_string(),
42            Self::LeftBracket(_) => "'['".to_string(),
43            Self::RightBracket(_) => "']'".to_string(),
44            Self::Colon(_) => "':'".to_string(),
45            Self::Comma(_) => "','".to_string(),
46            Self::String(s, _) => format!("String({s})"),
47            Self::Number(n, _) => format!("Number({n})"),
48            Self::Boolean(b, _) => format!("Boolean({b})"),
49            Self::Null(_) => "null".to_string(),
50            Self::Whitespace(s, _) => format!("{}", s),
51            Self::UnquotedString(s, _) => format!("{}", s),
52        }
53    }
54    pub fn pos(&self) -> &Position {
55        match self {
56            Self::LeftBrace(pos) => pos,
57            Self::RightBrace(pos) => pos,
58            Self::LeftBracket(pos) => pos,
59            Self::RightBracket(pos) => pos,
60            Self::Colon(pos) => pos,
61            Self::Comma(pos) => pos,
62            Self::String(_, pos) => pos,
63            Self::Number(_, pos) => pos,
64            Self::Boolean(_, pos) => pos,
65            Self::Null(pos) => pos,
66            Self::Whitespace(_, pos) => pos,
67            Self::UnquotedString(_, pos) => pos,
68        }
69    }
70}
71
72/// Tokenizer that converts JSON input text into a stream of tokens.
73pub struct JsonTokenizer<'a> {
74    input: Peekable<Chars<'a>>,
75    line: usize,
76    column: usize,
77}
78
79impl<'a> JsonTokenizer<'a> {
80    /// Creates a new tokenizer instance.
81    pub fn new(input: &'a str) -> Self {
82        Self {
83            input: input.chars().peekable(),
84            line: 1,
85            column: 0,
86        }
87    }
88
89    /// Returns the next token from the input stream.
90    ///
91    /// # Errors
92    ///
93    /// Returns `JsonFixerError` if an invalid token is encountered.
94    pub fn next_token(&mut self) -> Result<Option<Token>, JsonFixerError> {
95        if let Some(ch) = self.advance() {
96            match ch {
97                ch if ch.is_whitespace() => self.tokenize_whitespaces(ch).map(Some),
98                '{' => Ok(Some(Token::LeftBrace(self.current_position()))),
99                '}' => Ok(Some(Token::RightBrace(self.current_position()))),
100                '[' => Ok(Some(Token::LeftBracket(self.current_position()))),
101                ']' => Ok(Some(Token::RightBracket(self.current_position()))),
102                ':' => Ok(Some(Token::Colon(self.current_position()))),
103                ',' => Ok(Some(Token::Comma(self.current_position()))),
104                '\'' | '"' => self.tokenize_string(ch).map(Some),
105                '.' | '+' | '-' | '0'..='9' => self.tokenize_number(ch).map(Some),
106                'a'..='z' | 'A'..='Z' | '_' => self.tokenize_identifier(ch).map(Some),
107                ch => Err(JsonFixerError::Syntax(SyntaxError::UnexpectedCharacter(
108                    ch,
109                    Position {
110                        line: self.line,
111                        column: self.column,
112                    },
113                ))),
114            }
115        } else {
116            Ok(None)
117        }
118    }
119
120    fn tokenize_whitespaces(&mut self, first_space: char) -> Result<Token, JsonFixerError> {
121        let start_pos = self.current_position();
122        let mut whitespaces = String::new();
123        whitespaces.push(first_space);
124
125        while let Some(next_ch) = self.input.peek() {
126            if !next_ch.is_whitespace() {
127                break;
128            }
129
130            whitespaces.push(self.advance().unwrap());
131        }
132
133        Ok(Token::Whitespace(whitespaces, start_pos))
134    }
135
136    fn peek(&mut self) -> Option<&char> {
137        self.input.peek()
138    }
139    fn advance(&mut self) -> Option<char> {
140        if let Some(ch) = self.input.next() {
141            self.column += 1;
142
143            if ch == '\n' {
144                self.line += 1;
145                self.column = 1;
146            }
147            Some(ch)
148        } else {
149            None
150        }
151    }
152    pub fn current_position(&self) -> Position {
153        Position {
154            line: self.line,
155            column: self.column,
156        }
157    }
158    fn tokenize_string(&mut self, quote_char: char) -> Result<Token, JsonFixerError> {
159        let start_pos = self.current_position();
160        let mut result = String::new();
161
162        while let Some(ch) = self.advance() {
163            match ch {
164                ch if ch == quote_char => return Ok(Token::String(result, start_pos)),
165                '\\' => {
166                    if let Some(next_ch) = self.advance() {
167                        match next_ch {
168                            '"' | '\\' | '/' => result.push(next_ch),
169                            // handle controle characters
170                            'b' => result.push('\x08'), // \b = backspace
171                            'f' => result.push('\x0C'),
172                            'n' => result.push('\n'),
173                            'r' => result.push('\r'),
174                            't' => result.push('\t'),
175                            'u' => {
176                                // Handle unicode escape sequences
177                                let mut hex = String::with_capacity(4);
178                                for _ in 0..4 {
179                                    if let Some(h) = self.advance() {
180                                        hex.push(h);
181                                    }
182                                }
183                                if let Ok(code) = u32::from_str_radix(&hex, 16) {
184                                    if let Some(chr) = std::char::from_u32(code) {
185                                        result.push(chr);
186                                    }
187                                }
188                            }
189                            _ => result.push(next_ch),
190                        }
191                    }
192                }
193                _ => result.push(ch),
194            }
195        }
196        Err(JsonFixerError::Syntax(SyntaxError::UnmatchedQuotes(
197            start_pos,
198        ))) // placeholder
199    }
200
201    fn tokenize_number(&mut self, first_char: char) -> Result<Token, JsonFixerError> {
202        let start_pos = self.current_position();
203        let mut number = String::from(first_char);
204
205        // Handle numbers that start with plus
206        if first_char == '+' || first_char == '.' {
207            // If there is no digit after +, it's invalid
208            if let Some(next_char) = self.peek() {
209                if !next_char.is_digit(10) {
210                    return Err(JsonFixerError::Syntax(SyntaxError::InvalidNumber(
211                        number, start_pos,
212                    )));
213                }
214            } else {
215                return Err(JsonFixerError::Syntax(SyntaxError::InvalidNumber(
216                    number, start_pos,
217                )));
218            }
219
220            if first_char == '+' {
221                // Remove the +
222                number.clear();
223            }
224
225            if first_char == '.' {
226                // Add 0 before the . eg. .123 -> 0.123
227                number.clear();
228                number.push('0');
229                number.push('.');
230            }
231        }
232
233        let mut multi_dots = false;
234        while let Some(&ch) = self.peek() {
235            if !ch.is_digit(10) && ch != '.' && ch != 'e' && ch != 'E' && ch != '+' && ch != '-' {
236                break;
237            }
238            if first_char == '.' && ch == '.' {
239                // Cannot accept two dots, a first dot already accepted
240                multi_dots = true;
241            }
242
243            number.push(self.advance().unwrap());
244        }
245
246        // it's a number that includes many dots
247        if multi_dots {
248            return Err(JsonFixerError::Syntax(SyntaxError::InvalidNumber(
249                number, start_pos,
250            )));
251        }
252
253        if number.chars().last().unwrap() == '.' {
254            // remove the .
255            number.pop();
256        }
257
258        Ok(Token::Number(number, self.current_position()))
259    }
260
261    fn tokenize_identifier(&mut self, first_char: char) -> Result<Token, JsonFixerError> {
262        let start_pos = self.current_position();
263        let mut ident = String::from(first_char);
264        while let Some(&ch) = self.input.peek() {
265            if !ch.is_alphanumeric() && ch != '_' {
266                break;
267            }
268
269            ident.push(self.advance().unwrap());
270        }
271
272        match ident.as_str() {
273            "true" => Ok(Token::Boolean(true, start_pos)),
274            "false" => Ok(Token::Boolean(false, start_pos)),
275            "null" => Ok(Token::Null(start_pos)),
276            _ => Ok(Token::UnquotedString(ident, start_pos)),
277        }
278    }
279}