Skip to main content

oak_json/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// Token types for JSON.
3pub mod token_type;
4
5use crate::{language::JsonLanguage, lexer::token_type::JsonTokenType};
6use oak_core::{
7    errors::OakError,
8    lexer::{CommentConfig, LexOutput, Lexer, LexerCache, LexerState, StringConfig},
9    source::{Source, TextEdit},
10};
11use std::sync::LazyLock;
12
13pub(crate) type State<'a, S> = LexerState<'a, S, JsonLanguage>;
14
15static JSON_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "/*", block_end: "*/", nested_blocks: false });
16static JSON_SINGLE_QUOTE_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['\''], escape: Some('\\') });
17
18/// Lexer for JSON.
19#[derive(Clone)]
20pub struct JsonLexer<'config> {
21    config: &'config JsonLanguage,
22}
23
24impl<'config> Lexer<JsonLanguage> for JsonLexer<'config> {
25    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<JsonLanguage>) -> LexOutput<JsonLanguage> {
26        let mut state = State::new(source);
27        let result = self.run(&mut state);
28        if result.is_ok() {
29            state.add_eof();
30        }
31        state.finish_with_cache(result, cache)
32    }
33}
34
35impl<'config> JsonLexer<'config> {
36    /// Creates a new `JsonLexer` with the given language configuration.
37    pub fn new(config: &'config JsonLanguage) -> Self {
38        Self { config }
39    }
40
41    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
42        while state.not_at_end() {
43            let safe_point = state.get_position();
44            let Some(ch) = state.peek()
45            else {
46                break;
47            };
48
49            match ch {
50                ' ' | '\t' | '\n' | '\r' => {
51                    self.skip_whitespace_fast(state);
52                }
53                '"' => {
54                    self.lex_string_fast(state);
55                }
56                '/' if self.config.comments => {
57                    JSON_COMMENT.scan(state, JsonTokenType::Comment, JsonTokenType::Comment);
58                }
59                '-' | '0'..='9' => {
60                    self.lex_number(state);
61                }
62                '{' | '}' | '[' | ']' | ',' | ':' => {
63                    self.lex_operator_or_delimiter(state);
64                }
65                't' | 'f' | 'n' => {
66                    if !self.lex_keyword(state) {
67                        if self.config.bare_keys {
68                            self.lex_bare_key(state);
69                        }
70                    }
71                }
72                '\'' if self.config.single_quotes => {
73                    JSON_SINGLE_QUOTE_STRING.scan(state, JsonTokenType::StringLiteral);
74                }
75                _ => {
76                    let mut handled = false;
77                    if self.config.bare_keys && (ch.is_alphabetic() || ch == '_' || ch == '$') {
78                        handled = self.lex_bare_key(state);
79                    }
80
81                    if !handled {
82                        // If no rules match, skip current character and mark as error
83                        state.advance(ch.len_utf8());
84                        state.add_token(JsonTokenType::Error, safe_point, state.get_position());
85                    }
86                }
87            }
88
89            state.advance_if_dead_lock(safe_point);
90        }
91
92        Ok(())
93    }
94
95    /// Handles number literals.
96    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
97        let start_pos = state.get_position();
98
99        // Handle negative sign
100        state.consume_if_starts_with("-");
101
102        let mut has_digits = false;
103
104        // Handle hexadecimal numbers (if allowed)
105        if self.config.hex_numbers && state.starts_with("0") {
106            let n1 = state.peek_next_n(1);
107            if n1 == Some('x') || n1 == Some('X') {
108                state.advance(2); // Skip '0x'
109                let range = state.take_while(|c| c.is_ascii_hexdigit() || c == '_');
110                if range.end > range.start {
111                    state.add_token(JsonTokenType::NumberLiteral, start_pos, state.get_position());
112                    return true;
113                }
114                // Fallback to decimal handling if no hex digits
115            }
116        }
117
118        // Handle integer part
119        let r1 = state.take_while(|c| c.is_ascii_digit());
120        if r1.end > r1.start {
121            has_digits = true;
122        }
123
124        // Handle decimal point and fractional part
125        if state.consume_if_starts_with(".") {
126            let r2 = state.take_while(|c| c.is_ascii_digit());
127            if r2.end > r2.start {
128                has_digits = true;
129            }
130        }
131
132        // Handle scientific notation
133        if let Some(ch) = state.peek() {
134            if ch == 'e' || ch == 'E' {
135                state.advance(1);
136                if let Some(sign) = state.peek() {
137                    if sign == '+' || sign == '-' {
138                        state.advance(1);
139                    }
140                }
141                state.take_while(|c| c.is_ascii_digit());
142            }
143        }
144
145        if has_digits {
146            state.add_token(JsonTokenType::NumberLiteral, start_pos, state.get_position());
147            true
148        }
149        else {
150            false
151        }
152    }
153
154    /// Handles keywords (true, false, null).
155    fn lex_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
156        let start_pos = state.get_position();
157        if state.consume_if_starts_with("true") || state.consume_if_starts_with("false") {
158            state.add_token(JsonTokenType::BooleanLiteral, start_pos, state.get_position());
159            return true;
160        }
161        if state.consume_if_starts_with("null") {
162            state.add_token(JsonTokenType::NullLiteral, start_pos, state.get_position());
163            return true;
164        }
165        false
166    }
167
168    /// Handles bare keys (JSON5 feature).
169    fn lex_bare_key<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
170        let start_pos = state.get_position();
171        if let Some(ch) = state.peek() {
172            if ch.is_alphabetic() || ch == '_' || ch == '$' {
173                state.advance(ch.len_utf8());
174                state.take_while(|c| c.is_alphanumeric() || c == '_' || c == '$');
175                state.add_token(JsonTokenType::BareKey, start_pos, state.get_position());
176                return true;
177            }
178        }
179        false
180    }
181
182    /// Handles operators and delimiters.
183    fn lex_operator_or_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
184        let start_pos = state.get_position();
185        if let Some(ch) = state.peek() {
186            let token_kind = match ch {
187                '{' => JsonTokenType::LeftBrace,
188                '}' => JsonTokenType::RightBrace,
189                '[' => JsonTokenType::LeftBracket,
190                ']' => JsonTokenType::RightBracket,
191                ',' => JsonTokenType::Comma,
192                ':' => JsonTokenType::Colon,
193                _ => return false,
194            };
195
196            state.advance(ch.len_utf8());
197            state.add_token(token_kind, start_pos, state.get_position());
198            true
199        }
200        else {
201            false
202        }
203    }
204
205    fn skip_whitespace_fast<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
206        let start_pos = state.get_position();
207        let mut count = 0;
208        while let Some(ch) = state.peek() {
209            if ch.is_whitespace() {
210                state.advance(ch.len_utf8());
211                count += 1;
212            }
213            else {
214                break;
215            }
216        }
217        if count > 0 {
218            state.add_token(JsonTokenType::Whitespace, start_pos, state.get_position());
219            true
220        }
221        else {
222            false
223        }
224    }
225
226    fn lex_string_fast<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
227        let start_pos = state.get_position();
228        if !state.consume_if_starts_with("\"") {
229            return false;
230        }
231
232        let mut escaped = false;
233        while let Some(ch) = state.peek() {
234            state.advance(ch.len_utf8());
235            if escaped {
236                escaped = false;
237                continue;
238            }
239            if ch == '\\' {
240                escaped = true;
241                continue;
242            }
243            if ch == '"' {
244                state.add_token(JsonTokenType::StringLiteral, start_pos, state.get_position());
245                return true;
246            }
247        }
248        // Unclosed string
249        state.add_token(JsonTokenType::Error, start_pos, state.get_position());
250        false
251    }
252}