oak_json/lexer/
mod.rs

1use crate::{kind::JsonSyntaxKind, language::JsonLanguage};
2use oak_core::{IncrementalCache, Lexer, LexerState, OakError, SourceText, lexer::LexOutput, source::Source};
3
4type State<S> = LexerState<S, JsonLanguage>;
5
6/// JSON 词法分析
7#[derive(Clone)]
8pub struct JsonLexer<'config> {
9    config: &'config JsonLanguage,
10}
11
12impl<'config> JsonLexer<'config> {
13    pub fn new(config: &'config JsonLanguage) -> Self {
14        Self { config }
15    }
16
17    /// 为了向后兼容,提供tokenize_source 方法
18    pub fn tokenize_source(&self, source: &SourceText) -> LexOutput<JsonLanguage> {
19        self.lex(source)
20    }
21
22    /// 跳过空白字符
23    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
24        let start_pos = state.get_position();
25
26        while let Some(ch) = state.peek() {
27            if ch == ' ' || ch == '\t' {
28                state.advance(ch.len_utf8());
29            }
30            else {
31                break;
32            }
33        }
34
35        if state.get_position() > start_pos {
36            state.add_token(JsonSyntaxKind::Whitespace, start_pos, state.get_position());
37            true
38        }
39        else {
40            false
41        }
42    }
43
44    /// 处理换行
45    fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
46        let start_pos = state.get_position();
47
48        if let Some('\n') = state.peek() {
49            state.advance(1);
50            state.add_token(JsonSyntaxKind::Whitespace, start_pos, state.get_position());
51            true
52        }
53        else if let Some('\r') = state.peek() {
54            state.advance(1);
55            if let Some('\n') = state.peek() {
56                state.advance(1);
57            }
58            state.add_token(JsonSyntaxKind::Whitespace, start_pos, state.get_position());
59            true
60        }
61        else {
62            false
63        }
64    }
65
66    /// 处理注释
67    fn lex_comment<S: Source>(&self, state: &mut State<S>) -> bool {
68        if !self.config.comments {
69            return false;
70        }
71
72        let start_pos = state.get_position();
73
74        if let Some('/') = state.peek() {
75            // 检查下一个字符
76            let remaining_text = state.get_text_in((start_pos..state.length()).into());
77            if remaining_text.len() > 1 {
78                let next_ch = remaining_text.chars().nth(1).unwrap();
79                match next_ch {
80                    '/' => {
81                        // 单行注释
82                        state.advance(2); // 跳过 '//'
83
84                        // 读取到行
85                        while let Some(ch) = state.peek() {
86                            if ch == '\n' || ch == '\r' {
87                                break;
88                            }
89                            state.advance(ch.len_utf8());
90                        }
91
92                        state.add_token(JsonSyntaxKind::Comment, start_pos, state.get_position());
93                        return true;
94                    }
95                    '*' => {
96                        // 多行注释
97                        state.advance(2); // 跳过 '/*'
98                        let mut closed = false;
99
100                        while let Some(ch) = state.peek() {
101                            if ch == '*' {
102                                let current_pos = state.get_position();
103                                let remaining = state.get_text_in((current_pos..state.length()).into());
104                                if remaining.len() > 1 && remaining.chars().nth(1) == Some('/') {
105                                    state.advance(2); // 跳过 '*/'
106                                    closed = true;
107                                    break;
108                                }
109                            }
110                            state.advance(ch.len_utf8());
111                        }
112
113                        if !closed {
114                            // 未闭合的注释,添加错误但仍然创建 kind
115                        }
116
117                        state.add_token(JsonSyntaxKind::Comment, start_pos, state.get_position());
118                        return true;
119                    }
120                    _ => {}
121                }
122            }
123        }
124        false
125    }
126
127    /// 处理字符串字面量
128    fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
129        let start_pos = state.get_position();
130
131        let quote_char = if let Some('"') = state.peek() {
132            '"'
133        }
134        else if self.config.single_quotes && matches!(state.peek(), Some('\'')) {
135            '\''
136        }
137        else {
138            return false;
139        };
140
141        state.advance(quote_char.len_utf8()); // 跳过开始的引号
142        let mut escaped = false;
143
144        while let Some(ch) = state.peek() {
145            if escaped {
146                escaped = false;
147                state.advance(ch.len_utf8());
148            }
149            else if ch == '\\' {
150                escaped = true;
151                state.advance(ch.len_utf8());
152            }
153            else if ch == quote_char {
154                state.advance(ch.len_utf8()); // 跳过结束的引
155                break;
156            }
157            else if ch == '\n' || ch == '\r' {
158                // 字符串不能跨
159                break;
160            }
161            else {
162                state.advance(ch.len_utf8());
163            }
164        }
165
166        state.add_token(JsonSyntaxKind::StringLiteral, start_pos, state.get_position());
167        true
168    }
169
170    /// 处理数字字面
171    fn lex_number<S: Source>(&self, state: &mut State<S>) -> bool {
172        let start_pos = state.get_position();
173
174        // 处理负号
175        if let Some('-') = state.peek() {
176            state.advance(1);
177        }
178
179        let mut has_digits = false;
180
181        // 处理十六进制数字(如果配置允许)
182        if self.config.hex_numbers
183            && let Some('0') = state.peek()
184        {
185            if let Some(next_ch) = state.peek_next_n(1) {
186                if next_ch == 'x' || next_ch == 'X' {
187                    state.advance(2); // 跳过 '0x'
188                    while let Some(ch) = state.peek() {
189                        if ch.is_ascii_hexdigit() {
190                            has_digits = true;
191                            state.advance(1);
192                        }
193                        else {
194                            break;
195                        }
196                    }
197
198                    if has_digits {
199                        state.add_token(JsonSyntaxKind::NumberLiteral, start_pos, state.get_position());
200                        return true;
201                    }
202                    else {
203                        // 回退到开始位
204                        state.set_position(start_pos);
205                        return false;
206                    }
207                }
208            }
209        }
210
211        // 处理整数部分
212        while let Some(ch) = state.peek() {
213            if ch.is_ascii_digit() {
214                has_digits = true;
215                state.advance(1);
216            }
217            else {
218                break;
219            }
220        }
221
222        // 处理小数点和小数部分
223        if let Some('.') = state.peek() {
224            state.advance(1);
225            while let Some(ch) = state.peek() {
226                if ch.is_ascii_digit() {
227                    has_digits = true;
228                    state.advance(1);
229                }
230                else {
231                    break;
232                }
233            }
234        }
235
236        // 处理科学计数
237        if let Some(ch) = state.peek() {
238            if ch == 'e' || ch == 'E' {
239                state.advance(1);
240                if let Some(sign) = state.peek() {
241                    if sign == '+' || sign == '-' {
242                        state.advance(1);
243                    }
244                }
245                while let Some(digit) = state.peek() {
246                    if digit.is_ascii_digit() {
247                        state.advance(1);
248                    }
249                    else {
250                        break;
251                    }
252                }
253            }
254        }
255
256        if has_digits && state.get_position() > start_pos {
257            state.add_token(JsonSyntaxKind::NumberLiteral, start_pos, state.get_position());
258            true
259        }
260        else {
261            // 回退到开始位
262            state.set_position(start_pos);
263            false
264        }
265    }
266
267    /// 处理布尔值和 null
268    fn lex_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
269        let start_pos = state.get_position();
270
271        // Check for "true"
272        if start_pos + 4 <= state.length() {
273            let text = state.get_text_in((start_pos..start_pos + 4).into());
274            if text == "true" {
275                state.advance(4);
276                state.add_token(JsonSyntaxKind::BooleanLiteral, start_pos, state.get_position());
277                return true;
278            }
279        }
280
281        // Check for "false"
282        if start_pos + 5 <= state.length() {
283            let text = state.get_text_in((start_pos..start_pos + 5).into());
284            if text == "false" {
285                state.advance(5);
286                state.add_token(JsonSyntaxKind::BooleanLiteral, start_pos, state.get_position());
287                return true;
288            }
289        }
290
291        // Check for "null"
292        if start_pos + 4 <= state.length() {
293            let text = state.get_text_in((start_pos..start_pos + 4).into());
294            if text == "null" {
295                state.advance(4);
296                state.add_token(JsonSyntaxKind::NullLiteral, start_pos, state.get_position());
297                return true;
298            }
299        }
300
301        false
302    }
303
304    /// 处理裸键(JSON5 特性)
305    fn lex_bare_key<S: Source>(&self, state: &mut State<S>) -> bool {
306        if !self.config.bare_keys {
307            return false;
308        }
309
310        let start_pos = state.get_position();
311
312        if let Some(ch) = state.peek() {
313            if ch.is_alphabetic() || ch == '_' || ch == '$' {
314                state.advance(ch.len_utf8());
315
316                // 继续读取标识符字
317                while let Some(ch) = state.peek() {
318                    if ch.is_alphanumeric() || ch == '_' || ch == '$' {
319                        state.advance(ch.len_utf8());
320                    }
321                    else {
322                        break;
323                    }
324                }
325
326                state.add_token(JsonSyntaxKind::BareKey, start_pos, state.get_position());
327                true
328            }
329            else {
330                false
331            }
332        }
333        else {
334            false
335        }
336    }
337
338    /// 处理操作符和分隔
339    fn lex_operator_or_delimiter<S: Source>(&self, state: &mut State<S>) -> bool {
340        let start_pos = state.get_position();
341
342        if let Some(ch) = state.peek() {
343            let token_kind = match ch {
344                '{' => JsonSyntaxKind::LeftBrace,
345                '}' => JsonSyntaxKind::RightBrace,
346                '[' => JsonSyntaxKind::LeftBracket,
347                ']' => JsonSyntaxKind::RightBracket,
348                ',' => JsonSyntaxKind::Comma,
349                ':' => JsonSyntaxKind::Colon,
350                _ => return false,
351            };
352
353            state.advance(ch.len_utf8());
354            state.add_token(token_kind, start_pos, state.get_position());
355            true
356        }
357        else {
358            false
359        }
360    }
361}
362
363impl<'config> Lexer<JsonLanguage> for JsonLexer<'config> {
364    fn lex_incremental(
365        &self,
366        source: impl Source,
367        _start_offset: usize,
368        _cache: IncrementalCache<'_, JsonLanguage>,
369    ) -> LexOutput<JsonLanguage> {
370        let mut state = LexerState::new_with_cache(source, _start_offset, _cache);
371        let result = self.run(&mut state);
372        state.finish(result)
373    }
374
375    fn lex(&self, source: impl Source) -> LexOutput<JsonLanguage> {
376        let mut state = LexerState::new(source);
377        let result = self.run(&mut state);
378        state.finish(result)
379    }
380}
381
382impl<'config> JsonLexer<'config> {
383    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
384        while state.not_at_end() {
385            // 尝试各种词法规则
386            if self.skip_whitespace(state) {
387                continue;
388            }
389
390            if self.lex_newline(state) {
391                continue;
392            }
393
394            if self.lex_comment(state) {
395                continue;
396            }
397
398            if self.lex_string_literal(state) {
399                continue;
400            }
401
402            if self.lex_number(state) {
403                continue;
404            }
405
406            if self.lex_keyword(state) {
407                continue;
408            }
409
410            if self.lex_bare_key(state) {
411                continue;
412            }
413
414            if self.lex_operator_or_delimiter(state) {
415                continue;
416            }
417
418            // 如果所有规则都不匹配,跳过当前字符并标记为错误
419            let start_pos = state.get_position();
420            if let Some(ch) = state.peek() {
421                state.advance(ch.len_utf8());
422                state.add_token(JsonSyntaxKind::Error, start_pos, state.get_position());
423            }
424            else {
425                break;
426            }
427        }
428
429        // 添加 EOF kind
430        let eof_pos = state.get_position();
431        state.add_token(JsonSyntaxKind::Eof, eof_pos, eof_pos);
432
433        Ok(())
434    }
435}