oak_json/lexer/
mod.rs

1use crate::{kind::JsonSyntaxKind, language::JsonLanguage};
2use oak_core::{
3    errors::OakError,
4    lexer::{CommentConfig, LexOutput, Lexer, LexerCache, LexerState, StringConfig},
5    source::{Source, TextEdit},
6};
7use std::sync::LazyLock;
8
9type State<'a, S> = LexerState<'a, S, JsonLanguage>;
10
11static JSON_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "/*", block_end: "*/", nested_blocks: false });
12static JSON_SINGLE_QUOTE_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['\''], escape: Some('\\') });
13
14/// JSON 词法分析
15#[derive(Clone)]
16pub struct JsonLexer<'config> {
17    _config: &'config JsonLanguage,
18}
19
20impl<'config> Lexer<JsonLanguage> for JsonLexer<'config> {
21    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<JsonLanguage>) -> LexOutput<JsonLanguage> {
22        let mut state = State::new(source);
23        let result = self.run(&mut state);
24        if result.is_ok() {
25            state.add_eof();
26        }
27        state.finish_with_cache(result, cache)
28    }
29}
30
31impl<'config> JsonLexer<'config> {
32    pub fn new(config: &'config JsonLanguage) -> Self {
33        Self { _config: config }
34    }
35
36    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
37        while state.not_at_end() {
38            let safe_point = state.get_position();
39            let Some(ch) = state.peek()
40            else {
41                break;
42            };
43
44            match ch {
45                ' ' | '\t' | '\n' | '\r' => {
46                    self.skip_whitespace_fast(state);
47                }
48                '"' => {
49                    self.lex_string_fast(state);
50                }
51                '/' if self._config.comments => {
52                    JSON_COMMENT.scan(state, JsonSyntaxKind::Comment, JsonSyntaxKind::Comment);
53                }
54                '-' | '0'..='9' => {
55                    self.lex_number(state);
56                }
57                '{' | '}' | '[' | ']' | ',' | ':' => {
58                    self.lex_operator_or_delimiter(state);
59                }
60                't' | 'f' | 'n' => {
61                    if !self.lex_keyword(state) {
62                        if self._config.bare_keys {
63                            self.lex_bare_key(state);
64                        }
65                    }
66                }
67                '\'' if self._config.single_quotes => {
68                    JSON_SINGLE_QUOTE_STRING.scan(state, JsonSyntaxKind::StringLiteral);
69                }
70                _ => {
71                    let mut handled = false;
72                    if self._config.bare_keys && (ch.is_alphabetic() || ch == '_' || ch == '$') {
73                        handled = self.lex_bare_key(state);
74                    }
75
76                    if !handled {
77                        // 如果所有规则都不匹配,跳过当前字符并标记为错误
78                        state.advance(ch.len_utf8());
79                        state.add_token(JsonSyntaxKind::Error, safe_point, state.get_position());
80                    }
81                }
82            }
83
84            state.advance_if_dead_lock(safe_point);
85        }
86
87        Ok(())
88    }
89
90    /// 处理数字字面
91    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
92        let start_pos = state.get_position();
93
94        // 处理负号
95        state.consume_if_starts_with("-");
96
97        let mut has_digits = false;
98
99        // 处理十六进制数字(如果配置允许)
100        if self._config.hex_numbers && state.starts_with("0") {
101            let n1 = state.peek_next_n(1);
102            if n1 == Some('x') || n1 == Some('X') {
103                state.advance(2); // 跳过 '0x'
104                let range = state.take_while(|c| c.is_ascii_hexdigit() || c == '_');
105                if range.end > range.start {
106                    state.add_token(JsonSyntaxKind::NumberLiteral, start_pos, state.get_position());
107                    return true;
108                }
109                // Fallback to decimal handling if no hex digits
110            }
111        }
112
113        // 处理整数部分
114        let r1 = state.take_while(|c| c.is_ascii_digit());
115        if r1.end > r1.start {
116            has_digits = true;
117        }
118
119        // 处理小数点和小数部分
120        if state.consume_if_starts_with(".") {
121            let r2 = state.take_while(|c| c.is_ascii_digit());
122            if r2.end > r2.start {
123                has_digits = true;
124            }
125        }
126
127        // 处理科学计数
128        if let Some(ch) = state.peek() {
129            if ch == 'e' || ch == 'E' {
130                state.advance(1);
131                if let Some(sign) = state.peek() {
132                    if sign == '+' || sign == '-' {
133                        state.advance(1);
134                    }
135                }
136                state.take_while(|c| c.is_ascii_digit());
137            }
138        }
139
140        if has_digits {
141            state.add_token(JsonSyntaxKind::NumberLiteral, start_pos, state.get_position());
142            true
143        }
144        else {
145            false
146        }
147    }
148
149    /// 处理布尔值和 null
150    fn lex_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
151        let start_pos = state.get_position();
152        if state.consume_if_starts_with("true") || state.consume_if_starts_with("false") {
153            state.add_token(JsonSyntaxKind::BooleanLiteral, start_pos, state.get_position());
154            return true;
155        }
156        if state.consume_if_starts_with("null") {
157            state.add_token(JsonSyntaxKind::NullLiteral, start_pos, state.get_position());
158            return true;
159        }
160        false
161    }
162
163    /// 处理裸键(JSON5 特性)
164    fn lex_bare_key<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
165        let start_pos = state.get_position();
166        if let Some(ch) = state.peek() {
167            if ch.is_alphabetic() || ch == '_' || ch == '$' {
168                state.advance(ch.len_utf8());
169                state.take_while(|c| c.is_alphanumeric() || c == '_' || c == '$');
170                state.add_token(JsonSyntaxKind::BareKey, start_pos, state.get_position());
171                return true;
172            }
173        }
174        false
175    }
176
177    /// 处理操作符和分隔
178    fn lex_operator_or_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
179        let start_pos = state.get_position();
180        if let Some(ch) = state.peek() {
181            let token_kind = match ch {
182                '{' => JsonSyntaxKind::LeftBrace,
183                '}' => JsonSyntaxKind::RightBrace,
184                '[' => JsonSyntaxKind::LeftBracket,
185                ']' => JsonSyntaxKind::RightBracket,
186                ',' => JsonSyntaxKind::Comma,
187                ':' => JsonSyntaxKind::Colon,
188                _ => return false,
189            };
190
191            state.advance(ch.len_utf8());
192            state.add_token(token_kind, start_pos, state.get_position());
193            true
194        }
195        else {
196            false
197        }
198    }
199
200    fn skip_whitespace_fast<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
201        let start_pos = state.get_position();
202        let mut count = 0;
203        while let Some(ch) = state.peek() {
204            if ch.is_whitespace() {
205                state.advance(ch.len_utf8());
206                count += 1;
207            }
208            else {
209                break;
210            }
211        }
212
213        if count > 0 {
214            state.add_token(JsonSyntaxKind::Whitespace, start_pos, state.get_position());
215            true
216        }
217        else {
218            false
219        }
220    }
221
222    fn lex_string_fast<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
223        let start_pos = state.get_position();
224        if !state.consume_if_starts_with("\"") {
225            return false;
226        }
227
228        while let Some(ch) = state.peek() {
229            if ch == '"' {
230                state.advance(ch.len_utf8());
231                state.add_token(JsonSyntaxKind::StringLiteral, start_pos, state.get_position());
232                return true;
233            }
234            else if ch == '\\' {
235                state.advance(ch.len_utf8());
236                if let Some(escaped) = state.peek() {
237                    state.advance(escaped.len_utf8());
238                }
239            }
240            else {
241                state.advance(ch.len_utf8());
242            }
243        }
244
245        state.add_token(JsonSyntaxKind::StringLiteral, start_pos, state.get_position());
246        true
247    }
248}