Skip to main content

oak_json/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::JsonLanguage, lexer::token_type::JsonTokenType};
5use oak_core::{
6    errors::OakError,
7    lexer::{CommentConfig, LexOutput, Lexer, LexerCache, LexerState, StringConfig},
8    source::{Source, TextEdit},
9};
10use std::sync::LazyLock;
11
12type State<'a, S> = LexerState<'a, S, JsonLanguage>;
13
14static JSON_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "/*", block_end: "*/", nested_blocks: false });
15static JSON_SINGLE_QUOTE_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['\''], escape: Some('\\') });
16
17/// JSON 词法分析
18#[derive(Clone)]
19pub struct JsonLexer<'config> {
20    config: &'config JsonLanguage,
21}
22
23impl<'config> Lexer<JsonLanguage> for JsonLexer<'config> {
24    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<JsonLanguage>) -> LexOutput<JsonLanguage> {
25        let mut state = State::new(source);
26        let result = self.run(&mut state);
27        if result.is_ok() {
28            state.add_eof();
29        }
30        state.finish_with_cache(result, cache)
31    }
32}
33
34impl<'config> JsonLexer<'config> {
35    pub fn new(config: &'config JsonLanguage) -> Self {
36        Self { config }
37    }
38
39    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
40        while state.not_at_end() {
41            let safe_point = state.get_position();
42            let Some(ch) = state.peek()
43            else {
44                break;
45            };
46
47            match ch {
48                ' ' | '\t' | '\n' | '\r' => {
49                    self.skip_whitespace_fast(state);
50                }
51                '"' => {
52                    self.lex_string_fast(state);
53                }
54                '/' if self.config.comments => {
55                    JSON_COMMENT.scan(state, JsonTokenType::Comment, JsonTokenType::Comment);
56                }
57                '-' | '0'..='9' => {
58                    self.lex_number(state);
59                }
60                '{' | '}' | '[' | ']' | ',' | ':' => {
61                    self.lex_operator_or_delimiter(state);
62                }
63                't' | 'f' | 'n' => {
64                    if !self.lex_keyword(state) {
65                        if self.config.bare_keys {
66                            self.lex_bare_key(state);
67                        }
68                    }
69                }
70                '\'' if self.config.single_quotes => {
71                    JSON_SINGLE_QUOTE_STRING.scan(state, JsonTokenType::StringLiteral);
72                }
73                _ => {
74                    let mut handled = false;
75                    if self.config.bare_keys && (ch.is_alphabetic() || ch == '_' || ch == '$') {
76                        handled = self.lex_bare_key(state);
77                    }
78
79                    if !handled {
80                        // 如果所有规则都不匹配,跳过当前字符并标记为错误
81                        state.advance(ch.len_utf8());
82                        state.add_token(JsonTokenType::Error, safe_point, state.get_position());
83                    }
84                }
85            }
86
87            state.advance_if_dead_lock(safe_point);
88        }
89
90        Ok(())
91    }
92
93    /// 处理数字字面
94    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
95        let start_pos = state.get_position();
96
97        // 处理负号
98        state.consume_if_starts_with("-");
99
100        let mut has_digits = false;
101
102        // 处理十六进制数字(如果配置允许)
103        if self.config.hex_numbers && state.starts_with("0") {
104            let n1 = state.peek_next_n(1);
105            if n1 == Some('x') || n1 == Some('X') {
106                state.advance(2); // 跳过 '0x'
107                let range = state.take_while(|c| c.is_ascii_hexdigit() || c == '_');
108                if range.end > range.start {
109                    state.add_token(JsonTokenType::NumberLiteral, start_pos, state.get_position());
110                    return true;
111                }
112                // Fallback to decimal handling if no hex digits
113            }
114        }
115
116        // 处理整数部分
117        let r1 = state.take_while(|c| c.is_ascii_digit());
118        if r1.end > r1.start {
119            has_digits = true;
120        }
121
122        // 处理小数点和小数部分
123        if state.consume_if_starts_with(".") {
124            let r2 = state.take_while(|c| c.is_ascii_digit());
125            if r2.end > r2.start {
126                has_digits = true;
127            }
128        }
129
130        // 处理科学计数
131        if let Some(ch) = state.peek() {
132            if ch == 'e' || ch == 'E' {
133                state.advance(1);
134                if let Some(sign) = state.peek() {
135                    if sign == '+' || sign == '-' {
136                        state.advance(1);
137                    }
138                }
139                state.take_while(|c| c.is_ascii_digit());
140            }
141        }
142
143        if has_digits {
144            state.add_token(JsonTokenType::NumberLiteral, start_pos, state.get_position());
145            true
146        }
147        else {
148            false
149        }
150    }
151
152    /// 处理布尔值和 null
153    fn lex_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
154        let start_pos = state.get_position();
155        if state.consume_if_starts_with("true") || state.consume_if_starts_with("false") {
156            state.add_token(JsonTokenType::BooleanLiteral, start_pos, state.get_position());
157            return true;
158        }
159        if state.consume_if_starts_with("null") {
160            state.add_token(JsonTokenType::NullLiteral, start_pos, state.get_position());
161            return true;
162        }
163        false
164    }
165
166    /// 处理裸键(JSON5 特性)
167    fn lex_bare_key<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
168        let start_pos = state.get_position();
169        if let Some(ch) = state.peek() {
170            if ch.is_alphabetic() || ch == '_' || ch == '$' {
171                state.advance(ch.len_utf8());
172                state.take_while(|c| c.is_alphanumeric() || c == '_' || c == '$');
173                state.add_token(JsonTokenType::BareKey, start_pos, state.get_position());
174                return true;
175            }
176        }
177        false
178    }
179
180    /// 处理操作符和分隔
181    fn lex_operator_or_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
182        let start_pos = state.get_position();
183        if let Some(ch) = state.peek() {
184            let token_kind = match ch {
185                '{' => JsonTokenType::LeftBrace,
186                '}' => JsonTokenType::RightBrace,
187                '[' => JsonTokenType::LeftBracket,
188                ']' => JsonTokenType::RightBracket,
189                ',' => JsonTokenType::Comma,
190                ':' => JsonTokenType::Colon,
191                _ => return false,
192            };
193
194            state.advance(ch.len_utf8());
195            state.add_token(token_kind, start_pos, state.get_position());
196            true
197        }
198        else {
199            false
200        }
201    }
202
203    fn skip_whitespace_fast<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
204        let start_pos = state.get_position();
205        let mut count = 0;
206        while let Some(ch) = state.peek() {
207            if ch.is_whitespace() {
208                state.advance(ch.len_utf8());
209                count += 1;
210            }
211            else {
212                break;
213            }
214        }
215        if count > 0 {
216            state.add_token(JsonTokenType::Whitespace, start_pos, state.get_position());
217            true
218        }
219        else {
220            false
221        }
222    }
223
224    fn lex_string_fast<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
225        let start_pos = state.get_position();
226        if !state.consume_if_starts_with("\"") {
227            return false;
228        }
229
230        let mut escaped = false;
231        while let Some(ch) = state.peek() {
232            state.advance(ch.len_utf8());
233            if escaped {
234                escaped = false;
235                continue;
236            }
237            if ch == '\\' {
238                escaped = true;
239                continue;
240            }
241            if ch == '"' {
242                state.add_token(JsonTokenType::StringLiteral, start_pos, state.get_position());
243                return true;
244            }
245        }
246        // 未闭合的字符串
247        state.add_token(JsonTokenType::Error, start_pos, state.get_position());
248        false
249    }
250}