Skip to main content

oak_toml/lexer/
mod.rs

1use crate::{TomlSyntaxKind, language::TomlLanguage};
2use oak_core::{
3    Lexer, LexerState, OakError, TextEdit,
4    lexer::{LexOutput, LexerCache},
5    source::Source,
6};
7
8type State<'a, S> = LexerState<'a, S, TomlLanguage>;
9
10#[derive(Clone, Debug)]
11pub struct TomlLexer<'config> {
12    _config: &'config TomlLanguage,
13}
14
15impl<'config> Lexer<TomlLanguage> for TomlLexer<'config> {
16    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<TomlLanguage>) -> LexOutput<TomlLanguage> {
17        let mut state = State::new(source);
18        let result = self.run(&mut state);
19        if result.is_ok() {
20            state.add_eof();
21        }
22        state.finish_with_cache(result, cache)
23    }
24}
25
26impl<'config> TomlLexer<'config> {
27    pub fn new(config: &'config TomlLanguage) -> Self {
28        Self { _config: config }
29    }
30
31    /// 主要的词法分析循环
32    fn run<S: Source + ?Sized>(&self, state: &mut State<S>) -> Result<(), OakError> {
33        while state.not_at_end() {
34            if let Some(ch) = state.peek() {
35                match ch {
36                    ' ' | '\t' | '\n' | '\r' => {
37                        self.skip_whitespace(state);
38                    }
39                    '#' => {
40                        self.skip_comment(state);
41                    }
42                    '"' | '\'' => {
43                        self.lex_string(state);
44                    }
45                    '0'..='9' | '+' | '-' => {
46                        self.lex_number(state);
47                    }
48                    '[' | ']' | '{' | '}' | ',' | '.' | '=' => {
49                        self.lex_punctuation(state);
50                    }
51                    'a'..='z' | 'A'..='Z' | '_' => {
52                        self.lex_identifier(state);
53                    }
54                    _ => {
55                        // Fallback for any other punctuation or unknown characters
56                        if self.lex_punctuation(state) {
57                            continue;
58                        }
59                        // 如果没有匹配任何模式,跳过当前字符
60                        state.advance(1);
61                    }
62                }
63            }
64            else {
65                break;
66            }
67        }
68        Ok(())
69    }
70
71    /// 跳过空白字符
72    fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
73        let start_pos = state.get_position();
74
75        while let Some(ch) = state.current() {
76            if ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' {
77                state.advance(1);
78            }
79            else {
80                break;
81            }
82        }
83
84        if state.get_position() > start_pos {
85            state.add_token(TomlSyntaxKind::Whitespace, start_pos, state.get_position());
86            true
87        }
88        else {
89            false
90        }
91    }
92
93    /// 跳过注释
94    fn skip_comment<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
95        if state.current() == Some('#') {
96            let start_pos = state.get_position();
97            state.advance(1);
98
99            // 读取到行尾
100            while let Some(ch) = state.current() {
101                if ch == '\n' || ch == '\r' {
102                    break;
103                }
104                state.advance(ch.len_utf8());
105            }
106
107            state.add_token(TomlSyntaxKind::Comment, start_pos, state.get_position());
108            true
109        }
110        else {
111            false
112        }
113    }
114
115    /// 解析字符串
116    fn lex_string<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
117        match state.current() {
118            Some('"') => {
119                let start = state.get_position();
120                state.advance(1);
121
122                // 简单的字符串解析
123                while let Some(ch) = state.current() {
124                    if ch == '"' {
125                        state.advance(1);
126                        break;
127                    }
128                    if ch == '\\' {
129                        state.advance(1); // 跳过转义字符
130                        if state.current().is_some() {
131                            state.advance(1);
132                        }
133                    }
134                    else {
135                        state.advance(1);
136                    }
137                }
138
139                let end = state.get_position();
140                state.add_token(TomlSyntaxKind::BasicString, start, end);
141                true
142            }
143            Some('\'') => {
144                let start = state.get_position();
145                state.advance(1);
146
147                // 字面字符串解析
148                while let Some(ch) = state.current() {
149                    if ch == '\'' {
150                        state.advance(1);
151                        break;
152                    }
153                    state.advance(1);
154                }
155
156                let end = state.get_position();
157                state.add_token(TomlSyntaxKind::LiteralString, start, end);
158                true
159            }
160            _ => false,
161        }
162    }
163
164    /// 解析数字
165    fn lex_number<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
166        if !state.current().map_or(false, |c| c.is_ascii_digit() || c == '-' || c == '+') {
167            return false;
168        }
169
170        let start = state.get_position();
171
172        // 跳过符号
173        if matches!(state.current(), Some('-') | Some('+')) {
174            state.advance(1);
175        }
176
177        // 解析数字
178        while state.current().map_or(false, |c| c.is_ascii_digit()) {
179            state.advance(1);
180        }
181
182        // 检查是否是浮点数
183        let mut is_float = false;
184        if state.current() == Some('.') {
185            is_float = true;
186            state.advance(1);
187            while state.current().map_or(false, |c| c.is_ascii_digit()) {
188                state.advance(1);
189            }
190        }
191
192        let end = state.get_position();
193        let kind = if is_float { TomlSyntaxKind::Float } else { TomlSyntaxKind::Integer };
194        state.add_token(kind, start, end);
195        true
196    }
197
198    /// 解析标点符号
199    fn lex_punctuation<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
200        let start = state.get_position();
201
202        match state.current() {
203            Some('[') => {
204                state.advance(1);
205                if state.current() == Some('[') {
206                    state.advance(1);
207                    let end = state.get_position();
208                    state.add_token(TomlSyntaxKind::DoubleLeftBracket, start, end);
209                }
210                else {
211                    let end = state.get_position();
212                    state.add_token(TomlSyntaxKind::LeftBracket, start, end);
213                }
214                true
215            }
216            Some(']') => {
217                state.advance(1);
218                if state.current() == Some(']') {
219                    state.advance(1);
220                    let end = state.get_position();
221                    state.add_token(TomlSyntaxKind::DoubleRightBracket, start, end);
222                }
223                else {
224                    let end = state.get_position();
225                    state.add_token(TomlSyntaxKind::RightBracket, start, end);
226                }
227                true
228            }
229            Some('{') => {
230                state.advance(1);
231                let end = state.get_position();
232                state.add_token(TomlSyntaxKind::LeftBrace, start, end);
233                true
234            }
235            Some('}') => {
236                state.advance(1);
237                let end = state.get_position();
238                state.add_token(TomlSyntaxKind::RightBrace, start, end);
239                true
240            }
241            Some(',') => {
242                state.advance(1);
243                let end = state.get_position();
244                state.add_token(TomlSyntaxKind::Comma, start, end);
245                true
246            }
247            Some('.') => {
248                state.advance(1);
249                let end = state.get_position();
250                state.add_token(TomlSyntaxKind::Dot, start, end);
251                true
252            }
253            Some('=') => {
254                state.advance(1);
255                let end = state.get_position();
256                state.add_token(TomlSyntaxKind::Equal, start, end);
257                true
258            }
259            _ => false,
260        }
261    }
262
263    /// 解析标识符和键
264    fn lex_identifier<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
265        if !state.current().map_or(false, |c| c.is_ascii_alphabetic() || c == '_') {
266            return false;
267        }
268
269        let start = state.get_position();
270
271        while state.current().map_or(false, |c| c.is_ascii_alphanumeric() || c == '_' || c == '-') {
272            state.advance(1);
273        }
274
275        let end = state.get_position();
276
277        // 检查是否为关键字
278        let text = state.get_text_in((start..end).into());
279        let kind = match text.as_ref() {
280            "true" | "false" => TomlSyntaxKind::Boolean,
281            _ => TomlSyntaxKind::BareKey,
282        };
283
284        state.add_token(kind, start, end);
285        true
286    }
287}