Skip to main content

oak_ini/lexer/
mod.rs

1use crate::{kind::IniSyntaxKind, language::IniLanguage};
2use oak_core::{
3    Lexer, LexerCache, LexerState, OakError,
4    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
5    source::Source,
6};
7
8type State<'a, S> = LexerState<'a, S, IniLanguage>;
9
10static _INI_WHITESPACE: WhitespaceConfig = WhitespaceConfig { unicode_whitespace: true };
11static _INI_COMMENT: CommentConfig = CommentConfig { line_marker: ";", block_start: "", block_end: "", nested_blocks: false };
12static _INI_STRING: StringConfig = StringConfig { quotes: &['"', '\''], escape: Some('\\') };
13
14#[derive(Clone, Debug)]
15pub struct IniLexer<'config> {
16    _config: &'config IniLanguage,
17}
18
19impl<'config> Lexer<IniLanguage> for IniLexer<'config> {
20    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<IniLanguage>) -> LexOutput<IniLanguage> {
21        let mut state: State<'_, S> = State::new(source);
22        let result = self.run(&mut state);
23        if result.is_ok() {
24            state.add_eof();
25        }
26        state.finish_with_cache(result, cache)
27    }
28}
29
30impl<'config> IniLexer<'config> {
31    pub fn new(config: &'config IniLanguage) -> Self {
32        Self { _config: config }
33    }
34
35    /// 主要的词法分析循环
36    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
37        while state.not_at_end() {
38            let safe_point = state.get_position();
39
40            if self.skip_whitespace(state) {
41                continue;
42            }
43
44            if self.lex_newline(state) {
45                continue;
46            }
47
48            if self.skip_comment(state) {
49                continue;
50            }
51
52            if self.lex_string_literal(state) {
53                continue;
54            }
55
56            if self.lex_number_literal(state) {
57                continue;
58            }
59
60            if self.lex_identifier(state) {
61                continue;
62            }
63
64            if self.lex_punctuation(state) {
65                continue;
66            }
67
68            state.advance_if_dead_lock(safe_point);
69        }
70
71        Ok(())
72    }
73
74    /// 跳过空白字符(不包括换行符)
75    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
76        let start = state.get_position();
77
78        while let Some(ch) = state.peek() {
79            if ch == ' ' || ch == '\t' || ch == '\r' {
80                state.advance(ch.len_utf8());
81            }
82            else {
83                break;
84            }
85        }
86
87        if state.get_position() > start {
88            state.add_token(IniSyntaxKind::Whitespace, start, state.get_position());
89            return true;
90        }
91        false
92    }
93
94    /// 处理换行
95    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
96        let start = state.get_position();
97
98        if state.current() == Some('\n') {
99            state.advance(1);
100            state.add_token(IniSyntaxKind::Newline, start, state.get_position());
101            return true;
102        }
103        false
104    }
105
106    /// 跳过注释
107    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
108        let start = state.get_position();
109
110        if let Some(ch) = state.current() {
111            if ch == ';' || ch == '#' {
112                // 跳过注释字符
113                state.advance(1);
114
115                // 读取到行尾
116                while let Some(ch) = state.peek() {
117                    if ch != '\n' {
118                        state.advance(ch.len_utf8());
119                    }
120                    else {
121                        break;
122                    }
123                }
124
125                state.add_token(IniSyntaxKind::Comment, start, state.get_position());
126                return true;
127            }
128        }
129        false
130    }
131
132    /// 处理字符串字面量
133    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
134        let start = state.get_position();
135
136        if let Some(quote_char) = state.current() {
137            if quote_char == '"' || quote_char == '\'' {
138                // 跳过开始引号
139                state.advance(1);
140
141                while let Some(ch) = state.peek() {
142                    if ch != quote_char {
143                        if ch == '\\' {
144                            state.advance(1); // 转义字符
145                            if let Some(_) = state.peek() {
146                                state.advance(1); // 被转义的字符
147                            }
148                        }
149                        else {
150                            state.advance(ch.len_utf8());
151                        }
152                    }
153                    else {
154                        // 找到结束引号
155                        state.advance(1);
156                        break;
157                    }
158                }
159
160                state.add_token(IniSyntaxKind::String, start, state.get_position());
161                return true;
162            }
163        }
164        false
165    }
166
167    /// 处理数字字面量
168    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
169        let start = state.get_position();
170        let first = match state.current() {
171            Some(c) => c,
172            None => return false,
173        };
174
175        // 检查是否以数字或负号开始
176        if !first.is_ascii_digit() && first != '-' && first != '+' {
177            return false;
178        }
179
180        // 如果是符号,检查后面是否跟数字
181        if first == '-' || first == '+' {
182            if let Some(next) = state.peek_next_n(1) {
183                if !next.is_ascii_digit() {
184                    return false;
185                }
186            }
187            else {
188                return false;
189            }
190        }
191
192        state.advance(1);
193        let mut has_dot = false;
194        let mut has_exp = false;
195
196        while let Some(ch) = state.peek() {
197            if ch.is_ascii_digit() {
198                state.advance(1);
199            }
200            else if ch == '.' && !has_dot && !has_exp {
201                has_dot = true;
202                state.advance(1);
203            }
204            else if (ch == 'e' || ch == 'E') && !has_exp {
205                has_exp = true;
206                state.advance(1);
207                // 处理指数符号
208                if let Some(sign) = state.peek() {
209                    if sign == '+' || sign == '-' {
210                        state.advance(1);
211                    }
212                }
213            }
214            else {
215                break;
216            }
217        }
218
219        // 检查是否为有效数字
220        let end = state.get_position();
221        let text = state.get_text_in((start..end).into());
222
223        // 简单验证:不能只是符号或只是点
224        if text.as_ref() == "-" || text.as_ref() == "+" || text.as_ref() == "." {
225            // 回退
226            state.set_position(start);
227            return false;
228        }
229
230        // 判断是整数还是浮点数
231        let kind = if has_dot || has_exp { IniSyntaxKind::Float } else { IniSyntaxKind::Integer };
232
233        state.add_token(kind, start, state.get_position());
234        true
235    }
236
237    /// 处理标识符
238    fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
239        let start = state.get_position();
240        let ch = match state.current() {
241            Some(c) => c,
242            None => return false,
243        };
244
245        // 标识符必须以字母或下划线开始
246        if !(ch.is_ascii_alphabetic() || ch == '_') {
247            return false;
248        }
249
250        state.advance(1);
251        while let Some(c) = state.current() {
252            if c.is_ascii_alphanumeric() || c == '_' || c == '-' {
253                state.advance(1);
254            }
255            else {
256                break;
257            }
258        }
259
260        let end = state.get_position();
261        let text = state.get_text_in((start..end).into());
262
263        // 检查是否为布尔值或日期时间
264        let kind = match text.to_lowercase().as_str() {
265            "true" | "false" => IniSyntaxKind::Boolean,
266            _ => {
267                if self.is_datetime_like(text.as_ref()) {
268                    IniSyntaxKind::DateTime
269                }
270                else {
271                    IniSyntaxKind::Identifier
272                }
273            }
274        };
275
276        state.add_token(kind, start, state.get_position());
277        true
278    }
279
280    /// 处理标点符号
281    fn lex_punctuation<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
282        let start = state.get_position();
283
284        // 优先匹配较长的符号
285        if state.starts_with("[[") {
286            state.advance(2);
287            state.add_token(IniSyntaxKind::DoubleLeftBracket, start, state.get_position());
288            return true;
289        }
290
291        if state.starts_with("]]") {
292            state.advance(2);
293            state.add_token(IniSyntaxKind::DoubleRightBracket, start, state.get_position());
294            return true;
295        }
296
297        if let Some(ch) = state.current() {
298            let kind = match ch {
299                '{' => IniSyntaxKind::LeftBrace,
300                '}' => IniSyntaxKind::RightBrace,
301                '[' => IniSyntaxKind::LeftBracket,
302                ']' => IniSyntaxKind::RightBracket,
303                ',' => IniSyntaxKind::Comma,
304                '.' => IniSyntaxKind::Dot,
305                '=' => IniSyntaxKind::Equal,
306                _ => return false,
307            };
308
309            state.advance(ch.len_utf8());
310            state.add_token(kind, start, state.get_position());
311            return true;
312        }
313
314        false
315    }
316
317    /// 判断是否类似日期时间格式
318    fn is_datetime_like(&self, text: &str) -> bool {
319        // 简单的日期时间格式检查
320        // 支持 ISO 8601 格式:YYYY-MM-DD, YYYY-MM-DDTHH:MM:SS 等
321        if text.len() < 8 {
322            return false;
323        }
324
325        // 检查是否包含日期分隔符
326        if text.contains('-') || text.contains(':') || text.contains('T') {
327            // 更详细的检查可以在这里添加
328            let chars: Vec<char> = text.chars().collect();
329            let mut digit_count = 0;
330            let mut separator_count = 0;
331
332            for ch in chars {
333                if ch.is_ascii_digit() {
334                    digit_count += 1;
335                }
336                else if ch == '-' || ch == ':' || ch == 'T' || ch == 'Z' || ch == '+' {
337                    separator_count += 1;
338                }
339            }
340
341            // 简单启发式:如果数字多于分隔符,可能是日期时间
342            digit_count > separator_count && digit_count >= 6
343        }
344        else {
345            false
346        }
347    }
348}