Skip to main content

oak_ini/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2use oak_core::{
3    Lexer, LexerCache, LexerState, OakError, Source,
4    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
5};
6pub mod token_type;
7
8use crate::{language::IniLanguage, lexer::token_type::IniTokenType};
9
10type State<'a, S> = LexerState<'a, S, IniLanguage>;
11
12static _INI_WHITESPACE: WhitespaceConfig = WhitespaceConfig { unicode_whitespace: true };
13static _INI_COMMENT: CommentConfig = CommentConfig { line_marker: ";", block_start: "", block_end: "", nested_blocks: false };
14static _INI_STRING: StringConfig = StringConfig { quotes: &['"', '\''], escape: Some('\\') };
15
16#[derive(Clone, Debug)]
17pub struct IniLexer<'config> {
18    _config: &'config IniLanguage,
19}
20
21impl<'config> Lexer<IniLanguage> for IniLexer<'config> {
22    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<IniLanguage>) -> LexOutput<IniLanguage> {
23        let mut state: State<'_, S> = State::new(source);
24        let result = self.run(&mut state);
25        if result.is_ok() {
26            state.add_eof();
27        }
28        state.finish_with_cache(result, cache)
29    }
30}
31
32impl<'config> IniLexer<'config> {
33    pub fn new(config: &'config IniLanguage) -> Self {
34        Self { _config: config }
35    }
36
37    /// 主要的词法分析循环
38    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
39        while state.not_at_end() {
40            let safe_point = state.get_position();
41
42            if self.skip_whitespace(state) {
43                continue;
44            }
45
46            if self.lex_newline(state) {
47                continue;
48            }
49
50            if self.skip_comment(state) {
51                continue;
52            }
53
54            if self.lex_string_literal(state) {
55                continue;
56            }
57
58            if self.lex_number_literal(state) {
59                continue;
60            }
61
62            if self.lex_identifier(state) {
63                continue;
64            }
65
66            if self.lex_punctuation(state) {
67                continue;
68            }
69
70            state.advance_if_dead_lock(safe_point);
71        }
72
73        Ok(())
74    }
75
76    /// 跳过空白字符(不包括换行符)
77    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
78        let start = state.get_position();
79
80        while let Some(ch) = state.peek() {
81            if ch == ' ' || ch == '\t' || ch == '\r' {
82                state.advance(ch.len_utf8());
83            }
84            else {
85                break;
86            }
87        }
88
89        if state.get_position() > start {
90            state.add_token(IniTokenType::Whitespace, start, state.get_position());
91            return true;
92        }
93        false
94    }
95
96    /// 处理换行
97    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
98        let start = state.get_position();
99
100        if state.current() == Some('\n') {
101            state.advance(1);
102            state.add_token(IniTokenType::Newline, start, state.get_position());
103            return true;
104        }
105        false
106    }
107
108    /// 跳过注释
109    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
110        let start = state.get_position();
111
112        if let Some(ch) = state.current() {
113            if ch == ';' || ch == '#' {
114                // 跳过注释字符
115                state.advance(1);
116
117                // 读取到行尾
118                while let Some(ch) = state.peek() {
119                    if ch != '\n' {
120                        state.advance(ch.len_utf8());
121                    }
122                    else {
123                        break;
124                    }
125                }
126
127                state.add_token(IniTokenType::Comment, start, state.get_position());
128                return true;
129            }
130        }
131        false
132    }
133
134    /// 处理字符串字面量
135    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
136        let start = state.get_position();
137
138        if let Some(quote_char) = state.current() {
139            if quote_char == '"' || quote_char == '\'' {
140                // 跳过开始引号
141                state.advance(1);
142
143                while let Some(ch) = state.peek() {
144                    if ch != quote_char {
145                        if ch == '\\' {
146                            state.advance(1); // 转义字符
147                            if let Some(_) = state.peek() {
148                                state.advance(1); // 被转义的字符
149                            }
150                        }
151                        else {
152                            state.advance(ch.len_utf8());
153                        }
154                    }
155                    else {
156                        // 找到结束引号
157                        state.advance(1);
158                        break;
159                    }
160                }
161
162                state.add_token(IniTokenType::String, start, state.get_position());
163                return true;
164            }
165        }
166        false
167    }
168
169    /// 处理数字字面量
170    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
171        let start = state.get_position();
172        let first = match state.current() {
173            Some(c) => c,
174            None => return false,
175        };
176
177        // 检查是否以数字或负号开始
178        if !first.is_ascii_digit() && first != '-' && first != '+' {
179            return false;
180        }
181
182        // 如果是符号,检查后面是否跟数字
183        if first == '-' || first == '+' {
184            if let Some(next) = state.peek_next_n(1) {
185                if !next.is_ascii_digit() {
186                    return false;
187                }
188            }
189            else {
190                return false;
191            }
192        }
193
194        state.advance(1);
195        let mut has_dot = false;
196        let mut has_exp = false;
197
198        while let Some(ch) = state.peek() {
199            if ch.is_ascii_digit() {
200                state.advance(1);
201            }
202            else if ch == '.' && !has_dot && !has_exp {
203                has_dot = true;
204                state.advance(1);
205            }
206            else if (ch == 'e' || ch == 'E') && !has_exp {
207                has_exp = true;
208                state.advance(1);
209                // 处理指数符号
210                if let Some(sign) = state.peek() {
211                    if sign == '+' || sign == '-' {
212                        state.advance(1);
213                    }
214                }
215            }
216            else {
217                break;
218            }
219        }
220
221        // 检查是否为有效数字
222        let end = state.get_position();
223        let text = state.get_text_in((start..end).into());
224
225        // 简单验证:不能只是符号或只是点
226        if text.as_ref() == "-" || text.as_ref() == "+" || text.as_ref() == "." {
227            // 回退
228            state.set_position(start);
229            return false;
230        }
231
232        // 判断是整数还是浮点数
233        let kind = if has_dot || has_exp { IniTokenType::Float } else { IniTokenType::Integer };
234
235        state.add_token(kind, start, state.get_position());
236        true
237    }
238
239    /// 处理标识符
240    fn lex_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
241        let start = state.get_position();
242        let ch = match state.current() {
243            Some(c) => c,
244            None => return false,
245        };
246
247        // 标识符必须以字母或下划线开始
248        if !(ch.is_ascii_alphabetic() || ch == '_') {
249            return false;
250        }
251
252        state.advance(1);
253        while let Some(c) = state.current() {
254            if c.is_ascii_alphanumeric() || c == '_' || c == '-' {
255                state.advance(1);
256            }
257            else {
258                break;
259            }
260        }
261
262        let end = state.get_position();
263        let text = state.get_text_in((start..end).into());
264
265        // 检查是否为布尔值或日期时间
266        let kind = match text.to_lowercase().as_str() {
267            "true" | "false" => IniTokenType::Boolean,
268            _ => {
269                if self.is_datetime_like(text.as_ref()) {
270                    IniTokenType::DateTime
271                }
272                else {
273                    IniTokenType::Identifier
274                }
275            }
276        };
277
278        state.add_token(kind, start, state.get_position());
279        true
280    }
281
282    /// 处理标点符号
283    fn lex_punctuation<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
284        let start = state.get_position();
285
286        // 优先匹配较长的符号
287        if state.starts_with("[[") {
288            state.advance(2);
289            state.add_token(IniTokenType::DoubleLeftBracket, start, state.get_position());
290            return true;
291        }
292
293        if state.starts_with("]]") {
294            state.advance(2);
295            state.add_token(IniTokenType::DoubleRightBracket, start, state.get_position());
296            return true;
297        }
298
299        if let Some(ch) = state.current() {
300            let kind = match ch {
301                '{' => IniTokenType::LeftBrace,
302                '}' => IniTokenType::RightBrace,
303                '[' => IniTokenType::LeftBracket,
304                ']' => IniTokenType::RightBracket,
305                ',' => IniTokenType::Comma,
306                '.' => IniTokenType::Dot,
307                '=' => IniTokenType::Equal,
308                _ => return false,
309            };
310
311            state.advance(ch.len_utf8());
312            state.add_token(kind, start, state.get_position());
313            return true;
314        }
315
316        false
317    }
318
319    fn is_datetime_like(&self, text: &str) -> bool {
320        // 极简判断:包含 - 和 : 的可能是日期时间
321        text.contains('-') && text.contains(':')
322    }
323}