oak_dot/lexer/
mod.rs

1use crate::{kind::DotSyntaxKind, language::DotLanguage};
2use oak_core::{IncrementalCache, Lexer, LexerState, OakError, lexer::LexOutput, source::Source};
3
4// State type alias removed - using generic methods instead
5
6#[derive(Clone)]
7pub struct DotLexer<'config> {
8    config: &'config DotLanguage,
9}
10
11impl<'config> DotLexer<'config> {
12    pub fn new(config: &'config DotLanguage) -> Self {
13        Self { config }
14    }
15
16    /// 跳过空白字符
17    fn skip_whitespace<S: Source>(&self, state: &mut LexerState<S, DotLanguage>) -> bool {
18        let start_pos = state.get_position();
19
20        while let Some(ch) = state.peek() {
21            if ch == ' ' || ch == '\t' {
22                state.advance(ch.len_utf8());
23            }
24            else {
25                break;
26            }
27        }
28
29        if state.get_position() > start_pos {
30            state.add_token(DotSyntaxKind::Whitespace, start_pos, state.get_position());
31            true
32        }
33        else {
34            false
35        }
36    }
37
38    /// 处理换行
39    fn lex_newline<S: Source>(&self, state: &mut LexerState<S, DotLanguage>) -> bool {
40        let start_pos = state.get_position();
41
42        if let Some('\n') = state.peek() {
43            state.advance(1);
44            state.add_token(DotSyntaxKind::Newline, start_pos, state.get_position());
45            true
46        }
47        else if let Some('\r') = state.peek() {
48            state.advance(1);
49            if let Some('\n') = state.peek() {
50                state.advance(1);
51            }
52            state.add_token(DotSyntaxKind::Newline, start_pos, state.get_position());
53            true
54        }
55        else {
56            false
57        }
58    }
59
60    /// 处理注释
61    fn lex_comment<S: Source>(&self, state: &mut LexerState<S, DotLanguage>) -> bool {
62        let start_pos = state.get_position();
63
64        if let Some('/') = state.peek() {
65            match state.peek_next_n(1) {
66                Some('/') => {
67                    // 单行注释
68                    state.advance(1);
69                    state.advance(1);
70
71                    while let Some(ch) = state.peek() {
72                        if ch == '\n' || ch == '\r' {
73                            break;
74                        }
75                        state.advance(ch.len_utf8());
76                    }
77
78                    state.add_token(DotSyntaxKind::Comment, start_pos, state.get_position());
79                    true
80                }
81                Some('*') => {
82                    // 多行注释
83                    state.advance(1);
84                    state.advance(1);
85
86                    while let Some(ch) = state.peek() {
87                        if ch == '*' {
88                            if state.peek_next_n(1) == Some('/') {
89                                state.advance(1);
90                                state.advance(1);
91                                break;
92                            }
93                        }
94                        state.advance(ch.len_utf8());
95                    }
96
97                    state.add_token(DotSyntaxKind::Comment, start_pos, state.get_position());
98                    true
99                }
100                _ => false,
101            }
102        }
103        else if let Some('#') = state.peek() {
104            // # 风格注释
105            state.advance(1);
106
107            while let Some(ch) = state.peek() {
108                if ch == '\n' || ch == '\r' {
109                    break;
110                }
111                state.advance(ch.len_utf8());
112            }
113
114            state.add_token(DotSyntaxKind::Comment, start_pos, state.get_position());
115            true
116        }
117        else {
118            false
119        }
120    }
121
122    /// 处理标识符和关键字
123    fn lex_identifier_or_keyword<S: Source>(&self, state: &mut LexerState<S, DotLanguage>) -> bool {
124        let start_pos = state.get_position();
125
126        if let Some(ch) = state.peek() {
127            if ch.is_alphabetic() || ch == '_' {
128                state.advance(ch.len_utf8());
129
130                while let Some(ch) = state.peek() {
131                    if ch.is_alphanumeric() || ch == '_' {
132                        state.advance(ch.len_utf8());
133                    }
134                    else {
135                        break;
136                    }
137                }
138
139                let end_pos = state.get_position();
140                let text = state.get_text_in(core::range::Range { start: start_pos, end: end_pos });
141
142                let token_kind = match text.to_lowercase().as_str() {
143                    "graph" => DotSyntaxKind::Graph,
144                    "digraph" => DotSyntaxKind::Digraph,
145                    "subgraph" => DotSyntaxKind::Subgraph,
146                    "node" => DotSyntaxKind::Node,
147                    "edge" => DotSyntaxKind::Edge,
148                    "strict" => DotSyntaxKind::Strict,
149                    _ => DotSyntaxKind::Identifier,
150                };
151
152                state.add_token(token_kind, start_pos, state.get_position());
153                true
154            }
155            else {
156                false
157            }
158        }
159        else {
160            false
161        }
162    }
163
164    /// 处理数字
165    fn lex_number<S: Source>(&self, state: &mut LexerState<S, DotLanguage>) -> bool {
166        let start_pos = state.get_position();
167
168        if let Some(ch) = state.peek() {
169            let is_negative = ch == '-';
170            let mut has_digit = false;
171
172            if is_negative {
173                // 检查负号后面是否有数字
174                let next_pos = state.get_position() + 1;
175                if next_pos < state.length() {
176                    let next_ch = state.get_char_at(next_pos);
177                    if next_ch.map_or(false, |c| c.is_ascii_digit()) {
178                        state.advance(1); // 跳过负号
179                    }
180                    else {
181                        return false;
182                    }
183                }
184                else {
185                    return false;
186                }
187            }
188
189            if let Some(ch) = state.peek() {
190                if ch.is_ascii_digit() {
191                    has_digit = true;
192                    state.advance(ch.len_utf8());
193
194                    // 处理整数部分
195                    while let Some(ch) = state.peek() {
196                        if ch.is_ascii_digit() {
197                            state.advance(ch.len_utf8());
198                        }
199                        else {
200                            break;
201                        }
202                    }
203
204                    // 处理小数部分
205                    if let Some('.') = state.peek() {
206                        let dot_pos = state.get_position();
207                        state.advance(1);
208
209                        if let Some(ch) = state.peek() {
210                            if ch.is_ascii_digit() {
211                                while let Some(ch) = state.peek() {
212                                    if ch.is_ascii_digit() {
213                                        state.advance(ch.len_utf8());
214                                    }
215                                    else {
216                                        break;
217                                    }
218                                }
219                            }
220                            else {
221                                // 回退点号
222                                state.set_position(dot_pos);
223                            }
224                        }
225                        else {
226                            // 回退点号
227                            state.set_position(dot_pos);
228                        }
229                    }
230                }
231            }
232
233            if has_digit || (is_negative && state.get_position() > start_pos + 1) {
234                state.add_token(DotSyntaxKind::Number, start_pos, state.get_position());
235                true
236            }
237            else {
238                // 回退到开始位                state.set_position(start_pos);
239                false
240            }
241        }
242        else {
243            false
244        }
245    }
246
247    /// 处理字符
248    fn lex_string<S: Source>(&self, state: &mut LexerState<S, DotLanguage>) -> bool {
249        let start_pos = state.get_position();
250
251        if let Some('"') = state.peek() {
252            state.advance(1);
253
254            while let Some(ch) = state.peek() {
255                if ch == '"' {
256                    state.advance(1);
257                    state.add_token(DotSyntaxKind::String, start_pos, state.get_position());
258                    return true;
259                }
260                else if ch == '\\' {
261                    state.advance(1);
262                    if state.peek().is_some() {
263                        state.advance(1);
264                    }
265                }
266                else {
267                    state.advance(ch.len_utf8());
268                }
269            }
270
271            // 未闭合的字符            state.add_token(DotSyntaxKind::Error, start_pos, state.get_position());
272            true
273        }
274        else {
275            false
276        }
277    }
278
279    /// 处理操作
280    fn lex_operator<S: Source>(&self, state: &mut LexerState<S, DotLanguage>) -> bool {
281        let start_pos = state.get_position();
282
283        if let Some(ch) = state.peek() {
284            match ch {
285                '-' => {
286                    let next_pos = state.get_position() + 1;
287                    if next_pos < state.length() {
288                        let next_ch = state.get_char_at(next_pos);
289                        match next_ch {
290                            Some('>') => {
291                                state.advance(1);
292                                state.advance(1);
293                                state.add_token(DotSyntaxKind::Arrow, start_pos, state.get_position());
294                                true
295                            }
296                            Some('-') => {
297                                state.advance(1);
298                                state.advance(1);
299                                state.add_token(DotSyntaxKind::Line, start_pos, state.get_position());
300                                true
301                            }
302                            _ => false,
303                        }
304                    }
305                    else {
306                        false
307                    }
308                }
309                '=' => {
310                    state.advance(1);
311                    state.add_token(DotSyntaxKind::Equal, start_pos, state.get_position());
312                    true
313                }
314                ';' => {
315                    state.advance(1);
316                    state.add_token(DotSyntaxKind::Semicolon, start_pos, state.get_position());
317                    true
318                }
319                ',' => {
320                    state.advance(1);
321                    state.add_token(DotSyntaxKind::Comma, start_pos, state.get_position());
322                    true
323                }
324                _ => false,
325            }
326        }
327        else {
328            false
329        }
330    }
331
332    /// 处理分隔
333    fn lex_delimiter<S: Source>(&self, state: &mut LexerState<S, DotLanguage>) -> bool {
334        let start_pos = state.get_position();
335
336        if let Some(ch) = state.peek() {
337            let token_kind = match ch {
338                '{' => DotSyntaxKind::LeftBrace,
339                '}' => DotSyntaxKind::RightBrace,
340                '[' => DotSyntaxKind::LeftBracket,
341                ']' => DotSyntaxKind::RightBracket,
342                '(' => DotSyntaxKind::LeftParen,
343                ')' => DotSyntaxKind::RightParen,
344                _ => return false,
345            };
346
347            state.advance(ch.len_utf8());
348            state.add_token(token_kind, start_pos, state.get_position());
349            true
350        }
351        else {
352            false
353        }
354    }
355}
356
357impl<'config> Lexer<DotLanguage> for DotLexer<'config> {
358    fn lex(&self, source: impl Source) -> LexOutput<DotLanguage> {
359        let mut state = LexerState::new(source);
360        let result = self.run(&mut state);
361        state.finish(result)
362    }
363
364    fn lex_incremental(
365        &self,
366        source: impl Source,
367        changed: usize,
368        cache: IncrementalCache<DotLanguage>,
369    ) -> LexOutput<DotLanguage> {
370        let mut state = LexerState::new_with_cache(source, changed, cache);
371        let result = self.run(&mut state);
372        state.finish(result)
373    }
374}
375
376impl<'config> DotLexer<'config> {
377    /// 主要的词法分析逻辑
378    fn run<S: Source>(&self, state: &mut LexerState<S, DotLanguage>) -> Result<(), OakError> {
379        while state.not_at_end() {
380            let safe_point = state.get_position();
381
382            // 尝试各种词法规则
383            if self.skip_whitespace(state) {
384                continue;
385            }
386
387            if self.lex_newline(state) {
388                continue;
389            }
390
391            if self.lex_comment(state) {
392                continue;
393            }
394
395            if self.lex_identifier_or_keyword(state) {
396                continue;
397            }
398
399            if self.lex_number(state) {
400                continue;
401            }
402
403            if self.lex_string(state) {
404                continue;
405            }
406
407            if self.lex_operator(state) {
408                continue;
409            }
410
411            if self.lex_delimiter(state) {
412                continue;
413            }
414
415            // 如果所有规则都不匹配,跳过当前字符并标记为错误
416            let start_pos = state.get_position();
417            if let Some(ch) = state.peek() {
418                state.advance(ch.len_utf8());
419                state.add_token(DotSyntaxKind::Error, start_pos, state.get_position());
420            }
421
422            state.safe_check(safe_point);
423        }
424
425        // 添加 EOF token
426        let eof_pos = state.get_position();
427        state.add_token(DotSyntaxKind::Eof, eof_pos, eof_pos);
428
429        Ok(())
430    }
431}