Skip to main content

oak_dot/lexer/
mod.rs

1use crate::{kind::DotSyntaxKind, language::DotLanguage};
2use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
3
4type State<'a, S> = LexerState<'a, S, DotLanguage>;
5
6#[derive(Clone)]
7pub struct DotLexer<'config> {
8    _config: &'config DotLanguage,
9}
10
11impl<'config> DotLexer<'config> {
12    pub fn new(config: &'config DotLanguage) -> Self {
13        Self { _config: config }
14    }
15
16    /// 跳过空白字符
17    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
18        let start_pos = state.get_position();
19
20        while let Some(ch) = state.peek() {
21            if ch == ' ' || ch == '\t' {
22                state.advance(ch.len_utf8());
23            }
24            else {
25                break;
26            }
27        }
28
29        if state.get_position() > start_pos {
30            state.add_token(DotSyntaxKind::Whitespace, start_pos, state.get_position());
31            true
32        }
33        else {
34            false
35        }
36    }
37
38    /// 处理换行
39    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
40        let start_pos = state.get_position();
41
42        if let Some('\n') = state.peek() {
43            state.advance(1);
44            state.add_token(DotSyntaxKind::Newline, start_pos, state.get_position());
45            true
46        }
47        else if let Some('\r') = state.peek() {
48            state.advance(1);
49            if let Some('\n') = state.peek() {
50                state.advance(1);
51            }
52            state.add_token(DotSyntaxKind::Newline, start_pos, state.get_position());
53            true
54        }
55        else {
56            false
57        }
58    }
59
60    /// 处理注释
61    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
62        let start_pos = state.get_position();
63
64        if state.consume_if_starts_with("//") {
65            // 单行注释
66            while let Some(ch) = state.peek() {
67                if ch == '\n' || ch == '\r' {
68                    break;
69                }
70                state.advance(ch.len_utf8());
71            }
72
73            state.add_token(DotSyntaxKind::Comment, start_pos, state.get_position());
74            true
75        }
76        else if state.consume_if_starts_with("/*") {
77            // 多行注释
78            while let Some(ch) = state.peek() {
79                if ch == '*' && state.peek_next_n(1) == Some('/') {
80                    state.advance(2); // Skip */
81                    break;
82                }
83                state.advance(ch.len_utf8());
84            }
85
86            state.add_token(DotSyntaxKind::Comment, start_pos, state.get_position());
87            true
88        }
89        else if state.consume_if_starts_with("#") {
90            // # 风格注释
91            while let Some(ch) = state.peek() {
92                if ch == '\n' || ch == '\r' {
93                    break;
94                }
95                state.advance(ch.len_utf8());
96            }
97
98            state.add_token(DotSyntaxKind::Comment, start_pos, state.get_position());
99            true
100        }
101        else {
102            false
103        }
104    }
105
106    /// 处理标识符或关键字
107    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
108        let start_pos = state.get_position();
109
110        if let Some(ch) = state.peek() {
111            if ch.is_alphabetic() || ch == '_' {
112                state.advance(ch.len_utf8());
113
114                while let Some(ch) = state.peek() {
115                    if ch.is_alphanumeric() || ch == '_' {
116                        state.advance(ch.len_utf8());
117                    }
118                    else {
119                        break;
120                    }
121                }
122
123                let end_pos = state.get_position();
124                let text = state.get_text_in((start_pos..end_pos).into());
125
126                let token_kind = match text.to_lowercase().as_str() {
127                    "graph" => DotSyntaxKind::Graph,
128                    "digraph" => DotSyntaxKind::Digraph,
129                    "subgraph" => DotSyntaxKind::Subgraph,
130                    "node" => DotSyntaxKind::Node,
131                    "edge" => DotSyntaxKind::Edge,
132                    "strict" => DotSyntaxKind::Strict,
133                    _ => DotSyntaxKind::Identifier,
134                };
135
136                state.add_token(token_kind, start_pos, state.get_position());
137                true
138            }
139            else {
140                false
141            }
142        }
143        else {
144            false
145        }
146    }
147
148    /// 处理数字
149    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
150        let start_pos = state.get_position();
151
152        if let Some(ch) = state.peek() {
153            let is_negative = ch == '-';
154            let mut has_digit = false;
155
156            if is_negative {
157                // 检查负号后面是否有数字
158                if let Some(next_ch) = state.peek_next_n(1) {
159                    if next_ch.is_ascii_digit() {
160                        state.advance(1); // 跳过负号
161                    }
162                    else {
163                        return false;
164                    }
165                }
166                else {
167                    return false;
168                }
169            }
170
171            if let Some(ch) = state.peek() {
172                if ch.is_ascii_digit() {
173                    has_digit = true;
174                    state.advance(ch.len_utf8());
175
176                    // 处理整数部分
177                    while let Some(ch) = state.peek() {
178                        if ch.is_ascii_digit() {
179                            state.advance(ch.len_utf8());
180                        }
181                        else {
182                            break;
183                        }
184                    }
185
186                    // 处理小数部分
187                    if let Some('.') = state.peek() {
188                        let dot_pos = state.get_position();
189                        state.advance(1);
190
191                        if let Some(ch) = state.peek() {
192                            if ch.is_ascii_digit() {
193                                while let Some(ch) = state.peek() {
194                                    if ch.is_ascii_digit() {
195                                        state.advance(ch.len_utf8());
196                                    }
197                                    else {
198                                        break;
199                                    }
200                                }
201                            }
202                            else {
203                                // 回退点号
204                                state.set_position(dot_pos);
205                            }
206                        }
207                        else {
208                            // 回退点号
209                            state.set_position(dot_pos);
210                        }
211                    }
212                }
213            }
214
215            if has_digit || (is_negative && state.get_position() > start_pos + 1) {
216                state.add_token(DotSyntaxKind::Number, start_pos, state.get_position());
217                true
218            }
219            else {
220                // 回退到开始位
221                state.set_position(start_pos);
222                false
223            }
224        }
225        else {
226            false
227        }
228    }
229
230    /// 处理字符
231    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
232        let start_pos = state.get_position();
233
234        if let Some('"') = state.peek() {
235            state.advance(1);
236
237            while let Some(ch) = state.peek() {
238                if ch == '"' {
239                    state.advance(1);
240                    state.add_token(DotSyntaxKind::String, start_pos, state.get_position());
241                    return true;
242                }
243                else if ch == '\\' {
244                    state.advance(1);
245                    if state.peek().is_some() {
246                        state.advance(1);
247                    }
248                }
249                else {
250                    state.advance(ch.len_utf8());
251                }
252            }
253
254            // 未闭合的字符
255            state.add_token(DotSyntaxKind::Error, start_pos, state.get_position());
256            true
257        }
258        else {
259            false
260        }
261    }
262
263    /// 处理操作
264    fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
265        let start_pos = state.get_position();
266
267        if state.consume_if_starts_with("->") {
268            state.add_token(DotSyntaxKind::Arrow, start_pos, state.get_position());
269            return true;
270        }
271        if state.consume_if_starts_with("--") {
272            state.add_token(DotSyntaxKind::Line, start_pos, state.get_position());
273            return true;
274        }
275
276        if let Some(ch) = state.peek() {
277            match ch {
278                '=' => {
279                    state.advance(1);
280                    state.add_token(DotSyntaxKind::Equal, start_pos, state.get_position());
281                    true
282                }
283                ';' => {
284                    state.advance(1);
285                    state.add_token(DotSyntaxKind::Semicolon, start_pos, state.get_position());
286                    true
287                }
288                ',' => {
289                    state.advance(1);
290                    state.add_token(DotSyntaxKind::Comma, start_pos, state.get_position());
291                    true
292                }
293                _ => false,
294            }
295        }
296        else {
297            false
298        }
299    }
300
301    /// 处理分隔
302    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
303        let start_pos = state.get_position();
304
305        if let Some(ch) = state.peek() {
306            let token_kind = match ch {
307                '{' => DotSyntaxKind::LeftBrace,
308                '}' => DotSyntaxKind::RightBrace,
309                '[' => DotSyntaxKind::LeftBracket,
310                ']' => DotSyntaxKind::RightBracket,
311                '(' => DotSyntaxKind::LeftParen,
312                ')' => DotSyntaxKind::RightParen,
313                _ => return false,
314            };
315
316            state.advance(ch.len_utf8());
317            state.add_token(token_kind, start_pos, state.get_position());
318            true
319        }
320        else {
321            false
322        }
323    }
324}
325
326impl<'config> Lexer<DotLanguage> for DotLexer<'config> {
327    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<DotLanguage>) -> LexOutput<DotLanguage> {
328        let mut state = State::new(source);
329        let result = self.run(&mut state);
330        if result.is_ok() {
331            state.add_eof();
332        }
333        state.finish_with_cache(result, cache)
334    }
335}
336
337impl<'config> DotLexer<'config> {
338    /// 主要的词法分析逻辑
339    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
340        while state.not_at_end() {
341            let safe_point = state.get_position();
342
343            // 尝试各种词法规则
344            if self.skip_whitespace(state) {
345                continue;
346            }
347
348            if self.lex_newline(state) {
349                continue;
350            }
351
352            if self.lex_comment(state) {
353                continue;
354            }
355
356            if self.lex_identifier_or_keyword(state) {
357                continue;
358            }
359
360            if self.lex_number(state) {
361                continue;
362            }
363
364            if self.lex_string(state) {
365                continue;
366            }
367
368            if self.lex_operator(state) {
369                continue;
370            }
371
372            if self.lex_delimiter(state) {
373                continue;
374            }
375
376            // 如果所有规则都不匹配,跳过当前字符并标记为错误
377            let start_pos = state.get_position();
378            if let Some(ch) = state.peek() {
379                state.advance(ch.len_utf8());
380                state.add_token(DotSyntaxKind::Error, start_pos, state.get_position());
381            }
382
383            state.advance_if_dead_lock(safe_point);
384        }
385
386        Ok(())
387    }
388}