Skip to main content

oak_dot/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::DotLanguage, lexer::token_type::DotTokenType};
5use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
6
7type State<'a, S> = LexerState<'a, S, DotLanguage>;
8
9#[derive(Clone)]
10pub struct DotLexer<'config> {
11    _config: &'config DotLanguage,
12}
13
14impl<'config> DotLexer<'config> {
15    pub fn new(config: &'config DotLanguage) -> Self {
16        Self { _config: config }
17    }
18
19    /// 跳过空白字符
20    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
21        let start_pos = state.get_position();
22
23        while let Some(ch) = state.peek() {
24            if ch == ' ' || ch == '\t' {
25                state.advance(ch.len_utf8());
26            }
27            else {
28                break;
29            }
30        }
31
32        if state.get_position() > start_pos {
33            state.add_token(DotTokenType::Whitespace, start_pos, state.get_position());
34            true
35        }
36        else {
37            false
38        }
39    }
40
41    /// 处理换行
42    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
43        let start_pos = state.get_position();
44
45        if let Some('\n') = state.peek() {
46            state.advance(1);
47            state.add_token(DotTokenType::Newline, start_pos, state.get_position());
48            true
49        }
50        else if let Some('\r') = state.peek() {
51            state.advance(1);
52            if let Some('\n') = state.peek() {
53                state.advance(1);
54            }
55            state.add_token(DotTokenType::Newline, start_pos, state.get_position());
56            true
57        }
58        else {
59            false
60        }
61    }
62
63    /// 处理注释
64    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
65        let start_pos = state.get_position();
66
67        if state.consume_if_starts_with("//") {
68            // 单行注释
69            while let Some(ch) = state.peek() {
70                if ch == '\n' || ch == '\r' {
71                    break;
72                }
73                state.advance(ch.len_utf8());
74            }
75
76            state.add_token(DotTokenType::Comment, start_pos, state.get_position());
77            true
78        }
79        else if state.consume_if_starts_with("/*") {
80            // 多行注释
81            while let Some(ch) = state.peek() {
82                if ch == '*' && state.peek_next_n(1) == Some('/') {
83                    state.advance(2); // Skip */
84                    break;
85                }
86                state.advance(ch.len_utf8());
87            }
88
89            state.add_token(DotTokenType::Comment, start_pos, state.get_position());
90            true
91        }
92        else if state.consume_if_starts_with("#") {
93            // # 风格注释
94            while let Some(ch) = state.peek() {
95                if ch == '\n' || ch == '\r' {
96                    break;
97                }
98                state.advance(ch.len_utf8());
99            }
100
101            state.add_token(DotTokenType::Comment, start_pos, state.get_position());
102            true
103        }
104        else {
105            false
106        }
107    }
108
109    /// 处理标识符或关键字
110    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
111        let start_pos = state.get_position();
112
113        if let Some(ch) = state.peek() {
114            if ch.is_alphabetic() || ch == '_' {
115                state.advance(ch.len_utf8());
116
117                while let Some(ch) = state.peek() {
118                    if ch.is_alphanumeric() || ch == '_' {
119                        state.advance(ch.len_utf8());
120                    }
121                    else {
122                        break;
123                    }
124                }
125
126                let end_pos = state.get_position();
127                let text = state.get_text_in((start_pos..end_pos).into());
128
129                let token_kind = match text.to_lowercase().as_str() {
130                    "graph" => DotTokenType::Graph,
131                    "digraph" => DotTokenType::Digraph,
132                    "subgraph" => DotTokenType::Subgraph,
133                    "node" => DotTokenType::Node,
134                    "edge" => DotTokenType::Edge,
135                    "strict" => DotTokenType::Strict,
136                    _ => DotTokenType::Identifier,
137                };
138
139                state.add_token(token_kind, start_pos, state.get_position());
140                true
141            }
142            else {
143                false
144            }
145        }
146        else {
147            false
148        }
149    }
150
151    /// 处理数字
152    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
153        let start_pos = state.get_position();
154
155        if let Some(ch) = state.peek() {
156            let is_negative = ch == '-';
157            let mut has_digit = false;
158
159            if is_negative {
160                // 检查负号后面是否有数字
161                if let Some(next_ch) = state.peek_next_n(1) {
162                    if next_ch.is_ascii_digit() {
163                        state.advance(1); // 跳过负号
164                    }
165                    else {
166                        return false;
167                    }
168                }
169                else {
170                    return false;
171                }
172            }
173
174            if let Some(ch) = state.peek() {
175                if ch.is_ascii_digit() {
176                    has_digit = true;
177                    state.advance(ch.len_utf8());
178
179                    // 处理整数部分
180                    while let Some(ch) = state.peek() {
181                        if ch.is_ascii_digit() {
182                            state.advance(ch.len_utf8());
183                        }
184                        else {
185                            break;
186                        }
187                    }
188
189                    // 处理小数部分
190                    if let Some('.') = state.peek() {
191                        let dot_pos = state.get_position();
192                        state.advance(1);
193
194                        if let Some(ch) = state.peek() {
195                            if ch.is_ascii_digit() {
196                                while let Some(ch) = state.peek() {
197                                    if ch.is_ascii_digit() {
198                                        state.advance(ch.len_utf8());
199                                    }
200                                    else {
201                                        break;
202                                    }
203                                }
204                            }
205                            else {
206                                // 回退点号
207                                state.set_position(dot_pos);
208                            }
209                        }
210                        else {
211                            // 回退点号
212                            state.set_position(dot_pos);
213                        }
214                    }
215                }
216            }
217
218            if has_digit || (is_negative && state.get_position() > start_pos + 1) {
219                state.add_token(DotTokenType::Number, start_pos, state.get_position());
220                true
221            }
222            else {
223                // 回退到开始位
224                state.set_position(start_pos);
225                false
226            }
227        }
228        else {
229            false
230        }
231    }
232
233    /// 处理字符
234    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
235        let start_pos = state.get_position();
236
237        if let Some('"') = state.peek() {
238            state.advance(1);
239
240            while let Some(ch) = state.peek() {
241                if ch == '"' {
242                    state.advance(1);
243                    state.add_token(DotTokenType::String, start_pos, state.get_position());
244                    return true;
245                }
246                else if ch == '\\' {
247                    state.advance(1);
248                    if state.peek().is_some() {
249                        state.advance(1);
250                    }
251                }
252                else {
253                    state.advance(ch.len_utf8());
254                }
255            }
256
257            // 未闭合的字符
258            state.add_token(DotTokenType::Error, start_pos, state.get_position());
259            true
260        }
261        else {
262            false
263        }
264    }
265
266    /// 处理操作
267    fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
268        let start_pos = state.get_position();
269
270        if state.consume_if_starts_with("->") {
271            state.add_token(DotTokenType::Arrow, start_pos, state.get_position());
272            return true;
273        }
274        if state.consume_if_starts_with("--") {
275            state.add_token(DotTokenType::Line, start_pos, state.get_position());
276            return true;
277        }
278
279        if let Some(ch) = state.peek() {
280            match ch {
281                '=' => {
282                    state.advance(1);
283                    state.add_token(DotTokenType::Equal, start_pos, state.get_position());
284                    true
285                }
286                ';' => {
287                    state.advance(1);
288                    state.add_token(DotTokenType::Semicolon, start_pos, state.get_position());
289                    true
290                }
291                ',' => {
292                    state.advance(1);
293                    state.add_token(DotTokenType::Comma, start_pos, state.get_position());
294                    true
295                }
296                _ => false,
297            }
298        }
299        else {
300            false
301        }
302    }
303
304    /// 处理分隔
305    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
306        let start_pos = state.get_position();
307
308        if let Some(ch) = state.peek() {
309            let token_kind = match ch {
310                '{' => DotTokenType::LeftBrace,
311                '}' => DotTokenType::RightBrace,
312                '[' => DotTokenType::LeftBracket,
313                ']' => DotTokenType::RightBracket,
314                '(' => DotTokenType::LeftParen,
315                ')' => DotTokenType::RightParen,
316                _ => return false,
317            };
318
319            state.advance(ch.len_utf8());
320            state.add_token(token_kind, start_pos, state.get_position());
321            true
322        }
323        else {
324            false
325        }
326    }
327}
328
329impl<'config> Lexer<DotLanguage> for DotLexer<'config> {
330    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<DotLanguage>) -> LexOutput<DotLanguage> {
331        let mut state = State::new(source);
332        let result = self.run(&mut state);
333        if result.is_ok() {
334            state.add_eof();
335        }
336        state.finish_with_cache(result, cache)
337    }
338}
339
340impl<'config> DotLexer<'config> {
341    /// 主要的词法分析逻辑
342    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
343        while state.not_at_end() {
344            let safe_point = state.get_position();
345
346            // 尝试各种词法规则
347            if self.skip_whitespace(state) {
348                continue;
349            }
350
351            if self.lex_newline(state) {
352                continue;
353            }
354
355            if self.lex_comment(state) {
356                continue;
357            }
358
359            if self.lex_identifier_or_keyword(state) {
360                continue;
361            }
362
363            if self.lex_number(state) {
364                continue;
365            }
366
367            if self.lex_string(state) {
368                continue;
369            }
370
371            if self.lex_operator(state) {
372                continue;
373            }
374
375            if self.lex_delimiter(state) {
376                continue;
377            }
378
379            // 如果所有规则都不匹配,跳过当前字符并标记为错误
380            let start_pos = state.get_position();
381            if let Some(ch) = state.peek() {
382                state.advance(ch.len_utf8());
383                state.add_token(DotTokenType::Error, start_pos, state.get_position());
384            }
385
386            state.advance_if_dead_lock(safe_point);
387        }
388
389        Ok(())
390    }
391}