Skip to main content

oak_dot/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// Token types for the DOT language.
3pub mod token_type;
4
5use crate::{language::DotLanguage, lexer::token_type::DotTokenType};
6use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
7
8pub(crate) type State<'a, S> = LexerState<'a, S, DotLanguage>;
9
10/// Lexical analyzer for the DOT language.
11#[derive(Clone)]
12pub struct DotLexer<'config> {
13    config: &'config DotLanguage,
14}
15
16impl<'config> DotLexer<'config> {
17    /// Creates a new DOT lexer with the given configuration.
18    pub fn new(config: &'config DotLanguage) -> Self {
19        Self { config }
20    }
21
22    /// Skips whitespace characters.
23    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
24        let start_pos = state.get_position();
25
26        while let Some(ch) = state.peek() {
27            if ch == ' ' || ch == '\t' {
28                state.advance(ch.len_utf8());
29            }
30            else {
31                break;
32            }
33        }
34
35        if state.get_position() > start_pos {
36            state.add_token(DotTokenType::Whitespace, start_pos, state.get_position());
37            true
38        }
39        else {
40            false
41        }
42    }
43
44    /// Handles newlines.
45    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
46        let start_pos = state.get_position();
47
48        if let Some('\n') = state.peek() {
49            state.advance(1);
50            state.add_token(DotTokenType::Newline, start_pos, state.get_position());
51            true
52        }
53        else if let Some('\r') = state.peek() {
54            state.advance(1);
55            if let Some('\n') = state.peek() {
56                state.advance(1);
57            }
58            state.add_token(DotTokenType::Newline, start_pos, state.get_position());
59            true
60        }
61        else {
62            false
63        }
64    }
65
66    /// Handles comments.
67    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
68        let start_pos = state.get_position();
69
70        if state.consume_if_starts_with("//") {
71            // Single-line comment
72            while let Some(ch) = state.peek() {
73                if ch == '\n' || ch == '\r' {
74                    break;
75                }
76                state.advance(ch.len_utf8());
77            }
78
79            state.add_token(DotTokenType::Comment, start_pos, state.get_position());
80            true
81        }
82        else if state.consume_if_starts_with("/*") {
83            // Multi-line comment
84            while let Some(ch) = state.peek() {
85                if ch == '*' && state.peek_next_n(1) == Some('/') {
86                    state.advance(2); // Skip */
87                    break;
88                }
89                state.advance(ch.len_utf8());
90            }
91
92            state.add_token(DotTokenType::Comment, start_pos, state.get_position());
93            true
94        }
95        else if state.consume_if_starts_with("#") {
96            // # style comment
97            while let Some(ch) = state.peek() {
98                if ch == '\n' || ch == '\r' {
99                    break;
100                }
101                state.advance(ch.len_utf8());
102            }
103
104            state.add_token(DotTokenType::Comment, start_pos, state.get_position());
105            true
106        }
107        else {
108            false
109        }
110    }
111
112    /// Handles identifiers or keywords.
113    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
114        let start_pos = state.get_position();
115
116        if let Some(ch) = state.peek() {
117            if ch.is_alphabetic() || ch == '_' {
118                state.advance(ch.len_utf8());
119
120                while let Some(ch) = state.peek() {
121                    if ch.is_alphanumeric() || ch == '_' {
122                        state.advance(ch.len_utf8());
123                    }
124                    else {
125                        break;
126                    }
127                }
128
129                let end_pos = state.get_position();
130                let text = state.get_text_in((start_pos..end_pos).into());
131
132                let token_kind = match text.to_lowercase().as_str() {
133                    "graph" => DotTokenType::Graph,
134                    "digraph" => DotTokenType::Digraph,
135                    "subgraph" => DotTokenType::Subgraph,
136                    "node" => DotTokenType::Node,
137                    "edge" => DotTokenType::Edge,
138                    "strict" => DotTokenType::Strict,
139                    _ => DotTokenType::Identifier,
140                };
141
142                state.add_token(token_kind, start_pos, state.get_position());
143                true
144            }
145            else {
146                false
147            }
148        }
149        else {
150            false
151        }
152    }
153
154    /// Handles numbers.
155    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
156        let start_pos = state.get_position();
157
158        if let Some(ch) = state.peek() {
159            let is_negative = ch == '-';
160            let mut has_digit = false;
161
162            if is_negative {
163                // Check if there is a digit after the negative sign
164                if let Some(next_ch) = state.peek_next_n(1) {
165                    if next_ch.is_ascii_digit() {
166                        state.advance(1); // Skip negative sign
167                    }
168                    else {
169                        return false;
170                    }
171                }
172                else {
173                    return false;
174                }
175            }
176
177            if let Some(ch) = state.peek() {
178                if ch.is_ascii_digit() {
179                    has_digit = true;
180                    state.advance(ch.len_utf8());
181
182                    // Handle integer part
183                    while let Some(ch) = state.peek() {
184                        if ch.is_ascii_digit() {
185                            state.advance(ch.len_utf8());
186                        }
187                        else {
188                            break;
189                        }
190                    }
191
192                    // Handle fractional part
193                    if let Some('.') = state.peek() {
194                        let dot_pos = state.get_position();
195                        state.advance(1);
196
197                        if let Some(ch) = state.peek() {
198                            if ch.is_ascii_digit() {
199                                while let Some(ch) = state.peek() {
200                                    if ch.is_ascii_digit() {
201                                        state.advance(ch.len_utf8());
202                                    }
203                                    else {
204                                        break;
205                                    }
206                                }
207                            }
208                            else {
209                                // Backtrack dot
210                                state.set_position(dot_pos);
211                            }
212                        }
213                        else {
214                            // Backtrack dot
215                            state.set_position(dot_pos);
216                        }
217                    }
218                }
219            }
220
221            if has_digit || (is_negative && state.get_position() > start_pos + 1) {
222                state.add_token(DotTokenType::Number, start_pos, state.get_position());
223                true
224            }
225            else {
226                // Backtrack to start position
227                state.set_position(start_pos);
228                false
229            }
230        }
231        else {
232            false
233        }
234    }
235
236    /// Handles strings.
237    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
238        let start_pos = state.get_position();
239
240        if let Some('"') = state.peek() {
241            state.advance(1);
242
243            while let Some(ch) = state.peek() {
244                if ch == '"' {
245                    state.advance(1);
246                    state.add_token(DotTokenType::String, start_pos, state.get_position());
247                    return true;
248                }
249                else if ch == '\\' {
250                    state.advance(1);
251                    if state.peek().is_some() {
252                        state.advance(1);
253                    }
254                }
255                else {
256                    state.advance(ch.len_utf8());
257                }
258            }
259
260            // Unclosed string
261            state.add_token(DotTokenType::Error, start_pos, state.get_position());
262            true
263        }
264        else {
265            false
266        }
267    }
268
269    /// Handles operators.
270    fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
271        let start_pos = state.get_position();
272
273        if state.consume_if_starts_with("->") {
274            state.add_token(DotTokenType::Arrow, start_pos, state.get_position());
275            return true;
276        }
277        if state.consume_if_starts_with("--") {
278            state.add_token(DotTokenType::Line, start_pos, state.get_position());
279            return true;
280        }
281
282        if let Some(ch) = state.peek() {
283            match ch {
284                '=' => {
285                    state.advance(1);
286                    state.add_token(DotTokenType::Equal, start_pos, state.get_position());
287                    true
288                }
289                ';' => {
290                    state.advance(1);
291                    state.add_token(DotTokenType::Semicolon, start_pos, state.get_position());
292                    true
293                }
294                ',' => {
295                    state.advance(1);
296                    state.add_token(DotTokenType::Comma, start_pos, state.get_position());
297                    true
298                }
299                _ => false,
300            }
301        }
302        else {
303            false
304        }
305    }
306
307    /// Handles delimiters.
308    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
309        let start_pos = state.get_position();
310
311        if let Some(ch) = state.peek() {
312            let token_kind = match ch {
313                '{' => DotTokenType::LeftBrace,
314                '}' => DotTokenType::RightBrace,
315                '[' => DotTokenType::LeftBracket,
316                ']' => DotTokenType::RightBracket,
317                '(' => DotTokenType::LeftParen,
318                ')' => DotTokenType::RightParen,
319                _ => return false,
320            };
321
322            state.advance(ch.len_utf8());
323            state.add_token(token_kind, start_pos, state.get_position());
324            true
325        }
326        else {
327            false
328        }
329    }
330}
331
332impl<'config> Lexer<DotLanguage> for DotLexer<'config> {
333    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<DotLanguage>) -> LexOutput<DotLanguage> {
334        let mut state = State::new(source);
335        let result = self.run(&mut state);
336        if result.is_ok() {
337            state.add_eof();
338        }
339        state.finish_with_cache(result, cache)
340    }
341}
342
343impl<'config> DotLexer<'config> {
344    /// Main lexical analysis logic.
345    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
346        while state.not_at_end() {
347            let safe_point = state.get_position();
348
349            // Try various lexical rules
350            if self.skip_whitespace(state) {
351                continue;
352            }
353
354            if self.lex_newline(state) {
355                continue;
356            }
357
358            if self.lex_comment(state) {
359                continue;
360            }
361
362            if self.lex_identifier_or_keyword(state) {
363                continue;
364            }
365
366            if self.lex_number(state) {
367                continue;
368            }
369
370            if self.lex_string(state) {
371                continue;
372            }
373
374            if self.lex_operator(state) {
375                continue;
376            }
377
378            if self.lex_delimiter(state) {
379                continue;
380            }
381
382            // If no rules match, skip the current character and mark it as an error
383            let start_pos = state.get_position();
384            if let Some(ch) = state.peek() {
385                state.advance(ch.len_utf8());
386                state.add_token(DotTokenType::Error, start_pos, state.get_position());
387            }
388
389            state.advance_if_dead_lock(safe_point);
390        }
391
392        Ok(())
393    }
394}