oak_wolfram/lexer/
mod.rs

1use crate::{kind::WolframSyntaxKind, language::WolframLanguage};
2use oak_core::{
3    Lexer, LexerCache, LexerState, OakError,
4    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
5    source::{Source, TextEdit},
6};
7use std::sync::LazyLock;
8
9type State<'a, S> = LexerState<'a, S, WolframLanguage>;
10
11static WL_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static WL_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "", block_start: "(*", block_end: "*)", nested_blocks: true });
13static WL_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14
15#[derive(Clone, Debug, Default)]
16pub struct WolframLexer;
17
18impl Lexer<WolframLanguage> for WolframLexer {
19    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<WolframLanguage>) -> LexOutput<WolframLanguage> {
20        let mut state = LexerState::new(source);
21        let result = self.run(&mut state);
22        if result.is_ok() {
23            state.add_eof();
24        }
25        state.finish_with_cache(result, cache)
26    }
27}
28
29impl WolframLexer {
30    pub fn new(_config: &WolframLanguage) -> Self {
31        Self
32    }
33
34    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
35        while state.not_at_end() {
36            let safe_point = state.get_position();
37
38            if self.skip_whitespace(state) {
39                continue;
40            }
41
42            if self.skip_comment(state) {
43                continue;
44            }
45
46            if self.lex_string_literal(state) {
47                continue;
48            }
49
50            if self.lex_number_literal(state) {
51                continue;
52            }
53
54            if self.lex_identifier_or_keyword(state) {
55                continue;
56            }
57
58            if self.lex_operators(state) {
59                continue;
60            }
61
62            if self.lex_single_char_tokens(state) {
63                continue;
64            }
65
66            state.advance_if_dead_lock(safe_point);
67        }
68
69        Ok(())
70    }
71
72    fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
73        if WL_WHITESPACE.scan(state, WolframSyntaxKind::Whitespace) {
74            return true;
75        }
76
77        // Handle newlines separately
78        if let Some(ch) = state.peek() {
79            if ch == '\n' || ch == '\r' {
80                let start = state.get_position();
81                state.advance(ch.len_utf8());
82                if ch == '\r' && state.peek() == Some('\n') {
83                    state.advance(1);
84                }
85                state.add_token(WolframSyntaxKind::Newline, start, state.get_position());
86                return true;
87            }
88        }
89        false
90    }
91
92    fn skip_comment<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
93        WL_COMMENT.scan(state, WolframSyntaxKind::Comment, WolframSyntaxKind::Comment)
94    }
95
96    fn lex_string_literal<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
97        WL_STRING.scan(state, WolframSyntaxKind::String)
98    }
99
100    fn lex_number_literal<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
101        let start = state.get_position();
102        let first = match state.peek() {
103            Some(c) => c,
104            None => return false,
105        };
106
107        if !first.is_ascii_digit() {
108            return false;
109        }
110
111        let mut is_real = false;
112
113        // Integer part
114        state.advance(first.len_utf8());
115        while let Some(c) = state.peek() {
116            if c.is_ascii_digit() {
117                state.advance(1);
118            }
119            else {
120                break;
121            }
122        }
123
124        // Decimal part
125        if state.peek() == Some('.') {
126            let next = state.peek_next_n(1);
127            if next.map(|c| c.is_ascii_digit()).unwrap_or(false) {
128                is_real = true;
129                state.advance(1); // consume '.'
130                while let Some(c) = state.peek() {
131                    if c.is_ascii_digit() {
132                        state.advance(1);
133                    }
134                    else {
135                        break;
136                    }
137                }
138            }
139        }
140
141        // Scientific notation
142        if let Some(c) = state.peek() {
143            if c == 'e' || c == 'E' {
144                let next = state.peek_next_n(1);
145                if next == Some('+') || next == Some('-') || next.map(|d| d.is_ascii_digit()).unwrap_or(false) {
146                    is_real = true;
147                    state.advance(1);
148                    if let Some(sign) = state.peek() {
149                        if sign == '+' || sign == '-' {
150                            state.advance(1);
151                        }
152                    }
153                    while let Some(d) = state.peek() {
154                        if d.is_ascii_digit() {
155                            state.advance(1);
156                        }
157                        else {
158                            break;
159                        }
160                    }
161                }
162            }
163        }
164
165        let end = state.get_position();
166        state.add_token(if is_real { WolframSyntaxKind::Real } else { WolframSyntaxKind::Integer }, start, end);
167        true
168    }
169
170    fn lex_identifier_or_keyword<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
171        let start = state.get_position();
172        let ch = match state.peek() {
173            Some(c) => c,
174            None => return false,
175        };
176
177        if !(ch.is_ascii_alphabetic() || ch == '$') {
178            return false;
179        }
180
181        state.advance(ch.len_utf8());
182        while let Some(c) = state.peek() {
183            if c.is_ascii_alphanumeric() || c == '$' {
184                state.advance(c.len_utf8());
185            }
186            else {
187                break;
188            }
189        }
190
191        let end = state.get_position();
192        let text = state.source().get_text_in((start..end).into());
193        let kind = match text.as_ref() {
194            "If" => WolframSyntaxKind::If,
195            "Then" => WolframSyntaxKind::Then,
196            "Else" => WolframSyntaxKind::Else,
197            "While" => WolframSyntaxKind::While,
198            "For" => WolframSyntaxKind::For,
199            "Do" => WolframSyntaxKind::Do,
200            "Function" => WolframSyntaxKind::Function,
201            "Module" => WolframSyntaxKind::Module,
202            "Block" => WolframSyntaxKind::Block,
203            "With" => WolframSyntaxKind::With,
204            "Table" => WolframSyntaxKind::Table,
205            "Map" => WolframSyntaxKind::Map,
206            "Apply" => WolframSyntaxKind::Apply,
207            "Select" => WolframSyntaxKind::Select,
208            "Cases" => WolframSyntaxKind::Cases,
209            "Rule" => WolframSyntaxKind::Rule,
210            "RuleDelayed" => WolframSyntaxKind::RuleDelayed,
211            "Set" => WolframSyntaxKind::Set,
212            "SetDelayed" => WolframSyntaxKind::SetDelayed,
213            "Unset" => WolframSyntaxKind::Unset,
214            "Clear" => WolframSyntaxKind::Clear,
215            "ClearAll" => WolframSyntaxKind::ClearAll,
216            "Return" => WolframSyntaxKind::Return,
217            "Break" => WolframSyntaxKind::Break,
218            "Continue" => WolframSyntaxKind::Continue,
219            "True" => WolframSyntaxKind::True,
220            "False" => WolframSyntaxKind::False,
221            "Null" => WolframSyntaxKind::Null,
222            "Export" => WolframSyntaxKind::Export,
223            "Import" => WolframSyntaxKind::Import,
224            _ => WolframSyntaxKind::Identifier,
225        };
226        state.add_token(kind, start, state.get_position());
227        true
228    }
229
230    fn lex_operators<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
231        let start = state.get_position();
232
233        // Multi-character operators (prefer longest matches first)
234        let patterns: &[(&str, WolframSyntaxKind)] = &[
235            ("===", WolframSyntaxKind::Equal),    // SameQ
236            ("=!=", WolframSyntaxKind::NotEqual), // UnsameQ
237            ("->", WolframSyntaxKind::Arrow),
238            ("=>", WolframSyntaxKind::DoubleArrow),
239            ("==", WolframSyntaxKind::Equal),
240            ("!=", WolframSyntaxKind::NotEqual),
241            ("<=", WolframSyntaxKind::LessEqual),
242            (">=", WolframSyntaxKind::GreaterEqual),
243            ("&&", WolframSyntaxKind::And),
244            ("||", WolframSyntaxKind::Or),
245            ("+=", WolframSyntaxKind::AddTo),
246            ("-=", WolframSyntaxKind::SubtractFrom),
247            ("*=", WolframSyntaxKind::TimesBy),
248            ("/=", WolframSyntaxKind::DivideBy),
249            ("___", WolframSyntaxKind::TripleUnderscore),
250            ("__", WolframSyntaxKind::DoubleUnderscore),
251            ("##", WolframSyntaxKind::SlotSequence),
252        ];
253
254        for (pat, kind) in patterns {
255            if state.starts_with(pat) {
256                state.advance(pat.len());
257                state.add_token(*kind, start, state.get_position());
258                return true;
259            }
260        }
261
262        // Single-character operators
263        if let Some(ch) = state.peek() {
264            let kind = match ch {
265                '+' => Some(WolframSyntaxKind::Plus),
266                '-' => Some(WolframSyntaxKind::Minus),
267                '*' => Some(WolframSyntaxKind::Times),
268                '/' => Some(WolframSyntaxKind::Divide),
269                '^' => Some(WolframSyntaxKind::Power),
270                '=' => Some(WolframSyntaxKind::Assign),
271                '<' => Some(WolframSyntaxKind::Less),
272                '>' => Some(WolframSyntaxKind::Greater),
273                '!' => Some(WolframSyntaxKind::Not),
274                '?' => Some(WolframSyntaxKind::Question),
275                '_' => Some(WolframSyntaxKind::Underscore),
276                '#' => Some(WolframSyntaxKind::Slot),
277                '.' => Some(WolframSyntaxKind::Dot),
278                ':' => Some(WolframSyntaxKind::Colon),
279                _ => None,
280            };
281            if let Some(k) = kind {
282                state.advance(ch.len_utf8());
283                state.add_token(k, start, state.get_position());
284                return true;
285            }
286        }
287        false
288    }
289
290    fn lex_single_char_tokens<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
291        let start = state.get_position();
292        if let Some(ch) = state.peek() {
293            let kind = match ch {
294                '(' => WolframSyntaxKind::LeftParen,
295                ')' => WolframSyntaxKind::RightParen,
296                '[' => WolframSyntaxKind::LeftBracket,
297                ']' => WolframSyntaxKind::RightBracket,
298                '{' => WolframSyntaxKind::LeftBrace,
299                '}' => WolframSyntaxKind::RightBrace,
300                ',' => WolframSyntaxKind::Comma,
301                ';' => WolframSyntaxKind::Semicolon,
302                _ => {
303                    // Unknown character, treat as error
304                    state.advance(ch.len_utf8());
305                    state.add_token(WolframSyntaxKind::Error, start, state.get_position());
306                    return true;
307                }
308            };
309            state.advance(ch.len_utf8());
310            state.add_token(kind, start, state.get_position());
311            true
312        }
313        else {
314            false
315        }
316    }
317}