Skip to main content

oak_wolfram/lexer/
mod.rs

1use crate::{kind::WolframSyntaxKind, language::WolframLanguage};
2use oak_core::{
3    Lexer, LexerCache, LexerState, OakError,
4    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
5    source::{Source, TextEdit},
6};
7use std::sync::LazyLock;
8
9type State<'a, S> = LexerState<'a, S, WolframLanguage>;
10
11static WL_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static WL_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "", block_start: "(*", block_end: "*)", nested_blocks: true });
13static WL_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14
15#[derive(Clone, Debug)]
16pub struct WolframLexer<'config> {
17    _config: &'config WolframLanguage,
18}
19
20impl<'config> Lexer<WolframLanguage> for WolframLexer<'config> {
21    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<WolframLanguage>) -> LexOutput<WolframLanguage> {
22        let mut state = LexerState::new(source);
23        let result = self.run(&mut state);
24        if result.is_ok() {
25            state.add_eof();
26        }
27        state.finish_with_cache(result, cache)
28    }
29}
30
31impl<'config> WolframLexer<'config> {
32    pub fn new(config: &'config WolframLanguage) -> Self {
33        Self { _config: config }
34    }
35
36    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
37        while state.not_at_end() {
38            let safe_point = state.get_position();
39
40            if self.skip_whitespace(state) {
41                continue;
42            }
43
44            if self.skip_comment(state) {
45                continue;
46            }
47
48            if self.lex_string_literal(state) {
49                continue;
50            }
51
52            if self.lex_number_literal(state) {
53                continue;
54            }
55
56            if self.lex_identifier_or_keyword(state) {
57                continue;
58            }
59
60            if self.lex_operators(state) {
61                continue;
62            }
63
64            if self.lex_single_char_tokens(state) {
65                continue;
66            }
67
68            state.advance_if_dead_lock(safe_point);
69        }
70
71        Ok(())
72    }
73
74    fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
75        if WL_WHITESPACE.scan(state, WolframSyntaxKind::Whitespace) {
76            return true;
77        }
78
79        // Handle newlines separately
80        if let Some(ch) = state.peek() {
81            if ch == '\n' || ch == '\r' {
82                let start = state.get_position();
83                state.advance(ch.len_utf8());
84                if ch == '\r' && state.peek() == Some('\n') {
85                    state.advance(1);
86                }
87                state.add_token(WolframSyntaxKind::Newline, start, state.get_position());
88                return true;
89            }
90        }
91        false
92    }
93
94    fn skip_comment<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
95        WL_COMMENT.scan(state, WolframSyntaxKind::Comment, WolframSyntaxKind::Comment)
96    }
97
98    fn lex_string_literal<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
99        WL_STRING.scan(state, WolframSyntaxKind::String)
100    }
101
102    fn lex_number_literal<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
103        let start = state.get_position();
104        let first = match state.peek() {
105            Some(c) => c,
106            None => return false,
107        };
108
109        if !first.is_ascii_digit() {
110            return false;
111        }
112
113        let mut is_real = false;
114
115        // Integer part
116        state.advance(first.len_utf8());
117        while let Some(c) = state.peek() {
118            if c.is_ascii_digit() {
119                state.advance(1);
120            }
121            else {
122                break;
123            }
124        }
125
126        // Decimal part
127        if state.peek() == Some('.') {
128            let next = state.peek_next_n(1);
129            if next.map(|c| c.is_ascii_digit()).unwrap_or(false) {
130                is_real = true;
131                state.advance(1); // consume '.'
132                while let Some(c) = state.peek() {
133                    if c.is_ascii_digit() {
134                        state.advance(1);
135                    }
136                    else {
137                        break;
138                    }
139                }
140            }
141        }
142
143        // Scientific notation
144        if let Some(c) = state.peek() {
145            if c == 'e' || c == 'E' {
146                let next = state.peek_next_n(1);
147                if next == Some('+') || next == Some('-') || next.map(|d| d.is_ascii_digit()).unwrap_or(false) {
148                    is_real = true;
149                    state.advance(1);
150                    if let Some(sign) = state.peek() {
151                        if sign == '+' || sign == '-' {
152                            state.advance(1);
153                        }
154                    }
155                    while let Some(d) = state.peek() {
156                        if d.is_ascii_digit() {
157                            state.advance(1);
158                        }
159                        else {
160                            break;
161                        }
162                    }
163                }
164            }
165        }
166
167        let end = state.get_position();
168        state.add_token(if is_real { WolframSyntaxKind::Real } else { WolframSyntaxKind::Integer }, start, end);
169        true
170    }
171
172    fn lex_identifier_or_keyword<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
173        let start = state.get_position();
174        let ch = match state.peek() {
175            Some(c) => c,
176            None => return false,
177        };
178
179        if !(ch.is_ascii_alphabetic() || ch == '$') {
180            return false;
181        }
182
183        state.advance(ch.len_utf8());
184        while let Some(c) = state.peek() {
185            if c.is_ascii_alphanumeric() || c == '$' {
186                state.advance(c.len_utf8());
187            }
188            else {
189                break;
190            }
191        }
192
193        let end = state.get_position();
194        let text = state.source().get_text_in((start..end).into());
195        let kind = match text.as_ref() {
196            "If" => WolframSyntaxKind::If,
197            "Then" => WolframSyntaxKind::Then,
198            "Else" => WolframSyntaxKind::Else,
199            "While" => WolframSyntaxKind::While,
200            "For" => WolframSyntaxKind::For,
201            "Do" => WolframSyntaxKind::Do,
202            "Function" => WolframSyntaxKind::Function,
203            "Module" => WolframSyntaxKind::Module,
204            "Block" => WolframSyntaxKind::Block,
205            "With" => WolframSyntaxKind::With,
206            "Table" => WolframSyntaxKind::Table,
207            "Map" => WolframSyntaxKind::Map,
208            "Apply" => WolframSyntaxKind::Apply,
209            "Select" => WolframSyntaxKind::Select,
210            "Cases" => WolframSyntaxKind::Cases,
211            "Rule" => WolframSyntaxKind::Rule,
212            "RuleDelayed" => WolframSyntaxKind::RuleDelayed,
213            "Set" => WolframSyntaxKind::Set,
214            "SetDelayed" => WolframSyntaxKind::SetDelayed,
215            "Unset" => WolframSyntaxKind::Unset,
216            "Clear" => WolframSyntaxKind::Clear,
217            "ClearAll" => WolframSyntaxKind::ClearAll,
218            "Return" => WolframSyntaxKind::Return,
219            "Break" => WolframSyntaxKind::Break,
220            "Continue" => WolframSyntaxKind::Continue,
221            "True" => WolframSyntaxKind::True,
222            "False" => WolframSyntaxKind::False,
223            "Null" => WolframSyntaxKind::Null,
224            "Export" => WolframSyntaxKind::Export,
225            "Import" => WolframSyntaxKind::Import,
226            _ => WolframSyntaxKind::Identifier,
227        };
228        state.add_token(kind, start, state.get_position());
229        true
230    }
231
232    fn lex_operators<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
233        let start = state.get_position();
234
235        // Multi-character operators (prefer longest matches first)
236        let patterns: &[(&str, WolframSyntaxKind)] = &[
237            ("===", WolframSyntaxKind::Equal),    // SameQ
238            ("=!=", WolframSyntaxKind::NotEqual), // UnsameQ
239            ("->", WolframSyntaxKind::Arrow),
240            ("=>", WolframSyntaxKind::DoubleArrow),
241            ("==", WolframSyntaxKind::Equal),
242            ("!=", WolframSyntaxKind::NotEqual),
243            ("<=", WolframSyntaxKind::LessEqual),
244            (">=", WolframSyntaxKind::GreaterEqual),
245            ("&&", WolframSyntaxKind::And),
246            ("||", WolframSyntaxKind::Or),
247            ("+=", WolframSyntaxKind::AddTo),
248            ("-=", WolframSyntaxKind::SubtractFrom),
249            ("*=", WolframSyntaxKind::TimesBy),
250            ("/=", WolframSyntaxKind::DivideBy),
251            ("___", WolframSyntaxKind::TripleUnderscore),
252            ("__", WolframSyntaxKind::DoubleUnderscore),
253            ("##", WolframSyntaxKind::SlotSequence),
254        ];
255
256        for (pat, kind) in patterns {
257            if state.starts_with(pat) {
258                state.advance(pat.len());
259                state.add_token(*kind, start, state.get_position());
260                return true;
261            }
262        }
263
264        // Single-character operators
265        if let Some(ch) = state.peek() {
266            let kind = match ch {
267                '+' => Some(WolframSyntaxKind::Plus),
268                '-' => Some(WolframSyntaxKind::Minus),
269                '*' => Some(WolframSyntaxKind::Times),
270                '/' => Some(WolframSyntaxKind::Divide),
271                '^' => Some(WolframSyntaxKind::Power),
272                '=' => Some(WolframSyntaxKind::Assign),
273                '<' => Some(WolframSyntaxKind::Less),
274                '>' => Some(WolframSyntaxKind::Greater),
275                '!' => Some(WolframSyntaxKind::Not),
276                '?' => Some(WolframSyntaxKind::Question),
277                '_' => Some(WolframSyntaxKind::Underscore),
278                '#' => Some(WolframSyntaxKind::Slot),
279                '.' => Some(WolframSyntaxKind::Dot),
280                ':' => Some(WolframSyntaxKind::Colon),
281                _ => None,
282            };
283            if let Some(k) = kind {
284                state.advance(ch.len_utf8());
285                state.add_token(k, start, state.get_position());
286                return true;
287            }
288        }
289        false
290    }
291
292    fn lex_single_char_tokens<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
293        let start = state.get_position();
294        if let Some(ch) = state.peek() {
295            let kind = match ch {
296                '(' => WolframSyntaxKind::LeftParen,
297                ')' => WolframSyntaxKind::RightParen,
298                '[' => WolframSyntaxKind::LeftBracket,
299                ']' => WolframSyntaxKind::RightBracket,
300                '{' => WolframSyntaxKind::LeftBrace,
301                '}' => WolframSyntaxKind::RightBrace,
302                ',' => WolframSyntaxKind::Comma,
303                ';' => WolframSyntaxKind::Semicolon,
304                _ => {
305                    // Unknown character, treat as error
306                    state.advance(ch.len_utf8());
307                    state.add_token(WolframSyntaxKind::Error, start, state.get_position());
308                    return true;
309                }
310            };
311            state.advance(ch.len_utf8());
312            state.add_token(kind, start, state.get_position());
313            true
314        }
315        else {
316            false
317        }
318    }
319}