Skip to main content

oak_wolfram/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::WolframLanguage, lexer::token_type::WolframTokenType};
5use oak_core::{
6    Lexer, LexerCache, LexerState, OakError,
7    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
8    source::{Source, TextEdit},
9};
10use std::sync::LazyLock;
11
12type State<'a, S> = LexerState<'a, S, WolframLanguage>;
13
14static WL_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
15static WL_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "", block_start: "(*", block_end: "*)", nested_blocks: true });
16static WL_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
17
18#[derive(Clone, Debug)]
19pub struct WolframLexer<'config> {
20    _config: &'config WolframLanguage,
21}
22
23impl<'config> Lexer<WolframLanguage> for WolframLexer<'config> {
24    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<WolframLanguage>) -> LexOutput<WolframLanguage> {
25        let mut state = LexerState::new(source);
26        let result = self.run(&mut state);
27        if result.is_ok() {
28            state.add_eof();
29        }
30        state.finish_with_cache(result, cache)
31    }
32}
33
34impl<'config> WolframLexer<'config> {
35    pub fn new(config: &'config WolframLanguage) -> Self {
36        Self { _config: config }
37    }
38
39    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
40        while state.not_at_end() {
41            let safe_point = state.get_position();
42
43            if self.skip_whitespace(state) {
44                continue;
45            }
46
47            if self.skip_comment(state) {
48                continue;
49            }
50
51            if self.lex_string_literal(state) {
52                continue;
53            }
54
55            if self.lex_number_literal(state) {
56                continue;
57            }
58
59            if self.lex_identifier_or_keyword(state) {
60                continue;
61            }
62
63            if self.lex_operators(state) {
64                continue;
65            }
66
67            if self.lex_single_char_tokens(state) {
68                continue;
69            }
70
71            state.advance_if_dead_lock(safe_point);
72        }
73
74        Ok(())
75    }
76
77    fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
78        // Handle newlines first
79        if let Some(ch) = state.peek() {
80            if ch == '\n' || ch == '\r' {
81                let start = state.get_position();
82                state.advance(ch.len_utf8());
83                if ch == '\r' && state.peek() == Some('\n') {
84                    state.advance(1);
85                }
86                state.add_token(WolframTokenType::Newline, start, state.get_position());
87                return true;
88            }
89        }
90
91        if WL_WHITESPACE.scan(state, WolframTokenType::Whitespace) {
92            return true;
93        }
94
95        false
96    }
97
98    fn skip_comment<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
99        WL_COMMENT.scan(state, WolframTokenType::Comment, WolframTokenType::Comment)
100    }
101
102    fn lex_string_literal<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
103        WL_STRING.scan(state, WolframTokenType::String)
104    }
105
106    fn lex_number_literal<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
107        let start = state.get_position();
108        let first = match state.peek() {
109            Some(c) => c,
110            None => return false,
111        };
112
113        if !first.is_ascii_digit() {
114            return false;
115        }
116
117        let mut is_real = false;
118
119        // Integer part
120        state.advance(first.len_utf8());
121        while let Some(c) = state.peek() {
122            if c.is_ascii_digit() {
123                state.advance(1);
124            }
125            else {
126                break;
127            }
128        }
129
130        // Decimal part
131        if state.peek() == Some('.') {
132            let next = state.peek_next_n(1);
133            if next.map(|c| c.is_ascii_digit()).unwrap_or(false) {
134                is_real = true;
135                state.advance(1); // consume '.'
136                while let Some(c) = state.peek() {
137                    if c.is_ascii_digit() {
138                        state.advance(1);
139                    }
140                    else {
141                        break;
142                    }
143                }
144            }
145        }
146
147        // Scientific notation
148        if let Some(c) = state.peek() {
149            if c == 'e' || c == 'E' {
150                let next = state.peek_next_n(1);
151                if next == Some('+') || next == Some('-') || next.map(|d| d.is_ascii_digit()).unwrap_or(false) {
152                    is_real = true;
153                    state.advance(1);
154                    if let Some(sign) = state.peek() {
155                        if sign == '+' || sign == '-' {
156                            state.advance(1);
157                        }
158                    }
159                    while let Some(d) = state.peek() {
160                        if d.is_ascii_digit() {
161                            state.advance(1);
162                        }
163                        else {
164                            break;
165                        }
166                    }
167                }
168            }
169        }
170
171        let end = state.get_position();
172        state.add_token(if is_real { WolframTokenType::Real } else { WolframTokenType::Integer }, start, end);
173        true
174    }
175
176    fn lex_identifier_or_keyword<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
177        let start = state.get_position();
178        let ch = match state.peek() {
179            Some(c) => c,
180            None => return false,
181        };
182
183        if !(ch.is_ascii_alphabetic() || ch == '$') {
184            return false;
185        }
186
187        state.advance(ch.len_utf8());
188        while let Some(c) = state.peek() {
189            if c.is_ascii_alphanumeric() || c == '$' {
190                state.advance(c.len_utf8());
191            }
192            else {
193                break;
194            }
195        }
196
197        let end = state.get_position();
198        let text = state.source().get_text_in((start..end).into());
199        let kind = match text.as_ref() {
200            "If" => WolframTokenType::If,
201            "Then" => WolframTokenType::Then,
202            "Else" => WolframTokenType::Else,
203            "While" => WolframTokenType::While,
204            "For" => WolframTokenType::For,
205            "Do" => WolframTokenType::Do,
206            "Function" => WolframTokenType::Function,
207            "Module" => WolframTokenType::Module,
208            "Block" => WolframTokenType::Block,
209            "With" => WolframTokenType::With,
210            "Table" => WolframTokenType::Table,
211            "Map" => WolframTokenType::Map,
212            "Apply" => WolframTokenType::Apply,
213            "Select" => WolframTokenType::Select,
214            "Cases" => WolframTokenType::Cases,
215            "Rule" => WolframTokenType::Rule,
216            "RuleDelayed" => WolframTokenType::RuleDelayed,
217            "Set" => WolframTokenType::Set,
218            "SetDelayed" => WolframTokenType::SetDelayed,
219            "Unset" => WolframTokenType::Unset,
220            "Clear" => WolframTokenType::Clear,
221            "ClearAll" => WolframTokenType::ClearAll,
222            "Return" => WolframTokenType::Return,
223            "Break" => WolframTokenType::Break,
224            "Continue" => WolframTokenType::Continue,
225            "True" => WolframTokenType::True,
226            "False" => WolframTokenType::False,
227            "Null" => WolframTokenType::Null,
228            "Export" => WolframTokenType::Export,
229            "Import" => WolframTokenType::Import,
230            _ => WolframTokenType::Identifier,
231        };
232        state.add_token(kind, start, end);
233        true
234    }
235
236    fn lex_operators<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
237        let start = state.get_position();
238
239        // Multi-character operators (prefer longest matches first)
240        let patterns: &[(&str, WolframTokenType)] = &[
241            ("===", WolframTokenType::Equal),    // SameQ
242            ("=!=", WolframTokenType::NotEqual), // UnsameQ
243            ("@@@", WolframTokenType::ApplyLevelOperator),
244            ("//@", WolframTokenType::MapAllOperator),
245            (":=", WolframTokenType::SetDelayed),
246            (":>", WolframTokenType::RuleDelayedOp),
247            ("->", WolframTokenType::Arrow),
248            ("=>", WolframTokenType::DoubleArrow),
249            ("/@", WolframTokenType::MapOperator),
250            ("@@", WolframTokenType::ApplyOperator),
251            ("//", WolframTokenType::SlashSlash),
252            ("@*", WolframTokenType::AtStar),
253            ("/*", WolframTokenType::StarSlash),
254            ("<>", WolframTokenType::StringJoin),
255            ("==", WolframTokenType::Equal),
256            ("!=", WolframTokenType::NotEqual),
257            ("<=", WolframTokenType::LessEqual),
258            (">=", WolframTokenType::GreaterEqual),
259            ("&&", WolframTokenType::And),
260            ("||", WolframTokenType::Or),
261            ("+=", WolframTokenType::AddTo),
262            ("-=", WolframTokenType::SubtractFrom),
263            ("*=", WolframTokenType::TimesBy),
264            ("/=", WolframTokenType::DivideBy),
265            ("!!", WolframTokenType::Factorial), // Double Factorial
266            ("___", WolframTokenType::TripleUnderscore),
267            ("__", WolframTokenType::DoubleUnderscore),
268            ("##", WolframTokenType::SlotSequence),
269        ];
270
271        for (pat, kind) in patterns {
272            if state.starts_with(pat) {
273                state.advance(pat.len());
274                state.add_token(*kind, start, state.get_position());
275                return true;
276            }
277        }
278
279        // Single-character operators
280        if let Some(ch) = state.peek() {
281            let kind = match ch {
282                '+' => Some(WolframTokenType::Plus),
283                '-' => Some(WolframTokenType::Minus),
284                '*' => Some(WolframTokenType::Times),
285                '/' => Some(WolframTokenType::Divide),
286                '^' => Some(WolframTokenType::Power),
287                '=' => Some(WolframTokenType::Assign),
288                '<' => Some(WolframTokenType::Less),
289                '>' => Some(WolframTokenType::Greater),
290                '?' => Some(WolframTokenType::Question),
291                '_' => Some(WolframTokenType::Underscore),
292                '#' => Some(WolframTokenType::Slot),
293                '.' => Some(WolframTokenType::Dot),
294                ':' => Some(WolframTokenType::Colon),
295                '@' => Some(WolframTokenType::At),
296                '&' => Some(WolframTokenType::Ampersand),
297                '!' => Some(WolframTokenType::Factorial),
298                _ => None,
299            };
300            if let Some(k) = kind {
301                state.advance(ch.len_utf8());
302                state.add_token(k, start, state.get_position());
303                return true;
304            }
305        }
306        false
307    }
308
309    fn lex_single_char_tokens<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
310        let start = state.get_position();
311        if let Some(ch) = state.peek() {
312            let kind = match ch {
313                '(' => WolframTokenType::LeftParen,
314                ')' => WolframTokenType::RightParen,
315                '[' => WolframTokenType::LeftBracket,
316                ']' => WolframTokenType::RightBracket,
317                '{' => WolframTokenType::LeftBrace,
318                '}' => WolframTokenType::RightBrace,
319                ',' => WolframTokenType::Comma,
320                ';' => WolframTokenType::Semicolon,
321                _ => {
322                    // Unknown character, treat as error
323                    state.advance(ch.len_utf8());
324                    state.add_token(WolframTokenType::Error, start, state.get_position());
325                    return true;
326                }
327            };
328            state.advance(ch.len_utf8());
329            state.add_token(kind, start, state.get_position());
330            true
331        }
332        else {
333            false
334        }
335    }
336}