oak_wolfram/lexer/
mod.rs

1use crate::{kind::WolframSyntaxKind, language::WolframLanguage};
2use oak_core::{
3    IncrementalCache, Lexer, LexerState, OakError,
4    lexer::{CommentLine, LexOutput, StringConfig, WhitespaceConfig},
5    source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, WolframLanguage>;
10
11static WL_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static WL_COMMENT: LazyLock<CommentLine> = LazyLock::new(|| CommentLine { line_markers: &[] }); // Wolfram uses block comments
13static WL_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14
15#[derive(Clone)]
16pub struct WolframLexer<'config> {
17    config: &'config WolframLanguage,
18}
19
20impl<'config> Lexer<WolframLanguage> for WolframLexer<'config> {
21    fn lex_incremental(
22        &self,
23        source: impl Source,
24        changed: usize,
25        cache: IncrementalCache<WolframLanguage>,
26    ) -> LexOutput<WolframLanguage> {
27        let mut state = LexerState::new_with_cache(source, changed, cache);
28        let result = self.run(&mut state);
29        state.finish(result)
30    }
31}
32
33impl<'config> WolframLexer<'config> {
34    pub fn new(config: &'config WolframLanguage) -> Self {
35        Self { config }
36    }
37
38    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
39        while state.not_at_end() {
40            let safe_point = state.get_position();
41
42            if self.skip_whitespace(state) {
43                continue;
44            }
45
46            if self.skip_comment(state) {
47                continue;
48            }
49
50            if self.lex_string_literal(state) {
51                continue;
52            }
53
54            if self.lex_number_literal(state) {
55                continue;
56            }
57
58            if self.lex_identifier_or_keyword(state) {
59                continue;
60            }
61
62            if self.lex_operators(state) {
63                continue;
64            }
65
66            if self.lex_single_char_tokens(state) {
67                continue;
68            }
69
70            state.safe_check(safe_point);
71        }
72
73        // 添加 EOF token
74        let eof_pos = state.get_position();
75        state.add_token(WolframSyntaxKind::Eof, eof_pos, eof_pos);
76        Ok(())
77    }
78
79    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
80        match WL_WHITESPACE.scan(state.rest(), state.get_position(), WolframSyntaxKind::Whitespace) {
81            Some(token) => {
82                state.advance_with(token);
83                return true;
84            }
85            None => {}
86        }
87
88        // Handle newlines separately
89        if let Some(ch) = state.current() {
90            if ch == '\n' || ch == '\r' {
91                let start = state.get_position();
92                state.advance(1);
93                if ch == '\r' && state.current() == Some('\n') {
94                    state.advance(1);
95                }
96                state.add_token(WolframSyntaxKind::Newline, start, state.get_position());
97                return true;
98            }
99        }
100        false
101    }
102
103    fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
104        let start = state.get_position();
105        let rest = state.rest();
106
107        // Wolfram block comment: (* ... *) with nesting support
108        if rest.starts_with("(*") {
109            state.advance(2);
110            let mut depth = 1usize;
111            while let Some(ch) = state.peek() {
112                if ch == '(' && state.peek_next_n(1) == Some('*') {
113                    state.advance(2);
114                    depth += 1;
115                    continue;
116                }
117                if ch == '*' && state.peek_next_n(1) == Some(')') {
118                    state.advance(2);
119                    depth -= 1;
120                    if depth == 0 {
121                        break;
122                    }
123                    continue;
124                }
125                state.advance(ch.len_utf8());
126            }
127            state.add_token(WolframSyntaxKind::Comment, start, state.get_position());
128            return true;
129        }
130        false
131    }
132
133    fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
134        let start = state.get_position();
135
136        // Normal string: "..."
137        if state.current() == Some('"') {
138            state.advance(1);
139            let mut escaped = false;
140            while let Some(ch) = state.peek() {
141                if ch == '"' && !escaped {
142                    state.advance(1); // consume closing quote
143                    break;
144                }
145                state.advance(ch.len_utf8());
146                if escaped {
147                    escaped = false;
148                    continue;
149                }
150                if ch == '\\' {
151                    escaped = true;
152                    continue;
153                }
154                if ch == '\n' || ch == '\r' {
155                    break;
156                }
157            }
158            state.add_token(WolframSyntaxKind::String, start, state.get_position());
159            return true;
160        }
161        false
162    }
163
164    fn lex_number_literal<S: Source>(&self, state: &mut State<S>) -> bool {
165        let start = state.get_position();
166        let first = match state.current() {
167            Some(c) => c,
168            None => return false,
169        };
170
171        if !first.is_ascii_digit() {
172            return false;
173        }
174
175        let mut is_real = false;
176
177        // Integer part
178        state.advance(1);
179        while let Some(c) = state.peek() {
180            if c.is_ascii_digit() {
181                state.advance(1);
182            }
183            else {
184                break;
185            }
186        }
187
188        // Decimal part
189        if state.peek() == Some('.') {
190            let next = state.peek_next_n(1);
191            if next.map(|c| c.is_ascii_digit()).unwrap_or(false) {
192                is_real = true;
193                state.advance(1); // consume '.'
194                while let Some(c) = state.peek() {
195                    if c.is_ascii_digit() {
196                        state.advance(1);
197                    }
198                    else {
199                        break;
200                    }
201                }
202            }
203        }
204
205        // Scientific notation
206        if let Some(c) = state.peek() {
207            if c == 'e' || c == 'E' {
208                let next = state.peek_next_n(1);
209                if next == Some('+') || next == Some('-') || next.map(|d| d.is_ascii_digit()).unwrap_or(false) {
210                    is_real = true;
211                    state.advance(1);
212                    if let Some(sign) = state.peek() {
213                        if sign == '+' || sign == '-' {
214                            state.advance(1);
215                        }
216                    }
217                    while let Some(d) = state.peek() {
218                        if d.is_ascii_digit() {
219                            state.advance(1);
220                        }
221                        else {
222                            break;
223                        }
224                    }
225                }
226            }
227        }
228
229        let end = state.get_position();
230        state.add_token(if is_real { WolframSyntaxKind::Real } else { WolframSyntaxKind::Integer }, start, end);
231        true
232    }
233
234    fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
235        let start = state.get_position();
236        let ch = match state.current() {
237            Some(c) => c,
238            None => return false,
239        };
240
241        if !(ch.is_ascii_alphabetic() || ch == '$') {
242            return false;
243        }
244
245        state.advance(1);
246        while let Some(c) = state.current() {
247            if c.is_ascii_alphanumeric() || c == '$' {
248                state.advance(1);
249            }
250            else {
251                break;
252            }
253        }
254
255        let end = state.get_position();
256        let text = state.get_text_in((start..end).into());
257        let kind = match text {
258            "If" => WolframSyntaxKind::If,
259            "Then" => WolframSyntaxKind::Then,
260            "Else" => WolframSyntaxKind::Else,
261            "While" => WolframSyntaxKind::While,
262            "For" => WolframSyntaxKind::For,
263            "Do" => WolframSyntaxKind::Do,
264            "Function" => WolframSyntaxKind::Function,
265            "Module" => WolframSyntaxKind::Module,
266            "Block" => WolframSyntaxKind::Block,
267            "With" => WolframSyntaxKind::With,
268            "Table" => WolframSyntaxKind::Table,
269            "Map" => WolframSyntaxKind::Map,
270            "Apply" => WolframSyntaxKind::Apply,
271            "Select" => WolframSyntaxKind::Select,
272            "Cases" => WolframSyntaxKind::Cases,
273            "Rule" => WolframSyntaxKind::Rule,
274            "RuleDelayed" => WolframSyntaxKind::RuleDelayed,
275            "Set" => WolframSyntaxKind::Set,
276            "SetDelayed" => WolframSyntaxKind::SetDelayed,
277            "Unset" => WolframSyntaxKind::Unset,
278            "Clear" => WolframSyntaxKind::Clear,
279            "ClearAll" => WolframSyntaxKind::ClearAll,
280            "Return" => WolframSyntaxKind::Return,
281            "Break" => WolframSyntaxKind::Break,
282            "Continue" => WolframSyntaxKind::Continue,
283            "True" => WolframSyntaxKind::True,
284            "False" => WolframSyntaxKind::False,
285            "Null" => WolframSyntaxKind::Null,
286            "Export" => WolframSyntaxKind::Export,
287            "Import" => WolframSyntaxKind::Import,
288            _ => WolframSyntaxKind::Identifier,
289        };
290        state.add_token(kind, start, state.get_position());
291        true
292    }
293
294    fn lex_operators<S: Source>(&self, state: &mut State<S>) -> bool {
295        let start = state.get_position();
296        let rest = state.rest();
297
298        // Multi-character operators (prefer longest matches first)
299        let patterns: &[(&str, WolframSyntaxKind)] = &[
300            ("===", WolframSyntaxKind::Equal),    // SameQ
301            ("=!=", WolframSyntaxKind::NotEqual), // UnsameQ
302            ("->", WolframSyntaxKind::Arrow),
303            ("=>", WolframSyntaxKind::DoubleArrow),
304            ("==", WolframSyntaxKind::Equal),
305            ("!=", WolframSyntaxKind::NotEqual),
306            ("<=", WolframSyntaxKind::LessEqual),
307            (">=", WolframSyntaxKind::GreaterEqual),
308            ("&&", WolframSyntaxKind::And),
309            ("||", WolframSyntaxKind::Or),
310            ("+=", WolframSyntaxKind::AddTo),
311            ("-=", WolframSyntaxKind::SubtractFrom),
312            ("*=", WolframSyntaxKind::TimesBy),
313            ("/=", WolframSyntaxKind::DivideBy),
314            ("___", WolframSyntaxKind::TripleUnderscore),
315            ("__", WolframSyntaxKind::DoubleUnderscore),
316            ("##", WolframSyntaxKind::SlotSequence),
317        ];
318
319        for (pat, kind) in patterns {
320            if rest.starts_with(pat) {
321                state.advance(pat.len());
322                state.add_token(*kind, start, state.get_position());
323                return true;
324            }
325        }
326
327        // Single-character operators
328        if let Some(ch) = state.current() {
329            let kind = match ch {
330                '+' => Some(WolframSyntaxKind::Plus),
331                '-' => Some(WolframSyntaxKind::Minus),
332                '*' => Some(WolframSyntaxKind::Times),
333                '/' => Some(WolframSyntaxKind::Divide),
334                '^' => Some(WolframSyntaxKind::Power),
335                '=' => Some(WolframSyntaxKind::Assign),
336                '<' => Some(WolframSyntaxKind::Less),
337                '>' => Some(WolframSyntaxKind::Greater),
338                '!' => Some(WolframSyntaxKind::Not),
339                '?' => Some(WolframSyntaxKind::Question),
340                '_' => Some(WolframSyntaxKind::Underscore),
341                '#' => Some(WolframSyntaxKind::Slot),
342                '.' => Some(WolframSyntaxKind::Dot),
343                ':' => Some(WolframSyntaxKind::Colon),
344                _ => None,
345            };
346            if let Some(k) = kind {
347                state.advance(ch.len_utf8());
348                state.add_token(k, start, state.get_position());
349                return true;
350            }
351        }
352        false
353    }
354
355    fn lex_single_char_tokens<S: Source>(&self, state: &mut State<S>) -> bool {
356        let start = state.get_position();
357        if let Some(ch) = state.current() {
358            let kind = match ch {
359                '(' => WolframSyntaxKind::LeftParen,
360                ')' => WolframSyntaxKind::RightParen,
361                '[' => WolframSyntaxKind::LeftBracket,
362                ']' => WolframSyntaxKind::RightBracket,
363                '{' => WolframSyntaxKind::LeftBrace,
364                '}' => WolframSyntaxKind::RightBrace,
365                ',' => WolframSyntaxKind::Comma,
366                ';' => WolframSyntaxKind::Semicolon,
367                _ => {
368                    // Unknown character, treat as error
369                    state.advance(ch.len_utf8());
370                    state.add_token(WolframSyntaxKind::Error, start, state.get_position());
371                    return true;
372                }
373            };
374            state.advance(ch.len_utf8());
375            state.add_token(kind, start, state.get_position());
376            true
377        }
378        else {
379            false
380        }
381    }
382}