1use crate::{kind::WolframSyntaxKind, language::WolframLanguage};
2use oak_core::{
3 IncrementalCache, Lexer, LexerState, OakError,
4 lexer::{CommentLine, LexOutput, StringConfig, WhitespaceConfig},
5 source::Source,
6};
7use std::sync::LazyLock;
8
9type State<S> = LexerState<S, WolframLanguage>;
10
11static WL_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static WL_COMMENT: LazyLock<CommentLine> = LazyLock::new(|| CommentLine { line_markers: &[] }); static WL_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14
15#[derive(Clone)]
16pub struct WolframLexer<'config> {
17 config: &'config WolframLanguage,
18}
19
20impl<'config> Lexer<WolframLanguage> for WolframLexer<'config> {
21 fn lex_incremental(
22 &self,
23 source: impl Source,
24 changed: usize,
25 cache: IncrementalCache<WolframLanguage>,
26 ) -> LexOutput<WolframLanguage> {
27 let mut state = LexerState::new_with_cache(source, changed, cache);
28 let result = self.run(&mut state);
29 state.finish(result)
30 }
31}
32
33impl<'config> WolframLexer<'config> {
34 pub fn new(config: &'config WolframLanguage) -> Self {
35 Self { config }
36 }
37
38 fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
39 while state.not_at_end() {
40 let safe_point = state.get_position();
41
42 if self.skip_whitespace(state) {
43 continue;
44 }
45
46 if self.skip_comment(state) {
47 continue;
48 }
49
50 if self.lex_string_literal(state) {
51 continue;
52 }
53
54 if self.lex_number_literal(state) {
55 continue;
56 }
57
58 if self.lex_identifier_or_keyword(state) {
59 continue;
60 }
61
62 if self.lex_operators(state) {
63 continue;
64 }
65
66 if self.lex_single_char_tokens(state) {
67 continue;
68 }
69
70 state.safe_check(safe_point);
71 }
72
73 let eof_pos = state.get_position();
75 state.add_token(WolframSyntaxKind::Eof, eof_pos, eof_pos);
76 Ok(())
77 }
78
79 fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
80 match WL_WHITESPACE.scan(state.rest(), state.get_position(), WolframSyntaxKind::Whitespace) {
81 Some(token) => {
82 state.advance_with(token);
83 return true;
84 }
85 None => {}
86 }
87
88 if let Some(ch) = state.current() {
90 if ch == '\n' || ch == '\r' {
91 let start = state.get_position();
92 state.advance(1);
93 if ch == '\r' && state.current() == Some('\n') {
94 state.advance(1);
95 }
96 state.add_token(WolframSyntaxKind::Newline, start, state.get_position());
97 return true;
98 }
99 }
100 false
101 }
102
103 fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
104 let start = state.get_position();
105 let rest = state.rest();
106
107 if rest.starts_with("(*") {
109 state.advance(2);
110 let mut depth = 1usize;
111 while let Some(ch) = state.peek() {
112 if ch == '(' && state.peek_next_n(1) == Some('*') {
113 state.advance(2);
114 depth += 1;
115 continue;
116 }
117 if ch == '*' && state.peek_next_n(1) == Some(')') {
118 state.advance(2);
119 depth -= 1;
120 if depth == 0 {
121 break;
122 }
123 continue;
124 }
125 state.advance(ch.len_utf8());
126 }
127 state.add_token(WolframSyntaxKind::Comment, start, state.get_position());
128 return true;
129 }
130 false
131 }
132
133 fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
134 let start = state.get_position();
135
136 if state.current() == Some('"') {
138 state.advance(1);
139 let mut escaped = false;
140 while let Some(ch) = state.peek() {
141 if ch == '"' && !escaped {
142 state.advance(1); break;
144 }
145 state.advance(ch.len_utf8());
146 if escaped {
147 escaped = false;
148 continue;
149 }
150 if ch == '\\' {
151 escaped = true;
152 continue;
153 }
154 if ch == '\n' || ch == '\r' {
155 break;
156 }
157 }
158 state.add_token(WolframSyntaxKind::String, start, state.get_position());
159 return true;
160 }
161 false
162 }
163
164 fn lex_number_literal<S: Source>(&self, state: &mut State<S>) -> bool {
165 let start = state.get_position();
166 let first = match state.current() {
167 Some(c) => c,
168 None => return false,
169 };
170
171 if !first.is_ascii_digit() {
172 return false;
173 }
174
175 let mut is_real = false;
176
177 state.advance(1);
179 while let Some(c) = state.peek() {
180 if c.is_ascii_digit() {
181 state.advance(1);
182 }
183 else {
184 break;
185 }
186 }
187
188 if state.peek() == Some('.') {
190 let next = state.peek_next_n(1);
191 if next.map(|c| c.is_ascii_digit()).unwrap_or(false) {
192 is_real = true;
193 state.advance(1); while let Some(c) = state.peek() {
195 if c.is_ascii_digit() {
196 state.advance(1);
197 }
198 else {
199 break;
200 }
201 }
202 }
203 }
204
205 if let Some(c) = state.peek() {
207 if c == 'e' || c == 'E' {
208 let next = state.peek_next_n(1);
209 if next == Some('+') || next == Some('-') || next.map(|d| d.is_ascii_digit()).unwrap_or(false) {
210 is_real = true;
211 state.advance(1);
212 if let Some(sign) = state.peek() {
213 if sign == '+' || sign == '-' {
214 state.advance(1);
215 }
216 }
217 while let Some(d) = state.peek() {
218 if d.is_ascii_digit() {
219 state.advance(1);
220 }
221 else {
222 break;
223 }
224 }
225 }
226 }
227 }
228
229 let end = state.get_position();
230 state.add_token(if is_real { WolframSyntaxKind::Real } else { WolframSyntaxKind::Integer }, start, end);
231 true
232 }
233
234 fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
235 let start = state.get_position();
236 let ch = match state.current() {
237 Some(c) => c,
238 None => return false,
239 };
240
241 if !(ch.is_ascii_alphabetic() || ch == '$') {
242 return false;
243 }
244
245 state.advance(1);
246 while let Some(c) = state.current() {
247 if c.is_ascii_alphanumeric() || c == '$' {
248 state.advance(1);
249 }
250 else {
251 break;
252 }
253 }
254
255 let end = state.get_position();
256 let text = state.get_text_in((start..end).into());
257 let kind = match text {
258 "If" => WolframSyntaxKind::If,
259 "Then" => WolframSyntaxKind::Then,
260 "Else" => WolframSyntaxKind::Else,
261 "While" => WolframSyntaxKind::While,
262 "For" => WolframSyntaxKind::For,
263 "Do" => WolframSyntaxKind::Do,
264 "Function" => WolframSyntaxKind::Function,
265 "Module" => WolframSyntaxKind::Module,
266 "Block" => WolframSyntaxKind::Block,
267 "With" => WolframSyntaxKind::With,
268 "Table" => WolframSyntaxKind::Table,
269 "Map" => WolframSyntaxKind::Map,
270 "Apply" => WolframSyntaxKind::Apply,
271 "Select" => WolframSyntaxKind::Select,
272 "Cases" => WolframSyntaxKind::Cases,
273 "Rule" => WolframSyntaxKind::Rule,
274 "RuleDelayed" => WolframSyntaxKind::RuleDelayed,
275 "Set" => WolframSyntaxKind::Set,
276 "SetDelayed" => WolframSyntaxKind::SetDelayed,
277 "Unset" => WolframSyntaxKind::Unset,
278 "Clear" => WolframSyntaxKind::Clear,
279 "ClearAll" => WolframSyntaxKind::ClearAll,
280 "Return" => WolframSyntaxKind::Return,
281 "Break" => WolframSyntaxKind::Break,
282 "Continue" => WolframSyntaxKind::Continue,
283 "True" => WolframSyntaxKind::True,
284 "False" => WolframSyntaxKind::False,
285 "Null" => WolframSyntaxKind::Null,
286 "Export" => WolframSyntaxKind::Export,
287 "Import" => WolframSyntaxKind::Import,
288 _ => WolframSyntaxKind::Identifier,
289 };
290 state.add_token(kind, start, state.get_position());
291 true
292 }
293
294 fn lex_operators<S: Source>(&self, state: &mut State<S>) -> bool {
295 let start = state.get_position();
296 let rest = state.rest();
297
298 let patterns: &[(&str, WolframSyntaxKind)] = &[
300 ("===", WolframSyntaxKind::Equal), ("=!=", WolframSyntaxKind::NotEqual), ("->", WolframSyntaxKind::Arrow),
303 ("=>", WolframSyntaxKind::DoubleArrow),
304 ("==", WolframSyntaxKind::Equal),
305 ("!=", WolframSyntaxKind::NotEqual),
306 ("<=", WolframSyntaxKind::LessEqual),
307 (">=", WolframSyntaxKind::GreaterEqual),
308 ("&&", WolframSyntaxKind::And),
309 ("||", WolframSyntaxKind::Or),
310 ("+=", WolframSyntaxKind::AddTo),
311 ("-=", WolframSyntaxKind::SubtractFrom),
312 ("*=", WolframSyntaxKind::TimesBy),
313 ("/=", WolframSyntaxKind::DivideBy),
314 ("___", WolframSyntaxKind::TripleUnderscore),
315 ("__", WolframSyntaxKind::DoubleUnderscore),
316 ("##", WolframSyntaxKind::SlotSequence),
317 ];
318
319 for (pat, kind) in patterns {
320 if rest.starts_with(pat) {
321 state.advance(pat.len());
322 state.add_token(*kind, start, state.get_position());
323 return true;
324 }
325 }
326
327 if let Some(ch) = state.current() {
329 let kind = match ch {
330 '+' => Some(WolframSyntaxKind::Plus),
331 '-' => Some(WolframSyntaxKind::Minus),
332 '*' => Some(WolframSyntaxKind::Times),
333 '/' => Some(WolframSyntaxKind::Divide),
334 '^' => Some(WolframSyntaxKind::Power),
335 '=' => Some(WolframSyntaxKind::Assign),
336 '<' => Some(WolframSyntaxKind::Less),
337 '>' => Some(WolframSyntaxKind::Greater),
338 '!' => Some(WolframSyntaxKind::Not),
339 '?' => Some(WolframSyntaxKind::Question),
340 '_' => Some(WolframSyntaxKind::Underscore),
341 '#' => Some(WolframSyntaxKind::Slot),
342 '.' => Some(WolframSyntaxKind::Dot),
343 ':' => Some(WolframSyntaxKind::Colon),
344 _ => None,
345 };
346 if let Some(k) = kind {
347 state.advance(ch.len_utf8());
348 state.add_token(k, start, state.get_position());
349 return true;
350 }
351 }
352 false
353 }
354
355 fn lex_single_char_tokens<S: Source>(&self, state: &mut State<S>) -> bool {
356 let start = state.get_position();
357 if let Some(ch) = state.current() {
358 let kind = match ch {
359 '(' => WolframSyntaxKind::LeftParen,
360 ')' => WolframSyntaxKind::RightParen,
361 '[' => WolframSyntaxKind::LeftBracket,
362 ']' => WolframSyntaxKind::RightBracket,
363 '{' => WolframSyntaxKind::LeftBrace,
364 '}' => WolframSyntaxKind::RightBrace,
365 ',' => WolframSyntaxKind::Comma,
366 ';' => WolframSyntaxKind::Semicolon,
367 _ => {
368 state.advance(ch.len_utf8());
370 state.add_token(WolframSyntaxKind::Error, start, state.get_position());
371 return true;
372 }
373 };
374 state.advance(ch.len_utf8());
375 state.add_token(kind, start, state.get_position());
376 true
377 }
378 else {
379 false
380 }
381 }
382}