1use crate::{kind::WolframSyntaxKind, language::WolframLanguage};
2use oak_core::{
3 Lexer, LexerCache, LexerState, OakError,
4 lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
5 source::{Source, TextEdit},
6};
7use std::sync::LazyLock;
8
9type State<'a, S> = LexerState<'a, S, WolframLanguage>;
10
11static WL_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static WL_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "", block_start: "(*", block_end: "*)", nested_blocks: true });
13static WL_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14
15#[derive(Clone, Debug, Default)]
16pub struct WolframLexer;
17
18impl Lexer<WolframLanguage> for WolframLexer {
19 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<WolframLanguage>) -> LexOutput<WolframLanguage> {
20 let mut state = LexerState::new(source);
21 let result = self.run(&mut state);
22 if result.is_ok() {
23 state.add_eof();
24 }
25 state.finish_with_cache(result, cache)
26 }
27}
28
29impl WolframLexer {
30 pub fn new(_config: &WolframLanguage) -> Self {
31 Self
32 }
33
34 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
35 while state.not_at_end() {
36 let safe_point = state.get_position();
37
38 if self.skip_whitespace(state) {
39 continue;
40 }
41
42 if self.skip_comment(state) {
43 continue;
44 }
45
46 if self.lex_string_literal(state) {
47 continue;
48 }
49
50 if self.lex_number_literal(state) {
51 continue;
52 }
53
54 if self.lex_identifier_or_keyword(state) {
55 continue;
56 }
57
58 if self.lex_operators(state) {
59 continue;
60 }
61
62 if self.lex_single_char_tokens(state) {
63 continue;
64 }
65
66 state.advance_if_dead_lock(safe_point);
67 }
68
69 Ok(())
70 }
71
72 fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
73 if WL_WHITESPACE.scan(state, WolframSyntaxKind::Whitespace) {
74 return true;
75 }
76
77 if let Some(ch) = state.peek() {
79 if ch == '\n' || ch == '\r' {
80 let start = state.get_position();
81 state.advance(ch.len_utf8());
82 if ch == '\r' && state.peek() == Some('\n') {
83 state.advance(1);
84 }
85 state.add_token(WolframSyntaxKind::Newline, start, state.get_position());
86 return true;
87 }
88 }
89 false
90 }
91
92 fn skip_comment<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
93 WL_COMMENT.scan(state, WolframSyntaxKind::Comment, WolframSyntaxKind::Comment)
94 }
95
96 fn lex_string_literal<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
97 WL_STRING.scan(state, WolframSyntaxKind::String)
98 }
99
100 fn lex_number_literal<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
101 let start = state.get_position();
102 let first = match state.peek() {
103 Some(c) => c,
104 None => return false,
105 };
106
107 if !first.is_ascii_digit() {
108 return false;
109 }
110
111 let mut is_real = false;
112
113 state.advance(first.len_utf8());
115 while let Some(c) = state.peek() {
116 if c.is_ascii_digit() {
117 state.advance(1);
118 }
119 else {
120 break;
121 }
122 }
123
124 if state.peek() == Some('.') {
126 let next = state.peek_next_n(1);
127 if next.map(|c| c.is_ascii_digit()).unwrap_or(false) {
128 is_real = true;
129 state.advance(1); while let Some(c) = state.peek() {
131 if c.is_ascii_digit() {
132 state.advance(1);
133 }
134 else {
135 break;
136 }
137 }
138 }
139 }
140
141 if let Some(c) = state.peek() {
143 if c == 'e' || c == 'E' {
144 let next = state.peek_next_n(1);
145 if next == Some('+') || next == Some('-') || next.map(|d| d.is_ascii_digit()).unwrap_or(false) {
146 is_real = true;
147 state.advance(1);
148 if let Some(sign) = state.peek() {
149 if sign == '+' || sign == '-' {
150 state.advance(1);
151 }
152 }
153 while let Some(d) = state.peek() {
154 if d.is_ascii_digit() {
155 state.advance(1);
156 }
157 else {
158 break;
159 }
160 }
161 }
162 }
163 }
164
165 let end = state.get_position();
166 state.add_token(if is_real { WolframSyntaxKind::Real } else { WolframSyntaxKind::Integer }, start, end);
167 true
168 }
169
170 fn lex_identifier_or_keyword<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
171 let start = state.get_position();
172 let ch = match state.peek() {
173 Some(c) => c,
174 None => return false,
175 };
176
177 if !(ch.is_ascii_alphabetic() || ch == '$') {
178 return false;
179 }
180
181 state.advance(ch.len_utf8());
182 while let Some(c) = state.peek() {
183 if c.is_ascii_alphanumeric() || c == '$' {
184 state.advance(c.len_utf8());
185 }
186 else {
187 break;
188 }
189 }
190
191 let end = state.get_position();
192 let text = state.source().get_text_in((start..end).into());
193 let kind = match text.as_ref() {
194 "If" => WolframSyntaxKind::If,
195 "Then" => WolframSyntaxKind::Then,
196 "Else" => WolframSyntaxKind::Else,
197 "While" => WolframSyntaxKind::While,
198 "For" => WolframSyntaxKind::For,
199 "Do" => WolframSyntaxKind::Do,
200 "Function" => WolframSyntaxKind::Function,
201 "Module" => WolframSyntaxKind::Module,
202 "Block" => WolframSyntaxKind::Block,
203 "With" => WolframSyntaxKind::With,
204 "Table" => WolframSyntaxKind::Table,
205 "Map" => WolframSyntaxKind::Map,
206 "Apply" => WolframSyntaxKind::Apply,
207 "Select" => WolframSyntaxKind::Select,
208 "Cases" => WolframSyntaxKind::Cases,
209 "Rule" => WolframSyntaxKind::Rule,
210 "RuleDelayed" => WolframSyntaxKind::RuleDelayed,
211 "Set" => WolframSyntaxKind::Set,
212 "SetDelayed" => WolframSyntaxKind::SetDelayed,
213 "Unset" => WolframSyntaxKind::Unset,
214 "Clear" => WolframSyntaxKind::Clear,
215 "ClearAll" => WolframSyntaxKind::ClearAll,
216 "Return" => WolframSyntaxKind::Return,
217 "Break" => WolframSyntaxKind::Break,
218 "Continue" => WolframSyntaxKind::Continue,
219 "True" => WolframSyntaxKind::True,
220 "False" => WolframSyntaxKind::False,
221 "Null" => WolframSyntaxKind::Null,
222 "Export" => WolframSyntaxKind::Export,
223 "Import" => WolframSyntaxKind::Import,
224 _ => WolframSyntaxKind::Identifier,
225 };
226 state.add_token(kind, start, state.get_position());
227 true
228 }
229
230 fn lex_operators<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
231 let start = state.get_position();
232
233 let patterns: &[(&str, WolframSyntaxKind)] = &[
235 ("===", WolframSyntaxKind::Equal), ("=!=", WolframSyntaxKind::NotEqual), ("->", WolframSyntaxKind::Arrow),
238 ("=>", WolframSyntaxKind::DoubleArrow),
239 ("==", WolframSyntaxKind::Equal),
240 ("!=", WolframSyntaxKind::NotEqual),
241 ("<=", WolframSyntaxKind::LessEqual),
242 (">=", WolframSyntaxKind::GreaterEqual),
243 ("&&", WolframSyntaxKind::And),
244 ("||", WolframSyntaxKind::Or),
245 ("+=", WolframSyntaxKind::AddTo),
246 ("-=", WolframSyntaxKind::SubtractFrom),
247 ("*=", WolframSyntaxKind::TimesBy),
248 ("/=", WolframSyntaxKind::DivideBy),
249 ("___", WolframSyntaxKind::TripleUnderscore),
250 ("__", WolframSyntaxKind::DoubleUnderscore),
251 ("##", WolframSyntaxKind::SlotSequence),
252 ];
253
254 for (pat, kind) in patterns {
255 if state.starts_with(pat) {
256 state.advance(pat.len());
257 state.add_token(*kind, start, state.get_position());
258 return true;
259 }
260 }
261
262 if let Some(ch) = state.peek() {
264 let kind = match ch {
265 '+' => Some(WolframSyntaxKind::Plus),
266 '-' => Some(WolframSyntaxKind::Minus),
267 '*' => Some(WolframSyntaxKind::Times),
268 '/' => Some(WolframSyntaxKind::Divide),
269 '^' => Some(WolframSyntaxKind::Power),
270 '=' => Some(WolframSyntaxKind::Assign),
271 '<' => Some(WolframSyntaxKind::Less),
272 '>' => Some(WolframSyntaxKind::Greater),
273 '!' => Some(WolframSyntaxKind::Not),
274 '?' => Some(WolframSyntaxKind::Question),
275 '_' => Some(WolframSyntaxKind::Underscore),
276 '#' => Some(WolframSyntaxKind::Slot),
277 '.' => Some(WolframSyntaxKind::Dot),
278 ':' => Some(WolframSyntaxKind::Colon),
279 _ => None,
280 };
281 if let Some(k) = kind {
282 state.advance(ch.len_utf8());
283 state.add_token(k, start, state.get_position());
284 return true;
285 }
286 }
287 false
288 }
289
290 fn lex_single_char_tokens<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
291 let start = state.get_position();
292 if let Some(ch) = state.peek() {
293 let kind = match ch {
294 '(' => WolframSyntaxKind::LeftParen,
295 ')' => WolframSyntaxKind::RightParen,
296 '[' => WolframSyntaxKind::LeftBracket,
297 ']' => WolframSyntaxKind::RightBracket,
298 '{' => WolframSyntaxKind::LeftBrace,
299 '}' => WolframSyntaxKind::RightBrace,
300 ',' => WolframSyntaxKind::Comma,
301 ';' => WolframSyntaxKind::Semicolon,
302 _ => {
303 state.advance(ch.len_utf8());
305 state.add_token(WolframSyntaxKind::Error, start, state.get_position());
306 return true;
307 }
308 };
309 state.advance(ch.len_utf8());
310 state.add_token(kind, start, state.get_position());
311 true
312 }
313 else {
314 false
315 }
316 }
317}