1use crate::{kind::WolframSyntaxKind, language::WolframLanguage};
2use oak_core::{
3 Lexer, LexerCache, LexerState, OakError,
4 lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
5 source::{Source, TextEdit},
6};
7use std::sync::LazyLock;
8
9type State<'a, S> = LexerState<'a, S, WolframLanguage>;
10
11static WL_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
12static WL_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "", block_start: "(*", block_end: "*)", nested_blocks: true });
13static WL_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
14
15#[derive(Clone, Debug)]
16pub struct WolframLexer<'config> {
17 _config: &'config WolframLanguage,
18}
19
20impl<'config> Lexer<WolframLanguage> for WolframLexer<'config> {
21 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<WolframLanguage>) -> LexOutput<WolframLanguage> {
22 let mut state = LexerState::new(source);
23 let result = self.run(&mut state);
24 if result.is_ok() {
25 state.add_eof();
26 }
27 state.finish_with_cache(result, cache)
28 }
29}
30
31impl<'config> WolframLexer<'config> {
32 pub fn new(config: &'config WolframLanguage) -> Self {
33 Self { _config: config }
34 }
35
36 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
37 while state.not_at_end() {
38 let safe_point = state.get_position();
39
40 if self.skip_whitespace(state) {
41 continue;
42 }
43
44 if self.skip_comment(state) {
45 continue;
46 }
47
48 if self.lex_string_literal(state) {
49 continue;
50 }
51
52 if self.lex_number_literal(state) {
53 continue;
54 }
55
56 if self.lex_identifier_or_keyword(state) {
57 continue;
58 }
59
60 if self.lex_operators(state) {
61 continue;
62 }
63
64 if self.lex_single_char_tokens(state) {
65 continue;
66 }
67
68 state.advance_if_dead_lock(safe_point);
69 }
70
71 Ok(())
72 }
73
74 fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
75 if WL_WHITESPACE.scan(state, WolframSyntaxKind::Whitespace) {
76 return true;
77 }
78
79 if let Some(ch) = state.peek() {
81 if ch == '\n' || ch == '\r' {
82 let start = state.get_position();
83 state.advance(ch.len_utf8());
84 if ch == '\r' && state.peek() == Some('\n') {
85 state.advance(1);
86 }
87 state.add_token(WolframSyntaxKind::Newline, start, state.get_position());
88 return true;
89 }
90 }
91 false
92 }
93
94 fn skip_comment<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
95 WL_COMMENT.scan(state, WolframSyntaxKind::Comment, WolframSyntaxKind::Comment)
96 }
97
98 fn lex_string_literal<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
99 WL_STRING.scan(state, WolframSyntaxKind::String)
100 }
101
102 fn lex_number_literal<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
103 let start = state.get_position();
104 let first = match state.peek() {
105 Some(c) => c,
106 None => return false,
107 };
108
109 if !first.is_ascii_digit() {
110 return false;
111 }
112
113 let mut is_real = false;
114
115 state.advance(first.len_utf8());
117 while let Some(c) = state.peek() {
118 if c.is_ascii_digit() {
119 state.advance(1);
120 }
121 else {
122 break;
123 }
124 }
125
126 if state.peek() == Some('.') {
128 let next = state.peek_next_n(1);
129 if next.map(|c| c.is_ascii_digit()).unwrap_or(false) {
130 is_real = true;
131 state.advance(1); while let Some(c) = state.peek() {
133 if c.is_ascii_digit() {
134 state.advance(1);
135 }
136 else {
137 break;
138 }
139 }
140 }
141 }
142
143 if let Some(c) = state.peek() {
145 if c == 'e' || c == 'E' {
146 let next = state.peek_next_n(1);
147 if next == Some('+') || next == Some('-') || next.map(|d| d.is_ascii_digit()).unwrap_or(false) {
148 is_real = true;
149 state.advance(1);
150 if let Some(sign) = state.peek() {
151 if sign == '+' || sign == '-' {
152 state.advance(1);
153 }
154 }
155 while let Some(d) = state.peek() {
156 if d.is_ascii_digit() {
157 state.advance(1);
158 }
159 else {
160 break;
161 }
162 }
163 }
164 }
165 }
166
167 let end = state.get_position();
168 state.add_token(if is_real { WolframSyntaxKind::Real } else { WolframSyntaxKind::Integer }, start, end);
169 true
170 }
171
172 fn lex_identifier_or_keyword<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
173 let start = state.get_position();
174 let ch = match state.peek() {
175 Some(c) => c,
176 None => return false,
177 };
178
179 if !(ch.is_ascii_alphabetic() || ch == '$') {
180 return false;
181 }
182
183 state.advance(ch.len_utf8());
184 while let Some(c) = state.peek() {
185 if c.is_ascii_alphanumeric() || c == '$' {
186 state.advance(c.len_utf8());
187 }
188 else {
189 break;
190 }
191 }
192
193 let end = state.get_position();
194 let text = state.source().get_text_in((start..end).into());
195 let kind = match text.as_ref() {
196 "If" => WolframSyntaxKind::If,
197 "Then" => WolframSyntaxKind::Then,
198 "Else" => WolframSyntaxKind::Else,
199 "While" => WolframSyntaxKind::While,
200 "For" => WolframSyntaxKind::For,
201 "Do" => WolframSyntaxKind::Do,
202 "Function" => WolframSyntaxKind::Function,
203 "Module" => WolframSyntaxKind::Module,
204 "Block" => WolframSyntaxKind::Block,
205 "With" => WolframSyntaxKind::With,
206 "Table" => WolframSyntaxKind::Table,
207 "Map" => WolframSyntaxKind::Map,
208 "Apply" => WolframSyntaxKind::Apply,
209 "Select" => WolframSyntaxKind::Select,
210 "Cases" => WolframSyntaxKind::Cases,
211 "Rule" => WolframSyntaxKind::Rule,
212 "RuleDelayed" => WolframSyntaxKind::RuleDelayed,
213 "Set" => WolframSyntaxKind::Set,
214 "SetDelayed" => WolframSyntaxKind::SetDelayed,
215 "Unset" => WolframSyntaxKind::Unset,
216 "Clear" => WolframSyntaxKind::Clear,
217 "ClearAll" => WolframSyntaxKind::ClearAll,
218 "Return" => WolframSyntaxKind::Return,
219 "Break" => WolframSyntaxKind::Break,
220 "Continue" => WolframSyntaxKind::Continue,
221 "True" => WolframSyntaxKind::True,
222 "False" => WolframSyntaxKind::False,
223 "Null" => WolframSyntaxKind::Null,
224 "Export" => WolframSyntaxKind::Export,
225 "Import" => WolframSyntaxKind::Import,
226 _ => WolframSyntaxKind::Identifier,
227 };
228 state.add_token(kind, start, state.get_position());
229 true
230 }
231
232 fn lex_operators<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
233 let start = state.get_position();
234
235 let patterns: &[(&str, WolframSyntaxKind)] = &[
237 ("===", WolframSyntaxKind::Equal), ("=!=", WolframSyntaxKind::NotEqual), ("->", WolframSyntaxKind::Arrow),
240 ("=>", WolframSyntaxKind::DoubleArrow),
241 ("==", WolframSyntaxKind::Equal),
242 ("!=", WolframSyntaxKind::NotEqual),
243 ("<=", WolframSyntaxKind::LessEqual),
244 (">=", WolframSyntaxKind::GreaterEqual),
245 ("&&", WolframSyntaxKind::And),
246 ("||", WolframSyntaxKind::Or),
247 ("+=", WolframSyntaxKind::AddTo),
248 ("-=", WolframSyntaxKind::SubtractFrom),
249 ("*=", WolframSyntaxKind::TimesBy),
250 ("/=", WolframSyntaxKind::DivideBy),
251 ("___", WolframSyntaxKind::TripleUnderscore),
252 ("__", WolframSyntaxKind::DoubleUnderscore),
253 ("##", WolframSyntaxKind::SlotSequence),
254 ];
255
256 for (pat, kind) in patterns {
257 if state.starts_with(pat) {
258 state.advance(pat.len());
259 state.add_token(*kind, start, state.get_position());
260 return true;
261 }
262 }
263
264 if let Some(ch) = state.peek() {
266 let kind = match ch {
267 '+' => Some(WolframSyntaxKind::Plus),
268 '-' => Some(WolframSyntaxKind::Minus),
269 '*' => Some(WolframSyntaxKind::Times),
270 '/' => Some(WolframSyntaxKind::Divide),
271 '^' => Some(WolframSyntaxKind::Power),
272 '=' => Some(WolframSyntaxKind::Assign),
273 '<' => Some(WolframSyntaxKind::Less),
274 '>' => Some(WolframSyntaxKind::Greater),
275 '!' => Some(WolframSyntaxKind::Not),
276 '?' => Some(WolframSyntaxKind::Question),
277 '_' => Some(WolframSyntaxKind::Underscore),
278 '#' => Some(WolframSyntaxKind::Slot),
279 '.' => Some(WolframSyntaxKind::Dot),
280 ':' => Some(WolframSyntaxKind::Colon),
281 _ => None,
282 };
283 if let Some(k) = kind {
284 state.advance(ch.len_utf8());
285 state.add_token(k, start, state.get_position());
286 return true;
287 }
288 }
289 false
290 }
291
292 fn lex_single_char_tokens<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
293 let start = state.get_position();
294 if let Some(ch) = state.peek() {
295 let kind = match ch {
296 '(' => WolframSyntaxKind::LeftParen,
297 ')' => WolframSyntaxKind::RightParen,
298 '[' => WolframSyntaxKind::LeftBracket,
299 ']' => WolframSyntaxKind::RightBracket,
300 '{' => WolframSyntaxKind::LeftBrace,
301 '}' => WolframSyntaxKind::RightBrace,
302 ',' => WolframSyntaxKind::Comma,
303 ';' => WolframSyntaxKind::Semicolon,
304 _ => {
305 state.advance(ch.len_utf8());
307 state.add_token(WolframSyntaxKind::Error, start, state.get_position());
308 return true;
309 }
310 };
311 state.advance(ch.len_utf8());
312 state.add_token(kind, start, state.get_position());
313 true
314 }
315 else {
316 false
317 }
318 }
319}