1pub mod token_type;
5
6use crate::{language::WolframLanguage, lexer::token_type::WolframTokenType};
7use oak_core::{
8 Lexer, LexerCache, LexerState, OakError,
9 lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
10 source::{Source, TextEdit},
11};
12use std::sync::LazyLock;
13
14pub(crate) type State<'a, S> = LexerState<'a, S, WolframLanguage>;
15
16static WL_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
17static WL_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "", block_start: "(*", block_end: "*)", nested_blocks: true });
18static WL_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
19
20#[derive(Clone, Debug)]
22pub struct WolframLexer<'config> {
23 config: &'config WolframLanguage,
25}
26
27impl<'config> Lexer<WolframLanguage> for WolframLexer<'config> {
28 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<WolframLanguage>) -> LexOutput<WolframLanguage> {
29 let mut state = LexerState::new(source);
30 let result = self.run(&mut state);
31 if result.is_ok() {
32 state.add_eof();
33 }
34 state.finish_with_cache(result, cache)
35 }
36}
37
38impl<'config> WolframLexer<'config> {
39 pub fn new(config: &'config WolframLanguage) -> Self {
41 Self { config }
42 }
43
44 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
45 while state.not_at_end() {
46 let safe_point = state.get_position();
47
48 if self.skip_whitespace(state) {
49 continue;
50 }
51
52 if self.skip_comment(state) {
53 continue;
54 }
55
56 if self.lex_string_literal(state) {
57 continue;
58 }
59
60 if self.lex_number_literal(state) {
61 continue;
62 }
63
64 if self.lex_identifier_or_keyword(state) {
65 continue;
66 }
67
68 if self.lex_operators(state) {
69 continue;
70 }
71
72 if self.lex_single_char_tokens(state) {
73 continue;
74 }
75
76 state.advance_if_dead_lock(safe_point);
77 }
78
79 Ok(())
80 }
81
82 fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
83 if let Some(ch) = state.peek() {
85 if ch == '\n' || ch == '\r' {
86 let start = state.get_position();
87 state.advance(ch.len_utf8());
88 if ch == '\r' && state.peek() == Some('\n') {
89 state.advance(1);
90 }
91 state.add_token(WolframTokenType::Newline, start, state.get_position());
92 return true;
93 }
94 }
95
96 if WL_WHITESPACE.scan(state, WolframTokenType::Whitespace) {
97 return true;
98 }
99
100 false
101 }
102
103 fn skip_comment<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
104 WL_COMMENT.scan(state, WolframTokenType::Comment, WolframTokenType::Comment)
105 }
106
107 fn lex_string_literal<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
108 WL_STRING.scan(state, WolframTokenType::String)
109 }
110
111 fn lex_number_literal<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
112 let start = state.get_position();
113 let first = match state.peek() {
114 Some(c) => c,
115 None => return false,
116 };
117
118 if !first.is_ascii_digit() {
119 return false;
120 }
121
122 let mut is_real = false;
123
124 state.advance(first.len_utf8());
126 while let Some(c) = state.peek() {
127 if c.is_ascii_digit() {
128 state.advance(1);
129 }
130 else {
131 break;
132 }
133 }
134
135 if state.peek() == Some('.') {
137 let next = state.peek_next_n(1);
138 if next.map(|c| c.is_ascii_digit()).unwrap_or(false) {
139 is_real = true;
140 state.advance(1); while let Some(c) = state.peek() {
142 if c.is_ascii_digit() {
143 state.advance(1);
144 }
145 else {
146 break;
147 }
148 }
149 }
150 }
151
152 if let Some(c) = state.peek() {
154 if c == 'e' || c == 'E' {
155 let next = state.peek_next_n(1);
156 if next == Some('+') || next == Some('-') || next.map(|d| d.is_ascii_digit()).unwrap_or(false) {
157 is_real = true;
158 state.advance(1);
159 if let Some(sign) = state.peek() {
160 if sign == '+' || sign == '-' {
161 state.advance(1);
162 }
163 }
164 while let Some(d) = state.peek() {
165 if d.is_ascii_digit() {
166 state.advance(1);
167 }
168 else {
169 break;
170 }
171 }
172 }
173 }
174 }
175
176 let end = state.get_position();
177 state.add_token(if is_real { WolframTokenType::Real } else { WolframTokenType::Integer }, start, end);
178 true
179 }
180
181 fn lex_identifier_or_keyword<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
182 let start = state.get_position();
183 let ch = match state.peek() {
184 Some(c) => c,
185 None => return false,
186 };
187
188 if !(ch.is_ascii_alphabetic() || ch == '$') {
189 return false;
190 }
191
192 state.advance(ch.len_utf8());
193 while let Some(c) = state.peek() {
194 if c.is_ascii_alphanumeric() || c == '$' {
195 state.advance(c.len_utf8());
196 }
197 else {
198 break;
199 }
200 }
201
202 let end = state.get_position();
203 let text = state.source().get_text_in((start..end).into());
204 let kind = match text.as_ref() {
205 "If" => WolframTokenType::If,
206 "Then" => WolframTokenType::Then,
207 "Else" => WolframTokenType::Else,
208 "While" => WolframTokenType::While,
209 "For" => WolframTokenType::For,
210 "Do" => WolframTokenType::Do,
211 "Function" => WolframTokenType::Function,
212 "Module" => WolframTokenType::Module,
213 "Block" => WolframTokenType::Block,
214 "With" => WolframTokenType::With,
215 "Table" => WolframTokenType::Table,
216 "Map" => WolframTokenType::Map,
217 "Apply" => WolframTokenType::Apply,
218 "Select" => WolframTokenType::Select,
219 "Cases" => WolframTokenType::Cases,
220 "Rule" => WolframTokenType::Rule,
221 "RuleDelayed" => WolframTokenType::RuleDelayed,
222 "Set" => WolframTokenType::Set,
223 "SetDelayed" => WolframTokenType::SetDelayed,
224 "Unset" => WolframTokenType::Unset,
225 "Clear" => WolframTokenType::Clear,
226 "ClearAll" => WolframTokenType::ClearAll,
227 "Return" => WolframTokenType::Return,
228 "Break" => WolframTokenType::Break,
229 "Continue" => WolframTokenType::Continue,
230 "True" => WolframTokenType::True,
231 "False" => WolframTokenType::False,
232 "Null" => WolframTokenType::Null,
233 "Export" => WolframTokenType::Export,
234 "Import" => WolframTokenType::Import,
235 _ => WolframTokenType::Identifier,
236 };
237 state.add_token(kind, start, end);
238 true
239 }
240
241 fn lex_operators<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
242 let start = state.get_position();
243
244 let patterns: &[(&str, WolframTokenType)] = &[
246 ("===", WolframTokenType::Equal), ("=!=", WolframTokenType::NotEqual), ("@@@", WolframTokenType::ApplyLevelOperator),
249 ("//@", WolframTokenType::MapAllOperator),
250 (":=", WolframTokenType::SetDelayed),
251 (":>", WolframTokenType::RuleDelayedOp),
252 ("->", WolframTokenType::Arrow),
253 ("=>", WolframTokenType::DoubleArrow),
254 ("/@", WolframTokenType::MapOperator),
255 ("@@", WolframTokenType::ApplyOperator),
256 ("//", WolframTokenType::SlashSlash),
257 ("@*", WolframTokenType::AtStar),
258 ("/*", WolframTokenType::StarSlash),
259 ("<>", WolframTokenType::StringJoin),
260 ("==", WolframTokenType::Equal),
261 ("!=", WolframTokenType::NotEqual),
262 ("<=", WolframTokenType::LessEqual),
263 (">=", WolframTokenType::GreaterEqual),
264 ("&&", WolframTokenType::And),
265 ("||", WolframTokenType::Or),
266 ("+=", WolframTokenType::AddTo),
267 ("-=", WolframTokenType::SubtractFrom),
268 ("*=", WolframTokenType::TimesBy),
269 ("/=", WolframTokenType::DivideBy),
270 ("!!", WolframTokenType::Factorial), ("___", WolframTokenType::TripleUnderscore),
272 ("__", WolframTokenType::DoubleUnderscore),
273 ("##", WolframTokenType::SlotSequence),
274 ];
275
276 for (pat, kind) in patterns {
277 if state.starts_with(pat) {
278 state.advance(pat.len());
279 state.add_token(*kind, start, state.get_position());
280 return true;
281 }
282 }
283
284 if let Some(ch) = state.peek() {
286 let kind = match ch {
287 '+' => Some(WolframTokenType::Plus),
288 '-' => Some(WolframTokenType::Minus),
289 '*' => Some(WolframTokenType::Times),
290 '/' => Some(WolframTokenType::Divide),
291 '^' => Some(WolframTokenType::Power),
292 '=' => Some(WolframTokenType::Assign),
293 '<' => Some(WolframTokenType::Less),
294 '>' => Some(WolframTokenType::Greater),
295 '?' => Some(WolframTokenType::Question),
296 '_' => Some(WolframTokenType::Underscore),
297 '#' => Some(WolframTokenType::Slot),
298 '.' => Some(WolframTokenType::Dot),
299 ':' => Some(WolframTokenType::Colon),
300 '@' => Some(WolframTokenType::At),
301 '&' => Some(WolframTokenType::Ampersand),
302 '!' => Some(WolframTokenType::Factorial),
303 _ => None,
304 };
305 if let Some(k) = kind {
306 state.advance(ch.len_utf8());
307 state.add_token(k, start, state.get_position());
308 return true;
309 }
310 }
311 false
312 }
313
314 fn lex_single_char_tokens<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
315 let start = state.get_position();
316 if let Some(ch) = state.peek() {
317 let kind = match ch {
318 '(' => WolframTokenType::LeftParen,
319 ')' => WolframTokenType::RightParen,
320 '[' => WolframTokenType::LeftBracket,
321 ']' => WolframTokenType::RightBracket,
322 '{' => WolframTokenType::LeftBrace,
323 '}' => WolframTokenType::RightBrace,
324 ',' => WolframTokenType::Comma,
325 ';' => WolframTokenType::Semicolon,
326 _ => {
327 state.advance(ch.len_utf8());
329 state.add_token(WolframTokenType::Error, start, state.get_position());
330 return true;
331 }
332 };
333 state.advance(ch.len_utf8());
334 state.add_token(kind, start, state.get_position());
335 true
336 }
337 else {
338 false
339 }
340 }
341}