1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::WolframLanguage, lexer::token_type::WolframTokenType};
5use oak_core::{
6 Lexer, LexerCache, LexerState, OakError,
7 lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
8 source::{Source, TextEdit},
9};
10use std::sync::LazyLock;
11
12type State<'a, S> = LexerState<'a, S, WolframLanguage>;
13
14static WL_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
15static WL_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "", block_start: "(*", block_end: "*)", nested_blocks: true });
16static WL_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
17
18#[derive(Clone, Debug)]
19pub struct WolframLexer<'config> {
20 _config: &'config WolframLanguage,
21}
22
23impl<'config> Lexer<WolframLanguage> for WolframLexer<'config> {
24 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<WolframLanguage>) -> LexOutput<WolframLanguage> {
25 let mut state = LexerState::new(source);
26 let result = self.run(&mut state);
27 if result.is_ok() {
28 state.add_eof();
29 }
30 state.finish_with_cache(result, cache)
31 }
32}
33
34impl<'config> WolframLexer<'config> {
35 pub fn new(config: &'config WolframLanguage) -> Self {
36 Self { _config: config }
37 }
38
39 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
40 while state.not_at_end() {
41 let safe_point = state.get_position();
42
43 if self.skip_whitespace(state) {
44 continue;
45 }
46
47 if self.skip_comment(state) {
48 continue;
49 }
50
51 if self.lex_string_literal(state) {
52 continue;
53 }
54
55 if self.lex_number_literal(state) {
56 continue;
57 }
58
59 if self.lex_identifier_or_keyword(state) {
60 continue;
61 }
62
63 if self.lex_operators(state) {
64 continue;
65 }
66
67 if self.lex_single_char_tokens(state) {
68 continue;
69 }
70
71 state.advance_if_dead_lock(safe_point);
72 }
73
74 Ok(())
75 }
76
77 fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
78 if let Some(ch) = state.peek() {
80 if ch == '\n' || ch == '\r' {
81 let start = state.get_position();
82 state.advance(ch.len_utf8());
83 if ch == '\r' && state.peek() == Some('\n') {
84 state.advance(1);
85 }
86 state.add_token(WolframTokenType::Newline, start, state.get_position());
87 return true;
88 }
89 }
90
91 if WL_WHITESPACE.scan(state, WolframTokenType::Whitespace) {
92 return true;
93 }
94
95 false
96 }
97
98 fn skip_comment<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
99 WL_COMMENT.scan(state, WolframTokenType::Comment, WolframTokenType::Comment)
100 }
101
102 fn lex_string_literal<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
103 WL_STRING.scan(state, WolframTokenType::String)
104 }
105
106 fn lex_number_literal<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
107 let start = state.get_position();
108 let first = match state.peek() {
109 Some(c) => c,
110 None => return false,
111 };
112
113 if !first.is_ascii_digit() {
114 return false;
115 }
116
117 let mut is_real = false;
118
119 state.advance(first.len_utf8());
121 while let Some(c) = state.peek() {
122 if c.is_ascii_digit() {
123 state.advance(1);
124 }
125 else {
126 break;
127 }
128 }
129
130 if state.peek() == Some('.') {
132 let next = state.peek_next_n(1);
133 if next.map(|c| c.is_ascii_digit()).unwrap_or(false) {
134 is_real = true;
135 state.advance(1); while let Some(c) = state.peek() {
137 if c.is_ascii_digit() {
138 state.advance(1);
139 }
140 else {
141 break;
142 }
143 }
144 }
145 }
146
147 if let Some(c) = state.peek() {
149 if c == 'e' || c == 'E' {
150 let next = state.peek_next_n(1);
151 if next == Some('+') || next == Some('-') || next.map(|d| d.is_ascii_digit()).unwrap_or(false) {
152 is_real = true;
153 state.advance(1);
154 if let Some(sign) = state.peek() {
155 if sign == '+' || sign == '-' {
156 state.advance(1);
157 }
158 }
159 while let Some(d) = state.peek() {
160 if d.is_ascii_digit() {
161 state.advance(1);
162 }
163 else {
164 break;
165 }
166 }
167 }
168 }
169 }
170
171 let end = state.get_position();
172 state.add_token(if is_real { WolframTokenType::Real } else { WolframTokenType::Integer }, start, end);
173 true
174 }
175
176 fn lex_identifier_or_keyword<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
177 let start = state.get_position();
178 let ch = match state.peek() {
179 Some(c) => c,
180 None => return false,
181 };
182
183 if !(ch.is_ascii_alphabetic() || ch == '$') {
184 return false;
185 }
186
187 state.advance(ch.len_utf8());
188 while let Some(c) = state.peek() {
189 if c.is_ascii_alphanumeric() || c == '$' {
190 state.advance(c.len_utf8());
191 }
192 else {
193 break;
194 }
195 }
196
197 let end = state.get_position();
198 let text = state.source().get_text_in((start..end).into());
199 let kind = match text.as_ref() {
200 "If" => WolframTokenType::If,
201 "Then" => WolframTokenType::Then,
202 "Else" => WolframTokenType::Else,
203 "While" => WolframTokenType::While,
204 "For" => WolframTokenType::For,
205 "Do" => WolframTokenType::Do,
206 "Function" => WolframTokenType::Function,
207 "Module" => WolframTokenType::Module,
208 "Block" => WolframTokenType::Block,
209 "With" => WolframTokenType::With,
210 "Table" => WolframTokenType::Table,
211 "Map" => WolframTokenType::Map,
212 "Apply" => WolframTokenType::Apply,
213 "Select" => WolframTokenType::Select,
214 "Cases" => WolframTokenType::Cases,
215 "Rule" => WolframTokenType::Rule,
216 "RuleDelayed" => WolframTokenType::RuleDelayed,
217 "Set" => WolframTokenType::Set,
218 "SetDelayed" => WolframTokenType::SetDelayed,
219 "Unset" => WolframTokenType::Unset,
220 "Clear" => WolframTokenType::Clear,
221 "ClearAll" => WolframTokenType::ClearAll,
222 "Return" => WolframTokenType::Return,
223 "Break" => WolframTokenType::Break,
224 "Continue" => WolframTokenType::Continue,
225 "True" => WolframTokenType::True,
226 "False" => WolframTokenType::False,
227 "Null" => WolframTokenType::Null,
228 "Export" => WolframTokenType::Export,
229 "Import" => WolframTokenType::Import,
230 _ => WolframTokenType::Identifier,
231 };
232 state.add_token(kind, start, end);
233 true
234 }
235
236 fn lex_operators<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
237 let start = state.get_position();
238
239 let patterns: &[(&str, WolframTokenType)] = &[
241 ("===", WolframTokenType::Equal), ("=!=", WolframTokenType::NotEqual), ("@@@", WolframTokenType::ApplyLevelOperator),
244 ("//@", WolframTokenType::MapAllOperator),
245 (":=", WolframTokenType::SetDelayed),
246 (":>", WolframTokenType::RuleDelayedOp),
247 ("->", WolframTokenType::Arrow),
248 ("=>", WolframTokenType::DoubleArrow),
249 ("/@", WolframTokenType::MapOperator),
250 ("@@", WolframTokenType::ApplyOperator),
251 ("//", WolframTokenType::SlashSlash),
252 ("@*", WolframTokenType::AtStar),
253 ("/*", WolframTokenType::StarSlash),
254 ("<>", WolframTokenType::StringJoin),
255 ("==", WolframTokenType::Equal),
256 ("!=", WolframTokenType::NotEqual),
257 ("<=", WolframTokenType::LessEqual),
258 (">=", WolframTokenType::GreaterEqual),
259 ("&&", WolframTokenType::And),
260 ("||", WolframTokenType::Or),
261 ("+=", WolframTokenType::AddTo),
262 ("-=", WolframTokenType::SubtractFrom),
263 ("*=", WolframTokenType::TimesBy),
264 ("/=", WolframTokenType::DivideBy),
265 ("!!", WolframTokenType::Factorial), ("___", WolframTokenType::TripleUnderscore),
267 ("__", WolframTokenType::DoubleUnderscore),
268 ("##", WolframTokenType::SlotSequence),
269 ];
270
271 for (pat, kind) in patterns {
272 if state.starts_with(pat) {
273 state.advance(pat.len());
274 state.add_token(*kind, start, state.get_position());
275 return true;
276 }
277 }
278
279 if let Some(ch) = state.peek() {
281 let kind = match ch {
282 '+' => Some(WolframTokenType::Plus),
283 '-' => Some(WolframTokenType::Minus),
284 '*' => Some(WolframTokenType::Times),
285 '/' => Some(WolframTokenType::Divide),
286 '^' => Some(WolframTokenType::Power),
287 '=' => Some(WolframTokenType::Assign),
288 '<' => Some(WolframTokenType::Less),
289 '>' => Some(WolframTokenType::Greater),
290 '?' => Some(WolframTokenType::Question),
291 '_' => Some(WolframTokenType::Underscore),
292 '#' => Some(WolframTokenType::Slot),
293 '.' => Some(WolframTokenType::Dot),
294 ':' => Some(WolframTokenType::Colon),
295 '@' => Some(WolframTokenType::At),
296 '&' => Some(WolframTokenType::Ampersand),
297 '!' => Some(WolframTokenType::Factorial),
298 _ => None,
299 };
300 if let Some(k) = kind {
301 state.advance(ch.len_utf8());
302 state.add_token(k, start, state.get_position());
303 return true;
304 }
305 }
306 false
307 }
308
309 fn lex_single_char_tokens<S: Source + ?Sized>(&self, state: &mut State<S>) -> bool {
310 let start = state.get_position();
311 if let Some(ch) = state.peek() {
312 let kind = match ch {
313 '(' => WolframTokenType::LeftParen,
314 ')' => WolframTokenType::RightParen,
315 '[' => WolframTokenType::LeftBracket,
316 ']' => WolframTokenType::RightBracket,
317 '{' => WolframTokenType::LeftBrace,
318 '}' => WolframTokenType::RightBrace,
319 ',' => WolframTokenType::Comma,
320 ';' => WolframTokenType::Semicolon,
321 _ => {
322 state.advance(ch.len_utf8());
324 state.add_token(WolframTokenType::Error, start, state.get_position());
325 return true;
326 }
327 };
328 state.advance(ch.len_utf8());
329 state.add_token(kind, start, state.get_position());
330 true
331 }
332 else {
333 false
334 }
335 }
336}