1use crate::{kind::MatlabSyntaxKind, language::MatlabLanguage};
2
3use oak_core::{IncrementalCache, Lexer, LexerState, lexer::LexOutput, source::Source};
4
5type State<S> = LexerState<S, MatlabLanguage>;
6
7pub struct MatlabLexer<'config> {
8 config: &'config MatlabLanguage,
9}
10
11impl<'config> MatlabLexer<'config> {
12 pub fn new(config: &'config MatlabLanguage) -> Self {
13 Self { config }
14 }
15
16 fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
18 let start_pos = state.get_position();
19
20 while let Some(ch) = state.peek() {
21 if ch == ' ' || ch == '\t' {
22 state.advance(ch.len_utf8());
23 }
24 else {
25 break;
26 }
27 }
28
29 if state.get_position() > start_pos {
30 state.add_token(MatlabSyntaxKind::Whitespace, start_pos, state.get_position());
31 true
32 }
33 else {
34 false
35 }
36 }
37
38 fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
40 let start_pos = state.get_position();
41
42 if let Some('\n') = state.peek() {
43 state.advance(1);
44 state.add_token(MatlabSyntaxKind::Newline, start_pos, state.get_position());
45 true
46 }
47 else if let Some('\r') = state.peek() {
48 state.advance(1);
49 if let Some('\n') = state.peek() {
50 state.advance(1);
51 }
52 state.add_token(MatlabSyntaxKind::Newline, start_pos, state.get_position());
53 true
54 }
55 else {
56 false
57 }
58 }
59
60 fn lex_identifier<S: Source>(&self, state: &mut State<S>) -> bool {
62 let start_pos = state.get_position();
63
64 if let Some(ch) = state.peek() {
65 if !ch.is_ascii_alphabetic() && ch != '_' {
66 return false;
67 }
68
69 let mut identifier = String::new();
71 while let Some(ch) = state.peek() {
72 if ch.is_ascii_alphanumeric() || ch == '_' {
73 identifier.push(ch);
74 state.advance(1);
75 }
76 else {
77 break;
78 }
79 }
80
81 let token_kind = match identifier.as_str() {
83 "function" => MatlabSyntaxKind::Function,
84 "end" => MatlabSyntaxKind::End,
85 "if" => MatlabSyntaxKind::If,
86 "else" => MatlabSyntaxKind::Else,
87 "elseif" => MatlabSyntaxKind::Elseif,
88 "while" => MatlabSyntaxKind::While,
89 "for" => MatlabSyntaxKind::For,
90 "break" => MatlabSyntaxKind::Break,
91 "continue" => MatlabSyntaxKind::Continue,
92 "return" => MatlabSyntaxKind::Return,
93 "switch" => MatlabSyntaxKind::Switch,
94 "case" => MatlabSyntaxKind::Case,
95 "otherwise" => MatlabSyntaxKind::Otherwise,
96 "try" => MatlabSyntaxKind::Try,
97 "catch" => MatlabSyntaxKind::Catch,
98 "global" => MatlabSyntaxKind::Global,
99 "persistent" => MatlabSyntaxKind::Persistent,
100 "classdef" => MatlabSyntaxKind::Classdef,
101 "properties" => MatlabSyntaxKind::Properties,
102 "methods" => MatlabSyntaxKind::Methods,
103 "events" => MatlabSyntaxKind::Events,
104 _ => MatlabSyntaxKind::Identifier,
105 };
106
107 state.add_token(token_kind, start_pos, state.get_position());
108 true
109 }
110 else {
111 false
112 }
113 }
114
115 fn lex_number<S: Source>(&self, state: &mut State<S>) -> bool {
117 let start_pos = state.get_position();
118
119 if let Some(ch) = state.peek() {
120 if !ch.is_ascii_digit() {
121 return false;
122 }
123
124 while let Some(ch) = state.peek() {
126 if ch.is_ascii_digit() {
127 state.advance(ch.len_utf8());
128 }
129 else {
130 break;
131 }
132 }
133
134 if let Some('.') = state.peek() {
136 if let Some(next_ch) = state.peek_next_n(1) {
137 if next_ch.is_ascii_digit() {
138 state.advance(1); while let Some(ch) = state.peek() {
140 if ch.is_ascii_digit() {
141 state.advance(ch.len_utf8());
142 }
143 else {
144 break;
145 }
146 }
147 }
148 }
149 }
150
151 if let Some(ch) = state.peek() {
153 if ch == 'e' || ch == 'E' {
154 state.advance(1);
155 if let Some(sign) = state.peek() {
156 if sign == '+' || sign == '-' {
157 state.advance(1);
158 }
159 }
160 while let Some(ch) = state.peek() {
161 if ch.is_ascii_digit() {
162 state.advance(ch.len_utf8());
163 }
164 else {
165 break;
166 }
167 }
168 }
169 }
170
171 state.add_token(MatlabSyntaxKind::Number, start_pos, state.get_position());
172 true
173 }
174 else {
175 false
176 }
177 }
178
179 fn lex_string<S: Source>(&self, state: &mut State<S>) -> bool {
181 let start_pos = state.get_position();
182
183 if let Some(quote) = state.peek() {
184 if quote != '\'' && quote != '"' {
185 return false;
186 }
187
188 state.advance(1); while let Some(ch) = state.peek() {
190 if ch == quote {
191 state.advance(1); break;
193 }
194 else if ch == '\\' {
195 state.advance(1); if state.peek().is_some() {
197 state.advance(state.peek().unwrap().len_utf8());
198 }
199 }
200 else {
201 state.advance(ch.len_utf8());
202 }
203 }
204
205 let token_kind = if quote == '\'' { MatlabSyntaxKind::Character } else { MatlabSyntaxKind::String };
206
207 state.add_token(token_kind, start_pos, state.get_position());
208 true
209 }
210 else {
211 false
212 }
213 }
214
215 fn lex_comment<S: Source>(&self, state: &mut State<S>) -> bool {
217 let start_pos = state.get_position();
218
219 if let Some('%') = state.peek() {
220 state.advance(1);
221
222 if let Some('{') = state.peek() {
224 state.advance(1);
225
226 while let Some(ch) = state.peek() {
228 if ch == '%' {
229 if let Some('}') = state.peek_next_n(1) {
230 state.advance(2); break;
232 }
233 }
234 state.advance(ch.len_utf8());
235 }
236
237 state.add_token(MatlabSyntaxKind::BlockComment, start_pos, state.get_position());
238 }
239 else {
240 while let Some(ch) = state.peek() {
242 if ch == '\n' || ch == '\r' {
243 break;
244 }
245 state.advance(ch.len_utf8());
246 }
247
248 state.add_token(MatlabSyntaxKind::Comment, start_pos, state.get_position());
249 }
250 true
251 }
252 else {
253 false
254 }
255 }
256
257 fn lex_operator<S: Source>(&self, state: &mut State<S>) -> bool {
259 let start_pos = state.get_position();
260
261 if let Some(ch) = state.peek() {
262 let token_kind = match ch {
263 '+' => {
264 state.advance(1);
265 MatlabSyntaxKind::Plus
266 }
267 '-' => {
268 state.advance(1);
269 MatlabSyntaxKind::Minus
270 }
271 '*' => {
272 state.advance(1);
273 MatlabSyntaxKind::Times
274 }
275 '/' => {
276 state.advance(1);
277 MatlabSyntaxKind::Divide
278 }
279 '^' => {
280 state.advance(1);
281 MatlabSyntaxKind::Power
282 }
283 '\\' => {
284 state.advance(1);
285 MatlabSyntaxKind::LeftDivide
286 }
287 '=' => {
288 state.advance(1);
289 if let Some('=') = state.peek() {
290 state.advance(1);
291 MatlabSyntaxKind::Equal
292 }
293 else {
294 MatlabSyntaxKind::Assign
295 }
296 }
297 '~' => {
298 state.advance(1);
299 if let Some('=') = state.peek() {
300 state.advance(1);
301 MatlabSyntaxKind::NotEqual
302 }
303 else {
304 MatlabSyntaxKind::Not
305 }
306 }
307 '<' => {
308 state.advance(1);
309 if let Some('=') = state.peek() {
310 state.advance(1);
311 MatlabSyntaxKind::LessEqual
312 }
313 else {
314 MatlabSyntaxKind::Less
315 }
316 }
317 '>' => {
318 state.advance(1);
319 if let Some('=') = state.peek() {
320 state.advance(1);
321 MatlabSyntaxKind::GreaterEqual
322 }
323 else {
324 MatlabSyntaxKind::Greater
325 }
326 }
327 '&' => {
328 state.advance(1);
329 if let Some('&') = state.peek() {
330 state.advance(1);
331 MatlabSyntaxKind::AndAnd
332 }
333 else {
334 MatlabSyntaxKind::And
335 }
336 }
337 '|' => {
338 state.advance(1);
339 if let Some('|') = state.peek() {
340 state.advance(1);
341 MatlabSyntaxKind::OrOr
342 }
343 else {
344 MatlabSyntaxKind::Or
345 }
346 }
347 '.' => {
348 state.advance(1);
349 if let Some(next_ch) = state.peek() {
350 match next_ch {
351 '*' => {
352 state.advance(1);
353 MatlabSyntaxKind::DotTimes
354 }
355 '/' => {
356 state.advance(1);
357 MatlabSyntaxKind::DotDivide
358 }
359 '^' => {
360 state.advance(1);
361 MatlabSyntaxKind::DotPower
362 }
363 '\\' => {
364 state.advance(1);
365 MatlabSyntaxKind::DotLeftDivide
366 }
367 '\'' => {
368 state.advance(1);
369 MatlabSyntaxKind::DotTranspose
370 }
371 _ => MatlabSyntaxKind::Dot,
372 }
373 }
374 else {
375 MatlabSyntaxKind::Dot
376 }
377 }
378 '\'' => {
379 state.advance(1);
380 MatlabSyntaxKind::Transpose
381 }
382 _ => return false,
383 };
384
385 state.add_token(token_kind, start_pos, state.get_position());
386 true
387 }
388 else {
389 false
390 }
391 }
392
393 fn lex_delimiter<S: Source>(&self, state: &mut State<S>) -> bool {
395 let start_pos = state.get_position();
396
397 if let Some(ch) = state.peek() {
398 let token_kind = match ch {
399 '(' => MatlabSyntaxKind::LeftParen,
400 ')' => MatlabSyntaxKind::RightParen,
401 '[' => MatlabSyntaxKind::LeftBracket,
402 ']' => MatlabSyntaxKind::RightBracket,
403 '{' => MatlabSyntaxKind::LeftBrace,
404 '}' => MatlabSyntaxKind::RightBrace,
405 ';' => MatlabSyntaxKind::Semicolon,
406 ',' => MatlabSyntaxKind::Comma,
407 ':' => MatlabSyntaxKind::Colon,
408 '?' => MatlabSyntaxKind::Question,
409 '@' => MatlabSyntaxKind::At,
410 _ => return false,
411 };
412
413 state.advance(ch.len_utf8());
414 state.add_token(token_kind, start_pos, state.get_position());
415 true
416 }
417 else {
418 false
419 }
420 }
421}
422
423impl<'config> Lexer<MatlabLanguage> for MatlabLexer<'config> {
424 fn lex_incremental(
425 &self,
426 source: impl Source,
427 _start_offset: usize,
428 _cache: IncrementalCache<'_, MatlabLanguage>,
429 ) -> LexOutput<MatlabLanguage> {
430 self.lex_internal(source)
431 }
432
433 fn lex(&self, source: impl Source) -> LexOutput<MatlabLanguage> {
434 self.lex_internal(source)
435 }
436}
437
438impl<'config> MatlabLexer<'config> {
439 fn lex_internal<S: Source>(&self, source: S) -> LexOutput<MatlabLanguage> {
440 let mut state = LexerState::new(source);
441
442 while state.not_at_end() {
443 if self.skip_whitespace(&mut state) {
445 continue;
446 }
447
448 if self.lex_newline(&mut state) {
450 continue;
451 }
452
453 if self.lex_comment(&mut state) {
455 continue;
456 }
457
458 if self.lex_string(&mut state) {
460 continue;
461 }
462
463 if self.lex_number(&mut state) {
465 continue;
466 }
467
468 if self.lex_identifier(&mut state) {
470 continue;
471 }
472
473 if self.lex_operator(&mut state) {
475 continue;
476 }
477
478 if self.lex_delimiter(&mut state) {
480 continue;
481 }
482
483 let start_pos = state.get_position();
485 if let Some(_ch) = state.peek() {
486 state.advance(1);
487 state.add_token(MatlabSyntaxKind::Error, start_pos, state.get_position());
488 }
489 }
490
491 state.finish(Ok(()))
492 }
493}