oak_matlab/lexer/
mod.rs

1use crate::{kind::MatlabSyntaxKind, language::MatlabLanguage};
2use oak_core::{
3    Lexer, LexerState,
4    lexer::{LexOutput, LexerCache},
5    source::{Source, TextEdit},
6};
7
8type State<'s, S> = LexerState<'s, S, MatlabLanguage>;
9
10pub struct MatlabLexer<'config> {
11    _config: &'config MatlabLanguage,
12}
13
14impl<'config> Lexer<MatlabLanguage> for MatlabLexer<'config> {
15    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<MatlabLanguage>) -> LexOutput<MatlabLanguage> {
16        let mut state: State<'_, S> = LexerState::new(source);
17        let result = self.run(&mut state);
18        if result.is_ok() {
19            state.add_eof();
20        }
21        state.finish_with_cache(result, cache)
22    }
23}
24
25impl<'config> MatlabLexer<'config> {
26    pub fn new(config: &'config MatlabLanguage) -> Self {
27        Self { _config: config }
28    }
29
30    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), oak_core::OakError> {
31        while state.not_at_end() {
32            let safe_point = state.get_position();
33
34            if self.skip_whitespace(state) {
35                continue;
36            }
37
38            if self.lex_newline(state) {
39                continue;
40            }
41
42            if self.lex_comment(state) {
43                continue;
44            }
45
46            if self.lex_string(state) {
47                continue;
48            }
49
50            if self.lex_number(state) {
51                continue;
52            }
53
54            if self.lex_identifier(state) {
55                continue;
56            }
57
58            if self.lex_operator(state) {
59                continue;
60            }
61
62            if self.lex_delimiter(state) {
63                continue;
64            }
65
66            let start_pos = state.get_position();
67            if let Some(ch) = state.peek() {
68                state.advance(ch.len_utf8());
69                state.add_token(MatlabSyntaxKind::Error, start_pos, state.get_position());
70            }
71
72            state.advance_if_dead_lock(safe_point);
73        }
74
75        Ok(())
76    }
77
78    fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
79        let start_pos = state.get_position();
80        while let Some(ch) = state.peek() {
81            if ch == ' ' || ch == '\t' {
82                state.advance(ch.len_utf8());
83            }
84            else {
85                break;
86            }
87        }
88        if state.get_position() > start_pos {
89            state.add_token(MatlabSyntaxKind::Whitespace, start_pos, state.get_position());
90            true
91        }
92        else {
93            false
94        }
95    }
96
97    fn lex_newline<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
98        let start_pos = state.get_position();
99        if state.consume_if_starts_with("\n") || state.consume_if_starts_with("\r\n") || state.consume_if_starts_with("\r") {
100            state.add_token(MatlabSyntaxKind::Newline, start_pos, state.get_position());
101            true
102        }
103        else {
104            false
105        }
106    }
107
108    fn lex_identifier<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
109        let start_pos = state.get_position();
110        if let Some(ch) = state.peek() {
111            if ch.is_ascii_alphabetic() || ch == '_' {
112                state.advance(ch.len_utf8());
113                state.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
114
115                let text = state.get_text_in((start_pos..state.get_position()).into());
116                let token_kind = match text.as_ref() {
117                    "function" => MatlabSyntaxKind::Function,
118                    "end" => MatlabSyntaxKind::End,
119                    "if" => MatlabSyntaxKind::If,
120                    "else" => MatlabSyntaxKind::Else,
121                    "elseif" => MatlabSyntaxKind::Elseif,
122                    "while" => MatlabSyntaxKind::While,
123                    "for" => MatlabSyntaxKind::For,
124                    "break" => MatlabSyntaxKind::Break,
125                    "continue" => MatlabSyntaxKind::Continue,
126                    "return" => MatlabSyntaxKind::Return,
127                    "switch" => MatlabSyntaxKind::Switch,
128                    "case" => MatlabSyntaxKind::Case,
129                    "otherwise" => MatlabSyntaxKind::Otherwise,
130                    "try" => MatlabSyntaxKind::Try,
131                    "catch" => MatlabSyntaxKind::Catch,
132                    "global" => MatlabSyntaxKind::Global,
133                    "persistent" => MatlabSyntaxKind::Persistent,
134                    "classdef" => MatlabSyntaxKind::Classdef,
135                    "properties" => MatlabSyntaxKind::Properties,
136                    "methods" => MatlabSyntaxKind::Methods,
137                    "events" => MatlabSyntaxKind::Events,
138                    _ => MatlabSyntaxKind::Identifier,
139                };
140
141                state.add_token(token_kind, start_pos, state.get_position());
142                return true;
143            }
144        }
145        false
146    }
147
148    fn lex_number<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
149        let start_pos = state.get_position();
150        if let Some(ch) = state.peek() {
151            if ch.is_ascii_digit() || (ch == '.' && state.peek_next_n(1).map(|c| c.is_ascii_digit()).unwrap_or(false)) {
152                if ch == '.' {
153                    state.advance(1);
154                }
155                state.take_while(|c| c.is_ascii_digit());
156
157                if ch != '.' && state.consume_if_starts_with(".") {
158                    state.take_while(|c| c.is_ascii_digit());
159                }
160
161                if state.consume_if_starts_with("e") || state.consume_if_starts_with("E") {
162                    if let Some(sign) = state.peek() {
163                        if sign == '+' || sign == '-' {
164                            state.advance(1);
165                        }
166                    }
167                    state.take_while(|c| c.is_ascii_digit());
168                }
169
170                if state.consume_if_starts_with("i") || state.consume_if_starts_with("j") {
171                    // complex
172                }
173
174                state.add_token(MatlabSyntaxKind::Number, start_pos, state.get_position());
175                return true;
176            }
177        }
178        false
179    }
180
181    fn lex_string<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
182        let start_pos = state.get_position();
183        if let Some(quote) = state.peek() {
184            if quote == '\'' || quote == '"' {
185                state.advance(1);
186                while let Some(ch) = state.peek() {
187                    if ch == quote {
188                        state.advance(1);
189                        if state.peek() == Some(quote) {
190                            state.advance(1);
191                            continue;
192                        }
193                        break;
194                    }
195                    else if ch == '\\' {
196                        state.advance(1);
197                        if let Some(next) = state.peek() {
198                            state.advance(next.len_utf8());
199                        }
200                    }
201                    else {
202                        state.advance(ch.len_utf8());
203                    }
204                }
205                let kind = if quote == '\'' { MatlabSyntaxKind::Character } else { MatlabSyntaxKind::String };
206                state.add_token(kind, start_pos, state.get_position());
207                return true;
208            }
209        }
210        false
211    }
212
213    fn lex_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
214        let start_pos = state.get_position();
215        if state.consume_if_starts_with("%") {
216            if state.consume_if_starts_with("{") {
217                let mut depth = 1;
218                while depth > 0 && state.not_at_end() {
219                    if state.starts_with("%{") {
220                        depth += 1;
221                        state.advance(2);
222                    }
223                    else if state.starts_with("%}") {
224                        depth -= 1;
225                        state.advance(2);
226                    }
227                    else if let Some(ch) = state.current() {
228                        state.advance(ch.len_utf8());
229                    }
230                }
231                state.add_token(MatlabSyntaxKind::BlockComment, start_pos, state.get_position());
232            }
233            else {
234                state.take_while(|c| c != '\n' && c != '\r');
235                state.add_token(MatlabSyntaxKind::Comment, start_pos, state.get_position());
236            }
237            return true;
238        }
239        false
240    }
241
242    fn lex_operator<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
243        let start_pos = state.get_position();
244        let ops = [".*", "./", ".^", ".\\", "==", "~=", "<=", ">=", "&&", "||", "++", "--", ".'"];
245        for op in ops {
246            if state.consume_if_starts_with(op) {
247                state.add_token(MatlabSyntaxKind::Operator, start_pos, state.get_position());
248                return true;
249            }
250        }
251
252        if let Some(ch) = state.peek() {
253            let kind = match ch {
254                '+' | '-' | '*' | '/' | '\\' | '^' | '<' | '>' | '=' | '~' | '&' | '|' | '\'' => MatlabSyntaxKind::Operator,
255                _ => return false,
256            };
257            state.advance(1);
258            state.add_token(kind, start_pos, state.get_position());
259            return true;
260        }
261        false
262    }
263
264    fn lex_delimiter<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
265        let start_pos = state.get_position();
266        if let Some(ch) = state.peek() {
267            let kind = match ch {
268                '(' | ')' | '[' | ']' | '{' | '}' | ';' | ',' | ':' | '?' | '@' | '.' => MatlabSyntaxKind::Delimiter,
269                _ => return false,
270            };
271            state.advance(1);
272            state.add_token(kind, start_pos, state.get_position());
273            true
274        }
275        else {
276            false
277        }
278    }
279}