Skip to main content

oak_matlab/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// Token types for the Matlab language.
3pub mod token_type;
4
5use crate::{language::MatlabLanguage, lexer::token_type::MatlabTokenType};
6use oak_core::{
7    Lexer, LexerState,
8    lexer::{LexOutput, LexerCache},
9    source::{Source, TextEdit},
10};
11
12type State<'s, S> = LexerState<'s, S, MatlabLanguage>;
13
14/// Lexer for the Matlab language.
15#[derive(Clone)]
16pub struct MatlabLexer<'config> {
17    config: &'config MatlabLanguage,
18}
19
20impl<'config> Lexer<MatlabLanguage> for MatlabLexer<'config> {
21    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<MatlabLanguage>) -> LexOutput<MatlabLanguage> {
22        let mut state: State<'_, S> = LexerState::new(source);
23        let result = self.run(&mut state);
24        if result.is_ok() {
25            state.add_eof();
26        }
27        state.finish_with_cache(result, cache)
28    }
29}
30
31impl<'config> MatlabLexer<'config> {
32    /// Creates a new `MatlabLexer` with the given configuration.
33    pub fn new(config: &'config MatlabLanguage) -> Self {
34        Self { config }
35    }
36
37    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), oak_core::OakError> {
38        while state.not_at_end() {
39            let safe_point = state.get_position();
40
41            if self.skip_whitespace(state) {
42                continue;
43            }
44
45            if self.lex_newline(state) {
46                continue;
47            }
48
49            if self.lex_comment(state) {
50                continue;
51            }
52
53            if self.lex_string(state) {
54                continue;
55            }
56
57            if self.lex_number(state) {
58                continue;
59            }
60
61            if self.lex_identifier(state) {
62                continue;
63            }
64
65            if self.lex_operator(state) {
66                continue;
67            }
68
69            if self.lex_delimiter(state) {
70                continue;
71            }
72
73            let start_pos = state.get_position();
74            if let Some(ch) = state.peek() {
75                state.advance(ch.len_utf8());
76                state.add_token(MatlabTokenType::Error, start_pos, state.get_position());
77            }
78
79            state.advance_if_dead_lock(safe_point);
80        }
81
82        Ok(())
83    }
84
85    fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
86        let start_pos = state.get_position();
87        while let Some(ch) = state.peek() {
88            if ch == ' ' || ch == '\t' {
89                state.advance(ch.len_utf8());
90            }
91            else {
92                break;
93            }
94        }
95        if state.get_position() > start_pos {
96            state.add_token(MatlabTokenType::Whitespace, start_pos, state.get_position());
97            true
98        }
99        else {
100            false
101        }
102    }
103
104    fn lex_newline<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
105        let start_pos = state.get_position();
106        if state.consume_if_starts_with("\n") || state.consume_if_starts_with("\r\n") || state.consume_if_starts_with("\r") {
107            state.add_token(MatlabTokenType::Newline, start_pos, state.get_position());
108            true
109        }
110        else {
111            false
112        }
113    }
114
115    fn lex_identifier<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
116        let start_pos = state.get_position();
117        if let Some(ch) = state.peek() {
118            if ch.is_ascii_alphabetic() || ch == '_' {
119                state.advance(ch.len_utf8());
120                state.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
121
122                let text = state.get_text_in((start_pos..state.get_position()).into());
123                let token_kind = match text.as_ref() {
124                    "function" => MatlabTokenType::Function,
125                    "end" => MatlabTokenType::End,
126                    "if" => MatlabTokenType::If,
127                    "else" => MatlabTokenType::Else,
128                    "elseif" => MatlabTokenType::Elseif,
129                    "while" => MatlabTokenType::While,
130                    "for" => MatlabTokenType::For,
131                    "break" => MatlabTokenType::Break,
132                    "continue" => MatlabTokenType::Continue,
133                    "return" => MatlabTokenType::Return,
134                    "switch" => MatlabTokenType::Switch,
135                    "case" => MatlabTokenType::Case,
136                    "otherwise" => MatlabTokenType::Otherwise,
137                    "try" => MatlabTokenType::Try,
138                    "catch" => MatlabTokenType::Catch,
139                    "global" => MatlabTokenType::Global,
140                    "persistent" => MatlabTokenType::Persistent,
141                    "classdef" => MatlabTokenType::Classdef,
142                    "properties" => MatlabTokenType::Properties,
143                    "methods" => MatlabTokenType::Methods,
144                    "events" => MatlabTokenType::Events,
145                    _ => MatlabTokenType::Identifier,
146                };
147
148                state.add_token(token_kind, start_pos, state.get_position());
149                return true;
150            }
151        }
152        false
153    }
154
155    fn lex_number<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
156        let start_pos = state.get_position();
157        if let Some(ch) = state.peek() {
158            if ch.is_ascii_digit() || (ch == '.' && state.peek_next_n(1).map(|c| c.is_ascii_digit()).unwrap_or(false)) {
159                if ch == '.' {
160                    state.advance(1);
161                }
162                state.take_while(|c| c.is_ascii_digit());
163
164                if ch != '.' && state.consume_if_starts_with(".") {
165                    state.take_while(|c| c.is_ascii_digit());
166                }
167
168                if state.consume_if_starts_with("e") || state.consume_if_starts_with("E") {
169                    if let Some(sign) = state.peek() {
170                        if sign == '+' || sign == '-' {
171                            state.advance(1);
172                        }
173                    }
174                    state.take_while(|c| c.is_ascii_digit());
175                }
176
177                if state.consume_if_starts_with("i") || state.consume_if_starts_with("j") {
178                    // complex
179                }
180
181                state.add_token(MatlabTokenType::Number, start_pos, state.get_position());
182                return true;
183            }
184        }
185        false
186    }
187
188    fn lex_string<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
189        let start_pos = state.get_position();
190        if let Some(quote) = state.peek() {
191            if quote == '\'' || quote == '"' {
192                state.advance(1);
193                while let Some(ch) = state.peek() {
194                    if ch == quote {
195                        state.advance(1);
196                        if state.peek() == Some(quote) {
197                            state.advance(1);
198                            continue;
199                        }
200                        break;
201                    }
202                    else if ch == '\\' {
203                        state.advance(1);
204                        if let Some(next) = state.peek() {
205                            state.advance(next.len_utf8());
206                        }
207                    }
208                    else {
209                        state.advance(ch.len_utf8());
210                    }
211                }
212                let kind = if quote == '\'' { MatlabTokenType::Character } else { MatlabTokenType::String };
213                state.add_token(kind, start_pos, state.get_position());
214                return true;
215            }
216        }
217        false
218    }
219
220    fn lex_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
221        let start_pos = state.get_position();
222        if state.consume_if_starts_with("%") {
223            if state.consume_if_starts_with("{") {
224                let mut depth = 1;
225                while depth > 0 && state.not_at_end() {
226                    if state.starts_with("%{") {
227                        depth += 1;
228                        state.advance(2);
229                    }
230                    else if state.starts_with("%}") {
231                        depth -= 1;
232                        state.advance(2);
233                    }
234                    else if let Some(ch) = state.current() {
235                        state.advance(ch.len_utf8());
236                    }
237                }
238                state.add_token(MatlabTokenType::BlockComment, start_pos, state.get_position());
239            }
240            else {
241                state.take_while(|c| c != '\n' && c != '\r');
242                state.add_token(MatlabTokenType::Comment, start_pos, state.get_position());
243            }
244            return true;
245        }
246        false
247    }
248
249    fn lex_operator<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
250        let start_pos = state.get_position();
251        let ops = [".*", "./", ".^", ".\\", "==", "~=", "<=", ">=", "&&", "||", "++", "--", ".'"];
252        for op in ops {
253            if state.consume_if_starts_with(op) {
254                state.add_token(MatlabTokenType::Operator, start_pos, state.get_position());
255                return true;
256            }
257        }
258
259        if let Some(ch) = state.peek() {
260            let kind = match ch {
261                '+' | '-' | '*' | '/' | '\\' | '^' | '<' | '>' | '=' | '~' | '&' | '|' | '\'' => MatlabTokenType::Operator,
262                _ => return false,
263            };
264            state.advance(1);
265            state.add_token(kind, start_pos, state.get_position());
266            return true;
267        }
268        false
269    }
270
271    fn lex_delimiter<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
272        let start_pos = state.get_position();
273        if let Some(ch) = state.peek() {
274            let kind = match ch {
275                '(' | ')' | '[' | ']' | '{' | '}' | ';' | ',' | ':' | '?' | '@' | '.' => MatlabTokenType::Delimiter,
276                _ => return false,
277            };
278            state.advance(1);
279            state.add_token(kind, start_pos, state.get_position());
280            true
281        }
282        else {
283            false
284        }
285    }
286}