Skip to main content

oak_r/lexer/
mod.rs

1use crate::{kind::RSyntaxKind, language::RLanguage};
2use oak_core::{Lexer, LexerCache, LexerState, Range, lexer::LexOutput, source::Source};
3
4type State<'s, S> = LexerState<'s, S, RLanguage>;
5
6#[derive(Clone)]
7pub struct RLexer<'config> {
8    _config: &'config RLanguage,
9}
10
11impl<'config> Lexer<RLanguage> for RLexer<'config> {
12    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<RLanguage>) -> LexOutput<RLanguage> {
13        let mut state = State::new(source);
14        let result = self.run(&mut state);
15        if result.is_ok() {
16            state.add_eof();
17        }
18        state.finish_with_cache(result, cache)
19    }
20}
21
22impl<'config> RLexer<'config> {
23    pub fn new(_config: &'config RLanguage) -> Self {
24        Self { _config }
25    }
26
27    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), oak_core::OakError> {
28        while state.not_at_end() {
29            let safe_point = state.get_position();
30            if self.skip_whitespace(state) {
31                continue;
32            }
33
34            if self.lex_comment(state) {
35                continue;
36            }
37
38            if self.lex_string_literal(state) {
39                continue;
40            }
41
42            if self.lex_number_literal(state) {
43                continue;
44            }
45
46            if self.lex_identifier_or_keyword(state) {
47                continue;
48            }
49
50            if self.lex_operators(state) {
51                continue;
52            }
53
54            if self.lex_single_char_tokens(state) {
55                continue;
56            }
57
58            if self.lex_other(state) {
59                continue;
60            }
61
62            state.advance_if_dead_lock(safe_point);
63        }
64        Ok(())
65    }
66
67    /// 跳过空白符
68    fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
69        if let Some(ch) = state.current() {
70            if ch.is_whitespace() {
71                state.advance(ch.len_utf8());
72                return true;
73            }
74        }
75        false
76    }
77
78    /// 处理注释
79    fn lex_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
80        if let Some('#') = state.current() {
81            let start_pos = state.get_position();
82            state.advance(1); // 跳过 '#'
83
84            // 读取到行尾
85            while let Some(ch) = state.current() {
86                if ch == '\n' || ch == '\r' {
87                    break;
88                }
89                state.advance(ch.len_utf8());
90            }
91
92            state.add_token(RSyntaxKind::Comment, start_pos, state.get_position());
93            return true;
94        }
95        false
96    }
97
98    /// 处理字符串字面量
99    fn lex_string_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
100        if let Some(quote) = state.current() {
101            if quote == '"' || quote == '\'' {
102                let start_pos = state.get_position();
103                state.advance(1); // 跳过引号
104
105                while let Some(ch) = state.current() {
106                    if ch == quote {
107                        state.advance(1); // 跳过结束引号
108                        state.add_token(RSyntaxKind::StringLiteral, start_pos, state.get_position());
109                        return true;
110                    }
111                    if ch == '\\' {
112                        state.advance(1);
113                        if let Some(escaped) = state.current() {
114                            state.advance(escaped.len_utf8());
115                            continue;
116                        }
117                    }
118                    state.advance(ch.len_utf8());
119                }
120
121                // 未闭合字符串
122                state.add_token(RSyntaxKind::StringLiteral, start_pos, state.get_position());
123                return true;
124            }
125        }
126        false
127    }
128
129    /// 处理数字字面量
130    fn lex_number_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
131        if let Some(ch) = state.current() {
132            if ch.is_ascii_digit() || (ch == '.' && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit())) {
133                let start_pos = state.get_position();
134                let mut has_dot = false;
135
136                while let Some(c) = state.current() {
137                    if c.is_ascii_digit() {
138                        state.advance(1);
139                    }
140                    else if c == '.' && !has_dot {
141                        has_dot = true;
142                        state.advance(1);
143                    }
144                    else if (c == 'e' || c == 'E') && !state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit() || c == '+' || c == '-') {
145                        break;
146                    }
147                    else if c == 'e' || c == 'E' {
148                        state.advance(1);
149                        if let Some(next) = state.current() {
150                            if next == '+' || next == '-' {
151                                state.advance(1);
152                            }
153                        }
154                        while let Some(digit) = state.current() {
155                            if digit.is_ascii_digit() {
156                                state.advance(1);
157                            }
158                            else {
159                                break;
160                            }
161                        }
162                        break;
163                    }
164                    else if c == 'L' {
165                        state.advance(1);
166                        state.add_token(RSyntaxKind::IntegerLiteral, start_pos, state.get_position());
167                        return true;
168                    }
169                    else if c == 'i' {
170                        state.advance(1);
171                        state.add_token(RSyntaxKind::FloatLiteral, start_pos, state.get_position());
172                        return true;
173                    }
174                    else {
175                        break;
176                    }
177                }
178
179                let kind = if has_dot { RSyntaxKind::FloatLiteral } else { RSyntaxKind::IntegerLiteral };
180                state.add_token(kind, start_pos, state.get_position());
181                return true;
182            }
183        }
184        false
185    }
186
187    /// 处理标识符或关键字
188    fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
189        if let Some(ch) = state.current() {
190            if ch.is_alphabetic() || ch == '.' || ch == '_' {
191                let start_pos = state.get_position();
192                state.advance(ch.len_utf8());
193
194                while let Some(c) = state.current() {
195                    if c.is_alphanumeric() || c == '.' || c == '_' {
196                        state.advance(c.len_utf8());
197                    }
198                    else {
199                        break;
200                    }
201                }
202
203                let text = state.get_text_in(Range { start: start_pos, end: state.get_position() });
204                let kind = match text.as_ref() {
205                    "if" => RSyntaxKind::If,
206                    "else" => RSyntaxKind::Else,
207                    "for" => RSyntaxKind::For,
208                    "in" => RSyntaxKind::In,
209                    "while" => RSyntaxKind::While,
210                    "repeat" => RSyntaxKind::Repeat,
211                    "next" => RSyntaxKind::Next,
212                    "break" => RSyntaxKind::Break,
213                    "function" => RSyntaxKind::Function,
214                    "TRUE" => RSyntaxKind::True,
215                    "FALSE" => RSyntaxKind::False,
216                    "NULL" => RSyntaxKind::Null,
217                    "Inf" => RSyntaxKind::Inf,
218                    "NaN" => RSyntaxKind::NaN,
219                    "NA" => RSyntaxKind::NA,
220                    "NA_integer_" => RSyntaxKind::NaInteger,
221                    "NA_real_" => RSyntaxKind::NaReal,
222                    "NA_complex_" => RSyntaxKind::NaComplex,
223                    "NA_character_" => RSyntaxKind::NaCharacter,
224                    _ => RSyntaxKind::Identifier,
225                };
226
227                state.add_token(kind, start_pos, state.get_position());
228                return true;
229            }
230        }
231        false
232    }
233
234    /// 处理操作符
235    fn lex_operators<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
236        let start_pos = state.get_position();
237        if let Some(ch) = state.current() {
238            match ch {
239                '<' => {
240                    state.advance(1);
241                    if let Some('-') = state.current() {
242                        state.advance(1);
243                        state.add_token(RSyntaxKind::LeftArrow, start_pos, state.get_position());
244                        return true;
245                    }
246                    if let Some('<') = state.current() {
247                        state.advance(1);
248                        if let Some('-') = state.current() {
249                            state.advance(1);
250                            state.add_token(RSyntaxKind::DoubleLeftArrow, start_pos, state.get_position());
251                            return true;
252                        }
253                    }
254                    if let Some('=') = state.current() {
255                        state.advance(1);
256                        state.add_token(RSyntaxKind::LessEqual, start_pos, state.get_position());
257                        return true;
258                    }
259                    state.add_token(RSyntaxKind::Less, start_pos, state.get_position());
260                    return true;
261                }
262                '-' => {
263                    state.advance(1);
264                    if let Some('>') = state.current() {
265                        state.advance(1);
266                        if let Some('>') = state.current() {
267                            state.advance(1);
268                            state.add_token(RSyntaxKind::DoubleRightArrow, start_pos, state.get_position());
269                            return true;
270                        }
271                        state.add_token(RSyntaxKind::RightArrow, start_pos, state.get_position());
272                        return true;
273                    }
274                    state.add_token(RSyntaxKind::Minus, start_pos, state.get_position());
275                    return true;
276                }
277                '=' => {
278                    state.advance(1);
279                    if let Some('=') = state.current() {
280                        state.advance(1);
281                        state.add_token(RSyntaxKind::EqualEqual, start_pos, state.get_position());
282                        return true;
283                    }
284                    state.add_token(RSyntaxKind::Equal, start_pos, state.get_position());
285                    return true;
286                }
287                '!' => {
288                    state.advance(1);
289                    if let Some('=') = state.current() {
290                        state.advance(1);
291                        state.add_token(RSyntaxKind::NotEqual, start_pos, state.get_position());
292                        return true;
293                    }
294                    state.add_token(RSyntaxKind::Not, start_pos, state.get_position());
295                    return true;
296                }
297                '>' => {
298                    state.advance(1);
299                    if let Some('=') = state.current() {
300                        state.advance(1);
301                        state.add_token(RSyntaxKind::GreaterEqual, start_pos, state.get_position());
302                        return true;
303                    }
304                    state.add_token(RSyntaxKind::Greater, start_pos, state.get_position());
305                    return true;
306                }
307                '&' => {
308                    state.advance(1);
309                    if let Some('&') = state.current() {
310                        state.advance(1);
311                        state.add_token(RSyntaxKind::AndAnd, start_pos, state.get_position());
312                        return true;
313                    }
314                    state.add_token(RSyntaxKind::And, start_pos, state.get_position());
315                    return true;
316                }
317                '|' => {
318                    state.advance(1);
319                    if let Some('|') = state.current() {
320                        state.advance(1);
321                        state.add_token(RSyntaxKind::OrOr, start_pos, state.get_position());
322                        return true;
323                    }
324                    if let Some('>') = state.current() {
325                        state.advance(1);
326                        state.add_token(RSyntaxKind::Pipe, start_pos, state.get_position());
327                        return true;
328                    }
329                    state.add_token(RSyntaxKind::Or, start_pos, state.get_position());
330                    return true;
331                }
332                '%' => {
333                    state.advance(1);
334                    while let Some(c) = state.current() {
335                        state.advance(c.len_utf8());
336                        if c == '%' {
337                            state.add_token(RSyntaxKind::Operator, start_pos, state.get_position());
338                            return true;
339                        }
340                    }
341                    // 未闭合的操作符
342                    state.add_token(RSyntaxKind::Operator, start_pos, state.get_position());
343                    return true;
344                }
345                _ => {}
346            }
347        }
348        false
349    }
350
351    /// 处理单字符标记
352    fn lex_single_char_tokens<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
353        if let Some(ch) = state.current() {
354            let start_pos = state.get_position();
355            let kind = match ch {
356                '(' => Some(RSyntaxKind::LeftParen),
357                ')' => Some(RSyntaxKind::RightParen),
358                '[' => Some(RSyntaxKind::LeftBracket),
359                ']' => Some(RSyntaxKind::RightBracket),
360                '{' => Some(RSyntaxKind::LeftBrace),
361                '}' => Some(RSyntaxKind::RightBrace),
362                ',' => Some(RSyntaxKind::Comma),
363                ';' => Some(RSyntaxKind::Semicolon),
364                '+' => Some(RSyntaxKind::Plus),
365                '*' => Some(RSyntaxKind::Star),
366                '/' => Some(RSyntaxKind::Slash),
367                '^' => Some(RSyntaxKind::Caret),
368                '$' => Some(RSyntaxKind::Dollar),
369                '@' => Some(RSyntaxKind::At),
370                '~' => Some(RSyntaxKind::Tilde),
371                ':' => {
372                    state.advance(1);
373                    if let Some(':') = state.current() {
374                        state.advance(1);
375                        if let Some(':') = state.current() {
376                            state.advance(1);
377                            Some(RSyntaxKind::TripleColon)
378                        }
379                        else {
380                            Some(RSyntaxKind::DoubleColon)
381                        }
382                    }
383                    else {
384                        return {
385                            state.add_token(RSyntaxKind::Colon, start_pos, state.get_position());
386                            true
387                        };
388                    }
389                }
390                '?' => Some(RSyntaxKind::Question),
391                _ => None,
392            };
393
394            if let Some(k) = kind {
395                if !matches!(k, RSyntaxKind::TripleColon | RSyntaxKind::DoubleColon) {
396                    state.advance(1);
397                }
398                state.add_token(k, start_pos, state.get_position());
399                return true;
400            }
401        }
402        false
403    }
404
405    /// 处理其他字符
406    fn lex_other<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
407        if let Some(ch) = state.current() {
408            let start_pos = state.get_position();
409            let len = ch.len_utf8();
410            state.advance(len);
411            state.add_token(RSyntaxKind::Error, start_pos, state.get_position());
412            return true;
413        }
414        false
415    }
416}