Skip to main content

oak_r/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3pub use token_type::RTokenType;
4
5use crate::language::RLanguage;
6use oak_core::{Lexer, LexerCache, LexerState, Range, lexer::LexOutput, source::Source};
7
8type State<'s, S> = LexerState<'s, S, RLanguage>;
9
10#[derive(Clone)]
11pub struct RLexer<'config> {
12    _config: &'config RLanguage,
13}
14
15impl<'config> Lexer<RLanguage> for RLexer<'config> {
16    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<RLanguage>) -> LexOutput<RLanguage> {
17        let mut state = State::new(source);
18        let result = self.run(&mut state);
19        if result.is_ok() {
20            state.add_eof()
21        }
22        state.finish_with_cache(result, cache)
23    }
24}
25
26impl<'config> RLexer<'config> {
27    pub fn new(_config: &'config RLanguage) -> Self {
28        Self { _config }
29    }
30
31    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), oak_core::OakError> {
32        while state.not_at_end() {
33            let safe_point = state.get_position();
34            if self.skip_whitespace(state) {
35                continue;
36            }
37
38            if self.lex_comment(state) {
39                continue;
40            }
41
42            if self.lex_string_literal(state) {
43                continue;
44            }
45
46            if self.lex_number_literal(state) {
47                continue;
48            }
49
50            if self.lex_identifier_or_keyword(state) {
51                continue;
52            }
53
54            if self.lex_operators(state) {
55                continue;
56            }
57
58            if self.lex_single_char_tokens(state) {
59                continue;
60            }
61
62            if self.lex_other(state) {
63                continue;
64            }
65
66            state.advance_if_dead_lock(safe_point)
67        }
68        Ok(())
69    }
70
71    /// 跳过空白符
72    fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
73        if let Some(ch) = state.current() {
74            if ch.is_whitespace() {
75                state.advance(ch.len_utf8());
76                return true;
77            }
78        }
79        false
80    }
81
82    /// 处理注释
83    fn lex_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
84        if let Some('#') = state.current() {
85            let start_pos = state.get_position();
86            state.advance(1); // 跳过 '#'
87
88            // 读取到行尾
89            while let Some(ch) = state.current() {
90                if ch == '\n' || ch == '\r' {
91                    break;
92                }
93                state.advance(ch.len_utf8())
94            }
95
96            state.add_token(RTokenType::Comment, start_pos, state.get_position());
97            return true;
98        }
99        false
100    }
101
102    /// 处理字符串字面量
103    fn lex_string_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
104        if let Some(quote) = state.current() {
105            if quote == '"' || quote == '\'' {
106                let start_pos = state.get_position();
107                state.advance(1); // 跳过引号
108
109                while let Some(ch) = state.current() {
110                    if ch == quote {
111                        state.advance(1); // 跳过结束引号
112                        state.add_token(RTokenType::StringLiteral, start_pos, state.get_position());
113                        return true;
114                    }
115                    if ch == '\\' {
116                        state.advance(1);
117                        if let Some(escaped) = state.current() {
118                            state.advance(escaped.len_utf8());
119                            continue;
120                        }
121                    }
122                    state.advance(ch.len_utf8())
123                }
124
125                // 未闭合字符串
126                state.add_token(RTokenType::StringLiteral, start_pos, state.get_position());
127                return true;
128            }
129        }
130        false
131    }
132
133    /// 处理数字字面量
134    fn lex_number_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
135        if let Some(ch) = state.current() {
136            if ch.is_ascii_digit() || (ch == '.' && state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit())) {
137                let start_pos = state.get_position();
138                let mut has_dot = false;
139
140                while let Some(c) = state.current() {
141                    if c.is_ascii_digit() {
142                        state.advance(1)
143                    }
144                    else if c == '.' && !has_dot {
145                        has_dot = true;
146                        state.advance(1)
147                    }
148                    else if (c == 'e' || c == 'E') && !state.peek_next_n(1).map_or(false, |c| c.is_ascii_digit() || c == '+' || c == '-') {
149                        break;
150                    }
151                    else if c == 'e' || c == 'E' {
152                        state.advance(1);
153                        if let Some(next) = state.current() {
154                            if next == '+' || next == '-' {
155                                state.advance(1)
156                            }
157                        }
158                        while let Some(digit) = state.current() {
159                            if digit.is_ascii_digit() { state.advance(1) } else { break }
160                        }
161                        break;
162                    }
163                    else if c == 'L' {
164                        state.advance(1);
165                        state.add_token(RTokenType::IntegerLiteral, start_pos, state.get_position());
166                        return true;
167                    }
168                    else if c == 'i' {
169                        state.advance(1);
170                        state.add_token(RTokenType::FloatLiteral, start_pos, state.get_position());
171                        return true;
172                    }
173                    else {
174                        break;
175                    }
176                }
177
178                let kind = if has_dot { RTokenType::FloatLiteral } else { RTokenType::IntegerLiteral };
179                state.add_token(kind, start_pos, state.get_position());
180                return true;
181            }
182        }
183        false
184    }
185
186    /// 处理标识符或关键字
187    fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
188        if let Some(ch) = state.current() {
189            if ch.is_alphabetic() || ch == '.' || ch == '_' {
190                let start_pos = state.get_position();
191                state.advance(ch.len_utf8());
192
193                while let Some(c) = state.current() {
194                    if c.is_alphanumeric() || c == '.' || c == '_' { state.advance(c.len_utf8()) } else { break }
195                }
196
197                let text = state.get_text_in(Range { start: start_pos, end: state.get_position() });
198                let kind = match text.as_ref() {
199                    "if" => RTokenType::If,
200                    "else" => RTokenType::Else,
201                    "for" => RTokenType::For,
202                    "in" => RTokenType::In,
203                    "while" => RTokenType::While,
204                    "repeat" => RTokenType::Repeat,
205                    "next" => RTokenType::Next,
206                    "break" => RTokenType::Break,
207                    "function" => RTokenType::Function,
208                    "TRUE" => RTokenType::True,
209                    "FALSE" => RTokenType::False,
210                    "NULL" => RTokenType::Null,
211                    "Inf" => RTokenType::Inf,
212                    "NaN" => RTokenType::NaN,
213                    "NA" => RTokenType::NA,
214                    "NA_integer_" => RTokenType::NaInteger,
215                    "NA_real_" => RTokenType::NaReal,
216                    "NA_complex_" => RTokenType::NaComplex,
217                    "NA_character_" => RTokenType::NaCharacter,
218                    _ => RTokenType::Identifier,
219                };
220
221                state.add_token(kind, start_pos, state.get_position());
222                return true;
223            }
224        }
225        false
226    }
227
228    /// 处理操作符
229    fn lex_operators<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
230        let start_pos = state.get_position();
231        if let Some(ch) = state.current() {
232            match ch {
233                '<' => {
234                    state.advance(1);
235                    if let Some('-') = state.current() {
236                        state.advance(1);
237                        state.add_token(RTokenType::LeftArrow, start_pos, state.get_position());
238                        return true;
239                    }
240                    if let Some('<') = state.current() {
241                        state.advance(1);
242                        if let Some('-') = state.current() {
243                            state.advance(1);
244                            state.add_token(RTokenType::DoubleLeftArrow, start_pos, state.get_position());
245                            return true;
246                        }
247                    }
248                    if let Some('=') = state.current() {
249                        state.advance(1);
250                        state.add_token(RTokenType::LessEqual, start_pos, state.get_position());
251                        return true;
252                    }
253                    state.add_token(RTokenType::Less, start_pos, state.get_position());
254                    return true;
255                }
256                '-' => {
257                    state.advance(1);
258                    if let Some('>') = state.current() {
259                        state.advance(1);
260                        if let Some('>') = state.current() {
261                            state.advance(1);
262                            state.add_token(RTokenType::DoubleRightArrow, start_pos, state.get_position());
263                            return true;
264                        }
265                        state.add_token(RTokenType::RightArrow, start_pos, state.get_position());
266                        return true;
267                    }
268                    state.add_token(RTokenType::Minus, start_pos, state.get_position());
269                    return true;
270                }
271                '=' => {
272                    state.advance(1);
273                    if let Some('=') = state.current() {
274                        state.advance(1);
275                        state.add_token(RTokenType::EqualEqual, start_pos, state.get_position());
276                        return true;
277                    }
278                    state.add_token(RTokenType::Equal, start_pos, state.get_position());
279                    return true;
280                }
281                '!' => {
282                    state.advance(1);
283                    if let Some('=') = state.current() {
284                        state.advance(1);
285                        state.add_token(RTokenType::NotEqual, start_pos, state.get_position());
286                        return true;
287                    }
288                    state.add_token(RTokenType::Not, start_pos, state.get_position());
289                    return true;
290                }
291                '>' => {
292                    state.advance(1);
293                    if let Some('=') = state.current() {
294                        state.advance(1);
295                        state.add_token(RTokenType::GreaterEqual, start_pos, state.get_position());
296                        return true;
297                    }
298                    state.add_token(RTokenType::Greater, start_pos, state.get_position());
299                    return true;
300                }
301                '&' => {
302                    state.advance(1);
303                    if let Some('&') = state.current() {
304                        state.advance(1);
305                        state.add_token(RTokenType::AndAnd, start_pos, state.get_position());
306                        return true;
307                    }
308                    state.add_token(RTokenType::And, start_pos, state.get_position());
309                    return true;
310                }
311                '|' => {
312                    state.advance(1);
313                    if let Some('|') = state.current() {
314                        state.advance(1);
315                        state.add_token(RTokenType::OrOr, start_pos, state.get_position());
316                        return true;
317                    }
318                    if let Some('>') = state.current() {
319                        state.advance(1);
320                        state.add_token(RTokenType::Pipe, start_pos, state.get_position());
321                        return true;
322                    }
323                    state.add_token(RTokenType::Or, start_pos, state.get_position());
324                    return true;
325                }
326                '%' => {
327                    state.advance(1);
328                    while let Some(c) = state.current() {
329                        state.advance(c.len_utf8());
330                        if c == '%' {
331                            state.add_token(RTokenType::Operator, start_pos, state.get_position());
332                            return true;
333                        }
334                    }
335                    // 未闭合的操作符
336                    state.add_token(RTokenType::Operator, start_pos, state.get_position());
337                    return true;
338                }
339                _ => {}
340            }
341        }
342        false
343    }
344
345    /// 处理单字符标记
346    fn lex_single_char_tokens<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
347        if let Some(ch) = state.current() {
348            let start_pos = state.get_position();
349            let kind = match ch {
350                '(' => Some(RTokenType::LeftParen),
351                ')' => Some(RTokenType::RightParen),
352                '[' => Some(RTokenType::LeftBracket),
353                ']' => Some(RTokenType::RightBracket),
354                '{' => Some(RTokenType::LeftBrace),
355                '}' => Some(RTokenType::RightBrace),
356                ',' => Some(RTokenType::Comma),
357                ';' => Some(RTokenType::Semicolon),
358                '+' => Some(RTokenType::Plus),
359                '*' => Some(RTokenType::Star),
360                '/' => Some(RTokenType::Slash),
361                '^' => Some(RTokenType::Caret),
362                '$' => Some(RTokenType::Dollar),
363                '@' => Some(RTokenType::At),
364                '~' => Some(RTokenType::Tilde),
365                ':' => {
366                    state.advance(1);
367                    if let Some(':') = state.current() {
368                        state.advance(1);
369                        if let Some(':') = state.current() {
370                            state.advance(1);
371                            Some(RTokenType::TripleColon)
372                        }
373                        else {
374                            Some(RTokenType::DoubleColon)
375                        }
376                    }
377                    else {
378                        return {
379                            state.add_token(RTokenType::Colon, start_pos, state.get_position());
380                            true
381                        };
382                    }
383                }
384                '?' => Some(RTokenType::Question),
385                _ => None,
386            };
387
388            if let Some(k) = kind {
389                if !matches!(k, RTokenType::TripleColon | RTokenType::DoubleColon) {
390                    state.advance(1);
391                }
392                state.add_token(k, start_pos, state.get_position());
393                return true;
394            }
395        }
396        false
397    }
398
399    /// 处理其他字符
400    fn lex_other<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
401        if let Some(ch) = state.current() {
402            let start_pos = state.get_position();
403            let len = ch.len_utf8();
404            state.advance(len);
405            state.add_token(RTokenType::Error, start_pos, state.get_position());
406            return true;
407        }
408        false
409    }
410}