Skip to main content

oak_rhombus/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2use crate::{language::RhombusLanguage, lexer::token_type::RhombusTokenType};
3/// Token type definitions for Rhombus lexer.
4pub mod token_type;
5use oak_core::{
6    Lexer, LexerCache, LexerState, OakError, Source, TextEdit,
7    lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
8};
9use std::sync::LazyLock;
10
11pub(crate) type State<'a, S> = LexerState<'a, S, RhombusLanguage>;
12
13static RHOMBUS_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
14static RHOMBUS_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "/*", block_end: "*/", nested_blocks: true });
15static RHOMBUS_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
16
17/// Lexer for Rhombus source code.
18#[derive(Clone, Debug)]
19pub struct RhombusLexer<'config> {
20    config: &'config RhombusLanguage,
21}
22
23impl<'config> Lexer<RhombusLanguage> for RhombusLexer<'config> {
24    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<RhombusLanguage>) -> LexOutput<RhombusLanguage> {
25        let mut state: State<'_, S> = LexerState::new_with_cache(source, 0, cache);
26        let result = self.run(&mut state);
27        state.finish_with_cache(result, cache)
28    }
29}
30
31impl<'config> RhombusLexer<'config> {
32    /// Creates a new RhombusLexer with the given language configuration.
33    pub fn new(config: &'config RhombusLanguage) -> Self {
34        Self { config }
35    }
36
37    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
38        while state.not_at_end() {
39            let safe_point = state.get_position();
40
41            if self.skip_whitespace(state) {
42                continue;
43            }
44
45            if self.lex_newline(state) {
46                continue;
47            }
48
49            if self.skip_comment(state) {
50                continue;
51            }
52
53            if self.lex_string_literal(state) {
54                continue;
55            }
56
57            if self.lex_number_literal(state) {
58                continue;
59            }
60
61            if self.lex_identifier_or_keyword(state) {
62                continue;
63            }
64
65            if self.lex_single_char_tokens(state) {
66                continue;
67            }
68
69            // Error handling: if no rules match, skip the current character and mark as error
70            let start_pos = state.get_position();
71            if let Some(ch) = state.peek() {
72                state.advance(ch.len_utf8());
73                state.add_token(RhombusTokenType::Error, start_pos, state.get_position());
74            }
75
76            state.advance_if_dead_lock(safe_point)
77        }
78
79        // Add EOF token
80        state.add_eof();
81        Ok(())
82    }
83
84    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
85        RHOMBUS_WHITESPACE.scan(state, RhombusTokenType::Whitespace)
86    }
87
88    /// Handles newlines
89    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
90        let start_pos = state.get_position();
91
92        if let Some('\n') = state.peek() {
93            state.advance(1);
94            state.add_token(RhombusTokenType::Newline, start_pos, state.get_position());
95            true
96        }
97        else if let Some('\r') = state.peek() {
98            state.advance(1);
99            if let Some('\n') = state.peek() {
100                state.advance(1);
101            }
102            state.add_token(RhombusTokenType::Newline, start_pos, state.get_position());
103            true
104        }
105        else {
106            false
107        }
108    }
109
110    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
111        RHOMBUS_COMMENT.scan(state, RhombusTokenType::LineComment, RhombusTokenType::Comment)
112    }
113
114    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
115        RHOMBUS_STRING.scan(state, RhombusTokenType::StringLiteral)
116    }
117
118    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
119        let start = state.get_position();
120        let mut len = 0;
121        let mut has_digits = false;
122
123        {
124            let rest = state.rest();
125            if rest.is_empty() {
126                return false;
127            }
128
129            let first_char = rest.chars().next().unwrap();
130            if !first_char.is_ascii_digit() {
131                return false;
132            }
133
134            let mut chars = rest.chars();
135            while let Some(ch) = chars.next() {
136                if ch.is_ascii_digit() || ch == '.' || ch == '_' {
137                    len += ch.len_utf8();
138                    if ch.is_ascii_digit() {
139                        has_digits = true;
140                    }
141                }
142                else {
143                    break;
144                }
145            }
146        }
147
148        if has_digits {
149            state.advance(len);
150            let end = state.get_position();
151            state.add_token(RhombusTokenType::NumberLiteral, start, end);
152            true
153        }
154        else {
155            false
156        }
157    }
158
159    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
160        let start = state.get_position();
161        let mut len;
162
163        {
164            let rest = state.rest();
165            if rest.is_empty() {
166                return false;
167            }
168
169            let first_char = rest.chars().next().unwrap();
170            if !self.is_identifier_start(first_char) {
171                return false;
172            }
173
174            len = first_char.len_utf8();
175            let mut chars = rest.chars().skip(1);
176
177            while let Some(ch) = chars.next() {
178                if self.is_identifier_continue(ch) {
179                    len += ch.len_utf8();
180                }
181                else {
182                    break;
183                }
184            }
185        }
186
187        let text = state.get_text_in(oak_core::Range { start, end: start + len }).to_string();
188        state.advance(len);
189        let end = state.get_position();
190
191        let kind = match text.as_str() {
192            "fun" => RhombusTokenType::Fun,
193            "val" => RhombusTokenType::Val,
194            "var" => RhombusTokenType::Var,
195            "let" => RhombusTokenType::Let,
196            "if" => RhombusTokenType::If,
197            "else" => RhombusTokenType::Else,
198            "match" => RhombusTokenType::Match,
199            "case" => RhombusTokenType::Case,
200            "block" => RhombusTokenType::Block,
201            "module" => RhombusTokenType::Module,
202            "import" => RhombusTokenType::Import,
203            "export" => RhombusTokenType::Export,
204            "require" => RhombusTokenType::Require,
205            "provide" => RhombusTokenType::Provide,
206            "true" | "false" => RhombusTokenType::BooleanLiteral,
207            _ => RhombusTokenType::Identifier,
208        };
209
210        state.add_token(kind, start, end);
211        true
212    }
213
214    fn is_identifier_start(&self, ch: char) -> bool {
215        ch.is_alphabetic() || ch == '_'
216    }
217
218    fn is_identifier_continue(&self, ch: char) -> bool {
219        self.is_identifier_start(ch) || ch.is_ascii_digit()
220    }
221
222    fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
223        let start = state.get_position();
224        let ch = match state.peek() {
225            Some(ch) => ch,
226            None => return false,
227        };
228
229        let kind = match ch {
230            '(' => Some(RhombusTokenType::LeftParen),
231            ')' => Some(RhombusTokenType::RightParen),
232            '[' => Some(RhombusTokenType::LeftBracket),
233            ']' => Some(RhombusTokenType::RightBracket),
234            '{' => Some(RhombusTokenType::LeftBrace),
235            '}' => Some(RhombusTokenType::RightBrace),
236            '.' => Some(RhombusTokenType::Dot),
237            ',' => Some(RhombusTokenType::Comma),
238            ':' => Some(RhombusTokenType::Colon),
239            ';' => Some(RhombusTokenType::Semicolon),
240            _ => None,
241        };
242
243        if let Some(kind) = kind {
244            state.advance(ch.len_utf8());
245            state.add_token(kind, start, state.get_position());
246            true
247        }
248        else {
249            false
250        }
251    }
252}