1#![doc = include_str!("readme.md")]
2use crate::{language::RhombusLanguage, lexer::token_type::RhombusTokenType};
3pub mod token_type;
5use oak_core::{
6 Lexer, LexerCache, LexerState, OakError, Source, TextEdit,
7 lexer::{CommentConfig, LexOutput, StringConfig, WhitespaceConfig},
8};
9use std::sync::LazyLock;
10
11pub(crate) type State<'a, S> = LexerState<'a, S, RhombusLanguage>;
12
13static RHOMBUS_WHITESPACE: LazyLock<WhitespaceConfig> = LazyLock::new(|| WhitespaceConfig { unicode_whitespace: true });
14static RHOMBUS_COMMENT: LazyLock<CommentConfig> = LazyLock::new(|| CommentConfig { line_marker: "//", block_start: "/*", block_end: "*/", nested_blocks: true });
15static RHOMBUS_STRING: LazyLock<StringConfig> = LazyLock::new(|| StringConfig { quotes: &['"'], escape: Some('\\') });
16
17#[derive(Clone, Debug)]
19pub struct RhombusLexer<'config> {
20 config: &'config RhombusLanguage,
21}
22
23impl<'config> Lexer<RhombusLanguage> for RhombusLexer<'config> {
24 fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<RhombusLanguage>) -> LexOutput<RhombusLanguage> {
25 let mut state: State<'_, S> = LexerState::new_with_cache(source, 0, cache);
26 let result = self.run(&mut state);
27 state.finish_with_cache(result, cache)
28 }
29}
30
31impl<'config> RhombusLexer<'config> {
32 pub fn new(config: &'config RhombusLanguage) -> Self {
34 Self { config }
35 }
36
37 fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
38 while state.not_at_end() {
39 let safe_point = state.get_position();
40
41 if self.skip_whitespace(state) {
42 continue;
43 }
44
45 if self.lex_newline(state) {
46 continue;
47 }
48
49 if self.skip_comment(state) {
50 continue;
51 }
52
53 if self.lex_string_literal(state) {
54 continue;
55 }
56
57 if self.lex_number_literal(state) {
58 continue;
59 }
60
61 if self.lex_identifier_or_keyword(state) {
62 continue;
63 }
64
65 if self.lex_single_char_tokens(state) {
66 continue;
67 }
68
69 let start_pos = state.get_position();
71 if let Some(ch) = state.peek() {
72 state.advance(ch.len_utf8());
73 state.add_token(RhombusTokenType::Error, start_pos, state.get_position());
74 }
75
76 state.advance_if_dead_lock(safe_point)
77 }
78
79 state.add_eof();
81 Ok(())
82 }
83
84 fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
85 RHOMBUS_WHITESPACE.scan(state, RhombusTokenType::Whitespace)
86 }
87
88 fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
90 let start_pos = state.get_position();
91
92 if let Some('\n') = state.peek() {
93 state.advance(1);
94 state.add_token(RhombusTokenType::Newline, start_pos, state.get_position());
95 true
96 }
97 else if let Some('\r') = state.peek() {
98 state.advance(1);
99 if let Some('\n') = state.peek() {
100 state.advance(1);
101 }
102 state.add_token(RhombusTokenType::Newline, start_pos, state.get_position());
103 true
104 }
105 else {
106 false
107 }
108 }
109
110 fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
111 RHOMBUS_COMMENT.scan(state, RhombusTokenType::LineComment, RhombusTokenType::Comment)
112 }
113
114 fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
115 RHOMBUS_STRING.scan(state, RhombusTokenType::StringLiteral)
116 }
117
118 fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
119 let start = state.get_position();
120 let mut len = 0;
121 let mut has_digits = false;
122
123 {
124 let rest = state.rest();
125 if rest.is_empty() {
126 return false;
127 }
128
129 let first_char = rest.chars().next().unwrap();
130 if !first_char.is_ascii_digit() {
131 return false;
132 }
133
134 let mut chars = rest.chars();
135 while let Some(ch) = chars.next() {
136 if ch.is_ascii_digit() || ch == '.' || ch == '_' {
137 len += ch.len_utf8();
138 if ch.is_ascii_digit() {
139 has_digits = true;
140 }
141 }
142 else {
143 break;
144 }
145 }
146 }
147
148 if has_digits {
149 state.advance(len);
150 let end = state.get_position();
151 state.add_token(RhombusTokenType::NumberLiteral, start, end);
152 true
153 }
154 else {
155 false
156 }
157 }
158
159 fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
160 let start = state.get_position();
161 let mut len;
162
163 {
164 let rest = state.rest();
165 if rest.is_empty() {
166 return false;
167 }
168
169 let first_char = rest.chars().next().unwrap();
170 if !self.is_identifier_start(first_char) {
171 return false;
172 }
173
174 len = first_char.len_utf8();
175 let mut chars = rest.chars().skip(1);
176
177 while let Some(ch) = chars.next() {
178 if self.is_identifier_continue(ch) {
179 len += ch.len_utf8();
180 }
181 else {
182 break;
183 }
184 }
185 }
186
187 let text = state.get_text_in(oak_core::Range { start, end: start + len }).to_string();
188 state.advance(len);
189 let end = state.get_position();
190
191 let kind = match text.as_str() {
192 "fun" => RhombusTokenType::Fun,
193 "val" => RhombusTokenType::Val,
194 "var" => RhombusTokenType::Var,
195 "let" => RhombusTokenType::Let,
196 "if" => RhombusTokenType::If,
197 "else" => RhombusTokenType::Else,
198 "match" => RhombusTokenType::Match,
199 "case" => RhombusTokenType::Case,
200 "block" => RhombusTokenType::Block,
201 "module" => RhombusTokenType::Module,
202 "import" => RhombusTokenType::Import,
203 "export" => RhombusTokenType::Export,
204 "require" => RhombusTokenType::Require,
205 "provide" => RhombusTokenType::Provide,
206 "true" | "false" => RhombusTokenType::BooleanLiteral,
207 _ => RhombusTokenType::Identifier,
208 };
209
210 state.add_token(kind, start, end);
211 true
212 }
213
214 fn is_identifier_start(&self, ch: char) -> bool {
215 ch.is_alphabetic() || ch == '_'
216 }
217
218 fn is_identifier_continue(&self, ch: char) -> bool {
219 self.is_identifier_start(ch) || ch.is_ascii_digit()
220 }
221
222 fn lex_single_char_tokens<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
223 let start = state.get_position();
224 let ch = match state.peek() {
225 Some(ch) => ch,
226 None => return false,
227 };
228
229 let kind = match ch {
230 '(' => Some(RhombusTokenType::LeftParen),
231 ')' => Some(RhombusTokenType::RightParen),
232 '[' => Some(RhombusTokenType::LeftBracket),
233 ']' => Some(RhombusTokenType::RightBracket),
234 '{' => Some(RhombusTokenType::LeftBrace),
235 '}' => Some(RhombusTokenType::RightBrace),
236 '.' => Some(RhombusTokenType::Dot),
237 ',' => Some(RhombusTokenType::Comma),
238 ':' => Some(RhombusTokenType::Colon),
239 ';' => Some(RhombusTokenType::Semicolon),
240 _ => None,
241 };
242
243 if let Some(kind) = kind {
244 state.advance(ch.len_utf8());
245 state.add_token(kind, start, state.get_position());
246 true
247 }
248 else {
249 false
250 }
251 }
252}