1use std::ops::Range;
4
5use logos::Logos;
6
7use crate::syntax_kind::SyntaxKind;
8
9#[derive(Clone, Debug, Eq, PartialEq)]
17pub struct LexedSource<'a> {
18 source: &'a str,
19 tokens: Vec<Token<'a>>,
20}
21
22impl<'a> LexedSource<'a> {
23 pub fn as_str(&self) -> &'a str {
25 self.source
26 }
27
28 pub fn tokens(&self) -> &[Token<'a>] {
30 &self.tokens
31 }
32
33 pub fn iter(&self) -> impl Iterator<Item = &Token<'a>> {
35 self.tokens.iter()
36 }
37
38 pub fn into_tokens(self) -> Vec<Token<'a>> {
40 self.tokens
41 }
42
43 pub fn len(&self) -> usize {
45 self.tokens.len()
46 }
47
48 pub fn is_empty(&self) -> bool {
50 self.tokens.is_empty()
51 }
52}
53
54impl<'a> IntoIterator for LexedSource<'a> {
55 type IntoIter = std::vec::IntoIter<Token<'a>>;
56 type Item = Token<'a>;
57
58 fn into_iter(self) -> Self::IntoIter {
59 self.tokens.into_iter()
60 }
61}
62
63impl<'a, 's> IntoIterator for &'s LexedSource<'a> {
64 type IntoIter = std::slice::Iter<'s, Token<'a>>;
65 type Item = &'s Token<'a>;
66
67 fn into_iter(self) -> Self::IntoIter {
68 self.tokens.iter()
69 }
70}
71
72#[derive(Clone, Debug, Eq, PartialEq)]
77pub struct Token<'a> {
78 kind: SyntaxKind,
79 text: &'a str,
80 range: Range<usize>,
81}
82
83impl<'a> Token<'a> {
84 pub fn kind(&self) -> SyntaxKind {
86 self.kind
87 }
88
89 pub fn text(&self) -> &'a str {
91 self.text
92 }
93
94 pub fn range(&self) -> Range<usize> {
96 self.range.clone()
97 }
98}
99
100#[derive(Clone, Copy, Debug, Logos, PartialEq)]
101enum RawTokenKind {
102 #[regex(r"[ \t\n\r]+")]
103 Whitespace,
104 #[regex(r#""([^"\\\n\r]|\\[^\n\r])*""#)]
105 #[regex(r#"'([^'\\\n\r]|\\[^\n\r])*'"#)]
106 String,
107 #[regex(r#""([^"\\\n\r]|\\[^\n\r])*"#, unterminated_string)]
108 #[regex(r#"'([^'\\\n\r]|\\[^\n\r])*"#, unterminated_string)]
109 UnterminatedString,
110 #[regex(r"//[^\n\r]*", allow_greedy = true)]
111 LineComment,
112 #[regex(r"/\*", block_comment)]
113 BlockComment,
114 #[regex(r"[0-9]+(\.[0-9]+)?_?(mm|cm|m|inch|in|ft|yd|deg|rad|\?)?")]
115 #[regex(r"\.[0-9]+_?(mm|cm|m|inch|in|ft|yd|deg|rad|\?)?")]
116 Number,
117 #[token("..<")]
118 DoublePeriodLessThan,
119 #[token("..")]
120 DoublePeriod,
121 #[token("::")]
122 DoubleColon,
123 #[regex(r"[\p{Alphabetic}_][\p{Alphabetic}0-9_]*")]
124 Word,
125 #[token(">=")]
126 GtEq,
127 #[token("<=")]
128 LtEq,
129 #[token("==")]
130 EqEq,
131 #[token("=>")]
132 FatArrow,
133 #[token("!=")]
134 BangEq,
135 #[token("|>")]
136 PipeGt,
137 #[token("*")]
138 Star,
139 #[token("+")]
140 Plus,
141 #[token("-")]
142 Minus,
143 #[token("/")]
144 Slash,
145 #[token("%")]
146 Percent,
147 #[token("=")]
148 Eq,
149 #[token("<")]
150 Lt,
151 #[token(">")]
152 Gt,
153 #[token("\\")]
154 Backslash,
155 #[token("^")]
156 Caret,
157 #[token("||")]
158 PipePipe,
159 #[token("&&")]
160 AmpAmp,
161 #[token("|")]
162 Pipe,
163 #[token("&")]
164 Amp,
165 #[token("(")]
166 OpenParen,
167 #[token(")")]
168 CloseParen,
169 #[token("{")]
170 OpenBrace,
171 #[token("}")]
172 CloseBrace,
173 #[token("[")]
174 OpenBracket,
175 #[token("]")]
176 CloseBracket,
177 #[token("#")]
178 Hash,
179 #[token("!")]
180 Bang,
181 #[token("$")]
182 Dollar,
183 #[token(",")]
184 Comma,
185 #[token(":")]
186 Colon,
187 #[token(".")]
188 Period,
189 #[token("?")]
190 QuestionMark,
191 #[token("@")]
192 At,
193 #[token(";")]
194 SemiColon,
195}
196
197pub fn lex(source: &str) -> LexedSource<'_> {
202 let mut lexer = RawTokenKind::lexer(source);
203 let mut tokens = Vec::new();
204
205 while let Some(raw_kind) = lexer.next() {
206 let range = lexer.span();
207 let text = &source[range.clone()];
208 let kind = match raw_kind {
209 Ok(RawTokenKind::Whitespace) => SyntaxKind::Whitespace,
210 Ok(RawTokenKind::String) => SyntaxKind::String,
211 Ok(RawTokenKind::UnterminatedString) => SyntaxKind::UnterminatedString,
212 Ok(RawTokenKind::LineComment) => SyntaxKind::LineComment,
213 Ok(RawTokenKind::BlockComment) if text.ends_with("*/") => SyntaxKind::BlockComment,
214 Ok(RawTokenKind::BlockComment) => SyntaxKind::UnterminatedBlockComment,
215 Ok(RawTokenKind::Number) => SyntaxKind::Number,
216 Ok(RawTokenKind::DoublePeriodLessThan) => SyntaxKind::DoublePeriodLessThan,
217 Ok(RawTokenKind::DoublePeriod) => SyntaxKind::DoublePeriod,
218 Ok(RawTokenKind::DoubleColon) => SyntaxKind::DoubleColon,
219 Ok(RawTokenKind::Word) => keyword_or_word(text),
220 Ok(RawTokenKind::GtEq) => SyntaxKind::GtEq,
221 Ok(RawTokenKind::LtEq) => SyntaxKind::LtEq,
222 Ok(RawTokenKind::EqEq) => SyntaxKind::EqEq,
223 Ok(RawTokenKind::FatArrow) => SyntaxKind::FatArrow,
224 Ok(RawTokenKind::BangEq) => SyntaxKind::BangEq,
225 Ok(RawTokenKind::PipeGt) => SyntaxKind::PipeGt,
226 Ok(RawTokenKind::Star) => SyntaxKind::Star,
227 Ok(RawTokenKind::Plus) => SyntaxKind::Plus,
228 Ok(RawTokenKind::Minus) => SyntaxKind::Minus,
229 Ok(RawTokenKind::Slash) => SyntaxKind::Slash,
230 Ok(RawTokenKind::Percent) => SyntaxKind::Percent,
231 Ok(RawTokenKind::Eq) => SyntaxKind::Eq,
232 Ok(RawTokenKind::Lt) => SyntaxKind::Lt,
233 Ok(RawTokenKind::Gt) => SyntaxKind::Gt,
234 Ok(RawTokenKind::Backslash) => SyntaxKind::Backslash,
235 Ok(RawTokenKind::Caret) => SyntaxKind::Caret,
236 Ok(RawTokenKind::PipePipe) => SyntaxKind::PipePipe,
237 Ok(RawTokenKind::AmpAmp) => SyntaxKind::AmpAmp,
238 Ok(RawTokenKind::Pipe) => SyntaxKind::Pipe,
239 Ok(RawTokenKind::Amp) => SyntaxKind::Amp,
240 Ok(RawTokenKind::OpenParen) => SyntaxKind::OpenParen,
241 Ok(RawTokenKind::CloseParen) => SyntaxKind::CloseParen,
242 Ok(RawTokenKind::OpenBrace) => SyntaxKind::OpenBrace,
243 Ok(RawTokenKind::CloseBrace) => SyntaxKind::CloseBrace,
244 Ok(RawTokenKind::OpenBracket) => SyntaxKind::OpenBracket,
245 Ok(RawTokenKind::CloseBracket) => SyntaxKind::CloseBracket,
246 Ok(RawTokenKind::Hash) => SyntaxKind::Hash,
247 Ok(RawTokenKind::Bang) => SyntaxKind::Bang,
248 Ok(RawTokenKind::Dollar) => SyntaxKind::Dollar,
249 Ok(RawTokenKind::Comma) => SyntaxKind::Comma,
250 Ok(RawTokenKind::Colon) => SyntaxKind::Colon,
251 Ok(RawTokenKind::Period) => SyntaxKind::Period,
252 Ok(RawTokenKind::QuestionMark) => SyntaxKind::QuestionMark,
253 Ok(RawTokenKind::At) => SyntaxKind::At,
254 Ok(RawTokenKind::SemiColon) => SyntaxKind::SemiColon,
255 Err(()) => SyntaxKind::Unknown,
256 };
257 tokens.push(Token { kind, text, range });
258 }
259
260 LexedSource { source, tokens }
261}
262
263fn keyword_or_word(text: &str) -> SyntaxKind {
264 match text {
265 "if" => SyntaxKind::IfKw,
266 "else" => SyntaxKind::ElseKw,
267 "for" => SyntaxKind::ForKw,
268 "while" => SyntaxKind::WhileKw,
269 "return" => SyntaxKind::ReturnKw,
270 "break" => SyntaxKind::BreakKw,
271 "continue" => SyntaxKind::ContinueKw,
272 "fn" => SyntaxKind::FnKw,
273 "let" => SyntaxKind::LetKw,
274 "mut" => SyntaxKind::MutKw,
275 "as" => SyntaxKind::AsKw,
276 "loop" => SyntaxKind::LoopKw,
277 "true" => SyntaxKind::TrueKw,
278 "false" => SyntaxKind::FalseKw,
279 "nil" => SyntaxKind::NilKw,
280 "and" => SyntaxKind::AndKw,
281 "or" => SyntaxKind::OrKw,
282 "not" => SyntaxKind::NotKw,
283 "var" => SyntaxKind::VarKw,
284 "const" => SyntaxKind::ConstKw,
285 "import" => SyntaxKind::ImportKw,
286 "export" => SyntaxKind::ExportKw,
287 "type" => SyntaxKind::TypeKw,
288 "interface" => SyntaxKind::InterfaceKw,
289 "new" => SyntaxKind::NewKw,
290 "self" => SyntaxKind::SelfKw,
291 "record" => SyntaxKind::RecordKw,
292 "struct" => SyntaxKind::StructKw,
293 "object" => SyntaxKind::ObjectKw,
294 _ => SyntaxKind::Word,
295 }
296}
297
298fn block_comment(lexer: &mut logos::Lexer<'_, RawTokenKind>) {
299 if let Some(end) = lexer.remainder().find("*/") {
300 lexer.bump(end + 2);
301 } else {
302 lexer.bump(lexer.remainder().len());
303 }
304}
305
306fn unterminated_string(lexer: &mut logos::Lexer<'_, RawTokenKind>) {
307 let until_line_end = lexer
308 .remainder()
309 .find(['\n', '\r'])
310 .unwrap_or_else(|| lexer.remainder().len());
311 lexer.bump(until_line_end);
312}