1use anyhow::Result;
2use regex::Regex;
3use std::collections::HashMap;
4use stdext::function_name;
5use strum::IntoEnumIterator;
6
7use crate::syntax::lexer::token::{Builtin, Delimiter, Operator, TokenKind};
8
9use self::{
10 error::LexError,
11 token::{Keyword, Token},
12};
13
14use super::source::{range::Range, Source};
15
16pub(crate) mod error;
17pub(crate) mod token;
18
19#[derive(Debug)]
20pub struct Comment {
21 pub content: String,
22 pub range: Range,
23}
24
25impl Comment {
26 pub fn new(content: String, range: Range) -> Self {
27 Self { content, range }
28 }
29}
30
31#[derive(Debug)]
32pub struct Lexer {
33 source: Box<dyn Source>,
34 peek: Option<Result<Token, LexError>>,
35 keys: HashMap<String, Keyword>,
36 pub comments: Vec<Comment>,
37}
38
39impl Lexer {
40 pub fn new(source: Box<dyn Source>) -> Self {
41 log::trace!("{}", function_name!());
42 let mut keys = HashMap::new();
43
44 for key in Keyword::iter() {
45 keys.insert(key.to_string(), key);
46 }
47
48 Self {
49 source,
50 peek: None,
51 keys,
52 comments: vec![],
53 }
54 }
55 fn next(&mut self) -> Result<Token, LexError> {
56 log::trace!("{}", function_name!());
57 let from = self.source.pos();
58 let next;
59 match self.source.peek() {
60 Some(c) => match c {
61 c if c.is_whitespace() => {
62 self.source.eat();
63 return self.next();
64 }
65 '-' => {
66 self.source.eat();
67 if self.source.peek() == Some('-') {
68 log::trace!("comment");
69 self.source.eat();
70 let mut comment = String::new();
71 while let Some(c) = self.source.peek() {
72 comment.push(c);
73 if c == '\n' {
74 self.comments.push(Comment::new(
75 comment.clone(),
76 Range::new(from, self.source.pos()),
77 ));
78 self.source.eat();
79 break;
80 }
81 self.source.eat();
82 }
83 return self.next();
84 }
85 return Err(LexError::new(
86 "unfinished comment: missing '-'".to_string(),
87 Range::new(from, self.source.pos()),
88 ));
89 }
90 ',' => {
91 self.source.eat();
92 next = Token {
93 range: Range::new(from, self.source.pos()),
94 kind: Delimiter::Comma.into(),
95 }
96 }
97
98 '|' => {
99 self.source.eat();
100 next = Token {
101 range: Range::new(from, self.source.pos()),
102 kind: Delimiter::Pipe.into(),
103 }
104 }
105 '(' => {
106 self.source.eat();
107 next = Token {
108 range: Range::new(from, self.source.pos()),
109 kind: Delimiter::LParen.into(),
110 }
111 }
112
113 ')' => {
114 self.source.eat();
115 next = Token {
116 range: Range::new(from, self.source.pos()),
117 kind: Delimiter::RParen.into(),
118 }
119 }
120
121 '[' => {
122 self.source.eat();
123 next = Token {
124 range: Range::new(from, self.source.pos()),
125 kind: Delimiter::LBracket.into(),
126 }
127 }
128
129 ']' => {
130 self.source.eat();
131 next = Token {
132 range: Range::new(from, self.source.pos()),
133 kind: Delimiter::RBracket.into(),
134 }
135 }
136
137 '{' => {
138 self.source.eat();
139 if self.source.peek() == Some('}') {
140 self.source.eat();
141 next = Token {
142 range: Range::new(from, self.source.pos()),
143 kind: Delimiter::LDBrace.into(),
144 }
145 } else {
146 next = Token {
147 range: Range::new(from, self.source.pos()),
148 kind: Delimiter::LBrace.into(),
149 }
150 }
151 }
152
153 '}' => {
154 self.source.eat();
155 if self.source.peek() == Some('}') {
156 self.source.eat();
157 next = Token {
158 range: Range::new(from, self.source.pos()),
159 kind: Delimiter::RDBrace.into(),
160 }
161 } else {
162 next = Token {
163 range: Range::new(from, self.source.pos()),
164 kind: Delimiter::RBrace.into(),
165 }
166 }
167 }
168
169 '<' => {
170 self.source.eat();
171 if self.source.peek() == Some('<') {
172 self.source.eat();
173 next = Token {
174 range: Range::new(from, self.source.pos()),
175 kind: Delimiter::LDAngle.into(),
176 }
177 } else {
178 next = Token {
179 range: Range::new(from, self.source.pos()),
180 kind: Delimiter::LAngle.into(),
181 }
182 }
183 }
184
185 '>' => {
186 self.source.eat();
187 if self.source.peek() == Some('>') {
188 self.source.eat();
189 next = Token {
190 range: Range::new(from, self.source.pos()),
191 kind: Delimiter::RDAngle.into(),
192 }
193 } else {
194 next = Token {
195 range: Range::new(from, self.source.pos()),
196 kind: Delimiter::RAngle.into(),
197 }
198 }
199 }
200
201 '≔' => {
202 self.source.eat();
203 next = Token {
204 range: Range::new(from, self.source.pos()),
205 kind: Operator::Def.into(),
206 }
207 }
208 '∷' => {
209 self.source.eat();
210 next = Token {
211 range: Range::new(from, self.source.pos()),
212 kind: Operator::Concat.into(),
213 }
214 }
215 ':' => {
216 self.source.eat();
217 if self.source.peek() == Some('=') {
218 self.source.eat();
219 next = Token {
220 range: Range::new(from, self.source.pos()),
221 kind: Operator::Def.into(),
222 }
223 } else if self.source.peek() == Some(':') {
224 self.source.eat();
225 next = Token {
226 range: Range::new(from, self.source.pos()),
227 kind: Operator::Concat.into(),
228 }
229 } else {
230 next = Token {
231 range: Range::new(from, self.source.pos()),
232 kind: Operator::TypeDef.into(),
233 }
234 }
235 }
236 '\'' => next = self.character()?,
237 '"' => next = self.string()?,
238 '/' => next = self.regex()?,
239 _ => next = self.identifier(),
240 },
241 None => {
242 next = Token {
243 range: Range::new(from, self.source.pos()),
244 kind: TokenKind::EOF,
245 }
246 }
247 };
248
249 Ok(next)
250 }
251
252 pub fn eat(&mut self) -> Result<Token> {
253 log::trace!("{}", function_name!());
254 match self.peek.clone() {
255 Some(p) => {
256 self.peek = None;
257 match p {
258 Ok(t) => Ok(t),
259 Err(e) => anyhow::bail!(e),
260 }
261 }
262 None => match self.next() {
263 Ok(t) => Ok(t),
264 Err(e) => anyhow::bail!(e),
265 },
266 }
267 }
268
269 pub fn peek(&mut self) -> Result<Token> {
270 log::trace!("{}", function_name!());
271 match self.peek.clone() {
272 Some(p) => match p {
273 Ok(t) => Ok(t),
274 Err(e) => anyhow::bail!(e),
275 },
276 None => {
277 let next = self.next();
278 self.peek = Some(next.clone());
279 match next {
280 Ok(o) => Ok(o),
281 Err(e) => anyhow::bail!(e),
282 }
283 }
284 }
285 }
286
287 fn identifier(&mut self) -> Token {
288 log::trace!("{}", function_name!());
289 let from = self.source.pos();
290 let mut id = String::new();
291 let mut mc = self.source.peek();
292
293 while let Some(c) = mc {
294 if c.is_whitespace() || (c.is_ascii_punctuation() && c != '-' && c != '_') {
295 break;
296 }
297
298 id.push(c);
299 self.source.eat();
300 mc = self.source.peek();
301 }
302
303 if let Some(k) = self.keys.get(&id) {
304 return Token {
305 range: Range::new(from, self.source.pos()),
306 kind: (*k).into(),
307 };
308 }
309
310 Token {
311 range: Range::new(from, self.source.pos()),
312 kind: TokenKind::Identifier(id),
313 }
314 }
315
316 fn character(&mut self) -> Result<Token, LexError> {
317 log::trace!("{}", function_name!());
318 let from = self.source.pos();
319 if self.source.peek() != Some('\'') {
320 return Err(LexError::new(
321 "missing '\' when lexing string".to_string(),
322 Range::new(from, self.source.pos()),
323 ));
324 }
325
326 self.source.eat();
327 let mut string = String::new();
328
329 match self.source.peek() {
330 Some(c) => {
331 self.source.eat();
332 if c == '\\' {
333 match self.source.eat() {
334 Some(c) => match c {
335 '\'' => string.push_str("\\\'"),
336 '/' => string.push_str("\\/"),
337 '\\' => string.push_str("\\\\"),
338 '0' => string.push_str("\\0"),
339 'n' => string.push_str("\\n"),
340 't' => string.push_str("\\t"),
341 'r' => string.push_str("\\r"),
342 _ => {
343 return Err(LexError::new(
344 format!("invalid escaped literal '\\{c}'"),
345 Range::new(from, self.source.pos()),
346 ));
347 }
348 },
349 None => {
350 return Err(LexError::new(
351 "empty escaped literal".to_string(),
352 Range::new(from, self.source.pos()),
353 ));
354 }
355 }
356 } else if c == '\'' {
357 return Err(LexError::new(
358 "invalid empty literal".to_string(),
359 Range::new(from, self.source.pos()),
360 ));
361 } else {
362 string.push(c);
363 }
364 }
365 None => {
366 return Err(LexError::new(
367 "invalid literal".to_string(),
368 Range::new(from, self.source.pos()),
369 ));
370 }
371 }
372
373 if let Some(c) = self.source.peek() {
374 if c == '\'' {
375 self.source.eat();
376 return Ok(Token {
377 kind: Builtin::Char(string).into(),
378 range: Range::new(from, self.source.pos()),
379 });
380 }
381 }
382
383 return Err(LexError::new(
384 format!("unclosed literal, missing '\'' when lexing character {string}"),
385 Range::new(from, self.source.pos()),
386 ));
387 }
388
389 fn string(&mut self) -> Result<Token, LexError> {
390 log::trace!("{}", function_name!());
391 let from = self.source.pos();
392 if self.source.peek() != Some('"') {
393 return Err(LexError::new(
394 "missing '\"' when lexing string".to_string(),
395 Range::new(from, self.source.pos()),
396 ));
397 }
398
399 self.source.eat();
400 let mut string = String::new();
401
402 while let Some(c) = self.source.peek() {
403 if c == '"' {
404 self.source.eat();
405 return Ok(Token {
406 kind: Builtin::String(string).into(),
407 range: Range::new(from, self.source.pos()),
408 });
409 } else if c == '\n' {
410 return Err(LexError::new(
411 "unclosed string literal: missing '\"'".to_string(),
412 Range::new(from, self.source.pos()),
413 ));
414 } else if c == '\\' {
415 self.source.eat();
416 if let Some(c) = self.source.peek() {
417 match c {
418 '"' => string.push_str("\\\""),
419 '\\' => string.push_str("\\\\"),
420 '/' => string.push_str("\\/"),
421 '0' => string.push_str("\\0"),
422 'n' => string.push_str("\\n"),
423 't' => string.push_str("\\t"),
424 'r' => string.push_str("\\r"),
425 _ => {
426 return Err(LexError::new(
427 "unknown escape sequence '\\{c}".to_string(),
428 Range::new(from, self.source.pos()),
429 ))
430 }
431 }
432 self.source.eat();
433 }
434 } else {
435 string.push(c);
436 self.source.eat();
437 }
438 }
439
440 Err(LexError::new(
441 "unclosed string literal: missing '\"'".to_string(),
442 Range::new(from, self.source.pos()),
443 ))
444 }
445
446 fn regex(&mut self) -> Result<Token, LexError> {
447 log::trace!("{}", function_name!());
448
449 let from = self.source.pos();
450 if self.source.peek() != Some('/') {
451 return Err(LexError::new(
452 "missing '/' when lexing regex".to_string(),
453 Range::new(from, self.source.pos()),
454 ));
455 }
456
457 self.source.eat();
458 let mut string = String::new();
459
460 while let Some(c) = self.source.peek() {
461 if c == '/' {
462 self.source.eat();
463 let regex = match Regex::new(string.as_str()) {
464 Ok(r) => r,
465 Err(e) => {
466 return Err(LexError::new(
467 format!("failed parsing regex: {e}"),
468 Range::new(from, self.source.pos()),
469 ))
470 }
471 };
472 return Ok(Token {
473 kind: Builtin::Regex(regex.into()).into(),
474 range: Range::new(from, self.source.pos()),
475 });
476 } else if c == '\\' {
477 self.source.eat();
478 if let Some(c) = self.source.peek() {
479 string.push('\\');
480 string.push(c);
481 self.source.eat();
482 }
483 } else if c == '\n' {
484 return Err(LexError::new(
485 "unclosed regex: missing '/'".to_string(),
486 Range::new(from, self.source.pos()),
487 ));
488 } else {
489 string.push(c);
490 self.source.eat();
491 }
492 }
493
494 Err(LexError::new(
495 "unclosed regex: missing '/'".to_string(),
496 Range::new(from, self.source.pos()),
497 ))
498 }
499}
500
501#[cfg(test)]
502pub mod test {
503 use crate::syntax::source::string::StringSource;
504
505 use super::*;
506
507 #[test]
508 pub fn smoke_test() {
509 let source = String::from("test");
510 let mut lex = Lexer::new(Box::<StringSource>::new(source.into()));
511
512 _ = lex.peek();
513 }
514
515 #[test]
516 pub fn keys() {
517 let mut keys = vec![];
518
519 for key in Keyword::iter() {
520 keys.push(key)
521 }
522
523 let source = keys
524 .clone()
525 .iter()
526 .map(|k| k.to_string())
527 .collect::<Vec<String>>()
528 .join(" ");
529 let mut lex = Lexer::new(Box::<StringSource>::new(source.into()));
530 for key in keys {
531 let nt = lex.next();
532 assert!(nt.is_ok());
533 let t: Token = nt.unwrap();
534
535 assert_eq!(t.kind, key.into())
536 }
537 }
538
539 #[test]
540 pub fn strings() {
541 let source = String::from("\"--\"");
542
543 let mut lex = Lexer::new(Box::<StringSource>::new(source.clone().into()));
544 let t = lex.next();
545 assert!(t.is_ok());
546 let t = t.unwrap();
547
548 println!("{:?}", t);
549 assert_eq!(
550 t.kind,
551 TokenKind::Builtin(Builtin::String(String::from("--")))
552 );
553 }
554
555 #[test]
556 pub fn ids() {
557 let source = String::from("name:");
558
559 let mut lex = Lexer::new(Box::<StringSource>::new(source.clone().into()));
560
561 let t = lex.next();
562 assert!(t.is_ok());
563 let t = t.unwrap();
564 println!("{:?}", t);
565 assert!(t.kind.is_identifier(String::from("name")));
566
567 let t = lex.next();
568 assert!(t.is_ok());
569 let t = t.unwrap();
570 assert!(t.kind.is_operator(Operator::TypeDef));
571 }
572
573 #[test]
574 pub fn regexes() {
575 let source = String::from("/a///");
576
577 let mut lex = Lexer::new(Box::<StringSource>::new(source.clone().into()));
578 let t = lex.next();
579 assert!(t.is_ok());
580 let t = t.unwrap();
581
582 println!("{:?}", t);
583 assert_eq!(
584 t.kind,
585 TokenKind::Builtin(Builtin::Regex(token::RegexW {
586 regex: Regex::new("a").unwrap()
587 }))
588 );
589 }
590}