morgana_core/syntax/lexer/
mod.rs

1use anyhow::Result;
2use regex::Regex;
3use std::collections::HashMap;
4use stdext::function_name;
5use strum::IntoEnumIterator;
6
7use crate::syntax::lexer::token::{Builtin, Delimiter, Operator, TokenKind};
8
9use self::{
10    error::LexError,
11    token::{Keyword, Token},
12};
13
14use super::source::{range::Range, Source};
15
16pub(crate) mod error;
17pub(crate) mod token;
18
19#[derive(Debug)]
20pub struct Comment {
21    pub content: String,
22    pub range: Range,
23}
24
25impl Comment {
26    pub fn new(content: String, range: Range) -> Self {
27        Self { content, range }
28    }
29}
30
31#[derive(Debug)]
32pub struct Lexer {
33    source: Box<dyn Source>,
34    peek: Option<Result<Token, LexError>>,
35    keys: HashMap<String, Keyword>,
36    pub comments: Vec<Comment>,
37}
38
39impl Lexer {
40    pub fn new(source: Box<dyn Source>) -> Self {
41        log::trace!("{}", function_name!());
42        let mut keys = HashMap::new();
43
44        for key in Keyword::iter() {
45            keys.insert(key.to_string(), key);
46        }
47
48        Self {
49            source,
50            peek: None,
51            keys,
52            comments: vec![],
53        }
54    }
55    fn next(&mut self) -> Result<Token, LexError> {
56        log::trace!("{}", function_name!());
57        let from = self.source.pos();
58        let next;
59        match self.source.peek() {
60            Some(c) => match c {
61                c if c.is_whitespace() => {
62                    self.source.eat();
63                    return self.next();
64                }
65                '-' => {
66                    self.source.eat();
67                    if self.source.peek() == Some('-') {
68                        log::trace!("comment");
69                        self.source.eat();
70                        let mut comment = String::new();
71                        while let Some(c) = self.source.peek() {
72                            comment.push(c);
73                            if c == '\n' {
74                                self.comments.push(Comment::new(
75                                    comment.clone(),
76                                    Range::new(from, self.source.pos()),
77                                ));
78                                self.source.eat();
79                                break;
80                            }
81                            self.source.eat();
82                        }
83                        return self.next();
84                    }
85                    return Err(LexError::new(
86                        "unfinished comment: missing '-'".to_string(),
87                        Range::new(from, self.source.pos()),
88                    ));
89                }
90                ',' => {
91                    self.source.eat();
92                    next = Token {
93                        range: Range::new(from, self.source.pos()),
94                        kind: Delimiter::Comma.into(),
95                    }
96                }
97
98                '|' => {
99                    self.source.eat();
100                    next = Token {
101                        range: Range::new(from, self.source.pos()),
102                        kind: Delimiter::Pipe.into(),
103                    }
104                }
105                '(' => {
106                    self.source.eat();
107                    next = Token {
108                        range: Range::new(from, self.source.pos()),
109                        kind: Delimiter::LParen.into(),
110                    }
111                }
112
113                ')' => {
114                    self.source.eat();
115                    next = Token {
116                        range: Range::new(from, self.source.pos()),
117                        kind: Delimiter::RParen.into(),
118                    }
119                }
120
121                '[' => {
122                    self.source.eat();
123                    next = Token {
124                        range: Range::new(from, self.source.pos()),
125                        kind: Delimiter::LBracket.into(),
126                    }
127                }
128
129                ']' => {
130                    self.source.eat();
131                    next = Token {
132                        range: Range::new(from, self.source.pos()),
133                        kind: Delimiter::RBracket.into(),
134                    }
135                }
136
137                '{' => {
138                    self.source.eat();
139                    if self.source.peek() == Some('}') {
140                        self.source.eat();
141                        next = Token {
142                            range: Range::new(from, self.source.pos()),
143                            kind: Delimiter::LDBrace.into(),
144                        }
145                    } else {
146                        next = Token {
147                            range: Range::new(from, self.source.pos()),
148                            kind: Delimiter::LBrace.into(),
149                        }
150                    }
151                }
152
153                '}' => {
154                    self.source.eat();
155                    if self.source.peek() == Some('}') {
156                        self.source.eat();
157                        next = Token {
158                            range: Range::new(from, self.source.pos()),
159                            kind: Delimiter::RDBrace.into(),
160                        }
161                    } else {
162                        next = Token {
163                            range: Range::new(from, self.source.pos()),
164                            kind: Delimiter::RBrace.into(),
165                        }
166                    }
167                }
168
169                '<' => {
170                    self.source.eat();
171                    if self.source.peek() == Some('<') {
172                        self.source.eat();
173                        next = Token {
174                            range: Range::new(from, self.source.pos()),
175                            kind: Delimiter::LDAngle.into(),
176                        }
177                    } else {
178                        next = Token {
179                            range: Range::new(from, self.source.pos()),
180                            kind: Delimiter::LAngle.into(),
181                        }
182                    }
183                }
184
185                '>' => {
186                    self.source.eat();
187                    if self.source.peek() == Some('>') {
188                        self.source.eat();
189                        next = Token {
190                            range: Range::new(from, self.source.pos()),
191                            kind: Delimiter::RDAngle.into(),
192                        }
193                    } else {
194                        next = Token {
195                            range: Range::new(from, self.source.pos()),
196                            kind: Delimiter::RAngle.into(),
197                        }
198                    }
199                }
200
201                '≔' => {
202                    self.source.eat();
203                    next = Token {
204                        range: Range::new(from, self.source.pos()),
205                        kind: Operator::Def.into(),
206                    }
207                }
208                '∷' => {
209                    self.source.eat();
210                    next = Token {
211                        range: Range::new(from, self.source.pos()),
212                        kind: Operator::Concat.into(),
213                    }
214                }
215                ':' => {
216                    self.source.eat();
217                    if self.source.peek() == Some('=') {
218                        self.source.eat();
219                        next = Token {
220                            range: Range::new(from, self.source.pos()),
221                            kind: Operator::Def.into(),
222                        }
223                    } else if self.source.peek() == Some(':') {
224                        self.source.eat();
225                        next = Token {
226                            range: Range::new(from, self.source.pos()),
227                            kind: Operator::Concat.into(),
228                        }
229                    } else {
230                        next = Token {
231                            range: Range::new(from, self.source.pos()),
232                            kind: Operator::TypeDef.into(),
233                        }
234                    }
235                }
236                '\'' => next = self.character()?,
237                '"' => next = self.string()?,
238                '/' => next = self.regex()?,
239                _ => next = self.identifier(),
240            },
241            None => {
242                next = Token {
243                    range: Range::new(from, self.source.pos()),
244                    kind: TokenKind::EOF,
245                }
246            }
247        };
248
249        Ok(next)
250    }
251
252    pub fn eat(&mut self) -> Result<Token> {
253        log::trace!("{}", function_name!());
254        match self.peek.clone() {
255            Some(p) => {
256                self.peek = None;
257                match p {
258                    Ok(t) => Ok(t),
259                    Err(e) => anyhow::bail!(e),
260                }
261            }
262            None => match self.next() {
263                Ok(t) => Ok(t),
264                Err(e) => anyhow::bail!(e),
265            },
266        }
267    }
268
269    pub fn peek(&mut self) -> Result<Token> {
270        log::trace!("{}", function_name!());
271        match self.peek.clone() {
272            Some(p) => match p {
273                Ok(t) => Ok(t),
274                Err(e) => anyhow::bail!(e),
275            },
276            None => {
277                let next = self.next();
278                self.peek = Some(next.clone());
279                match next {
280                    Ok(o) => Ok(o),
281                    Err(e) => anyhow::bail!(e),
282                }
283            }
284        }
285    }
286
287    fn identifier(&mut self) -> Token {
288        log::trace!("{}", function_name!());
289        let from = self.source.pos();
290        let mut id = String::new();
291        let mut mc = self.source.peek();
292
293        while let Some(c) = mc {
294            if c.is_whitespace() || (c.is_ascii_punctuation() && c != '-' && c != '_') {
295                break;
296            }
297
298            id.push(c);
299            self.source.eat();
300            mc = self.source.peek();
301        }
302
303        if let Some(k) = self.keys.get(&id) {
304            return Token {
305                range: Range::new(from, self.source.pos()),
306                kind: (*k).into(),
307            };
308        }
309
310        Token {
311            range: Range::new(from, self.source.pos()),
312            kind: TokenKind::Identifier(id),
313        }
314    }
315
316    fn character(&mut self) -> Result<Token, LexError> {
317        log::trace!("{}", function_name!());
318        let from = self.source.pos();
319        if self.source.peek() != Some('\'') {
320            return Err(LexError::new(
321                "missing '\' when lexing string".to_string(),
322                Range::new(from, self.source.pos()),
323            ));
324        }
325
326        self.source.eat();
327        let mut string = String::new();
328
329        match self.source.peek() {
330            Some(c) => {
331                self.source.eat();
332                if c == '\\' {
333                    match self.source.eat() {
334                        Some(c) => match c {
335                            '\'' => string.push_str("\\\'"),
336                            '/' => string.push_str("\\/"),
337                            '\\' => string.push_str("\\\\"),
338                            '0' => string.push_str("\\0"),
339                            'n' => string.push_str("\\n"),
340                            't' => string.push_str("\\t"),
341                            'r' => string.push_str("\\r"),
342                            _ => {
343                                return Err(LexError::new(
344                                    format!("invalid escaped literal '\\{c}'"),
345                                    Range::new(from, self.source.pos()),
346                                ));
347                            }
348                        },
349                        None => {
350                            return Err(LexError::new(
351                                "empty escaped literal".to_string(),
352                                Range::new(from, self.source.pos()),
353                            ));
354                        }
355                    }
356                } else if c == '\'' {
357                    return Err(LexError::new(
358                        "invalid empty literal".to_string(),
359                        Range::new(from, self.source.pos()),
360                    ));
361                } else {
362                    string.push(c);
363                }
364            }
365            None => {
366                return Err(LexError::new(
367                    "invalid literal".to_string(),
368                    Range::new(from, self.source.pos()),
369                ));
370            }
371        }
372
373        if let Some(c) = self.source.peek() {
374            if c == '\'' {
375                self.source.eat();
376                return Ok(Token {
377                    kind: Builtin::Char(string).into(),
378                    range: Range::new(from, self.source.pos()),
379                });
380            }
381        }
382
383        return Err(LexError::new(
384            format!("unclosed literal, missing '\'' when lexing character {string}"),
385            Range::new(from, self.source.pos()),
386        ));
387    }
388
389    fn string(&mut self) -> Result<Token, LexError> {
390        log::trace!("{}", function_name!());
391        let from = self.source.pos();
392        if self.source.peek() != Some('"') {
393            return Err(LexError::new(
394                "missing '\"' when lexing string".to_string(),
395                Range::new(from, self.source.pos()),
396            ));
397        }
398
399        self.source.eat();
400        let mut string = String::new();
401
402        while let Some(c) = self.source.peek() {
403            if c == '"' {
404                self.source.eat();
405                return Ok(Token {
406                    kind: Builtin::String(string).into(),
407                    range: Range::new(from, self.source.pos()),
408                });
409            } else if c == '\n' {
410                return Err(LexError::new(
411                    "unclosed string literal: missing '\"'".to_string(),
412                    Range::new(from, self.source.pos()),
413                ));
414            } else if c == '\\' {
415                self.source.eat();
416                if let Some(c) = self.source.peek() {
417                    match c {
418                        '"' => string.push_str("\\\""),
419                        '\\' => string.push_str("\\\\"),
420                        '/' => string.push_str("\\/"),
421                        '0' => string.push_str("\\0"),
422                        'n' => string.push_str("\\n"),
423                        't' => string.push_str("\\t"),
424                        'r' => string.push_str("\\r"),
425                        _ => {
426                            return Err(LexError::new(
427                                "unknown escape sequence '\\{c}".to_string(),
428                                Range::new(from, self.source.pos()),
429                            ))
430                        }
431                    }
432                    self.source.eat();
433                }
434            } else {
435                string.push(c);
436                self.source.eat();
437            }
438        }
439
440        Err(LexError::new(
441            "unclosed string literal: missing '\"'".to_string(),
442            Range::new(from, self.source.pos()),
443        ))
444    }
445
446    fn regex(&mut self) -> Result<Token, LexError> {
447        log::trace!("{}", function_name!());
448
449        let from = self.source.pos();
450        if self.source.peek() != Some('/') {
451            return Err(LexError::new(
452                "missing '/' when lexing regex".to_string(),
453                Range::new(from, self.source.pos()),
454            ));
455        }
456
457        self.source.eat();
458        let mut string = String::new();
459
460        while let Some(c) = self.source.peek() {
461            if c == '/' {
462                self.source.eat();
463                let regex = match Regex::new(string.as_str()) {
464                    Ok(r) => r,
465                    Err(e) => {
466                        return Err(LexError::new(
467                            format!("failed parsing regex: {e}"),
468                            Range::new(from, self.source.pos()),
469                        ))
470                    }
471                };
472                return Ok(Token {
473                    kind: Builtin::Regex(regex.into()).into(),
474                    range: Range::new(from, self.source.pos()),
475                });
476            } else if c == '\\' {
477                self.source.eat();
478                if let Some(c) = self.source.peek() {
479                    string.push('\\');
480                    string.push(c);
481                    self.source.eat();
482                }
483            } else if c == '\n' {
484                return Err(LexError::new(
485                    "unclosed regex: missing '/'".to_string(),
486                    Range::new(from, self.source.pos()),
487                ));
488            } else {
489                string.push(c);
490                self.source.eat();
491            }
492        }
493
494        Err(LexError::new(
495            "unclosed regex: missing '/'".to_string(),
496            Range::new(from, self.source.pos()),
497        ))
498    }
499}
500
501#[cfg(test)]
502pub mod test {
503    use crate::syntax::source::string::StringSource;
504
505    use super::*;
506
507    #[test]
508    pub fn smoke_test() {
509        let source = String::from("test");
510        let mut lex = Lexer::new(Box::<StringSource>::new(source.into()));
511
512        _ = lex.peek();
513    }
514
515    #[test]
516    pub fn keys() {
517        let mut keys = vec![];
518
519        for key in Keyword::iter() {
520            keys.push(key)
521        }
522
523        let source = keys
524            .clone()
525            .iter()
526            .map(|k| k.to_string())
527            .collect::<Vec<String>>()
528            .join(" ");
529        let mut lex = Lexer::new(Box::<StringSource>::new(source.into()));
530        for key in keys {
531            let nt = lex.next();
532            assert!(nt.is_ok());
533            let t: Token = nt.unwrap();
534
535            assert_eq!(t.kind, key.into())
536        }
537    }
538
539    #[test]
540    pub fn strings() {
541        let source = String::from("\"--\"");
542
543        let mut lex = Lexer::new(Box::<StringSource>::new(source.clone().into()));
544        let t = lex.next();
545        assert!(t.is_ok());
546        let t = t.unwrap();
547
548        println!("{:?}", t);
549        assert_eq!(
550            t.kind,
551            TokenKind::Builtin(Builtin::String(String::from("--")))
552        );
553    }
554
555    #[test]
556    pub fn ids() {
557        let source = String::from("name:");
558
559        let mut lex = Lexer::new(Box::<StringSource>::new(source.clone().into()));
560
561        let t = lex.next();
562        assert!(t.is_ok());
563        let t = t.unwrap();
564        println!("{:?}", t);
565        assert!(t.kind.is_identifier(String::from("name")));
566
567        let t = lex.next();
568        assert!(t.is_ok());
569        let t = t.unwrap();
570        assert!(t.kind.is_operator(Operator::TypeDef));
571    }
572
573    #[test]
574    pub fn regexes() {
575        let source = String::from("/a///");
576
577        let mut lex = Lexer::new(Box::<StringSource>::new(source.clone().into()));
578        let t = lex.next();
579        assert!(t.is_ok());
580        let t = t.unwrap();
581
582        println!("{:?}", t);
583        assert_eq!(
584            t.kind,
585            TokenKind::Builtin(Builtin::Regex(token::RegexW {
586                regex: Regex::new("a").unwrap()
587            }))
588        );
589    }
590}