Skip to main content

prune_lang/syntax/
lexer.rs

1use logos::Logos;
2
3pub type Span = logos::Span;
4
5#[derive(Clone, Copy, Debug, Eq, Logos, PartialEq)]
6#[logos(skip r"[ \r\t\f]+")]
7pub enum Token {
8    #[token("(")]
9    LParen,
10    #[token(")")]
11    RParen,
12    #[token("[")]
13    LBracket,
14    #[token("]")]
15    RBracket,
16    #[token("{")]
17    LBrace,
18    #[token("}")]
19    RBrace,
20    #[token(":")]
21    Colon,
22    #[token(";")]
23    Semi,
24    #[token(",")]
25    Comma,
26    #[token(".")]
27    Dot,
28    #[token("|")]
29    Bar,
30    #[token("=")]
31    Equal,
32    #[token("+")]
33    Plus,
34    #[token("-")]
35    Minus,
36    #[token("*")]
37    Star,
38    #[token("/")]
39    Slash,
40    #[token("%")]
41    Percent,
42    #[token("<")]
43    Less,
44    #[token("<=")]
45    LessEqual,
46    #[token("==")]
47    EqualEqual,
48    #[token(">=")]
49    GreaterEqual,
50    #[token(">")]
51    Greater,
52    #[token("!=")]
53    BangEqual,
54    #[token("^")]
55    Caret,
56    #[token("&&")]
57    DoubleAmpersand,
58    #[token("||")]
59    DoubleBar,
60    #[token("!")]
61    Bang,
62    #[token("->")]
63    Arrow,
64    #[token("=>")]
65    FatArrow,
66    #[token("<-")]
67    LeftArrow,
68    #[token(":=")]
69    ColonEqual,
70    #[token("let")]
71    Let,
72    #[token("if")]
73    If,
74    #[token("then")]
75    Then,
76    #[token("else")]
77    Else,
78    #[token("condition")]
79    Condition,
80    #[token("alternative")]
81    Alternative,
82    #[token("match")]
83    Match,
84    #[token("with")]
85    With,
86    #[token("case")]
87    Case,
88    #[token("of")]
89    Of,
90    #[token("as")]
91    As,
92    #[token("begin")]
93    Begin,
94    #[token("end")]
95    End,
96    #[token("fresh")]
97    Fresh,
98    #[token("and")]
99    And,
100    #[token("or")]
101    Or,
102    #[token("guard")]
103    Guard,
104    #[token("undefined")]
105    Undefined,
106    #[token("datatype")]
107    Datatype,
108    #[token("function")]
109    Function,
110    #[token("query")]
111    Query,
112    #[token("where")]
113    Where,
114    #[regex(r"-?[0-9]([0-9])*")]
115    Int,
116    #[regex(r"-?[0-9]([0-9])*\.[0-9]([0-9])*")]
117    Float,
118    #[token("true")]
119    #[token("false")]
120    Bool,
121    #[regex(r"'(.|\\.)'")]
122    Char,
123    #[token("Int")]
124    TyInt,
125    #[token("Float")]
126    TyFloat,
127    #[token("Bool")]
128    TyBool,
129    #[token("Char")]
130    TyChar,
131    #[token("()")] // both for unit type and unit value
132    Unit,
133    // LowerIdent could be just wildcard "_", it is handled in parser
134    #[regex(r"([a-z]|_)([a-zA-Z0-9]|_)*")]
135    LowerIdent,
136    #[regex(r"[A-Z]([a-zA-Z0-9]|_)*")]
137    UpperIdent,
138    #[regex(r"@[a-zA-Z]([a-zA-Z0-9]|_)*")]
139    PrimOpr,
140    #[token("//", line_comment)]
141    LineComment,
142    #[token("/*", block_comment)]
143    BlockComment,
144    #[token("\n")]
145    NewLine,
146    /// lexer failed, skip till next whitespace
147    TokError,
148    EndOfFile,
149}
150
151fn line_comment(lex: &mut logos::Lexer<Token>) -> bool {
152    let mut rest = lex.remainder().chars();
153    loop {
154        if let Some(ch) = rest.next() {
155            lex.bump(ch.len_utf8());
156            if ch == '\n' {
157                return true;
158            }
159        } else {
160            return false;
161        }
162    }
163}
164
165fn block_comment(lex: &mut logos::Lexer<Token>) -> bool {
166    let mut rest = lex.remainder().chars();
167    let mut last_char = ' ';
168    let mut nested_level: usize = 1;
169    loop {
170        if let Some(ch) = rest.next() {
171            lex.bump(ch.len_utf8());
172            match ch {
173                '/' if last_char == '*' => {
174                    nested_level -= 1;
175                }
176                '*' if last_char == '/' => {
177                    nested_level += 1;
178                }
179                _ => {}
180            }
181            if nested_level == 0 {
182                return true;
183            }
184            last_char = ch;
185        } else {
186            return false;
187        }
188    }
189}
190
191pub struct TokenSpan {
192    pub token: Token,
193    pub span: Span,
194}
195
196pub fn tokenize(source: &str) -> Vec<TokenSpan> {
197    let mut lex = Token::lexer(source);
198    let mut vec = Vec::new();
199    while let Some(tok) = lex.next() {
200        let span = lex.span();
201        match tok {
202            // we don't leak these three tokens to parser
203            // but they will be useful in the future, if we want to write a formatter
204            Ok(Token::NewLine) | Ok(Token::LineComment) | Ok(Token::BlockComment) => {}
205            Ok(token) => {
206                vec.push(TokenSpan { token, span });
207            }
208            Err(()) => {
209                let token = Token::TokError;
210                vec.push(TokenSpan { token, span });
211            }
212        }
213    }
214    let token = Token::EndOfFile;
215    let span = lex.span();
216    vec.push(TokenSpan { token, span });
217    vec
218}
219
220#[test]
221#[ignore = "just to see result"]
222fn lexer_test() {
223    let s = r#"
224// test line comment
225/*
226    /*
227        test block comment
228    */
229*/
230datatype IntList where
231| Cons(Int, IntList)
232| Nil
233end
234
235function append(xs: IntList, x: Int) -> Int
236begin
237    match xs with
238    | Cons(head, tail) => Cons(head, append(tail, x))
239    | Nil => Cons(x, Nil)
240    end
241end
242"#;
243
244    let mut lex = Token::lexer(s);
245
246    loop {
247        if let Some(tok) = lex.next() {
248            println!("{:?} {:?} {}", tok, lex.span(), lex.slice());
249        } else {
250            break;
251        }
252    }
253}