Skip to main content

lex_syntax/
token.rs

1use logos::Logos;
2use std::ops::Range;
3
4#[derive(Logos, Debug, Clone, PartialEq)]
5#[logos(skip r"[ \t\r\f]+")]
6#[logos(skip(r"#[^\n]*", allow_greedy = true))]
7pub enum TokenKind {
8    // keywords
9    #[token("fn")]      Fn,
10    #[token("let")]     Let,
11    #[token("type")]    Type,
12    #[token("match")]   Match,
13    #[token("if")]      If,
14    #[token("else")]    Else,
15    #[token("return")]  Return,
16    #[token("import")]  Import,
17    #[token("as")]      As,
18    #[token("true")]    True,
19    #[token("false")]   False,
20    #[token("and")]     And,
21    #[token("or")]      Or,
22    #[token("not")]     Not,
23
24    // multi-char operators (longer first to win the match race)
25    #[token("|>")] Pipe,
26    #[token("->")] Arrow,
27    #[token("=>")] FatArrow,
28    #[token(":=")] ColonEq,
29    #[token("::")] ColonColon,
30    #[token("==")] EqEq,
31    #[token("!=")] BangEq,
32    #[token("<=")] LtEq,
33    #[token(">=")] GtEq,
34
35    // spread operator (record type spread `{ ...TypeName }`)
36    #[token("...")] DotDotDot,
37
38    // single-char operators
39    #[token("+")] Plus,
40    #[token("-")] Minus,
41    #[token("*")] Star,
42    #[token("/")] Slash,
43    #[token("%")] Percent,
44    #[token("<")] Lt,
45    #[token(">")] Gt,
46    #[token(".")] Dot,
47    #[token(",")] Comma,
48    #[token(";")] Semi,
49    #[token(":")] Colon,
50    #[token("?")] Question,
51    #[token("(")] LParen,
52    #[token(")")] RParen,
53    #[token("{")] LBrace,
54    #[token("}")] RBrace,
55    #[token("[")] LBracket,
56    #[token("]")] RBracket,
57    #[token("=")] Eq,
58    #[token("|")] Bar,
59    #[token("_")] Underscore,
60    #[token("\n")] Newline,
61
62    // literals
63    #[regex(r"[0-9][0-9_]*[eE][+-]?[0-9]+", |lex| lex.slice().replace('_', "").parse::<f64>().ok())]
64    #[regex(r"[0-9][0-9_]*\.[0-9][0-9_]*([eE][+-]?[0-9]+)?", |lex| lex.slice().replace('_', "").parse::<f64>().ok())]
65    Float(f64),
66
67    #[regex(r"[0-9][0-9_]*", |lex| lex.slice().replace('_', "").parse::<i64>().ok(), priority = 3)]
68    Int(i64),
69
70    #[regex(r#""([^"\\]|\\.)*""#, |lex| unescape(&lex.slice()[1..lex.slice().len()-1]))]
71    Str(String),
72
73    #[regex(r#"b"([^"\\]|\\.)*""#, |lex| unescape(&lex.slice()[2..lex.slice().len()-1]).map(|s| s.into_bytes()))]
74    Bytes(Vec<u8>),
75
76    /// String interpolation literal `f"hello {name}"` (#562). The content
77    /// is unescaped the same way as `Str`; `{...}` segments are desugared
78    /// to `str.concat` chains by the parser.
79    #[regex(r#"f"([^"\\]|\\.)*""#, |lex| unescape(&lex.slice()[2..lex.slice().len()-1]))]
80    FStr(String),
81
82    // Identifier. Two alternatives so a bare `_` keeps lexing as
83    // the discard token (used by `match _ => ...` and the new
84    // `let _ := ...`) while `_name` is recognized as a real
85    // identifier (#200). Logos picks the longer match: for `_`
86    // alone only Underscore matches (Ident requires ≥2 chars on
87    // the underscore branch); for `_x` the Ident branch wins.
88    #[regex(r"[a-zA-Z][a-zA-Z0-9_]*", |lex| lex.slice().to_string())]
89    #[regex(r"_[a-zA-Z0-9_]+", |lex| lex.slice().to_string())]
90    Ident(String),
91}
92
93fn unescape(s: &str) -> Option<String> {
94    let mut out = String::with_capacity(s.len());
95    let mut chars = s.chars();
96    while let Some(c) = chars.next() {
97        if c == '\\' {
98            match chars.next()? {
99                'n' => out.push('\n'),
100                't' => out.push('\t'),
101                'r' => out.push('\r'),
102                '\\' => out.push('\\'),
103                '"' => out.push('"'),
104                '0' => out.push('\0'),
105                _ => return None,
106            }
107        } else {
108            out.push(c);
109        }
110    }
111    Some(out)
112}
113
114#[derive(Debug, Clone)]
115pub struct Token {
116    pub kind: TokenKind,
117    pub span: Range<usize>,
118}
119
120pub fn lex(src: &str) -> Result<Vec<Token>, LexError> {
121    let mut toks = Vec::new();
122    let mut lx = TokenKind::lexer(src);
123    while let Some(res) = lx.next() {
124        match res {
125            Ok(kind) => toks.push(Token { kind, span: lx.span() }),
126            Err(_) => {
127                return Err(LexError {
128                    span: lx.span(),
129                    snippet: lx.slice().to_string(),
130                });
131            }
132        }
133    }
134    Ok(toks)
135}
136
137#[derive(Debug, thiserror::Error)]
138#[error("unrecognized token `{snippet}` at {span:?}")]
139pub struct LexError {
140    pub span: Range<usize>,
141    pub snippet: String,
142}