Skip to main content

bubbles/compiler/
lexer.rs

1//! Logos-based lexer that tokenises `.bub` source and expression strings.
2
3use logos::Logos;
4
5/// A lexical token produced by the lexer.
6#[derive(Logos, Debug, Clone, PartialEq)]
7#[logos(skip r"[ \t\r\f]+")] // skip horizontal whitespace; newlines are significant in the parser
8pub enum Token {
9    // ── literals ──────────────────────────────────────────────────────────────
10    /// Floating-point or integer literal.
11    #[regex(r"[0-9]+(\.[0-9]+)?", |lex| lex.slice().parse::<f64>().ok())]
12    Number(f64),
13
14    /// Double-quoted string literal.
15    #[regex(r#""([^"\\]|\\.)*""#, |lex| {
16        let s = lex.slice();
17        Some(s[1..s.len()-1].replace("\\\"", "\"").replace("\\\\", "\\").replace("\\n", "\n"))
18    })]
19    Str(String),
20
21    // ── identifiers / keywords ─────────────────────────────────────────────────
22    /// Variable beginning with `$`.
23    #[regex(r"\$[A-Za-z_][A-Za-z0-9_]*", |lex| lex.slice().to_owned())]
24    Var(String),
25
26    /// Plain identifier or keyword.
27    #[regex(r"[A-Za-z_][A-Za-z0-9_]*", |lex| lex.slice().to_owned())]
28    Ident(String),
29
30    // ── delimiters ─────────────────────────────────────────────────────────────
31    /// `(` – opens a parenthesised sub-expression or argument list.
32    #[token("(")]
33    LParen,
34    /// `)` – closes a parenthesised sub-expression or argument list.
35    #[token(")")]
36    RParen,
37    /// `,` – argument separator.
38    #[token(",")]
39    Comma,
40    /// `<<` – opens a command/statement block.
41    #[token("<<")]
42    CmdOpen,
43    /// `>>` – closes a command/statement block.
44    #[token(">>")]
45    CmdClose,
46    /// `{` – opens an inline expression.
47    #[token("{")]
48    BraceOpen,
49    /// `}` – closes an inline expression.
50    #[token("}")]
51    BraceClose,
52
53    // ── arithmetic ─────────────────────────────────────────────────────────────
54    /// `+`
55    #[token("+")]
56    Plus,
57    /// `-`
58    #[token("-")]
59    Minus,
60    /// `*`
61    #[token("*")]
62    Star,
63    /// `/`
64    #[token("/")]
65    Slash,
66    /// `%`
67    #[token("%")]
68    Percent,
69
70    // ── comparison (order matters: `>=` before `>`) ───────────────────────────
71    /// `>=`
72    #[token(">=")]
73    Gte,
74    /// `<=`
75    #[token("<=")]
76    Lte,
77    /// `>`
78    #[token(">")]
79    Gt,
80    /// `<`
81    #[token("<")]
82    Lt,
83    /// `==`
84    #[token("==")]
85    EqEq,
86    /// `!=`
87    #[token("!=")]
88    Neq,
89
90    // ── logical ────────────────────────────────────────────────────────────────
91    /// `&&`
92    #[token("&&")]
93    AndAnd,
94    /// `||`
95    #[token("||")]
96    OrOr,
97    /// `!`
98    #[token("!")]
99    Bang,
100
101    // ── assignment / misc ──────────────────────────────────────────────────────
102    /// `=` (used in `<<set $x = …>>`)
103    #[token("=")]
104    Eq,
105    /// `:`
106    #[token(":")]
107    Colon,
108    /// `->`
109    #[token("->")]
110    Arrow,
111    /// `=>`
112    #[token("=>")]
113    FatArrow,
114    /// `---` body-start delimiter.
115    #[token("---")]
116    BodyStart,
117    /// `===` node-end delimiter.
118    #[token("===")]
119    NodeEnd,
120    /// `#` tag prefix.
121    #[token("#")]
122    Hash,
123    /// Newline.
124    #[token("\n")]
125    Newline,
126}
127
128/// A spanned token pair.
129pub type Spanned = (Token, std::ops::Range<usize>);
130
131/// Lexes `input` into a [`Vec`] of spanned tokens, returning an error on
132/// any character that does not match a known token.
133///
134/// # Errors
135///
136/// Returns [`crate::error::DialogueError::Parse`] with `file` / `line` context
137/// when an unrecognised character is encountered, so the caller receives a
138/// precise pointer into the source rather than a confusing downstream failure.
139pub fn tokenise(input: &str, file: &str, line: usize) -> crate::error::Result<Vec<Spanned>> {
140    let mut tokens = Vec::new();
141    for (result, span) in Token::lexer(input).spanned() {
142        if let Ok(tok) = result {
143            tokens.push((tok, span));
144        } else {
145            let ch = input[span].chars().next().unwrap_or('?');
146            return Err(crate::error::DialogueError::Parse {
147                file: file.to_owned(),
148                line,
149                message: format!(
150                    "unexpected character `{ch}` in expression; \
151                     did you mean `$` for a variable?"
152                ),
153            });
154        }
155    }
156    Ok(tokens)
157}
158
159#[cfg(test)]
160#[path = "lexer_tests.rs"]
161mod tests;