Skip to main content

gollum_parser/
lexer.rs

1//! Logos-based lexer for Gollum source text.
2#![allow(missing_docs)]
3use logos::Logos;
4use std::fmt;
5
6/// A single Gollum token produced by the lexer.
7#[derive(Logos, Debug, PartialEq, Clone)]
8#[logos(skip r"[ \t\r\n\f]+")] // whitespace
9#[logos(skip r"%[^\n]*")] // line comments (% ...)
10#[logos(skip r"/\*[^*]*\*+(?:[^/*][^*]*\*+)*/")] // block comments /* ... */
11pub enum Token {
12    // --- Structural ---
13    #[token(":-")]
14    Neck,
15    #[token("?-")]
16    QueryNeck,
17    #[token("::")]
18    ColonColon,
19    #[token(":")]
20    Colon,
21    #[token(",")]
22    Comma,
23    #[token(".")]
24    Dot,
25    #[token("(")]
26    LParen,
27    #[token(")")]
28    RParen,
29    #[token("[")]
30    LBracket,
31    #[token("]")]
32    RBracket,
33    #[token("|")]
34    Pipe,
35    #[token("@")]
36    At,
37
38    // --- Arithmetic ---
39    #[token("+")]
40    Plus,
41    #[token("-")]
42    Minus,
43    #[token("*")]
44    Star,
45    #[token("/")]
46    Slash,
47    #[token("mod")]
48    Mod,
49
50    // --- Comparison / unification (longer tokens first to win over prefixes) ---
51    #[token("=:=")]
52    ArithEq,
53    #[token("=\\=")]
54    ArithNeq,
55    #[token("\\+")]
56    NotPlus,
57    #[token("\\=")]
58    NotEq,
59    #[token("=<")]
60    Lte,
61    #[token(">=")]
62    Gte,
63    #[token("<")]
64    Lt,
65    #[token(">")]
66    Gt,
67    #[token("=")]
68    Eq,
69
70    // --- Modal logic (Unicode) ---
71    #[token("□")]
72    Box,
73    #[token("◇")]
74    Diamond,
75
76    // --- Keywords ---
77    #[token("is")]
78    Is,
79    #[token("not")]
80    Not,
81    #[token("before")]
82    Before,
83    #[token("after")]
84    After,
85    #[token("during")]
86    During,
87    #[token("until")]
88    Until,
89    #[token("using")]
90    Using,
91    #[token("minimize")]
92    Minimize,
93    #[token("maximize")]
94    Maximize,
95
96    // --- Literals ---
97    /// Float: must come before Integer so "3.14" is not split into Integer(3) + Dot + Integer(14)
98    #[regex(r"[0-9]+\.[0-9]+", |lex| lex.slice().parse::<f64>().ok())]
99    Float(f64),
100
101    /// Integer: bare number without unit
102    #[regex(r"[0-9]+", |lex| lex.slice().parse::<i64>().ok())]
103    Integer(i64),
104
105    /// Number with time unit (e.g., 100s, -1h, 1500ms)
106    #[regex(r"-?[0-9]+(\.[0-9]+)?[a-zµμ]+", parse_unit_literal)]
107    UnitLiteral((i64, &'static str)),
108
109    // --- Variables and anonymous ---
110    /// Anonymous variable `_`
111    #[token("_")]
112    Anon,
113
114    /// Named variable: uppercase start, or underscore + alphanumeric
115    #[regex(r"[A-Z][a-zA-Z0-9_]*|_[a-zA-Z0-9_]+", |lex| lex.slice().to_string())]
116    Var(String),
117
118    // --- Atoms ---
119    /// Unquoted atom: lowercase start
120    #[regex(r"[a-z][a-zA-Z0-9_]*", |lex| lex.slice().to_string())]
121    Atom(String),
122
123    /// Quoted atom: 'foo bar'
124    #[regex(r"'[^']*'", |lex| { let s = lex.slice(); s[1..s.len()-1].to_string() })]
125    QuotedAtom(String),
126
127    /// String literal: "hello"
128    #[regex(r#""[^"]*""#, |lex| { let s = lex.slice(); s[1..s.len()-1].to_string() })]
129    Str(String),
130}
131
132fn parse_unit_literal(lex: &mut logos::Lexer<Token>) -> Option<(i64, &'static str)> {
133    let slice = lex.slice();
134
135    let mut num_end = 0;
136    for (i, c) in slice.char_indices() {
137        match c {
138            '0'..='9' | '-' | '.' => num_end = i + 1,
139            _ => break,
140        }
141    }
142
143    if num_end == 0 {
144        return None;
145    }
146
147    let num_str = &slice[..num_end];
148    let unit = &slice[num_end..];
149
150    if unit.is_empty() {
151        return None;
152    }
153
154    let num: i64 = num_str.parse().ok()?;
155    let unit_static: &'static str = match unit {
156        "ns" => "ns",
157        "us" | "µs" | "μs" => "us",
158        "ms" => "ms",
159        "s" => "s",
160        "min" => "min",
161        "h" => "h",
162        "d" => "d",
163        "w" => "w",
164        "y" => "y",
165        _ => return None,
166    };
167
168    Some((num, unit_static))
169}
170
171impl fmt::Display for Token {
172    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
173        match self {
174            Self::Atom(s) => write!(f, r#"Atom("{s}")"#),
175            Self::Var(s) => write!(f, r#"Var("{s}")"#),
176            Self::QuotedAtom(s) => write!(f, r#"QuotedAtom("{s}")"#),
177            Self::Str(s) => write!(f, r#"Str("{s}")"#),
178            Self::Integer(n) => write!(f, "Integer({n})"),
179            Self::Float(n) => write!(f, "Float({n})"),
180            Self::Anon => write!(f, "Anon"),
181            Self::Neck => write!(f, "Neck"),
182            Self::QueryNeck => write!(f, "QueryNeck"),
183            Self::ColonColon => write!(f, "ColonColon"),
184            Self::Colon => write!(f, "Colon"),
185            Self::Comma => write!(f, "Comma"),
186            Self::Dot => write!(f, "Dot"),
187            Self::LParen => write!(f, "LParen"),
188            Self::RParen => write!(f, "RParen"),
189            Self::LBracket => write!(f, "LBracket"),
190            Self::RBracket => write!(f, "RBracket"),
191            Self::Pipe => write!(f, "Pipe"),
192            Self::Plus => write!(f, "Plus"),
193            Self::Minus => write!(f, "Minus"),
194            Self::Star => write!(f, "Star"),
195            Self::Slash => write!(f, "Slash"),
196            Self::Mod => write!(f, "Mod"),
197            Self::Eq => write!(f, "Eq"),
198            Self::NotEq => write!(f, "NotEq"),
199            Self::ArithEq => write!(f, "ArithEq"),
200            Self::ArithNeq => write!(f, "ArithNeq"),
201            Self::Lt => write!(f, "Lt"),
202            Self::Gt => write!(f, "Gt"),
203            Self::Lte => write!(f, "Lte"),
204            Self::Gte => write!(f, "Gte"),
205            Self::NotPlus => write!(f, "NotPlus"),
206            Self::Is => write!(f, "Is"),
207            Self::Not => write!(f, "Not"),
208            Self::Before => write!(f, "Before"),
209            Self::After => write!(f, "After"),
210            Self::During => write!(f, "During"),
211            Self::Until => write!(f, "Until"),
212            Self::Using => write!(f, "Using"),
213            Self::Minimize => write!(f, "Minimize"),
214            Self::Maximize => write!(f, "Maximize"),
215            Self::Box => write!(f, "Box"),
216            Self::Diamond => write!(f, "Diamond"),
217            Self::At => write!(f, "At"),
218            Self::UnitLiteral((n, u)) => write!(f, "UnitLiteral({}{})", n, u),
219        }
220    }
221}
222
223/// Tokenize a Gollum source string.
224/// Returns `Ok(Token)` for each recognized token, `Err(())` for unrecognized input.
225pub fn tokenize(source: &str) -> Vec<Result<Token, ()>> {
226    Token::lexer(source).collect()
227}