Skip to main content

gollum_parser/
lexer.rs

1//! Logos-based lexer for Gollum source text.
2#![allow(missing_docs)]
3use logos::Logos;
4use std::fmt;
5
6/// A single Gollum token produced by the lexer.
7#[derive(Logos, Debug, PartialEq, Clone)]
8#[logos(skip r"[ \t\r\n\f]+")] // whitespace
9#[logos(skip r"%[^\n]*")] // line comments (% ...)
10#[logos(skip r"/\*[^*]*\*+(?:[^/*][^*]*\*+)*/")] // block comments /* ... */
11pub enum Token {
12    // --- Structural ---
13    #[token(":-")]
14    Neck,
15    #[token("?-")]
16    QueryNeck,
17    #[token("::")]
18    ColonColon,
19    #[token(":")]
20    Colon,
21    #[token(",")]
22    Comma,
23    #[token(".")]
24    Dot,
25    #[token("(")]
26    LParen,
27    #[token(")")]
28    RParen,
29    #[token("[")]
30    LBracket,
31    #[token("]")]
32    RBracket,
33    #[token("|")]
34    Pipe,
35    #[token("!")]
36    Cut,
37    #[token("@")]
38    At,
39
40    // --- Arithmetic ---
41    #[token("+")]
42    Plus,
43    #[token("-")]
44    Minus,
45    #[token("*")]
46    Star,
47    #[token("/")]
48    Slash,
49    #[token("mod")]
50    Mod,
51
52    // --- CLP(FD) constraint operators (longer tokens first) ---
53    #[token("#>=")]
54    ClpGte,
55    #[token("#=<")]
56    ClpLte,
57    #[token("#\\=")]
58    ClpNeq,
59    #[token("#>")]
60    ClpGt,
61    #[token("#<")]
62    ClpLt,
63    #[token("#=")]
64    ClpEq,
65
66    // --- Range operator ---
67    #[token("..")]
68    DotDot,
69
70    // --- Comparison / unification (longer tokens first to win over prefixes) ---
71    #[token("=:=")]
72    ArithEq,
73    #[token("=\\=")]
74    ArithNeq,
75    #[token("\\+")]
76    NotPlus,
77    #[token("\\=")]
78    NotEq,
79    #[token("=<")]
80    Lte,
81    #[token(">=")]
82    Gte,
83    #[token("<")]
84    Lt,
85    #[token(">")]
86    Gt,
87    #[token("=")]
88    Eq,
89
90    // --- Modal logic (Unicode) ---
91    #[token("□")]
92    Box,
93    #[token("◇")]
94    Diamond,
95
96    // --- Keywords ---
97    #[token("in")]
98    In,
99    #[token("is")]
100    Is,
101    #[token("not")]
102    Not,
103    #[token("before")]
104    Before,
105    #[token("after")]
106    After,
107    #[token("during")]
108    During,
109    #[token("until")]
110    Until,
111    #[token("using")]
112    Using,
113    #[token("minimize")]
114    Minimize,
115    #[token("maximize")]
116    Maximize,
117
118    // --- Literals ---
119    /// Float: must come before Integer so "3.14" is not split into Integer(3) + Dot + Integer(14)
120    #[regex(r"[0-9]+\.[0-9]+", |lex| lex.slice().parse::<f64>().ok())]
121    Float(f64),
122
123    /// Integer: bare number without unit
124    #[regex(r"[0-9]+", |lex| lex.slice().parse::<i64>().ok())]
125    Integer(i64),
126
127    /// Number with time unit (e.g., 100s, -1h, 1500ms)
128    #[regex(r"-?[0-9]+(\.[0-9]+)?[a-zµμ]+", parse_unit_literal)]
129    UnitLiteral((i64, &'static str)),
130
131    // --- Variables and anonymous ---
132    /// Anonymous variable `_`
133    #[token("_")]
134    Anon,
135
136    /// Named variable: uppercase start, or underscore + alphanumeric
137    #[regex(r"[A-Z][a-zA-Z0-9_]*|_[a-zA-Z0-9_]+", |lex| lex.slice().to_string())]
138    Var(String),
139
140    // --- Atoms ---
141    /// Unquoted atom: lowercase start
142    #[regex(r"[a-z][a-zA-Z0-9_]*", |lex| lex.slice().to_string())]
143    Atom(String),
144
145    /// Quoted atom: 'foo bar'
146    #[regex(r"'[^']*'", |lex| { let s = lex.slice(); s[1..s.len()-1].to_string() })]
147    QuotedAtom(String),
148
149    /// String literal: "hello"
150    #[regex(r#""[^"]*""#, |lex| { let s = lex.slice(); s[1..s.len()-1].to_string() })]
151    Str(String),
152}
153
154fn parse_unit_literal(lex: &mut logos::Lexer<Token>) -> Option<(i64, &'static str)> {
155    let slice = lex.slice();
156
157    let mut num_end = 0;
158    for (i, c) in slice.char_indices() {
159        match c {
160            '0'..='9' | '-' | '.' => num_end = i + 1,
161            _ => break,
162        }
163    }
164
165    if num_end == 0 {
166        return None;
167    }
168
169    let num_str = &slice[..num_end];
170    let unit = &slice[num_end..];
171
172    if unit.is_empty() {
173        return None;
174    }
175
176    let num: i64 = num_str.parse().ok()?;
177    let unit_static: &'static str = match unit {
178        "ns" => "ns",
179        "us" | "µs" | "μs" => "us",
180        "ms" => "ms",
181        "s" => "s",
182        "min" => "min",
183        "h" => "h",
184        "d" => "d",
185        "w" => "w",
186        "y" => "y",
187        _ => return None,
188    };
189
190    Some((num, unit_static))
191}
192
193impl fmt::Display for Token {
194    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
195        match self {
196            Self::Atom(s) => write!(f, r#"Atom("{s}")"#),
197            Self::Var(s) => write!(f, r#"Var("{s}")"#),
198            Self::QuotedAtom(s) => write!(f, r#"QuotedAtom("{s}")"#),
199            Self::Str(s) => write!(f, r#"Str("{s}")"#),
200            Self::Integer(n) => write!(f, "Integer({n})"),
201            Self::Float(n) => write!(f, "Float({n})"),
202            Self::Anon => write!(f, "Anon"),
203            Self::Neck => write!(f, "Neck"),
204            Self::QueryNeck => write!(f, "QueryNeck"),
205            Self::ColonColon => write!(f, "ColonColon"),
206            Self::Colon => write!(f, "Colon"),
207            Self::Comma => write!(f, "Comma"),
208            Self::Dot => write!(f, "Dot"),
209            Self::LParen => write!(f, "LParen"),
210            Self::RParen => write!(f, "RParen"),
211            Self::LBracket => write!(f, "LBracket"),
212            Self::RBracket => write!(f, "RBracket"),
213            Self::Pipe => write!(f, "Pipe"),
214            Self::Cut => write!(f, "Cut"),
215            Self::Plus => write!(f, "Plus"),
216            Self::Minus => write!(f, "Minus"),
217            Self::Star => write!(f, "Star"),
218            Self::Slash => write!(f, "Slash"),
219            Self::Mod => write!(f, "Mod"),
220            Self::Eq => write!(f, "Eq"),
221            Self::NotEq => write!(f, "NotEq"),
222            Self::ArithEq => write!(f, "ArithEq"),
223            Self::ArithNeq => write!(f, "ArithNeq"),
224            Self::Lt => write!(f, "Lt"),
225            Self::Gt => write!(f, "Gt"),
226            Self::Lte => write!(f, "Lte"),
227            Self::Gte => write!(f, "Gte"),
228            Self::NotPlus => write!(f, "NotPlus"),
229            Self::Is => write!(f, "Is"),
230            Self::Not => write!(f, "Not"),
231            Self::Before => write!(f, "Before"),
232            Self::After => write!(f, "After"),
233            Self::During => write!(f, "During"),
234            Self::Until => write!(f, "Until"),
235            Self::Using => write!(f, "Using"),
236            Self::Minimize => write!(f, "Minimize"),
237            Self::Maximize => write!(f, "Maximize"),
238            Self::Box => write!(f, "Box"),
239            Self::Diamond => write!(f, "Diamond"),
240            Self::At => write!(f, "At"),
241            Self::UnitLiteral((n, u)) => write!(f, "UnitLiteral({}{})", n, u),
242            Self::In => write!(f, "In"),
243            Self::DotDot => write!(f, "DotDot"),
244            Self::ClpEq => write!(f, "ClpEq"),
245            Self::ClpNeq => write!(f, "ClpNeq"),
246            Self::ClpLt => write!(f, "ClpLt"),
247            Self::ClpLte => write!(f, "ClpLte"),
248            Self::ClpGt => write!(f, "ClpGt"),
249            Self::ClpGte => write!(f, "ClpGte"),
250        }
251    }
252}
253
254/// Tokenize a Gollum source string.
255/// Returns `Ok(Token)` for each recognized token, `Err(())` for unrecognized input.
256pub fn tokenize(source: &str) -> Vec<Result<Token, ()>> {
257    Token::lexer(source).collect()
258}