Skip to main content

mimir_core/
lex.rs

1//! Lisp S-expression lexer for the Mimir write surface.
2//!
3//! Implements the lexical grammar in `docs/concepts/ir-write-surface.md`
4//! § 3. Produces a stream of [`Token`]s from UTF-8 input; errors carry a
5//! [`Position`] pointing at the offending byte.
6
7use std::str::Chars;
8
9use thiserror::Error;
10
11/// A byte-and-line position in the input, 1-based line/column.
12#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
13pub struct Position {
14    /// 0-based byte offset into the input.
15    pub offset: usize,
16    /// 1-based line number (newline-delimited).
17    pub line: usize,
18    /// 1-based column number.
19    pub column: usize,
20}
21
22impl Position {
23    /// Start-of-input position.
24    #[must_use]
25    pub const fn start() -> Self {
26        Self {
27            offset: 0,
28            line: 1,
29            column: 1,
30        }
31    }
32}
33
34/// A lexical token.
35///
36/// Matches the token classes in `ir-write-surface.md` § 3.1. String-
37/// bearing variants own their content (copying out of the input); this
38/// trades a small allocation per token for simpler parser lifetimes.
39#[derive(Clone, Debug, PartialEq)]
40pub enum Token {
41    /// `@name` — a symbol reference. Leading `@` stripped.
42    Symbol(String),
43    /// `@name:Kind` — a symbol reference with a kind annotation. The
44    /// `Kind` string is passed through for later validation in the
45    /// parser / binder.
46    TypedSymbol {
47        /// The `@name` part without the `@` prefix.
48        name: String,
49        /// The `:Kind` part without the leading `:`.
50        kind: String,
51    },
52    /// A bareword identifier — matches `[a-z_][a-z0-9_]*`. Serves as
53    /// opcode at form heads, predicate in predicate slots, string
54    /// literal elsewhere (disambiguation is the parser's job).
55    Bareword(String),
56    /// An ISO-8601-UTC timestamp bareword (date-only or full date-time
57    /// with optional millisecond fraction and a `Z` suffix).
58    Timestamp(String),
59    /// A signed 64-bit integer literal.
60    Integer(i64),
61    /// An IEEE 754 binary64 float literal (contains a `.`).
62    Float(f64),
63    /// A double-quoted UTF-8 string with escape sequences resolved.
64    String(String),
65    /// Boolean literal `true` or `false`.
66    Boolean(bool),
67    /// `nil` null literal.
68    Nil,
69    /// `:keyword` — keyword argument tag, without the leading `:`.
70    Keyword(String),
71    /// Open parenthesis `(`.
72    LParen,
73    /// Close parenthesis `)`.
74    RParen,
75}
76
77/// A [`Token`] paired with its source position.
78#[derive(Clone, Debug, PartialEq)]
79pub struct Spanned {
80    /// The token.
81    pub token: Token,
82    /// Start position of the token in the input.
83    pub position: Position,
84}
85
86/// Errors produced by [`tokenize`].
87#[derive(Debug, Error, PartialEq)]
88pub enum LexError {
89    /// A `"`-quoted string was not terminated before end-of-input.
90    #[error("unterminated string starting at {start:?}")]
91    UnterminatedString {
92        /// Position of the opening `"`.
93        start: Position,
94    },
95
96    /// A `\x` escape sequence used an unsupported character.
97    #[error("invalid escape '\\{escape}' at {pos:?}")]
98    InvalidEscape {
99        /// The character after the backslash.
100        escape: char,
101        /// Position of the backslash.
102        pos: Position,
103    },
104
105    /// A numeric token could not be parsed.
106    #[error("invalid number {text:?} at {pos:?}")]
107    InvalidNumber {
108        /// The raw text.
109        text: String,
110        /// Start position.
111        pos: Position,
112    },
113
114    /// An identifier or kind annotation is ill-formed.
115    #[error("invalid identifier {text:?} at {pos:?}")]
116    InvalidIdentifier {
117        /// The raw text.
118        text: String,
119        /// Start position.
120        pos: Position,
121    },
122
123    /// A byte that cannot start any token (e.g. stray punctuation).
124    #[error("unexpected byte {byte:#04x} at {pos:?}")]
125    UnexpectedByte {
126        /// The offending byte.
127        byte: u8,
128        /// Start position.
129        pos: Position,
130    },
131
132    /// Input was not valid UTF-8 at the cursor.
133    #[error("invalid UTF-8 at {pos:?}")]
134    InvalidUtf8 {
135        /// Position of the bad byte.
136        pos: Position,
137    },
138}
139
140/// Tokenize a UTF-8 input into a vector of [`Spanned`] tokens.
141///
142/// Comments (`; … \n`) and whitespace are dropped.
143///
144/// # Errors
145///
146/// Returns a [`LexError`] on any lexical violation; the error carries a
147/// [`Position`] pointing at the offending byte.
148///
149/// # Examples
150///
151/// ```
152/// # #![allow(clippy::unwrap_used)]
153/// use mimir_core::lex::{tokenize, Token};
154///
155/// let tokens = tokenize("(sem @alice email \"alice@example.com\")").unwrap();
156/// assert_eq!(tokens.first().map(|s| &s.token), Some(&Token::LParen));
157/// assert_eq!(tokens.last().map(|s| &s.token), Some(&Token::RParen));
158/// ```
159pub fn tokenize(input: &str) -> Result<Vec<Spanned>, LexError> {
160    let mut lexer = Lexer::new(input);
161    let mut out = Vec::new();
162    while let Some(spanned) = lexer.next_token()? {
163        out.push(spanned);
164    }
165    Ok(out)
166}
167
168struct Lexer<'a> {
169    input: &'a str,
170    chars: Chars<'a>,
171    pos: Position,
172}
173
174impl<'a> Lexer<'a> {
175    fn new(input: &'a str) -> Self {
176        Self {
177            input,
178            chars: input.chars(),
179            pos: Position::start(),
180        }
181    }
182
183    fn peek(&self) -> Option<char> {
184        self.chars.clone().next()
185    }
186
187    fn bump(&mut self) -> Option<char> {
188        let c = self.chars.next()?;
189        let len = c.len_utf8();
190        self.pos.offset += len;
191        if c == '\n' {
192            self.pos.line += 1;
193            self.pos.column = 1;
194        } else {
195            self.pos.column += 1;
196        }
197        Some(c)
198    }
199
200    fn skip_whitespace_and_comments(&mut self) {
201        while let Some(c) = self.peek() {
202            if c.is_whitespace() {
203                self.bump();
204            } else if c == ';' {
205                // Line comment runs to newline (not included).
206                while let Some(cc) = self.peek() {
207                    if cc == '\n' {
208                        break;
209                    }
210                    self.bump();
211                }
212            } else {
213                break;
214            }
215        }
216    }
217
218    fn next_token(&mut self) -> Result<Option<Spanned>, LexError> {
219        self.skip_whitespace_and_comments();
220        let start = self.pos;
221        let Some(c) = self.peek() else {
222            return Ok(None);
223        };
224        let token = match c {
225            '(' => {
226                self.bump();
227                Token::LParen
228            }
229            ')' => {
230                self.bump();
231                Token::RParen
232            }
233            '"' => self.lex_string(start)?,
234            '@' => self.lex_symbol_or_typed(start)?,
235            ':' => self.lex_keyword(start)?,
236            '-' | '0'..='9' => self.lex_number_or_timestamp(start)?,
237            'a'..='z' | '_' => self.lex_bareword_or_reserved(start)?,
238            _ => {
239                let byte = c as u32;
240                #[allow(clippy::cast_possible_truncation)]
241                return Err(LexError::UnexpectedByte {
242                    byte: byte as u8,
243                    pos: start,
244                });
245            }
246        };
247        Ok(Some(Spanned {
248            token,
249            position: start,
250        }))
251    }
252
253    fn lex_string(&mut self, start: Position) -> Result<Token, LexError> {
254        self.bump(); // consume opening quote
255        let mut buf = String::new();
256        loop {
257            let pos = self.pos;
258            let Some(c) = self.bump() else {
259                return Err(LexError::UnterminatedString { start });
260            };
261            match c {
262                '"' => return Ok(Token::String(buf)),
263                '\\' => {
264                    let Some(esc) = self.bump() else {
265                        return Err(LexError::UnterminatedString { start });
266                    };
267                    let resolved = match esc {
268                        'n' => '\n',
269                        'r' => '\r',
270                        't' => '\t',
271                        '\\' => '\\',
272                        '"' => '"',
273                        other => return Err(LexError::InvalidEscape { escape: other, pos }),
274                    };
275                    buf.push(resolved);
276                }
277                other => buf.push(other),
278            }
279        }
280    }
281
282    fn lex_symbol_or_typed(&mut self, start: Position) -> Result<Token, LexError> {
283        self.bump(); // consume '@'
284        let name_start = self.pos.offset;
285        self.consume_identifier();
286        let name_end = self.pos.offset;
287        let name = self.input[name_start..name_end].to_string();
288        if name.is_empty() || !is_valid_identifier_start(&name) {
289            return Err(LexError::InvalidIdentifier {
290                text: format!("@{name}"),
291                pos: start,
292            });
293        }
294        if self.peek() == Some(':') {
295            self.bump();
296            let kind_start = self.pos.offset;
297            self.consume_kind_annotation();
298            let kind_end = self.pos.offset;
299            let kind = self.input[kind_start..kind_end].to_string();
300            if kind.is_empty() || !is_valid_kind_annotation(&kind) {
301                return Err(LexError::InvalidIdentifier {
302                    text: format!("@{name}:{kind}"),
303                    pos: start,
304                });
305            }
306            Ok(Token::TypedSymbol { name, kind })
307        } else {
308            Ok(Token::Symbol(name))
309        }
310    }
311
312    fn lex_keyword(&mut self, start: Position) -> Result<Token, LexError> {
313        self.bump(); // consume ':'
314        let name_start = self.pos.offset;
315        self.consume_identifier();
316        let name_end = self.pos.offset;
317        let name = self.input[name_start..name_end].to_string();
318        if name.is_empty() || !is_valid_identifier_start(&name) {
319            return Err(LexError::InvalidIdentifier {
320                text: format!(":{name}"),
321                pos: start,
322            });
323        }
324        Ok(Token::Keyword(name))
325    }
326
327    fn lex_number_or_timestamp(&mut self, start: Position) -> Result<Token, LexError> {
328        let begin = self.pos.offset;
329        // Consume the number / timestamp body — digits, `-`, `.`, `T`, `:`, `Z`.
330        // The classifier below decides whether the result is an Integer,
331        // Float, or Timestamp.
332        while let Some(c) = self.peek() {
333            if c.is_ascii_digit() || matches!(c, '-' | '.' | ':' | 'T' | 'Z') {
334                self.bump();
335            } else {
336                break;
337            }
338        }
339        let end = self.pos.offset;
340        let text = &self.input[begin..end];
341        if looks_like_timestamp(text) {
342            return Ok(Token::Timestamp(text.to_string()));
343        }
344        if text.contains('.') {
345            text.parse::<f64>()
346                .map(Token::Float)
347                .map_err(|_| LexError::InvalidNumber {
348                    text: text.to_string(),
349                    pos: start,
350                })
351        } else {
352            text.parse::<i64>()
353                .map(Token::Integer)
354                .map_err(|_| LexError::InvalidNumber {
355                    text: text.to_string(),
356                    pos: start,
357                })
358        }
359    }
360
361    fn lex_bareword_or_reserved(&mut self, start: Position) -> Result<Token, LexError> {
362        let begin = self.pos.offset;
363        self.consume_identifier();
364        let end = self.pos.offset;
365        let text = &self.input[begin..end];
366        let token = match text {
367            "true" => Token::Boolean(true),
368            "false" => Token::Boolean(false),
369            "nil" => Token::Nil,
370            _ => {
371                if is_valid_identifier_start(text) {
372                    Token::Bareword(text.to_string())
373                } else {
374                    return Err(LexError::InvalidIdentifier {
375                        text: text.to_string(),
376                        pos: start,
377                    });
378                }
379            }
380        };
381        Ok(token)
382    }
383
384    fn consume_identifier(&mut self) {
385        while let Some(c) = self.peek() {
386            if c.is_ascii_lowercase() || c.is_ascii_digit() || c == '_' {
387                self.bump();
388            } else {
389                break;
390            }
391        }
392    }
393
394    fn consume_kind_annotation(&mut self) {
395        // Kind annotations start with an ASCII uppercase letter and
396        // continue with alphanumeric characters.
397        while let Some(c) = self.peek() {
398            if c.is_ascii_alphabetic() || c.is_ascii_digit() {
399                self.bump();
400            } else {
401                break;
402            }
403        }
404    }
405}
406
407fn is_valid_identifier_start(s: &str) -> bool {
408    let mut chars = s.chars();
409    match chars.next() {
410        Some(c) if c.is_ascii_lowercase() || c == '_' => {
411            chars.all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '_')
412        }
413        _ => false,
414    }
415}
416
417fn is_valid_kind_annotation(s: &str) -> bool {
418    let mut chars = s.chars();
419    match chars.next() {
420        Some(c) if c.is_ascii_uppercase() => chars.all(char::is_alphanumeric),
421        _ => false,
422    }
423}
424
425fn looks_like_timestamp(text: &str) -> bool {
426    // Canonical formats per ir-write-surface.md § 3.1:
427    //   YYYY-MM-DD
428    //   YYYY-MM-DDTHH:MM:SS[Z|.<frac>Z]
429    let bytes = text.as_bytes();
430    if bytes.len() < 10 {
431        return false;
432    }
433    // YYYY-MM-DD
434    if !(bytes[..4].iter().all(u8::is_ascii_digit)
435        && bytes[4] == b'-'
436        && bytes[5..7].iter().all(u8::is_ascii_digit)
437        && bytes[7] == b'-'
438        && bytes[8..10].iter().all(u8::is_ascii_digit))
439    {
440        return false;
441    }
442    if bytes.len() == 10 {
443        return true;
444    }
445    // Full date-time: must have 'T' after the date portion.
446    if bytes[10] != b'T' {
447        return false;
448    }
449    // Remainder is HH:MM:SS[Z|.frac Z]. Minimal sanity check — the
450    // format is accepted here and later normalised by the binder.
451    let rest = &bytes[11..];
452    rest.contains(&b':')
453}
454
455#[cfg(test)]
456mod tests {
457    use super::*;
458
459    fn first(tokens: &[Spanned]) -> &Token {
460        &tokens[0].token
461    }
462
463    #[test]
464    fn empty_input_produces_no_tokens() {
465        assert!(tokenize("").unwrap().is_empty());
466        assert!(tokenize("   \t\n  ").unwrap().is_empty());
467    }
468
469    #[test]
470    fn parens_are_tokens() {
471        let t = tokenize("( )").unwrap();
472        assert_eq!(t.len(), 2);
473        assert_eq!(first(&t), &Token::LParen);
474        assert_eq!(&t[1].token, &Token::RParen);
475    }
476
477    #[test]
478    fn symbol_with_and_without_kind() {
479        let t = tokenize("@alice @alice:Agent").unwrap();
480        assert_eq!(first(&t), &Token::Symbol("alice".into()));
481        assert_eq!(
482            &t[1].token,
483            &Token::TypedSymbol {
484                name: "alice".into(),
485                kind: "Agent".into(),
486            }
487        );
488    }
489
490    #[test]
491    fn bareword_and_reserved_words() {
492        let t = tokenize("email true false nil sem").unwrap();
493        assert_eq!(first(&t), &Token::Bareword("email".into()));
494        assert_eq!(&t[1].token, &Token::Boolean(true));
495        assert_eq!(&t[2].token, &Token::Boolean(false));
496        assert_eq!(&t[3].token, &Token::Nil);
497        assert_eq!(&t[4].token, &Token::Bareword("sem".into()));
498    }
499
500    #[test]
501    fn numbers_distinguish_int_and_float() {
502        let t = tokenize("42 -17 3.14 -0.5").unwrap();
503        assert_eq!(first(&t), &Token::Integer(42));
504        assert_eq!(&t[1].token, &Token::Integer(-17));
505        match &t[2].token {
506            Token::Float(f) => assert!((f - 3.14).abs() < 1e-9),
507            other => panic!("expected Float, got {other:?}"),
508        }
509        match &t[3].token {
510            Token::Float(f) => assert!((f + 0.5).abs() < 1e-9),
511            other => panic!("expected Float, got {other:?}"),
512        }
513    }
514
515    #[test]
516    fn timestamps_are_distinct_from_numbers() {
517        let t = tokenize("2024-01-15 2026-04-17T10:00:00Z").unwrap();
518        match first(&t) {
519            Token::Timestamp(s) => assert_eq!(s, "2024-01-15"),
520            other => panic!("expected Timestamp, got {other:?}"),
521        }
522        match &t[1].token {
523            Token::Timestamp(s) => assert_eq!(s, "2026-04-17T10:00:00Z"),
524            other => panic!("expected Timestamp, got {other:?}"),
525        }
526    }
527
528    #[test]
529    fn strings_resolve_escapes() {
530        let t = tokenize(r#" "hello\nworld" "a\"b" "#).unwrap();
531        assert_eq!(first(&t), &Token::String("hello\nworld".into()));
532        assert_eq!(&t[1].token, &Token::String("a\"b".into()));
533    }
534
535    #[test]
536    fn keyword_stripped_of_colon() {
537        let t = tokenize(":src :confidence_threshold").unwrap();
538        assert_eq!(first(&t), &Token::Keyword("src".into()));
539        assert_eq!(&t[1].token, &Token::Keyword("confidence_threshold".into()));
540    }
541
542    #[test]
543    fn line_comments_skipped() {
544        let t = tokenize("; a comment\n@alice").unwrap();
545        assert_eq!(t.len(), 1);
546        assert_eq!(first(&t), &Token::Symbol("alice".into()));
547    }
548
549    #[test]
550    fn unterminated_string_errors() {
551        let result = tokenize(r#" "no close "#);
552        assert!(matches!(result, Err(LexError::UnterminatedString { .. })));
553    }
554
555    #[test]
556    fn invalid_escape_errors() {
557        let result = tokenize(r#" "\q" "#);
558        assert!(matches!(
559            result,
560            Err(LexError::InvalidEscape { escape: 'q', .. })
561        ));
562    }
563
564    #[test]
565    fn unexpected_byte_errors() {
566        let result = tokenize("$");
567        assert!(matches!(result, Err(LexError::UnexpectedByte { .. })));
568    }
569
570    #[test]
571    fn positions_track_line_and_column() {
572        let t = tokenize("(\n@alice").unwrap();
573        assert_eq!(t[0].position.line, 1);
574        assert_eq!(t[0].position.column, 1);
575        assert_eq!(t[1].position.line, 2);
576        assert_eq!(t[1].position.column, 1);
577    }
578}