sentience_tokenize/
lib.rs

1//! sentience-tokenize — tiny zero-dep tokenizer for a simple DSL.
2//!
3//! ## Stable API surface (guaranteed across compatible releases)
4//! - `TokenKind`, `Token`, `Span`
5//! - `tokenize(&str) -> Result<Vec<Token>, LexError>`
6//! - `tokenize_iter(&str)` returning an iterator of `Result<Token, LexError>`
7//! - `LineMap` for byte→(line,col) mapping
8//! - `LexError` and `LexErrorKind`
9//!
10//! ## Versioning
11//! - Minor releases (`0.x.y` → `0.(x+1).0`) may add new token kinds behind minor bumps but will not break existing enum variants or fields.
12//! - Patch releases only fix bugs and do not change public types or behavior except to correct spec-conformant errors.
13//! - Any breaking change to the above surface will be accompanied by a semver-visible minor bump and noted in the changelog.
14//!
15//! ## Spec (summary)
16//! - **Identifiers**: `[A-Za-z_][A-Za-z0-9_]*`, ASCII only.
17//! - **Numbers**: decimal integers/decimals with optional exponent (`e|E[+|-]d+`). A single dot is allowed once; `..` is not consumed by numbers.
18//! - **Strings**: double-quoted with escapes `\n \t \r \" \\`. Unknown escapes are errors.
19//! - **Comments**: `//` to end-of-line.
20//! - **Delimiters**: `() { } [ ] , : ;`.
21//! - **Operators**: `= + - * / ->`.
22//! - **Keywords**: `true false if then else let rule and or`.
23
24use std::iter::Peekable;
25use std::str::CharIndices;
26
27mod error;
28/// Error type and categories returned by the lexer; stable across minor versions.
29pub use error::{LexError, LexErrorKind};
30mod span;
31/// Utility for mapping byte offsets to `(line, column)`; stable part of the public API.
32pub use span::LineMap;
33mod iter;
34/// Iterator-based API over tokens. Yields `Result<Token, LexError>`.
35pub use iter::{tokenize_iter, Tokens};
36
37/// Token kind for the DSL. Variant set is stable across minor releases; new variants may be added in minor versions.
38#[derive(Debug, Clone, PartialEq, Eq)]
39pub enum TokenKind {
40    Ident(String),
41    Number(String),
42    String(String),
43    True,
44    False,
45    If,
46    Then,
47    Else,
48    Let,
49    Rule,
50    And,
51    Or,
52    LParen,
53    RParen,
54    LBrace,
55    RBrace,
56    LBracket,
57    RBracket,
58    Comma,
59    Colon,
60    Semicolon,
61    Arrow,
62    Eq,
63    Plus,
64    Minus,
65    Star,
66    Slash,
67}
68
69/// Byte span `[start, end)` into the original source.
70#[derive(Debug, Clone, Copy, PartialEq, Eq)]
71pub struct Span {
72    pub start: usize,
73    pub end: usize,
74}
75
76/// A token with its [`TokenKind`] and [`Span`].
77#[derive(Debug, Clone, PartialEq, Eq)]
78pub struct Token {
79    pub kind: TokenKind,
80    pub span: Span,
81}
82
83/// Streaming lexer. Prefer [`tokenize`] / [`tokenize_iter`] unless you need manual control.
84pub struct Lexer<'a> {
85    src: &'a str,
86    it: Peekable<CharIndices<'a>>,
87    cur: Option<(usize, char)>,
88}
89
90impl<'a> Lexer<'a> {
91    pub fn new(src: &'a str) -> Self {
92        let mut it = src.char_indices().peekable();
93        let cur = it.next();
94        Self { src, it, cur }
95    }
96
97    fn bump(&mut self) -> Option<(usize, char)> {
98        let out = self.cur;
99        self.cur = self.it.next();
100        out
101    }
102
103    fn peek(&self) -> Option<(usize, char)> {
104        self.cur
105    }
106
107    fn skip_ws_and_comments(&mut self) {
108        loop {
109            let mut progressed = false;
110            while let Some((_, c)) = self.peek() {
111                if c.is_whitespace() {
112                    self.bump();
113                    progressed = true;
114                } else {
115                    break;
116                }
117            }
118            if let Some((_, '/')) = self.peek() {
119                let mut clone = self.it.clone();
120                if let Some((_, '/')) = clone.next() {
121                    self.bump();
122                    self.bump();
123                    while let Some((_, c)) = self.peek() {
124                        if c == '\n' {
125                            break;
126                        }
127                        self.bump();
128                    }
129                    continue;
130                }
131            }
132            if !progressed {
133                break;
134            }
135        }
136    }
137
138    fn kw_or_ident(s: &str) -> TokenKind {
139        match s {
140            "true" => TokenKind::True,
141            "false" => TokenKind::False,
142            "if" => TokenKind::If,
143            "then" => TokenKind::Then,
144            "else" => TokenKind::Else,
145            "let" => TokenKind::Let,
146            "rule" => TokenKind::Rule,
147            "and" => TokenKind::And,
148            "or" => TokenKind::Or,
149            _ => TokenKind::Ident(s.to_string()),
150        }
151    }
152
153    fn lex_number(&mut self, start: usize) -> Result<Token, LexError> {
154        let mut seen_dot = false;
155        let mut seen_exp = false;
156        let mut last_was_dot = false;
157        self.bump(); // consume first digit
158
159        while let Some((idx, ch)) = self.peek() {
160            if ch.is_ascii_digit() {
161                self.bump();
162                last_was_dot = false;
163            } else if ch == '.' {
164                if seen_dot {
165                    // If we just consumed a dot and see another dot, it's a range `..` -> stop number here
166                    if last_was_dot {
167                        break;
168                    }
169                    // Otherwise, this is a second dot in the same numeric literal -> invalid number
170                    return Err(LexError::new(
171                        LexErrorKind::InvalidNumber,
172                        Span {
173                            start,
174                            end: idx + ch.len_utf8(),
175                        },
176                    ));
177                }
178                // lookahead: do not consume if `..` or if no digit follows ("0.")
179                let mut clone = self.it.clone();
180                if let Some((_, next)) = clone.next() {
181                    if next == '.' {
182                        break;
183                    }
184                    if !next.is_ascii_digit() {
185                        break;
186                    }
187                } else {
188                    break;
189                }
190                seen_dot = true;
191                last_was_dot = true;
192                self.bump();
193            } else if (ch == 'e' || ch == 'E') && !seen_exp {
194                seen_exp = true;
195                last_was_dot = false;
196                self.bump();
197                if let Some((_, sign)) = self.peek() {
198                    if sign == '+' || sign == '-' {
199                        self.bump();
200                    }
201                }
202                match self.peek() {
203                    Some((_, d)) if d.is_ascii_digit() => {}
204                    _ => {
205                        return Err(LexError::new(
206                            LexErrorKind::InvalidNumber,
207                            Span {
208                                start,
209                                end: idx + ch.len_utf8(),
210                            },
211                        ));
212                    }
213                }
214            } else {
215                break;
216            }
217        }
218
219        let end = self.peek().map(|(j, _)| j).unwrap_or(self.src.len());
220        Ok(Token {
221            kind: TokenKind::Number(self.src[start..end].to_string()),
222            span: Span { start, end },
223        })
224    }
225
226    pub fn tokenize(mut self) -> Result<Vec<Token>, LexError> {
227        let mut out = Vec::new();
228        loop {
229            self.skip_ws_and_comments();
230            let Some((i, c)) = self.peek() else {
231                break;
232            };
233
234            if c == '"' {
235                let start = i;
236                self.bump();
237                let mut s = String::new();
238                loop {
239                    let Some((j, ch)) = self.bump() else {
240                        return Err(LexError::new(
241                            LexErrorKind::UnterminatedString,
242                            Span {
243                                start,
244                                end: self.src.len(),
245                            },
246                        ));
247                    };
248                    match ch {
249                        '\\' => {
250                            let Some((_, esc)) = self.bump() else {
251                                return Err(LexError::new(
252                                    LexErrorKind::UnterminatedEscape,
253                                    Span { start, end: j + 1 },
254                                ));
255                            };
256                            let ch = match esc {
257                                'n' => '\n',
258                                't' => '\t',
259                                'r' => '\r',
260                                '"' => '"',
261                                '\\' => '\\',
262                                _ => {
263                                    // invalid escape
264                                    return Err(LexError::new(
265                                        LexErrorKind::InvalidEscape,
266                                        Span {
267                                            start,
268                                            end: self.src.len(),
269                                        },
270                                    ));
271                                }
272                            };
273                            s.push(ch);
274                        }
275                        '"' => {
276                            out.push(Token {
277                                kind: TokenKind::String(s),
278                                span: Span { start, end: j + 1 },
279                            });
280                            break;
281                        }
282                        _ => s.push(ch),
283                    }
284                }
285                continue;
286            }
287
288            if c.is_ascii_digit() {
289                match self.lex_number(i) {
290                    Ok(tok) => out.push(tok),
291                    Err(e) => return Err(e),
292                }
293                continue;
294            }
295
296            if c.is_ascii_alphabetic() || c == '_' {
297                let start = i;
298                self.bump();
299                while let Some((_, p)) = self.peek() {
300                    if p.is_ascii_alphanumeric() || p == '_' {
301                        self.bump();
302                    } else {
303                        break;
304                    }
305                }
306                let end = self.peek().map(|(j, _)| j).unwrap_or(self.src.len());
307                let kind = Self::kw_or_ident(&self.src[start..end]);
308                out.push(Token {
309                    kind,
310                    span: Span { start, end },
311                });
312                continue;
313            }
314
315            if c == '-' {
316                let start = i;
317                self.bump();
318                if let Some((j, '>')) = self.peek() {
319                    self.bump();
320                    out.push(Token {
321                        kind: TokenKind::Arrow,
322                        span: Span { start, end: j + 1 },
323                    });
324                } else {
325                    out.push(Token {
326                        kind: TokenKind::Minus,
327                        span: Span {
328                            start,
329                            end: start + 1,
330                        },
331                    });
332                }
333                continue;
334            }
335
336            let start = i;
337            self.bump();
338            let tk = match c {
339                '(' => TokenKind::LParen,
340                ')' => TokenKind::RParen,
341                '{' => TokenKind::LBrace,
342                '}' => TokenKind::RBrace,
343                '[' => TokenKind::LBracket,
344                ']' => TokenKind::RBracket,
345                ',' => TokenKind::Comma,
346                ':' => TokenKind::Colon,
347                ';' => TokenKind::Semicolon,
348                '=' => TokenKind::Eq,
349                '+' => TokenKind::Plus,
350                '*' => TokenKind::Star,
351                '/' => TokenKind::Slash,
352                other => {
353                    return Err(LexError::new(
354                        LexErrorKind::UnexpectedChar,
355                        Span {
356                            start,
357                            end: start + other.len_utf8(),
358                        },
359                    ))
360                }
361            };
362            out.push(Token {
363                kind: tk,
364                span: Span {
365                    start,
366                    end: start + 1,
367                },
368            });
369        }
370        Ok(out)
371    }
372}
373
374/// Tokenize the entire input and return a vector of tokens.
375/// Errors include unterminated strings/escapes, invalid escapes, invalid numbers, and unexpected characters.
376pub fn tokenize(src: &str) -> Result<Vec<Token>, LexError> {
377    Lexer::new(src).tokenize()
378}
379
380#[cfg(test)]
381mod tests {
382    use super::*;
383    #[test]
384    fn numbers_second_dot_invalid_unless_range() {
385        // second dot with digits on both sides -> invalid number
386        let err = tokenize("123.45.6").expect_err("second dot should be invalid unless range");
387        assert!(matches!(err.kind, LexErrorKind::InvalidNumber));
388
389        // but range `1..2` must remain split (we already check UnexpectedChar for the dot itself)
390        let err = tokenize("1..2").expect_err("range dot should not be consumed by number");
391        assert!(matches!(err.kind, LexErrorKind::UnexpectedChar));
392    }
393
394    #[test]
395    fn numbers_exponent_rules() {
396        // valid exponent forms
397        let toks = tokenize("1e10 1E+10 1.23e-4").unwrap();
398        assert!(toks
399            .iter()
400            .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1e10")));
401        assert!(toks
402            .iter()
403            .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1E+10")));
404        assert!(toks
405            .iter()
406            .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1.23e-4")));
407
408        // missing exponent digits is invalid
409        let err = tokenize("1e+").expect_err("missing exponent digits");
410        assert!(matches!(err.kind, LexErrorKind::InvalidNumber));
411
412        let err = tokenize("2E-").expect_err("missing exponent digits");
413        assert!(matches!(err.kind, LexErrorKind::InvalidNumber));
414    }
415    #[test]
416    fn basic() {
417        let code = r#"
418            // sample
419            let rule greet(name) = "hi, " + name
420            if true and false then x = 1 else x = 2;
421        "#;
422        let toks = tokenize(code).unwrap();
423        assert!(toks.iter().any(|t| matches!(t.kind, TokenKind::Let)));
424        assert!(toks.iter().any(|t| matches!(t.kind, TokenKind::Rule)));
425        assert!(toks.iter().any(|t| matches!(t.kind, TokenKind::String(_))));
426    }
427
428    #[test]
429    fn numbers_and_ranges() {
430        // valid decimals and exponents
431        let toks = tokenize("1 1.0 1.2e-3").unwrap();
432        assert!(toks
433            .iter()
434            .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1")));
435        assert!(toks
436            .iter()
437            .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1.0")));
438        assert!(toks
439            .iter()
440            .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1.2e-3")));
441
442        // ensure don't swallow `..` as part of a number
443        let err = tokenize("1..2").expect_err("should error on unexpected '.'");
444        assert!(matches!(err.kind, LexErrorKind::UnexpectedChar));
445    }
446
447    #[test]
448    fn string_escapes() {
449        // valid escapes
450        let toks = tokenize("\"a\\n\\t\\r\\\\\\\"\"").unwrap();
451        assert!(matches!(toks[0].kind, TokenKind::String(_)));
452
453        // invalid escape
454        let err = tokenize("\"\\x\"").unwrap_err();
455        assert!(matches!(err.kind, LexErrorKind::InvalidEscape));
456    }
457
458    #[test]
459    fn numbers_trailing_dot_is_error() {
460        let err = tokenize("0.").expect_err("trailing dot should error");
461        assert!(matches!(err.kind, LexErrorKind::UnexpectedChar));
462    }
463
464    #[test]
465    fn strings_empty_and_raw_newline_and_escapes() {
466        // empty string
467        let toks = tokenize("\"\"").unwrap();
468        assert!(matches!(toks[0].kind, TokenKind::String(ref s) if s.is_empty()));
469
470        // raw newline inside string is allowed by this lexer
471        let toks = tokenize("\"a\nb\"").unwrap();
472        assert!(matches!(toks[0].kind, TokenKind::String(ref s) if s == "a\nb"));
473
474        // complex escapes: quote, backslash, tab -> resulting string is "\t
475        let toks = tokenize("\"\\\"\\\\\t\"").unwrap();
476        assert!(matches!(toks[0].kind, TokenKind::String(ref s) if s == "\"\\\t"));
477    }
478
479    #[test]
480    fn strings_unterminated_and_unterminated_escape() {
481        // unterminated string
482        let err = tokenize("\"abc").expect_err("unterminated string");
483        assert!(matches!(err.kind, LexErrorKind::UnterminatedString));
484
485        // unterminated escape
486        let err = tokenize("\"abc\\").expect_err("unterminated escape");
487        assert!(matches!(err.kind, LexErrorKind::UnterminatedEscape));
488    }
489
490    #[test]
491    fn idents_and_keywords() {
492        let toks = tokenize("let letx _x1").unwrap();
493        assert!(matches!(toks[0].kind, TokenKind::Let));
494        assert!(matches!(toks[1].kind, TokenKind::Ident(ref s) if s == "letx"));
495        assert!(matches!(toks[2].kind, TokenKind::Ident(ref s) if s == "_x1"));
496    }
497
498    #[test]
499    fn comments_do_not_leak() {
500        let toks = tokenize("foo // comment\nbar").unwrap();
501        assert!(matches!(toks[0].kind, TokenKind::Ident(ref s) if s == "foo"));
502        assert!(matches!(toks[1].kind, TokenKind::Ident(ref s) if s == "bar"));
503        assert_eq!(toks.len(), 2);
504    }
505
506    #[test]
507    fn unknown_char_errors_with_span() {
508        let err = tokenize("a @ b").expect_err("unknown char '@'");
509        assert!(matches!(err.kind, LexErrorKind::UnexpectedChar));
510        assert!(err.span.start < err.span.end);
511    }
512
513    #[test]
514    fn golden_small_input() {
515        let src = "let rule f(x) = \"hi\" + x";
516        let toks = tokenize(src).unwrap();
517        use TokenKind::*;
518        let kinds: Vec<&'static str> = toks
519            .iter()
520            .map(|t| match &t.kind {
521                Let => "Let",
522                Rule => "Rule",
523                Ident(s) if s == "f" => "Ident(f)",
524                LParen => "LParen",
525                Ident(s) if s == "x" => "Ident(x)",
526                RParen => "RParen",
527                Eq => "Eq",
528                String(s) if s == "hi" => "String(hi)",
529                Plus => "Plus",
530                Ident(s) if s == "x" => "Ident(x)",
531                other => panic!("unexpected token in golden: {:?}", other),
532            })
533            .collect();
534        assert_eq!(
535            kinds,
536            vec![
537                "Let",
538                "Rule",
539                "Ident(f)",
540                "LParen",
541                "Ident(x)",
542                "RParen",
543                "Eq",
544                "String(hi)",
545                "Plus",
546                "Ident(x)"
547            ]
548        );
549    }
550}