sentience_tokenize/
lib.rs

1//! sentience-tokenize — tiny zero-dep tokenizer for a simple DSL.
2//!
3//! ## Stable API surface (guaranteed across compatible releases)
4//! - `TokenKind`, `Token`, `Span`
5//! - `tokenize(&str) -> Result<Vec<Token>, LexError>`
6//! - `tokenize_iter(&str)` returning an iterator of `Result<Token, LexError>`
7//! - `LineMap` for byte→(line,col) mapping
8//! - `LexError` and `LexErrorKind`
9//!
10//! ## Versioning
11//! - Patch releases fix bugs only; no public API changes.
12//! - Minor releases (`0.x.y` → `0.(x+1).0`) may add new `TokenKind` variants or utilities without removing existing ones.
13//!   Downstream code should avoid exhaustive `match` over `TokenKind`; prefer a `_` catch-all to remain forward-compatible.
14//! - Any removal or change of existing public types/fields will be treated as a breaking change and called out explicitly.
15//!
16//! ## Spec (summary)
17//! - **Identifiers**: `[A-Za-z_][A-Za-z0-9_]*`, ASCII only.
18//! - **Numbers**: decimal integers/decimals with optional exponent (`e|E[+|-]d+`). A single dot is allowed once; `..` is not consumed by numbers.
19//! - **Strings**: double-quoted with escapes `\n \t \r \" \\`. Raw newlines are accepted. Unknown escapes are errors.
20//! - **Comments**: `//` to end-of-line.
21//! - **Delimiters**: `() { } [ ] , : ;`.
22//! - **Operators**: `= + - * / ->`.
23//! - **Keywords**: `true false if then else let rule and or`.
24
25use std::iter::Peekable;
26use std::str::CharIndices;
27
28mod error;
29/// Error type and categories returned by the lexer; stable across minor versions.
30pub use error::{LexError, LexErrorKind};
31mod span;
32/// Utility for mapping byte offsets to `(line, column)`; stable part of the public API.
33pub use span::LineMap;
34mod iter;
35/// Iterator-based API over tokens. Yields `Result<Token, LexError>`.
36pub use iter::{tokenize_iter, Tokens};
37
38/// Token kind for the DSL. Variant set is stable across minor releases; new variants may be added in minor versions.
39#[derive(Debug, Clone, PartialEq, Eq)]
40pub enum TokenKind {
41    Ident(String),
42    Number(String),
43    String(String),
44    True,
45    False,
46    If,
47    Then,
48    Else,
49    Let,
50    Rule,
51    And,
52    Or,
53    LParen,
54    RParen,
55    LBrace,
56    RBrace,
57    LBracket,
58    RBracket,
59    Comma,
60    Colon,
61    Semicolon,
62    Arrow,
63    Eq,
64    Plus,
65    Minus,
66    Star,
67    Slash,
68}
69
70/// Byte span `[start, end)` into the original source.
71#[derive(Debug, Clone, Copy, PartialEq, Eq)]
72pub struct Span {
73    pub start: usize,
74    pub end: usize,
75}
76
77/// A token with its [`TokenKind`] and [`Span`].
78#[derive(Debug, Clone, PartialEq, Eq)]
79pub struct Token {
80    pub kind: TokenKind,
81    pub span: Span,
82}
83
84/// Streaming lexer. Prefer [`tokenize`] / [`tokenize_iter`] unless you need manual control.
85pub struct Lexer<'a> {
86    src: &'a str,
87    it: Peekable<CharIndices<'a>>,
88    cur: Option<(usize, char)>,
89}
90
91impl<'a> Lexer<'a> {
92    pub fn new(src: &'a str) -> Self {
93        let mut it = src.char_indices().peekable();
94        let cur = it.next();
95        Self { src, it, cur }
96    }
97
98    fn bump(&mut self) -> Option<(usize, char)> {
99        let out = self.cur;
100        self.cur = self.it.next();
101        out
102    }
103
104    fn peek(&self) -> Option<(usize, char)> {
105        self.cur
106    }
107
108    fn skip_ws_and_comments(&mut self) {
109        loop {
110            let mut progressed = false;
111            while let Some((_, c)) = self.peek() {
112                if c.is_whitespace() {
113                    self.bump();
114                    progressed = true;
115                } else {
116                    break;
117                }
118            }
119            if let Some((_, '/')) = self.peek() {
120                let mut clone = self.it.clone();
121                if let Some((_, '/')) = clone.next() {
122                    self.bump();
123                    self.bump();
124                    while let Some((_, c)) = self.peek() {
125                        if c == '\n' {
126                            break;
127                        }
128                        self.bump();
129                    }
130                    continue;
131                }
132            }
133            if !progressed {
134                break;
135            }
136        }
137    }
138
139    fn kw_or_ident(s: &str) -> TokenKind {
140        match s {
141            "true" => TokenKind::True,
142            "false" => TokenKind::False,
143            "if" => TokenKind::If,
144            "then" => TokenKind::Then,
145            "else" => TokenKind::Else,
146            "let" => TokenKind::Let,
147            "rule" => TokenKind::Rule,
148            "and" => TokenKind::And,
149            "or" => TokenKind::Or,
150            _ => TokenKind::Ident(s.to_string()),
151        }
152    }
153
154    fn lex_number(&mut self, start: usize) -> Result<Token, LexError> {
155        let mut seen_dot = false;
156        let mut seen_exp = false;
157        let mut last_was_dot = false;
158        self.bump(); // consume first digit
159
160        while let Some((idx, ch)) = self.peek() {
161            if ch.is_ascii_digit() {
162                self.bump();
163                last_was_dot = false;
164            } else if ch == '.' {
165                if seen_dot {
166                    // If we just consumed a dot and see another dot, it's a range `..` -> stop number here
167                    if last_was_dot {
168                        break;
169                    }
170                    // Otherwise, this is a second dot in the same numeric literal -> invalid number
171                    return Err(LexError::new(
172                        LexErrorKind::InvalidNumber,
173                        Span {
174                            start,
175                            end: idx + ch.len_utf8(),
176                        },
177                    ));
178                }
179                // lookahead: do not consume if `..` or if no digit follows ("0.")
180                let mut clone = self.it.clone();
181                if let Some((_, next)) = clone.next() {
182                    if next == '.' {
183                        break;
184                    }
185                    if !next.is_ascii_digit() {
186                        break;
187                    }
188                } else {
189                    break;
190                }
191                seen_dot = true;
192                last_was_dot = true;
193                self.bump();
194            } else if (ch == 'e' || ch == 'E') && !seen_exp {
195                seen_exp = true;
196                last_was_dot = false;
197                self.bump();
198                if let Some((_, sign)) = self.peek() {
199                    if sign == '+' || sign == '-' {
200                        self.bump();
201                    }
202                }
203                match self.peek() {
204                    Some((_, d)) if d.is_ascii_digit() => {}
205                    _ => {
206                        return Err(LexError::new(
207                            LexErrorKind::InvalidNumber,
208                            Span {
209                                start,
210                                end: idx + ch.len_utf8(),
211                            },
212                        ));
213                    }
214                }
215            } else {
216                break;
217            }
218        }
219
220        let end = self.peek().map(|(j, _)| j).unwrap_or(self.src.len());
221        Ok(Token {
222            kind: TokenKind::Number(self.src[start..end].to_string()),
223            span: Span { start, end },
224        })
225    }
226
227    pub fn tokenize(mut self) -> Result<Vec<Token>, LexError> {
228        let mut out = Vec::new();
229        loop {
230            self.skip_ws_and_comments();
231            let Some((i, c)) = self.peek() else {
232                break;
233            };
234
235            if c == '"' {
236                let start = i;
237                self.bump();
238                let mut s = String::new();
239                loop {
240                    let Some((j, ch)) = self.bump() else {
241                        return Err(LexError::new(
242                            LexErrorKind::UnterminatedString,
243                            Span {
244                                start,
245                                end: self.src.len(),
246                            },
247                        ));
248                    };
249                    match ch {
250                        '\\' => {
251                            let Some((_, esc)) = self.bump() else {
252                                return Err(LexError::new(
253                                    LexErrorKind::UnterminatedEscape,
254                                    Span { start, end: j + 1 },
255                                ));
256                            };
257                            let ch = match esc {
258                                'n' => '\n',
259                                't' => '\t',
260                                'r' => '\r',
261                                '"' => '"',
262                                '\\' => '\\',
263                                _ => {
264                                    // invalid escape
265                                    return Err(LexError::new(
266                                        LexErrorKind::InvalidEscape,
267                                        Span {
268                                            start,
269                                            end: self.src.len(),
270                                        },
271                                    ));
272                                }
273                            };
274                            s.push(ch);
275                        }
276                        '"' => {
277                            out.push(Token {
278                                kind: TokenKind::String(s),
279                                span: Span { start, end: j + 1 },
280                            });
281                            break;
282                        }
283                        _ => s.push(ch),
284                    }
285                }
286                continue;
287            }
288
289            if c.is_ascii_digit() {
290                match self.lex_number(i) {
291                    Ok(tok) => out.push(tok),
292                    Err(e) => return Err(e),
293                }
294                continue;
295            }
296
297            if c.is_ascii_alphabetic() || c == '_' {
298                let start = i;
299                self.bump();
300                while let Some((_, p)) = self.peek() {
301                    if p.is_ascii_alphanumeric() || p == '_' {
302                        self.bump();
303                    } else {
304                        break;
305                    }
306                }
307                let end = self.peek().map(|(j, _)| j).unwrap_or(self.src.len());
308                let kind = Self::kw_or_ident(&self.src[start..end]);
309                out.push(Token {
310                    kind,
311                    span: Span { start, end },
312                });
313                continue;
314            }
315
316            if c == '-' {
317                let start = i;
318                self.bump();
319                if let Some((j, '>')) = self.peek() {
320                    self.bump();
321                    out.push(Token {
322                        kind: TokenKind::Arrow,
323                        span: Span { start, end: j + 1 },
324                    });
325                } else {
326                    out.push(Token {
327                        kind: TokenKind::Minus,
328                        span: Span {
329                            start,
330                            end: start + 1,
331                        },
332                    });
333                }
334                continue;
335            }
336
337            let start = i;
338            self.bump();
339            let tk = match c {
340                '(' => TokenKind::LParen,
341                ')' => TokenKind::RParen,
342                '{' => TokenKind::LBrace,
343                '}' => TokenKind::RBrace,
344                '[' => TokenKind::LBracket,
345                ']' => TokenKind::RBracket,
346                ',' => TokenKind::Comma,
347                ':' => TokenKind::Colon,
348                ';' => TokenKind::Semicolon,
349                '=' => TokenKind::Eq,
350                '+' => TokenKind::Plus,
351                '*' => TokenKind::Star,
352                '/' => TokenKind::Slash,
353                other => {
354                    return Err(LexError::new(
355                        LexErrorKind::UnexpectedChar,
356                        Span {
357                            start,
358                            end: start + other.len_utf8(),
359                        },
360                    ))
361                }
362            };
363            out.push(Token {
364                kind: tk,
365                span: Span {
366                    start,
367                    end: start + 1,
368                },
369            });
370        }
371        Ok(out)
372    }
373}
374
375/// Tokenize the entire input and return a vector of tokens.
376/// Errors include unterminated strings/escapes, invalid escapes, invalid numbers, and unexpected characters.
377pub fn tokenize(src: &str) -> Result<Vec<Token>, LexError> {
378    Lexer::new(src).tokenize()
379}
380
381#[cfg(test)]
382mod tests {
383    use super::*;
384    #[test]
385    fn error_kind_as_str_and_display_messages() {
386        use super::{LexError, LexErrorKind, Span};
387        let span = Span { start: 1, end: 3 };
388        let cases: &[(LexErrorKind, &str, &str)] = &[
389            (
390                LexErrorKind::UnexpectedChar,
391                "unexpected character",
392                "unexpected char",
393            ),
394            (
395                LexErrorKind::UnterminatedString,
396                "unterminated string",
397                "unterminated string",
398            ),
399            (
400                LexErrorKind::UnterminatedEscape,
401                "unterminated escape",
402                "unterminated escape",
403            ),
404            (
405                LexErrorKind::InvalidNumber,
406                "invalid number",
407                "invalid number",
408            ),
409            (
410                LexErrorKind::InvalidEscape,
411                "invalid escape sequence",
412                "invalid escape",
413            ),
414        ];
415
416        for (kind, as_str_msg, display_msg) in cases.iter().cloned() {
417            assert_eq!(kind.as_str(), as_str_msg);
418            let err = LexError::new(kind, span);
419            let rendered = format!("{}", err);
420            assert_eq!(
421                rendered,
422                format!("{} at {}..{}", display_msg, span.start, span.end)
423            );
424            let _e: &dyn std::error::Error = &err;
425            let _dbg = format!("{:?}", err.clone());
426            assert!(!_dbg.is_empty());
427        }
428    }
429    #[test]
430    fn numbers_second_dot_invalid_unless_range() {
431        // second dot with digits on both sides -> invalid number
432        let err = tokenize("123.45.6").expect_err("second dot should be invalid unless range");
433        assert!(matches!(err.kind, LexErrorKind::InvalidNumber));
434
435        // but range `1..2` must remain split (we already check UnexpectedChar for the dot itself)
436        let err = tokenize("1..2").expect_err("range dot should not be consumed by number");
437        assert!(matches!(err.kind, LexErrorKind::UnexpectedChar));
438    }
439
440    #[test]
441    fn numbers_exponent_rules() {
442        // valid exponent forms
443        let toks = tokenize("1e10 1E+10 1.23e-4").unwrap();
444        assert!(toks
445            .iter()
446            .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1e10")));
447        assert!(toks
448            .iter()
449            .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1E+10")));
450        assert!(toks
451            .iter()
452            .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1.23e-4")));
453
454        // missing exponent digits is invalid
455        let err = tokenize("1e+").expect_err("missing exponent digits");
456        assert!(matches!(err.kind, LexErrorKind::InvalidNumber));
457
458        let err = tokenize("2E-").expect_err("missing exponent digits");
459        assert!(matches!(err.kind, LexErrorKind::InvalidNumber));
460    }
461    #[test]
462    fn basic() {
463        let code = r#"
464            // sample
465            let rule greet(name) = "hi, " + name
466            if true and false then x = 1 else x = 2;
467        "#;
468        let toks = tokenize(code).unwrap();
469        assert!(toks.iter().any(|t| matches!(t.kind, TokenKind::Let)));
470        assert!(toks.iter().any(|t| matches!(t.kind, TokenKind::Rule)));
471        assert!(toks.iter().any(|t| matches!(t.kind, TokenKind::String(_))));
472    }
473
474    #[test]
475    fn numbers_and_ranges() {
476        // valid decimals and exponents
477        let toks = tokenize("1 1.0 1.2e-3").unwrap();
478        assert!(toks
479            .iter()
480            .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1")));
481        assert!(toks
482            .iter()
483            .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1.0")));
484        assert!(toks
485            .iter()
486            .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1.2e-3")));
487
488        // ensure don't swallow `..` as part of a number
489        let err = tokenize("1..2").expect_err("should error on unexpected '.'");
490        assert!(matches!(err.kind, LexErrorKind::UnexpectedChar));
491    }
492
493    #[test]
494    fn string_escapes() {
495        // valid escapes
496        let toks = tokenize("\"a\\n\\t\\r\\\\\\\"\"").unwrap();
497        assert!(matches!(toks[0].kind, TokenKind::String(_)));
498
499        // invalid escape
500        let err = tokenize("\"\\x\"").unwrap_err();
501        assert!(matches!(err.kind, LexErrorKind::InvalidEscape));
502    }
503
504    #[test]
505    fn numbers_trailing_dot_is_error() {
506        let err = tokenize("0.").expect_err("trailing dot should error");
507        assert!(matches!(err.kind, LexErrorKind::UnexpectedChar));
508    }
509
510    #[test]
511    fn strings_empty_and_raw_newline_and_escapes() {
512        // empty string
513        let toks = tokenize("\"\"").unwrap();
514        assert!(matches!(toks[0].kind, TokenKind::String(ref s) if s.is_empty()));
515
516        // raw newline inside string is allowed by this lexer
517        let toks = tokenize("\"a\nb\"").unwrap();
518        assert!(matches!(toks[0].kind, TokenKind::String(ref s) if s == "a\nb"));
519
520        // complex escapes: quote, backslash, tab -> resulting string is "\t
521        let toks = tokenize("\"\\\"\\\\\t\"").unwrap();
522        assert!(matches!(toks[0].kind, TokenKind::String(ref s) if s == "\"\\\t"));
523    }
524
525    #[test]
526    fn strings_unterminated_and_unterminated_escape() {
527        // unterminated string
528        let err = tokenize("\"abc").expect_err("unterminated string");
529        assert!(matches!(err.kind, LexErrorKind::UnterminatedString));
530
531        // unterminated escape
532        let err = tokenize("\"abc\\").expect_err("unterminated escape");
533        assert!(matches!(err.kind, LexErrorKind::UnterminatedEscape));
534    }
535
536    #[test]
537    fn idents_and_keywords() {
538        let toks = tokenize("let letx _x1").unwrap();
539        assert!(matches!(toks[0].kind, TokenKind::Let));
540        assert!(matches!(toks[1].kind, TokenKind::Ident(ref s) if s == "letx"));
541        assert!(matches!(toks[2].kind, TokenKind::Ident(ref s) if s == "_x1"));
542    }
543
544    #[test]
545    fn comments_do_not_leak() {
546        let toks = tokenize("foo // comment\nbar").unwrap();
547        assert!(matches!(toks[0].kind, TokenKind::Ident(ref s) if s == "foo"));
548        assert!(matches!(toks[1].kind, TokenKind::Ident(ref s) if s == "bar"));
549        assert_eq!(toks.len(), 2);
550    }
551
552    #[test]
553    fn unknown_char_errors_with_span() {
554        let err = tokenize("a @ b").expect_err("unknown char '@'");
555        assert!(matches!(err.kind, LexErrorKind::UnexpectedChar));
556        assert!(err.span.start < err.span.end);
557    }
558
559    #[test]
560    fn golden_small_input() {
561        let src = "let rule f(x) = \"hi\" + x";
562        let toks = tokenize(src).unwrap();
563        use TokenKind::*;
564        let kinds: Vec<&'static str> = toks
565            .iter()
566            .map(|t| match &t.kind {
567                Let => "Let",
568                Rule => "Rule",
569                Ident(s) if s == "f" => "Ident(f)",
570                LParen => "LParen",
571                Ident(s) if s == "x" => "Ident(x)",
572                RParen => "RParen",
573                Eq => "Eq",
574                String(s) if s == "hi" => "String(hi)",
575                Plus => "Plus",
576                Ident(s) if s == "x" => "Ident(x)",
577                other => panic!("unexpected token in golden: {:?}", other),
578            })
579            .collect();
580        assert_eq!(
581            kinds,
582            vec![
583                "Let",
584                "Rule",
585                "Ident(f)",
586                "LParen",
587                "Ident(x)",
588                "RParen",
589                "Eq",
590                "String(hi)",
591                "Plus",
592                "Ident(x)"
593            ]
594        );
595    }
596}