sentience_tokenize/
lib.rs

1#![forbid(unsafe_code)]
2//! sentience-tokenize — tiny zero-dep tokenizer for a simple DSL.
3//!
4//! ## Stable API surface (guaranteed across compatible releases)
5//! - `TokenKind`, `Token`, `Span`
6//! - `tokenize(&str) -> Result<Vec<Token>, LexError>`
7//! - `tokenize_iter(&str)` returning an iterator of `Result<Token, LexError>`
8//! - `LineMap` for byte→(line,col) mapping
9//! - `LexError` and `LexErrorKind`
10//!
11//! ## Versioning
12//! - Patch releases fix bugs only; no public API changes.
13//! - Minor releases (`0.x.y` → `0.(x+1).0`) may add new `TokenKind` variants or utilities without removing existing ones.
14//!   Downstream code should avoid exhaustive `match` over `TokenKind`; prefer a `_` catch-all to remain forward-compatible.
15//! - Any removal or change of existing public types/fields will be treated as a breaking change and called out explicitly.
16//!
17//! ## Spec (summary)
18//! - **Identifiers**: `[A-Za-z_][A-Za-z0-9_]*`, ASCII only.
19//! - **Numbers**: decimal integers/decimals with optional exponent (`e|E[+|-]d+`). A single dot is allowed once; `..` is not consumed by numbers.
20//! - **Strings**: double-quoted with escapes `\n \t \r \" \\`. Raw newlines are accepted. Unknown escapes are errors.
21//! - **Comments**: `//` to end-of-line.
22//! - **Delimiters**: `() { } [ ] , : ;`.
23//! - **Operators**: `= + - * / ->`.
24//! - **Keywords**: `true false if then else let rule and or`.
25
26use std::iter::Peekable;
27use std::str::CharIndices;
28
29mod error;
30/// Error type and categories returned by the lexer; stable across minor versions.
31pub use error::{LexError, LexErrorKind};
32mod span;
33/// Utility for mapping byte offsets to `(line, column)`; stable part of the public API.
34pub use span::LineMap;
35mod iter;
36/// Iterator-based API over tokens. Yields `Result<Token, LexError>`.
37pub use iter::{tokenize_iter, Tokens};
38
39/// Zero-copy token kind borrowing slices from the source.
40/// Note: `String(&str)` contains the *literal contents between quotes* without unquoting; escapes (e.g. `\n`) are left as two characters.
41#[cfg_attr(feature = "serde", derive(serde::Serialize))]
42#[derive(Debug, Clone, Copy, PartialEq, Eq)]
43pub enum BorrowedTokenKind<'a> {
44    Ident(&'a str),
45    Number(&'a str),
46    String(&'a str),
47    True,
48    False,
49    If,
50    Then,
51    Else,
52    Let,
53    Rule,
54    And,
55    Or,
56    LParen,
57    RParen,
58    LBrace,
59    RBrace,
60    LBracket,
61    RBracket,
62    Comma,
63    Colon,
64    Semicolon,
65    Arrow,
66    Eq,
67    Plus,
68    Minus,
69    Star,
70    Slash,
71}
72
73/// A zero-copy token with its [`BorrowedTokenKind`] and [`Span`].
74#[cfg_attr(feature = "serde", derive(serde::Serialize))]
75#[derive(Debug, Clone, Copy, PartialEq, Eq)]
76pub struct BorrowedToken<'a> {
77    pub kind: BorrowedTokenKind<'a>,
78    pub span: Span,
79}
80
81/// Token kind for the DSL. Variant set is stable across minor releases; new variants may be added in minor versions.
82#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
83#[cfg_attr(feature = "serde", serde(tag = "type", content = "value"))]
84#[derive(Debug, Clone, PartialEq, Eq)]
85pub enum TokenKind {
86    Ident(String),
87    Number(String),
88    String(String),
89    True,
90    False,
91    If,
92    Then,
93    Else,
94    Let,
95    Rule,
96    And,
97    Or,
98    LParen,
99    RParen,
100    LBrace,
101    RBrace,
102    LBracket,
103    RBracket,
104    Comma,
105    Colon,
106    Semicolon,
107    Arrow,
108    Eq,
109    Plus,
110    Minus,
111    Star,
112    Slash,
113}
114
115/// Byte span `[start, end)` into the original source.
116#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
117#[derive(Debug, Clone, Copy, PartialEq, Eq)]
118pub struct Span {
119    pub start: usize,
120    pub end: usize,
121}
122
123/// A token with its [`TokenKind`] and [`Span`].
124#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
125#[derive(Debug, Clone, PartialEq, Eq)]
126pub struct Token {
127    pub kind: TokenKind,
128    pub span: Span,
129}
130
131/// Streaming lexer. Prefer [`tokenize`] / [`tokenize_iter`] unless you need manual control.
132#[derive(Debug)]
133pub struct Lexer<'a> {
134    src: &'a str,
135    it: Peekable<CharIndices<'a>>,
136    cur: Option<(usize, char)>,
137}
138
139impl<'a> Lexer<'a> {
140    pub fn new(src: &'a str) -> Self {
141        let mut it = src.char_indices().peekable();
142        let cur = it.next();
143        Self { src, it, cur }
144    }
145
146    fn bump(&mut self) -> Option<(usize, char)> {
147        let out = self.cur;
148        self.cur = self.it.next();
149        out
150    }
151
152    fn peek(&self) -> Option<(usize, char)> {
153        self.cur
154    }
155
156    fn skip_ws_and_comments(&mut self) {
157        loop {
158            let mut progressed = false;
159            while let Some((_, c)) = self.peek() {
160                if c.is_whitespace() {
161                    self.bump();
162                    progressed = true;
163                } else {
164                    break;
165                }
166            }
167            if let Some((_, '/')) = self.peek() {
168                let mut clone = self.it.clone();
169                if let Some((_, '/')) = clone.next() {
170                    self.bump();
171                    self.bump();
172                    while let Some((_, c)) = self.peek() {
173                        if c == '\n' {
174                            break;
175                        }
176                        self.bump();
177                    }
178                    continue;
179                }
180            }
181            if !progressed {
182                break;
183            }
184        }
185    }
186
187    fn kw_or_ident(s: &str) -> TokenKind {
188        match s {
189            "true" => TokenKind::True,
190            "false" => TokenKind::False,
191            "if" => TokenKind::If,
192            "then" => TokenKind::Then,
193            "else" => TokenKind::Else,
194            "let" => TokenKind::Let,
195            "rule" => TokenKind::Rule,
196            "and" => TokenKind::And,
197            "or" => TokenKind::Or,
198            _ => TokenKind::Ident(s.to_string()),
199        }
200    }
201
202    fn lex_number(&mut self, start: usize) -> Result<Token, LexError> {
203        let mut seen_dot = false;
204        let mut seen_exp = false;
205        let mut last_was_dot = false;
206        self.bump(); // consume first digit
207
208        while let Some((idx, ch)) = self.peek() {
209            if ch.is_ascii_digit() {
210                self.bump();
211                last_was_dot = false;
212            } else if ch == '.' {
213                if seen_dot {
214                    // If we just consumed a dot and see another dot, it's a range `..` -> stop number here
215                    if last_was_dot {
216                        break;
217                    }
218                    // Otherwise, this is a second dot in the same numeric literal -> invalid number
219                    return Err(LexError::new(
220                        LexErrorKind::InvalidNumber,
221                        Span {
222                            start,
223                            end: idx + ch.len_utf8(),
224                        },
225                    ));
226                }
227                // lookahead: do not consume if `..` or if no digit follows ("0.")
228                let mut clone = self.it.clone();
229                if let Some((_, next)) = clone.next() {
230                    if next == '.' {
231                        break;
232                    }
233                    if !next.is_ascii_digit() {
234                        break;
235                    }
236                } else {
237                    break;
238                }
239                seen_dot = true;
240                last_was_dot = true;
241                self.bump();
242            } else if (ch == 'e' || ch == 'E') && !seen_exp {
243                seen_exp = true;
244                last_was_dot = false;
245                self.bump();
246                if let Some((_, sign)) = self.peek() {
247                    if sign == '+' || sign == '-' {
248                        self.bump();
249                    }
250                }
251                match self.peek() {
252                    Some((_, d)) if d.is_ascii_digit() => {}
253                    _ => {
254                        return Err(LexError::new(
255                            LexErrorKind::InvalidNumber,
256                            Span {
257                                start,
258                                end: idx + ch.len_utf8(),
259                            },
260                        ));
261                    }
262                }
263            } else {
264                break;
265            }
266        }
267
268        let end = self.peek().map(|(j, _)| j).unwrap_or(self.src.len());
269        Ok(Token {
270            kind: TokenKind::Number(self.src[start..end].to_string()),
271            span: Span { start, end },
272        })
273    }
274
275    fn lex_number_borrowed(&mut self, start: usize) -> Result<BorrowedToken<'a>, LexError> {
276        let mut seen_dot = false;
277        let mut seen_exp = false;
278        let mut last_was_dot = false;
279        self.bump(); // consume first digit
280
281        while let Some((idx, ch)) = self.peek() {
282            if ch.is_ascii_digit() {
283                self.bump();
284                last_was_dot = false;
285            } else if ch == '.' {
286                if seen_dot {
287                    if last_was_dot {
288                        break;
289                    }
290                    return Err(LexError::new(
291                        LexErrorKind::InvalidNumber,
292                        Span {
293                            start,
294                            end: idx + ch.len_utf8(),
295                        },
296                    ));
297                }
298                let mut clone = self.it.clone();
299                if let Some((_, next)) = clone.next() {
300                    if next == '.' {
301                        break;
302                    }
303                    if !next.is_ascii_digit() {
304                        break;
305                    }
306                } else {
307                    break;
308                }
309                seen_dot = true;
310                last_was_dot = true;
311                self.bump();
312            } else if (ch == 'e' || ch == 'E') && !seen_exp {
313                seen_exp = true;
314                last_was_dot = false;
315                self.bump();
316                if let Some((_, sign)) = self.peek() {
317                    if sign == '+' || sign == '-' {
318                        self.bump();
319                    }
320                }
321                match self.peek() {
322                    Some((_, d)) if d.is_ascii_digit() => {}
323                    _ => {
324                        return Err(LexError::new(
325                            LexErrorKind::InvalidNumber,
326                            Span {
327                                start,
328                                end: idx + ch.len_utf8(),
329                            },
330                        ))
331                    }
332                }
333            } else {
334                break;
335            }
336        }
337
338        let end = self.peek().map(|(j, _)| j).unwrap_or(self.src.len());
339        Ok(BorrowedToken {
340            kind: BorrowedTokenKind::Number(&self.src[start..end]),
341            span: Span { start, end },
342        })
343    }
344
345    /// Return next borrowed token or error without allocations. Strings validate escapes but keep them as-is in the slice.
346    fn next_token_borrowed(&mut self) -> Option<Result<BorrowedToken<'a>, LexError>> {
347        self.skip_ws_and_comments();
348        let (i, c) = self.peek()?;
349
350        // Strings: validate and borrow the raw contents between quotes
351        if c == '"' {
352            let start = i; // points at opening quote
353            self.bump();
354            let content_start = start + 1;
355            loop {
356                let Some((j, ch)) = self.bump() else {
357                    return Some(Err(LexError::new(
358                        LexErrorKind::UnterminatedString,
359                        Span {
360                            start,
361                            end: self.src.len(),
362                        },
363                    )));
364                };
365                match ch {
366                    '\\' => {
367                        // require a following valid escape char, but do not build the string
368                        let Some((k, esc)) = self.bump() else {
369                            return Some(Err(LexError::new(
370                                LexErrorKind::UnterminatedEscape,
371                                Span {
372                                    start: j,
373                                    end: j + 1,
374                                },
375                            )));
376                        };
377                        match esc {
378                            'n' | 't' | 'r' | '"' | '\\' => {
379                                let _ = k;
380                            }
381                            _ => {
382                                let escape_end = k + esc.len_utf8();
383                                return Some(Err(LexError::new(
384                                    LexErrorKind::InvalidEscape,
385                                    Span {
386                                        start: j,
387                                        end: escape_end,
388                                    },
389                                )));
390                            }
391                        }
392                    }
393                    '"' => {
394                        let end = j + 1; // closing quote included in token span
395                        return Some(Ok(BorrowedToken {
396                            kind: BorrowedTokenKind::String(&self.src[content_start..j]),
397                            span: Span { start, end },
398                        }));
399                    }
400                    _ => {}
401                }
402            }
403        }
404
405        // Numbers
406        if c.is_ascii_digit() {
407            match self.lex_number_borrowed(i) {
408                Ok(tok) => return Some(Ok(tok)),
409                Err(e) => return Some(Err(e)),
410            }
411        }
412
413        // Idents / keywords
414        if c.is_ascii_alphabetic() || c == '_' {
415            let start = i;
416            self.bump();
417            while let Some((_, p)) = self.peek() {
418                if p.is_ascii_alphanumeric() || p == '_' {
419                    self.bump();
420                } else {
421                    break;
422                }
423            }
424            let end = self.peek().map(|(j, _)| j).unwrap_or(self.src.len());
425            let kind = match &self.src[start..end] {
426                "true" => BorrowedTokenKind::True,
427                "false" => BorrowedTokenKind::False,
428                "if" => BorrowedTokenKind::If,
429                "then" => BorrowedTokenKind::Then,
430                "else" => BorrowedTokenKind::Else,
431                "let" => BorrowedTokenKind::Let,
432                "rule" => BorrowedTokenKind::Rule,
433                "and" => BorrowedTokenKind::And,
434                "or" => BorrowedTokenKind::Or,
435                s => BorrowedTokenKind::Ident(s),
436            };
437            return Some(Ok(BorrowedToken {
438                kind,
439                span: Span { start, end },
440            }));
441        }
442
443        // Arrow / minus
444        if c == '-' {
445            let start = i;
446            self.bump();
447            if let Some((j, '>')) = self.peek() {
448                self.bump();
449                return Some(Ok(BorrowedToken {
450                    kind: BorrowedTokenKind::Arrow,
451                    span: Span { start, end: j + 1 },
452                }));
453            } else {
454                return Some(Ok(BorrowedToken {
455                    kind: BorrowedTokenKind::Minus,
456                    span: Span {
457                        start,
458                        end: start + 1,
459                    },
460                }));
461            }
462        }
463
464        // Singles / error
465        let start = i;
466        self.bump();
467        let tk = match c {
468            '(' => BorrowedTokenKind::LParen,
469            ')' => BorrowedTokenKind::RParen,
470            '{' => BorrowedTokenKind::LBrace,
471            '}' => BorrowedTokenKind::RBrace,
472            '[' => BorrowedTokenKind::LBracket,
473            ']' => BorrowedTokenKind::RBracket,
474            ',' => BorrowedTokenKind::Comma,
475            ':' => BorrowedTokenKind::Colon,
476            ';' => BorrowedTokenKind::Semicolon,
477            '=' => BorrowedTokenKind::Eq,
478            '+' => BorrowedTokenKind::Plus,
479            '*' => BorrowedTokenKind::Star,
480            '/' => BorrowedTokenKind::Slash,
481            other => {
482                return Some(Err(LexError::new(
483                    LexErrorKind::UnexpectedChar,
484                    Span {
485                        start,
486                        end: start + other.len_utf8(),
487                    },
488                )));
489            }
490        };
491        Some(Ok(BorrowedToken {
492            kind: tk,
493            span: Span {
494                start,
495                end: start + 1,
496            },
497        }))
498    }
499
500    /// Return next token or error without buffering the entire input.
501    /// `None` means end.
502    #[inline]
503    pub(crate) fn next_token(&mut self) -> Option<Result<Token, LexError>> {
504        self.skip_ws_and_comments();
505        let (i, c) = self.peek()?;
506
507        // Strings
508        if c == '"' {
509            let start = i;
510            self.bump();
511            let mut s = String::new();
512            loop {
513                let Some((j, ch)) = self.bump() else {
514                    return Some(Err(LexError::new(
515                        LexErrorKind::UnterminatedString,
516                        Span {
517                            start,
518                            end: self.src.len(),
519                        },
520                    )));
521                };
522                match ch {
523                    '\\' => {
524                        // precizan span za escape sekvence
525                        let Some((k, esc)) = self.bump() else {
526                            return Some(Err(LexError::new(
527                                LexErrorKind::UnterminatedEscape,
528                                Span {
529                                    start: j,
530                                    end: j + 1,
531                                },
532                            )));
533                        };
534                        let ch = match esc {
535                            'n' => '\n',
536                            't' => '\t',
537                            'r' => '\r',
538                            '"' => '"',
539                            '\\' => '\\',
540                            _ => {
541                                let escape_end = k + esc.len_utf8();
542                                return Some(Err(LexError::new(
543                                    LexErrorKind::InvalidEscape,
544                                    Span {
545                                        start: j,
546                                        end: escape_end,
547                                    },
548                                )));
549                            }
550                        };
551                        s.push(ch);
552                    }
553                    '"' => {
554                        return Some(Ok(Token {
555                            kind: TokenKind::String(s),
556                            span: Span { start, end: j + 1 },
557                        }));
558                    }
559                    _ => s.push(ch),
560                }
561            }
562        }
563
564        // Numbers
565        if c.is_ascii_digit() {
566            match self.lex_number(i) {
567                Ok(tok) => return Some(Ok(tok)),
568                Err(e) => return Some(Err(e)),
569            }
570        }
571
572        // Idents / keywords
573        if c.is_ascii_alphabetic() || c == '_' {
574            let start = i;
575            self.bump();
576            while let Some((_, p)) = self.peek() {
577                if p.is_ascii_alphanumeric() || p == '_' {
578                    self.bump();
579                } else {
580                    break;
581                }
582            }
583            let end = self.peek().map(|(j, _)| j).unwrap_or(self.src.len());
584            let kind = Self::kw_or_ident(&self.src[start..end]);
585            return Some(Ok(Token {
586                kind,
587                span: Span { start, end },
588            }));
589        }
590
591        // Arrow / minus
592        if c == '-' {
593            let start = i;
594            self.bump();
595            if let Some((j, '>')) = self.peek() {
596                self.bump();
597                return Some(Ok(Token {
598                    kind: TokenKind::Arrow,
599                    span: Span { start, end: j + 1 },
600                }));
601            } else {
602                return Some(Ok(Token {
603                    kind: TokenKind::Minus,
604                    span: Span {
605                        start,
606                        end: start + 1,
607                    },
608                }));
609            }
610        }
611
612        // Singles / error
613        let start = i;
614        self.bump();
615        let tk = match c {
616            '(' => TokenKind::LParen,
617            ')' => TokenKind::RParen,
618            '{' => TokenKind::LBrace,
619            '}' => TokenKind::RBrace,
620            '[' => TokenKind::LBracket,
621            ']' => TokenKind::RBracket,
622            ',' => TokenKind::Comma,
623            ':' => TokenKind::Colon,
624            ';' => TokenKind::Semicolon,
625            '=' => TokenKind::Eq,
626            '+' => TokenKind::Plus,
627            '*' => TokenKind::Star,
628            '/' => TokenKind::Slash,
629            other => {
630                return Some(Err(LexError::new(
631                    LexErrorKind::UnexpectedChar,
632                    Span {
633                        start,
634                        end: start + other.len_utf8(),
635                    },
636                )));
637            }
638        };
639        Some(Ok(Token {
640            kind: tk,
641            span: Span {
642                start,
643                end: start + 1,
644            },
645        }))
646    }
647
648    pub fn tokenize(mut self) -> Result<Vec<Token>, LexError> {
649        let mut out = Vec::new();
650        while let Some(res) = self.next_token() {
651            match res {
652                Ok(tok) => out.push(tok),
653                Err(e) => return Err(e),
654            }
655        }
656        Ok(out)
657    }
658}
659
660/// Tokenize the entire input and return a vector of tokens.
661/// Errors include unterminated strings/escapes, invalid escapes, invalid numbers, and unexpected characters.
662pub fn tokenize(src: &str) -> Result<Vec<Token>, LexError> {
663    Lexer::new(src).tokenize()
664}
665
666/// Tokenize the entire input returning zero-copy tokens that borrow from `src`.
667/// Strings are validated (including escapes) but their contents are *not* unescaped; the returned `&str` is the raw slice between quotes.
668pub fn tokenize_borrowed(src: &str) -> Result<Vec<BorrowedToken<'_>>, LexError> {
669    let mut lx = Lexer::new(src);
670    let mut out = Vec::new();
671    while let Some(res) = lx.next_token_borrowed() {
672        match res {
673            Ok(t) => out.push(t),
674            Err(e) => return Err(e),
675        }
676    }
677    Ok(out)
678}
679
680#[cfg(test)]
681mod tests {
682    use super::*;
683    #[test]
684    fn error_kind_as_str_and_display_messages() {
685        use super::{LexError, LexErrorKind, Span};
686        let span = Span { start: 1, end: 3 };
687        let cases: &[(LexErrorKind, &str, &str)] = &[
688            (
689                LexErrorKind::UnexpectedChar,
690                "unexpected character",
691                "unexpected char",
692            ),
693            (
694                LexErrorKind::UnterminatedString,
695                "unterminated string",
696                "unterminated string",
697            ),
698            (
699                LexErrorKind::UnterminatedEscape,
700                "unterminated escape",
701                "unterminated escape",
702            ),
703            (
704                LexErrorKind::InvalidNumber,
705                "invalid number",
706                "invalid number",
707            ),
708            (
709                LexErrorKind::InvalidEscape,
710                "invalid escape sequence",
711                "invalid escape",
712            ),
713        ];
714
715        for (kind, as_str_msg, display_msg) in cases.iter().cloned() {
716            assert_eq!(kind.as_str(), as_str_msg);
717            let err = LexError::new(kind, span);
718            let rendered = format!("{}", err);
719            assert_eq!(
720                rendered,
721                format!("{} at {}..{}", display_msg, span.start, span.end)
722            );
723            let _e: &dyn std::error::Error = &err;
724            let _dbg = format!("{:?}", err.clone());
725            assert!(!_dbg.is_empty());
726        }
727    }
728    #[test]
729    fn numbers_second_dot_invalid_unless_range() {
730        // second dot with digits on both sides -> invalid number
731        let err = tokenize("123.45.6").expect_err("second dot should be invalid unless range");
732        assert!(matches!(err.kind, LexErrorKind::InvalidNumber));
733
734        // but range `1..2` must remain split (we already check UnexpectedChar for the dot itself)
735        let err = tokenize("1..2").expect_err("range dot should not be consumed by number");
736        assert!(matches!(err.kind, LexErrorKind::UnexpectedChar));
737    }
738
739    #[test]
740    fn numbers_exponent_rules() {
741        // valid exponent forms
742        let toks = tokenize("1e10 1E+10 1.23e-4").unwrap();
743        assert!(toks
744            .iter()
745            .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1e10")));
746        assert!(toks
747            .iter()
748            .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1E+10")));
749        assert!(toks
750            .iter()
751            .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1.23e-4")));
752
753        // missing exponent digits is invalid
754        let err = tokenize("1e+").expect_err("missing exponent digits");
755        assert!(matches!(err.kind, LexErrorKind::InvalidNumber));
756
757        let err = tokenize("2E-").expect_err("missing exponent digits");
758        assert!(matches!(err.kind, LexErrorKind::InvalidNumber));
759    }
760    #[test]
761    fn basic() {
762        let code = r#"
763            // sample
764            let rule greet(name) = "hi, " + name
765            if true and false then x = 1 else x = 2;
766        "#;
767        let toks = tokenize(code).unwrap();
768        assert!(toks.iter().any(|t| matches!(t.kind, TokenKind::Let)));
769        assert!(toks.iter().any(|t| matches!(t.kind, TokenKind::Rule)));
770        assert!(toks.iter().any(|t| matches!(t.kind, TokenKind::String(_))));
771    }
772
773    #[test]
774    fn numbers_and_ranges() {
775        // valid decimals and exponents
776        let toks = tokenize("1 1.0 1.2e-3").unwrap();
777        assert!(toks
778            .iter()
779            .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1")));
780        assert!(toks
781            .iter()
782            .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1.0")));
783        assert!(toks
784            .iter()
785            .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1.2e-3")));
786
787        // ensure don't swallow `..` as part of a number
788        let err = tokenize("1..2").expect_err("should error on unexpected '.'");
789        assert!(matches!(err.kind, LexErrorKind::UnexpectedChar));
790    }
791
792    #[test]
793    fn string_escapes() {
794        // valid escapes
795        let toks = tokenize("\"a\\n\\t\\r\\\\\\\"\"").unwrap();
796        assert!(matches!(toks[0].kind, TokenKind::String(_)));
797
798        // invalid escape
799        let err = tokenize("\"\\x\"").unwrap_err();
800        assert!(matches!(err.kind, LexErrorKind::InvalidEscape));
801    }
802
803    #[test]
804    fn numbers_trailing_dot_is_error() {
805        let err = tokenize("0.").expect_err("trailing dot should error");
806        assert!(matches!(err.kind, LexErrorKind::UnexpectedChar));
807    }
808
809    #[test]
810    fn strings_empty_and_raw_newline_and_escapes() {
811        // empty string
812        let toks = tokenize("\"\"").unwrap();
813        assert!(matches!(toks[0].kind, TokenKind::String(ref s) if s.is_empty()));
814
815        // raw newline inside string is allowed by this lexer
816        let toks = tokenize("\"a\nb\"").unwrap();
817        assert!(matches!(toks[0].kind, TokenKind::String(ref s) if s == "a\nb"));
818
819        // complex escapes: quote, backslash, tab -> resulting string is "\t
820        let toks = tokenize("\"\\\"\\\\\t\"").unwrap();
821        assert!(matches!(toks[0].kind, TokenKind::String(ref s) if s == "\"\\\t"));
822    }
823
824    #[test]
825    fn streaming_iterator_matches_tokenize_and_propagates_error() {
826        // identičan izlaz kao tokenize()
827        let src = "let x = 1 + 2\nrule r() = \"ok\"";
828        let vec_tokens = tokenize(src).unwrap();
829        let iter_tokens: Result<Vec<_>, _> = tokenize_iter(src).collect();
830        let iter_tokens = iter_tokens.unwrap();
831        assert_eq!(vec_tokens, iter_tokens);
832
833        // greška: invalid escape — prvi element je Err, posle toga iteracija se završava
834        let src_err = "\"abc\\x\" rest";
835        let mut it = tokenize_iter(src_err);
836        match it.next() {
837            Some(Err(e)) => assert!(matches!(e.kind, LexErrorKind::InvalidEscape)),
838            other => panic!("expected first item to be Err, got {:?}", other),
839        }
840        assert!(it.next().is_none(), "iterator should end after error");
841    }
842
843    #[test]
844    fn invalid_escape_span_is_precise() {
845        // src contents: \"abc\\x\"
846        let src = "\"abc\\x\"";
847        let err = tokenize(src).unwrap_err();
848        assert!(matches!(err.kind, LexErrorKind::InvalidEscape));
849        // backslash at idx 4, 'x' at idx 5 -> span 4..6
850        assert_eq!(err.span, Span { start: 4, end: 6 });
851    }
852
853    #[test]
854    fn strings_unterminated_and_unterminated_escape() {
855        // unterminated string
856        let err = tokenize("\"abc").expect_err("unterminated string");
857        assert!(matches!(err.kind, LexErrorKind::UnterminatedString));
858
859        // unterminated escape
860        let err = tokenize("\"abc\\").expect_err("unterminated escape");
861        assert!(matches!(err.kind, LexErrorKind::UnterminatedEscape));
862    }
863
864    #[test]
865    fn idents_and_keywords() {
866        let toks = tokenize("let letx _x1").unwrap();
867        assert!(matches!(toks[0].kind, TokenKind::Let));
868        assert!(matches!(toks[1].kind, TokenKind::Ident(ref s) if s == "letx"));
869        assert!(matches!(toks[2].kind, TokenKind::Ident(ref s) if s == "_x1"));
870    }
871
872    #[test]
873    fn comments_do_not_leak() {
874        let toks = tokenize("foo // comment\nbar").unwrap();
875        assert!(matches!(toks[0].kind, TokenKind::Ident(ref s) if s == "foo"));
876        assert!(matches!(toks[1].kind, TokenKind::Ident(ref s) if s == "bar"));
877        assert_eq!(toks.len(), 2);
878    }
879
880    #[test]
881    fn unknown_char_errors_with_span() {
882        let err = tokenize("a @ b").expect_err("unknown char '@'");
883        assert!(matches!(err.kind, LexErrorKind::UnexpectedChar));
884        assert!(err.span.start < err.span.end);
885    }
886
887    #[test]
888    fn golden_small_input() {
889        let src = "let rule f(x) = \"hi\" + x";
890        let toks = tokenize(src).unwrap();
891        use TokenKind::*;
892        let kinds: Vec<&'static str> = toks
893            .iter()
894            .map(|t| match &t.kind {
895                Let => "Let",
896                Rule => "Rule",
897                Ident(s) if s == "f" => "Ident(f)",
898                LParen => "LParen",
899                Ident(s) if s == "x" => "Ident(x)",
900                RParen => "RParen",
901                Eq => "Eq",
902                String(s) if s == "hi" => "String(hi)",
903                Plus => "Plus",
904                Ident(s) if s == "x" => "Ident(x)",
905                other => panic!("unexpected token in golden: {:?}", other),
906            })
907            .collect();
908        assert_eq!(
909            kinds,
910            vec![
911                "Let",
912                "Rule",
913                "Ident(f)",
914                "LParen",
915                "Ident(x)",
916                "RParen",
917                "Eq",
918                "String(hi)",
919                "Plus",
920                "Ident(x)"
921            ]
922        );
923    }
924
925    #[cfg(feature = "serde")]
926    #[test]
927    fn serde_round_trip_token() {
928        let toks = tokenize("let x = 1").unwrap();
929        let json = serde_json::to_string(&toks).unwrap();
930        let back: Vec<Token> = serde_json::from_str(&json).unwrap();
931        assert_eq!(toks, back);
932    }
933
934    #[test]
935    fn borrowed_basic_no_escapes() {
936        let toks = tokenize_borrowed("let x = \"hi\" 123").unwrap();
937        use BorrowedTokenKind as K;
938        assert!(matches!(toks[0].kind, K::Let));
939        assert!(matches!(toks[1].kind, K::Ident("x")));
940        assert!(matches!(toks[3].kind, K::String("hi")));
941        assert!(matches!(toks[4].kind, K::Number("123")));
942    }
943
944    #[test]
945    fn borrowed_string_keeps_escapes() {
946        let toks = tokenize_borrowed("\"a\\n\"").unwrap();
947        use BorrowedTokenKind as K;
948        assert!(matches!(toks[0].kind, K::String("a\\n")));
949    }
950
951    // --- Extra coverage for borrowed API ---
952    #[test]
953    fn borrowed_operators_and_singles() {
954        use BorrowedTokenKind as K;
955        // covers: Arrow vs Minus, and all singles
956        let src = "()->{}[],:;=+ - * / ->";
957        let toks = tokenize_borrowed(src).unwrap();
958        let kinds: Vec<&'static str> = toks
959            .iter()
960            .map(|t| match t.kind {
961                K::LParen => "LParen",
962                K::RParen => "RParen",
963                K::Arrow => "Arrow",
964                K::LBrace => "LBrace",
965                K::RBrace => "RBrace",
966                K::LBracket => "LBracket",
967                K::RBracket => "RBracket",
968                K::Comma => "Comma",
969                K::Colon => "Colon",
970                K::Semicolon => "Semicolon",
971                K::Eq => "Eq",
972                K::Plus => "Plus",
973                K::Minus => "Minus",
974                K::Star => "Star",
975                K::Slash => "Slash",
976                _ => "Other",
977            })
978            .collect();
979        assert_eq!(
980            kinds,
981            vec![
982                "LParen",
983                "RParen",
984                "Arrow",
985                "LBrace",
986                "RBrace",
987                "LBracket",
988                "RBracket",
989                "Comma",
990                "Colon",
991                "Semicolon",
992                "Eq",
993                "Plus",
994                "Minus",
995                "Star",
996                "Slash",
997                "Arrow"
998            ]
999        );
1000    }
1001
1002    #[test]
1003    fn borrowed_keywords_and_idents() {
1004        use BorrowedTokenKind as K;
1005        let toks =
1006            tokenize_borrowed("true false if then else let rule and or foo _bar a1").unwrap();
1007        // Pick some spot checks
1008        assert!(matches!(toks[0].kind, K::True));
1009        assert!(matches!(toks[1].kind, K::False));
1010        assert!(matches!(toks[2].kind, K::If));
1011        assert!(matches!(toks[3].kind, K::Then));
1012        assert!(matches!(toks[4].kind, K::Else));
1013        assert!(matches!(toks[5].kind, K::Let));
1014        assert!(matches!(toks[6].kind, K::Rule));
1015        assert!(matches!(toks[7].kind, K::And));
1016        assert!(matches!(toks[8].kind, K::Or));
1017        assert!(matches!(toks[9].kind, K::Ident("foo")));
1018        assert!(matches!(toks[10].kind, K::Ident("_bar")));
1019        assert!(matches!(toks[11].kind, K::Ident("a1")));
1020    }
1021
1022    #[test]
1023    fn borrowed_comments_skipped() {
1024        use BorrowedTokenKind as K;
1025        let toks = tokenize_borrowed("foo // comment\nbar").unwrap();
1026        assert!(matches!(toks[0].kind, K::Ident("foo")));
1027        assert!(matches!(toks[1].kind, K::Ident("bar")));
1028        assert_eq!(toks.len(), 2);
1029    }
1030
1031    #[test]
1032    fn borrowed_numbers_errors_and_valid() {
1033        use BorrowedTokenKind as K;
1034        // valid
1035        let toks = tokenize_borrowed("1 1.0 1.2e-3").unwrap();
1036        assert!(matches!(toks[0].kind, K::Number("1")));
1037        assert!(matches!(toks[1].kind, K::Number("1.0")));
1038        assert!(matches!(toks[2].kind, K::Number("1.2e-3")));
1039        // invalid: second dot that's not a range
1040        let err = tokenize_borrowed("123.45.6").expect_err("second dot invalid");
1041        assert!(matches!(err.kind, LexErrorKind::InvalidNumber));
1042        // invalid: exponent without digits
1043        let err = tokenize_borrowed("1e+").expect_err("missing exponent digits");
1044        assert!(matches!(err.kind, LexErrorKind::InvalidNumber));
1045    }
1046
1047    #[test]
1048    fn borrowed_string_errors() {
1049        // invalid escape
1050        let err = tokenize_borrowed("\"\\x\"").unwrap_err();
1051        assert!(matches!(err.kind, LexErrorKind::InvalidEscape));
1052        // unterminated string
1053        let err = tokenize_borrowed("\"abc").unwrap_err();
1054        assert!(matches!(err.kind, LexErrorKind::UnterminatedString));
1055        // unterminated escape
1056        let err = tokenize_borrowed("\"abc\\").unwrap_err();
1057        assert!(matches!(err.kind, LexErrorKind::UnterminatedEscape));
1058    }
1059
1060    #[test]
1061    fn borrowed_unexpected_char_error() {
1062        let err = tokenize_borrowed("a @ b").expect_err("unexpected '@'");
1063        assert!(matches!(err.kind, LexErrorKind::UnexpectedChar));
1064        assert!(err.span.start < err.span.end);
1065    }
1066}