sentience_tokenize/
lib.rs

1#![forbid(unsafe_code)]
2//! sentience-tokenize — tiny zero-dep tokenizer for a simple DSL.
3//!
4//! ## Stable API surface (guaranteed across compatible releases)
5//! - `TokenKind`, `Token`, `Span`
6//! - `tokenize(&str) -> Result<Vec<Token>, LexError>`
7//! - `tokenize_iter(&str)` returning an iterator of `Result<Token, LexError>`
8//! - `LineMap` for byte→(line,col) mapping
9//! - `LexError` and `LexErrorKind`
10//!
11//! ## Versioning
12//! - Patch releases fix bugs only; no public API changes.
13//! - Minor releases (`0.x.y` → `0.(x+1).0`) may add new `TokenKind` variants or utilities without removing existing ones.
14//!   Downstream code should avoid exhaustive `match` over `TokenKind`; prefer a `_` catch-all to remain forward-compatible.
15//! - Any removal or change of existing public types/fields will be treated as a breaking change and called out explicitly.
16//!
17//! ## Spec (summary)
18//! - **Identifiers**: `[A-Za-z_][A-Za-z0-9_]*`, ASCII only.
19//! - **Numbers**: decimal integers/decimals with optional exponent (`e|E[+|-]d+`). A single dot is allowed once; `..` is not consumed by numbers.
20//! - **Strings**: double-quoted with escapes `\n \t \r \" \\`. Raw newlines are accepted. Unknown escapes are errors.
21//! - **Comments**: `//` to end-of-line.
22//! - **Delimiters**: `() { } [ ] , : ;`.
23//! - **Operators**: `= + - * / ->`.
24//! - **Keywords**: `true false if then else let rule and or`.
25
26use std::iter::Peekable;
27use std::str::CharIndices;
28
29mod error;
30/// Error type and categories returned by the lexer; stable across minor versions.
31pub use error::{LexError, LexErrorKind};
32mod span;
33/// Utility for mapping byte offsets to `(line, column)`; stable part of the public API.
34pub use span::LineMap;
35mod iter;
36/// Iterator-based API over tokens. Yields `Result<Token, LexError>`.
37pub use iter::{tokenize_iter, Tokens};
38
39/// Zero-copy token kind borrowing slices from the source.
40/// Note: `String(&str)` contains the *literal contents between quotes* without unquoting; escapes (e.g. `\n`) are left as two characters.
41#[cfg_attr(feature = "serde", derive(serde::Serialize))]
42#[derive(Debug, Clone, Copy, PartialEq, Eq)]
43pub enum BorrowedTokenKind<'a> {
44    Ident(&'a str),
45    Number(&'a str),
46    String(&'a str),
47    True,
48    False,
49    If,
50    Then,
51    Else,
52    Let,
53    Rule,
54    And,
55    Or,
56    LParen,
57    RParen,
58    LBrace,
59    RBrace,
60    LBracket,
61    RBracket,
62    Comma,
63    Colon,
64    Semicolon,
65    Arrow,
66    Eq,
67    Plus,
68    Minus,
69    Star,
70    Slash,
71    Dot,
72    DoubleDot,
73    At,
74}
75
76/// A zero-copy token with its [`BorrowedTokenKind`] and [`Span`].
77#[cfg_attr(feature = "serde", derive(serde::Serialize))]
78#[derive(Debug, Clone, Copy, PartialEq, Eq)]
79pub struct BorrowedToken<'a> {
80    pub kind: BorrowedTokenKind<'a>,
81    pub span: Span,
82}
83
84/// Token kind for the DSL. Variant set is stable across minor releases; new variants may be added in minor versions.
85#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
86#[cfg_attr(feature = "serde", serde(tag = "type", content = "value"))]
87#[derive(Debug, Clone, PartialEq, Eq)]
88pub enum TokenKind {
89    Ident(String),
90    Number(String),
91    String(String),
92    True,
93    False,
94    If,
95    Then,
96    Else,
97    Let,
98    Rule,
99    And,
100    Or,
101    LParen,
102    RParen,
103    LBrace,
104    RBrace,
105    LBracket,
106    RBracket,
107    Comma,
108    Colon,
109    Semicolon,
110    Arrow,
111    Eq,
112    Plus,
113    Minus,
114    Star,
115    Slash,
116    Dot,
117    DoubleDot,
118    At,
119}
120
121/// Byte span `[start, end)` into the original source.
122#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
123#[derive(Debug, Clone, Copy, PartialEq, Eq)]
124pub struct Span {
125    pub start: usize,
126    pub end: usize,
127}
128
129/// A token with its [`TokenKind`] and [`Span`].
130#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
131#[derive(Debug, Clone, PartialEq, Eq)]
132pub struct Token {
133    pub kind: TokenKind,
134    pub span: Span,
135}
136
137/// Streaming lexer. Prefer [`tokenize`] / [`tokenize_iter`] unless you need manual control.
138#[derive(Debug)]
139pub struct Lexer<'a> {
140    src: &'a str,
141    it: Peekable<CharIndices<'a>>,
142    cur: Option<(usize, char)>,
143}
144
145impl<'a> Lexer<'a> {
146    pub fn new(src: &'a str) -> Self {
147        let mut it = src.char_indices().peekable();
148        let cur = it.next();
149        Self { src, it, cur }
150    }
151
152    fn bump(&mut self) -> Option<(usize, char)> {
153        let out = self.cur;
154        self.cur = self.it.next();
155        out
156    }
157
158    fn peek(&self) -> Option<(usize, char)> {
159        self.cur
160    }
161
162    fn skip_ws_and_comments(&mut self) {
163        loop {
164            let mut progressed = false;
165            while let Some((_, c)) = self.peek() {
166                if c.is_whitespace() {
167                    self.bump();
168                    progressed = true;
169                } else {
170                    break;
171                }
172            }
173            if let Some((_, '/')) = self.peek() {
174                let mut clone = self.it.clone();
175                if let Some((_, '/')) = clone.next() {
176                    self.bump();
177                    self.bump();
178                    while let Some((_, c)) = self.peek() {
179                        if c == '\n' {
180                            break;
181                        }
182                        self.bump();
183                    }
184                    continue;
185                }
186            }
187            if !progressed {
188                break;
189            }
190        }
191    }
192
193    fn kw_or_ident(s: &str) -> TokenKind {
194        match s {
195            "true" => TokenKind::True,
196            "false" => TokenKind::False,
197            "if" => TokenKind::If,
198            "then" => TokenKind::Then,
199            "else" => TokenKind::Else,
200            "let" => TokenKind::Let,
201            "rule" => TokenKind::Rule,
202            "and" => TokenKind::And,
203            "or" => TokenKind::Or,
204            _ => TokenKind::Ident(s.to_string()),
205        }
206    }
207
208    fn lex_number(&mut self, start: usize) -> Result<Token, LexError> {
209        let mut seen_dot = false;
210        let mut seen_exp = false;
211        let mut last_was_dot = false;
212        self.bump(); // consume first digit
213
214        while let Some((idx, ch)) = self.peek() {
215            if ch.is_ascii_digit() {
216                self.bump();
217                last_was_dot = false;
218            } else if ch == '.' {
219                if seen_dot {
220                    // If we just consumed a dot and see another dot, it's a range `..` -> stop number here
221                    if last_was_dot {
222                        break;
223                    }
224                    // Otherwise, this is a second dot in the same numeric literal -> invalid number
225                    return Err(LexError::new(
226                        LexErrorKind::InvalidNumber,
227                        Span {
228                            start,
229                            end: idx + ch.len_utf8(),
230                        },
231                    ));
232                }
233                // lookahead: do not consume if `..` or if no digit follows ("0.")
234                let mut clone = self.it.clone();
235                if let Some((_, next)) = clone.next() {
236                    if next == '.' {
237                        break;
238                    }
239                    if !next.is_ascii_digit() {
240                        break;
241                    }
242                } else {
243                    break;
244                }
245                seen_dot = true;
246                last_was_dot = true;
247                self.bump();
248            } else if (ch == 'e' || ch == 'E') && !seen_exp {
249                seen_exp = true;
250                last_was_dot = false;
251                self.bump();
252                if let Some((_, sign)) = self.peek() {
253                    if sign == '+' || sign == '-' {
254                        self.bump();
255                    }
256                }
257                match self.peek() {
258                    Some((_, d)) if d.is_ascii_digit() => {}
259                    _ => {
260                        return Err(LexError::new(
261                            LexErrorKind::InvalidNumber,
262                            Span {
263                                start,
264                                end: idx + ch.len_utf8(),
265                            },
266                        ));
267                    }
268                }
269            } else {
270                break;
271            }
272        }
273
274        let end = self.peek().map(|(j, _)| j).unwrap_or(self.src.len());
275        Ok(Token {
276            kind: TokenKind::Number(self.src[start..end].to_string()),
277            span: Span { start, end },
278        })
279    }
280
281    fn lex_number_borrowed(&mut self, start: usize) -> Result<BorrowedToken<'a>, LexError> {
282        let mut seen_dot = false;
283        let mut seen_exp = false;
284        let mut last_was_dot = false;
285        self.bump(); // consume first digit
286
287        while let Some((idx, ch)) = self.peek() {
288            if ch.is_ascii_digit() {
289                self.bump();
290                last_was_dot = false;
291            } else if ch == '.' {
292                if seen_dot {
293                    if last_was_dot {
294                        break;
295                    }
296                    return Err(LexError::new(
297                        LexErrorKind::InvalidNumber,
298                        Span {
299                            start,
300                            end: idx + ch.len_utf8(),
301                        },
302                    ));
303                }
304                let mut clone = self.it.clone();
305                if let Some((_, next)) = clone.next() {
306                    if next == '.' {
307                        break;
308                    }
309                    if !next.is_ascii_digit() {
310                        break;
311                    }
312                } else {
313                    break;
314                }
315                seen_dot = true;
316                last_was_dot = true;
317                self.bump();
318            } else if (ch == 'e' || ch == 'E') && !seen_exp {
319                seen_exp = true;
320                last_was_dot = false;
321                self.bump();
322                if let Some((_, sign)) = self.peek() {
323                    if sign == '+' || sign == '-' {
324                        self.bump();
325                    }
326                }
327                match self.peek() {
328                    Some((_, d)) if d.is_ascii_digit() => {}
329                    _ => {
330                        return Err(LexError::new(
331                            LexErrorKind::InvalidNumber,
332                            Span {
333                                start,
334                                end: idx + ch.len_utf8(),
335                            },
336                        ))
337                    }
338                }
339            } else {
340                break;
341            }
342        }
343
344        let end = self.peek().map(|(j, _)| j).unwrap_or(self.src.len());
345        Ok(BorrowedToken {
346            kind: BorrowedTokenKind::Number(&self.src[start..end]),
347            span: Span { start, end },
348        })
349    }
350
351    /// Return next borrowed token or error without allocations. Strings validate escapes but keep them as-is in the slice.
352    fn next_token_borrowed(&mut self) -> Option<Result<BorrowedToken<'a>, LexError>> {
353        self.skip_ws_and_comments();
354        let (i, c) = self.peek()?;
355
356        // Strings: validate and borrow the raw contents between quotes
357        if c == '"' {
358            let start = i; // points at opening quote
359            self.bump();
360            let content_start = start + 1;
361            loop {
362                let Some((j, ch)) = self.bump() else {
363                    return Some(Err(LexError::new(
364                        LexErrorKind::UnterminatedString,
365                        Span {
366                            start,
367                            end: self.src.len(),
368                        },
369                    )));
370                };
371                match ch {
372                    '\\' => {
373                        // require a following valid escape char, but do not build the string
374                        let Some((k, esc)) = self.bump() else {
375                            return Some(Err(LexError::new(
376                                LexErrorKind::UnterminatedEscape,
377                                Span {
378                                    start: j,
379                                    end: j + 1,
380                                },
381                            )));
382                        };
383                        match esc {
384                            'n' | 't' | 'r' | '"' | '\\' => {
385                                let _ = k;
386                            }
387                            _ => {
388                                let escape_end = k + esc.len_utf8();
389                                return Some(Err(LexError::new(
390                                    LexErrorKind::InvalidEscape,
391                                    Span {
392                                        start: j,
393                                        end: escape_end,
394                                    },
395                                )));
396                            }
397                        }
398                    }
399                    '"' => {
400                        let end = j + 1; // closing quote included in token span
401                        return Some(Ok(BorrowedToken {
402                            kind: BorrowedTokenKind::String(&self.src[content_start..j]),
403                            span: Span { start, end },
404                        }));
405                    }
406                    _ => {}
407                }
408            }
409        }
410
411        // Double dot (must come before numbers to handle 1..2 correctly)
412        if c == '.' {
413            let start = i;
414            self.bump();
415            if let Some((j, '.')) = self.peek() {
416                self.bump();
417                return Some(Ok(BorrowedToken {
418                    kind: BorrowedTokenKind::DoubleDot,
419                    span: Span { start, end: j + 1 },
420                }));
421            } else {
422                return Some(Ok(BorrowedToken {
423                    kind: BorrowedTokenKind::Dot,
424                    span: Span {
425                        start,
426                        end: start + 1,
427                    },
428                }));
429            }
430        }
431
432        // Numbers
433        if c.is_ascii_digit() {
434            match self.lex_number_borrowed(i) {
435                Ok(tok) => return Some(Ok(tok)),
436                Err(e) => return Some(Err(e)),
437            }
438        }
439
440        // Idents / keywords
441        if c.is_ascii_alphabetic() || c == '_' {
442            let start = i;
443            self.bump();
444            while let Some((_, p)) = self.peek() {
445                if p.is_ascii_alphanumeric() || p == '_' {
446                    self.bump();
447                } else {
448                    break;
449                }
450            }
451            let end = self.peek().map(|(j, _)| j).unwrap_or(self.src.len());
452            let kind = match &self.src[start..end] {
453                "true" => BorrowedTokenKind::True,
454                "false" => BorrowedTokenKind::False,
455                "if" => BorrowedTokenKind::If,
456                "then" => BorrowedTokenKind::Then,
457                "else" => BorrowedTokenKind::Else,
458                "let" => BorrowedTokenKind::Let,
459                "rule" => BorrowedTokenKind::Rule,
460                "and" => BorrowedTokenKind::And,
461                "or" => BorrowedTokenKind::Or,
462                s => BorrowedTokenKind::Ident(s),
463            };
464            return Some(Ok(BorrowedToken {
465                kind,
466                span: Span { start, end },
467            }));
468        }
469
470        // Arrow / minus
471        if c == '-' {
472            let start = i;
473            self.bump();
474            if let Some((j, '>')) = self.peek() {
475                self.bump();
476                return Some(Ok(BorrowedToken {
477                    kind: BorrowedTokenKind::Arrow,
478                    span: Span { start, end: j + 1 },
479                }));
480            } else {
481                return Some(Ok(BorrowedToken {
482                    kind: BorrowedTokenKind::Minus,
483                    span: Span {
484                        start,
485                        end: start + 1,
486                    },
487                }));
488            }
489        }
490
491        // Singles / error
492        let start = i;
493        self.bump();
494        let tk = match c {
495            '(' => BorrowedTokenKind::LParen,
496            ')' => BorrowedTokenKind::RParen,
497            '{' => BorrowedTokenKind::LBrace,
498            '}' => BorrowedTokenKind::RBrace,
499            '[' => BorrowedTokenKind::LBracket,
500            ']' => BorrowedTokenKind::RBracket,
501            ',' => BorrowedTokenKind::Comma,
502            ':' => BorrowedTokenKind::Colon,
503            ';' => BorrowedTokenKind::Semicolon,
504            '=' => BorrowedTokenKind::Eq,
505            '+' => BorrowedTokenKind::Plus,
506            '*' => BorrowedTokenKind::Star,
507            '/' => BorrowedTokenKind::Slash,
508            '@' => BorrowedTokenKind::At,
509            other => {
510                return Some(Err(LexError::new(
511                    LexErrorKind::UnexpectedChar,
512                    Span {
513                        start,
514                        end: start + other.len_utf8(),
515                    },
516                )));
517            }
518        };
519        Some(Ok(BorrowedToken {
520            kind: tk,
521            span: Span {
522                start,
523                end: start + 1,
524            },
525        }))
526    }
527
528    /// Return next token or error without buffering the entire input.
529    /// `None` means end.
530    #[inline]
531    pub(crate) fn next_token(&mut self) -> Option<Result<Token, LexError>> {
532        self.skip_ws_and_comments();
533        let (i, c) = self.peek()?;
534
535        // Strings
536        if c == '"' {
537            let start = i;
538            self.bump();
539            let mut s = String::new();
540            loop {
541                let Some((j, ch)) = self.bump() else {
542                    return Some(Err(LexError::new(
543                        LexErrorKind::UnterminatedString,
544                        Span {
545                            start,
546                            end: self.src.len(),
547                        },
548                    )));
549                };
550                match ch {
551                    '\\' => {
552                        // precizan span za escape sekvence
553                        let Some((k, esc)) = self.bump() else {
554                            return Some(Err(LexError::new(
555                                LexErrorKind::UnterminatedEscape,
556                                Span {
557                                    start: j,
558                                    end: j + 1,
559                                },
560                            )));
561                        };
562                        let ch = match esc {
563                            'n' => '\n',
564                            't' => '\t',
565                            'r' => '\r',
566                            '"' => '"',
567                            '\\' => '\\',
568                            _ => {
569                                let escape_end = k + esc.len_utf8();
570                                return Some(Err(LexError::new(
571                                    LexErrorKind::InvalidEscape,
572                                    Span {
573                                        start: j,
574                                        end: escape_end,
575                                    },
576                                )));
577                            }
578                        };
579                        s.push(ch);
580                    }
581                    '"' => {
582                        return Some(Ok(Token {
583                            kind: TokenKind::String(s),
584                            span: Span { start, end: j + 1 },
585                        }));
586                    }
587                    _ => s.push(ch),
588                }
589            }
590        }
591
592        // Double dot (must come before numbers to handle 1..2 correctly)
593        if c == '.' {
594            let start = i;
595            self.bump();
596            if let Some((j, '.')) = self.peek() {
597                self.bump();
598                return Some(Ok(Token {
599                    kind: TokenKind::DoubleDot,
600                    span: Span { start, end: j + 1 },
601                }));
602            } else {
603                return Some(Ok(Token {
604                    kind: TokenKind::Dot,
605                    span: Span {
606                        start,
607                        end: start + 1,
608                    },
609                }));
610            }
611        }
612
613        // Numbers
614        if c.is_ascii_digit() {
615            match self.lex_number(i) {
616                Ok(tok) => return Some(Ok(tok)),
617                Err(e) => return Some(Err(e)),
618            }
619        }
620
621        // Idents / keywords
622        if c.is_ascii_alphabetic() || c == '_' {
623            let start = i;
624            self.bump();
625            while let Some((_, p)) = self.peek() {
626                if p.is_ascii_alphanumeric() || p == '_' {
627                    self.bump();
628                } else {
629                    break;
630                }
631            }
632            let end = self.peek().map(|(j, _)| j).unwrap_or(self.src.len());
633            let kind = Self::kw_or_ident(&self.src[start..end]);
634            return Some(Ok(Token {
635                kind,
636                span: Span { start, end },
637            }));
638        }
639
640        // Arrow / minus
641        if c == '-' {
642            let start = i;
643            self.bump();
644            if let Some((j, '>')) = self.peek() {
645                self.bump();
646                return Some(Ok(Token {
647                    kind: TokenKind::Arrow,
648                    span: Span { start, end: j + 1 },
649                }));
650            } else {
651                return Some(Ok(Token {
652                    kind: TokenKind::Minus,
653                    span: Span {
654                        start,
655                        end: start + 1,
656                    },
657                }));
658            }
659        }
660
661        // Singles / error
662        let start = i;
663        self.bump();
664        let tk = match c {
665            '(' => TokenKind::LParen,
666            ')' => TokenKind::RParen,
667            '{' => TokenKind::LBrace,
668            '}' => TokenKind::RBrace,
669            '[' => TokenKind::LBracket,
670            ']' => TokenKind::RBracket,
671            ',' => TokenKind::Comma,
672            ':' => TokenKind::Colon,
673            ';' => TokenKind::Semicolon,
674            '=' => TokenKind::Eq,
675            '+' => TokenKind::Plus,
676            '*' => TokenKind::Star,
677            '/' => TokenKind::Slash,
678            '@' => TokenKind::At,
679            other => {
680                return Some(Err(LexError::new(
681                    LexErrorKind::UnexpectedChar,
682                    Span {
683                        start,
684                        end: start + other.len_utf8(),
685                    },
686                )));
687            }
688        };
689        Some(Ok(Token {
690            kind: tk,
691            span: Span {
692                start,
693                end: start + 1,
694            },
695        }))
696    }
697
698    pub fn tokenize(mut self) -> Result<Vec<Token>, LexError> {
699        let mut out = Vec::new();
700        while let Some(res) = self.next_token() {
701            match res {
702                Ok(tok) => out.push(tok),
703                Err(e) => return Err(e),
704            }
705        }
706        Ok(out)
707    }
708}
709
710/// Tokenize the entire input and return a vector of tokens.
711/// Errors include unterminated strings/escapes, invalid escapes, invalid numbers, and unexpected characters.
712pub fn tokenize(src: &str) -> Result<Vec<Token>, LexError> {
713    Lexer::new(src).tokenize()
714}
715
716/// Tokenize the entire input returning zero-copy tokens that borrow from `src`.
717/// Strings are validated (including escapes) but their contents are *not* unescaped; the returned `&str` is the raw slice between quotes.
718pub fn tokenize_borrowed(src: &str) -> Result<Vec<BorrowedToken<'_>>, LexError> {
719    let mut lx = Lexer::new(src);
720    let mut out = Vec::new();
721    while let Some(res) = lx.next_token_borrowed() {
722        match res {
723            Ok(t) => out.push(t),
724            Err(e) => return Err(e),
725        }
726    }
727    Ok(out)
728}
729
730#[cfg(test)]
731mod tests {
732    use super::*;
733    #[test]
734    fn error_kind_as_str_and_display_messages() {
735        use super::{LexError, LexErrorKind, Span};
736        let span = Span { start: 1, end: 3 };
737        let cases: &[(LexErrorKind, &str, &str)] = &[
738            (
739                LexErrorKind::UnexpectedChar,
740                "unexpected character",
741                "unexpected char",
742            ),
743            (
744                LexErrorKind::UnterminatedString,
745                "unterminated string",
746                "unterminated string",
747            ),
748            (
749                LexErrorKind::UnterminatedEscape,
750                "unterminated escape",
751                "unterminated escape",
752            ),
753            (
754                LexErrorKind::InvalidNumber,
755                "invalid number",
756                "invalid number",
757            ),
758            (
759                LexErrorKind::InvalidEscape,
760                "invalid escape sequence",
761                "invalid escape",
762            ),
763        ];
764
765        for (kind, as_str_msg, display_msg) in cases.iter().cloned() {
766            assert_eq!(kind.as_str(), as_str_msg);
767            let err = LexError::new(kind, span);
768            let rendered = format!("{}", err);
769            assert_eq!(
770                rendered,
771                format!("{} at {}..{}", display_msg, span.start, span.end)
772            );
773            let _e: &dyn std::error::Error = &err;
774            let _dbg = format!("{:?}", err.clone());
775            assert!(!_dbg.is_empty());
776        }
777    }
778    #[test]
779    fn numbers_second_dot_invalid_unless_range() {
780        // second dot with digits on both sides -> invalid number
781        let err = tokenize("123.45.6").expect_err("second dot should be invalid unless range");
782        assert!(matches!(err.kind, LexErrorKind::InvalidNumber));
783
784        // range `1..2` should be tokenized as separate tokens
785        let toks = tokenize("1..2").unwrap();
786        assert!(matches!(toks[0].kind, TokenKind::Number(ref s) if s == "1"));
787        assert!(matches!(toks[1].kind, TokenKind::DoubleDot));
788        assert!(matches!(toks[2].kind, TokenKind::Number(ref s) if s == "2"));
789    }
790
791    #[test]
792    fn numbers_exponent_rules() {
793        // valid exponent forms
794        let toks = tokenize("1e10 1E+10 1.23e-4").unwrap();
795        assert!(toks
796            .iter()
797            .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1e10")));
798        assert!(toks
799            .iter()
800            .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1E+10")));
801        assert!(toks
802            .iter()
803            .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1.23e-4")));
804
805        // missing exponent digits is invalid
806        let err = tokenize("1e+").expect_err("missing exponent digits");
807        assert!(matches!(err.kind, LexErrorKind::InvalidNumber));
808
809        let err = tokenize("2E-").expect_err("missing exponent digits");
810        assert!(matches!(err.kind, LexErrorKind::InvalidNumber));
811    }
812    #[test]
813    fn basic() {
814        let code = r#"
815            // sample
816            let rule greet(name) = "hi, " + name
817            if true and false then x = 1 else x = 2;
818        "#;
819        let toks = tokenize(code).unwrap();
820        assert!(toks.iter().any(|t| matches!(t.kind, TokenKind::Let)));
821        assert!(toks.iter().any(|t| matches!(t.kind, TokenKind::Rule)));
822        assert!(toks.iter().any(|t| matches!(t.kind, TokenKind::String(_))));
823    }
824
825    #[test]
826    fn new_token_types_owned() {
827        use TokenKind as K;
828        let toks = tokenize("a.b ..c @d").unwrap();
829        assert!(matches!(toks[1].kind, K::Dot));
830        assert!(matches!(toks[3].kind, K::DoubleDot));
831        assert!(matches!(toks[5].kind, K::At));
832    }
833
834    #[test]
835    fn numbers_and_ranges() {
836        // valid decimals and exponents
837        let toks = tokenize("1 1.0 1.2e-3").unwrap();
838        assert!(toks
839            .iter()
840            .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1")));
841        assert!(toks
842            .iter()
843            .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1.0")));
844        assert!(toks
845            .iter()
846            .any(|t| matches!(t.kind, TokenKind::Number(ref s) if s == "1.2e-3")));
847
848        // ensure don't swallow `..` as part of a number
849        let toks = tokenize("1..2").unwrap();
850        assert!(matches!(toks[0].kind, TokenKind::Number(ref s) if s == "1"));
851        assert!(matches!(toks[1].kind, TokenKind::DoubleDot));
852        assert!(matches!(toks[2].kind, TokenKind::Number(ref s) if s == "2"));
853    }
854
855    #[test]
856    fn string_escapes() {
857        // valid escapes
858        let toks = tokenize("\"a\\n\\t\\r\\\\\\\"\"").unwrap();
859        assert!(matches!(toks[0].kind, TokenKind::String(_)));
860
861        // invalid escape
862        let err = tokenize("\"\\x\"").unwrap_err();
863        assert!(matches!(err.kind, LexErrorKind::InvalidEscape));
864    }
865
866    #[test]
867    fn numbers_trailing_dot_is_error() {
868        let toks = tokenize("0.").unwrap();
869        assert!(matches!(toks[0].kind, TokenKind::Number(ref s) if s == "0"));
870        assert!(matches!(toks[1].kind, TokenKind::Dot));
871    }
872
873    #[test]
874    fn dot_tokens_work() {
875        let toks = tokenize("a.b ..c d.e").unwrap();
876        use TokenKind as K;
877        assert!(matches!(toks[1].kind, K::Dot));
878        assert!(matches!(toks[3].kind, K::DoubleDot));
879        assert!(matches!(toks[6].kind, K::Dot));
880    }
881
882    #[test]
883    fn strings_empty_and_raw_newline_and_escapes() {
884        // empty string
885        let toks = tokenize("\"\"").unwrap();
886        assert!(matches!(toks[0].kind, TokenKind::String(ref s) if s.is_empty()));
887
888        // raw newline inside string is allowed by this lexer
889        let toks = tokenize("\"a\nb\"").unwrap();
890        assert!(matches!(toks[0].kind, TokenKind::String(ref s) if s == "a\nb"));
891
892        // complex escapes: quote, backslash, tab -> resulting string is "\t
893        let toks = tokenize("\"\\\"\\\\\t\"").unwrap();
894        assert!(matches!(toks[0].kind, TokenKind::String(ref s) if s == "\"\\\t"));
895    }
896
897    #[test]
898    fn streaming_iterator_matches_tokenize_and_propagates_error() {
899        // identičan izlaz kao tokenize()
900        let src = "let x = 1 + 2\nrule r() = \"ok\"";
901        let vec_tokens = tokenize(src).unwrap();
902        let iter_tokens: Result<Vec<_>, _> = tokenize_iter(src).collect();
903        let iter_tokens = iter_tokens.unwrap();
904        assert_eq!(vec_tokens, iter_tokens);
905
906        // greška: invalid escape — prvi element je Err, posle toga iteracija se završava
907        let src_err = "\"abc\\x\" rest";
908        let mut it = tokenize_iter(src_err);
909        match it.next() {
910            Some(Err(e)) => assert!(matches!(e.kind, LexErrorKind::InvalidEscape)),
911            other => panic!("expected first item to be Err, got {:?}", other),
912        }
913        assert!(it.next().is_none(), "iterator should end after error");
914    }
915
916    #[test]
917    fn invalid_escape_span_is_precise() {
918        // src contents: \"abc\\x\"
919        let src = "\"abc\\x\"";
920        let err = tokenize(src).unwrap_err();
921        assert!(matches!(err.kind, LexErrorKind::InvalidEscape));
922        // backslash at idx 4, 'x' at idx 5 -> span 4..6
923        assert_eq!(err.span, Span { start: 4, end: 6 });
924    }
925
926    #[test]
927    fn strings_unterminated_and_unterminated_escape() {
928        // unterminated string
929        let err = tokenize("\"abc").expect_err("unterminated string");
930        assert!(matches!(err.kind, LexErrorKind::UnterminatedString));
931
932        // unterminated escape
933        let err = tokenize("\"abc\\").expect_err("unterminated escape");
934        assert!(matches!(err.kind, LexErrorKind::UnterminatedEscape));
935    }
936
937    #[test]
938    fn idents_and_keywords() {
939        let toks = tokenize("let letx _x1").unwrap();
940        assert!(matches!(toks[0].kind, TokenKind::Let));
941        assert!(matches!(toks[1].kind, TokenKind::Ident(ref s) if s == "letx"));
942        assert!(matches!(toks[2].kind, TokenKind::Ident(ref s) if s == "_x1"));
943    }
944
945    #[test]
946    fn comments_do_not_leak() {
947        let toks = tokenize("foo // comment\nbar").unwrap();
948        assert!(matches!(toks[0].kind, TokenKind::Ident(ref s) if s == "foo"));
949        assert!(matches!(toks[1].kind, TokenKind::Ident(ref s) if s == "bar"));
950        assert_eq!(toks.len(), 2);
951    }
952
953    #[test]
954    fn unknown_char_errors_with_span() {
955        let err = tokenize("a # b").expect_err("unknown char '#'");
956        assert!(matches!(err.kind, LexErrorKind::UnexpectedChar));
957        assert!(err.span.start < err.span.end);
958    }
959
960    #[test]
961    fn golden_small_input() {
962        let src = "let rule f(x) = \"hi\" + x";
963        let toks = tokenize(src).unwrap();
964        use TokenKind::*;
965        let kinds: Vec<&'static str> = toks
966            .iter()
967            .map(|t| match &t.kind {
968                Let => "Let",
969                Rule => "Rule",
970                Ident(s) if s == "f" => "Ident(f)",
971                LParen => "LParen",
972                Ident(s) if s == "x" => "Ident(x)",
973                RParen => "RParen",
974                Eq => "Eq",
975                String(s) if s == "hi" => "String(hi)",
976                Plus => "Plus",
977                Ident(s) if s == "x" => "Ident(x)",
978                other => panic!("unexpected token in golden: {:?}", other),
979            })
980            .collect();
981        assert_eq!(
982            kinds,
983            vec![
984                "Let",
985                "Rule",
986                "Ident(f)",
987                "LParen",
988                "Ident(x)",
989                "RParen",
990                "Eq",
991                "String(hi)",
992                "Plus",
993                "Ident(x)"
994            ]
995        );
996    }
997
998    #[cfg(feature = "serde")]
999    #[test]
1000    fn serde_round_trip_token() {
1001        let toks = tokenize("let x = 1").unwrap();
1002        let json = serde_json::to_string(&toks).unwrap();
1003        let back: Vec<Token> = serde_json::from_str(&json).unwrap();
1004        assert_eq!(toks, back);
1005    }
1006
1007    #[test]
1008    fn borrowed_basic_no_escapes() {
1009        let toks = tokenize_borrowed("let x = \"hi\" 123").unwrap();
1010        use BorrowedTokenKind as K;
1011        assert!(matches!(toks[0].kind, K::Let));
1012        assert!(matches!(toks[1].kind, K::Ident("x")));
1013        assert!(matches!(toks[3].kind, K::String("hi")));
1014        assert!(matches!(toks[4].kind, K::Number("123")));
1015    }
1016
1017    #[test]
1018    fn borrowed_string_keeps_escapes() {
1019        let toks = tokenize_borrowed("\"a\\n\"").unwrap();
1020        use BorrowedTokenKind as K;
1021        assert!(matches!(toks[0].kind, K::String("a\\n")));
1022    }
1023
1024    // --- Extra coverage for borrowed API ---
1025    #[test]
1026    fn borrowed_operators_and_singles() {
1027        use BorrowedTokenKind as K;
1028        // covers: Arrow vs Minus, and all singles
1029        let src = "()->{}[],:;=+ - * / ->";
1030        let toks = tokenize_borrowed(src).unwrap();
1031        let kinds: Vec<&'static str> = toks
1032            .iter()
1033            .map(|t| match t.kind {
1034                K::LParen => "LParen",
1035                K::RParen => "RParen",
1036                K::Arrow => "Arrow",
1037                K::LBrace => "LBrace",
1038                K::RBrace => "RBrace",
1039                K::LBracket => "LBracket",
1040                K::RBracket => "RBracket",
1041                K::Comma => "Comma",
1042                K::Colon => "Colon",
1043                K::Semicolon => "Semicolon",
1044                K::Eq => "Eq",
1045                K::Plus => "Plus",
1046                K::Minus => "Minus",
1047                K::Star => "Star",
1048                K::Slash => "Slash",
1049                _ => "Other",
1050            })
1051            .collect();
1052        assert_eq!(
1053            kinds,
1054            vec![
1055                "LParen",
1056                "RParen",
1057                "Arrow",
1058                "LBrace",
1059                "RBrace",
1060                "LBracket",
1061                "RBracket",
1062                "Comma",
1063                "Colon",
1064                "Semicolon",
1065                "Eq",
1066                "Plus",
1067                "Minus",
1068                "Star",
1069                "Slash",
1070                "Arrow"
1071            ]
1072        );
1073    }
1074
1075    #[test]
1076    fn borrowed_keywords_and_idents() {
1077        use BorrowedTokenKind as K;
1078        let toks =
1079            tokenize_borrowed("true false if then else let rule and or foo _bar a1").unwrap();
1080        // Pick some spot checks
1081        assert!(matches!(toks[0].kind, K::True));
1082        assert!(matches!(toks[1].kind, K::False));
1083        assert!(matches!(toks[2].kind, K::If));
1084        assert!(matches!(toks[3].kind, K::Then));
1085        assert!(matches!(toks[4].kind, K::Else));
1086        assert!(matches!(toks[5].kind, K::Let));
1087        assert!(matches!(toks[6].kind, K::Rule));
1088        assert!(matches!(toks[7].kind, K::And));
1089        assert!(matches!(toks[8].kind, K::Or));
1090        assert!(matches!(toks[9].kind, K::Ident("foo")));
1091        assert!(matches!(toks[10].kind, K::Ident("_bar")));
1092        assert!(matches!(toks[11].kind, K::Ident("a1")));
1093    }
1094
1095    #[test]
1096    fn borrowed_comments_skipped() {
1097        use BorrowedTokenKind as K;
1098        let toks = tokenize_borrowed("foo // comment\nbar").unwrap();
1099        assert!(matches!(toks[0].kind, K::Ident("foo")));
1100        assert!(matches!(toks[1].kind, K::Ident("bar")));
1101        assert_eq!(toks.len(), 2);
1102    }
1103
1104    #[test]
1105    fn borrowed_numbers_errors_and_valid() {
1106        use BorrowedTokenKind as K;
1107        // valid
1108        let toks = tokenize_borrowed("1 1.0 1.2e-3").unwrap();
1109        assert!(matches!(toks[0].kind, K::Number("1")));
1110        assert!(matches!(toks[1].kind, K::Number("1.0")));
1111        assert!(matches!(toks[2].kind, K::Number("1.2e-3")));
1112        // invalid: second dot that's not a range
1113        let err = tokenize_borrowed("123.45.6").expect_err("second dot invalid");
1114        assert!(matches!(err.kind, LexErrorKind::InvalidNumber));
1115        // invalid: exponent without digits
1116        let err = tokenize_borrowed("1e+").expect_err("missing exponent digits");
1117        assert!(matches!(err.kind, LexErrorKind::InvalidNumber));
1118    }
1119
1120    #[test]
1121    fn borrowed_string_errors() {
1122        // invalid escape
1123        let err = tokenize_borrowed("\"\\x\"").unwrap_err();
1124        assert!(matches!(err.kind, LexErrorKind::InvalidEscape));
1125        // unterminated string
1126        let err = tokenize_borrowed("\"abc").unwrap_err();
1127        assert!(matches!(err.kind, LexErrorKind::UnterminatedString));
1128        // unterminated escape
1129        let err = tokenize_borrowed("\"abc\\").unwrap_err();
1130        assert!(matches!(err.kind, LexErrorKind::UnterminatedEscape));
1131    }
1132
1133    #[test]
1134    fn borrowed_unexpected_char_error() {
1135        let err = tokenize_borrowed("a # b").expect_err("unexpected '#'");
1136        assert!(matches!(err.kind, LexErrorKind::UnexpectedChar));
1137        assert!(err.span.start < err.span.end);
1138    }
1139
1140    #[test]
1141    fn new_token_types() {
1142        use BorrowedTokenKind as K;
1143        let toks = tokenize_borrowed("a.b ..c @d").unwrap();
1144        assert!(matches!(toks[1].kind, K::Dot));
1145        assert!(matches!(toks[3].kind, K::DoubleDot));
1146        assert!(matches!(toks[5].kind, K::At));
1147    }
1148
1149    #[test]
1150    fn dot_vs_double_dot() {
1151        use BorrowedTokenKind as K;
1152        let toks = tokenize_borrowed("a.b ..c d.e").unwrap();
1153        assert!(matches!(toks[1].kind, K::Dot));
1154        assert!(matches!(toks[3].kind, K::DoubleDot));
1155        assert!(matches!(toks[6].kind, K::Dot));
1156    }
1157}