Skip to main content

icl_core/parser/
tokenizer.rs

1//! ICL Tokenizer — converts ICL text into token stream
2//!
3//! Handles: keywords, identifiers, string literals, integer/float literals,
4//! ISO8601 timestamps, UUIDs, symbols (braces, colons, commas, brackets).
5//! Comments (//) are discarded.
6//!
7//! Guarantees:
8//! - Deterministic: same input always produces same token stream
9//! - Complete error reporting: line:column for every error
10
11/// Token types for ICL syntax
12#[derive(Debug, Clone, PartialEq)]
13pub enum Token {
14    // Keywords (section names)
15    Contract,
16    Identity,
17    PurposeStatement,
18    DataSemantics,
19    BehavioralSemantics,
20    ExecutionConstraints,
21    HumanMachineContract,
22    Extensions,
23
24    // Type keywords
25    IntegerType,
26    FloatType,
27    StringType,
28    BooleanType,
29    Iso8601Type,
30    UuidType,
31    ArrayType,
32    MapType,
33    ObjectType,
34    EnumType,
35
36    // Literals
37    StringLiteral(String),
38    IntegerLiteral(i64),
39    FloatLiteral(f64),
40    BooleanLiteral(bool),
41
42    // Symbols
43    LBrace,   // {
44    RBrace,   // }
45    LBracket, // [
46    RBracket, // ]
47    LAngle,   // <
48    RAngle,   // >
49    Colon,    // :
50    Comma,    // ,
51    Equals,   // =
52
53    // Other
54    Identifier(String),
55    Eof,
56}
57
58/// Position in source text for error reporting
59#[derive(Debug, Clone, PartialEq)]
60pub struct Span {
61    pub line: usize,
62    pub column: usize,
63    pub offset: usize,
64}
65
66impl std::fmt::Display for Span {
67    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
68        write!(f, "{}:{}", self.line, self.column)
69    }
70}
71
72/// Token with source position
73#[derive(Debug, Clone, PartialEq)]
74pub struct SpannedToken {
75    pub token: Token,
76    pub span: Span,
77}
78
79/// Tokenizer for ICL source text
80pub struct Tokenizer {
81    input: Vec<char>,
82    position: usize,
83    line: usize,
84    column: usize,
85}
86
87impl Tokenizer {
88    /// Create a new tokenizer for the given input text
89    pub fn new(text: &str) -> Self {
90        Tokenizer {
91            input: text.chars().collect(),
92            position: 0,
93            line: 1,
94            column: 1,
95        }
96    }
97
98    /// Tokenize the entire input into a stream of spanned tokens
99    pub fn tokenize(&mut self) -> crate::Result<Vec<SpannedToken>> {
100        let mut tokens = Vec::new();
101
102        loop {
103            self.skip_whitespace_and_comments();
104
105            if self.is_at_end() {
106                tokens.push(SpannedToken {
107                    token: Token::Eof,
108                    span: self.current_span(),
109                });
110                break;
111            }
112
113            let token = self.next_token()?;
114            tokens.push(token);
115        }
116
117        Ok(tokens)
118    }
119
120    // ── Character helpers ──────────────────────────────────
121
122    fn is_at_end(&self) -> bool {
123        self.position >= self.input.len()
124    }
125
126    fn peek(&self) -> Option<char> {
127        self.input.get(self.position).copied()
128    }
129
130    fn peek_ahead(&self, offset: usize) -> Option<char> {
131        self.input.get(self.position + offset).copied()
132    }
133
134    fn advance(&mut self) -> Option<char> {
135        let ch = self.input.get(self.position).copied();
136        if let Some(c) = ch {
137            self.position += 1;
138            if c == '\n' {
139                self.line += 1;
140                self.column = 1;
141            } else {
142                self.column += 1;
143            }
144        }
145        ch
146    }
147
148    fn current_span(&self) -> Span {
149        Span {
150            line: self.line,
151            column: self.column,
152            offset: self.position,
153        }
154    }
155
156    // ── Whitespace & Comments ──────────────────────────────
157
158    fn skip_whitespace_and_comments(&mut self) {
159        loop {
160            // Skip whitespace
161            while let Some(ch) = self.peek() {
162                if ch.is_ascii_whitespace() {
163                    self.advance();
164                } else {
165                    break;
166                }
167            }
168
169            // Skip line comments: //
170            if self.peek() == Some('/') && self.peek_ahead(1) == Some('/') {
171                while let Some(ch) = self.peek() {
172                    if ch == '\n' {
173                        break;
174                    }
175                    self.advance();
176                }
177                continue; // Loop back to skip more whitespace after comment
178            }
179
180            break;
181        }
182    }
183
184    // ── Main dispatch ──────────────────────────────────────
185
186    fn next_token(&mut self) -> crate::Result<SpannedToken> {
187        let span = self.current_span();
188        let ch = self.peek().unwrap();
189
190        match ch {
191            '{' => {
192                self.advance();
193                Ok(SpannedToken {
194                    token: Token::LBrace,
195                    span,
196                })
197            }
198            '}' => {
199                self.advance();
200                Ok(SpannedToken {
201                    token: Token::RBrace,
202                    span,
203                })
204            }
205            '[' => {
206                self.advance();
207                Ok(SpannedToken {
208                    token: Token::LBracket,
209                    span,
210                })
211            }
212            ']' => {
213                self.advance();
214                Ok(SpannedToken {
215                    token: Token::RBracket,
216                    span,
217                })
218            }
219            '<' => {
220                self.advance();
221                Ok(SpannedToken {
222                    token: Token::LAngle,
223                    span,
224                })
225            }
226            '>' => {
227                self.advance();
228                Ok(SpannedToken {
229                    token: Token::RAngle,
230                    span,
231                })
232            }
233            ':' => {
234                self.advance();
235                Ok(SpannedToken {
236                    token: Token::Colon,
237                    span,
238                })
239            }
240            ',' => {
241                self.advance();
242                Ok(SpannedToken {
243                    token: Token::Comma,
244                    span,
245                })
246            }
247            '=' => {
248                self.advance();
249                Ok(SpannedToken {
250                    token: Token::Equals,
251                    span,
252                })
253            }
254            '"' => self.read_string(span),
255            c if c.is_ascii_digit() => self.read_number(span),
256            c if c.is_ascii_alphabetic() || c == '_' => self.read_identifier_or_keyword(span),
257            _ => Err(crate::Error::ParseError(format!(
258                "Unexpected character '{}' at {}",
259                ch, span
260            ))),
261        }
262    }
263
264    // ── String literals ────────────────────────────────────
265
266    fn read_string(&mut self, span: Span) -> crate::Result<SpannedToken> {
267        self.advance(); // consume opening "
268        let mut value = String::new();
269
270        loop {
271            match self.advance() {
272                None => {
273                    return Err(crate::Error::ParseError(format!(
274                        "Unterminated string starting at {}",
275                        span
276                    )));
277                }
278                Some('"') => break,
279                Some('\\') => match self.advance() {
280                    Some('n') => value.push('\n'),
281                    Some('t') => value.push('\t'),
282                    Some('\\') => value.push('\\'),
283                    Some('"') => value.push('"'),
284                    Some(c) => {
285                        return Err(crate::Error::ParseError(format!(
286                            "Invalid escape sequence '\\{}' at {}",
287                            c,
288                            self.current_span()
289                        )));
290                    }
291                    None => {
292                        return Err(crate::Error::ParseError(format!(
293                            "Unterminated escape sequence at {}",
294                            self.current_span()
295                        )));
296                    }
297                },
298                Some(c) => value.push(c),
299            }
300        }
301
302        Ok(SpannedToken {
303            token: Token::StringLiteral(value),
304            span,
305        })
306    }
307
308    // ── Numbers & ISO8601 timestamps ───────────────────────
309
310    fn read_number(&mut self, span: Span) -> crate::Result<SpannedToken> {
311        let start = self.position;
312        let mut has_dot = false;
313
314        // Collect all digits
315        while let Some(ch) = self.peek() {
316            if ch.is_ascii_digit() {
317                self.advance();
318            } else if ch == '.' {
319                has_dot = true;
320                self.advance();
321            } else {
322                break;
323            }
324        }
325
326        // Check for ISO8601: digits followed by '-' (like 2026-02-01T...)
327        // Pattern: NNNN-NN-NNTNN:NN:NNZ
328        if self.peek() == Some('-') && !has_dot {
329            // Could be ISO8601 timestamp — collect the rest
330            while let Some(ch) = self.peek() {
331                if ch.is_ascii_alphanumeric()
332                    || ch == '-'
333                    || ch == ':'
334                    || ch == 'T'
335                    || ch == 'Z'
336                    || ch == '+'
337                    || ch == '.'
338                {
339                    self.advance();
340                } else {
341                    break;
342                }
343            }
344            let text: String = self.input[start..self.position].iter().collect();
345            // Validate basic ISO8601 shape
346            if is_iso8601_like(&text) {
347                return Ok(SpannedToken {
348                    token: Token::StringLiteral(text),
349                    span,
350                });
351            } else {
352                return Err(crate::Error::ParseError(format!(
353                    "Invalid timestamp '{}' at {}",
354                    text, span
355                )));
356            }
357        }
358
359        let text: String = self.input[start..self.position].iter().collect();
360
361        if has_dot {
362            let val: f64 = text.parse().map_err(|_| {
363                crate::Error::ParseError(format!("Invalid float '{}' at {}", text, span))
364            })?;
365            Ok(SpannedToken {
366                token: Token::FloatLiteral(val),
367                span,
368            })
369        } else {
370            let val: i64 = text.parse().map_err(|_| {
371                crate::Error::ParseError(format!("Invalid integer '{}' at {}", text, span))
372            })?;
373            Ok(SpannedToken {
374                token: Token::IntegerLiteral(val),
375                span,
376            })
377        }
378    }
379
380    // ── Identifiers & Keywords ─────────────────────────────
381
382    fn read_identifier_or_keyword(&mut self, span: Span) -> crate::Result<SpannedToken> {
383        let start = self.position;
384
385        while let Some(ch) = self.peek() {
386            if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' {
387                self.advance();
388            } else {
389                break;
390            }
391        }
392
393        let text: String = self.input[start..self.position].iter().collect();
394
395        let token = match text.as_str() {
396            // Section keywords
397            "Contract" => Token::Contract,
398            "Identity" => Token::Identity,
399            "PurposeStatement" => Token::PurposeStatement,
400            "DataSemantics" => Token::DataSemantics,
401            "BehavioralSemantics" => Token::BehavioralSemantics,
402            "ExecutionConstraints" => Token::ExecutionConstraints,
403            "HumanMachineContract" => Token::HumanMachineContract,
404            "Extensions" => Token::Extensions,
405
406            // Type keywords
407            "Integer" => Token::IntegerType,
408            "Float" => Token::FloatType,
409            "String" => Token::StringType,
410            "Boolean" => Token::BooleanType,
411            "ISO8601" => Token::Iso8601Type,
412            "UUID" => Token::UuidType,
413            "Array" => Token::ArrayType,
414            "Map" => Token::MapType,
415            "Object" => Token::ObjectType,
416            "Enum" => Token::EnumType,
417
418            // Boolean literals
419            "true" => Token::BooleanLiteral(true),
420            "false" => Token::BooleanLiteral(false),
421
422            // Everything else is an identifier
423            _ => Token::Identifier(text),
424        };
425
426        Ok(SpannedToken { token, span })
427    }
428}
429
430/// Basic check for ISO8601-like timestamps (YYYY-MM-DDTHH:MM:SSZ)
431fn is_iso8601_like(s: &str) -> bool {
432    // Must contain T and end with Z or timezone offset
433    // Minimal pattern: NNNN-NN-NNTNN:NN:NNZ
434    if s.len() < 20 {
435        return false;
436    }
437    s.contains('T') && (s.ends_with('Z') || s.contains('+'))
438}
439
440#[cfg(test)]
441mod tests {
442    use super::*;
443
444    fn tokenize(input: &str) -> Vec<Token> {
445        Tokenizer::new(input)
446            .tokenize()
447            .unwrap()
448            .into_iter()
449            .map(|st| st.token)
450            .collect()
451    }
452
453    fn tokenize_err(input: &str) -> String {
454        Tokenizer::new(input).tokenize().unwrap_err().to_string()
455    }
456
457    // ── Keywords ───────────────────────────────────────
458
459    #[test]
460    fn test_tokenize_section_keywords() {
461        let tokens = tokenize("Contract Identity PurposeStatement");
462        assert_eq!(
463            tokens,
464            vec![
465                Token::Contract,
466                Token::Identity,
467                Token::PurposeStatement,
468                Token::Eof,
469            ]
470        );
471    }
472
473    #[test]
474    fn test_tokenize_all_section_keywords() {
475        let input = "Contract Identity PurposeStatement DataSemantics BehavioralSemantics ExecutionConstraints HumanMachineContract Extensions";
476        let tokens = tokenize(input);
477        assert_eq!(
478            tokens,
479            vec![
480                Token::Contract,
481                Token::Identity,
482                Token::PurposeStatement,
483                Token::DataSemantics,
484                Token::BehavioralSemantics,
485                Token::ExecutionConstraints,
486                Token::HumanMachineContract,
487                Token::Extensions,
488                Token::Eof,
489            ]
490        );
491    }
492
493    #[test]
494    fn test_tokenize_type_keywords() {
495        let tokens = tokenize("Integer Float String Boolean ISO8601 UUID Array Map Object Enum");
496        assert_eq!(
497            tokens,
498            vec![
499                Token::IntegerType,
500                Token::FloatType,
501                Token::StringType,
502                Token::BooleanType,
503                Token::Iso8601Type,
504                Token::UuidType,
505                Token::ArrayType,
506                Token::MapType,
507                Token::ObjectType,
508                Token::EnumType,
509                Token::Eof,
510            ]
511        );
512    }
513
514    // ── String literals ────────────────────────────────
515
516    #[test]
517    fn test_tokenize_string_literal() {
518        let tokens = tokenize(r#""hello world""#);
519        assert_eq!(
520            tokens,
521            vec![Token::StringLiteral("hello world".to_string()), Token::Eof,]
522        );
523    }
524
525    #[test]
526    fn test_tokenize_string_escape_sequences() {
527        let tokens = tokenize(r#""line\none\ttab\\slash\"quote""#);
528        assert_eq!(
529            tokens,
530            vec![
531                Token::StringLiteral("line\none\ttab\\slash\"quote".to_string()),
532                Token::Eof,
533            ]
534        );
535    }
536
537    #[test]
538    fn test_tokenize_empty_string() {
539        let tokens = tokenize(r#""""#);
540        assert_eq!(
541            tokens,
542            vec![Token::StringLiteral(String::new()), Token::Eof,]
543        );
544    }
545
546    #[test]
547    fn test_unterminated_string() {
548        let err = tokenize_err(r#""hello"#);
549        assert!(err.contains("Unterminated string"));
550    }
551
552    // ── Numbers ────────────────────────────────────────
553
554    #[test]
555    fn test_tokenize_integer() {
556        let tokens = tokenize("42 0 999999");
557        assert_eq!(
558            tokens,
559            vec![
560                Token::IntegerLiteral(42),
561                Token::IntegerLiteral(0),
562                Token::IntegerLiteral(999999),
563                Token::Eof,
564            ]
565        );
566    }
567
568    #[test]
569    #[allow(clippy::approx_constant)]
570    fn test_tokenize_float() {
571        let tokens = tokenize("3.14 0.0 1.0");
572        assert_eq!(
573            tokens,
574            vec![
575                Token::FloatLiteral(3.14),
576                Token::FloatLiteral(0.0),
577                Token::FloatLiteral(1.0),
578                Token::Eof,
579            ]
580        );
581    }
582
583    // ── ISO8601 timestamps ─────────────────────────────
584
585    #[test]
586    fn test_tokenize_timestamp() {
587        let tokens = tokenize("2026-02-01T00:00:00Z");
588        assert_eq!(
589            tokens,
590            vec![
591                Token::StringLiteral("2026-02-01T00:00:00Z".to_string()),
592                Token::Eof,
593            ]
594        );
595    }
596
597    // ── Booleans ───────────────────────────────────────
598
599    #[test]
600    fn test_tokenize_booleans() {
601        let tokens = tokenize("true false");
602        assert_eq!(
603            tokens,
604            vec![
605                Token::BooleanLiteral(true),
606                Token::BooleanLiteral(false),
607                Token::Eof,
608            ]
609        );
610    }
611
612    // ── Symbols ────────────────────────────────────────
613
614    #[test]
615    fn test_tokenize_symbols() {
616        let tokens = tokenize("{ } [ ] < > : , =");
617        assert_eq!(
618            tokens,
619            vec![
620                Token::LBrace,
621                Token::RBrace,
622                Token::LBracket,
623                Token::RBracket,
624                Token::LAngle,
625                Token::RAngle,
626                Token::Colon,
627                Token::Comma,
628                Token::Equals,
629                Token::Eof,
630            ]
631        );
632    }
633
634    // ── Comments ───────────────────────────────────────
635
636    #[test]
637    fn test_skip_line_comments() {
638        let tokens = tokenize("Contract // this is a comment\nIdentity");
639        assert_eq!(tokens, vec![Token::Contract, Token::Identity, Token::Eof,]);
640    }
641
642    #[test]
643    fn test_skip_comment_at_start() {
644        let tokens = tokenize("// comment\nContract");
645        assert_eq!(tokens, vec![Token::Contract, Token::Eof,]);
646    }
647
648    #[test]
649    fn test_skip_multiple_comments() {
650        let tokens = tokenize("// first\n// second\nContract");
651        assert_eq!(tokens, vec![Token::Contract, Token::Eof,]);
652    }
653
654    // ── Identifiers ────────────────────────────────────
655
656    #[test]
657    fn test_tokenize_identifiers() {
658        let tokens = tokenize("stable_id version count");
659        assert_eq!(
660            tokens,
661            vec![
662                Token::Identifier("stable_id".to_string()),
663                Token::Identifier("version".to_string()),
664                Token::Identifier("count".to_string()),
665                Token::Eof,
666            ]
667        );
668    }
669
670    #[test]
671    fn test_tokenize_identifier_with_hyphens() {
672        let tokens = tokenize("custom-system my-extension");
673        assert_eq!(
674            tokens,
675            vec![
676                Token::Identifier("custom-system".to_string()),
677                Token::Identifier("my-extension".to_string()),
678                Token::Eof,
679            ]
680        );
681    }
682
683    // ── Span tracking ──────────────────────────────────
684
685    #[test]
686    fn test_span_tracking() {
687        let tokens = Tokenizer::new("Contract {\n  Identity\n}")
688            .tokenize()
689            .unwrap();
690        assert_eq!(
691            tokens[0].span,
692            Span {
693                line: 1,
694                column: 1,
695                offset: 0
696            }
697        );
698        assert_eq!(tokens[0].token, Token::Contract);
699        assert_eq!(
700            tokens[1].span,
701            Span {
702                line: 1,
703                column: 10,
704                offset: 9
705            }
706        );
707        assert_eq!(tokens[1].token, Token::LBrace);
708        assert_eq!(
709            tokens[2].span,
710            Span {
711                line: 2,
712                column: 3,
713                offset: 13
714            }
715        );
716        assert_eq!(tokens[2].token, Token::Identity);
717        assert_eq!(
718            tokens[3].span,
719            Span {
720                line: 3,
721                column: 1,
722                offset: 22
723            }
724        );
725        assert_eq!(tokens[3].token, Token::RBrace);
726    }
727
728    // ── Edge cases ─────────────────────────────────────
729
730    #[test]
731    fn test_empty_input() {
732        let tokens = tokenize("");
733        assert_eq!(tokens, vec![Token::Eof]);
734    }
735
736    #[test]
737    fn test_only_whitespace() {
738        let tokens = tokenize("   \n\n\t  ");
739        assert_eq!(tokens, vec![Token::Eof]);
740    }
741
742    #[test]
743    fn test_only_comments() {
744        let tokens = tokenize("// nothing here\n// or here\n");
745        assert_eq!(tokens, vec![Token::Eof]);
746    }
747
748    #[test]
749    fn test_unexpected_character() {
750        let err = tokenize_err("@");
751        assert!(err.contains("Unexpected character"));
752    }
753
754    // ── Integration: minimal contract tokens ───────────
755
756    #[test]
757    fn test_tokenize_minimal_contract_fragment() {
758        let input = r#"Contract {
759  Identity {
760    stable_id: "test-001",
761    version: 1
762  }
763}"#;
764        let tokens = tokenize(input);
765        assert_eq!(
766            tokens,
767            vec![
768                Token::Contract,
769                Token::LBrace,
770                Token::Identity,
771                Token::LBrace,
772                Token::Identifier("stable_id".to_string()),
773                Token::Colon,
774                Token::StringLiteral("test-001".to_string()),
775                Token::Comma,
776                Token::Identifier("version".to_string()),
777                Token::Colon,
778                Token::IntegerLiteral(1),
779                Token::RBrace,
780                Token::RBrace,
781                Token::Eof,
782            ]
783        );
784    }
785
786    #[test]
787    fn test_tokenize_type_expression() {
788        let tokens = tokenize("Array<String>");
789        assert_eq!(
790            tokens,
791            vec![
792                Token::ArrayType,
793                Token::LAngle,
794                Token::StringType,
795                Token::RAngle,
796                Token::Eof,
797            ]
798        );
799    }
800
801    #[test]
802    fn test_tokenize_map_type() {
803        let tokens = tokenize("Map<String, Integer>");
804        assert_eq!(
805            tokens,
806            vec![
807                Token::MapType,
808                Token::LAngle,
809                Token::StringType,
810                Token::Comma,
811                Token::IntegerType,
812                Token::RAngle,
813                Token::Eof,
814            ]
815        );
816    }
817
818    // ── Determinism proof ──────────────────────────────
819
820    #[test]
821    fn test_tokenize_determinism_100_iterations() {
822        let input = r#"Contract {
823  Identity {
824    stable_id: "test",
825    version: 1,
826    created_timestamp: 2026-01-01T00:00:00Z,
827    owner: "test",
828    semantic_hash: "abc123"
829  }
830}"#;
831        let first = Tokenizer::new(input).tokenize().unwrap();
832
833        for i in 0..100 {
834            let result = Tokenizer::new(input).tokenize().unwrap();
835            assert_eq!(first, result, "Determinism failure at iteration {}", i);
836        }
837    }
838}