1#[derive(Debug, Clone, PartialEq)]
13pub enum Token {
14 Contract,
16 Identity,
17 PurposeStatement,
18 DataSemantics,
19 BehavioralSemantics,
20 ExecutionConstraints,
21 HumanMachineContract,
22 Extensions,
23
24 IntegerType,
26 FloatType,
27 StringType,
28 BooleanType,
29 Iso8601Type,
30 UuidType,
31 ArrayType,
32 MapType,
33 ObjectType,
34 EnumType,
35
36 StringLiteral(String),
38 IntegerLiteral(i64),
39 FloatLiteral(f64),
40 BooleanLiteral(bool),
41
42 LBrace, RBrace, LBracket, RBracket, LAngle, RAngle, Colon, Comma, Equals, Identifier(String),
55 Eof,
56}
57
58#[derive(Debug, Clone, PartialEq)]
60pub struct Span {
61 pub line: usize,
62 pub column: usize,
63 pub offset: usize,
64}
65
66impl std::fmt::Display for Span {
67 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
68 write!(f, "{}:{}", self.line, self.column)
69 }
70}
71
72#[derive(Debug, Clone, PartialEq)]
74pub struct SpannedToken {
75 pub token: Token,
76 pub span: Span,
77}
78
79pub struct Tokenizer {
81 input: Vec<char>,
82 position: usize,
83 line: usize,
84 column: usize,
85}
86
87impl Tokenizer {
88 pub fn new(text: &str) -> Self {
90 Tokenizer {
91 input: text.chars().collect(),
92 position: 0,
93 line: 1,
94 column: 1,
95 }
96 }
97
98 pub fn tokenize(&mut self) -> crate::Result<Vec<SpannedToken>> {
100 let mut tokens = Vec::new();
101
102 loop {
103 self.skip_whitespace_and_comments();
104
105 if self.is_at_end() {
106 tokens.push(SpannedToken {
107 token: Token::Eof,
108 span: self.current_span(),
109 });
110 break;
111 }
112
113 let token = self.next_token()?;
114 tokens.push(token);
115 }
116
117 Ok(tokens)
118 }
119
120 fn is_at_end(&self) -> bool {
123 self.position >= self.input.len()
124 }
125
126 fn peek(&self) -> Option<char> {
127 self.input.get(self.position).copied()
128 }
129
130 fn peek_ahead(&self, offset: usize) -> Option<char> {
131 self.input.get(self.position + offset).copied()
132 }
133
134 fn advance(&mut self) -> Option<char> {
135 let ch = self.input.get(self.position).copied();
136 if let Some(c) = ch {
137 self.position += 1;
138 if c == '\n' {
139 self.line += 1;
140 self.column = 1;
141 } else {
142 self.column += 1;
143 }
144 }
145 ch
146 }
147
148 fn current_span(&self) -> Span {
149 Span {
150 line: self.line,
151 column: self.column,
152 offset: self.position,
153 }
154 }
155
156 fn skip_whitespace_and_comments(&mut self) {
159 loop {
160 while let Some(ch) = self.peek() {
162 if ch.is_ascii_whitespace() {
163 self.advance();
164 } else {
165 break;
166 }
167 }
168
169 if self.peek() == Some('/') && self.peek_ahead(1) == Some('/') {
171 while let Some(ch) = self.peek() {
172 if ch == '\n' {
173 break;
174 }
175 self.advance();
176 }
177 continue; }
179
180 break;
181 }
182 }
183
184 fn next_token(&mut self) -> crate::Result<SpannedToken> {
187 let span = self.current_span();
188 let ch = self.peek().unwrap();
189
190 match ch {
191 '{' => {
192 self.advance();
193 Ok(SpannedToken {
194 token: Token::LBrace,
195 span,
196 })
197 }
198 '}' => {
199 self.advance();
200 Ok(SpannedToken {
201 token: Token::RBrace,
202 span,
203 })
204 }
205 '[' => {
206 self.advance();
207 Ok(SpannedToken {
208 token: Token::LBracket,
209 span,
210 })
211 }
212 ']' => {
213 self.advance();
214 Ok(SpannedToken {
215 token: Token::RBracket,
216 span,
217 })
218 }
219 '<' => {
220 self.advance();
221 Ok(SpannedToken {
222 token: Token::LAngle,
223 span,
224 })
225 }
226 '>' => {
227 self.advance();
228 Ok(SpannedToken {
229 token: Token::RAngle,
230 span,
231 })
232 }
233 ':' => {
234 self.advance();
235 Ok(SpannedToken {
236 token: Token::Colon,
237 span,
238 })
239 }
240 ',' => {
241 self.advance();
242 Ok(SpannedToken {
243 token: Token::Comma,
244 span,
245 })
246 }
247 '=' => {
248 self.advance();
249 Ok(SpannedToken {
250 token: Token::Equals,
251 span,
252 })
253 }
254 '"' => self.read_string(span),
255 c if c.is_ascii_digit() => self.read_number(span),
256 c if c.is_ascii_alphabetic() || c == '_' => self.read_identifier_or_keyword(span),
257 _ => Err(crate::Error::ParseError(format!(
258 "Unexpected character '{}' at {}",
259 ch, span
260 ))),
261 }
262 }
263
264 fn read_string(&mut self, span: Span) -> crate::Result<SpannedToken> {
267 self.advance(); let mut value = String::new();
269
270 loop {
271 match self.advance() {
272 None => {
273 return Err(crate::Error::ParseError(format!(
274 "Unterminated string starting at {}",
275 span
276 )));
277 }
278 Some('"') => break,
279 Some('\\') => match self.advance() {
280 Some('n') => value.push('\n'),
281 Some('t') => value.push('\t'),
282 Some('\\') => value.push('\\'),
283 Some('"') => value.push('"'),
284 Some(c) => {
285 return Err(crate::Error::ParseError(format!(
286 "Invalid escape sequence '\\{}' at {}",
287 c,
288 self.current_span()
289 )));
290 }
291 None => {
292 return Err(crate::Error::ParseError(format!(
293 "Unterminated escape sequence at {}",
294 self.current_span()
295 )));
296 }
297 },
298 Some(c) => value.push(c),
299 }
300 }
301
302 Ok(SpannedToken {
303 token: Token::StringLiteral(value),
304 span,
305 })
306 }
307
308 fn read_number(&mut self, span: Span) -> crate::Result<SpannedToken> {
311 let start = self.position;
312 let mut has_dot = false;
313
314 while let Some(ch) = self.peek() {
316 if ch.is_ascii_digit() {
317 self.advance();
318 } else if ch == '.' {
319 has_dot = true;
320 self.advance();
321 } else {
322 break;
323 }
324 }
325
326 if self.peek() == Some('-') && !has_dot {
329 while let Some(ch) = self.peek() {
331 if ch.is_ascii_alphanumeric()
332 || ch == '-'
333 || ch == ':'
334 || ch == 'T'
335 || ch == 'Z'
336 || ch == '+'
337 || ch == '.'
338 {
339 self.advance();
340 } else {
341 break;
342 }
343 }
344 let text: String = self.input[start..self.position].iter().collect();
345 if is_iso8601_like(&text) {
347 return Ok(SpannedToken {
348 token: Token::StringLiteral(text),
349 span,
350 });
351 } else {
352 return Err(crate::Error::ParseError(format!(
353 "Invalid timestamp '{}' at {}",
354 text, span
355 )));
356 }
357 }
358
359 let text: String = self.input[start..self.position].iter().collect();
360
361 if has_dot {
362 let val: f64 = text.parse().map_err(|_| {
363 crate::Error::ParseError(format!("Invalid float '{}' at {}", text, span))
364 })?;
365 Ok(SpannedToken {
366 token: Token::FloatLiteral(val),
367 span,
368 })
369 } else {
370 let val: i64 = text.parse().map_err(|_| {
371 crate::Error::ParseError(format!("Invalid integer '{}' at {}", text, span))
372 })?;
373 Ok(SpannedToken {
374 token: Token::IntegerLiteral(val),
375 span,
376 })
377 }
378 }
379
380 fn read_identifier_or_keyword(&mut self, span: Span) -> crate::Result<SpannedToken> {
383 let start = self.position;
384
385 while let Some(ch) = self.peek() {
386 if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' {
387 self.advance();
388 } else {
389 break;
390 }
391 }
392
393 let text: String = self.input[start..self.position].iter().collect();
394
395 let token = match text.as_str() {
396 "Contract" => Token::Contract,
398 "Identity" => Token::Identity,
399 "PurposeStatement" => Token::PurposeStatement,
400 "DataSemantics" => Token::DataSemantics,
401 "BehavioralSemantics" => Token::BehavioralSemantics,
402 "ExecutionConstraints" => Token::ExecutionConstraints,
403 "HumanMachineContract" => Token::HumanMachineContract,
404 "Extensions" => Token::Extensions,
405
406 "Integer" => Token::IntegerType,
408 "Float" => Token::FloatType,
409 "String" => Token::StringType,
410 "Boolean" => Token::BooleanType,
411 "ISO8601" => Token::Iso8601Type,
412 "UUID" => Token::UuidType,
413 "Array" => Token::ArrayType,
414 "Map" => Token::MapType,
415 "Object" => Token::ObjectType,
416 "Enum" => Token::EnumType,
417
418 "true" => Token::BooleanLiteral(true),
420 "false" => Token::BooleanLiteral(false),
421
422 _ => Token::Identifier(text),
424 };
425
426 Ok(SpannedToken { token, span })
427 }
428}
429
430fn is_iso8601_like(s: &str) -> bool {
432 if s.len() < 20 {
435 return false;
436 }
437 s.contains('T') && (s.ends_with('Z') || s.contains('+'))
438}
439
440#[cfg(test)]
441mod tests {
442 use super::*;
443
444 fn tokenize(input: &str) -> Vec<Token> {
445 Tokenizer::new(input)
446 .tokenize()
447 .unwrap()
448 .into_iter()
449 .map(|st| st.token)
450 .collect()
451 }
452
453 fn tokenize_err(input: &str) -> String {
454 Tokenizer::new(input).tokenize().unwrap_err().to_string()
455 }
456
457 #[test]
460 fn test_tokenize_section_keywords() {
461 let tokens = tokenize("Contract Identity PurposeStatement");
462 assert_eq!(
463 tokens,
464 vec![
465 Token::Contract,
466 Token::Identity,
467 Token::PurposeStatement,
468 Token::Eof,
469 ]
470 );
471 }
472
473 #[test]
474 fn test_tokenize_all_section_keywords() {
475 let input = "Contract Identity PurposeStatement DataSemantics BehavioralSemantics ExecutionConstraints HumanMachineContract Extensions";
476 let tokens = tokenize(input);
477 assert_eq!(
478 tokens,
479 vec![
480 Token::Contract,
481 Token::Identity,
482 Token::PurposeStatement,
483 Token::DataSemantics,
484 Token::BehavioralSemantics,
485 Token::ExecutionConstraints,
486 Token::HumanMachineContract,
487 Token::Extensions,
488 Token::Eof,
489 ]
490 );
491 }
492
493 #[test]
494 fn test_tokenize_type_keywords() {
495 let tokens = tokenize("Integer Float String Boolean ISO8601 UUID Array Map Object Enum");
496 assert_eq!(
497 tokens,
498 vec![
499 Token::IntegerType,
500 Token::FloatType,
501 Token::StringType,
502 Token::BooleanType,
503 Token::Iso8601Type,
504 Token::UuidType,
505 Token::ArrayType,
506 Token::MapType,
507 Token::ObjectType,
508 Token::EnumType,
509 Token::Eof,
510 ]
511 );
512 }
513
514 #[test]
517 fn test_tokenize_string_literal() {
518 let tokens = tokenize(r#""hello world""#);
519 assert_eq!(
520 tokens,
521 vec![Token::StringLiteral("hello world".to_string()), Token::Eof,]
522 );
523 }
524
525 #[test]
526 fn test_tokenize_string_escape_sequences() {
527 let tokens = tokenize(r#""line\none\ttab\\slash\"quote""#);
528 assert_eq!(
529 tokens,
530 vec![
531 Token::StringLiteral("line\none\ttab\\slash\"quote".to_string()),
532 Token::Eof,
533 ]
534 );
535 }
536
537 #[test]
538 fn test_tokenize_empty_string() {
539 let tokens = tokenize(r#""""#);
540 assert_eq!(
541 tokens,
542 vec![Token::StringLiteral(String::new()), Token::Eof,]
543 );
544 }
545
546 #[test]
547 fn test_unterminated_string() {
548 let err = tokenize_err(r#""hello"#);
549 assert!(err.contains("Unterminated string"));
550 }
551
552 #[test]
555 fn test_tokenize_integer() {
556 let tokens = tokenize("42 0 999999");
557 assert_eq!(
558 tokens,
559 vec![
560 Token::IntegerLiteral(42),
561 Token::IntegerLiteral(0),
562 Token::IntegerLiteral(999999),
563 Token::Eof,
564 ]
565 );
566 }
567
568 #[test]
569 #[allow(clippy::approx_constant)]
570 fn test_tokenize_float() {
571 let tokens = tokenize("3.14 0.0 1.0");
572 assert_eq!(
573 tokens,
574 vec![
575 Token::FloatLiteral(3.14),
576 Token::FloatLiteral(0.0),
577 Token::FloatLiteral(1.0),
578 Token::Eof,
579 ]
580 );
581 }
582
583 #[test]
586 fn test_tokenize_timestamp() {
587 let tokens = tokenize("2026-02-01T00:00:00Z");
588 assert_eq!(
589 tokens,
590 vec![
591 Token::StringLiteral("2026-02-01T00:00:00Z".to_string()),
592 Token::Eof,
593 ]
594 );
595 }
596
597 #[test]
600 fn test_tokenize_booleans() {
601 let tokens = tokenize("true false");
602 assert_eq!(
603 tokens,
604 vec![
605 Token::BooleanLiteral(true),
606 Token::BooleanLiteral(false),
607 Token::Eof,
608 ]
609 );
610 }
611
612 #[test]
615 fn test_tokenize_symbols() {
616 let tokens = tokenize("{ } [ ] < > : , =");
617 assert_eq!(
618 tokens,
619 vec![
620 Token::LBrace,
621 Token::RBrace,
622 Token::LBracket,
623 Token::RBracket,
624 Token::LAngle,
625 Token::RAngle,
626 Token::Colon,
627 Token::Comma,
628 Token::Equals,
629 Token::Eof,
630 ]
631 );
632 }
633
634 #[test]
637 fn test_skip_line_comments() {
638 let tokens = tokenize("Contract // this is a comment\nIdentity");
639 assert_eq!(tokens, vec![Token::Contract, Token::Identity, Token::Eof,]);
640 }
641
642 #[test]
643 fn test_skip_comment_at_start() {
644 let tokens = tokenize("// comment\nContract");
645 assert_eq!(tokens, vec![Token::Contract, Token::Eof,]);
646 }
647
648 #[test]
649 fn test_skip_multiple_comments() {
650 let tokens = tokenize("// first\n// second\nContract");
651 assert_eq!(tokens, vec![Token::Contract, Token::Eof,]);
652 }
653
654 #[test]
657 fn test_tokenize_identifiers() {
658 let tokens = tokenize("stable_id version count");
659 assert_eq!(
660 tokens,
661 vec![
662 Token::Identifier("stable_id".to_string()),
663 Token::Identifier("version".to_string()),
664 Token::Identifier("count".to_string()),
665 Token::Eof,
666 ]
667 );
668 }
669
670 #[test]
671 fn test_tokenize_identifier_with_hyphens() {
672 let tokens = tokenize("custom-system my-extension");
673 assert_eq!(
674 tokens,
675 vec![
676 Token::Identifier("custom-system".to_string()),
677 Token::Identifier("my-extension".to_string()),
678 Token::Eof,
679 ]
680 );
681 }
682
683 #[test]
686 fn test_span_tracking() {
687 let tokens = Tokenizer::new("Contract {\n Identity\n}")
688 .tokenize()
689 .unwrap();
690 assert_eq!(
691 tokens[0].span,
692 Span {
693 line: 1,
694 column: 1,
695 offset: 0
696 }
697 );
698 assert_eq!(tokens[0].token, Token::Contract);
699 assert_eq!(
700 tokens[1].span,
701 Span {
702 line: 1,
703 column: 10,
704 offset: 9
705 }
706 );
707 assert_eq!(tokens[1].token, Token::LBrace);
708 assert_eq!(
709 tokens[2].span,
710 Span {
711 line: 2,
712 column: 3,
713 offset: 13
714 }
715 );
716 assert_eq!(tokens[2].token, Token::Identity);
717 assert_eq!(
718 tokens[3].span,
719 Span {
720 line: 3,
721 column: 1,
722 offset: 22
723 }
724 );
725 assert_eq!(tokens[3].token, Token::RBrace);
726 }
727
728 #[test]
731 fn test_empty_input() {
732 let tokens = tokenize("");
733 assert_eq!(tokens, vec![Token::Eof]);
734 }
735
736 #[test]
737 fn test_only_whitespace() {
738 let tokens = tokenize(" \n\n\t ");
739 assert_eq!(tokens, vec![Token::Eof]);
740 }
741
742 #[test]
743 fn test_only_comments() {
744 let tokens = tokenize("// nothing here\n// or here\n");
745 assert_eq!(tokens, vec![Token::Eof]);
746 }
747
748 #[test]
749 fn test_unexpected_character() {
750 let err = tokenize_err("@");
751 assert!(err.contains("Unexpected character"));
752 }
753
754 #[test]
757 fn test_tokenize_minimal_contract_fragment() {
758 let input = r#"Contract {
759 Identity {
760 stable_id: "test-001",
761 version: 1
762 }
763}"#;
764 let tokens = tokenize(input);
765 assert_eq!(
766 tokens,
767 vec![
768 Token::Contract,
769 Token::LBrace,
770 Token::Identity,
771 Token::LBrace,
772 Token::Identifier("stable_id".to_string()),
773 Token::Colon,
774 Token::StringLiteral("test-001".to_string()),
775 Token::Comma,
776 Token::Identifier("version".to_string()),
777 Token::Colon,
778 Token::IntegerLiteral(1),
779 Token::RBrace,
780 Token::RBrace,
781 Token::Eof,
782 ]
783 );
784 }
785
786 #[test]
787 fn test_tokenize_type_expression() {
788 let tokens = tokenize("Array<String>");
789 assert_eq!(
790 tokens,
791 vec![
792 Token::ArrayType,
793 Token::LAngle,
794 Token::StringType,
795 Token::RAngle,
796 Token::Eof,
797 ]
798 );
799 }
800
801 #[test]
802 fn test_tokenize_map_type() {
803 let tokens = tokenize("Map<String, Integer>");
804 assert_eq!(
805 tokens,
806 vec![
807 Token::MapType,
808 Token::LAngle,
809 Token::StringType,
810 Token::Comma,
811 Token::IntegerType,
812 Token::RAngle,
813 Token::Eof,
814 ]
815 );
816 }
817
818 #[test]
821 fn test_tokenize_determinism_100_iterations() {
822 let input = r#"Contract {
823 Identity {
824 stable_id: "test",
825 version: 1,
826 created_timestamp: 2026-01-01T00:00:00Z,
827 owner: "test",
828 semantic_hash: "abc123"
829 }
830}"#;
831 let first = Tokenizer::new(input).tokenize().unwrap();
832
833 for i in 0..100 {
834 let result = Tokenizer::new(input).tokenize().unwrap();
835 assert_eq!(first, result, "Determinism failure at iteration {}", i);
836 }
837 }
838}