1use chumsky::prelude::*;
45
46#[derive(Clone, Debug, PartialEq)]
51pub enum Token<'src> {
52 Config,
54 Variables,
55 System,
56 StartAgent,
57 Topic,
58 Actions,
59 Inputs,
60 Outputs,
61 Target,
62 Reasoning,
63 Instructions,
64 BeforeReasoning,
65 AfterReasoning,
66 Messages,
67 Welcome,
68 Error,
69 Connection, Connections, Knowledge,
72 Language,
73
74 Mutable,
76 Linked,
77 Description,
78 Source,
79 Label,
80
81 IsRequired,
83 IsDisplayable,
84 IsUsedByPlanner,
85 ComplexDataTypeName,
86 FilterFromAgent,
87
88 RequireUserConfirmation,
90 IncludeInProgressIndicator,
91 ProgressIndicatorMessage,
92
93 String,
95 Number,
96 Boolean,
97 Object,
98 List,
99 Date,
100 Timestamp,
101 Currency,
102 Id,
103 Datetime,
104 Time,
105 Integer,
106 Long,
107
108 If,
110 Else,
111 Run,
112 With,
113 Set,
114 To,
115 As,
116 Transition,
117 Available,
118 When,
119
120 True,
122 False,
123 None,
124
125 Eq, Ne, Lt, Gt, Le, Ge, Assign, Is, Not, And, Or, Plus, Minus, Colon, Dot, Comma, At, Pipe, Arrow, ColonPipe, ColonArrow, LParen, RParen, LBracket, RBracket, LBrace, RBrace, ExclBrace, DoubleLBrace, DoubleBrace, Ellipsis, Slash, Question, Exclamation, Dollar, Percent, Star, Ampersand, Semicolon, Backtick, Tilde, Caret, Backslash, Underscore, Apostrophe, UnicodeText(&'src str),
178
179 Ident(&'src str),
181
182 StringLit(&'src str),
184
185 NumberLit(f64),
187
188 Comment(&'src str),
190
191 Newline,
193
194 Indent, Dedent, }
198
199impl std::fmt::Display for Token<'_> {
200 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
201 match self {
202 Token::Config => write!(f, "config"),
203 Token::Variables => write!(f, "variables"),
204 Token::System => write!(f, "system"),
205 Token::StartAgent => write!(f, "start_agent"),
206 Token::Topic => write!(f, "topic"),
207 Token::Actions => write!(f, "actions"),
208 Token::Inputs => write!(f, "inputs"),
209 Token::Outputs => write!(f, "outputs"),
210 Token::Target => write!(f, "target"),
211 Token::Reasoning => write!(f, "reasoning"),
212 Token::Instructions => write!(f, "instructions"),
213 Token::BeforeReasoning => write!(f, "before_reasoning"),
214 Token::AfterReasoning => write!(f, "after_reasoning"),
215 Token::Messages => write!(f, "messages"),
216 Token::Welcome => write!(f, "welcome"),
217 Token::Error => write!(f, "error"),
218 Token::Connection => write!(f, "connection"),
219 Token::Connections => write!(f, "connections"),
220 Token::Knowledge => write!(f, "knowledge"),
221 Token::Language => write!(f, "language"),
222 Token::Mutable => write!(f, "mutable"),
223 Token::Linked => write!(f, "linked"),
224 Token::Description => write!(f, "description"),
225 Token::Source => write!(f, "source"),
226 Token::Label => write!(f, "label"),
227 Token::IsRequired => write!(f, "is_required"),
228 Token::IsDisplayable => write!(f, "is_displayable"),
229 Token::IsUsedByPlanner => write!(f, "is_used_by_planner"),
230 Token::ComplexDataTypeName => write!(f, "complex_data_type_name"),
231 Token::FilterFromAgent => write!(f, "filter_from_agent"),
232 Token::RequireUserConfirmation => write!(f, "require_user_confirmation"),
233 Token::IncludeInProgressIndicator => write!(f, "include_in_progress_indicator"),
234 Token::ProgressIndicatorMessage => write!(f, "progress_indicator_message"),
235 Token::String => write!(f, "string"),
236 Token::Number => write!(f, "number"),
237 Token::Boolean => write!(f, "boolean"),
238 Token::Object => write!(f, "object"),
239 Token::List => write!(f, "list"),
240 Token::Date => write!(f, "date"),
241 Token::Timestamp => write!(f, "timestamp"),
242 Token::Currency => write!(f, "currency"),
243 Token::Id => write!(f, "id"),
244 Token::Datetime => write!(f, "datetime"),
245 Token::Time => write!(f, "time"),
246 Token::Integer => write!(f, "integer"),
247 Token::Long => write!(f, "long"),
248 Token::If => write!(f, "if"),
249 Token::Else => write!(f, "else"),
250 Token::Run => write!(f, "run"),
251 Token::With => write!(f, "with"),
252 Token::Set => write!(f, "set"),
253 Token::To => write!(f, "to"),
254 Token::As => write!(f, "as"),
255 Token::Transition => write!(f, "transition"),
256 Token::Available => write!(f, "available"),
257 Token::When => write!(f, "when"),
258 Token::True => write!(f, "True"),
259 Token::False => write!(f, "False"),
260 Token::None => write!(f, "None"),
261 Token::Eq => write!(f, "=="),
262 Token::Ne => write!(f, "!="),
263 Token::Lt => write!(f, "<"),
264 Token::Gt => write!(f, ">"),
265 Token::Le => write!(f, "<="),
266 Token::Ge => write!(f, ">="),
267 Token::Assign => write!(f, "="),
268 Token::Is => write!(f, "is"),
269 Token::Not => write!(f, "not"),
270 Token::And => write!(f, "and"),
271 Token::Or => write!(f, "or"),
272 Token::Plus => write!(f, "+"),
273 Token::Minus => write!(f, "-"),
274 Token::Colon => write!(f, ":"),
275 Token::Dot => write!(f, "."),
276 Token::Comma => write!(f, ","),
277 Token::At => write!(f, "@"),
278 Token::Pipe => write!(f, "|"),
279 Token::Arrow => write!(f, "->"),
280 Token::ColonPipe => write!(f, ":|"),
281 Token::ColonArrow => write!(f, ":->"),
282 Token::LParen => write!(f, "("),
283 Token::RParen => write!(f, ")"),
284 Token::LBracket => write!(f, "["),
285 Token::RBracket => write!(f, "]"),
286 Token::LBrace => write!(f, "{{"),
287 Token::RBrace => write!(f, "}}"),
288 Token::ExclBrace => write!(f, "{{!"),
289 Token::DoubleLBrace => write!(f, "{{{{"),
290 Token::DoubleBrace => write!(f, "}}}}"),
291 Token::Ellipsis => write!(f, "..."),
292 Token::Slash => write!(f, "/"),
293 Token::Question => write!(f, "?"),
294 Token::Exclamation => write!(f, "!"),
295 Token::Dollar => write!(f, "$"),
296 Token::Percent => write!(f, "%"),
297 Token::Star => write!(f, "*"),
298 Token::Ampersand => write!(f, "&"),
299 Token::Semicolon => write!(f, ";"),
300 Token::Backtick => write!(f, "`"),
301 Token::Tilde => write!(f, "~"),
302 Token::Caret => write!(f, "^"),
303 Token::Backslash => write!(f, "\\"),
304 Token::Underscore => write!(f, "_"),
305 Token::Apostrophe => write!(f, "'"),
306 Token::UnicodeText(s) => write!(f, "{}", s),
307 Token::Ident(s) => write!(f, "{}", s),
308 Token::StringLit(s) => write!(f, "\"{}\"", s),
309 Token::NumberLit(n) => write!(f, "{}", n),
310 Token::Comment(s) => write!(f, "# {}", s),
311 Token::Newline => write!(f, "\\n"),
312 Token::Indent => write!(f, "INDENT"),
313 Token::Dedent => write!(f, "DEDENT"),
314 }
315 }
316}
317
318pub type Span = SimpleSpan<usize>;
320
321pub type Spanned<T> = (T, Span);
323
324pub fn lexer<'src>(
326) -> impl Parser<'src, &'src str, Vec<Spanned<Token<'src>>>, extra::Err<Rich<'src, char, Span>>> {
327 let comment = just('#')
328 .ignore_then(none_of('\n').repeated().to_slice())
329 .map(Token::Comment);
330
331 let string_lit = just('"')
333 .ignore_then(none_of('"').repeated().to_slice())
334 .then_ignore(just('"'))
335 .map(Token::StringLit);
336
337 let number = text::int(10)
339 .then(just('.').then(text::digits(10)).or_not())
340 .to_slice()
341 .map(|s: &str| Token::NumberLit(s.parse().unwrap()));
342
343 let multi_char_ops = choice((
345 just(":->").to(Token::ColonArrow),
346 just(":|").to(Token::ColonPipe),
347 just("->").to(Token::Arrow),
348 just("...").to(Token::Ellipsis),
349 just("==").to(Token::Eq),
350 just("!=").to(Token::Ne),
351 just("<=").to(Token::Le),
352 just(">=").to(Token::Ge),
353 just("{!").to(Token::ExclBrace),
354 just("{{").to(Token::DoubleLBrace),
355 just("}}").to(Token::DoubleBrace),
356 ));
357
358 let single_char_ops = choice((
360 just('<').to(Token::Lt),
361 just('>').to(Token::Gt),
362 just('=').to(Token::Assign),
363 just('+').to(Token::Plus),
364 just('-').to(Token::Minus),
365 just(':').to(Token::Colon),
366 just('.').to(Token::Dot),
367 just(',').to(Token::Comma),
368 just('@').to(Token::At),
369 just('|').to(Token::Pipe),
370 just('(').to(Token::LParen),
371 just(')').to(Token::RParen),
372 just('[').to(Token::LBracket),
373 just(']').to(Token::RBracket),
374 just('{').to(Token::LBrace),
375 just('}').to(Token::RBrace),
376 ));
377
378 let text_punctuation = choice((
380 just('/').to(Token::Slash),
381 just('?').to(Token::Question),
382 just('!').to(Token::Exclamation),
383 just('$').to(Token::Dollar),
384 just('%').to(Token::Percent),
385 just('*').to(Token::Star),
386 just('&').to(Token::Ampersand),
387 just(';').to(Token::Semicolon),
388 just('`').to(Token::Backtick),
389 just('~').to(Token::Tilde),
390 just('^').to(Token::Caret),
391 just('\\').to(Token::Backslash),
392 just('_').to(Token::Underscore),
393 just('\'').to(Token::Apostrophe),
394 ));
395
396 let unicode_text = any()
399 .filter(|c: &char| !c.is_ascii())
400 .repeated()
401 .at_least(1)
402 .to_slice()
403 .map(Token::UnicodeText);
404
405 let ident_or_keyword = text::ident().map(|s: &str| match s {
410 "config" => Token::Config,
412 "variables" => Token::Variables,
413 "system" => Token::System,
414 "start_agent" => Token::StartAgent,
415 "topic" => Token::Topic,
416 "actions" => Token::Actions,
417 "inputs" => Token::Inputs,
418 "outputs" => Token::Outputs,
419 "target" => Token::Target,
420 "reasoning" => Token::Reasoning,
421 "instructions" => Token::Instructions,
422 "before_reasoning" => Token::BeforeReasoning,
423 "after_reasoning" => Token::AfterReasoning,
424 "messages" => Token::Messages,
425 "welcome" => Token::Welcome,
427 "error" => Token::Error,
428 "connection" => Token::Connection,
429 "connections" => Token::Connections,
430 "knowledge" => Token::Knowledge,
431 "language" => Token::Language,
432 "mutable" => Token::Mutable,
433 "linked" => Token::Linked,
434 "description" => Token::Description,
435 "source" => Token::Source,
436 "label" => Token::Label,
437 "is_required" => Token::IsRequired,
438 "is_displayable" => Token::IsDisplayable,
439 "is_used_by_planner" => Token::IsUsedByPlanner,
440 "complex_data_type_name" => Token::ComplexDataTypeName,
441 "filter_from_agent" => Token::FilterFromAgent,
442 "require_user_confirmation" => Token::RequireUserConfirmation,
443 "include_in_progress_indicator" => Token::IncludeInProgressIndicator,
444 "progress_indicator_message" => Token::ProgressIndicatorMessage,
445 "string" => Token::String,
447 "number" => Token::Number,
448 "boolean" => Token::Boolean,
449 "object" => Token::Object,
450 "list" => Token::List,
451 "date" => Token::Date,
452 "timestamp" => Token::Timestamp,
453 "currency" => Token::Currency,
454 "datetime" => Token::Datetime,
455 "time" => Token::Time,
456 "integer" => Token::Integer,
457 "long" => Token::Long,
458 "id" => Token::Id,
459 "if" => Token::If,
461 "else" => Token::Else,
462 "run" => Token::Run,
463 "with" => Token::With,
464 "set" => Token::Set,
465 "to" => Token::To,
466 "as" => Token::As,
467 "transition" => Token::Transition,
468 "available" => Token::Available,
469 "when" => Token::When,
470 "True" => Token::True,
472 "False" => Token::False,
473 "None" => Token::None,
474 "is" => Token::Is,
475 "not" => Token::Not,
476 "and" => Token::And,
477 "or" => Token::Or,
478 _ => Token::Ident(s),
480 });
481
482 let newline = just('\n').to(Token::Newline);
484
485 let token = choice((
487 comment,
488 string_lit,
489 number,
490 multi_char_ops,
491 single_char_ops,
492 text_punctuation,
493 unicode_text,
494 ident_or_keyword,
495 newline,
496 ));
497
498 let horizontal_ws = one_of(" \t").repeated();
500
501 token
502 .map_with(|tok, e| (tok, e.span()))
503 .padded_by(horizontal_ws)
504 .repeated()
505 .collect()
506}
507
508pub fn add_indentation_tokens<'src>(
516 source: &'src str,
517 tokens: Vec<Spanned<Token<'src>>>,
518) -> Vec<Spanned<Token<'src>>> {
519 let mut result = Vec::with_capacity(tokens.len() * 2);
520 let mut indent_stack: Vec<usize> = vec![0]; let line_indents: Vec<(usize, usize)> = source
524 .lines()
525 .scan(0usize, |pos, line| {
526 let start = *pos;
527 *pos += line.len() + 1; let indent = line.len() - line.trim_start().len();
529 Some((start, indent))
530 })
531 .collect();
532
533 let get_indent_at = |pos: usize| -> usize {
535 match line_indents.binary_search_by_key(&pos, |&(start, _)| start) {
536 Ok(i) => line_indents[i].1,
537 Err(0) => 0,
538 Err(i) => line_indents[i - 1].1,
539 }
540 };
541
542 let mut i = 0;
543 while i < tokens.len() {
544 let (tok, span) = &tokens[i];
545
546 if matches!(tok, Token::Newline) {
547 result.push((tok.clone(), *span));
548
549 let mut next_idx = i + 1;
551 while next_idx < tokens.len() {
552 match &tokens[next_idx].0 {
553 Token::Comment(_) => {
554 result.push(tokens[next_idx].clone());
556 next_idx += 1;
557 }
558 Token::Newline => {
559 result.push(tokens[next_idx].clone());
561 next_idx += 1;
562 }
563 _ => break,
564 }
565 }
566
567 if next_idx < tokens.len() {
568 let next_span = &tokens[next_idx].1;
569 let new_indent = get_indent_at(next_span.start);
570 let current_indent = *indent_stack.last().unwrap_or(&0);
571
572 if new_indent > current_indent {
573 indent_stack.push(new_indent);
576 result.push((Token::Indent, Span::new((), next_span.start..next_span.start)));
577 } else if new_indent < current_indent {
578 while indent_stack.len() > 1 && *indent_stack.last().unwrap() > new_indent {
581 indent_stack.pop();
582 result
583 .push((Token::Dedent, Span::new((), next_span.start..next_span.start)));
584 }
585 }
588 }
590 i = next_idx;
591 } else {
592 result.push((tok.clone(), *span));
593 i += 1;
594 }
595 }
596
597 let eof_pos = source.len();
599 while indent_stack.len() > 1 {
600 indent_stack.pop();
601 result.push((Token::Dedent, Span::new((), eof_pos..eof_pos)));
602 }
603
604 result
605}
606
607pub fn lex_with_indentation<'src>(
609 source: &'src str,
610) -> Result<Vec<Spanned<Token<'src>>>, Vec<Rich<'src, char, Span>>> {
611 let tokens = lexer().parse(source).into_result()?;
612 Ok(add_indentation_tokens(source, tokens))
613}
614
615#[cfg(test)]
616mod tests {
617 use super::*;
618
619 #[test]
620 fn test_basic_tokens() {
621 let input = "config: agent_name";
622 let result = lexer().parse(input).into_result();
623 assert!(result.is_ok());
624 let tokens: Vec<_> = result.unwrap().into_iter().map(|(t, _)| t).collect();
625 assert_eq!(tokens, vec![Token::Config, Token::Colon, Token::Ident("agent_name"),]);
626 }
627
628 #[test]
629 fn test_string_literal() {
630 let input = r#""hello world""#;
631 let result = lexer().parse(input).into_result();
632 assert!(result.is_ok());
633 let tokens: Vec<_> = result.unwrap().into_iter().map(|(t, _)| t).collect();
634 assert_eq!(tokens, vec![Token::StringLit("hello world")]);
635 }
636
637 #[test]
638 fn test_reference_tokens() {
639 let input = "@variables.user_id";
640 let result = lexer().parse(input).into_result();
641 assert!(result.is_ok());
642 let tokens: Vec<_> = result.unwrap().into_iter().map(|(t, _)| t).collect();
643 assert_eq!(
644 tokens,
645 vec![
646 Token::At,
647 Token::Variables,
648 Token::Dot,
649 Token::Ident("user_id"),
650 ]
651 );
652 }
653
654 #[test]
655 fn test_operators() {
656 let input = "== != < > <= >= = + -";
657 let result = lexer().parse(input).into_result();
658 assert!(result.is_ok());
659 let tokens: Vec<_> = result.unwrap().into_iter().map(|(t, _)| t).collect();
660 assert_eq!(
661 tokens,
662 vec![
663 Token::Eq,
664 Token::Ne,
665 Token::Lt,
666 Token::Gt,
667 Token::Le,
668 Token::Ge,
669 Token::Assign,
670 Token::Plus,
671 Token::Minus,
672 ]
673 );
674 }
675
676 #[test]
677 fn test_ellipsis() {
678 let input = "with value=...";
679 let result = lexer().parse(input).into_result();
680 assert!(result.is_ok());
681 let tokens: Vec<_> = result.unwrap().into_iter().map(|(t, _)| t).collect();
682 assert_eq!(
683 tokens,
684 vec![
685 Token::With,
686 Token::Ident("value"),
687 Token::Assign,
688 Token::Ellipsis
689 ]
690 );
691 }
692
693 #[test]
694 fn test_colon_variants() {
695 let input = ": :| :->";
696 let result = lexer().parse(input).into_result();
697 assert!(result.is_ok());
698 let tokens: Vec<_> = result.unwrap().into_iter().map(|(t, _)| t).collect();
699 assert_eq!(tokens, vec![Token::Colon, Token::ColonPipe, Token::ColonArrow]);
700 }
701
702 #[test]
703 fn test_number_literals() {
704 let input = "42 3.15 0";
705 let result = lexer().parse(input).into_result();
706 assert!(result.is_ok());
707 let tokens: Vec<_> = result.unwrap().into_iter().map(|(t, _)| t).collect();
708 assert_eq!(
709 tokens,
710 vec![
711 Token::NumberLit(42.0),
712 Token::NumberLit(3.15),
713 Token::NumberLit(0.0),
714 ]
715 );
716 }
717
718 #[test]
719 fn test_interpolation_brace() {
720 let input = "{!@variables.name}";
721 let result = lexer().parse(input).into_result();
722 assert!(result.is_ok());
723 let tokens: Vec<_> = result.unwrap().into_iter().map(|(t, _)| t).collect();
724 assert_eq!(
725 tokens,
726 vec![
727 Token::ExclBrace,
728 Token::At,
729 Token::Variables,
730 Token::Dot,
731 Token::Ident("name"),
732 Token::RBrace,
733 ]
734 );
735 }
736
737 #[test]
738 fn test_indentation_tokens() {
739 let input = r#"config:
740 agent_name: "Test"
741 description: "Desc"
742
743topic main:
744 description: "Main"
745"#;
746 let result = lex_with_indentation(input);
747 assert!(result.is_ok());
748 let tokens: Vec<_> = result.unwrap().into_iter().map(|(t, _)| t).collect();
749
750 assert!(tokens.contains(&Token::Indent));
752 assert!(tokens.contains(&Token::Dedent));
753
754 let indents = tokens.iter().filter(|t| matches!(t, Token::Indent)).count();
756 let dedents = tokens.iter().filter(|t| matches!(t, Token::Dedent)).count();
757
758 assert_eq!(indents, dedents, "INDENT/DEDENT should balance");
760 }
761
762 #[test]
763 fn test_nested_indentation() {
764 let input = r#"topic main:
765 reasoning:
766 instructions: "test"
767"#;
768 let result = lex_with_indentation(input);
769 assert!(result.is_ok());
770 let tokens: Vec<_> = result.unwrap().into_iter().map(|(t, _)| t).collect();
771
772 let indents = tokens.iter().filter(|t| matches!(t, Token::Indent)).count();
774 let dedents = tokens.iter().filter(|t| matches!(t, Token::Dedent)).count();
775 assert_eq!(indents, 2, "Should have 2 INDENTs");
776 assert_eq!(dedents, 2, "Should have 2 DEDENTs");
777 }
778}