use chumsky::prelude::*;
#[derive(Clone, Debug, PartialEq)]
pub enum Token<'src> {
Config,
Variables,
System,
StartAgent,
Topic,
Actions,
Inputs,
Outputs,
Target,
Reasoning,
Instructions,
BeforeReasoning,
AfterReasoning,
Messages,
Welcome,
Error,
Connection, Connections, Knowledge,
Language,
Mutable,
Linked,
Description,
Source,
Label,
IsRequired,
IsDisplayable,
IsUsedByPlanner,
ComplexDataTypeName,
FilterFromAgent,
RequireUserConfirmation,
IncludeInProgressIndicator,
ProgressIndicatorMessage,
String,
Number,
Boolean,
Object,
List,
Date,
Timestamp,
Currency,
Id,
Datetime,
Time,
Integer,
Long,
If,
Else,
Run,
With,
Set,
To,
As,
Transition,
Available,
When,
True,
False,
None,
Eq, Ne, Lt, Gt, Le, Ge, Assign, Is, Not, And, Or, Plus, Minus,
Colon, Dot, Comma, At, Pipe, Arrow, ColonPipe, ColonArrow, LParen, RParen, LBracket, RBracket, LBrace, RBrace, ExclBrace, DoubleLBrace, DoubleBrace, Ellipsis,
Slash, Question, Exclamation, Dollar, Percent, Star, Ampersand, Semicolon, Backtick, Tilde, Caret, Backslash, Underscore, Apostrophe,
UnicodeText(&'src str),
Ident(&'src str),
StringLit(&'src str),
NumberLit(f64),
Comment(&'src str),
Newline,
Indent, Dedent, }
impl std::fmt::Display for Token<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Token::Config => write!(f, "config"),
Token::Variables => write!(f, "variables"),
Token::System => write!(f, "system"),
Token::StartAgent => write!(f, "start_agent"),
Token::Topic => write!(f, "topic"),
Token::Actions => write!(f, "actions"),
Token::Inputs => write!(f, "inputs"),
Token::Outputs => write!(f, "outputs"),
Token::Target => write!(f, "target"),
Token::Reasoning => write!(f, "reasoning"),
Token::Instructions => write!(f, "instructions"),
Token::BeforeReasoning => write!(f, "before_reasoning"),
Token::AfterReasoning => write!(f, "after_reasoning"),
Token::Messages => write!(f, "messages"),
Token::Welcome => write!(f, "welcome"),
Token::Error => write!(f, "error"),
Token::Connection => write!(f, "connection"),
Token::Connections => write!(f, "connections"),
Token::Knowledge => write!(f, "knowledge"),
Token::Language => write!(f, "language"),
Token::Mutable => write!(f, "mutable"),
Token::Linked => write!(f, "linked"),
Token::Description => write!(f, "description"),
Token::Source => write!(f, "source"),
Token::Label => write!(f, "label"),
Token::IsRequired => write!(f, "is_required"),
Token::IsDisplayable => write!(f, "is_displayable"),
Token::IsUsedByPlanner => write!(f, "is_used_by_planner"),
Token::ComplexDataTypeName => write!(f, "complex_data_type_name"),
Token::FilterFromAgent => write!(f, "filter_from_agent"),
Token::RequireUserConfirmation => write!(f, "require_user_confirmation"),
Token::IncludeInProgressIndicator => write!(f, "include_in_progress_indicator"),
Token::ProgressIndicatorMessage => write!(f, "progress_indicator_message"),
Token::String => write!(f, "string"),
Token::Number => write!(f, "number"),
Token::Boolean => write!(f, "boolean"),
Token::Object => write!(f, "object"),
Token::List => write!(f, "list"),
Token::Date => write!(f, "date"),
Token::Timestamp => write!(f, "timestamp"),
Token::Currency => write!(f, "currency"),
Token::Id => write!(f, "id"),
Token::Datetime => write!(f, "datetime"),
Token::Time => write!(f, "time"),
Token::Integer => write!(f, "integer"),
Token::Long => write!(f, "long"),
Token::If => write!(f, "if"),
Token::Else => write!(f, "else"),
Token::Run => write!(f, "run"),
Token::With => write!(f, "with"),
Token::Set => write!(f, "set"),
Token::To => write!(f, "to"),
Token::As => write!(f, "as"),
Token::Transition => write!(f, "transition"),
Token::Available => write!(f, "available"),
Token::When => write!(f, "when"),
Token::True => write!(f, "True"),
Token::False => write!(f, "False"),
Token::None => write!(f, "None"),
Token::Eq => write!(f, "=="),
Token::Ne => write!(f, "!="),
Token::Lt => write!(f, "<"),
Token::Gt => write!(f, ">"),
Token::Le => write!(f, "<="),
Token::Ge => write!(f, ">="),
Token::Assign => write!(f, "="),
Token::Is => write!(f, "is"),
Token::Not => write!(f, "not"),
Token::And => write!(f, "and"),
Token::Or => write!(f, "or"),
Token::Plus => write!(f, "+"),
Token::Minus => write!(f, "-"),
Token::Colon => write!(f, ":"),
Token::Dot => write!(f, "."),
Token::Comma => write!(f, ","),
Token::At => write!(f, "@"),
Token::Pipe => write!(f, "|"),
Token::Arrow => write!(f, "->"),
Token::ColonPipe => write!(f, ":|"),
Token::ColonArrow => write!(f, ":->"),
Token::LParen => write!(f, "("),
Token::RParen => write!(f, ")"),
Token::LBracket => write!(f, "["),
Token::RBracket => write!(f, "]"),
Token::LBrace => write!(f, "{{"),
Token::RBrace => write!(f, "}}"),
Token::ExclBrace => write!(f, "{{!"),
Token::DoubleLBrace => write!(f, "{{{{"),
Token::DoubleBrace => write!(f, "}}}}"),
Token::Ellipsis => write!(f, "..."),
Token::Slash => write!(f, "/"),
Token::Question => write!(f, "?"),
Token::Exclamation => write!(f, "!"),
Token::Dollar => write!(f, "$"),
Token::Percent => write!(f, "%"),
Token::Star => write!(f, "*"),
Token::Ampersand => write!(f, "&"),
Token::Semicolon => write!(f, ";"),
Token::Backtick => write!(f, "`"),
Token::Tilde => write!(f, "~"),
Token::Caret => write!(f, "^"),
Token::Backslash => write!(f, "\\"),
Token::Underscore => write!(f, "_"),
Token::Apostrophe => write!(f, "'"),
Token::UnicodeText(s) => write!(f, "{}", s),
Token::Ident(s) => write!(f, "{}", s),
Token::StringLit(s) => write!(f, "\"{}\"", s),
Token::NumberLit(n) => write!(f, "{}", n),
Token::Comment(s) => write!(f, "# {}", s),
Token::Newline => write!(f, "\\n"),
Token::Indent => write!(f, "INDENT"),
Token::Dedent => write!(f, "DEDENT"),
}
}
}
pub type Span = SimpleSpan<usize>;
pub type Spanned<T> = (T, Span);
pub fn lexer<'src>(
) -> impl Parser<'src, &'src str, Vec<Spanned<Token<'src>>>, extra::Err<Rich<'src, char, Span>>> {
let comment = just('#')
.ignore_then(none_of('\n').repeated().to_slice())
.map(Token::Comment);
let string_lit = just('"')
.ignore_then(none_of('"').repeated().to_slice())
.then_ignore(just('"'))
.map(Token::StringLit);
let number = text::int(10)
.then(just('.').then(text::digits(10)).or_not())
.to_slice()
.map(|s: &str| Token::NumberLit(s.parse().unwrap()));
let multi_char_ops = choice((
just(":->").to(Token::ColonArrow),
just(":|").to(Token::ColonPipe),
just("->").to(Token::Arrow),
just("...").to(Token::Ellipsis),
just("==").to(Token::Eq),
just("!=").to(Token::Ne),
just("<=").to(Token::Le),
just(">=").to(Token::Ge),
just("{!").to(Token::ExclBrace),
just("{{").to(Token::DoubleLBrace),
just("}}").to(Token::DoubleBrace),
));
let single_char_ops = choice((
just('<').to(Token::Lt),
just('>').to(Token::Gt),
just('=').to(Token::Assign),
just('+').to(Token::Plus),
just('-').to(Token::Minus),
just(':').to(Token::Colon),
just('.').to(Token::Dot),
just(',').to(Token::Comma),
just('@').to(Token::At),
just('|').to(Token::Pipe),
just('(').to(Token::LParen),
just(')').to(Token::RParen),
just('[').to(Token::LBracket),
just(']').to(Token::RBracket),
just('{').to(Token::LBrace),
just('}').to(Token::RBrace),
));
let text_punctuation = choice((
just('/').to(Token::Slash),
just('?').to(Token::Question),
just('!').to(Token::Exclamation),
just('$').to(Token::Dollar),
just('%').to(Token::Percent),
just('*').to(Token::Star),
just('&').to(Token::Ampersand),
just(';').to(Token::Semicolon),
just('`').to(Token::Backtick),
just('~').to(Token::Tilde),
just('^').to(Token::Caret),
just('\\').to(Token::Backslash),
just('_').to(Token::Underscore),
just('\'').to(Token::Apostrophe),
));
let unicode_text = any()
.filter(|c: &char| !c.is_ascii())
.repeated()
.at_least(1)
.to_slice()
.map(Token::UnicodeText);
let ident_or_keyword = text::ident().map(|s: &str| match s {
"config" => Token::Config,
"variables" => Token::Variables,
"system" => Token::System,
"start_agent" => Token::StartAgent,
"topic" => Token::Topic,
"actions" => Token::Actions,
"inputs" => Token::Inputs,
"outputs" => Token::Outputs,
"target" => Token::Target,
"reasoning" => Token::Reasoning,
"instructions" => Token::Instructions,
"before_reasoning" => Token::BeforeReasoning,
"after_reasoning" => Token::AfterReasoning,
"messages" => Token::Messages,
"welcome" => Token::Welcome,
"error" => Token::Error,
"connection" => Token::Connection,
"connections" => Token::Connections,
"knowledge" => Token::Knowledge,
"language" => Token::Language,
"mutable" => Token::Mutable,
"linked" => Token::Linked,
"description" => Token::Description,
"source" => Token::Source,
"label" => Token::Label,
"is_required" => Token::IsRequired,
"is_displayable" => Token::IsDisplayable,
"is_used_by_planner" => Token::IsUsedByPlanner,
"complex_data_type_name" => Token::ComplexDataTypeName,
"filter_from_agent" => Token::FilterFromAgent,
"require_user_confirmation" => Token::RequireUserConfirmation,
"include_in_progress_indicator" => Token::IncludeInProgressIndicator,
"progress_indicator_message" => Token::ProgressIndicatorMessage,
"string" => Token::String,
"number" => Token::Number,
"boolean" => Token::Boolean,
"object" => Token::Object,
"list" => Token::List,
"date" => Token::Date,
"timestamp" => Token::Timestamp,
"currency" => Token::Currency,
"datetime" => Token::Datetime,
"time" => Token::Time,
"integer" => Token::Integer,
"long" => Token::Long,
"id" => Token::Id,
"if" => Token::If,
"else" => Token::Else,
"run" => Token::Run,
"with" => Token::With,
"set" => Token::Set,
"to" => Token::To,
"as" => Token::As,
"transition" => Token::Transition,
"available" => Token::Available,
"when" => Token::When,
"True" => Token::True,
"False" => Token::False,
"None" => Token::None,
"is" => Token::Is,
"not" => Token::Not,
"and" => Token::And,
"or" => Token::Or,
_ => Token::Ident(s),
});
let newline = just('\n').to(Token::Newline);
let token = choice((
comment,
string_lit,
number,
multi_char_ops,
single_char_ops,
text_punctuation,
unicode_text,
ident_or_keyword,
newline,
));
let horizontal_ws = one_of(" \t").repeated();
token
.map_with(|tok, e| (tok, e.span()))
.padded_by(horizontal_ws)
.repeated()
.collect()
}
pub fn add_indentation_tokens<'src>(
source: &'src str,
tokens: Vec<Spanned<Token<'src>>>,
) -> Vec<Spanned<Token<'src>>> {
let mut result = Vec::with_capacity(tokens.len() * 2);
let mut indent_stack: Vec<usize> = vec![0];
let line_indents: Vec<(usize, usize)> = source
.lines()
.scan(0usize, |pos, line| {
let start = *pos;
*pos += line.len() + 1; let indent = line.len() - line.trim_start().len();
Some((start, indent))
})
.collect();
let get_indent_at = |pos: usize| -> usize {
match line_indents.binary_search_by_key(&pos, |&(start, _)| start) {
Ok(i) => line_indents[i].1,
Err(0) => 0,
Err(i) => line_indents[i - 1].1,
}
};
let mut i = 0;
while i < tokens.len() {
let (tok, span) = &tokens[i];
if matches!(tok, Token::Newline) {
result.push((tok.clone(), *span));
let mut next_idx = i + 1;
while next_idx < tokens.len() {
match &tokens[next_idx].0 {
Token::Comment(_) => {
result.push(tokens[next_idx].clone());
next_idx += 1;
}
Token::Newline => {
result.push(tokens[next_idx].clone());
next_idx += 1;
}
_ => break,
}
}
if next_idx < tokens.len() {
let next_span = &tokens[next_idx].1;
let new_indent = get_indent_at(next_span.start);
let current_indent = *indent_stack.last().unwrap_or(&0);
if new_indent > current_indent {
indent_stack.push(new_indent);
result.push((Token::Indent, Span::new((), next_span.start..next_span.start)));
} else if new_indent < current_indent {
while indent_stack.len() > 1 && *indent_stack.last().unwrap() > new_indent {
indent_stack.pop();
result
.push((Token::Dedent, Span::new((), next_span.start..next_span.start)));
}
}
}
i = next_idx;
} else {
result.push((tok.clone(), *span));
i += 1;
}
}
let eof_pos = source.len();
while indent_stack.len() > 1 {
indent_stack.pop();
result.push((Token::Dedent, Span::new((), eof_pos..eof_pos)));
}
result
}
pub fn lex_with_indentation<'src>(
source: &'src str,
) -> Result<Vec<Spanned<Token<'src>>>, Vec<Rich<'src, char, Span>>> {
let tokens = lexer().parse(source).into_result()?;
Ok(add_indentation_tokens(source, tokens))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_basic_tokens() {
let input = "config: agent_name";
let result = lexer().parse(input).into_result();
assert!(result.is_ok());
let tokens: Vec<_> = result.unwrap().into_iter().map(|(t, _)| t).collect();
assert_eq!(tokens, vec![Token::Config, Token::Colon, Token::Ident("agent_name"),]);
}
#[test]
fn test_string_literal() {
let input = r#""hello world""#;
let result = lexer().parse(input).into_result();
assert!(result.is_ok());
let tokens: Vec<_> = result.unwrap().into_iter().map(|(t, _)| t).collect();
assert_eq!(tokens, vec![Token::StringLit("hello world")]);
}
#[test]
fn test_reference_tokens() {
let input = "@variables.user_id";
let result = lexer().parse(input).into_result();
assert!(result.is_ok());
let tokens: Vec<_> = result.unwrap().into_iter().map(|(t, _)| t).collect();
assert_eq!(
tokens,
vec![
Token::At,
Token::Variables,
Token::Dot,
Token::Ident("user_id"),
]
);
}
#[test]
fn test_operators() {
let input = "== != < > <= >= = + -";
let result = lexer().parse(input).into_result();
assert!(result.is_ok());
let tokens: Vec<_> = result.unwrap().into_iter().map(|(t, _)| t).collect();
assert_eq!(
tokens,
vec![
Token::Eq,
Token::Ne,
Token::Lt,
Token::Gt,
Token::Le,
Token::Ge,
Token::Assign,
Token::Plus,
Token::Minus,
]
);
}
#[test]
fn test_ellipsis() {
let input = "with value=...";
let result = lexer().parse(input).into_result();
assert!(result.is_ok());
let tokens: Vec<_> = result.unwrap().into_iter().map(|(t, _)| t).collect();
assert_eq!(
tokens,
vec![
Token::With,
Token::Ident("value"),
Token::Assign,
Token::Ellipsis
]
);
}
#[test]
fn test_colon_variants() {
let input = ": :| :->";
let result = lexer().parse(input).into_result();
assert!(result.is_ok());
let tokens: Vec<_> = result.unwrap().into_iter().map(|(t, _)| t).collect();
assert_eq!(tokens, vec![Token::Colon, Token::ColonPipe, Token::ColonArrow]);
}
#[test]
fn test_number_literals() {
let input = "42 3.15 0";
let result = lexer().parse(input).into_result();
assert!(result.is_ok());
let tokens: Vec<_> = result.unwrap().into_iter().map(|(t, _)| t).collect();
assert_eq!(
tokens,
vec![
Token::NumberLit(42.0),
Token::NumberLit(3.15),
Token::NumberLit(0.0),
]
);
}
#[test]
fn test_interpolation_brace() {
let input = "{!@variables.name}";
let result = lexer().parse(input).into_result();
assert!(result.is_ok());
let tokens: Vec<_> = result.unwrap().into_iter().map(|(t, _)| t).collect();
assert_eq!(
tokens,
vec![
Token::ExclBrace,
Token::At,
Token::Variables,
Token::Dot,
Token::Ident("name"),
Token::RBrace,
]
);
}
#[test]
fn test_indentation_tokens() {
let input = r#"config:
agent_name: "Test"
description: "Desc"
topic main:
description: "Main"
"#;
let result = lex_with_indentation(input);
assert!(result.is_ok());
let tokens: Vec<_> = result.unwrap().into_iter().map(|(t, _)| t).collect();
assert!(tokens.contains(&Token::Indent));
assert!(tokens.contains(&Token::Dedent));
let indents = tokens.iter().filter(|t| matches!(t, Token::Indent)).count();
let dedents = tokens.iter().filter(|t| matches!(t, Token::Dedent)).count();
assert_eq!(indents, dedents, "INDENT/DEDENT should balance");
}
#[test]
fn test_nested_indentation() {
let input = r#"topic main:
reasoning:
instructions: "test"
"#;
let result = lex_with_indentation(input);
assert!(result.is_ok());
let tokens: Vec<_> = result.unwrap().into_iter().map(|(t, _)| t).collect();
let indents = tokens.iter().filter(|t| matches!(t, Token::Indent)).count();
let dedents = tokens.iter().filter(|t| matches!(t, Token::Dedent)).count();
assert_eq!(indents, 2, "Should have 2 INDENTs");
assert_eq!(dedents, 2, "Should have 2 DEDENTs");
}
}