Skip to main content

busbar_sf_agentscript/parser/
mod.rs

1//! Parser for AgentScript source code.
2//!
3//! This module provides a complete parser that converts AgentScript source
4//! into a typed Abstract Syntax Tree ([`AgentFile`]).
5//!
6//! # Architecture
7//!
8//! The parser uses a two-phase approach:
9//!
10//! 1. **Lexical analysis** - Source → Tokens (via [`crate::lexer`])
11//! 2. **Parsing** - Tokens → AST (via chumsky combinators)
12//!
13//! # Usage
14//!
15//! ```rust
16//! use busbar_sf_agentscript::parser::parse;
17//!
18//! let source = r#"
19//! config:
20//!    agent_name: "MyAgent"
21//!
22//! topic main:
23//!    description: "Main topic"
24//! "#;
25//!
26//! match parse(source) {
27//!     Ok(agent) => {
28//!         println!("Parsed {} topics", agent.topics.len());
29//!     }
30//!     Err(errors) => {
31//!         for err in errors {
32//!             eprintln!("{}", err);
33//!         }
34//!     }
35//! }
36//! ```
37//!
38//! # Error Handling
39//!
40//! Use [`parse_with_errors()`] for partial parsing that returns both
41//! the result and any errors encountered:
42//!
43//! ```rust
44//! use busbar_sf_agentscript::parser::parse_with_errors;
45//!
46//! let source = "config:\n   agent_name: \"Test\"";
47//! let (result, errors) = parse_with_errors(source);
48//!
49//! if let Some(agent) = result {
50//!     println!("Parsed successfully");
51//! }
52//! for err in errors {
53//!     eprintln!("Warning: {}", err);
54//! }
55//! ```
56//!
57//! # Module Structure
58//!
59//! The parser is split into submodules for each block type:
60//!
61//! - `config` - Config block parsing
62//! - `variables` - Variable declarations
63//! - `system` - System instructions and messages
64//! - `topics` - Topic and start_agent blocks
65//! - `actions` - Action definitions
66//! - `reasoning` - Reasoning blocks
67//! - `expressions` - Expression parsing
68//! - `instructions` - Static and dynamic instructions
69//!
70//! [`AgentFile`]: crate::ast::AgentFile
71
72mod actions;
73mod config;
74mod connections;
75mod directives;
76mod expressions;
77mod instructions;
78mod language;
79mod primitives;
80mod reasoning;
81mod system;
82#[cfg(not(test))]
83mod tests;
84mod topics;
85mod variables;
86
87use crate::ast::AgentFile;
88use crate::lexer;
89
90// Re-export the span type
91pub use primitives::Span;
92
93/// Convert a character offset to (line, column) - both 1-indexed
94fn offset_to_line_col(source: &str, offset: usize) -> (usize, usize) {
95    let mut line = 1;
96    let mut col = 1;
97    for (i, ch) in source.char_indices() {
98        if i >= offset {
99            break;
100        }
101        if ch == '\n' {
102            line += 1;
103            col = 1;
104        } else {
105            col += 1;
106        }
107    }
108    (line, col)
109}
110
111/// Get the line content at a given line number (1-indexed)
112fn get_line_content(source: &str, line_num: usize) -> &str {
113    source.lines().nth(line_num.saturating_sub(1)).unwrap_or("")
114}
115
116/// Format a chumsky Rich error into a human-readable string with line/column info
117fn format_parse_error<'tokens, 'src>(
118    source: &str,
119    error: &Rich<'tokens, crate::lexer::Token<'src>, primitives::Span>,
120) -> String {
121    let span = error.span();
122    let (line, col) = offset_to_line_col(source, span.start);
123    let line_content = get_line_content(source, line);
124
125    // Build expected string
126    let expected: Vec<String> = error.expected().map(|e| format!("{}", e)).collect();
127    let expected_str = if expected.is_empty() {
128        String::new()
129    } else if expected.len() == 1 {
130        format!(", expected {}", expected[0])
131    } else {
132        format!(", expected one of: {}", expected.join(", "))
133    };
134
135    // Build found string
136    let found_str = match error.found() {
137        Some(tok) => format!("found '{}'", tok),
138        None => "found end of input".to_string(),
139    };
140
141    // Build context chain from .labelled() calls - shows WHERE in the parse tree we failed
142    let contexts: Vec<_> = error.contexts().collect();
143    let context_str = if contexts.is_empty() {
144        String::new()
145    } else {
146        let ctx_labels: Vec<String> = contexts
147            .iter()
148            .map(|(label, ctx_span)| {
149                let (ctx_line, _) = offset_to_line_col(source, ctx_span.start);
150                format!("{} (line {})", label, ctx_line)
151            })
152            .collect();
153        format!("\n  while parsing: {}", ctx_labels.join(" > "))
154    };
155
156    // Format with context
157    format!(
158        "Error at line {}, column {}: {}{}{}\n  |\n{:>3} | {}\n  | {}{}",
159        line,
160        col,
161        found_str,
162        expected_str,
163        context_str,
164        line,
165        line_content,
166        " ".repeat(col.saturating_sub(1)),
167        "^".repeat(
168            (span.end - span.start)
169                .max(1)
170                .min(line_content.len().saturating_sub(col - 1).max(1))
171        )
172    )
173}
174
175/// Format a lexer error into a human-readable string
176fn format_lexer_error(
177    source: &str,
178    error: &impl std::fmt::Debug,
179    span_start: usize,
180    span_end: usize,
181) -> String {
182    let (line, col) = offset_to_line_col(source, span_start);
183    let line_content = get_line_content(source, line);
184
185    format!(
186        "Lexer error at line {}, column {}: {:?}\n  |\n{:>3} | {}\n  | {}{}",
187        line,
188        col,
189        error,
190        line,
191        line_content,
192        " ".repeat(col.saturating_sub(1)),
193        "^".repeat(
194            (span_end - span_start)
195                .max(1)
196                .min(line_content.len().saturating_sub(col - 1).max(1))
197        )
198    )
199}
200
201// Re-export primitives needed by agent_file_parser
202use primitives::{skip_toplevel_noise, ParserInput};
203
204use chumsky::input::Input as _;
205use chumsky::prelude::*;
206use chumsky::recovery::skip_then_retry_until;
207
208use config::config_block;
209use connections::{connection_block, legacy_connections_block};
210use language::language_block;
211use system::system_block;
212use topics::{start_agent_block, topic_block};
213use variables::variables_block;
214
215/// Parse an AgentScript file from source code.
216///
217/// Returns Ok only if parsing succeeds with no errors.
218/// Use `parse_with_errors` to get partial results and all errors.
219pub fn parse(source: &str) -> Result<AgentFile, Vec<String>> {
220    let (result, errors) = parse_with_errors(source);
221    if errors.is_empty() {
222        result.ok_or_else(|| vec!["Unknown parse error".to_string()])
223    } else {
224        Err(errors)
225    }
226}
227
228/// Parse an AgentScript file from source with full error reporting.
229///
230/// Returns both a partial AST (if recovery succeeded) and ALL errors found.
231/// This allows collecting multiple errors in a single parse pass.
232pub fn parse_with_errors(source: &str) -> (Option<AgentFile>, Vec<String>) {
233    // Phase 1: Lexical analysis with indentation tokens
234    let tokens = match lexer::lex_with_indentation(source) {
235        Ok(tokens) => tokens,
236        Err(errs) => {
237            let errors: Vec<String> = errs
238                .iter()
239                .map(|e| {
240                    let span = e.span();
241                    format_lexer_error(source, &e.reason(), span.start, span.end)
242                })
243                .collect();
244            return (None, errors);
245        }
246    };
247
248    // Phase 2: Parse into AST using token-based parser
249    let eoi_span = primitives::Span::new((), source.len()..source.len());
250    let token_stream = tokens.as_slice().split_token_span(eoi_span);
251
252    // Use into_output_errors to get BOTH partial results AND all errors
253    let (result, errs) = agent_file_parser().parse(token_stream).into_output_errors();
254
255    let errors: Vec<String> = errs.iter().map(|e| format_parse_error(source, e)).collect();
256    (result, errors)
257}
258
259/// Parse an AgentScript file and return structured errors with span information.
260///
261/// Returns Ok only if parsing succeeds with no errors.
262/// Use `parse_with_structured_errors_all` to get partial results and all errors.
263pub fn parse_with_structured_errors(
264    source: &str,
265) -> Result<AgentFile, Vec<crate::error::ParseErrorInfo>> {
266    let (result, errors) = parse_with_structured_errors_all(source);
267    if errors.is_empty() {
268        result.ok_or_else(|| {
269            vec![crate::error::ParseErrorInfo {
270                message: "Unknown parse error".to_string(),
271                span: None,
272                expected: vec![],
273                found: None,
274                contexts: vec![],
275            }]
276        })
277    } else {
278        Err(errors)
279    }
280}
281
282/// Parse an AgentScript file and return structured errors with span information.
283///
284/// Returns both a partial AST (if recovery succeeded) and ALL errors found.
285pub fn parse_with_structured_errors_all(
286    source: &str,
287) -> (Option<AgentFile>, Vec<crate::error::ParseErrorInfo>) {
288    use crate::error::ParseErrorInfo;
289
290    // Phase 1: Lexical analysis with indentation tokens
291    let tokens = match lexer::lex_with_indentation(source) {
292        Ok(tokens) => tokens,
293        Err(errs) => {
294            let errors: Vec<ParseErrorInfo> = errs
295                .iter()
296                .map(|e| {
297                    let span = e.span();
298                    let (line, col) = offset_to_line_col(source, span.start);
299                    ParseErrorInfo {
300                        message: format!(
301                            "Lexer error at line {}, column {}: {}",
302                            line,
303                            col,
304                            e.reason()
305                        ),
306                        span: Some(span.start..span.end),
307                        expected: vec![],
308                        found: None,
309                        contexts: vec![],
310                    }
311                })
312                .collect();
313            return (None, errors);
314        }
315    };
316
317    // Phase 2: Parse into AST using token-based parser
318    let eoi_span = primitives::Span::new((), source.len()..source.len());
319    let token_stream = tokens.as_slice().split_token_span(eoi_span);
320
321    // Use into_output_errors to get BOTH partial results AND all errors
322    let (result, errs) = agent_file_parser().parse(token_stream).into_output_errors();
323
324    let errors: Vec<ParseErrorInfo> = errs
325        .iter()
326        .map(|e| {
327            let span = e.span();
328            let (line, col) = offset_to_line_col(source, span.start);
329            // Collect contexts from labelled parsers
330            let contexts: Vec<(String, std::ops::Range<usize>)> = e
331                .contexts()
332                .map(|(label, ctx_span)| (label.to_string(), ctx_span.start..ctx_span.end))
333                .collect();
334
335            ParseErrorInfo {
336                message: format!("Parse error at line {}, column {}: {}", line, col, e.reason()),
337                span: Some(span.start..span.end),
338                expected: e.expected().map(|exp| format!("{}", exp)).collect(),
339                found: e.found().map(|tok| format!("{}", tok)),
340                contexts,
341            }
342        })
343        .collect();
344
345    (result, errors)
346}
347
348// ============================================================================
349// Top-Level Agent File Parser
350// ============================================================================
351
352use crate::ast::{
353    ConfigBlock, ConnectionBlock, LanguageBlock, Spanned, StartAgentBlock, SystemBlock, TopicBlock,
354    VariablesBlock,
355};
356use crate::lexer::Token;
357
358/// Enum for tracking parsed top-level blocks.
359enum TopLevelBlock {
360    Config(Spanned<ConfigBlock>),
361    Variables(Spanned<VariablesBlock>),
362    System(Spanned<SystemBlock>),
363    StartAgent(Spanned<StartAgentBlock>),
364    Topic(Spanned<TopicBlock>),
365    Language(Spanned<LanguageBlock>),
366    Connection(Spanned<ConnectionBlock>),
367}
368
369/// Parse a complete agent file.
370fn agent_file_parser<'tokens, 'src: 'tokens>() -> impl Parser<
371    'tokens,
372    ParserInput<'tokens, 'src>,
373    AgentFile,
374    extra::Err<Rich<'tokens, Token<'src>, primitives::Span>>,
375> + Clone {
376    // Top-level blocks - each with noise handling before them (including DEDENTs)
377    let config = skip_toplevel_noise()
378        .ignore_then(config_block())
379        .map(TopLevelBlock::Config);
380    let variables = skip_toplevel_noise()
381        .ignore_then(variables_block())
382        .map(TopLevelBlock::Variables);
383    let system = skip_toplevel_noise()
384        .ignore_then(system_block())
385        .map(TopLevelBlock::System);
386    let start_agent = skip_toplevel_noise()
387        .ignore_then(start_agent_block())
388        .map(TopLevelBlock::StartAgent);
389    let topic = skip_toplevel_noise()
390        .ignore_then(topic_block())
391        .map(TopLevelBlock::Topic);
392    let language = skip_toplevel_noise()
393        .ignore_then(language_block())
394        .map(TopLevelBlock::Language);
395    let connection = skip_toplevel_noise()
396        .ignore_then(connection_block())
397        .map(TopLevelBlock::Connection);
398
399    // Legacy connections: block - emit helpful error message
400    let legacy_connections = skip_toplevel_noise().ignore_then(legacy_connections_block());
401
402    // Skip trailing whitespace including any final DEDENTs
403    let trailing_noise = skip_toplevel_noise();
404
405    // Recovery strategy: when parsing fails, skip until we find a top-level keyword
406    // and retry. This captures errors with proper context from .labelled() calls.
407    let recovery_until = choice((
408        just(Token::Topic).ignored(),
409        just(Token::StartAgent).ignored(),
410        just(Token::Config).ignored(),
411        just(Token::Variables).ignored(),
412        just(Token::System).ignored(),
413        just(Token::Language).ignored(),
414        just(Token::Connection).ignored(),
415    ));
416
417    // Parse blocks with choice (try each parser)
418    // Note: legacy_connections must come AFTER connection to avoid early matching
419    choice((config, variables, system, start_agent, topic, language, connection))
420        .or(legacy_connections.map(|_| {
421            // This shouldn't be reached since legacy_connections emits an error,
422            // but we need to return something for type checking
423            TopLevelBlock::Config(Spanned::new(
424                ConfigBlock {
425                    agent_name: Spanned::new("error".to_string(), 0..0),
426                    agent_label: None,
427                    description: None,
428                    agent_type: None,
429                    default_agent_user: None,
430                },
431                0..0,
432            ))
433        }))
434        .recover_with(skip_then_retry_until(any().ignored(), recovery_until))
435        .repeated()
436        .collect::<Vec<_>>()
437        .then_ignore(trailing_noise)
438        .then_ignore(end())
439        .map(|blocks| {
440            let mut file = AgentFile::default();
441
442            for block in blocks {
443                match block {
444                    TopLevelBlock::Config(c) => file.config = Some(c),
445                    TopLevelBlock::Variables(v) => file.variables = Some(v),
446                    TopLevelBlock::System(s) => file.system = Some(s),
447                    TopLevelBlock::StartAgent(sa) => file.start_agent = Some(sa),
448                    TopLevelBlock::Topic(t) => file.topics.push(t),
449                    TopLevelBlock::Language(l) => file.language = Some(l),
450                    TopLevelBlock::Connection(c) => file.connections.push(c),
451                }
452            }
453
454            file
455        })
456}