Skip to main content

aiproof_parse/
markdown.rs

1use aiproof_core::document::{Document, Kind, PromptText, Role};
2use aiproof_core::span::Span;
3use std::path::Path;
4use tree_sitter::Parser;
5
6pub fn parse(path: &Path, source: &str) -> anyhow::Result<Vec<Document>> {
7    // Verify tree-sitter-md accepts the source (future rules will walk the AST).
8    // API: tree_sitter_md::language() and tree_sitter_md::inline_language() for block/inline grammars.
9    let mut parser = Parser::new();
10    parser.set_language(&tree_sitter_md::language())?;
11    let _tree = parser
12        .parse(source, None)
13        .ok_or_else(|| anyhow::anyhow!("tree-sitter-md failed to parse"))?;
14    // TODO: expose AST to rules when needed
15
16    let (role, body_offset) = extract_frontmatter(source);
17    let body = &source[body_offset..];
18    let trimmed = body.trim_start();
19    let trim_offset = body_offset + (body.len() - trimmed.len());
20    let origin_span = Span::from_byte_range(source, trim_offset..source.len());
21
22    Ok(vec![Document {
23        path: path.to_path_buf(),
24        role,
25        source: source.to_string(),
26        prompt: PromptText {
27            text: trimmed.trim_end().to_string(),
28            origin_span: Some(origin_span),
29        },
30        kind: Kind::Markdown,
31    }])
32}
33
34/// Returns (role, byte offset at which body content starts).
35fn extract_frontmatter(src: &str) -> (Role, usize) {
36    let Some(rest) = src.strip_prefix("---\n") else {
37        return (Role::Unknown, 0);
38    };
39    let Some(end) = rest.find("\n---\n") else {
40        return (Role::Unknown, 0);
41    };
42    let frontmatter = &rest[..end];
43    let body_offset = "---\n".len() + end + "\n---\n".len();
44    let role = frontmatter
45        .lines()
46        .filter_map(|line| line.split_once(':'))
47        .find(|(k, _)| k.trim() == "role")
48        .map(|(_, v)| v.trim().trim_matches('"').trim_matches('\''))
49        .and_then(|v| match v {
50            "system" => Some(Role::System),
51            "user" => Some(Role::User),
52            "assistant" => Some(Role::Assistant),
53            "tool" => Some(Role::Tool),
54            _ => None,
55        })
56        .unwrap_or(Role::Unknown);
57    (role, body_offset)
58}
59
60#[cfg(test)]
61mod tests {
62    use super::*;
63
64    #[test]
65    fn extracts_system_role_and_body() {
66        let src = "---\nrole: system\n---\nYou are a helpful assistant.\n";
67        let docs = parse(Path::new("x.md"), src).unwrap();
68        assert_eq!(docs.len(), 1);
69        assert_eq!(docs[0].role, Role::System);
70        insta::assert_yaml_snapshot!(docs[0].prompt.text);
71    }
72
73    #[test]
74    fn no_frontmatter_whole_body_is_prompt() {
75        let src = "# Prompt\n\nAnswer concisely.\n";
76        let docs = parse(Path::new("x.md"), src).unwrap();
77        assert_eq!(docs[0].role, Role::Unknown);
78        insta::assert_yaml_snapshot!(docs[0].prompt.text);
79    }
80
81    #[test]
82    fn malformed_frontmatter_treated_as_body() {
83        // No closing --- fence.
84        let src = "---\nrole: system\nYou are helpful.\n";
85        let docs = parse(Path::new("x.md"), src).unwrap();
86        assert_eq!(docs[0].role, Role::Unknown);
87        insta::assert_yaml_snapshot!(docs[0].prompt.text);
88    }
89
90    #[test]
91    fn unknown_role_falls_back() {
92        let src = "---\nrole: sidekick\n---\nHi.\n";
93        let docs = parse(Path::new("x.md"), src).unwrap();
94        assert_eq!(docs[0].role, Role::Unknown);
95    }
96
97    #[test]
98    fn quoted_role_value() {
99        let src = "---\nrole: \"user\"\n---\nTest message.\n";
100        let docs = parse(Path::new("x.md"), src).unwrap();
101        assert_eq!(docs[0].role, Role::User);
102        insta::assert_yaml_snapshot!(docs[0].prompt.text);
103    }
104
105    #[test]
106    fn multiple_frontmatter_keys() {
107        let src = "---\nname: test\nrole: assistant\nmodel: gpt-4\n---\nAssistant response.\n";
108        let docs = parse(Path::new("x.md"), src).unwrap();
109        assert_eq!(docs[0].role, Role::Assistant);
110        insta::assert_yaml_snapshot!(docs[0].prompt.text);
111    }
112}