aiproof_parse/
markdown.rs1use aiproof_core::document::{Document, Kind, PromptText, Role};
2use aiproof_core::span::Span;
3use std::path::Path;
4use tree_sitter::Parser;
5
6pub fn parse(path: &Path, source: &str) -> anyhow::Result<Vec<Document>> {
7 let mut parser = Parser::new();
10 parser.set_language(&tree_sitter_md::language())?;
11 let _tree = parser
12 .parse(source, None)
13 .ok_or_else(|| anyhow::anyhow!("tree-sitter-md failed to parse"))?;
14 let (role, body_offset) = extract_frontmatter(source);
17 let body = &source[body_offset..];
18 let trimmed = body.trim_start();
19 let trim_offset = body_offset + (body.len() - trimmed.len());
20 let origin_span = Span::from_byte_range(source, trim_offset..source.len());
21
22 Ok(vec![Document {
23 path: path.to_path_buf(),
24 role,
25 source: source.to_string(),
26 prompt: PromptText {
27 text: trimmed.trim_end().to_string(),
28 origin_span: Some(origin_span),
29 },
30 kind: Kind::Markdown,
31 }])
32}
33
34fn extract_frontmatter(src: &str) -> (Role, usize) {
36 let Some(rest) = src.strip_prefix("---\n") else {
37 return (Role::Unknown, 0);
38 };
39 let Some(end) = rest.find("\n---\n") else {
40 return (Role::Unknown, 0);
41 };
42 let frontmatter = &rest[..end];
43 let body_offset = "---\n".len() + end + "\n---\n".len();
44 let role = frontmatter
45 .lines()
46 .filter_map(|line| line.split_once(':'))
47 .find(|(k, _)| k.trim() == "role")
48 .map(|(_, v)| v.trim().trim_matches('"').trim_matches('\''))
49 .and_then(|v| match v {
50 "system" => Some(Role::System),
51 "user" => Some(Role::User),
52 "assistant" => Some(Role::Assistant),
53 "tool" => Some(Role::Tool),
54 _ => None,
55 })
56 .unwrap_or(Role::Unknown);
57 (role, body_offset)
58}
59
60#[cfg(test)]
61mod tests {
62 use super::*;
63
64 #[test]
65 fn extracts_system_role_and_body() {
66 let src = "---\nrole: system\n---\nYou are a helpful assistant.\n";
67 let docs = parse(Path::new("x.md"), src).unwrap();
68 assert_eq!(docs.len(), 1);
69 assert_eq!(docs[0].role, Role::System);
70 insta::assert_yaml_snapshot!(docs[0].prompt.text);
71 }
72
73 #[test]
74 fn no_frontmatter_whole_body_is_prompt() {
75 let src = "# Prompt\n\nAnswer concisely.\n";
76 let docs = parse(Path::new("x.md"), src).unwrap();
77 assert_eq!(docs[0].role, Role::Unknown);
78 insta::assert_yaml_snapshot!(docs[0].prompt.text);
79 }
80
81 #[test]
82 fn malformed_frontmatter_treated_as_body() {
83 let src = "---\nrole: system\nYou are helpful.\n";
85 let docs = parse(Path::new("x.md"), src).unwrap();
86 assert_eq!(docs[0].role, Role::Unknown);
87 insta::assert_yaml_snapshot!(docs[0].prompt.text);
88 }
89
90 #[test]
91 fn unknown_role_falls_back() {
92 let src = "---\nrole: sidekick\n---\nHi.\n";
93 let docs = parse(Path::new("x.md"), src).unwrap();
94 assert_eq!(docs[0].role, Role::Unknown);
95 }
96
97 #[test]
98 fn quoted_role_value() {
99 let src = "---\nrole: \"user\"\n---\nTest message.\n";
100 let docs = parse(Path::new("x.md"), src).unwrap();
101 assert_eq!(docs[0].role, Role::User);
102 insta::assert_yaml_snapshot!(docs[0].prompt.text);
103 }
104
105 #[test]
106 fn multiple_frontmatter_keys() {
107 let src = "---\nname: test\nrole: assistant\nmodel: gpt-4\n---\nAssistant response.\n";
108 let docs = parse(Path::new("x.md"), src).unwrap();
109 assert_eq!(docs[0].role, Role::Assistant);
110 insta::assert_yaml_snapshot!(docs[0].prompt.text);
111 }
112}