ricecoder_storage/markdown_config/
parser.rs

1//! Markdown parser for extracting YAML frontmatter and content
2
3use crate::markdown_config::error::{MarkdownConfigError, MarkdownConfigResult};
4use crate::markdown_config::types::ParsedMarkdown;
5use std::path::Path;
6
7/// Parser for markdown files with YAML frontmatter
8#[derive(Debug, Clone)]
9pub struct MarkdownParser;
10
11impl MarkdownParser {
12    /// Create a new markdown parser
13    pub fn new() -> Self {
14        Self
15    }
16
17    /// Parse markdown content and extract frontmatter and body
18    ///
19    /// Expects frontmatter to be delimited by `---` at the start of the file.
20    /// Format:
21    /// ```text
22    /// ---
23    /// yaml: frontmatter
24    /// ---
25    /// # Markdown content
26    /// ```
27    pub fn parse(&self, content: &str) -> MarkdownConfigResult<ParsedMarkdown> {
28        self.parse_with_context(content, None)
29    }
30
31    /// Parse markdown content with file path context for better error messages
32    pub fn parse_with_context(
33        &self,
34        content: &str,
35        file_path: Option<&Path>,
36    ) -> MarkdownConfigResult<ParsedMarkdown> {
37        let trimmed = content.trim();
38
39        // Check if content starts with frontmatter delimiter
40        if !trimmed.starts_with("---") {
41            // No frontmatter, entire content is body
42            return Ok(ParsedMarkdown::new(None, content.to_string()));
43        }
44
45        // Find the closing delimiter
46        let rest = &trimmed[3..]; // Skip opening "---"
47        let closing_delimiter_pos = rest.find("---");
48
49        match closing_delimiter_pos {
50            Some(pos) => {
51                // Extract frontmatter and body
52                let frontmatter = rest[..pos].trim().to_string();
53                let body_start = pos + 3; // Skip closing "---"
54                let body = rest[body_start..].trim().to_string();
55
56                // Validate that frontmatter is not empty
57                if frontmatter.is_empty() {
58                    let msg = match file_path {
59                        Some(path) => format!(
60                            "Frontmatter cannot be empty in {}",
61                            path.display()
62                        ),
63                        None => "Frontmatter cannot be empty".to_string(),
64                    };
65                    return Err(MarkdownConfigError::parse_error(msg));
66                }
67
68                Ok(ParsedMarkdown::new(Some(frontmatter), body))
69            }
70            None => {
71                // Opening delimiter found but no closing delimiter
72                let msg = match file_path {
73                    Some(path) => format!(
74                        "Unclosed frontmatter in {}: found opening '---' but no closing '---'",
75                        path.display()
76                    ),
77                    None => "Unclosed frontmatter: found opening '---' but no closing '---'"
78                        .to_string(),
79                };
80                Err(MarkdownConfigError::parse_error(msg))
81            }
82        }
83    }
84
85}
86
87impl Default for MarkdownParser {
88    fn default() -> Self {
89        Self::new()
90    }
91}
92
93#[cfg(test)]
94mod tests {
95    use super::*;
96
97    #[test]
98    fn test_parse_with_frontmatter() {
99        let parser = MarkdownParser::new();
100        let content = r#"---
101name: test-agent
102description: A test agent
103---
104# Test Content
105This is the body"#;
106
107        let result = parser.parse(content).unwrap();
108        assert_eq!(
109            result.frontmatter,
110            Some("name: test-agent\ndescription: A test agent".to_string())
111        );
112        assert_eq!(result.content, "# Test Content\nThis is the body");
113    }
114
115    #[test]
116    fn test_parse_without_frontmatter() {
117        let parser = MarkdownParser::new();
118        let content = "# Test Content\nThis is the body";
119
120        let result = parser.parse(content).unwrap();
121        assert_eq!(result.frontmatter, None);
122        assert_eq!(result.content, "# Test Content\nThis is the body");
123    }
124
125    #[test]
126    fn test_parse_empty_frontmatter() {
127        let parser = MarkdownParser::new();
128        let content = r#"---
129---
130# Test Content"#;
131
132        let result = parser.parse(content);
133        assert!(result.is_err());
134    }
135
136    #[test]
137    fn test_parse_unclosed_frontmatter() {
138        let parser = MarkdownParser::new();
139        let content = r#"---
140name: test
141# Test Content"#;
142
143        let result = parser.parse(content);
144        assert!(result.is_err());
145    }
146
147    #[test]
148    fn test_parse_with_whitespace() {
149        let parser = MarkdownParser::new();
150        let content = r#"  ---
151name: test
152  ---
153  # Content"#;
154
155        let result = parser.parse(content).unwrap();
156        assert_eq!(result.frontmatter, Some("name: test".to_string()));
157        assert_eq!(result.content, "# Content");
158    }
159
160    #[test]
161    fn test_parse_multiline_frontmatter() {
162        let parser = MarkdownParser::new();
163        let content = r#"---
164name: test-agent
165description: A test agent
166model: gpt-4
167temperature: 0.7
168---
169# Test Content"#;
170
171        let result = parser.parse(content).unwrap();
172        assert!(result.frontmatter.is_some());
173        let fm = result.frontmatter.unwrap();
174        assert!(fm.contains("name: test-agent"));
175        assert!(fm.contains("model: gpt-4"));
176    }
177
178    #[test]
179    fn test_parse_empty_body() {
180        let parser = MarkdownParser::new();
181        let content = r#"---
182name: test
183---"#;
184
185        let result = parser.parse(content).unwrap();
186        assert_eq!(result.frontmatter, Some("name: test".to_string()));
187        assert_eq!(result.content, "");
188    }
189
190    #[test]
191    fn test_parse_complex_yaml_frontmatter() {
192        let parser = MarkdownParser::new();
193        let content = r#"---
194name: complex-agent
195description: Complex agent
196model: gpt-4
197temperature: 0.7
198max_tokens: 2000
199tools:
200  - tool1
201  - tool2
202---
203# Complex Content
204With multiple lines
205And formatting"#;
206
207        let result = parser.parse(content).unwrap();
208        assert!(result.frontmatter.is_some());
209        let fm = result.frontmatter.unwrap();
210        assert!(fm.contains("tools:"));
211        assert!(fm.contains("- tool1"));
212    }
213
214    #[test]
215    fn test_parse_frontmatter_with_special_characters() {
216        let parser = MarkdownParser::new();
217        let content = r#"---
218name: test-agent
219description: "Agent with special chars: @#$%^&*()"
220---
221Content"#;
222
223        let result = parser.parse(content).unwrap();
224        assert!(result.frontmatter.is_some());
225        assert!(result.frontmatter.unwrap().contains("@#$%^&*()"));
226    }
227
228    #[test]
229    fn test_parse_frontmatter_with_quotes() {
230        let parser = MarkdownParser::new();
231        let content = r#"---
232name: "test-agent"
233description: 'Single quoted'
234---
235Content"#;
236
237        let result = parser.parse(content).unwrap();
238        assert!(result.frontmatter.is_some());
239    }
240
241    #[test]
242    fn test_parse_body_with_code_blocks() {
243        let parser = MarkdownParser::new();
244        let content = r#"---
245name: test
246---
247# Content
248
249```rust
250fn main() {
251    println!("Hello");
252}
253```
254
255More content"#;
256
257        let result = parser.parse(content).unwrap();
258        assert!(result.content.contains("```rust"));
259        assert!(result.content.contains("fn main()"));
260    }
261
262    #[test]
263    fn test_parse_body_with_frontmatter_like_content() {
264        let parser = MarkdownParser::new();
265        let content = r#"---
266name: test
267---
268# Content
269
270This mentions --- but it's in the body
271So it should be fine"#;
272
273        let result = parser.parse(content).unwrap();
274        assert!(result.content.contains("---"));
275    }
276
277    #[test]
278    fn test_parse_with_context_error_message() {
279        let parser = MarkdownParser::new();
280        let content = r#"---
281---
282Content"#;
283        let path = Path::new("test.agent.md");
284
285        let result = parser.parse_with_context(content, Some(path));
286        assert!(result.is_err());
287        let error_msg = result.unwrap_err().to_string();
288        assert!(error_msg.contains("test.agent.md"));
289    }
290
291    #[test]
292    fn test_parse_consistency() {
293        let parser = MarkdownParser::new();
294        let content = r#"---
295name: test-agent
296description: Test
297---
298Body content"#;
299
300        let result1 = parser.parse(content).unwrap();
301        let result2 = parser.parse(content).unwrap();
302
303        assert_eq!(result1, result2);
304    }
305
306    #[test]
307    fn test_parse_only_frontmatter_delimiter() {
308        let parser = MarkdownParser::new();
309        let content = "---";
310
311        // Single "---" is treated as opening delimiter with no closing
312        let result = parser.parse(content);
313        assert!(result.is_err());
314    }
315
316    #[test]
317    fn test_parse_multiple_delimiters_in_body() {
318        let parser = MarkdownParser::new();
319        let content = r#"---
320name: test
321---
322First section
323---
324Second section
325---
326Third section"#;
327
328        let result = parser.parse(content).unwrap();
329        assert_eq!(result.frontmatter, Some("name: test".to_string()));
330        assert!(result.content.contains("First section"));
331        assert!(result.content.contains("Second section"));
332        assert!(result.content.contains("Third section"));
333    }
334
335    #[test]
336    fn test_parse_very_long_frontmatter() {
337        let parser = MarkdownParser::new();
338        let mut frontmatter = String::from("---\n");
339        for i in 0..100 {
340            frontmatter.push_str(&format!("field{}: value{}\n", i, i));
341        }
342        frontmatter.push_str("---\nBody");
343
344        let result = parser.parse(&frontmatter).unwrap();
345        assert!(result.frontmatter.is_some());
346        assert!(result.frontmatter.unwrap().contains("field99"));
347    }
348
349    #[test]
350    fn test_parse_very_long_body() {
351        let parser = MarkdownParser::new();
352        let mut body = String::from("# Content\n");
353        for i in 0..1000 {
354            body.push_str(&format!("Line {}\n", i));
355        }
356        let content = format!("---\nname: test\n---\n{}", body);
357
358        let result = parser.parse(&content).unwrap();
359        assert!(result.content.contains("Line 999"));
360    }
361
362    #[test]
363    fn test_parse_unicode_content() {
364        let parser = MarkdownParser::new();
365        let content = r#"---
366name: 测试代理
367description: 日本語のテスト
368---
369# 内容
370Ελληνικά
371العربية"#;
372
373        let result = parser.parse(content).unwrap();
374        assert!(result.frontmatter.unwrap().contains("测试代理"));
375        assert!(result.content.contains("Ελληνικά"));
376    }
377
378    #[test]
379    fn test_parse_windows_line_endings() {
380        let parser = MarkdownParser::new();
381        let content = "---\r\nname: test\r\n---\r\nBody";
382
383        let result = parser.parse(content).unwrap();
384        assert!(result.frontmatter.is_some());
385    }
386
387    #[test]
388    fn test_parse_mixed_line_endings() {
389        let parser = MarkdownParser::new();
390        let content = "---\nname: test\r\n---\nBody";
391
392        let result = parser.parse(content).unwrap();
393        assert!(result.frontmatter.is_some());
394    }
395
396    #[test]
397    fn test_parse_tabs_in_frontmatter() {
398        let parser = MarkdownParser::new();
399        let content = "---\nname:\ttest\n---\nBody";
400
401        let result = parser.parse(content).unwrap();
402        assert!(result.frontmatter.is_some());
403    }
404
405    #[test]
406    fn test_parse_empty_content() {
407        let parser = MarkdownParser::new();
408        let content = "";
409
410        let result = parser.parse(content).unwrap();
411        assert_eq!(result.frontmatter, None);
412        assert_eq!(result.content, "");
413    }
414
415    #[test]
416    fn test_parse_only_whitespace() {
417        let parser = MarkdownParser::new();
418        let content = "   \n  \n   ";
419
420        let result = parser.parse(content).unwrap();
421        assert_eq!(result.frontmatter, None);
422    }
423}