agentroot_core/index/
parser.rs

1//! Document parsing utilities
2
3use lazy_static::lazy_static;
4use regex::Regex;
5use std::path::Path;
6
7lazy_static! {
8    static ref HEADING_RE: Regex = Regex::new(r"^##?\s+(.+)$").unwrap();
9    static ref SECOND_HEADING_RE: Regex = Regex::new(r"^##\s+(.+)$").unwrap();
10}
11
12/// Generic headings to skip
13const SKIP_TITLES: &[&str] = &["Notes", "README", "Index"];
14
15/// Extract title from markdown content
16pub fn extract_title(content: &str, filename: &str) -> String {
17    for line in content.lines().take(50) {
18        if let Some(caps) = HEADING_RE.captures(line) {
19            let title = caps.get(1).map(|m| m.as_str().trim()).unwrap_or("");
20
21            if SKIP_TITLES
22                .iter()
23                .any(|&s| title == s || title.contains("Notes"))
24            {
25                for line2 in content.lines().skip(1).take(50) {
26                    if let Some(caps2) = SECOND_HEADING_RE.captures(line2) {
27                        if let Some(title2) = caps2.get(1) {
28                            return title2.as_str().trim().to_string();
29                        }
30                    }
31                }
32            }
33
34            if !title.is_empty() {
35                return title.to_string();
36            }
37        }
38    }
39
40    Path::new(filename)
41        .file_stem()
42        .and_then(|s| s.to_str())
43        .map(|s| s.replace(['-', '_'], " "))
44        .unwrap_or_else(|| filename.to_string())
45}
46
47/// Normalize path for storage (handelize)
48pub fn handelize(path: &str) -> String {
49    path.to_lowercase()
50        .replace("___", "/")
51        .chars()
52        .map(|c| {
53            if c.is_alphanumeric() || c == '/' || c == '.' || c == '-' {
54                c
55            } else {
56                '-'
57            }
58        })
59        .collect::<String>()
60        .replace("--", "-")
61        .trim_matches('-')
62        .to_string()
63}
64
65#[cfg(test)]
66mod tests {
67    use super::*;
68
69    #[test]
70    fn test_extract_title_heading() {
71        let content = "# My Document\n\nSome content here.";
72        assert_eq!(extract_title(content, "doc.md"), "My Document");
73    }
74
75    #[test]
76    fn test_extract_title_fallback() {
77        let content = "No heading here, just text.";
78        assert_eq!(extract_title(content, "my-doc.md"), "my doc");
79    }
80
81    #[test]
82    fn test_handelize() {
83        assert_eq!(
84            handelize("My Docs/2024/Report.md"),
85            "my-docs/2024/report.md"
86        );
87        assert_eq!(handelize("foo___bar.md"), "foo/bar.md");
88    }
89}