Skip to main content

graphrag_core/text/parsers/
markdown.rs

1//! Markdown layout parser
2
3use crate::text::{
4    document_structure::{DocumentStructure, Heading, HeadingHierarchy, Section},
5    layout_parser::LayoutParser,
6};
7
8/// Parser for Markdown documents
9pub struct MarkdownLayoutParser;
10
11impl MarkdownLayoutParser {
12    /// Create new Markdown parser
13    pub fn new() -> Self {
14        Self
15    }
16
17    /// Build sections from headings
18    fn build_sections(&self, headings: &[Heading], content: &str) -> Vec<Section> {
19        let mut sections = Vec::new();
20
21        for (i, heading) in headings.iter().enumerate() {
22            let content_start = heading.end_offset;
23            let content_end = headings
24                .get(i + 1)
25                .map(|h| h.start_offset)
26                .unwrap_or(content.len());
27
28            sections.push(Section::new(heading.clone(), content_start, content_end));
29        }
30
31        sections
32    }
33
34    /// Build hierarchy from sections
35    fn build_hierarchy(&self, sections: &mut [Section]) -> HeadingHierarchy {
36        let mut hierarchy = HeadingHierarchy::new();
37        let mut stack: Vec<usize> = Vec::new();
38
39        for idx in 0..sections.len() {
40            let section_level = sections[idx].heading.level;
41
42            // Pop stack until we find parent
43            while let Some(&parent_idx) = stack.last() {
44                if sections[parent_idx].heading.level < section_level {
45                    break;
46                }
47                stack.pop();
48            }
49
50            if let Some(&parent_idx) = stack.last() {
51                sections[parent_idx].child_sections.push(idx);
52                sections[idx].parent_section = Some(parent_idx);
53            } else {
54                hierarchy.root_sections.push(idx);
55            }
56
57            stack.push(idx);
58        }
59
60        // Build depth map
61        for (idx, section) in sections.iter().enumerate() {
62            let mut depth = 0;
63            let mut current = section.parent_section;
64            while let Some(parent_idx) = current {
65                depth += 1;
66                current = sections[parent_idx].parent_section;
67            }
68            hierarchy.depth_map.insert(idx, depth);
69        }
70
71        hierarchy
72    }
73}
74
75impl Default for MarkdownLayoutParser {
76    fn default() -> Self {
77        Self::new()
78    }
79}
80
81impl LayoutParser for MarkdownLayoutParser {
82    fn parse(&self, content: &str) -> DocumentStructure {
83        let mut headings = Vec::new();
84        let mut current_offset = 0;
85
86        for (line_num, line) in content.lines().enumerate() {
87            // Detect markdown headings: # ## ### etc.
88            if line.trim_start().starts_with('#') {
89                let trimmed = line.trim();
90                let level = trimmed.chars().take_while(|&c| c == '#').count();
91
92                if level > 0 && level <= 6 {
93                    // Verify proper markdown (space after hashes)
94                    if trimmed.len() > level {
95                        let after_hashes = trimmed.chars().nth(level);
96                        if after_hashes == Some(' ') || after_hashes.is_none() {
97                            let text = trimmed[level..].trim().to_string();
98                            if !text.is_empty() {
99                                headings.push(
100                                    Heading::new(
101                                        level.min(255) as u8,
102                                        text,
103                                        current_offset,
104                                        current_offset + line.len(),
105                                    )
106                                    .with_line_number(line_num),
107                                );
108                            }
109                        }
110                    }
111                }
112            }
113
114            current_offset += line.len() + 1; // +1 for newline
115        }
116
117        let mut sections = self.build_sections(&headings, content);
118        let hierarchy = self.build_hierarchy(&mut sections);
119
120        DocumentStructure {
121            headings,
122            sections,
123            hierarchy,
124        }
125    }
126
127    fn supports_format(&self, format: &str) -> bool {
128        matches!(format.to_lowercase().as_str(), "markdown" | "md")
129    }
130
131    fn name(&self) -> &'static str {
132        "MarkdownLayoutParser"
133    }
134}
135
136#[cfg(test)]
137mod tests {
138    use super::*;
139
140    #[test]
141    fn test_markdown_parsing() {
142        let parser = MarkdownLayoutParser::new();
143        let content = "# Chapter 1\n\nSome text\n\n## Section 1.1\n\nMore text\n\n### Subsection 1.1.1\n\nDetails";
144
145        let structure = parser.parse(content);
146
147        assert_eq!(structure.headings.len(), 3);
148        assert_eq!(structure.headings[0].level, 1);
149        assert_eq!(structure.headings[0].text, "Chapter 1");
150        assert_eq!(structure.headings[1].level, 2);
151        assert_eq!(structure.headings[1].text, "Section 1.1");
152        assert_eq!(structure.headings[2].level, 3);
153    }
154
155    #[test]
156    fn test_hierarchy_building() {
157        let parser = MarkdownLayoutParser::new();
158        let content = "# H1\n## H2\n### H3\n## H2b\n# H1b";
159
160        let structure = parser.parse(content);
161
162        assert_eq!(structure.hierarchy.root_sections.len(), 2); // Two H1s
163        assert!(structure.sections[1].parent_section == Some(0)); // H2 parent is H1
164    }
165}