graphrag_core/text/parsers/
markdown.rs1use crate::text::{
4 document_structure::{DocumentStructure, Heading, HeadingHierarchy, Section},
5 layout_parser::LayoutParser,
6};
7
8pub struct MarkdownLayoutParser;
10
11impl MarkdownLayoutParser {
12 pub fn new() -> Self {
14 Self
15 }
16
17 fn build_sections(&self, headings: &[Heading], content: &str) -> Vec<Section> {
19 let mut sections = Vec::new();
20
21 for (i, heading) in headings.iter().enumerate() {
22 let content_start = heading.end_offset;
23 let content_end = headings
24 .get(i + 1)
25 .map(|h| h.start_offset)
26 .unwrap_or(content.len());
27
28 sections.push(Section::new(heading.clone(), content_start, content_end));
29 }
30
31 sections
32 }
33
34 fn build_hierarchy(&self, sections: &mut [Section]) -> HeadingHierarchy {
36 let mut hierarchy = HeadingHierarchy::new();
37 let mut stack: Vec<usize> = Vec::new();
38
39 for idx in 0..sections.len() {
40 let section_level = sections[idx].heading.level;
41
42 while let Some(&parent_idx) = stack.last() {
44 if sections[parent_idx].heading.level < section_level {
45 break;
46 }
47 stack.pop();
48 }
49
50 if let Some(&parent_idx) = stack.last() {
51 sections[parent_idx].child_sections.push(idx);
52 sections[idx].parent_section = Some(parent_idx);
53 } else {
54 hierarchy.root_sections.push(idx);
55 }
56
57 stack.push(idx);
58 }
59
60 for (idx, section) in sections.iter().enumerate() {
62 let mut depth = 0;
63 let mut current = section.parent_section;
64 while let Some(parent_idx) = current {
65 depth += 1;
66 current = sections[parent_idx].parent_section;
67 }
68 hierarchy.depth_map.insert(idx, depth);
69 }
70
71 hierarchy
72 }
73}
74
75impl Default for MarkdownLayoutParser {
76 fn default() -> Self {
77 Self::new()
78 }
79}
80
81impl LayoutParser for MarkdownLayoutParser {
82 fn parse(&self, content: &str) -> DocumentStructure {
83 let mut headings = Vec::new();
84 let mut current_offset = 0;
85
86 for (line_num, line) in content.lines().enumerate() {
87 if line.trim_start().starts_with('#') {
89 let trimmed = line.trim();
90 let level = trimmed.chars().take_while(|&c| c == '#').count();
91
92 if level > 0 && level <= 6 {
93 if trimmed.len() > level {
95 let after_hashes = trimmed.chars().nth(level);
96 if after_hashes == Some(' ') || after_hashes.is_none() {
97 let text = trimmed[level..].trim().to_string();
98 if !text.is_empty() {
99 headings.push(
100 Heading::new(
101 level.min(255) as u8,
102 text,
103 current_offset,
104 current_offset + line.len(),
105 )
106 .with_line_number(line_num),
107 );
108 }
109 }
110 }
111 }
112 }
113
114 current_offset += line.len() + 1; }
116
117 let mut sections = self.build_sections(&headings, content);
118 let hierarchy = self.build_hierarchy(&mut sections);
119
120 DocumentStructure {
121 headings,
122 sections,
123 hierarchy,
124 }
125 }
126
127 fn supports_format(&self, format: &str) -> bool {
128 matches!(format.to_lowercase().as_str(), "markdown" | "md")
129 }
130
131 fn name(&self) -> &'static str {
132 "MarkdownLayoutParser"
133 }
134}
135
136#[cfg(test)]
137mod tests {
138 use super::*;
139
140 #[test]
141 fn test_markdown_parsing() {
142 let parser = MarkdownLayoutParser::new();
143 let content = "# Chapter 1\n\nSome text\n\n## Section 1.1\n\nMore text\n\n### Subsection 1.1.1\n\nDetails";
144
145 let structure = parser.parse(content);
146
147 assert_eq!(structure.headings.len(), 3);
148 assert_eq!(structure.headings[0].level, 1);
149 assert_eq!(structure.headings[0].text, "Chapter 1");
150 assert_eq!(structure.headings[1].level, 2);
151 assert_eq!(structure.headings[1].text, "Section 1.1");
152 assert_eq!(structure.headings[2].level, 3);
153 }
154
155 #[test]
156 fn test_hierarchy_building() {
157 let parser = MarkdownLayoutParser::new();
158 let content = "# H1\n## H2\n### H3\n## H2b\n# H1b";
159
160 let structure = parser.parse(content);
161
162 assert_eq!(structure.hierarchy.root_sections.len(), 2); assert!(structure.sections[1].parent_section == Some(0)); }
165}