graphrag_core/text/parsers/
html.rs1use crate::text::{
7 document_structure::{DocumentStructure, Heading, HeadingHierarchy, Section},
8 layout_parser::LayoutParser,
9};
10
11pub struct HtmlLayoutParser;
13
14impl HtmlLayoutParser {
15 pub fn new() -> Self {
17 Self
18 }
19
20 fn extract_text_content(tag_content: &str) -> String {
22 let mut text = String::new();
24 let mut inside_tag = false;
25
26 for ch in tag_content.chars() {
27 match ch {
28 '<' => inside_tag = true,
29 '>' => inside_tag = false,
30 _ if !inside_tag => text.push(ch),
31 _ => {}
32 }
33 }
34
35 text.trim().to_string()
36 }
37
38 fn parse_headings(&self, content: &str) -> Vec<Heading> {
40 let mut headings = Vec::new();
41 let mut current_offset = 0;
42
43 let lines: Vec<&str> = content.lines().collect();
45
46 for (line_num, line) in lines.iter().enumerate() {
47 for level in 1..=6 {
49 let open_tag = format!("<h{}", level);
50 let close_tag = format!("</h{}>", level);
51
52 if let Some(start_idx) = line.to_lowercase().find(&open_tag) {
53 if let Some(end_idx) = line.to_lowercase().find(&close_tag) {
54 if let Some(content_start) = line[start_idx..].find('>') {
56 let actual_start = start_idx + content_start + 1;
57 let tag_content = &line[actual_start..end_idx];
58 let text = Self::extract_text_content(tag_content);
59
60 if !text.is_empty() {
61 let heading = Heading::new(
62 level as u8,
63 text,
64 current_offset + start_idx,
65 current_offset + end_idx + close_tag.len(),
66 )
67 .with_line_number(line_num);
68
69 headings.push(heading);
70 }
71 }
72 }
73 }
74 }
75
76 current_offset += line.len() + 1; }
78
79 headings
80 }
81
82 fn build_sections(&self, headings: &[Heading], content: &str) -> Vec<Section> {
84 let mut sections = Vec::new();
85
86 for (i, heading) in headings.iter().enumerate() {
87 let content_start = heading.end_offset;
88 let content_end = headings
89 .get(i + 1)
90 .map(|h| h.start_offset)
91 .unwrap_or(content.len());
92
93 sections.push(Section::new(heading.clone(), content_start, content_end));
94 }
95
96 sections
97 }
98
99 fn build_hierarchy(&self, sections: &mut [Section]) -> HeadingHierarchy {
101 let mut hierarchy = HeadingHierarchy::new();
102 let mut stack: Vec<usize> = Vec::new();
103
104 for idx in 0..sections.len() {
105 let section_level = sections[idx].heading.level;
106
107 while let Some(&parent_idx) = stack.last() {
109 if sections[parent_idx].heading.level < section_level {
110 break;
111 }
112 stack.pop();
113 }
114
115 if let Some(&parent_idx) = stack.last() {
116 sections[parent_idx].child_sections.push(idx);
117 sections[idx].parent_section = Some(parent_idx);
118 } else {
119 hierarchy.root_sections.push(idx);
120 }
121
122 stack.push(idx);
123 }
124
125 for (idx, section) in sections.iter().enumerate() {
127 let mut depth = 0;
128 let mut current = section.parent_section;
129 while let Some(parent_idx) = current {
130 depth += 1;
131 current = sections[parent_idx].parent_section;
132 }
133 hierarchy.depth_map.insert(idx, depth);
134 }
135
136 hierarchy
137 }
138}
139
140impl Default for HtmlLayoutParser {
141 fn default() -> Self {
142 Self::new()
143 }
144}
145
146impl LayoutParser for HtmlLayoutParser {
147 fn parse(&self, content: &str) -> DocumentStructure {
148 let headings = self.parse_headings(content);
149 let mut sections = self.build_sections(&headings, content);
150 let hierarchy = self.build_hierarchy(&mut sections);
151
152 DocumentStructure {
153 headings,
154 sections,
155 hierarchy,
156 }
157 }
158
159 fn supports_format(&self, format: &str) -> bool {
160 matches!(format.to_lowercase().as_str(), "html" | "htm")
161 }
162
163 fn name(&self) -> &'static str {
164 "HtmlLayoutParser"
165 }
166}
167
168#[cfg(test)]
169mod tests {
170 use super::*;
171
172 #[test]
173 fn test_html_heading_parsing() {
174 let parser = HtmlLayoutParser::new();
175 let content = r#"
176<html>
177<body>
178<h1>Chapter 1: Introduction</h1>
179<p>Some introductory text.</p>
180<h2>Section 1.1: Background</h2>
181<p>Background information.</p>
182<h3>Subsection 1.1.1: Details</h3>
183<p>Detailed information.</p>
184<h2>Section 1.2: Methods</h2>
185<p>Methodology.</p>
186</body>
187</html>
188"#;
189
190 let structure = parser.parse(content);
191
192 assert_eq!(structure.headings.len(), 4);
193 assert_eq!(structure.headings[0].level, 1);
194 assert_eq!(structure.headings[0].text, "Chapter 1: Introduction");
195 assert_eq!(structure.headings[1].level, 2);
196 assert_eq!(structure.headings[1].text, "Section 1.1: Background");
197 assert_eq!(structure.headings[2].level, 3);
198 assert_eq!(structure.headings[2].text, "Subsection 1.1.1: Details");
199 assert_eq!(structure.headings[3].level, 2);
200 assert_eq!(structure.headings[3].text, "Section 1.2: Methods");
201 }
202
203 #[test]
204 fn test_html_hierarchy() {
205 let parser = HtmlLayoutParser::new();
206 let content = r#"<h1>Main</h1>
207<h2>Sub1</h2>
208<h3>SubSub1</h3>
209<h2>Sub2</h2>"#;
210
211 let structure = parser.parse(content);
212
213 assert_eq!(structure.hierarchy.root_sections.len(), 1); assert_eq!(structure.sections.len(), 4);
215
216 assert_eq!(structure.sections[1].parent_section, Some(0)); assert_eq!(structure.sections[2].parent_section, Some(1)); assert_eq!(structure.sections[3].parent_section, Some(0)); }
221
222 #[test]
223 fn test_nested_tags_in_heading() {
224 let parser = HtmlLayoutParser::new();
225 let content = "<h1>Chapter <em>One</em></h1><p>Content</p>";
226
227 let structure = parser.parse(content);
228
229 assert_eq!(structure.headings.len(), 1);
230 assert_eq!(structure.headings[0].text, "Chapter One"); }
232
233 #[test]
234 fn test_format_support() {
235 let parser = HtmlLayoutParser::new();
236 assert!(parser.supports_format("html"));
237 assert!(parser.supports_format("HTML"));
238 assert!(parser.supports_format("htm"));
239 assert!(!parser.supports_format("md"));
240 }
241}