Skip to main content

graphrag_core/text/parsers/
html.rs

1//! HTML layout parser
2//!
3//! Extracts document structure from HTML documents by parsing heading tags (h1-h6).
4//! This is a simplified parser - for production use, consider using a proper HTML parsing library.
5
6use crate::text::{
7    document_structure::{DocumentStructure, Heading, HeadingHierarchy, Section},
8    layout_parser::LayoutParser,
9};
10
11/// Parser for HTML documents
12pub struct HtmlLayoutParser;
13
14impl HtmlLayoutParser {
15    /// Create new HTML parser
16    pub fn new() -> Self {
17        Self
18    }
19
20    /// Extract text content from an HTML tag
21    fn extract_text_content(tag_content: &str) -> String {
22        // Remove any nested HTML tags
23        let mut text = String::new();
24        let mut inside_tag = false;
25
26        for ch in tag_content.chars() {
27            match ch {
28                '<' => inside_tag = true,
29                '>' => inside_tag = false,
30                _ if !inside_tag => text.push(ch),
31                _ => {}
32            }
33        }
34
35        text.trim().to_string()
36    }
37
38    /// Parse HTML headings from content
39    fn parse_headings(&self, content: &str) -> Vec<Heading> {
40        let mut headings = Vec::new();
41        let mut current_offset = 0;
42
43        // Simple regex-like pattern matching for heading tags
44        let lines: Vec<&str> = content.lines().collect();
45
46        for (line_num, line) in lines.iter().enumerate() {
47            // Look for heading tags h1-h6
48            for level in 1..=6 {
49                let open_tag = format!("<h{}", level);
50                let close_tag = format!("</h{}>", level);
51
52                if let Some(start_idx) = line.to_lowercase().find(&open_tag) {
53                    if let Some(end_idx) = line.to_lowercase().find(&close_tag) {
54                        // Extract the tag content (everything between > and </h)
55                        if let Some(content_start) = line[start_idx..].find('>') {
56                            let actual_start = start_idx + content_start + 1;
57                            let tag_content = &line[actual_start..end_idx];
58                            let text = Self::extract_text_content(tag_content);
59
60                            if !text.is_empty() {
61                                let heading = Heading::new(
62                                    level as u8,
63                                    text,
64                                    current_offset + start_idx,
65                                    current_offset + end_idx + close_tag.len(),
66                                )
67                                .with_line_number(line_num);
68
69                                headings.push(heading);
70                            }
71                        }
72                    }
73                }
74            }
75
76            current_offset += line.len() + 1; // +1 for newline
77        }
78
79        headings
80    }
81
82    /// Build sections from headings
83    fn build_sections(&self, headings: &[Heading], content: &str) -> Vec<Section> {
84        let mut sections = Vec::new();
85
86        for (i, heading) in headings.iter().enumerate() {
87            let content_start = heading.end_offset;
88            let content_end = headings
89                .get(i + 1)
90                .map(|h| h.start_offset)
91                .unwrap_or(content.len());
92
93            sections.push(Section::new(heading.clone(), content_start, content_end));
94        }
95
96        sections
97    }
98
99    /// Build hierarchy from sections
100    fn build_hierarchy(&self, sections: &mut [Section]) -> HeadingHierarchy {
101        let mut hierarchy = HeadingHierarchy::new();
102        let mut stack: Vec<usize> = Vec::new();
103
104        for idx in 0..sections.len() {
105            let section_level = sections[idx].heading.level;
106
107            // Pop stack until we find parent
108            while let Some(&parent_idx) = stack.last() {
109                if sections[parent_idx].heading.level < section_level {
110                    break;
111                }
112                stack.pop();
113            }
114
115            if let Some(&parent_idx) = stack.last() {
116                sections[parent_idx].child_sections.push(idx);
117                sections[idx].parent_section = Some(parent_idx);
118            } else {
119                hierarchy.root_sections.push(idx);
120            }
121
122            stack.push(idx);
123        }
124
125        // Build depth map
126        for (idx, section) in sections.iter().enumerate() {
127            let mut depth = 0;
128            let mut current = section.parent_section;
129            while let Some(parent_idx) = current {
130                depth += 1;
131                current = sections[parent_idx].parent_section;
132            }
133            hierarchy.depth_map.insert(idx, depth);
134        }
135
136        hierarchy
137    }
138}
139
140impl Default for HtmlLayoutParser {
141    fn default() -> Self {
142        Self::new()
143    }
144}
145
146impl LayoutParser for HtmlLayoutParser {
147    fn parse(&self, content: &str) -> DocumentStructure {
148        let headings = self.parse_headings(content);
149        let mut sections = self.build_sections(&headings, content);
150        let hierarchy = self.build_hierarchy(&mut sections);
151
152        DocumentStructure {
153            headings,
154            sections,
155            hierarchy,
156        }
157    }
158
159    fn supports_format(&self, format: &str) -> bool {
160        matches!(format.to_lowercase().as_str(), "html" | "htm")
161    }
162
163    fn name(&self) -> &'static str {
164        "HtmlLayoutParser"
165    }
166}
167
168#[cfg(test)]
169mod tests {
170    use super::*;
171
172    #[test]
173    fn test_html_heading_parsing() {
174        let parser = HtmlLayoutParser::new();
175        let content = r#"
176<html>
177<body>
178<h1>Chapter 1: Introduction</h1>
179<p>Some introductory text.</p>
180<h2>Section 1.1: Background</h2>
181<p>Background information.</p>
182<h3>Subsection 1.1.1: Details</h3>
183<p>Detailed information.</p>
184<h2>Section 1.2: Methods</h2>
185<p>Methodology.</p>
186</body>
187</html>
188"#;
189
190        let structure = parser.parse(content);
191
192        assert_eq!(structure.headings.len(), 4);
193        assert_eq!(structure.headings[0].level, 1);
194        assert_eq!(structure.headings[0].text, "Chapter 1: Introduction");
195        assert_eq!(structure.headings[1].level, 2);
196        assert_eq!(structure.headings[1].text, "Section 1.1: Background");
197        assert_eq!(structure.headings[2].level, 3);
198        assert_eq!(structure.headings[2].text, "Subsection 1.1.1: Details");
199        assert_eq!(structure.headings[3].level, 2);
200        assert_eq!(structure.headings[3].text, "Section 1.2: Methods");
201    }
202
203    #[test]
204    fn test_html_hierarchy() {
205        let parser = HtmlLayoutParser::new();
206        let content = r#"<h1>Main</h1>
207<h2>Sub1</h2>
208<h3>SubSub1</h3>
209<h2>Sub2</h2>"#;
210
211        let structure = parser.parse(content);
212
213        assert_eq!(structure.hierarchy.root_sections.len(), 1); // One h1
214        assert_eq!(structure.sections.len(), 4);
215
216        // Check hierarchy relationships
217        assert_eq!(structure.sections[1].parent_section, Some(0)); // h2 parent is h1
218        assert_eq!(structure.sections[2].parent_section, Some(1)); // h3 parent is h2
219        assert_eq!(structure.sections[3].parent_section, Some(0)); // h2 parent is h1
220    }
221
222    #[test]
223    fn test_nested_tags_in_heading() {
224        let parser = HtmlLayoutParser::new();
225        let content = "<h1>Chapter <em>One</em></h1><p>Content</p>";
226
227        let structure = parser.parse(content);
228
229        assert_eq!(structure.headings.len(), 1);
230        assert_eq!(structure.headings[0].text, "Chapter One"); // Nested tags removed
231    }
232
233    #[test]
234    fn test_format_support() {
235        let parser = HtmlLayoutParser::new();
236        assert!(parser.supports_format("html"));
237        assert!(parser.supports_format("HTML"));
238        assert!(parser.supports_format("htm"));
239        assert!(!parser.supports_format("md"));
240    }
241}