html_to_markdown_rs/hocr/
extractor.rs

1//! hOCR element extraction
2//!
3//! Extracts structured hOCR elements from HTML DOM.
4
5use super::parser::parse_properties;
6use super::types::{HocrElement, HocrElementType, HocrMetadata};
7
8/// Extract complete hOCR document structure from HTML DOM
9///
10/// Parses an HTML document containing hOCR annotations and extracts all hOCR elements
11/// along with document metadata.
12///
13/// # Arguments
14///
15/// * `dom` - The parsed HTML DOM (from tl parser)
16/// * `debug` - Enable debug logging for property parsing
17///
18/// # Returns
19///
20/// A tuple containing:
21/// * `Vec<HocrElement>` - All top-level hOCR elements with their full hierarchies
22/// * `HocrMetadata` - Document metadata from `<head>` meta tags
23///
24/// # hOCR 1.2 Compliance
25///
26/// Supports all 40 element types:
27/// - Logical structure (12): ocr_title, ocr_chapter, ocr_section, ocr_par, etc.
28/// - Typesetting (6): ocr_page, ocr_carea, ocr_line, etc.
29/// - Float elements (13): ocr_image, ocr_table, ocr_math, etc.
30/// - Inline elements (6): ocr_dropcap, ocr_glyph, etc.
31/// - Engine-specific (3): ocrx_block, ocrx_line, ocrx_word
32///
33/// Extracts all 20+ properties from title attributes (bbox, x_wconf, baseline, order, etc.)
34/// and all 5 metadata fields (ocr-system, ocr-capabilities, ocr-langs, etc.)
35///
36/// # Example
37///
38/// ```rust
39/// use html_to_markdown_rs::hocr::extract_hocr_document;
40///
41/// let html = r#"<div class="ocr_page" title="bbox 0 0 1000 1500">
42///     <p class="ocr_par" title="bbox 100 100 900 200">
43///         <span class="ocrx_word" title="bbox 100 100 150 130; x_wconf 95">Hello</span>
44///     </p>
45/// </div>"#;
46/// let dom = tl::parse(html, tl::ParserOptions::default()).unwrap();
47/// let (elements, metadata) = extract_hocr_document(&dom, false);
48/// ```
49pub fn extract_hocr_document(dom: &tl::VDom, debug: bool) -> (Vec<HocrElement>, HocrMetadata) {
50    let parser = dom.parser();
51    let mut elements = Vec::new();
52    let metadata = extract_metadata(dom);
53
54    // Recursively search for hOCR elements starting from root
55    for child_handle in dom.children().iter() {
56        collect_hocr_elements(child_handle, parser, &mut elements, debug);
57    }
58
59    (elements, metadata)
60}
61
62/// Recursively collect hOCR elements from DOM tree
63fn collect_hocr_elements(
64    node_handle: &tl::NodeHandle,
65    parser: &tl::Parser,
66    elements: &mut Vec<HocrElement>,
67    debug: bool,
68) {
69    // Try to extract as hOCR element
70    if let Some(element) = extract_element(node_handle, parser, debug) {
71        elements.push(element);
72    } else if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
73        // Not an hOCR element, but continue searching in children
74        let children = tag.children();
75        for child_handle in children.top().iter() {
76            collect_hocr_elements(child_handle, parser, elements, debug);
77        }
78    }
79}
80
81/// Extract hOCR metadata from HTML head
82fn extract_metadata(dom: &tl::VDom) -> HocrMetadata {
83    let mut metadata = HocrMetadata::default();
84    let parser = dom.parser();
85
86    // Recursively search for head element
87    fn find_head_and_extract<'a>(
88        node_handle: &tl::NodeHandle,
89        parser: &'a tl::Parser<'a>,
90        metadata: &mut HocrMetadata,
91    ) {
92        if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
93            let tag_name = tag.name().as_utf8_str();
94
95            if tag_name == "head" {
96                // Found head, extract meta tags
97                let children = tag.children();
98                for meta_handle in children.top().iter() {
99                    if let Some(tl::Node::Tag(meta_tag)) = meta_handle.get(parser) {
100                        if meta_tag.name().as_utf8_str() == "meta" {
101                            let attrs = meta_tag.attributes();
102                            if let (Some(name), Some(content)) =
103                                (attrs.get("name").flatten(), attrs.get("content").flatten())
104                            {
105                                let name_str = name.as_utf8_str();
106                                let content_str = content.as_utf8_str().to_string();
107
108                                match name_str.as_ref() {
109                                    "ocr-system" => metadata.ocr_system = Some(content_str),
110                                    "ocr-capabilities" => {
111                                        metadata.ocr_capabilities =
112                                            content_str.split_whitespace().map(|s| s.to_string()).collect();
113                                    }
114                                    "ocr-number-of-pages" => {
115                                        metadata.ocr_number_of_pages = content_str.parse().ok();
116                                    }
117                                    "ocr-langs" => {
118                                        metadata.ocr_langs =
119                                            content_str.split_whitespace().map(|s| s.to_string()).collect();
120                                    }
121                                    "ocr-scripts" => {
122                                        metadata.ocr_scripts =
123                                            content_str.split_whitespace().map(|s| s.to_string()).collect();
124                                    }
125                                    _ => {}
126                                }
127                            }
128                        }
129                    }
130                }
131            } else {
132                // Keep searching in children
133                let children = tag.children();
134                for child_handle in children.top().iter() {
135                    find_head_and_extract(child_handle, parser, metadata);
136                }
137            }
138        }
139    }
140
141    // Search from root
142    for child_handle in dom.children().iter() {
143        find_head_and_extract(child_handle, parser, &mut metadata);
144    }
145
146    metadata
147}
148
149/// Extract a single hOCR element and its children
150fn extract_element(node_handle: &tl::NodeHandle, parser: &tl::Parser, debug: bool) -> Option<HocrElement> {
151    if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
152        let attrs = tag.attributes();
153        let class_attr = attrs.get("class").flatten()?;
154        let classes = class_attr.as_utf8_str();
155
156        // Find hOCR element type from classes
157        let element_type = classes.split_whitespace().find_map(HocrElementType::from_class)?;
158
159        // Parse properties from title attribute
160        let properties = if let Some(title) = attrs.get("title").flatten() {
161            parse_properties(&title.as_utf8_str(), debug)
162        } else {
163            Default::default()
164        };
165
166        // Extract text content and children
167        let mut text = String::new();
168        let mut children = Vec::new();
169
170        let tag_children = tag.children();
171        for child_handle in tag_children.top().iter() {
172            if let Some(tl::Node::Raw(bytes)) = child_handle.get(parser) {
173                text.push_str(&bytes.as_utf8_str());
174            } else if let Some(child_element) = extract_element(child_handle, parser, debug) {
175                children.push(child_element);
176            }
177        }
178
179        Some(HocrElement {
180            element_type,
181            properties,
182            text: text.trim().to_string(),
183            children,
184        })
185    } else {
186        None
187    }
188}
189
190#[cfg(test)]
191mod tests {
192    use super::*;
193
194    #[test]
195    fn test_extract_simple_word() {
196        let html = r#"<span class="ocrx_word" title="bbox 100 50 150 80; x_wconf 95">Hello</span>"#;
197        let dom = tl::parse(html, tl::ParserOptions::default()).unwrap();
198        let parser = dom.parser();
199
200        let element = extract_element(&dom.children()[0], parser, false).unwrap();
201        assert!(matches!(element.element_type, HocrElementType::OcrxWord));
202        assert_eq!(element.text, "Hello");
203        assert!(element.properties.bbox.is_some());
204        assert_eq!(element.properties.x_wconf, Some(95.0));
205    }
206
207    #[test]
208    fn test_extract_paragraph() {
209        let html = r#"<p class="ocr_par" title="bbox 0 0 200 100">
210            <span class="ocrx_word" title="bbox 10 10 50 30; x_wconf 90">First</span>
211            <span class="ocrx_word" title="bbox 60 10 100 30; x_wconf 92">Word</span>
212        </p>"#;
213        let dom = tl::parse(html, tl::ParserOptions::default()).unwrap();
214        let parser = dom.parser();
215
216        let element = extract_element(&dom.children()[0], parser, false).unwrap();
217        assert!(matches!(element.element_type, HocrElementType::OcrPar));
218        assert_eq!(element.children.len(), 2);
219        assert!(matches!(element.children[0].element_type, HocrElementType::OcrxWord));
220    }
221
222    #[test]
223    fn test_extract_metadata() {
224        let html = r#"<!DOCTYPE html>
225<html>
226<head>
227<meta name="ocr-system" content="tesseract 4.1.1" />
228<meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
229<meta name="ocr-number-of-pages" content="5" />
230</head>
231<body>
232<div class="ocr_page"></div>
233</body>
234</html>"#;
235        let dom = tl::parse(html, tl::ParserOptions::default()).unwrap();
236        let (_, metadata) = extract_hocr_document(&dom, false);
237
238        assert_eq!(metadata.ocr_system, Some("tesseract 4.1.1".to_string()));
239        assert!(metadata.ocr_capabilities.contains(&"ocr_page".to_string()));
240        assert_eq!(metadata.ocr_number_of_pages, Some(5));
241    }
242}