html_to_markdown_rs/hocr/
extractor.rs1use super::parser::parse_properties;
6use super::types::{HocrElement, HocrElementType, HocrMetadata};
7
8pub fn extract_hocr_document(dom: &tl::VDom, debug: bool) -> (Vec<HocrElement>, HocrMetadata) {
50 let parser = dom.parser();
51 let mut elements = Vec::new();
52 let metadata = extract_metadata(dom);
53
54 for child_handle in dom.children().iter() {
55 collect_hocr_elements(child_handle, parser, &mut elements, debug);
56 }
57
58 (elements, metadata)
59}
60
61fn collect_hocr_elements(
63 node_handle: &tl::NodeHandle,
64 parser: &tl::Parser,
65 elements: &mut Vec<HocrElement>,
66 debug: bool,
67) {
68 if let Some(element) = extract_element(node_handle, parser, debug) {
69 elements.push(element);
70 } else if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
71 let children = tag.children();
72 for child_handle in children.top().iter() {
73 collect_hocr_elements(child_handle, parser, elements, debug);
74 }
75 }
76}
77
78fn extract_metadata(dom: &tl::VDom) -> HocrMetadata {
80 let mut metadata = HocrMetadata::default();
81 let parser = dom.parser();
82
83 fn extract_from_meta_tag(meta_tag: &tl::HTMLTag, metadata: &mut HocrMetadata) {
84 let attrs = meta_tag.attributes();
85 if let (Some(name), Some(content)) = (attrs.get("name").flatten(), attrs.get("content").flatten()) {
86 let name_str = name.as_utf8_str();
87 let content_str = content.as_utf8_str().to_string();
88
89 match name_str.as_ref() {
90 "ocr-system" => metadata.ocr_system = Some(content_str),
91 "ocr-capabilities" => {
92 metadata.ocr_capabilities = content_str.split_whitespace().map(|s| s.to_string()).collect();
93 }
94 "ocr-number-of-pages" => {
95 metadata.ocr_number_of_pages = content_str.parse().ok();
96 }
97 "ocr-langs" => {
98 metadata.ocr_langs = content_str.split_whitespace().map(|s| s.to_string()).collect();
99 }
100 "ocr-scripts" => {
101 metadata.ocr_scripts = content_str.split_whitespace().map(|s| s.to_string()).collect();
102 }
103 _ => {}
104 }
105 }
106 }
107
108 fn find_meta_tags<'a>(node_handle: &tl::NodeHandle, parser: &'a tl::Parser<'a>, metadata: &mut HocrMetadata) {
109 if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
110 let tag_name = tag.name().as_utf8_str();
111
112 if tag_name == "meta" {
113 extract_from_meta_tag(tag, metadata);
114 }
115
116 let children = tag.children();
117 for child_handle in children.top().iter() {
118 find_meta_tags(child_handle, parser, metadata);
119 }
120 }
121 }
122
123 for child_handle in dom.children().iter() {
124 find_meta_tags(child_handle, parser, &mut metadata);
125 }
126
127 metadata
128}
129
130fn extract_element(node_handle: &tl::NodeHandle, parser: &tl::Parser, debug: bool) -> Option<HocrElement> {
132 if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
133 let attrs = tag.attributes();
134 let class_attr = attrs.get("class").flatten()?;
135 let classes = class_attr.as_utf8_str();
136
137 let element_type = classes.split_whitespace().find_map(HocrElementType::from_class)?;
138
139 let properties = if let Some(title) = attrs.get("title").flatten() {
140 parse_properties(&title.as_utf8_str(), debug)
141 } else {
142 Default::default()
143 };
144
145 let mut text = String::new();
146 let mut children = Vec::new();
147
148 let tag_children = tag.children();
149 for child_handle in tag_children.top().iter() {
150 if let Some(tl::Node::Raw(bytes)) = child_handle.get(parser) {
151 text.push_str(&bytes.as_utf8_str());
152 } else if let Some(child_element) = extract_element(child_handle, parser, debug) {
153 children.push(child_element);
154 }
155 }
156
157 Some(HocrElement {
158 element_type,
159 properties,
160 text: text.trim().to_string(),
161 children,
162 })
163 } else {
164 None
165 }
166}
167
168#[cfg(test)]
169mod tests {
170 use super::*;
171
172 #[test]
173 fn test_extract_simple_word() {
174 let html = r#"<span class="ocrx_word" title="bbox 100 50 150 80; x_wconf 95">Hello</span>"#;
175 let dom = tl::parse(html, tl::ParserOptions::default()).unwrap();
176 let parser = dom.parser();
177
178 let element = extract_element(&dom.children()[0], parser, false).unwrap();
179 assert!(matches!(element.element_type, HocrElementType::OcrxWord));
180 assert_eq!(element.text, "Hello");
181 assert!(element.properties.bbox.is_some());
182 assert_eq!(element.properties.x_wconf, Some(95.0));
183 }
184
185 #[test]
186 fn test_extract_paragraph() {
187 let html = r#"<p class="ocr_par" title="bbox 0 0 200 100">
188 <span class="ocrx_word" title="bbox 10 10 50 30; x_wconf 90">First</span>
189 <span class="ocrx_word" title="bbox 60 10 100 30; x_wconf 92">Word</span>
190 </p>"#;
191 let dom = tl::parse(html, tl::ParserOptions::default()).unwrap();
192 let parser = dom.parser();
193
194 let element = extract_element(&dom.children()[0], parser, false).unwrap();
195 assert!(matches!(element.element_type, HocrElementType::OcrPar));
196 assert_eq!(element.children.len(), 2);
197 assert!(matches!(element.children[0].element_type, HocrElementType::OcrxWord));
198 }
199
200 #[test]
201 fn test_extract_metadata() {
202 let html = r#"<!DOCTYPE html>
203<html>
204<head>
205<meta name="ocr-system" content="tesseract 4.1.1" />
206<meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
207<meta name="ocr-number-of-pages" content="5" />
208</head>
209<body>
210<div class="ocr_page"></div>
211</body>
212</html>"#;
213 let dom = tl::parse(html, tl::ParserOptions::default()).unwrap();
214 let (_, metadata) = extract_hocr_document(&dom, false);
215
216 assert_eq!(metadata.ocr_system, Some("tesseract 4.1.1".to_string()));
217 assert!(metadata.ocr_capabilities.contains(&"ocr_page".to_string()));
218 assert_eq!(metadata.ocr_number_of_pages, Some(5));
219 }
220}