html_to_markdown_rs/hocr/
extractor.rs1use super::parser::parse_properties;
6use super::types::{HocrElement, HocrElementType, HocrMetadata};
7
8pub fn extract_hocr_document(dom: &tl::VDom, debug: bool) -> (Vec<HocrElement>, HocrMetadata) {
50 let parser = dom.parser();
51 let mut elements = Vec::new();
52 let metadata = extract_metadata(dom);
53
54 for child_handle in dom.children().iter() {
56 collect_hocr_elements(child_handle, parser, &mut elements, debug);
57 }
58
59 (elements, metadata)
60}
61
62fn collect_hocr_elements(
64 node_handle: &tl::NodeHandle,
65 parser: &tl::Parser,
66 elements: &mut Vec<HocrElement>,
67 debug: bool,
68) {
69 if let Some(element) = extract_element(node_handle, parser, debug) {
71 elements.push(element);
72 } else if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
73 let children = tag.children();
75 for child_handle in children.top().iter() {
76 collect_hocr_elements(child_handle, parser, elements, debug);
77 }
78 }
79}
80
81fn extract_metadata(dom: &tl::VDom) -> HocrMetadata {
83 let mut metadata = HocrMetadata::default();
84 let parser = dom.parser();
85
86 fn find_head_and_extract<'a>(
88 node_handle: &tl::NodeHandle,
89 parser: &'a tl::Parser<'a>,
90 metadata: &mut HocrMetadata,
91 ) {
92 if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
93 let tag_name = tag.name().as_utf8_str();
94
95 if tag_name == "head" {
96 let children = tag.children();
98 for meta_handle in children.top().iter() {
99 if let Some(tl::Node::Tag(meta_tag)) = meta_handle.get(parser) {
100 if meta_tag.name().as_utf8_str() == "meta" {
101 let attrs = meta_tag.attributes();
102 if let (Some(name), Some(content)) =
103 (attrs.get("name").flatten(), attrs.get("content").flatten())
104 {
105 let name_str = name.as_utf8_str();
106 let content_str = content.as_utf8_str().to_string();
107
108 match name_str.as_ref() {
109 "ocr-system" => metadata.ocr_system = Some(content_str),
110 "ocr-capabilities" => {
111 metadata.ocr_capabilities =
112 content_str.split_whitespace().map(|s| s.to_string()).collect();
113 }
114 "ocr-number-of-pages" => {
115 metadata.ocr_number_of_pages = content_str.parse().ok();
116 }
117 "ocr-langs" => {
118 metadata.ocr_langs =
119 content_str.split_whitespace().map(|s| s.to_string()).collect();
120 }
121 "ocr-scripts" => {
122 metadata.ocr_scripts =
123 content_str.split_whitespace().map(|s| s.to_string()).collect();
124 }
125 _ => {}
126 }
127 }
128 }
129 }
130 }
131 } else {
132 let children = tag.children();
134 for child_handle in children.top().iter() {
135 find_head_and_extract(child_handle, parser, metadata);
136 }
137 }
138 }
139 }
140
141 for child_handle in dom.children().iter() {
143 find_head_and_extract(child_handle, parser, &mut metadata);
144 }
145
146 metadata
147}
148
149fn extract_element(node_handle: &tl::NodeHandle, parser: &tl::Parser, debug: bool) -> Option<HocrElement> {
151 if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
152 let attrs = tag.attributes();
153 let class_attr = attrs.get("class").flatten()?;
154 let classes = class_attr.as_utf8_str();
155
156 let element_type = classes.split_whitespace().find_map(HocrElementType::from_class)?;
158
159 let properties = if let Some(title) = attrs.get("title").flatten() {
161 parse_properties(&title.as_utf8_str(), debug)
162 } else {
163 Default::default()
164 };
165
166 let mut text = String::new();
168 let mut children = Vec::new();
169
170 let tag_children = tag.children();
171 for child_handle in tag_children.top().iter() {
172 if let Some(tl::Node::Raw(bytes)) = child_handle.get(parser) {
173 text.push_str(&bytes.as_utf8_str());
174 } else if let Some(child_element) = extract_element(child_handle, parser, debug) {
175 children.push(child_element);
176 }
177 }
178
179 Some(HocrElement {
180 element_type,
181 properties,
182 text: text.trim().to_string(),
183 children,
184 })
185 } else {
186 None
187 }
188}
189
190#[cfg(test)]
191mod tests {
192 use super::*;
193
194 #[test]
195 fn test_extract_simple_word() {
196 let html = r#"<span class="ocrx_word" title="bbox 100 50 150 80; x_wconf 95">Hello</span>"#;
197 let dom = tl::parse(html, tl::ParserOptions::default()).unwrap();
198 let parser = dom.parser();
199
200 let element = extract_element(&dom.children()[0], parser, false).unwrap();
201 assert!(matches!(element.element_type, HocrElementType::OcrxWord));
202 assert_eq!(element.text, "Hello");
203 assert!(element.properties.bbox.is_some());
204 assert_eq!(element.properties.x_wconf, Some(95.0));
205 }
206
207 #[test]
208 fn test_extract_paragraph() {
209 let html = r#"<p class="ocr_par" title="bbox 0 0 200 100">
210 <span class="ocrx_word" title="bbox 10 10 50 30; x_wconf 90">First</span>
211 <span class="ocrx_word" title="bbox 60 10 100 30; x_wconf 92">Word</span>
212 </p>"#;
213 let dom = tl::parse(html, tl::ParserOptions::default()).unwrap();
214 let parser = dom.parser();
215
216 let element = extract_element(&dom.children()[0], parser, false).unwrap();
217 assert!(matches!(element.element_type, HocrElementType::OcrPar));
218 assert_eq!(element.children.len(), 2);
219 assert!(matches!(element.children[0].element_type, HocrElementType::OcrxWord));
220 }
221
222 #[test]
223 fn test_extract_metadata() {
224 let html = r#"<!DOCTYPE html>
225<html>
226<head>
227<meta name="ocr-system" content="tesseract 4.1.1" />
228<meta name="ocr-capabilities" content="ocr_page ocr_carea ocr_par ocr_line ocrx_word" />
229<meta name="ocr-number-of-pages" content="5" />
230</head>
231<body>
232<div class="ocr_page"></div>
233</body>
234</html>"#;
235 let dom = tl::parse(html, tl::ParserOptions::default()).unwrap();
236 let (_, metadata) = extract_hocr_document(&dom, false);
237
238 assert_eq!(metadata.ocr_system, Some("tesseract 4.1.1".to_string()));
239 assert!(metadata.ocr_capabilities.contains(&"ocr_page".to_string()));
240 assert_eq!(metadata.ocr_number_of_pages, Some(5));
241 }
242}