use crate::types::extraction::BoundingBox;
use crate::types::internal::{ElementKind, InternalDocument, InternalElement};
use crate::types::ocr_elements::{OcrBoundingGeometry, OcrConfidence, OcrElementLevel};
pub fn parse_hocr_to_internal_document(hocr_html: &str) -> InternalDocument {
let mut doc = InternalDocument::new("ocr");
doc.mime_type = std::borrow::Cow::Borrowed("application/x-hocr");
let mut element_index: u32 = 0;
let mut last_page: Option<u32> = None;
let bytes = hocr_html.as_bytes();
let mut pos = 0;
while pos < bytes.len() {
let Some(tag_start) = memchr(b'<', &bytes[pos..]).map(|i| pos + i) else {
break;
};
let Some(tag_end) = memchr(b'>', &bytes[tag_start..]).map(|i| tag_start + i) else {
break;
};
let tag_content = &hocr_html[tag_start + 1..tag_end];
pos = tag_end + 1;
if tag_content.starts_with('/') || tag_content.ends_with('/') {
continue;
}
if has_class(tag_content, "ocr_page") {
let title = extract_title_attr(tag_content);
let props = parse_title_properties(&title);
let page_number = props.ppageno.map(|p| p + 1);
if let Some(prev) = last_page
&& page_number != Some(prev)
{
let pb = InternalElement::text(ElementKind::PageBreak, "", 0).with_index(element_index);
element_index += 1;
doc.push_element(pb);
}
last_page = page_number;
continue;
}
if is_paragraph_tag(tag_content) {
let (paragraph, end_pos) = parse_paragraph(hocr_html, pos, last_page.unwrap_or(1), element_index);
pos = end_pos;
if let Some(elem) = paragraph {
element_index += 1;
doc.push_element(elem);
}
}
}
doc
}
#[derive(Debug, Default)]
struct HocrProperties {
bbox: Option<(u32, u32, u32, u32)>,
x_wconf: Option<f64>,
ppageno: Option<u32>,
textangle: Option<f64>,
baseline: Option<(f64, i32)>,
x_font: Option<String>,
x_fsize: Option<u32>,
}
fn parse_title_properties(title: &str) -> HocrProperties {
let mut props = HocrProperties::default();
for part in title.split(';') {
let part = part.trim();
if part.is_empty() {
continue;
}
let mut tokens = part.split_whitespace();
let Some(key) = tokens.next() else {
continue;
};
match key {
"bbox" => {
let coords: Vec<u32> = tokens.filter_map(|s| s.parse().ok()).collect();
if coords.len() == 4 {
props.bbox = Some((coords[0], coords[1], coords[2], coords[3]));
}
}
"x_wconf" => {
if let Some(val) = tokens.next().and_then(|s| s.parse::<f64>().ok()) {
props.x_wconf = Some(val);
}
}
"ppageno" => {
if let Some(val) = tokens.next().and_then(|s| s.parse::<u32>().ok()) {
props.ppageno = Some(val);
}
}
"textangle" => {
if let Some(val) = tokens.next().and_then(|s| s.parse::<f64>().ok()) {
props.textangle = Some(val);
}
}
"baseline" => {
let slope = tokens.next().and_then(|s| s.parse::<f64>().ok());
let constant = tokens.next().and_then(|s| s.parse::<i32>().ok());
if let (Some(s), Some(c)) = (slope, constant) {
props.baseline = Some((s, c));
}
}
"x_font" => {
props.x_font = parse_quoted_value(part);
}
"x_fsize" => {
if let Some(val) = tokens.next().and_then(|s| s.parse::<u32>().ok()) {
props.x_fsize = Some(val);
}
}
_ => {}
}
}
props
}
fn parse_quoted_value(part: &str) -> Option<String> {
let start = part.find('"')?;
let end = part[start + 1..].find('"')?;
Some(part[start + 1..start + 1 + end].to_string())
}
struct HocrWordInfo {
text: String,
x0: u32,
y0: u32,
x1: u32,
y1: u32,
confidence: Option<f64>,
}
fn parse_paragraph(html: &str, start: usize, page: u32, element_index: u32) -> (Option<InternalElement>, usize) {
let bytes = html.as_bytes();
let mut pos = start;
let mut lines: Vec<Vec<HocrWordInfo>> = Vec::new();
let mut current_line: Vec<HocrWordInfo> = Vec::new();
let mut in_line = false;
let mut depth: u32 = 1;
while pos < bytes.len() {
let Some(tag_start) = memchr(b'<', &bytes[pos..]).map(|i| pos + i) else {
break;
};
let Some(tag_end) = memchr(b'>', &bytes[tag_start..]).map(|i| tag_start + i) else {
break;
};
let tag_content = &html[tag_start + 1..tag_end];
pos = tag_end + 1;
if let Some(stripped) = tag_content.strip_prefix('/') {
let closing_name = stripped.trim().to_ascii_lowercase();
if closing_name == "p" || closing_name == "span" || closing_name == "div" {
depth = depth.saturating_sub(1);
if depth == 0 {
if !current_line.is_empty() {
lines.push(std::mem::take(&mut current_line));
}
break;
}
}
continue;
}
if tag_content.ends_with('/') {
continue;
}
let tag_name = tag_content.split_whitespace().next().unwrap_or("").to_ascii_lowercase();
if has_class(tag_content, "ocr_line") || has_class(tag_content, "ocrx_line") {
if in_line && !current_line.is_empty() {
lines.push(std::mem::take(&mut current_line));
}
in_line = true;
if tag_name == "p" || tag_name == "span" || tag_name == "div" {
depth += 1;
}
continue;
}
if has_class(tag_content, "ocrx_word") {
let title = extract_title_attr(tag_content);
let props = parse_title_properties(&title);
let word_text = extract_inner_text(html, pos);
let trimmed = decode_html_entities(&word_text);
let trimmed = trimmed.trim();
pos = skip_to_matching_close(html, pos, &tag_name);
if !trimmed.is_empty() {
let (x0, y0, x1, y1) = props.bbox.unwrap_or((0, 0, 0, 0));
current_line.push(HocrWordInfo {
text: trimmed.to_string(),
x0,
y0,
x1,
y1,
confidence: props.x_wconf,
});
}
continue;
}
if tag_name == "p" || tag_name == "span" || tag_name == "div" {
depth += 1;
}
}
let all_words: Vec<&HocrWordInfo> = lines.iter().flat_map(|l| l.iter()).collect();
if all_words.is_empty() {
return (None, pos);
}
let text: String = lines
.iter()
.map(|line| line.iter().map(|w| w.text.as_str()).collect::<Vec<_>>().join(" "))
.collect::<Vec<_>>()
.join("\n");
let mut min_x0 = u32::MAX;
let mut min_y0 = u32::MAX;
let mut max_x1 = 0u32;
let mut max_y1 = 0u32;
let mut conf_sum = 0.0f64;
let mut conf_count = 0u32;
for word in &all_words {
if word.x1 > 0 || word.y1 > 0 {
min_x0 = min_x0.min(word.x0);
min_y0 = min_y0.min(word.y0);
max_x1 = max_x1.max(word.x1);
max_y1 = max_y1.max(word.y1);
}
if let Some(c) = word.confidence {
conf_sum += c;
conf_count += 1;
}
}
let has_valid_bbox = max_x1 > 0 || max_y1 > 0;
let bbox = if has_valid_bbox {
Some(BoundingBox {
x0: min_x0 as f64,
y0: min_y0 as f64,
x1: max_x1 as f64,
y1: max_y1 as f64,
})
} else {
None
};
let ocr_geometry = if has_valid_bbox {
Some(OcrBoundingGeometry::Rectangle {
left: min_x0,
top: min_y0,
width: max_x1.saturating_sub(min_x0),
height: max_y1.saturating_sub(min_y0),
})
} else {
None
};
let ocr_confidence = if conf_count > 0 {
Some(OcrConfidence::from_tesseract(conf_sum / conf_count as f64))
} else {
None
};
let kind = ElementKind::OcrText {
level: OcrElementLevel::Block,
};
let mut elem = InternalElement::text(kind, text, 0)
.with_page(page)
.with_index(element_index);
elem.bbox = bbox;
elem.ocr_geometry = ocr_geometry;
elem.ocr_confidence = ocr_confidence;
(Some(elem), pos)
}
#[inline]
fn memchr(needle: u8, haystack: &[u8]) -> Option<usize> {
haystack.iter().position(|&b| b == needle)
}
fn has_class(tag_content: &str, cls: &str) -> bool {
if let Some(class_start) = tag_content.find("class=") {
let rest = &tag_content[class_start + 6..];
let quote = rest.as_bytes().first().copied().unwrap_or(b'"');
if quote == b'"' || quote == b'\'' {
let inner = &rest[1..];
if let Some(end) = inner.find(quote as char) {
let class_value = &inner[..end];
return class_value.split_whitespace().any(|c| c == cls);
}
}
}
false
}
fn is_paragraph_tag(tag_content: &str) -> bool {
has_class(tag_content, "ocr_par")
}
fn extract_title_attr(tag_content: &str) -> String {
if let Some(title_start) = tag_content.find("title=") {
let rest = &tag_content[title_start + 6..];
let quote = rest.as_bytes().first().copied().unwrap_or(b'"');
if quote == b'"' || quote == b'\'' {
let inner = &rest[1..];
if let Some(end) = inner.find(quote as char) {
return inner[..end].to_string();
}
}
}
String::new()
}
fn extract_inner_text(html: &str, start: usize) -> String {
let bytes = html.as_bytes();
let mut result = String::new();
let mut pos = start;
let mut depth: u32 = 1;
while pos < bytes.len() && depth > 0 {
if let Some(lt) = memchr(b'<', &bytes[pos..]).map(|i| pos + i) {
result.push_str(&html[pos..lt]);
if let Some(gt) = memchr(b'>', &bytes[lt..]).map(|i| lt + i) {
let tag = &html[lt + 1..gt];
if tag.starts_with('/') {
depth -= 1;
} else if !tag.ends_with('/') {
depth += 1;
}
pos = gt + 1;
} else {
break;
}
} else {
result.push_str(&html[pos..]);
break;
}
}
result
}
fn skip_to_matching_close(html: &str, start: usize, tag_name: &str) -> usize {
let bytes = html.as_bytes();
let mut pos = start;
let mut depth: u32 = 1;
while pos < bytes.len() && depth > 0 {
let Some(lt) = memchr(b'<', &bytes[pos..]).map(|i| pos + i) else {
break;
};
let Some(gt) = memchr(b'>', &bytes[lt..]).map(|i| lt + i) else {
break;
};
let tag = &html[lt + 1..gt];
if let Some(stripped) = tag.strip_prefix('/') {
let name = stripped.split_whitespace().next().unwrap_or("");
if name.eq_ignore_ascii_case(tag_name) {
depth -= 1;
}
} else if !tag.ends_with('/') {
let name = tag.split_whitespace().next().unwrap_or("");
if name.eq_ignore_ascii_case(tag_name) {
depth += 1;
}
}
pos = gt + 1;
}
pos
}
fn decode_html_entities(text: &str) -> String {
if !text.contains('&') {
return text.to_string();
}
text.replace("&", "&")
.replace("<", "<")
.replace(">", ">")
.replace(""", "\"")
.replace("'", "'")
.replace("'", "'")
.replace("'", "'")
.replace(" ", " ")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_empty_hocr() {
let doc = parse_hocr_to_internal_document("");
assert!(doc.elements.is_empty());
}
#[test]
fn test_single_page_single_paragraph() {
let hocr = r#"<div class="ocr_page" title="bbox 0 0 1000 1500; ppageno 0">
<p class="ocr_par" title="bbox 100 100 900 200">
<span class="ocr_line" title="bbox 100 100 900 150">
<span class="ocrx_word" title="bbox 100 100 200 140; x_wconf 95">Hello</span>
<span class="ocrx_word" title="bbox 210 100 350 140; x_wconf 90">World</span>
</span>
</p>
</div>"#;
let doc = parse_hocr_to_internal_document(hocr);
let elements = doc.elements;
assert_eq!(elements.len(), 1);
let elem = &elements[0];
assert_eq!(elem.text, "Hello World");
assert_eq!(elem.page, Some(1));
let bbox = elem.bbox.as_ref().unwrap();
assert_eq!(bbox.x0, 100.0);
assert_eq!(bbox.y0, 100.0);
assert_eq!(bbox.x1, 350.0);
assert_eq!(bbox.y1, 140.0);
let conf = elem.ocr_confidence.as_ref().unwrap();
assert!((conf.recognition - 0.925).abs() < 0.01);
}
#[test]
fn test_multi_line_paragraph() {
let hocr = r#"<div class="ocr_page" title="ppageno 0">
<p class="ocr_par">
<span class="ocr_line" title="bbox 10 10 200 30">
<span class="ocrx_word" title="bbox 10 10 50 30">Line</span>
<span class="ocrx_word" title="bbox 60 10 100 30">one</span>
</span>
<span class="ocr_line" title="bbox 10 40 200 60">
<span class="ocrx_word" title="bbox 10 40 50 60">Line</span>
<span class="ocrx_word" title="bbox 60 40 100 60">two</span>
</span>
</p>
</div>"#;
let doc = parse_hocr_to_internal_document(hocr);
let elements = doc.elements;
assert_eq!(elements.len(), 1);
assert_eq!(elements[0].text, "Line one\nLine two");
}
#[test]
fn test_multi_page_inserts_page_breaks() {
let hocr = r#"
<div class="ocr_page" title="ppageno 0">
<p class="ocr_par">
<span class="ocrx_word" title="bbox 10 10 50 30">Page1</span>
</p>
</div>
<div class="ocr_page" title="ppageno 1">
<p class="ocr_par">
<span class="ocrx_word" title="bbox 10 10 50 30">Page2</span>
</p>
</div>"#;
let doc = parse_hocr_to_internal_document(hocr);
let elements = doc.elements;
assert_eq!(elements.len(), 3);
assert!(matches!(elements[0].kind, ElementKind::OcrText { .. }));
assert!(matches!(elements[1].kind, ElementKind::PageBreak));
assert!(matches!(elements[2].kind, ElementKind::OcrText { .. }));
assert_eq!(elements[0].page, Some(1));
assert_eq!(elements[2].page, Some(2));
}
#[test]
fn test_html_entity_decoding() {
let hocr = r#"<div class="ocr_page" title="ppageno 0">
<p class="ocr_par">
<span class="ocrx_word" title="bbox 10 10 50 30">&foo<bar></span>
</p>
</div>"#;
let doc = parse_hocr_to_internal_document(hocr);
assert_eq!(doc.elements[0].text, "&foo<bar>");
}
#[test]
fn test_words_without_bbox_still_included() {
let hocr = r#"<div class="ocr_page" title="ppageno 0">
<p class="ocr_par">
<span class="ocrx_word">NoBbox</span>
</p>
</div>"#;
let doc = parse_hocr_to_internal_document(hocr);
assert_eq!(doc.elements.len(), 1);
assert_eq!(doc.elements[0].text, "NoBbox");
assert!(doc.elements[0].bbox.is_none());
}
#[test]
fn test_nested_formatting_tags() {
let hocr = r#"<div class="ocr_page" title="ppageno 0">
<p class="ocr_par">
<span class="ocrx_word" title="bbox 10 10 50 30"><strong>Bold</strong></span>
<span class="ocrx_word" title="bbox 60 10 100 30"><em>Italic</em></span>
</p>
</div>"#;
let doc = parse_hocr_to_internal_document(hocr);
assert_eq!(doc.elements[0].text, "Bold Italic");
}
#[test]
fn test_property_parsing() {
let props = parse_title_properties("bbox 100 50 200 150; x_wconf 95.5; ppageno 3; textangle 7.2");
assert_eq!(props.bbox, Some((100, 50, 200, 150)));
assert_eq!(props.x_wconf, Some(95.5));
assert_eq!(props.ppageno, Some(3));
assert_eq!(props.textangle, Some(7.2));
}
#[test]
fn test_baseline_parsing() {
let props = parse_title_properties("baseline 0.015 -18");
assert_eq!(props.baseline, Some((0.015, -18)));
}
#[test]
fn test_font_parsing() {
let props = parse_title_properties("x_font \"Comic Sans MS\"; x_fsize 12");
assert_eq!(props.x_font, Some("Comic Sans MS".to_string()));
assert_eq!(props.x_fsize, Some(12));
}
#[test]
fn test_has_class() {
assert!(has_class(
r#"div class="ocr_page" title="bbox 0 0 100 100""#,
"ocr_page"
));
assert!(!has_class(r#"div class="ocr_page""#, "ocr_par"));
assert!(has_class(r#"span class="ocrx_word ocr_line""#, "ocrx_word"));
assert!(has_class(r#"span class="ocrx_word ocr_line""#, "ocr_line"));
}
#[test]
fn test_extract_title_attr() {
let title = extract_title_attr(r#"div class="ocr_page" title="bbox 0 0 100 200; ppageno 0""#);
assert_eq!(title, "bbox 0 0 100 200; ppageno 0");
}
#[test]
fn test_ocr_geometry_set() {
let hocr = r#"<div class="ocr_page" title="ppageno 0">
<p class="ocr_par">
<span class="ocrx_word" title="bbox 50 60 150 100; x_wconf 88">test</span>
</p>
</div>"#;
let doc = parse_hocr_to_internal_document(hocr);
let elem = &doc.elements[0];
let geom = elem.ocr_geometry.as_ref().unwrap();
match geom {
OcrBoundingGeometry::Rectangle {
left,
top,
width,
height,
} => {
assert_eq!(left, &50);
assert_eq!(top, &60);
assert_eq!(width, &100);
assert_eq!(height, &40);
}
_ => panic!("Expected Rectangle geometry"),
}
}
#[test]
fn test_english_pdf_real_data() {
let hocr = include_str!("../../test_data/hocr/english_pdf_default.hocr");
let doc = parse_hocr_to_internal_document(hocr);
assert!(
!doc.elements.is_empty(),
"Should extract elements from English PDF hOCR"
);
let total_text: String = doc
.elements
.iter()
.map(|e| e.text.as_str())
.collect::<Vec<_>>()
.join(" ");
assert!(!total_text.trim().is_empty(), "Should have non-empty text");
let has_pages = doc.elements.iter().any(|e| e.page.is_some());
assert!(has_pages, "Should have page numbers");
}
#[test]
fn test_german_pdf_real_data() {
let hocr = include_str!("../../test_data/hocr/german_pdf_default.hocr");
let doc = parse_hocr_to_internal_document(hocr);
assert!(!doc.elements.is_empty(), "Should extract elements from German PDF hOCR");
let total_text: String = doc
.elements
.iter()
.map(|e| e.text.as_str())
.collect::<Vec<_>>()
.join(" ");
assert!(!total_text.trim().is_empty(), "Should have non-empty German text");
}
#[test]
fn test_invoice_image_real_data() {
let hocr = include_str!("../../test_data/hocr/invoice_image_default.hocr");
let doc = parse_hocr_to_internal_document(hocr);
assert!(!doc.elements.is_empty(), "Should extract elements from invoice hOCR");
let total_text: String = doc
.elements
.iter()
.map(|e| e.text.as_str())
.collect::<Vec<_>>()
.join(" ");
assert!(
total_text.chars().any(|c| c.is_ascii_digit()),
"Invoice should contain numbers"
);
}
#[test]
fn test_word_confidence_real_data() {
let hocr = include_str!("../../test_data/hocr/word_confidence.hocr");
let doc = parse_hocr_to_internal_document(hocr);
assert!(
doc.elements.is_empty(),
"Non-hOCR-classed elements should not be extracted"
);
}
#[test]
fn test_utf8_encoding_real_data() {
let hocr = include_str!("../../test_data/hocr/utf8_encoding.hocr");
let doc = parse_hocr_to_internal_document(hocr);
assert!(
doc.elements.is_empty(),
"Non-hOCR-classed UTF-8 content should not be extracted"
);
}
#[test]
fn test_v4_with_tables_and_code() {
let hocr = include_str!("../../test_data/hocr/v4_code_formula.hocr");
let doc = parse_hocr_to_internal_document(hocr);
assert!(
!doc.elements.is_empty(),
"Should extract from v4 hOCR with code/formula"
);
}
#[test]
fn test_v4_embedded_tables() {
let hocr = include_str!("../../test_data/hocr/v4_embedded_tables.hocr");
let doc = parse_hocr_to_internal_document(hocr);
assert!(
!doc.elements.is_empty(),
"Should extract from v4 hOCR with embedded tables"
);
}
}