use super::error::OcrError;
use crate::core::config::OutputFormat as KreuzbergOutputFormat;
use html_to_markdown_rs::{ConversionOptions, OutputFormat as LibOutputFormat, convert};
pub fn convert_hocr_to_markdown(
hocr_html: &str,
options: Option<ConversionOptions>,
output_format: Option<KreuzbergOutputFormat>,
) -> Result<String, OcrError> {
let use_default = options.is_none();
let mut opts = options.unwrap_or_default();
if use_default {
opts.hocr_spatial_tables = false;
opts.extract_metadata = false;
}
let format = output_format.unwrap_or(KreuzbergOutputFormat::Markdown);
opts.output_format = match format {
KreuzbergOutputFormat::Markdown => LibOutputFormat::Markdown,
KreuzbergOutputFormat::Djot => LibOutputFormat::Djot,
KreuzbergOutputFormat::Plain | KreuzbergOutputFormat::Html | KreuzbergOutputFormat::Structured => {
LibOutputFormat::Markdown
}
};
convert(hocr_html, Some(opts)).map_err(|e| OcrError::ProcessingFailed(format!("hOCR conversion failed: {}", e)))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_simple_hocr_conversion() {
let hocr = r#"<div class="ocr_page">
<p class="ocr_par">
<span class="ocrx_word">Hello</span>
<span class="ocrx_word">World</span>
</p>
</div>"#;
let markdown = convert_hocr_to_markdown(hocr, None, None).unwrap();
assert!(markdown.contains("Hello"));
assert!(markdown.contains("World"));
}
#[test]
fn test_hocr_with_formatting() {
let hocr = r#"<div class="ocr_page">
<p class="ocr_par">
<strong class="ocrx_word">Bold</strong>
<em class="ocrx_word">Italic</em>
</p>
</div>"#;
let markdown = convert_hocr_to_markdown(hocr, None, None).unwrap();
assert!(!markdown.is_empty());
}
#[test]
fn test_empty_hocr() {
let hocr = "";
let markdown = convert_hocr_to_markdown(hocr, None, None).unwrap();
assert!(markdown.is_empty() || markdown.trim().is_empty());
}
#[test]
fn test_hocr_with_headings() {
let hocr = r#"<div class="ocr_page">
<h1>Title</h1>
<p class="ocr_par">
<span class="ocrx_word">Content</span>
</p>
</div>"#;
let markdown = convert_hocr_to_markdown(hocr, None, None).unwrap();
assert!(!markdown.is_empty());
assert!(markdown.contains("Content"));
}
#[test]
fn test_hocr_with_multiple_paragraphs() {
let hocr = r#"<div class="ocr_page">
<p class="ocr_par">
<span class="ocrx_word">First</span>
<span class="ocrx_word">paragraph</span>
</p>
<p class="ocr_par">
<span class="ocrx_word">Second</span>
<span class="ocrx_word">paragraph</span>
</p>
</div>"#;
let markdown = convert_hocr_to_markdown(hocr, None, None).unwrap();
assert!(markdown.contains("First"));
assert!(markdown.contains("Second"));
}
#[test]
fn test_hocr_with_line_breaks() {
let hocr = r#"<div class="ocr_page">
<p class="ocr_par">
<span class="ocrx_line">
<span class="ocrx_word">Line</span>
<span class="ocrx_word">one</span>
</span>
<span class="ocrx_line">
<span class="ocrx_word">Line</span>
<span class="ocrx_word">two</span>
</span>
</p>
</div>"#;
let markdown = convert_hocr_to_markdown(hocr, None, None).unwrap();
assert!(!markdown.is_empty());
}
#[test]
fn test_hocr_whitespace_handling() {
let hocr = r#"<div class="ocr_page">
<p class="ocr_par">
<span class="ocrx_word"> Padded </span>
<span class="ocrx_word"> Text </span>
</p>
</div>"#;
let markdown = convert_hocr_to_markdown(hocr, None, None).unwrap();
assert!(!markdown.is_empty());
}
#[test]
fn test_hocr_special_characters() {
let hocr = r#"<div class="ocr_page">
<p class="ocr_par">
<span class="ocrx_word"><special></span>
<span class="ocrx_word">&chars&</span>
</p>
</div>"#;
let markdown = convert_hocr_to_markdown(hocr, None, None).unwrap();
assert!(!markdown.is_empty());
}
#[test]
fn test_hocr_nested_structure() {
let hocr = r#"<div class="ocr_page">
<div class="ocr_carea">
<p class="ocr_par">
<span class="ocr_line">
<span class="ocrx_word">Nested</span>
</span>
</p>
</div>
</div>"#;
let markdown = convert_hocr_to_markdown(hocr, None, None).unwrap();
assert!(markdown.contains("Nested"));
}
#[test]
fn test_hocr_malformed_html() {
let hocr = r#"<div class="ocr_page">
<p class="ocr_par">
<span class="ocrx_word">Unclosed
</div>"#;
let result = convert_hocr_to_markdown(hocr, None, None);
assert!(result.is_ok());
}
#[test]
fn test_hocr_no_ocr_classes() {
let hocr = r#"<div>
<p>
<span>Regular HTML</span>
</p>
</div>"#;
let markdown = convert_hocr_to_markdown(hocr, None, None).unwrap();
assert!(!markdown.is_empty());
}
#[test]
fn test_hocr_mixed_content() {
let hocr = r#"<div class="ocr_page">
<h1>Heading</h1>
<p class="ocr_par">
<span class="ocrx_word">Paragraph</span>
</p>
<ul>
<li>List item</li>
</ul>
</div>"#;
let markdown = convert_hocr_to_markdown(hocr, None, None).unwrap();
assert!(markdown.contains("Heading") || markdown.contains("heading") || !markdown.is_empty());
}
#[test]
fn test_hocr_unicode_content() {
let hocr = r#"<div class="ocr_page">
<p class="ocr_par">
<span class="ocrx_word">Ñoño</span>
<span class="ocrx_word">日本語</span>
<span class="ocrx_word">العربية</span>
</p>
</div>"#;
let markdown = convert_hocr_to_markdown(hocr, None, None).unwrap();
assert!(markdown.contains("Ñoño") || !markdown.is_empty());
}
#[test]
fn test_hocr_large_document() {
use std::fmt::Write;
let mut hocr = String::from(r#"<div class="ocr_page">"#);
for i in 0..100 {
let _ = write!(
hocr,
r#"<p class="ocr_par"><span class="ocrx_word">Word{}</span></p>"#,
i
);
}
hocr.push_str("</div>");
let result = convert_hocr_to_markdown(&hocr, None, None);
assert!(result.is_ok());
let markdown = result.unwrap();
assert!(!markdown.is_empty());
}
#[test]
fn test_hocr_to_djot_conversion() {
let hocr = r#"<div class="ocr_page">
<p class="ocr_par">
<span class="ocrx_word">Hello</span>
<span class="ocrx_word">World</span>
</p>
</div>"#;
let result = convert_hocr_to_markdown(hocr, None, Some(KreuzbergOutputFormat::Djot)).unwrap();
assert!(result.contains("Hello"));
assert!(result.contains("World"));
}
#[test]
fn test_hocr_to_djot_with_formatting() {
let hocr = r#"<div class="ocr_page">
<p class="ocr_par">
<strong class="ocrx_word">Bold</strong>
<em class="ocrx_word">Italic</em>
</p>
</div>"#;
let result = convert_hocr_to_markdown(hocr, None, Some(KreuzbergOutputFormat::Djot)).unwrap();
assert!(!result.is_empty());
}
}