html_to_markdown_rs/hocr/
mod.rs

1//! hOCR 1.2 document processing.
2//!
3//! Complete hOCR 1.2 specification support for extracting structured content from OCR documents.
4//!
5//! ## Features
6//!
7//! - **Full Element Support**: All 40+ hOCR 1.2 element types
8//! - **Complete Property Parsing**: All 20+ hOCR properties (bbox, baseline, fonts, etc.)
9//! - **Document Structure**: Logical hierarchy (paragraphs, sections, chapters)
10//! - **Spatial Table Reconstruction**: Automatic table detection from bbox coordinates
11//! - **Metadata Extraction**: OCR system info, capabilities, languages
12//!
13//! ## Modules
14//!
15//! - [`types`]: Core hOCR element and property types
16//! - [`parser`]: Property parsing from title attributes
17//! - [`extractor`]: DOM to hOCR element tree extraction
18//! - [`converter`]: hOCR to Markdown conversion
19//! - [`spatial`]: Spatial table reconstruction from bounding boxes
20
21pub mod converter;
22pub mod extractor;
23pub mod parser;
24pub mod spatial;
25pub mod types;
26
27pub use converter::convert_to_markdown;
28pub use extractor::extract_hocr_document;
29pub use spatial::{extract_hocr_words, reconstruct_table, table_to_markdown, HocrWord};
30pub use types::{BBox, Baseline, HocrElement, HocrElementType, HocrMetadata, HocrProperties};