Skip to main content

djvu_rs/
ocr_export.rs

1//! hOCR and ALTO XML export for the DjVu text layer.
2//!
3//! Converts the structured [`TextLayer`] / [`TextZone`] hierarchy into two
4//! widely-used OCR interchange formats:
5//!
6//! - **hOCR** — HTML micro-format used by Tesseract, Google Books, Internet Archive.
7//! - **ALTO XML** — ISO 25577:2013 standard used by national libraries (LoC, Europeana, BnF).
8//!
9//! ## Key public types
10//!
11//! - [`HocrOptions`] — options for hOCR output (page selection, DPI scale)
12//! - [`AltoOptions`] — options for ALTO output (page selection, DPI scale)
13//! - [`to_hocr`] — generate hOCR HTML string for a document
14//! - [`to_alto`] — generate ALTO XML string for a document
15//! - [`OcrExportError`] — typed errors from this module
16
17use std::fmt::Write as FmtWrite;
18
19use crate::djvu_document::DjVuDocument;
20use crate::text::{TextLayer, TextZone, TextZoneKind};
21
22// ---- Error ------------------------------------------------------------------
23
24/// Errors from OCR export.
25#[derive(Debug, thiserror::Error)]
26pub enum OcrExportError {
27    /// Accessing a page failed.
28    #[error("document error: {0}")]
29    Doc(#[from] crate::djvu_document::DocError),
30
31    /// Text layer extraction failed.
32    #[error("text layer error: {0}")]
33    Text(#[from] crate::text::TextError),
34
35    /// String formatting error (infallible in practice).
36    #[error("format error: {0}")]
37    Fmt(#[from] std::fmt::Error),
38}
39
40// ---- Options ----------------------------------------------------------------
41
42/// Options for hOCR output.
43#[derive(Debug, Clone, Default)]
44pub struct HocrOptions {
45    /// If `Some(n)`, only include page `n` (0-based). Default: all pages.
46    pub page_index: Option<usize>,
47    /// Reserved for future DPI-based coordinate scaling. Currently unused;
48    /// coordinates are always emitted in native page pixels.
49    pub dpi: Option<u32>,
50}
51
52/// Options for ALTO XML output.
53#[derive(Debug, Clone, Default)]
54pub struct AltoOptions {
55    /// If `Some(n)`, only include page `n` (0-based). Default: all pages.
56    pub page_index: Option<usize>,
57    /// Reserved for future DPI-based coordinate scaling. Currently unused;
58    /// coordinates are always emitted in native page pixels.
59    pub dpi: Option<u32>,
60}
61
62// ---- Public API -------------------------------------------------------------
63
64/// Generate hOCR HTML for the text layer of a [`DjVuDocument`].
65///
66/// Returns the complete HTML document as a `String`. Pages without a text
67/// layer produce an empty `ocr_page` div (with correct dimensions) so that
68/// the page count in the output always matches the document.
69///
70/// # Errors
71///
72/// Returns [`OcrExportError`] if a page cannot be accessed or its text layer
73/// cannot be decoded.
74pub fn to_hocr(doc: &DjVuDocument, opts: &HocrOptions) -> Result<String, OcrExportError> {
75    let mut out = String::with_capacity(4096);
76
77    writeln!(out, "<!DOCTYPE html>")?;
78    writeln!(out, r#"<html xmlns="http://www.w3.org/1999/xhtml">"#)?;
79    writeln!(out, "<head>")?;
80    writeln!(out, r#"  <meta charset="utf-8"/>"#)?;
81    writeln!(out, r#"  <meta name="ocr-system" content="djvu-rs"/>"#)?;
82    writeln!(
83        out,
84        r#"  <meta name="ocr-capabilities" content="ocr_page ocr_block ocr_par ocr_line ocrx_word"/>"#
85    )?;
86    writeln!(out, "</head>")?;
87    writeln!(out, "<body>")?;
88
89    let page_range: Box<dyn Iterator<Item = usize>> = match opts.page_index {
90        Some(i) => Box::new(std::iter::once(i)),
91        None => Box::new(0..doc.page_count()),
92    };
93
94    for page_idx in page_range {
95        let page = doc.page(page_idx)?;
96        let pw = page.width() as u32;
97        let ph = page.height() as u32;
98
99        // bbox for the full page
100        write!(
101            out,
102            r#"  <div class="ocr_page" id="page_{idx}" title="image page_{idx}.djvu; bbox 0 0 {w} {h}; ppageno {idx}">"#,
103            idx = page_idx,
104            w = pw,
105            h = ph,
106        )?;
107        writeln!(out)?;
108
109        if let Some(layer) = page.text_layer()? {
110            write_hocr_zones(&mut out, &layer, page_idx)?;
111        }
112
113        writeln!(out, "  </div>")?;
114    }
115
116    writeln!(out, "</body>")?;
117    writeln!(out, "</html>")?;
118
119    Ok(out)
120}
121
122/// Generate ALTO XML for the text layer of a [`DjVuDocument`].
123///
124/// Returns a complete ALTO 4.x XML document as a `String`.
125///
126/// # Errors
127///
128/// Returns [`OcrExportError`] if a page cannot be accessed or its text layer
129/// cannot be decoded.
130pub fn to_alto(doc: &DjVuDocument, opts: &AltoOptions) -> Result<String, OcrExportError> {
131    let mut out = String::with_capacity(4096);
132
133    writeln!(out, r#"<?xml version="1.0" encoding="UTF-8"?>"#)?;
134    writeln!(
135        out,
136        r#"<alto xmlns="http://www.loc.gov/standards/alto/ns-v4#""#
137    )?;
138    writeln!(
139        out,
140        r#"      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance""#
141    )?;
142    writeln!(
143        out,
144        r#"      xsi:schemaLocation="http://www.loc.gov/standards/alto/ns-v4# https://www.loc.gov/standards/alto/v4/alto.xsd">"#
145    )?;
146    writeln!(out, "  <Description>")?;
147    writeln!(out, "    <MeasurementUnit>pixel</MeasurementUnit>")?;
148    writeln!(out, "    <sourceImageInformation>")?;
149    writeln!(out, "      <fileName>document.djvu</fileName>")?;
150    writeln!(out, "    </sourceImageInformation>")?;
151    writeln!(out, "  </Description>")?;
152    writeln!(out, "  <Layout>")?;
153
154    let page_range: Box<dyn Iterator<Item = usize>> = match opts.page_index {
155        Some(i) => Box::new(std::iter::once(i)),
156        None => Box::new(0..doc.page_count()),
157    };
158
159    for page_idx in page_range {
160        let page = doc.page(page_idx)?;
161        let pw = page.width() as u32;
162        let ph = page.height() as u32;
163
164        writeln!(
165            out,
166            r#"    <Page ID="page_{idx}" WIDTH="{w}" HEIGHT="{h}" PHYSICAL_IMG_NR="{idx}">"#,
167            idx = page_idx,
168            w = pw,
169            h = ph,
170        )?;
171        writeln!(
172            out,
173            "      <PrintSpace WIDTH=\"{w}\" HEIGHT=\"{h}\" HPOS=\"0\" VPOS=\"0\">",
174            w = pw,
175            h = ph
176        )?;
177
178        if let Some(layer) = page.text_layer()? {
179            write_alto_zones(&mut out, &layer, page_idx)?;
180        }
181
182        writeln!(out, "      </PrintSpace>")?;
183        writeln!(out, "    </Page>")?;
184    }
185
186    writeln!(out, "  </Layout>")?;
187    writeln!(out, "</alto>")?;
188
189    Ok(out)
190}
191
192// ---- hOCR helpers -----------------------------------------------------------
193
194fn write_hocr_zones(
195    out: &mut String,
196    layer: &TextLayer,
197    page_idx: usize,
198) -> Result<(), OcrExportError> {
199    let mut block_id = 0usize;
200    let mut line_id = 0usize;
201    let mut word_id = 0usize;
202
203    for zone in &layer.zones {
204        write_hocr_zone(
205            out,
206            zone,
207            page_idx,
208            &mut block_id,
209            &mut line_id,
210            &mut word_id,
211            3,
212        )?;
213    }
214    Ok(())
215}
216
217fn write_hocr_zone(
218    out: &mut String,
219    zone: &TextZone,
220    page_idx: usize,
221    block_id: &mut usize,
222    line_id: &mut usize,
223    word_id: &mut usize,
224    indent: usize,
225) -> Result<(), OcrExportError> {
226    let pad = " ".repeat(indent);
227    let r = &zone.rect;
228    let bbox = format!("bbox {} {} {} {}", r.x, r.y, r.x + r.width, r.y + r.height);
229
230    match zone.kind {
231        TextZoneKind::Page => {
232            // Page zone is handled by the caller
233            for child in &zone.children {
234                write_hocr_zone(out, child, page_idx, block_id, line_id, word_id, indent)?;
235            }
236        }
237        TextZoneKind::Column | TextZoneKind::Region => {
238            let id = *block_id;
239            *block_id += 1;
240            writeln!(
241                out,
242                r#"{pad}<div class="ocr_block" id="block_{page}_{id}" title="{bbox}">"#,
243                page = page_idx
244            )?;
245            for child in &zone.children {
246                write_hocr_zone(out, child, page_idx, block_id, line_id, word_id, indent + 2)?;
247            }
248            writeln!(out, "{pad}</div>")?;
249        }
250        TextZoneKind::Para => {
251            let id = *block_id;
252            *block_id += 1;
253            writeln!(
254                out,
255                r#"{pad}<p class="ocr_par" id="par_{page}_{id}" title="{bbox}">"#,
256                page = page_idx
257            )?;
258            for child in &zone.children {
259                write_hocr_zone(out, child, page_idx, block_id, line_id, word_id, indent + 2)?;
260            }
261            writeln!(out, "{pad}</p>")?;
262        }
263        TextZoneKind::Line => {
264            let id = *line_id;
265            *line_id += 1;
266            writeln!(
267                out,
268                r#"{pad}<span class="ocr_line" id="line_{page}_{id}" title="{bbox}">"#,
269                page = page_idx
270            )?;
271            for child in &zone.children {
272                write_hocr_zone(out, child, page_idx, block_id, line_id, word_id, indent + 2)?;
273            }
274            writeln!(out, "{pad}</span>")?;
275        }
276        TextZoneKind::Word => {
277            let id = *word_id;
278            *word_id += 1;
279            let text = escape_html(&zone.text);
280            writeln!(
281                out,
282                r#"{pad}<span class="ocrx_word" id="word_{page}_{id}" title="{bbox}">{text}</span>"#,
283                page = page_idx
284            )?;
285            // Words may have character children — skip sub-word nesting in hOCR
286        }
287        TextZoneKind::Character => {
288            // Characters are not a standard hOCR class; skip.
289        }
290    }
291    Ok(())
292}
293
294fn escape_html(s: &str) -> String {
295    s.chars()
296        .flat_map(|c| match c {
297            '&' => "&amp;".chars().collect::<Vec<_>>(),
298            '<' => "&lt;".chars().collect(),
299            '>' => "&gt;".chars().collect(),
300            '"' => "&quot;".chars().collect(),
301            '\'' => "&#39;".chars().collect(),
302            c => vec![c],
303        })
304        .collect()
305}
306
307// ---- ALTO helpers -----------------------------------------------------------
308
309fn write_alto_zones(
310    out: &mut String,
311    layer: &TextLayer,
312    page_idx: usize,
313) -> Result<(), OcrExportError> {
314    let mut block_id = 0usize;
315    let mut line_id = 0usize;
316    let mut word_id = 0usize;
317
318    for zone in &layer.zones {
319        write_alto_zone(
320            out,
321            zone,
322            page_idx,
323            &mut block_id,
324            &mut line_id,
325            &mut word_id,
326            4,
327        )?;
328    }
329    Ok(())
330}
331
332fn write_alto_zone(
333    out: &mut String,
334    zone: &TextZone,
335    page_idx: usize,
336    block_id: &mut usize,
337    line_id: &mut usize,
338    word_id: &mut usize,
339    indent: usize,
340) -> Result<(), OcrExportError> {
341    let pad = " ".repeat(indent);
342    let r = &zone.rect;
343
344    match zone.kind {
345        TextZoneKind::Page => {
346            for child in &zone.children {
347                write_alto_zone(out, child, page_idx, block_id, line_id, word_id, indent)?;
348            }
349        }
350        TextZoneKind::Column | TextZoneKind::Region | TextZoneKind::Para => {
351            let id = *block_id;
352            *block_id += 1;
353            writeln!(
354                out,
355                r#"{pad}<TextBlock ID="block_{page}_{id}" HPOS="{hpos}" VPOS="{vpos}" WIDTH="{w}" HEIGHT="{h}">"#,
356                page = page_idx,
357                hpos = r.x,
358                vpos = r.y,
359                w = r.width,
360                h = r.height,
361            )?;
362            for child in &zone.children {
363                write_alto_zone(out, child, page_idx, block_id, line_id, word_id, indent + 2)?;
364            }
365            writeln!(out, "{pad}</TextBlock>")?;
366        }
367        TextZoneKind::Line => {
368            let id = *line_id;
369            *line_id += 1;
370            writeln!(
371                out,
372                r#"{pad}<TextLine ID="line_{page}_{id}" HPOS="{hpos}" VPOS="{vpos}" WIDTH="{w}" HEIGHT="{h}">"#,
373                page = page_idx,
374                hpos = r.x,
375                vpos = r.y,
376                w = r.width,
377                h = r.height,
378            )?;
379            for child in &zone.children {
380                write_alto_zone(out, child, page_idx, block_id, line_id, word_id, indent + 2)?;
381            }
382            writeln!(out, "{pad}</TextLine>")?;
383        }
384        TextZoneKind::Word => {
385            let id = *word_id;
386            *word_id += 1;
387            let text = escape_xml(&zone.text);
388            writeln!(
389                out,
390                r#"{pad}<String ID="word_{page}_{id}" HPOS="{hpos}" VPOS="{vpos}" WIDTH="{w}" HEIGHT="{h}" CONTENT="{text}"/>"#,
391                page = page_idx,
392                hpos = r.x,
393                vpos = r.y,
394                w = r.width,
395                h = r.height,
396            )?;
397        }
398        TextZoneKind::Character => {
399            // Glyph-level elements not included in the basic ALTO export.
400        }
401    }
402    Ok(())
403}
404
405fn escape_xml(s: &str) -> String {
406    s.chars()
407        .flat_map(|c| match c {
408            '&' => "&amp;".chars().collect::<Vec<_>>(),
409            '<' => "&lt;".chars().collect(),
410            '>' => "&gt;".chars().collect(),
411            '"' => "&quot;".chars().collect(),
412            '\'' => "&apos;".chars().collect(),
413            c => vec![c],
414        })
415        .collect()
416}