1use std::fmt::Write as FmtWrite;
18
19use crate::djvu_document::DjVuDocument;
20use crate::text::{TextLayer, TextZone, TextZoneKind};
21
22#[derive(Debug, thiserror::Error)]
26pub enum OcrExportError {
27 #[error("document error: {0}")]
29 Doc(#[from] crate::djvu_document::DocError),
30
31 #[error("text layer error: {0}")]
33 Text(#[from] crate::text::TextError),
34
35 #[error("format error: {0}")]
37 Fmt(#[from] std::fmt::Error),
38}
39
40#[derive(Debug, Clone, Default)]
44pub struct HocrOptions {
45 pub page_index: Option<usize>,
47 pub dpi: Option<u32>,
50}
51
52#[derive(Debug, Clone, Default)]
54pub struct AltoOptions {
55 pub page_index: Option<usize>,
57 pub dpi: Option<u32>,
60}
61
62pub fn to_hocr(doc: &DjVuDocument, opts: &HocrOptions) -> Result<String, OcrExportError> {
75 let mut out = String::with_capacity(4096);
76
77 writeln!(out, "<!DOCTYPE html>")?;
78 writeln!(out, r#"<html xmlns="http://www.w3.org/1999/xhtml">"#)?;
79 writeln!(out, "<head>")?;
80 writeln!(out, r#" <meta charset="utf-8"/>"#)?;
81 writeln!(out, r#" <meta name="ocr-system" content="djvu-rs"/>"#)?;
82 writeln!(
83 out,
84 r#" <meta name="ocr-capabilities" content="ocr_page ocr_block ocr_par ocr_line ocrx_word"/>"#
85 )?;
86 writeln!(out, "</head>")?;
87 writeln!(out, "<body>")?;
88
89 let page_range: Box<dyn Iterator<Item = usize>> = match opts.page_index {
90 Some(i) => Box::new(std::iter::once(i)),
91 None => Box::new(0..doc.page_count()),
92 };
93
94 for page_idx in page_range {
95 let page = doc.page(page_idx)?;
96 let pw = page.width() as u32;
97 let ph = page.height() as u32;
98
99 write!(
101 out,
102 r#" <div class="ocr_page" id="page_{idx}" title="image page_{idx}.djvu; bbox 0 0 {w} {h}; ppageno {idx}">"#,
103 idx = page_idx,
104 w = pw,
105 h = ph,
106 )?;
107 writeln!(out)?;
108
109 if let Some(layer) = page.text_layer()? {
110 write_hocr_zones(&mut out, &layer, page_idx)?;
111 }
112
113 writeln!(out, " </div>")?;
114 }
115
116 writeln!(out, "</body>")?;
117 writeln!(out, "</html>")?;
118
119 Ok(out)
120}
121
122pub fn to_alto(doc: &DjVuDocument, opts: &AltoOptions) -> Result<String, OcrExportError> {
131 let mut out = String::with_capacity(4096);
132
133 writeln!(out, r#"<?xml version="1.0" encoding="UTF-8"?>"#)?;
134 writeln!(
135 out,
136 r#"<alto xmlns="http://www.loc.gov/standards/alto/ns-v4#""#
137 )?;
138 writeln!(
139 out,
140 r#" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance""#
141 )?;
142 writeln!(
143 out,
144 r#" xsi:schemaLocation="http://www.loc.gov/standards/alto/ns-v4# https://www.loc.gov/standards/alto/v4/alto.xsd">"#
145 )?;
146 writeln!(out, " <Description>")?;
147 writeln!(out, " <MeasurementUnit>pixel</MeasurementUnit>")?;
148 writeln!(out, " <sourceImageInformation>")?;
149 writeln!(out, " <fileName>document.djvu</fileName>")?;
150 writeln!(out, " </sourceImageInformation>")?;
151 writeln!(out, " </Description>")?;
152 writeln!(out, " <Layout>")?;
153
154 let page_range: Box<dyn Iterator<Item = usize>> = match opts.page_index {
155 Some(i) => Box::new(std::iter::once(i)),
156 None => Box::new(0..doc.page_count()),
157 };
158
159 for page_idx in page_range {
160 let page = doc.page(page_idx)?;
161 let pw = page.width() as u32;
162 let ph = page.height() as u32;
163
164 writeln!(
165 out,
166 r#" <Page ID="page_{idx}" WIDTH="{w}" HEIGHT="{h}" PHYSICAL_IMG_NR="{idx}">"#,
167 idx = page_idx,
168 w = pw,
169 h = ph,
170 )?;
171 writeln!(
172 out,
173 " <PrintSpace WIDTH=\"{w}\" HEIGHT=\"{h}\" HPOS=\"0\" VPOS=\"0\">",
174 w = pw,
175 h = ph
176 )?;
177
178 if let Some(layer) = page.text_layer()? {
179 write_alto_zones(&mut out, &layer, page_idx)?;
180 }
181
182 writeln!(out, " </PrintSpace>")?;
183 writeln!(out, " </Page>")?;
184 }
185
186 writeln!(out, " </Layout>")?;
187 writeln!(out, "</alto>")?;
188
189 Ok(out)
190}
191
192fn write_hocr_zones(
195 out: &mut String,
196 layer: &TextLayer,
197 page_idx: usize,
198) -> Result<(), OcrExportError> {
199 let mut block_id = 0usize;
200 let mut line_id = 0usize;
201 let mut word_id = 0usize;
202
203 for zone in &layer.zones {
204 write_hocr_zone(
205 out,
206 zone,
207 page_idx,
208 &mut block_id,
209 &mut line_id,
210 &mut word_id,
211 3,
212 )?;
213 }
214 Ok(())
215}
216
217fn write_hocr_zone(
218 out: &mut String,
219 zone: &TextZone,
220 page_idx: usize,
221 block_id: &mut usize,
222 line_id: &mut usize,
223 word_id: &mut usize,
224 indent: usize,
225) -> Result<(), OcrExportError> {
226 let pad = " ".repeat(indent);
227 let r = &zone.rect;
228 let bbox = format!("bbox {} {} {} {}", r.x, r.y, r.x + r.width, r.y + r.height);
229
230 match zone.kind {
231 TextZoneKind::Page => {
232 for child in &zone.children {
234 write_hocr_zone(out, child, page_idx, block_id, line_id, word_id, indent)?;
235 }
236 }
237 TextZoneKind::Column | TextZoneKind::Region => {
238 let id = *block_id;
239 *block_id += 1;
240 writeln!(
241 out,
242 r#"{pad}<div class="ocr_block" id="block_{page}_{id}" title="{bbox}">"#,
243 page = page_idx
244 )?;
245 for child in &zone.children {
246 write_hocr_zone(out, child, page_idx, block_id, line_id, word_id, indent + 2)?;
247 }
248 writeln!(out, "{pad}</div>")?;
249 }
250 TextZoneKind::Para => {
251 let id = *block_id;
252 *block_id += 1;
253 writeln!(
254 out,
255 r#"{pad}<p class="ocr_par" id="par_{page}_{id}" title="{bbox}">"#,
256 page = page_idx
257 )?;
258 for child in &zone.children {
259 write_hocr_zone(out, child, page_idx, block_id, line_id, word_id, indent + 2)?;
260 }
261 writeln!(out, "{pad}</p>")?;
262 }
263 TextZoneKind::Line => {
264 let id = *line_id;
265 *line_id += 1;
266 writeln!(
267 out,
268 r#"{pad}<span class="ocr_line" id="line_{page}_{id}" title="{bbox}">"#,
269 page = page_idx
270 )?;
271 for child in &zone.children {
272 write_hocr_zone(out, child, page_idx, block_id, line_id, word_id, indent + 2)?;
273 }
274 writeln!(out, "{pad}</span>")?;
275 }
276 TextZoneKind::Word => {
277 let id = *word_id;
278 *word_id += 1;
279 let text = escape_html(&zone.text);
280 writeln!(
281 out,
282 r#"{pad}<span class="ocrx_word" id="word_{page}_{id}" title="{bbox}">{text}</span>"#,
283 page = page_idx
284 )?;
285 }
287 TextZoneKind::Character => {
288 }
290 }
291 Ok(())
292}
293
294fn escape_html(s: &str) -> String {
295 s.chars()
296 .flat_map(|c| match c {
297 '&' => "&".chars().collect::<Vec<_>>(),
298 '<' => "<".chars().collect(),
299 '>' => ">".chars().collect(),
300 '"' => """.chars().collect(),
301 '\'' => "'".chars().collect(),
302 c => vec![c],
303 })
304 .collect()
305}
306
307fn write_alto_zones(
310 out: &mut String,
311 layer: &TextLayer,
312 page_idx: usize,
313) -> Result<(), OcrExportError> {
314 let mut block_id = 0usize;
315 let mut line_id = 0usize;
316 let mut word_id = 0usize;
317
318 for zone in &layer.zones {
319 write_alto_zone(
320 out,
321 zone,
322 page_idx,
323 &mut block_id,
324 &mut line_id,
325 &mut word_id,
326 4,
327 )?;
328 }
329 Ok(())
330}
331
332fn write_alto_zone(
333 out: &mut String,
334 zone: &TextZone,
335 page_idx: usize,
336 block_id: &mut usize,
337 line_id: &mut usize,
338 word_id: &mut usize,
339 indent: usize,
340) -> Result<(), OcrExportError> {
341 let pad = " ".repeat(indent);
342 let r = &zone.rect;
343
344 match zone.kind {
345 TextZoneKind::Page => {
346 for child in &zone.children {
347 write_alto_zone(out, child, page_idx, block_id, line_id, word_id, indent)?;
348 }
349 }
350 TextZoneKind::Column | TextZoneKind::Region | TextZoneKind::Para => {
351 let id = *block_id;
352 *block_id += 1;
353 writeln!(
354 out,
355 r#"{pad}<TextBlock ID="block_{page}_{id}" HPOS="{hpos}" VPOS="{vpos}" WIDTH="{w}" HEIGHT="{h}">"#,
356 page = page_idx,
357 hpos = r.x,
358 vpos = r.y,
359 w = r.width,
360 h = r.height,
361 )?;
362 for child in &zone.children {
363 write_alto_zone(out, child, page_idx, block_id, line_id, word_id, indent + 2)?;
364 }
365 writeln!(out, "{pad}</TextBlock>")?;
366 }
367 TextZoneKind::Line => {
368 let id = *line_id;
369 *line_id += 1;
370 writeln!(
371 out,
372 r#"{pad}<TextLine ID="line_{page}_{id}" HPOS="{hpos}" VPOS="{vpos}" WIDTH="{w}" HEIGHT="{h}">"#,
373 page = page_idx,
374 hpos = r.x,
375 vpos = r.y,
376 w = r.width,
377 h = r.height,
378 )?;
379 for child in &zone.children {
380 write_alto_zone(out, child, page_idx, block_id, line_id, word_id, indent + 2)?;
381 }
382 writeln!(out, "{pad}</TextLine>")?;
383 }
384 TextZoneKind::Word => {
385 let id = *word_id;
386 *word_id += 1;
387 let text = escape_xml(&zone.text);
388 writeln!(
389 out,
390 r#"{pad}<String ID="word_{page}_{id}" HPOS="{hpos}" VPOS="{vpos}" WIDTH="{w}" HEIGHT="{h}" CONTENT="{text}"/>"#,
391 page = page_idx,
392 hpos = r.x,
393 vpos = r.y,
394 w = r.width,
395 h = r.height,
396 )?;
397 }
398 TextZoneKind::Character => {
399 }
401 }
402 Ok(())
403}
404
405fn escape_xml(s: &str) -> String {
406 s.chars()
407 .flat_map(|c| match c {
408 '&' => "&".chars().collect::<Vec<_>>(),
409 '<' => "<".chars().collect(),
410 '>' => ">".chars().collect(),
411 '"' => """.chars().collect(),
412 '\'' => "'".chars().collect(),
413 c => vec![c],
414 })
415 .collect()
416}