1use crate::engine::{text_document_from_text, ExtractionEngine};
2use crate::error::Result;
3use crate::ir::{
4 BBox, Block, Confidence, Document, Line, Metadata, Page, SourceAnchor, Span, TextBlock,
5 SCHEMA_VERSION,
6};
7use crate::source::Source;
8
9#[derive(Debug, Default, Clone, Copy)]
10pub struct HtmlEngine;
11
12#[derive(Debug, Default, Clone, Copy)]
13pub struct EmailEngine;
14
15#[derive(Debug, Default, Clone, Copy)]
16pub struct XmlEngine;
17
18impl ExtractionEngine for HtmlEngine {
19 fn name(&self) -> &'static str {
20 "html-native"
21 }
22
23 fn extract(&self, source: &Source) -> Result<Document> {
24 if let Some(document) = hocr_document(source, self.name()) {
25 return Ok(document);
26 }
27 text_document_from_text(source, self.name(), &html_to_text(&source.content), None)
28 }
29}
30
31impl ExtractionEngine for EmailEngine {
32 fn name(&self) -> &'static str {
33 "email-native"
34 }
35
36 fn extract(&self, source: &Source) -> Result<Document> {
37 let email = parse_email(&source.content);
38 let text = match (&email.subject, email.body.trim()) {
39 (Some(subject), body) if !body.is_empty() => format!("{subject}\n\n{body}"),
40 (Some(subject), _) => subject.clone(),
41 (None, body) => body.to_owned(),
42 };
43 text_document_from_text(source, self.name(), &text, email.subject)
44 }
45}
46
47impl ExtractionEngine for XmlEngine {
48 fn name(&self) -> &'static str {
49 "xml-native"
50 }
51
52 fn extract(&self, source: &Source) -> Result<Document> {
53 if let Some(document) = page_xml_document(source, self.name()) {
54 return Ok(document);
55 }
56 if let Some(document) = alto_document(source, self.name()) {
57 return Ok(document);
58 }
59 if let Some(document) = pascal_voc_document(source, self.name()) {
60 return Ok(document);
61 }
62 text_document_from_text(source, self.name(), &html_to_text(&source.content), None)
63 }
64}
65
66#[derive(Debug, Default)]
67struct EmailParts {
68 subject: Option<String>,
69 body: String,
70}
71
72#[derive(Debug)]
73struct PascalVocObject {
74 name: String,
75 bbox: BBox,
76}
77
78#[derive(Debug, Clone, Copy)]
79struct XmlElement<'a> {
80 start_tag: &'a str,
81 content: &'a str,
82}
83
84#[derive(Debug, Clone)]
85struct AltoWord {
86 text: String,
87 bbox: Option<BBox>,
88 confidence: Option<f32>,
89}
90
91#[derive(Debug, Clone)]
92struct HocrWord {
93 text: String,
94 bbox: Option<BBox>,
95 confidence: Option<f32>,
96}
97
98#[derive(Debug, Clone)]
99struct PageXmlWord {
100 text: String,
101 bbox: Option<BBox>,
102 confidence: Option<f32>,
103}
104
105fn hocr_document(source: &Source, engine_name: &str) -> Option<Document> {
106 if !source.content.contains("ocr_page")
107 && !source.content.contains("ocr_line")
108 && !source.content.contains("ocrx_word")
109 {
110 return None;
111 }
112
113 let page_element = hocr_elements_with_class(&source.content, "ocr_page")
114 .into_iter()
115 .next();
116 let page_content = page_element
117 .as_ref()
118 .map(|element| element.content)
119 .unwrap_or(source.content.as_str());
120 let page_bbox = page_element
121 .as_ref()
122 .and_then(|element| hocr_bbox_from_tag(element.start_tag));
123 let mut blocks = hocr_elements_with_any_class(page_content, &["ocr_line", "ocrx_line"])
124 .into_iter()
125 .filter_map(|line| hocr_line_block(line, 1))
126 .collect::<Vec<_>>();
127
128 if blocks.is_empty() {
129 blocks = hocr_elements_with_class(page_content, "ocrx_word")
130 .into_iter()
131 .filter_map(hocr_word_from_element)
132 .map(|word| hocr_word_block(word, 1))
133 .collect();
134 }
135 if blocks.is_empty() {
136 return None;
137 }
138
139 let page_bbox = page_bbox.or_else(|| inferred_block_bbox(&blocks));
140 let text = blocks
141 .iter()
142 .filter_map(|block| match block {
143 Block::Text(text) => Some(text.text.as_str()),
144 _ => None,
145 })
146 .collect::<Vec<_>>()
147 .join("\n");
148
149 Some(Document {
150 schema_version: SCHEMA_VERSION.to_owned(),
151 metadata: Metadata {
152 format: source.format.clone(),
153 engine: engine_name.to_owned(),
154 source: source.path.clone(),
155 title: None,
156 character_count: text.chars().count(),
157 word_count: text.split_whitespace().count(),
158 block_count: blocks.len(),
159 file_size_bytes: source.bytes.as_ref().map(|bytes| bytes.len() as u64),
160 pdf_version: None,
161 encrypted: false,
162 },
163 pages: vec![Page {
164 number: 1,
165 width: page_bbox.map(|bbox| bbox.width),
166 height: page_bbox.map(|bbox| bbox.height),
167 rotation: None,
168 bbox: page_bbox,
169 blocks,
170 images: Vec::new(),
171 assets: Vec::new(),
172 warnings: Vec::new(), ..Default::default()
173 }],
174 assets: Vec::new(),
175 warnings: Vec::new(),
176 })
177}
178
179fn hocr_line_block(line: XmlElement<'_>, page_number: usize) -> Option<Block> {
180 let words = hocr_elements_with_class(line.content, "ocrx_word")
181 .into_iter()
182 .filter_map(hocr_word_from_element)
183 .collect::<Vec<_>>();
184 if words.is_empty() {
185 let text = html_to_text(line.content);
186 if text.trim().is_empty() {
187 return None;
188 }
189 let bbox = hocr_bbox_from_tag(line.start_tag);
190 return Some(Block::Text(TextBlock {
191 text: text.split_whitespace().collect::<Vec<_>>().join(" "),
192 kind: "ocr_line".to_owned(),
193 bbox,
194 lines: Vec::new(),
195 source_anchors: vec![html_source_anchor(page_number, bbox)],
196 confidence: Some(Confidence {
197 score: 0.9,
198 calibrated: false,
199 }), ..Default::default()
200 }));
201 }
202
203 let text = words
204 .iter()
205 .map(|word| word.text.as_str())
206 .collect::<Vec<_>>()
207 .join(" ");
208 let bbox = hocr_bbox_from_tag(line.start_tag).or_else(|| inferred_hocr_word_bbox(&words));
209 let confidence = mean_confidence(words.iter().filter_map(|word| word.confidence));
210 let spans = words
211 .iter()
212 .map(|word| Span {
213 text: word.text.clone(),
214 bbox: word.bbox,
215 font: None,
216 size: None,
217 bold: false,
218 italic: false,
219 })
220 .collect::<Vec<_>>();
221
222 Some(Block::Text(TextBlock {
223 text: text.clone(),
224 kind: "ocr_line".to_owned(),
225 bbox,
226 lines: vec![Line { text, bbox, spans }],
227 source_anchors: vec![html_source_anchor(page_number, bbox)],
228 confidence: Some(Confidence {
229 score: confidence.unwrap_or(0.9),
230 calibrated: false,
231 }), ..Default::default()
232 }))
233}
234
235fn hocr_word_block(word: HocrWord, page_number: usize) -> Block {
236 Block::Text(TextBlock {
237 text: word.text.clone(),
238 kind: "ocr_word".to_owned(),
239 bbox: word.bbox,
240 lines: Vec::new(),
241 source_anchors: vec![html_source_anchor(page_number, word.bbox)],
242 confidence: Some(Confidence {
243 score: word.confidence.unwrap_or(0.9),
244 calibrated: false,
245 }), ..Default::default()
246 })
247}
248
249fn hocr_word_from_element(element: XmlElement<'_>) -> Option<HocrWord> {
250 let text = html_to_text(element.content)
251 .split_whitespace()
252 .collect::<Vec<_>>()
253 .join(" ");
254 if text.is_empty() {
255 return None;
256 }
257 Some(HocrWord {
258 text,
259 bbox: hocr_bbox_from_tag(element.start_tag),
260 confidence: hocr_confidence_from_tag(element.start_tag),
261 })
262}
263
264fn page_xml_document(source: &Source, engine_name: &str) -> Option<Document> {
265 if !source.content.contains("PcGts") && !source.content.contains("TextRegion") {
266 return None;
267 }
268
269 let page_element = xml_elements_by_local_name(&source.content, "Page")
270 .into_iter()
271 .next()?;
272 let width = first_xml_attr_f32(
273 page_element.start_tag,
274 &["imageWidth", "image_width", "WIDTH", "width"],
275 );
276 let height = first_xml_attr_f32(
277 page_element.start_tag,
278 &["imageHeight", "image_height", "HEIGHT", "height"],
279 );
280 let blocks = xml_elements_by_local_name(page_element.content, "TextLine")
281 .into_iter()
282 .filter_map(|line| page_xml_line_block(line, 1))
283 .collect::<Vec<_>>();
284 if blocks.is_empty() {
285 return None;
286 }
287
288 let page_bbox = page_bbox(width, height).or_else(|| inferred_block_bbox(&blocks));
289 let text = blocks
290 .iter()
291 .filter_map(|block| match block {
292 Block::Text(text) => Some(text.text.as_str()),
293 _ => None,
294 })
295 .collect::<Vec<_>>()
296 .join("\n");
297
298 Some(Document {
299 schema_version: SCHEMA_VERSION.to_owned(),
300 metadata: Metadata {
301 format: source.format.clone(),
302 engine: engine_name.to_owned(),
303 source: source.path.clone(),
304 title: None,
305 character_count: text.chars().count(),
306 word_count: text.split_whitespace().count(),
307 block_count: blocks.len(),
308 file_size_bytes: source.bytes.as_ref().map(|bytes| bytes.len() as u64),
309 pdf_version: None,
310 encrypted: false,
311 },
312 pages: vec![Page {
313 number: 1,
314 width: width.or_else(|| page_bbox.map(|bbox| bbox.width)),
315 height: height.or_else(|| page_bbox.map(|bbox| bbox.height)),
316 rotation: None,
317 bbox: page_bbox,
318 blocks,
319 images: Vec::new(),
320 assets: Vec::new(),
321 warnings: Vec::new(), ..Default::default()
322 }],
323 assets: Vec::new(),
324 warnings: Vec::new(),
325 })
326}
327
328fn page_xml_line_block(line: XmlElement<'_>, page_number: usize) -> Option<Block> {
329 let words = xml_elements_by_local_name(line.content, "Word")
330 .into_iter()
331 .filter_map(page_xml_word_from_element)
332 .collect::<Vec<_>>();
333 let bbox =
334 page_xml_bbox_from_content(line.content).or_else(|| inferred_page_xml_word_bbox(&words));
335
336 if words.is_empty() {
337 let text = page_xml_text_from_content(line.content)?;
338 if text.is_empty() {
339 return None;
340 }
341 return Some(Block::Text(TextBlock {
342 text,
343 kind: "ocr_line".to_owned(),
344 bbox,
345 lines: Vec::new(),
346 source_anchors: vec![xml_source_anchor(page_number, bbox)],
347 confidence: Some(Confidence {
348 score: page_xml_confidence_from_content(line.content).unwrap_or(0.9),
349 calibrated: false,
350 }), ..Default::default()
351 }));
352 }
353
354 let text = page_xml_text_from_content(line.content).unwrap_or_else(|| {
355 words
356 .iter()
357 .map(|word| word.text.as_str())
358 .collect::<Vec<_>>()
359 .join(" ")
360 });
361 let confidence = mean_confidence(words.iter().filter_map(|word| word.confidence));
362 let spans = words
363 .iter()
364 .map(|word| Span {
365 text: word.text.clone(),
366 bbox: word.bbox,
367 font: None,
368 size: None,
369 bold: false,
370 italic: false,
371 })
372 .collect::<Vec<_>>();
373
374 Some(Block::Text(TextBlock {
375 text: text.clone(),
376 kind: "ocr_line".to_owned(),
377 bbox,
378 lines: vec![Line { text, bbox, spans }],
379 source_anchors: vec![xml_source_anchor(page_number, bbox)],
380 confidence: Some(Confidence {
381 score: confidence.unwrap_or(0.9),
382 calibrated: false,
383 }), ..Default::default()
384 }))
385}
386
387fn page_xml_word_from_element(element: XmlElement<'_>) -> Option<PageXmlWord> {
388 let text = page_xml_text_from_content(element.content)?;
389 if text.is_empty() {
390 return None;
391 }
392 Some(PageXmlWord {
393 text,
394 bbox: page_xml_bbox_from_content(element.content),
395 confidence: page_xml_confidence_from_content(element.content),
396 })
397}
398
399fn alto_document(source: &Source, engine_name: &str) -> Option<Document> {
400 let page_element = xml_elements_by_local_name(&source.content, "Page")
401 .into_iter()
402 .next()?;
403 let width = xml_attr_f32(page_element.start_tag, "WIDTH");
404 let height = xml_attr_f32(page_element.start_tag, "HEIGHT");
405 let mut blocks = xml_elements_by_local_name(page_element.content, "TextLine")
406 .into_iter()
407 .filter_map(|line| alto_line_block(line, 1))
408 .collect::<Vec<_>>();
409
410 if blocks.is_empty() {
411 blocks = xml_start_tags_by_local_name(page_element.content, "String")
412 .into_iter()
413 .filter_map(|tag| alto_word_from_tag(tag))
414 .map(|word| alto_word_block(word, 1))
415 .collect();
416 }
417 if blocks.is_empty() {
418 return None;
419 }
420
421 let page_bbox = page_bbox(width, height).or_else(|| inferred_block_bbox(&blocks));
422 let text = blocks
423 .iter()
424 .filter_map(|block| match block {
425 Block::Text(text) => Some(text.text.as_str()),
426 _ => None,
427 })
428 .collect::<Vec<_>>()
429 .join("\n");
430
431 Some(Document {
432 schema_version: SCHEMA_VERSION.to_owned(),
433 metadata: Metadata {
434 format: source.format.clone(),
435 engine: engine_name.to_owned(),
436 source: source.path.clone(),
437 title: None,
438 character_count: text.chars().count(),
439 word_count: text.split_whitespace().count(),
440 block_count: blocks.len(),
441 file_size_bytes: source.bytes.as_ref().map(|bytes| bytes.len() as u64),
442 pdf_version: None,
443 encrypted: false,
444 },
445 pages: vec![Page {
446 number: 1,
447 width: width.or_else(|| page_bbox.map(|bbox| bbox.width)),
448 height: height.or_else(|| page_bbox.map(|bbox| bbox.height)),
449 rotation: None,
450 bbox: page_bbox,
451 blocks,
452 images: Vec::new(),
453 assets: Vec::new(),
454 warnings: Vec::new(), ..Default::default()
455 }],
456 assets: Vec::new(),
457 warnings: Vec::new(),
458 })
459}
460
461fn alto_line_block(line: XmlElement<'_>, page_number: usize) -> Option<Block> {
462 let words = xml_start_tags_by_local_name(line.content, "String")
463 .into_iter()
464 .filter_map(alto_word_from_tag)
465 .collect::<Vec<_>>();
466 if words.is_empty() {
467 return None;
468 }
469 let text = words
470 .iter()
471 .map(|word| word.text.as_str())
472 .collect::<Vec<_>>()
473 .join(" ");
474 let bbox = alto_bbox_from_tag(line.start_tag).or_else(|| inferred_word_bbox(&words));
475 let confidence = mean_confidence(words.iter().filter_map(|word| word.confidence));
476 let spans = words
477 .iter()
478 .map(|word| Span {
479 text: word.text.clone(),
480 bbox: word.bbox,
481 font: None,
482 size: None,
483 bold: false,
484 italic: false,
485 })
486 .collect::<Vec<_>>();
487
488 Some(Block::Text(TextBlock {
489 text,
490 kind: "ocr_line".to_owned(),
491 bbox,
492 lines: vec![Line {
493 text: words
494 .iter()
495 .map(|word| word.text.as_str())
496 .collect::<Vec<_>>()
497 .join(" "),
498 bbox,
499 spans,
500 }],
501 source_anchors: vec![xml_source_anchor(page_number, bbox)],
502 confidence: Some(Confidence {
503 score: confidence.unwrap_or(0.9),
504 calibrated: false,
505 }), ..Default::default()
506 }))
507}
508
509fn alto_word_block(word: AltoWord, page_number: usize) -> Block {
510 Block::Text(TextBlock {
511 text: word.text.clone(),
512 kind: "ocr_word".to_owned(),
513 bbox: word.bbox,
514 lines: Vec::new(),
515 source_anchors: vec![xml_source_anchor(page_number, word.bbox)],
516 confidence: Some(Confidence {
517 score: word.confidence.unwrap_or(0.9),
518 calibrated: false,
519 }), ..Default::default()
520 })
521}
522
523fn alto_word_from_tag(tag: &str) -> Option<AltoWord> {
524 let text = xml_attr_value(tag, "CONTENT")
525 .map(|value| html_to_text(&value))
526 .map(|text| text.split_whitespace().collect::<Vec<_>>().join(" "))?;
527 if text.is_empty() {
528 return None;
529 }
530 Some(AltoWord {
531 text,
532 bbox: alto_bbox_from_tag(tag),
533 confidence: xml_attr_f32(tag, "WC"),
534 })
535}
536
537fn alto_bbox_from_tag(tag: &str) -> Option<BBox> {
538 Some(BBox {
539 x: xml_attr_f32(tag, "HPOS")?,
540 y: xml_attr_f32(tag, "VPOS")?,
541 width: xml_attr_f32(tag, "WIDTH")?,
542 height: xml_attr_f32(tag, "HEIGHT")?,
543 })
544}
545
546fn hocr_bbox_from_tag(tag: &str) -> Option<BBox> {
547 let title = xml_attr_value(tag, "title")?;
548 let mut parts = title
549 .split(';')
550 .find_map(|part| part.trim().strip_prefix("bbox "))?
551 .split_whitespace();
552 let left = parts.next()?.parse::<f32>().ok()?;
553 let top = parts.next()?.parse::<f32>().ok()?;
554 let right = parts.next()?.parse::<f32>().ok()?;
555 let bottom = parts.next()?.parse::<f32>().ok()?;
556 Some(BBox {
557 x: left.min(right),
558 y: top.min(bottom),
559 width: (right - left).abs(),
560 height: (bottom - top).abs(),
561 })
562}
563
564fn hocr_confidence_from_tag(tag: &str) -> Option<f32> {
565 let title = xml_attr_value(tag, "title")?;
566 let value = title
567 .split(';')
568 .find_map(|part| part.trim().strip_prefix("x_wconf "))?
569 .split_whitespace()
570 .next()?
571 .parse::<f32>()
572 .ok()?;
573 Some((value / 100.0).clamp(0.0, 1.0))
574}
575
576fn page_xml_text_from_content(content: &str) -> Option<String> {
577 xml_elements_by_local_name(content, "Unicode")
578 .into_iter()
579 .last()
580 .map(|unicode| html_to_text(unicode.content))
581 .map(|text| text.split_whitespace().collect::<Vec<_>>().join(" "))
582 .filter(|text| !text.is_empty())
583}
584
585fn page_xml_confidence_from_content(content: &str) -> Option<f32> {
586 xml_elements_by_local_name(content, "TextEquiv")
587 .into_iter()
588 .last()
589 .and_then(|element| xml_attr_f32(element.start_tag, "conf"))
590}
591
592fn page_xml_bbox_from_content(content: &str) -> Option<BBox> {
593 let coords = xml_start_tags_by_local_name(content, "Coords")
594 .into_iter()
595 .next()?;
596 let points = xml_attr_value(coords, "points")?;
597 bbox_from_points(&points)
598}
599
600fn bbox_from_points(points: &str) -> Option<BBox> {
601 let mut min_x = f32::INFINITY;
602 let mut min_y = f32::INFINITY;
603 let mut max_x = f32::NEG_INFINITY;
604 let mut max_y = f32::NEG_INFINITY;
605 let mut count = 0usize;
606
607 for point in points.split_whitespace() {
608 let Some((x, y)) = point.split_once(',') else {
609 continue;
610 };
611 let x = x.parse::<f32>().ok()?;
612 let y = y.parse::<f32>().ok()?;
613 min_x = min_x.min(x);
614 min_y = min_y.min(y);
615 max_x = max_x.max(x);
616 max_y = max_y.max(y);
617 count += 1;
618 }
619
620 (count > 0).then_some(BBox {
621 x: min_x,
622 y: min_y,
623 width: max_x - min_x,
624 height: max_y - min_y,
625 })
626}
627
628fn inferred_page_xml_word_bbox(words: &[PageXmlWord]) -> Option<BBox> {
629 let mut min_x = f32::INFINITY;
630 let mut min_y = f32::INFINITY;
631 let mut max_x = f32::NEG_INFINITY;
632 let mut max_y = f32::NEG_INFINITY;
633 let mut has_bbox = false;
634 for word in words {
635 let Some(bbox) = word.bbox else {
636 continue;
637 };
638 has_bbox = true;
639 min_x = min_x.min(bbox.x);
640 min_y = min_y.min(bbox.y);
641 max_x = max_x.max(bbox.x + bbox.width);
642 max_y = max_y.max(bbox.y + bbox.height);
643 }
644 has_bbox.then_some(BBox {
645 x: min_x,
646 y: min_y,
647 width: max_x - min_x,
648 height: max_y - min_y,
649 })
650}
651
652fn inferred_hocr_word_bbox(words: &[HocrWord]) -> Option<BBox> {
653 let mut min_x = f32::INFINITY;
654 let mut min_y = f32::INFINITY;
655 let mut max_x = f32::NEG_INFINITY;
656 let mut max_y = f32::NEG_INFINITY;
657 let mut has_bbox = false;
658 for word in words {
659 let Some(bbox) = word.bbox else {
660 continue;
661 };
662 has_bbox = true;
663 min_x = min_x.min(bbox.x);
664 min_y = min_y.min(bbox.y);
665 max_x = max_x.max(bbox.x + bbox.width);
666 max_y = max_y.max(bbox.y + bbox.height);
667 }
668 has_bbox.then_some(BBox {
669 x: min_x,
670 y: min_y,
671 width: max_x - min_x,
672 height: max_y - min_y,
673 })
674}
675
676fn inferred_word_bbox(words: &[AltoWord]) -> Option<BBox> {
677 let mut min_x = f32::INFINITY;
678 let mut min_y = f32::INFINITY;
679 let mut max_x = f32::NEG_INFINITY;
680 let mut max_y = f32::NEG_INFINITY;
681 let mut has_bbox = false;
682 for word in words {
683 let Some(bbox) = word.bbox else {
684 continue;
685 };
686 has_bbox = true;
687 min_x = min_x.min(bbox.x);
688 min_y = min_y.min(bbox.y);
689 max_x = max_x.max(bbox.x + bbox.width);
690 max_y = max_y.max(bbox.y + bbox.height);
691 }
692 has_bbox.then_some(BBox {
693 x: min_x,
694 y: min_y,
695 width: max_x - min_x,
696 height: max_y - min_y,
697 })
698}
699
700fn inferred_block_bbox(blocks: &[Block]) -> Option<BBox> {
701 let mut max_x = 0.0f32;
702 let mut max_y = 0.0f32;
703 let mut has_bbox = false;
704 for block in blocks {
705 let Some(bbox) = block_bbox(block) else {
706 continue;
707 };
708 has_bbox = true;
709 max_x = max_x.max(bbox.x + bbox.width);
710 max_y = max_y.max(bbox.y + bbox.height);
711 }
712 has_bbox.then_some(BBox {
713 x: 0.0,
714 y: 0.0,
715 width: max_x,
716 height: max_y,
717 })
718}
719
720fn block_bbox(block: &Block) -> Option<BBox> {
721 match block {
722 Block::Text(text) => text.bbox,
723 Block::Table(table) => table.bbox,
724 Block::Figure(figure) => figure.bbox,
725 }
726}
727
728fn page_bbox(width: Option<f32>, height: Option<f32>) -> Option<BBox> {
729 Some(BBox {
730 x: 0.0,
731 y: 0.0,
732 width: width?,
733 height: height?,
734 })
735}
736
737fn xml_source_anchor(page_number: usize, bbox: Option<BBox>) -> SourceAnchor {
738 SourceAnchor {
739 page_number,
740 pdf_object_ids: Vec::new(),
741 bbox,
742 extraction_method: "xml_native".to_owned(),
743 }
744}
745
746fn html_source_anchor(page_number: usize, bbox: Option<BBox>) -> SourceAnchor {
747 SourceAnchor {
748 page_number,
749 pdf_object_ids: Vec::new(),
750 bbox,
751 extraction_method: "html_native".to_owned(),
752 }
753}
754
755fn mean_confidence(values: impl Iterator<Item = f32>) -> Option<f32> {
756 let mut total = 0.0f32;
757 let mut count = 0usize;
758 for value in values {
759 total += value;
760 count += 1;
761 }
762 (count > 0).then_some(total / count as f32)
763}
764
765fn pascal_voc_document(source: &Source, engine_name: &str) -> Option<Document> {
766 let width = tag_text(&source.content, "width")?.parse::<f32>().ok()?;
767 let height = tag_text(&source.content, "height")?.parse::<f32>().ok()?;
768 let objects = pascal_voc_objects(&source.content);
769 if objects.is_empty() {
770 return None;
771 }
772
773 let blocks = objects
774 .into_iter()
775 .map(|object| {
776 Block::Text(TextBlock {
777 kind: object.name.clone(),
778 text: object.name,
779 bbox: Some(object.bbox),
780 lines: Vec::new(),
781 source_anchors: vec![SourceAnchor {
782 page_number: 1,
783 pdf_object_ids: Vec::new(),
784 bbox: Some(object.bbox),
785 extraction_method: "xml_native".to_owned(),
786 }],
787 confidence: Some(Confidence {
788 score: 0.9,
789 calibrated: false,
790 }), ..Default::default()
791 })
792 })
793 .collect::<Vec<_>>();
794 let text = blocks
795 .iter()
796 .filter_map(|block| match block {
797 Block::Text(text) => Some(text.text.as_str()),
798 _ => None,
799 })
800 .collect::<Vec<_>>()
801 .join("\n");
802
803 Some(Document {
804 schema_version: SCHEMA_VERSION.to_owned(),
805 metadata: Metadata {
806 format: source.format.clone(),
807 engine: engine_name.to_owned(),
808 source: source.path.clone(),
809 title: None,
810 character_count: text.chars().count(),
811 word_count: text.split_whitespace().count(),
812 block_count: blocks.len(),
813 file_size_bytes: source.bytes.as_ref().map(|bytes| bytes.len() as u64),
814 pdf_version: None,
815 encrypted: false,
816 },
817 pages: vec![Page {
818 number: 1,
819 width: Some(width),
820 height: Some(height),
821 rotation: None,
822 bbox: Some(BBox {
823 x: 0.0,
824 y: 0.0,
825 width,
826 height,
827 }),
828 blocks,
829 images: Vec::new(),
830 assets: Vec::new(),
831 warnings: Vec::new(), ..Default::default()
832 }],
833 assets: Vec::new(),
834 warnings: Vec::new(),
835 })
836}
837
838fn pascal_voc_objects(xml: &str) -> Vec<PascalVocObject> {
839 tag_ranges(xml, "object")
840 .into_iter()
841 .filter_map(|range| {
842 let object_xml = &xml[range.0..range.1];
843 let name = tag_text(object_xml, "name")?;
844 let xmin = tag_text(object_xml, "xmin")?.parse::<f32>().ok()?;
845 let ymin = tag_text(object_xml, "ymin")?.parse::<f32>().ok()?;
846 let xmax = tag_text(object_xml, "xmax")?.parse::<f32>().ok()?;
847 let ymax = tag_text(object_xml, "ymax")?.parse::<f32>().ok()?;
848 Some(PascalVocObject {
849 name,
850 bbox: BBox {
851 x: xmin.min(xmax),
852 y: ymin.min(ymax),
853 width: (xmax - xmin).abs(),
854 height: (ymax - ymin).abs(),
855 },
856 })
857 })
858 .collect()
859}
860
861fn tag_text(xml: &str, tag: &str) -> Option<String> {
862 let range = tag_ranges(xml, tag).into_iter().next()?;
863 Some(html_to_text(&xml[range.0..range.1]).trim().to_owned())
864}
865
866fn tag_ranges(xml: &str, tag: &str) -> Vec<(usize, usize)> {
867 let lower = xml.to_ascii_lowercase();
868 let mut ranges = Vec::new();
869 let mut search_start = 0;
870 let open = format!("<{tag}");
871 let close = format!("</{tag}>");
872
873 while let Some(offset) = lower[search_start..].find(&open) {
874 let open_start = search_start + offset;
875 let Some(open_end_offset) = lower[open_start..].find('>') else {
876 break;
877 };
878 let content_start = open_start + open_end_offset + 1;
879 let Some(close_offset) = lower[content_start..].find(&close) else {
880 break;
881 };
882 let content_end = content_start + close_offset;
883 ranges.push((content_start, content_end));
884 search_start = content_end + close.len();
885 }
886
887 ranges
888}
889
890fn xml_elements_by_local_name<'a>(xml: &'a str, local_name: &str) -> Vec<XmlElement<'a>> {
891 let mut elements = Vec::new();
892 let mut pos = 0usize;
893 while let Some(relative_start) = xml[pos..].find('<') {
894 let start = pos + relative_start;
895 let Some(relative_end) = xml[start..].find('>') else {
896 break;
897 };
898 let tag_end = start + relative_end;
899 let start_tag = &xml[start..=tag_end];
900 let Some(tag_name) = opening_tag_name(start_tag) else {
901 pos = tag_end + 1;
902 continue;
903 };
904 if tag_local_name(tag_name).eq_ignore_ascii_case(local_name)
905 && !start_tag.trim_end().ends_with("/>")
906 {
907 let close = format!("</{tag_name}>");
908 let content_start = tag_end + 1;
909 if let Some(relative_close) = xml[content_start..].find(&close) {
910 let content_end = content_start + relative_close;
911 elements.push(XmlElement {
912 start_tag,
913 content: &xml[content_start..content_end],
914 });
915 pos = content_end + close.len();
916 continue;
917 }
918 }
919 pos = tag_end + 1;
920 }
921 elements
922}
923
924fn hocr_elements_with_class<'a>(html: &'a str, class_name: &str) -> Vec<XmlElement<'a>> {
925 hocr_elements_with_any_class(html, &[class_name])
926}
927
928fn hocr_elements_with_any_class<'a>(html: &'a str, class_names: &[&str]) -> Vec<XmlElement<'a>> {
929 let mut elements = Vec::new();
930 let mut pos = 0usize;
931 while let Some(relative_start) = html[pos..].find('<') {
932 let start = pos + relative_start;
933 let Some(relative_end) = html[start..].find('>') else {
934 break;
935 };
936 let tag_end = start + relative_end;
937 let start_tag = &html[start..=tag_end];
938 let Some(tag_name) = opening_tag_name(start_tag) else {
939 pos = tag_end + 1;
940 continue;
941 };
942 if tag_has_any_class(start_tag, class_names) && !start_tag.trim_end().ends_with("/>") {
943 let content_start = tag_end + 1;
944 if let Some(content_end) = matching_element_content_end(html, tag_name, content_start) {
945 elements.push(XmlElement {
946 start_tag,
947 content: &html[content_start..content_end],
948 });
949 pos = content_end + closing_tag_len(tag_name);
950 continue;
951 }
952 }
953 pos = tag_end + 1;
954 }
955 elements
956}
957
958fn tag_has_any_class(tag: &str, class_names: &[&str]) -> bool {
959 let Some(classes) = xml_attr_value(tag, "class") else {
960 return false;
961 };
962 classes.split_whitespace().any(|class| {
963 class_names
964 .iter()
965 .any(|name| class.eq_ignore_ascii_case(name))
966 })
967}
968
969fn matching_element_content_end(
970 input: &str,
971 tag_name: &str,
972 content_start: usize,
973) -> Option<usize> {
974 let lower = input.to_ascii_lowercase();
975 let tag = tag_name.to_ascii_lowercase();
976 let open = format!("<{tag}");
977 let close = format!("</{tag}>");
978 let mut pos = content_start;
979 let mut depth = 1usize;
980
981 loop {
982 let next_open = lower[pos..].find(&open).map(|offset| pos + offset);
983 let next_close = lower[pos..].find(&close).map(|offset| pos + offset)?;
984
985 if next_open
986 .map(|open_pos| open_pos < next_close)
987 .unwrap_or(false)
988 {
989 let open_pos = next_open.unwrap();
990 let after_name = open_pos + open.len();
991 if is_tag_name_boundary(lower.as_bytes().get(after_name).copied()) {
992 let Some(open_end_offset) = lower[open_pos..].find('>') else {
993 return None;
994 };
995 let open_end = open_pos + open_end_offset;
996 if !lower[open_pos..=open_end].trim_end().ends_with("/>") {
997 depth += 1;
998 }
999 pos = open_end + 1;
1000 } else {
1001 pos = after_name;
1002 }
1003 continue;
1004 }
1005
1006 depth -= 1;
1007 if depth == 0 {
1008 return Some(next_close);
1009 }
1010 pos = next_close + close.len();
1011 }
1012}
1013
1014fn closing_tag_len(tag_name: &str) -> usize {
1015 tag_name.len() + 3
1016}
1017
1018fn is_tag_name_boundary(byte: Option<u8>) -> bool {
1019 byte.map(|byte| matches!(byte, b'>' | b'/' | b' ' | b'\t' | b'\n' | b'\r'))
1020 .unwrap_or(false)
1021}
1022
1023fn xml_start_tags_by_local_name<'a>(xml: &'a str, local_name: &str) -> Vec<&'a str> {
1024 let mut tags = Vec::new();
1025 let mut pos = 0usize;
1026 while let Some(relative_start) = xml[pos..].find('<') {
1027 let start = pos + relative_start;
1028 let Some(relative_end) = xml[start..].find('>') else {
1029 break;
1030 };
1031 let tag_end = start + relative_end;
1032 let start_tag = &xml[start..=tag_end];
1033 if opening_tag_name(start_tag)
1034 .map(|name| tag_local_name(name).eq_ignore_ascii_case(local_name))
1035 .unwrap_or(false)
1036 {
1037 tags.push(start_tag);
1038 }
1039 pos = tag_end + 1;
1040 }
1041 tags
1042}
1043
1044fn opening_tag_name(tag: &str) -> Option<&str> {
1045 let inner = tag.trim().strip_prefix('<')?.trim_start();
1046 if inner.starts_with('/') || inner.starts_with('!') || inner.starts_with('?') {
1047 return None;
1048 }
1049 inner
1050 .split_whitespace()
1051 .next()
1052 .map(|name| name.trim_end_matches('/').trim_end_matches('>'))
1053 .filter(|name| !name.is_empty())
1054}
1055
1056fn tag_local_name(name: &str) -> &str {
1057 name.rsplit_once(':')
1058 .map(|(_, local)| local)
1059 .unwrap_or(name)
1060}
1061
1062fn xml_attr_f32(tag: &str, name: &str) -> Option<f32> {
1063 xml_attr_value(tag, name)?.parse::<f32>().ok()
1064}
1065
1066fn first_xml_attr_f32(tag: &str, names: &[&str]) -> Option<f32> {
1067 names.iter().find_map(|name| xml_attr_f32(tag, name))
1068}
1069
1070fn xml_attr_value(tag: &str, name: &str) -> Option<String> {
1071 let bytes = tag.as_bytes();
1072 let mut pos = 0usize;
1073 while pos < bytes.len() {
1074 while pos < bytes.len() && !is_xml_name_start(bytes[pos]) {
1075 pos += 1;
1076 }
1077 let key_start = pos;
1078 while pos < bytes.len() && is_xml_name_continue(bytes[pos]) {
1079 pos += 1;
1080 }
1081 if key_start == pos {
1082 break;
1083 }
1084 let key = &tag[key_start..pos];
1085 while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
1086 pos += 1;
1087 }
1088 if bytes.get(pos) != Some(&b'=') {
1089 continue;
1090 }
1091 pos += 1;
1092 while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
1093 pos += 1;
1094 }
1095 let quote = *bytes.get(pos)?;
1096 if quote != b'"' && quote != b'\'' {
1097 continue;
1098 }
1099 pos += 1;
1100 let value_start = pos;
1101 while pos < bytes.len() && bytes[pos] != quote {
1102 pos += 1;
1103 }
1104 let value = &tag[value_start..pos];
1105 if key.eq_ignore_ascii_case(name) || tag_local_name(key).eq_ignore_ascii_case(name) {
1106 return Some(value.to_owned());
1107 }
1108 pos += 1;
1109 }
1110 None
1111}
1112
1113fn is_xml_name_start(byte: u8) -> bool {
1114 byte.is_ascii_alphabetic() || byte == b'_' || byte == b':'
1115}
1116
1117fn is_xml_name_continue(byte: u8) -> bool {
1118 is_xml_name_start(byte) || byte.is_ascii_digit() || byte == b'-' || byte == b'.'
1119}
1120
1121pub(crate) fn html_to_text(html: &str) -> String {
1122 let without_ignored = remove_html_ranges(html, &["script", "style", "title", "head"]);
1123 let mut output = String::new();
1124 let bytes = without_ignored.as_bytes();
1125 let mut pos = 0;
1126 let mut pending_space = false;
1127
1128 while pos < bytes.len() {
1129 if bytes[pos] == b'<' {
1130 if let Some(end) = without_ignored[pos..].find('>') {
1131 let tag = without_ignored[pos + 1..pos + end].trim();
1132 if is_block_tag(tag) {
1133 push_newline(&mut output);
1134 }
1135 pos += end + 1;
1136 pending_space = false;
1137 continue;
1138 }
1139 }
1140
1141 let Some(character) = without_ignored[pos..].chars().next() else {
1142 break;
1143 };
1144 if character == '&' {
1145 if let Some((decoded, consumed)) = decode_entity(&without_ignored[pos..]) {
1146 if pending_space {
1147 output.push(' ');
1148 }
1149 output.push_str(&decoded);
1150 pos += consumed;
1151 pending_space = false;
1152 continue;
1153 }
1154 }
1155 if character.is_whitespace() {
1156 pending_space = !output.ends_with('\n') && !output.is_empty();
1157 } else {
1158 if pending_space {
1159 output.push(' ');
1160 }
1161 output.push(character);
1162 pending_space = false;
1163 }
1164 pos += character.len_utf8();
1165 }
1166
1167 normalize_text_lines(&output)
1168}
1169
1170fn remove_html_ranges(input: &str, tags: &[&str]) -> String {
1171 let mut output = String::new();
1172 let mut pos = 0;
1173 while pos < input.len() {
1174 let lower_rest = input[pos..].to_ascii_lowercase();
1175 let Some((tag, start)) = find_ignored_tag_start(&lower_rest, tags) else {
1176 output.push_str(&input[pos..]);
1177 break;
1178 };
1179
1180 output.push_str(&input[pos..pos + start]);
1181 let after_open = pos + start;
1182 let close = format!("</{tag}>");
1183 let lower_after_open = input[after_open..].to_ascii_lowercase();
1184 if let Some(end) = lower_after_open.find(&close) {
1185 pos = after_open + end + close.len();
1186 } else {
1187 break;
1188 }
1189 }
1190 output
1191}
1192
1193fn find_ignored_tag_start<'a>(lower_input: &str, tags: &[&'a str]) -> Option<(&'a str, usize)> {
1194 tags.iter()
1195 .filter_map(|tag| find_tag_start(lower_input, tag).map(|start| (*tag, start)))
1196 .min_by_key(|(_, start)| *start)
1197}
1198
1199fn find_tag_start(input: &str, tag: &str) -> Option<usize> {
1200 let open = format!("<{tag}");
1201 let mut search_start = 0;
1202 while let Some(offset) = input[search_start..].find(&open) {
1203 let start = search_start + offset;
1204 let after_name = start + open.len();
1205 if input
1206 .as_bytes()
1207 .get(after_name)
1208 .map(|byte| matches!(byte, b'>' | b'/' | b' ' | b'\t' | b'\n' | b'\r'))
1209 .unwrap_or(false)
1210 {
1211 return Some(start);
1212 }
1213 search_start = after_name;
1214 }
1215 None
1216}
1217
1218fn is_block_tag(tag: &str) -> bool {
1219 let name = tag
1220 .trim_start_matches('/')
1221 .split_whitespace()
1222 .next()
1223 .unwrap_or_default()
1224 .trim_end_matches('/');
1225 matches!(
1226 name.to_ascii_lowercase().as_str(),
1227 "address"
1228 | "article"
1229 | "article-title"
1230 | "aside"
1231 | "abstract"
1232 | "back"
1233 | "blockquote"
1234 | "body"
1235 | "br"
1236 | "caption"
1237 | "div"
1238 | "footer"
1239 | "front"
1240 | "h1"
1241 | "h2"
1242 | "h3"
1243 | "h4"
1244 | "h5"
1245 | "h6"
1246 | "header"
1247 | "item"
1248 | "li"
1249 | "list"
1250 | "main"
1251 | "mixed-citation"
1252 | "p"
1253 | "ref"
1254 | "sec"
1255 | "section"
1256 | "table-wrap"
1257 | "tr"
1258 )
1259}
1260
1261fn decode_entity(input: &str) -> Option<(String, usize)> {
1262 let end = input.find(';')?.min(16);
1263 let entity = &input[1..end];
1264 let decoded = match entity {
1265 "amp" => "&".to_owned(),
1266 "lt" => "<".to_owned(),
1267 "gt" => ">".to_owned(),
1268 "quot" => "\"".to_owned(),
1269 "apos" => "'".to_owned(),
1270 "nbsp" => " ".to_owned(),
1271 value if value.starts_with("#x") || value.starts_with("#X") => {
1272 char::from_u32(u32::from_str_radix(&value[2..], 16).ok()?)?.to_string()
1273 }
1274 value if value.starts_with('#') => {
1275 char::from_u32(value[1..].parse::<u32>().ok()?)?.to_string()
1276 }
1277 _ => return None,
1278 };
1279 Some((decoded, end + 1))
1280}
1281
1282fn parse_email(raw: &str) -> EmailParts {
1283 let normalized = raw.replace("\r\n", "\n").replace('\r', "\n");
1284 let (headers, body) = normalized
1285 .split_once("\n\n")
1286 .unwrap_or((normalized.as_str(), ""));
1287 let mut subject_lines = Vec::new();
1288 let mut active_header = String::new();
1289
1290 for line in headers.lines() {
1291 if line.starts_with(' ') || line.starts_with('\t') {
1292 if active_header.eq_ignore_ascii_case("subject") {
1293 subject_lines.push(line.trim().to_owned());
1294 }
1295 continue;
1296 }
1297
1298 let Some((name, value)) = line.split_once(':') else {
1299 active_header.clear();
1300 continue;
1301 };
1302 active_header = name.trim().to_owned();
1303 if active_header.eq_ignore_ascii_case("subject") {
1304 subject_lines.push(value.trim().to_owned());
1305 }
1306 }
1307
1308 EmailParts {
1309 subject: (!subject_lines.is_empty())
1310 .then(|| decode_rfc2047_words(&subject_lines.join(" "))),
1311 body: normalize_text_lines(body),
1312 }
1313}
1314
1315fn decode_rfc2047_words(value: &str) -> String {
1316 value.to_owned()
1319}
1320
1321fn push_newline(output: &mut String) {
1322 while output.ends_with(' ') {
1323 output.pop();
1324 }
1325 if !output.ends_with("\n\n") {
1326 if output.ends_with('\n') {
1327 output.push('\n');
1328 } else if !output.is_empty() {
1329 output.push_str("\n\n");
1330 }
1331 }
1332}
1333
1334fn normalize_text_lines(text: &str) -> String {
1335 let mut lines = Vec::new();
1336 for line in text.lines() {
1337 let trimmed = line.split_whitespace().collect::<Vec<_>>().join(" ");
1338 if trimmed.is_empty() {
1339 if !lines
1340 .last()
1341 .map(|line: &String| line.is_empty())
1342 .unwrap_or(true)
1343 {
1344 lines.push(String::new());
1345 }
1346 } else {
1347 lines.push(trimmed);
1348 }
1349 }
1350 while lines.last().map(|line| line.is_empty()).unwrap_or(false) {
1351 lines.pop();
1352 }
1353 lines.join("\n")
1354}