Skip to main content

dongler_core/
textual.rs

1use crate::engine::{text_document_from_text, ExtractionEngine};
2use crate::error::Result;
3use crate::ir::{
4    BBox, Block, Confidence, Document, Line, Metadata, Page, SourceAnchor, Span, TextBlock,
5    SCHEMA_VERSION,
6};
7use crate::source::Source;
8
9#[derive(Debug, Default, Clone, Copy)]
10pub struct HtmlEngine;
11
12#[derive(Debug, Default, Clone, Copy)]
13pub struct EmailEngine;
14
15#[derive(Debug, Default, Clone, Copy)]
16pub struct XmlEngine;
17
18impl ExtractionEngine for HtmlEngine {
19    fn name(&self) -> &'static str {
20        "html-native"
21    }
22
23    fn extract(&self, source: &Source) -> Result<Document> {
24        if let Some(document) = hocr_document(source, self.name()) {
25            return Ok(document);
26        }
27        text_document_from_text(source, self.name(), &html_to_text(&source.content), None)
28    }
29}
30
31impl ExtractionEngine for EmailEngine {
32    fn name(&self) -> &'static str {
33        "email-native"
34    }
35
36    fn extract(&self, source: &Source) -> Result<Document> {
37        let email = parse_email(&source.content);
38        let text = match (&email.subject, email.body.trim()) {
39            (Some(subject), body) if !body.is_empty() => format!("{subject}\n\n{body}"),
40            (Some(subject), _) => subject.clone(),
41            (None, body) => body.to_owned(),
42        };
43        text_document_from_text(source, self.name(), &text, email.subject)
44    }
45}
46
47impl ExtractionEngine for XmlEngine {
48    fn name(&self) -> &'static str {
49        "xml-native"
50    }
51
52    fn extract(&self, source: &Source) -> Result<Document> {
53        if let Some(document) = page_xml_document(source, self.name()) {
54            return Ok(document);
55        }
56        if let Some(document) = alto_document(source, self.name()) {
57            return Ok(document);
58        }
59        if let Some(document) = pascal_voc_document(source, self.name()) {
60            return Ok(document);
61        }
62        text_document_from_text(source, self.name(), &html_to_text(&source.content), None)
63    }
64}
65
66#[derive(Debug, Default)]
67struct EmailParts {
68    subject: Option<String>,
69    body: String,
70}
71
72#[derive(Debug)]
73struct PascalVocObject {
74    name: String,
75    bbox: BBox,
76}
77
78#[derive(Debug, Clone, Copy)]
79struct XmlElement<'a> {
80    start_tag: &'a str,
81    content: &'a str,
82}
83
84#[derive(Debug, Clone)]
85struct AltoWord {
86    text: String,
87    bbox: Option<BBox>,
88    confidence: Option<f32>,
89}
90
91#[derive(Debug, Clone)]
92struct HocrWord {
93    text: String,
94    bbox: Option<BBox>,
95    confidence: Option<f32>,
96}
97
98#[derive(Debug, Clone)]
99struct PageXmlWord {
100    text: String,
101    bbox: Option<BBox>,
102    confidence: Option<f32>,
103}
104
105fn hocr_document(source: &Source, engine_name: &str) -> Option<Document> {
106    if !source.content.contains("ocr_page")
107        && !source.content.contains("ocr_line")
108        && !source.content.contains("ocrx_word")
109    {
110        return None;
111    }
112
113    let page_element = hocr_elements_with_class(&source.content, "ocr_page")
114        .into_iter()
115        .next();
116    let page_content = page_element
117        .as_ref()
118        .map(|element| element.content)
119        .unwrap_or(source.content.as_str());
120    let page_bbox = page_element
121        .as_ref()
122        .and_then(|element| hocr_bbox_from_tag(element.start_tag));
123    let mut blocks = hocr_elements_with_any_class(page_content, &["ocr_line", "ocrx_line"])
124        .into_iter()
125        .filter_map(|line| hocr_line_block(line, 1))
126        .collect::<Vec<_>>();
127
128    if blocks.is_empty() {
129        blocks = hocr_elements_with_class(page_content, "ocrx_word")
130            .into_iter()
131            .filter_map(hocr_word_from_element)
132            .map(|word| hocr_word_block(word, 1))
133            .collect();
134    }
135    if blocks.is_empty() {
136        return None;
137    }
138
139    let page_bbox = page_bbox.or_else(|| inferred_block_bbox(&blocks));
140    let text = blocks
141        .iter()
142        .filter_map(|block| match block {
143            Block::Text(text) => Some(text.text.as_str()),
144            _ => None,
145        })
146        .collect::<Vec<_>>()
147        .join("\n");
148
149    Some(Document {
150        schema_version: SCHEMA_VERSION.to_owned(),
151        metadata: Metadata {
152            format: source.format.clone(),
153            engine: engine_name.to_owned(),
154            source: source.path.clone(),
155            title: None,
156            character_count: text.chars().count(),
157            word_count: text.split_whitespace().count(),
158            block_count: blocks.len(),
159            file_size_bytes: source.bytes.as_ref().map(|bytes| bytes.len() as u64),
160            pdf_version: None,
161            encrypted: false,
162        },
163        pages: vec![Page {
164            number: 1,
165            width: page_bbox.map(|bbox| bbox.width),
166            height: page_bbox.map(|bbox| bbox.height),
167            rotation: None,
168            bbox: page_bbox,
169            blocks,
170            images: Vec::new(),
171            assets: Vec::new(),
172            warnings: Vec::new(), ..Default::default()
173        }],
174        assets: Vec::new(),
175        warnings: Vec::new(),
176    })
177}
178
179fn hocr_line_block(line: XmlElement<'_>, page_number: usize) -> Option<Block> {
180    let words = hocr_elements_with_class(line.content, "ocrx_word")
181        .into_iter()
182        .filter_map(hocr_word_from_element)
183        .collect::<Vec<_>>();
184    if words.is_empty() {
185        let text = html_to_text(line.content);
186        if text.trim().is_empty() {
187            return None;
188        }
189        let bbox = hocr_bbox_from_tag(line.start_tag);
190        return Some(Block::Text(TextBlock {
191            text: text.split_whitespace().collect::<Vec<_>>().join(" "),
192            kind: "ocr_line".to_owned(),
193            bbox,
194            lines: Vec::new(),
195            source_anchors: vec![html_source_anchor(page_number, bbox)],
196            confidence: Some(Confidence {
197                score: 0.9,
198                calibrated: false,
199            }), ..Default::default()
200        }));
201    }
202
203    let text = words
204        .iter()
205        .map(|word| word.text.as_str())
206        .collect::<Vec<_>>()
207        .join(" ");
208    let bbox = hocr_bbox_from_tag(line.start_tag).or_else(|| inferred_hocr_word_bbox(&words));
209    let confidence = mean_confidence(words.iter().filter_map(|word| word.confidence));
210    let spans = words
211        .iter()
212        .map(|word| Span {
213            text: word.text.clone(),
214            bbox: word.bbox,
215            font: None,
216            size: None,
217            bold: false,
218            italic: false,
219        })
220        .collect::<Vec<_>>();
221
222    Some(Block::Text(TextBlock {
223        text: text.clone(),
224        kind: "ocr_line".to_owned(),
225        bbox,
226        lines: vec![Line { text, bbox, spans }],
227        source_anchors: vec![html_source_anchor(page_number, bbox)],
228        confidence: Some(Confidence {
229            score: confidence.unwrap_or(0.9),
230            calibrated: false,
231        }), ..Default::default()
232    }))
233}
234
235fn hocr_word_block(word: HocrWord, page_number: usize) -> Block {
236    Block::Text(TextBlock {
237        text: word.text.clone(),
238        kind: "ocr_word".to_owned(),
239        bbox: word.bbox,
240        lines: Vec::new(),
241        source_anchors: vec![html_source_anchor(page_number, word.bbox)],
242        confidence: Some(Confidence {
243            score: word.confidence.unwrap_or(0.9),
244            calibrated: false,
245        }), ..Default::default()
246    })
247}
248
249fn hocr_word_from_element(element: XmlElement<'_>) -> Option<HocrWord> {
250    let text = html_to_text(element.content)
251        .split_whitespace()
252        .collect::<Vec<_>>()
253        .join(" ");
254    if text.is_empty() {
255        return None;
256    }
257    Some(HocrWord {
258        text,
259        bbox: hocr_bbox_from_tag(element.start_tag),
260        confidence: hocr_confidence_from_tag(element.start_tag),
261    })
262}
263
264fn page_xml_document(source: &Source, engine_name: &str) -> Option<Document> {
265    if !source.content.contains("PcGts") && !source.content.contains("TextRegion") {
266        return None;
267    }
268
269    let page_element = xml_elements_by_local_name(&source.content, "Page")
270        .into_iter()
271        .next()?;
272    let width = first_xml_attr_f32(
273        page_element.start_tag,
274        &["imageWidth", "image_width", "WIDTH", "width"],
275    );
276    let height = first_xml_attr_f32(
277        page_element.start_tag,
278        &["imageHeight", "image_height", "HEIGHT", "height"],
279    );
280    let blocks = xml_elements_by_local_name(page_element.content, "TextLine")
281        .into_iter()
282        .filter_map(|line| page_xml_line_block(line, 1))
283        .collect::<Vec<_>>();
284    if blocks.is_empty() {
285        return None;
286    }
287
288    let page_bbox = page_bbox(width, height).or_else(|| inferred_block_bbox(&blocks));
289    let text = blocks
290        .iter()
291        .filter_map(|block| match block {
292            Block::Text(text) => Some(text.text.as_str()),
293            _ => None,
294        })
295        .collect::<Vec<_>>()
296        .join("\n");
297
298    Some(Document {
299        schema_version: SCHEMA_VERSION.to_owned(),
300        metadata: Metadata {
301            format: source.format.clone(),
302            engine: engine_name.to_owned(),
303            source: source.path.clone(),
304            title: None,
305            character_count: text.chars().count(),
306            word_count: text.split_whitespace().count(),
307            block_count: blocks.len(),
308            file_size_bytes: source.bytes.as_ref().map(|bytes| bytes.len() as u64),
309            pdf_version: None,
310            encrypted: false,
311        },
312        pages: vec![Page {
313            number: 1,
314            width: width.or_else(|| page_bbox.map(|bbox| bbox.width)),
315            height: height.or_else(|| page_bbox.map(|bbox| bbox.height)),
316            rotation: None,
317            bbox: page_bbox,
318            blocks,
319            images: Vec::new(),
320            assets: Vec::new(),
321            warnings: Vec::new(), ..Default::default()
322        }],
323        assets: Vec::new(),
324        warnings: Vec::new(),
325    })
326}
327
328fn page_xml_line_block(line: XmlElement<'_>, page_number: usize) -> Option<Block> {
329    let words = xml_elements_by_local_name(line.content, "Word")
330        .into_iter()
331        .filter_map(page_xml_word_from_element)
332        .collect::<Vec<_>>();
333    let bbox =
334        page_xml_bbox_from_content(line.content).or_else(|| inferred_page_xml_word_bbox(&words));
335
336    if words.is_empty() {
337        let text = page_xml_text_from_content(line.content)?;
338        if text.is_empty() {
339            return None;
340        }
341        return Some(Block::Text(TextBlock {
342            text,
343            kind: "ocr_line".to_owned(),
344            bbox,
345            lines: Vec::new(),
346            source_anchors: vec![xml_source_anchor(page_number, bbox)],
347            confidence: Some(Confidence {
348                score: page_xml_confidence_from_content(line.content).unwrap_or(0.9),
349                calibrated: false,
350            }), ..Default::default()
351        }));
352    }
353
354    let text = page_xml_text_from_content(line.content).unwrap_or_else(|| {
355        words
356            .iter()
357            .map(|word| word.text.as_str())
358            .collect::<Vec<_>>()
359            .join(" ")
360    });
361    let confidence = mean_confidence(words.iter().filter_map(|word| word.confidence));
362    let spans = words
363        .iter()
364        .map(|word| Span {
365            text: word.text.clone(),
366            bbox: word.bbox,
367            font: None,
368            size: None,
369            bold: false,
370            italic: false,
371        })
372        .collect::<Vec<_>>();
373
374    Some(Block::Text(TextBlock {
375        text: text.clone(),
376        kind: "ocr_line".to_owned(),
377        bbox,
378        lines: vec![Line { text, bbox, spans }],
379        source_anchors: vec![xml_source_anchor(page_number, bbox)],
380        confidence: Some(Confidence {
381            score: confidence.unwrap_or(0.9),
382            calibrated: false,
383        }), ..Default::default()
384    }))
385}
386
387fn page_xml_word_from_element(element: XmlElement<'_>) -> Option<PageXmlWord> {
388    let text = page_xml_text_from_content(element.content)?;
389    if text.is_empty() {
390        return None;
391    }
392    Some(PageXmlWord {
393        text,
394        bbox: page_xml_bbox_from_content(element.content),
395        confidence: page_xml_confidence_from_content(element.content),
396    })
397}
398
399fn alto_document(source: &Source, engine_name: &str) -> Option<Document> {
400    let page_element = xml_elements_by_local_name(&source.content, "Page")
401        .into_iter()
402        .next()?;
403    let width = xml_attr_f32(page_element.start_tag, "WIDTH");
404    let height = xml_attr_f32(page_element.start_tag, "HEIGHT");
405    let mut blocks = xml_elements_by_local_name(page_element.content, "TextLine")
406        .into_iter()
407        .filter_map(|line| alto_line_block(line, 1))
408        .collect::<Vec<_>>();
409
410    if blocks.is_empty() {
411        blocks = xml_start_tags_by_local_name(page_element.content, "String")
412            .into_iter()
413            .filter_map(|tag| alto_word_from_tag(tag))
414            .map(|word| alto_word_block(word, 1))
415            .collect();
416    }
417    if blocks.is_empty() {
418        return None;
419    }
420
421    let page_bbox = page_bbox(width, height).or_else(|| inferred_block_bbox(&blocks));
422    let text = blocks
423        .iter()
424        .filter_map(|block| match block {
425            Block::Text(text) => Some(text.text.as_str()),
426            _ => None,
427        })
428        .collect::<Vec<_>>()
429        .join("\n");
430
431    Some(Document {
432        schema_version: SCHEMA_VERSION.to_owned(),
433        metadata: Metadata {
434            format: source.format.clone(),
435            engine: engine_name.to_owned(),
436            source: source.path.clone(),
437            title: None,
438            character_count: text.chars().count(),
439            word_count: text.split_whitespace().count(),
440            block_count: blocks.len(),
441            file_size_bytes: source.bytes.as_ref().map(|bytes| bytes.len() as u64),
442            pdf_version: None,
443            encrypted: false,
444        },
445        pages: vec![Page {
446            number: 1,
447            width: width.or_else(|| page_bbox.map(|bbox| bbox.width)),
448            height: height.or_else(|| page_bbox.map(|bbox| bbox.height)),
449            rotation: None,
450            bbox: page_bbox,
451            blocks,
452            images: Vec::new(),
453            assets: Vec::new(),
454            warnings: Vec::new(), ..Default::default()
455        }],
456        assets: Vec::new(),
457        warnings: Vec::new(),
458    })
459}
460
461fn alto_line_block(line: XmlElement<'_>, page_number: usize) -> Option<Block> {
462    let words = xml_start_tags_by_local_name(line.content, "String")
463        .into_iter()
464        .filter_map(alto_word_from_tag)
465        .collect::<Vec<_>>();
466    if words.is_empty() {
467        return None;
468    }
469    let text = words
470        .iter()
471        .map(|word| word.text.as_str())
472        .collect::<Vec<_>>()
473        .join(" ");
474    let bbox = alto_bbox_from_tag(line.start_tag).or_else(|| inferred_word_bbox(&words));
475    let confidence = mean_confidence(words.iter().filter_map(|word| word.confidence));
476    let spans = words
477        .iter()
478        .map(|word| Span {
479            text: word.text.clone(),
480            bbox: word.bbox,
481            font: None,
482            size: None,
483            bold: false,
484            italic: false,
485        })
486        .collect::<Vec<_>>();
487
488    Some(Block::Text(TextBlock {
489        text,
490        kind: "ocr_line".to_owned(),
491        bbox,
492        lines: vec![Line {
493            text: words
494                .iter()
495                .map(|word| word.text.as_str())
496                .collect::<Vec<_>>()
497                .join(" "),
498            bbox,
499            spans,
500        }],
501        source_anchors: vec![xml_source_anchor(page_number, bbox)],
502        confidence: Some(Confidence {
503            score: confidence.unwrap_or(0.9),
504            calibrated: false,
505        }), ..Default::default()
506    }))
507}
508
509fn alto_word_block(word: AltoWord, page_number: usize) -> Block {
510    Block::Text(TextBlock {
511        text: word.text.clone(),
512        kind: "ocr_word".to_owned(),
513        bbox: word.bbox,
514        lines: Vec::new(),
515        source_anchors: vec![xml_source_anchor(page_number, word.bbox)],
516        confidence: Some(Confidence {
517            score: word.confidence.unwrap_or(0.9),
518            calibrated: false,
519        }), ..Default::default()
520    })
521}
522
523fn alto_word_from_tag(tag: &str) -> Option<AltoWord> {
524    let text = xml_attr_value(tag, "CONTENT")
525        .map(|value| html_to_text(&value))
526        .map(|text| text.split_whitespace().collect::<Vec<_>>().join(" "))?;
527    if text.is_empty() {
528        return None;
529    }
530    Some(AltoWord {
531        text,
532        bbox: alto_bbox_from_tag(tag),
533        confidence: xml_attr_f32(tag, "WC"),
534    })
535}
536
537fn alto_bbox_from_tag(tag: &str) -> Option<BBox> {
538    Some(BBox {
539        x: xml_attr_f32(tag, "HPOS")?,
540        y: xml_attr_f32(tag, "VPOS")?,
541        width: xml_attr_f32(tag, "WIDTH")?,
542        height: xml_attr_f32(tag, "HEIGHT")?,
543    })
544}
545
546fn hocr_bbox_from_tag(tag: &str) -> Option<BBox> {
547    let title = xml_attr_value(tag, "title")?;
548    let mut parts = title
549        .split(';')
550        .find_map(|part| part.trim().strip_prefix("bbox "))?
551        .split_whitespace();
552    let left = parts.next()?.parse::<f32>().ok()?;
553    let top = parts.next()?.parse::<f32>().ok()?;
554    let right = parts.next()?.parse::<f32>().ok()?;
555    let bottom = parts.next()?.parse::<f32>().ok()?;
556    Some(BBox {
557        x: left.min(right),
558        y: top.min(bottom),
559        width: (right - left).abs(),
560        height: (bottom - top).abs(),
561    })
562}
563
564fn hocr_confidence_from_tag(tag: &str) -> Option<f32> {
565    let title = xml_attr_value(tag, "title")?;
566    let value = title
567        .split(';')
568        .find_map(|part| part.trim().strip_prefix("x_wconf "))?
569        .split_whitespace()
570        .next()?
571        .parse::<f32>()
572        .ok()?;
573    Some((value / 100.0).clamp(0.0, 1.0))
574}
575
576fn page_xml_text_from_content(content: &str) -> Option<String> {
577    xml_elements_by_local_name(content, "Unicode")
578        .into_iter()
579        .last()
580        .map(|unicode| html_to_text(unicode.content))
581        .map(|text| text.split_whitespace().collect::<Vec<_>>().join(" "))
582        .filter(|text| !text.is_empty())
583}
584
585fn page_xml_confidence_from_content(content: &str) -> Option<f32> {
586    xml_elements_by_local_name(content, "TextEquiv")
587        .into_iter()
588        .last()
589        .and_then(|element| xml_attr_f32(element.start_tag, "conf"))
590}
591
592fn page_xml_bbox_from_content(content: &str) -> Option<BBox> {
593    let coords = xml_start_tags_by_local_name(content, "Coords")
594        .into_iter()
595        .next()?;
596    let points = xml_attr_value(coords, "points")?;
597    bbox_from_points(&points)
598}
599
600fn bbox_from_points(points: &str) -> Option<BBox> {
601    let mut min_x = f32::INFINITY;
602    let mut min_y = f32::INFINITY;
603    let mut max_x = f32::NEG_INFINITY;
604    let mut max_y = f32::NEG_INFINITY;
605    let mut count = 0usize;
606
607    for point in points.split_whitespace() {
608        let Some((x, y)) = point.split_once(',') else {
609            continue;
610        };
611        let x = x.parse::<f32>().ok()?;
612        let y = y.parse::<f32>().ok()?;
613        min_x = min_x.min(x);
614        min_y = min_y.min(y);
615        max_x = max_x.max(x);
616        max_y = max_y.max(y);
617        count += 1;
618    }
619
620    (count > 0).then_some(BBox {
621        x: min_x,
622        y: min_y,
623        width: max_x - min_x,
624        height: max_y - min_y,
625    })
626}
627
628fn inferred_page_xml_word_bbox(words: &[PageXmlWord]) -> Option<BBox> {
629    let mut min_x = f32::INFINITY;
630    let mut min_y = f32::INFINITY;
631    let mut max_x = f32::NEG_INFINITY;
632    let mut max_y = f32::NEG_INFINITY;
633    let mut has_bbox = false;
634    for word in words {
635        let Some(bbox) = word.bbox else {
636            continue;
637        };
638        has_bbox = true;
639        min_x = min_x.min(bbox.x);
640        min_y = min_y.min(bbox.y);
641        max_x = max_x.max(bbox.x + bbox.width);
642        max_y = max_y.max(bbox.y + bbox.height);
643    }
644    has_bbox.then_some(BBox {
645        x: min_x,
646        y: min_y,
647        width: max_x - min_x,
648        height: max_y - min_y,
649    })
650}
651
652fn inferred_hocr_word_bbox(words: &[HocrWord]) -> Option<BBox> {
653    let mut min_x = f32::INFINITY;
654    let mut min_y = f32::INFINITY;
655    let mut max_x = f32::NEG_INFINITY;
656    let mut max_y = f32::NEG_INFINITY;
657    let mut has_bbox = false;
658    for word in words {
659        let Some(bbox) = word.bbox else {
660            continue;
661        };
662        has_bbox = true;
663        min_x = min_x.min(bbox.x);
664        min_y = min_y.min(bbox.y);
665        max_x = max_x.max(bbox.x + bbox.width);
666        max_y = max_y.max(bbox.y + bbox.height);
667    }
668    has_bbox.then_some(BBox {
669        x: min_x,
670        y: min_y,
671        width: max_x - min_x,
672        height: max_y - min_y,
673    })
674}
675
676fn inferred_word_bbox(words: &[AltoWord]) -> Option<BBox> {
677    let mut min_x = f32::INFINITY;
678    let mut min_y = f32::INFINITY;
679    let mut max_x = f32::NEG_INFINITY;
680    let mut max_y = f32::NEG_INFINITY;
681    let mut has_bbox = false;
682    for word in words {
683        let Some(bbox) = word.bbox else {
684            continue;
685        };
686        has_bbox = true;
687        min_x = min_x.min(bbox.x);
688        min_y = min_y.min(bbox.y);
689        max_x = max_x.max(bbox.x + bbox.width);
690        max_y = max_y.max(bbox.y + bbox.height);
691    }
692    has_bbox.then_some(BBox {
693        x: min_x,
694        y: min_y,
695        width: max_x - min_x,
696        height: max_y - min_y,
697    })
698}
699
700fn inferred_block_bbox(blocks: &[Block]) -> Option<BBox> {
701    let mut max_x = 0.0f32;
702    let mut max_y = 0.0f32;
703    let mut has_bbox = false;
704    for block in blocks {
705        let Some(bbox) = block_bbox(block) else {
706            continue;
707        };
708        has_bbox = true;
709        max_x = max_x.max(bbox.x + bbox.width);
710        max_y = max_y.max(bbox.y + bbox.height);
711    }
712    has_bbox.then_some(BBox {
713        x: 0.0,
714        y: 0.0,
715        width: max_x,
716        height: max_y,
717    })
718}
719
720fn block_bbox(block: &Block) -> Option<BBox> {
721    match block {
722        Block::Text(text) => text.bbox,
723        Block::Table(table) => table.bbox,
724        Block::Figure(figure) => figure.bbox,
725    }
726}
727
728fn page_bbox(width: Option<f32>, height: Option<f32>) -> Option<BBox> {
729    Some(BBox {
730        x: 0.0,
731        y: 0.0,
732        width: width?,
733        height: height?,
734    })
735}
736
737fn xml_source_anchor(page_number: usize, bbox: Option<BBox>) -> SourceAnchor {
738    SourceAnchor {
739        page_number,
740        pdf_object_ids: Vec::new(),
741        bbox,
742        extraction_method: "xml_native".to_owned(),
743    }
744}
745
746fn html_source_anchor(page_number: usize, bbox: Option<BBox>) -> SourceAnchor {
747    SourceAnchor {
748        page_number,
749        pdf_object_ids: Vec::new(),
750        bbox,
751        extraction_method: "html_native".to_owned(),
752    }
753}
754
755fn mean_confidence(values: impl Iterator<Item = f32>) -> Option<f32> {
756    let mut total = 0.0f32;
757    let mut count = 0usize;
758    for value in values {
759        total += value;
760        count += 1;
761    }
762    (count > 0).then_some(total / count as f32)
763}
764
765fn pascal_voc_document(source: &Source, engine_name: &str) -> Option<Document> {
766    let width = tag_text(&source.content, "width")?.parse::<f32>().ok()?;
767    let height = tag_text(&source.content, "height")?.parse::<f32>().ok()?;
768    let objects = pascal_voc_objects(&source.content);
769    if objects.is_empty() {
770        return None;
771    }
772
773    let blocks = objects
774        .into_iter()
775        .map(|object| {
776            Block::Text(TextBlock {
777                kind: object.name.clone(),
778                text: object.name,
779                bbox: Some(object.bbox),
780                lines: Vec::new(),
781                source_anchors: vec![SourceAnchor {
782                    page_number: 1,
783                    pdf_object_ids: Vec::new(),
784                    bbox: Some(object.bbox),
785                    extraction_method: "xml_native".to_owned(),
786                }],
787                confidence: Some(Confidence {
788                    score: 0.9,
789                    calibrated: false,
790                }), ..Default::default()
791            })
792        })
793        .collect::<Vec<_>>();
794    let text = blocks
795        .iter()
796        .filter_map(|block| match block {
797            Block::Text(text) => Some(text.text.as_str()),
798            _ => None,
799        })
800        .collect::<Vec<_>>()
801        .join("\n");
802
803    Some(Document {
804        schema_version: SCHEMA_VERSION.to_owned(),
805        metadata: Metadata {
806            format: source.format.clone(),
807            engine: engine_name.to_owned(),
808            source: source.path.clone(),
809            title: None,
810            character_count: text.chars().count(),
811            word_count: text.split_whitespace().count(),
812            block_count: blocks.len(),
813            file_size_bytes: source.bytes.as_ref().map(|bytes| bytes.len() as u64),
814            pdf_version: None,
815            encrypted: false,
816        },
817        pages: vec![Page {
818            number: 1,
819            width: Some(width),
820            height: Some(height),
821            rotation: None,
822            bbox: Some(BBox {
823                x: 0.0,
824                y: 0.0,
825                width,
826                height,
827            }),
828            blocks,
829            images: Vec::new(),
830            assets: Vec::new(),
831            warnings: Vec::new(), ..Default::default()
832        }],
833        assets: Vec::new(),
834        warnings: Vec::new(),
835    })
836}
837
838fn pascal_voc_objects(xml: &str) -> Vec<PascalVocObject> {
839    tag_ranges(xml, "object")
840        .into_iter()
841        .filter_map(|range| {
842            let object_xml = &xml[range.0..range.1];
843            let name = tag_text(object_xml, "name")?;
844            let xmin = tag_text(object_xml, "xmin")?.parse::<f32>().ok()?;
845            let ymin = tag_text(object_xml, "ymin")?.parse::<f32>().ok()?;
846            let xmax = tag_text(object_xml, "xmax")?.parse::<f32>().ok()?;
847            let ymax = tag_text(object_xml, "ymax")?.parse::<f32>().ok()?;
848            Some(PascalVocObject {
849                name,
850                bbox: BBox {
851                    x: xmin.min(xmax),
852                    y: ymin.min(ymax),
853                    width: (xmax - xmin).abs(),
854                    height: (ymax - ymin).abs(),
855                },
856            })
857        })
858        .collect()
859}
860
861fn tag_text(xml: &str, tag: &str) -> Option<String> {
862    let range = tag_ranges(xml, tag).into_iter().next()?;
863    Some(html_to_text(&xml[range.0..range.1]).trim().to_owned())
864}
865
866fn tag_ranges(xml: &str, tag: &str) -> Vec<(usize, usize)> {
867    let lower = xml.to_ascii_lowercase();
868    let mut ranges = Vec::new();
869    let mut search_start = 0;
870    let open = format!("<{tag}");
871    let close = format!("</{tag}>");
872
873    while let Some(offset) = lower[search_start..].find(&open) {
874        let open_start = search_start + offset;
875        let Some(open_end_offset) = lower[open_start..].find('>') else {
876            break;
877        };
878        let content_start = open_start + open_end_offset + 1;
879        let Some(close_offset) = lower[content_start..].find(&close) else {
880            break;
881        };
882        let content_end = content_start + close_offset;
883        ranges.push((content_start, content_end));
884        search_start = content_end + close.len();
885    }
886
887    ranges
888}
889
890fn xml_elements_by_local_name<'a>(xml: &'a str, local_name: &str) -> Vec<XmlElement<'a>> {
891    let mut elements = Vec::new();
892    let mut pos = 0usize;
893    while let Some(relative_start) = xml[pos..].find('<') {
894        let start = pos + relative_start;
895        let Some(relative_end) = xml[start..].find('>') else {
896            break;
897        };
898        let tag_end = start + relative_end;
899        let start_tag = &xml[start..=tag_end];
900        let Some(tag_name) = opening_tag_name(start_tag) else {
901            pos = tag_end + 1;
902            continue;
903        };
904        if tag_local_name(tag_name).eq_ignore_ascii_case(local_name)
905            && !start_tag.trim_end().ends_with("/>")
906        {
907            let close = format!("</{tag_name}>");
908            let content_start = tag_end + 1;
909            if let Some(relative_close) = xml[content_start..].find(&close) {
910                let content_end = content_start + relative_close;
911                elements.push(XmlElement {
912                    start_tag,
913                    content: &xml[content_start..content_end],
914                });
915                pos = content_end + close.len();
916                continue;
917            }
918        }
919        pos = tag_end + 1;
920    }
921    elements
922}
923
924fn hocr_elements_with_class<'a>(html: &'a str, class_name: &str) -> Vec<XmlElement<'a>> {
925    hocr_elements_with_any_class(html, &[class_name])
926}
927
928fn hocr_elements_with_any_class<'a>(html: &'a str, class_names: &[&str]) -> Vec<XmlElement<'a>> {
929    let mut elements = Vec::new();
930    let mut pos = 0usize;
931    while let Some(relative_start) = html[pos..].find('<') {
932        let start = pos + relative_start;
933        let Some(relative_end) = html[start..].find('>') else {
934            break;
935        };
936        let tag_end = start + relative_end;
937        let start_tag = &html[start..=tag_end];
938        let Some(tag_name) = opening_tag_name(start_tag) else {
939            pos = tag_end + 1;
940            continue;
941        };
942        if tag_has_any_class(start_tag, class_names) && !start_tag.trim_end().ends_with("/>") {
943            let content_start = tag_end + 1;
944            if let Some(content_end) = matching_element_content_end(html, tag_name, content_start) {
945                elements.push(XmlElement {
946                    start_tag,
947                    content: &html[content_start..content_end],
948                });
949                pos = content_end + closing_tag_len(tag_name);
950                continue;
951            }
952        }
953        pos = tag_end + 1;
954    }
955    elements
956}
957
958fn tag_has_any_class(tag: &str, class_names: &[&str]) -> bool {
959    let Some(classes) = xml_attr_value(tag, "class") else {
960        return false;
961    };
962    classes.split_whitespace().any(|class| {
963        class_names
964            .iter()
965            .any(|name| class.eq_ignore_ascii_case(name))
966    })
967}
968
969fn matching_element_content_end(
970    input: &str,
971    tag_name: &str,
972    content_start: usize,
973) -> Option<usize> {
974    let lower = input.to_ascii_lowercase();
975    let tag = tag_name.to_ascii_lowercase();
976    let open = format!("<{tag}");
977    let close = format!("</{tag}>");
978    let mut pos = content_start;
979    let mut depth = 1usize;
980
981    loop {
982        let next_open = lower[pos..].find(&open).map(|offset| pos + offset);
983        let next_close = lower[pos..].find(&close).map(|offset| pos + offset)?;
984
985        if next_open
986            .map(|open_pos| open_pos < next_close)
987            .unwrap_or(false)
988        {
989            let open_pos = next_open.unwrap();
990            let after_name = open_pos + open.len();
991            if is_tag_name_boundary(lower.as_bytes().get(after_name).copied()) {
992                let Some(open_end_offset) = lower[open_pos..].find('>') else {
993                    return None;
994                };
995                let open_end = open_pos + open_end_offset;
996                if !lower[open_pos..=open_end].trim_end().ends_with("/>") {
997                    depth += 1;
998                }
999                pos = open_end + 1;
1000            } else {
1001                pos = after_name;
1002            }
1003            continue;
1004        }
1005
1006        depth -= 1;
1007        if depth == 0 {
1008            return Some(next_close);
1009        }
1010        pos = next_close + close.len();
1011    }
1012}
1013
1014fn closing_tag_len(tag_name: &str) -> usize {
1015    tag_name.len() + 3
1016}
1017
1018fn is_tag_name_boundary(byte: Option<u8>) -> bool {
1019    byte.map(|byte| matches!(byte, b'>' | b'/' | b' ' | b'\t' | b'\n' | b'\r'))
1020        .unwrap_or(false)
1021}
1022
1023fn xml_start_tags_by_local_name<'a>(xml: &'a str, local_name: &str) -> Vec<&'a str> {
1024    let mut tags = Vec::new();
1025    let mut pos = 0usize;
1026    while let Some(relative_start) = xml[pos..].find('<') {
1027        let start = pos + relative_start;
1028        let Some(relative_end) = xml[start..].find('>') else {
1029            break;
1030        };
1031        let tag_end = start + relative_end;
1032        let start_tag = &xml[start..=tag_end];
1033        if opening_tag_name(start_tag)
1034            .map(|name| tag_local_name(name).eq_ignore_ascii_case(local_name))
1035            .unwrap_or(false)
1036        {
1037            tags.push(start_tag);
1038        }
1039        pos = tag_end + 1;
1040    }
1041    tags
1042}
1043
1044fn opening_tag_name(tag: &str) -> Option<&str> {
1045    let inner = tag.trim().strip_prefix('<')?.trim_start();
1046    if inner.starts_with('/') || inner.starts_with('!') || inner.starts_with('?') {
1047        return None;
1048    }
1049    inner
1050        .split_whitespace()
1051        .next()
1052        .map(|name| name.trim_end_matches('/').trim_end_matches('>'))
1053        .filter(|name| !name.is_empty())
1054}
1055
1056fn tag_local_name(name: &str) -> &str {
1057    name.rsplit_once(':')
1058        .map(|(_, local)| local)
1059        .unwrap_or(name)
1060}
1061
1062fn xml_attr_f32(tag: &str, name: &str) -> Option<f32> {
1063    xml_attr_value(tag, name)?.parse::<f32>().ok()
1064}
1065
1066fn first_xml_attr_f32(tag: &str, names: &[&str]) -> Option<f32> {
1067    names.iter().find_map(|name| xml_attr_f32(tag, name))
1068}
1069
1070fn xml_attr_value(tag: &str, name: &str) -> Option<String> {
1071    let bytes = tag.as_bytes();
1072    let mut pos = 0usize;
1073    while pos < bytes.len() {
1074        while pos < bytes.len() && !is_xml_name_start(bytes[pos]) {
1075            pos += 1;
1076        }
1077        let key_start = pos;
1078        while pos < bytes.len() && is_xml_name_continue(bytes[pos]) {
1079            pos += 1;
1080        }
1081        if key_start == pos {
1082            break;
1083        }
1084        let key = &tag[key_start..pos];
1085        while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
1086            pos += 1;
1087        }
1088        if bytes.get(pos) != Some(&b'=') {
1089            continue;
1090        }
1091        pos += 1;
1092        while pos < bytes.len() && bytes[pos].is_ascii_whitespace() {
1093            pos += 1;
1094        }
1095        let quote = *bytes.get(pos)?;
1096        if quote != b'"' && quote != b'\'' {
1097            continue;
1098        }
1099        pos += 1;
1100        let value_start = pos;
1101        while pos < bytes.len() && bytes[pos] != quote {
1102            pos += 1;
1103        }
1104        let value = &tag[value_start..pos];
1105        if key.eq_ignore_ascii_case(name) || tag_local_name(key).eq_ignore_ascii_case(name) {
1106            return Some(value.to_owned());
1107        }
1108        pos += 1;
1109    }
1110    None
1111}
1112
1113fn is_xml_name_start(byte: u8) -> bool {
1114    byte.is_ascii_alphabetic() || byte == b'_' || byte == b':'
1115}
1116
1117fn is_xml_name_continue(byte: u8) -> bool {
1118    is_xml_name_start(byte) || byte.is_ascii_digit() || byte == b'-' || byte == b'.'
1119}
1120
1121pub(crate) fn html_to_text(html: &str) -> String {
1122    let without_ignored = remove_html_ranges(html, &["script", "style", "title", "head"]);
1123    let mut output = String::new();
1124    let bytes = without_ignored.as_bytes();
1125    let mut pos = 0;
1126    let mut pending_space = false;
1127
1128    while pos < bytes.len() {
1129        if bytes[pos] == b'<' {
1130            if let Some(end) = without_ignored[pos..].find('>') {
1131                let tag = without_ignored[pos + 1..pos + end].trim();
1132                if is_block_tag(tag) {
1133                    push_newline(&mut output);
1134                }
1135                pos += end + 1;
1136                pending_space = false;
1137                continue;
1138            }
1139        }
1140
1141        let Some(character) = without_ignored[pos..].chars().next() else {
1142            break;
1143        };
1144        if character == '&' {
1145            if let Some((decoded, consumed)) = decode_entity(&without_ignored[pos..]) {
1146                if pending_space {
1147                    output.push(' ');
1148                }
1149                output.push_str(&decoded);
1150                pos += consumed;
1151                pending_space = false;
1152                continue;
1153            }
1154        }
1155        if character.is_whitespace() {
1156            pending_space = !output.ends_with('\n') && !output.is_empty();
1157        } else {
1158            if pending_space {
1159                output.push(' ');
1160            }
1161            output.push(character);
1162            pending_space = false;
1163        }
1164        pos += character.len_utf8();
1165    }
1166
1167    normalize_text_lines(&output)
1168}
1169
1170fn remove_html_ranges(input: &str, tags: &[&str]) -> String {
1171    let mut output = String::new();
1172    let mut pos = 0;
1173    while pos < input.len() {
1174        let lower_rest = input[pos..].to_ascii_lowercase();
1175        let Some((tag, start)) = find_ignored_tag_start(&lower_rest, tags) else {
1176            output.push_str(&input[pos..]);
1177            break;
1178        };
1179
1180        output.push_str(&input[pos..pos + start]);
1181        let after_open = pos + start;
1182        let close = format!("</{tag}>");
1183        let lower_after_open = input[after_open..].to_ascii_lowercase();
1184        if let Some(end) = lower_after_open.find(&close) {
1185            pos = after_open + end + close.len();
1186        } else {
1187            break;
1188        }
1189    }
1190    output
1191}
1192
1193fn find_ignored_tag_start<'a>(lower_input: &str, tags: &[&'a str]) -> Option<(&'a str, usize)> {
1194    tags.iter()
1195        .filter_map(|tag| find_tag_start(lower_input, tag).map(|start| (*tag, start)))
1196        .min_by_key(|(_, start)| *start)
1197}
1198
1199fn find_tag_start(input: &str, tag: &str) -> Option<usize> {
1200    let open = format!("<{tag}");
1201    let mut search_start = 0;
1202    while let Some(offset) = input[search_start..].find(&open) {
1203        let start = search_start + offset;
1204        let after_name = start + open.len();
1205        if input
1206            .as_bytes()
1207            .get(after_name)
1208            .map(|byte| matches!(byte, b'>' | b'/' | b' ' | b'\t' | b'\n' | b'\r'))
1209            .unwrap_or(false)
1210        {
1211            return Some(start);
1212        }
1213        search_start = after_name;
1214    }
1215    None
1216}
1217
1218fn is_block_tag(tag: &str) -> bool {
1219    let name = tag
1220        .trim_start_matches('/')
1221        .split_whitespace()
1222        .next()
1223        .unwrap_or_default()
1224        .trim_end_matches('/');
1225    matches!(
1226        name.to_ascii_lowercase().as_str(),
1227        "address"
1228            | "article"
1229            | "article-title"
1230            | "aside"
1231            | "abstract"
1232            | "back"
1233            | "blockquote"
1234            | "body"
1235            | "br"
1236            | "caption"
1237            | "div"
1238            | "footer"
1239            | "front"
1240            | "h1"
1241            | "h2"
1242            | "h3"
1243            | "h4"
1244            | "h5"
1245            | "h6"
1246            | "header"
1247            | "item"
1248            | "li"
1249            | "list"
1250            | "main"
1251            | "mixed-citation"
1252            | "p"
1253            | "ref"
1254            | "sec"
1255            | "section"
1256            | "table-wrap"
1257            | "tr"
1258    )
1259}
1260
1261fn decode_entity(input: &str) -> Option<(String, usize)> {
1262    let end = input.find(';')?.min(16);
1263    let entity = &input[1..end];
1264    let decoded = match entity {
1265        "amp" => "&".to_owned(),
1266        "lt" => "<".to_owned(),
1267        "gt" => ">".to_owned(),
1268        "quot" => "\"".to_owned(),
1269        "apos" => "'".to_owned(),
1270        "nbsp" => " ".to_owned(),
1271        value if value.starts_with("#x") || value.starts_with("#X") => {
1272            char::from_u32(u32::from_str_radix(&value[2..], 16).ok()?)?.to_string()
1273        }
1274        value if value.starts_with('#') => {
1275            char::from_u32(value[1..].parse::<u32>().ok()?)?.to_string()
1276        }
1277        _ => return None,
1278    };
1279    Some((decoded, end + 1))
1280}
1281
1282fn parse_email(raw: &str) -> EmailParts {
1283    let normalized = raw.replace("\r\n", "\n").replace('\r', "\n");
1284    let (headers, body) = normalized
1285        .split_once("\n\n")
1286        .unwrap_or((normalized.as_str(), ""));
1287    let mut subject_lines = Vec::new();
1288    let mut active_header = String::new();
1289
1290    for line in headers.lines() {
1291        if line.starts_with(' ') || line.starts_with('\t') {
1292            if active_header.eq_ignore_ascii_case("subject") {
1293                subject_lines.push(line.trim().to_owned());
1294            }
1295            continue;
1296        }
1297
1298        let Some((name, value)) = line.split_once(':') else {
1299            active_header.clear();
1300            continue;
1301        };
1302        active_header = name.trim().to_owned();
1303        if active_header.eq_ignore_ascii_case("subject") {
1304            subject_lines.push(value.trim().to_owned());
1305        }
1306    }
1307
1308    EmailParts {
1309        subject: (!subject_lines.is_empty())
1310            .then(|| decode_rfc2047_words(&subject_lines.join(" "))),
1311        body: normalize_text_lines(body),
1312    }
1313}
1314
1315fn decode_rfc2047_words(value: &str) -> String {
1316    // Keep this deliberately conservative: most benchmark and archive emails
1317    // carry plain ASCII/UTF-8 subjects, and undecodable words are safer intact.
1318    value.to_owned()
1319}
1320
1321fn push_newline(output: &mut String) {
1322    while output.ends_with(' ') {
1323        output.pop();
1324    }
1325    if !output.ends_with("\n\n") {
1326        if output.ends_with('\n') {
1327            output.push('\n');
1328        } else if !output.is_empty() {
1329            output.push_str("\n\n");
1330        }
1331    }
1332}
1333
1334fn normalize_text_lines(text: &str) -> String {
1335    let mut lines = Vec::new();
1336    for line in text.lines() {
1337        let trimmed = line.split_whitespace().collect::<Vec<_>>().join(" ");
1338        if trimmed.is_empty() {
1339            if !lines
1340                .last()
1341                .map(|line: &String| line.is_empty())
1342                .unwrap_or(true)
1343            {
1344                lines.push(String::new());
1345            }
1346        } else {
1347            lines.push(trimmed);
1348        }
1349    }
1350    while lines.last().map(|line| line.is_empty()).unwrap_or(false) {
1351        lines.pop();
1352    }
1353    lines.join("\n")
1354}