Skip to main content

a3s_code_core/
default_parser.rs

1use anyhow::{Context, Result};
2use base64::Engine as _;
3use roxmltree::Document;
4use std::fs::File;
5use std::io::Read;
6use std::path::Path;
7use std::sync::Arc;
8use zip::ZipArchive;
9
10use crate::document_parser::{DocumentBlock, DocumentBlockKind, ParsedDocument};
11
12/// Built-in rich document parser inspired by Kreuzberg's multi-format extraction model.
13///
14/// This parser handles common binary and containerized document formats and returns
15/// plain text suitable for `agentic_parse` and `agentic_search`.
16pub trait DefaultParserOcrProvider: Send + Sync {
17    fn name(&self) -> &str;
18
19    fn ocr_pdf(
20        &self,
21        path: &Path,
22        config: &crate::config::DefaultParserOcrConfig,
23    ) -> Result<Option<String>>;
24}
25
26#[derive(Default)]
27pub struct DefaultParser {
28    config: crate::config::DefaultParserConfig,
29    ocr_provider: Option<Arc<dyn DefaultParserOcrProvider>>,
30}
31
32impl DefaultParser {
33    pub fn new() -> Self {
34        Self::default()
35    }
36
37    pub fn with_config(config: crate::config::DefaultParserConfig) -> Self {
38        Self {
39            config,
40            ocr_provider: None,
41        }
42    }
43
44    pub fn with_config_and_ocr(
45        config: crate::config::DefaultParserConfig,
46        ocr_provider: Arc<dyn DefaultParserOcrProvider>,
47    ) -> Self {
48        Self {
49            config,
50            ocr_provider: Some(ocr_provider),
51        }
52    }
53
54    pub fn config(&self) -> &crate::config::DefaultParserConfig {
55        &self.config
56    }
57
58    pub fn ocr_provider(&self) -> Option<&Arc<dyn DefaultParserOcrProvider>> {
59        self.ocr_provider.as_ref()
60    }
61}
62
63impl crate::document_parser::DocumentParser for DefaultParser {
64    fn name(&self) -> &str {
65        "default-parser"
66    }
67
68    fn supported_extensions(&self) -> &[&str] {
69        &[
70            "pdf", "docx", "xlsx", "xlsm", "pptx", "odt", "ods", "odp", "epub", "rtf", "html",
71            "htm", "xhtml", "xml", "eml",
72        ]
73    }
74
75    fn parse(&self, path: &Path) -> Result<String> {
76        Ok(self.parse_document(path)?.to_text())
77    }
78
79    fn parse_document(&self, path: &Path) -> Result<ParsedDocument> {
80        let ext = path
81            .extension()
82            .and_then(|ext| ext.to_str())
83            .map(|ext| ext.to_ascii_lowercase())
84            .unwrap_or_default();
85
86        match ext.as_str() {
87            "pdf" => parse_pdf_document(path, &self.config, self.ocr_provider.as_deref()),
88            "docx" => parse_docx(path),
89            "xlsx" | "xlsm" => parse_xlsx(path),
90            "pptx" => parse_pptx(path),
91            "odt" | "ods" | "odp" => parse_odf(path),
92            "epub" => parse_epub(path),
93            "eml" => parse_eml(path),
94            "rtf" => parsed_text_document(path, parse_rtf(path)?, DocumentBlockKind::Paragraph),
95            "html" | "htm" | "xhtml" => parse_html_document(path),
96            "xml" => parse_xml_document(path),
97            _ => anyhow::bail!("unsupported extension for kreuzberg parser"),
98        }
99    }
100
101    fn max_file_size(&self) -> u64 {
102        self.config.max_file_size_mb * 1024 * 1024
103    }
104}
105
106fn parse_pdf(path: &Path) -> Result<String> {
107    pdf_extract::extract_text(path)
108        .with_context(|| format!("failed to extract text from PDF {}", path.display()))
109}
110
111fn parse_pdf_document(
112    path: &Path,
113    config: &crate::config::DefaultParserConfig,
114    ocr_provider: Option<&dyn DefaultParserOcrProvider>,
115) -> Result<ParsedDocument> {
116    let extracted_text = parse_pdf(path).unwrap_or_default();
117    let text = maybe_run_pdf_ocr(path, extracted_text, config, ocr_provider)?;
118    parsed_text_document(path, text, DocumentBlockKind::Paragraph)
119}
120
121fn maybe_run_pdf_ocr(
122    path: &Path,
123    extracted_text: String,
124    config: &crate::config::DefaultParserConfig,
125    ocr_provider: Option<&dyn DefaultParserOcrProvider>,
126) -> Result<String> {
127    if !should_attempt_pdf_ocr(&extracted_text, config) {
128        return Ok(extracted_text);
129    }
130
131    let Some(ocr_config) = config.ocr.as_ref().filter(|ocr| ocr.enabled) else {
132        return Ok(extracted_text);
133    };
134    let Some(provider) = ocr_provider else {
135        tracing::debug!(
136            "DefaultParser OCR enabled for {} but no OCR provider was configured",
137            path.display()
138        );
139        return Ok(extracted_text);
140    };
141
142    match provider.ocr_pdf(path, ocr_config) {
143        Ok(Some(ocr_text)) if !ocr_text.trim().is_empty() => {
144            tracing::info!(
145                "DefaultParser used OCR provider '{}' for {}",
146                provider.name(),
147                path.display()
148            );
149            Ok(ocr_text)
150        }
151        Ok(_) => Ok(extracted_text),
152        Err(err) => {
153            tracing::warn!(
154                "DefaultParser OCR provider '{}' failed on {}: {}",
155                provider.name(),
156                path.display(),
157                err
158            );
159            Ok(extracted_text)
160        }
161    }
162}
163
164fn should_attempt_pdf_ocr(text: &str, config: &crate::config::DefaultParserConfig) -> bool {
165    let Some(ocr) = config.ocr.as_ref() else {
166        return false;
167    };
168    if !ocr.enabled {
169        return false;
170    }
171
172    let trimmed = text.trim();
173    if trimmed.is_empty() {
174        return true;
175    }
176
177    let char_count = trimmed.chars().count();
178    let word_count = trimmed.split_whitespace().count();
179    let alnum_count = trimmed.chars().filter(|ch| ch.is_alphanumeric()).count();
180    let alnum_ratio = alnum_count as f32 / char_count.max(1) as f32;
181
182    char_count < 80 || word_count < 20 || alnum_ratio < 0.45
183}
184
185fn parse_html_document(path: &Path) -> Result<ParsedDocument> {
186    let html = std::fs::read_to_string(path)
187        .with_context(|| format!("failed to read HTML file {}", path.display()))?;
188    parse_markup_document(path, &html, true)
189}
190
191fn parse_xml_document(path: &Path) -> Result<ParsedDocument> {
192    let xml = std::fs::read_to_string(path)
193        .with_context(|| format!("failed to read XML file {}", path.display()))?;
194    parse_markup_document(path, &xml, false)
195}
196
197fn parse_rtf(path: &Path) -> Result<String> {
198    let raw = std::fs::read_to_string(path)
199        .with_context(|| format!("failed to read RTF file {}", path.display()))?;
200    Ok(strip_rtf(&raw))
201}
202
203fn parse_eml(path: &Path) -> Result<ParsedDocument> {
204    let raw = std::fs::read_to_string(path)
205        .with_context(|| format!("failed to read EML file {}", path.display()))?;
206    let mail = parse_email_part(&raw);
207
208    let mut doc = ParsedDocument::new();
209    doc.title = file_title(path);
210    if !mail.headers.is_empty() {
211        let mut header_lines = Vec::new();
212        for key in ["Subject", "From", "To", "Cc", "Date"] {
213            if let Some(value) = mail.headers.get(key) {
214                header_lines.push(format!("{key}: {value}"));
215            }
216        }
217        if !header_lines.is_empty() {
218            doc.push(
219                DocumentBlock::new(
220                    DocumentBlockKind::EmailHeader,
221                    Some("headers"),
222                    header_lines.join("\n"),
223                )
224                .with_source("message")
225                .with_ordinal(1),
226            );
227        }
228    }
229
230    let body = collect_best_mail_body(&mail);
231    if !body.trim().is_empty() {
232        doc.push(
233            DocumentBlock::new(DocumentBlockKind::Paragraph, Some("body"), body)
234                .with_source("message")
235                .with_ordinal(2),
236        );
237    }
238
239    ensure_document(doc, path)
240}
241
242fn parse_epub(path: &Path) -> Result<ParsedDocument> {
243    let mut zip = open_zip(path)?;
244    let mut names: Vec<String> = zip.file_names().map(|s| s.to_string()).collect();
245    names.sort();
246    let mut doc = ParsedDocument::new();
247    doc.title = file_title(path);
248
249    for name in names {
250        let lower = name.to_ascii_lowercase();
251        if !(lower.ends_with(".xhtml") || lower.ends_with(".html") || lower.ends_with(".htm")) {
252            continue;
253        }
254
255        let content = read_zip_entry(&mut zip, &name)?;
256        let section_doc = parse_markup_string(&content, true).unwrap_or_else(|| {
257            fallback_text_blocks(&render_html_to_text(&content).unwrap_or_default())
258        });
259        if section_doc.is_empty() {
260            continue;
261        }
262
263        doc.push(
264            DocumentBlock::new(
265                DocumentBlockKind::Metadata,
266                Some(name.clone()),
267                format!("source: {}", name),
268            )
269            .with_source(name.clone()),
270        );
271        for (idx, block) in section_doc.into_iter().enumerate() {
272            let label = block
273                .label
274                .as_ref()
275                .map(|label| format!("{}: {}", name, label))
276                .or_else(|| Some(name.clone()));
277            doc.push(
278                DocumentBlock::new(block.kind, label, block.content)
279                    .with_source(name.clone())
280                    .with_ordinal(idx + 1),
281            );
282        }
283    }
284
285    ensure_document(doc, path)
286}
287
288fn parse_docx(path: &Path) -> Result<ParsedDocument> {
289    let mut zip = open_zip(path)?;
290    let mut names: Vec<String> = zip.file_names().map(|s| s.to_string()).collect();
291    names.sort();
292
293    let mut doc = ParsedDocument::new();
294    doc.title = file_title(path);
295    for name in names {
296        if !name.starts_with("word/") || !name.ends_with(".xml") {
297            continue;
298        }
299        if !(name == "word/document.xml"
300            || name.starts_with("word/header")
301            || name.starts_with("word/footer")
302            || name.starts_with("word/footnotes")
303            || name.starts_with("word/endnotes"))
304        {
305            continue;
306        }
307
308        let content = read_zip_entry(&mut zip, &name)?;
309        let blocks = extract_docx_blocks(&content)?;
310        if !blocks.is_empty() {
311            for (idx, block) in blocks.into_iter().enumerate() {
312                let label = block
313                    .label
314                    .as_ref()
315                    .map(|label| format!("{}: {}", name, label))
316                    .or_else(|| Some(name.clone()));
317                doc.push(
318                    DocumentBlock::new(block.kind, label, block.content)
319                        .with_source(name.clone())
320                        .with_ordinal(idx + 1),
321                );
322            }
323        }
324    }
325
326    ensure_document(doc, path)
327}
328
329fn parse_xlsx(path: &Path) -> Result<ParsedDocument> {
330    let mut zip = open_zip(path)?;
331    let shared_strings = read_shared_strings(&mut zip).unwrap_or_default();
332    let mut names: Vec<String> = zip.file_names().map(|s| s.to_string()).collect();
333    names.sort();
334
335    let mut doc = ParsedDocument::new();
336    doc.title = file_title(path);
337    for name in names {
338        if !name.starts_with("xl/worksheets/") || !name.ends_with(".xml") {
339            continue;
340        }
341        let content = read_zip_entry(&mut zip, &name)?;
342        let text = parse_xlsx_sheet(&content, &shared_strings)?;
343        if !text.trim().is_empty() {
344            doc.push(
345                DocumentBlock::new(DocumentBlockKind::Table, Some(name.clone()), text)
346                    .with_source(name.clone())
347                    .with_ordinal(1),
348            );
349        }
350    }
351
352    ensure_document(doc, path)
353}
354
355fn parse_pptx(path: &Path) -> Result<ParsedDocument> {
356    let mut zip = open_zip(path)?;
357    let mut names: Vec<String> = zip.file_names().map(|s| s.to_string()).collect();
358    names.sort();
359
360    let mut doc = ParsedDocument::new();
361    doc.title = file_title(path);
362    for name in names {
363        if !name.starts_with("ppt/slides/slide") || !name.ends_with(".xml") {
364            continue;
365        }
366        let content = read_zip_entry(&mut zip, &name)?;
367        let text = extract_xml_text(&content)?;
368        let blocks = text_blocks(&text, DocumentBlockKind::Paragraph);
369        if blocks.is_empty() {
370            continue;
371        }
372        for (idx, block) in blocks.into_iter().enumerate() {
373            let kind = if idx == 0 && looks_like_heading(&block.content) {
374                DocumentBlockKind::Heading
375            } else {
376                DocumentBlockKind::Slide
377            };
378            let label = if idx == 0 {
379                Some(name.clone())
380            } else {
381                Some(format!("{}: block {}", name, idx + 1))
382            };
383            doc.push(
384                DocumentBlock::new(kind, label, block.content)
385                    .with_source(name.clone())
386                    .with_page(extract_slide_number(&name).unwrap_or(idx + 1))
387                    .with_ordinal(idx + 1),
388            );
389        }
390    }
391
392    ensure_document(doc, path)
393}
394
395fn parse_odf(path: &Path) -> Result<ParsedDocument> {
396    let mut zip = open_zip(path)?;
397    let mut doc = ParsedDocument::new();
398    doc.title = file_title(path);
399
400    for name in ["meta.xml", "styles.xml", "content.xml"] {
401        if let Ok(content) = read_zip_entry(&mut zip, name) {
402            let blocks = if name == "content.xml" {
403                parse_odf_content_blocks(&content)?
404            } else {
405                text_blocks(&extract_xml_text(&content)?, DocumentBlockKind::Metadata)
406            };
407            for (idx, block) in blocks.into_iter().enumerate() {
408                let label = block
409                    .label
410                    .as_ref()
411                    .map(|label| format!("{}: {}", name, label))
412                    .or_else(|| {
413                        if idx == 0 {
414                            Some(name.to_string())
415                        } else {
416                            Some(format!("{}: block {}", name, idx + 1))
417                        }
418                    });
419                doc.push(
420                    DocumentBlock::new(block.kind, label, block.content)
421                        .with_source(name)
422                        .with_ordinal(idx + 1),
423                );
424            }
425        }
426    }
427
428    ensure_document(doc, path)
429}
430
431fn read_shared_strings(zip: &mut ZipArchive<File>) -> Result<Vec<String>> {
432    let content = read_zip_entry(zip, "xl/sharedStrings.xml")?;
433    let doc = Document::parse(&content).context("failed to parse xlsx sharedStrings.xml")?;
434    let mut values = Vec::new();
435
436    for si in doc.descendants().filter(|n| n.tag_name().name() == "si") {
437        let value = si
438            .descendants()
439            .filter(|n| n.tag_name().name() == "t")
440            .filter_map(|n| n.text())
441            .map(str::trim)
442            .filter(|t| !t.is_empty())
443            .collect::<Vec<_>>()
444            .join("");
445        if !value.is_empty() {
446            values.push(value);
447        }
448    }
449
450    Ok(values)
451}
452
453fn parse_xlsx_sheet(xml: &str, shared_strings: &[String]) -> Result<String> {
454    let doc = Document::parse(xml).context("failed to parse worksheet xml")?;
455    let mut rows = Vec::new();
456
457    for row in doc.descendants().filter(|n| n.tag_name().name() == "row") {
458        let mut cells = Vec::new();
459        for cell in row.children().filter(|n| n.tag_name().name() == "c") {
460            let value = extract_xlsx_cell(cell, shared_strings);
461            if !value.is_empty() {
462                cells.push(value);
463            }
464        }
465        if !cells.is_empty() {
466            rows.push(cells.join("\t"));
467        }
468    }
469
470    Ok(rows.join("\n"))
471}
472
473fn parse_markup_document(path: &Path, input: &str, is_html: bool) -> Result<ParsedDocument> {
474    let mut doc = ParsedDocument::new();
475    doc.title = file_title(path);
476
477    let blocks = parse_markup_string(input, is_html).unwrap_or_else(|| {
478        let rendered = if is_html {
479            render_html_to_text(input).unwrap_or_default()
480        } else {
481            extract_xml_text(input).unwrap_or_default()
482        };
483        fallback_text_blocks(&rendered)
484    });
485
486    if doc.title.is_none() {
487        doc.title = extract_markup_title(input);
488    }
489    for block in blocks {
490        doc.push(block);
491    }
492
493    ensure_document(doc, path)
494}
495
496fn parse_markup_string(input: &str, is_html: bool) -> Option<Vec<DocumentBlock>> {
497    let doc = Document::parse(input).ok()?;
498    let mut blocks = Vec::new();
499
500    for node in doc.descendants().filter(|node| node.is_element()) {
501        let tag = node.tag_name().name();
502        let kind = match tag {
503            "title" => continue,
504            "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => DocumentBlockKind::Heading,
505            "p" | "li" | "blockquote" => DocumentBlockKind::Paragraph,
506            "pre" | "code" => DocumentBlockKind::Code,
507            "table" => DocumentBlockKind::Table,
508            "meta" if is_html => DocumentBlockKind::Metadata,
509            "section" | "article" => DocumentBlockKind::Section,
510            _ => continue,
511        };
512
513        let content = collect_node_text(node);
514        if content.trim().is_empty() {
515            continue;
516        }
517
518        let label = match tag {
519            "meta" => node
520                .attribute("name")
521                .or_else(|| node.attribute("property"))
522                .or_else(|| node.attribute("http-equiv"))
523                .map(str::to_string),
524            _ => None,
525        };
526        blocks.push(DocumentBlock::new(kind, label, content));
527    }
528
529    if blocks.is_empty() {
530        None
531    } else {
532        Some(dedupe_adjacent_blocks(blocks))
533    }
534}
535
536fn extract_markup_title(input: &str) -> Option<String> {
537    let doc = Document::parse(input).ok()?;
538    doc.descendants()
539        .find(|node| node.has_tag_name("title"))
540        .map(collect_node_text)
541        .filter(|title| !title.trim().is_empty())
542}
543
544fn render_html_to_text(input: &str) -> Result<String> {
545    html2text::from_read(input.as_bytes(), 80).context("failed to render HTML as text")
546}
547
548fn collect_node_text(node: roxmltree::Node<'_, '_>) -> String {
549    let text = node
550        .descendants()
551        .filter_map(|child| child.text())
552        .map(str::trim)
553        .filter(|text| !text.is_empty())
554        .collect::<Vec<_>>()
555        .join(" ");
556    normalize_text(&text)
557}
558
559fn dedupe_adjacent_blocks(blocks: Vec<DocumentBlock>) -> Vec<DocumentBlock> {
560    let mut deduped = Vec::new();
561    for block in blocks {
562        let is_duplicate = deduped.last().is_some_and(|last: &DocumentBlock| {
563            last.kind == block.kind && last.label == block.label && last.content == block.content
564        });
565        if !is_duplicate {
566            deduped.push(block);
567        }
568    }
569    deduped
570}
571
572fn extract_docx_blocks(xml: &str) -> Result<Vec<DocumentBlock>> {
573    let doc = Document::parse(xml).context("failed to parse docx xml")?;
574    let mut blocks = Vec::new();
575
576    for para in doc
577        .descendants()
578        .filter(|node| node.tag_name().name() == "p")
579    {
580        let content = para
581            .descendants()
582            .filter(|node| node.tag_name().name() == "t")
583            .filter_map(|node| node.text())
584            .map(str::trim)
585            .filter(|text| !text.is_empty())
586            .collect::<Vec<_>>()
587            .join("");
588        let content = normalize_text(&content);
589        if content.is_empty() {
590            continue;
591        }
592
593        let kind = if paragraph_style(para)
594            .map(|style| style.to_ascii_lowercase().contains("heading"))
595            .unwrap_or(false)
596            || looks_like_heading(&content)
597        {
598            DocumentBlockKind::Heading
599        } else {
600            DocumentBlockKind::Paragraph
601        };
602
603        blocks.push(DocumentBlock::new(kind, None::<String>, content));
604    }
605
606    if blocks.is_empty() {
607        let fallback = extract_xml_text(xml)?;
608        Ok(fallback_text_blocks(&fallback))
609    } else {
610        Ok(blocks)
611    }
612}
613
614fn paragraph_style(node: roxmltree::Node<'_, '_>) -> Option<String> {
615    node.descendants()
616        .find(|child| child.tag_name().name() == "pStyle")
617        .and_then(|child| child.attribute("val").or_else(|| child.attribute("w:val")))
618        .map(str::to_string)
619}
620
621fn parse_odf_content_blocks(xml: &str) -> Result<Vec<DocumentBlock>> {
622    let doc = Document::parse(xml).context("failed to parse odf content xml")?;
623    let mut blocks = Vec::new();
624
625    for node in doc.descendants().filter(|node| node.is_element()) {
626        let tag = node.tag_name().name();
627        let kind = match tag {
628            "h" => DocumentBlockKind::Heading,
629            "p" => DocumentBlockKind::Paragraph,
630            "list-item" => DocumentBlockKind::Paragraph,
631            _ => continue,
632        };
633        let content = collect_node_text(node);
634        if content.is_empty() {
635            continue;
636        }
637        blocks.push(DocumentBlock::new(kind, None::<String>, content));
638    }
639
640    if blocks.is_empty() {
641        let fallback = extract_xml_text(xml)?;
642        Ok(fallback_text_blocks(&fallback))
643    } else {
644        Ok(blocks)
645    }
646}
647
648fn parsed_text_document(
649    path: &Path,
650    text: String,
651    default_kind: DocumentBlockKind,
652) -> Result<ParsedDocument> {
653    let mut doc = ParsedDocument::new();
654    doc.title = file_title(path);
655    let source = doc
656        .title
657        .clone()
658        .unwrap_or_else(|| path.display().to_string());
659    for (idx, block) in text_blocks(&text, default_kind).into_iter().enumerate() {
660        doc.push(block.with_source(source.clone()).with_ordinal(idx + 1));
661    }
662    ensure_document(doc, path)
663}
664
665fn fallback_text_blocks(text: &str) -> Vec<DocumentBlock> {
666    text_blocks(text, DocumentBlockKind::Paragraph)
667}
668
669fn text_blocks(text: &str, default_kind: DocumentBlockKind) -> Vec<DocumentBlock> {
670    let normalized = normalize_text(text);
671    normalized
672        .split("\n\n")
673        .filter_map(|chunk| {
674            let chunk = chunk.trim();
675            if chunk.is_empty() {
676                return None;
677            }
678
679            let kind = if looks_like_heading(chunk) {
680                DocumentBlockKind::Heading
681            } else {
682                default_kind.clone()
683            };
684            Some(DocumentBlock::new(kind, None::<String>, chunk))
685        })
686        .collect()
687}
688
689fn looks_like_heading(text: &str) -> bool {
690    let line = text.lines().next().unwrap_or("").trim();
691    if line.is_empty() || text.lines().count() > 2 {
692        return false;
693    }
694    if line.starts_with('#') {
695        return true;
696    }
697    let char_count = line.chars().count();
698    let ends_like_sentence = matches!(line.chars().last(), Some('.' | '!' | '?' | ':' | ';'));
699    char_count <= 80 && !ends_like_sentence
700}
701
702fn extract_slide_number(name: &str) -> Option<usize> {
703    let digits = name
704        .chars()
705        .skip_while(|ch| !ch.is_ascii_digit())
706        .take_while(|ch| ch.is_ascii_digit())
707        .collect::<String>();
708    digits.parse().ok()
709}
710
711fn extract_xlsx_cell(cell: roxmltree::Node<'_, '_>, shared_strings: &[String]) -> String {
712    let cell_type = cell.attribute("t").unwrap_or_default();
713
714    if cell_type == "inlineStr" {
715        return cell
716            .descendants()
717            .filter(|n| n.tag_name().name() == "t")
718            .filter_map(|n| n.text())
719            .map(str::trim)
720            .filter(|t| !t.is_empty())
721            .collect::<Vec<_>>()
722            .join("");
723    }
724
725    let raw = cell
726        .children()
727        .find(|n| n.tag_name().name() == "v")
728        .and_then(|n| n.text())
729        .map(str::trim)
730        .unwrap_or_default();
731
732    if raw.is_empty() {
733        return String::new();
734    }
735
736    if cell_type == "s" {
737        return raw
738            .parse::<usize>()
739            .ok()
740            .and_then(|idx| shared_strings.get(idx))
741            .cloned()
742            .unwrap_or_else(|| raw.to_string());
743    }
744
745    raw.to_string()
746}
747
748fn open_zip(path: &Path) -> Result<ZipArchive<File>> {
749    let file = File::open(path)
750        .with_context(|| format!("failed to open zip container {}", path.display()))?;
751    ZipArchive::new(file)
752        .with_context(|| format!("failed to read zip container {}", path.display()))
753}
754
755fn read_zip_entry(zip: &mut ZipArchive<File>, name: &str) -> Result<String> {
756    let mut file = zip
757        .by_name(name)
758        .with_context(|| format!("zip entry not found: {name}"))?;
759    let mut buf = String::new();
760    file.read_to_string(&mut buf)
761        .with_context(|| format!("failed to read zip entry: {name}"))?;
762    Ok(buf)
763}
764
765fn extract_xml_text(xml: &str) -> Result<String> {
766    let doc = Document::parse(xml).context("failed to parse XML")?;
767    let mut out = String::new();
768    let mut last_was_space = true;
769
770    for node in doc.descendants() {
771        if let Some(text) = node.text() {
772            let trimmed = text.trim();
773            if trimmed.is_empty() {
774                continue;
775            }
776            if !last_was_space && !needs_newline(node.tag_name().name()) {
777                out.push(' ');
778            }
779            out.push_str(trimmed);
780            if needs_newline(node.tag_name().name()) {
781                out.push('\n');
782                last_was_space = true;
783            } else {
784                last_was_space = false;
785            }
786        }
787    }
788
789    Ok(normalize_text(&out))
790}
791
792fn needs_newline(tag: &str) -> bool {
793    matches!(
794        tag,
795        "p" | "div"
796            | "br"
797            | "section"
798            | "li"
799            | "tr"
800            | "row"
801            | "sheetData"
802            | "worksheet"
803            | "text-box"
804    )
805}
806
807fn normalize_text(text: &str) -> String {
808    let mut out = String::new();
809    let mut blank_lines = 0usize;
810
811    for line in text.lines() {
812        let line = line.split_whitespace().collect::<Vec<_>>().join(" ");
813        if line.is_empty() {
814            blank_lines += 1;
815            if blank_lines <= 1 && !out.ends_with("\n\n") {
816                out.push('\n');
817            }
818            continue;
819        }
820
821        blank_lines = 0;
822        if !out.is_empty() && !out.ends_with('\n') {
823            out.push('\n');
824        }
825        out.push_str(&line);
826    }
827
828    out.trim().to_string()
829}
830
831fn file_title(path: &Path) -> Option<String> {
832    path.file_name()
833        .and_then(|name| name.to_str())
834        .map(|name| name.to_string())
835}
836
837fn ensure_document(doc: ParsedDocument, path: &Path) -> Result<ParsedDocument> {
838    if doc.is_empty() {
839        anyhow::bail!("no extractable text found in {}", path.display());
840    }
841    Ok(doc)
842}
843
844#[derive(Debug, Default, Clone)]
845struct EmailPart {
846    headers: std::collections::HashMap<String, String>,
847    content_type: String,
848    body: String,
849    parts: Vec<EmailPart>,
850}
851
852fn parse_email_part(raw: &str) -> EmailPart {
853    let (header_block, body_block) = split_headers_body(raw);
854    let headers = parse_headers(header_block);
855    let content_type = headers
856        .get("Content-Type")
857        .cloned()
858        .unwrap_or_else(|| "text/plain; charset=utf-8".to_string());
859    let encoding = headers
860        .get("Content-Transfer-Encoding")
861        .cloned()
862        .unwrap_or_default();
863
864    if let Some(boundary) = extract_content_type_param(&content_type, "boundary") {
865        let parts = split_multipart_body(body_block, &boundary)
866            .into_iter()
867            .map(|part| parse_email_part(&part))
868            .collect::<Vec<_>>();
869        EmailPart {
870            headers,
871            content_type,
872            body: String::new(),
873            parts,
874        }
875    } else {
876        let decoded = decode_email_body(body_block, &encoding);
877        EmailPart {
878            headers,
879            content_type,
880            body: decoded,
881            parts: Vec::new(),
882        }
883    }
884}
885
886fn split_headers_body(raw: &str) -> (&str, &str) {
887    if let Some(idx) = raw.find("\r\n\r\n") {
888        (&raw[..idx], &raw[idx + 4..])
889    } else if let Some(idx) = raw.find("\n\n") {
890        (&raw[..idx], &raw[idx + 2..])
891    } else {
892        ("", raw)
893    }
894}
895
896fn parse_headers(raw: &str) -> std::collections::HashMap<String, String> {
897    let mut headers = std::collections::HashMap::new();
898    let mut current_key: Option<String> = None;
899    let mut current_val = String::new();
900
901    for line in raw.lines() {
902        if line.starts_with(' ') || line.starts_with('\t') {
903            if !current_val.is_empty() {
904                current_val.push(' ');
905            }
906            current_val.push_str(line.trim());
907            continue;
908        }
909
910        if let Some(key) = current_key.take() {
911            headers.insert(key, current_val.trim().to_string());
912            current_val.clear();
913        }
914
915        if let Some((key, value)) = line.split_once(':') {
916            current_key = Some(key.trim().to_string());
917            current_val.push_str(value.trim());
918        }
919    }
920
921    if let Some(key) = current_key {
922        headers.insert(key, current_val.trim().to_string());
923    }
924
925    headers
926}
927
928fn extract_content_type_param(content_type: &str, name: &str) -> Option<String> {
929    for part in content_type.split(';').skip(1) {
930        let (key, value) = part.split_once('=')?;
931        if key.trim().eq_ignore_ascii_case(name) {
932            return Some(value.trim().trim_matches('"').to_string());
933        }
934    }
935    None
936}
937
938fn split_multipart_body(body: &str, boundary: &str) -> Vec<String> {
939    let marker = format!("--{boundary}");
940    let end_marker = format!("--{boundary}--");
941    let normalized = body.replace("\r\n", "\n");
942    let mut parts = Vec::new();
943    let mut current = Vec::new();
944    let mut in_part = false;
945
946    for line in normalized.lines() {
947        if line == marker {
948            if in_part && !current.is_empty() {
949                parts.push(current.join("\n"));
950                current.clear();
951            }
952            in_part = true;
953            continue;
954        }
955        if line == end_marker {
956            if in_part && !current.is_empty() {
957                parts.push(current.join("\n"));
958            }
959            break;
960        }
961        if in_part {
962            current.push(line.to_string());
963        }
964    }
965
966    parts
967}
968
969fn decode_email_body(body: &str, encoding: &str) -> String {
970    let normalized = body.replace("\r\n", "\n");
971    let decoded = if encoding.eq_ignore_ascii_case("base64") {
972        decode_base64_text(&normalized).unwrap_or(normalized)
973    } else if encoding.eq_ignore_ascii_case("quoted-printable") {
974        decode_quoted_printable(&normalized)
975    } else {
976        normalized
977    };
978
979    decoded.trim().to_string()
980}
981
982fn decode_base64_text(input: &str) -> Option<String> {
983    let compact = input.lines().map(str::trim).collect::<String>();
984    let bytes = base64::engine::general_purpose::STANDARD
985        .decode(compact)
986        .ok()?;
987    String::from_utf8(bytes).ok()
988}
989
990fn decode_quoted_printable(input: &str) -> String {
991    let bytes = input.as_bytes();
992    let mut out = Vec::with_capacity(bytes.len());
993    let mut i = 0;
994
995    while i < bytes.len() {
996        match bytes[i] {
997            b'=' => {
998                if i + 1 < bytes.len() && bytes[i + 1] == b'\n' {
999                    i += 2;
1000                    continue;
1001                }
1002                if i + 2 < bytes.len() && bytes[i + 1] == b'\r' && bytes[i + 2] == b'\n' {
1003                    i += 3;
1004                    continue;
1005                }
1006                if i + 2 < bytes.len() {
1007                    let hex = &input[i + 1..i + 3];
1008                    if let Ok(byte) = u8::from_str_radix(hex, 16) {
1009                        out.push(byte);
1010                        i += 3;
1011                        continue;
1012                    }
1013                }
1014                out.push(bytes[i]);
1015                i += 1;
1016            }
1017            b'_' => {
1018                out.push(b' ');
1019                i += 1;
1020            }
1021            b => {
1022                out.push(b);
1023                i += 1;
1024            }
1025        }
1026    }
1027
1028    String::from_utf8_lossy(&out).into_owned()
1029}
1030
1031fn collect_best_mail_body(part: &EmailPart) -> String {
1032    if !part.parts.is_empty() {
1033        let preferred_plain = part
1034            .parts
1035            .iter()
1036            .map(collect_best_mail_body)
1037            .find(|body| !body.trim().is_empty());
1038        if let Some(body) = preferred_plain {
1039            return body;
1040        }
1041    }
1042
1043    if part
1044        .content_type
1045        .to_ascii_lowercase()
1046        .starts_with("text/html")
1047    {
1048        return html2text::from_read(part.body.as_bytes(), 80)
1049            .unwrap_or_else(|_| part.body.clone())
1050            .trim()
1051            .to_string();
1052    }
1053
1054    if part.content_type.is_empty() || part.content_type.to_ascii_lowercase().starts_with("text/") {
1055        return part.body.trim().to_string();
1056    }
1057
1058    String::new()
1059}
1060
1061fn strip_rtf(input: &str) -> String {
1062    let mut out = String::new();
1063    let mut chars = input.chars().peekable();
1064
1065    while let Some(ch) = chars.next() {
1066        match ch {
1067            '{' | '}' => {}
1068            '\\' => match chars.peek().copied() {
1069                Some('\\') | Some('{') | Some('}') => {
1070                    out.push(chars.next().unwrap_or_default());
1071                }
1072                Some('\'') => {
1073                    chars.next();
1074                    let hi = chars.next();
1075                    let lo = chars.next();
1076                    if let (Some(hi), Some(lo)) = (hi, lo) {
1077                        let hex = format!("{hi}{lo}");
1078                        if let Ok(byte) = u8::from_str_radix(&hex, 16) {
1079                            out.push(byte as char);
1080                        }
1081                    }
1082                }
1083                Some(_) => {
1084                    let mut word = String::new();
1085                    while let Some(c) = chars.peek().copied() {
1086                        if c.is_ascii_alphabetic() {
1087                            word.push(c);
1088                            chars.next();
1089                        } else {
1090                            break;
1091                        }
1092                    }
1093                    while let Some(c) = chars.peek().copied() {
1094                        if c.is_ascii_digit() || c == '-' {
1095                            chars.next();
1096                        } else {
1097                            break;
1098                        }
1099                    }
1100                    if chars.peek() == Some(&' ') {
1101                        chars.next();
1102                    }
1103                    if matches!(word.as_str(), "par" | "line") {
1104                        out.push('\n');
1105                    }
1106                }
1107                None => break,
1108            },
1109            '\r' => {}
1110            '\n' => out.push('\n'),
1111            _ => out.push(ch),
1112        }
1113    }
1114
1115    normalize_text(&out)
1116}
1117
1118#[cfg(test)]
1119mod tests {
1120    use super::*;
1121    use std::io::Write;
1122    use tempfile::TempDir;
1123    use zip::write::FileOptions;
1124
1125    struct MockOcrProvider {
1126        text: Option<String>,
1127    }
1128
1129    impl DefaultParserOcrProvider for MockOcrProvider {
1130        fn name(&self) -> &str {
1131            "mock-ocr"
1132        }
1133
1134        fn ocr_pdf(
1135            &self,
1136            _path: &Path,
1137            _config: &crate::config::DefaultParserOcrConfig,
1138        ) -> Result<Option<String>> {
1139            Ok(self.text.clone())
1140        }
1141    }
1142
1143    fn write_file(dir: &TempDir, name: &str, content: &str) -> std::path::PathBuf {
1144        let path = dir.path().join(name);
1145        std::fs::write(&path, content).unwrap();
1146        path
1147    }
1148
1149    fn write_zip(dir: &TempDir, name: &str, entries: &[(&str, &str)]) -> std::path::PathBuf {
1150        let path = dir.path().join(name);
1151        let file = File::create(&path).unwrap();
1152        let mut zip = zip::ZipWriter::new(file);
1153        let options = FileOptions::default();
1154
1155        for (entry, content) in entries {
1156            zip.start_file(*entry, options).unwrap();
1157            zip.write_all(content.as_bytes()).unwrap();
1158        }
1159
1160        zip.finish().unwrap();
1161        path
1162    }
1163
1164    #[test]
1165    fn parses_html() {
1166        let dir = TempDir::new().unwrap();
1167        let path = write_file(
1168            &dir,
1169            "sample.html",
1170            "<html><body><h1>Hello</h1><p>World</p></body></html>",
1171        );
1172        let doc = parse_html_document(&path).unwrap();
1173        assert!(doc
1174            .blocks
1175            .iter()
1176            .any(|block| block.kind == DocumentBlockKind::Heading));
1177        assert!(doc.to_text().contains("Hello"));
1178        assert!(doc.to_text().contains("World"));
1179    }
1180
1181    #[test]
1182    fn parses_docx_like_zip() {
1183        let dir = TempDir::new().unwrap();
1184        let path = write_zip(
1185            &dir,
1186            "sample.docx",
1187            &[(
1188                "word/document.xml",
1189                r#"<w:document xmlns:w="urn:test"><w:body><w:p><w:r><w:t>Hello</w:t></w:r></w:p><w:p><w:r><w:t>World</w:t></w:r></w:p></w:body></w:document>"#,
1190            )],
1191        );
1192        let doc = parse_docx(&path).unwrap();
1193        assert!(doc
1194            .blocks
1195            .iter()
1196            .any(|block| block.kind == DocumentBlockKind::Heading));
1197        assert!(doc.to_text().contains("Hello"));
1198        assert!(doc.to_text().contains("World"));
1199    }
1200
1201    #[test]
1202    fn parses_xlsx_shared_strings_and_inline_cells() {
1203        let dir = TempDir::new().unwrap();
1204        let path = write_zip(
1205            &dir,
1206            "sample.xlsx",
1207            &[
1208                (
1209                    "xl/sharedStrings.xml",
1210                    r#"<sst xmlns="urn:test"><si><t>Name</t></si><si><t>Alice</t></si></sst>"#,
1211                ),
1212                (
1213                    "xl/worksheets/sheet1.xml",
1214                    r#"<worksheet xmlns="urn:test"><sheetData><row r="1"><c r="A1" t="s"><v>0</v></c><c r="B1" t="inlineStr"><is><t>Score</t></is></c></row><row r="2"><c r="A2" t="s"><v>1</v></c><c r="B2"><v>42</v></c></row></sheetData></worksheet>"#,
1215                ),
1216            ],
1217        );
1218        let text = parse_xlsx(&path).unwrap().to_text();
1219        assert!(text.contains("Name"));
1220        assert!(text.contains("Score"));
1221        assert!(text.contains("Alice"));
1222        assert!(text.contains("42"));
1223    }
1224
1225    #[test]
1226    fn parses_pptx_slides() {
1227        let dir = TempDir::new().unwrap();
1228        let path = write_zip(
1229            &dir,
1230            "slides.pptx",
1231            &[(
1232                "ppt/slides/slide1.xml",
1233                r#"<p:sld xmlns:p="urn:test" xmlns:a="urn:test-a"><p:cSld><p:spTree><p:sp><p:txBody><a:p><a:r><a:t>Quarterly Review</a:t></a:r></a:p></p:txBody></p:sp></p:spTree></p:cSld></p:sld>"#,
1234            )],
1235        );
1236        let doc = parse_pptx(&path).unwrap();
1237        assert!(doc
1238            .blocks
1239            .iter()
1240            .any(|block| block.kind == DocumentBlockKind::Heading));
1241        assert!(doc
1242            .blocks
1243            .iter()
1244            .any(|block| block.location.as_ref().and_then(|loc| loc.page) == Some(1)));
1245        assert!(doc.to_text().contains("Quarterly Review"));
1246    }
1247
1248    #[test]
1249    fn parses_odf_content() {
1250        let dir = TempDir::new().unwrap();
1251        let path = write_zip(
1252            &dir,
1253            "document.odt",
1254            &[(
1255                "content.xml",
1256                r#"<office:document-content xmlns:office="urn:test" xmlns:text="urn:test-text"><office:body><office:text><text:p>Hello ODF</text:p><text:p>Second line</text:p></office:text></office:body></office:document-content>"#,
1257            )],
1258        );
1259        let doc = parse_odf(&path).unwrap();
1260        assert!(doc
1261            .blocks
1262            .iter()
1263            .any(|block| block.kind == DocumentBlockKind::Paragraph));
1264        assert!(doc.to_text().contains("Hello ODF"));
1265        assert!(doc.to_text().contains("Second line"));
1266    }
1267
1268    #[test]
1269    fn parses_epub_html_entries() {
1270        let dir = TempDir::new().unwrap();
1271        let path = write_zip(
1272            &dir,
1273            "book.epub",
1274            &[(
1275                "OPS/ch1.xhtml",
1276                "<html><body><p>Chapter One</p></body></html>",
1277            )],
1278        );
1279        let doc = parse_epub(&path).unwrap();
1280        assert!(doc
1281            .blocks
1282            .iter()
1283            .any(|block| block.kind == DocumentBlockKind::Paragraph));
1284        assert!(doc.to_text().contains("Chapter One"));
1285    }
1286
1287    #[test]
1288    fn parses_plain_eml() {
1289        let dir = TempDir::new().unwrap();
1290        let path = write_file(
1291            &dir,
1292            "mail.eml",
1293            "Subject: Hello\nFrom: alice@example.com\nTo: bob@example.com\nContent-Type: text/plain; charset=utf-8\n\nThis is a plain email body.\n",
1294        );
1295        let text = parse_eml(&path).unwrap().to_text();
1296        assert!(text.contains("Subject: Hello"));
1297        assert!(text.contains("alice@example.com"));
1298        assert!(text.contains("This is a plain email body."));
1299    }
1300
1301    #[test]
1302    fn parsed_text_document_sets_block_locations() {
1303        let dir = TempDir::new().unwrap();
1304        let path = write_file(&dir, "notes.rtf", "{\\rtf1\\ansi Hello \\par World}");
1305        let doc = parsed_text_document(
1306            &path,
1307            parse_rtf(&path).unwrap(),
1308            DocumentBlockKind::Paragraph,
1309        )
1310        .unwrap();
1311        assert!(doc.blocks.iter().enumerate().all(|(idx, block)| {
1312            block.location.as_ref().and_then(|loc| loc.ordinal) == Some(idx + 1)
1313        }));
1314    }
1315
1316    #[test]
1317    fn parses_xml_document_into_structured_blocks() {
1318        let dir = TempDir::new().unwrap();
1319        let path = write_file(
1320            &dir,
1321            "sample.xml",
1322            "<root><title>Spec</title><section><p>Intro text</p><p>More text</p></section></root>",
1323        );
1324        let doc = parse_xml_document(&path).unwrap();
1325        assert!(doc.title.as_deref() == Some("sample.xml") || doc.title.as_deref() == Some("Spec"));
1326        assert!(doc
1327            .blocks
1328            .iter()
1329            .any(|block| block.kind == DocumentBlockKind::Paragraph));
1330        assert!(doc.to_text().contains("Intro text"));
1331    }
1332
1333    #[test]
1334    fn parses_multipart_eml_with_html_and_quoted_printable() {
1335        let dir = TempDir::new().unwrap();
1336        let path = write_file(
1337            &dir,
1338            "multipart.eml",
1339            concat!(
1340                "Subject: Multipart Test\n",
1341                "From: sender@example.com\n",
1342                "To: receiver@example.com\n",
1343                "Content-Type: multipart/alternative; boundary=\"abc123\"\n",
1344                "\n",
1345                "--abc123\n",
1346                "Content-Type: text/plain; charset=utf-8\n",
1347                "Content-Transfer-Encoding: quoted-printable\n",
1348                "\n",
1349                "Hello=20World=21\n",
1350                "--abc123\n",
1351                "Content-Type: text/html; charset=utf-8\n",
1352                "\n",
1353                "<html><body><p>Ignored HTML fallback</p></body></html>\n",
1354                "--abc123--\n"
1355            ),
1356        );
1357        let text = parse_eml(&path).unwrap().to_text();
1358        assert!(text.contains("Subject: Multipart Test"));
1359        assert!(text.contains("Hello World!"));
1360    }
1361
1362    #[test]
1363    fn strips_rtf_control_words() {
1364        let text = strip_rtf(r"{\rtf1\ansi Hello \par World}");
1365        assert!(text.contains("Hello"));
1366        assert!(text.contains("World"));
1367    }
1368
1369    #[test]
1370    fn pdf_ocr_heuristic_detects_weak_text() {
1371        let config = crate::config::DefaultParserConfig {
1372            enabled: true,
1373            max_file_size_mb: 50,
1374            ocr: Some(crate::config::DefaultParserOcrConfig {
1375                enabled: true,
1376                ..Default::default()
1377            }),
1378        };
1379
1380        assert!(should_attempt_pdf_ocr("", &config));
1381        assert!(should_attempt_pdf_ocr("%%% ---", &config));
1382        assert!(!should_attempt_pdf_ocr(
1383            "This is a reasonably healthy PDF text extraction with enough words and letters to avoid OCR fallback across multiple paragraphs and sections of the document body.",
1384            &config
1385        ));
1386    }
1387
1388    #[test]
1389    fn pdf_ocr_fallback_uses_provider_when_text_is_weak() {
1390        let dir = TempDir::new().unwrap();
1391        let path = write_file(&dir, "sample.pdf", "not-a-real-pdf");
1392        let config = crate::config::DefaultParserConfig {
1393            enabled: true,
1394            max_file_size_mb: 50,
1395            ocr: Some(crate::config::DefaultParserOcrConfig {
1396                enabled: true,
1397                ..Default::default()
1398            }),
1399        };
1400        let provider = MockOcrProvider {
1401            text: Some("OCR recovered text".to_string()),
1402        };
1403
1404        let text = maybe_run_pdf_ocr(&path, String::new(), &config, Some(&provider)).unwrap();
1405        assert_eq!(text, "OCR recovered text");
1406    }
1407
1408    #[test]
1409    fn pdf_ocr_fallback_preserves_extracted_text_without_provider() {
1410        let dir = TempDir::new().unwrap();
1411        let path = write_file(&dir, "sample.pdf", "not-a-real-pdf");
1412        let config = crate::config::DefaultParserConfig {
1413            enabled: true,
1414            max_file_size_mb: 50,
1415            ocr: Some(crate::config::DefaultParserOcrConfig {
1416                enabled: true,
1417                ..Default::default()
1418            }),
1419        };
1420
1421        let text = maybe_run_pdf_ocr(&path, "weak".to_string(), &config, None).unwrap();
1422        assert_eq!(text, "weak");
1423    }
1424
1425    #[test]
1426    fn default_parser_can_hold_ocr_provider() {
1427        let parser = DefaultParser::with_config_and_ocr(
1428            crate::config::DefaultParserConfig::default(),
1429            Arc::new(MockOcrProvider { text: None }),
1430        );
1431        assert!(parser.ocr_provider().is_some());
1432    }
1433}