1use anyhow::{Context, Result};
2use base64::Engine as _;
3use roxmltree::Document;
4use std::fs::File;
5use std::io::Read;
6use std::path::Path;
7use std::sync::Arc;
8use zip::ZipArchive;
9
10use crate::document_parser::{DocumentBlock, DocumentBlockKind, ParsedDocument};
11
12pub trait DefaultParserOcrProvider: Send + Sync {
17 fn name(&self) -> &str;
18
19 fn ocr_pdf(
20 &self,
21 path: &Path,
22 config: &crate::config::DefaultParserOcrConfig,
23 ) -> Result<Option<String>>;
24}
25
26#[derive(Default)]
27pub struct DefaultParser {
28 config: crate::config::DefaultParserConfig,
29 ocr_provider: Option<Arc<dyn DefaultParserOcrProvider>>,
30}
31
32impl DefaultParser {
33 pub fn new() -> Self {
34 Self::default()
35 }
36
37 pub fn with_config(config: crate::config::DefaultParserConfig) -> Self {
38 Self {
39 config,
40 ocr_provider: None,
41 }
42 }
43
44 pub fn with_config_and_ocr(
45 config: crate::config::DefaultParserConfig,
46 ocr_provider: Arc<dyn DefaultParserOcrProvider>,
47 ) -> Self {
48 Self {
49 config,
50 ocr_provider: Some(ocr_provider),
51 }
52 }
53
54 pub fn config(&self) -> &crate::config::DefaultParserConfig {
55 &self.config
56 }
57
58 pub fn ocr_provider(&self) -> Option<&Arc<dyn DefaultParserOcrProvider>> {
59 self.ocr_provider.as_ref()
60 }
61}
62
63impl crate::document_parser::DocumentParser for DefaultParser {
64 fn name(&self) -> &str {
65 "default-parser"
66 }
67
68 fn supported_extensions(&self) -> &[&str] {
69 &[
70 "pdf", "docx", "xlsx", "xlsm", "pptx", "odt", "ods", "odp", "epub", "rtf", "html",
71 "htm", "xhtml", "xml", "eml",
72 ]
73 }
74
75 fn parse(&self, path: &Path) -> Result<String> {
76 Ok(self.parse_document(path)?.to_text())
77 }
78
79 fn parse_document(&self, path: &Path) -> Result<ParsedDocument> {
80 let ext = path
81 .extension()
82 .and_then(|ext| ext.to_str())
83 .map(|ext| ext.to_ascii_lowercase())
84 .unwrap_or_default();
85
86 match ext.as_str() {
87 "pdf" => parse_pdf_document(path, &self.config, self.ocr_provider.as_deref()),
88 "docx" => parse_docx(path),
89 "xlsx" | "xlsm" => parse_xlsx(path),
90 "pptx" => parse_pptx(path),
91 "odt" | "ods" | "odp" => parse_odf(path),
92 "epub" => parse_epub(path),
93 "eml" => parse_eml(path),
94 "rtf" => parsed_text_document(path, parse_rtf(path)?, DocumentBlockKind::Paragraph),
95 "html" | "htm" | "xhtml" => parse_html_document(path),
96 "xml" => parse_xml_document(path),
97 _ => anyhow::bail!("unsupported extension for kreuzberg parser"),
98 }
99 }
100
101 fn max_file_size(&self) -> u64 {
102 self.config.max_file_size_mb * 1024 * 1024
103 }
104}
105
106fn parse_pdf(path: &Path) -> Result<String> {
107 pdf_extract::extract_text(path)
108 .with_context(|| format!("failed to extract text from PDF {}", path.display()))
109}
110
111fn parse_pdf_document(
112 path: &Path,
113 config: &crate::config::DefaultParserConfig,
114 ocr_provider: Option<&dyn DefaultParserOcrProvider>,
115) -> Result<ParsedDocument> {
116 let extracted_text = parse_pdf(path).unwrap_or_default();
117 let text = maybe_run_pdf_ocr(path, extracted_text, config, ocr_provider)?;
118 parsed_text_document(path, text, DocumentBlockKind::Paragraph)
119}
120
121fn maybe_run_pdf_ocr(
122 path: &Path,
123 extracted_text: String,
124 config: &crate::config::DefaultParserConfig,
125 ocr_provider: Option<&dyn DefaultParserOcrProvider>,
126) -> Result<String> {
127 if !should_attempt_pdf_ocr(&extracted_text, config) {
128 return Ok(extracted_text);
129 }
130
131 let Some(ocr_config) = config.ocr.as_ref().filter(|ocr| ocr.enabled) else {
132 return Ok(extracted_text);
133 };
134 let Some(provider) = ocr_provider else {
135 tracing::debug!(
136 "DefaultParser OCR enabled for {} but no OCR provider was configured",
137 path.display()
138 );
139 return Ok(extracted_text);
140 };
141
142 match provider.ocr_pdf(path, ocr_config) {
143 Ok(Some(ocr_text)) if !ocr_text.trim().is_empty() => {
144 tracing::info!(
145 "DefaultParser used OCR provider '{}' for {}",
146 provider.name(),
147 path.display()
148 );
149 Ok(ocr_text)
150 }
151 Ok(_) => Ok(extracted_text),
152 Err(err) => {
153 tracing::warn!(
154 "DefaultParser OCR provider '{}' failed on {}: {}",
155 provider.name(),
156 path.display(),
157 err
158 );
159 Ok(extracted_text)
160 }
161 }
162}
163
164fn should_attempt_pdf_ocr(text: &str, config: &crate::config::DefaultParserConfig) -> bool {
165 let Some(ocr) = config.ocr.as_ref() else {
166 return false;
167 };
168 if !ocr.enabled {
169 return false;
170 }
171
172 let trimmed = text.trim();
173 if trimmed.is_empty() {
174 return true;
175 }
176
177 let char_count = trimmed.chars().count();
178 let word_count = trimmed.split_whitespace().count();
179 let alnum_count = trimmed.chars().filter(|ch| ch.is_alphanumeric()).count();
180 let alnum_ratio = alnum_count as f32 / char_count.max(1) as f32;
181
182 char_count < 80 || word_count < 20 || alnum_ratio < 0.45
183}
184
185fn parse_html_document(path: &Path) -> Result<ParsedDocument> {
186 let html = std::fs::read_to_string(path)
187 .with_context(|| format!("failed to read HTML file {}", path.display()))?;
188 parse_markup_document(path, &html, true)
189}
190
191fn parse_xml_document(path: &Path) -> Result<ParsedDocument> {
192 let xml = std::fs::read_to_string(path)
193 .with_context(|| format!("failed to read XML file {}", path.display()))?;
194 parse_markup_document(path, &xml, false)
195}
196
197fn parse_rtf(path: &Path) -> Result<String> {
198 let raw = std::fs::read_to_string(path)
199 .with_context(|| format!("failed to read RTF file {}", path.display()))?;
200 Ok(strip_rtf(&raw))
201}
202
203fn parse_eml(path: &Path) -> Result<ParsedDocument> {
204 let raw = std::fs::read_to_string(path)
205 .with_context(|| format!("failed to read EML file {}", path.display()))?;
206 let mail = parse_email_part(&raw);
207
208 let mut doc = ParsedDocument::new();
209 doc.title = file_title(path);
210 if !mail.headers.is_empty() {
211 let mut header_lines = Vec::new();
212 for key in ["Subject", "From", "To", "Cc", "Date"] {
213 if let Some(value) = mail.headers.get(key) {
214 header_lines.push(format!("{key}: {value}"));
215 }
216 }
217 if !header_lines.is_empty() {
218 doc.push(
219 DocumentBlock::new(
220 DocumentBlockKind::EmailHeader,
221 Some("headers"),
222 header_lines.join("\n"),
223 )
224 .with_source("message")
225 .with_ordinal(1),
226 );
227 }
228 }
229
230 let body = collect_best_mail_body(&mail);
231 if !body.trim().is_empty() {
232 doc.push(
233 DocumentBlock::new(DocumentBlockKind::Paragraph, Some("body"), body)
234 .with_source("message")
235 .with_ordinal(2),
236 );
237 }
238
239 ensure_document(doc, path)
240}
241
242fn parse_epub(path: &Path) -> Result<ParsedDocument> {
243 let mut zip = open_zip(path)?;
244 let mut names: Vec<String> = zip.file_names().map(|s| s.to_string()).collect();
245 names.sort();
246 let mut doc = ParsedDocument::new();
247 doc.title = file_title(path);
248
249 for name in names {
250 let lower = name.to_ascii_lowercase();
251 if !(lower.ends_with(".xhtml") || lower.ends_with(".html") || lower.ends_with(".htm")) {
252 continue;
253 }
254
255 let content = read_zip_entry(&mut zip, &name)?;
256 let section_doc = parse_markup_string(&content, true).unwrap_or_else(|| {
257 fallback_text_blocks(&render_html_to_text(&content).unwrap_or_default())
258 });
259 if section_doc.is_empty() {
260 continue;
261 }
262
263 doc.push(
264 DocumentBlock::new(
265 DocumentBlockKind::Metadata,
266 Some(name.clone()),
267 format!("source: {}", name),
268 )
269 .with_source(name.clone()),
270 );
271 for (idx, block) in section_doc.into_iter().enumerate() {
272 let label = block
273 .label
274 .as_ref()
275 .map(|label| format!("{}: {}", name, label))
276 .or_else(|| Some(name.clone()));
277 doc.push(
278 DocumentBlock::new(block.kind, label, block.content)
279 .with_source(name.clone())
280 .with_ordinal(idx + 1),
281 );
282 }
283 }
284
285 ensure_document(doc, path)
286}
287
288fn parse_docx(path: &Path) -> Result<ParsedDocument> {
289 let mut zip = open_zip(path)?;
290 let mut names: Vec<String> = zip.file_names().map(|s| s.to_string()).collect();
291 names.sort();
292
293 let mut doc = ParsedDocument::new();
294 doc.title = file_title(path);
295 for name in names {
296 if !name.starts_with("word/") || !name.ends_with(".xml") {
297 continue;
298 }
299 if !(name == "word/document.xml"
300 || name.starts_with("word/header")
301 || name.starts_with("word/footer")
302 || name.starts_with("word/footnotes")
303 || name.starts_with("word/endnotes"))
304 {
305 continue;
306 }
307
308 let content = read_zip_entry(&mut zip, &name)?;
309 let blocks = extract_docx_blocks(&content)?;
310 if !blocks.is_empty() {
311 for (idx, block) in blocks.into_iter().enumerate() {
312 let label = block
313 .label
314 .as_ref()
315 .map(|label| format!("{}: {}", name, label))
316 .or_else(|| Some(name.clone()));
317 doc.push(
318 DocumentBlock::new(block.kind, label, block.content)
319 .with_source(name.clone())
320 .with_ordinal(idx + 1),
321 );
322 }
323 }
324 }
325
326 ensure_document(doc, path)
327}
328
329fn parse_xlsx(path: &Path) -> Result<ParsedDocument> {
330 let mut zip = open_zip(path)?;
331 let shared_strings = read_shared_strings(&mut zip).unwrap_or_default();
332 let mut names: Vec<String> = zip.file_names().map(|s| s.to_string()).collect();
333 names.sort();
334
335 let mut doc = ParsedDocument::new();
336 doc.title = file_title(path);
337 for name in names {
338 if !name.starts_with("xl/worksheets/") || !name.ends_with(".xml") {
339 continue;
340 }
341 let content = read_zip_entry(&mut zip, &name)?;
342 let text = parse_xlsx_sheet(&content, &shared_strings)?;
343 if !text.trim().is_empty() {
344 doc.push(
345 DocumentBlock::new(DocumentBlockKind::Table, Some(name.clone()), text)
346 .with_source(name.clone())
347 .with_ordinal(1),
348 );
349 }
350 }
351
352 ensure_document(doc, path)
353}
354
355fn parse_pptx(path: &Path) -> Result<ParsedDocument> {
356 let mut zip = open_zip(path)?;
357 let mut names: Vec<String> = zip.file_names().map(|s| s.to_string()).collect();
358 names.sort();
359
360 let mut doc = ParsedDocument::new();
361 doc.title = file_title(path);
362 for name in names {
363 if !name.starts_with("ppt/slides/slide") || !name.ends_with(".xml") {
364 continue;
365 }
366 let content = read_zip_entry(&mut zip, &name)?;
367 let text = extract_xml_text(&content)?;
368 let blocks = text_blocks(&text, DocumentBlockKind::Paragraph);
369 if blocks.is_empty() {
370 continue;
371 }
372 for (idx, block) in blocks.into_iter().enumerate() {
373 let kind = if idx == 0 && looks_like_heading(&block.content) {
374 DocumentBlockKind::Heading
375 } else {
376 DocumentBlockKind::Slide
377 };
378 let label = if idx == 0 {
379 Some(name.clone())
380 } else {
381 Some(format!("{}: block {}", name, idx + 1))
382 };
383 doc.push(
384 DocumentBlock::new(kind, label, block.content)
385 .with_source(name.clone())
386 .with_page(extract_slide_number(&name).unwrap_or(idx + 1))
387 .with_ordinal(idx + 1),
388 );
389 }
390 }
391
392 ensure_document(doc, path)
393}
394
395fn parse_odf(path: &Path) -> Result<ParsedDocument> {
396 let mut zip = open_zip(path)?;
397 let mut doc = ParsedDocument::new();
398 doc.title = file_title(path);
399
400 for name in ["meta.xml", "styles.xml", "content.xml"] {
401 if let Ok(content) = read_zip_entry(&mut zip, name) {
402 let blocks = if name == "content.xml" {
403 parse_odf_content_blocks(&content)?
404 } else {
405 text_blocks(&extract_xml_text(&content)?, DocumentBlockKind::Metadata)
406 };
407 for (idx, block) in blocks.into_iter().enumerate() {
408 let label = block
409 .label
410 .as_ref()
411 .map(|label| format!("{}: {}", name, label))
412 .or_else(|| {
413 if idx == 0 {
414 Some(name.to_string())
415 } else {
416 Some(format!("{}: block {}", name, idx + 1))
417 }
418 });
419 doc.push(
420 DocumentBlock::new(block.kind, label, block.content)
421 .with_source(name)
422 .with_ordinal(idx + 1),
423 );
424 }
425 }
426 }
427
428 ensure_document(doc, path)
429}
430
431fn read_shared_strings(zip: &mut ZipArchive<File>) -> Result<Vec<String>> {
432 let content = read_zip_entry(zip, "xl/sharedStrings.xml")?;
433 let doc = Document::parse(&content).context("failed to parse xlsx sharedStrings.xml")?;
434 let mut values = Vec::new();
435
436 for si in doc.descendants().filter(|n| n.tag_name().name() == "si") {
437 let value = si
438 .descendants()
439 .filter(|n| n.tag_name().name() == "t")
440 .filter_map(|n| n.text())
441 .map(str::trim)
442 .filter(|t| !t.is_empty())
443 .collect::<Vec<_>>()
444 .join("");
445 if !value.is_empty() {
446 values.push(value);
447 }
448 }
449
450 Ok(values)
451}
452
453fn parse_xlsx_sheet(xml: &str, shared_strings: &[String]) -> Result<String> {
454 let doc = Document::parse(xml).context("failed to parse worksheet xml")?;
455 let mut rows = Vec::new();
456
457 for row in doc.descendants().filter(|n| n.tag_name().name() == "row") {
458 let mut cells = Vec::new();
459 for cell in row.children().filter(|n| n.tag_name().name() == "c") {
460 let value = extract_xlsx_cell(cell, shared_strings);
461 if !value.is_empty() {
462 cells.push(value);
463 }
464 }
465 if !cells.is_empty() {
466 rows.push(cells.join("\t"));
467 }
468 }
469
470 Ok(rows.join("\n"))
471}
472
473fn parse_markup_document(path: &Path, input: &str, is_html: bool) -> Result<ParsedDocument> {
474 let mut doc = ParsedDocument::new();
475 doc.title = file_title(path);
476
477 let blocks = parse_markup_string(input, is_html).unwrap_or_else(|| {
478 let rendered = if is_html {
479 render_html_to_text(input).unwrap_or_default()
480 } else {
481 extract_xml_text(input).unwrap_or_default()
482 };
483 fallback_text_blocks(&rendered)
484 });
485
486 if doc.title.is_none() {
487 doc.title = extract_markup_title(input);
488 }
489 for block in blocks {
490 doc.push(block);
491 }
492
493 ensure_document(doc, path)
494}
495
496fn parse_markup_string(input: &str, is_html: bool) -> Option<Vec<DocumentBlock>> {
497 let doc = Document::parse(input).ok()?;
498 let mut blocks = Vec::new();
499
500 for node in doc.descendants().filter(|node| node.is_element()) {
501 let tag = node.tag_name().name();
502 let kind = match tag {
503 "title" => continue,
504 "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => DocumentBlockKind::Heading,
505 "p" | "li" | "blockquote" => DocumentBlockKind::Paragraph,
506 "pre" | "code" => DocumentBlockKind::Code,
507 "table" => DocumentBlockKind::Table,
508 "meta" if is_html => DocumentBlockKind::Metadata,
509 "section" | "article" => DocumentBlockKind::Section,
510 _ => continue,
511 };
512
513 let content = collect_node_text(node);
514 if content.trim().is_empty() {
515 continue;
516 }
517
518 let label = match tag {
519 "meta" => node
520 .attribute("name")
521 .or_else(|| node.attribute("property"))
522 .or_else(|| node.attribute("http-equiv"))
523 .map(str::to_string),
524 _ => None,
525 };
526 blocks.push(DocumentBlock::new(kind, label, content));
527 }
528
529 if blocks.is_empty() {
530 None
531 } else {
532 Some(dedupe_adjacent_blocks(blocks))
533 }
534}
535
536fn extract_markup_title(input: &str) -> Option<String> {
537 let doc = Document::parse(input).ok()?;
538 doc.descendants()
539 .find(|node| node.has_tag_name("title"))
540 .map(collect_node_text)
541 .filter(|title| !title.trim().is_empty())
542}
543
544fn render_html_to_text(input: &str) -> Result<String> {
545 html2text::from_read(input.as_bytes(), 80).context("failed to render HTML as text")
546}
547
548fn collect_node_text(node: roxmltree::Node<'_, '_>) -> String {
549 let text = node
550 .descendants()
551 .filter_map(|child| child.text())
552 .map(str::trim)
553 .filter(|text| !text.is_empty())
554 .collect::<Vec<_>>()
555 .join(" ");
556 normalize_text(&text)
557}
558
559fn dedupe_adjacent_blocks(blocks: Vec<DocumentBlock>) -> Vec<DocumentBlock> {
560 let mut deduped = Vec::new();
561 for block in blocks {
562 let is_duplicate = deduped.last().is_some_and(|last: &DocumentBlock| {
563 last.kind == block.kind && last.label == block.label && last.content == block.content
564 });
565 if !is_duplicate {
566 deduped.push(block);
567 }
568 }
569 deduped
570}
571
572fn extract_docx_blocks(xml: &str) -> Result<Vec<DocumentBlock>> {
573 let doc = Document::parse(xml).context("failed to parse docx xml")?;
574 let mut blocks = Vec::new();
575
576 for para in doc
577 .descendants()
578 .filter(|node| node.tag_name().name() == "p")
579 {
580 let content = para
581 .descendants()
582 .filter(|node| node.tag_name().name() == "t")
583 .filter_map(|node| node.text())
584 .map(str::trim)
585 .filter(|text| !text.is_empty())
586 .collect::<Vec<_>>()
587 .join("");
588 let content = normalize_text(&content);
589 if content.is_empty() {
590 continue;
591 }
592
593 let kind = if paragraph_style(para)
594 .map(|style| style.to_ascii_lowercase().contains("heading"))
595 .unwrap_or(false)
596 || looks_like_heading(&content)
597 {
598 DocumentBlockKind::Heading
599 } else {
600 DocumentBlockKind::Paragraph
601 };
602
603 blocks.push(DocumentBlock::new(kind, None::<String>, content));
604 }
605
606 if blocks.is_empty() {
607 let fallback = extract_xml_text(xml)?;
608 Ok(fallback_text_blocks(&fallback))
609 } else {
610 Ok(blocks)
611 }
612}
613
614fn paragraph_style(node: roxmltree::Node<'_, '_>) -> Option<String> {
615 node.descendants()
616 .find(|child| child.tag_name().name() == "pStyle")
617 .and_then(|child| child.attribute("val").or_else(|| child.attribute("w:val")))
618 .map(str::to_string)
619}
620
621fn parse_odf_content_blocks(xml: &str) -> Result<Vec<DocumentBlock>> {
622 let doc = Document::parse(xml).context("failed to parse odf content xml")?;
623 let mut blocks = Vec::new();
624
625 for node in doc.descendants().filter(|node| node.is_element()) {
626 let tag = node.tag_name().name();
627 let kind = match tag {
628 "h" => DocumentBlockKind::Heading,
629 "p" => DocumentBlockKind::Paragraph,
630 "list-item" => DocumentBlockKind::Paragraph,
631 _ => continue,
632 };
633 let content = collect_node_text(node);
634 if content.is_empty() {
635 continue;
636 }
637 blocks.push(DocumentBlock::new(kind, None::<String>, content));
638 }
639
640 if blocks.is_empty() {
641 let fallback = extract_xml_text(xml)?;
642 Ok(fallback_text_blocks(&fallback))
643 } else {
644 Ok(blocks)
645 }
646}
647
648fn parsed_text_document(
649 path: &Path,
650 text: String,
651 default_kind: DocumentBlockKind,
652) -> Result<ParsedDocument> {
653 let mut doc = ParsedDocument::new();
654 doc.title = file_title(path);
655 let source = doc
656 .title
657 .clone()
658 .unwrap_or_else(|| path.display().to_string());
659 for (idx, block) in text_blocks(&text, default_kind).into_iter().enumerate() {
660 doc.push(block.with_source(source.clone()).with_ordinal(idx + 1));
661 }
662 ensure_document(doc, path)
663}
664
665fn fallback_text_blocks(text: &str) -> Vec<DocumentBlock> {
666 text_blocks(text, DocumentBlockKind::Paragraph)
667}
668
669fn text_blocks(text: &str, default_kind: DocumentBlockKind) -> Vec<DocumentBlock> {
670 let normalized = normalize_text(text);
671 normalized
672 .split("\n\n")
673 .filter_map(|chunk| {
674 let chunk = chunk.trim();
675 if chunk.is_empty() {
676 return None;
677 }
678
679 let kind = if looks_like_heading(chunk) {
680 DocumentBlockKind::Heading
681 } else {
682 default_kind.clone()
683 };
684 Some(DocumentBlock::new(kind, None::<String>, chunk))
685 })
686 .collect()
687}
688
689fn looks_like_heading(text: &str) -> bool {
690 let line = text.lines().next().unwrap_or("").trim();
691 if line.is_empty() || text.lines().count() > 2 {
692 return false;
693 }
694 if line.starts_with('#') {
695 return true;
696 }
697 let char_count = line.chars().count();
698 let ends_like_sentence = matches!(line.chars().last(), Some('.' | '!' | '?' | ':' | ';'));
699 char_count <= 80 && !ends_like_sentence
700}
701
702fn extract_slide_number(name: &str) -> Option<usize> {
703 let digits = name
704 .chars()
705 .skip_while(|ch| !ch.is_ascii_digit())
706 .take_while(|ch| ch.is_ascii_digit())
707 .collect::<String>();
708 digits.parse().ok()
709}
710
711fn extract_xlsx_cell(cell: roxmltree::Node<'_, '_>, shared_strings: &[String]) -> String {
712 let cell_type = cell.attribute("t").unwrap_or_default();
713
714 if cell_type == "inlineStr" {
715 return cell
716 .descendants()
717 .filter(|n| n.tag_name().name() == "t")
718 .filter_map(|n| n.text())
719 .map(str::trim)
720 .filter(|t| !t.is_empty())
721 .collect::<Vec<_>>()
722 .join("");
723 }
724
725 let raw = cell
726 .children()
727 .find(|n| n.tag_name().name() == "v")
728 .and_then(|n| n.text())
729 .map(str::trim)
730 .unwrap_or_default();
731
732 if raw.is_empty() {
733 return String::new();
734 }
735
736 if cell_type == "s" {
737 return raw
738 .parse::<usize>()
739 .ok()
740 .and_then(|idx| shared_strings.get(idx))
741 .cloned()
742 .unwrap_or_else(|| raw.to_string());
743 }
744
745 raw.to_string()
746}
747
748fn open_zip(path: &Path) -> Result<ZipArchive<File>> {
749 let file = File::open(path)
750 .with_context(|| format!("failed to open zip container {}", path.display()))?;
751 ZipArchive::new(file)
752 .with_context(|| format!("failed to read zip container {}", path.display()))
753}
754
755fn read_zip_entry(zip: &mut ZipArchive<File>, name: &str) -> Result<String> {
756 let mut file = zip
757 .by_name(name)
758 .with_context(|| format!("zip entry not found: {name}"))?;
759 let mut buf = String::new();
760 file.read_to_string(&mut buf)
761 .with_context(|| format!("failed to read zip entry: {name}"))?;
762 Ok(buf)
763}
764
765fn extract_xml_text(xml: &str) -> Result<String> {
766 let doc = Document::parse(xml).context("failed to parse XML")?;
767 let mut out = String::new();
768 let mut last_was_space = true;
769
770 for node in doc.descendants() {
771 if let Some(text) = node.text() {
772 let trimmed = text.trim();
773 if trimmed.is_empty() {
774 continue;
775 }
776 if !last_was_space && !needs_newline(node.tag_name().name()) {
777 out.push(' ');
778 }
779 out.push_str(trimmed);
780 if needs_newline(node.tag_name().name()) {
781 out.push('\n');
782 last_was_space = true;
783 } else {
784 last_was_space = false;
785 }
786 }
787 }
788
789 Ok(normalize_text(&out))
790}
791
792fn needs_newline(tag: &str) -> bool {
793 matches!(
794 tag,
795 "p" | "div"
796 | "br"
797 | "section"
798 | "li"
799 | "tr"
800 | "row"
801 | "sheetData"
802 | "worksheet"
803 | "text-box"
804 )
805}
806
807fn normalize_text(text: &str) -> String {
808 let mut out = String::new();
809 let mut blank_lines = 0usize;
810
811 for line in text.lines() {
812 let line = line.split_whitespace().collect::<Vec<_>>().join(" ");
813 if line.is_empty() {
814 blank_lines += 1;
815 if blank_lines <= 1 && !out.ends_with("\n\n") {
816 out.push('\n');
817 }
818 continue;
819 }
820
821 blank_lines = 0;
822 if !out.is_empty() && !out.ends_with('\n') {
823 out.push('\n');
824 }
825 out.push_str(&line);
826 }
827
828 out.trim().to_string()
829}
830
831fn file_title(path: &Path) -> Option<String> {
832 path.file_name()
833 .and_then(|name| name.to_str())
834 .map(|name| name.to_string())
835}
836
837fn ensure_document(doc: ParsedDocument, path: &Path) -> Result<ParsedDocument> {
838 if doc.is_empty() {
839 anyhow::bail!("no extractable text found in {}", path.display());
840 }
841 Ok(doc)
842}
843
844#[derive(Debug, Default, Clone)]
845struct EmailPart {
846 headers: std::collections::HashMap<String, String>,
847 content_type: String,
848 body: String,
849 parts: Vec<EmailPart>,
850}
851
852fn parse_email_part(raw: &str) -> EmailPart {
853 let (header_block, body_block) = split_headers_body(raw);
854 let headers = parse_headers(header_block);
855 let content_type = headers
856 .get("Content-Type")
857 .cloned()
858 .unwrap_or_else(|| "text/plain; charset=utf-8".to_string());
859 let encoding = headers
860 .get("Content-Transfer-Encoding")
861 .cloned()
862 .unwrap_or_default();
863
864 if let Some(boundary) = extract_content_type_param(&content_type, "boundary") {
865 let parts = split_multipart_body(body_block, &boundary)
866 .into_iter()
867 .map(|part| parse_email_part(&part))
868 .collect::<Vec<_>>();
869 EmailPart {
870 headers,
871 content_type,
872 body: String::new(),
873 parts,
874 }
875 } else {
876 let decoded = decode_email_body(body_block, &encoding);
877 EmailPart {
878 headers,
879 content_type,
880 body: decoded,
881 parts: Vec::new(),
882 }
883 }
884}
885
886fn split_headers_body(raw: &str) -> (&str, &str) {
887 if let Some(idx) = raw.find("\r\n\r\n") {
888 (&raw[..idx], &raw[idx + 4..])
889 } else if let Some(idx) = raw.find("\n\n") {
890 (&raw[..idx], &raw[idx + 2..])
891 } else {
892 ("", raw)
893 }
894}
895
896fn parse_headers(raw: &str) -> std::collections::HashMap<String, String> {
897 let mut headers = std::collections::HashMap::new();
898 let mut current_key: Option<String> = None;
899 let mut current_val = String::new();
900
901 for line in raw.lines() {
902 if line.starts_with(' ') || line.starts_with('\t') {
903 if !current_val.is_empty() {
904 current_val.push(' ');
905 }
906 current_val.push_str(line.trim());
907 continue;
908 }
909
910 if let Some(key) = current_key.take() {
911 headers.insert(key, current_val.trim().to_string());
912 current_val.clear();
913 }
914
915 if let Some((key, value)) = line.split_once(':') {
916 current_key = Some(key.trim().to_string());
917 current_val.push_str(value.trim());
918 }
919 }
920
921 if let Some(key) = current_key {
922 headers.insert(key, current_val.trim().to_string());
923 }
924
925 headers
926}
927
928fn extract_content_type_param(content_type: &str, name: &str) -> Option<String> {
929 for part in content_type.split(';').skip(1) {
930 let (key, value) = part.split_once('=')?;
931 if key.trim().eq_ignore_ascii_case(name) {
932 return Some(value.trim().trim_matches('"').to_string());
933 }
934 }
935 None
936}
937
938fn split_multipart_body(body: &str, boundary: &str) -> Vec<String> {
939 let marker = format!("--{boundary}");
940 let end_marker = format!("--{boundary}--");
941 let normalized = body.replace("\r\n", "\n");
942 let mut parts = Vec::new();
943 let mut current = Vec::new();
944 let mut in_part = false;
945
946 for line in normalized.lines() {
947 if line == marker {
948 if in_part && !current.is_empty() {
949 parts.push(current.join("\n"));
950 current.clear();
951 }
952 in_part = true;
953 continue;
954 }
955 if line == end_marker {
956 if in_part && !current.is_empty() {
957 parts.push(current.join("\n"));
958 }
959 break;
960 }
961 if in_part {
962 current.push(line.to_string());
963 }
964 }
965
966 parts
967}
968
969fn decode_email_body(body: &str, encoding: &str) -> String {
970 let normalized = body.replace("\r\n", "\n");
971 let decoded = if encoding.eq_ignore_ascii_case("base64") {
972 decode_base64_text(&normalized).unwrap_or(normalized)
973 } else if encoding.eq_ignore_ascii_case("quoted-printable") {
974 decode_quoted_printable(&normalized)
975 } else {
976 normalized
977 };
978
979 decoded.trim().to_string()
980}
981
982fn decode_base64_text(input: &str) -> Option<String> {
983 let compact = input.lines().map(str::trim).collect::<String>();
984 let bytes = base64::engine::general_purpose::STANDARD
985 .decode(compact)
986 .ok()?;
987 String::from_utf8(bytes).ok()
988}
989
990fn decode_quoted_printable(input: &str) -> String {
991 let bytes = input.as_bytes();
992 let mut out = Vec::with_capacity(bytes.len());
993 let mut i = 0;
994
995 while i < bytes.len() {
996 match bytes[i] {
997 b'=' => {
998 if i + 1 < bytes.len() && bytes[i + 1] == b'\n' {
999 i += 2;
1000 continue;
1001 }
1002 if i + 2 < bytes.len() && bytes[i + 1] == b'\r' && bytes[i + 2] == b'\n' {
1003 i += 3;
1004 continue;
1005 }
1006 if i + 2 < bytes.len() {
1007 let hex = &input[i + 1..i + 3];
1008 if let Ok(byte) = u8::from_str_radix(hex, 16) {
1009 out.push(byte);
1010 i += 3;
1011 continue;
1012 }
1013 }
1014 out.push(bytes[i]);
1015 i += 1;
1016 }
1017 b'_' => {
1018 out.push(b' ');
1019 i += 1;
1020 }
1021 b => {
1022 out.push(b);
1023 i += 1;
1024 }
1025 }
1026 }
1027
1028 String::from_utf8_lossy(&out).into_owned()
1029}
1030
1031fn collect_best_mail_body(part: &EmailPart) -> String {
1032 if !part.parts.is_empty() {
1033 let preferred_plain = part
1034 .parts
1035 .iter()
1036 .map(collect_best_mail_body)
1037 .find(|body| !body.trim().is_empty());
1038 if let Some(body) = preferred_plain {
1039 return body;
1040 }
1041 }
1042
1043 if part
1044 .content_type
1045 .to_ascii_lowercase()
1046 .starts_with("text/html")
1047 {
1048 return html2text::from_read(part.body.as_bytes(), 80)
1049 .unwrap_or_else(|_| part.body.clone())
1050 .trim()
1051 .to_string();
1052 }
1053
1054 if part.content_type.is_empty() || part.content_type.to_ascii_lowercase().starts_with("text/") {
1055 return part.body.trim().to_string();
1056 }
1057
1058 String::new()
1059}
1060
1061fn strip_rtf(input: &str) -> String {
1062 let mut out = String::new();
1063 let mut chars = input.chars().peekable();
1064
1065 while let Some(ch) = chars.next() {
1066 match ch {
1067 '{' | '}' => {}
1068 '\\' => match chars.peek().copied() {
1069 Some('\\') | Some('{') | Some('}') => {
1070 out.push(chars.next().unwrap_or_default());
1071 }
1072 Some('\'') => {
1073 chars.next();
1074 let hi = chars.next();
1075 let lo = chars.next();
1076 if let (Some(hi), Some(lo)) = (hi, lo) {
1077 let hex = format!("{hi}{lo}");
1078 if let Ok(byte) = u8::from_str_radix(&hex, 16) {
1079 out.push(byte as char);
1080 }
1081 }
1082 }
1083 Some(_) => {
1084 let mut word = String::new();
1085 while let Some(c) = chars.peek().copied() {
1086 if c.is_ascii_alphabetic() {
1087 word.push(c);
1088 chars.next();
1089 } else {
1090 break;
1091 }
1092 }
1093 while let Some(c) = chars.peek().copied() {
1094 if c.is_ascii_digit() || c == '-' {
1095 chars.next();
1096 } else {
1097 break;
1098 }
1099 }
1100 if chars.peek() == Some(&' ') {
1101 chars.next();
1102 }
1103 if matches!(word.as_str(), "par" | "line") {
1104 out.push('\n');
1105 }
1106 }
1107 None => break,
1108 },
1109 '\r' => {}
1110 '\n' => out.push('\n'),
1111 _ => out.push(ch),
1112 }
1113 }
1114
1115 normalize_text(&out)
1116}
1117
1118#[cfg(test)]
1119mod tests {
1120 use super::*;
1121 use std::io::Write;
1122 use tempfile::TempDir;
1123 use zip::write::FileOptions;
1124
1125 struct MockOcrProvider {
1126 text: Option<String>,
1127 }
1128
1129 impl DefaultParserOcrProvider for MockOcrProvider {
1130 fn name(&self) -> &str {
1131 "mock-ocr"
1132 }
1133
1134 fn ocr_pdf(
1135 &self,
1136 _path: &Path,
1137 _config: &crate::config::DefaultParserOcrConfig,
1138 ) -> Result<Option<String>> {
1139 Ok(self.text.clone())
1140 }
1141 }
1142
1143 fn write_file(dir: &TempDir, name: &str, content: &str) -> std::path::PathBuf {
1144 let path = dir.path().join(name);
1145 std::fs::write(&path, content).unwrap();
1146 path
1147 }
1148
1149 fn write_zip(dir: &TempDir, name: &str, entries: &[(&str, &str)]) -> std::path::PathBuf {
1150 let path = dir.path().join(name);
1151 let file = File::create(&path).unwrap();
1152 let mut zip = zip::ZipWriter::new(file);
1153 let options = FileOptions::default();
1154
1155 for (entry, content) in entries {
1156 zip.start_file(*entry, options).unwrap();
1157 zip.write_all(content.as_bytes()).unwrap();
1158 }
1159
1160 zip.finish().unwrap();
1161 path
1162 }
1163
1164 #[test]
1165 fn parses_html() {
1166 let dir = TempDir::new().unwrap();
1167 let path = write_file(
1168 &dir,
1169 "sample.html",
1170 "<html><body><h1>Hello</h1><p>World</p></body></html>",
1171 );
1172 let doc = parse_html_document(&path).unwrap();
1173 assert!(doc
1174 .blocks
1175 .iter()
1176 .any(|block| block.kind == DocumentBlockKind::Heading));
1177 assert!(doc.to_text().contains("Hello"));
1178 assert!(doc.to_text().contains("World"));
1179 }
1180
1181 #[test]
1182 fn parses_docx_like_zip() {
1183 let dir = TempDir::new().unwrap();
1184 let path = write_zip(
1185 &dir,
1186 "sample.docx",
1187 &[(
1188 "word/document.xml",
1189 r#"<w:document xmlns:w="urn:test"><w:body><w:p><w:r><w:t>Hello</w:t></w:r></w:p><w:p><w:r><w:t>World</w:t></w:r></w:p></w:body></w:document>"#,
1190 )],
1191 );
1192 let doc = parse_docx(&path).unwrap();
1193 assert!(doc
1194 .blocks
1195 .iter()
1196 .any(|block| block.kind == DocumentBlockKind::Heading));
1197 assert!(doc.to_text().contains("Hello"));
1198 assert!(doc.to_text().contains("World"));
1199 }
1200
1201 #[test]
1202 fn parses_xlsx_shared_strings_and_inline_cells() {
1203 let dir = TempDir::new().unwrap();
1204 let path = write_zip(
1205 &dir,
1206 "sample.xlsx",
1207 &[
1208 (
1209 "xl/sharedStrings.xml",
1210 r#"<sst xmlns="urn:test"><si><t>Name</t></si><si><t>Alice</t></si></sst>"#,
1211 ),
1212 (
1213 "xl/worksheets/sheet1.xml",
1214 r#"<worksheet xmlns="urn:test"><sheetData><row r="1"><c r="A1" t="s"><v>0</v></c><c r="B1" t="inlineStr"><is><t>Score</t></is></c></row><row r="2"><c r="A2" t="s"><v>1</v></c><c r="B2"><v>42</v></c></row></sheetData></worksheet>"#,
1215 ),
1216 ],
1217 );
1218 let text = parse_xlsx(&path).unwrap().to_text();
1219 assert!(text.contains("Name"));
1220 assert!(text.contains("Score"));
1221 assert!(text.contains("Alice"));
1222 assert!(text.contains("42"));
1223 }
1224
1225 #[test]
1226 fn parses_pptx_slides() {
1227 let dir = TempDir::new().unwrap();
1228 let path = write_zip(
1229 &dir,
1230 "slides.pptx",
1231 &[(
1232 "ppt/slides/slide1.xml",
1233 r#"<p:sld xmlns:p="urn:test" xmlns:a="urn:test-a"><p:cSld><p:spTree><p:sp><p:txBody><a:p><a:r><a:t>Quarterly Review</a:t></a:r></a:p></p:txBody></p:sp></p:spTree></p:cSld></p:sld>"#,
1234 )],
1235 );
1236 let doc = parse_pptx(&path).unwrap();
1237 assert!(doc
1238 .blocks
1239 .iter()
1240 .any(|block| block.kind == DocumentBlockKind::Heading));
1241 assert!(doc
1242 .blocks
1243 .iter()
1244 .any(|block| block.location.as_ref().and_then(|loc| loc.page) == Some(1)));
1245 assert!(doc.to_text().contains("Quarterly Review"));
1246 }
1247
1248 #[test]
1249 fn parses_odf_content() {
1250 let dir = TempDir::new().unwrap();
1251 let path = write_zip(
1252 &dir,
1253 "document.odt",
1254 &[(
1255 "content.xml",
1256 r#"<office:document-content xmlns:office="urn:test" xmlns:text="urn:test-text"><office:body><office:text><text:p>Hello ODF</text:p><text:p>Second line</text:p></office:text></office:body></office:document-content>"#,
1257 )],
1258 );
1259 let doc = parse_odf(&path).unwrap();
1260 assert!(doc
1261 .blocks
1262 .iter()
1263 .any(|block| block.kind == DocumentBlockKind::Paragraph));
1264 assert!(doc.to_text().contains("Hello ODF"));
1265 assert!(doc.to_text().contains("Second line"));
1266 }
1267
1268 #[test]
1269 fn parses_epub_html_entries() {
1270 let dir = TempDir::new().unwrap();
1271 let path = write_zip(
1272 &dir,
1273 "book.epub",
1274 &[(
1275 "OPS/ch1.xhtml",
1276 "<html><body><p>Chapter One</p></body></html>",
1277 )],
1278 );
1279 let doc = parse_epub(&path).unwrap();
1280 assert!(doc
1281 .blocks
1282 .iter()
1283 .any(|block| block.kind == DocumentBlockKind::Paragraph));
1284 assert!(doc.to_text().contains("Chapter One"));
1285 }
1286
1287 #[test]
1288 fn parses_plain_eml() {
1289 let dir = TempDir::new().unwrap();
1290 let path = write_file(
1291 &dir,
1292 "mail.eml",
1293 "Subject: Hello\nFrom: alice@example.com\nTo: bob@example.com\nContent-Type: text/plain; charset=utf-8\n\nThis is a plain email body.\n",
1294 );
1295 let text = parse_eml(&path).unwrap().to_text();
1296 assert!(text.contains("Subject: Hello"));
1297 assert!(text.contains("alice@example.com"));
1298 assert!(text.contains("This is a plain email body."));
1299 }
1300
1301 #[test]
1302 fn parsed_text_document_sets_block_locations() {
1303 let dir = TempDir::new().unwrap();
1304 let path = write_file(&dir, "notes.rtf", "{\\rtf1\\ansi Hello \\par World}");
1305 let doc = parsed_text_document(
1306 &path,
1307 parse_rtf(&path).unwrap(),
1308 DocumentBlockKind::Paragraph,
1309 )
1310 .unwrap();
1311 assert!(doc.blocks.iter().enumerate().all(|(idx, block)| {
1312 block.location.as_ref().and_then(|loc| loc.ordinal) == Some(idx + 1)
1313 }));
1314 }
1315
1316 #[test]
1317 fn parses_xml_document_into_structured_blocks() {
1318 let dir = TempDir::new().unwrap();
1319 let path = write_file(
1320 &dir,
1321 "sample.xml",
1322 "<root><title>Spec</title><section><p>Intro text</p><p>More text</p></section></root>",
1323 );
1324 let doc = parse_xml_document(&path).unwrap();
1325 assert!(doc.title.as_deref() == Some("sample.xml") || doc.title.as_deref() == Some("Spec"));
1326 assert!(doc
1327 .blocks
1328 .iter()
1329 .any(|block| block.kind == DocumentBlockKind::Paragraph));
1330 assert!(doc.to_text().contains("Intro text"));
1331 }
1332
1333 #[test]
1334 fn parses_multipart_eml_with_html_and_quoted_printable() {
1335 let dir = TempDir::new().unwrap();
1336 let path = write_file(
1337 &dir,
1338 "multipart.eml",
1339 concat!(
1340 "Subject: Multipart Test\n",
1341 "From: sender@example.com\n",
1342 "To: receiver@example.com\n",
1343 "Content-Type: multipart/alternative; boundary=\"abc123\"\n",
1344 "\n",
1345 "--abc123\n",
1346 "Content-Type: text/plain; charset=utf-8\n",
1347 "Content-Transfer-Encoding: quoted-printable\n",
1348 "\n",
1349 "Hello=20World=21\n",
1350 "--abc123\n",
1351 "Content-Type: text/html; charset=utf-8\n",
1352 "\n",
1353 "<html><body><p>Ignored HTML fallback</p></body></html>\n",
1354 "--abc123--\n"
1355 ),
1356 );
1357 let text = parse_eml(&path).unwrap().to_text();
1358 assert!(text.contains("Subject: Multipart Test"));
1359 assert!(text.contains("Hello World!"));
1360 }
1361
1362 #[test]
1363 fn strips_rtf_control_words() {
1364 let text = strip_rtf(r"{\rtf1\ansi Hello \par World}");
1365 assert!(text.contains("Hello"));
1366 assert!(text.contains("World"));
1367 }
1368
1369 #[test]
1370 fn pdf_ocr_heuristic_detects_weak_text() {
1371 let config = crate::config::DefaultParserConfig {
1372 enabled: true,
1373 max_file_size_mb: 50,
1374 ocr: Some(crate::config::DefaultParserOcrConfig {
1375 enabled: true,
1376 ..Default::default()
1377 }),
1378 };
1379
1380 assert!(should_attempt_pdf_ocr("", &config));
1381 assert!(should_attempt_pdf_ocr("%%% ---", &config));
1382 assert!(!should_attempt_pdf_ocr(
1383 "This is a reasonably healthy PDF text extraction with enough words and letters to avoid OCR fallback across multiple paragraphs and sections of the document body.",
1384 &config
1385 ));
1386 }
1387
1388 #[test]
1389 fn pdf_ocr_fallback_uses_provider_when_text_is_weak() {
1390 let dir = TempDir::new().unwrap();
1391 let path = write_file(&dir, "sample.pdf", "not-a-real-pdf");
1392 let config = crate::config::DefaultParserConfig {
1393 enabled: true,
1394 max_file_size_mb: 50,
1395 ocr: Some(crate::config::DefaultParserOcrConfig {
1396 enabled: true,
1397 ..Default::default()
1398 }),
1399 };
1400 let provider = MockOcrProvider {
1401 text: Some("OCR recovered text".to_string()),
1402 };
1403
1404 let text = maybe_run_pdf_ocr(&path, String::new(), &config, Some(&provider)).unwrap();
1405 assert_eq!(text, "OCR recovered text");
1406 }
1407
1408 #[test]
1409 fn pdf_ocr_fallback_preserves_extracted_text_without_provider() {
1410 let dir = TempDir::new().unwrap();
1411 let path = write_file(&dir, "sample.pdf", "not-a-real-pdf");
1412 let config = crate::config::DefaultParserConfig {
1413 enabled: true,
1414 max_file_size_mb: 50,
1415 ocr: Some(crate::config::DefaultParserOcrConfig {
1416 enabled: true,
1417 ..Default::default()
1418 }),
1419 };
1420
1421 let text = maybe_run_pdf_ocr(&path, "weak".to_string(), &config, None).unwrap();
1422 assert_eq!(text, "weak");
1423 }
1424
1425 #[test]
1426 fn default_parser_can_hold_ocr_provider() {
1427 let parser = DefaultParser::with_config_and_ocr(
1428 crate::config::DefaultParserConfig::default(),
1429 Arc::new(MockOcrProvider { text: None }),
1430 );
1431 assert!(parser.ocr_provider().is_some());
1432 }
1433}