dbmd_core/extract.rs
1//! Document text extraction — the `dbmd extract` engine.
2//!
3//! `sources/` is where raw evidence lands: invoices, contracts, reports,
4//! exports. Most of it arrives as binary documents (PDF, Word, Excel, EPUB) or
5//! HTML, not markdown. Before an agent can reason over that evidence — wiki-link
6//! it, summarize it into the wiki layer, file a typed record that cites it — the
7//! text has to come out. This module is that step: a binary document in, plain
8//! UTF-8 text out, format chosen by file extension.
9//!
10//! # What this is, and is not
11//!
12//! - **Deterministic decoders only.** Every adapter is a format parser
13//! (`pdf-extract`, `calamine`, `html2text`, `quick-xml`+`zip`). There is **no
14//! AI, no OCR, no embeddings** here — consistent with the crate-wide invariant
15//! (`lib.rs`). The agent driving `dbmd` is the semantic layer; this is plumbing.
16//! - **Text layer, not pixels.** A scanned PDF with no text layer yields the
17//! empty string — *empty in, empty out, never hallucinated text.* OCR is an
18//! explicit non-goal (a future `dbmd-ocr`).
19//! - **Single document, single call.** [`extract`] handles one file. Walking a
20//! store and extracting every document is the caller's loop, not this module's.
21//!
22//! # Format dispatch
23//!
24//! [`Format::from_path`] maps the file extension to an adapter; [`extract`]
25//! dispatches:
26//!
27//! | Extension | Format | Adapter |
28//! |--------------------------|-------------------|----------------------------------|
29//! | `.pdf` | [`Format::Pdf`] | `pdf-extract` |
30//! | `.docx` | [`Format::Docx`] | `zip` + `quick-xml` (`w:t` runs) |
31//! | `.xlsx` / `.xlsm` / `.xlsb` / `.ods` | [`Format::Spreadsheet`] | `calamine` |
32//! | `.epub` | [`Format::Epub`] | `zip` + `quick-xml` + `html2text`|
33//! | `.html` / `.htm` / `.xhtml` | [`Format::Html`] | `html2text` |
34//!
35//! Anything else is [`ExtractError::UnsupportedFormat`] — a typed refusal the
36//! CLI surfaces with a stable code, never a panic.
37
38use std::collections::BTreeMap;
39use std::io::Read;
40use std::panic::{catch_unwind, AssertUnwindSafe};
41use std::path::Path;
42
43use serde::Serialize;
44
45/// The result of extracting one document: the plain text plus a small,
46/// format-tagged metadata map.
47///
48/// This is the `--json` shape the CLI emits verbatim (`{text, metadata}`); in
49/// plain mode the CLI prints [`Extracted::text`] and discards the metadata.
50/// Metadata is intentionally minimal and best-effort — extraction never *fails*
51/// for want of a title; it just omits the key.
52#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
53pub struct Extracted {
54 /// The extracted plain text (UTF-8), normalized to `\n` line endings with
55 /// trailing whitespace trimmed per line and a single trailing newline. For
56 /// a document with no recoverable text layer (e.g. a scanned, image-only
57 /// PDF) this is the empty string — the contract is "empty in, empty out."
58 pub text: String,
59
60 /// Best-effort key/value metadata. Always carries `format` (the adapter
61 /// that ran, e.g. `"pdf"`). Adapters add what they cheaply know:
62 /// `pages`/`sheets`/`sheet_names` (counts), `title` (when the container
63 /// declares one). A `BTreeMap` so `--json` output is key-ordered and stable.
64 pub metadata: BTreeMap<String, MetaValue>,
65}
66
67impl Extracted {
68 /// Build an [`Extracted`] from raw adapter text + the detected format,
69 /// applying the canonical text normalization ([`normalize_text`]) and
70 /// seeding the `format` metadata key.
71 fn new(raw_text: String, format: Format) -> Self {
72 let mut metadata = BTreeMap::new();
73 metadata.insert(
74 "format".to_string(),
75 MetaValue::Str(format.tag().to_string()),
76 );
77 Extracted {
78 text: normalize_text(&raw_text),
79 metadata,
80 }
81 }
82
83 /// Insert a string metadata key only when the value is non-empty (keeps the
84 /// map free of empty `title: ""` noise).
85 fn put_str(&mut self, key: &str, value: impl Into<String>) {
86 let v = value.into();
87 if !v.trim().is_empty() {
88 self.metadata.insert(key.to_string(), MetaValue::Str(v));
89 }
90 }
91
92 /// Insert a numeric (count) metadata key.
93 fn put_num(&mut self, key: &str, value: u64) {
94 self.metadata.insert(key.to_string(), MetaValue::Num(value));
95 }
96}
97
98/// A metadata value: a string (title, format tag, sheet name list joined) or a
99/// non-negative count (pages, sheets). Serializes to a bare JSON string or
100/// number — no wrapper object — so `{text, metadata}` stays flat and readable.
101#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
102#[serde(untagged)]
103pub enum MetaValue {
104 /// A textual value (e.g. document title, the `format` tag).
105 Str(String),
106 /// A non-negative count (e.g. page count, sheet count).
107 Num(u64),
108}
109
110/// The document formats `dbmd extract` understands, one per adapter. Detected
111/// from the file extension by [`Format::from_path`].
112#[derive(Debug, Clone, Copy, PartialEq, Eq)]
113pub enum Format {
114 /// Portable Document Format (`.pdf`) — text layer via `pdf-extract`.
115 Pdf,
116 /// Office Open XML WordprocessingML (`.docx`) — `w:t` runs via `quick-xml`.
117 Docx,
118 /// A spreadsheet (`.xlsx`/`.xlsm`/`.xlsb`/`.ods`) — cells via `calamine`.
119 Spreadsheet,
120 /// EPUB e-book (`.epub`) — spine XHTML via `zip` + `quick-xml` + `html2text`.
121 Epub,
122 /// HTML (`.html`/`.htm`/`.xhtml`) — plain text via `html2text`.
123 Html,
124}
125
126impl Format {
127 /// Detect the format from a path's extension (case-insensitive). Returns
128 /// `None` for an unrecognized or missing extension; [`extract`] turns that
129 /// into [`ExtractError::UnsupportedFormat`] with the offending extension.
130 pub fn from_path(path: &Path) -> Option<Format> {
131 let ext = path.extension()?.to_str()?.to_ascii_lowercase();
132 Some(match ext.as_str() {
133 "pdf" => Format::Pdf,
134 "docx" => Format::Docx,
135 "xlsx" | "xlsm" | "xlsb" | "ods" => Format::Spreadsheet,
136 "epub" => Format::Epub,
137 "html" | "htm" | "xhtml" => Format::Html,
138 _ => return None,
139 })
140 }
141
142 /// The short, stable tag recorded in `metadata.format` and used in error
143 /// messages. Distinct from the file extension (one tag can cover several
144 /// extensions, e.g. `spreadsheet`).
145 pub fn tag(self) -> &'static str {
146 match self {
147 Format::Pdf => "pdf",
148 Format::Docx => "docx",
149 Format::Spreadsheet => "spreadsheet",
150 Format::Epub => "epub",
151 Format::Html => "html",
152 }
153 }
154}
155
156/// Errors from document extraction. Every variant is a typed refusal the CLI
157/// maps to a stable machine code — extraction never panics on a bad or
158/// encrypted input.
159#[derive(Debug, thiserror::Error)]
160pub enum ExtractError {
161 /// The file extension is missing or not one of the supported document
162 /// formats. Carries the offending extension (or `""` when absent).
163 #[error("unsupported document format: {0:?} (supported: pdf, docx, xlsx/xlsm/xlsb/ods, epub, html/htm/xhtml)")]
164 UnsupportedFormat(String),
165
166 /// The document is encrypted/password-protected and could not be opened
167 /// without a password (or with the wrong one). A clean refusal — the
168 /// extractor must never emit partial/garbled bytes for a locked file.
169 #[error("document is encrypted or password-protected: {0}")]
170 Encrypted(String),
171
172 /// A format adapter failed to parse a structurally invalid or corrupt
173 /// document. Carries the adapter's diagnostic.
174 #[error("failed to parse {format} document: {message}")]
175 Parse {
176 /// The format tag whose adapter failed (e.g. `"pdf"`, `"docx"`).
177 format: &'static str,
178 /// The underlying parser diagnostic.
179 message: String,
180 },
181
182 /// An underlying I/O failure (file missing, unreadable, etc.).
183 #[error(transparent)]
184 Io(#[from] std::io::Error),
185}
186
187impl ExtractError {
188 /// A short, stable machine code for this error, mirrored at the CLI
189 /// boundary for `--json` output and exit-code mapping.
190 pub fn code(&self) -> &'static str {
191 match self {
192 ExtractError::UnsupportedFormat(_) => "UNSUPPORTED_FORMAT",
193 ExtractError::Encrypted(_) => "DOCUMENT_ENCRYPTED",
194 ExtractError::Parse { .. } => "EXTRACT_PARSE_ERROR",
195 ExtractError::Io(_) => "IO_ERROR",
196 }
197 }
198}
199
200/// Result alias for extraction operations.
201pub type Result<T> = std::result::Result<T, ExtractError>;
202
203/// Extract plain text (and best-effort metadata) from a document, choosing the
204/// adapter by the file's extension.
205///
206/// This is the single entry point the CLI calls. It reads exactly one file and
207/// returns one [`Extracted`]; there is no whole-store walk here (per the
208/// crate-wide O(changed) invariant — a store-wide extraction is the caller's
209/// loop). An unsupported extension is [`ExtractError::UnsupportedFormat`]; an
210/// encrypted PDF is [`ExtractError::Encrypted`]; neither panics.
211///
212/// # Examples
213///
214/// ```no_run
215/// use std::path::Path;
216/// let out = dbmd_core::extract::extract(Path::new("sources/docs/invoice.pdf"))?;
217/// println!("{}", out.text);
218/// # Ok::<(), dbmd_core::extract::ExtractError>(())
219/// ```
220pub fn extract(path: &Path) -> Result<Extracted> {
221 let format = Format::from_path(path).ok_or_else(|| {
222 let ext = path
223 .extension()
224 .and_then(|e| e.to_str())
225 .unwrap_or("")
226 .to_string();
227 ExtractError::UnsupportedFormat(ext)
228 })?;
229
230 match format {
231 Format::Pdf => extract_pdf(path),
232 Format::Docx => extract_docx(path),
233 Format::Spreadsheet => extract_spreadsheet(path),
234 Format::Epub => extract_epub(path),
235 Format::Html => extract_html(path),
236 }
237}
238
239// ─────────────────────────────────────────────────────────────────────────────
240// Text normalization
241// ─────────────────────────────────────────────────────────────────────────────
242
243/// Canonicalize extracted text so output is stable across adapters:
244///
245/// 1. Normalize line endings to `\n` (drop `\r`).
246/// 2. Trim trailing whitespace on each line.
247/// 3. Collapse three-or-more consecutive blank lines to a single blank line.
248/// 4. Trim leading/trailing blank lines, then append exactly one `\n` (unless
249/// the whole text is empty, which stays empty — the image-only-PDF contract).
250///
251/// This is *layout* tid-up only; it never reorders or drops words. Word-level
252/// content is whatever the adapter recovered.
253pub fn normalize_text(raw: &str) -> String {
254 let unix = raw.replace("\r\n", "\n").replace('\r', "\n");
255
256 let mut lines: Vec<&str> = unix.lines().map(|l| l.trim_end()).collect();
257
258 // Trim leading blank lines.
259 while lines.first().is_some_and(|l| l.is_empty()) {
260 lines.remove(0);
261 }
262 // Trim trailing blank lines.
263 while lines.last().is_some_and(|l| l.is_empty()) {
264 lines.pop();
265 }
266
267 if lines.is_empty() {
268 return String::new();
269 }
270
271 // Collapse runs of 2+ blank lines down to a single blank line.
272 let mut out = String::new();
273 let mut blank_run = 0usize;
274 for line in lines {
275 if line.is_empty() {
276 blank_run += 1;
277 if blank_run >= 2 {
278 continue;
279 }
280 } else {
281 blank_run = 0;
282 }
283 out.push_str(line);
284 out.push('\n');
285 }
286 out
287}
288
289// ─────────────────────────────────────────────────────────────────────────────
290// PDF — pdf-extract
291// ─────────────────────────────────────────────────────────────────────────────
292
293/// Extract a PDF's text layer via `pdf-extract`.
294///
295/// A PDF with no text layer (a scanned image) yields the empty string — that is
296/// correct, not an error (OCR is out of scope). A password-protected PDF that
297/// cannot be opened is mapped to [`ExtractError::Encrypted`] rather than a raw
298/// parse error so the caller can branch on it. Metadata carries the page count
299/// when the document tree exposes it.
300///
301/// `pdf-extract`/`lopdf` `panic!` internally on some malformed-but-openable
302/// PDFs (e.g. an out-of-set base `/Encoding` name), so both parser calls are
303/// wrapped in [`std::panic::catch_unwind`]: an internal abort is contained and
304/// surfaced as [`ExtractError::Parse`], upholding this module's "never panics"
305/// contract on untrusted `sources/` input.
306fn extract_pdf(path: &Path) -> Result<Extracted> {
307 // Read the bytes ourselves so a missing/unreadable file is a clean
308 // `ExtractError::Io` (via `?`) before we hand anything to the PDF parser.
309 let bytes = std::fs::read(path)?;
310
311 let text = match guard_pdf_panic(|| pdf_extract::extract_text_from_mem(&bytes))? {
312 Ok(t) => t,
313 Err(e) => return Err(classify_pdf_error(e)),
314 };
315
316 let mut out = Extracted::new(text, Format::Pdf);
317
318 // Page count is best-effort; derive it from the parsed document. A parse
319 // failure OR an internal panic here is non-fatal — the text already
320 // succeeded — so a contained panic (outer `Err`) and a load failure (inner
321 // `Err`) are both silently skipped.
322 if let Ok(Ok(doc)) = guard_pdf_panic(|| pdf_extract::Document::load_mem(&bytes)) {
323 out.put_num("pages", doc.get_pages().len() as u64);
324 }
325
326 Ok(out)
327}
328
329/// Run a panic-prone `pdf-extract`/`lopdf` call, converting an internal unwind
330/// into a typed [`ExtractError::Parse`] tagged `pdf` so the module's "never
331/// panics" contract holds on adversarial PDFs. `AssertUnwindSafe` is sound: the
332/// closure borrows only `&[u8]`, and on a caught unwind we discard any partial
333/// state and return an owned error. The default panic hook still writes the
334/// panic line to stderr — library code must not mutate the process-global hook.
335fn guard_pdf_panic<T>(f: impl FnOnce() -> T) -> Result<T> {
336 catch_unwind(AssertUnwindSafe(f)).map_err(|_| ExtractError::Parse {
337 format: "pdf",
338 message: "pdf parser aborted on malformed input".to_string(),
339 })
340}
341
342/// Map a `pdf-extract` error onto the right [`ExtractError`] variant.
343/// Decryption failures become [`ExtractError::Encrypted`]; everything else is a
344/// [`ExtractError::Parse`] tagged `pdf`.
345fn classify_pdf_error(err: pdf_extract::OutputError) -> ExtractError {
346 let msg = err.to_string();
347 let lower = msg.to_ascii_lowercase();
348 if lower.contains("password") || lower.contains("decrypt") || lower.contains("encrypt") {
349 ExtractError::Encrypted(msg)
350 } else {
351 ExtractError::Parse {
352 format: "pdf",
353 message: msg,
354 }
355 }
356}
357
358// ─────────────────────────────────────────────────────────────────────────────
359// DOCX — zip + quick-xml (no docx-rs dependency; quick-xml is already needed
360// for epub, so docx, xlsx-via-calamine, and epub share one XML/zip surface)
361// ─────────────────────────────────────────────────────────────────────────────
362
363/// Extract a `.docx` (WordprocessingML) by unzipping `word/document.xml` and
364/// concatenating the `<w:t>` run text, one logical line per `<w:p>` paragraph.
365///
366/// `<w:tab/>` becomes a tab and `<w:br/>` / `<w:cr>` a newline so table-ish and
367/// line-broken content keeps its shape; everything else is structural and
368/// ignored. This is the same minimal-but-faithful path `docx-rs` takes for text
369/// extraction, without pulling in a second XML/zip stack.
370fn extract_docx(path: &Path) -> Result<Extracted> {
371 let file = std::fs::File::open(path)?;
372 let mut archive = open_zip(file, "docx")?;
373
374 let xml = read_zip_entry(&mut archive, "word/document.xml", "docx")?;
375 let text = wordprocessing_text(&xml, "docx")?;
376
377 Ok(Extracted::new(text, Format::Docx))
378}
379
380/// Pull paragraph text out of a WordprocessingML / DrawingML XML body.
381///
382/// Shared by [`extract_docx`]. Walks the event stream collecting `<w:t>` text;
383/// `<w:p>` ends a line, `<w:tab/>` is a tab, `<w:br>`/`<w:cr>` a newline.
384fn wordprocessing_text(xml: &str, format: &'static str) -> Result<String> {
385 use quick_xml::events::Event;
386 use quick_xml::reader::Reader;
387
388 let mut reader = Reader::from_str(xml);
389 let mut buf = Vec::new();
390 let mut out = String::new();
391 let mut in_text_run = false;
392
393 loop {
394 match reader.read_event_into(&mut buf) {
395 Ok(Event::Start(e)) => {
396 if local_name(e.name().as_ref()) == b"t" {
397 in_text_run = true;
398 }
399 }
400 Ok(Event::End(e)) => {
401 let name = e.name();
402 match local_name(name.as_ref()) {
403 b"t" => in_text_run = false,
404 b"p" => out.push('\n'),
405 _ => {}
406 }
407 }
408 Ok(Event::Empty(e)) => {
409 // Self-closing run-level breaks inside a paragraph.
410 match local_name(e.name().as_ref()) {
411 b"tab" => out.push('\t'),
412 b"br" | b"cr" => out.push('\n'),
413 _ => {}
414 }
415 }
416 // quick-xml 0.40 yields already-unescaped text in `Event::Text`.
417 Ok(Event::Text(t)) => {
418 if in_text_run {
419 out.push_str(&String::from_utf8_lossy(&t.into_inner()));
420 }
421 }
422 Ok(Event::Eof) => break,
423 Err(e) => {
424 return Err(ExtractError::Parse {
425 format,
426 message: format!("malformed XML: {e}"),
427 });
428 }
429 _ => {}
430 }
431 buf.clear();
432 }
433
434 Ok(out)
435}
436
437/// The local part of a possibly-namespaced XML name: `w:t` → `t`, `t` → `t`.
438/// docx/epub XML uses prefixes (`w:`, `dc:`) the writer chose; matching the
439/// local name is prefix-agnostic and robust to that choice.
440fn local_name(qname: &[u8]) -> &[u8] {
441 match qname.iter().rposition(|&b| b == b':') {
442 Some(i) => &qname[i + 1..],
443 None => qname,
444 }
445}
446
447// ─────────────────────────────────────────────────────────────────────────────
448// Spreadsheet — calamine (xlsx / xlsm / xlsb / ods)
449// ─────────────────────────────────────────────────────────────────────────────
450
451/// Extract every sheet of a spreadsheet via `calamine`, rendering each row as
452/// tab-separated cells, one row per line, sheets in workbook order separated by
453/// a blank line.
454///
455/// Cell rendering: text verbatim; integers and whole-valued floats without a
456/// trailing `.0` (`1200`, not `1200.0`); other floats via their default
457/// formatting; booleans as `TRUE`/`FALSE`; empty/error cells as the empty
458/// string. Metadata carries the sheet count and the joined sheet-name list.
459fn extract_spreadsheet(path: &Path) -> Result<Extracted> {
460 use calamine::{open_workbook_auto, Reader};
461
462 let mut workbook = open_workbook_auto(path).map_err(|e| ExtractError::Parse {
463 format: "spreadsheet",
464 message: e.to_string(),
465 })?;
466
467 let sheet_names = workbook.sheet_names().to_vec();
468 let mut text = String::new();
469
470 for (idx, name) in sheet_names.iter().enumerate() {
471 if idx > 0 {
472 text.push('\n'); // blank line between sheets
473 }
474 let range = workbook
475 .worksheet_range(name)
476 .map_err(|e| ExtractError::Parse {
477 format: "spreadsheet",
478 message: format!("sheet {name:?}: {e}"),
479 })?;
480
481 for row in range.rows() {
482 let cells: Vec<String> = row.iter().map(render_cell).collect();
483 text.push_str(&cells.join("\t"));
484 text.push('\n');
485 }
486 }
487
488 let mut out = Extracted::new(text, Format::Spreadsheet);
489 out.put_num("sheets", sheet_names.len() as u64);
490 if !sheet_names.is_empty() {
491 out.put_str("sheet_names", sheet_names.join(", "));
492 }
493 Ok(out)
494}
495
496/// Render one spreadsheet cell to its text form. Whole-valued floats drop the
497/// `.0` (so `3450.0` → `3450`), matching how spreadsheet apps display an
498/// integer-typed amount.
499fn render_cell(cell: &calamine::Data) -> String {
500 use calamine::Data;
501 match cell {
502 Data::Empty => String::new(),
503 Data::String(s) => s.clone(),
504 Data::Int(i) => i.to_string(),
505 Data::Float(f) => {
506 if f.fract() == 0.0 && f.is_finite() && f.abs() < 1e15 {
507 format!("{}", *f as i64)
508 } else {
509 f.to_string()
510 }
511 }
512 Data::Bool(b) => {
513 if *b {
514 "TRUE".to_string()
515 } else {
516 "FALSE".to_string()
517 }
518 }
519 Data::DateTime(dt) => dt.to_string(),
520 Data::DateTimeIso(s) => s.clone(),
521 Data::DurationIso(s) => s.clone(),
522 Data::Error(e) => format!("{e:?}"),
523 }
524}
525
526// ─────────────────────────────────────────────────────────────────────────────
527// EPUB — zip + quick-xml (spine order) + html2text (per-chapter)
528// ─────────────────────────────────────────────────────────────────────────────
529//
530// We do NOT use the `epub` crate: it is GPL-3.0, which violates the toolkit's
531// permissive-only license rule. An EPUB is a zip whose OPF package declares a
532// reading-order `spine`; each spine item is an XHTML document. zip + quick-xml
533// (already dependencies) read the container/OPF, and html2text (already a
534// dependency for `.html`) flattens each chapter. Same machinery, no GPL.
535
536/// Extract an EPUB's reading-order text:
537/// 1. read `META-INF/container.xml` → the OPF package path;
538/// 2. parse the OPF `manifest` (id→href) and `spine` (ordered idref list);
539/// 3. for each spine item, read its XHTML and flatten it with [`html_to_text`];
540/// 4. join chapters with a blank line.
541///
542/// Metadata carries `title` (the OPF `dc:title`) and `chapters` (spine length).
543fn extract_epub(path: &Path) -> Result<Extracted> {
544 let file = std::fs::File::open(path)?;
545 let mut archive = open_zip(file, "epub")?;
546
547 // 1. container.xml → OPF path.
548 let container = read_zip_entry(&mut archive, "META-INF/container.xml", "epub")?;
549 let opf_path = epub_opf_path(&container)?;
550
551 // 2. OPF → base dir, manifest, spine, title.
552 let opf = read_zip_entry(&mut archive, &opf_path, "epub")?;
553 let parsed = parse_opf(&opf)?;
554 let base = opf_base_dir(&opf_path);
555
556 // 3. Spine items in order → flattened chapter text.
557 let mut text = String::new();
558 let mut chapters = 0u64;
559 for idref in &parsed.spine {
560 let Some(href) = parsed.manifest.get(idref) else {
561 continue; // dangling spine ref; skip rather than fail
562 };
563 let entry = join_zip_path(&base, href);
564 // A missing spine target is skipped (best-effort), not fatal.
565 let Ok(chapter_xhtml) = read_zip_entry(&mut archive, &entry, "epub") else {
566 continue;
567 };
568 let chapter_text = html_to_text(chapter_xhtml.as_bytes())?;
569 if !chapter_text.trim().is_empty() {
570 if chapters > 0 {
571 text.push('\n');
572 }
573 text.push_str(&chapter_text);
574 text.push('\n');
575 chapters += 1;
576 }
577 }
578
579 let mut out = Extracted::new(text, Format::Epub);
580 out.put_num("chapters", chapters);
581 if let Some(title) = parsed.title {
582 out.put_str("title", title);
583 }
584 Ok(out)
585}
586
587/// The full-path of the OPF package file, read from `META-INF/container.xml`'s
588/// first `<rootfile full-path="…">`.
589fn epub_opf_path(container_xml: &str) -> Result<String> {
590 use quick_xml::events::Event;
591 use quick_xml::reader::Reader;
592
593 let mut reader = Reader::from_str(container_xml);
594 let mut buf = Vec::new();
595 loop {
596 match reader.read_event_into(&mut buf) {
597 Ok(Event::Start(e)) | Ok(Event::Empty(e)) => {
598 if local_name(e.name().as_ref()) == b"rootfile" {
599 if let Some(p) = attr_value(&e, b"full-path") {
600 return Ok(p);
601 }
602 }
603 }
604 Ok(Event::Eof) => break,
605 Err(e) => {
606 return Err(ExtractError::Parse {
607 format: "epub",
608 message: format!("container.xml: {e}"),
609 })
610 }
611 _ => {}
612 }
613 buf.clear();
614 }
615 Err(ExtractError::Parse {
616 format: "epub",
617 message: "container.xml has no <rootfile full-path>".to_string(),
618 })
619}
620
621/// The parsed-out pieces of an OPF package we need for reading-order text.
622struct OpfParsed {
623 /// Manifest: item id → href (relative to the OPF's directory).
624 manifest: BTreeMap<String, String>,
625 /// Spine: ordered list of manifest item ids (the reading order).
626 spine: Vec<String>,
627 /// `dc:title`, if present.
628 title: Option<String>,
629}
630
631/// Parse an OPF package document into its manifest, spine, and title.
632fn parse_opf(opf_xml: &str) -> Result<OpfParsed> {
633 use quick_xml::events::Event;
634 use quick_xml::reader::Reader;
635
636 let mut reader = Reader::from_str(opf_xml);
637 let mut buf = Vec::new();
638
639 let mut manifest = BTreeMap::new();
640 let mut spine = Vec::new();
641 let mut title: Option<String> = None;
642 let mut in_title = false;
643
644 loop {
645 match reader.read_event_into(&mut buf) {
646 Ok(Event::Start(e)) | Ok(Event::Empty(e)) => match local_name(e.name().as_ref()) {
647 b"item" => {
648 if let (Some(id), Some(href)) = (attr_value(&e, b"id"), attr_value(&e, b"href"))
649 {
650 manifest.insert(id, href);
651 }
652 }
653 b"itemref" => {
654 if let Some(idref) = attr_value(&e, b"idref") {
655 spine.push(idref);
656 }
657 }
658 b"title" => in_title = true,
659 _ => {}
660 },
661 Ok(Event::End(e)) => {
662 if local_name(e.name().as_ref()) == b"title" {
663 in_title = false;
664 }
665 }
666 Ok(Event::Text(t)) => {
667 if in_title && title.is_none() {
668 let s = String::from_utf8_lossy(&t.into_inner()).trim().to_string();
669 if !s.is_empty() {
670 title = Some(s);
671 }
672 }
673 }
674 Ok(Event::Eof) => break,
675 Err(e) => {
676 return Err(ExtractError::Parse {
677 format: "epub",
678 message: format!("OPF: {e}"),
679 })
680 }
681 _ => {}
682 }
683 buf.clear();
684 }
685
686 Ok(OpfParsed {
687 manifest,
688 spine,
689 title,
690 })
691}
692
693/// The directory portion of an OPF path (`"OEBPS/content.opf"` → `"OEBPS"`,
694/// `"content.opf"` → `""`), used to resolve manifest hrefs against the OPF's own
695/// location inside the zip.
696fn opf_base_dir(opf_path: &str) -> String {
697 match opf_path.rfind('/') {
698 Some(i) => opf_path[..i].to_string(),
699 None => String::new(),
700 }
701}
702
703/// Join an OPF base dir with a (possibly `./`-prefixed) manifest href into a zip
704/// entry name. Forward-slash only — zip paths are always `/`-separated.
705fn join_zip_path(base: &str, href: &str) -> String {
706 let href = href.trim_start_matches("./");
707 if base.is_empty() {
708 href.to_string()
709 } else {
710 format!("{base}/{href}")
711 }
712}
713
714// ─────────────────────────────────────────────────────────────────────────────
715// HTML — html2text + light markdown-decoration cleanup
716// ─────────────────────────────────────────────────────────────────────────────
717
718/// Extract plain text from an `.html` file.
719fn extract_html(path: &Path) -> Result<Extracted> {
720 let bytes = std::fs::read(path)?;
721 let text = html_to_text(&bytes)?;
722 Ok(Extracted::new(text, Format::Html))
723}
724
725/// Flatten an HTML/XHTML byte stream to clean plain text.
726///
727/// Uses `html2text`'s non-decorating plain renderer (which already drops
728/// `<script>`/`<style>`/comments and flattens lists), then strips the two
729/// markdown-ish decorations that renderer still emits — leading `#` heading
730/// markers and `[text]` link brackets — so headings and link text read as plain
731/// prose. Unordered list items keep their `*` marker and ordered items their
732/// `N.` marker (those are content-faithful and match the corpus convention).
733///
734/// A very wide wrap width (10_000) is used so paragraphs are not hard-wrapped by
735/// the renderer; paragraph structure comes from the source's block elements, and
736/// final layout is canonicalized by [`normalize_text`].
737fn html_to_text(html: &[u8]) -> Result<String> {
738 let rendered = html2text::config::plain_no_decorate()
739 .string_from_read(html, 10_000)
740 .map_err(|e| ExtractError::Parse {
741 format: "html",
742 message: e.to_string(),
743 })?;
744
745 Ok(strip_markdown_decorations(&rendered))
746}
747
748/// Strip the residual markdown decorations `html2text`'s plain renderer emits:
749/// leading run of `#` (ATX heading markers) at the start of a line, and `[...]`
750/// brackets around link/anchor text (the reference-style `[n]` suffix is already
751/// gone under `plain_no_decorate`). Bullet (`*`) and ordered (`N.`) markers are
752/// left intact — they are content, not decoration.
753fn strip_markdown_decorations(text: &str) -> String {
754 let mut out = String::with_capacity(text.len());
755 for line in text.lines() {
756 // Strip a leading "#"-run + the single space after it (ATX heading).
757 let trimmed = line.trim_start();
758 let after_hashes = trimmed.trim_start_matches('#');
759 let line = if after_hashes.len() != trimmed.len() {
760 // It was a heading line: keep indentation-free heading text.
761 after_hashes.trim_start()
762 } else {
763 line
764 };
765 out.push_str(&unwrap_brackets(line));
766 out.push('\n');
767 }
768 out
769}
770
771/// Replace every `[inner]` with `inner` (one pass, non-nested). `html2text`'s
772/// plain renderer wraps link/anchor text in single brackets; unwrapping yields
773/// the bare text. Escaped or unmatched brackets are left as-is.
774fn unwrap_brackets(line: &str) -> String {
775 if !line.contains('[') {
776 return line.to_string();
777 }
778 let mut out = String::with_capacity(line.len());
779 let mut chars = line.chars().peekable();
780 while let Some(c) = chars.next() {
781 if c == '[' {
782 // Collect until the matching ']'; if none, emit the '[' literally.
783 let mut inner = String::new();
784 let mut closed = false;
785 for d in chars.by_ref() {
786 if d == ']' {
787 closed = true;
788 break;
789 }
790 inner.push(d);
791 }
792 if closed {
793 out.push_str(&inner);
794 } else {
795 out.push('[');
796 out.push_str(&inner);
797 }
798 } else {
799 out.push(c);
800 }
801 }
802 out
803}
804
805// ─────────────────────────────────────────────────────────────────────────────
806// Shared zip helpers (docx + epub)
807// ─────────────────────────────────────────────────────────────────────────────
808
809/// Open a zip archive from a reader, mapping any failure to a typed
810/// [`ExtractError::Parse`] tagged with the calling format.
811fn open_zip<R: Read + std::io::Seek>(
812 reader: R,
813 format: &'static str,
814) -> Result<zip::ZipArchive<R>> {
815 zip::ZipArchive::new(reader).map_err(|e| ExtractError::Parse {
816 format,
817 message: format!("not a valid zip container: {e}"),
818 })
819}
820
821/// Cap on a single decompressed zip entry. docx/epub members are XML text — a
822/// member that inflates past this ceiling is a decompression bomb or corruption,
823/// not real evidence. `sources/` is untrusted input, so bound the read rather
824/// than let `read_to_end` follow a hostile DEFLATE stream until OOM.
825const MAX_ZIP_ENTRY_BYTES: u64 = 256 * 1024 * 1024;
826
827/// Read a single zip entry to a UTF-8 string, bounded by [`MAX_ZIP_ENTRY_BYTES`]
828/// so a zip-bomb member cannot exhaust memory. A missing entry, an over-cap
829/// entry, or a read failure is a typed [`ExtractError::Parse`]; invalid UTF-8 is
830/// lossily decoded (OOXML / XHTML are declared UTF-8, but we never panic on a
831/// stray byte).
832fn read_zip_entry<R: Read + std::io::Seek>(
833 archive: &mut zip::ZipArchive<R>,
834 name: &str,
835 format: &'static str,
836) -> Result<String> {
837 let entry = archive.by_name(name).map_err(|e| ExtractError::Parse {
838 format,
839 message: format!("missing zip entry {name:?}: {e}"),
840 })?;
841 // Reject up front when the central directory declares an over-cap size...
842 let declared = entry.size();
843 if declared > MAX_ZIP_ENTRY_BYTES {
844 return Err(ExtractError::Parse {
845 format,
846 message: format!(
847 "zip entry {name:?} declares {declared} bytes, over the {MAX_ZIP_ENTRY_BYTES}-byte cap"
848 ),
849 });
850 }
851 // ...and bound the actual decompressed read so a lying header (a bomb that
852 // understates its uncompressed size) still cannot allocate past the cap.
853 let mut bytes = Vec::new();
854 entry
855 .take(MAX_ZIP_ENTRY_BYTES + 1)
856 .read_to_end(&mut bytes)
857 .map_err(|e| ExtractError::Parse {
858 format,
859 message: format!("reading {name:?}: {e}"),
860 })?;
861 if bytes.len() as u64 > MAX_ZIP_ENTRY_BYTES {
862 return Err(ExtractError::Parse {
863 format,
864 message: format!(
865 "zip entry {name:?} exceeds the {MAX_ZIP_ENTRY_BYTES}-byte cap (decompression bomb?)"
866 ),
867 });
868 }
869 Ok(String::from_utf8_lossy(&bytes).into_owned())
870}
871
872/// Look up a start/empty element's attribute value by local name, returning it
873/// unescaped as an owned `String`. Prefix-agnostic on the attribute key.
874fn attr_value(elem: &quick_xml::events::BytesStart<'_>, key: &[u8]) -> Option<String> {
875 elem.attributes().flatten().find_map(|attr| {
876 if local_name(attr.key.as_ref()) == key {
877 // `unescape_value` returns an XML-unescaped `Cow<str>` — exactly the
878 // owned attribute text we want. It is soft-deprecated in quick-xml
879 // 0.40 in favor of `normalized_value(XmlVersion)`, whose extra
880 // version arg and byte-Cow return buy us nothing here; the simple
881 // form is correct for the UTF-8 OOXML/OPF attributes we read.
882 #[allow(deprecated)]
883 attr.unescape_value().ok().map(|cow| cow.into_owned())
884 } else {
885 None
886 }
887 })
888}
889
890#[cfg(test)]
891mod tests {
892 use super::*;
893 use std::path::PathBuf;
894
895 /// Absolute path to a corpus-c-formats fixture under `sources/docs/`.
896 fn fixture(name: &str) -> PathBuf {
897 PathBuf::from(env!("CARGO_MANIFEST_DIR"))
898 .join("../../tests/corpora/corpus-c-formats/sources/docs")
899 .join(name)
900 }
901
902 /// Read the known-good `.txt` sibling of a fixture.
903 fn expected(name: &str) -> String {
904 std::fs::read_to_string(fixture(&format!("{name}.txt"))).unwrap()
905 }
906
907 /// Token-level normalization: collapse every run of whitespace (incl.
908 /// newlines) to one space and trim. This is the corpus's recommended,
909 /// layout-agnostic comparison ("same words, same order").
910 fn tokens(s: &str) -> String {
911 s.split_whitespace().collect::<Vec<_>>().join(" ")
912 }
913
914 /// The sorted set of non-blank, token-normalized lines — order-agnostic
915 /// content comparison (used where extractor reading-order legitimately
916 /// differs, e.g. multi-column PDF).
917 fn line_set(s: &str) -> Vec<String> {
918 let mut v: Vec<String> = s.lines().map(tokens).filter(|l| !l.is_empty()).collect();
919 v.sort();
920 v
921 }
922
923 // ── format detection ────────────────────────────────────────────────────
924
925 #[test]
926 fn detects_format_by_extension_case_insensitively() {
927 assert_eq!(Format::from_path(Path::new("a.pdf")), Some(Format::Pdf));
928 assert_eq!(Format::from_path(Path::new("a.PDF")), Some(Format::Pdf));
929 assert_eq!(Format::from_path(Path::new("a.docx")), Some(Format::Docx));
930 assert_eq!(
931 Format::from_path(Path::new("a.xlsx")),
932 Some(Format::Spreadsheet)
933 );
934 assert_eq!(
935 Format::from_path(Path::new("a.ods")),
936 Some(Format::Spreadsheet)
937 );
938 assert_eq!(Format::from_path(Path::new("a.epub")), Some(Format::Epub));
939 assert_eq!(Format::from_path(Path::new("a.html")), Some(Format::Html));
940 assert_eq!(Format::from_path(Path::new("a.htm")), Some(Format::Html));
941 assert_eq!(Format::from_path(Path::new("a.txt")), None);
942 assert_eq!(Format::from_path(Path::new("noext")), None);
943 }
944
945 #[test]
946 fn unsupported_extension_is_typed_error() {
947 let err = extract(Path::new("/tmp/whatever.txt")).unwrap_err();
948 assert!(matches!(err, ExtractError::UnsupportedFormat(ref e) if e == "txt"));
949 assert_eq!(err.code(), "UNSUPPORTED_FORMAT");
950 }
951
952 #[test]
953 fn missing_extension_is_unsupported() {
954 let err = extract(Path::new("/tmp/noext")).unwrap_err();
955 assert!(matches!(err, ExtractError::UnsupportedFormat(ref e) if e.is_empty()));
956 }
957
958 // ── normalization ─────────────────────────────────────────────────────────
959
960 #[test]
961 fn normalize_collapses_blanks_and_trims() {
962 let raw = "\r\n\r\nHeading\r\n\r\n\r\n\r\nBody line \r\n\r\n";
963 assert_eq!(normalize_text(raw), "Heading\n\nBody line\n");
964 }
965
966 #[test]
967 fn normalize_empty_stays_empty() {
968 assert_eq!(normalize_text(""), "");
969 assert_eq!(normalize_text(" \n\n \n"), "");
970 }
971
972 // ── per-format extraction against corpus-c fixtures ───────────────────────
973
974 #[test]
975 fn extract_text_pdf_matches_known_good() {
976 let got = extract(&fixture("text.pdf")).unwrap();
977 assert_eq!(got.metadata["format"], MetaValue::Str("pdf".into()));
978 assert_eq!(got.metadata["pages"], MetaValue::Num(1));
979 assert_eq!(tokens(&got.text), tokens(&expected("text.pdf")));
980 }
981
982 #[test]
983 fn extract_weird_fonts_pdf_matches_known_good() {
984 let got = extract(&fixture("weird-fonts.pdf")).unwrap();
985 assert_eq!(tokens(&got.text), tokens(&expected("weird-fonts.pdf")));
986 }
987
988 #[test]
989 fn extract_multi_column_pdf_matches_content_order_agnostic() {
990 // pdf-extract reads column-by-column; the known-good `.txt` captures the
991 // interleaved (pdftotext) order. Both carry identical content — assert
992 // the line SET, not the order. (README § multi-column.)
993 let got = extract(&fixture("multi-column.pdf")).unwrap();
994 assert_eq!(line_set(&got.text), line_set(&expected("multi-column.pdf")));
995 }
996
997 #[test]
998 fn extract_image_only_pdf_yields_empty() {
999 // No text layer → empty out, never hallucinated text. OCR out of scope.
1000 let got = extract(&fixture("image-only.pdf")).unwrap();
1001 assert_eq!(got.text, "");
1002 assert!(expected("image-only.pdf").trim().is_empty());
1003 }
1004
1005 #[test]
1006 fn extract_encrypted_pdf_without_password_refuses_cleanly() {
1007 let err = extract(&fixture("encrypted.pdf")).unwrap_err();
1008 assert!(
1009 matches!(err, ExtractError::Encrypted(_)),
1010 "expected Encrypted, got {err:?}"
1011 );
1012 assert_eq!(err.code(), "DOCUMENT_ENCRYPTED");
1013 }
1014
1015 #[test]
1016 fn guard_pdf_panic_contains_unwind_as_parse_error() {
1017 // The "never panics" contract: an internal pdf-extract/lopdf panic must
1018 // surface as a typed ExtractError::Parse, not abort the process. (cargo
1019 // captures the unwind's stderr line for a passing test.)
1020 let contained: Result<()> = guard_pdf_panic(|| panic!("simulated pdf-extract abort"));
1021 assert!(
1022 matches!(contained, Err(ExtractError::Parse { format: "pdf", .. })),
1023 "panic must be contained as a pdf Parse error, got {contained:?}"
1024 );
1025 // The success path is transparent — the value passes straight through.
1026 let ok: Result<u32> = guard_pdf_panic(|| 42);
1027 assert_eq!(ok.unwrap(), 42);
1028 }
1029
1030 #[test]
1031 fn extract_docx_matches_known_good() {
1032 let got = extract(&fixture("sample.docx")).unwrap();
1033 assert_eq!(got.metadata["format"], MetaValue::Str("docx".into()));
1034 assert_eq!(tokens(&got.text), tokens(&expected("sample.docx")));
1035 }
1036
1037 #[test]
1038 fn extract_xlsx_matches_known_good() {
1039 let got = extract(&fixture("sample.xlsx")).unwrap();
1040 assert_eq!(got.metadata["format"], MetaValue::Str("spreadsheet".into()));
1041 assert_eq!(got.metadata["sheets"], MetaValue::Num(1));
1042 assert_eq!(
1043 got.metadata["sheet_names"],
1044 MetaValue::Str("Expenses".into())
1045 );
1046 // Tab-separated, integers without `.0` — exact match (no soft-wrap risk).
1047 assert_eq!(got.text.trim_end(), expected("sample.xlsx").trim_end());
1048 }
1049
1050 #[test]
1051 fn extract_epub_matches_known_good() {
1052 let got = extract(&fixture("sample.epub")).unwrap();
1053 assert_eq!(got.metadata["format"], MetaValue::Str("epub".into()));
1054 assert_eq!(got.metadata["chapters"], MetaValue::Num(1));
1055 assert_eq!(
1056 got.metadata["title"],
1057 MetaValue::Str("Operations Playbook".into())
1058 );
1059 assert_eq!(tokens(&got.text), tokens(&expected("sample.epub")));
1060 }
1061
1062 #[test]
1063 fn extract_html_matches_known_good() {
1064 let got = extract(&fixture("sample.html")).unwrap();
1065 assert_eq!(got.metadata["format"], MetaValue::Str("html".into()));
1066 assert_eq!(tokens(&got.text), tokens(&expected("sample.html")));
1067 }
1068
1069 // ── helper-level unit tests ───────────────────────────────────────────────
1070
1071 #[test]
1072 fn unwrap_brackets_flattens_link_text() {
1073 assert_eq!(
1074 unwrap_brackets("contact [ops@acme.example] or the [handbook]."),
1075 "contact ops@acme.example or the handbook."
1076 );
1077 // Unmatched '[' is preserved.
1078 assert_eq!(unwrap_brackets("a [b c"), "a [b c");
1079 // No brackets → untouched.
1080 assert_eq!(unwrap_brackets("plain text"), "plain text");
1081 }
1082
1083 #[test]
1084 fn strip_markdown_decorations_drops_heading_hashes() {
1085 let input = "# Title\n## Section\n* bullet\n1. ordered\nplain\n";
1086 let out = strip_markdown_decorations(input);
1087 assert_eq!(out, "Title\nSection\n* bullet\n1. ordered\nplain\n");
1088 }
1089
1090 #[test]
1091 fn local_name_strips_prefix() {
1092 assert_eq!(local_name(b"w:t"), b"t");
1093 assert_eq!(local_name(b"t"), b"t");
1094 assert_eq!(local_name(b"dc:title"), b"title");
1095 }
1096
1097 #[test]
1098 fn extracted_serializes_to_text_metadata_json() {
1099 let got = extract(&fixture("sample.xlsx")).unwrap();
1100 let json = serde_json::to_value(&got).unwrap();
1101 assert!(json.get("text").is_some());
1102 assert_eq!(json["metadata"]["format"], "spreadsheet");
1103 assert_eq!(json["metadata"]["sheets"], 1);
1104 // MetaValue::Num serializes as a bare JSON number, Str as a bare string.
1105 assert!(json["metadata"]["sheets"].is_number());
1106 assert!(json["metadata"]["format"].is_string());
1107 }
1108}