dbmd_core/
extract.rs

1//! Document text extraction — the `dbmd extract` engine.
2//!
3//! `sources/` is where raw evidence lands: invoices, contracts, reports,
4//! exports. Most of it arrives as binary documents (PDF, Word, Excel, EPUB) or
5//! HTML, not markdown. Before an agent can reason over that evidence — wiki-link
6//! it, summarize it into the wiki layer, file a typed record that cites it — the
7//! text has to come out. This module is that step: a binary document in, plain
8//! UTF-8 text out, format chosen by file extension.
9//!
10//! # What this is, and is not
11//!
12//! - **Deterministic decoders only.** Every adapter is a format parser
13//!   (`pdf-extract`, `calamine`, `html2text`, `quick-xml`+`zip`). There is **no
14//!   AI, no OCR, no embeddings** here — consistent with the crate-wide invariant
15//!   (`lib.rs`). The agent driving `dbmd` is the semantic layer; this is plumbing.
16//! - **Text layer, not pixels.** A scanned PDF with no text layer yields the
17//!   empty string — *empty in, empty out, never hallucinated text.* OCR is an
18//!   explicit non-goal (a future `dbmd-ocr`).
19//! - **Single document, single call.** [`extract`] handles one file. Walking a
20//!   store and extracting every document is the caller's loop, not this module's.
21//!
22//! # Format dispatch
23//!
24//! [`Format::from_path`] maps the file extension to an adapter; [`extract`]
25//! dispatches:
26//!
27//! | Extension                | Format            | Adapter                          |
28//! |--------------------------|-------------------|----------------------------------|
29//! | `.pdf`                   | [`Format::Pdf`]   | `pdf-extract`                    |
30//! | `.docx`                  | [`Format::Docx`]  | `zip` + `quick-xml` (`w:t` runs) |
31//! | `.xlsx` / `.xlsm` / `.xlsb` / `.ods` | [`Format::Spreadsheet`] | `calamine` |
32//! | `.epub`                  | [`Format::Epub`]  | `zip` + `quick-xml` + `html2text`|
33//! | `.html` / `.htm` / `.xhtml` | [`Format::Html`] | `html2text`                    |
34//!
35//! Anything else is [`ExtractError::UnsupportedFormat`] — a typed refusal the
36//! CLI surfaces with a stable code, never a panic.
37
38use std::collections::BTreeMap;
39use std::io::Read;
40use std::panic::{catch_unwind, AssertUnwindSafe};
41use std::path::Path;
42
43use serde::Serialize;
44
45/// The result of extracting one document: the plain text plus a small,
46/// format-tagged metadata map.
47///
48/// This is the `--json` shape the CLI emits verbatim (`{text, metadata}`); in
49/// plain mode the CLI prints [`Extracted::text`] and discards the metadata.
50/// Metadata is intentionally minimal and best-effort — extraction never *fails*
51/// for want of a title; it just omits the key.
52#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
53pub struct Extracted {
54    /// The extracted plain text (UTF-8), normalized to `\n` line endings with
55    /// trailing whitespace trimmed per line and a single trailing newline. For
56    /// a document with no recoverable text layer (e.g. a scanned, image-only
57    /// PDF) this is the empty string — the contract is "empty in, empty out."
58    pub text: String,
59
60    /// Best-effort key/value metadata. Always carries `format` (the adapter
61    /// that ran, e.g. `"pdf"`). Adapters add what they cheaply know:
62    /// `pages`/`sheets`/`sheet_names` (counts), `title` (when the container
63    /// declares one). A `BTreeMap` so `--json` output is key-ordered and stable.
64    pub metadata: BTreeMap<String, MetaValue>,
65}
66
67impl Extracted {
68    /// Build an [`Extracted`] from raw adapter text + the detected format,
69    /// applying the canonical text normalization ([`normalize_text`]) and
70    /// seeding the `format` metadata key.
71    fn new(raw_text: String, format: Format) -> Self {
72        let mut metadata = BTreeMap::new();
73        metadata.insert(
74            "format".to_string(),
75            MetaValue::Str(format.tag().to_string()),
76        );
77        Extracted {
78            text: normalize_text(&raw_text),
79            metadata,
80        }
81    }
82
83    /// Insert a string metadata key only when the value is non-empty (keeps the
84    /// map free of empty `title: ""` noise).
85    fn put_str(&mut self, key: &str, value: impl Into<String>) {
86        let v = value.into();
87        if !v.trim().is_empty() {
88            self.metadata.insert(key.to_string(), MetaValue::Str(v));
89        }
90    }
91
92    /// Insert a numeric (count) metadata key.
93    fn put_num(&mut self, key: &str, value: u64) {
94        self.metadata.insert(key.to_string(), MetaValue::Num(value));
95    }
96}
97
98/// A metadata value: a string (title, format tag, sheet name list joined) or a
99/// non-negative count (pages, sheets). Serializes to a bare JSON string or
100/// number — no wrapper object — so `{text, metadata}` stays flat and readable.
101#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
102#[serde(untagged)]
103pub enum MetaValue {
104    /// A textual value (e.g. document title, the `format` tag).
105    Str(String),
106    /// A non-negative count (e.g. page count, sheet count).
107    Num(u64),
108}
109
110/// The document formats `dbmd extract` understands, one per adapter. Detected
111/// from the file extension by [`Format::from_path`].
112#[derive(Debug, Clone, Copy, PartialEq, Eq)]
113pub enum Format {
114    /// Portable Document Format (`.pdf`) — text layer via `pdf-extract`.
115    Pdf,
116    /// Office Open XML WordprocessingML (`.docx`) — `w:t` runs via `quick-xml`.
117    Docx,
118    /// A spreadsheet (`.xlsx`/`.xlsm`/`.xlsb`/`.ods`) — cells via `calamine`.
119    Spreadsheet,
120    /// EPUB e-book (`.epub`) — spine XHTML via `zip` + `quick-xml` + `html2text`.
121    Epub,
122    /// HTML (`.html`/`.htm`/`.xhtml`) — plain text via `html2text`.
123    Html,
124}
125
126impl Format {
127    /// Detect the format from a path's extension (case-insensitive). Returns
128    /// `None` for an unrecognized or missing extension; [`extract`] turns that
129    /// into [`ExtractError::UnsupportedFormat`] with the offending extension.
130    pub fn from_path(path: &Path) -> Option<Format> {
131        let ext = path.extension()?.to_str()?.to_ascii_lowercase();
132        Some(match ext.as_str() {
133            "pdf" => Format::Pdf,
134            "docx" => Format::Docx,
135            "xlsx" | "xlsm" | "xlsb" | "ods" => Format::Spreadsheet,
136            "epub" => Format::Epub,
137            "html" | "htm" | "xhtml" => Format::Html,
138            _ => return None,
139        })
140    }
141
142    /// The short, stable tag recorded in `metadata.format` and used in error
143    /// messages. Distinct from the file extension (one tag can cover several
144    /// extensions, e.g. `spreadsheet`).
145    pub fn tag(self) -> &'static str {
146        match self {
147            Format::Pdf => "pdf",
148            Format::Docx => "docx",
149            Format::Spreadsheet => "spreadsheet",
150            Format::Epub => "epub",
151            Format::Html => "html",
152        }
153    }
154}
155
156/// Errors from document extraction. Every variant is a typed refusal the CLI
157/// maps to a stable machine code — extraction never panics on a bad or
158/// encrypted input.
159#[derive(Debug, thiserror::Error)]
160pub enum ExtractError {
161    /// The file extension is missing or not one of the supported document
162    /// formats. Carries the offending extension (or `""` when absent).
163    #[error("unsupported document format: {0:?} (supported: pdf, docx, xlsx/xlsm/xlsb/ods, epub, html/htm/xhtml)")]
164    UnsupportedFormat(String),
165
166    /// The document is encrypted/password-protected and could not be opened
167    /// without a password (or with the wrong one). A clean refusal — the
168    /// extractor must never emit partial/garbled bytes for a locked file.
169    #[error("document is encrypted or password-protected: {0}")]
170    Encrypted(String),
171
172    /// A format adapter failed to parse a structurally invalid or corrupt
173    /// document. Carries the adapter's diagnostic.
174    #[error("failed to parse {format} document: {message}")]
175    Parse {
176        /// The format tag whose adapter failed (e.g. `"pdf"`, `"docx"`).
177        format: &'static str,
178        /// The underlying parser diagnostic.
179        message: String,
180    },
181
182    /// An underlying I/O failure (file missing, unreadable, etc.).
183    #[error(transparent)]
184    Io(#[from] std::io::Error),
185}
186
187impl ExtractError {
188    /// A short, stable machine code for this error, mirrored at the CLI
189    /// boundary for `--json` output and exit-code mapping.
190    pub fn code(&self) -> &'static str {
191        match self {
192            ExtractError::UnsupportedFormat(_) => "UNSUPPORTED_FORMAT",
193            ExtractError::Encrypted(_) => "DOCUMENT_ENCRYPTED",
194            ExtractError::Parse { .. } => "EXTRACT_PARSE_ERROR",
195            ExtractError::Io(_) => "IO_ERROR",
196        }
197    }
198}
199
200/// Result alias for extraction operations.
201pub type Result<T> = std::result::Result<T, ExtractError>;
202
203/// Extract plain text (and best-effort metadata) from a document, choosing the
204/// adapter by the file's extension.
205///
206/// This is the single entry point the CLI calls. It reads exactly one file and
207/// returns one [`Extracted`]; there is no whole-store walk here (per the
208/// crate-wide O(changed) invariant — a store-wide extraction is the caller's
209/// loop). An unsupported extension is [`ExtractError::UnsupportedFormat`]; an
210/// encrypted PDF is [`ExtractError::Encrypted`]; neither panics.
211///
212/// # Examples
213///
214/// ```no_run
215/// use std::path::Path;
216/// let out = dbmd_core::extract::extract(Path::new("sources/docs/invoice.pdf"))?;
217/// println!("{}", out.text);
218/// # Ok::<(), dbmd_core::extract::ExtractError>(())
219/// ```
220pub fn extract(path: &Path) -> Result<Extracted> {
221    let format = Format::from_path(path).ok_or_else(|| {
222        let ext = path
223            .extension()
224            .and_then(|e| e.to_str())
225            .unwrap_or("")
226            .to_string();
227        ExtractError::UnsupportedFormat(ext)
228    })?;
229
230    match format {
231        Format::Pdf => extract_pdf(path),
232        Format::Docx => extract_docx(path),
233        Format::Spreadsheet => extract_spreadsheet(path),
234        Format::Epub => extract_epub(path),
235        Format::Html => extract_html(path),
236    }
237}
238
239// ─────────────────────────────────────────────────────────────────────────────
240// Text normalization
241// ─────────────────────────────────────────────────────────────────────────────
242
243/// Canonicalize extracted text so output is stable across adapters:
244///
245/// 1. Normalize line endings to `\n` (drop `\r`).
246/// 2. Trim trailing whitespace on each line.
247/// 3. Collapse three-or-more consecutive blank lines to a single blank line.
248/// 4. Trim leading/trailing blank lines, then append exactly one `\n` (unless
249///    the whole text is empty, which stays empty — the image-only-PDF contract).
250///
251/// This is *layout* tid-up only; it never reorders or drops words. Word-level
252/// content is whatever the adapter recovered.
253pub fn normalize_text(raw: &str) -> String {
254    let unix = raw.replace("\r\n", "\n").replace('\r', "\n");
255
256    let lines: Vec<&str> = unix.lines().map(|l| l.trim_end()).collect();
257
258    // Trim leading/trailing blank lines by locating the first and last
259    // non-blank line ONCE, then slicing. The previous `while … lines.remove(0)`
260    // shifted every remaining element on each removal — O(n²) when the document
261    // is dominated by leading blanks (e.g. an adapter that emits millions of
262    // empty paragraphs), letting a few-hundred-KB document hang extraction for
263    // minutes. Index-and-slice is O(n) regardless of how many blanks lead.
264    let Some(first) = lines.iter().position(|l| !l.is_empty()) else {
265        return String::new();
266    };
267    // `first` exists, so a last non-blank line exists too (rposition can't be None).
268    let last = lines
269        .iter()
270        .rposition(|l| !l.is_empty())
271        .expect("a non-blank line exists once `first` is found");
272    let lines = &lines[first..=last];
273
274    // Collapse runs of 2+ blank lines down to a single blank line.
275    let mut out = String::new();
276    let mut blank_run = 0usize;
277    for &line in lines {
278        if line.is_empty() {
279            blank_run += 1;
280            if blank_run >= 2 {
281                continue;
282            }
283        } else {
284            blank_run = 0;
285        }
286        out.push_str(line);
287        out.push('\n');
288    }
289    out
290}
291
292// ─────────────────────────────────────────────────────────────────────────────
293// PDF — pdf-extract
294// ─────────────────────────────────────────────────────────────────────────────
295
296/// Extract a PDF's text layer via `pdf-extract`.
297///
298/// A PDF with no text layer (a scanned image) yields the empty string — that is
299/// correct, not an error (OCR is out of scope). A password-protected PDF that
300/// cannot be opened is mapped to [`ExtractError::Encrypted`] rather than a raw
301/// parse error so the caller can branch on it. Metadata carries the page count
302/// when the document tree exposes it.
303///
304/// `pdf-extract`/`lopdf` `panic!` internally on some malformed-but-openable
305/// PDFs (e.g. an out-of-set base `/Encoding` name), so both parser calls are
306/// wrapped in [`std::panic::catch_unwind`]: an internal abort is contained and
307/// surfaced as [`ExtractError::Parse`], upholding this module's "never panics"
308/// contract on untrusted `sources/` input.
309fn extract_pdf(path: &Path) -> Result<Extracted> {
310    // Read the bytes ourselves so a missing/unreadable file is a clean
311    // `ExtractError::Io` (via `?`) before we hand anything to the PDF parser.
312    let bytes = std::fs::read(path)?;
313
314    let text = match guard_pdf_panic(|| pdf_extract::extract_text_from_mem(&bytes))? {
315        Ok(t) => t,
316        Err(e) => return Err(classify_pdf_error(e)),
317    };
318
319    let mut out = Extracted::new(text, Format::Pdf);
320
321    // Page count is best-effort; derive it from the parsed document. A parse
322    // failure OR an internal panic here is non-fatal — the text already
323    // succeeded — so a contained panic (outer `Err`) and a load failure (inner
324    // `Err`) are both silently skipped.
325    if let Ok(Ok(doc)) = guard_pdf_panic(|| pdf_extract::Document::load_mem(&bytes)) {
326        out.put_num("pages", doc.get_pages().len() as u64);
327    }
328
329    Ok(out)
330}
331
332/// Run a panic-prone `pdf-extract`/`lopdf` call, converting an internal unwind
333/// into a typed [`ExtractError::Parse`] tagged `pdf` so the module's "never
334/// panics" contract holds on adversarial PDFs. `AssertUnwindSafe` is sound: the
335/// closure borrows only `&[u8]`, and on a caught unwind we discard any partial
336/// state and return an owned error. The default panic hook still writes the
337/// panic line to stderr — library code must not mutate the process-global hook.
338fn guard_pdf_panic<T>(f: impl FnOnce() -> T) -> Result<T> {
339    catch_unwind(AssertUnwindSafe(f)).map_err(|_| ExtractError::Parse {
340        format: "pdf",
341        message: "pdf parser aborted on malformed input".to_string(),
342    })
343}
344
345/// Map a `pdf-extract` error onto the right [`ExtractError`] variant.
346/// Decryption failures become [`ExtractError::Encrypted`]; everything else is a
347/// [`ExtractError::Parse`] tagged `pdf`.
348fn classify_pdf_error(err: pdf_extract::OutputError) -> ExtractError {
349    let msg = err.to_string();
350    let lower = msg.to_ascii_lowercase();
351    if lower.contains("password") || lower.contains("decrypt") || lower.contains("encrypt") {
352        ExtractError::Encrypted(msg)
353    } else {
354        ExtractError::Parse {
355            format: "pdf",
356            message: msg,
357        }
358    }
359}
360
361// ─────────────────────────────────────────────────────────────────────────────
362// DOCX — zip + quick-xml (no docx-rs dependency; quick-xml is already needed
363// for epub, so docx, xlsx-via-calamine, and epub share one XML/zip surface)
364// ─────────────────────────────────────────────────────────────────────────────
365
366/// Extract a `.docx` (WordprocessingML) by unzipping `word/document.xml` and
367/// concatenating the `<w:t>` run text, one logical line per `<w:p>` paragraph.
368///
369/// `<w:tab/>` becomes a tab and `<w:br/>` / `<w:cr>` a newline so table-ish and
370/// line-broken content keeps its shape; everything else is structural and
371/// ignored. This is the same minimal-but-faithful path `docx-rs` takes for text
372/// extraction, without pulling in a second XML/zip stack.
373fn extract_docx(path: &Path) -> Result<Extracted> {
374    let file = std::fs::File::open(path)?;
375    let mut archive = open_zip(file, "docx")?;
376
377    let xml = read_zip_entry(&mut archive, "word/document.xml", "docx")?;
378    let text = wordprocessing_text(&xml, "docx")?;
379
380    Ok(Extracted::new(text, Format::Docx))
381}
382
383/// Pull paragraph text out of a WordprocessingML / DrawingML XML body.
384///
385/// Shared by [`extract_docx`]. Walks the event stream collecting `<w:t>` text;
386/// `<w:p>` ends a line, `<w:tab/>` is a tab, `<w:br>`/`<w:cr>` a newline.
387fn wordprocessing_text(xml: &str, format: &'static str) -> Result<String> {
388    use quick_xml::events::Event;
389    use quick_xml::reader::Reader;
390
391    let mut reader = Reader::from_str(xml);
392    let mut buf = Vec::new();
393    let mut out = String::new();
394    let mut in_text_run = false;
395
396    loop {
397        match reader.read_event_into(&mut buf) {
398            Ok(Event::Start(e)) => {
399                if local_name(e.name().as_ref()) == b"t" {
400                    in_text_run = true;
401                }
402            }
403            Ok(Event::End(e)) => {
404                let name = e.name();
405                match local_name(name.as_ref()) {
406                    b"t" => in_text_run = false,
407                    b"p" => out.push('\n'),
408                    _ => {}
409                }
410            }
411            Ok(Event::Empty(e)) => {
412                // Self-closing run-level breaks inside a paragraph.
413                match local_name(e.name().as_ref()) {
414                    b"tab" => out.push('\t'),
415                    b"br" | b"cr" => out.push('\n'),
416                    _ => {}
417                }
418            }
419            // quick-xml 0.40 yields already-unescaped text in `Event::Text`.
420            Ok(Event::Text(t)) => {
421                if in_text_run {
422                    out.push_str(&String::from_utf8_lossy(&t.into_inner()));
423                }
424            }
425            Ok(Event::Eof) => break,
426            Err(e) => {
427                return Err(ExtractError::Parse {
428                    format,
429                    message: format!("malformed XML: {e}"),
430                });
431            }
432            _ => {}
433        }
434        buf.clear();
435    }
436
437    Ok(out)
438}
439
440/// The local part of a possibly-namespaced XML name: `w:t` → `t`, `t` → `t`.
441/// docx/epub XML uses prefixes (`w:`, `dc:`) the writer chose; matching the
442/// local name is prefix-agnostic and robust to that choice.
443fn local_name(qname: &[u8]) -> &[u8] {
444    match qname.iter().rposition(|&b| b == b':') {
445        Some(i) => &qname[i + 1..],
446        None => qname,
447    }
448}
449
450// ─────────────────────────────────────────────────────────────────────────────
451// Spreadsheet — calamine (xlsx / xlsm / xlsb / ods)
452// ─────────────────────────────────────────────────────────────────────────────
453
454/// Ceiling on a single sheet's dense cell grid (`rows × cols`). `calamine`
455/// materializes a worksheet as a DENSE `Vec<Data>` sized from the MIN/MAX cell
456/// positions (`Range::from_sparse`), so two cells at `A1` and `XFD1048576` in a
457/// few-hundred-byte file force a ~1.7e10-element (~400 GB) allocation that
458/// **aborts** the process — bypassing the docx/epub zip-entry cap and the
459/// PDF panic guard (an allocation failure aborts, it does not unwind, so
460/// `catch_unwind` cannot contain it). `sources/` is untrusted input, so we
461/// bound the read the same way docx/epub do: refuse before the allocation.
462///
463/// 50M cells is ~1.2 GB worst-case dense (`Data` ≈ 24 bytes) — far above any
464/// real spreadsheet's used range, far below the weaponizable extreme.
465const MAX_SPREADSHEET_CELLS: u64 = 50_000_000;
466
467/// Extract every sheet of a spreadsheet via `calamine`, rendering each row as
468/// tab-separated cells, one row per line, sheets in workbook order separated by
469/// a blank line.
470///
471/// Cell rendering: text verbatim; integers and whole-valued floats without a
472/// trailing `.0` (`1200`, not `1200.0`); other floats via their default
473/// formatting; booleans as `TRUE`/`FALSE`; empty/error cells as the empty
474/// string. Metadata carries the sheet count and the joined sheet-name list.
475///
476/// Before materializing each sheet, [`spreadsheet_dense_cells`] bounds the
477/// would-be dense grid against [`MAX_SPREADSHEET_CELLS`] and returns a typed
478/// [`ExtractError::Parse`] refusal rather than letting an attacker-supplied
479/// sheet OOM/abort the process — upholding the module's "never panics on
480/// untrusted `sources/` input" contract for the spreadsheet adapter.
481fn extract_spreadsheet(path: &Path) -> Result<Extracted> {
482    use calamine::{open_workbook_auto, Reader};
483
484    let mut workbook = open_workbook_auto(path).map_err(|e| ExtractError::Parse {
485        format: "spreadsheet",
486        message: e.to_string(),
487    })?;
488
489    let sheet_names = workbook.sheet_names().to_vec();
490    let mut text = String::new();
491
492    for (idx, name) in sheet_names.iter().enumerate() {
493        if idx > 0 {
494            text.push('\n'); // blank line between sheets
495        }
496
497        // Bound the dense grid BEFORE calamine allocates it. For the zip-XML /
498        // record backends that expose a sparse cell iterator (xlsx-family,
499        // xlsb) this never densely allocates; over-cap sheets refuse cleanly.
500        if let Some(cells) = spreadsheet_dense_cells(&mut workbook, name)? {
501            if cells > MAX_SPREADSHEET_CELLS {
502                return Err(ExtractError::Parse {
503                    format: "spreadsheet",
504                    message: format!(
505                        "sheet {name:?} declares a {cells}-cell grid, over the \
506                         {MAX_SPREADSHEET_CELLS}-cell cap (malformed or hostile spreadsheet)"
507                    ),
508                });
509            }
510        }
511
512        let range = workbook
513            .worksheet_range(name)
514            .map_err(|e| ExtractError::Parse {
515                format: "spreadsheet",
516                message: format!("sheet {name:?}: {e}"),
517            })?;
518
519        for row in range.rows() {
520            let cells: Vec<String> = row.iter().map(render_cell).collect();
521            text.push_str(&cells.join("\t"));
522            text.push('\n');
523        }
524    }
525
526    let mut out = Extracted::new(text, Format::Spreadsheet);
527    out.put_num("sheets", sheet_names.len() as u64);
528    if !sheet_names.is_empty() {
529        out.put_str("sheet_names", sheet_names.join(", "));
530    }
531    Ok(out)
532}
533
534/// Compute the would-be dense cell count (`rows × cols`) of one sheet WITHOUT
535/// the dense allocation, by streaming the sheet's sparse cells and tracking the
536/// MIN/MAX non-empty position — exactly the bounds `Range::from_sparse` uses.
537///
538/// Returns `Some(rows * cols)` for the formats that expose a sparse cell
539/// iterator (`.xlsx`/`.xlsm`/`.xlsb`/`.xlam`), which are the realistic
540/// decompression/dimension-bomb vectors (an OOXML/record sheet can place two
541/// cells 1e10 apart in a few hundred bytes). Returns `None` for `.xls` (BIFF,
542/// format-bounded to ≤ 65 536 × 256 ≈ 1.7e7 cells) and `.ods`, neither of which
543/// exposes a sparse iterator on the auto-detected reader; those fall through to
544/// the normal materialization path. A row/col delta is saturated into `u64` so
545/// the multiply cannot overflow.
546fn spreadsheet_dense_cells(
547    workbook: &mut calamine::Sheets<std::io::BufReader<std::fs::File>>,
548    name: &str,
549) -> Result<Option<u64>> {
550    use calamine::{DataRef, Sheets};
551
552    // Stream cells, tracking the non-empty MIN/MAX extent that `from_sparse`
553    // would allocate. Empty cells are excluded (calamine drops them before
554    // computing the dense bounds), matching the dense grid exactly.
555    fn extent<E: std::fmt::Display>(
556        mut next: impl FnMut() -> std::result::Result<Option<((u32, u32), bool)>, E>,
557    ) -> Result<Option<u64>> {
558        let (mut r0, mut r1, mut c0, mut c1) = (u32::MAX, 0u32, u32::MAX, 0u32);
559        let mut any = false;
560        loop {
561            match next() {
562                Ok(Some(((r, c), is_empty))) => {
563                    if is_empty {
564                        continue;
565                    }
566                    any = true;
567                    r0 = r0.min(r);
568                    r1 = r1.max(r);
569                    c0 = c0.min(c);
570                    c1 = c1.max(c);
571                }
572                Ok(None) => break,
573                Err(e) => {
574                    return Err(ExtractError::Parse {
575                        format: "spreadsheet",
576                        message: format!("scanning sheet dimensions: {e}"),
577                    })
578                }
579            }
580        }
581        if !any {
582            return Ok(Some(0));
583        }
584        let rows = u64::from(r1 - r0) + 1;
585        let cols = u64::from(c1 - c0) + 1;
586        Ok(Some(rows.saturating_mul(cols)))
587    }
588
589    match workbook {
590        Sheets::Xlsx(xlsx) => {
591            let mut reader =
592                xlsx.worksheet_cells_reader(name)
593                    .map_err(|e| ExtractError::Parse {
594                        format: "spreadsheet",
595                        message: format!("sheet {name:?}: {e}"),
596                    })?;
597            extent(|| {
598                reader.next_cell().map(|opt| {
599                    opt.map(|c| (c.get_position(), matches!(c.get_value(), DataRef::Empty)))
600                })
601            })
602        }
603        Sheets::Xlsb(xlsb) => {
604            let mut reader =
605                xlsb.worksheet_cells_reader(name)
606                    .map_err(|e| ExtractError::Parse {
607                        format: "spreadsheet",
608                        message: format!("sheet {name:?}: {e}"),
609                    })?;
610            extent(|| {
611                reader.next_cell().map(|opt| {
612                    opt.map(|c| (c.get_position(), matches!(c.get_value(), DataRef::Empty)))
613                })
614            })
615        }
616        // `.xls` (BIFF, format-bounded) and `.ods` expose no sparse iterator on
617        // the auto reader; let them materialize normally.
618        Sheets::Xls(_) | Sheets::Ods(_) => Ok(None),
619    }
620}
621
622/// Render one spreadsheet cell to its text form. Whole-valued floats drop the
623/// `.0` (so `3450.0` → `3450`), matching how spreadsheet apps display an
624/// integer-typed amount.
625fn render_cell(cell: &calamine::Data) -> String {
626    use calamine::Data;
627    match cell {
628        Data::Empty => String::new(),
629        Data::String(s) => s.clone(),
630        Data::Int(i) => i.to_string(),
631        Data::Float(f) => {
632            if f.fract() == 0.0 && f.is_finite() && f.abs() < 1e15 {
633                format!("{}", *f as i64)
634            } else {
635                f.to_string()
636            }
637        }
638        Data::Bool(b) => {
639            if *b {
640                "TRUE".to_string()
641            } else {
642                "FALSE".to_string()
643            }
644        }
645        Data::DateTime(dt) => dt.to_string(),
646        Data::DateTimeIso(s) => s.clone(),
647        Data::DurationIso(s) => s.clone(),
648        Data::Error(e) => format!("{e:?}"),
649    }
650}
651
652// ─────────────────────────────────────────────────────────────────────────────
653// EPUB — zip + quick-xml (spine order) + html2text (per-chapter)
654// ─────────────────────────────────────────────────────────────────────────────
655//
656// We do NOT use the `epub` crate: it is GPL-3.0, which violates the toolkit's
657// permissive-only license rule. An EPUB is a zip whose OPF package declares a
658// reading-order `spine`; each spine item is an XHTML document. zip + quick-xml
659// (already dependencies) read the container/OPF, and html2text (already a
660// dependency for `.html`) flattens each chapter. Same machinery, no GPL.
661
662/// Extract an EPUB's reading-order text:
663/// 1. read `META-INF/container.xml` → the OPF package path;
664/// 2. parse the OPF `manifest` (id→href) and `spine` (ordered idref list);
665/// 3. for each spine item, read its XHTML and flatten it with [`html_to_text`];
666/// 4. join chapters with a blank line.
667///
668/// Metadata carries `title` (the OPF `dc:title`) and `chapters` (spine length).
669fn extract_epub(path: &Path) -> Result<Extracted> {
670    let file = std::fs::File::open(path)?;
671    let mut archive = open_zip(file, "epub")?;
672
673    // 1. container.xml → OPF path.
674    let container = read_zip_entry(&mut archive, "META-INF/container.xml", "epub")?;
675    let opf_path = epub_opf_path(&container)?;
676
677    // 2. OPF → base dir, manifest, spine, title.
678    let opf = read_zip_entry(&mut archive, &opf_path, "epub")?;
679    let parsed = parse_opf(&opf)?;
680    let base = opf_base_dir(&opf_path);
681
682    // 3. Spine items in order → flattened chapter text.
683    let mut text = String::new();
684    let mut chapters = 0u64;
685    for idref in &parsed.spine {
686        let Some(href) = parsed.manifest.get(idref) else {
687            continue; // dangling spine ref; skip rather than fail
688        };
689        let entry = join_zip_path(&base, href);
690        // A missing spine target is skipped (best-effort), not fatal.
691        let Ok(chapter_xhtml) = read_zip_entry(&mut archive, &entry, "epub") else {
692            continue;
693        };
694        let chapter_text = html_to_text(chapter_xhtml.as_bytes())?;
695        if !chapter_text.trim().is_empty() {
696            if chapters > 0 {
697                text.push('\n');
698            }
699            text.push_str(&chapter_text);
700            text.push('\n');
701            chapters += 1;
702        }
703    }
704
705    let mut out = Extracted::new(text, Format::Epub);
706    out.put_num("chapters", chapters);
707    if let Some(title) = parsed.title {
708        out.put_str("title", title);
709    }
710    Ok(out)
711}
712
713/// The full-path of the OPF package file, read from `META-INF/container.xml`'s
714/// first `<rootfile full-path="…">`.
715fn epub_opf_path(container_xml: &str) -> Result<String> {
716    use quick_xml::events::Event;
717    use quick_xml::reader::Reader;
718
719    let mut reader = Reader::from_str(container_xml);
720    let mut buf = Vec::new();
721    loop {
722        match reader.read_event_into(&mut buf) {
723            Ok(Event::Start(e)) | Ok(Event::Empty(e)) => {
724                if local_name(e.name().as_ref()) == b"rootfile" {
725                    if let Some(p) = attr_value(&e, b"full-path") {
726                        return Ok(p);
727                    }
728                }
729            }
730            Ok(Event::Eof) => break,
731            Err(e) => {
732                return Err(ExtractError::Parse {
733                    format: "epub",
734                    message: format!("container.xml: {e}"),
735                })
736            }
737            _ => {}
738        }
739        buf.clear();
740    }
741    Err(ExtractError::Parse {
742        format: "epub",
743        message: "container.xml has no <rootfile full-path>".to_string(),
744    })
745}
746
747/// The parsed-out pieces of an OPF package we need for reading-order text.
748struct OpfParsed {
749    /// Manifest: item id → href (relative to the OPF's directory).
750    manifest: BTreeMap<String, String>,
751    /// Spine: ordered list of manifest item ids (the reading order).
752    spine: Vec<String>,
753    /// `dc:title`, if present.
754    title: Option<String>,
755}
756
757/// Parse an OPF package document into its manifest, spine, and title.
758fn parse_opf(opf_xml: &str) -> Result<OpfParsed> {
759    use quick_xml::events::Event;
760    use quick_xml::reader::Reader;
761
762    let mut reader = Reader::from_str(opf_xml);
763    let mut buf = Vec::new();
764
765    let mut manifest = BTreeMap::new();
766    let mut spine = Vec::new();
767    let mut title: Option<String> = None;
768    let mut in_title = false;
769
770    loop {
771        match reader.read_event_into(&mut buf) {
772            Ok(Event::Start(e)) | Ok(Event::Empty(e)) => match local_name(e.name().as_ref()) {
773                b"item" => {
774                    if let (Some(id), Some(href)) = (attr_value(&e, b"id"), attr_value(&e, b"href"))
775                    {
776                        manifest.insert(id, href);
777                    }
778                }
779                b"itemref" => {
780                    if let Some(idref) = attr_value(&e, b"idref") {
781                        spine.push(idref);
782                    }
783                }
784                b"title" => in_title = true,
785                _ => {}
786            },
787            Ok(Event::End(e)) => {
788                if local_name(e.name().as_ref()) == b"title" {
789                    in_title = false;
790                }
791            }
792            Ok(Event::Text(t)) => {
793                if in_title && title.is_none() {
794                    let s = String::from_utf8_lossy(&t.into_inner()).trim().to_string();
795                    if !s.is_empty() {
796                        title = Some(s);
797                    }
798                }
799            }
800            Ok(Event::Eof) => break,
801            Err(e) => {
802                return Err(ExtractError::Parse {
803                    format: "epub",
804                    message: format!("OPF: {e}"),
805                })
806            }
807            _ => {}
808        }
809        buf.clear();
810    }
811
812    Ok(OpfParsed {
813        manifest,
814        spine,
815        title,
816    })
817}
818
819/// The directory portion of an OPF path (`"OEBPS/content.opf"` → `"OEBPS"`,
820/// `"content.opf"` → `""`), used to resolve manifest hrefs against the OPF's own
821/// location inside the zip.
822fn opf_base_dir(opf_path: &str) -> String {
823    match opf_path.rfind('/') {
824        Some(i) => opf_path[..i].to_string(),
825        None => String::new(),
826    }
827}
828
829/// Join an OPF base dir with a (possibly `./`-prefixed) manifest href into a zip
830/// entry name. Forward-slash only — zip paths are always `/`-separated.
831fn join_zip_path(base: &str, href: &str) -> String {
832    let href = href.trim_start_matches("./");
833    if base.is_empty() {
834        href.to_string()
835    } else {
836        format!("{base}/{href}")
837    }
838}
839
840// ─────────────────────────────────────────────────────────────────────────────
841// HTML — html2text + light markdown-decoration cleanup
842// ─────────────────────────────────────────────────────────────────────────────
843
844/// Extract plain text from an `.html` file.
845fn extract_html(path: &Path) -> Result<Extracted> {
846    let bytes = std::fs::read(path)?;
847    let text = html_to_text(&bytes)?;
848    Ok(Extracted::new(text, Format::Html))
849}
850
851/// Flatten an HTML/XHTML byte stream to clean plain text.
852///
853/// Uses `html2text`'s non-decorating plain renderer (which already drops
854/// `<script>`/`<style>`/comments and flattens lists), then strips the two
855/// markdown-ish decorations that renderer still emits — leading `#` heading
856/// markers and `[text]` link brackets — so headings and link text read as plain
857/// prose. Unordered list items keep their `*` marker and ordered items their
858/// `N.` marker (those are content-faithful and match the corpus convention).
859///
860/// A very wide wrap width (10_000) is used so paragraphs are not hard-wrapped by
861/// the renderer; paragraph structure comes from the source's block elements, and
862/// final layout is canonicalized by [`normalize_text`].
863fn html_to_text(html: &[u8]) -> Result<String> {
864    let rendered = html2text::config::plain_no_decorate()
865        .string_from_read(html, 10_000)
866        .map_err(|e| ExtractError::Parse {
867            format: "html",
868            message: e.to_string(),
869        })?;
870
871    Ok(strip_markdown_decorations(&rendered))
872}
873
874/// Strip the residual markdown decorations `html2text`'s plain renderer emits:
875/// leading run of `#` (ATX heading markers) at the start of a line, and `[...]`
876/// brackets around link/anchor text (the reference-style `[n]` suffix is already
877/// gone under `plain_no_decorate`). Bullet (`*`) and ordered (`N.`) markers are
878/// left intact — they are content, not decoration.
879fn strip_markdown_decorations(text: &str) -> String {
880    let mut out = String::with_capacity(text.len());
881    for line in text.lines() {
882        // Strip a leading "#"-run + the single space after it (ATX heading).
883        let trimmed = line.trim_start();
884        let after_hashes = trimmed.trim_start_matches('#');
885        let line = if after_hashes.len() != trimmed.len() {
886            // It was a heading line: keep indentation-free heading text.
887            after_hashes.trim_start()
888        } else {
889            line
890        };
891        out.push_str(&unwrap_brackets(line));
892        out.push('\n');
893    }
894    out
895}
896
897/// Replace every `[inner]` with `inner` (one pass, non-nested). `html2text`'s
898/// plain renderer wraps link/anchor text in single brackets; unwrapping yields
899/// the bare text. Escaped or unmatched brackets are left as-is.
900fn unwrap_brackets(line: &str) -> String {
901    if !line.contains('[') {
902        return line.to_string();
903    }
904    let mut out = String::with_capacity(line.len());
905    let mut chars = line.chars().peekable();
906    while let Some(c) = chars.next() {
907        if c == '[' {
908            // Collect until the matching ']'; if none, emit the '[' literally.
909            let mut inner = String::new();
910            let mut closed = false;
911            for d in chars.by_ref() {
912                if d == ']' {
913                    closed = true;
914                    break;
915                }
916                inner.push(d);
917            }
918            if closed {
919                out.push_str(&inner);
920            } else {
921                out.push('[');
922                out.push_str(&inner);
923            }
924        } else {
925            out.push(c);
926        }
927    }
928    out
929}
930
931// ─────────────────────────────────────────────────────────────────────────────
932// Shared zip helpers (docx + epub)
933// ─────────────────────────────────────────────────────────────────────────────
934
935/// Open a zip archive from a reader, mapping any failure to a typed
936/// [`ExtractError::Parse`] tagged with the calling format.
937fn open_zip<R: Read + std::io::Seek>(
938    reader: R,
939    format: &'static str,
940) -> Result<zip::ZipArchive<R>> {
941    zip::ZipArchive::new(reader).map_err(|e| ExtractError::Parse {
942        format,
943        message: format!("not a valid zip container: {e}"),
944    })
945}
946
947/// Cap on a single decompressed zip entry. docx/epub members are XML text — a
948/// member that inflates past this ceiling is a decompression bomb or corruption,
949/// not real evidence. `sources/` is untrusted input, so bound the read rather
950/// than let `read_to_end` follow a hostile DEFLATE stream until OOM.
951const MAX_ZIP_ENTRY_BYTES: u64 = 256 * 1024 * 1024;
952
953/// Read a single zip entry to a UTF-8 string, bounded by [`MAX_ZIP_ENTRY_BYTES`]
954/// so a zip-bomb member cannot exhaust memory. A missing entry, an over-cap
955/// entry, or a read failure is a typed [`ExtractError::Parse`]; invalid UTF-8 is
956/// lossily decoded (OOXML / XHTML are declared UTF-8, but we never panic on a
957/// stray byte).
958fn read_zip_entry<R: Read + std::io::Seek>(
959    archive: &mut zip::ZipArchive<R>,
960    name: &str,
961    format: &'static str,
962) -> Result<String> {
963    let entry = archive.by_name(name).map_err(|e| ExtractError::Parse {
964        format,
965        message: format!("missing zip entry {name:?}: {e}"),
966    })?;
967    // Reject up front when the central directory declares an over-cap size...
968    let declared = entry.size();
969    if declared > MAX_ZIP_ENTRY_BYTES {
970        return Err(ExtractError::Parse {
971            format,
972            message: format!(
973                "zip entry {name:?} declares {declared} bytes, over the {MAX_ZIP_ENTRY_BYTES}-byte cap"
974            ),
975        });
976    }
977    // ...and bound the actual decompressed read so a lying header (a bomb that
978    // understates its uncompressed size) still cannot allocate past the cap.
979    let mut bytes = Vec::new();
980    entry
981        .take(MAX_ZIP_ENTRY_BYTES + 1)
982        .read_to_end(&mut bytes)
983        .map_err(|e| ExtractError::Parse {
984            format,
985            message: format!("reading {name:?}: {e}"),
986        })?;
987    if bytes.len() as u64 > MAX_ZIP_ENTRY_BYTES {
988        return Err(ExtractError::Parse {
989            format,
990            message: format!(
991                "zip entry {name:?} exceeds the {MAX_ZIP_ENTRY_BYTES}-byte cap (decompression bomb?)"
992            ),
993        });
994    }
995    Ok(String::from_utf8_lossy(&bytes).into_owned())
996}
997
998/// Look up a start/empty element's attribute value by local name, returning it
999/// unescaped as an owned `String`. Prefix-agnostic on the attribute key.
1000fn attr_value(elem: &quick_xml::events::BytesStart<'_>, key: &[u8]) -> Option<String> {
1001    elem.attributes().flatten().find_map(|attr| {
1002        if local_name(attr.key.as_ref()) == key {
1003            // `unescape_value` returns an XML-unescaped `Cow<str>` — exactly the
1004            // owned attribute text we want. It is soft-deprecated in quick-xml
1005            // 0.40 in favor of `normalized_value(XmlVersion)`, whose extra
1006            // version arg and byte-Cow return buy us nothing here; the simple
1007            // form is correct for the UTF-8 OOXML/OPF attributes we read.
1008            #[allow(deprecated)]
1009            attr.unescape_value().ok().map(|cow| cow.into_owned())
1010        } else {
1011            None
1012        }
1013    })
1014}
1015
1016#[cfg(test)]
1017mod tests {
1018    use super::*;
1019    use std::path::PathBuf;
1020
1021    /// Absolute path to a corpus-c-formats fixture under `sources/docs/`.
1022    fn fixture(name: &str) -> PathBuf {
1023        PathBuf::from(env!("CARGO_MANIFEST_DIR"))
1024            .join("../../tests/corpora/corpus-c-formats/sources/docs")
1025            .join(name)
1026    }
1027
1028    /// Read the known-good `.txt` sibling of a fixture.
1029    fn expected(name: &str) -> String {
1030        std::fs::read_to_string(fixture(&format!("{name}.txt"))).unwrap()
1031    }
1032
1033    /// Token-level normalization: collapse every run of whitespace (incl.
1034    /// newlines) to one space and trim. This is the corpus's recommended,
1035    /// layout-agnostic comparison ("same words, same order").
1036    fn tokens(s: &str) -> String {
1037        s.split_whitespace().collect::<Vec<_>>().join(" ")
1038    }
1039
1040    /// The sorted set of non-blank, token-normalized lines — order-agnostic
1041    /// content comparison (used where extractor reading-order legitimately
1042    /// differs, e.g. multi-column PDF).
1043    fn line_set(s: &str) -> Vec<String> {
1044        let mut v: Vec<String> = s.lines().map(tokens).filter(|l| !l.is_empty()).collect();
1045        v.sort();
1046        v
1047    }
1048
1049    // ── format detection ────────────────────────────────────────────────────
1050
1051    #[test]
1052    fn detects_format_by_extension_case_insensitively() {
1053        assert_eq!(Format::from_path(Path::new("a.pdf")), Some(Format::Pdf));
1054        assert_eq!(Format::from_path(Path::new("a.PDF")), Some(Format::Pdf));
1055        assert_eq!(Format::from_path(Path::new("a.docx")), Some(Format::Docx));
1056        assert_eq!(
1057            Format::from_path(Path::new("a.xlsx")),
1058            Some(Format::Spreadsheet)
1059        );
1060        assert_eq!(
1061            Format::from_path(Path::new("a.ods")),
1062            Some(Format::Spreadsheet)
1063        );
1064        assert_eq!(Format::from_path(Path::new("a.epub")), Some(Format::Epub));
1065        assert_eq!(Format::from_path(Path::new("a.html")), Some(Format::Html));
1066        assert_eq!(Format::from_path(Path::new("a.htm")), Some(Format::Html));
1067        assert_eq!(Format::from_path(Path::new("a.txt")), None);
1068        assert_eq!(Format::from_path(Path::new("noext")), None);
1069    }
1070
1071    #[test]
1072    fn unsupported_extension_is_typed_error() {
1073        let err = extract(Path::new("/tmp/whatever.txt")).unwrap_err();
1074        assert!(matches!(err, ExtractError::UnsupportedFormat(ref e) if e == "txt"));
1075        assert_eq!(err.code(), "UNSUPPORTED_FORMAT");
1076    }
1077
1078    #[test]
1079    fn missing_extension_is_unsupported() {
1080        let err = extract(Path::new("/tmp/noext")).unwrap_err();
1081        assert!(matches!(err, ExtractError::UnsupportedFormat(ref e) if e.is_empty()));
1082    }
1083
1084    // ── normalization ─────────────────────────────────────────────────────────
1085
1086    #[test]
1087    fn normalize_collapses_blanks_and_trims() {
1088        let raw = "\r\n\r\nHeading\r\n\r\n\r\n\r\nBody line   \r\n\r\n";
1089        assert_eq!(normalize_text(raw), "Heading\n\nBody line\n");
1090    }
1091
1092    #[test]
1093    fn normalize_empty_stays_empty() {
1094        assert_eq!(normalize_text(""), "");
1095        assert_eq!(normalize_text("   \n\n  \n"), "");
1096    }
1097
1098    // ── per-format extraction against corpus-c fixtures ───────────────────────
1099
1100    #[test]
1101    fn extract_text_pdf_matches_known_good() {
1102        let got = extract(&fixture("text.pdf")).unwrap();
1103        assert_eq!(got.metadata["format"], MetaValue::Str("pdf".into()));
1104        assert_eq!(got.metadata["pages"], MetaValue::Num(1));
1105        assert_eq!(tokens(&got.text), tokens(&expected("text.pdf")));
1106    }
1107
1108    #[test]
1109    fn extract_weird_fonts_pdf_matches_known_good() {
1110        let got = extract(&fixture("weird-fonts.pdf")).unwrap();
1111        assert_eq!(tokens(&got.text), tokens(&expected("weird-fonts.pdf")));
1112    }
1113
1114    #[test]
1115    fn extract_multi_column_pdf_matches_content_order_agnostic() {
1116        // pdf-extract reads column-by-column; the known-good `.txt` captures the
1117        // interleaved (pdftotext) order. Both carry identical content — assert
1118        // the line SET, not the order. (README § multi-column.)
1119        let got = extract(&fixture("multi-column.pdf")).unwrap();
1120        assert_eq!(line_set(&got.text), line_set(&expected("multi-column.pdf")));
1121    }
1122
1123    #[test]
1124    fn extract_image_only_pdf_yields_empty() {
1125        // No text layer → empty out, never hallucinated text. OCR out of scope.
1126        let got = extract(&fixture("image-only.pdf")).unwrap();
1127        assert_eq!(got.text, "");
1128        assert!(expected("image-only.pdf").trim().is_empty());
1129    }
1130
1131    #[test]
1132    fn extract_encrypted_pdf_without_password_refuses_cleanly() {
1133        let err = extract(&fixture("encrypted.pdf")).unwrap_err();
1134        assert!(
1135            matches!(err, ExtractError::Encrypted(_)),
1136            "expected Encrypted, got {err:?}"
1137        );
1138        assert_eq!(err.code(), "DOCUMENT_ENCRYPTED");
1139    }
1140
1141    #[test]
1142    fn guard_pdf_panic_contains_unwind_as_parse_error() {
1143        // The "never panics" contract: an internal pdf-extract/lopdf panic must
1144        // surface as a typed ExtractError::Parse, not abort the process. (cargo
1145        // captures the unwind's stderr line for a passing test.)
1146        let contained: Result<()> = guard_pdf_panic(|| panic!("simulated pdf-extract abort"));
1147        assert!(
1148            matches!(contained, Err(ExtractError::Parse { format: "pdf", .. })),
1149            "panic must be contained as a pdf Parse error, got {contained:?}"
1150        );
1151        // The success path is transparent — the value passes straight through.
1152        let ok: Result<u32> = guard_pdf_panic(|| 42);
1153        assert_eq!(ok.unwrap(), 42);
1154    }
1155
1156    #[test]
1157    fn extract_docx_matches_known_good() {
1158        let got = extract(&fixture("sample.docx")).unwrap();
1159        assert_eq!(got.metadata["format"], MetaValue::Str("docx".into()));
1160        assert_eq!(tokens(&got.text), tokens(&expected("sample.docx")));
1161    }
1162
1163    #[test]
1164    fn extract_xlsx_matches_known_good() {
1165        let got = extract(&fixture("sample.xlsx")).unwrap();
1166        assert_eq!(got.metadata["format"], MetaValue::Str("spreadsheet".into()));
1167        assert_eq!(got.metadata["sheets"], MetaValue::Num(1));
1168        assert_eq!(
1169            got.metadata["sheet_names"],
1170            MetaValue::Str("Expenses".into())
1171        );
1172        // Tab-separated, integers without `.0` — exact match (no soft-wrap risk).
1173        assert_eq!(got.text.trim_end(), expected("sample.xlsx").trim_end());
1174    }
1175
1176    #[test]
1177    fn extract_epub_matches_known_good() {
1178        let got = extract(&fixture("sample.epub")).unwrap();
1179        assert_eq!(got.metadata["format"], MetaValue::Str("epub".into()));
1180        assert_eq!(got.metadata["chapters"], MetaValue::Num(1));
1181        assert_eq!(
1182            got.metadata["title"],
1183            MetaValue::Str("Operations Playbook".into())
1184        );
1185        assert_eq!(tokens(&got.text), tokens(&expected("sample.epub")));
1186    }
1187
1188    #[test]
1189    fn extract_html_matches_known_good() {
1190        let got = extract(&fixture("sample.html")).unwrap();
1191        assert_eq!(got.metadata["format"], MetaValue::Str("html".into()));
1192        assert_eq!(tokens(&got.text), tokens(&expected("sample.html")));
1193    }
1194
1195    // ── helper-level unit tests ───────────────────────────────────────────────
1196
1197    #[test]
1198    fn unwrap_brackets_flattens_link_text() {
1199        assert_eq!(
1200            unwrap_brackets("contact [ops@acme.example] or the [handbook]."),
1201            "contact ops@acme.example or the handbook."
1202        );
1203        // Unmatched '[' is preserved.
1204        assert_eq!(unwrap_brackets("a [b c"), "a [b c");
1205        // No brackets → untouched.
1206        assert_eq!(unwrap_brackets("plain text"), "plain text");
1207    }
1208
1209    #[test]
1210    fn strip_markdown_decorations_drops_heading_hashes() {
1211        let input = "# Title\n## Section\n* bullet\n1. ordered\nplain\n";
1212        let out = strip_markdown_decorations(input);
1213        assert_eq!(out, "Title\nSection\n* bullet\n1. ordered\nplain\n");
1214    }
1215
1216    #[test]
1217    fn local_name_strips_prefix() {
1218        assert_eq!(local_name(b"w:t"), b"t");
1219        assert_eq!(local_name(b"t"), b"t");
1220        assert_eq!(local_name(b"dc:title"), b"title");
1221    }
1222
1223    #[test]
1224    fn extracted_serializes_to_text_metadata_json() {
1225        let got = extract(&fixture("sample.xlsx")).unwrap();
1226        let json = serde_json::to_value(&got).unwrap();
1227        assert!(json.get("text").is_some());
1228        assert_eq!(json["metadata"]["format"], "spreadsheet");
1229        assert_eq!(json["metadata"]["sheets"], 1);
1230        // MetaValue::Num serializes as a bare JSON number, Str as a bare string.
1231        assert!(json["metadata"]["sheets"].is_number());
1232        assert!(json["metadata"]["format"].is_string());
1233    }
1234
1235    // ── regression: leading-blank normalization is linear (finding #13) ────────
1236
1237    /// `normalize_text` must trim leading blank lines in O(n), not O(n²). The
1238    /// pre-fix loop used `lines.remove(0)` per blank line — O(n) shift each, so a
1239    /// document dominated by leading blanks took O(n²) and hung extraction.
1240    ///
1241    /// 500_000 leading blank lines is ~2.5e11 element shifts under the old code
1242    /// (minutes-to-hours, effectively a hang) but instant under the index-and-
1243    /// slice path; the test reconstructs the finding's trigger (an adapter output
1244    /// that is mostly leading blanks then one line of text) and asserts the
1245    /// correct, fully-trimmed result. Against the pre-fix code this test does not
1246    /// complete in a reasonable time — encoding the quadratic regression.
1247    #[test]
1248    fn regression_normalize_text_leading_blanks_is_linear() {
1249        let blanks = "\n".repeat(500_000);
1250        let raw = format!("{blanks}only real line\n");
1251        // Leading blanks fully trimmed; single trailing newline; body intact.
1252        assert_eq!(normalize_text(&raw), "only real line\n");
1253
1254        // A wholly-blank giant input still collapses to empty (the other branch).
1255        assert_eq!(normalize_text(&"   \n".repeat(500_000)), "");
1256    }
1257
1258    // ── regression: spreadsheet dense-grid bomb is refused (finding #4) ────────
1259
1260    /// Build a VALID `.xlsx` whose single sheet declares two real cells at the
1261    /// opposite corners of Excel's grid (`A1` and `XFD1048576`). `calamine`
1262    /// materializes a sheet as a DENSE `Vec<Data>` sized from the MIN/MAX cell
1263    /// positions, so this two-cell sheet would force a ~1.7e10-element (~400 GB)
1264    /// allocation and abort the process. We reuse the corpus `sample.xlsx`
1265    /// container verbatim and swap ONLY `xl/worksheets/sheet1.xml`, so every
1266    /// other part (workbook, rels, content-types) is a real, openable workbook.
1267    fn write_dense_bomb_xlsx(dest: &Path) {
1268        use std::io::Write;
1269
1270        let base = std::fs::read(fixture("sample.xlsx")).expect("corpus sample.xlsx exists");
1271        let mut archive =
1272            zip::ZipArchive::new(std::io::Cursor::new(base)).expect("sample.xlsx is a valid zip");
1273
1274        let bomb_sheet = b"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\
1275<worksheet xmlns=\"http://schemas.openxmlformats.org/spreadsheetml/2006/main\">\
1276<sheetData>\
1277<row r=\"1\"><c r=\"A1\"><v>1</v></c></row>\
1278<row r=\"1048576\"><c r=\"XFD1048576\"><v>2</v></c></row>\
1279</sheetData></worksheet>";
1280
1281        let out = std::fs::File::create(dest).unwrap();
1282        let mut writer = zip::ZipWriter::new(out);
1283        let opts = zip::write::SimpleFileOptions::default()
1284            .compression_method(zip::CompressionMethod::Stored);
1285
1286        for i in 0..archive.len() {
1287            let entry = archive.by_index(i).unwrap();
1288            let name = entry.name().to_string();
1289            if name == "xl/worksheets/sheet1.xml" {
1290                writer.start_file(name, opts).unwrap();
1291                writer.write_all(bomb_sheet).unwrap();
1292            } else {
1293                // Copy every other entry's already-compressed bytes verbatim.
1294                writer.raw_copy_file(entry).unwrap();
1295            }
1296        }
1297        writer.finish().unwrap();
1298    }
1299
1300    /// A spreadsheet whose declared dense grid exceeds [`MAX_SPREADSHEET_CELLS`]
1301    /// is refused with a typed [`ExtractError::Parse`] BEFORE calamine allocates
1302    /// the dense matrix — never an OOM/abort. Pre-fix, `extract_spreadsheet`
1303    /// called `worksheet_range` directly and the process aborted on the
1304    /// allocation; this test would not return (it would kill the test runner),
1305    /// so it encodes the resource-exhaustion regression.
1306    #[test]
1307    fn regression_spreadsheet_dense_bomb_refused_not_oom() {
1308        let tmp = tempfile::TempDir::new().unwrap();
1309        let bomb = tmp.path().join("invoice.xlsx");
1310        write_dense_bomb_xlsx(&bomb);
1311
1312        // A few-hundred-byte file on disk — the whole point of the bomb.
1313        assert!(
1314            std::fs::metadata(&bomb).unwrap().len() < 10_000,
1315            "the bomb must be tiny on disk; the danger is the in-memory expansion"
1316        );
1317
1318        let err = extract(&bomb).unwrap_err();
1319        assert!(
1320            matches!(
1321                err,
1322                ExtractError::Parse {
1323                    format: "spreadsheet",
1324                    ..
1325                }
1326            ),
1327            "an over-cap dense grid must be a typed spreadsheet Parse refusal, got {err:?}"
1328        );
1329        assert_eq!(err.code(), "EXTRACT_PARSE_ERROR");
1330    }
1331
1332    /// The cap is a guard, not a wall: a normal spreadsheet still extracts. Locks
1333    /// down that the preflight bound does not regress the legitimate path (the
1334    /// corpus `sample.xlsx` is a 3×3 grid, far under the cap).
1335    #[test]
1336    fn regression_spreadsheet_cap_allows_real_workbook() {
1337        let got = extract(&fixture("sample.xlsx")).unwrap();
1338        assert_eq!(got.metadata["sheets"], MetaValue::Num(1));
1339        assert!(!got.text.is_empty());
1340    }
1341}
dbmd_core/extract.rs

dbmd_core/
extract.rs