dbmd_core/
extract.rs

1//! Document text extraction — the `dbmd extract` engine.
2//!
3//! `sources/` is where raw evidence lands: invoices, contracts, reports,
4//! exports. Most of it arrives as binary documents (PDF, Word, Excel, EPUB) or
5//! HTML, not markdown. Before an agent can reason over that evidence — wiki-link
6//! it, summarize it into the wiki layer, file a typed record that cites it — the
7//! text has to come out. This module is that step: a binary document in, plain
8//! UTF-8 text out, format chosen by file extension.
9//!
10//! # What this is, and is not
11//!
12//! - **Deterministic decoders only.** Every adapter is a format parser
13//!   (`pdf-extract`, `calamine`, `html2text`, `quick-xml`+`zip`). There is **no
14//!   AI, no OCR, no embeddings** here — consistent with the crate-wide invariant
15//!   (`lib.rs`). The agent driving `dbmd` is the semantic layer; this is plumbing.
16//! - **Text layer, not pixels.** A scanned PDF with no text layer yields the
17//!   empty string — *empty in, empty out, never hallucinated text.* OCR is an
18//!   explicit non-goal (a future `dbmd-ocr`).
19//! - **Single document, single call.** [`extract`] handles one file. Walking a
20//!   store and extracting every document is the caller's loop, not this module's.
21//!
22//! # Format dispatch
23//!
24//! [`Format::from_path`] maps the file extension to an adapter; [`extract`]
25//! dispatches:
26//!
27//! | Extension                | Format            | Adapter                          |
28//! |--------------------------|-------------------|----------------------------------|
29//! | `.pdf`                   | [`Format::Pdf`]   | `pdf-extract`                    |
30//! | `.docx`                  | [`Format::Docx`]  | `zip` + `quick-xml` (`w:t` runs) |
31//! | `.xlsx` / `.xlsm` / `.xlsb` / `.ods` | [`Format::Spreadsheet`] | `calamine` |
32//! | `.epub`                  | [`Format::Epub`]  | `zip` + `quick-xml` + `html2text`|
33//! | `.html` / `.htm` / `.xhtml` | [`Format::Html`] | `html2text`                    |
34//!
35//! Anything else is [`ExtractError::UnsupportedFormat`] — a typed refusal the
36//! CLI surfaces with a stable code, never a panic.
37
38use std::collections::BTreeMap;
39use std::io::Read;
40use std::panic::{catch_unwind, AssertUnwindSafe};
41use std::path::Path;
42
43use serde::Serialize;
44
45/// The result of extracting one document: the plain text plus a small,
46/// format-tagged metadata map.
47///
48/// This is the `--json` shape the CLI emits verbatim (`{text, metadata}`); in
49/// plain mode the CLI prints [`Extracted::text`] and discards the metadata.
50/// Metadata is intentionally minimal and best-effort — extraction never *fails*
51/// for want of a title; it just omits the key.
52#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
53pub struct Extracted {
54    /// The extracted plain text (UTF-8), normalized to `\n` line endings with
55    /// trailing whitespace trimmed per line and a single trailing newline. For
56    /// a document with no recoverable text layer (e.g. a scanned, image-only
57    /// PDF) this is the empty string — the contract is "empty in, empty out."
58    pub text: String,
59
60    /// Best-effort key/value metadata. Always carries `format` (the adapter
61    /// that ran, e.g. `"pdf"`). Adapters add what they cheaply know:
62    /// `pages`/`sheets`/`sheet_names` (counts), `title` (when the container
63    /// declares one). A `BTreeMap` so `--json` output is key-ordered and stable.
64    pub metadata: BTreeMap<String, MetaValue>,
65}
66
67impl Extracted {
68    /// Build an [`Extracted`] from raw adapter text + the detected format,
69    /// applying the canonical text normalization ([`normalize_text`]) and
70    /// seeding the `format` metadata key.
71    fn new(raw_text: String, format: Format) -> Self {
72        let mut metadata = BTreeMap::new();
73        metadata.insert(
74            "format".to_string(),
75            MetaValue::Str(format.tag().to_string()),
76        );
77        Extracted {
78            text: normalize_text(&raw_text),
79            metadata,
80        }
81    }
82
83    /// Insert a string metadata key only when the value is non-empty (keeps the
84    /// map free of empty `title: ""` noise).
85    fn put_str(&mut self, key: &str, value: impl Into<String>) {
86        let v = value.into();
87        if !v.trim().is_empty() {
88            self.metadata.insert(key.to_string(), MetaValue::Str(v));
89        }
90    }
91
92    /// Insert a numeric (count) metadata key.
93    fn put_num(&mut self, key: &str, value: u64) {
94        self.metadata.insert(key.to_string(), MetaValue::Num(value));
95    }
96}
97
98/// A metadata value: a string (title, format tag, sheet name list joined) or a
99/// non-negative count (pages, sheets). Serializes to a bare JSON string or
100/// number — no wrapper object — so `{text, metadata}` stays flat and readable.
101#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
102#[serde(untagged)]
103pub enum MetaValue {
104    /// A textual value (e.g. document title, the `format` tag).
105    Str(String),
106    /// A non-negative count (e.g. page count, sheet count).
107    Num(u64),
108}
109
110/// The document formats `dbmd extract` understands, one per adapter. Detected
111/// from the file extension by [`Format::from_path`].
112#[derive(Debug, Clone, Copy, PartialEq, Eq)]
113pub enum Format {
114    /// Portable Document Format (`.pdf`) — text layer via `pdf-extract`.
115    Pdf,
116    /// Office Open XML WordprocessingML (`.docx`) — `w:t` runs via `quick-xml`.
117    Docx,
118    /// A spreadsheet (`.xlsx`/`.xlsm`/`.xlsb`/`.ods`) — cells via `calamine`.
119    Spreadsheet,
120    /// EPUB e-book (`.epub`) — spine XHTML via `zip` + `quick-xml` + `html2text`.
121    Epub,
122    /// HTML (`.html`/`.htm`/`.xhtml`) — plain text via `html2text`.
123    Html,
124}
125
126impl Format {
127    /// Detect the format from a path's extension (case-insensitive). Returns
128    /// `None` for an unrecognized or missing extension; [`extract`] turns that
129    /// into [`ExtractError::UnsupportedFormat`] with the offending extension.
130    pub fn from_path(path: &Path) -> Option<Format> {
131        let ext = path.extension()?.to_str()?.to_ascii_lowercase();
132        Some(match ext.as_str() {
133            "pdf" => Format::Pdf,
134            "docx" => Format::Docx,
135            "xlsx" | "xlsm" | "xlsb" | "ods" => Format::Spreadsheet,
136            "epub" => Format::Epub,
137            "html" | "htm" | "xhtml" => Format::Html,
138            _ => return None,
139        })
140    }
141
142    /// The short, stable tag recorded in `metadata.format` and used in error
143    /// messages. Distinct from the file extension (one tag can cover several
144    /// extensions, e.g. `spreadsheet`).
145    pub fn tag(self) -> &'static str {
146        match self {
147            Format::Pdf => "pdf",
148            Format::Docx => "docx",
149            Format::Spreadsheet => "spreadsheet",
150            Format::Epub => "epub",
151            Format::Html => "html",
152        }
153    }
154}
155
156/// Errors from document extraction. Every variant is a typed refusal the CLI
157/// maps to a stable machine code — extraction never panics on a bad or
158/// encrypted input.
159#[derive(Debug, thiserror::Error)]
160pub enum ExtractError {
161    /// The file extension is missing or not one of the supported document
162    /// formats. Carries the offending extension (or `""` when absent).
163    #[error("unsupported document format: {0:?} (supported: pdf, docx, xlsx/xlsm/xlsb/ods, epub, html/htm/xhtml)")]
164    UnsupportedFormat(String),
165
166    /// The document is encrypted/password-protected and could not be opened
167    /// without a password (or with the wrong one). A clean refusal — the
168    /// extractor must never emit partial/garbled bytes for a locked file.
169    #[error("document is encrypted or password-protected: {0}")]
170    Encrypted(String),
171
172    /// A format adapter failed to parse a structurally invalid or corrupt
173    /// document. Carries the adapter's diagnostic.
174    #[error("failed to parse {format} document: {message}")]
175    Parse {
176        /// The format tag whose adapter failed (e.g. `"pdf"`, `"docx"`).
177        format: &'static str,
178        /// The underlying parser diagnostic.
179        message: String,
180    },
181
182    /// An underlying I/O failure (file missing, unreadable, etc.).
183    #[error(transparent)]
184    Io(#[from] std::io::Error),
185}
186
187impl ExtractError {
188    /// A short, stable machine code for this error, mirrored at the CLI
189    /// boundary for `--json` output and exit-code mapping.
190    pub fn code(&self) -> &'static str {
191        match self {
192            ExtractError::UnsupportedFormat(_) => "UNSUPPORTED_FORMAT",
193            ExtractError::Encrypted(_) => "DOCUMENT_ENCRYPTED",
194            ExtractError::Parse { .. } => "EXTRACT_PARSE_ERROR",
195            ExtractError::Io(_) => "IO_ERROR",
196        }
197    }
198}
199
200/// Result alias for extraction operations.
201pub type Result<T> = std::result::Result<T, ExtractError>;
202
203/// Extract plain text (and best-effort metadata) from a document, choosing the
204/// adapter by the file's extension.
205///
206/// This is the single entry point the CLI calls. It reads exactly one file and
207/// returns one [`Extracted`]; there is no whole-store walk here (per the
208/// crate-wide O(changed) invariant — a store-wide extraction is the caller's
209/// loop). An unsupported extension is [`ExtractError::UnsupportedFormat`]; an
210/// encrypted PDF is [`ExtractError::Encrypted`]; neither panics.
211///
212/// # Examples
213///
214/// ```no_run
215/// use std::path::Path;
216/// let out = dbmd_core::extract::extract(Path::new("sources/docs/invoice.pdf"))?;
217/// println!("{}", out.text);
218/// # Ok::<(), dbmd_core::extract::ExtractError>(())
219/// ```
220pub fn extract(path: &Path) -> Result<Extracted> {
221    let format = Format::from_path(path).ok_or_else(|| {
222        let ext = path
223            .extension()
224            .and_then(|e| e.to_str())
225            .unwrap_or("")
226            .to_string();
227        ExtractError::UnsupportedFormat(ext)
228    })?;
229
230    match format {
231        Format::Pdf => extract_pdf(path),
232        Format::Docx => extract_docx(path),
233        Format::Spreadsheet => extract_spreadsheet(path),
234        Format::Epub => extract_epub(path),
235        Format::Html => extract_html(path),
236    }
237}
238
239// ─────────────────────────────────────────────────────────────────────────────
240// Text normalization
241// ─────────────────────────────────────────────────────────────────────────────
242
243/// Canonicalize extracted text so output is stable across adapters:
244///
245/// 1. Normalize line endings to `\n` (drop `\r`).
246/// 2. Trim trailing whitespace on each line.
247/// 3. Collapse three-or-more consecutive blank lines to a single blank line.
248/// 4. Trim leading/trailing blank lines, then append exactly one `\n` (unless
249///    the whole text is empty, which stays empty — the image-only-PDF contract).
250///
251/// This is *layout* tid-up only; it never reorders or drops words. Word-level
252/// content is whatever the adapter recovered.
253pub fn normalize_text(raw: &str) -> String {
254    let unix = raw.replace("\r\n", "\n").replace('\r', "\n");
255
256    let lines: Vec<&str> = unix.lines().map(|l| l.trim_end()).collect();
257
258    // Trim leading/trailing blank lines by locating the first and last
259    // non-blank line ONCE, then slicing. The previous `while … lines.remove(0)`
260    // shifted every remaining element on each removal — O(n²) when the document
261    // is dominated by leading blanks (e.g. an adapter that emits millions of
262    // empty paragraphs), letting a few-hundred-KB document hang extraction for
263    // minutes. Index-and-slice is O(n) regardless of how many blanks lead.
264    let Some(first) = lines.iter().position(|l| !l.is_empty()) else {
265        return String::new();
266    };
267    // `first` exists, so a last non-blank line exists too (rposition can't be None).
268    let last = lines
269        .iter()
270        .rposition(|l| !l.is_empty())
271        .expect("a non-blank line exists once `first` is found");
272    let lines = &lines[first..=last];
273
274    // Collapse runs of 2+ blank lines down to a single blank line.
275    let mut out = String::new();
276    let mut blank_run = 0usize;
277    for &line in lines {
278        if line.is_empty() {
279            blank_run += 1;
280            if blank_run >= 2 {
281                continue;
282            }
283        } else {
284            blank_run = 0;
285        }
286        out.push_str(line);
287        out.push('\n');
288    }
289    out
290}
291
292// ─────────────────────────────────────────────────────────────────────────────
293// PDF — pdf-extract
294// ─────────────────────────────────────────────────────────────────────────────
295
296/// Extract a PDF's text layer via `pdf-extract`.
297///
298/// A PDF with no text layer (a scanned image) yields the empty string — that is
299/// correct, not an error (OCR is out of scope). A password-protected PDF that
300/// cannot be opened is mapped to [`ExtractError::Encrypted`] rather than a raw
301/// parse error so the caller can branch on it. Metadata carries the page count
302/// when the document tree exposes it.
303///
304/// `pdf-extract`/`lopdf` `panic!` internally on some malformed-but-openable
305/// PDFs (e.g. an out-of-set base `/Encoding` name), so both parser calls are
306/// wrapped in [`std::panic::catch_unwind`]: an internal abort is contained and
307/// surfaced as [`ExtractError::Parse`], upholding this module's "never panics"
308/// contract on untrusted `sources/` input.
309fn extract_pdf(path: &Path) -> Result<Extracted> {
310    // Read the bytes ourselves so a missing/unreadable file is a clean
311    // `ExtractError::Io` (via `?`) before we hand anything to the PDF parser.
312    let bytes = std::fs::read(path)?;
313
314    let text = match guard_pdf_panic(|| pdf_extract::extract_text_from_mem(&bytes))? {
315        Ok(t) => t,
316        Err(e) => return Err(classify_pdf_error(e)),
317    };
318
319    let mut out = Extracted::new(text, Format::Pdf);
320
321    // Page count is best-effort; derive it from the parsed document. A parse
322    // failure OR an internal panic here is non-fatal — the text already
323    // succeeded — so a contained panic (outer `Err`) and a load failure (inner
324    // `Err`) are both silently skipped.
325    if let Ok(Ok(doc)) = guard_pdf_panic(|| pdf_extract::Document::load_mem(&bytes)) {
326        out.put_num("pages", doc.get_pages().len() as u64);
327    }
328
329    Ok(out)
330}
331
332/// Run a panic-prone `pdf-extract`/`lopdf` call, converting an internal unwind
333/// into a typed [`ExtractError::Parse`] tagged `pdf` so the module's "never
334/// panics" contract holds on adversarial PDFs. `AssertUnwindSafe` is sound: the
335/// closure borrows only `&[u8]`, and on a caught unwind we discard any partial
336/// state and return an owned error. The default panic hook still writes the
337/// panic line to stderr — library code must not mutate the process-global hook.
338fn guard_pdf_panic<T>(f: impl FnOnce() -> T) -> Result<T> {
339    catch_unwind(AssertUnwindSafe(f)).map_err(|_| ExtractError::Parse {
340        format: "pdf",
341        message: "pdf parser aborted on malformed input".to_string(),
342    })
343}
344
345/// Map a `pdf-extract` error onto the right [`ExtractError`] variant.
346/// Decryption failures become [`ExtractError::Encrypted`]; everything else is a
347/// [`ExtractError::Parse`] tagged `pdf`.
348fn classify_pdf_error(err: pdf_extract::OutputError) -> ExtractError {
349    let msg = err.to_string();
350    let lower = msg.to_ascii_lowercase();
351    if lower.contains("password") || lower.contains("decrypt") || lower.contains("encrypt") {
352        ExtractError::Encrypted(msg)
353    } else {
354        ExtractError::Parse {
355            format: "pdf",
356            message: msg,
357        }
358    }
359}
360
361// ─────────────────────────────────────────────────────────────────────────────
362// DOCX — zip + quick-xml (no docx-rs dependency; quick-xml is already needed
363// for epub, so docx, xlsx-via-calamine, and epub share one XML/zip surface)
364// ─────────────────────────────────────────────────────────────────────────────
365
366/// Extract a `.docx` (WordprocessingML) by unzipping `word/document.xml` and
367/// concatenating the `<w:t>` run text, one logical line per `<w:p>` paragraph.
368///
369/// `<w:tab/>` becomes a tab and `<w:br/>` / `<w:cr>` a newline so table-ish and
370/// line-broken content keeps its shape; everything else is structural and
371/// ignored. This is the same minimal-but-faithful path `docx-rs` takes for text
372/// extraction, without pulling in a second XML/zip stack.
373fn extract_docx(path: &Path) -> Result<Extracted> {
374    let file = std::fs::File::open(path)?;
375    let mut archive = open_zip(file, "docx")?;
376
377    let xml = read_zip_entry(&mut archive, "word/document.xml", "docx")?;
378    let text = wordprocessing_text(&xml, "docx")?;
379
380    Ok(Extracted::new(text, Format::Docx))
381}
382
383/// Pull paragraph text out of a WordprocessingML / DrawingML XML body.
384///
385/// Shared by [`extract_docx`]. Walks the event stream collecting `<w:t>` text;
386/// `<w:p>` ends a line, `<w:tab/>` is a tab, `<w:br>`/`<w:cr>` a newline.
387///
388/// Output-bounded for parity with the HTML/EPUB adapters. A docx is a zip, and
389/// `word/document.xml` is attacker-controlled `sources/` input that can compress
390/// enormously: a few-hundred-KB `.docx` whose `document.xml` inflates to hundreds
391/// of MB of `<w:t>` runs would otherwise accumulate without bound. We cap the
392/// running output at [`MAX_EXTRACT_OUTPUT_BYTES`] *during* accumulation — the
393/// same ceiling EPUB enforces — so peak memory stays bounded rather than only
394/// being checked after the full string is materialized.
395fn wordprocessing_text(xml: &str, format: &'static str) -> Result<String> {
396    use quick_xml::events::Event;
397    use quick_xml::reader::Reader;
398
399    let mut reader = Reader::from_str(xml);
400    let mut buf = Vec::new();
401    let mut out = String::new();
402    let mut in_text_run = false;
403
404    // Refuse once accumulated text crosses the cap. Checked after each append so a
405    // single huge run can't blow past the ceiling before the next loop turn.
406    macro_rules! bound_output {
407        () => {
408            if out.len() > MAX_EXTRACT_OUTPUT_BYTES {
409                return Err(ExtractError::Parse {
410                    format,
411                    message: format!(
412                        "extracted text exceeds the {MAX_EXTRACT_OUTPUT_BYTES} byte cap \
413                         (malformed or hostile input)"
414                    ),
415                });
416            }
417        };
418    }
419
420    loop {
421        match reader.read_event_into(&mut buf) {
422            Ok(Event::Start(e)) => {
423                if local_name(e.name().as_ref()) == b"t" {
424                    in_text_run = true;
425                }
426            }
427            Ok(Event::End(e)) => {
428                let name = e.name();
429                match local_name(name.as_ref()) {
430                    b"t" => in_text_run = false,
431                    b"p" => {
432                        out.push('\n');
433                        bound_output!();
434                    }
435                    _ => {}
436                }
437            }
438            Ok(Event::Empty(e)) => {
439                // Self-closing run-level breaks inside a paragraph.
440                match local_name(e.name().as_ref()) {
441                    b"tab" => out.push('\t'),
442                    b"br" | b"cr" => out.push('\n'),
443                    _ => {}
444                }
445            }
446            // quick-xml 0.40 surfaces text verbatim in `Event::Text` but routes
447            // every entity reference to a separate `Event::GeneralRef` and CDATA
448            // to `Event::CData` — all three carry run content.
449            Ok(Event::Text(t)) => {
450                if in_text_run {
451                    out.push_str(&String::from_utf8_lossy(&t.into_inner()));
452                    bound_output!();
453                }
454            }
455            // `Smith &amp; Co` arrives as Text("Smith ") + GeneralRef("amp") +
456            // Text(" Co"); resolve the ref so `&`/`<`/`>`/numeric chars survive.
457            Ok(Event::GeneralRef(r)) => {
458                if in_text_run {
459                    out.push_str(&resolve_entity_ref(&r));
460                    bound_output!();
461                }
462            }
463            // CDATA inside a `<w:t>` run is valid WordprocessingML; its payload
464            // is literal text and must be appended like `Event::Text`.
465            Ok(Event::CData(c)) => {
466                if in_text_run {
467                    out.push_str(&String::from_utf8_lossy(&c.into_inner()));
468                    bound_output!();
469                }
470            }
471            Ok(Event::Eof) => break,
472            Err(e) => {
473                return Err(ExtractError::Parse {
474                    format,
475                    message: format!("malformed XML: {e}"),
476                });
477            }
478            _ => {}
479        }
480        buf.clear();
481    }
482
483    Ok(out)
484}
485
486/// The local part of a possibly-namespaced XML name: `w:t` → `t`, `t` → `t`.
487/// docx/epub XML uses prefixes (`w:`, `dc:`) the writer chose; matching the
488/// local name is prefix-agnostic and robust to that choice.
489fn local_name(qname: &[u8]) -> &[u8] {
490    match qname.iter().rposition(|&b| b == b':') {
491        Some(i) => &qname[i + 1..],
492        None => qname,
493    }
494}
495
496/// Resolve a `quick_xml` general-entity / character reference to its literal
497/// text. quick-xml 0.40 does NOT inline-resolve entity references inside
498/// `Event::Text`; instead it surfaces each `&name;` / `&#nnn;` as a separate
499/// `Event::GeneralRef`. Routing those to a `_ => {}` arm silently drops `&`,
500/// `<`, `>`, numeric refs, etc. from extracted text — corrupting any title,
501/// company name, or amount that contains them. This resolves the five
502/// XML-predefined named entities and any numeric character reference; an
503/// unknown named entity falls back to its bare name (best-effort, never a
504/// panic), matching the "recover what we can" stance of `sources/` extraction.
505fn resolve_entity_ref(reference: &quick_xml::events::BytesRef<'_>) -> String {
506    // Numeric character reference (`&#8212;`, `&#x2014;`): resolve to the char.
507    if let Ok(Some(ch)) = reference.resolve_char_ref() {
508        return ch.to_string();
509    }
510    // Named entity: map the five XML-predefined names; fall back to the bare
511    // name for anything else (custom DTD entities are out of scope here).
512    match reference.decode().as_deref() {
513        Ok("amp") => "&".to_string(),
514        Ok("lt") => "<".to_string(),
515        Ok("gt") => ">".to_string(),
516        Ok("quot") => "\"".to_string(),
517        Ok("apos") => "'".to_string(),
518        Ok(other) => other.to_string(),
519        Err(_) => String::new(),
520    }
521}
522
523// ─────────────────────────────────────────────────────────────────────────────
524// Spreadsheet — calamine (xlsx / xlsm / xlsb / ods)
525// ─────────────────────────────────────────────────────────────────────────────
526
527/// Ceiling on a single sheet's dense cell grid (`rows × cols`). `calamine`
528/// materializes a worksheet as a DENSE `Vec<Data>` sized from the MIN/MAX cell
529/// positions (`Range::from_sparse`), so two cells at `A1` and `XFD1048576` in a
530/// few-hundred-byte file force a ~1.7e10-element (~400 GB) allocation that
531/// **aborts** the process — bypassing the docx/epub zip-entry cap and the
532/// PDF panic guard (an allocation failure aborts, it does not unwind, so
533/// `catch_unwind` cannot contain it). `sources/` is untrusted input, so we
534/// bound the read the same way docx/epub do: refuse before the allocation.
535///
536/// 50M cells is ~1.2 GB worst-case dense (`Data` ≈ 24 bytes) — far above any
537/// real spreadsheet's used range, far below the weaponizable extreme.
538const MAX_SPREADSHEET_CELLS: u64 = 50_000_000;
539
540/// Extract every sheet of a spreadsheet via `calamine`, rendering each row as
541/// tab-separated cells, one row per line, sheets in workbook order separated by
542/// a blank line.
543///
544/// Cell rendering: text verbatim; integers and whole-valued floats without a
545/// trailing `.0` (`1200`, not `1200.0`); other floats via their default
546/// formatting; booleans as `TRUE`/`FALSE`; empty/error cells as the empty
547/// string. Metadata carries the sheet count and the joined sheet-name list.
548///
549/// Before materializing each sheet, [`spreadsheet_dense_cells`] bounds the
550/// would-be dense grid against [`MAX_SPREADSHEET_CELLS`] and returns a typed
551/// [`ExtractError::Parse`] refusal rather than letting an attacker-supplied
552/// sheet OOM/abort the process — upholding the module's "never panics on
553/// untrusted `sources/` input" contract for the spreadsheet adapter.
554fn extract_spreadsheet(path: &Path) -> Result<Extracted> {
555    use calamine::{open_workbook_auto, Reader};
556
557    // ODS has no sparse-iterator pre-scan (see `spreadsheet_dense_cells`), so the
558    // xlsx-family fail-fast on a truncated/unclosed `content.xml` does not protect
559    // it: a `.ods` whose `content.xml` opens `<table:table>` then hits EOF makes
560    // calamine's ODS reader spin forever (an UNBOUNDED loop, not a panic —
561    // `catch_unwind` cannot recover it). The hang is reachable from the very first
562    // calamine call (`open_workbook_auto` parses the ODS document on open), so the
563    // structural validity gate has to run BEFORE we hand the file to calamine at
564    // all — not merely before `worksheet_range`. Gate by extension (the `.ods`
565    // backend is the only one with this unbounded shape; `.xls`/BIFF is
566    // format-bounded and the xlsx-family is pre-scanned). A truncated/unclosed
567    // document fails fast here with a typed Parse refusal — the same shape the
568    // xlsx pre-scan produces on a truncated sheet.
569    let is_ods = path
570        .extension()
571        .and_then(|e| e.to_str())
572        .is_some_and(|e| e.eq_ignore_ascii_case("ods"));
573    if is_ods {
574        ods_content_xml_well_formed(path)?;
575    }
576
577    let mut workbook = open_workbook_auto(path).map_err(|e| ExtractError::Parse {
578        format: "spreadsheet",
579        message: e.to_string(),
580    })?;
581
582    let sheet_names = workbook.sheet_names().to_vec();
583    let mut text = String::new();
584
585    for (idx, name) in sheet_names.iter().enumerate() {
586        if idx > 0 {
587            text.push('\n'); // blank line between sheets
588        }
589
590        // Bound the dense grid BEFORE calamine allocates it. For the zip-XML /
591        // record backends that expose a sparse cell iterator (xlsx-family,
592        // xlsb) this never densely allocates; over-cap sheets refuse cleanly.
593        if let Some(cells) = spreadsheet_dense_cells(&mut workbook, name)? {
594            if cells > MAX_SPREADSHEET_CELLS {
595                return Err(ExtractError::Parse {
596                    format: "spreadsheet",
597                    message: format!(
598                        "sheet {name:?} declares a {cells}-cell grid, over the \
599                         {MAX_SPREADSHEET_CELLS}-cell cap (malformed or hostile spreadsheet)"
600                    ),
601                });
602            }
603        }
604
605        let range = workbook
606            .worksheet_range(name)
607            .map_err(|e| ExtractError::Parse {
608                format: "spreadsheet",
609                message: format!("sheet {name:?}: {e}"),
610            })?;
611
612        for row in range.rows() {
613            let cells: Vec<String> = row.iter().map(render_cell).collect();
614            text.push_str(&cells.join("\t"));
615            text.push('\n');
616        }
617    }
618
619    let mut out = Extracted::new(text, Format::Spreadsheet);
620    out.put_num("sheets", sheet_names.len() as u64);
621    if !sheet_names.is_empty() {
622        out.put_str("sheet_names", sheet_names.join(", "));
623    }
624    Ok(out)
625}
626
627/// Structurally validate an `.ods` `content.xml` before the unbounded calamine
628/// ODS reader touches it.
629///
630/// calamine's ODS backend exposes no sparse-cell iterator, so it gets none of the
631/// streaming pre-scan that bounds (and fails fast on truncated input) the
632/// xlsx/xlsb path in [`spreadsheet_dense_cells`]. On a `.ods` whose `content.xml`
633/// opens `<table:table>` and then hits EOF before the matching `</table:table>`,
634/// `worksheet_range` spins forever at full CPU — a resource-exhaustion DoS on
635/// untrusted `sources/` input, and an *infinite loop* that [`catch_unwind`]
636/// cannot recover (it catches panics, not hangs).
637///
638/// This gate reuses the shared zip helpers ([`open_zip`] / [`read_zip_entry`],
639/// bounded by [`MAX_ZIP_ENTRY_BYTES`]) to read `content.xml`, then streams it
640/// through `quick-xml` exactly like [`wordprocessing_text`] does for docx. A
641/// truncated/unclosed document surfaces as a `quick-xml` error (e.g. "Unexpected
642/// end of xml") or as an at-EOF tag-balance mismatch; either way we return a
643/// typed [`ExtractError::Parse`] (format `"spreadsheet"`) in well under a second,
644/// matching how a truncated `.xlsx` already fails — instead of letting calamine
645/// hang. A well-formed `content.xml` passes through untouched, so valid `.ods`
646/// extraction is unchanged. Peak memory stays bounded by the zip-entry cap; the
647/// scan never densely materializes anything.
648fn ods_content_xml_well_formed(path: &Path) -> Result<()> {
649    use quick_xml::events::Event;
650    use quick_xml::reader::Reader;
651
652    let file = std::fs::File::open(path)?;
653    let mut archive = open_zip(file, "spreadsheet")?;
654    let xml = read_zip_entry(&mut archive, "content.xml", "spreadsheet")?;
655
656    let mut reader = Reader::from_str(&xml);
657    let mut depth: i64 = 0;
658    loop {
659        match reader.read_event() {
660            // Any structural malformation (including the unclosed `<table:table>`
661            // at EOF, which quick-xml reports as "Unexpected end of xml") is a
662            // typed refusal — never a hang.
663            Err(e) => {
664                return Err(ExtractError::Parse {
665                    format: "spreadsheet",
666                    message: format!("malformed ODS content.xml: {e}"),
667                });
668            }
669            Ok(Event::Start(_)) => depth += 1,
670            Ok(Event::End(_)) => depth -= 1,
671            Ok(Event::Eof) => break,
672            _ => {}
673        }
674    }
675
676    // Belt-and-suspenders: even if a quirk let the stream reach EOF with elements
677    // still open, an unbalanced tree is not a document the ODS reader can finish.
678    // Refuse rather than risk the unbounded path.
679    if depth != 0 {
680        return Err(ExtractError::Parse {
681            format: "spreadsheet",
682            message: "malformed ODS content.xml: unbalanced elements (truncated document)"
683                .to_string(),
684        });
685    }
686
687    Ok(())
688}
689
690/// Compute the would-be dense cell count (`rows × cols`) of one sheet WITHOUT
691/// the dense allocation, by streaming the sheet's sparse cells and tracking the
692/// MIN/MAX non-empty position — exactly the bounds `Range::from_sparse` uses.
693///
694/// Returns `Some(rows * cols)` for the formats that expose a sparse cell
695/// iterator (`.xlsx`/`.xlsm`/`.xlsb`/`.xlam`), which are the realistic
696/// decompression/dimension-bomb vectors (an OOXML/record sheet can place two
697/// cells 1e10 apart in a few hundred bytes). Returns `None` for `.xls` (BIFF,
698/// format-bounded to ≤ 65 536 × 256 ≈ 1.7e7 cells) and `.ods`, neither of which
699/// exposes a sparse iterator on the auto-detected reader; those fall through to
700/// the normal materialization path. A row/col delta is saturated into `u64` so
701/// the multiply cannot overflow.
702fn spreadsheet_dense_cells(
703    workbook: &mut calamine::Sheets<std::io::BufReader<std::fs::File>>,
704    name: &str,
705) -> Result<Option<u64>> {
706    use calamine::{DataRef, Sheets};
707
708    // Stream cells, tracking the non-empty MIN/MAX extent that `from_sparse`
709    // would allocate. Empty cells are excluded (calamine drops them before
710    // computing the dense bounds), matching the dense grid exactly.
711    fn extent<E: std::fmt::Display>(
712        mut next: impl FnMut() -> std::result::Result<Option<((u32, u32), bool)>, E>,
713    ) -> Result<Option<u64>> {
714        let (mut r0, mut r1, mut c0, mut c1) = (u32::MAX, 0u32, u32::MAX, 0u32);
715        let mut any = false;
716        loop {
717            match next() {
718                Ok(Some(((r, c), is_empty))) => {
719                    if is_empty {
720                        continue;
721                    }
722                    any = true;
723                    r0 = r0.min(r);
724                    r1 = r1.max(r);
725                    c0 = c0.min(c);
726                    c1 = c1.max(c);
727                }
728                Ok(None) => break,
729                Err(e) => {
730                    return Err(ExtractError::Parse {
731                        format: "spreadsheet",
732                        message: format!("scanning sheet dimensions: {e}"),
733                    })
734                }
735            }
736        }
737        if !any {
738            return Ok(Some(0));
739        }
740        let rows = u64::from(r1 - r0) + 1;
741        let cols = u64::from(c1 - c0) + 1;
742        Ok(Some(rows.saturating_mul(cols)))
743    }
744
745    match workbook {
746        Sheets::Xlsx(xlsx) => {
747            let mut reader =
748                xlsx.worksheet_cells_reader(name)
749                    .map_err(|e| ExtractError::Parse {
750                        format: "spreadsheet",
751                        message: format!("sheet {name:?}: {e}"),
752                    })?;
753            extent(|| {
754                reader.next_cell().map(|opt| {
755                    opt.map(|c| (c.get_position(), matches!(c.get_value(), DataRef::Empty)))
756                })
757            })
758        }
759        Sheets::Xlsb(xlsb) => {
760            let mut reader =
761                xlsb.worksheet_cells_reader(name)
762                    .map_err(|e| ExtractError::Parse {
763                        format: "spreadsheet",
764                        message: format!("sheet {name:?}: {e}"),
765                    })?;
766            extent(|| {
767                reader.next_cell().map(|opt| {
768                    opt.map(|c| (c.get_position(), matches!(c.get_value(), DataRef::Empty)))
769                })
770            })
771        }
772        // `.xls` (BIFF, format-bounded) and `.ods` expose no sparse iterator on
773        // the auto reader; let them materialize normally.
774        Sheets::Xls(_) | Sheets::Ods(_) => Ok(None),
775    }
776}
777
778/// Render one spreadsheet cell to its text form. Whole-valued floats drop the
779/// `.0` (so `3450.0` → `3450`), matching how spreadsheet apps display an
780/// integer-typed amount.
781fn render_cell(cell: &calamine::Data) -> String {
782    use calamine::Data;
783    match cell {
784        Data::Empty => String::new(),
785        Data::String(s) => s.clone(),
786        Data::Int(i) => i.to_string(),
787        Data::Float(f) => {
788            if f.fract() == 0.0 && f.is_finite() && f.abs() < 1e15 {
789                format!("{}", *f as i64)
790            } else {
791                f.to_string()
792            }
793        }
794        Data::Bool(b) => {
795            if *b {
796                "TRUE".to_string()
797            } else {
798                "FALSE".to_string()
799            }
800        }
801        // A date/datetime cell is an Excel SERIAL number (days since the 1900
802        // epoch, fractional part = time of day). `ExcelDateTime`'s `Display`
803        // writes the raw serial (`46188`, `46143.5`), which is meaningless to an
804        // agent filing the value into a record, so render the calendar date
805        // instead. `to_ymd_hms_milli` is available without the `chrono` feature.
806        Data::DateTime(dt) => render_excel_datetime(dt),
807        Data::DateTimeIso(s) => s.clone(),
808        Data::DurationIso(s) => s.clone(),
809        Data::Error(e) => format!("{e:?}"),
810    }
811}
812
813/// Render an Excel serial date/datetime to an ISO calendar string. A pure date
814/// (midnight, no sub-day component) renders `YYYY-MM-DD`; a datetime with a time
815/// component renders `YYYY-MM-DD HH:MM:SS`. A duration (Excel `[hh]:mm:ss`
816/// elapsed-time format) is not a calendar date, so it keeps its raw serial form
817/// (the prior behavior) rather than being misrendered as a date.
818fn render_excel_datetime(dt: &calamine::ExcelDateTime) -> String {
819    // Guard the serial BEFORE calling `to_ymd_hms_milli`. A date cell carries an
820    // arbitrary (attacker-controlled in `sources/`) f64; calamine's conversion is
821    // only defined over its calendar window (~1899-12-31..9999-12-31, i.e. serial
822    // 0..=2_958_465). Outside it, calamine saturates `floor() as u64` and then
823    // overflows on `days += 109_571` — a panic in debug (abort, exit 101) and a
824    // fabricated far-past date in release (`1e308` → `1899-12-29`), both of which
825    // violate the module contract ("never panics on untrusted input, never
826    // hallucinated text"). A duration is likewise not a calendar point. In every
827    // such case keep the raw serial, exactly as the duration branch always did.
828    let serial = dt.as_f64();
829    if dt.is_duration() || !(0.0..=2_958_465.0).contains(&serial) {
830        return serial.to_string();
831    }
832    let (y, mo, d, h, mi, s, _ms) = dt.to_ymd_hms_milli();
833    if h == 0 && mi == 0 && s == 0 {
834        format!("{y:04}-{mo:02}-{d:02}")
835    } else {
836        format!("{y:04}-{mo:02}-{d:02} {h:02}:{mi:02}:{s:02}")
837    }
838}
839
840// ─────────────────────────────────────────────────────────────────────────────
841// EPUB — zip + quick-xml (spine order) + html2text (per-chapter)
842// ─────────────────────────────────────────────────────────────────────────────
843//
844// We do NOT use the `epub` crate: it is GPL-3.0, which violates the toolkit's
845// permissive-only license rule. An EPUB is a zip whose OPF package declares a
846// reading-order `spine`; each spine item is an XHTML document. zip + quick-xml
847// (already dependencies) read the container/OPF, and html2text (already a
848// dependency for `.html`) flattens each chapter. Same machinery, no GPL.
849
850/// Max spine itemrefs an `.epub` may declare before extraction refuses it. The
851/// spine is attacker-controlled (`parse_opf` pushes every `<itemref>`), so a
852/// few-KB file can declare millions; this bounds the read loop. Far above any
853/// real book (which has well under a few hundred reading-order items).
854const MAX_EPUB_SPINE_ITEMS: usize = 10_000;
855
856/// Hard cap on accumulated extracted-text bytes, shared by every adapter that
857/// concatenates or materializes a large string from untrusted `sources/` input:
858/// EPUB chapter concatenation, the HTML/XHTML flattener ([`html_to_text`]), and
859/// the WordprocessingML run accumulator ([`wordprocessing_text`]). The common
860/// backstop against output amplification — a long EPUB spine, a renderer
861/// pathology, or a docx whose `document.xml` inflates to hundreds of MB — so
862/// extracted text (and stdout) can't balloon without bound. Each adapter checks
863/// it *during* accumulation, not only at the end, to keep peak memory bounded.
864/// Far above any real document's flattened text; only hostile/corrupt input hits.
865const MAX_EXTRACT_OUTPUT_BYTES: usize = 64 * 1024 * 1024;
866
867/// Extract an EPUB's reading-order text:
868/// 1. read `META-INF/container.xml` → the OPF package path;
869/// 2. parse the OPF `manifest` (id→href) and `spine` (ordered idref list);
870/// 3. for each spine item, read its XHTML and flatten it with [`html_to_text`];
871/// 4. join chapters with a blank line.
872///
873/// Bounded against spine amplification: the spine length is capped, each
874/// distinct chapter is rendered at most once (memoized), and the total output is
875/// capped — so a tiny crafted `.epub` can neither peg a core nor balloon memory.
876///
877/// Metadata carries `title` (the OPF `dc:title`) and `chapters` (spine length).
878fn extract_epub(path: &Path) -> Result<Extracted> {
879    let file = std::fs::File::open(path)?;
880    let mut archive = open_zip(file, "epub")?;
881
882    // 1. container.xml → OPF path.
883    let container = read_zip_entry(&mut archive, "META-INF/container.xml", "epub")?;
884    let opf_path = epub_opf_path(&container)?;
885
886    // 2. OPF → base dir, manifest, spine, title.
887    let opf = read_zip_entry(&mut archive, &opf_path, "epub")?;
888    let parsed = parse_opf(&opf)?;
889    let base = opf_base_dir(&opf_path);
890
891    // Bound the spine length BEFORE the loop: `parse_opf` pushes every
892    // attacker-controlled `<itemref idref>` verbatim, so a tiny crafted .epub can
893    // declare millions of items. Even spine entries that render to empty text
894    // still cost a zip read each, so the output cap below can't bound the loop on
895    // its own — this guard does. Real books have well under a few hundred items.
896    if parsed.spine.len() > MAX_EPUB_SPINE_ITEMS {
897        return Err(ExtractError::Parse {
898            format: "epub",
899            message: format!(
900                "spine declares {} items, exceeding the {} cap",
901                parsed.spine.len(),
902                MAX_EPUB_SPINE_ITEMS
903            ),
904        });
905    }
906
907    // 3. Spine items in order → flattened chapter text.
908    let mut text = String::new();
909    let mut chapters = 0u64;
910    // Memoize rendered chapters by zip-entry path: a spine that references the
911    // SAME manifest item repeatedly must re-render it in O(1), not re-decode the
912    // zip entry and re-flatten its XHTML each time (the dominant CPU cost of the
913    // spine-amplification DoS — a few-KB file could peg a core indefinitely).
914    let mut rendered: std::collections::HashMap<String, String> = std::collections::HashMap::new();
915    for idref in &parsed.spine {
916        let Some(href) = parsed.manifest.get(idref) else {
917            continue; // dangling spine ref; skip rather than fail
918        };
919        let entry = join_zip_path(&base, href);
920        let chapter_text = match rendered.get(&entry) {
921            Some(cached) => cached.clone(),
922            None => {
923                // A missing spine target is skipped (best-effort), not fatal.
924                let Ok(chapter_xhtml) = read_zip_entry(&mut archive, &entry, "epub") else {
925                    continue;
926                };
927                let t = html_to_text(chapter_xhtml.as_bytes())?;
928                rendered.insert(entry.clone(), t.clone());
929                t
930            }
931        };
932        if !chapter_text.trim().is_empty() {
933            if chapters > 0 {
934                text.push('\n');
935            }
936            text.push_str(&chapter_text);
937            text.push('\n');
938            chapters += 1;
939            // Hard output backstop: a long spine of DISTINCT items, or a near-cap
940            // chapter referenced many times, must not balloon the extracted text
941            // (and stdout) without bound.
942            if text.len() > MAX_EXTRACT_OUTPUT_BYTES {
943                return Err(ExtractError::Parse {
944                    format: "epub",
945                    message: format!(
946                        "extracted text exceeds the {} byte cap",
947                        MAX_EXTRACT_OUTPUT_BYTES
948                    ),
949                });
950            }
951        }
952    }
953
954    let mut out = Extracted::new(text, Format::Epub);
955    out.put_num("chapters", chapters);
956    if let Some(title) = parsed.title {
957        out.put_str("title", title);
958    }
959    Ok(out)
960}
961
962/// The full-path of the OPF package file, read from `META-INF/container.xml`'s
963/// first `<rootfile full-path="…">`.
964fn epub_opf_path(container_xml: &str) -> Result<String> {
965    use quick_xml::events::Event;
966    use quick_xml::reader::Reader;
967
968    let mut reader = Reader::from_str(container_xml);
969    let mut buf = Vec::new();
970    loop {
971        match reader.read_event_into(&mut buf) {
972            Ok(Event::Start(e)) | Ok(Event::Empty(e)) => {
973                if local_name(e.name().as_ref()) == b"rootfile" {
974                    if let Some(p) = attr_value(&e, b"full-path") {
975                        return Ok(p);
976                    }
977                }
978            }
979            Ok(Event::Eof) => break,
980            Err(e) => {
981                return Err(ExtractError::Parse {
982                    format: "epub",
983                    message: format!("container.xml: {e}"),
984                })
985            }
986            _ => {}
987        }
988        buf.clear();
989    }
990    Err(ExtractError::Parse {
991        format: "epub",
992        message: "container.xml has no <rootfile full-path>".to_string(),
993    })
994}
995
996/// The parsed-out pieces of an OPF package we need for reading-order text.
997struct OpfParsed {
998    /// Manifest: item id → href (relative to the OPF's directory).
999    manifest: BTreeMap<String, String>,
1000    /// Spine: ordered list of manifest item ids (the reading order).
1001    spine: Vec<String>,
1002    /// `dc:title`, if present.
1003    title: Option<String>,
1004}
1005
1006/// Parse an OPF package document into its manifest, spine, and title.
1007fn parse_opf(opf_xml: &str) -> Result<OpfParsed> {
1008    use quick_xml::events::Event;
1009    use quick_xml::reader::Reader;
1010
1011    let mut reader = Reader::from_str(opf_xml);
1012    let mut buf = Vec::new();
1013
1014    let mut manifest = BTreeMap::new();
1015    let mut spine = Vec::new();
1016    let mut title: Option<String> = None;
1017    // Whether we are inside the FIRST `<dc:title>` element, and the text we have
1018    // accumulated for it. We accumulate across every Text/GeneralRef/CData event
1019    // until the matching End so an entity, comment, or nested element inside the
1020    // title does not truncate it.
1021    let mut in_title = false;
1022    let mut title_buf = String::new();
1023
1024    loop {
1025        match reader.read_event_into(&mut buf) {
1026            Ok(Event::Start(e)) => match local_name(e.name().as_ref()) {
1027                b"item" => {
1028                    if let (Some(id), Some(href)) = (attr_value(&e, b"id"), attr_value(&e, b"href"))
1029                    {
1030                        manifest.insert(id, href);
1031                    }
1032                }
1033                b"itemref" => {
1034                    if let Some(idref) = attr_value(&e, b"idref") {
1035                        spine.push(idref);
1036                    }
1037                }
1038                // Only a Start (not a self-closing Empty) opens the title: an
1039                // Empty `<dc:title/>` has no content and produces no End event,
1040                // so latching `in_title` on it would wrongly capture the next
1041                // text node (e.g. the author) as the title.
1042                b"title" if title.is_none() => in_title = true,
1043                _ => {}
1044            },
1045            // Self-closing manifest/spine entries are Empty events; the title is
1046            // never captured from Empty (see the Start arm's note).
1047            Ok(Event::Empty(e)) => match local_name(e.name().as_ref()) {
1048                b"item" => {
1049                    if let (Some(id), Some(href)) = (attr_value(&e, b"id"), attr_value(&e, b"href"))
1050                    {
1051                        manifest.insert(id, href);
1052                    }
1053                }
1054                b"itemref" => {
1055                    if let Some(idref) = attr_value(&e, b"idref") {
1056                        spine.push(idref);
1057                    }
1058                }
1059                _ => {}
1060            },
1061            Ok(Event::End(e)) => {
1062                if in_title && local_name(e.name().as_ref()) == b"title" {
1063                    in_title = false;
1064                    let s = title_buf.trim();
1065                    if !s.is_empty() {
1066                        title = Some(s.to_string());
1067                    }
1068                }
1069            }
1070            Ok(Event::Text(t)) => {
1071                if in_title {
1072                    title_buf.push_str(&String::from_utf8_lossy(&t.into_inner()));
1073                }
1074            }
1075            // An entity (`&amp;`) or numeric ref inside the title resolves into
1076            // the accumulated value rather than truncating it.
1077            Ok(Event::GeneralRef(r)) => {
1078                if in_title {
1079                    title_buf.push_str(&resolve_entity_ref(&r));
1080                }
1081            }
1082            // CDATA inside `<dc:title>` is literal title text.
1083            Ok(Event::CData(c)) => {
1084                if in_title {
1085                    title_buf.push_str(&String::from_utf8_lossy(&c.into_inner()));
1086                }
1087            }
1088            Ok(Event::Eof) => break,
1089            Err(e) => {
1090                return Err(ExtractError::Parse {
1091                    format: "epub",
1092                    message: format!("OPF: {e}"),
1093                })
1094            }
1095            _ => {}
1096        }
1097        buf.clear();
1098    }
1099
1100    Ok(OpfParsed {
1101        manifest,
1102        spine,
1103        title,
1104    })
1105}
1106
1107/// The directory portion of an OPF path (`"OEBPS/content.opf"` → `"OEBPS"`,
1108/// `"content.opf"` → `""`), used to resolve manifest hrefs against the OPF's own
1109/// location inside the zip.
1110fn opf_base_dir(opf_path: &str) -> String {
1111    match opf_path.rfind('/') {
1112        Some(i) => opf_path[..i].to_string(),
1113        None => String::new(),
1114    }
1115}
1116
1117/// Join an OPF base dir with a (possibly `./`-prefixed) manifest href into a zip
1118/// entry name. Forward-slash only — zip paths are always `/`-separated.
1119///
1120/// OPF manifest hrefs are URLs: the EPUB spec requires reserved characters
1121/// (spaces, non-ASCII) to be percent-encoded, but zip entry NAMES are raw. So an
1122/// href `my%20chapter.xhtml` must be percent-decoded to `my chapter.xhtml`
1123/// before it can match the zip entry, or the chapter is silently dropped. We
1124/// percent-decode the href and then normalize `.`/`..` segments so a relative
1125/// href like `../text/ch1.xhtml` resolves against the OPF's directory.
1126fn join_zip_path(base: &str, href: &str) -> String {
1127    let decoded = percent_decode(href);
1128    let combined = if base.is_empty() {
1129        decoded
1130    } else {
1131        format!("{base}/{decoded}")
1132    };
1133    normalize_zip_path(&combined)
1134}
1135
1136/// Percent-decode a URL path component (`%20` → space, `%C3%A9` → `é`).
1137/// Decodes byte-by-byte then UTF-8-lossy-reinterprets, so a multi-byte
1138/// percent-encoded codepoint (`%C3%A9`) round-trips. A stray `%` not followed by
1139/// two hex digits is emitted verbatim (best-effort, never a panic).
1140fn percent_decode(s: &str) -> String {
1141    let bytes = s.as_bytes();
1142    let mut out: Vec<u8> = Vec::with_capacity(bytes.len());
1143    let mut i = 0;
1144    while i < bytes.len() {
1145        if bytes[i] == b'%' && i + 2 < bytes.len() {
1146            let hi = (bytes[i + 1] as char).to_digit(16);
1147            let lo = (bytes[i + 2] as char).to_digit(16);
1148            if let (Some(hi), Some(lo)) = (hi, lo) {
1149                out.push((hi * 16 + lo) as u8);
1150                i += 3;
1151                continue;
1152            }
1153        }
1154        out.push(bytes[i]);
1155        i += 1;
1156    }
1157    String::from_utf8_lossy(&out).into_owned()
1158}
1159
1160/// Resolve `.` and `..` segments in a `/`-separated zip path so a manifest href
1161/// like `../text/ch1.xhtml` (relative to the OPF's directory) maps to the real
1162/// entry name. A leading `..` that would escape the archive root is dropped
1163/// (zip entries have no parent of the root).
1164fn normalize_zip_path(path: &str) -> String {
1165    let mut out: Vec<&str> = Vec::new();
1166    for seg in path.split('/') {
1167        match seg {
1168            "" | "." => {}
1169            ".." => {
1170                out.pop();
1171            }
1172            other => out.push(other),
1173        }
1174    }
1175    out.join("/")
1176}
1177
1178// ─────────────────────────────────────────────────────────────────────────────
1179// HTML — html2text + light markdown-decoration cleanup
1180// ─────────────────────────────────────────────────────────────────────────────
1181
1182/// Extract plain text from an `.html` file.
1183fn extract_html(path: &Path) -> Result<Extracted> {
1184    let bytes = std::fs::read(path)?;
1185    let text = html_to_text(&bytes)?;
1186    Ok(Extracted::new(text, Format::Html))
1187}
1188
1189/// Flatten an HTML/XHTML byte stream to clean plain text.
1190///
1191/// Renders with [`PlainContentDecorator`] — `html2text`'s plain renderer driven
1192/// by a decorator that emits **no** link brackets and **no** `#` heading
1193/// markers, while keeping list-item markers (`*` / `N.`). This removes the two
1194/// decorations at the source instead of post-stripping them: the previous
1195/// approach blindly deleted every `[bracketed]` substring and every leading `#`
1196/// run from the rendered text, which also destroyed *literal* content —
1197/// citation markers (`[1]`, `[sic]`), code subscripts (`x[i]`), and ranking
1198/// prose (`#1 in sales`). The renderer knows which `[`/`#` it produced; literal
1199/// brackets and hashes in the source now survive untouched.
1200///
1201/// A very wide wrap width (10_000) is used so paragraphs are not hard-wrapped by
1202/// the renderer; paragraph structure comes from the source's block elements, and
1203/// final layout is canonicalized by [`normalize_text`].
1204fn html_to_text(html: &[u8]) -> Result<String> {
1205    // Bound block-element nesting BEFORE handing the bytes to html2text. The
1206    // layout engine is super-linear in nesting depth (O(depth^2) observed), so a
1207    // tiny crafted file (`<div>`×40_000 …`</div>`×40_000`, ~440 KB) hangs
1208    // extraction for tens of seconds. `sources/` is untrusted, and every other
1209    // adapter bounds its untrusted input (MAX_ZIP_ENTRY_BYTES, MAX_SPREADSHEET_
1210    // CELLS); the HTML path is the lone unbounded one. This is the missing bound.
1211    // A pure byte cap can't distinguish a 440 KB bomb from a 440 KB legitimate
1212    // article, so we bound the structural cause (depth) rather than size. EPUB
1213    // chapters route through here too, so the guard covers them as well.
1214    if let Some(depth) = html_block_nesting_exceeds(html, MAX_HTML_NESTING_DEPTH) {
1215        return Err(ExtractError::Parse {
1216            format: "html",
1217            message: format!(
1218                "HTML block nesting depth exceeds the {MAX_HTML_NESTING_DEPTH} cap (reached {depth}; \
1219                 malformed or hostile input)"
1220            ),
1221        });
1222    }
1223    // Bound table size BEFORE html2text lays the table out. Depth alone misses
1224    // the *width* amplification: a flat `<table><tr><td>x</td>×200_000</tr>` is
1225    // only ~3 deep, so the nesting guard never fires — but html2text lays the row
1226    // out at the 10_000 wrap width and draws full-width U+2500 box rules per row
1227    // boundary, turning a ~2 MB input into multi-GB output and 9 GB+ peak RSS
1228    // (resource-exhaustion DoS on untrusted `sources/` input). The MAX_EXTRACT_
1229    // OUTPUT_BYTES backstop below cannot prevent that spike — html2text has
1230    // already materialized the giant string by the time it's measured. So we
1231    // refuse the layout BEFORE it happens, on the structural cause (table cell
1232    // counts — both single-row width and the overall total), mirroring the
1233    // refuse-before-allocate precedent of MAX_SPREADSHEET_CELLS / MAX_ZIP_ENTRY_
1234    // BYTES. EPUB/xhtml chapters route through here too, so this covers them.
1235    if let Some(bomb) =
1236        html_table_amplification(html, MAX_HTML_TABLE_ROW_CELLS, MAX_HTML_TABLE_CELLS)
1237    {
1238        let message = match bomb {
1239            TableBomb::RowTooWide(width) => format!(
1240                "a table row declares {width} cells, exceeding the \
1241                 {MAX_HTML_TABLE_ROW_CELLS}-cell-per-row cap (malformed or hostile input)"
1242            ),
1243            TableBomb::TooManyCells(total) => format!(
1244                "HTML declares over {total} table cells, exceeding the \
1245                 {MAX_HTML_TABLE_CELLS}-cell cap (malformed or hostile input)"
1246            ),
1247        };
1248        return Err(ExtractError::Parse {
1249            format: "html",
1250            message,
1251        });
1252    }
1253    let text = html2text::config::with_decorator(PlainContentDecorator)
1254        .string_from_read(html, 10_000)
1255        .map_err(|e| ExtractError::Parse {
1256            format: "html",
1257            message: e.to_string(),
1258        })?;
1259    // Hard output backstop. The structural pre-checks above stop the known
1260    // amplifier (wide tables) before the layout pass, but they cannot anticipate
1261    // every renderer pathology; this final byte cap guarantees the HTML path can
1262    // never return (or stream to stdout) more than the same ceiling EPUB enforces,
1263    // independent of *why* the output grew. A real document's flattened text is
1264    // far under 64 MB; only hostile or corrupt input reaches it.
1265    if text.len() > MAX_EXTRACT_OUTPUT_BYTES {
1266        return Err(ExtractError::Parse {
1267            format: "html",
1268            message: format!(
1269                "extracted text exceeds the {MAX_EXTRACT_OUTPUT_BYTES} byte cap \
1270                 (malformed or hostile input)"
1271            ),
1272        });
1273    }
1274    Ok(text)
1275}
1276
1277/// The deepest block-element nesting `html_to_text` tolerates. No legitimate
1278/// document nests containers anywhere near this deep; the cap exists purely to
1279/// refuse the deeply-nested bomb that makes html2text's layout pass run for
1280/// minutes. Set with large headroom so it can only fire on pathological input.
1281const MAX_HTML_NESTING_DEPTH: usize = 4_096;
1282
1283/// Ceiling on the number of cells (`<td>`/`<th>`) in any SINGLE table row before
1284/// extraction refuses the document. This is the primary structural guard against
1285/// the wide-table amplification DoS: html2text lays a table out at the 10_000
1286/// wrap width and draws full-width U+2500 box rules sized to the row, so a flat
1287/// `<td>`×N single row is the worst case — N=200_000 in a ~2 MB file balloons to
1288/// multi-GB output and 9 GB+ peak RSS. *Row width* is what drives the spike (a
1289/// tall narrow table of the same total cell count costs an order of magnitude
1290/// less), so we bound it directly and BEFORE html2text runs — the same
1291/// refuse-before-allocate precedent as MAX_SPREADSHEET_CELLS / MAX_ZIP_ENTRY_BYTES.
1292///
1293/// 4_096 columns is far beyond any real document's table width — a spreadsheet
1294/// export with thousands of columns is already unreadable as flattened text —
1295/// yet keeps the worst-case (all in one row) layout under ~16 MB peak, measured.
1296const MAX_HTML_TABLE_ROW_CELLS: usize = 4_096;
1297
1298/// Ceiling on the TOTAL number of table cells (`<td>`/`<th>`) across the whole
1299/// document. The backstop to [`MAX_HTML_TABLE_ROW_CELLS`] for the *tall* shape:
1300/// even narrow rows, if there are enough of them, grow html2text's layout memory
1301/// roughly linearly in total cells (independent of output size). The row-width
1302/// cap alone wouldn't bound a million-row × few-column table, so this caps the
1303/// aggregate too. Checked in the same single scan, before html2text runs.
1304///
1305/// 200_000 cells is far above any real tabular document (a 20_000-row × 10-column
1306/// table) yet keeps the worst measured tall-table peak under ~450 MB. Set
1307/// generously so it can only fire on pathological input.
1308const MAX_HTML_TABLE_CELLS: usize = 200_000;
1309
1310/// HTML5 void elements — they have no closing tag, so they must NOT increment
1311/// the nesting depth (a document of many sibling `<br>`/`<img>` is flat, not
1312/// deep). Kept lowercase; the scan lowercases the tag name before matching.
1313const HTML_VOID_ELEMENTS: &[&str] = &[
1314    "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source",
1315    "track", "wbr",
1316];
1317
1318/// Scan an HTML byte stream once and return `Some(depth)` if open-tag nesting
1319/// ever exceeds `limit`, else `None`. This is a deliberately crude, allocation-
1320/// free tag scanner — NOT a parser. It tracks only nesting *depth* to bound
1321/// html2text's super-linear layout cost; correctness of the depth count past the
1322/// limit does not matter (we only care whether it is exceeded). Closing tags
1323/// decrement (saturating at 0), void/self-closing tags and comments/doctype/PI
1324/// are ignored, and a `<` not followed by a tag-ish character is treated as
1325/// literal text rather than a tag open (so `a < b` in prose does not inflate it).
1326fn html_block_nesting_exceeds(html: &[u8], limit: usize) -> Option<usize> {
1327    let mut depth: usize = 0;
1328    let mut i = 0usize;
1329    let n = html.len();
1330    while i < n {
1331        if html[i] != b'<' {
1332            i += 1;
1333            continue;
1334        }
1335        // Look at the byte after `<` to classify the tag.
1336        let Some(&c) = html.get(i + 1) else { break };
1337        if c == b'!' || c == b'?' {
1338            // Comment, doctype, CDATA, or processing instruction — skip to `>`.
1339            i = memchr_gt(html, i + 1);
1340            continue;
1341        }
1342        if c == b'/' {
1343            depth = depth.saturating_sub(1);
1344            i = memchr_gt(html, i + 1);
1345            continue;
1346        }
1347        if !c.is_ascii_alphabetic() {
1348            // A stray `<` in text (`a < b`) — not a tag open.
1349            i += 1;
1350            continue;
1351        }
1352        // Find the tag's end `>` and whether it self-closes (`... />`).
1353        // `memchr_gt` returns the index ONE PAST the `>`, so the `>` byte is at
1354        // `end - 1` and the self-closing `/` (`<div/>`, `<div />`) is at `end - 2`.
1355        // (Reading `end - 1` here always saw the `>`, so the check was dead and
1356        // every self-closing NON-void element was miscounted as an open tag —
1357        // tripping the depth cap on a flat, valid document.)
1358        let end = memchr_gt(html, i + 1);
1359        let self_closing = end >= 2 && html.get(end - 2) == Some(&b'/');
1360        // Extract the tag name (letters/digits after `<`).
1361        let name_end = (i + 1..end.min(n))
1362            .find(|&j| !html[j].is_ascii_alphanumeric())
1363            .unwrap_or(end.min(n));
1364        let name = html[i + 1..name_end].to_ascii_lowercase();
1365        let is_void = std::str::from_utf8(&name)
1366            .map(|s| HTML_VOID_ELEMENTS.contains(&s))
1367            .unwrap_or(false);
1368        if !self_closing && !is_void {
1369            depth += 1;
1370            if depth > limit {
1371                return Some(depth);
1372            }
1373        }
1374        i = end;
1375    }
1376    None
1377}
1378
1379/// Why a table-cell pre-check refused an HTML document, with the offending count.
1380/// Returned by [`html_table_amplification`] so the caller can name the exact
1381/// structural cause (row width vs. total cells) in the typed error.
1382enum TableBomb {
1383    /// A single row holds more than [`MAX_HTML_TABLE_ROW_CELLS`] cells — the wide
1384    /// shape that html2text amplifies into multi-GB output. Carries the row width.
1385    RowTooWide(usize),
1386    /// The document holds more than [`MAX_HTML_TABLE_CELLS`] cells in total — the
1387    /// tall shape whose aggregate grows html2text's layout memory. Carries the
1388    /// total count (at the moment the cap was crossed).
1389    TooManyCells(usize),
1390}
1391
1392/// Scan an HTML byte stream once and return `Some(TableBomb)` if its table cells
1393/// would amplify html2text's layout past a safe bound, else `None`. Two bounds
1394/// are checked in the single pass: the max cells in any one `<tr>` (the *width*
1395/// amplifier, the dominant cost) against `row_limit`, and the total cell count
1396/// (the *tall* aggregate) against `total_limit`. Whichever trips first wins.
1397///
1398/// Like [`html_block_nesting_exceeds`] this is a crude, allocation-free tag
1399/// scanner — NOT a parser. It counts cell *opens* (`<td>`/`<th>`); closing tags
1400/// and self-closing forms add no cell. A `<tr>` open resets the per-row counter.
1401/// Comments/doctype/PI are skipped (so a `<td>` inside a comment isn't counted)
1402/// and a stray `<` in prose is ignored. The exact tally past a limit doesn't
1403/// matter, only whether the limit is crossed — so we can early-return.
1404fn html_table_amplification(
1405    html: &[u8],
1406    row_limit: usize,
1407    total_limit: usize,
1408) -> Option<TableBomb> {
1409    let mut total: usize = 0;
1410    let mut row_cells: usize = 0;
1411    let mut i = 0usize;
1412    let n = html.len();
1413    while i < n {
1414        if html[i] != b'<' {
1415            i += 1;
1416            continue;
1417        }
1418        let Some(&c) = html.get(i + 1) else { break };
1419        if c == b'!' || c == b'?' {
1420            // Comment, doctype, CDATA, or processing instruction — skip to `>`.
1421            i = memchr_gt(html, i + 1);
1422            continue;
1423        }
1424        if c == b'/' {
1425            // Closing tag — not a new cell.
1426            i = memchr_gt(html, i + 1);
1427            continue;
1428        }
1429        if !c.is_ascii_alphabetic() {
1430            // A stray `<` in text (`a < b`) — not a tag open.
1431            i += 1;
1432            continue;
1433        }
1434        let end = memchr_gt(html, i + 1);
1435        // Tag name = the run of letters/digits right after `<`.
1436        let name_end = (i + 1..end.min(n))
1437            .find(|&j| !html[j].is_ascii_alphanumeric())
1438            .unwrap_or(end.min(n));
1439        let name = html[i + 1..name_end].to_ascii_lowercase();
1440        if name == b"tr" {
1441            // A new row resets the per-row width tally. (A `<td>` outside any row
1442            // still counts toward both totals; resetting only on `<tr>` is the
1443            // conservative choice — it can never under-count a real row's width.)
1444            row_cells = 0;
1445        } else if name == b"td" || name == b"th" {
1446            total += 1;
1447            row_cells += 1;
1448            if row_cells > row_limit {
1449                return Some(TableBomb::RowTooWide(row_cells));
1450            }
1451            if total > total_limit {
1452                return Some(TableBomb::TooManyCells(total));
1453            }
1454        }
1455        i = end;
1456    }
1457    None
1458}
1459
1460/// Index just past the next `>` at or after `from` (or `len` if none). Small
1461/// helper so [`html_block_nesting_exceeds`] always makes forward progress.
1462fn memchr_gt(hay: &[u8], from: usize) -> usize {
1463    let mut j = from;
1464    while j < hay.len() {
1465        if hay[j] == b'>' {
1466            return j + 1;
1467        }
1468        j += 1;
1469    }
1470    hay.len()
1471}
1472
1473/// A `html2text` decorator that flattens HTML to plain text WITHOUT emitting the
1474/// markup that would otherwise have to be post-stripped: no `[`/`]` around link
1475/// text, no `#` heading prefix, no `^{…}` superscript braces. List-item markers
1476/// (`* ` for unordered, `N. ` for ordered) ARE emitted — they are content-
1477/// faithful and match the corpus convention. Quote prefixes are kept as in the
1478/// stock plain decorator. This is the fix for the literal-content corruption the
1479/// old `strip_markdown_decorations`/`unwrap_brackets` post-pass caused.
1480#[derive(Clone, Debug)]
1481struct PlainContentDecorator;
1482
1483impl html2text::render::TextDecorator for PlainContentDecorator {
1484    type Annotation = ();
1485
1486    fn decorate_link_start(&mut self, _url: &str) -> (String, Self::Annotation) {
1487        (String::new(), ())
1488    }
1489    fn decorate_link_end(&mut self) -> String {
1490        String::new()
1491    }
1492    fn decorate_em_start(&self) -> (String, Self::Annotation) {
1493        (String::new(), ())
1494    }
1495    fn decorate_em_end(&self) -> String {
1496        String::new()
1497    }
1498    fn decorate_strong_start(&self) -> (String, Self::Annotation) {
1499        (String::new(), ())
1500    }
1501    fn decorate_strong_end(&self) -> String {
1502        String::new()
1503    }
1504    fn decorate_strikeout_start(&self) -> (String, Self::Annotation) {
1505        (String::new(), ())
1506    }
1507    fn decorate_strikeout_end(&self) -> String {
1508        String::new()
1509    }
1510    fn decorate_code_start(&self) -> (String, Self::Annotation) {
1511        (String::new(), ())
1512    }
1513    fn decorate_code_end(&self) -> String {
1514        String::new()
1515    }
1516    fn decorate_preformat_first(&self) -> Self::Annotation {}
1517    fn decorate_preformat_cont(&self) -> Self::Annotation {}
1518    fn decorate_image(&mut self, _src: &str, title: &str) -> (String, Self::Annotation) {
1519        // Alt/title text only — no surrounding brackets (the stock plain
1520        // decorator wraps it in `[...]`, which would read as literal content).
1521        (title.to_string(), ())
1522    }
1523    fn header_prefix(&self, _level: usize) -> String {
1524        // No `#` heading marker — heading text reads as plain prose.
1525        String::new()
1526    }
1527    fn quote_prefix(&self) -> String {
1528        "> ".to_string()
1529    }
1530    fn unordered_item_prefix(&self) -> String {
1531        "* ".to_string()
1532    }
1533    fn ordered_item_prefix(&self, i: i64) -> String {
1534        format!("{i}. ")
1535    }
1536    fn decorate_superscript_start(&self) -> (String, Self::Annotation) {
1537        // Plain text: no `^{…}` braces (which would corrupt literal content).
1538        (String::new(), ())
1539    }
1540    fn decorate_superscript_end(&self) -> String {
1541        String::new()
1542    }
1543    fn make_subblock_decorator(&self) -> Self {
1544        PlainContentDecorator
1545    }
1546}
1547
1548/// Strip the residual markdown decorations `html2text`'s plain renderer emits:
1549/// leading run of `#` (ATX heading markers) at the start of a line, and `[...]`
1550/// brackets around link/anchor text (the reference-style `[n]` suffix is already
1551/// gone under `plain_no_decorate`). Bullet (`*`) and ordered (`N.`) markers are
1552/// left intact — they are content, not decoration.
1553///
1554/// No longer used by [`html_to_text`] (the [`PlainContentDecorator`] now removes
1555/// these decorations at the source so literal `[brackets]`/`#hashes` survive);
1556/// retained only for its unit test documenting the old renderer's behavior.
1557#[allow(dead_code)]
1558fn strip_markdown_decorations(text: &str) -> String {
1559    let mut out = String::with_capacity(text.len());
1560    for line in text.lines() {
1561        // Strip a leading "#"-run + the single space after it (ATX heading).
1562        let trimmed = line.trim_start();
1563        let after_hashes = trimmed.trim_start_matches('#');
1564        let line = if after_hashes.len() != trimmed.len() {
1565            // It was a heading line: keep indentation-free heading text.
1566            after_hashes.trim_start()
1567        } else {
1568            line
1569        };
1570        out.push_str(&unwrap_brackets(line));
1571        out.push('\n');
1572    }
1573    out
1574}
1575
1576/// Replace every `[inner]` with `inner` (one pass, non-nested). `html2text`'s
1577/// plain renderer wraps link/anchor text in single brackets; unwrapping yields
1578/// the bare text. Escaped or unmatched brackets are left as-is.
1579///
1580/// No longer used by [`html_to_text`] (see [`strip_markdown_decorations`]);
1581/// retained only for its unit test.
1582#[allow(dead_code)]
1583fn unwrap_brackets(line: &str) -> String {
1584    if !line.contains('[') {
1585        return line.to_string();
1586    }
1587    let mut out = String::with_capacity(line.len());
1588    let mut chars = line.chars().peekable();
1589    while let Some(c) = chars.next() {
1590        if c == '[' {
1591            // Collect until the matching ']'; if none, emit the '[' literally.
1592            let mut inner = String::new();
1593            let mut closed = false;
1594            for d in chars.by_ref() {
1595                if d == ']' {
1596                    closed = true;
1597                    break;
1598                }
1599                inner.push(d);
1600            }
1601            if closed {
1602                out.push_str(&inner);
1603            } else {
1604                out.push('[');
1605                out.push_str(&inner);
1606            }
1607        } else {
1608            out.push(c);
1609        }
1610    }
1611    out
1612}
1613
1614// ─────────────────────────────────────────────────────────────────────────────
1615// Shared zip helpers (docx + epub)
1616// ─────────────────────────────────────────────────────────────────────────────
1617
1618/// Open a zip archive from a reader, mapping any failure to a typed
1619/// [`ExtractError::Parse`] tagged with the calling format.
1620fn open_zip<R: Read + std::io::Seek>(
1621    reader: R,
1622    format: &'static str,
1623) -> Result<zip::ZipArchive<R>> {
1624    zip::ZipArchive::new(reader).map_err(|e| ExtractError::Parse {
1625        format,
1626        message: format!("not a valid zip container: {e}"),
1627    })
1628}
1629
1630/// Cap on a single decompressed zip entry. docx/epub members are XML text — a
1631/// member that inflates past this ceiling is a decompression bomb or corruption,
1632/// not real evidence. `sources/` is untrusted input, so bound the read rather
1633/// than let `read_to_end` follow a hostile DEFLATE stream until OOM.
1634const MAX_ZIP_ENTRY_BYTES: u64 = 256 * 1024 * 1024;
1635
1636/// Read a single zip entry to a UTF-8 string, bounded by [`MAX_ZIP_ENTRY_BYTES`]
1637/// so a zip-bomb member cannot exhaust memory. A missing entry, an over-cap
1638/// entry, or a read failure is a typed [`ExtractError::Parse`]; invalid UTF-8 is
1639/// lossily decoded (OOXML / XHTML are declared UTF-8, but we never panic on a
1640/// stray byte).
1641fn read_zip_entry<R: Read + std::io::Seek>(
1642    archive: &mut zip::ZipArchive<R>,
1643    name: &str,
1644    format: &'static str,
1645) -> Result<String> {
1646    let entry = archive.by_name(name).map_err(|e| ExtractError::Parse {
1647        format,
1648        message: format!("missing zip entry {name:?}: {e}"),
1649    })?;
1650    // Reject up front when the central directory declares an over-cap size...
1651    let declared = entry.size();
1652    if declared > MAX_ZIP_ENTRY_BYTES {
1653        return Err(ExtractError::Parse {
1654            format,
1655            message: format!(
1656                "zip entry {name:?} declares {declared} bytes, over the {MAX_ZIP_ENTRY_BYTES}-byte cap"
1657            ),
1658        });
1659    }
1660    // ...and bound the actual decompressed read so a lying header (a bomb that
1661    // understates its uncompressed size) still cannot allocate past the cap.
1662    let mut bytes = Vec::new();
1663    entry
1664        .take(MAX_ZIP_ENTRY_BYTES + 1)
1665        .read_to_end(&mut bytes)
1666        .map_err(|e| ExtractError::Parse {
1667            format,
1668            message: format!("reading {name:?}: {e}"),
1669        })?;
1670    if bytes.len() as u64 > MAX_ZIP_ENTRY_BYTES {
1671        return Err(ExtractError::Parse {
1672            format,
1673            message: format!(
1674                "zip entry {name:?} exceeds the {MAX_ZIP_ENTRY_BYTES}-byte cap (decompression bomb?)"
1675            ),
1676        });
1677    }
1678    Ok(String::from_utf8_lossy(&bytes).into_owned())
1679}
1680
1681/// Look up a start/empty element's attribute value by local name, returning it
1682/// unescaped as an owned `String`. Prefix-agnostic on the attribute key.
1683fn attr_value(elem: &quick_xml::events::BytesStart<'_>, key: &[u8]) -> Option<String> {
1684    elem.attributes().flatten().find_map(|attr| {
1685        if local_name(attr.key.as_ref()) == key {
1686            // `unescape_value` returns an XML-unescaped `Cow<str>` — exactly the
1687            // owned attribute text we want. It is soft-deprecated in quick-xml
1688            // 0.40 in favor of `normalized_value(XmlVersion)`, whose extra
1689            // version arg and byte-Cow return buy us nothing here; the simple
1690            // form is correct for the UTF-8 OOXML/OPF attributes we read.
1691            #[allow(deprecated)]
1692            attr.unescape_value().ok().map(|cow| cow.into_owned())
1693        } else {
1694            None
1695        }
1696    })
1697}
1698
1699#[cfg(test)]
1700mod tests {
1701    use super::*;
1702    use std::path::PathBuf;
1703
1704    /// Absolute path to a corpus-c-formats fixture under `sources/docs/`.
1705    fn fixture(name: &str) -> PathBuf {
1706        PathBuf::from(env!("CARGO_MANIFEST_DIR"))
1707            .join("../../tests/corpora/corpus-c-formats/sources/docs")
1708            .join(name)
1709    }
1710
1711    /// Read the known-good `.txt` sibling of a fixture.
1712    fn expected(name: &str) -> String {
1713        std::fs::read_to_string(fixture(&format!("{name}.txt"))).unwrap()
1714    }
1715
1716    /// Token-level normalization: collapse every run of whitespace (incl.
1717    /// newlines) to one space and trim. This is the corpus's recommended,
1718    /// layout-agnostic comparison ("same words, same order").
1719    fn tokens(s: &str) -> String {
1720        s.split_whitespace().collect::<Vec<_>>().join(" ")
1721    }
1722
1723    /// The sorted set of non-blank, token-normalized lines — order-agnostic
1724    /// content comparison (used where extractor reading-order legitimately
1725    /// differs, e.g. multi-column PDF).
1726    fn line_set(s: &str) -> Vec<String> {
1727        let mut v: Vec<String> = s.lines().map(tokens).filter(|l| !l.is_empty()).collect();
1728        v.sort();
1729        v
1730    }
1731
1732    // ── untrusted-input guards (adversarial review) ──────────────────────────
1733
1734    /// A crafted spreadsheet date cell carries an arbitrary f64 serial. An
1735    /// out-of-range serial must NOT panic (debug `attempt to add with overflow`)
1736    /// and must NOT fabricate a calendar date (release `1e308` → `1899-12-29`);
1737    /// it keeps the raw serial, exactly like the duration fallback.
1738    #[test]
1739    fn excel_datetime_out_of_range_serial_stays_raw_and_never_panics() {
1740        use calamine::{ExcelDateTime, ExcelDateTimeType};
1741        // In-range serial → a real calendar date (contains a `-`).
1742        let in_range = render_excel_datetime(&ExcelDateTime::new(
1743            46_188.0,
1744            ExcelDateTimeType::DateTime,
1745            false,
1746        ));
1747        assert!(
1748            in_range.contains('-'),
1749            "an in-range serial should render a calendar date, got {in_range}"
1750        );
1751        // Out-of-range / hostile serials keep the raw serial string, no panic.
1752        for serial in [1e308_f64, 3_000_000.0, 9e18, -5.0] {
1753            let out = render_excel_datetime(&ExcelDateTime::new(
1754                serial,
1755                ExcelDateTimeType::DateTime,
1756                false,
1757            ));
1758            assert_eq!(
1759                out,
1760                serial.to_string(),
1761                "out-of-range serial {serial} must stay raw, got {out}"
1762            );
1763        }
1764    }
1765
1766    /// The HTML adapter's block-nesting guard refuses a deeply-nested bomb (the
1767    /// O(depth^2) html2text blowup) while passing flat documents — including ones
1768    /// with tens of thousands of sibling VOID elements (which must not count as
1769    /// depth) and prose containing a literal `<`.
1770    #[test]
1771    fn html_nesting_guard_refuses_deep_bomb_passes_flat() {
1772        let deep = format!(
1773            "<html><body>{}x{}</body></html>",
1774            "<div>".repeat(8_000),
1775            "</div>".repeat(8_000)
1776        );
1777        assert!(
1778            html_block_nesting_exceeds(deep.as_bytes(), MAX_HTML_NESTING_DEPTH).is_some(),
1779            "an 8000-deep nest must trip the guard"
1780        );
1781        assert!(
1782            html_to_text(deep.as_bytes()).is_err(),
1783            "html_to_text must refuse the bomb (typed error), not hang"
1784        );
1785
1786        let flat = format!("<html><body>{}</body></html>", "<br>".repeat(50_000));
1787        assert!(
1788            html_block_nesting_exceeds(flat.as_bytes(), MAX_HTML_NESTING_DEPTH).is_none(),
1789            "50k sibling void <br> are flat, not deep — must pass"
1790        );
1791
1792        let normal =
1793            "<html><body><div><p>hi <a href=\"u\">link</a>; a < b in prose</p></div></body></html>";
1794        assert!(
1795            html_block_nesting_exceeds(normal.as_bytes(), MAX_HTML_NESTING_DEPTH).is_none(),
1796            "ordinary nesting (and a stray `<`) must pass"
1797        );
1798        assert!(
1799            html_to_text(normal.as_bytes()).is_ok(),
1800            "a normal document must still flatten fine"
1801        );
1802    }
1803
1804    #[test]
1805    fn regression_html_self_closing_non_void_is_flat_not_deep() {
1806        // Adversarial review #17: a self-closing NON-void element (`<div/>`,
1807        // `<section />`) is flat, not a nesting increment. The off-by-one read the
1808        // `>` byte (always present) instead of the `/` (at end-2), so the
1809        // self-closing check was dead and N such elements miscounted as depth N,
1810        // falsely tripping the cap on a valid, flat document (XHTML/EPUB chapters
1811        // commonly self-close).
1812        let flat = "<div/>".repeat(MAX_HTML_NESTING_DEPTH + 1000);
1813        assert!(
1814            html_block_nesting_exceeds(flat.as_bytes(), MAX_HTML_NESTING_DEPTH).is_none(),
1815            "a flat run of self-closing <div/> must not trip the nesting cap"
1816        );
1817        let spaced = "<section />".repeat(MAX_HTML_NESTING_DEPTH + 1000);
1818        assert!(
1819            html_block_nesting_exceeds(spaced.as_bytes(), MAX_HTML_NESTING_DEPTH).is_none(),
1820            "`<section />` (space before slash) is self-closing too"
1821        );
1822        // Defense intact: genuine deep nesting of the SAME tag still trips it.
1823        let deep = "<div>".repeat(MAX_HTML_NESTING_DEPTH + 1);
1824        assert!(
1825            html_block_nesting_exceeds(deep.as_bytes(), MAX_HTML_NESTING_DEPTH).is_some(),
1826            "real deep nesting must still trip the cap"
1827        );
1828    }
1829
1830    /// The table scanner counts `<td>`/`<th>` opens, ignores closing and
1831    /// commented-out cells, resets the per-row tally on `<tr>`, and reports the
1832    /// right bomb variant (row-too-wide vs. too-many-cells). Small-limit probes
1833    /// keep the test fast.
1834    #[test]
1835    fn html_table_scanner_counts_cells_and_classifies_shape() {
1836        // 5 real cells (td + th, case-insensitive) in ONE row; the commented cell
1837        // and the closing tags must NOT be counted.
1838        let one_row = b"<table><tr><td>a</td><TH>b</TH><td>c</td>\
1839<!-- <td>x</td> --><td>d</td><td>e</td></tr></table>";
1840        // Row-width cap of 4 trips on the 5-wide row.
1841        assert!(
1842            matches!(
1843                html_table_amplification(one_row, 4, 1000),
1844                Some(TableBomb::RowTooWide(w)) if w == 5
1845            ),
1846            "a 5-wide row must trip the row-width cap as RowTooWide(5)"
1847        );
1848        // Generous row cap, generous total → no bomb (commented cell not counted).
1849        assert!(
1850            html_table_amplification(one_row, 100, 100).is_none(),
1851            "5 cells under both caps must not fire"
1852        );
1853
1854        // Many narrow rows: width stays at 1, total accumulates → TooManyCells.
1855        let tall: String = "<table>".to_string() + &"<tr><td>x</td></tr>".repeat(20) + "</table>";
1856        assert!(
1857            matches!(
1858                html_table_amplification(tall.as_bytes(), 100, 10),
1859                Some(TableBomb::TooManyCells(t)) if t == 11
1860            ),
1861            "20 single-cell rows must trip the total cap at 11 (width stays under)"
1862        );
1863
1864        // A document with no tables never trips it.
1865        assert!(
1866            html_table_amplification(b"<p>plain prose, a < b</p>", 0, 0).is_none(),
1867            "no table cells means the scanner never fires"
1868        );
1869    }
1870
1871    /// The wide-table amplification bomb (HIGH DoS): a tiny flat `<td>`×N row
1872    /// makes html2text emit gigantic U+2500 box rules (multi-GB output, 9 GB+
1873    /// RSS) from a ~MB input. The row-width pre-check refuses it BEFORE the
1874    /// layout pass — fast, typed, never materializing the giant string — while a
1875    /// normal small table still extracts intact (no regression).
1876    #[test]
1877    fn regression_html_wide_table_bomb_is_refused_small_table_ok() {
1878        // Just over the per-row width cap in a single row — the exact shape of the
1879        // real exploit (a flat `<td>`×N row), kept small enough that the test is
1880        // fast precisely BECAUSE the pre-check refuses before html2text runs.
1881        let cells = MAX_HTML_TABLE_ROW_CELLS + 10;
1882        let bomb = format!(
1883            "<html><body><table><tr>{}</tr></table></body></html>",
1884            "<td>x</td>".repeat(cells)
1885        );
1886        // The pre-check fires; html2text is never reached, so no giant string is
1887        // materialized (the test would OOM/hang otherwise).
1888        assert!(
1889            matches!(
1890                html_table_amplification(
1891                    bomb.as_bytes(),
1892                    MAX_HTML_TABLE_ROW_CELLS,
1893                    MAX_HTML_TABLE_CELLS
1894                ),
1895                Some(TableBomb::RowTooWide(_))
1896            ),
1897            "an over-cap wide row must trip the scanner as RowTooWide"
1898        );
1899        let err = html_to_text(bomb.as_bytes()).unwrap_err();
1900        assert!(
1901            matches!(&err, ExtractError::Parse { format, message }
1902                if *format == "html" && message.contains("cell-per-row")),
1903            "the wide-table bomb must be refused with a typed row-width error; got {err:?}"
1904        );
1905        assert_eq!(err.code(), "EXTRACT_PARSE_ERROR");
1906
1907        // A tall table whose TOTAL cells exceed the aggregate cap is also refused
1908        // (narrow rows, but too many of them) — bounding the other shape.
1909        let rows = MAX_HTML_TABLE_CELLS / 2 + 5; // 2 cells/row, just over the total cap
1910        let tall = format!(
1911            "<html><body><table>{}</table></body></html>",
1912            "<tr><td>a</td><td>b</td></tr>".repeat(rows)
1913        );
1914        let err = html_to_text(tall.as_bytes()).unwrap_err();
1915        assert!(
1916            matches!(&err, ExtractError::Parse { message, .. } if message.contains("table cells")),
1917            "an over-cap tall table must be refused with the total-cell error; got {err:?}"
1918        );
1919
1920        // A normal small table still extracts its cell content cleanly.
1921        let ok = "<html><body><table>\
1922<tr><td>Name</td><td>Amount</td></tr>\
1923<tr><td>Acme</td><td>1200</td></tr></table></body></html>";
1924        let out = html_to_text(ok.as_bytes()).unwrap();
1925        for token in ["Name", "Amount", "Acme", "1200"] {
1926            assert!(
1927                out.contains(token),
1928                "small table must keep {token:?}, got {out:?}"
1929            );
1930        }
1931        // And the output is far under the byte cap.
1932        assert!(
1933            out.len() < MAX_EXTRACT_OUTPUT_BYTES,
1934            "a 2x2 table must not approach the output cap (got {} bytes)",
1935            out.len()
1936        );
1937    }
1938
1939    /// Build an `.epub` whose single chapter body is `chapter_body` (spliced
1940    /// inside `<body>…</body>`). Lets a test exercise a hostile chapter shape
1941    /// (e.g. a wide table) through the real EPUB → html_to_text path.
1942    fn write_epub_with_chapter_body(dest: &Path, chapter_body: &str) {
1943        use std::io::Write;
1944        let container = "<?xml version=\"1.0\"?>\
1945<container version=\"1.0\" xmlns=\"urn:oasis:names:tc:opendocument:xmlns:container\">\
1946<rootfiles><rootfile full-path=\"OEBPS/content.opf\" \
1947media-type=\"application/oebps-package+xml\"/></rootfiles></container>";
1948        let opf = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\
1949<package xmlns=\"http://www.idpf.org/2007/opf\" version=\"3.0\" unique-identifier=\"id\">\
1950<metadata xmlns:dc=\"http://purl.org/dc/elements/1.1/\"><dc:title>Wide</dc:title></metadata>\
1951<manifest><item id=\"c1\" href=\"chapter.xhtml\" media-type=\"application/xhtml+xml\"/></manifest>\
1952<spine><itemref idref=\"c1\"/></spine></package>";
1953        let chapter = format!(
1954            "<?xml version=\"1.0\" encoding=\"utf-8\"?>\
1955<html xmlns=\"http://www.w3.org/1999/xhtml\"><body>{chapter_body}</body></html>"
1956        );
1957        let file = std::fs::File::create(dest).unwrap();
1958        let mut writer = zip::ZipWriter::new(file);
1959        let stored = zip::write::SimpleFileOptions::default()
1960            .compression_method(zip::CompressionMethod::Stored);
1961        writer.start_file("mimetype", stored).unwrap();
1962        writer.write_all(b"application/epub+zip").unwrap();
1963        writer.start_file("META-INF/container.xml", stored).unwrap();
1964        writer.write_all(container.as_bytes()).unwrap();
1965        writer.start_file("OEBPS/content.opf", stored).unwrap();
1966        writer.write_all(opf.as_bytes()).unwrap();
1967        writer.start_file("OEBPS/chapter.xhtml", stored).unwrap();
1968        writer.write_all(chapter.as_bytes()).unwrap();
1969        writer.finish().unwrap();
1970    }
1971
1972    /// An EPUB chapter that is itself a wide-table bomb routes through
1973    /// `html_to_text` and must be refused with the same typed table-cell error,
1974    /// before any giant chapter string is materialized — so EPUB peak memory
1975    /// stays bounded per chapter, not just at the final concatenation check.
1976    #[test]
1977    fn regression_epub_wide_table_chapter_is_refused() {
1978        let tmp = tempfile::TempDir::new().unwrap();
1979        let bomb = tmp.path().join("wide.epub");
1980        let body = format!(
1981            "<table><tr>{}</tr></table>",
1982            "<td>x</td>".repeat(MAX_HTML_TABLE_ROW_CELLS + 10)
1983        );
1984        write_epub_with_chapter_body(&bomb, &body);
1985        let err = extract(&bomb).unwrap_err();
1986        assert!(
1987            matches!(&err, ExtractError::Parse { message, .. } if message.contains("cell-per-row")),
1988            "a wide-table EPUB chapter must be refused with the row-width error; got {err:?}"
1989        );
1990
1991        // A normal EPUB chapter with a small table still extracts.
1992        let ok = tmp.path().join("ok.epub");
1993        write_epub_with_chapter_body(
1994            &ok,
1995            "<p>Chapter one.</p><table><tr><td>Cell A</td><td>Cell B</td></tr></table>",
1996        );
1997        let got = extract(&ok).unwrap();
1998        assert_eq!(got.metadata["chapters"], MetaValue::Num(1));
1999        assert!(
2000            got.text.contains("Cell A") && got.text.contains("Cell B"),
2001            "small EPUB table must extract, got {:?}",
2002            got.text
2003        );
2004    }
2005
2006    /// A `.docx` whose `word/document.xml` expands to an enormous run of `<w:t>`
2007    /// text must be refused by the output-byte cap during accumulation (docx
2008    /// parity with HTML/EPUB), while a normal docx extracts unchanged.
2009    #[test]
2010    fn regression_docx_oversized_text_is_bounded() {
2011        let tmp = tempfile::TempDir::new().unwrap();
2012        let bomb = tmp.path().join("huge.docx");
2013        // One paragraph whose single run holds > MAX_EXTRACT_OUTPUT_BYTES of text.
2014        // (Built as one big string so the body XML itself is the amplified input;
2015        // a real exploit relies on zip deflate to ship this compactly.)
2016        let big = "A".repeat(MAX_EXTRACT_OUTPUT_BYTES + 1024);
2017        let body = format!("<w:p><w:r><w:t>{big}</w:t></w:r></w:p>");
2018        write_docx(&bomb, &body);
2019        let err = extract(&bomb).unwrap_err();
2020        assert!(
2021            matches!(&err, ExtractError::Parse { format, message }
2022                if *format == "docx" && message.contains("byte cap")),
2023            "an oversized docx must be refused with the output-cap error; got {err:?}"
2024        );
2025
2026        // A normal docx still extracts intact (no regression).
2027        let ok = tmp.path().join("ok.docx");
2028        write_docx(
2029            &ok,
2030            "<w:p><w:r><w:t>Quarterly report total 1200.</w:t></w:r></w:p>",
2031        );
2032        let got = extract(&ok).unwrap();
2033        assert_eq!(got.text, "Quarterly report total 1200.\n");
2034    }
2035
2036    // ── format detection ────────────────────────────────────────────────────
2037
2038    #[test]
2039    fn detects_format_by_extension_case_insensitively() {
2040        assert_eq!(Format::from_path(Path::new("a.pdf")), Some(Format::Pdf));
2041        assert_eq!(Format::from_path(Path::new("a.PDF")), Some(Format::Pdf));
2042        assert_eq!(Format::from_path(Path::new("a.docx")), Some(Format::Docx));
2043        assert_eq!(
2044            Format::from_path(Path::new("a.xlsx")),
2045            Some(Format::Spreadsheet)
2046        );
2047        assert_eq!(
2048            Format::from_path(Path::new("a.ods")),
2049            Some(Format::Spreadsheet)
2050        );
2051        assert_eq!(Format::from_path(Path::new("a.epub")), Some(Format::Epub));
2052        assert_eq!(Format::from_path(Path::new("a.html")), Some(Format::Html));
2053        assert_eq!(Format::from_path(Path::new("a.htm")), Some(Format::Html));
2054        assert_eq!(Format::from_path(Path::new("a.txt")), None);
2055        assert_eq!(Format::from_path(Path::new("noext")), None);
2056    }
2057
2058    #[test]
2059    fn unsupported_extension_is_typed_error() {
2060        let err = extract(Path::new("/tmp/whatever.txt")).unwrap_err();
2061        assert!(matches!(err, ExtractError::UnsupportedFormat(ref e) if e == "txt"));
2062        assert_eq!(err.code(), "UNSUPPORTED_FORMAT");
2063    }
2064
2065    #[test]
2066    fn missing_extension_is_unsupported() {
2067        let err = extract(Path::new("/tmp/noext")).unwrap_err();
2068        assert!(matches!(err, ExtractError::UnsupportedFormat(ref e) if e.is_empty()));
2069    }
2070
2071    // ── normalization ─────────────────────────────────────────────────────────
2072
2073    #[test]
2074    fn normalize_collapses_blanks_and_trims() {
2075        let raw = "\r\n\r\nHeading\r\n\r\n\r\n\r\nBody line   \r\n\r\n";
2076        assert_eq!(normalize_text(raw), "Heading\n\nBody line\n");
2077    }
2078
2079    #[test]
2080    fn normalize_empty_stays_empty() {
2081        assert_eq!(normalize_text(""), "");
2082        assert_eq!(normalize_text("   \n\n  \n"), "");
2083    }
2084
2085    // ── per-format extraction against corpus-c fixtures ───────────────────────
2086
2087    #[test]
2088    fn extract_text_pdf_matches_known_good() {
2089        let got = extract(&fixture("text.pdf")).unwrap();
2090        assert_eq!(got.metadata["format"], MetaValue::Str("pdf".into()));
2091        assert_eq!(got.metadata["pages"], MetaValue::Num(1));
2092        assert_eq!(tokens(&got.text), tokens(&expected("text.pdf")));
2093    }
2094
2095    #[test]
2096    fn extract_weird_fonts_pdf_matches_known_good() {
2097        let got = extract(&fixture("weird-fonts.pdf")).unwrap();
2098        assert_eq!(tokens(&got.text), tokens(&expected("weird-fonts.pdf")));
2099    }
2100
2101    #[test]
2102    fn extract_multi_column_pdf_matches_content_order_agnostic() {
2103        // pdf-extract reads column-by-column; the known-good `.txt` captures the
2104        // interleaved (pdftotext) order. Both carry identical content — assert
2105        // the line SET, not the order. (README § multi-column.)
2106        let got = extract(&fixture("multi-column.pdf")).unwrap();
2107        assert_eq!(line_set(&got.text), line_set(&expected("multi-column.pdf")));
2108    }
2109
2110    #[test]
2111    fn extract_image_only_pdf_yields_empty() {
2112        // No text layer → empty out, never hallucinated text. OCR out of scope.
2113        let got = extract(&fixture("image-only.pdf")).unwrap();
2114        assert_eq!(got.text, "");
2115        assert!(expected("image-only.pdf").trim().is_empty());
2116    }
2117
2118    #[test]
2119    fn extract_encrypted_pdf_without_password_refuses_cleanly() {
2120        let err = extract(&fixture("encrypted.pdf")).unwrap_err();
2121        assert!(
2122            matches!(err, ExtractError::Encrypted(_)),
2123            "expected Encrypted, got {err:?}"
2124        );
2125        assert_eq!(err.code(), "DOCUMENT_ENCRYPTED");
2126    }
2127
2128    #[test]
2129    fn guard_pdf_panic_contains_unwind_as_parse_error() {
2130        // The "never panics" contract: an internal pdf-extract/lopdf panic must
2131        // surface as a typed ExtractError::Parse, not abort the process. (cargo
2132        // captures the unwind's stderr line for a passing test.)
2133        let contained: Result<()> = guard_pdf_panic(|| panic!("simulated pdf-extract abort"));
2134        assert!(
2135            matches!(contained, Err(ExtractError::Parse { format: "pdf", .. })),
2136            "panic must be contained as a pdf Parse error, got {contained:?}"
2137        );
2138        // The success path is transparent — the value passes straight through.
2139        let ok: Result<u32> = guard_pdf_panic(|| 42);
2140        assert_eq!(ok.unwrap(), 42);
2141    }
2142
2143    #[test]
2144    fn extract_docx_matches_known_good() {
2145        let got = extract(&fixture("sample.docx")).unwrap();
2146        assert_eq!(got.metadata["format"], MetaValue::Str("docx".into()));
2147        assert_eq!(tokens(&got.text), tokens(&expected("sample.docx")));
2148    }
2149
2150    #[test]
2151    fn extract_xlsx_matches_known_good() {
2152        let got = extract(&fixture("sample.xlsx")).unwrap();
2153        assert_eq!(got.metadata["format"], MetaValue::Str("spreadsheet".into()));
2154        assert_eq!(got.metadata["sheets"], MetaValue::Num(1));
2155        assert_eq!(
2156            got.metadata["sheet_names"],
2157            MetaValue::Str("Expenses".into())
2158        );
2159        // Tab-separated, integers without `.0` — exact match (no soft-wrap risk).
2160        assert_eq!(got.text.trim_end(), expected("sample.xlsx").trim_end());
2161    }
2162
2163    #[test]
2164    fn extract_epub_matches_known_good() {
2165        let got = extract(&fixture("sample.epub")).unwrap();
2166        assert_eq!(got.metadata["format"], MetaValue::Str("epub".into()));
2167        assert_eq!(got.metadata["chapters"], MetaValue::Num(1));
2168        assert_eq!(
2169            got.metadata["title"],
2170            MetaValue::Str("Operations Playbook".into())
2171        );
2172        assert_eq!(tokens(&got.text), tokens(&expected("sample.epub")));
2173    }
2174
2175    #[test]
2176    fn extract_html_matches_known_good() {
2177        let got = extract(&fixture("sample.html")).unwrap();
2178        assert_eq!(got.metadata["format"], MetaValue::Str("html".into()));
2179        assert_eq!(tokens(&got.text), tokens(&expected("sample.html")));
2180    }
2181
2182    // ── helper-level unit tests ───────────────────────────────────────────────
2183
2184    #[test]
2185    fn unwrap_brackets_flattens_link_text() {
2186        assert_eq!(
2187            unwrap_brackets("contact [ops@acme.example] or the [handbook]."),
2188            "contact ops@acme.example or the handbook."
2189        );
2190        // Unmatched '[' is preserved.
2191        assert_eq!(unwrap_brackets("a [b c"), "a [b c");
2192        // No brackets → untouched.
2193        assert_eq!(unwrap_brackets("plain text"), "plain text");
2194    }
2195
2196    #[test]
2197    fn strip_markdown_decorations_drops_heading_hashes() {
2198        let input = "# Title\n## Section\n* bullet\n1. ordered\nplain\n";
2199        let out = strip_markdown_decorations(input);
2200        assert_eq!(out, "Title\nSection\n* bullet\n1. ordered\nplain\n");
2201    }
2202
2203    #[test]
2204    fn local_name_strips_prefix() {
2205        assert_eq!(local_name(b"w:t"), b"t");
2206        assert_eq!(local_name(b"t"), b"t");
2207        assert_eq!(local_name(b"dc:title"), b"title");
2208    }
2209
2210    #[test]
2211    fn extracted_serializes_to_text_metadata_json() {
2212        let got = extract(&fixture("sample.xlsx")).unwrap();
2213        let json = serde_json::to_value(&got).unwrap();
2214        assert!(json.get("text").is_some());
2215        assert_eq!(json["metadata"]["format"], "spreadsheet");
2216        assert_eq!(json["metadata"]["sheets"], 1);
2217        // MetaValue::Num serializes as a bare JSON number, Str as a bare string.
2218        assert!(json["metadata"]["sheets"].is_number());
2219        assert!(json["metadata"]["format"].is_string());
2220    }
2221
2222    // ── regression: leading-blank normalization is linear (finding #13) ────────
2223
2224    /// `normalize_text` must trim leading blank lines in O(n), not O(n²). The
2225    /// pre-fix loop used `lines.remove(0)` per blank line — O(n) shift each, so a
2226    /// document dominated by leading blanks took O(n²) and hung extraction.
2227    ///
2228    /// 500_000 leading blank lines is ~2.5e11 element shifts under the old code
2229    /// (minutes-to-hours, effectively a hang) but instant under the index-and-
2230    /// slice path; the test reconstructs the finding's trigger (an adapter output
2231    /// that is mostly leading blanks then one line of text) and asserts the
2232    /// correct, fully-trimmed result. Against the pre-fix code this test does not
2233    /// complete in a reasonable time — encoding the quadratic regression.
2234    #[test]
2235    fn regression_normalize_text_leading_blanks_is_linear() {
2236        let blanks = "\n".repeat(500_000);
2237        let raw = format!("{blanks}only real line\n");
2238        // Leading blanks fully trimmed; single trailing newline; body intact.
2239        assert_eq!(normalize_text(&raw), "only real line\n");
2240
2241        // A wholly-blank giant input still collapses to empty (the other branch).
2242        assert_eq!(normalize_text(&"   \n".repeat(500_000)), "");
2243    }
2244
2245    // ── regression: spreadsheet dense-grid bomb is refused (finding #4) ────────
2246
2247    /// Build a VALID `.xlsx` whose single sheet declares two real cells at the
2248    /// opposite corners of Excel's grid (`A1` and `XFD1048576`). `calamine`
2249    /// materializes a sheet as a DENSE `Vec<Data>` sized from the MIN/MAX cell
2250    /// positions, so this two-cell sheet would force a ~1.7e10-element (~400 GB)
2251    /// allocation and abort the process. We reuse the corpus `sample.xlsx`
2252    /// container verbatim and swap ONLY `xl/worksheets/sheet1.xml`, so every
2253    /// other part (workbook, rels, content-types) is a real, openable workbook.
2254    fn write_dense_bomb_xlsx(dest: &Path) {
2255        use std::io::Write;
2256
2257        let base = std::fs::read(fixture("sample.xlsx")).expect("corpus sample.xlsx exists");
2258        let mut archive =
2259            zip::ZipArchive::new(std::io::Cursor::new(base)).expect("sample.xlsx is a valid zip");
2260
2261        let bomb_sheet = b"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\
2262<worksheet xmlns=\"http://schemas.openxmlformats.org/spreadsheetml/2006/main\">\
2263<sheetData>\
2264<row r=\"1\"><c r=\"A1\"><v>1</v></c></row>\
2265<row r=\"1048576\"><c r=\"XFD1048576\"><v>2</v></c></row>\
2266</sheetData></worksheet>";
2267
2268        let out = std::fs::File::create(dest).unwrap();
2269        let mut writer = zip::ZipWriter::new(out);
2270        let opts = zip::write::SimpleFileOptions::default()
2271            .compression_method(zip::CompressionMethod::Stored);
2272
2273        for i in 0..archive.len() {
2274            let entry = archive.by_index(i).unwrap();
2275            let name = entry.name().to_string();
2276            if name == "xl/worksheets/sheet1.xml" {
2277                writer.start_file(name, opts).unwrap();
2278                writer.write_all(bomb_sheet).unwrap();
2279            } else {
2280                // Copy every other entry's already-compressed bytes verbatim.
2281                writer.raw_copy_file(entry).unwrap();
2282            }
2283        }
2284        writer.finish().unwrap();
2285    }
2286
2287    /// A spreadsheet whose declared dense grid exceeds [`MAX_SPREADSHEET_CELLS`]
2288    /// is refused with a typed [`ExtractError::Parse`] BEFORE calamine allocates
2289    /// the dense matrix — never an OOM/abort. Pre-fix, `extract_spreadsheet`
2290    /// called `worksheet_range` directly and the process aborted on the
2291    /// allocation; this test would not return (it would kill the test runner),
2292    /// so it encodes the resource-exhaustion regression.
2293    #[test]
2294    fn regression_spreadsheet_dense_bomb_refused_not_oom() {
2295        let tmp = tempfile::TempDir::new().unwrap();
2296        let bomb = tmp.path().join("invoice.xlsx");
2297        write_dense_bomb_xlsx(&bomb);
2298
2299        // A few-hundred-byte file on disk — the whole point of the bomb.
2300        assert!(
2301            std::fs::metadata(&bomb).unwrap().len() < 10_000,
2302            "the bomb must be tiny on disk; the danger is the in-memory expansion"
2303        );
2304
2305        let err = extract(&bomb).unwrap_err();
2306        assert!(
2307            matches!(
2308                err,
2309                ExtractError::Parse {
2310                    format: "spreadsheet",
2311                    ..
2312                }
2313            ),
2314            "an over-cap dense grid must be a typed spreadsheet Parse refusal, got {err:?}"
2315        );
2316        assert_eq!(err.code(), "EXTRACT_PARSE_ERROR");
2317    }
2318
2319    /// The cap is a guard, not a wall: a normal spreadsheet still extracts. Locks
2320    /// down that the preflight bound does not regress the legitimate path (the
2321    /// corpus `sample.xlsx` is a 3×3 grid, far under the cap).
2322    #[test]
2323    fn regression_spreadsheet_cap_allows_real_workbook() {
2324        let got = extract(&fixture("sample.xlsx")).unwrap();
2325        assert_eq!(got.metadata["sheets"], MetaValue::Num(1));
2326        assert!(!got.text.is_empty());
2327    }
2328
2329    /// Build a minimal `.ods` (OpenDocument Spreadsheet) whose `content.xml`
2330    /// body is exactly `content_xml`, written to `dest`. Lets a test inject a
2331    /// truncated/unclosed document XML and drive it through the real
2332    /// `extract_spreadsheet` ODS path. The mimetype + manifest members make
2333    /// calamine's auto-detector recognize the package as ODS.
2334    fn write_ods_with_content(dest: &Path, content_xml: &str) {
2335        use std::io::Write;
2336        let manifest = "<?xml version=\"1.0\"?>\
2337<manifest:manifest xmlns:manifest=\"urn:oasis:names:tc:opendocument:xmlns:manifest:1.0\">\
2338<manifest:file-entry manifest:full-path=\"/\" \
2339manifest:media-type=\"application/vnd.oasis.opendocument.spreadsheet\"/></manifest:manifest>";
2340        let file = std::fs::File::create(dest).unwrap();
2341        let mut writer = zip::ZipWriter::new(file);
2342        let stored = zip::write::SimpleFileOptions::default()
2343            .compression_method(zip::CompressionMethod::Stored);
2344        // The mimetype member must be the first, STORED entry for OpenDocument.
2345        writer.start_file("mimetype", stored).unwrap();
2346        writer
2347            .write_all(b"application/vnd.oasis.opendocument.spreadsheet")
2348            .unwrap();
2349        writer.start_file("META-INF/manifest.xml", stored).unwrap();
2350        writer.write_all(manifest.as_bytes()).unwrap();
2351        writer.start_file("content.xml", stored).unwrap();
2352        writer.write_all(content_xml.as_bytes()).unwrap();
2353        writer.finish().unwrap();
2354    }
2355
2356    /// A truncated `.ods` — `content.xml` opens `<table:table>` then hits EOF
2357    /// before the matching `</table:table>` — must be REFUSED fast with a typed
2358    /// Parse error, not spin forever inside calamine's unbounded ODS reader
2359    /// (resource-exhaustion DoS on untrusted `sources/` input). Pre-fix this test
2360    /// hangs (calamine's `worksheet_range` never returns); post-fix the structural
2361    /// pre-scan refuses it in microseconds. A well-formed `.ods` still extracts.
2362    #[test]
2363    fn regression_truncated_ods_is_refused_not_hung() {
2364        let tmp = tempfile::TempDir::new().unwrap();
2365
2366        // Truncated: the spreadsheet opens `<table:table>` and the document ends
2367        // there — exactly the EOF-mid-table shape that hangs the ODS reader.
2368        let trunc = tmp.path().join("trunc.ods");
2369        let truncated_content = "<?xml version=\"1.0\"?>\
2370<office:document-content \
2371xmlns:office=\"urn:oasis:names:tc:opendocument:xmlns:office:1.0\" \
2372xmlns:table=\"urn:oasis:names:tc:opendocument:xmlns:table:1.0\">\
2373<office:body><office:spreadsheet><table:table table:name=\"S\">";
2374        write_ods_with_content(&trunc, truncated_content);
2375
2376        let start = std::time::Instant::now();
2377        let err = extract(&trunc).unwrap_err();
2378        let elapsed = start.elapsed();
2379        assert!(
2380            matches!(&err, ExtractError::Parse { format, .. } if *format == "spreadsheet"),
2381            "a truncated .ods must be a typed spreadsheet Parse refusal, got {err:?}"
2382        );
2383        assert_eq!(err.code(), "EXTRACT_PARSE_ERROR");
2384        assert!(
2385            elapsed < std::time::Duration::from_secs(1),
2386            "the truncated .ods must fail fast (<1s); took {elapsed:?} (would-be hang)"
2387        );
2388
2389        // A well-formed `.ods` with a single 1-row, 2-cell table still extracts
2390        // its cell text — the pre-scan must not regress valid spreadsheets.
2391        let ok = tmp.path().join("ok.ods");
2392        let valid_content = "<?xml version=\"1.0\"?>\
2393<office:document-content \
2394xmlns:office=\"urn:oasis:names:tc:opendocument:xmlns:office:1.0\" \
2395xmlns:table=\"urn:oasis:names:tc:opendocument:xmlns:table:1.0\" \
2396xmlns:text=\"urn:oasis:names:tc:opendocument:xmlns:text:1.0\">\
2397<office:body><office:spreadsheet>\
2398<table:table table:name=\"S\">\
2399<table:table-row>\
2400<table:table-cell office:value-type=\"string\"><text:p>Alpha</text:p></table:table-cell>\
2401<table:table-cell office:value-type=\"string\"><text:p>Beta</text:p></table:table-cell>\
2402</table:table-row>\
2403</table:table>\
2404</office:spreadsheet></office:body></office:document-content>";
2405        write_ods_with_content(&ok, valid_content);
2406        let got = extract(&ok).unwrap();
2407        assert!(
2408            got.text.contains("Alpha") && got.text.contains("Beta"),
2409            "a valid .ods must still extract its cell text, got {:?}",
2410            got.text
2411        );
2412    }
2413
2414    // ── regression: entity-ref / CDATA fidelity (findings #34, #1011) ──────────
2415
2416    /// Build a minimal valid `.docx` whose `word/document.xml` body is the given
2417    /// run XML, written to `dest`. Only the three OOXML members `extract_docx`
2418    /// touches need to be real; the rest of a Word package is optional for text
2419    /// extraction.
2420    fn write_docx(dest: &Path, body_runs: &str) {
2421        use std::io::Write;
2422        let document = format!(
2423            "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\
2424<w:document xmlns:w=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\">\
2425<w:body>{body_runs}</w:body></w:document>"
2426        );
2427        let file = std::fs::File::create(dest).unwrap();
2428        let mut writer = zip::ZipWriter::new(file);
2429        let opts = zip::write::SimpleFileOptions::default()
2430            .compression_method(zip::CompressionMethod::Stored);
2431        writer.start_file("word/document.xml", opts).unwrap();
2432        writer.write_all(document.as_bytes()).unwrap();
2433        writer.finish().unwrap();
2434    }
2435
2436    #[test]
2437    fn regression_docx_resolves_entity_refs() {
2438        // quick-xml 0.40 surfaces `&amp;`/`&lt;`/`&gt;`/`&#8212;` as separate
2439        // GeneralRef events; pre-fix they were routed to `_ => {}` and dropped,
2440        // corrupting `Smith & Co invoice <final> total — 100`.
2441        let tmp = tempfile::TempDir::new().unwrap();
2442        let f = tmp.path().join("entity.docx");
2443        write_docx(
2444            &f,
2445            "<w:p><w:r><w:t>Smith &amp; Co invoice &lt;final&gt; total &#8212; 100</w:t></w:r></w:p>",
2446        );
2447        let got = extract(&f).unwrap();
2448        assert_eq!(got.text, "Smith & Co invoice <final> total — 100\n");
2449    }
2450
2451    #[test]
2452    fn regression_docx_preserves_cdata_run_text() {
2453        // CDATA inside `<w:t>` is valid and literal; pre-fix it fell through the
2454        // wildcard arm and the payload vanished.
2455        let tmp = tempfile::TempDir::new().unwrap();
2456        let f = tmp.path().join("cdata.docx");
2457        write_docx(
2458            &f,
2459            "<w:p><w:r><w:t>Line A.</w:t></w:r></w:p>\
2460<w:p><w:r><w:t><![CDATA[IMPORTANT CDATA CONTENT]]></w:t></w:r></w:p>\
2461<w:p><w:r><w:t>Line C.</w:t></w:r></w:p>",
2462        );
2463        let got = extract(&f).unwrap();
2464        assert_eq!(got.text, "Line A.\nIMPORTANT CDATA CONTENT\nLine C.\n");
2465    }
2466
2467    #[test]
2468    fn resolve_entity_ref_maps_named_and_numeric() {
2469        use quick_xml::events::BytesRef;
2470        let r = |s: &'static str| resolve_entity_ref(&BytesRef::new(s));
2471        assert_eq!(r("amp"), "&");
2472        assert_eq!(r("lt"), "<");
2473        assert_eq!(r("gt"), ">");
2474        assert_eq!(r("quot"), "\"");
2475        assert_eq!(r("apos"), "'");
2476        assert_eq!(r("#8212"), "—");
2477        assert_eq!(r("#x2014"), "—");
2478        // Unknown named entity → bare name (best-effort, never a panic).
2479        assert_eq!(r("nbsp"), "nbsp");
2480    }
2481
2482    // ── regression: EPUB OPF parsing (findings #35, #37, #1012) ────────────────
2483
2484    /// Build a minimal valid EPUB at `dest`. `opf_metadata` is spliced verbatim
2485    /// inside `<metadata>`; `manifest_href` is the chapter item's href; the
2486    /// chapter XHTML is stored under the literal zip entry `chapter_entry`. The
2487    /// mimetype member is written first and stored (per the EPUB OCF spec).
2488    fn write_epub(dest: &Path, opf_metadata: &str, manifest_href: &str, chapter_entry: &str) {
2489        use std::io::Write;
2490        let container = "<?xml version=\"1.0\"?>\
2491<container version=\"1.0\" xmlns=\"urn:oasis:names:tc:opendocument:xmlns:container\">\
2492<rootfiles><rootfile full-path=\"OEBPS/content.opf\" \
2493media-type=\"application/oebps-package+xml\"/></rootfiles></container>";
2494        let opf = format!(
2495            "<?xml version=\"1.0\" encoding=\"utf-8\"?>\
2496<package xmlns=\"http://www.idpf.org/2007/opf\" version=\"3.0\" unique-identifier=\"id\">\
2497<metadata xmlns:dc=\"http://purl.org/dc/elements/1.1/\">{opf_metadata}</metadata>\
2498<manifest><item id=\"c1\" href=\"{manifest_href}\" media-type=\"application/xhtml+xml\"/></manifest>\
2499<spine><itemref idref=\"c1\"/></spine></package>"
2500        );
2501        let chapter = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\
2502<html xmlns=\"http://www.w3.org/1999/xhtml\"><body>\
2503<p>Hello world body text.</p></body></html>";
2504
2505        let file = std::fs::File::create(dest).unwrap();
2506        let mut writer = zip::ZipWriter::new(file);
2507        let stored = zip::write::SimpleFileOptions::default()
2508            .compression_method(zip::CompressionMethod::Stored);
2509        // mimetype must be the first member and stored uncompressed.
2510        writer.start_file("mimetype", stored).unwrap();
2511        writer.write_all(b"application/epub+zip").unwrap();
2512        writer.start_file("META-INF/container.xml", stored).unwrap();
2513        writer.write_all(container.as_bytes()).unwrap();
2514        writer.start_file("OEBPS/content.opf", stored).unwrap();
2515        writer.write_all(opf.as_bytes()).unwrap();
2516        writer.start_file(chapter_entry, stored).unwrap();
2517        writer.write_all(chapter.as_bytes()).unwrap();
2518        writer.finish().unwrap();
2519    }
2520
2521    #[test]
2522    fn regression_epub_title_accumulates_entities_and_nested_events() {
2523        // Pre-fix the title was cut at the first Text node, so an entity or a
2524        // comment inside `<dc:title>` truncated it.
2525        let tmp = tempfile::TempDir::new().unwrap();
2526
2527        let f1 = tmp.path().join("entity.epub");
2528        write_epub(
2529            &f1,
2530            "<dc:title>Smith &amp; Jones: A &lt;Tale&gt;</dc:title>",
2531            "chapter.xhtml",
2532            "OEBPS/chapter.xhtml",
2533        );
2534        let got = extract(&f1).unwrap();
2535        assert_eq!(
2536            got.metadata["title"],
2537            MetaValue::Str("Smith & Jones: A <Tale>".into())
2538        );
2539
2540        let f2 = tmp.path().join("comment.epub");
2541        write_epub(
2542            &f2,
2543            "<dc:title>Part One<!-- editorial --> and Part Two</dc:title>",
2544            "chapter.xhtml",
2545            "OEBPS/chapter.xhtml",
2546        );
2547        let got = extract(&f2).unwrap();
2548        assert_eq!(
2549            got.metadata["title"],
2550            MetaValue::Str("Part One and Part Two".into())
2551        );
2552    }
2553
2554    #[test]
2555    fn regression_epub_self_closing_title_does_not_capture_author() {
2556        // A self-closing `<dc:title/>` (an untitled book) must NOT latch the next
2557        // text node (the author) as the title.
2558        let tmp = tempfile::TempDir::new().unwrap();
2559        let f = tmp.path().join("empty-title.epub");
2560        write_epub(
2561            &f,
2562            "<dc:title/><dc:creator>John Doe</dc:creator>",
2563            "chapter.xhtml",
2564            "OEBPS/chapter.xhtml",
2565        );
2566        let got = extract(&f).unwrap();
2567        // No (or empty) title — never the author. `put_str` omits empty values.
2568        assert!(
2569            !got.metadata.contains_key("title"),
2570            "self-closing title must not capture the author, got {:?}",
2571            got.metadata.get("title")
2572        );
2573        // The chapter still extracts.
2574        assert_eq!(got.metadata["chapters"], MetaValue::Num(1));
2575    }
2576
2577    /// Build an `.epub` whose spine references the single chapter `spine_count`
2578    /// times — the spine-amplification shape.
2579    fn write_epub_with_spine(dest: &Path, spine_count: usize) {
2580        use std::io::Write;
2581        let container = "<?xml version=\"1.0\"?>\
2582<container version=\"1.0\" xmlns=\"urn:oasis:names:tc:opendocument:xmlns:container\">\
2583<rootfiles><rootfile full-path=\"OEBPS/content.opf\" \
2584media-type=\"application/oebps-package+xml\"/></rootfiles></container>";
2585        let itemrefs = "<itemref idref=\"c1\"/>".repeat(spine_count);
2586        let opf = format!(
2587            "<?xml version=\"1.0\" encoding=\"utf-8\"?>\
2588<package xmlns=\"http://www.idpf.org/2007/opf\" version=\"3.0\" unique-identifier=\"id\">\
2589<metadata xmlns:dc=\"http://purl.org/dc/elements/1.1/\"><dc:title>Bomb</dc:title></metadata>\
2590<manifest><item id=\"c1\" href=\"chapter.xhtml\" media-type=\"application/xhtml+xml\"/></manifest>\
2591<spine>{itemrefs}</spine></package>"
2592        );
2593        let chapter = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\
2594<html xmlns=\"http://www.w3.org/1999/xhtml\"><body><p>Repeated chapter body.</p></body></html>";
2595        let file = std::fs::File::create(dest).unwrap();
2596        let mut writer = zip::ZipWriter::new(file);
2597        let stored = zip::write::SimpleFileOptions::default()
2598            .compression_method(zip::CompressionMethod::Stored);
2599        writer.start_file("mimetype", stored).unwrap();
2600        writer.write_all(b"application/epub+zip").unwrap();
2601        writer.start_file("META-INF/container.xml", stored).unwrap();
2602        writer.write_all(container.as_bytes()).unwrap();
2603        writer.start_file("OEBPS/content.opf", stored).unwrap();
2604        writer.write_all(opf.as_bytes()).unwrap();
2605        writer.start_file("OEBPS/chapter.xhtml", stored).unwrap();
2606        writer.write_all(chapter.as_bytes()).unwrap();
2607        writer.finish().unwrap();
2608    }
2609
2610    #[test]
2611    fn regression_epub_spine_amplification_is_bounded() {
2612        // Adversarial review #8: a tiny .epub whose spine references the same
2613        // chapter a huge number of times pegged a CPU core (re-decoding +
2614        // re-rendering the chapter each time) and ballooned output. The spine
2615        // length is now capped, so an over-cap spine is REFUSED — fast, never
2616        // hung.
2617        let tmp = tempfile::TempDir::new().unwrap();
2618        let bomb = tmp.path().join("bomb.epub");
2619        write_epub_with_spine(&bomb, MAX_EPUB_SPINE_ITEMS + 1);
2620        let err = extract(&bomb).unwrap_err();
2621        assert!(
2622            matches!(&err, ExtractError::Parse { message, .. } if message.contains("spine")),
2623            "an over-cap spine must be refused with a spine error; got {err:?}"
2624        );
2625
2626        // A legitimate small repeat-spine still extracts: memoization renders the
2627        // shared chapter once, but each reading-order reference is still counted.
2628        let ok = tmp.path().join("ok.epub");
2629        write_epub_with_spine(&ok, 5);
2630        let got = extract(&ok).unwrap();
2631        assert_eq!(got.metadata["chapters"], MetaValue::Num(5));
2632    }
2633
2634    #[test]
2635    fn regression_epub_percent_encoded_href_resolves() {
2636        // An href `my%20chapter.xhtml` must match the zip entry
2637        // `OEBPS/my chapter.xhtml`; pre-fix the lookup failed and the chapter was
2638        // silently dropped (empty text, 0 chapters).
2639        let tmp = tempfile::TempDir::new().unwrap();
2640        let f = tmp.path().join("spaced.epub");
2641        write_epub(
2642            &f,
2643            "<dc:title>Spaced</dc:title>",
2644            "my%20chapter.xhtml",
2645            "OEBPS/my chapter.xhtml",
2646        );
2647        let got = extract(&f).unwrap();
2648        assert_eq!(got.metadata["chapters"], MetaValue::Num(1));
2649        assert!(
2650            got.text.contains("Hello world body text."),
2651            "percent-encoded-href chapter must extract, got {:?}",
2652            got.text
2653        );
2654    }
2655
2656    #[test]
2657    fn percent_decode_handles_spaces_and_unicode_and_stray_percent() {
2658        assert_eq!(percent_decode("my%20chapter.xhtml"), "my chapter.xhtml");
2659        // `%C3%A9` is UTF-8 for `é`.
2660        assert_eq!(percent_decode("caf%C3%A9.xhtml"), "café.xhtml");
2661        // A stray `%` not followed by two hex digits is emitted verbatim.
2662        assert_eq!(percent_decode("100%done"), "100%done");
2663        assert_eq!(percent_decode("plain.xhtml"), "plain.xhtml");
2664    }
2665
2666    #[test]
2667    fn normalize_zip_path_resolves_dot_segments() {
2668        assert_eq!(
2669            normalize_zip_path("OEBPS/../text/ch1.xhtml"),
2670            "text/ch1.xhtml"
2671        );
2672        assert_eq!(normalize_zip_path("OEBPS/./ch1.xhtml"), "OEBPS/ch1.xhtml");
2673        assert_eq!(normalize_zip_path("OEBPS/ch1.xhtml"), "OEBPS/ch1.xhtml");
2674    }
2675
2676    // ── regression: spreadsheet date rendering (finding #1013) ─────────────────
2677
2678    #[test]
2679    fn render_excel_datetime_renders_iso_not_serial() {
2680        use calamine::{ExcelDateTime, ExcelDateTimeType};
2681        // 46188 → 2026-06-15 (date only, midnight → no time component).
2682        let date = ExcelDateTime::new(46188.0, ExcelDateTimeType::DateTime, false);
2683        assert_eq!(render_excel_datetime(&date), "2026-06-15");
2684        // 46143.5 → 2026-05-01 12:00:00 (has a time component).
2685        let dt = ExcelDateTime::new(46143.5, ExcelDateTimeType::DateTime, false);
2686        assert_eq!(render_excel_datetime(&dt), "2026-05-01 12:00:00");
2687        // A duration is elapsed time, not a calendar date → keep the serial form.
2688        let dur = ExcelDateTime::new(1.5, ExcelDateTimeType::TimeDelta, false);
2689        assert_eq!(render_excel_datetime(&dur), "1.5");
2690    }
2691
2692    #[test]
2693    fn render_cell_dates_are_iso() {
2694        use calamine::{Data, ExcelDateTime, ExcelDateTimeType};
2695        assert_eq!(
2696            render_cell(&Data::DateTime(ExcelDateTime::new(
2697                46188.0,
2698                ExcelDateTimeType::DateTime,
2699                false
2700            ))),
2701            "2026-06-15"
2702        );
2703        // The integer/float/string paths are unchanged by the date fix.
2704        assert_eq!(render_cell(&Data::Float(3450.0)), "3450");
2705        assert_eq!(render_cell(&Data::Int(7)), "7");
2706    }
2707
2708    // ── regression: HTML/EPUB literal-content fidelity (finding #36) ───────────
2709
2710    /// Render an HTML body string through the production extract path.
2711    fn html_text(body: &str) -> String {
2712        let tmp = tempfile::TempDir::new().unwrap();
2713        let f = tmp.path().join("doc.html");
2714        std::fs::write(&f, format!("<html><body>{body}</body></html>")).unwrap();
2715        extract(&f).unwrap().text
2716    }
2717
2718    #[test]
2719    fn regression_html_keeps_literal_brackets_and_hashes() {
2720        // Pre-fix every `[bracketed]` substring and every leading-`#` run was
2721        // stripped from real prose, fusing `total[net]` into `totalnet` and
2722        // deleting the `#` from `#1 in sales`.
2723        let out = html_text(
2724            "<p>#1 in sales this quarter</p>\
2725<p>see chart[3] for data, array[0] = total[net]</p>",
2726        );
2727        assert!(out.contains("#1 in sales this quarter"), "got {out:?}");
2728        assert!(
2729            out.contains("see chart[3] for data, array[0] = total[net]"),
2730            "got {out:?}"
2731        );
2732
2733        // Citation markers and subscripts survive intact.
2734        let out = html_text("<p>See note [1] and [sic] here.</p><p>x[i] + y[j]</p>");
2735        assert!(out.contains("See note [1] and [sic] here."), "got {out:?}");
2736        assert!(out.contains("x[i] + y[j]"), "got {out:?}");
2737    }
2738
2739    #[test]
2740    fn html_headings_render_as_plain_prose_no_hash() {
2741        // A real `<h1>` heading still renders WITHOUT a `#` marker (the renderer
2742        // emits no heading prefix now), so headings read as prose.
2743        let out = html_text("<h1>Launch Plan</h1><p>Body prose.</p>");
2744        assert!(out.contains("Launch Plan"), "got {out:?}");
2745        assert!(
2746            !out.contains('#'),
2747            "no heading marker expected, got {out:?}"
2748        );
2749    }
2750
2751    #[test]
2752    fn html_links_render_as_bare_text_no_brackets() {
2753        // Link display text renders bare; the surrounding `[...]` the stock plain
2754        // decorator would add is gone.
2755        let out = html_text("<p>See the <a href=\"https://x.example\">handbook</a>.</p>");
2756        assert!(out.contains("See the handbook."), "got {out:?}");
2757    }
2758}
dbmd_core/extract.rs

dbmd_core/
extract.rs