dbmd_core/
extract.rs

1//! Document text extraction — the `dbmd extract` engine.
2//!
3//! `sources/` is where raw evidence lands: invoices, contracts, reports,
4//! exports. Most of it arrives as binary documents (PDF, Word, Excel, EPUB) or
5//! HTML, not markdown. Before an agent can reason over that evidence — wiki-link
6//! it, summarize it into the wiki layer, file a typed record that cites it — the
7//! text has to come out. This module is that step: a binary document in, plain
8//! UTF-8 text out, format chosen by file extension.
9//!
10//! # What this is, and is not
11//!
12//! - **Deterministic decoders only.** Every adapter is a format parser
13//!   (`pdf-extract`, `calamine`, `html2text`, `quick-xml`+`zip`). There is **no
14//!   AI, no OCR, no embeddings** here — consistent with the crate-wide invariant
15//!   (`lib.rs`). The agent driving `dbmd` is the semantic layer; this is plumbing.
16//! - **Text layer, not pixels.** A scanned PDF with no text layer yields the
17//!   empty string — *empty in, empty out, never hallucinated text.* OCR is an
18//!   explicit non-goal (a future `dbmd-ocr`).
19//! - **Single document, single call.** [`extract`] handles one file. Walking a
20//!   store and extracting every document is the caller's loop, not this module's.
21//!
22//! # Format dispatch
23//!
24//! [`Format::from_path`] maps the file extension to an adapter; [`extract`]
25//! dispatches:
26//!
27//! | Extension                | Format            | Adapter                          |
28//! |--------------------------|-------------------|----------------------------------|
29//! | `.pdf`                   | [`Format::Pdf`]   | `pdf-extract`                    |
30//! | `.docx`                  | [`Format::Docx`]  | `zip` + `quick-xml` (`w:t` runs) |
31//! | `.xlsx` / `.xlsm` / `.xlsb` / `.ods` | [`Format::Spreadsheet`] | `calamine` |
32//! | `.epub`                  | [`Format::Epub`]  | `zip` + `quick-xml` + `html2text`|
33//! | `.html` / `.htm` / `.xhtml` | [`Format::Html`] | `html2text`                    |
34//!
35//! Anything else is [`ExtractError::UnsupportedFormat`] — a typed refusal the
36//! CLI surfaces with a stable code, never a panic.
37
38use std::collections::BTreeMap;
39use std::io::Read;
40use std::panic::{catch_unwind, AssertUnwindSafe};
41use std::path::Path;
42
43use serde::Serialize;
44
45/// The result of extracting one document: the plain text plus a small,
46/// format-tagged metadata map.
47///
48/// This is the `--json` shape the CLI emits verbatim (`{text, metadata}`); in
49/// plain mode the CLI prints [`Extracted::text`] and discards the metadata.
50/// Metadata is intentionally minimal and best-effort — extraction never *fails*
51/// for want of a title; it just omits the key.
52#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
53pub struct Extracted {
54    /// The extracted plain text (UTF-8), normalized to `\n` line endings with
55    /// trailing whitespace trimmed per line and a single trailing newline. For
56    /// a document with no recoverable text layer (e.g. a scanned, image-only
57    /// PDF) this is the empty string — the contract is "empty in, empty out."
58    pub text: String,
59
60    /// Best-effort key/value metadata. Always carries `format` (the adapter
61    /// that ran, e.g. `"pdf"`). Adapters add what they cheaply know:
62    /// `pages`/`sheets`/`sheet_names` (counts), `title` (when the container
63    /// declares one). A `BTreeMap` so `--json` output is key-ordered and stable.
64    pub metadata: BTreeMap<String, MetaValue>,
65}
66
67impl Extracted {
68    /// Build an [`Extracted`] from raw adapter text + the detected format,
69    /// applying the canonical text normalization ([`normalize_text`]) and
70    /// seeding the `format` metadata key.
71    fn new(raw_text: String, format: Format) -> Self {
72        let mut metadata = BTreeMap::new();
73        metadata.insert(
74            "format".to_string(),
75            MetaValue::Str(format.tag().to_string()),
76        );
77        Extracted {
78            text: normalize_text(&raw_text),
79            metadata,
80        }
81    }
82
83    /// Insert a string metadata key only when the value is non-empty (keeps the
84    /// map free of empty `title: ""` noise).
85    fn put_str(&mut self, key: &str, value: impl Into<String>) {
86        let v = value.into();
87        if !v.trim().is_empty() {
88            self.metadata.insert(key.to_string(), MetaValue::Str(v));
89        }
90    }
91
92    /// Insert a numeric (count) metadata key.
93    fn put_num(&mut self, key: &str, value: u64) {
94        self.metadata.insert(key.to_string(), MetaValue::Num(value));
95    }
96}
97
98/// A metadata value: a string (title, format tag, sheet name list joined) or a
99/// non-negative count (pages, sheets). Serializes to a bare JSON string or
100/// number — no wrapper object — so `{text, metadata}` stays flat and readable.
101#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
102#[serde(untagged)]
103pub enum MetaValue {
104    /// A textual value (e.g. document title, the `format` tag).
105    Str(String),
106    /// A non-negative count (e.g. page count, sheet count).
107    Num(u64),
108}
109
110/// The document formats `dbmd extract` understands, one per adapter. Detected
111/// from the file extension by [`Format::from_path`].
112#[derive(Debug, Clone, Copy, PartialEq, Eq)]
113pub enum Format {
114    /// Portable Document Format (`.pdf`) — text layer via `pdf-extract`.
115    Pdf,
116    /// Office Open XML WordprocessingML (`.docx`) — `w:t` runs via `quick-xml`.
117    Docx,
118    /// A spreadsheet (`.xlsx`/`.xlsm`/`.xlsb`/`.ods`) — cells via `calamine`.
119    Spreadsheet,
120    /// EPUB e-book (`.epub`) — spine XHTML via `zip` + `quick-xml` + `html2text`.
121    Epub,
122    /// HTML (`.html`/`.htm`/`.xhtml`) — plain text via `html2text`.
123    Html,
124}
125
126impl Format {
127    /// Detect the format from a path's extension (case-insensitive). Returns
128    /// `None` for an unrecognized or missing extension; [`extract`] turns that
129    /// into [`ExtractError::UnsupportedFormat`] with the offending extension.
130    pub fn from_path(path: &Path) -> Option<Format> {
131        let ext = path.extension()?.to_str()?.to_ascii_lowercase();
132        Some(match ext.as_str() {
133            "pdf" => Format::Pdf,
134            "docx" => Format::Docx,
135            "xlsx" | "xlsm" | "xlsb" | "ods" => Format::Spreadsheet,
136            "epub" => Format::Epub,
137            "html" | "htm" | "xhtml" => Format::Html,
138            _ => return None,
139        })
140    }
141
142    /// The short, stable tag recorded in `metadata.format` and used in error
143    /// messages. Distinct from the file extension (one tag can cover several
144    /// extensions, e.g. `spreadsheet`).
145    pub fn tag(self) -> &'static str {
146        match self {
147            Format::Pdf => "pdf",
148            Format::Docx => "docx",
149            Format::Spreadsheet => "spreadsheet",
150            Format::Epub => "epub",
151            Format::Html => "html",
152        }
153    }
154}
155
156/// Errors from document extraction. Every variant is a typed refusal the CLI
157/// maps to a stable machine code — extraction never panics on a bad or
158/// encrypted input.
159#[derive(Debug, thiserror::Error)]
160pub enum ExtractError {
161    /// The file extension is missing or not one of the supported document
162    /// formats. Carries the offending extension (or `""` when absent).
163    #[error("unsupported document format: {0:?} (supported: pdf, docx, xlsx/xlsm/xlsb/ods, epub, html/htm/xhtml)")]
164    UnsupportedFormat(String),
165
166    /// The document is encrypted/password-protected and could not be opened
167    /// without a password (or with the wrong one). A clean refusal — the
168    /// extractor must never emit partial/garbled bytes for a locked file.
169    #[error("document is encrypted or password-protected: {0}")]
170    Encrypted(String),
171
172    /// A format adapter failed to parse a structurally invalid or corrupt
173    /// document. Carries the adapter's diagnostic.
174    #[error("failed to parse {format} document: {message}")]
175    Parse {
176        /// The format tag whose adapter failed (e.g. `"pdf"`, `"docx"`).
177        format: &'static str,
178        /// The underlying parser diagnostic.
179        message: String,
180    },
181
182    /// An underlying I/O failure (file missing, unreadable, etc.).
183    #[error(transparent)]
184    Io(#[from] std::io::Error),
185}
186
187impl ExtractError {
188    /// A short, stable machine code for this error, mirrored at the CLI
189    /// boundary for `--json` output and exit-code mapping.
190    pub fn code(&self) -> &'static str {
191        match self {
192            ExtractError::UnsupportedFormat(_) => "UNSUPPORTED_FORMAT",
193            ExtractError::Encrypted(_) => "DOCUMENT_ENCRYPTED",
194            ExtractError::Parse { .. } => "EXTRACT_PARSE_ERROR",
195            ExtractError::Io(_) => "IO_ERROR",
196        }
197    }
198}
199
200/// Result alias for extraction operations.
201pub type Result<T> = std::result::Result<T, ExtractError>;
202
203/// Extract plain text (and best-effort metadata) from a document, choosing the
204/// adapter by the file's extension.
205///
206/// This is the single entry point the CLI calls. It reads exactly one file and
207/// returns one [`Extracted`]; there is no whole-store walk here (per the
208/// crate-wide O(changed) invariant — a store-wide extraction is the caller's
209/// loop). An unsupported extension is [`ExtractError::UnsupportedFormat`]; an
210/// encrypted PDF is [`ExtractError::Encrypted`]; neither panics.
211///
212/// # Examples
213///
214/// ```no_run
215/// use std::path::Path;
216/// let out = dbmd_core::extract::extract(Path::new("sources/docs/invoice.pdf"))?;
217/// println!("{}", out.text);
218/// # Ok::<(), dbmd_core::extract::ExtractError>(())
219/// ```
220pub fn extract(path: &Path) -> Result<Extracted> {
221    let format = Format::from_path(path).ok_or_else(|| {
222        let ext = path
223            .extension()
224            .and_then(|e| e.to_str())
225            .unwrap_or("")
226            .to_string();
227        ExtractError::UnsupportedFormat(ext)
228    })?;
229
230    match format {
231        Format::Pdf => extract_pdf(path),
232        Format::Docx => extract_docx(path),
233        Format::Spreadsheet => extract_spreadsheet(path),
234        Format::Epub => extract_epub(path),
235        Format::Html => extract_html(path),
236    }
237}
238
239// ─────────────────────────────────────────────────────────────────────────────
240// Text normalization
241// ─────────────────────────────────────────────────────────────────────────────
242
243/// Canonicalize extracted text so output is stable across adapters:
244///
245/// 1. Normalize line endings to `\n` (drop `\r`).
246/// 2. Trim trailing whitespace on each line.
247/// 3. Collapse three-or-more consecutive blank lines to a single blank line.
248/// 4. Trim leading/trailing blank lines, then append exactly one `\n` (unless
249///    the whole text is empty, which stays empty — the image-only-PDF contract).
250///
251/// This is *layout* tid-up only; it never reorders or drops words. Word-level
252/// content is whatever the adapter recovered.
253pub fn normalize_text(raw: &str) -> String {
254    let unix = raw.replace("\r\n", "\n").replace('\r', "\n");
255
256    let lines: Vec<&str> = unix.lines().map(|l| l.trim_end()).collect();
257
258    // Trim leading/trailing blank lines by locating the first and last
259    // non-blank line ONCE, then slicing. The previous `while … lines.remove(0)`
260    // shifted every remaining element on each removal — O(n²) when the document
261    // is dominated by leading blanks (e.g. an adapter that emits millions of
262    // empty paragraphs), letting a few-hundred-KB document hang extraction for
263    // minutes. Index-and-slice is O(n) regardless of how many blanks lead.
264    let Some(first) = lines.iter().position(|l| !l.is_empty()) else {
265        return String::new();
266    };
267    // `first` exists, so a last non-blank line exists too (rposition can't be None).
268    let last = lines
269        .iter()
270        .rposition(|l| !l.is_empty())
271        .expect("a non-blank line exists once `first` is found");
272    let lines = &lines[first..=last];
273
274    // Collapse runs of 2+ blank lines down to a single blank line.
275    let mut out = String::new();
276    let mut blank_run = 0usize;
277    for &line in lines {
278        if line.is_empty() {
279            blank_run += 1;
280            if blank_run >= 2 {
281                continue;
282            }
283        } else {
284            blank_run = 0;
285        }
286        out.push_str(line);
287        out.push('\n');
288    }
289    out
290}
291
292// ─────────────────────────────────────────────────────────────────────────────
293// PDF — pdf-extract
294// ─────────────────────────────────────────────────────────────────────────────
295
296/// Extract a PDF's text layer via `pdf-extract`.
297///
298/// A PDF with no text layer (a scanned image) yields the empty string — that is
299/// correct, not an error (OCR is out of scope). A password-protected PDF that
300/// cannot be opened is mapped to [`ExtractError::Encrypted`] rather than a raw
301/// parse error so the caller can branch on it. Metadata carries the page count
302/// when the document tree exposes it.
303///
304/// `pdf-extract`/`lopdf` `panic!` internally on some malformed-but-openable
305/// PDFs (e.g. an out-of-set base `/Encoding` name), so both parser calls are
306/// wrapped in [`std::panic::catch_unwind`]: an internal abort is contained and
307/// surfaced as [`ExtractError::Parse`], upholding this module's "never panics"
308/// contract on untrusted `sources/` input.
309fn extract_pdf(path: &Path) -> Result<Extracted> {
310    // Read the bytes ourselves so a missing/unreadable file is a clean
311    // `ExtractError::Io` (via `?`) before we hand anything to the PDF parser.
312    let bytes = std::fs::read(path)?;
313
314    let text = match guard_pdf_panic(|| pdf_extract::extract_text_from_mem(&bytes))? {
315        Ok(t) => t,
316        Err(e) => return Err(classify_pdf_error(e)),
317    };
318
319    let mut out = Extracted::new(text, Format::Pdf);
320
321    // Page count is best-effort; derive it from the parsed document. A parse
322    // failure OR an internal panic here is non-fatal — the text already
323    // succeeded — so a contained panic (outer `Err`) and a load failure (inner
324    // `Err`) are both silently skipped.
325    if let Ok(Ok(doc)) = guard_pdf_panic(|| pdf_extract::Document::load_mem(&bytes)) {
326        out.put_num("pages", doc.get_pages().len() as u64);
327    }
328
329    Ok(out)
330}
331
332/// Run a panic-prone `pdf-extract`/`lopdf` call, converting an internal unwind
333/// into a typed [`ExtractError::Parse`] tagged `pdf` so the module's "never
334/// panics" contract holds on adversarial PDFs. `AssertUnwindSafe` is sound: the
335/// closure borrows only `&[u8]`, and on a caught unwind we discard any partial
336/// state and return an owned error. The default panic hook still writes the
337/// panic line to stderr — library code must not mutate the process-global hook.
338fn guard_pdf_panic<T>(f: impl FnOnce() -> T) -> Result<T> {
339    catch_unwind(AssertUnwindSafe(f)).map_err(|_| ExtractError::Parse {
340        format: "pdf",
341        message: "pdf parser aborted on malformed input".to_string(),
342    })
343}
344
345/// Map a `pdf-extract` error onto the right [`ExtractError`] variant.
346/// Decryption failures become [`ExtractError::Encrypted`]; everything else is a
347/// [`ExtractError::Parse`] tagged `pdf`.
348fn classify_pdf_error(err: pdf_extract::OutputError) -> ExtractError {
349    let msg = err.to_string();
350    let lower = msg.to_ascii_lowercase();
351    if lower.contains("password") || lower.contains("decrypt") || lower.contains("encrypt") {
352        ExtractError::Encrypted(msg)
353    } else {
354        ExtractError::Parse {
355            format: "pdf",
356            message: msg,
357        }
358    }
359}
360
361// ─────────────────────────────────────────────────────────────────────────────
362// DOCX — zip + quick-xml (no docx-rs dependency; quick-xml is already needed
363// for epub, so docx, xlsx-via-calamine, and epub share one XML/zip surface)
364// ─────────────────────────────────────────────────────────────────────────────
365
366/// Extract a `.docx` (WordprocessingML) by unzipping `word/document.xml` and
367/// concatenating the `<w:t>` run text, one logical line per `<w:p>` paragraph.
368///
369/// `<w:tab/>` becomes a tab and `<w:br/>` / `<w:cr>` a newline so table-ish and
370/// line-broken content keeps its shape; everything else is structural and
371/// ignored. This is the same minimal-but-faithful path `docx-rs` takes for text
372/// extraction, without pulling in a second XML/zip stack.
373fn extract_docx(path: &Path) -> Result<Extracted> {
374    let file = std::fs::File::open(path)?;
375    let mut archive = open_zip(file, "docx")?;
376
377    let xml = read_zip_entry(&mut archive, "word/document.xml", "docx")?;
378    let text = wordprocessing_text(&xml, "docx")?;
379
380    Ok(Extracted::new(text, Format::Docx))
381}
382
383/// Pull paragraph text out of a WordprocessingML / DrawingML XML body.
384///
385/// Shared by [`extract_docx`]. Walks the event stream collecting `<w:t>` text;
386/// `<w:p>` ends a line, `<w:tab/>` is a tab, `<w:br>`/`<w:cr>` a newline.
387fn wordprocessing_text(xml: &str, format: &'static str) -> Result<String> {
388    use quick_xml::events::Event;
389    use quick_xml::reader::Reader;
390
391    let mut reader = Reader::from_str(xml);
392    let mut buf = Vec::new();
393    let mut out = String::new();
394    let mut in_text_run = false;
395
396    loop {
397        match reader.read_event_into(&mut buf) {
398            Ok(Event::Start(e)) => {
399                if local_name(e.name().as_ref()) == b"t" {
400                    in_text_run = true;
401                }
402            }
403            Ok(Event::End(e)) => {
404                let name = e.name();
405                match local_name(name.as_ref()) {
406                    b"t" => in_text_run = false,
407                    b"p" => out.push('\n'),
408                    _ => {}
409                }
410            }
411            Ok(Event::Empty(e)) => {
412                // Self-closing run-level breaks inside a paragraph.
413                match local_name(e.name().as_ref()) {
414                    b"tab" => out.push('\t'),
415                    b"br" | b"cr" => out.push('\n'),
416                    _ => {}
417                }
418            }
419            // quick-xml 0.40 surfaces text verbatim in `Event::Text` but routes
420            // every entity reference to a separate `Event::GeneralRef` and CDATA
421            // to `Event::CData` — all three carry run content.
422            Ok(Event::Text(t)) => {
423                if in_text_run {
424                    out.push_str(&String::from_utf8_lossy(&t.into_inner()));
425                }
426            }
427            // `Smith &amp; Co` arrives as Text("Smith ") + GeneralRef("amp") +
428            // Text(" Co"); resolve the ref so `&`/`<`/`>`/numeric chars survive.
429            Ok(Event::GeneralRef(r)) => {
430                if in_text_run {
431                    out.push_str(&resolve_entity_ref(&r));
432                }
433            }
434            // CDATA inside a `<w:t>` run is valid WordprocessingML; its payload
435            // is literal text and must be appended like `Event::Text`.
436            Ok(Event::CData(c)) => {
437                if in_text_run {
438                    out.push_str(&String::from_utf8_lossy(&c.into_inner()));
439                }
440            }
441            Ok(Event::Eof) => break,
442            Err(e) => {
443                return Err(ExtractError::Parse {
444                    format,
445                    message: format!("malformed XML: {e}"),
446                });
447            }
448            _ => {}
449        }
450        buf.clear();
451    }
452
453    Ok(out)
454}
455
456/// The local part of a possibly-namespaced XML name: `w:t` → `t`, `t` → `t`.
457/// docx/epub XML uses prefixes (`w:`, `dc:`) the writer chose; matching the
458/// local name is prefix-agnostic and robust to that choice.
459fn local_name(qname: &[u8]) -> &[u8] {
460    match qname.iter().rposition(|&b| b == b':') {
461        Some(i) => &qname[i + 1..],
462        None => qname,
463    }
464}
465
466/// Resolve a `quick_xml` general-entity / character reference to its literal
467/// text. quick-xml 0.40 does NOT inline-resolve entity references inside
468/// `Event::Text`; instead it surfaces each `&name;` / `&#nnn;` as a separate
469/// `Event::GeneralRef`. Routing those to a `_ => {}` arm silently drops `&`,
470/// `<`, `>`, numeric refs, etc. from extracted text — corrupting any title,
471/// company name, or amount that contains them. This resolves the five
472/// XML-predefined named entities and any numeric character reference; an
473/// unknown named entity falls back to its bare name (best-effort, never a
474/// panic), matching the "recover what we can" stance of `sources/` extraction.
475fn resolve_entity_ref(reference: &quick_xml::events::BytesRef<'_>) -> String {
476    // Numeric character reference (`&#8212;`, `&#x2014;`): resolve to the char.
477    if let Ok(Some(ch)) = reference.resolve_char_ref() {
478        return ch.to_string();
479    }
480    // Named entity: map the five XML-predefined names; fall back to the bare
481    // name for anything else (custom DTD entities are out of scope here).
482    match reference.decode().as_deref() {
483        Ok("amp") => "&".to_string(),
484        Ok("lt") => "<".to_string(),
485        Ok("gt") => ">".to_string(),
486        Ok("quot") => "\"".to_string(),
487        Ok("apos") => "'".to_string(),
488        Ok(other) => other.to_string(),
489        Err(_) => String::new(),
490    }
491}
492
493// ─────────────────────────────────────────────────────────────────────────────
494// Spreadsheet — calamine (xlsx / xlsm / xlsb / ods)
495// ─────────────────────────────────────────────────────────────────────────────
496
497/// Ceiling on a single sheet's dense cell grid (`rows × cols`). `calamine`
498/// materializes a worksheet as a DENSE `Vec<Data>` sized from the MIN/MAX cell
499/// positions (`Range::from_sparse`), so two cells at `A1` and `XFD1048576` in a
500/// few-hundred-byte file force a ~1.7e10-element (~400 GB) allocation that
501/// **aborts** the process — bypassing the docx/epub zip-entry cap and the
502/// PDF panic guard (an allocation failure aborts, it does not unwind, so
503/// `catch_unwind` cannot contain it). `sources/` is untrusted input, so we
504/// bound the read the same way docx/epub do: refuse before the allocation.
505///
506/// 50M cells is ~1.2 GB worst-case dense (`Data` ≈ 24 bytes) — far above any
507/// real spreadsheet's used range, far below the weaponizable extreme.
508const MAX_SPREADSHEET_CELLS: u64 = 50_000_000;
509
510/// Extract every sheet of a spreadsheet via `calamine`, rendering each row as
511/// tab-separated cells, one row per line, sheets in workbook order separated by
512/// a blank line.
513///
514/// Cell rendering: text verbatim; integers and whole-valued floats without a
515/// trailing `.0` (`1200`, not `1200.0`); other floats via their default
516/// formatting; booleans as `TRUE`/`FALSE`; empty/error cells as the empty
517/// string. Metadata carries the sheet count and the joined sheet-name list.
518///
519/// Before materializing each sheet, [`spreadsheet_dense_cells`] bounds the
520/// would-be dense grid against [`MAX_SPREADSHEET_CELLS`] and returns a typed
521/// [`ExtractError::Parse`] refusal rather than letting an attacker-supplied
522/// sheet OOM/abort the process — upholding the module's "never panics on
523/// untrusted `sources/` input" contract for the spreadsheet adapter.
524fn extract_spreadsheet(path: &Path) -> Result<Extracted> {
525    use calamine::{open_workbook_auto, Reader};
526
527    let mut workbook = open_workbook_auto(path).map_err(|e| ExtractError::Parse {
528        format: "spreadsheet",
529        message: e.to_string(),
530    })?;
531
532    let sheet_names = workbook.sheet_names().to_vec();
533    let mut text = String::new();
534
535    for (idx, name) in sheet_names.iter().enumerate() {
536        if idx > 0 {
537            text.push('\n'); // blank line between sheets
538        }
539
540        // Bound the dense grid BEFORE calamine allocates it. For the zip-XML /
541        // record backends that expose a sparse cell iterator (xlsx-family,
542        // xlsb) this never densely allocates; over-cap sheets refuse cleanly.
543        if let Some(cells) = spreadsheet_dense_cells(&mut workbook, name)? {
544            if cells > MAX_SPREADSHEET_CELLS {
545                return Err(ExtractError::Parse {
546                    format: "spreadsheet",
547                    message: format!(
548                        "sheet {name:?} declares a {cells}-cell grid, over the \
549                         {MAX_SPREADSHEET_CELLS}-cell cap (malformed or hostile spreadsheet)"
550                    ),
551                });
552            }
553        }
554
555        let range = workbook
556            .worksheet_range(name)
557            .map_err(|e| ExtractError::Parse {
558                format: "spreadsheet",
559                message: format!("sheet {name:?}: {e}"),
560            })?;
561
562        for row in range.rows() {
563            let cells: Vec<String> = row.iter().map(render_cell).collect();
564            text.push_str(&cells.join("\t"));
565            text.push('\n');
566        }
567    }
568
569    let mut out = Extracted::new(text, Format::Spreadsheet);
570    out.put_num("sheets", sheet_names.len() as u64);
571    if !sheet_names.is_empty() {
572        out.put_str("sheet_names", sheet_names.join(", "));
573    }
574    Ok(out)
575}
576
577/// Compute the would-be dense cell count (`rows × cols`) of one sheet WITHOUT
578/// the dense allocation, by streaming the sheet's sparse cells and tracking the
579/// MIN/MAX non-empty position — exactly the bounds `Range::from_sparse` uses.
580///
581/// Returns `Some(rows * cols)` for the formats that expose a sparse cell
582/// iterator (`.xlsx`/`.xlsm`/`.xlsb`/`.xlam`), which are the realistic
583/// decompression/dimension-bomb vectors (an OOXML/record sheet can place two
584/// cells 1e10 apart in a few hundred bytes). Returns `None` for `.xls` (BIFF,
585/// format-bounded to ≤ 65 536 × 256 ≈ 1.7e7 cells) and `.ods`, neither of which
586/// exposes a sparse iterator on the auto-detected reader; those fall through to
587/// the normal materialization path. A row/col delta is saturated into `u64` so
588/// the multiply cannot overflow.
589fn spreadsheet_dense_cells(
590    workbook: &mut calamine::Sheets<std::io::BufReader<std::fs::File>>,
591    name: &str,
592) -> Result<Option<u64>> {
593    use calamine::{DataRef, Sheets};
594
595    // Stream cells, tracking the non-empty MIN/MAX extent that `from_sparse`
596    // would allocate. Empty cells are excluded (calamine drops them before
597    // computing the dense bounds), matching the dense grid exactly.
598    fn extent<E: std::fmt::Display>(
599        mut next: impl FnMut() -> std::result::Result<Option<((u32, u32), bool)>, E>,
600    ) -> Result<Option<u64>> {
601        let (mut r0, mut r1, mut c0, mut c1) = (u32::MAX, 0u32, u32::MAX, 0u32);
602        let mut any = false;
603        loop {
604            match next() {
605                Ok(Some(((r, c), is_empty))) => {
606                    if is_empty {
607                        continue;
608                    }
609                    any = true;
610                    r0 = r0.min(r);
611                    r1 = r1.max(r);
612                    c0 = c0.min(c);
613                    c1 = c1.max(c);
614                }
615                Ok(None) => break,
616                Err(e) => {
617                    return Err(ExtractError::Parse {
618                        format: "spreadsheet",
619                        message: format!("scanning sheet dimensions: {e}"),
620                    })
621                }
622            }
623        }
624        if !any {
625            return Ok(Some(0));
626        }
627        let rows = u64::from(r1 - r0) + 1;
628        let cols = u64::from(c1 - c0) + 1;
629        Ok(Some(rows.saturating_mul(cols)))
630    }
631
632    match workbook {
633        Sheets::Xlsx(xlsx) => {
634            let mut reader =
635                xlsx.worksheet_cells_reader(name)
636                    .map_err(|e| ExtractError::Parse {
637                        format: "spreadsheet",
638                        message: format!("sheet {name:?}: {e}"),
639                    })?;
640            extent(|| {
641                reader.next_cell().map(|opt| {
642                    opt.map(|c| (c.get_position(), matches!(c.get_value(), DataRef::Empty)))
643                })
644            })
645        }
646        Sheets::Xlsb(xlsb) => {
647            let mut reader =
648                xlsb.worksheet_cells_reader(name)
649                    .map_err(|e| ExtractError::Parse {
650                        format: "spreadsheet",
651                        message: format!("sheet {name:?}: {e}"),
652                    })?;
653            extent(|| {
654                reader.next_cell().map(|opt| {
655                    opt.map(|c| (c.get_position(), matches!(c.get_value(), DataRef::Empty)))
656                })
657            })
658        }
659        // `.xls` (BIFF, format-bounded) and `.ods` expose no sparse iterator on
660        // the auto reader; let them materialize normally.
661        Sheets::Xls(_) | Sheets::Ods(_) => Ok(None),
662    }
663}
664
665/// Render one spreadsheet cell to its text form. Whole-valued floats drop the
666/// `.0` (so `3450.0` → `3450`), matching how spreadsheet apps display an
667/// integer-typed amount.
668fn render_cell(cell: &calamine::Data) -> String {
669    use calamine::Data;
670    match cell {
671        Data::Empty => String::new(),
672        Data::String(s) => s.clone(),
673        Data::Int(i) => i.to_string(),
674        Data::Float(f) => {
675            if f.fract() == 0.0 && f.is_finite() && f.abs() < 1e15 {
676                format!("{}", *f as i64)
677            } else {
678                f.to_string()
679            }
680        }
681        Data::Bool(b) => {
682            if *b {
683                "TRUE".to_string()
684            } else {
685                "FALSE".to_string()
686            }
687        }
688        // A date/datetime cell is an Excel SERIAL number (days since the 1900
689        // epoch, fractional part = time of day). `ExcelDateTime`'s `Display`
690        // writes the raw serial (`46188`, `46143.5`), which is meaningless to an
691        // agent filing the value into a record, so render the calendar date
692        // instead. `to_ymd_hms_milli` is available without the `chrono` feature.
693        Data::DateTime(dt) => render_excel_datetime(dt),
694        Data::DateTimeIso(s) => s.clone(),
695        Data::DurationIso(s) => s.clone(),
696        Data::Error(e) => format!("{e:?}"),
697    }
698}
699
700/// Render an Excel serial date/datetime to an ISO calendar string. A pure date
701/// (midnight, no sub-day component) renders `YYYY-MM-DD`; a datetime with a time
702/// component renders `YYYY-MM-DD HH:MM:SS`. A duration (Excel `[hh]:mm:ss`
703/// elapsed-time format) is not a calendar date, so it keeps its raw serial form
704/// (the prior behavior) rather than being misrendered as a date.
705fn render_excel_datetime(dt: &calamine::ExcelDateTime) -> String {
706    if dt.is_duration() {
707        // Elapsed-time value, not a point on the calendar — leave as the serial.
708        return dt.as_f64().to_string();
709    }
710    let (y, mo, d, h, mi, s, _ms) = dt.to_ymd_hms_milli();
711    if h == 0 && mi == 0 && s == 0 {
712        format!("{y:04}-{mo:02}-{d:02}")
713    } else {
714        format!("{y:04}-{mo:02}-{d:02} {h:02}:{mi:02}:{s:02}")
715    }
716}
717
718// ─────────────────────────────────────────────────────────────────────────────
719// EPUB — zip + quick-xml (spine order) + html2text (per-chapter)
720// ─────────────────────────────────────────────────────────────────────────────
721//
722// We do NOT use the `epub` crate: it is GPL-3.0, which violates the toolkit's
723// permissive-only license rule. An EPUB is a zip whose OPF package declares a
724// reading-order `spine`; each spine item is an XHTML document. zip + quick-xml
725// (already dependencies) read the container/OPF, and html2text (already a
726// dependency for `.html`) flattens each chapter. Same machinery, no GPL.
727
728/// Extract an EPUB's reading-order text:
729/// 1. read `META-INF/container.xml` → the OPF package path;
730/// 2. parse the OPF `manifest` (id→href) and `spine` (ordered idref list);
731/// 3. for each spine item, read its XHTML and flatten it with [`html_to_text`];
732/// 4. join chapters with a blank line.
733///
734/// Metadata carries `title` (the OPF `dc:title`) and `chapters` (spine length).
735fn extract_epub(path: &Path) -> Result<Extracted> {
736    let file = std::fs::File::open(path)?;
737    let mut archive = open_zip(file, "epub")?;
738
739    // 1. container.xml → OPF path.
740    let container = read_zip_entry(&mut archive, "META-INF/container.xml", "epub")?;
741    let opf_path = epub_opf_path(&container)?;
742
743    // 2. OPF → base dir, manifest, spine, title.
744    let opf = read_zip_entry(&mut archive, &opf_path, "epub")?;
745    let parsed = parse_opf(&opf)?;
746    let base = opf_base_dir(&opf_path);
747
748    // 3. Spine items in order → flattened chapter text.
749    let mut text = String::new();
750    let mut chapters = 0u64;
751    for idref in &parsed.spine {
752        let Some(href) = parsed.manifest.get(idref) else {
753            continue; // dangling spine ref; skip rather than fail
754        };
755        let entry = join_zip_path(&base, href);
756        // A missing spine target is skipped (best-effort), not fatal.
757        let Ok(chapter_xhtml) = read_zip_entry(&mut archive, &entry, "epub") else {
758            continue;
759        };
760        let chapter_text = html_to_text(chapter_xhtml.as_bytes())?;
761        if !chapter_text.trim().is_empty() {
762            if chapters > 0 {
763                text.push('\n');
764            }
765            text.push_str(&chapter_text);
766            text.push('\n');
767            chapters += 1;
768        }
769    }
770
771    let mut out = Extracted::new(text, Format::Epub);
772    out.put_num("chapters", chapters);
773    if let Some(title) = parsed.title {
774        out.put_str("title", title);
775    }
776    Ok(out)
777}
778
779/// The full-path of the OPF package file, read from `META-INF/container.xml`'s
780/// first `<rootfile full-path="…">`.
781fn epub_opf_path(container_xml: &str) -> Result<String> {
782    use quick_xml::events::Event;
783    use quick_xml::reader::Reader;
784
785    let mut reader = Reader::from_str(container_xml);
786    let mut buf = Vec::new();
787    loop {
788        match reader.read_event_into(&mut buf) {
789            Ok(Event::Start(e)) | Ok(Event::Empty(e)) => {
790                if local_name(e.name().as_ref()) == b"rootfile" {
791                    if let Some(p) = attr_value(&e, b"full-path") {
792                        return Ok(p);
793                    }
794                }
795            }
796            Ok(Event::Eof) => break,
797            Err(e) => {
798                return Err(ExtractError::Parse {
799                    format: "epub",
800                    message: format!("container.xml: {e}"),
801                })
802            }
803            _ => {}
804        }
805        buf.clear();
806    }
807    Err(ExtractError::Parse {
808        format: "epub",
809        message: "container.xml has no <rootfile full-path>".to_string(),
810    })
811}
812
813/// The parsed-out pieces of an OPF package we need for reading-order text.
814struct OpfParsed {
815    /// Manifest: item id → href (relative to the OPF's directory).
816    manifest: BTreeMap<String, String>,
817    /// Spine: ordered list of manifest item ids (the reading order).
818    spine: Vec<String>,
819    /// `dc:title`, if present.
820    title: Option<String>,
821}
822
823/// Parse an OPF package document into its manifest, spine, and title.
824fn parse_opf(opf_xml: &str) -> Result<OpfParsed> {
825    use quick_xml::events::Event;
826    use quick_xml::reader::Reader;
827
828    let mut reader = Reader::from_str(opf_xml);
829    let mut buf = Vec::new();
830
831    let mut manifest = BTreeMap::new();
832    let mut spine = Vec::new();
833    let mut title: Option<String> = None;
834    // Whether we are inside the FIRST `<dc:title>` element, and the text we have
835    // accumulated for it. We accumulate across every Text/GeneralRef/CData event
836    // until the matching End so an entity, comment, or nested element inside the
837    // title does not truncate it.
838    let mut in_title = false;
839    let mut title_buf = String::new();
840
841    loop {
842        match reader.read_event_into(&mut buf) {
843            Ok(Event::Start(e)) => match local_name(e.name().as_ref()) {
844                b"item" => {
845                    if let (Some(id), Some(href)) = (attr_value(&e, b"id"), attr_value(&e, b"href"))
846                    {
847                        manifest.insert(id, href);
848                    }
849                }
850                b"itemref" => {
851                    if let Some(idref) = attr_value(&e, b"idref") {
852                        spine.push(idref);
853                    }
854                }
855                // Only a Start (not a self-closing Empty) opens the title: an
856                // Empty `<dc:title/>` has no content and produces no End event,
857                // so latching `in_title` on it would wrongly capture the next
858                // text node (e.g. the author) as the title.
859                b"title" if title.is_none() => in_title = true,
860                _ => {}
861            },
862            // Self-closing manifest/spine entries are Empty events; the title is
863            // never captured from Empty (see the Start arm's note).
864            Ok(Event::Empty(e)) => match local_name(e.name().as_ref()) {
865                b"item" => {
866                    if let (Some(id), Some(href)) = (attr_value(&e, b"id"), attr_value(&e, b"href"))
867                    {
868                        manifest.insert(id, href);
869                    }
870                }
871                b"itemref" => {
872                    if let Some(idref) = attr_value(&e, b"idref") {
873                        spine.push(idref);
874                    }
875                }
876                _ => {}
877            },
878            Ok(Event::End(e)) => {
879                if in_title && local_name(e.name().as_ref()) == b"title" {
880                    in_title = false;
881                    let s = title_buf.trim();
882                    if !s.is_empty() {
883                        title = Some(s.to_string());
884                    }
885                }
886            }
887            Ok(Event::Text(t)) => {
888                if in_title {
889                    title_buf.push_str(&String::from_utf8_lossy(&t.into_inner()));
890                }
891            }
892            // An entity (`&amp;`) or numeric ref inside the title resolves into
893            // the accumulated value rather than truncating it.
894            Ok(Event::GeneralRef(r)) => {
895                if in_title {
896                    title_buf.push_str(&resolve_entity_ref(&r));
897                }
898            }
899            // CDATA inside `<dc:title>` is literal title text.
900            Ok(Event::CData(c)) => {
901                if in_title {
902                    title_buf.push_str(&String::from_utf8_lossy(&c.into_inner()));
903                }
904            }
905            Ok(Event::Eof) => break,
906            Err(e) => {
907                return Err(ExtractError::Parse {
908                    format: "epub",
909                    message: format!("OPF: {e}"),
910                })
911            }
912            _ => {}
913        }
914        buf.clear();
915    }
916
917    Ok(OpfParsed {
918        manifest,
919        spine,
920        title,
921    })
922}
923
924/// The directory portion of an OPF path (`"OEBPS/content.opf"` → `"OEBPS"`,
925/// `"content.opf"` → `""`), used to resolve manifest hrefs against the OPF's own
926/// location inside the zip.
927fn opf_base_dir(opf_path: &str) -> String {
928    match opf_path.rfind('/') {
929        Some(i) => opf_path[..i].to_string(),
930        None => String::new(),
931    }
932}
933
934/// Join an OPF base dir with a (possibly `./`-prefixed) manifest href into a zip
935/// entry name. Forward-slash only — zip paths are always `/`-separated.
936///
937/// OPF manifest hrefs are URLs: the EPUB spec requires reserved characters
938/// (spaces, non-ASCII) to be percent-encoded, but zip entry NAMES are raw. So an
939/// href `my%20chapter.xhtml` must be percent-decoded to `my chapter.xhtml`
940/// before it can match the zip entry, or the chapter is silently dropped. We
941/// percent-decode the href and then normalize `.`/`..` segments so a relative
942/// href like `../text/ch1.xhtml` resolves against the OPF's directory.
943fn join_zip_path(base: &str, href: &str) -> String {
944    let decoded = percent_decode(href);
945    let combined = if base.is_empty() {
946        decoded
947    } else {
948        format!("{base}/{decoded}")
949    };
950    normalize_zip_path(&combined)
951}
952
953/// Percent-decode a URL path component (`%20` → space, `%C3%A9` → `é`).
954/// Decodes byte-by-byte then UTF-8-lossy-reinterprets, so a multi-byte
955/// percent-encoded codepoint (`%C3%A9`) round-trips. A stray `%` not followed by
956/// two hex digits is emitted verbatim (best-effort, never a panic).
957fn percent_decode(s: &str) -> String {
958    let bytes = s.as_bytes();
959    let mut out: Vec<u8> = Vec::with_capacity(bytes.len());
960    let mut i = 0;
961    while i < bytes.len() {
962        if bytes[i] == b'%' && i + 2 < bytes.len() {
963            let hi = (bytes[i + 1] as char).to_digit(16);
964            let lo = (bytes[i + 2] as char).to_digit(16);
965            if let (Some(hi), Some(lo)) = (hi, lo) {
966                out.push((hi * 16 + lo) as u8);
967                i += 3;
968                continue;
969            }
970        }
971        out.push(bytes[i]);
972        i += 1;
973    }
974    String::from_utf8_lossy(&out).into_owned()
975}
976
977/// Resolve `.` and `..` segments in a `/`-separated zip path so a manifest href
978/// like `../text/ch1.xhtml` (relative to the OPF's directory) maps to the real
979/// entry name. A leading `..` that would escape the archive root is dropped
980/// (zip entries have no parent of the root).
981fn normalize_zip_path(path: &str) -> String {
982    let mut out: Vec<&str> = Vec::new();
983    for seg in path.split('/') {
984        match seg {
985            "" | "." => {}
986            ".." => {
987                out.pop();
988            }
989            other => out.push(other),
990        }
991    }
992    out.join("/")
993}
994
995// ─────────────────────────────────────────────────────────────────────────────
996// HTML — html2text + light markdown-decoration cleanup
997// ─────────────────────────────────────────────────────────────────────────────
998
999/// Extract plain text from an `.html` file.
1000fn extract_html(path: &Path) -> Result<Extracted> {
1001    let bytes = std::fs::read(path)?;
1002    let text = html_to_text(&bytes)?;
1003    Ok(Extracted::new(text, Format::Html))
1004}
1005
1006/// Flatten an HTML/XHTML byte stream to clean plain text.
1007///
1008/// Renders with [`PlainContentDecorator`] — `html2text`'s plain renderer driven
1009/// by a decorator that emits **no** link brackets and **no** `#` heading
1010/// markers, while keeping list-item markers (`*` / `N.`). This removes the two
1011/// decorations at the source instead of post-stripping them: the previous
1012/// approach blindly deleted every `[bracketed]` substring and every leading `#`
1013/// run from the rendered text, which also destroyed *literal* content —
1014/// citation markers (`[1]`, `[sic]`), code subscripts (`x[i]`), and ranking
1015/// prose (`#1 in sales`). The renderer knows which `[`/`#` it produced; literal
1016/// brackets and hashes in the source now survive untouched.
1017///
1018/// A very wide wrap width (10_000) is used so paragraphs are not hard-wrapped by
1019/// the renderer; paragraph structure comes from the source's block elements, and
1020/// final layout is canonicalized by [`normalize_text`].
1021fn html_to_text(html: &[u8]) -> Result<String> {
1022    html2text::config::with_decorator(PlainContentDecorator)
1023        .string_from_read(html, 10_000)
1024        .map_err(|e| ExtractError::Parse {
1025            format: "html",
1026            message: e.to_string(),
1027        })
1028}
1029
1030/// A `html2text` decorator that flattens HTML to plain text WITHOUT emitting the
1031/// markup that would otherwise have to be post-stripped: no `[`/`]` around link
1032/// text, no `#` heading prefix, no `^{…}` superscript braces. List-item markers
1033/// (`* ` for unordered, `N. ` for ordered) ARE emitted — they are content-
1034/// faithful and match the corpus convention. Quote prefixes are kept as in the
1035/// stock plain decorator. This is the fix for the literal-content corruption the
1036/// old `strip_markdown_decorations`/`unwrap_brackets` post-pass caused.
1037#[derive(Clone, Debug)]
1038struct PlainContentDecorator;
1039
1040impl html2text::render::TextDecorator for PlainContentDecorator {
1041    type Annotation = ();
1042
1043    fn decorate_link_start(&mut self, _url: &str) -> (String, Self::Annotation) {
1044        (String::new(), ())
1045    }
1046    fn decorate_link_end(&mut self) -> String {
1047        String::new()
1048    }
1049    fn decorate_em_start(&self) -> (String, Self::Annotation) {
1050        (String::new(), ())
1051    }
1052    fn decorate_em_end(&self) -> String {
1053        String::new()
1054    }
1055    fn decorate_strong_start(&self) -> (String, Self::Annotation) {
1056        (String::new(), ())
1057    }
1058    fn decorate_strong_end(&self) -> String {
1059        String::new()
1060    }
1061    fn decorate_strikeout_start(&self) -> (String, Self::Annotation) {
1062        (String::new(), ())
1063    }
1064    fn decorate_strikeout_end(&self) -> String {
1065        String::new()
1066    }
1067    fn decorate_code_start(&self) -> (String, Self::Annotation) {
1068        (String::new(), ())
1069    }
1070    fn decorate_code_end(&self) -> String {
1071        String::new()
1072    }
1073    fn decorate_preformat_first(&self) -> Self::Annotation {}
1074    fn decorate_preformat_cont(&self) -> Self::Annotation {}
1075    fn decorate_image(&mut self, _src: &str, title: &str) -> (String, Self::Annotation) {
1076        // Alt/title text only — no surrounding brackets (the stock plain
1077        // decorator wraps it in `[...]`, which would read as literal content).
1078        (title.to_string(), ())
1079    }
1080    fn header_prefix(&self, _level: usize) -> String {
1081        // No `#` heading marker — heading text reads as plain prose.
1082        String::new()
1083    }
1084    fn quote_prefix(&self) -> String {
1085        "> ".to_string()
1086    }
1087    fn unordered_item_prefix(&self) -> String {
1088        "* ".to_string()
1089    }
1090    fn ordered_item_prefix(&self, i: i64) -> String {
1091        format!("{i}. ")
1092    }
1093    fn decorate_superscript_start(&self) -> (String, Self::Annotation) {
1094        // Plain text: no `^{…}` braces (which would corrupt literal content).
1095        (String::new(), ())
1096    }
1097    fn decorate_superscript_end(&self) -> String {
1098        String::new()
1099    }
1100    fn make_subblock_decorator(&self) -> Self {
1101        PlainContentDecorator
1102    }
1103}
1104
1105/// Strip the residual markdown decorations `html2text`'s plain renderer emits:
1106/// leading run of `#` (ATX heading markers) at the start of a line, and `[...]`
1107/// brackets around link/anchor text (the reference-style `[n]` suffix is already
1108/// gone under `plain_no_decorate`). Bullet (`*`) and ordered (`N.`) markers are
1109/// left intact — they are content, not decoration.
1110///
1111/// No longer used by [`html_to_text`] (the [`PlainContentDecorator`] now removes
1112/// these decorations at the source so literal `[brackets]`/`#hashes` survive);
1113/// retained only for its unit test documenting the old renderer's behavior.
1114#[allow(dead_code)]
1115fn strip_markdown_decorations(text: &str) -> String {
1116    let mut out = String::with_capacity(text.len());
1117    for line in text.lines() {
1118        // Strip a leading "#"-run + the single space after it (ATX heading).
1119        let trimmed = line.trim_start();
1120        let after_hashes = trimmed.trim_start_matches('#');
1121        let line = if after_hashes.len() != trimmed.len() {
1122            // It was a heading line: keep indentation-free heading text.
1123            after_hashes.trim_start()
1124        } else {
1125            line
1126        };
1127        out.push_str(&unwrap_brackets(line));
1128        out.push('\n');
1129    }
1130    out
1131}
1132
1133/// Replace every `[inner]` with `inner` (one pass, non-nested). `html2text`'s
1134/// plain renderer wraps link/anchor text in single brackets; unwrapping yields
1135/// the bare text. Escaped or unmatched brackets are left as-is.
1136///
1137/// No longer used by [`html_to_text`] (see [`strip_markdown_decorations`]);
1138/// retained only for its unit test.
1139#[allow(dead_code)]
1140fn unwrap_brackets(line: &str) -> String {
1141    if !line.contains('[') {
1142        return line.to_string();
1143    }
1144    let mut out = String::with_capacity(line.len());
1145    let mut chars = line.chars().peekable();
1146    while let Some(c) = chars.next() {
1147        if c == '[' {
1148            // Collect until the matching ']'; if none, emit the '[' literally.
1149            let mut inner = String::new();
1150            let mut closed = false;
1151            for d in chars.by_ref() {
1152                if d == ']' {
1153                    closed = true;
1154                    break;
1155                }
1156                inner.push(d);
1157            }
1158            if closed {
1159                out.push_str(&inner);
1160            } else {
1161                out.push('[');
1162                out.push_str(&inner);
1163            }
1164        } else {
1165            out.push(c);
1166        }
1167    }
1168    out
1169}
1170
1171// ─────────────────────────────────────────────────────────────────────────────
1172// Shared zip helpers (docx + epub)
1173// ─────────────────────────────────────────────────────────────────────────────
1174
1175/// Open a zip archive from a reader, mapping any failure to a typed
1176/// [`ExtractError::Parse`] tagged with the calling format.
1177fn open_zip<R: Read + std::io::Seek>(
1178    reader: R,
1179    format: &'static str,
1180) -> Result<zip::ZipArchive<R>> {
1181    zip::ZipArchive::new(reader).map_err(|e| ExtractError::Parse {
1182        format,
1183        message: format!("not a valid zip container: {e}"),
1184    })
1185}
1186
1187/// Cap on a single decompressed zip entry. docx/epub members are XML text — a
1188/// member that inflates past this ceiling is a decompression bomb or corruption,
1189/// not real evidence. `sources/` is untrusted input, so bound the read rather
1190/// than let `read_to_end` follow a hostile DEFLATE stream until OOM.
1191const MAX_ZIP_ENTRY_BYTES: u64 = 256 * 1024 * 1024;
1192
1193/// Read a single zip entry to a UTF-8 string, bounded by [`MAX_ZIP_ENTRY_BYTES`]
1194/// so a zip-bomb member cannot exhaust memory. A missing entry, an over-cap
1195/// entry, or a read failure is a typed [`ExtractError::Parse`]; invalid UTF-8 is
1196/// lossily decoded (OOXML / XHTML are declared UTF-8, but we never panic on a
1197/// stray byte).
1198fn read_zip_entry<R: Read + std::io::Seek>(
1199    archive: &mut zip::ZipArchive<R>,
1200    name: &str,
1201    format: &'static str,
1202) -> Result<String> {
1203    let entry = archive.by_name(name).map_err(|e| ExtractError::Parse {
1204        format,
1205        message: format!("missing zip entry {name:?}: {e}"),
1206    })?;
1207    // Reject up front when the central directory declares an over-cap size...
1208    let declared = entry.size();
1209    if declared > MAX_ZIP_ENTRY_BYTES {
1210        return Err(ExtractError::Parse {
1211            format,
1212            message: format!(
1213                "zip entry {name:?} declares {declared} bytes, over the {MAX_ZIP_ENTRY_BYTES}-byte cap"
1214            ),
1215        });
1216    }
1217    // ...and bound the actual decompressed read so a lying header (a bomb that
1218    // understates its uncompressed size) still cannot allocate past the cap.
1219    let mut bytes = Vec::new();
1220    entry
1221        .take(MAX_ZIP_ENTRY_BYTES + 1)
1222        .read_to_end(&mut bytes)
1223        .map_err(|e| ExtractError::Parse {
1224            format,
1225            message: format!("reading {name:?}: {e}"),
1226        })?;
1227    if bytes.len() as u64 > MAX_ZIP_ENTRY_BYTES {
1228        return Err(ExtractError::Parse {
1229            format,
1230            message: format!(
1231                "zip entry {name:?} exceeds the {MAX_ZIP_ENTRY_BYTES}-byte cap (decompression bomb?)"
1232            ),
1233        });
1234    }
1235    Ok(String::from_utf8_lossy(&bytes).into_owned())
1236}
1237
1238/// Look up a start/empty element's attribute value by local name, returning it
1239/// unescaped as an owned `String`. Prefix-agnostic on the attribute key.
1240fn attr_value(elem: &quick_xml::events::BytesStart<'_>, key: &[u8]) -> Option<String> {
1241    elem.attributes().flatten().find_map(|attr| {
1242        if local_name(attr.key.as_ref()) == key {
1243            // `unescape_value` returns an XML-unescaped `Cow<str>` — exactly the
1244            // owned attribute text we want. It is soft-deprecated in quick-xml
1245            // 0.40 in favor of `normalized_value(XmlVersion)`, whose extra
1246            // version arg and byte-Cow return buy us nothing here; the simple
1247            // form is correct for the UTF-8 OOXML/OPF attributes we read.
1248            #[allow(deprecated)]
1249            attr.unescape_value().ok().map(|cow| cow.into_owned())
1250        } else {
1251            None
1252        }
1253    })
1254}
1255
1256#[cfg(test)]
1257mod tests {
1258    use super::*;
1259    use std::path::PathBuf;
1260
1261    /// Absolute path to a corpus-c-formats fixture under `sources/docs/`.
1262    fn fixture(name: &str) -> PathBuf {
1263        PathBuf::from(env!("CARGO_MANIFEST_DIR"))
1264            .join("../../tests/corpora/corpus-c-formats/sources/docs")
1265            .join(name)
1266    }
1267
1268    /// Read the known-good `.txt` sibling of a fixture.
1269    fn expected(name: &str) -> String {
1270        std::fs::read_to_string(fixture(&format!("{name}.txt"))).unwrap()
1271    }
1272
1273    /// Token-level normalization: collapse every run of whitespace (incl.
1274    /// newlines) to one space and trim. This is the corpus's recommended,
1275    /// layout-agnostic comparison ("same words, same order").
1276    fn tokens(s: &str) -> String {
1277        s.split_whitespace().collect::<Vec<_>>().join(" ")
1278    }
1279
1280    /// The sorted set of non-blank, token-normalized lines — order-agnostic
1281    /// content comparison (used where extractor reading-order legitimately
1282    /// differs, e.g. multi-column PDF).
1283    fn line_set(s: &str) -> Vec<String> {
1284        let mut v: Vec<String> = s.lines().map(tokens).filter(|l| !l.is_empty()).collect();
1285        v.sort();
1286        v
1287    }
1288
1289    // ── format detection ────────────────────────────────────────────────────
1290
1291    #[test]
1292    fn detects_format_by_extension_case_insensitively() {
1293        assert_eq!(Format::from_path(Path::new("a.pdf")), Some(Format::Pdf));
1294        assert_eq!(Format::from_path(Path::new("a.PDF")), Some(Format::Pdf));
1295        assert_eq!(Format::from_path(Path::new("a.docx")), Some(Format::Docx));
1296        assert_eq!(
1297            Format::from_path(Path::new("a.xlsx")),
1298            Some(Format::Spreadsheet)
1299        );
1300        assert_eq!(
1301            Format::from_path(Path::new("a.ods")),
1302            Some(Format::Spreadsheet)
1303        );
1304        assert_eq!(Format::from_path(Path::new("a.epub")), Some(Format::Epub));
1305        assert_eq!(Format::from_path(Path::new("a.html")), Some(Format::Html));
1306        assert_eq!(Format::from_path(Path::new("a.htm")), Some(Format::Html));
1307        assert_eq!(Format::from_path(Path::new("a.txt")), None);
1308        assert_eq!(Format::from_path(Path::new("noext")), None);
1309    }
1310
1311    #[test]
1312    fn unsupported_extension_is_typed_error() {
1313        let err = extract(Path::new("/tmp/whatever.txt")).unwrap_err();
1314        assert!(matches!(err, ExtractError::UnsupportedFormat(ref e) if e == "txt"));
1315        assert_eq!(err.code(), "UNSUPPORTED_FORMAT");
1316    }
1317
1318    #[test]
1319    fn missing_extension_is_unsupported() {
1320        let err = extract(Path::new("/tmp/noext")).unwrap_err();
1321        assert!(matches!(err, ExtractError::UnsupportedFormat(ref e) if e.is_empty()));
1322    }
1323
1324    // ── normalization ─────────────────────────────────────────────────────────
1325
1326    #[test]
1327    fn normalize_collapses_blanks_and_trims() {
1328        let raw = "\r\n\r\nHeading\r\n\r\n\r\n\r\nBody line   \r\n\r\n";
1329        assert_eq!(normalize_text(raw), "Heading\n\nBody line\n");
1330    }
1331
1332    #[test]
1333    fn normalize_empty_stays_empty() {
1334        assert_eq!(normalize_text(""), "");
1335        assert_eq!(normalize_text("   \n\n  \n"), "");
1336    }
1337
1338    // ── per-format extraction against corpus-c fixtures ───────────────────────
1339
1340    #[test]
1341    fn extract_text_pdf_matches_known_good() {
1342        let got = extract(&fixture("text.pdf")).unwrap();
1343        assert_eq!(got.metadata["format"], MetaValue::Str("pdf".into()));
1344        assert_eq!(got.metadata["pages"], MetaValue::Num(1));
1345        assert_eq!(tokens(&got.text), tokens(&expected("text.pdf")));
1346    }
1347
1348    #[test]
1349    fn extract_weird_fonts_pdf_matches_known_good() {
1350        let got = extract(&fixture("weird-fonts.pdf")).unwrap();
1351        assert_eq!(tokens(&got.text), tokens(&expected("weird-fonts.pdf")));
1352    }
1353
1354    #[test]
1355    fn extract_multi_column_pdf_matches_content_order_agnostic() {
1356        // pdf-extract reads column-by-column; the known-good `.txt` captures the
1357        // interleaved (pdftotext) order. Both carry identical content — assert
1358        // the line SET, not the order. (README § multi-column.)
1359        let got = extract(&fixture("multi-column.pdf")).unwrap();
1360        assert_eq!(line_set(&got.text), line_set(&expected("multi-column.pdf")));
1361    }
1362
1363    #[test]
1364    fn extract_image_only_pdf_yields_empty() {
1365        // No text layer → empty out, never hallucinated text. OCR out of scope.
1366        let got = extract(&fixture("image-only.pdf")).unwrap();
1367        assert_eq!(got.text, "");
1368        assert!(expected("image-only.pdf").trim().is_empty());
1369    }
1370
1371    #[test]
1372    fn extract_encrypted_pdf_without_password_refuses_cleanly() {
1373        let err = extract(&fixture("encrypted.pdf")).unwrap_err();
1374        assert!(
1375            matches!(err, ExtractError::Encrypted(_)),
1376            "expected Encrypted, got {err:?}"
1377        );
1378        assert_eq!(err.code(), "DOCUMENT_ENCRYPTED");
1379    }
1380
1381    #[test]
1382    fn guard_pdf_panic_contains_unwind_as_parse_error() {
1383        // The "never panics" contract: an internal pdf-extract/lopdf panic must
1384        // surface as a typed ExtractError::Parse, not abort the process. (cargo
1385        // captures the unwind's stderr line for a passing test.)
1386        let contained: Result<()> = guard_pdf_panic(|| panic!("simulated pdf-extract abort"));
1387        assert!(
1388            matches!(contained, Err(ExtractError::Parse { format: "pdf", .. })),
1389            "panic must be contained as a pdf Parse error, got {contained:?}"
1390        );
1391        // The success path is transparent — the value passes straight through.
1392        let ok: Result<u32> = guard_pdf_panic(|| 42);
1393        assert_eq!(ok.unwrap(), 42);
1394    }
1395
1396    #[test]
1397    fn extract_docx_matches_known_good() {
1398        let got = extract(&fixture("sample.docx")).unwrap();
1399        assert_eq!(got.metadata["format"], MetaValue::Str("docx".into()));
1400        assert_eq!(tokens(&got.text), tokens(&expected("sample.docx")));
1401    }
1402
1403    #[test]
1404    fn extract_xlsx_matches_known_good() {
1405        let got = extract(&fixture("sample.xlsx")).unwrap();
1406        assert_eq!(got.metadata["format"], MetaValue::Str("spreadsheet".into()));
1407        assert_eq!(got.metadata["sheets"], MetaValue::Num(1));
1408        assert_eq!(
1409            got.metadata["sheet_names"],
1410            MetaValue::Str("Expenses".into())
1411        );
1412        // Tab-separated, integers without `.0` — exact match (no soft-wrap risk).
1413        assert_eq!(got.text.trim_end(), expected("sample.xlsx").trim_end());
1414    }
1415
1416    #[test]
1417    fn extract_epub_matches_known_good() {
1418        let got = extract(&fixture("sample.epub")).unwrap();
1419        assert_eq!(got.metadata["format"], MetaValue::Str("epub".into()));
1420        assert_eq!(got.metadata["chapters"], MetaValue::Num(1));
1421        assert_eq!(
1422            got.metadata["title"],
1423            MetaValue::Str("Operations Playbook".into())
1424        );
1425        assert_eq!(tokens(&got.text), tokens(&expected("sample.epub")));
1426    }
1427
1428    #[test]
1429    fn extract_html_matches_known_good() {
1430        let got = extract(&fixture("sample.html")).unwrap();
1431        assert_eq!(got.metadata["format"], MetaValue::Str("html".into()));
1432        assert_eq!(tokens(&got.text), tokens(&expected("sample.html")));
1433    }
1434
1435    // ── helper-level unit tests ───────────────────────────────────────────────
1436
1437    #[test]
1438    fn unwrap_brackets_flattens_link_text() {
1439        assert_eq!(
1440            unwrap_brackets("contact [ops@acme.example] or the [handbook]."),
1441            "contact ops@acme.example or the handbook."
1442        );
1443        // Unmatched '[' is preserved.
1444        assert_eq!(unwrap_brackets("a [b c"), "a [b c");
1445        // No brackets → untouched.
1446        assert_eq!(unwrap_brackets("plain text"), "plain text");
1447    }
1448
1449    #[test]
1450    fn strip_markdown_decorations_drops_heading_hashes() {
1451        let input = "# Title\n## Section\n* bullet\n1. ordered\nplain\n";
1452        let out = strip_markdown_decorations(input);
1453        assert_eq!(out, "Title\nSection\n* bullet\n1. ordered\nplain\n");
1454    }
1455
1456    #[test]
1457    fn local_name_strips_prefix() {
1458        assert_eq!(local_name(b"w:t"), b"t");
1459        assert_eq!(local_name(b"t"), b"t");
1460        assert_eq!(local_name(b"dc:title"), b"title");
1461    }
1462
1463    #[test]
1464    fn extracted_serializes_to_text_metadata_json() {
1465        let got = extract(&fixture("sample.xlsx")).unwrap();
1466        let json = serde_json::to_value(&got).unwrap();
1467        assert!(json.get("text").is_some());
1468        assert_eq!(json["metadata"]["format"], "spreadsheet");
1469        assert_eq!(json["metadata"]["sheets"], 1);
1470        // MetaValue::Num serializes as a bare JSON number, Str as a bare string.
1471        assert!(json["metadata"]["sheets"].is_number());
1472        assert!(json["metadata"]["format"].is_string());
1473    }
1474
1475    // ── regression: leading-blank normalization is linear (finding #13) ────────
1476
1477    /// `normalize_text` must trim leading blank lines in O(n), not O(n²). The
1478    /// pre-fix loop used `lines.remove(0)` per blank line — O(n) shift each, so a
1479    /// document dominated by leading blanks took O(n²) and hung extraction.
1480    ///
1481    /// 500_000 leading blank lines is ~2.5e11 element shifts under the old code
1482    /// (minutes-to-hours, effectively a hang) but instant under the index-and-
1483    /// slice path; the test reconstructs the finding's trigger (an adapter output
1484    /// that is mostly leading blanks then one line of text) and asserts the
1485    /// correct, fully-trimmed result. Against the pre-fix code this test does not
1486    /// complete in a reasonable time — encoding the quadratic regression.
1487    #[test]
1488    fn regression_normalize_text_leading_blanks_is_linear() {
1489        let blanks = "\n".repeat(500_000);
1490        let raw = format!("{blanks}only real line\n");
1491        // Leading blanks fully trimmed; single trailing newline; body intact.
1492        assert_eq!(normalize_text(&raw), "only real line\n");
1493
1494        // A wholly-blank giant input still collapses to empty (the other branch).
1495        assert_eq!(normalize_text(&"   \n".repeat(500_000)), "");
1496    }
1497
1498    // ── regression: spreadsheet dense-grid bomb is refused (finding #4) ────────
1499
1500    /// Build a VALID `.xlsx` whose single sheet declares two real cells at the
1501    /// opposite corners of Excel's grid (`A1` and `XFD1048576`). `calamine`
1502    /// materializes a sheet as a DENSE `Vec<Data>` sized from the MIN/MAX cell
1503    /// positions, so this two-cell sheet would force a ~1.7e10-element (~400 GB)
1504    /// allocation and abort the process. We reuse the corpus `sample.xlsx`
1505    /// container verbatim and swap ONLY `xl/worksheets/sheet1.xml`, so every
1506    /// other part (workbook, rels, content-types) is a real, openable workbook.
1507    fn write_dense_bomb_xlsx(dest: &Path) {
1508        use std::io::Write;
1509
1510        let base = std::fs::read(fixture("sample.xlsx")).expect("corpus sample.xlsx exists");
1511        let mut archive =
1512            zip::ZipArchive::new(std::io::Cursor::new(base)).expect("sample.xlsx is a valid zip");
1513
1514        let bomb_sheet = b"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\
1515<worksheet xmlns=\"http://schemas.openxmlformats.org/spreadsheetml/2006/main\">\
1516<sheetData>\
1517<row r=\"1\"><c r=\"A1\"><v>1</v></c></row>\
1518<row r=\"1048576\"><c r=\"XFD1048576\"><v>2</v></c></row>\
1519</sheetData></worksheet>";
1520
1521        let out = std::fs::File::create(dest).unwrap();
1522        let mut writer = zip::ZipWriter::new(out);
1523        let opts = zip::write::SimpleFileOptions::default()
1524            .compression_method(zip::CompressionMethod::Stored);
1525
1526        for i in 0..archive.len() {
1527            let entry = archive.by_index(i).unwrap();
1528            let name = entry.name().to_string();
1529            if name == "xl/worksheets/sheet1.xml" {
1530                writer.start_file(name, opts).unwrap();
1531                writer.write_all(bomb_sheet).unwrap();
1532            } else {
1533                // Copy every other entry's already-compressed bytes verbatim.
1534                writer.raw_copy_file(entry).unwrap();
1535            }
1536        }
1537        writer.finish().unwrap();
1538    }
1539
1540    /// A spreadsheet whose declared dense grid exceeds [`MAX_SPREADSHEET_CELLS`]
1541    /// is refused with a typed [`ExtractError::Parse`] BEFORE calamine allocates
1542    /// the dense matrix — never an OOM/abort. Pre-fix, `extract_spreadsheet`
1543    /// called `worksheet_range` directly and the process aborted on the
1544    /// allocation; this test would not return (it would kill the test runner),
1545    /// so it encodes the resource-exhaustion regression.
1546    #[test]
1547    fn regression_spreadsheet_dense_bomb_refused_not_oom() {
1548        let tmp = tempfile::TempDir::new().unwrap();
1549        let bomb = tmp.path().join("invoice.xlsx");
1550        write_dense_bomb_xlsx(&bomb);
1551
1552        // A few-hundred-byte file on disk — the whole point of the bomb.
1553        assert!(
1554            std::fs::metadata(&bomb).unwrap().len() < 10_000,
1555            "the bomb must be tiny on disk; the danger is the in-memory expansion"
1556        );
1557
1558        let err = extract(&bomb).unwrap_err();
1559        assert!(
1560            matches!(
1561                err,
1562                ExtractError::Parse {
1563                    format: "spreadsheet",
1564                    ..
1565                }
1566            ),
1567            "an over-cap dense grid must be a typed spreadsheet Parse refusal, got {err:?}"
1568        );
1569        assert_eq!(err.code(), "EXTRACT_PARSE_ERROR");
1570    }
1571
1572    /// The cap is a guard, not a wall: a normal spreadsheet still extracts. Locks
1573    /// down that the preflight bound does not regress the legitimate path (the
1574    /// corpus `sample.xlsx` is a 3×3 grid, far under the cap).
1575    #[test]
1576    fn regression_spreadsheet_cap_allows_real_workbook() {
1577        let got = extract(&fixture("sample.xlsx")).unwrap();
1578        assert_eq!(got.metadata["sheets"], MetaValue::Num(1));
1579        assert!(!got.text.is_empty());
1580    }
1581
1582    // ── regression: entity-ref / CDATA fidelity (findings #34, #1011) ──────────
1583
1584    /// Build a minimal valid `.docx` whose `word/document.xml` body is the given
1585    /// run XML, written to `dest`. Only the three OOXML members `extract_docx`
1586    /// touches need to be real; the rest of a Word package is optional for text
1587    /// extraction.
1588    fn write_docx(dest: &Path, body_runs: &str) {
1589        use std::io::Write;
1590        let document = format!(
1591            "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\
1592<w:document xmlns:w=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\">\
1593<w:body>{body_runs}</w:body></w:document>"
1594        );
1595        let file = std::fs::File::create(dest).unwrap();
1596        let mut writer = zip::ZipWriter::new(file);
1597        let opts = zip::write::SimpleFileOptions::default()
1598            .compression_method(zip::CompressionMethod::Stored);
1599        writer.start_file("word/document.xml", opts).unwrap();
1600        writer.write_all(document.as_bytes()).unwrap();
1601        writer.finish().unwrap();
1602    }
1603
1604    #[test]
1605    fn regression_docx_resolves_entity_refs() {
1606        // quick-xml 0.40 surfaces `&amp;`/`&lt;`/`&gt;`/`&#8212;` as separate
1607        // GeneralRef events; pre-fix they were routed to `_ => {}` and dropped,
1608        // corrupting `Smith & Co invoice <final> total — 100`.
1609        let tmp = tempfile::TempDir::new().unwrap();
1610        let f = tmp.path().join("entity.docx");
1611        write_docx(
1612            &f,
1613            "<w:p><w:r><w:t>Smith &amp; Co invoice &lt;final&gt; total &#8212; 100</w:t></w:r></w:p>",
1614        );
1615        let got = extract(&f).unwrap();
1616        assert_eq!(got.text, "Smith & Co invoice <final> total — 100\n");
1617    }
1618
1619    #[test]
1620    fn regression_docx_preserves_cdata_run_text() {
1621        // CDATA inside `<w:t>` is valid and literal; pre-fix it fell through the
1622        // wildcard arm and the payload vanished.
1623        let tmp = tempfile::TempDir::new().unwrap();
1624        let f = tmp.path().join("cdata.docx");
1625        write_docx(
1626            &f,
1627            "<w:p><w:r><w:t>Line A.</w:t></w:r></w:p>\
1628<w:p><w:r><w:t><![CDATA[IMPORTANT CDATA CONTENT]]></w:t></w:r></w:p>\
1629<w:p><w:r><w:t>Line C.</w:t></w:r></w:p>",
1630        );
1631        let got = extract(&f).unwrap();
1632        assert_eq!(got.text, "Line A.\nIMPORTANT CDATA CONTENT\nLine C.\n");
1633    }
1634
1635    #[test]
1636    fn resolve_entity_ref_maps_named_and_numeric() {
1637        use quick_xml::events::BytesRef;
1638        let r = |s: &'static str| resolve_entity_ref(&BytesRef::new(s));
1639        assert_eq!(r("amp"), "&");
1640        assert_eq!(r("lt"), "<");
1641        assert_eq!(r("gt"), ">");
1642        assert_eq!(r("quot"), "\"");
1643        assert_eq!(r("apos"), "'");
1644        assert_eq!(r("#8212"), "—");
1645        assert_eq!(r("#x2014"), "—");
1646        // Unknown named entity → bare name (best-effort, never a panic).
1647        assert_eq!(r("nbsp"), "nbsp");
1648    }
1649
1650    // ── regression: EPUB OPF parsing (findings #35, #37, #1012) ────────────────
1651
1652    /// Build a minimal valid EPUB at `dest`. `opf_metadata` is spliced verbatim
1653    /// inside `<metadata>`; `manifest_href` is the chapter item's href; the
1654    /// chapter XHTML is stored under the literal zip entry `chapter_entry`. The
1655    /// mimetype member is written first and stored (per the EPUB OCF spec).
1656    fn write_epub(dest: &Path, opf_metadata: &str, manifest_href: &str, chapter_entry: &str) {
1657        use std::io::Write;
1658        let container = "<?xml version=\"1.0\"?>\
1659<container version=\"1.0\" xmlns=\"urn:oasis:names:tc:opendocument:xmlns:container\">\
1660<rootfiles><rootfile full-path=\"OEBPS/content.opf\" \
1661media-type=\"application/oebps-package+xml\"/></rootfiles></container>";
1662        let opf = format!(
1663            "<?xml version=\"1.0\" encoding=\"utf-8\"?>\
1664<package xmlns=\"http://www.idpf.org/2007/opf\" version=\"3.0\" unique-identifier=\"id\">\
1665<metadata xmlns:dc=\"http://purl.org/dc/elements/1.1/\">{opf_metadata}</metadata>\
1666<manifest><item id=\"c1\" href=\"{manifest_href}\" media-type=\"application/xhtml+xml\"/></manifest>\
1667<spine><itemref idref=\"c1\"/></spine></package>"
1668        );
1669        let chapter = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\
1670<html xmlns=\"http://www.w3.org/1999/xhtml\"><body>\
1671<p>Hello world body text.</p></body></html>";
1672
1673        let file = std::fs::File::create(dest).unwrap();
1674        let mut writer = zip::ZipWriter::new(file);
1675        let stored = zip::write::SimpleFileOptions::default()
1676            .compression_method(zip::CompressionMethod::Stored);
1677        // mimetype must be the first member and stored uncompressed.
1678        writer.start_file("mimetype", stored).unwrap();
1679        writer.write_all(b"application/epub+zip").unwrap();
1680        writer.start_file("META-INF/container.xml", stored).unwrap();
1681        writer.write_all(container.as_bytes()).unwrap();
1682        writer.start_file("OEBPS/content.opf", stored).unwrap();
1683        writer.write_all(opf.as_bytes()).unwrap();
1684        writer.start_file(chapter_entry, stored).unwrap();
1685        writer.write_all(chapter.as_bytes()).unwrap();
1686        writer.finish().unwrap();
1687    }
1688
1689    #[test]
1690    fn regression_epub_title_accumulates_entities_and_nested_events() {
1691        // Pre-fix the title was cut at the first Text node, so an entity or a
1692        // comment inside `<dc:title>` truncated it.
1693        let tmp = tempfile::TempDir::new().unwrap();
1694
1695        let f1 = tmp.path().join("entity.epub");
1696        write_epub(
1697            &f1,
1698            "<dc:title>Smith &amp; Jones: A &lt;Tale&gt;</dc:title>",
1699            "chapter.xhtml",
1700            "OEBPS/chapter.xhtml",
1701        );
1702        let got = extract(&f1).unwrap();
1703        assert_eq!(
1704            got.metadata["title"],
1705            MetaValue::Str("Smith & Jones: A <Tale>".into())
1706        );
1707
1708        let f2 = tmp.path().join("comment.epub");
1709        write_epub(
1710            &f2,
1711            "<dc:title>Part One<!-- editorial --> and Part Two</dc:title>",
1712            "chapter.xhtml",
1713            "OEBPS/chapter.xhtml",
1714        );
1715        let got = extract(&f2).unwrap();
1716        assert_eq!(
1717            got.metadata["title"],
1718            MetaValue::Str("Part One and Part Two".into())
1719        );
1720    }
1721
1722    #[test]
1723    fn regression_epub_self_closing_title_does_not_capture_author() {
1724        // A self-closing `<dc:title/>` (an untitled book) must NOT latch the next
1725        // text node (the author) as the title.
1726        let tmp = tempfile::TempDir::new().unwrap();
1727        let f = tmp.path().join("empty-title.epub");
1728        write_epub(
1729            &f,
1730            "<dc:title/><dc:creator>John Doe</dc:creator>",
1731            "chapter.xhtml",
1732            "OEBPS/chapter.xhtml",
1733        );
1734        let got = extract(&f).unwrap();
1735        // No (or empty) title — never the author. `put_str` omits empty values.
1736        assert!(
1737            !got.metadata.contains_key("title"),
1738            "self-closing title must not capture the author, got {:?}",
1739            got.metadata.get("title")
1740        );
1741        // The chapter still extracts.
1742        assert_eq!(got.metadata["chapters"], MetaValue::Num(1));
1743    }
1744
1745    #[test]
1746    fn regression_epub_percent_encoded_href_resolves() {
1747        // An href `my%20chapter.xhtml` must match the zip entry
1748        // `OEBPS/my chapter.xhtml`; pre-fix the lookup failed and the chapter was
1749        // silently dropped (empty text, 0 chapters).
1750        let tmp = tempfile::TempDir::new().unwrap();
1751        let f = tmp.path().join("spaced.epub");
1752        write_epub(
1753            &f,
1754            "<dc:title>Spaced</dc:title>",
1755            "my%20chapter.xhtml",
1756            "OEBPS/my chapter.xhtml",
1757        );
1758        let got = extract(&f).unwrap();
1759        assert_eq!(got.metadata["chapters"], MetaValue::Num(1));
1760        assert!(
1761            got.text.contains("Hello world body text."),
1762            "percent-encoded-href chapter must extract, got {:?}",
1763            got.text
1764        );
1765    }
1766
1767    #[test]
1768    fn percent_decode_handles_spaces_and_unicode_and_stray_percent() {
1769        assert_eq!(percent_decode("my%20chapter.xhtml"), "my chapter.xhtml");
1770        // `%C3%A9` is UTF-8 for `é`.
1771        assert_eq!(percent_decode("caf%C3%A9.xhtml"), "café.xhtml");
1772        // A stray `%` not followed by two hex digits is emitted verbatim.
1773        assert_eq!(percent_decode("100%done"), "100%done");
1774        assert_eq!(percent_decode("plain.xhtml"), "plain.xhtml");
1775    }
1776
1777    #[test]
1778    fn normalize_zip_path_resolves_dot_segments() {
1779        assert_eq!(
1780            normalize_zip_path("OEBPS/../text/ch1.xhtml"),
1781            "text/ch1.xhtml"
1782        );
1783        assert_eq!(normalize_zip_path("OEBPS/./ch1.xhtml"), "OEBPS/ch1.xhtml");
1784        assert_eq!(normalize_zip_path("OEBPS/ch1.xhtml"), "OEBPS/ch1.xhtml");
1785    }
1786
1787    // ── regression: spreadsheet date rendering (finding #1013) ─────────────────
1788
1789    #[test]
1790    fn render_excel_datetime_renders_iso_not_serial() {
1791        use calamine::{ExcelDateTime, ExcelDateTimeType};
1792        // 46188 → 2026-06-15 (date only, midnight → no time component).
1793        let date = ExcelDateTime::new(46188.0, ExcelDateTimeType::DateTime, false);
1794        assert_eq!(render_excel_datetime(&date), "2026-06-15");
1795        // 46143.5 → 2026-05-01 12:00:00 (has a time component).
1796        let dt = ExcelDateTime::new(46143.5, ExcelDateTimeType::DateTime, false);
1797        assert_eq!(render_excel_datetime(&dt), "2026-05-01 12:00:00");
1798        // A duration is elapsed time, not a calendar date → keep the serial form.
1799        let dur = ExcelDateTime::new(1.5, ExcelDateTimeType::TimeDelta, false);
1800        assert_eq!(render_excel_datetime(&dur), "1.5");
1801    }
1802
1803    #[test]
1804    fn render_cell_dates_are_iso() {
1805        use calamine::{Data, ExcelDateTime, ExcelDateTimeType};
1806        assert_eq!(
1807            render_cell(&Data::DateTime(ExcelDateTime::new(
1808                46188.0,
1809                ExcelDateTimeType::DateTime,
1810                false
1811            ))),
1812            "2026-06-15"
1813        );
1814        // The integer/float/string paths are unchanged by the date fix.
1815        assert_eq!(render_cell(&Data::Float(3450.0)), "3450");
1816        assert_eq!(render_cell(&Data::Int(7)), "7");
1817    }
1818
1819    // ── regression: HTML/EPUB literal-content fidelity (finding #36) ───────────
1820
1821    /// Render an HTML body string through the production extract path.
1822    fn html_text(body: &str) -> String {
1823        let tmp = tempfile::TempDir::new().unwrap();
1824        let f = tmp.path().join("doc.html");
1825        std::fs::write(&f, format!("<html><body>{body}</body></html>")).unwrap();
1826        extract(&f).unwrap().text
1827    }
1828
1829    #[test]
1830    fn regression_html_keeps_literal_brackets_and_hashes() {
1831        // Pre-fix every `[bracketed]` substring and every leading-`#` run was
1832        // stripped from real prose, fusing `total[net]` into `totalnet` and
1833        // deleting the `#` from `#1 in sales`.
1834        let out = html_text(
1835            "<p>#1 in sales this quarter</p>\
1836<p>see chart[3] for data, array[0] = total[net]</p>",
1837        );
1838        assert!(out.contains("#1 in sales this quarter"), "got {out:?}");
1839        assert!(
1840            out.contains("see chart[3] for data, array[0] = total[net]"),
1841            "got {out:?}"
1842        );
1843
1844        // Citation markers and subscripts survive intact.
1845        let out = html_text("<p>See note [1] and [sic] here.</p><p>x[i] + y[j]</p>");
1846        assert!(out.contains("See note [1] and [sic] here."), "got {out:?}");
1847        assert!(out.contains("x[i] + y[j]"), "got {out:?}");
1848    }
1849
1850    #[test]
1851    fn html_headings_render_as_plain_prose_no_hash() {
1852        // A real `<h1>` heading still renders WITHOUT a `#` marker (the renderer
1853        // emits no heading prefix now), so headings read as prose.
1854        let out = html_text("<h1>Launch Plan</h1><p>Body prose.</p>");
1855        assert!(out.contains("Launch Plan"), "got {out:?}");
1856        assert!(
1857            !out.contains('#'),
1858            "no heading marker expected, got {out:?}"
1859        );
1860    }
1861
1862    #[test]
1863    fn html_links_render_as_bare_text_no_brackets() {
1864        // Link display text renders bare; the surrounding `[...]` the stock plain
1865        // decorator would add is gone.
1866        let out = html_text("<p>See the <a href=\"https://x.example\">handbook</a>.</p>");
1867        assert!(out.contains("See the handbook."), "got {out:?}");
1868    }
1869}
dbmd_core/extract.rs

dbmd_core/
extract.rs