dbmd_core/extract.rs
1//! Document text extraction — the `dbmd extract` engine.
2//!
3//! `sources/` is where raw evidence lands: invoices, contracts, reports,
4//! exports. Most of it arrives as binary documents (PDF, Word, Excel, EPUB) or
5//! HTML, not markdown. Before an agent can reason over that evidence — wiki-link
6//! it, summarize it into the wiki layer, file a typed record that cites it — the
7//! text has to come out. This module is that step: a binary document in, plain
8//! UTF-8 text out, format chosen by file extension.
9//!
10//! # What this is, and is not
11//!
12//! - **Deterministic decoders only.** Every adapter is a format parser
13//! (`pdf-extract`, `calamine`, `html2text`, `quick-xml`+`zip`). There is **no
14//! AI, no OCR, no embeddings** here — consistent with the crate-wide invariant
15//! (`lib.rs`). The agent driving `dbmd` is the semantic layer; this is plumbing.
16//! - **Text layer, not pixels.** A scanned PDF with no text layer yields the
17//! empty string — *empty in, empty out, never hallucinated text.* OCR is an
18//! explicit non-goal (a future `dbmd-ocr`).
19//! - **Single document, single call.** [`extract`] handles one file. Walking a
20//! store and extracting every document is the caller's loop, not this module's.
21//!
22//! # Format dispatch
23//!
24//! [`Format::from_path`] maps the file extension to an adapter; [`extract`]
25//! dispatches:
26//!
27//! | Extension | Format | Adapter |
28//! |--------------------------|-------------------|----------------------------------|
29//! | `.pdf` | [`Format::Pdf`] | `pdf-extract` |
30//! | `.docx` | [`Format::Docx`] | `zip` + `quick-xml` (`w:t` runs) |
31//! | `.xlsx` / `.xlsm` / `.xlsb` / `.ods` | [`Format::Spreadsheet`] | `calamine` |
32//! | `.epub` | [`Format::Epub`] | `zip` + `quick-xml` + `html2text`|
33//! | `.html` / `.htm` / `.xhtml` | [`Format::Html`] | `html2text` |
34//!
35//! Anything else is [`ExtractError::UnsupportedFormat`] — a typed refusal the
36//! CLI surfaces with a stable code, never a panic.
37
38use std::collections::BTreeMap;
39use std::io::Read;
40use std::panic::{catch_unwind, AssertUnwindSafe};
41use std::path::Path;
42
43use serde::Serialize;
44
45/// The result of extracting one document: the plain text plus a small,
46/// format-tagged metadata map.
47///
48/// This is the `--json` shape the CLI emits verbatim (`{text, metadata}`); in
49/// plain mode the CLI prints [`Extracted::text`] and discards the metadata.
50/// Metadata is intentionally minimal and best-effort — extraction never *fails*
51/// for want of a title; it just omits the key.
52#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
53pub struct Extracted {
54 /// The extracted plain text (UTF-8), normalized to `\n` line endings with
55 /// trailing whitespace trimmed per line and a single trailing newline. For
56 /// a document with no recoverable text layer (e.g. a scanned, image-only
57 /// PDF) this is the empty string — the contract is "empty in, empty out."
58 pub text: String,
59
60 /// Best-effort key/value metadata. Always carries `format` (the adapter
61 /// that ran, e.g. `"pdf"`). Adapters add what they cheaply know:
62 /// `pages`/`sheets`/`sheet_names` (counts), `title` (when the container
63 /// declares one). A `BTreeMap` so `--json` output is key-ordered and stable.
64 pub metadata: BTreeMap<String, MetaValue>,
65}
66
67impl Extracted {
68 /// Build an [`Extracted`] from raw adapter text + the detected format,
69 /// applying the canonical text normalization ([`normalize_text`]) and
70 /// seeding the `format` metadata key.
71 fn new(raw_text: String, format: Format) -> Self {
72 let mut metadata = BTreeMap::new();
73 metadata.insert(
74 "format".to_string(),
75 MetaValue::Str(format.tag().to_string()),
76 );
77 Extracted {
78 text: normalize_text(&raw_text),
79 metadata,
80 }
81 }
82
83 /// Insert a string metadata key only when the value is non-empty (keeps the
84 /// map free of empty `title: ""` noise).
85 fn put_str(&mut self, key: &str, value: impl Into<String>) {
86 let v = value.into();
87 if !v.trim().is_empty() {
88 self.metadata.insert(key.to_string(), MetaValue::Str(v));
89 }
90 }
91
92 /// Insert a numeric (count) metadata key.
93 fn put_num(&mut self, key: &str, value: u64) {
94 self.metadata.insert(key.to_string(), MetaValue::Num(value));
95 }
96}
97
98/// A metadata value: a string (title, format tag, sheet name list joined) or a
99/// non-negative count (pages, sheets). Serializes to a bare JSON string or
100/// number — no wrapper object — so `{text, metadata}` stays flat and readable.
101#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
102#[serde(untagged)]
103pub enum MetaValue {
104 /// A textual value (e.g. document title, the `format` tag).
105 Str(String),
106 /// A non-negative count (e.g. page count, sheet count).
107 Num(u64),
108}
109
110/// The document formats `dbmd extract` understands, one per adapter. Detected
111/// from the file extension by [`Format::from_path`].
112#[derive(Debug, Clone, Copy, PartialEq, Eq)]
113pub enum Format {
114 /// Portable Document Format (`.pdf`) — text layer via `pdf-extract`.
115 Pdf,
116 /// Office Open XML WordprocessingML (`.docx`) — `w:t` runs via `quick-xml`.
117 Docx,
118 /// A spreadsheet (`.xlsx`/`.xlsm`/`.xlsb`/`.ods`) — cells via `calamine`.
119 Spreadsheet,
120 /// EPUB e-book (`.epub`) — spine XHTML via `zip` + `quick-xml` + `html2text`.
121 Epub,
122 /// HTML (`.html`/`.htm`/`.xhtml`) — plain text via `html2text`.
123 Html,
124}
125
126impl Format {
127 /// Detect the format from a path's extension (case-insensitive). Returns
128 /// `None` for an unrecognized or missing extension; [`extract`] turns that
129 /// into [`ExtractError::UnsupportedFormat`] with the offending extension.
130 pub fn from_path(path: &Path) -> Option<Format> {
131 let ext = path.extension()?.to_str()?.to_ascii_lowercase();
132 Some(match ext.as_str() {
133 "pdf" => Format::Pdf,
134 "docx" => Format::Docx,
135 "xlsx" | "xlsm" | "xlsb" | "ods" => Format::Spreadsheet,
136 "epub" => Format::Epub,
137 "html" | "htm" | "xhtml" => Format::Html,
138 _ => return None,
139 })
140 }
141
142 /// The short, stable tag recorded in `metadata.format` and used in error
143 /// messages. Distinct from the file extension (one tag can cover several
144 /// extensions, e.g. `spreadsheet`).
145 pub fn tag(self) -> &'static str {
146 match self {
147 Format::Pdf => "pdf",
148 Format::Docx => "docx",
149 Format::Spreadsheet => "spreadsheet",
150 Format::Epub => "epub",
151 Format::Html => "html",
152 }
153 }
154}
155
156/// Errors from document extraction. Every variant is a typed refusal the CLI
157/// maps to a stable machine code — extraction never panics on a bad or
158/// encrypted input.
159#[derive(Debug, thiserror::Error)]
160pub enum ExtractError {
161 /// The file extension is missing or not one of the supported document
162 /// formats. Carries the offending extension (or `""` when absent).
163 #[error("unsupported document format: {0:?} (supported: pdf, docx, xlsx/xlsm/xlsb/ods, epub, html/htm/xhtml)")]
164 UnsupportedFormat(String),
165
166 /// The document is encrypted/password-protected and could not be opened
167 /// without a password (or with the wrong one). A clean refusal — the
168 /// extractor must never emit partial/garbled bytes for a locked file.
169 #[error("document is encrypted or password-protected: {0}")]
170 Encrypted(String),
171
172 /// A format adapter failed to parse a structurally invalid or corrupt
173 /// document. Carries the adapter's diagnostic.
174 #[error("failed to parse {format} document: {message}")]
175 Parse {
176 /// The format tag whose adapter failed (e.g. `"pdf"`, `"docx"`).
177 format: &'static str,
178 /// The underlying parser diagnostic.
179 message: String,
180 },
181
182 /// An underlying I/O failure (file missing, unreadable, etc.).
183 #[error(transparent)]
184 Io(#[from] std::io::Error),
185}
186
187impl ExtractError {
188 /// A short, stable machine code for this error, mirrored at the CLI
189 /// boundary for `--json` output and exit-code mapping.
190 pub fn code(&self) -> &'static str {
191 match self {
192 ExtractError::UnsupportedFormat(_) => "UNSUPPORTED_FORMAT",
193 ExtractError::Encrypted(_) => "DOCUMENT_ENCRYPTED",
194 ExtractError::Parse { .. } => "EXTRACT_PARSE_ERROR",
195 ExtractError::Io(_) => "IO_ERROR",
196 }
197 }
198}
199
200/// Result alias for extraction operations.
201pub type Result<T> = std::result::Result<T, ExtractError>;
202
203/// Extract plain text (and best-effort metadata) from a document, choosing the
204/// adapter by the file's extension.
205///
206/// This is the single entry point the CLI calls. It reads exactly one file and
207/// returns one [`Extracted`]; there is no whole-store walk here (per the
208/// crate-wide O(changed) invariant — a store-wide extraction is the caller's
209/// loop). An unsupported extension is [`ExtractError::UnsupportedFormat`]; an
210/// encrypted PDF is [`ExtractError::Encrypted`]; neither panics.
211///
212/// # Examples
213///
214/// ```no_run
215/// use std::path::Path;
216/// let out = dbmd_core::extract::extract(Path::new("sources/docs/invoice.pdf"))?;
217/// println!("{}", out.text);
218/// # Ok::<(), dbmd_core::extract::ExtractError>(())
219/// ```
220pub fn extract(path: &Path) -> Result<Extracted> {
221 let format = Format::from_path(path).ok_or_else(|| {
222 let ext = path
223 .extension()
224 .and_then(|e| e.to_str())
225 .unwrap_or("")
226 .to_string();
227 ExtractError::UnsupportedFormat(ext)
228 })?;
229
230 match format {
231 Format::Pdf => extract_pdf(path),
232 Format::Docx => extract_docx(path),
233 Format::Spreadsheet => extract_spreadsheet(path),
234 Format::Epub => extract_epub(path),
235 Format::Html => extract_html(path),
236 }
237}
238
239// ─────────────────────────────────────────────────────────────────────────────
240// Text normalization
241// ─────────────────────────────────────────────────────────────────────────────
242
243/// Canonicalize extracted text so output is stable across adapters:
244///
245/// 1. Normalize line endings to `\n` (drop `\r`).
246/// 2. Trim trailing whitespace on each line.
247/// 3. Collapse three-or-more consecutive blank lines to a single blank line.
248/// 4. Trim leading/trailing blank lines, then append exactly one `\n` (unless
249/// the whole text is empty, which stays empty — the image-only-PDF contract).
250///
251/// This is *layout* tid-up only; it never reorders or drops words. Word-level
252/// content is whatever the adapter recovered.
253pub fn normalize_text(raw: &str) -> String {
254 let unix = raw.replace("\r\n", "\n").replace('\r', "\n");
255
256 let lines: Vec<&str> = unix.lines().map(|l| l.trim_end()).collect();
257
258 // Trim leading/trailing blank lines by locating the first and last
259 // non-blank line ONCE, then slicing. The previous `while … lines.remove(0)`
260 // shifted every remaining element on each removal — O(n²) when the document
261 // is dominated by leading blanks (e.g. an adapter that emits millions of
262 // empty paragraphs), letting a few-hundred-KB document hang extraction for
263 // minutes. Index-and-slice is O(n) regardless of how many blanks lead.
264 let Some(first) = lines.iter().position(|l| !l.is_empty()) else {
265 return String::new();
266 };
267 // `first` exists, so a last non-blank line exists too (rposition can't be None).
268 let last = lines
269 .iter()
270 .rposition(|l| !l.is_empty())
271 .expect("a non-blank line exists once `first` is found");
272 let lines = &lines[first..=last];
273
274 // Collapse runs of 2+ blank lines down to a single blank line.
275 let mut out = String::new();
276 let mut blank_run = 0usize;
277 for &line in lines {
278 if line.is_empty() {
279 blank_run += 1;
280 if blank_run >= 2 {
281 continue;
282 }
283 } else {
284 blank_run = 0;
285 }
286 out.push_str(line);
287 out.push('\n');
288 }
289 out
290}
291
292// ─────────────────────────────────────────────────────────────────────────────
293// PDF — pdf-extract
294// ─────────────────────────────────────────────────────────────────────────────
295
296/// Extract a PDF's text layer via `pdf-extract`.
297///
298/// A PDF with no text layer (a scanned image) yields the empty string — that is
299/// correct, not an error (OCR is out of scope). A password-protected PDF that
300/// cannot be opened is mapped to [`ExtractError::Encrypted`] rather than a raw
301/// parse error so the caller can branch on it. Metadata carries the page count
302/// when the document tree exposes it.
303///
304/// `pdf-extract`/`lopdf` `panic!` internally on some malformed-but-openable
305/// PDFs (e.g. an out-of-set base `/Encoding` name), so both parser calls are
306/// wrapped in [`std::panic::catch_unwind`]: an internal abort is contained and
307/// surfaced as [`ExtractError::Parse`], upholding this module's "never panics"
308/// contract on untrusted `sources/` input.
309fn extract_pdf(path: &Path) -> Result<Extracted> {
310 // Read the bytes ourselves so a missing/unreadable file is a clean
311 // `ExtractError::Io` (via `?`) before we hand anything to the PDF parser.
312 let bytes = std::fs::read(path)?;
313
314 let text = match guard_pdf_panic(|| pdf_extract::extract_text_from_mem(&bytes))? {
315 Ok(t) => t,
316 Err(e) => return Err(classify_pdf_error(e)),
317 };
318
319 let mut out = Extracted::new(text, Format::Pdf);
320
321 // Page count is best-effort; derive it from the parsed document. A parse
322 // failure OR an internal panic here is non-fatal — the text already
323 // succeeded — so a contained panic (outer `Err`) and a load failure (inner
324 // `Err`) are both silently skipped.
325 if let Ok(Ok(doc)) = guard_pdf_panic(|| pdf_extract::Document::load_mem(&bytes)) {
326 out.put_num("pages", doc.get_pages().len() as u64);
327 }
328
329 Ok(out)
330}
331
332/// Run a panic-prone `pdf-extract`/`lopdf` call, converting an internal unwind
333/// into a typed [`ExtractError::Parse`] tagged `pdf` so the module's "never
334/// panics" contract holds on adversarial PDFs. `AssertUnwindSafe` is sound: the
335/// closure borrows only `&[u8]`, and on a caught unwind we discard any partial
336/// state and return an owned error. The default panic hook still writes the
337/// panic line to stderr — library code must not mutate the process-global hook.
338fn guard_pdf_panic<T>(f: impl FnOnce() -> T) -> Result<T> {
339 catch_unwind(AssertUnwindSafe(f)).map_err(|_| ExtractError::Parse {
340 format: "pdf",
341 message: "pdf parser aborted on malformed input".to_string(),
342 })
343}
344
345/// Map a `pdf-extract` error onto the right [`ExtractError`] variant.
346/// Decryption failures become [`ExtractError::Encrypted`]; everything else is a
347/// [`ExtractError::Parse`] tagged `pdf`.
348fn classify_pdf_error(err: pdf_extract::OutputError) -> ExtractError {
349 let msg = err.to_string();
350 let lower = msg.to_ascii_lowercase();
351 if lower.contains("password") || lower.contains("decrypt") || lower.contains("encrypt") {
352 ExtractError::Encrypted(msg)
353 } else {
354 ExtractError::Parse {
355 format: "pdf",
356 message: msg,
357 }
358 }
359}
360
361// ─────────────────────────────────────────────────────────────────────────────
362// DOCX — zip + quick-xml (no docx-rs dependency; quick-xml is already needed
363// for epub, so docx, xlsx-via-calamine, and epub share one XML/zip surface)
364// ─────────────────────────────────────────────────────────────────────────────
365
366/// Extract a `.docx` (WordprocessingML) by unzipping `word/document.xml` and
367/// concatenating the `<w:t>` run text, one logical line per `<w:p>` paragraph.
368///
369/// `<w:tab/>` becomes a tab and `<w:br/>` / `<w:cr>` a newline so table-ish and
370/// line-broken content keeps its shape; everything else is structural and
371/// ignored. This is the same minimal-but-faithful path `docx-rs` takes for text
372/// extraction, without pulling in a second XML/zip stack.
373fn extract_docx(path: &Path) -> Result<Extracted> {
374 let file = std::fs::File::open(path)?;
375 let mut archive = open_zip(file, "docx")?;
376
377 let xml = read_zip_entry(&mut archive, "word/document.xml", "docx")?;
378 let text = wordprocessing_text(&xml, "docx")?;
379
380 Ok(Extracted::new(text, Format::Docx))
381}
382
383/// Pull paragraph text out of a WordprocessingML / DrawingML XML body.
384///
385/// Shared by [`extract_docx`]. Walks the event stream collecting `<w:t>` text;
386/// `<w:p>` ends a line, `<w:tab/>` is a tab, `<w:br>`/`<w:cr>` a newline.
387fn wordprocessing_text(xml: &str, format: &'static str) -> Result<String> {
388 use quick_xml::events::Event;
389 use quick_xml::reader::Reader;
390
391 let mut reader = Reader::from_str(xml);
392 let mut buf = Vec::new();
393 let mut out = String::new();
394 let mut in_text_run = false;
395
396 loop {
397 match reader.read_event_into(&mut buf) {
398 Ok(Event::Start(e)) => {
399 if local_name(e.name().as_ref()) == b"t" {
400 in_text_run = true;
401 }
402 }
403 Ok(Event::End(e)) => {
404 let name = e.name();
405 match local_name(name.as_ref()) {
406 b"t" => in_text_run = false,
407 b"p" => out.push('\n'),
408 _ => {}
409 }
410 }
411 Ok(Event::Empty(e)) => {
412 // Self-closing run-level breaks inside a paragraph.
413 match local_name(e.name().as_ref()) {
414 b"tab" => out.push('\t'),
415 b"br" | b"cr" => out.push('\n'),
416 _ => {}
417 }
418 }
419 // quick-xml 0.40 surfaces text verbatim in `Event::Text` but routes
420 // every entity reference to a separate `Event::GeneralRef` and CDATA
421 // to `Event::CData` — all three carry run content.
422 Ok(Event::Text(t)) => {
423 if in_text_run {
424 out.push_str(&String::from_utf8_lossy(&t.into_inner()));
425 }
426 }
427 // `Smith & Co` arrives as Text("Smith ") + GeneralRef("amp") +
428 // Text(" Co"); resolve the ref so `&`/`<`/`>`/numeric chars survive.
429 Ok(Event::GeneralRef(r)) => {
430 if in_text_run {
431 out.push_str(&resolve_entity_ref(&r));
432 }
433 }
434 // CDATA inside a `<w:t>` run is valid WordprocessingML; its payload
435 // is literal text and must be appended like `Event::Text`.
436 Ok(Event::CData(c)) => {
437 if in_text_run {
438 out.push_str(&String::from_utf8_lossy(&c.into_inner()));
439 }
440 }
441 Ok(Event::Eof) => break,
442 Err(e) => {
443 return Err(ExtractError::Parse {
444 format,
445 message: format!("malformed XML: {e}"),
446 });
447 }
448 _ => {}
449 }
450 buf.clear();
451 }
452
453 Ok(out)
454}
455
456/// The local part of a possibly-namespaced XML name: `w:t` → `t`, `t` → `t`.
457/// docx/epub XML uses prefixes (`w:`, `dc:`) the writer chose; matching the
458/// local name is prefix-agnostic and robust to that choice.
459fn local_name(qname: &[u8]) -> &[u8] {
460 match qname.iter().rposition(|&b| b == b':') {
461 Some(i) => &qname[i + 1..],
462 None => qname,
463 }
464}
465
466/// Resolve a `quick_xml` general-entity / character reference to its literal
467/// text. quick-xml 0.40 does NOT inline-resolve entity references inside
468/// `Event::Text`; instead it surfaces each `&name;` / `&#nnn;` as a separate
469/// `Event::GeneralRef`. Routing those to a `_ => {}` arm silently drops `&`,
470/// `<`, `>`, numeric refs, etc. from extracted text — corrupting any title,
471/// company name, or amount that contains them. This resolves the five
472/// XML-predefined named entities and any numeric character reference; an
473/// unknown named entity falls back to its bare name (best-effort, never a
474/// panic), matching the "recover what we can" stance of `sources/` extraction.
475fn resolve_entity_ref(reference: &quick_xml::events::BytesRef<'_>) -> String {
476 // Numeric character reference (`—`, `—`): resolve to the char.
477 if let Ok(Some(ch)) = reference.resolve_char_ref() {
478 return ch.to_string();
479 }
480 // Named entity: map the five XML-predefined names; fall back to the bare
481 // name for anything else (custom DTD entities are out of scope here).
482 match reference.decode().as_deref() {
483 Ok("amp") => "&".to_string(),
484 Ok("lt") => "<".to_string(),
485 Ok("gt") => ">".to_string(),
486 Ok("quot") => "\"".to_string(),
487 Ok("apos") => "'".to_string(),
488 Ok(other) => other.to_string(),
489 Err(_) => String::new(),
490 }
491}
492
493// ─────────────────────────────────────────────────────────────────────────────
494// Spreadsheet — calamine (xlsx / xlsm / xlsb / ods)
495// ─────────────────────────────────────────────────────────────────────────────
496
497/// Ceiling on a single sheet's dense cell grid (`rows × cols`). `calamine`
498/// materializes a worksheet as a DENSE `Vec<Data>` sized from the MIN/MAX cell
499/// positions (`Range::from_sparse`), so two cells at `A1` and `XFD1048576` in a
500/// few-hundred-byte file force a ~1.7e10-element (~400 GB) allocation that
501/// **aborts** the process — bypassing the docx/epub zip-entry cap and the
502/// PDF panic guard (an allocation failure aborts, it does not unwind, so
503/// `catch_unwind` cannot contain it). `sources/` is untrusted input, so we
504/// bound the read the same way docx/epub do: refuse before the allocation.
505///
506/// 50M cells is ~1.2 GB worst-case dense (`Data` ≈ 24 bytes) — far above any
507/// real spreadsheet's used range, far below the weaponizable extreme.
508const MAX_SPREADSHEET_CELLS: u64 = 50_000_000;
509
510/// Extract every sheet of a spreadsheet via `calamine`, rendering each row as
511/// tab-separated cells, one row per line, sheets in workbook order separated by
512/// a blank line.
513///
514/// Cell rendering: text verbatim; integers and whole-valued floats without a
515/// trailing `.0` (`1200`, not `1200.0`); other floats via their default
516/// formatting; booleans as `TRUE`/`FALSE`; empty/error cells as the empty
517/// string. Metadata carries the sheet count and the joined sheet-name list.
518///
519/// Before materializing each sheet, [`spreadsheet_dense_cells`] bounds the
520/// would-be dense grid against [`MAX_SPREADSHEET_CELLS`] and returns a typed
521/// [`ExtractError::Parse`] refusal rather than letting an attacker-supplied
522/// sheet OOM/abort the process — upholding the module's "never panics on
523/// untrusted `sources/` input" contract for the spreadsheet adapter.
524fn extract_spreadsheet(path: &Path) -> Result<Extracted> {
525 use calamine::{open_workbook_auto, Reader};
526
527 let mut workbook = open_workbook_auto(path).map_err(|e| ExtractError::Parse {
528 format: "spreadsheet",
529 message: e.to_string(),
530 })?;
531
532 let sheet_names = workbook.sheet_names().to_vec();
533 let mut text = String::new();
534
535 for (idx, name) in sheet_names.iter().enumerate() {
536 if idx > 0 {
537 text.push('\n'); // blank line between sheets
538 }
539
540 // Bound the dense grid BEFORE calamine allocates it. For the zip-XML /
541 // record backends that expose a sparse cell iterator (xlsx-family,
542 // xlsb) this never densely allocates; over-cap sheets refuse cleanly.
543 if let Some(cells) = spreadsheet_dense_cells(&mut workbook, name)? {
544 if cells > MAX_SPREADSHEET_CELLS {
545 return Err(ExtractError::Parse {
546 format: "spreadsheet",
547 message: format!(
548 "sheet {name:?} declares a {cells}-cell grid, over the \
549 {MAX_SPREADSHEET_CELLS}-cell cap (malformed or hostile spreadsheet)"
550 ),
551 });
552 }
553 }
554
555 let range = workbook
556 .worksheet_range(name)
557 .map_err(|e| ExtractError::Parse {
558 format: "spreadsheet",
559 message: format!("sheet {name:?}: {e}"),
560 })?;
561
562 for row in range.rows() {
563 let cells: Vec<String> = row.iter().map(render_cell).collect();
564 text.push_str(&cells.join("\t"));
565 text.push('\n');
566 }
567 }
568
569 let mut out = Extracted::new(text, Format::Spreadsheet);
570 out.put_num("sheets", sheet_names.len() as u64);
571 if !sheet_names.is_empty() {
572 out.put_str("sheet_names", sheet_names.join(", "));
573 }
574 Ok(out)
575}
576
577/// Compute the would-be dense cell count (`rows × cols`) of one sheet WITHOUT
578/// the dense allocation, by streaming the sheet's sparse cells and tracking the
579/// MIN/MAX non-empty position — exactly the bounds `Range::from_sparse` uses.
580///
581/// Returns `Some(rows * cols)` for the formats that expose a sparse cell
582/// iterator (`.xlsx`/`.xlsm`/`.xlsb`/`.xlam`), which are the realistic
583/// decompression/dimension-bomb vectors (an OOXML/record sheet can place two
584/// cells 1e10 apart in a few hundred bytes). Returns `None` for `.xls` (BIFF,
585/// format-bounded to ≤ 65 536 × 256 ≈ 1.7e7 cells) and `.ods`, neither of which
586/// exposes a sparse iterator on the auto-detected reader; those fall through to
587/// the normal materialization path. A row/col delta is saturated into `u64` so
588/// the multiply cannot overflow.
589fn spreadsheet_dense_cells(
590 workbook: &mut calamine::Sheets<std::io::BufReader<std::fs::File>>,
591 name: &str,
592) -> Result<Option<u64>> {
593 use calamine::{DataRef, Sheets};
594
595 // Stream cells, tracking the non-empty MIN/MAX extent that `from_sparse`
596 // would allocate. Empty cells are excluded (calamine drops them before
597 // computing the dense bounds), matching the dense grid exactly.
598 fn extent<E: std::fmt::Display>(
599 mut next: impl FnMut() -> std::result::Result<Option<((u32, u32), bool)>, E>,
600 ) -> Result<Option<u64>> {
601 let (mut r0, mut r1, mut c0, mut c1) = (u32::MAX, 0u32, u32::MAX, 0u32);
602 let mut any = false;
603 loop {
604 match next() {
605 Ok(Some(((r, c), is_empty))) => {
606 if is_empty {
607 continue;
608 }
609 any = true;
610 r0 = r0.min(r);
611 r1 = r1.max(r);
612 c0 = c0.min(c);
613 c1 = c1.max(c);
614 }
615 Ok(None) => break,
616 Err(e) => {
617 return Err(ExtractError::Parse {
618 format: "spreadsheet",
619 message: format!("scanning sheet dimensions: {e}"),
620 })
621 }
622 }
623 }
624 if !any {
625 return Ok(Some(0));
626 }
627 let rows = u64::from(r1 - r0) + 1;
628 let cols = u64::from(c1 - c0) + 1;
629 Ok(Some(rows.saturating_mul(cols)))
630 }
631
632 match workbook {
633 Sheets::Xlsx(xlsx) => {
634 let mut reader =
635 xlsx.worksheet_cells_reader(name)
636 .map_err(|e| ExtractError::Parse {
637 format: "spreadsheet",
638 message: format!("sheet {name:?}: {e}"),
639 })?;
640 extent(|| {
641 reader.next_cell().map(|opt| {
642 opt.map(|c| (c.get_position(), matches!(c.get_value(), DataRef::Empty)))
643 })
644 })
645 }
646 Sheets::Xlsb(xlsb) => {
647 let mut reader =
648 xlsb.worksheet_cells_reader(name)
649 .map_err(|e| ExtractError::Parse {
650 format: "spreadsheet",
651 message: format!("sheet {name:?}: {e}"),
652 })?;
653 extent(|| {
654 reader.next_cell().map(|opt| {
655 opt.map(|c| (c.get_position(), matches!(c.get_value(), DataRef::Empty)))
656 })
657 })
658 }
659 // `.xls` (BIFF, format-bounded) and `.ods` expose no sparse iterator on
660 // the auto reader; let them materialize normally.
661 Sheets::Xls(_) | Sheets::Ods(_) => Ok(None),
662 }
663}
664
665/// Render one spreadsheet cell to its text form. Whole-valued floats drop the
666/// `.0` (so `3450.0` → `3450`), matching how spreadsheet apps display an
667/// integer-typed amount.
668fn render_cell(cell: &calamine::Data) -> String {
669 use calamine::Data;
670 match cell {
671 Data::Empty => String::new(),
672 Data::String(s) => s.clone(),
673 Data::Int(i) => i.to_string(),
674 Data::Float(f) => {
675 if f.fract() == 0.0 && f.is_finite() && f.abs() < 1e15 {
676 format!("{}", *f as i64)
677 } else {
678 f.to_string()
679 }
680 }
681 Data::Bool(b) => {
682 if *b {
683 "TRUE".to_string()
684 } else {
685 "FALSE".to_string()
686 }
687 }
688 // A date/datetime cell is an Excel SERIAL number (days since the 1900
689 // epoch, fractional part = time of day). `ExcelDateTime`'s `Display`
690 // writes the raw serial (`46188`, `46143.5`), which is meaningless to an
691 // agent filing the value into a record, so render the calendar date
692 // instead. `to_ymd_hms_milli` is available without the `chrono` feature.
693 Data::DateTime(dt) => render_excel_datetime(dt),
694 Data::DateTimeIso(s) => s.clone(),
695 Data::DurationIso(s) => s.clone(),
696 Data::Error(e) => format!("{e:?}"),
697 }
698}
699
700/// Render an Excel serial date/datetime to an ISO calendar string. A pure date
701/// (midnight, no sub-day component) renders `YYYY-MM-DD`; a datetime with a time
702/// component renders `YYYY-MM-DD HH:MM:SS`. A duration (Excel `[hh]:mm:ss`
703/// elapsed-time format) is not a calendar date, so it keeps its raw serial form
704/// (the prior behavior) rather than being misrendered as a date.
705fn render_excel_datetime(dt: &calamine::ExcelDateTime) -> String {
706 if dt.is_duration() {
707 // Elapsed-time value, not a point on the calendar — leave as the serial.
708 return dt.as_f64().to_string();
709 }
710 let (y, mo, d, h, mi, s, _ms) = dt.to_ymd_hms_milli();
711 if h == 0 && mi == 0 && s == 0 {
712 format!("{y:04}-{mo:02}-{d:02}")
713 } else {
714 format!("{y:04}-{mo:02}-{d:02} {h:02}:{mi:02}:{s:02}")
715 }
716}
717
718// ─────────────────────────────────────────────────────────────────────────────
719// EPUB — zip + quick-xml (spine order) + html2text (per-chapter)
720// ─────────────────────────────────────────────────────────────────────────────
721//
722// We do NOT use the `epub` crate: it is GPL-3.0, which violates the toolkit's
723// permissive-only license rule. An EPUB is a zip whose OPF package declares a
724// reading-order `spine`; each spine item is an XHTML document. zip + quick-xml
725// (already dependencies) read the container/OPF, and html2text (already a
726// dependency for `.html`) flattens each chapter. Same machinery, no GPL.
727
728/// Extract an EPUB's reading-order text:
729/// 1. read `META-INF/container.xml` → the OPF package path;
730/// 2. parse the OPF `manifest` (id→href) and `spine` (ordered idref list);
731/// 3. for each spine item, read its XHTML and flatten it with [`html_to_text`];
732/// 4. join chapters with a blank line.
733///
734/// Metadata carries `title` (the OPF `dc:title`) and `chapters` (spine length).
735fn extract_epub(path: &Path) -> Result<Extracted> {
736 let file = std::fs::File::open(path)?;
737 let mut archive = open_zip(file, "epub")?;
738
739 // 1. container.xml → OPF path.
740 let container = read_zip_entry(&mut archive, "META-INF/container.xml", "epub")?;
741 let opf_path = epub_opf_path(&container)?;
742
743 // 2. OPF → base dir, manifest, spine, title.
744 let opf = read_zip_entry(&mut archive, &opf_path, "epub")?;
745 let parsed = parse_opf(&opf)?;
746 let base = opf_base_dir(&opf_path);
747
748 // 3. Spine items in order → flattened chapter text.
749 let mut text = String::new();
750 let mut chapters = 0u64;
751 for idref in &parsed.spine {
752 let Some(href) = parsed.manifest.get(idref) else {
753 continue; // dangling spine ref; skip rather than fail
754 };
755 let entry = join_zip_path(&base, href);
756 // A missing spine target is skipped (best-effort), not fatal.
757 let Ok(chapter_xhtml) = read_zip_entry(&mut archive, &entry, "epub") else {
758 continue;
759 };
760 let chapter_text = html_to_text(chapter_xhtml.as_bytes())?;
761 if !chapter_text.trim().is_empty() {
762 if chapters > 0 {
763 text.push('\n');
764 }
765 text.push_str(&chapter_text);
766 text.push('\n');
767 chapters += 1;
768 }
769 }
770
771 let mut out = Extracted::new(text, Format::Epub);
772 out.put_num("chapters", chapters);
773 if let Some(title) = parsed.title {
774 out.put_str("title", title);
775 }
776 Ok(out)
777}
778
779/// The full-path of the OPF package file, read from `META-INF/container.xml`'s
780/// first `<rootfile full-path="…">`.
781fn epub_opf_path(container_xml: &str) -> Result<String> {
782 use quick_xml::events::Event;
783 use quick_xml::reader::Reader;
784
785 let mut reader = Reader::from_str(container_xml);
786 let mut buf = Vec::new();
787 loop {
788 match reader.read_event_into(&mut buf) {
789 Ok(Event::Start(e)) | Ok(Event::Empty(e)) => {
790 if local_name(e.name().as_ref()) == b"rootfile" {
791 if let Some(p) = attr_value(&e, b"full-path") {
792 return Ok(p);
793 }
794 }
795 }
796 Ok(Event::Eof) => break,
797 Err(e) => {
798 return Err(ExtractError::Parse {
799 format: "epub",
800 message: format!("container.xml: {e}"),
801 })
802 }
803 _ => {}
804 }
805 buf.clear();
806 }
807 Err(ExtractError::Parse {
808 format: "epub",
809 message: "container.xml has no <rootfile full-path>".to_string(),
810 })
811}
812
813/// The parsed-out pieces of an OPF package we need for reading-order text.
814struct OpfParsed {
815 /// Manifest: item id → href (relative to the OPF's directory).
816 manifest: BTreeMap<String, String>,
817 /// Spine: ordered list of manifest item ids (the reading order).
818 spine: Vec<String>,
819 /// `dc:title`, if present.
820 title: Option<String>,
821}
822
823/// Parse an OPF package document into its manifest, spine, and title.
824fn parse_opf(opf_xml: &str) -> Result<OpfParsed> {
825 use quick_xml::events::Event;
826 use quick_xml::reader::Reader;
827
828 let mut reader = Reader::from_str(opf_xml);
829 let mut buf = Vec::new();
830
831 let mut manifest = BTreeMap::new();
832 let mut spine = Vec::new();
833 let mut title: Option<String> = None;
834 // Whether we are inside the FIRST `<dc:title>` element, and the text we have
835 // accumulated for it. We accumulate across every Text/GeneralRef/CData event
836 // until the matching End so an entity, comment, or nested element inside the
837 // title does not truncate it.
838 let mut in_title = false;
839 let mut title_buf = String::new();
840
841 loop {
842 match reader.read_event_into(&mut buf) {
843 Ok(Event::Start(e)) => match local_name(e.name().as_ref()) {
844 b"item" => {
845 if let (Some(id), Some(href)) = (attr_value(&e, b"id"), attr_value(&e, b"href"))
846 {
847 manifest.insert(id, href);
848 }
849 }
850 b"itemref" => {
851 if let Some(idref) = attr_value(&e, b"idref") {
852 spine.push(idref);
853 }
854 }
855 // Only a Start (not a self-closing Empty) opens the title: an
856 // Empty `<dc:title/>` has no content and produces no End event,
857 // so latching `in_title` on it would wrongly capture the next
858 // text node (e.g. the author) as the title.
859 b"title" if title.is_none() => in_title = true,
860 _ => {}
861 },
862 // Self-closing manifest/spine entries are Empty events; the title is
863 // never captured from Empty (see the Start arm's note).
864 Ok(Event::Empty(e)) => match local_name(e.name().as_ref()) {
865 b"item" => {
866 if let (Some(id), Some(href)) = (attr_value(&e, b"id"), attr_value(&e, b"href"))
867 {
868 manifest.insert(id, href);
869 }
870 }
871 b"itemref" => {
872 if let Some(idref) = attr_value(&e, b"idref") {
873 spine.push(idref);
874 }
875 }
876 _ => {}
877 },
878 Ok(Event::End(e)) => {
879 if in_title && local_name(e.name().as_ref()) == b"title" {
880 in_title = false;
881 let s = title_buf.trim();
882 if !s.is_empty() {
883 title = Some(s.to_string());
884 }
885 }
886 }
887 Ok(Event::Text(t)) => {
888 if in_title {
889 title_buf.push_str(&String::from_utf8_lossy(&t.into_inner()));
890 }
891 }
892 // An entity (`&`) or numeric ref inside the title resolves into
893 // the accumulated value rather than truncating it.
894 Ok(Event::GeneralRef(r)) => {
895 if in_title {
896 title_buf.push_str(&resolve_entity_ref(&r));
897 }
898 }
899 // CDATA inside `<dc:title>` is literal title text.
900 Ok(Event::CData(c)) => {
901 if in_title {
902 title_buf.push_str(&String::from_utf8_lossy(&c.into_inner()));
903 }
904 }
905 Ok(Event::Eof) => break,
906 Err(e) => {
907 return Err(ExtractError::Parse {
908 format: "epub",
909 message: format!("OPF: {e}"),
910 })
911 }
912 _ => {}
913 }
914 buf.clear();
915 }
916
917 Ok(OpfParsed {
918 manifest,
919 spine,
920 title,
921 })
922}
923
924/// The directory portion of an OPF path (`"OEBPS/content.opf"` → `"OEBPS"`,
925/// `"content.opf"` → `""`), used to resolve manifest hrefs against the OPF's own
926/// location inside the zip.
927fn opf_base_dir(opf_path: &str) -> String {
928 match opf_path.rfind('/') {
929 Some(i) => opf_path[..i].to_string(),
930 None => String::new(),
931 }
932}
933
934/// Join an OPF base dir with a (possibly `./`-prefixed) manifest href into a zip
935/// entry name. Forward-slash only — zip paths are always `/`-separated.
936///
937/// OPF manifest hrefs are URLs: the EPUB spec requires reserved characters
938/// (spaces, non-ASCII) to be percent-encoded, but zip entry NAMES are raw. So an
939/// href `my%20chapter.xhtml` must be percent-decoded to `my chapter.xhtml`
940/// before it can match the zip entry, or the chapter is silently dropped. We
941/// percent-decode the href and then normalize `.`/`..` segments so a relative
942/// href like `../text/ch1.xhtml` resolves against the OPF's directory.
943fn join_zip_path(base: &str, href: &str) -> String {
944 let decoded = percent_decode(href);
945 let combined = if base.is_empty() {
946 decoded
947 } else {
948 format!("{base}/{decoded}")
949 };
950 normalize_zip_path(&combined)
951}
952
953/// Percent-decode a URL path component (`%20` → space, `%C3%A9` → `é`).
954/// Decodes byte-by-byte then UTF-8-lossy-reinterprets, so a multi-byte
955/// percent-encoded codepoint (`%C3%A9`) round-trips. A stray `%` not followed by
956/// two hex digits is emitted verbatim (best-effort, never a panic).
957fn percent_decode(s: &str) -> String {
958 let bytes = s.as_bytes();
959 let mut out: Vec<u8> = Vec::with_capacity(bytes.len());
960 let mut i = 0;
961 while i < bytes.len() {
962 if bytes[i] == b'%' && i + 2 < bytes.len() {
963 let hi = (bytes[i + 1] as char).to_digit(16);
964 let lo = (bytes[i + 2] as char).to_digit(16);
965 if let (Some(hi), Some(lo)) = (hi, lo) {
966 out.push((hi * 16 + lo) as u8);
967 i += 3;
968 continue;
969 }
970 }
971 out.push(bytes[i]);
972 i += 1;
973 }
974 String::from_utf8_lossy(&out).into_owned()
975}
976
977/// Resolve `.` and `..` segments in a `/`-separated zip path so a manifest href
978/// like `../text/ch1.xhtml` (relative to the OPF's directory) maps to the real
979/// entry name. A leading `..` that would escape the archive root is dropped
980/// (zip entries have no parent of the root).
981fn normalize_zip_path(path: &str) -> String {
982 let mut out: Vec<&str> = Vec::new();
983 for seg in path.split('/') {
984 match seg {
985 "" | "." => {}
986 ".." => {
987 out.pop();
988 }
989 other => out.push(other),
990 }
991 }
992 out.join("/")
993}
994
995// ─────────────────────────────────────────────────────────────────────────────
996// HTML — html2text + light markdown-decoration cleanup
997// ─────────────────────────────────────────────────────────────────────────────
998
999/// Extract plain text from an `.html` file.
1000fn extract_html(path: &Path) -> Result<Extracted> {
1001 let bytes = std::fs::read(path)?;
1002 let text = html_to_text(&bytes)?;
1003 Ok(Extracted::new(text, Format::Html))
1004}
1005
1006/// Flatten an HTML/XHTML byte stream to clean plain text.
1007///
1008/// Renders with [`PlainContentDecorator`] — `html2text`'s plain renderer driven
1009/// by a decorator that emits **no** link brackets and **no** `#` heading
1010/// markers, while keeping list-item markers (`*` / `N.`). This removes the two
1011/// decorations at the source instead of post-stripping them: the previous
1012/// approach blindly deleted every `[bracketed]` substring and every leading `#`
1013/// run from the rendered text, which also destroyed *literal* content —
1014/// citation markers (`[1]`, `[sic]`), code subscripts (`x[i]`), and ranking
1015/// prose (`#1 in sales`). The renderer knows which `[`/`#` it produced; literal
1016/// brackets and hashes in the source now survive untouched.
1017///
1018/// A very wide wrap width (10_000) is used so paragraphs are not hard-wrapped by
1019/// the renderer; paragraph structure comes from the source's block elements, and
1020/// final layout is canonicalized by [`normalize_text`].
1021fn html_to_text(html: &[u8]) -> Result<String> {
1022 html2text::config::with_decorator(PlainContentDecorator)
1023 .string_from_read(html, 10_000)
1024 .map_err(|e| ExtractError::Parse {
1025 format: "html",
1026 message: e.to_string(),
1027 })
1028}
1029
1030/// A `html2text` decorator that flattens HTML to plain text WITHOUT emitting the
1031/// markup that would otherwise have to be post-stripped: no `[`/`]` around link
1032/// text, no `#` heading prefix, no `^{…}` superscript braces. List-item markers
1033/// (`* ` for unordered, `N. ` for ordered) ARE emitted — they are content-
1034/// faithful and match the corpus convention. Quote prefixes are kept as in the
1035/// stock plain decorator. This is the fix for the literal-content corruption the
1036/// old `strip_markdown_decorations`/`unwrap_brackets` post-pass caused.
1037#[derive(Clone, Debug)]
1038struct PlainContentDecorator;
1039
1040impl html2text::render::TextDecorator for PlainContentDecorator {
1041 type Annotation = ();
1042
1043 fn decorate_link_start(&mut self, _url: &str) -> (String, Self::Annotation) {
1044 (String::new(), ())
1045 }
1046 fn decorate_link_end(&mut self) -> String {
1047 String::new()
1048 }
1049 fn decorate_em_start(&self) -> (String, Self::Annotation) {
1050 (String::new(), ())
1051 }
1052 fn decorate_em_end(&self) -> String {
1053 String::new()
1054 }
1055 fn decorate_strong_start(&self) -> (String, Self::Annotation) {
1056 (String::new(), ())
1057 }
1058 fn decorate_strong_end(&self) -> String {
1059 String::new()
1060 }
1061 fn decorate_strikeout_start(&self) -> (String, Self::Annotation) {
1062 (String::new(), ())
1063 }
1064 fn decorate_strikeout_end(&self) -> String {
1065 String::new()
1066 }
1067 fn decorate_code_start(&self) -> (String, Self::Annotation) {
1068 (String::new(), ())
1069 }
1070 fn decorate_code_end(&self) -> String {
1071 String::new()
1072 }
1073 fn decorate_preformat_first(&self) -> Self::Annotation {}
1074 fn decorate_preformat_cont(&self) -> Self::Annotation {}
1075 fn decorate_image(&mut self, _src: &str, title: &str) -> (String, Self::Annotation) {
1076 // Alt/title text only — no surrounding brackets (the stock plain
1077 // decorator wraps it in `[...]`, which would read as literal content).
1078 (title.to_string(), ())
1079 }
1080 fn header_prefix(&self, _level: usize) -> String {
1081 // No `#` heading marker — heading text reads as plain prose.
1082 String::new()
1083 }
1084 fn quote_prefix(&self) -> String {
1085 "> ".to_string()
1086 }
1087 fn unordered_item_prefix(&self) -> String {
1088 "* ".to_string()
1089 }
1090 fn ordered_item_prefix(&self, i: i64) -> String {
1091 format!("{i}. ")
1092 }
1093 fn decorate_superscript_start(&self) -> (String, Self::Annotation) {
1094 // Plain text: no `^{…}` braces (which would corrupt literal content).
1095 (String::new(), ())
1096 }
1097 fn decorate_superscript_end(&self) -> String {
1098 String::new()
1099 }
1100 fn make_subblock_decorator(&self) -> Self {
1101 PlainContentDecorator
1102 }
1103}
1104
1105/// Strip the residual markdown decorations `html2text`'s plain renderer emits:
1106/// leading run of `#` (ATX heading markers) at the start of a line, and `[...]`
1107/// brackets around link/anchor text (the reference-style `[n]` suffix is already
1108/// gone under `plain_no_decorate`). Bullet (`*`) and ordered (`N.`) markers are
1109/// left intact — they are content, not decoration.
1110///
1111/// No longer used by [`html_to_text`] (the [`PlainContentDecorator`] now removes
1112/// these decorations at the source so literal `[brackets]`/`#hashes` survive);
1113/// retained only for its unit test documenting the old renderer's behavior.
1114#[allow(dead_code)]
1115fn strip_markdown_decorations(text: &str) -> String {
1116 let mut out = String::with_capacity(text.len());
1117 for line in text.lines() {
1118 // Strip a leading "#"-run + the single space after it (ATX heading).
1119 let trimmed = line.trim_start();
1120 let after_hashes = trimmed.trim_start_matches('#');
1121 let line = if after_hashes.len() != trimmed.len() {
1122 // It was a heading line: keep indentation-free heading text.
1123 after_hashes.trim_start()
1124 } else {
1125 line
1126 };
1127 out.push_str(&unwrap_brackets(line));
1128 out.push('\n');
1129 }
1130 out
1131}
1132
1133/// Replace every `[inner]` with `inner` (one pass, non-nested). `html2text`'s
1134/// plain renderer wraps link/anchor text in single brackets; unwrapping yields
1135/// the bare text. Escaped or unmatched brackets are left as-is.
1136///
1137/// No longer used by [`html_to_text`] (see [`strip_markdown_decorations`]);
1138/// retained only for its unit test.
1139#[allow(dead_code)]
1140fn unwrap_brackets(line: &str) -> String {
1141 if !line.contains('[') {
1142 return line.to_string();
1143 }
1144 let mut out = String::with_capacity(line.len());
1145 let mut chars = line.chars().peekable();
1146 while let Some(c) = chars.next() {
1147 if c == '[' {
1148 // Collect until the matching ']'; if none, emit the '[' literally.
1149 let mut inner = String::new();
1150 let mut closed = false;
1151 for d in chars.by_ref() {
1152 if d == ']' {
1153 closed = true;
1154 break;
1155 }
1156 inner.push(d);
1157 }
1158 if closed {
1159 out.push_str(&inner);
1160 } else {
1161 out.push('[');
1162 out.push_str(&inner);
1163 }
1164 } else {
1165 out.push(c);
1166 }
1167 }
1168 out
1169}
1170
1171// ─────────────────────────────────────────────────────────────────────────────
1172// Shared zip helpers (docx + epub)
1173// ─────────────────────────────────────────────────────────────────────────────
1174
1175/// Open a zip archive from a reader, mapping any failure to a typed
1176/// [`ExtractError::Parse`] tagged with the calling format.
1177fn open_zip<R: Read + std::io::Seek>(
1178 reader: R,
1179 format: &'static str,
1180) -> Result<zip::ZipArchive<R>> {
1181 zip::ZipArchive::new(reader).map_err(|e| ExtractError::Parse {
1182 format,
1183 message: format!("not a valid zip container: {e}"),
1184 })
1185}
1186
1187/// Cap on a single decompressed zip entry. docx/epub members are XML text — a
1188/// member that inflates past this ceiling is a decompression bomb or corruption,
1189/// not real evidence. `sources/` is untrusted input, so bound the read rather
1190/// than let `read_to_end` follow a hostile DEFLATE stream until OOM.
1191const MAX_ZIP_ENTRY_BYTES: u64 = 256 * 1024 * 1024;
1192
1193/// Read a single zip entry to a UTF-8 string, bounded by [`MAX_ZIP_ENTRY_BYTES`]
1194/// so a zip-bomb member cannot exhaust memory. A missing entry, an over-cap
1195/// entry, or a read failure is a typed [`ExtractError::Parse`]; invalid UTF-8 is
1196/// lossily decoded (OOXML / XHTML are declared UTF-8, but we never panic on a
1197/// stray byte).
1198fn read_zip_entry<R: Read + std::io::Seek>(
1199 archive: &mut zip::ZipArchive<R>,
1200 name: &str,
1201 format: &'static str,
1202) -> Result<String> {
1203 let entry = archive.by_name(name).map_err(|e| ExtractError::Parse {
1204 format,
1205 message: format!("missing zip entry {name:?}: {e}"),
1206 })?;
1207 // Reject up front when the central directory declares an over-cap size...
1208 let declared = entry.size();
1209 if declared > MAX_ZIP_ENTRY_BYTES {
1210 return Err(ExtractError::Parse {
1211 format,
1212 message: format!(
1213 "zip entry {name:?} declares {declared} bytes, over the {MAX_ZIP_ENTRY_BYTES}-byte cap"
1214 ),
1215 });
1216 }
1217 // ...and bound the actual decompressed read so a lying header (a bomb that
1218 // understates its uncompressed size) still cannot allocate past the cap.
1219 let mut bytes = Vec::new();
1220 entry
1221 .take(MAX_ZIP_ENTRY_BYTES + 1)
1222 .read_to_end(&mut bytes)
1223 .map_err(|e| ExtractError::Parse {
1224 format,
1225 message: format!("reading {name:?}: {e}"),
1226 })?;
1227 if bytes.len() as u64 > MAX_ZIP_ENTRY_BYTES {
1228 return Err(ExtractError::Parse {
1229 format,
1230 message: format!(
1231 "zip entry {name:?} exceeds the {MAX_ZIP_ENTRY_BYTES}-byte cap (decompression bomb?)"
1232 ),
1233 });
1234 }
1235 Ok(String::from_utf8_lossy(&bytes).into_owned())
1236}
1237
1238/// Look up a start/empty element's attribute value by local name, returning it
1239/// unescaped as an owned `String`. Prefix-agnostic on the attribute key.
1240fn attr_value(elem: &quick_xml::events::BytesStart<'_>, key: &[u8]) -> Option<String> {
1241 elem.attributes().flatten().find_map(|attr| {
1242 if local_name(attr.key.as_ref()) == key {
1243 // `unescape_value` returns an XML-unescaped `Cow<str>` — exactly the
1244 // owned attribute text we want. It is soft-deprecated in quick-xml
1245 // 0.40 in favor of `normalized_value(XmlVersion)`, whose extra
1246 // version arg and byte-Cow return buy us nothing here; the simple
1247 // form is correct for the UTF-8 OOXML/OPF attributes we read.
1248 #[allow(deprecated)]
1249 attr.unescape_value().ok().map(|cow| cow.into_owned())
1250 } else {
1251 None
1252 }
1253 })
1254}
1255
1256#[cfg(test)]
1257mod tests {
1258 use super::*;
1259 use std::path::PathBuf;
1260
1261 /// Absolute path to a corpus-c-formats fixture under `sources/docs/`.
1262 fn fixture(name: &str) -> PathBuf {
1263 PathBuf::from(env!("CARGO_MANIFEST_DIR"))
1264 .join("../../tests/corpora/corpus-c-formats/sources/docs")
1265 .join(name)
1266 }
1267
1268 /// Read the known-good `.txt` sibling of a fixture.
1269 fn expected(name: &str) -> String {
1270 std::fs::read_to_string(fixture(&format!("{name}.txt"))).unwrap()
1271 }
1272
1273 /// Token-level normalization: collapse every run of whitespace (incl.
1274 /// newlines) to one space and trim. This is the corpus's recommended,
1275 /// layout-agnostic comparison ("same words, same order").
1276 fn tokens(s: &str) -> String {
1277 s.split_whitespace().collect::<Vec<_>>().join(" ")
1278 }
1279
1280 /// The sorted set of non-blank, token-normalized lines — order-agnostic
1281 /// content comparison (used where extractor reading-order legitimately
1282 /// differs, e.g. multi-column PDF).
1283 fn line_set(s: &str) -> Vec<String> {
1284 let mut v: Vec<String> = s.lines().map(tokens).filter(|l| !l.is_empty()).collect();
1285 v.sort();
1286 v
1287 }
1288
1289 // ── format detection ────────────────────────────────────────────────────
1290
1291 #[test]
1292 fn detects_format_by_extension_case_insensitively() {
1293 assert_eq!(Format::from_path(Path::new("a.pdf")), Some(Format::Pdf));
1294 assert_eq!(Format::from_path(Path::new("a.PDF")), Some(Format::Pdf));
1295 assert_eq!(Format::from_path(Path::new("a.docx")), Some(Format::Docx));
1296 assert_eq!(
1297 Format::from_path(Path::new("a.xlsx")),
1298 Some(Format::Spreadsheet)
1299 );
1300 assert_eq!(
1301 Format::from_path(Path::new("a.ods")),
1302 Some(Format::Spreadsheet)
1303 );
1304 assert_eq!(Format::from_path(Path::new("a.epub")), Some(Format::Epub));
1305 assert_eq!(Format::from_path(Path::new("a.html")), Some(Format::Html));
1306 assert_eq!(Format::from_path(Path::new("a.htm")), Some(Format::Html));
1307 assert_eq!(Format::from_path(Path::new("a.txt")), None);
1308 assert_eq!(Format::from_path(Path::new("noext")), None);
1309 }
1310
1311 #[test]
1312 fn unsupported_extension_is_typed_error() {
1313 let err = extract(Path::new("/tmp/whatever.txt")).unwrap_err();
1314 assert!(matches!(err, ExtractError::UnsupportedFormat(ref e) if e == "txt"));
1315 assert_eq!(err.code(), "UNSUPPORTED_FORMAT");
1316 }
1317
1318 #[test]
1319 fn missing_extension_is_unsupported() {
1320 let err = extract(Path::new("/tmp/noext")).unwrap_err();
1321 assert!(matches!(err, ExtractError::UnsupportedFormat(ref e) if e.is_empty()));
1322 }
1323
1324 // ── normalization ─────────────────────────────────────────────────────────
1325
1326 #[test]
1327 fn normalize_collapses_blanks_and_trims() {
1328 let raw = "\r\n\r\nHeading\r\n\r\n\r\n\r\nBody line \r\n\r\n";
1329 assert_eq!(normalize_text(raw), "Heading\n\nBody line\n");
1330 }
1331
1332 #[test]
1333 fn normalize_empty_stays_empty() {
1334 assert_eq!(normalize_text(""), "");
1335 assert_eq!(normalize_text(" \n\n \n"), "");
1336 }
1337
1338 // ── per-format extraction against corpus-c fixtures ───────────────────────
1339
1340 #[test]
1341 fn extract_text_pdf_matches_known_good() {
1342 let got = extract(&fixture("text.pdf")).unwrap();
1343 assert_eq!(got.metadata["format"], MetaValue::Str("pdf".into()));
1344 assert_eq!(got.metadata["pages"], MetaValue::Num(1));
1345 assert_eq!(tokens(&got.text), tokens(&expected("text.pdf")));
1346 }
1347
1348 #[test]
1349 fn extract_weird_fonts_pdf_matches_known_good() {
1350 let got = extract(&fixture("weird-fonts.pdf")).unwrap();
1351 assert_eq!(tokens(&got.text), tokens(&expected("weird-fonts.pdf")));
1352 }
1353
1354 #[test]
1355 fn extract_multi_column_pdf_matches_content_order_agnostic() {
1356 // pdf-extract reads column-by-column; the known-good `.txt` captures the
1357 // interleaved (pdftotext) order. Both carry identical content — assert
1358 // the line SET, not the order. (README § multi-column.)
1359 let got = extract(&fixture("multi-column.pdf")).unwrap();
1360 assert_eq!(line_set(&got.text), line_set(&expected("multi-column.pdf")));
1361 }
1362
1363 #[test]
1364 fn extract_image_only_pdf_yields_empty() {
1365 // No text layer → empty out, never hallucinated text. OCR out of scope.
1366 let got = extract(&fixture("image-only.pdf")).unwrap();
1367 assert_eq!(got.text, "");
1368 assert!(expected("image-only.pdf").trim().is_empty());
1369 }
1370
1371 #[test]
1372 fn extract_encrypted_pdf_without_password_refuses_cleanly() {
1373 let err = extract(&fixture("encrypted.pdf")).unwrap_err();
1374 assert!(
1375 matches!(err, ExtractError::Encrypted(_)),
1376 "expected Encrypted, got {err:?}"
1377 );
1378 assert_eq!(err.code(), "DOCUMENT_ENCRYPTED");
1379 }
1380
1381 #[test]
1382 fn guard_pdf_panic_contains_unwind_as_parse_error() {
1383 // The "never panics" contract: an internal pdf-extract/lopdf panic must
1384 // surface as a typed ExtractError::Parse, not abort the process. (cargo
1385 // captures the unwind's stderr line for a passing test.)
1386 let contained: Result<()> = guard_pdf_panic(|| panic!("simulated pdf-extract abort"));
1387 assert!(
1388 matches!(contained, Err(ExtractError::Parse { format: "pdf", .. })),
1389 "panic must be contained as a pdf Parse error, got {contained:?}"
1390 );
1391 // The success path is transparent — the value passes straight through.
1392 let ok: Result<u32> = guard_pdf_panic(|| 42);
1393 assert_eq!(ok.unwrap(), 42);
1394 }
1395
1396 #[test]
1397 fn extract_docx_matches_known_good() {
1398 let got = extract(&fixture("sample.docx")).unwrap();
1399 assert_eq!(got.metadata["format"], MetaValue::Str("docx".into()));
1400 assert_eq!(tokens(&got.text), tokens(&expected("sample.docx")));
1401 }
1402
1403 #[test]
1404 fn extract_xlsx_matches_known_good() {
1405 let got = extract(&fixture("sample.xlsx")).unwrap();
1406 assert_eq!(got.metadata["format"], MetaValue::Str("spreadsheet".into()));
1407 assert_eq!(got.metadata["sheets"], MetaValue::Num(1));
1408 assert_eq!(
1409 got.metadata["sheet_names"],
1410 MetaValue::Str("Expenses".into())
1411 );
1412 // Tab-separated, integers without `.0` — exact match (no soft-wrap risk).
1413 assert_eq!(got.text.trim_end(), expected("sample.xlsx").trim_end());
1414 }
1415
1416 #[test]
1417 fn extract_epub_matches_known_good() {
1418 let got = extract(&fixture("sample.epub")).unwrap();
1419 assert_eq!(got.metadata["format"], MetaValue::Str("epub".into()));
1420 assert_eq!(got.metadata["chapters"], MetaValue::Num(1));
1421 assert_eq!(
1422 got.metadata["title"],
1423 MetaValue::Str("Operations Playbook".into())
1424 );
1425 assert_eq!(tokens(&got.text), tokens(&expected("sample.epub")));
1426 }
1427
1428 #[test]
1429 fn extract_html_matches_known_good() {
1430 let got = extract(&fixture("sample.html")).unwrap();
1431 assert_eq!(got.metadata["format"], MetaValue::Str("html".into()));
1432 assert_eq!(tokens(&got.text), tokens(&expected("sample.html")));
1433 }
1434
1435 // ── helper-level unit tests ───────────────────────────────────────────────
1436
1437 #[test]
1438 fn unwrap_brackets_flattens_link_text() {
1439 assert_eq!(
1440 unwrap_brackets("contact [ops@acme.example] or the [handbook]."),
1441 "contact ops@acme.example or the handbook."
1442 );
1443 // Unmatched '[' is preserved.
1444 assert_eq!(unwrap_brackets("a [b c"), "a [b c");
1445 // No brackets → untouched.
1446 assert_eq!(unwrap_brackets("plain text"), "plain text");
1447 }
1448
1449 #[test]
1450 fn strip_markdown_decorations_drops_heading_hashes() {
1451 let input = "# Title\n## Section\n* bullet\n1. ordered\nplain\n";
1452 let out = strip_markdown_decorations(input);
1453 assert_eq!(out, "Title\nSection\n* bullet\n1. ordered\nplain\n");
1454 }
1455
1456 #[test]
1457 fn local_name_strips_prefix() {
1458 assert_eq!(local_name(b"w:t"), b"t");
1459 assert_eq!(local_name(b"t"), b"t");
1460 assert_eq!(local_name(b"dc:title"), b"title");
1461 }
1462
1463 #[test]
1464 fn extracted_serializes_to_text_metadata_json() {
1465 let got = extract(&fixture("sample.xlsx")).unwrap();
1466 let json = serde_json::to_value(&got).unwrap();
1467 assert!(json.get("text").is_some());
1468 assert_eq!(json["metadata"]["format"], "spreadsheet");
1469 assert_eq!(json["metadata"]["sheets"], 1);
1470 // MetaValue::Num serializes as a bare JSON number, Str as a bare string.
1471 assert!(json["metadata"]["sheets"].is_number());
1472 assert!(json["metadata"]["format"].is_string());
1473 }
1474
1475 // ── regression: leading-blank normalization is linear (finding #13) ────────
1476
1477 /// `normalize_text` must trim leading blank lines in O(n), not O(n²). The
1478 /// pre-fix loop used `lines.remove(0)` per blank line — O(n) shift each, so a
1479 /// document dominated by leading blanks took O(n²) and hung extraction.
1480 ///
1481 /// 500_000 leading blank lines is ~2.5e11 element shifts under the old code
1482 /// (minutes-to-hours, effectively a hang) but instant under the index-and-
1483 /// slice path; the test reconstructs the finding's trigger (an adapter output
1484 /// that is mostly leading blanks then one line of text) and asserts the
1485 /// correct, fully-trimmed result. Against the pre-fix code this test does not
1486 /// complete in a reasonable time — encoding the quadratic regression.
1487 #[test]
1488 fn regression_normalize_text_leading_blanks_is_linear() {
1489 let blanks = "\n".repeat(500_000);
1490 let raw = format!("{blanks}only real line\n");
1491 // Leading blanks fully trimmed; single trailing newline; body intact.
1492 assert_eq!(normalize_text(&raw), "only real line\n");
1493
1494 // A wholly-blank giant input still collapses to empty (the other branch).
1495 assert_eq!(normalize_text(&" \n".repeat(500_000)), "");
1496 }
1497
1498 // ── regression: spreadsheet dense-grid bomb is refused (finding #4) ────────
1499
1500 /// Build a VALID `.xlsx` whose single sheet declares two real cells at the
1501 /// opposite corners of Excel's grid (`A1` and `XFD1048576`). `calamine`
1502 /// materializes a sheet as a DENSE `Vec<Data>` sized from the MIN/MAX cell
1503 /// positions, so this two-cell sheet would force a ~1.7e10-element (~400 GB)
1504 /// allocation and abort the process. We reuse the corpus `sample.xlsx`
1505 /// container verbatim and swap ONLY `xl/worksheets/sheet1.xml`, so every
1506 /// other part (workbook, rels, content-types) is a real, openable workbook.
1507 fn write_dense_bomb_xlsx(dest: &Path) {
1508 use std::io::Write;
1509
1510 let base = std::fs::read(fixture("sample.xlsx")).expect("corpus sample.xlsx exists");
1511 let mut archive =
1512 zip::ZipArchive::new(std::io::Cursor::new(base)).expect("sample.xlsx is a valid zip");
1513
1514 let bomb_sheet = b"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\
1515<worksheet xmlns=\"http://schemas.openxmlformats.org/spreadsheetml/2006/main\">\
1516<sheetData>\
1517<row r=\"1\"><c r=\"A1\"><v>1</v></c></row>\
1518<row r=\"1048576\"><c r=\"XFD1048576\"><v>2</v></c></row>\
1519</sheetData></worksheet>";
1520
1521 let out = std::fs::File::create(dest).unwrap();
1522 let mut writer = zip::ZipWriter::new(out);
1523 let opts = zip::write::SimpleFileOptions::default()
1524 .compression_method(zip::CompressionMethod::Stored);
1525
1526 for i in 0..archive.len() {
1527 let entry = archive.by_index(i).unwrap();
1528 let name = entry.name().to_string();
1529 if name == "xl/worksheets/sheet1.xml" {
1530 writer.start_file(name, opts).unwrap();
1531 writer.write_all(bomb_sheet).unwrap();
1532 } else {
1533 // Copy every other entry's already-compressed bytes verbatim.
1534 writer.raw_copy_file(entry).unwrap();
1535 }
1536 }
1537 writer.finish().unwrap();
1538 }
1539
1540 /// A spreadsheet whose declared dense grid exceeds [`MAX_SPREADSHEET_CELLS`]
1541 /// is refused with a typed [`ExtractError::Parse`] BEFORE calamine allocates
1542 /// the dense matrix — never an OOM/abort. Pre-fix, `extract_spreadsheet`
1543 /// called `worksheet_range` directly and the process aborted on the
1544 /// allocation; this test would not return (it would kill the test runner),
1545 /// so it encodes the resource-exhaustion regression.
1546 #[test]
1547 fn regression_spreadsheet_dense_bomb_refused_not_oom() {
1548 let tmp = tempfile::TempDir::new().unwrap();
1549 let bomb = tmp.path().join("invoice.xlsx");
1550 write_dense_bomb_xlsx(&bomb);
1551
1552 // A few-hundred-byte file on disk — the whole point of the bomb.
1553 assert!(
1554 std::fs::metadata(&bomb).unwrap().len() < 10_000,
1555 "the bomb must be tiny on disk; the danger is the in-memory expansion"
1556 );
1557
1558 let err = extract(&bomb).unwrap_err();
1559 assert!(
1560 matches!(
1561 err,
1562 ExtractError::Parse {
1563 format: "spreadsheet",
1564 ..
1565 }
1566 ),
1567 "an over-cap dense grid must be a typed spreadsheet Parse refusal, got {err:?}"
1568 );
1569 assert_eq!(err.code(), "EXTRACT_PARSE_ERROR");
1570 }
1571
1572 /// The cap is a guard, not a wall: a normal spreadsheet still extracts. Locks
1573 /// down that the preflight bound does not regress the legitimate path (the
1574 /// corpus `sample.xlsx` is a 3×3 grid, far under the cap).
1575 #[test]
1576 fn regression_spreadsheet_cap_allows_real_workbook() {
1577 let got = extract(&fixture("sample.xlsx")).unwrap();
1578 assert_eq!(got.metadata["sheets"], MetaValue::Num(1));
1579 assert!(!got.text.is_empty());
1580 }
1581
1582 // ── regression: entity-ref / CDATA fidelity (findings #34, #1011) ──────────
1583
1584 /// Build a minimal valid `.docx` whose `word/document.xml` body is the given
1585 /// run XML, written to `dest`. Only the three OOXML members `extract_docx`
1586 /// touches need to be real; the rest of a Word package is optional for text
1587 /// extraction.
1588 fn write_docx(dest: &Path, body_runs: &str) {
1589 use std::io::Write;
1590 let document = format!(
1591 "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\
1592<w:document xmlns:w=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\">\
1593<w:body>{body_runs}</w:body></w:document>"
1594 );
1595 let file = std::fs::File::create(dest).unwrap();
1596 let mut writer = zip::ZipWriter::new(file);
1597 let opts = zip::write::SimpleFileOptions::default()
1598 .compression_method(zip::CompressionMethod::Stored);
1599 writer.start_file("word/document.xml", opts).unwrap();
1600 writer.write_all(document.as_bytes()).unwrap();
1601 writer.finish().unwrap();
1602 }
1603
1604 #[test]
1605 fn regression_docx_resolves_entity_refs() {
1606 // quick-xml 0.40 surfaces `&`/`<`/`>`/`—` as separate
1607 // GeneralRef events; pre-fix they were routed to `_ => {}` and dropped,
1608 // corrupting `Smith & Co invoice <final> total — 100`.
1609 let tmp = tempfile::TempDir::new().unwrap();
1610 let f = tmp.path().join("entity.docx");
1611 write_docx(
1612 &f,
1613 "<w:p><w:r><w:t>Smith & Co invoice <final> total — 100</w:t></w:r></w:p>",
1614 );
1615 let got = extract(&f).unwrap();
1616 assert_eq!(got.text, "Smith & Co invoice <final> total — 100\n");
1617 }
1618
1619 #[test]
1620 fn regression_docx_preserves_cdata_run_text() {
1621 // CDATA inside `<w:t>` is valid and literal; pre-fix it fell through the
1622 // wildcard arm and the payload vanished.
1623 let tmp = tempfile::TempDir::new().unwrap();
1624 let f = tmp.path().join("cdata.docx");
1625 write_docx(
1626 &f,
1627 "<w:p><w:r><w:t>Line A.</w:t></w:r></w:p>\
1628<w:p><w:r><w:t><![CDATA[IMPORTANT CDATA CONTENT]]></w:t></w:r></w:p>\
1629<w:p><w:r><w:t>Line C.</w:t></w:r></w:p>",
1630 );
1631 let got = extract(&f).unwrap();
1632 assert_eq!(got.text, "Line A.\nIMPORTANT CDATA CONTENT\nLine C.\n");
1633 }
1634
1635 #[test]
1636 fn resolve_entity_ref_maps_named_and_numeric() {
1637 use quick_xml::events::BytesRef;
1638 let r = |s: &'static str| resolve_entity_ref(&BytesRef::new(s));
1639 assert_eq!(r("amp"), "&");
1640 assert_eq!(r("lt"), "<");
1641 assert_eq!(r("gt"), ">");
1642 assert_eq!(r("quot"), "\"");
1643 assert_eq!(r("apos"), "'");
1644 assert_eq!(r("#8212"), "—");
1645 assert_eq!(r("#x2014"), "—");
1646 // Unknown named entity → bare name (best-effort, never a panic).
1647 assert_eq!(r("nbsp"), "nbsp");
1648 }
1649
1650 // ── regression: EPUB OPF parsing (findings #35, #37, #1012) ────────────────
1651
1652 /// Build a minimal valid EPUB at `dest`. `opf_metadata` is spliced verbatim
1653 /// inside `<metadata>`; `manifest_href` is the chapter item's href; the
1654 /// chapter XHTML is stored under the literal zip entry `chapter_entry`. The
1655 /// mimetype member is written first and stored (per the EPUB OCF spec).
1656 fn write_epub(dest: &Path, opf_metadata: &str, manifest_href: &str, chapter_entry: &str) {
1657 use std::io::Write;
1658 let container = "<?xml version=\"1.0\"?>\
1659<container version=\"1.0\" xmlns=\"urn:oasis:names:tc:opendocument:xmlns:container\">\
1660<rootfiles><rootfile full-path=\"OEBPS/content.opf\" \
1661media-type=\"application/oebps-package+xml\"/></rootfiles></container>";
1662 let opf = format!(
1663 "<?xml version=\"1.0\" encoding=\"utf-8\"?>\
1664<package xmlns=\"http://www.idpf.org/2007/opf\" version=\"3.0\" unique-identifier=\"id\">\
1665<metadata xmlns:dc=\"http://purl.org/dc/elements/1.1/\">{opf_metadata}</metadata>\
1666<manifest><item id=\"c1\" href=\"{manifest_href}\" media-type=\"application/xhtml+xml\"/></manifest>\
1667<spine><itemref idref=\"c1\"/></spine></package>"
1668 );
1669 let chapter = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\
1670<html xmlns=\"http://www.w3.org/1999/xhtml\"><body>\
1671<p>Hello world body text.</p></body></html>";
1672
1673 let file = std::fs::File::create(dest).unwrap();
1674 let mut writer = zip::ZipWriter::new(file);
1675 let stored = zip::write::SimpleFileOptions::default()
1676 .compression_method(zip::CompressionMethod::Stored);
1677 // mimetype must be the first member and stored uncompressed.
1678 writer.start_file("mimetype", stored).unwrap();
1679 writer.write_all(b"application/epub+zip").unwrap();
1680 writer.start_file("META-INF/container.xml", stored).unwrap();
1681 writer.write_all(container.as_bytes()).unwrap();
1682 writer.start_file("OEBPS/content.opf", stored).unwrap();
1683 writer.write_all(opf.as_bytes()).unwrap();
1684 writer.start_file(chapter_entry, stored).unwrap();
1685 writer.write_all(chapter.as_bytes()).unwrap();
1686 writer.finish().unwrap();
1687 }
1688
1689 #[test]
1690 fn regression_epub_title_accumulates_entities_and_nested_events() {
1691 // Pre-fix the title was cut at the first Text node, so an entity or a
1692 // comment inside `<dc:title>` truncated it.
1693 let tmp = tempfile::TempDir::new().unwrap();
1694
1695 let f1 = tmp.path().join("entity.epub");
1696 write_epub(
1697 &f1,
1698 "<dc:title>Smith & Jones: A <Tale></dc:title>",
1699 "chapter.xhtml",
1700 "OEBPS/chapter.xhtml",
1701 );
1702 let got = extract(&f1).unwrap();
1703 assert_eq!(
1704 got.metadata["title"],
1705 MetaValue::Str("Smith & Jones: A <Tale>".into())
1706 );
1707
1708 let f2 = tmp.path().join("comment.epub");
1709 write_epub(
1710 &f2,
1711 "<dc:title>Part One<!-- editorial --> and Part Two</dc:title>",
1712 "chapter.xhtml",
1713 "OEBPS/chapter.xhtml",
1714 );
1715 let got = extract(&f2).unwrap();
1716 assert_eq!(
1717 got.metadata["title"],
1718 MetaValue::Str("Part One and Part Two".into())
1719 );
1720 }
1721
1722 #[test]
1723 fn regression_epub_self_closing_title_does_not_capture_author() {
1724 // A self-closing `<dc:title/>` (an untitled book) must NOT latch the next
1725 // text node (the author) as the title.
1726 let tmp = tempfile::TempDir::new().unwrap();
1727 let f = tmp.path().join("empty-title.epub");
1728 write_epub(
1729 &f,
1730 "<dc:title/><dc:creator>John Doe</dc:creator>",
1731 "chapter.xhtml",
1732 "OEBPS/chapter.xhtml",
1733 );
1734 let got = extract(&f).unwrap();
1735 // No (or empty) title — never the author. `put_str` omits empty values.
1736 assert!(
1737 !got.metadata.contains_key("title"),
1738 "self-closing title must not capture the author, got {:?}",
1739 got.metadata.get("title")
1740 );
1741 // The chapter still extracts.
1742 assert_eq!(got.metadata["chapters"], MetaValue::Num(1));
1743 }
1744
1745 #[test]
1746 fn regression_epub_percent_encoded_href_resolves() {
1747 // An href `my%20chapter.xhtml` must match the zip entry
1748 // `OEBPS/my chapter.xhtml`; pre-fix the lookup failed and the chapter was
1749 // silently dropped (empty text, 0 chapters).
1750 let tmp = tempfile::TempDir::new().unwrap();
1751 let f = tmp.path().join("spaced.epub");
1752 write_epub(
1753 &f,
1754 "<dc:title>Spaced</dc:title>",
1755 "my%20chapter.xhtml",
1756 "OEBPS/my chapter.xhtml",
1757 );
1758 let got = extract(&f).unwrap();
1759 assert_eq!(got.metadata["chapters"], MetaValue::Num(1));
1760 assert!(
1761 got.text.contains("Hello world body text."),
1762 "percent-encoded-href chapter must extract, got {:?}",
1763 got.text
1764 );
1765 }
1766
1767 #[test]
1768 fn percent_decode_handles_spaces_and_unicode_and_stray_percent() {
1769 assert_eq!(percent_decode("my%20chapter.xhtml"), "my chapter.xhtml");
1770 // `%C3%A9` is UTF-8 for `é`.
1771 assert_eq!(percent_decode("caf%C3%A9.xhtml"), "café.xhtml");
1772 // A stray `%` not followed by two hex digits is emitted verbatim.
1773 assert_eq!(percent_decode("100%done"), "100%done");
1774 assert_eq!(percent_decode("plain.xhtml"), "plain.xhtml");
1775 }
1776
1777 #[test]
1778 fn normalize_zip_path_resolves_dot_segments() {
1779 assert_eq!(
1780 normalize_zip_path("OEBPS/../text/ch1.xhtml"),
1781 "text/ch1.xhtml"
1782 );
1783 assert_eq!(normalize_zip_path("OEBPS/./ch1.xhtml"), "OEBPS/ch1.xhtml");
1784 assert_eq!(normalize_zip_path("OEBPS/ch1.xhtml"), "OEBPS/ch1.xhtml");
1785 }
1786
1787 // ── regression: spreadsheet date rendering (finding #1013) ─────────────────
1788
1789 #[test]
1790 fn render_excel_datetime_renders_iso_not_serial() {
1791 use calamine::{ExcelDateTime, ExcelDateTimeType};
1792 // 46188 → 2026-06-15 (date only, midnight → no time component).
1793 let date = ExcelDateTime::new(46188.0, ExcelDateTimeType::DateTime, false);
1794 assert_eq!(render_excel_datetime(&date), "2026-06-15");
1795 // 46143.5 → 2026-05-01 12:00:00 (has a time component).
1796 let dt = ExcelDateTime::new(46143.5, ExcelDateTimeType::DateTime, false);
1797 assert_eq!(render_excel_datetime(&dt), "2026-05-01 12:00:00");
1798 // A duration is elapsed time, not a calendar date → keep the serial form.
1799 let dur = ExcelDateTime::new(1.5, ExcelDateTimeType::TimeDelta, false);
1800 assert_eq!(render_excel_datetime(&dur), "1.5");
1801 }
1802
1803 #[test]
1804 fn render_cell_dates_are_iso() {
1805 use calamine::{Data, ExcelDateTime, ExcelDateTimeType};
1806 assert_eq!(
1807 render_cell(&Data::DateTime(ExcelDateTime::new(
1808 46188.0,
1809 ExcelDateTimeType::DateTime,
1810 false
1811 ))),
1812 "2026-06-15"
1813 );
1814 // The integer/float/string paths are unchanged by the date fix.
1815 assert_eq!(render_cell(&Data::Float(3450.0)), "3450");
1816 assert_eq!(render_cell(&Data::Int(7)), "7");
1817 }
1818
1819 // ── regression: HTML/EPUB literal-content fidelity (finding #36) ───────────
1820
1821 /// Render an HTML body string through the production extract path.
1822 fn html_text(body: &str) -> String {
1823 let tmp = tempfile::TempDir::new().unwrap();
1824 let f = tmp.path().join("doc.html");
1825 std::fs::write(&f, format!("<html><body>{body}</body></html>")).unwrap();
1826 extract(&f).unwrap().text
1827 }
1828
1829 #[test]
1830 fn regression_html_keeps_literal_brackets_and_hashes() {
1831 // Pre-fix every `[bracketed]` substring and every leading-`#` run was
1832 // stripped from real prose, fusing `total[net]` into `totalnet` and
1833 // deleting the `#` from `#1 in sales`.
1834 let out = html_text(
1835 "<p>#1 in sales this quarter</p>\
1836<p>see chart[3] for data, array[0] = total[net]</p>",
1837 );
1838 assert!(out.contains("#1 in sales this quarter"), "got {out:?}");
1839 assert!(
1840 out.contains("see chart[3] for data, array[0] = total[net]"),
1841 "got {out:?}"
1842 );
1843
1844 // Citation markers and subscripts survive intact.
1845 let out = html_text("<p>See note [1] and [sic] here.</p><p>x[i] + y[j]</p>");
1846 assert!(out.contains("See note [1] and [sic] here."), "got {out:?}");
1847 assert!(out.contains("x[i] + y[j]"), "got {out:?}");
1848 }
1849
1850 #[test]
1851 fn html_headings_render_as_plain_prose_no_hash() {
1852 // A real `<h1>` heading still renders WITHOUT a `#` marker (the renderer
1853 // emits no heading prefix now), so headings read as prose.
1854 let out = html_text("<h1>Launch Plan</h1><p>Body prose.</p>");
1855 assert!(out.contains("Launch Plan"), "got {out:?}");
1856 assert!(
1857 !out.contains('#'),
1858 "no heading marker expected, got {out:?}"
1859 );
1860 }
1861
1862 #[test]
1863 fn html_links_render_as_bare_text_no_brackets() {
1864 // Link display text renders bare; the surrounding `[...]` the stock plain
1865 // decorator would add is gone.
1866 let out = html_text("<p>See the <a href=\"https://x.example\">handbook</a>.</p>");
1867 assert!(out.contains("See the handbook."), "got {out:?}");
1868 }
1869}