dbmd_core/
extract.rs

1//! Document text extraction — the `dbmd extract` engine.
2//!
3//! `sources/` is where raw evidence lands: invoices, contracts, reports,
4//! exports. Most of it arrives as binary documents (PDF, Word, Excel, EPUB) or
5//! HTML, not markdown. Before an agent can reason over that evidence — wiki-link
6//! it, summarize it into the wiki layer, file a typed record that cites it — the
7//! text has to come out. This module is that step: a binary document in, plain
8//! UTF-8 text out, format chosen by file extension.
9//!
10//! # What this is, and is not
11//!
12//! - **Deterministic decoders only.** Every adapter is a format parser
13//!   (`pdf-extract`, `calamine`, `html2text`, `quick-xml`+`zip`). There is **no
14//!   AI, no OCR, no embeddings** here — consistent with the crate-wide invariant
15//!   (`lib.rs`). The agent driving `dbmd` is the semantic layer; this is plumbing.
16//! - **Text layer, not pixels.** A scanned PDF with no text layer yields the
17//!   empty string — *empty in, empty out, never hallucinated text.* OCR is an
18//!   explicit non-goal (a future `dbmd-ocr`).
19//! - **Single document, single call.** [`extract`] handles one file. Walking a
20//!   store and extracting every document is the caller's loop, not this module's.
21//!
22//! # Format dispatch
23//!
24//! [`Format::from_path`] maps the file extension to an adapter; [`extract`]
25//! dispatches:
26//!
27//! | Extension                | Format            | Adapter                          |
28//! |--------------------------|-------------------|----------------------------------|
29//! | `.pdf`                   | [`Format::Pdf`]   | `pdf-extract`                    |
30//! | `.docx`                  | [`Format::Docx`]  | `zip` + `quick-xml` (`w:t` runs) |
31//! | `.xlsx` / `.xlsm` / `.xlsb` / `.ods` | [`Format::Spreadsheet`] | `calamine` |
32//! | `.epub`                  | [`Format::Epub`]  | `zip` + `quick-xml` + `html2text`|
33//! | `.html` / `.htm` / `.xhtml` | [`Format::Html`] | `html2text`                    |
34//!
35//! Anything else is [`ExtractError::UnsupportedFormat`] — a typed refusal the
36//! CLI surfaces with a stable code, never a panic.
37
38use std::collections::BTreeMap;
39use std::io::Read;
40use std::path::Path;
41
42use serde::Serialize;
43
44/// The result of extracting one document: the plain text plus a small,
45/// format-tagged metadata map.
46///
47/// This is the `--json` shape the CLI emits verbatim (`{text, metadata}`); in
48/// plain mode the CLI prints [`Extracted::text`] and discards the metadata.
49/// Metadata is intentionally minimal and best-effort — extraction never *fails*
50/// for want of a title; it just omits the key.
51#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
52pub struct Extracted {
53    /// The extracted plain text (UTF-8), normalized to `\n` line endings with
54    /// trailing whitespace trimmed per line and a single trailing newline. For
55    /// a document with no recoverable text layer (e.g. a scanned, image-only
56    /// PDF) this is the empty string — the contract is "empty in, empty out."
57    pub text: String,
58
59    /// Best-effort key/value metadata. Always carries `format` (the adapter
60    /// that ran, e.g. `"pdf"`). Adapters add what they cheaply know:
61    /// `pages`/`sheets`/`sheet_names` (counts), `title` (when the container
62    /// declares one). A `BTreeMap` so `--json` output is key-ordered and stable.
63    pub metadata: BTreeMap<String, MetaValue>,
64}
65
66impl Extracted {
67    /// Build an [`Extracted`] from raw adapter text + the detected format,
68    /// applying the canonical text normalization ([`normalize_text`]) and
69    /// seeding the `format` metadata key.
70    fn new(raw_text: String, format: Format) -> Self {
71        let mut metadata = BTreeMap::new();
72        metadata.insert(
73            "format".to_string(),
74            MetaValue::Str(format.tag().to_string()),
75        );
76        Extracted {
77            text: normalize_text(&raw_text),
78            metadata,
79        }
80    }
81
82    /// Insert a string metadata key only when the value is non-empty (keeps the
83    /// map free of empty `title: ""` noise).
84    fn put_str(&mut self, key: &str, value: impl Into<String>) {
85        let v = value.into();
86        if !v.trim().is_empty() {
87            self.metadata.insert(key.to_string(), MetaValue::Str(v));
88        }
89    }
90
91    /// Insert a numeric (count) metadata key.
92    fn put_num(&mut self, key: &str, value: u64) {
93        self.metadata.insert(key.to_string(), MetaValue::Num(value));
94    }
95}
96
97/// A metadata value: a string (title, format tag, sheet name list joined) or a
98/// non-negative count (pages, sheets). Serializes to a bare JSON string or
99/// number — no wrapper object — so `{text, metadata}` stays flat and readable.
100#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
101#[serde(untagged)]
102pub enum MetaValue {
103    /// A textual value (e.g. document title, the `format` tag).
104    Str(String),
105    /// A non-negative count (e.g. page count, sheet count).
106    Num(u64),
107}
108
109/// The document formats `dbmd extract` understands, one per adapter. Detected
110/// from the file extension by [`Format::from_path`].
111#[derive(Debug, Clone, Copy, PartialEq, Eq)]
112pub enum Format {
113    /// Portable Document Format (`.pdf`) — text layer via `pdf-extract`.
114    Pdf,
115    /// Office Open XML WordprocessingML (`.docx`) — `w:t` runs via `quick-xml`.
116    Docx,
117    /// A spreadsheet (`.xlsx`/`.xlsm`/`.xlsb`/`.ods`) — cells via `calamine`.
118    Spreadsheet,
119    /// EPUB e-book (`.epub`) — spine XHTML via `zip` + `quick-xml` + `html2text`.
120    Epub,
121    /// HTML (`.html`/`.htm`/`.xhtml`) — plain text via `html2text`.
122    Html,
123}
124
125impl Format {
126    /// Detect the format from a path's extension (case-insensitive). Returns
127    /// `None` for an unrecognized or missing extension; [`extract`] turns that
128    /// into [`ExtractError::UnsupportedFormat`] with the offending extension.
129    pub fn from_path(path: &Path) -> Option<Format> {
130        let ext = path.extension()?.to_str()?.to_ascii_lowercase();
131        Some(match ext.as_str() {
132            "pdf" => Format::Pdf,
133            "docx" => Format::Docx,
134            "xlsx" | "xlsm" | "xlsb" | "ods" => Format::Spreadsheet,
135            "epub" => Format::Epub,
136            "html" | "htm" | "xhtml" => Format::Html,
137            _ => return None,
138        })
139    }
140
141    /// The short, stable tag recorded in `metadata.format` and used in error
142    /// messages. Distinct from the file extension (one tag can cover several
143    /// extensions, e.g. `spreadsheet`).
144    pub fn tag(self) -> &'static str {
145        match self {
146            Format::Pdf => "pdf",
147            Format::Docx => "docx",
148            Format::Spreadsheet => "spreadsheet",
149            Format::Epub => "epub",
150            Format::Html => "html",
151        }
152    }
153}
154
155/// Errors from document extraction. Every variant is a typed refusal the CLI
156/// maps to a stable machine code — extraction never panics on a bad or
157/// encrypted input.
158#[derive(Debug, thiserror::Error)]
159pub enum ExtractError {
160    /// The file extension is missing or not one of the supported document
161    /// formats. Carries the offending extension (or `""` when absent).
162    #[error("unsupported document format: {0:?} (supported: pdf, docx, xlsx, epub, html)")]
163    UnsupportedFormat(String),
164
165    /// The document is encrypted/password-protected and could not be opened
166    /// without a password (or with the wrong one). A clean refusal — the
167    /// extractor must never emit partial/garbled bytes for a locked file.
168    #[error("document is encrypted or password-protected: {0}")]
169    Encrypted(String),
170
171    /// A format adapter failed to parse a structurally invalid or corrupt
172    /// document. Carries the adapter's diagnostic.
173    #[error("failed to parse {format} document: {message}")]
174    Parse {
175        /// The format tag whose adapter failed (e.g. `"pdf"`, `"docx"`).
176        format: &'static str,
177        /// The underlying parser diagnostic.
178        message: String,
179    },
180
181    /// An underlying I/O failure (file missing, unreadable, etc.).
182    #[error(transparent)]
183    Io(#[from] std::io::Error),
184}
185
186impl ExtractError {
187    /// A short, stable machine code for this error, mirrored at the CLI
188    /// boundary for `--json` output and exit-code mapping.
189    pub fn code(&self) -> &'static str {
190        match self {
191            ExtractError::UnsupportedFormat(_) => "UNSUPPORTED_FORMAT",
192            ExtractError::Encrypted(_) => "DOCUMENT_ENCRYPTED",
193            ExtractError::Parse { .. } => "EXTRACT_PARSE_ERROR",
194            ExtractError::Io(_) => "IO_ERROR",
195        }
196    }
197}
198
199/// Result alias for extraction operations.
200pub type Result<T> = std::result::Result<T, ExtractError>;
201
202/// Extract plain text (and best-effort metadata) from a document, choosing the
203/// adapter by the file's extension.
204///
205/// This is the single entry point the CLI calls. It reads exactly one file and
206/// returns one [`Extracted`]; there is no whole-store walk here (per the
207/// crate-wide O(changed) invariant — a store-wide extraction is the caller's
208/// loop). An unsupported extension is [`ExtractError::UnsupportedFormat`]; an
209/// encrypted PDF is [`ExtractError::Encrypted`]; neither panics.
210///
211/// # Examples
212///
213/// ```no_run
214/// use std::path::Path;
215/// let out = dbmd_core::extract::extract(Path::new("sources/docs/invoice.pdf"))?;
216/// println!("{}", out.text);
217/// # Ok::<(), dbmd_core::extract::ExtractError>(())
218/// ```
219pub fn extract(path: &Path) -> Result<Extracted> {
220    let format = Format::from_path(path).ok_or_else(|| {
221        let ext = path
222            .extension()
223            .and_then(|e| e.to_str())
224            .unwrap_or("")
225            .to_string();
226        ExtractError::UnsupportedFormat(ext)
227    })?;
228
229    match format {
230        Format::Pdf => extract_pdf(path),
231        Format::Docx => extract_docx(path),
232        Format::Spreadsheet => extract_spreadsheet(path),
233        Format::Epub => extract_epub(path),
234        Format::Html => extract_html(path),
235    }
236}
237
238// ─────────────────────────────────────────────────────────────────────────────
239// Text normalization
240// ─────────────────────────────────────────────────────────────────────────────
241
242/// Canonicalize extracted text so output is stable across adapters:
243///
244/// 1. Normalize line endings to `\n` (drop `\r`).
245/// 2. Trim trailing whitespace on each line.
246/// 3. Collapse three-or-more consecutive blank lines to a single blank line.
247/// 4. Trim leading/trailing blank lines, then append exactly one `\n` (unless
248///    the whole text is empty, which stays empty — the image-only-PDF contract).
249///
250/// This is *layout* tid-up only; it never reorders or drops words. Word-level
251/// content is whatever the adapter recovered.
252pub fn normalize_text(raw: &str) -> String {
253    let unix = raw.replace("\r\n", "\n").replace('\r', "\n");
254
255    let mut lines: Vec<&str> = unix.lines().map(|l| l.trim_end()).collect();
256
257    // Trim leading blank lines.
258    while lines.first().is_some_and(|l| l.is_empty()) {
259        lines.remove(0);
260    }
261    // Trim trailing blank lines.
262    while lines.last().is_some_and(|l| l.is_empty()) {
263        lines.pop();
264    }
265
266    if lines.is_empty() {
267        return String::new();
268    }
269
270    // Collapse runs of 2+ blank lines down to a single blank line.
271    let mut out = String::new();
272    let mut blank_run = 0usize;
273    for line in lines {
274        if line.is_empty() {
275            blank_run += 1;
276            if blank_run >= 2 {
277                continue;
278            }
279        } else {
280            blank_run = 0;
281        }
282        out.push_str(line);
283        out.push('\n');
284    }
285    out
286}
287
288// ─────────────────────────────────────────────────────────────────────────────
289// PDF — pdf-extract
290// ─────────────────────────────────────────────────────────────────────────────
291
292/// Extract a PDF's text layer via `pdf-extract`.
293///
294/// A PDF with no text layer (a scanned image) yields the empty string — that is
295/// correct, not an error (OCR is out of scope). A password-protected PDF that
296/// cannot be opened is mapped to [`ExtractError::Encrypted`] rather than a raw
297/// parse error so the caller can branch on it. Metadata carries the page count
298/// when the document tree exposes it.
299fn extract_pdf(path: &Path) -> Result<Extracted> {
300    // Read the bytes ourselves so a missing/unreadable file is a clean
301    // `ExtractError::Io` (via `?`) before we hand anything to the PDF parser.
302    let bytes = std::fs::read(path)?;
303
304    let text = match pdf_extract::extract_text_from_mem(&bytes) {
305        Ok(t) => t,
306        Err(e) => return Err(classify_pdf_error(e)),
307    };
308
309    let mut out = Extracted::new(text, Format::Pdf);
310
311    // Page count is cheap and useful; derive it from the parsed document. A
312    // failure here is non-fatal — the text already succeeded.
313    if let Ok(doc) = pdf_extract::Document::load_mem(&bytes) {
314        let pages = doc.get_pages().len() as u64;
315        out.put_num("pages", pages);
316    }
317
318    Ok(out)
319}
320
321/// Map a `pdf-extract` error onto the right [`ExtractError`] variant.
322/// Decryption failures become [`ExtractError::Encrypted`]; everything else is a
323/// [`ExtractError::Parse`] tagged `pdf`.
324fn classify_pdf_error(err: pdf_extract::OutputError) -> ExtractError {
325    let msg = err.to_string();
326    let lower = msg.to_ascii_lowercase();
327    if lower.contains("password") || lower.contains("decrypt") || lower.contains("encrypt") {
328        ExtractError::Encrypted(msg)
329    } else {
330        ExtractError::Parse {
331            format: "pdf",
332            message: msg,
333        }
334    }
335}
336
337// ─────────────────────────────────────────────────────────────────────────────
338// DOCX — zip + quick-xml (no docx-rs dependency; quick-xml is already needed
339// for epub, so docx, xlsx-via-calamine, and epub share one XML/zip surface)
340// ─────────────────────────────────────────────────────────────────────────────
341
342/// Extract a `.docx` (WordprocessingML) by unzipping `word/document.xml` and
343/// concatenating the `<w:t>` run text, one logical line per `<w:p>` paragraph.
344///
345/// `<w:tab/>` becomes a tab and `<w:br/>` / `<w:cr>` a newline so table-ish and
346/// line-broken content keeps its shape; everything else is structural and
347/// ignored. This is the same minimal-but-faithful path `docx-rs` takes for text
348/// extraction, without pulling in a second XML/zip stack.
349fn extract_docx(path: &Path) -> Result<Extracted> {
350    let file = std::fs::File::open(path)?;
351    let mut archive = open_zip(file, "docx")?;
352
353    let xml = read_zip_entry(&mut archive, "word/document.xml", "docx")?;
354    let text = wordprocessing_text(&xml, "docx")?;
355
356    Ok(Extracted::new(text, Format::Docx))
357}
358
359/// Pull paragraph text out of a WordprocessingML / DrawingML XML body.
360///
361/// Shared by [`extract_docx`]. Walks the event stream collecting `<w:t>` text;
362/// `<w:p>` ends a line, `<w:tab/>` is a tab, `<w:br>`/`<w:cr>` a newline.
363fn wordprocessing_text(xml: &str, format: &'static str) -> Result<String> {
364    use quick_xml::events::Event;
365    use quick_xml::reader::Reader;
366
367    let mut reader = Reader::from_str(xml);
368    let mut buf = Vec::new();
369    let mut out = String::new();
370    let mut in_text_run = false;
371
372    loop {
373        match reader.read_event_into(&mut buf) {
374            Ok(Event::Start(e)) => {
375                if local_name(e.name().as_ref()) == b"t" {
376                    in_text_run = true;
377                }
378            }
379            Ok(Event::End(e)) => {
380                let name = e.name();
381                match local_name(name.as_ref()) {
382                    b"t" => in_text_run = false,
383                    b"p" => out.push('\n'),
384                    _ => {}
385                }
386            }
387            Ok(Event::Empty(e)) => {
388                // Self-closing run-level breaks inside a paragraph.
389                match local_name(e.name().as_ref()) {
390                    b"tab" => out.push('\t'),
391                    b"br" | b"cr" => out.push('\n'),
392                    _ => {}
393                }
394            }
395            // quick-xml 0.40 yields already-unescaped text in `Event::Text`.
396            Ok(Event::Text(t)) => {
397                if in_text_run {
398                    out.push_str(&String::from_utf8_lossy(&t.into_inner()));
399                }
400            }
401            Ok(Event::Eof) => break,
402            Err(e) => {
403                return Err(ExtractError::Parse {
404                    format,
405                    message: format!("malformed XML: {e}"),
406                });
407            }
408            _ => {}
409        }
410        buf.clear();
411    }
412
413    Ok(out)
414}
415
416/// The local part of a possibly-namespaced XML name: `w:t` → `t`, `t` → `t`.
417/// docx/epub XML uses prefixes (`w:`, `dc:`) the writer chose; matching the
418/// local name is prefix-agnostic and robust to that choice.
419fn local_name(qname: &[u8]) -> &[u8] {
420    match qname.iter().rposition(|&b| b == b':') {
421        Some(i) => &qname[i + 1..],
422        None => qname,
423    }
424}
425
426// ─────────────────────────────────────────────────────────────────────────────
427// Spreadsheet — calamine (xlsx / xlsm / xlsb / ods)
428// ─────────────────────────────────────────────────────────────────────────────
429
430/// Extract every sheet of a spreadsheet via `calamine`, rendering each row as
431/// tab-separated cells, one row per line, sheets in workbook order separated by
432/// a blank line.
433///
434/// Cell rendering: text verbatim; integers and whole-valued floats without a
435/// trailing `.0` (`1200`, not `1200.0`); other floats via their default
436/// formatting; booleans as `TRUE`/`FALSE`; empty/error cells as the empty
437/// string. Metadata carries the sheet count and the joined sheet-name list.
438fn extract_spreadsheet(path: &Path) -> Result<Extracted> {
439    use calamine::{open_workbook_auto, Reader};
440
441    let mut workbook = open_workbook_auto(path).map_err(|e| ExtractError::Parse {
442        format: "spreadsheet",
443        message: e.to_string(),
444    })?;
445
446    let sheet_names = workbook.sheet_names().to_vec();
447    let mut text = String::new();
448
449    for (idx, name) in sheet_names.iter().enumerate() {
450        if idx > 0 {
451            text.push('\n'); // blank line between sheets
452        }
453        let range = workbook
454            .worksheet_range(name)
455            .map_err(|e| ExtractError::Parse {
456                format: "spreadsheet",
457                message: format!("sheet {name:?}: {e}"),
458            })?;
459
460        for row in range.rows() {
461            let cells: Vec<String> = row.iter().map(render_cell).collect();
462            text.push_str(&cells.join("\t"));
463            text.push('\n');
464        }
465    }
466
467    let mut out = Extracted::new(text, Format::Spreadsheet);
468    out.put_num("sheets", sheet_names.len() as u64);
469    if !sheet_names.is_empty() {
470        out.put_str("sheet_names", sheet_names.join(", "));
471    }
472    Ok(out)
473}
474
475/// Render one spreadsheet cell to its text form. Whole-valued floats drop the
476/// `.0` (so `3450.0` → `3450`), matching how spreadsheet apps display an
477/// integer-typed amount.
478fn render_cell(cell: &calamine::Data) -> String {
479    use calamine::Data;
480    match cell {
481        Data::Empty => String::new(),
482        Data::String(s) => s.clone(),
483        Data::Int(i) => i.to_string(),
484        Data::Float(f) => {
485            if f.fract() == 0.0 && f.is_finite() && f.abs() < 1e15 {
486                format!("{}", *f as i64)
487            } else {
488                f.to_string()
489            }
490        }
491        Data::Bool(b) => {
492            if *b {
493                "TRUE".to_string()
494            } else {
495                "FALSE".to_string()
496            }
497        }
498        Data::DateTime(dt) => dt.to_string(),
499        Data::DateTimeIso(s) => s.clone(),
500        Data::DurationIso(s) => s.clone(),
501        Data::Error(e) => format!("{e:?}"),
502    }
503}
504
505// ─────────────────────────────────────────────────────────────────────────────
506// EPUB — zip + quick-xml (spine order) + html2text (per-chapter)
507// ─────────────────────────────────────────────────────────────────────────────
508//
509// We do NOT use the `epub` crate: it is GPL-3.0, which violates the toolkit's
510// permissive-only license rule. An EPUB is a zip whose OPF package declares a
511// reading-order `spine`; each spine item is an XHTML document. zip + quick-xml
512// (already dependencies) read the container/OPF, and html2text (already a
513// dependency for `.html`) flattens each chapter. Same machinery, no GPL.
514
515/// Extract an EPUB's reading-order text:
516/// 1. read `META-INF/container.xml` → the OPF package path;
517/// 2. parse the OPF `manifest` (id→href) and `spine` (ordered idref list);
518/// 3. for each spine item, read its XHTML and flatten it with [`html_to_text`];
519/// 4. join chapters with a blank line.
520///
521/// Metadata carries `title` (the OPF `dc:title`) and `chapters` (spine length).
522fn extract_epub(path: &Path) -> Result<Extracted> {
523    let file = std::fs::File::open(path)?;
524    let mut archive = open_zip(file, "epub")?;
525
526    // 1. container.xml → OPF path.
527    let container = read_zip_entry(&mut archive, "META-INF/container.xml", "epub")?;
528    let opf_path = epub_opf_path(&container)?;
529
530    // 2. OPF → base dir, manifest, spine, title.
531    let opf = read_zip_entry(&mut archive, &opf_path, "epub")?;
532    let parsed = parse_opf(&opf)?;
533    let base = opf_base_dir(&opf_path);
534
535    // 3. Spine items in order → flattened chapter text.
536    let mut text = String::new();
537    let mut chapters = 0u64;
538    for idref in &parsed.spine {
539        let Some(href) = parsed.manifest.get(idref) else {
540            continue; // dangling spine ref; skip rather than fail
541        };
542        let entry = join_zip_path(&base, href);
543        // A missing spine target is skipped (best-effort), not fatal.
544        let Ok(chapter_xhtml) = read_zip_entry(&mut archive, &entry, "epub") else {
545            continue;
546        };
547        let chapter_text = html_to_text(chapter_xhtml.as_bytes())?;
548        if !chapter_text.trim().is_empty() {
549            if chapters > 0 {
550                text.push('\n');
551            }
552            text.push_str(&chapter_text);
553            text.push('\n');
554            chapters += 1;
555        }
556    }
557
558    let mut out = Extracted::new(text, Format::Epub);
559    out.put_num("chapters", chapters);
560    if let Some(title) = parsed.title {
561        out.put_str("title", title);
562    }
563    Ok(out)
564}
565
566/// The full-path of the OPF package file, read from `META-INF/container.xml`'s
567/// first `<rootfile full-path="…">`.
568fn epub_opf_path(container_xml: &str) -> Result<String> {
569    use quick_xml::events::Event;
570    use quick_xml::reader::Reader;
571
572    let mut reader = Reader::from_str(container_xml);
573    let mut buf = Vec::new();
574    loop {
575        match reader.read_event_into(&mut buf) {
576            Ok(Event::Start(e)) | Ok(Event::Empty(e)) => {
577                if local_name(e.name().as_ref()) == b"rootfile" {
578                    if let Some(p) = attr_value(&e, b"full-path") {
579                        return Ok(p);
580                    }
581                }
582            }
583            Ok(Event::Eof) => break,
584            Err(e) => {
585                return Err(ExtractError::Parse {
586                    format: "epub",
587                    message: format!("container.xml: {e}"),
588                })
589            }
590            _ => {}
591        }
592        buf.clear();
593    }
594    Err(ExtractError::Parse {
595        format: "epub",
596        message: "container.xml has no <rootfile full-path>".to_string(),
597    })
598}
599
600/// The parsed-out pieces of an OPF package we need for reading-order text.
601struct OpfParsed {
602    /// Manifest: item id → href (relative to the OPF's directory).
603    manifest: BTreeMap<String, String>,
604    /// Spine: ordered list of manifest item ids (the reading order).
605    spine: Vec<String>,
606    /// `dc:title`, if present.
607    title: Option<String>,
608}
609
610/// Parse an OPF package document into its manifest, spine, and title.
611fn parse_opf(opf_xml: &str) -> Result<OpfParsed> {
612    use quick_xml::events::Event;
613    use quick_xml::reader::Reader;
614
615    let mut reader = Reader::from_str(opf_xml);
616    let mut buf = Vec::new();
617
618    let mut manifest = BTreeMap::new();
619    let mut spine = Vec::new();
620    let mut title: Option<String> = None;
621    let mut in_title = false;
622
623    loop {
624        match reader.read_event_into(&mut buf) {
625            Ok(Event::Start(e)) | Ok(Event::Empty(e)) => match local_name(e.name().as_ref()) {
626                b"item" => {
627                    if let (Some(id), Some(href)) = (attr_value(&e, b"id"), attr_value(&e, b"href"))
628                    {
629                        manifest.insert(id, href);
630                    }
631                }
632                b"itemref" => {
633                    if let Some(idref) = attr_value(&e, b"idref") {
634                        spine.push(idref);
635                    }
636                }
637                b"title" => in_title = true,
638                _ => {}
639            },
640            Ok(Event::End(e)) => {
641                if local_name(e.name().as_ref()) == b"title" {
642                    in_title = false;
643                }
644            }
645            Ok(Event::Text(t)) => {
646                if in_title && title.is_none() {
647                    let s = String::from_utf8_lossy(&t.into_inner()).trim().to_string();
648                    if !s.is_empty() {
649                        title = Some(s);
650                    }
651                }
652            }
653            Ok(Event::Eof) => break,
654            Err(e) => {
655                return Err(ExtractError::Parse {
656                    format: "epub",
657                    message: format!("OPF: {e}"),
658                })
659            }
660            _ => {}
661        }
662        buf.clear();
663    }
664
665    Ok(OpfParsed {
666        manifest,
667        spine,
668        title,
669    })
670}
671
672/// The directory portion of an OPF path (`"OEBPS/content.opf"` → `"OEBPS"`,
673/// `"content.opf"` → `""`), used to resolve manifest hrefs against the OPF's own
674/// location inside the zip.
675fn opf_base_dir(opf_path: &str) -> String {
676    match opf_path.rfind('/') {
677        Some(i) => opf_path[..i].to_string(),
678        None => String::new(),
679    }
680}
681
682/// Join an OPF base dir with a (possibly `./`-prefixed) manifest href into a zip
683/// entry name. Forward-slash only — zip paths are always `/`-separated.
684fn join_zip_path(base: &str, href: &str) -> String {
685    let href = href.trim_start_matches("./");
686    if base.is_empty() {
687        href.to_string()
688    } else {
689        format!("{base}/{href}")
690    }
691}
692
693// ─────────────────────────────────────────────────────────────────────────────
694// HTML — html2text + light markdown-decoration cleanup
695// ─────────────────────────────────────────────────────────────────────────────
696
697/// Extract plain text from an `.html` file.
698fn extract_html(path: &Path) -> Result<Extracted> {
699    let bytes = std::fs::read(path)?;
700    let text = html_to_text(&bytes)?;
701    Ok(Extracted::new(text, Format::Html))
702}
703
704/// Flatten an HTML/XHTML byte stream to clean plain text.
705///
706/// Uses `html2text`'s non-decorating plain renderer (which already drops
707/// `<script>`/`<style>`/comments and flattens lists), then strips the two
708/// markdown-ish decorations that renderer still emits — leading `#` heading
709/// markers and `[text]` link brackets — so headings and link text read as plain
710/// prose. Unordered list items keep their `*` marker and ordered items their
711/// `N.` marker (those are content-faithful and match the corpus convention).
712///
713/// A very wide wrap width (10_000) is used so paragraphs are not hard-wrapped by
714/// the renderer; paragraph structure comes from the source's block elements, and
715/// final layout is canonicalized by [`normalize_text`].
716fn html_to_text(html: &[u8]) -> Result<String> {
717    let rendered = html2text::config::plain_no_decorate()
718        .string_from_read(html, 10_000)
719        .map_err(|e| ExtractError::Parse {
720            format: "html",
721            message: e.to_string(),
722        })?;
723
724    Ok(strip_markdown_decorations(&rendered))
725}
726
727/// Strip the residual markdown decorations `html2text`'s plain renderer emits:
728/// leading run of `#` (ATX heading markers) at the start of a line, and `[...]`
729/// brackets around link/anchor text (the reference-style `[n]` suffix is already
730/// gone under `plain_no_decorate`). Bullet (`*`) and ordered (`N.`) markers are
731/// left intact — they are content, not decoration.
732fn strip_markdown_decorations(text: &str) -> String {
733    let mut out = String::with_capacity(text.len());
734    for line in text.lines() {
735        // Strip a leading "#"-run + the single space after it (ATX heading).
736        let trimmed = line.trim_start();
737        let after_hashes = trimmed.trim_start_matches('#');
738        let line = if after_hashes.len() != trimmed.len() {
739            // It was a heading line: keep indentation-free heading text.
740            after_hashes.trim_start()
741        } else {
742            line
743        };
744        out.push_str(&unwrap_brackets(line));
745        out.push('\n');
746    }
747    out
748}
749
750/// Replace every `[inner]` with `inner` (one pass, non-nested). `html2text`'s
751/// plain renderer wraps link/anchor text in single brackets; unwrapping yields
752/// the bare text. Escaped or unmatched brackets are left as-is.
753fn unwrap_brackets(line: &str) -> String {
754    if !line.contains('[') {
755        return line.to_string();
756    }
757    let mut out = String::with_capacity(line.len());
758    let mut chars = line.chars().peekable();
759    while let Some(c) = chars.next() {
760        if c == '[' {
761            // Collect until the matching ']'; if none, emit the '[' literally.
762            let mut inner = String::new();
763            let mut closed = false;
764            for d in chars.by_ref() {
765                if d == ']' {
766                    closed = true;
767                    break;
768                }
769                inner.push(d);
770            }
771            if closed {
772                out.push_str(&inner);
773            } else {
774                out.push('[');
775                out.push_str(&inner);
776            }
777        } else {
778            out.push(c);
779        }
780    }
781    out
782}
783
784// ─────────────────────────────────────────────────────────────────────────────
785// Shared zip helpers (docx + epub)
786// ─────────────────────────────────────────────────────────────────────────────
787
788/// Open a zip archive from a reader, mapping any failure to a typed
789/// [`ExtractError::Parse`] tagged with the calling format.
790fn open_zip<R: Read + std::io::Seek>(
791    reader: R,
792    format: &'static str,
793) -> Result<zip::ZipArchive<R>> {
794    zip::ZipArchive::new(reader).map_err(|e| ExtractError::Parse {
795        format,
796        message: format!("not a valid zip container: {e}"),
797    })
798}
799
800/// Read a single zip entry to a UTF-8 string. A missing entry or a read failure
801/// is a typed [`ExtractError::Parse`]; invalid UTF-8 is lossily decoded (OOXML /
802/// XHTML are declared UTF-8, but we never panic on a stray byte).
803fn read_zip_entry<R: Read + std::io::Seek>(
804    archive: &mut zip::ZipArchive<R>,
805    name: &str,
806    format: &'static str,
807) -> Result<String> {
808    let mut entry = archive.by_name(name).map_err(|e| ExtractError::Parse {
809        format,
810        message: format!("missing zip entry {name:?}: {e}"),
811    })?;
812    let mut bytes = Vec::new();
813    entry
814        .read_to_end(&mut bytes)
815        .map_err(|e| ExtractError::Parse {
816            format,
817            message: format!("reading {name:?}: {e}"),
818        })?;
819    Ok(String::from_utf8_lossy(&bytes).into_owned())
820}
821
822/// Look up a start/empty element's attribute value by local name, returning it
823/// unescaped as an owned `String`. Prefix-agnostic on the attribute key.
824fn attr_value(elem: &quick_xml::events::BytesStart<'_>, key: &[u8]) -> Option<String> {
825    elem.attributes().flatten().find_map(|attr| {
826        if local_name(attr.key.as_ref()) == key {
827            // `unescape_value` returns an XML-unescaped `Cow<str>` — exactly the
828            // owned attribute text we want. It is soft-deprecated in quick-xml
829            // 0.40 in favor of `normalized_value(XmlVersion)`, whose extra
830            // version arg and byte-Cow return buy us nothing here; the simple
831            // form is correct for the UTF-8 OOXML/OPF attributes we read.
832            #[allow(deprecated)]
833            attr.unescape_value().ok().map(|cow| cow.into_owned())
834        } else {
835            None
836        }
837    })
838}
839
840#[cfg(test)]
841mod tests {
842    use super::*;
843    use std::path::PathBuf;
844
845    /// Absolute path to a corpus-c-formats fixture under `sources/docs/`.
846    fn fixture(name: &str) -> PathBuf {
847        PathBuf::from(env!("CARGO_MANIFEST_DIR"))
848            .join("../../tests/corpora/corpus-c-formats/sources/docs")
849            .join(name)
850    }
851
852    /// Read the known-good `.txt` sibling of a fixture.
853    fn expected(name: &str) -> String {
854        std::fs::read_to_string(fixture(&format!("{name}.txt"))).unwrap()
855    }
856
857    /// Token-level normalization: collapse every run of whitespace (incl.
858    /// newlines) to one space and trim. This is the corpus's recommended,
859    /// layout-agnostic comparison ("same words, same order").
860    fn tokens(s: &str) -> String {
861        s.split_whitespace().collect::<Vec<_>>().join(" ")
862    }
863
864    /// The sorted set of non-blank, token-normalized lines — order-agnostic
865    /// content comparison (used where extractor reading-order legitimately
866    /// differs, e.g. multi-column PDF).
867    fn line_set(s: &str) -> Vec<String> {
868        let mut v: Vec<String> = s.lines().map(tokens).filter(|l| !l.is_empty()).collect();
869        v.sort();
870        v
871    }
872
873    // ── format detection ────────────────────────────────────────────────────
874
875    #[test]
876    fn detects_format_by_extension_case_insensitively() {
877        assert_eq!(Format::from_path(Path::new("a.pdf")), Some(Format::Pdf));
878        assert_eq!(Format::from_path(Path::new("a.PDF")), Some(Format::Pdf));
879        assert_eq!(Format::from_path(Path::new("a.docx")), Some(Format::Docx));
880        assert_eq!(
881            Format::from_path(Path::new("a.xlsx")),
882            Some(Format::Spreadsheet)
883        );
884        assert_eq!(
885            Format::from_path(Path::new("a.ods")),
886            Some(Format::Spreadsheet)
887        );
888        assert_eq!(Format::from_path(Path::new("a.epub")), Some(Format::Epub));
889        assert_eq!(Format::from_path(Path::new("a.html")), Some(Format::Html));
890        assert_eq!(Format::from_path(Path::new("a.htm")), Some(Format::Html));
891        assert_eq!(Format::from_path(Path::new("a.txt")), None);
892        assert_eq!(Format::from_path(Path::new("noext")), None);
893    }
894
895    #[test]
896    fn unsupported_extension_is_typed_error() {
897        let err = extract(Path::new("/tmp/whatever.txt")).unwrap_err();
898        assert!(matches!(err, ExtractError::UnsupportedFormat(ref e) if e == "txt"));
899        assert_eq!(err.code(), "UNSUPPORTED_FORMAT");
900    }
901
902    #[test]
903    fn missing_extension_is_unsupported() {
904        let err = extract(Path::new("/tmp/noext")).unwrap_err();
905        assert!(matches!(err, ExtractError::UnsupportedFormat(ref e) if e.is_empty()));
906    }
907
908    // ── normalization ─────────────────────────────────────────────────────────
909
910    #[test]
911    fn normalize_collapses_blanks_and_trims() {
912        let raw = "\r\n\r\nHeading\r\n\r\n\r\n\r\nBody line   \r\n\r\n";
913        assert_eq!(normalize_text(raw), "Heading\n\nBody line\n");
914    }
915
916    #[test]
917    fn normalize_empty_stays_empty() {
918        assert_eq!(normalize_text(""), "");
919        assert_eq!(normalize_text("   \n\n  \n"), "");
920    }
921
922    // ── per-format extraction against corpus-c fixtures ───────────────────────
923
924    #[test]
925    fn extract_text_pdf_matches_known_good() {
926        let got = extract(&fixture("text.pdf")).unwrap();
927        assert_eq!(got.metadata["format"], MetaValue::Str("pdf".into()));
928        assert_eq!(got.metadata["pages"], MetaValue::Num(1));
929        assert_eq!(tokens(&got.text), tokens(&expected("text.pdf")));
930    }
931
932    #[test]
933    fn extract_weird_fonts_pdf_matches_known_good() {
934        let got = extract(&fixture("weird-fonts.pdf")).unwrap();
935        assert_eq!(tokens(&got.text), tokens(&expected("weird-fonts.pdf")));
936    }
937
938    #[test]
939    fn extract_multi_column_pdf_matches_content_order_agnostic() {
940        // pdf-extract reads column-by-column; the known-good `.txt` captures the
941        // interleaved (pdftotext) order. Both carry identical content — assert
942        // the line SET, not the order. (README § multi-column.)
943        let got = extract(&fixture("multi-column.pdf")).unwrap();
944        assert_eq!(line_set(&got.text), line_set(&expected("multi-column.pdf")));
945    }
946
947    #[test]
948    fn extract_image_only_pdf_yields_empty() {
949        // No text layer → empty out, never hallucinated text. OCR out of scope.
950        let got = extract(&fixture("image-only.pdf")).unwrap();
951        assert_eq!(got.text, "");
952        assert!(expected("image-only.pdf").trim().is_empty());
953    }
954
955    #[test]
956    fn extract_encrypted_pdf_without_password_refuses_cleanly() {
957        let err = extract(&fixture("encrypted.pdf")).unwrap_err();
958        assert!(
959            matches!(err, ExtractError::Encrypted(_)),
960            "expected Encrypted, got {err:?}"
961        );
962        assert_eq!(err.code(), "DOCUMENT_ENCRYPTED");
963    }
964
965    #[test]
966    fn extract_docx_matches_known_good() {
967        let got = extract(&fixture("sample.docx")).unwrap();
968        assert_eq!(got.metadata["format"], MetaValue::Str("docx".into()));
969        assert_eq!(tokens(&got.text), tokens(&expected("sample.docx")));
970    }
971
972    #[test]
973    fn extract_xlsx_matches_known_good() {
974        let got = extract(&fixture("sample.xlsx")).unwrap();
975        assert_eq!(got.metadata["format"], MetaValue::Str("spreadsheet".into()));
976        assert_eq!(got.metadata["sheets"], MetaValue::Num(1));
977        assert_eq!(
978            got.metadata["sheet_names"],
979            MetaValue::Str("Expenses".into())
980        );
981        // Tab-separated, integers without `.0` — exact match (no soft-wrap risk).
982        assert_eq!(got.text.trim_end(), expected("sample.xlsx").trim_end());
983    }
984
985    #[test]
986    fn extract_epub_matches_known_good() {
987        let got = extract(&fixture("sample.epub")).unwrap();
988        assert_eq!(got.metadata["format"], MetaValue::Str("epub".into()));
989        assert_eq!(got.metadata["chapters"], MetaValue::Num(1));
990        assert_eq!(
991            got.metadata["title"],
992            MetaValue::Str("Operations Playbook".into())
993        );
994        assert_eq!(tokens(&got.text), tokens(&expected("sample.epub")));
995    }
996
997    #[test]
998    fn extract_html_matches_known_good() {
999        let got = extract(&fixture("sample.html")).unwrap();
1000        assert_eq!(got.metadata["format"], MetaValue::Str("html".into()));
1001        assert_eq!(tokens(&got.text), tokens(&expected("sample.html")));
1002    }
1003
1004    // ── helper-level unit tests ───────────────────────────────────────────────
1005
1006    #[test]
1007    fn unwrap_brackets_flattens_link_text() {
1008        assert_eq!(
1009            unwrap_brackets("contact [ops@acme.example] or the [handbook]."),
1010            "contact ops@acme.example or the handbook."
1011        );
1012        // Unmatched '[' is preserved.
1013        assert_eq!(unwrap_brackets("a [b c"), "a [b c");
1014        // No brackets → untouched.
1015        assert_eq!(unwrap_brackets("plain text"), "plain text");
1016    }
1017
1018    #[test]
1019    fn strip_markdown_decorations_drops_heading_hashes() {
1020        let input = "# Title\n## Section\n* bullet\n1. ordered\nplain\n";
1021        let out = strip_markdown_decorations(input);
1022        assert_eq!(out, "Title\nSection\n* bullet\n1. ordered\nplain\n");
1023    }
1024
1025    #[test]
1026    fn local_name_strips_prefix() {
1027        assert_eq!(local_name(b"w:t"), b"t");
1028        assert_eq!(local_name(b"t"), b"t");
1029        assert_eq!(local_name(b"dc:title"), b"title");
1030    }
1031
1032    #[test]
1033    fn extracted_serializes_to_text_metadata_json() {
1034        let got = extract(&fixture("sample.xlsx")).unwrap();
1035        let json = serde_json::to_value(&got).unwrap();
1036        assert!(json.get("text").is_some());
1037        assert_eq!(json["metadata"]["format"], "spreadsheet");
1038        assert_eq!(json["metadata"]["sheets"], 1);
1039        // MetaValue::Num serializes as a bare JSON number, Str as a bare string.
1040        assert!(json["metadata"]["sheets"].is_number());
1041        assert!(json["metadata"]["format"].is_string());
1042    }
1043}
dbmd_core/extract.rs

dbmd_core/
extract.rs