use std::collections::BTreeMap;
use std::io::Read;
use std::panic::{catch_unwind, AssertUnwindSafe};
use std::path::Path;
use serde::Serialize;
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
pub struct Extracted {
pub text: String,
pub metadata: BTreeMap<String, MetaValue>,
}
impl Extracted {
fn new(raw_text: String, format: Format) -> Self {
let mut metadata = BTreeMap::new();
metadata.insert(
"format".to_string(),
MetaValue::Str(format.tag().to_string()),
);
Extracted {
text: normalize_text(&raw_text),
metadata,
}
}
fn put_str(&mut self, key: &str, value: impl Into<String>) {
let v = value.into();
if !v.trim().is_empty() {
self.metadata.insert(key.to_string(), MetaValue::Str(v));
}
}
fn put_num(&mut self, key: &str, value: u64) {
self.metadata.insert(key.to_string(), MetaValue::Num(value));
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
#[serde(untagged)]
pub enum MetaValue {
Str(String),
Num(u64),
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Format {
Pdf,
Docx,
Spreadsheet,
Epub,
Html,
}
impl Format {
pub fn from_path(path: &Path) -> Option<Format> {
let ext = path.extension()?.to_str()?.to_ascii_lowercase();
Some(match ext.as_str() {
"pdf" => Format::Pdf,
"docx" => Format::Docx,
"xlsx" | "xlsm" | "xlsb" | "ods" => Format::Spreadsheet,
"epub" => Format::Epub,
"html" | "htm" | "xhtml" => Format::Html,
_ => return None,
})
}
pub fn tag(self) -> &'static str {
match self {
Format::Pdf => "pdf",
Format::Docx => "docx",
Format::Spreadsheet => "spreadsheet",
Format::Epub => "epub",
Format::Html => "html",
}
}
}
#[derive(Debug, thiserror::Error)]
pub enum ExtractError {
#[error("unsupported document format: {0:?} (supported: pdf, docx, xlsx/xlsm/xlsb/ods, epub, html/htm/xhtml)")]
UnsupportedFormat(String),
#[error("document is encrypted or password-protected: {0}")]
Encrypted(String),
#[error("failed to parse {format} document: {message}")]
Parse {
format: &'static str,
message: String,
},
#[error(transparent)]
Io(#[from] std::io::Error),
}
impl ExtractError {
pub fn code(&self) -> &'static str {
match self {
ExtractError::UnsupportedFormat(_) => "UNSUPPORTED_FORMAT",
ExtractError::Encrypted(_) => "DOCUMENT_ENCRYPTED",
ExtractError::Parse { .. } => "EXTRACT_PARSE_ERROR",
ExtractError::Io(_) => "IO_ERROR",
}
}
}
pub type Result<T> = std::result::Result<T, ExtractError>;
pub fn extract(path: &Path) -> Result<Extracted> {
let format = Format::from_path(path).ok_or_else(|| {
let ext = path
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_string();
ExtractError::UnsupportedFormat(ext)
})?;
match format {
Format::Pdf => extract_pdf(path),
Format::Docx => extract_docx(path),
Format::Spreadsheet => extract_spreadsheet(path),
Format::Epub => extract_epub(path),
Format::Html => extract_html(path),
}
}
pub fn normalize_text(raw: &str) -> String {
let unix = raw.replace("\r\n", "\n").replace('\r', "\n");
let lines: Vec<&str> = unix.lines().map(|l| l.trim_end()).collect();
let Some(first) = lines.iter().position(|l| !l.is_empty()) else {
return String::new();
};
let last = lines
.iter()
.rposition(|l| !l.is_empty())
.expect("a non-blank line exists once `first` is found");
let lines = &lines[first..=last];
let mut out = String::new();
let mut blank_run = 0usize;
for &line in lines {
if line.is_empty() {
blank_run += 1;
if blank_run >= 2 {
continue;
}
} else {
blank_run = 0;
}
out.push_str(line);
out.push('\n');
}
out
}
fn extract_pdf(path: &Path) -> Result<Extracted> {
let bytes = std::fs::read(path)?;
let text = match guard_pdf_panic(|| pdf_extract::extract_text_from_mem(&bytes))? {
Ok(t) => t,
Err(e) => return Err(classify_pdf_error(e)),
};
let mut out = Extracted::new(text, Format::Pdf);
if let Ok(Ok(doc)) = guard_pdf_panic(|| pdf_extract::Document::load_mem(&bytes)) {
out.put_num("pages", doc.get_pages().len() as u64);
}
Ok(out)
}
fn guard_pdf_panic<T>(f: impl FnOnce() -> T) -> Result<T> {
catch_unwind(AssertUnwindSafe(f)).map_err(|_| ExtractError::Parse {
format: "pdf",
message: "pdf parser aborted on malformed input".to_string(),
})
}
fn classify_pdf_error(err: pdf_extract::OutputError) -> ExtractError {
let msg = err.to_string();
let lower = msg.to_ascii_lowercase();
if lower.contains("password") || lower.contains("decrypt") || lower.contains("encrypt") {
ExtractError::Encrypted(msg)
} else {
ExtractError::Parse {
format: "pdf",
message: msg,
}
}
}
fn extract_docx(path: &Path) -> Result<Extracted> {
let file = std::fs::File::open(path)?;
let mut archive = open_zip(file, "docx")?;
let xml = read_zip_entry(&mut archive, "word/document.xml", "docx")?;
let text = wordprocessing_text(&xml, "docx")?;
Ok(Extracted::new(text, Format::Docx))
}
fn wordprocessing_text(xml: &str, format: &'static str) -> Result<String> {
use quick_xml::events::Event;
use quick_xml::reader::Reader;
let mut reader = Reader::from_str(xml);
let mut buf = Vec::new();
let mut out = String::new();
let mut in_text_run = false;
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(e)) => {
if local_name(e.name().as_ref()) == b"t" {
in_text_run = true;
}
}
Ok(Event::End(e)) => {
let name = e.name();
match local_name(name.as_ref()) {
b"t" => in_text_run = false,
b"p" => out.push('\n'),
_ => {}
}
}
Ok(Event::Empty(e)) => {
match local_name(e.name().as_ref()) {
b"tab" => out.push('\t'),
b"br" | b"cr" => out.push('\n'),
_ => {}
}
}
Ok(Event::Text(t)) => {
if in_text_run {
out.push_str(&String::from_utf8_lossy(&t.into_inner()));
}
}
Ok(Event::GeneralRef(r)) => {
if in_text_run {
out.push_str(&resolve_entity_ref(&r));
}
}
Ok(Event::CData(c)) => {
if in_text_run {
out.push_str(&String::from_utf8_lossy(&c.into_inner()));
}
}
Ok(Event::Eof) => break,
Err(e) => {
return Err(ExtractError::Parse {
format,
message: format!("malformed XML: {e}"),
});
}
_ => {}
}
buf.clear();
}
Ok(out)
}
fn local_name(qname: &[u8]) -> &[u8] {
match qname.iter().rposition(|&b| b == b':') {
Some(i) => &qname[i + 1..],
None => qname,
}
}
fn resolve_entity_ref(reference: &quick_xml::events::BytesRef<'_>) -> String {
if let Ok(Some(ch)) = reference.resolve_char_ref() {
return ch.to_string();
}
match reference.decode().as_deref() {
Ok("amp") => "&".to_string(),
Ok("lt") => "<".to_string(),
Ok("gt") => ">".to_string(),
Ok("quot") => "\"".to_string(),
Ok("apos") => "'".to_string(),
Ok(other) => other.to_string(),
Err(_) => String::new(),
}
}
const MAX_SPREADSHEET_CELLS: u64 = 50_000_000;
fn extract_spreadsheet(path: &Path) -> Result<Extracted> {
use calamine::{open_workbook_auto, Reader};
let mut workbook = open_workbook_auto(path).map_err(|e| ExtractError::Parse {
format: "spreadsheet",
message: e.to_string(),
})?;
let sheet_names = workbook.sheet_names().to_vec();
let mut text = String::new();
for (idx, name) in sheet_names.iter().enumerate() {
if idx > 0 {
text.push('\n'); }
if let Some(cells) = spreadsheet_dense_cells(&mut workbook, name)? {
if cells > MAX_SPREADSHEET_CELLS {
return Err(ExtractError::Parse {
format: "spreadsheet",
message: format!(
"sheet {name:?} declares a {cells}-cell grid, over the \
{MAX_SPREADSHEET_CELLS}-cell cap (malformed or hostile spreadsheet)"
),
});
}
}
let range = workbook
.worksheet_range(name)
.map_err(|e| ExtractError::Parse {
format: "spreadsheet",
message: format!("sheet {name:?}: {e}"),
})?;
for row in range.rows() {
let cells: Vec<String> = row.iter().map(render_cell).collect();
text.push_str(&cells.join("\t"));
text.push('\n');
}
}
let mut out = Extracted::new(text, Format::Spreadsheet);
out.put_num("sheets", sheet_names.len() as u64);
if !sheet_names.is_empty() {
out.put_str("sheet_names", sheet_names.join(", "));
}
Ok(out)
}
fn spreadsheet_dense_cells(
workbook: &mut calamine::Sheets<std::io::BufReader<std::fs::File>>,
name: &str,
) -> Result<Option<u64>> {
use calamine::{DataRef, Sheets};
fn extent<E: std::fmt::Display>(
mut next: impl FnMut() -> std::result::Result<Option<((u32, u32), bool)>, E>,
) -> Result<Option<u64>> {
let (mut r0, mut r1, mut c0, mut c1) = (u32::MAX, 0u32, u32::MAX, 0u32);
let mut any = false;
loop {
match next() {
Ok(Some(((r, c), is_empty))) => {
if is_empty {
continue;
}
any = true;
r0 = r0.min(r);
r1 = r1.max(r);
c0 = c0.min(c);
c1 = c1.max(c);
}
Ok(None) => break,
Err(e) => {
return Err(ExtractError::Parse {
format: "spreadsheet",
message: format!("scanning sheet dimensions: {e}"),
})
}
}
}
if !any {
return Ok(Some(0));
}
let rows = u64::from(r1 - r0) + 1;
let cols = u64::from(c1 - c0) + 1;
Ok(Some(rows.saturating_mul(cols)))
}
match workbook {
Sheets::Xlsx(xlsx) => {
let mut reader =
xlsx.worksheet_cells_reader(name)
.map_err(|e| ExtractError::Parse {
format: "spreadsheet",
message: format!("sheet {name:?}: {e}"),
})?;
extent(|| {
reader.next_cell().map(|opt| {
opt.map(|c| (c.get_position(), matches!(c.get_value(), DataRef::Empty)))
})
})
}
Sheets::Xlsb(xlsb) => {
let mut reader =
xlsb.worksheet_cells_reader(name)
.map_err(|e| ExtractError::Parse {
format: "spreadsheet",
message: format!("sheet {name:?}: {e}"),
})?;
extent(|| {
reader.next_cell().map(|opt| {
opt.map(|c| (c.get_position(), matches!(c.get_value(), DataRef::Empty)))
})
})
}
Sheets::Xls(_) | Sheets::Ods(_) => Ok(None),
}
}
fn render_cell(cell: &calamine::Data) -> String {
use calamine::Data;
match cell {
Data::Empty => String::new(),
Data::String(s) => s.clone(),
Data::Int(i) => i.to_string(),
Data::Float(f) => {
if f.fract() == 0.0 && f.is_finite() && f.abs() < 1e15 {
format!("{}", *f as i64)
} else {
f.to_string()
}
}
Data::Bool(b) => {
if *b {
"TRUE".to_string()
} else {
"FALSE".to_string()
}
}
Data::DateTime(dt) => render_excel_datetime(dt),
Data::DateTimeIso(s) => s.clone(),
Data::DurationIso(s) => s.clone(),
Data::Error(e) => format!("{e:?}"),
}
}
fn render_excel_datetime(dt: &calamine::ExcelDateTime) -> String {
let serial = dt.as_f64();
if dt.is_duration() || !(0.0..=2_958_465.0).contains(&serial) {
return serial.to_string();
}
let (y, mo, d, h, mi, s, _ms) = dt.to_ymd_hms_milli();
if h == 0 && mi == 0 && s == 0 {
format!("{y:04}-{mo:02}-{d:02}")
} else {
format!("{y:04}-{mo:02}-{d:02} {h:02}:{mi:02}:{s:02}")
}
}
fn extract_epub(path: &Path) -> Result<Extracted> {
let file = std::fs::File::open(path)?;
let mut archive = open_zip(file, "epub")?;
let container = read_zip_entry(&mut archive, "META-INF/container.xml", "epub")?;
let opf_path = epub_opf_path(&container)?;
let opf = read_zip_entry(&mut archive, &opf_path, "epub")?;
let parsed = parse_opf(&opf)?;
let base = opf_base_dir(&opf_path);
let mut text = String::new();
let mut chapters = 0u64;
for idref in &parsed.spine {
let Some(href) = parsed.manifest.get(idref) else {
continue; };
let entry = join_zip_path(&base, href);
let Ok(chapter_xhtml) = read_zip_entry(&mut archive, &entry, "epub") else {
continue;
};
let chapter_text = html_to_text(chapter_xhtml.as_bytes())?;
if !chapter_text.trim().is_empty() {
if chapters > 0 {
text.push('\n');
}
text.push_str(&chapter_text);
text.push('\n');
chapters += 1;
}
}
let mut out = Extracted::new(text, Format::Epub);
out.put_num("chapters", chapters);
if let Some(title) = parsed.title {
out.put_str("title", title);
}
Ok(out)
}
fn epub_opf_path(container_xml: &str) -> Result<String> {
use quick_xml::events::Event;
use quick_xml::reader::Reader;
let mut reader = Reader::from_str(container_xml);
let mut buf = Vec::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(e)) | Ok(Event::Empty(e)) => {
if local_name(e.name().as_ref()) == b"rootfile" {
if let Some(p) = attr_value(&e, b"full-path") {
return Ok(p);
}
}
}
Ok(Event::Eof) => break,
Err(e) => {
return Err(ExtractError::Parse {
format: "epub",
message: format!("container.xml: {e}"),
})
}
_ => {}
}
buf.clear();
}
Err(ExtractError::Parse {
format: "epub",
message: "container.xml has no <rootfile full-path>".to_string(),
})
}
struct OpfParsed {
manifest: BTreeMap<String, String>,
spine: Vec<String>,
title: Option<String>,
}
fn parse_opf(opf_xml: &str) -> Result<OpfParsed> {
use quick_xml::events::Event;
use quick_xml::reader::Reader;
let mut reader = Reader::from_str(opf_xml);
let mut buf = Vec::new();
let mut manifest = BTreeMap::new();
let mut spine = Vec::new();
let mut title: Option<String> = None;
let mut in_title = false;
let mut title_buf = String::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(e)) => match local_name(e.name().as_ref()) {
b"item" => {
if let (Some(id), Some(href)) = (attr_value(&e, b"id"), attr_value(&e, b"href"))
{
manifest.insert(id, href);
}
}
b"itemref" => {
if let Some(idref) = attr_value(&e, b"idref") {
spine.push(idref);
}
}
b"title" if title.is_none() => in_title = true,
_ => {}
},
Ok(Event::Empty(e)) => match local_name(e.name().as_ref()) {
b"item" => {
if let (Some(id), Some(href)) = (attr_value(&e, b"id"), attr_value(&e, b"href"))
{
manifest.insert(id, href);
}
}
b"itemref" => {
if let Some(idref) = attr_value(&e, b"idref") {
spine.push(idref);
}
}
_ => {}
},
Ok(Event::End(e)) => {
if in_title && local_name(e.name().as_ref()) == b"title" {
in_title = false;
let s = title_buf.trim();
if !s.is_empty() {
title = Some(s.to_string());
}
}
}
Ok(Event::Text(t)) => {
if in_title {
title_buf.push_str(&String::from_utf8_lossy(&t.into_inner()));
}
}
Ok(Event::GeneralRef(r)) => {
if in_title {
title_buf.push_str(&resolve_entity_ref(&r));
}
}
Ok(Event::CData(c)) => {
if in_title {
title_buf.push_str(&String::from_utf8_lossy(&c.into_inner()));
}
}
Ok(Event::Eof) => break,
Err(e) => {
return Err(ExtractError::Parse {
format: "epub",
message: format!("OPF: {e}"),
})
}
_ => {}
}
buf.clear();
}
Ok(OpfParsed {
manifest,
spine,
title,
})
}
fn opf_base_dir(opf_path: &str) -> String {
match opf_path.rfind('/') {
Some(i) => opf_path[..i].to_string(),
None => String::new(),
}
}
fn join_zip_path(base: &str, href: &str) -> String {
let decoded = percent_decode(href);
let combined = if base.is_empty() {
decoded
} else {
format!("{base}/{decoded}")
};
normalize_zip_path(&combined)
}
fn percent_decode(s: &str) -> String {
let bytes = s.as_bytes();
let mut out: Vec<u8> = Vec::with_capacity(bytes.len());
let mut i = 0;
while i < bytes.len() {
if bytes[i] == b'%' && i + 2 < bytes.len() {
let hi = (bytes[i + 1] as char).to_digit(16);
let lo = (bytes[i + 2] as char).to_digit(16);
if let (Some(hi), Some(lo)) = (hi, lo) {
out.push((hi * 16 + lo) as u8);
i += 3;
continue;
}
}
out.push(bytes[i]);
i += 1;
}
String::from_utf8_lossy(&out).into_owned()
}
fn normalize_zip_path(path: &str) -> String {
let mut out: Vec<&str> = Vec::new();
for seg in path.split('/') {
match seg {
"" | "." => {}
".." => {
out.pop();
}
other => out.push(other),
}
}
out.join("/")
}
fn extract_html(path: &Path) -> Result<Extracted> {
let bytes = std::fs::read(path)?;
let text = html_to_text(&bytes)?;
Ok(Extracted::new(text, Format::Html))
}
fn html_to_text(html: &[u8]) -> Result<String> {
if let Some(depth) = html_block_nesting_exceeds(html, MAX_HTML_NESTING_DEPTH) {
return Err(ExtractError::Parse {
format: "html",
message: format!(
"HTML block nesting depth exceeds the {MAX_HTML_NESTING_DEPTH} cap (reached {depth}; \
malformed or hostile input)"
),
});
}
html2text::config::with_decorator(PlainContentDecorator)
.string_from_read(html, 10_000)
.map_err(|e| ExtractError::Parse {
format: "html",
message: e.to_string(),
})
}
const MAX_HTML_NESTING_DEPTH: usize = 4_096;
const HTML_VOID_ELEMENTS: &[&str] = &[
"area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source",
"track", "wbr",
];
fn html_block_nesting_exceeds(html: &[u8], limit: usize) -> Option<usize> {
let mut depth: usize = 0;
let mut i = 0usize;
let n = html.len();
while i < n {
if html[i] != b'<' {
i += 1;
continue;
}
let Some(&c) = html.get(i + 1) else { break };
if c == b'!' || c == b'?' {
i = memchr_gt(html, i + 1);
continue;
}
if c == b'/' {
depth = depth.saturating_sub(1);
i = memchr_gt(html, i + 1);
continue;
}
if !c.is_ascii_alphabetic() {
i += 1;
continue;
}
let end = memchr_gt(html, i + 1);
let self_closing = end > 0 && end <= n && html.get(end - 1) == Some(&b'/');
let name_end = (i + 1..end.min(n))
.find(|&j| !html[j].is_ascii_alphanumeric())
.unwrap_or(end.min(n));
let name = html[i + 1..name_end].to_ascii_lowercase();
let is_void = std::str::from_utf8(&name)
.map(|s| HTML_VOID_ELEMENTS.contains(&s))
.unwrap_or(false);
if !self_closing && !is_void {
depth += 1;
if depth > limit {
return Some(depth);
}
}
i = end;
}
None
}
fn memchr_gt(hay: &[u8], from: usize) -> usize {
let mut j = from;
while j < hay.len() {
if hay[j] == b'>' {
return j + 1;
}
j += 1;
}
hay.len()
}
#[derive(Clone, Debug)]
struct PlainContentDecorator;
impl html2text::render::TextDecorator for PlainContentDecorator {
type Annotation = ();
fn decorate_link_start(&mut self, _url: &str) -> (String, Self::Annotation) {
(String::new(), ())
}
fn decorate_link_end(&mut self) -> String {
String::new()
}
fn decorate_em_start(&self) -> (String, Self::Annotation) {
(String::new(), ())
}
fn decorate_em_end(&self) -> String {
String::new()
}
fn decorate_strong_start(&self) -> (String, Self::Annotation) {
(String::new(), ())
}
fn decorate_strong_end(&self) -> String {
String::new()
}
fn decorate_strikeout_start(&self) -> (String, Self::Annotation) {
(String::new(), ())
}
fn decorate_strikeout_end(&self) -> String {
String::new()
}
fn decorate_code_start(&self) -> (String, Self::Annotation) {
(String::new(), ())
}
fn decorate_code_end(&self) -> String {
String::new()
}
fn decorate_preformat_first(&self) -> Self::Annotation {}
fn decorate_preformat_cont(&self) -> Self::Annotation {}
fn decorate_image(&mut self, _src: &str, title: &str) -> (String, Self::Annotation) {
(title.to_string(), ())
}
fn header_prefix(&self, _level: usize) -> String {
String::new()
}
fn quote_prefix(&self) -> String {
"> ".to_string()
}
fn unordered_item_prefix(&self) -> String {
"* ".to_string()
}
fn ordered_item_prefix(&self, i: i64) -> String {
format!("{i}. ")
}
fn decorate_superscript_start(&self) -> (String, Self::Annotation) {
(String::new(), ())
}
fn decorate_superscript_end(&self) -> String {
String::new()
}
fn make_subblock_decorator(&self) -> Self {
PlainContentDecorator
}
}
#[allow(dead_code)]
fn strip_markdown_decorations(text: &str) -> String {
let mut out = String::with_capacity(text.len());
for line in text.lines() {
let trimmed = line.trim_start();
let after_hashes = trimmed.trim_start_matches('#');
let line = if after_hashes.len() != trimmed.len() {
after_hashes.trim_start()
} else {
line
};
out.push_str(&unwrap_brackets(line));
out.push('\n');
}
out
}
#[allow(dead_code)]
fn unwrap_brackets(line: &str) -> String {
if !line.contains('[') {
return line.to_string();
}
let mut out = String::with_capacity(line.len());
let mut chars = line.chars().peekable();
while let Some(c) = chars.next() {
if c == '[' {
let mut inner = String::new();
let mut closed = false;
for d in chars.by_ref() {
if d == ']' {
closed = true;
break;
}
inner.push(d);
}
if closed {
out.push_str(&inner);
} else {
out.push('[');
out.push_str(&inner);
}
} else {
out.push(c);
}
}
out
}
fn open_zip<R: Read + std::io::Seek>(
reader: R,
format: &'static str,
) -> Result<zip::ZipArchive<R>> {
zip::ZipArchive::new(reader).map_err(|e| ExtractError::Parse {
format,
message: format!("not a valid zip container: {e}"),
})
}
const MAX_ZIP_ENTRY_BYTES: u64 = 256 * 1024 * 1024;
fn read_zip_entry<R: Read + std::io::Seek>(
archive: &mut zip::ZipArchive<R>,
name: &str,
format: &'static str,
) -> Result<String> {
let entry = archive.by_name(name).map_err(|e| ExtractError::Parse {
format,
message: format!("missing zip entry {name:?}: {e}"),
})?;
let declared = entry.size();
if declared > MAX_ZIP_ENTRY_BYTES {
return Err(ExtractError::Parse {
format,
message: format!(
"zip entry {name:?} declares {declared} bytes, over the {MAX_ZIP_ENTRY_BYTES}-byte cap"
),
});
}
let mut bytes = Vec::new();
entry
.take(MAX_ZIP_ENTRY_BYTES + 1)
.read_to_end(&mut bytes)
.map_err(|e| ExtractError::Parse {
format,
message: format!("reading {name:?}: {e}"),
})?;
if bytes.len() as u64 > MAX_ZIP_ENTRY_BYTES {
return Err(ExtractError::Parse {
format,
message: format!(
"zip entry {name:?} exceeds the {MAX_ZIP_ENTRY_BYTES}-byte cap (decompression bomb?)"
),
});
}
Ok(String::from_utf8_lossy(&bytes).into_owned())
}
fn attr_value(elem: &quick_xml::events::BytesStart<'_>, key: &[u8]) -> Option<String> {
elem.attributes().flatten().find_map(|attr| {
if local_name(attr.key.as_ref()) == key {
#[allow(deprecated)]
attr.unescape_value().ok().map(|cow| cow.into_owned())
} else {
None
}
})
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
fn fixture(name: &str) -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("../../tests/corpora/corpus-c-formats/sources/docs")
.join(name)
}
fn expected(name: &str) -> String {
std::fs::read_to_string(fixture(&format!("{name}.txt"))).unwrap()
}
fn tokens(s: &str) -> String {
s.split_whitespace().collect::<Vec<_>>().join(" ")
}
fn line_set(s: &str) -> Vec<String> {
let mut v: Vec<String> = s.lines().map(tokens).filter(|l| !l.is_empty()).collect();
v.sort();
v
}
#[test]
fn excel_datetime_out_of_range_serial_stays_raw_and_never_panics() {
use calamine::{ExcelDateTime, ExcelDateTimeType};
let in_range = render_excel_datetime(&ExcelDateTime::new(
46_188.0,
ExcelDateTimeType::DateTime,
false,
));
assert!(
in_range.contains('-'),
"an in-range serial should render a calendar date, got {in_range}"
);
for serial in [1e308_f64, 3_000_000.0, 9e18, -5.0] {
let out = render_excel_datetime(&ExcelDateTime::new(
serial,
ExcelDateTimeType::DateTime,
false,
));
assert_eq!(
out,
serial.to_string(),
"out-of-range serial {serial} must stay raw, got {out}"
);
}
}
#[test]
fn html_nesting_guard_refuses_deep_bomb_passes_flat() {
let deep = format!(
"<html><body>{}x{}</body></html>",
"<div>".repeat(8_000),
"</div>".repeat(8_000)
);
assert!(
html_block_nesting_exceeds(deep.as_bytes(), MAX_HTML_NESTING_DEPTH).is_some(),
"an 8000-deep nest must trip the guard"
);
assert!(
html_to_text(deep.as_bytes()).is_err(),
"html_to_text must refuse the bomb (typed error), not hang"
);
let flat = format!("<html><body>{}</body></html>", "<br>".repeat(50_000));
assert!(
html_block_nesting_exceeds(flat.as_bytes(), MAX_HTML_NESTING_DEPTH).is_none(),
"50k sibling void <br> are flat, not deep — must pass"
);
let normal =
"<html><body><div><p>hi <a href=\"u\">link</a>; a < b in prose</p></div></body></html>";
assert!(
html_block_nesting_exceeds(normal.as_bytes(), MAX_HTML_NESTING_DEPTH).is_none(),
"ordinary nesting (and a stray `<`) must pass"
);
assert!(
html_to_text(normal.as_bytes()).is_ok(),
"a normal document must still flatten fine"
);
}
#[test]
fn detects_format_by_extension_case_insensitively() {
assert_eq!(Format::from_path(Path::new("a.pdf")), Some(Format::Pdf));
assert_eq!(Format::from_path(Path::new("a.PDF")), Some(Format::Pdf));
assert_eq!(Format::from_path(Path::new("a.docx")), Some(Format::Docx));
assert_eq!(
Format::from_path(Path::new("a.xlsx")),
Some(Format::Spreadsheet)
);
assert_eq!(
Format::from_path(Path::new("a.ods")),
Some(Format::Spreadsheet)
);
assert_eq!(Format::from_path(Path::new("a.epub")), Some(Format::Epub));
assert_eq!(Format::from_path(Path::new("a.html")), Some(Format::Html));
assert_eq!(Format::from_path(Path::new("a.htm")), Some(Format::Html));
assert_eq!(Format::from_path(Path::new("a.txt")), None);
assert_eq!(Format::from_path(Path::new("noext")), None);
}
#[test]
fn unsupported_extension_is_typed_error() {
let err = extract(Path::new("/tmp/whatever.txt")).unwrap_err();
assert!(matches!(err, ExtractError::UnsupportedFormat(ref e) if e == "txt"));
assert_eq!(err.code(), "UNSUPPORTED_FORMAT");
}
#[test]
fn missing_extension_is_unsupported() {
let err = extract(Path::new("/tmp/noext")).unwrap_err();
assert!(matches!(err, ExtractError::UnsupportedFormat(ref e) if e.is_empty()));
}
#[test]
fn normalize_collapses_blanks_and_trims() {
let raw = "\r\n\r\nHeading\r\n\r\n\r\n\r\nBody line \r\n\r\n";
assert_eq!(normalize_text(raw), "Heading\n\nBody line\n");
}
#[test]
fn normalize_empty_stays_empty() {
assert_eq!(normalize_text(""), "");
assert_eq!(normalize_text(" \n\n \n"), "");
}
#[test]
fn extract_text_pdf_matches_known_good() {
let got = extract(&fixture("text.pdf")).unwrap();
assert_eq!(got.metadata["format"], MetaValue::Str("pdf".into()));
assert_eq!(got.metadata["pages"], MetaValue::Num(1));
assert_eq!(tokens(&got.text), tokens(&expected("text.pdf")));
}
#[test]
fn extract_weird_fonts_pdf_matches_known_good() {
let got = extract(&fixture("weird-fonts.pdf")).unwrap();
assert_eq!(tokens(&got.text), tokens(&expected("weird-fonts.pdf")));
}
#[test]
fn extract_multi_column_pdf_matches_content_order_agnostic() {
let got = extract(&fixture("multi-column.pdf")).unwrap();
assert_eq!(line_set(&got.text), line_set(&expected("multi-column.pdf")));
}
#[test]
fn extract_image_only_pdf_yields_empty() {
let got = extract(&fixture("image-only.pdf")).unwrap();
assert_eq!(got.text, "");
assert!(expected("image-only.pdf").trim().is_empty());
}
#[test]
fn extract_encrypted_pdf_without_password_refuses_cleanly() {
let err = extract(&fixture("encrypted.pdf")).unwrap_err();
assert!(
matches!(err, ExtractError::Encrypted(_)),
"expected Encrypted, got {err:?}"
);
assert_eq!(err.code(), "DOCUMENT_ENCRYPTED");
}
#[test]
fn guard_pdf_panic_contains_unwind_as_parse_error() {
let contained: Result<()> = guard_pdf_panic(|| panic!("simulated pdf-extract abort"));
assert!(
matches!(contained, Err(ExtractError::Parse { format: "pdf", .. })),
"panic must be contained as a pdf Parse error, got {contained:?}"
);
let ok: Result<u32> = guard_pdf_panic(|| 42);
assert_eq!(ok.unwrap(), 42);
}
#[test]
fn extract_docx_matches_known_good() {
let got = extract(&fixture("sample.docx")).unwrap();
assert_eq!(got.metadata["format"], MetaValue::Str("docx".into()));
assert_eq!(tokens(&got.text), tokens(&expected("sample.docx")));
}
#[test]
fn extract_xlsx_matches_known_good() {
let got = extract(&fixture("sample.xlsx")).unwrap();
assert_eq!(got.metadata["format"], MetaValue::Str("spreadsheet".into()));
assert_eq!(got.metadata["sheets"], MetaValue::Num(1));
assert_eq!(
got.metadata["sheet_names"],
MetaValue::Str("Expenses".into())
);
assert_eq!(got.text.trim_end(), expected("sample.xlsx").trim_end());
}
#[test]
fn extract_epub_matches_known_good() {
let got = extract(&fixture("sample.epub")).unwrap();
assert_eq!(got.metadata["format"], MetaValue::Str("epub".into()));
assert_eq!(got.metadata["chapters"], MetaValue::Num(1));
assert_eq!(
got.metadata["title"],
MetaValue::Str("Operations Playbook".into())
);
assert_eq!(tokens(&got.text), tokens(&expected("sample.epub")));
}
#[test]
fn extract_html_matches_known_good() {
let got = extract(&fixture("sample.html")).unwrap();
assert_eq!(got.metadata["format"], MetaValue::Str("html".into()));
assert_eq!(tokens(&got.text), tokens(&expected("sample.html")));
}
#[test]
fn unwrap_brackets_flattens_link_text() {
assert_eq!(
unwrap_brackets("contact [ops@acme.example] or the [handbook]."),
"contact ops@acme.example or the handbook."
);
assert_eq!(unwrap_brackets("a [b c"), "a [b c");
assert_eq!(unwrap_brackets("plain text"), "plain text");
}
#[test]
fn strip_markdown_decorations_drops_heading_hashes() {
let input = "# Title\n## Section\n* bullet\n1. ordered\nplain\n";
let out = strip_markdown_decorations(input);
assert_eq!(out, "Title\nSection\n* bullet\n1. ordered\nplain\n");
}
#[test]
fn local_name_strips_prefix() {
assert_eq!(local_name(b"w:t"), b"t");
assert_eq!(local_name(b"t"), b"t");
assert_eq!(local_name(b"dc:title"), b"title");
}
#[test]
fn extracted_serializes_to_text_metadata_json() {
let got = extract(&fixture("sample.xlsx")).unwrap();
let json = serde_json::to_value(&got).unwrap();
assert!(json.get("text").is_some());
assert_eq!(json["metadata"]["format"], "spreadsheet");
assert_eq!(json["metadata"]["sheets"], 1);
assert!(json["metadata"]["sheets"].is_number());
assert!(json["metadata"]["format"].is_string());
}
#[test]
fn regression_normalize_text_leading_blanks_is_linear() {
let blanks = "\n".repeat(500_000);
let raw = format!("{blanks}only real line\n");
assert_eq!(normalize_text(&raw), "only real line\n");
assert_eq!(normalize_text(&" \n".repeat(500_000)), "");
}
fn write_dense_bomb_xlsx(dest: &Path) {
use std::io::Write;
let base = std::fs::read(fixture("sample.xlsx")).expect("corpus sample.xlsx exists");
let mut archive =
zip::ZipArchive::new(std::io::Cursor::new(base)).expect("sample.xlsx is a valid zip");
let bomb_sheet = b"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\
<worksheet xmlns=\"http://schemas.openxmlformats.org/spreadsheetml/2006/main\">\
<sheetData>\
<row r=\"1\"><c r=\"A1\"><v>1</v></c></row>\
<row r=\"1048576\"><c r=\"XFD1048576\"><v>2</v></c></row>\
</sheetData></worksheet>";
let out = std::fs::File::create(dest).unwrap();
let mut writer = zip::ZipWriter::new(out);
let opts = zip::write::SimpleFileOptions::default()
.compression_method(zip::CompressionMethod::Stored);
for i in 0..archive.len() {
let entry = archive.by_index(i).unwrap();
let name = entry.name().to_string();
if name == "xl/worksheets/sheet1.xml" {
writer.start_file(name, opts).unwrap();
writer.write_all(bomb_sheet).unwrap();
} else {
writer.raw_copy_file(entry).unwrap();
}
}
writer.finish().unwrap();
}
#[test]
fn regression_spreadsheet_dense_bomb_refused_not_oom() {
let tmp = tempfile::TempDir::new().unwrap();
let bomb = tmp.path().join("invoice.xlsx");
write_dense_bomb_xlsx(&bomb);
assert!(
std::fs::metadata(&bomb).unwrap().len() < 10_000,
"the bomb must be tiny on disk; the danger is the in-memory expansion"
);
let err = extract(&bomb).unwrap_err();
assert!(
matches!(
err,
ExtractError::Parse {
format: "spreadsheet",
..
}
),
"an over-cap dense grid must be a typed spreadsheet Parse refusal, got {err:?}"
);
assert_eq!(err.code(), "EXTRACT_PARSE_ERROR");
}
#[test]
fn regression_spreadsheet_cap_allows_real_workbook() {
let got = extract(&fixture("sample.xlsx")).unwrap();
assert_eq!(got.metadata["sheets"], MetaValue::Num(1));
assert!(!got.text.is_empty());
}
fn write_docx(dest: &Path, body_runs: &str) {
use std::io::Write;
let document = format!(
"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\
<w:document xmlns:w=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\">\
<w:body>{body_runs}</w:body></w:document>"
);
let file = std::fs::File::create(dest).unwrap();
let mut writer = zip::ZipWriter::new(file);
let opts = zip::write::SimpleFileOptions::default()
.compression_method(zip::CompressionMethod::Stored);
writer.start_file("word/document.xml", opts).unwrap();
writer.write_all(document.as_bytes()).unwrap();
writer.finish().unwrap();
}
#[test]
fn regression_docx_resolves_entity_refs() {
let tmp = tempfile::TempDir::new().unwrap();
let f = tmp.path().join("entity.docx");
write_docx(
&f,
"<w:p><w:r><w:t>Smith & Co invoice <final> total — 100</w:t></w:r></w:p>",
);
let got = extract(&f).unwrap();
assert_eq!(got.text, "Smith & Co invoice <final> total — 100\n");
}
#[test]
fn regression_docx_preserves_cdata_run_text() {
let tmp = tempfile::TempDir::new().unwrap();
let f = tmp.path().join("cdata.docx");
write_docx(
&f,
"<w:p><w:r><w:t>Line A.</w:t></w:r></w:p>\
<w:p><w:r><w:t><![CDATA[IMPORTANT CDATA CONTENT]]></w:t></w:r></w:p>\
<w:p><w:r><w:t>Line C.</w:t></w:r></w:p>",
);
let got = extract(&f).unwrap();
assert_eq!(got.text, "Line A.\nIMPORTANT CDATA CONTENT\nLine C.\n");
}
#[test]
fn resolve_entity_ref_maps_named_and_numeric() {
use quick_xml::events::BytesRef;
let r = |s: &'static str| resolve_entity_ref(&BytesRef::new(s));
assert_eq!(r("amp"), "&");
assert_eq!(r("lt"), "<");
assert_eq!(r("gt"), ">");
assert_eq!(r("quot"), "\"");
assert_eq!(r("apos"), "'");
assert_eq!(r("#8212"), "—");
assert_eq!(r("#x2014"), "—");
assert_eq!(r("nbsp"), "nbsp");
}
fn write_epub(dest: &Path, opf_metadata: &str, manifest_href: &str, chapter_entry: &str) {
use std::io::Write;
let container = "<?xml version=\"1.0\"?>\
<container version=\"1.0\" xmlns=\"urn:oasis:names:tc:opendocument:xmlns:container\">\
<rootfiles><rootfile full-path=\"OEBPS/content.opf\" \
media-type=\"application/oebps-package+xml\"/></rootfiles></container>";
let opf = format!(
"<?xml version=\"1.0\" encoding=\"utf-8\"?>\
<package xmlns=\"http://www.idpf.org/2007/opf\" version=\"3.0\" unique-identifier=\"id\">\
<metadata xmlns:dc=\"http://purl.org/dc/elements/1.1/\">{opf_metadata}</metadata>\
<manifest><item id=\"c1\" href=\"{manifest_href}\" media-type=\"application/xhtml+xml\"/></manifest>\
<spine><itemref idref=\"c1\"/></spine></package>"
);
let chapter = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\
<html xmlns=\"http://www.w3.org/1999/xhtml\"><body>\
<p>Hello world body text.</p></body></html>";
let file = std::fs::File::create(dest).unwrap();
let mut writer = zip::ZipWriter::new(file);
let stored = zip::write::SimpleFileOptions::default()
.compression_method(zip::CompressionMethod::Stored);
writer.start_file("mimetype", stored).unwrap();
writer.write_all(b"application/epub+zip").unwrap();
writer.start_file("META-INF/container.xml", stored).unwrap();
writer.write_all(container.as_bytes()).unwrap();
writer.start_file("OEBPS/content.opf", stored).unwrap();
writer.write_all(opf.as_bytes()).unwrap();
writer.start_file(chapter_entry, stored).unwrap();
writer.write_all(chapter.as_bytes()).unwrap();
writer.finish().unwrap();
}
#[test]
fn regression_epub_title_accumulates_entities_and_nested_events() {
let tmp = tempfile::TempDir::new().unwrap();
let f1 = tmp.path().join("entity.epub");
write_epub(
&f1,
"<dc:title>Smith & Jones: A <Tale></dc:title>",
"chapter.xhtml",
"OEBPS/chapter.xhtml",
);
let got = extract(&f1).unwrap();
assert_eq!(
got.metadata["title"],
MetaValue::Str("Smith & Jones: A <Tale>".into())
);
let f2 = tmp.path().join("comment.epub");
write_epub(
&f2,
"<dc:title>Part One<!-- editorial --> and Part Two</dc:title>",
"chapter.xhtml",
"OEBPS/chapter.xhtml",
);
let got = extract(&f2).unwrap();
assert_eq!(
got.metadata["title"],
MetaValue::Str("Part One and Part Two".into())
);
}
#[test]
fn regression_epub_self_closing_title_does_not_capture_author() {
let tmp = tempfile::TempDir::new().unwrap();
let f = tmp.path().join("empty-title.epub");
write_epub(
&f,
"<dc:title/><dc:creator>John Doe</dc:creator>",
"chapter.xhtml",
"OEBPS/chapter.xhtml",
);
let got = extract(&f).unwrap();
assert!(
!got.metadata.contains_key("title"),
"self-closing title must not capture the author, got {:?}",
got.metadata.get("title")
);
assert_eq!(got.metadata["chapters"], MetaValue::Num(1));
}
#[test]
fn regression_epub_percent_encoded_href_resolves() {
let tmp = tempfile::TempDir::new().unwrap();
let f = tmp.path().join("spaced.epub");
write_epub(
&f,
"<dc:title>Spaced</dc:title>",
"my%20chapter.xhtml",
"OEBPS/my chapter.xhtml",
);
let got = extract(&f).unwrap();
assert_eq!(got.metadata["chapters"], MetaValue::Num(1));
assert!(
got.text.contains("Hello world body text."),
"percent-encoded-href chapter must extract, got {:?}",
got.text
);
}
#[test]
fn percent_decode_handles_spaces_and_unicode_and_stray_percent() {
assert_eq!(percent_decode("my%20chapter.xhtml"), "my chapter.xhtml");
assert_eq!(percent_decode("caf%C3%A9.xhtml"), "café.xhtml");
assert_eq!(percent_decode("100%done"), "100%done");
assert_eq!(percent_decode("plain.xhtml"), "plain.xhtml");
}
#[test]
fn normalize_zip_path_resolves_dot_segments() {
assert_eq!(
normalize_zip_path("OEBPS/../text/ch1.xhtml"),
"text/ch1.xhtml"
);
assert_eq!(normalize_zip_path("OEBPS/./ch1.xhtml"), "OEBPS/ch1.xhtml");
assert_eq!(normalize_zip_path("OEBPS/ch1.xhtml"), "OEBPS/ch1.xhtml");
}
#[test]
fn render_excel_datetime_renders_iso_not_serial() {
use calamine::{ExcelDateTime, ExcelDateTimeType};
let date = ExcelDateTime::new(46188.0, ExcelDateTimeType::DateTime, false);
assert_eq!(render_excel_datetime(&date), "2026-06-15");
let dt = ExcelDateTime::new(46143.5, ExcelDateTimeType::DateTime, false);
assert_eq!(render_excel_datetime(&dt), "2026-05-01 12:00:00");
let dur = ExcelDateTime::new(1.5, ExcelDateTimeType::TimeDelta, false);
assert_eq!(render_excel_datetime(&dur), "1.5");
}
#[test]
fn render_cell_dates_are_iso() {
use calamine::{Data, ExcelDateTime, ExcelDateTimeType};
assert_eq!(
render_cell(&Data::DateTime(ExcelDateTime::new(
46188.0,
ExcelDateTimeType::DateTime,
false
))),
"2026-06-15"
);
assert_eq!(render_cell(&Data::Float(3450.0)), "3450");
assert_eq!(render_cell(&Data::Int(7)), "7");
}
fn html_text(body: &str) -> String {
let tmp = tempfile::TempDir::new().unwrap();
let f = tmp.path().join("doc.html");
std::fs::write(&f, format!("<html><body>{body}</body></html>")).unwrap();
extract(&f).unwrap().text
}
#[test]
fn regression_html_keeps_literal_brackets_and_hashes() {
let out = html_text(
"<p>#1 in sales this quarter</p>\
<p>see chart[3] for data, array[0] = total[net]</p>",
);
assert!(out.contains("#1 in sales this quarter"), "got {out:?}");
assert!(
out.contains("see chart[3] for data, array[0] = total[net]"),
"got {out:?}"
);
let out = html_text("<p>See note [1] and [sic] here.</p><p>x[i] + y[j]</p>");
assert!(out.contains("See note [1] and [sic] here."), "got {out:?}");
assert!(out.contains("x[i] + y[j]"), "got {out:?}");
}
#[test]
fn html_headings_render_as_plain_prose_no_hash() {
let out = html_text("<h1>Launch Plan</h1><p>Body prose.</p>");
assert!(out.contains("Launch Plan"), "got {out:?}");
assert!(
!out.contains('#'),
"no heading marker expected, got {out:?}"
);
}
#[test]
fn html_links_render_as_bare_text_no_brackets() {
let out = html_text("<p>See the <a href=\"https://x.example\">handbook</a>.</p>");
assert!(out.contains("See the handbook."), "got {out:?}");
}
}