use std::collections::BTreeMap;
use std::io::Read;
use std::path::Path;
use serde::Serialize;
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
pub struct Extracted {
pub text: String,
pub metadata: BTreeMap<String, MetaValue>,
}
impl Extracted {
fn new(raw_text: String, format: Format) -> Self {
let mut metadata = BTreeMap::new();
metadata.insert(
"format".to_string(),
MetaValue::Str(format.tag().to_string()),
);
Extracted {
text: normalize_text(&raw_text),
metadata,
}
}
fn put_str(&mut self, key: &str, value: impl Into<String>) {
let v = value.into();
if !v.trim().is_empty() {
self.metadata.insert(key.to_string(), MetaValue::Str(v));
}
}
fn put_num(&mut self, key: &str, value: u64) {
self.metadata.insert(key.to_string(), MetaValue::Num(value));
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
#[serde(untagged)]
pub enum MetaValue {
Str(String),
Num(u64),
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Format {
Pdf,
Docx,
Spreadsheet,
Epub,
Html,
}
impl Format {
pub fn from_path(path: &Path) -> Option<Format> {
let ext = path.extension()?.to_str()?.to_ascii_lowercase();
Some(match ext.as_str() {
"pdf" => Format::Pdf,
"docx" => Format::Docx,
"xlsx" | "xlsm" | "xlsb" | "ods" => Format::Spreadsheet,
"epub" => Format::Epub,
"html" | "htm" | "xhtml" => Format::Html,
_ => return None,
})
}
pub fn tag(self) -> &'static str {
match self {
Format::Pdf => "pdf",
Format::Docx => "docx",
Format::Spreadsheet => "spreadsheet",
Format::Epub => "epub",
Format::Html => "html",
}
}
}
#[derive(Debug, thiserror::Error)]
pub enum ExtractError {
#[error("unsupported document format: {0:?} (supported: pdf, docx, xlsx, epub, html)")]
UnsupportedFormat(String),
#[error("document is encrypted or password-protected: {0}")]
Encrypted(String),
#[error("failed to parse {format} document: {message}")]
Parse {
format: &'static str,
message: String,
},
#[error(transparent)]
Io(#[from] std::io::Error),
}
impl ExtractError {
pub fn code(&self) -> &'static str {
match self {
ExtractError::UnsupportedFormat(_) => "UNSUPPORTED_FORMAT",
ExtractError::Encrypted(_) => "DOCUMENT_ENCRYPTED",
ExtractError::Parse { .. } => "EXTRACT_PARSE_ERROR",
ExtractError::Io(_) => "IO_ERROR",
}
}
}
pub type Result<T> = std::result::Result<T, ExtractError>;
pub fn extract(path: &Path) -> Result<Extracted> {
let format = Format::from_path(path).ok_or_else(|| {
let ext = path
.extension()
.and_then(|e| e.to_str())
.unwrap_or("")
.to_string();
ExtractError::UnsupportedFormat(ext)
})?;
match format {
Format::Pdf => extract_pdf(path),
Format::Docx => extract_docx(path),
Format::Spreadsheet => extract_spreadsheet(path),
Format::Epub => extract_epub(path),
Format::Html => extract_html(path),
}
}
pub fn normalize_text(raw: &str) -> String {
let unix = raw.replace("\r\n", "\n").replace('\r', "\n");
let mut lines: Vec<&str> = unix.lines().map(|l| l.trim_end()).collect();
while lines.first().is_some_and(|l| l.is_empty()) {
lines.remove(0);
}
while lines.last().is_some_and(|l| l.is_empty()) {
lines.pop();
}
if lines.is_empty() {
return String::new();
}
let mut out = String::new();
let mut blank_run = 0usize;
for line in lines {
if line.is_empty() {
blank_run += 1;
if blank_run >= 2 {
continue;
}
} else {
blank_run = 0;
}
out.push_str(line);
out.push('\n');
}
out
}
fn extract_pdf(path: &Path) -> Result<Extracted> {
let bytes = std::fs::read(path)?;
let text = match pdf_extract::extract_text_from_mem(&bytes) {
Ok(t) => t,
Err(e) => return Err(classify_pdf_error(e)),
};
let mut out = Extracted::new(text, Format::Pdf);
if let Ok(doc) = pdf_extract::Document::load_mem(&bytes) {
let pages = doc.get_pages().len() as u64;
out.put_num("pages", pages);
}
Ok(out)
}
fn classify_pdf_error(err: pdf_extract::OutputError) -> ExtractError {
let msg = err.to_string();
let lower = msg.to_ascii_lowercase();
if lower.contains("password") || lower.contains("decrypt") || lower.contains("encrypt") {
ExtractError::Encrypted(msg)
} else {
ExtractError::Parse {
format: "pdf",
message: msg,
}
}
}
fn extract_docx(path: &Path) -> Result<Extracted> {
let file = std::fs::File::open(path)?;
let mut archive = open_zip(file, "docx")?;
let xml = read_zip_entry(&mut archive, "word/document.xml", "docx")?;
let text = wordprocessing_text(&xml, "docx")?;
Ok(Extracted::new(text, Format::Docx))
}
fn wordprocessing_text(xml: &str, format: &'static str) -> Result<String> {
use quick_xml::events::Event;
use quick_xml::reader::Reader;
let mut reader = Reader::from_str(xml);
let mut buf = Vec::new();
let mut out = String::new();
let mut in_text_run = false;
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(e)) => {
if local_name(e.name().as_ref()) == b"t" {
in_text_run = true;
}
}
Ok(Event::End(e)) => {
let name = e.name();
match local_name(name.as_ref()) {
b"t" => in_text_run = false,
b"p" => out.push('\n'),
_ => {}
}
}
Ok(Event::Empty(e)) => {
match local_name(e.name().as_ref()) {
b"tab" => out.push('\t'),
b"br" | b"cr" => out.push('\n'),
_ => {}
}
}
Ok(Event::Text(t)) => {
if in_text_run {
out.push_str(&String::from_utf8_lossy(&t.into_inner()));
}
}
Ok(Event::Eof) => break,
Err(e) => {
return Err(ExtractError::Parse {
format,
message: format!("malformed XML: {e}"),
});
}
_ => {}
}
buf.clear();
}
Ok(out)
}
fn local_name(qname: &[u8]) -> &[u8] {
match qname.iter().rposition(|&b| b == b':') {
Some(i) => &qname[i + 1..],
None => qname,
}
}
fn extract_spreadsheet(path: &Path) -> Result<Extracted> {
use calamine::{open_workbook_auto, Reader};
let mut workbook = open_workbook_auto(path).map_err(|e| ExtractError::Parse {
format: "spreadsheet",
message: e.to_string(),
})?;
let sheet_names = workbook.sheet_names().to_vec();
let mut text = String::new();
for (idx, name) in sheet_names.iter().enumerate() {
if idx > 0 {
text.push('\n'); }
let range = workbook
.worksheet_range(name)
.map_err(|e| ExtractError::Parse {
format: "spreadsheet",
message: format!("sheet {name:?}: {e}"),
})?;
for row in range.rows() {
let cells: Vec<String> = row.iter().map(render_cell).collect();
text.push_str(&cells.join("\t"));
text.push('\n');
}
}
let mut out = Extracted::new(text, Format::Spreadsheet);
out.put_num("sheets", sheet_names.len() as u64);
if !sheet_names.is_empty() {
out.put_str("sheet_names", sheet_names.join(", "));
}
Ok(out)
}
fn render_cell(cell: &calamine::Data) -> String {
use calamine::Data;
match cell {
Data::Empty => String::new(),
Data::String(s) => s.clone(),
Data::Int(i) => i.to_string(),
Data::Float(f) => {
if f.fract() == 0.0 && f.is_finite() && f.abs() < 1e15 {
format!("{}", *f as i64)
} else {
f.to_string()
}
}
Data::Bool(b) => {
if *b {
"TRUE".to_string()
} else {
"FALSE".to_string()
}
}
Data::DateTime(dt) => dt.to_string(),
Data::DateTimeIso(s) => s.clone(),
Data::DurationIso(s) => s.clone(),
Data::Error(e) => format!("{e:?}"),
}
}
fn extract_epub(path: &Path) -> Result<Extracted> {
let file = std::fs::File::open(path)?;
let mut archive = open_zip(file, "epub")?;
let container = read_zip_entry(&mut archive, "META-INF/container.xml", "epub")?;
let opf_path = epub_opf_path(&container)?;
let opf = read_zip_entry(&mut archive, &opf_path, "epub")?;
let parsed = parse_opf(&opf)?;
let base = opf_base_dir(&opf_path);
let mut text = String::new();
let mut chapters = 0u64;
for idref in &parsed.spine {
let Some(href) = parsed.manifest.get(idref) else {
continue; };
let entry = join_zip_path(&base, href);
let Ok(chapter_xhtml) = read_zip_entry(&mut archive, &entry, "epub") else {
continue;
};
let chapter_text = html_to_text(chapter_xhtml.as_bytes())?;
if !chapter_text.trim().is_empty() {
if chapters > 0 {
text.push('\n');
}
text.push_str(&chapter_text);
text.push('\n');
chapters += 1;
}
}
let mut out = Extracted::new(text, Format::Epub);
out.put_num("chapters", chapters);
if let Some(title) = parsed.title {
out.put_str("title", title);
}
Ok(out)
}
fn epub_opf_path(container_xml: &str) -> Result<String> {
use quick_xml::events::Event;
use quick_xml::reader::Reader;
let mut reader = Reader::from_str(container_xml);
let mut buf = Vec::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(e)) | Ok(Event::Empty(e)) => {
if local_name(e.name().as_ref()) == b"rootfile" {
if let Some(p) = attr_value(&e, b"full-path") {
return Ok(p);
}
}
}
Ok(Event::Eof) => break,
Err(e) => {
return Err(ExtractError::Parse {
format: "epub",
message: format!("container.xml: {e}"),
})
}
_ => {}
}
buf.clear();
}
Err(ExtractError::Parse {
format: "epub",
message: "container.xml has no <rootfile full-path>".to_string(),
})
}
struct OpfParsed {
manifest: BTreeMap<String, String>,
spine: Vec<String>,
title: Option<String>,
}
fn parse_opf(opf_xml: &str) -> Result<OpfParsed> {
use quick_xml::events::Event;
use quick_xml::reader::Reader;
let mut reader = Reader::from_str(opf_xml);
let mut buf = Vec::new();
let mut manifest = BTreeMap::new();
let mut spine = Vec::new();
let mut title: Option<String> = None;
let mut in_title = false;
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(e)) | Ok(Event::Empty(e)) => match local_name(e.name().as_ref()) {
b"item" => {
if let (Some(id), Some(href)) = (attr_value(&e, b"id"), attr_value(&e, b"href"))
{
manifest.insert(id, href);
}
}
b"itemref" => {
if let Some(idref) = attr_value(&e, b"idref") {
spine.push(idref);
}
}
b"title" => in_title = true,
_ => {}
},
Ok(Event::End(e)) => {
if local_name(e.name().as_ref()) == b"title" {
in_title = false;
}
}
Ok(Event::Text(t)) => {
if in_title && title.is_none() {
let s = String::from_utf8_lossy(&t.into_inner()).trim().to_string();
if !s.is_empty() {
title = Some(s);
}
}
}
Ok(Event::Eof) => break,
Err(e) => {
return Err(ExtractError::Parse {
format: "epub",
message: format!("OPF: {e}"),
})
}
_ => {}
}
buf.clear();
}
Ok(OpfParsed {
manifest,
spine,
title,
})
}
fn opf_base_dir(opf_path: &str) -> String {
match opf_path.rfind('/') {
Some(i) => opf_path[..i].to_string(),
None => String::new(),
}
}
fn join_zip_path(base: &str, href: &str) -> String {
let href = href.trim_start_matches("./");
if base.is_empty() {
href.to_string()
} else {
format!("{base}/{href}")
}
}
fn extract_html(path: &Path) -> Result<Extracted> {
let bytes = std::fs::read(path)?;
let text = html_to_text(&bytes)?;
Ok(Extracted::new(text, Format::Html))
}
fn html_to_text(html: &[u8]) -> Result<String> {
let rendered = html2text::config::plain_no_decorate()
.string_from_read(html, 10_000)
.map_err(|e| ExtractError::Parse {
format: "html",
message: e.to_string(),
})?;
Ok(strip_markdown_decorations(&rendered))
}
fn strip_markdown_decorations(text: &str) -> String {
let mut out = String::with_capacity(text.len());
for line in text.lines() {
let trimmed = line.trim_start();
let after_hashes = trimmed.trim_start_matches('#');
let line = if after_hashes.len() != trimmed.len() {
after_hashes.trim_start()
} else {
line
};
out.push_str(&unwrap_brackets(line));
out.push('\n');
}
out
}
fn unwrap_brackets(line: &str) -> String {
if !line.contains('[') {
return line.to_string();
}
let mut out = String::with_capacity(line.len());
let mut chars = line.chars().peekable();
while let Some(c) = chars.next() {
if c == '[' {
let mut inner = String::new();
let mut closed = false;
for d in chars.by_ref() {
if d == ']' {
closed = true;
break;
}
inner.push(d);
}
if closed {
out.push_str(&inner);
} else {
out.push('[');
out.push_str(&inner);
}
} else {
out.push(c);
}
}
out
}
fn open_zip<R: Read + std::io::Seek>(
reader: R,
format: &'static str,
) -> Result<zip::ZipArchive<R>> {
zip::ZipArchive::new(reader).map_err(|e| ExtractError::Parse {
format,
message: format!("not a valid zip container: {e}"),
})
}
fn read_zip_entry<R: Read + std::io::Seek>(
archive: &mut zip::ZipArchive<R>,
name: &str,
format: &'static str,
) -> Result<String> {
let mut entry = archive.by_name(name).map_err(|e| ExtractError::Parse {
format,
message: format!("missing zip entry {name:?}: {e}"),
})?;
let mut bytes = Vec::new();
entry
.read_to_end(&mut bytes)
.map_err(|e| ExtractError::Parse {
format,
message: format!("reading {name:?}: {e}"),
})?;
Ok(String::from_utf8_lossy(&bytes).into_owned())
}
fn attr_value(elem: &quick_xml::events::BytesStart<'_>, key: &[u8]) -> Option<String> {
elem.attributes().flatten().find_map(|attr| {
if local_name(attr.key.as_ref()) == key {
#[allow(deprecated)]
attr.unescape_value().ok().map(|cow| cow.into_owned())
} else {
None
}
})
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
fn fixture(name: &str) -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("../../tests/corpora/corpus-c-formats/sources/docs")
.join(name)
}
fn expected(name: &str) -> String {
std::fs::read_to_string(fixture(&format!("{name}.txt"))).unwrap()
}
fn tokens(s: &str) -> String {
s.split_whitespace().collect::<Vec<_>>().join(" ")
}
fn line_set(s: &str) -> Vec<String> {
let mut v: Vec<String> = s.lines().map(tokens).filter(|l| !l.is_empty()).collect();
v.sort();
v
}
#[test]
fn detects_format_by_extension_case_insensitively() {
assert_eq!(Format::from_path(Path::new("a.pdf")), Some(Format::Pdf));
assert_eq!(Format::from_path(Path::new("a.PDF")), Some(Format::Pdf));
assert_eq!(Format::from_path(Path::new("a.docx")), Some(Format::Docx));
assert_eq!(
Format::from_path(Path::new("a.xlsx")),
Some(Format::Spreadsheet)
);
assert_eq!(
Format::from_path(Path::new("a.ods")),
Some(Format::Spreadsheet)
);
assert_eq!(Format::from_path(Path::new("a.epub")), Some(Format::Epub));
assert_eq!(Format::from_path(Path::new("a.html")), Some(Format::Html));
assert_eq!(Format::from_path(Path::new("a.htm")), Some(Format::Html));
assert_eq!(Format::from_path(Path::new("a.txt")), None);
assert_eq!(Format::from_path(Path::new("noext")), None);
}
#[test]
fn unsupported_extension_is_typed_error() {
let err = extract(Path::new("/tmp/whatever.txt")).unwrap_err();
assert!(matches!(err, ExtractError::UnsupportedFormat(ref e) if e == "txt"));
assert_eq!(err.code(), "UNSUPPORTED_FORMAT");
}
#[test]
fn missing_extension_is_unsupported() {
let err = extract(Path::new("/tmp/noext")).unwrap_err();
assert!(matches!(err, ExtractError::UnsupportedFormat(ref e) if e.is_empty()));
}
#[test]
fn normalize_collapses_blanks_and_trims() {
let raw = "\r\n\r\nHeading\r\n\r\n\r\n\r\nBody line \r\n\r\n";
assert_eq!(normalize_text(raw), "Heading\n\nBody line\n");
}
#[test]
fn normalize_empty_stays_empty() {
assert_eq!(normalize_text(""), "");
assert_eq!(normalize_text(" \n\n \n"), "");
}
#[test]
fn extract_text_pdf_matches_known_good() {
let got = extract(&fixture("text.pdf")).unwrap();
assert_eq!(got.metadata["format"], MetaValue::Str("pdf".into()));
assert_eq!(got.metadata["pages"], MetaValue::Num(1));
assert_eq!(tokens(&got.text), tokens(&expected("text.pdf")));
}
#[test]
fn extract_weird_fonts_pdf_matches_known_good() {
let got = extract(&fixture("weird-fonts.pdf")).unwrap();
assert_eq!(tokens(&got.text), tokens(&expected("weird-fonts.pdf")));
}
#[test]
fn extract_multi_column_pdf_matches_content_order_agnostic() {
let got = extract(&fixture("multi-column.pdf")).unwrap();
assert_eq!(line_set(&got.text), line_set(&expected("multi-column.pdf")));
}
#[test]
fn extract_image_only_pdf_yields_empty() {
let got = extract(&fixture("image-only.pdf")).unwrap();
assert_eq!(got.text, "");
assert!(expected("image-only.pdf").trim().is_empty());
}
#[test]
fn extract_encrypted_pdf_without_password_refuses_cleanly() {
let err = extract(&fixture("encrypted.pdf")).unwrap_err();
assert!(
matches!(err, ExtractError::Encrypted(_)),
"expected Encrypted, got {err:?}"
);
assert_eq!(err.code(), "DOCUMENT_ENCRYPTED");
}
#[test]
fn extract_docx_matches_known_good() {
let got = extract(&fixture("sample.docx")).unwrap();
assert_eq!(got.metadata["format"], MetaValue::Str("docx".into()));
assert_eq!(tokens(&got.text), tokens(&expected("sample.docx")));
}
#[test]
fn extract_xlsx_matches_known_good() {
let got = extract(&fixture("sample.xlsx")).unwrap();
assert_eq!(got.metadata["format"], MetaValue::Str("spreadsheet".into()));
assert_eq!(got.metadata["sheets"], MetaValue::Num(1));
assert_eq!(
got.metadata["sheet_names"],
MetaValue::Str("Expenses".into())
);
assert_eq!(got.text.trim_end(), expected("sample.xlsx").trim_end());
}
#[test]
fn extract_epub_matches_known_good() {
let got = extract(&fixture("sample.epub")).unwrap();
assert_eq!(got.metadata["format"], MetaValue::Str("epub".into()));
assert_eq!(got.metadata["chapters"], MetaValue::Num(1));
assert_eq!(
got.metadata["title"],
MetaValue::Str("Operations Playbook".into())
);
assert_eq!(tokens(&got.text), tokens(&expected("sample.epub")));
}
#[test]
fn extract_html_matches_known_good() {
let got = extract(&fixture("sample.html")).unwrap();
assert_eq!(got.metadata["format"], MetaValue::Str("html".into()));
assert_eq!(tokens(&got.text), tokens(&expected("sample.html")));
}
#[test]
fn unwrap_brackets_flattens_link_text() {
assert_eq!(
unwrap_brackets("contact [ops@acme.example] or the [handbook]."),
"contact ops@acme.example or the handbook."
);
assert_eq!(unwrap_brackets("a [b c"), "a [b c");
assert_eq!(unwrap_brackets("plain text"), "plain text");
}
#[test]
fn strip_markdown_decorations_drops_heading_hashes() {
let input = "# Title\n## Section\n* bullet\n1. ordered\nplain\n";
let out = strip_markdown_decorations(input);
assert_eq!(out, "Title\nSection\n* bullet\n1. ordered\nplain\n");
}
#[test]
fn local_name_strips_prefix() {
assert_eq!(local_name(b"w:t"), b"t");
assert_eq!(local_name(b"t"), b"t");
assert_eq!(local_name(b"dc:title"), b"title");
}
#[test]
fn extracted_serializes_to_text_metadata_json() {
let got = extract(&fixture("sample.xlsx")).unwrap();
let json = serde_json::to_value(&got).unwrap();
assert!(json.get("text").is_some());
assert_eq!(json["metadata"]["format"], "spreadsheet");
assert_eq!(json["metadata"]["sheets"], 1);
assert!(json["metadata"]["sheets"].is_number());
assert!(json["metadata"]["format"].is_string());
}
}