mod common;
use std::path::PathBuf;
use common::{corpora_dir, dbmd};
#[derive(Debug, Clone, Copy)]
enum Compare {
Tokens,
LineSet,
Exact,
Empty,
RefusesEncrypted,
}
struct Fixture {
file: &'static str,
compare: Compare,
}
const FIXTURES: &[Fixture] = &[
Fixture {
file: "sample.html",
compare: Compare::Tokens,
},
Fixture {
file: "text.pdf",
compare: Compare::Tokens,
},
Fixture {
file: "multi-column.pdf",
compare: Compare::LineSet,
},
Fixture {
file: "weird-fonts.pdf",
compare: Compare::Tokens,
},
Fixture {
file: "image-only.pdf",
compare: Compare::Empty,
},
Fixture {
file: "encrypted.pdf",
compare: Compare::RefusesEncrypted,
},
Fixture {
file: "sample.docx",
compare: Compare::Tokens,
},
Fixture {
file: "sample.xlsx",
compare: Compare::Exact,
},
Fixture {
file: "sample.epub",
compare: Compare::Tokens,
},
];
fn docs_dir() -> PathBuf {
corpora_dir()
.join("corpus-c-formats")
.join("sources")
.join("docs")
}
fn tokens(s: &str) -> String {
s.split_whitespace().collect::<Vec<_>>().join(" ")
}
fn line_set(s: &str) -> Vec<String> {
let mut v: Vec<String> = s.lines().map(tokens).filter(|l| !l.is_empty()).collect();
v.sort();
v
}
enum Outcome {
Checked,
SkippedMissing(String),
}
#[test]
fn extract_end_to_end_over_corpus_c_fixtures() {
let docs = docs_dir();
let mut checked = 0usize;
let mut skipped: Vec<String> = Vec::new();
for fx in FIXTURES {
let bin = docs.join(fx.file);
let txt = docs.join(format!("{}.txt", fx.file));
match account_for_fixture(fx, &bin, &txt) {
Outcome::Checked => {
checked += 1;
eprintln!("[corpus-c E2E] checked: {} ({:?})", fx.file, fx.compare);
}
Outcome::SkippedMissing(reason) => {
eprintln!("[corpus-c E2E] SKIP (not created): {} — {reason}", fx.file);
skipped.push(format!("{} ({reason})", fx.file));
}
}
}
eprintln!(
"[corpus-c E2E] summary: {checked} checked, {} skipped (of {} declared)",
skipped.len(),
FIXTURES.len()
);
if !skipped.is_empty() {
eprintln!("[corpus-c E2E] skipped fixtures: {}", skipped.join("; "));
}
assert!(
checked > 0,
"no corpus-c fixtures were present to extract — corpus-c-formats/sources/docs is empty or missing"
);
}
fn account_for_fixture(fx: &Fixture, bin: &std::path::Path, txt: &std::path::Path) -> Outcome {
if !bin.exists() {
return Outcome::SkippedMissing(format!("binary fixture absent at {}", bin.display()));
}
assert!(
txt.exists(),
"fixture {} exists but its known-good sibling {} is missing — a present fixture must have its .txt",
fx.file,
txt.display()
);
match fx.compare {
Compare::RefusesEncrypted => assert_refuses_encrypted(bin),
Compare::Empty => {
let got = run_extract_ok(bin);
assert!(
got.trim().is_empty(),
"{}: expected empty output (no text layer), got: {got:?}",
fx.file
);
let expected = read_known_good(txt);
assert!(
expected.trim().is_empty(),
"{}: known-good .txt should be empty for an image-only PDF",
fx.file
);
}
Compare::Tokens => {
let got = run_extract_ok(bin);
let expected = read_known_good(txt);
assert_eq!(
tokens(&got),
tokens(&expected),
"{}: token-normalized text differs from known-good",
fx.file
);
}
Compare::LineSet => {
let got = run_extract_ok(bin);
let expected = read_known_good(txt);
assert_eq!(
line_set(&got),
line_set(&expected),
"{}: token-normalized line SET differs from known-good (order-agnostic)",
fx.file
);
}
Compare::Exact => {
let got = run_extract_ok(bin);
let expected = read_known_good(txt);
assert_eq!(
got.trim_end(),
expected.trim_end(),
"{}: extracted text is not byte-exact with known-good",
fx.file
);
}
}
Outcome::Checked
}
fn run_extract_ok(bin: &std::path::Path) -> String {
let out = dbmd().arg("extract").arg(bin).assert().success();
String::from_utf8(out.get_output().stdout.clone()).expect("utf-8 stdout")
}
fn assert_refuses_encrypted(bin: &std::path::Path) {
let out = dbmd().arg("extract").arg(bin).assert().failure().code(1);
let stdout = String::from_utf8(out.get_output().stdout.clone()).unwrap();
assert!(
stdout.is_empty(),
"encrypted doc must emit nothing to stdout, got: {stdout:?}"
);
let out = dbmd()
.arg("--json")
.arg("extract")
.arg(bin)
.assert()
.failure()
.code(1);
let stderr = String::from_utf8(out.get_output().stderr.clone()).unwrap();
let parsed: serde_json::Value =
serde_json::from_str(stderr.trim()).expect("JSON error object on stderr");
assert_eq!(
parsed["error"]["code"], "DOCUMENT_ENCRYPTED",
"encrypted doc must report the DOCUMENT_ENCRYPTED code"
);
}
fn read_known_good(txt: &std::path::Path) -> String {
std::fs::read_to_string(txt).expect("known-good .txt is readable")
}