#![cfg(feature = "ocr-tesseract")]
use std::path::{Path, PathBuf};
use gaze::Manifest;
use gaze_document::{
AgentBundleDir, BundleLayoutInvalidReason, BundleReport, DocumentError, OcrSource,
OwnerBundleDir, BUNDLE_VERSION,
};
use gaze_types::PiiClass;
const ORIGINAL_EMAIL: &str = "jane.doe@example.invalid";
const ORIGINAL_EMAIL_DOMAIN: &str = "@example.invalid";
const ORIGINAL_EMAIL_LOCAL: &str = "jane.doe";
const ORIGINAL_NAME: &str = "Jane Doe";
const ORIGINAL_PHONE_TAIL: &str = "555-0142";
fn testdata_dir() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("testdata")
}
fn skip_reason(err: &DocumentError) -> Option<&'static str> {
match err {
DocumentError::TesseractNotFound(_) => Some("tesseract not installed"),
DocumentError::PdfiumNotFound(_) => Some("pdfium dynamic library not installed"),
_ => None,
}
}
fn assert_clean_bundle(input: &Path, expect_pdf_fields: bool) {
let tmp = tempfile::tempdir().expect("tempdir");
let agent_dir = tmp.path().join("agent");
let owner_dir = tmp.path().join("owner");
let bundle = match gaze_document::clean(
input,
AgentBundleDir::new(&agent_dir).expect("agent dir"),
OwnerBundleDir::new(&owner_dir).expect("owner dir"),
) {
Ok(b) => b,
Err(err) => {
if let Some(why) = skip_reason(&err) {
eprintln!("SKIP: {why}: {err}");
return;
}
panic!("gaze_document::clean failed: {err}");
}
};
let clean_md_path = agent_dir.join("clean.md");
let manifest_path = owner_dir.join("manifest.json");
let report_path = agent_dir.join("report.json");
assert!(clean_md_path.exists(), "clean.md missing");
assert!(manifest_path.exists(), "manifest.json missing");
assert!(report_path.exists(), "report.json missing");
assert!(
!agent_dir.join("manifest.json").exists(),
"agent dir must not contain restorable manifest material"
);
let clean_text = std::fs::read_to_string(&clean_md_path).expect("clean.md readable");
assert!(
!clean_text.contains(ORIGINAL_EMAIL),
"clean.md leaked the original email substring:\n{clean_text}"
);
assert!(
!clean_text.contains(ORIGINAL_EMAIL_DOMAIN),
"clean.md leaked the email domain substring (OCR-artifact survival):\n{clean_text}"
);
assert!(
!clean_text.contains(ORIGINAL_EMAIL_LOCAL),
"clean.md leaked the email local-part substring:\n{clean_text}"
);
assert!(
!clean_text.contains(ORIGINAL_PHONE_TAIL),
"clean.md leaked the original phone tail substring:\n{clean_text}"
);
assert!(
!clean_text.contains(ORIGINAL_NAME),
"clean.md leaked the original recipient name substring:\n{clean_text}"
);
assert!(
clean_text.contains(":Email_"),
"expected a tokenized Email reference in clean.md, got:\n{clean_text}"
);
assert!(
clean_text.contains(":Custom:phone_"),
"expected a tokenized phone reference in clean.md, got:\n{clean_text}"
);
assert!(
clean_text.contains(":Name_"),
"expected a tokenized Name reference in clean.md, got:\n{clean_text}"
);
let manifest_bytes = std::fs::read(&manifest_path).expect("manifest bytes");
let manifest: Manifest =
serde_json::from_slice(&manifest_bytes).expect("manifest deserializes");
assert!(
manifest
.spans
.iter()
.any(|s| matches!(s.class, PiiClass::Email)),
"manifest missing Email span"
);
assert!(
manifest
.spans
.iter()
.any(|s| matches!(&s.class, PiiClass::Custom(name) if name == "phone")),
"manifest missing Custom(phone) span"
);
assert!(
manifest
.spans
.iter()
.any(|s| matches!(s.class, PiiClass::Name)),
"manifest missing Name span"
);
let report_bytes = std::fs::read(&report_path).expect("report bytes");
let report: BundleReport = serde_json::from_slice(&report_bytes).expect("report deserializes");
assert_eq!(report.bundle_version, BUNDLE_VERSION);
assert!(report.pii_token_count >= 2);
assert!(report.clean_char_count > 0);
assert_eq!(report.low_confidence_threshold, 0.65);
assert_eq!(report.pages.len(), 1);
assert_eq!(report.pages[0].page_index, 0);
assert!(report.pages[0].column_count >= 1);
if report.pages[0].ocr_source == OcrSource::Ocr {
assert!(report.ocr_word_count > 0, "OCR returned zero words");
assert_eq!(report.pages[0].ocr_backend.as_deref(), Some("tesseract"));
} else {
assert_eq!(report.pages[0].ocr_source, OcrSource::VectorPdf);
assert!(report.pages[0].confidence.is_none());
assert!(!report.pages[0].low_confidence);
}
if expect_pdf_fields {
assert_eq!(report.input_kind, "pdf");
assert_eq!(report.pdf_page_count, Some(1));
assert_eq!(report.pdf_page_index, Some(0));
assert_eq!(report.pages[0].ocr_source, OcrSource::VectorPdf);
} else {
assert!(matches!(report.input_kind.as_str(), "png" | "jpeg"));
assert!(report.pdf_page_count.is_none());
assert_eq!(report.pages[0].ocr_source, OcrSource::Ocr);
}
assert_eq!(bundle.report.bundle_version, BUNDLE_VERSION);
assert_eq!(bundle.report.pii_token_count, report.pii_token_count);
assert_eq!(
bundle.agent_out_dir,
std::fs::canonicalize(&agent_dir).unwrap()
);
assert_eq!(
bundle.owner_out_dir,
std::fs::canonicalize(&owner_dir).unwrap()
);
}
#[test]
fn synthetic_image_png_clean_emits_safe_bundle() {
let input = testdata_dir().join("synthetic_image.png");
assert!(input.exists(), "missing fixture: {}", input.display());
assert_clean_bundle(&input, false);
}
#[cfg(feature = "pdf-input")]
#[test]
fn synthetic_doc_pdf_clean_emits_safe_bundle() {
let input = testdata_dir().join("synthetic_doc.pdf");
assert!(input.exists(), "missing fixture: {}", input.display());
assert_clean_bundle(&input, true);
}
#[test]
fn bundle_writer_rejects_equal_agent_and_owner_paths() {
let input = testdata_dir().join("synthetic_image.png");
assert!(input.exists(), "missing fixture: {}", input.display());
let tmp = tempfile::tempdir().expect("tempdir");
let err = gaze_document::clean(
&input,
AgentBundleDir::new(tmp.path()).expect("agent dir"),
OwnerBundleDir::new(tmp.path()).expect("owner dir"),
)
.expect_err("equal bundle paths must be rejected");
assert!(matches!(
err,
DocumentError::BundleLayoutInvalid {
reason: BundleLayoutInvalidReason::AgentEqualsOwner
}
));
}
#[test]
fn bundle_writer_rejects_nested_paths() {
let input = testdata_dir().join("synthetic_image.png");
assert!(input.exists(), "missing fixture: {}", input.display());
let tmp = tempfile::tempdir().expect("tempdir");
let err = gaze_document::clean(
&input,
AgentBundleDir::new(tmp.path().join("agent")).expect("agent dir"),
OwnerBundleDir::new(tmp.path().join("agent").join("owner")).expect("owner dir"),
)
.expect_err("nested bundle paths must be rejected");
assert!(matches!(
err,
DocumentError::BundleLayoutInvalid {
reason: BundleLayoutInvalidReason::OwnerNestedInAgent
}
));
}
#[test]
fn agent_dir_contains_no_manifest_material() {
let input = testdata_dir().join("synthetic_image.png");
assert!(input.exists(), "missing fixture: {}", input.display());
let tmp = tempfile::tempdir().expect("tempdir");
let agent_dir = tmp.path().join("agent");
let owner_dir = tmp.path().join("owner");
match gaze_document::clean(
&input,
AgentBundleDir::new(&agent_dir).expect("agent dir"),
OwnerBundleDir::new(&owner_dir).expect("owner dir"),
) {
Ok(_) => {}
Err(err) => {
if let Some(why) = skip_reason(&err) {
eprintln!("SKIP: {why}: {err}");
return;
}
panic!("gaze_document::clean failed: {err}");
}
}
let names = dir_entry_names(&agent_dir);
assert_eq!(
names,
vec!["clean.md".to_string(), "report.json".to_string()]
);
assert!(!names.iter().any(|name| name == "manifest.json"));
}
#[test]
fn owner_dir_contains_only_manifest() {
let input = testdata_dir().join("synthetic_image.png");
assert!(input.exists(), "missing fixture: {}", input.display());
let tmp = tempfile::tempdir().expect("tempdir");
let agent_dir = tmp.path().join("agent");
let owner_dir = tmp.path().join("owner");
match gaze_document::clean(
&input,
AgentBundleDir::new(&agent_dir).expect("agent dir"),
OwnerBundleDir::new(&owner_dir).expect("owner dir"),
) {
Ok(_) => {}
Err(err) => {
if let Some(why) = skip_reason(&err) {
eprintln!("SKIP: {why}: {err}");
return;
}
panic!("gaze_document::clean failed: {err}");
}
}
assert_eq!(
dir_entry_names(&owner_dir),
vec!["manifest.json".to_string()]
);
}
fn dir_entry_names(path: &Path) -> Vec<String> {
let mut names = std::fs::read_dir(path)
.expect("read dir")
.map(|entry| {
entry
.expect("dir entry")
.file_name()
.to_string_lossy()
.into_owned()
})
.collect::<Vec<_>>();
names.sort();
names
}