use std::path::PathBuf;
use gaze::Manifest;
use serde::{Deserialize, Serialize};
use crate::ocr::OcrResult;
#[cfg(feature = "ocr-tesseract")]
use std::collections::BTreeMap;
#[cfg(feature = "ocr-tesseract")]
use std::fs;
#[cfg(feature = "ocr-tesseract")]
use std::path::Path;
#[cfg(any(feature = "ocr-tesseract", feature = "mcp"))]
use gaze::{
Action, ClassRule, CleanDocument, DefaultRule, LocaleTag, Pipeline, RawDocument, Scope, Session,
};
#[cfg(any(feature = "ocr-tesseract", feature = "mcp"))]
use gaze_recognizers::{
AnchoredBoundary, AnchoredMatchRecognizer, CuePosition, NameShape, RegexDetector,
};
#[cfg(any(feature = "ocr-tesseract", feature = "mcp"))]
use gaze_types::{EmittedTokenSpan, PiiClass};
#[cfg(feature = "ocr-tesseract")]
use crate::extract::InputKind;
#[cfg(feature = "ocr-tesseract")]
use crate::DocumentError;
pub const BUNDLE_VERSION: u32 = 1;
pub const CLEAN_MARKDOWN_FILE: &str = "clean.md";
pub const MANIFEST_FILE: &str = "manifest.json";
pub const REPORT_FILE: &str = "report.json";
#[non_exhaustive]
#[derive(Debug, Clone)]
pub struct SafeBundle {
pub clean_markdown: String,
pub manifest: Manifest,
pub layout: LayoutSummary,
pub preview_png: Option<Vec<u8>>,
pub report: BundleReport,
pub source_path: PathBuf,
pub out_dir: PathBuf,
}
impl SafeBundle {
pub fn new(
clean_markdown: String,
manifest: Manifest,
layout: LayoutSummary,
preview_png: Option<Vec<u8>>,
report: BundleReport,
source_path: PathBuf,
out_dir: PathBuf,
) -> Self {
Self {
clean_markdown,
manifest,
layout,
preview_png,
report,
source_path,
out_dir,
}
}
}
#[non_exhaustive]
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct ClassCount {
pub class: String,
pub count: u32,
}
impl ClassCount {
pub fn new(class: impl Into<String>, count: u32) -> Self {
Self {
class: class.into(),
count,
}
}
}
#[non_exhaustive]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BundleReport {
pub bundle_version: u32,
pub input_kind: String,
pub ocr_mean_confidence: Option<f32>,
pub ocr_word_count: usize,
pub ocr_lang: String,
pub clean_char_count: usize,
pub pii_token_count: u32,
pub pii_tokens_by_class: Vec<ClassCount>,
pub pdf_page_count: Option<i32>,
pub pdf_page_index: Option<i32>,
}
impl BundleReport {
#[allow(clippy::too_many_arguments)]
pub fn new(
input_kind: impl Into<String>,
ocr: &OcrResult,
clean_char_count: usize,
pii_token_count: u32,
pii_tokens_by_class: Vec<ClassCount>,
pdf_page_count: Option<i32>,
pdf_page_index: Option<i32>,
) -> Self {
Self {
bundle_version: BUNDLE_VERSION,
input_kind: input_kind.into(),
ocr_mean_confidence: ocr.mean_confidence,
ocr_word_count: ocr.word_count,
ocr_lang: ocr.lang.clone(),
clean_char_count,
pii_token_count,
pii_tokens_by_class,
pdf_page_count,
pdf_page_index,
}
}
}
#[non_exhaustive]
#[derive(Debug, Clone)]
pub struct LayoutSummary {
pub page_count: u32,
}
impl LayoutSummary {
pub fn single_page() -> Self {
Self { page_count: 1 }
}
pub fn new(page_count: u32) -> Self {
Self { page_count }
}
}
#[cfg(feature = "ocr-tesseract")]
#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
pub fn clean(input: &Path, out_dir: &Path) -> Result<SafeBundle, DocumentError> {
let kind = InputKind::detect(input)?;
let absolute_input = absolutize(input);
let absolute_out = absolutize(out_dir);
fs::create_dir_all(out_dir)
.map_err(|err| DocumentError::OutputDir(absolute_out.clone(), err))?;
let (ocr_result, pdf_page_count, pdf_page_index) = run_ocr(input, kind)?;
let normalized_text = crate::ocr::normalize_ocr_artifacts(&ocr_result.text);
let pipeline = build_document_pipeline()?;
let session = Session::new(Scope::Ephemeral).map_err(|err| pipeline_err("session", err))?;
let locale_chain = [LocaleTag::Global];
let (clean_doc, spans, _leak_report) = pipeline
.clean_with_safety_net(&session, RawDocument::Text(normalized_text), &locale_chain)
.map_err(|err| pipeline_err("redact", err))?;
let clean_text = match clean_doc {
CleanDocument::Text(text) => text,
_ => {
return Err(DocumentError::Pipeline(
"pipeline returned non-text variant for text input".to_string(),
));
}
};
let manifest = Manifest::from_spans(spans.clone());
let counts = count_pii_by_class(&spans);
let pii_token_count: u32 = counts.iter().map(|c| c.count).sum();
let report = BundleReport::new(
kind_label(kind),
&ocr_result,
clean_text.chars().count(),
pii_token_count,
counts,
pdf_page_count,
pdf_page_index,
);
let clean_markdown = format_clean_markdown(&clean_text, kind);
write_bundle(out_dir, &clean_markdown, &manifest, &report)?;
Ok(SafeBundle::new(
clean_markdown,
manifest,
LayoutSummary::single_page(),
None,
report,
absolute_input,
absolute_out,
))
}
#[cfg(feature = "ocr-tesseract")]
pub(crate) fn run_ocr(
input: &Path,
kind: InputKind,
) -> Result<(OcrResult, Option<i32>, Option<i32>), DocumentError> {
use crate::ocr::TesseractOcr;
let ocr = TesseractOcr::new();
match kind {
InputKind::Png | InputKind::Jpeg => {
let result = ocr.extract_from_file(input)?;
Ok((result, None, None))
}
InputKind::Pdf => {
#[cfg(feature = "pdf-input")]
{
use crate::extract::pdf::{rasterize_first_page, PdfRasterConfig};
let raster = rasterize_first_page(input, PdfRasterConfig::new())?;
let result = ocr.extract_from_bytes(&raster.png_bytes, "png")?;
Ok((result, Some(raster.page_count), Some(raster.page_index)))
}
#[cfg(not(feature = "pdf-input"))]
{
Err(DocumentError::UnsupportedInput {
path: input.to_path_buf(),
reason: "rebuild gaze-document with `--features pdf-input` for PDF support",
})
}
}
}
}
#[cfg(feature = "ocr-tesseract")]
#[cfg(any(feature = "ocr-tesseract", feature = "mcp"))]
pub(crate) fn build_document_pipeline() -> Result<Pipeline, DocumentError> {
let email = RegexDetector::emails().map_err(|err| pipeline_err("email-regex", err))?;
let phone = RegexDetector::new(
r"\+?\d{1,3}[-.\s]\(?\d{3}\)?[-.\s]?\d{3,4}[-.\s]?\d{0,4}",
PiiClass::custom("phone"),
)
.map_err(|err| pipeline_err("phone-regex", err))?;
let recipient_name = AnchoredMatchRecognizer::new(
"gaze_document.name.recipient".to_string(),
vec![
"Bill to".to_string(),
"Invoice to".to_string(),
"Ship to".to_string(),
"Attention".to_string(),
"Attn".to_string(),
],
AnchoredBoundary::LineEnd,
48,
NameShape::PersonName,
CuePosition::Before,
"invoice_recipient".to_string(),
2,
0.88,
110,
);
Pipeline::builder()
.detector(email)
.detector(phone)
.recognizer(recipient_name)
.rule(ClassRule::new(PiiClass::Email, Action::Tokenize))
.rule(ClassRule::new(PiiClass::custom("phone"), Action::Tokenize))
.rule(ClassRule::new(PiiClass::Name, Action::Tokenize))
.rule(DefaultRule::new(Action::Preserve))
.build()
.map_err(|err| pipeline_err("build", err))
}
#[cfg(feature = "ocr-tesseract")]
fn count_pii_by_class(spans: &[EmittedTokenSpan]) -> Vec<ClassCount> {
let mut by_class: BTreeMap<String, u32> = BTreeMap::new();
for span in spans {
*by_class.entry(span.class.to_canonical_str()).or_insert(0) += 1;
}
by_class
.into_iter()
.map(|(class, count)| ClassCount::new(class, count))
.collect()
}
#[cfg(feature = "ocr-tesseract")]
fn write_bundle(
out_dir: &Path,
clean_markdown: &str,
manifest: &Manifest,
report: &BundleReport,
) -> Result<(), DocumentError> {
fs::write(out_dir.join(CLEAN_MARKDOWN_FILE), clean_markdown)?;
let manifest_json = serde_json::to_vec_pretty(manifest)?;
fs::write(out_dir.join(MANIFEST_FILE), manifest_json)?;
let report_json = serde_json::to_vec_pretty(report)?;
fs::write(out_dir.join(REPORT_FILE), report_json)?;
Ok(())
}
#[cfg(feature = "ocr-tesseract")]
pub(crate) fn format_clean_markdown(text: &str, kind: InputKind) -> String {
let mut out = String::new();
out.push_str("# gaze-document safe bundle\n\n");
out.push_str(&format!("Source kind: `{}`\n\n", kind_label(kind)));
out.push_str("---\n\n");
out.push_str(text);
if !text.ends_with('\n') {
out.push('\n');
}
out
}
#[cfg(feature = "ocr-tesseract")]
pub(crate) fn kind_label(kind: InputKind) -> &'static str {
match kind {
InputKind::Png => "png",
InputKind::Jpeg => "jpeg",
InputKind::Pdf => "pdf",
}
}
#[cfg(feature = "ocr-tesseract")]
fn absolutize(path: &Path) -> PathBuf {
if path.is_absolute() {
path.to_path_buf()
} else {
std::env::current_dir()
.map(|cwd| cwd.join(path))
.unwrap_or_else(|_| path.to_path_buf())
}
}
#[cfg(feature = "ocr-tesseract")]
fn pipeline_err(stage: &'static str, err: impl std::fmt::Display) -> DocumentError {
DocumentError::Pipeline(format!("{stage}: {err}"))
}
#[cfg(all(test, feature = "ocr-tesseract"))]
mod tests {
use super::*;
#[test]
fn count_pii_by_class_groups_email_and_phone() {
let spans = vec![
EmittedTokenSpan::new(0..10, 0..10, PiiClass::Email),
EmittedTokenSpan::new(20..28, 20..28, PiiClass::Email),
EmittedTokenSpan::new(40..50, 40..50, PiiClass::custom("phone")),
];
let counts = count_pii_by_class(&spans);
assert_eq!(counts.len(), 2);
let by_class: BTreeMap<_, _> = counts.iter().map(|c| (c.class.as_str(), c.count)).collect();
assert_eq!(by_class.get("email"), Some(&2));
assert_eq!(by_class.get("custom:phone"), Some(&1));
}
#[test]
fn report_serializes_with_bundle_version() {
let ocr = OcrResult::new("body".into(), Some(91.5), 2, "eng".into());
let report = BundleReport::new(
"png",
&ocr,
42,
3,
vec![
ClassCount::new("email", 2),
ClassCount::new("custom:phone", 1),
],
None,
None,
);
let json = serde_json::to_value(&report).expect("serialize");
assert_eq!(json["bundle_version"], BUNDLE_VERSION);
assert_eq!(json["input_kind"], "png");
assert_eq!(json["pii_token_count"], 3);
}
#[test]
fn format_clean_markdown_appends_trailing_newline() {
let md = format_clean_markdown("hello", InputKind::Png);
assert!(md.ends_with('\n'));
assert!(md.contains("Source kind: `png`"));
assert!(md.contains("hello"));
}
}