1use std::path::PathBuf;
20
21use gaze::Manifest;
22use serde::{Deserialize, Serialize};
23
24use crate::ocr::OcrResult;
25
26#[cfg(feature = "ocr-tesseract")]
27use std::collections::BTreeMap;
28#[cfg(feature = "ocr-tesseract")]
29use std::fs;
30#[cfg(feature = "ocr-tesseract")]
31use std::path::Path;
32
33#[cfg(any(feature = "ocr-tesseract", feature = "mcp"))]
34use gaze::{
35 Action, ClassRule, CleanDocument, DefaultRule, LocaleTag, Pipeline, RawDocument, Scope, Session,
36};
37#[cfg(any(feature = "ocr-tesseract", feature = "mcp"))]
38use gaze_recognizers::{
39 AnchoredBoundary, AnchoredMatchRecognizer, CuePosition, NameShape, RegexDetector,
40};
41#[cfg(any(feature = "ocr-tesseract", feature = "mcp"))]
42use gaze_types::{EmittedTokenSpan, PiiClass};
43
44#[cfg(feature = "ocr-tesseract")]
45use crate::extract::InputKind;
46#[cfg(feature = "ocr-tesseract")]
47use crate::DocumentError;
48
49pub const BUNDLE_VERSION: u32 = 1;
51
52pub const CLEAN_MARKDOWN_FILE: &str = "clean.md";
54pub const MANIFEST_FILE: &str = "manifest.json";
56pub const REPORT_FILE: &str = "report.json";
58
59#[non_exhaustive]
61#[derive(Debug, Clone)]
62pub struct SafeBundle {
63 pub clean_markdown: String,
65 pub manifest: Manifest,
67 pub layout: LayoutSummary,
69 pub preview_png: Option<Vec<u8>>,
71 pub report: BundleReport,
73 pub source_path: PathBuf,
75 pub out_dir: PathBuf,
77}
78
79impl SafeBundle {
80 pub fn new(
82 clean_markdown: String,
83 manifest: Manifest,
84 layout: LayoutSummary,
85 preview_png: Option<Vec<u8>>,
86 report: BundleReport,
87 source_path: PathBuf,
88 out_dir: PathBuf,
89 ) -> Self {
90 Self {
91 clean_markdown,
92 manifest,
93 layout,
94 preview_png,
95 report,
96 source_path,
97 out_dir,
98 }
99 }
100}
101
102#[non_exhaustive]
104#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
105pub struct ClassCount {
106 pub class: String,
108 pub count: u32,
110}
111
112impl ClassCount {
113 pub fn new(class: impl Into<String>, count: u32) -> Self {
115 Self {
116 class: class.into(),
117 count,
118 }
119 }
120}
121
122#[non_exhaustive]
128#[derive(Debug, Clone, Serialize, Deserialize)]
129pub struct BundleReport {
130 pub bundle_version: u32,
132 pub input_kind: String,
134 pub ocr_mean_confidence: Option<f32>,
136 pub ocr_word_count: usize,
138 pub ocr_lang: String,
140 pub clean_char_count: usize,
142 pub pii_token_count: u32,
144 pub pii_tokens_by_class: Vec<ClassCount>,
146 pub pdf_page_count: Option<i32>,
148 pub pdf_page_index: Option<i32>,
150}
151
152impl BundleReport {
153 #[allow(clippy::too_many_arguments)]
155 pub fn new(
156 input_kind: impl Into<String>,
157 ocr: &OcrResult,
158 clean_char_count: usize,
159 pii_token_count: u32,
160 pii_tokens_by_class: Vec<ClassCount>,
161 pdf_page_count: Option<i32>,
162 pdf_page_index: Option<i32>,
163 ) -> Self {
164 Self {
165 bundle_version: BUNDLE_VERSION,
166 input_kind: input_kind.into(),
167 ocr_mean_confidence: ocr.mean_confidence,
168 ocr_word_count: ocr.word_count,
169 ocr_lang: ocr.lang.clone(),
170 clean_char_count,
171 pii_token_count,
172 pii_tokens_by_class,
173 pdf_page_count,
174 pdf_page_index,
175 }
176 }
177}
178
179#[non_exhaustive]
184#[derive(Debug, Clone)]
185pub struct LayoutSummary {
186 pub page_count: u32,
188}
189
190impl LayoutSummary {
191 pub fn single_page() -> Self {
193 Self { page_count: 1 }
194 }
195
196 pub fn new(page_count: u32) -> Self {
198 Self { page_count }
199 }
200}
201
202#[cfg(feature = "ocr-tesseract")]
214#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
215pub fn clean(input: &Path, out_dir: &Path) -> Result<SafeBundle, DocumentError> {
216 let kind = InputKind::detect(input)?;
217 let absolute_input = absolutize(input);
218 let absolute_out = absolutize(out_dir);
219
220 fs::create_dir_all(out_dir)
221 .map_err(|err| DocumentError::OutputDir(absolute_out.clone(), err))?;
222
223 let (ocr_result, pdf_page_count, pdf_page_index) = run_ocr(input, kind)?;
224 let normalized_text = crate::ocr::normalize_ocr_artifacts(&ocr_result.text);
231 let pipeline = build_document_pipeline()?;
232 let session = Session::new(Scope::Ephemeral).map_err(|err| pipeline_err("session", err))?;
233 let locale_chain = [LocaleTag::Global];
234 let (clean_doc, spans, _leak_report) = pipeline
235 .clean_with_safety_net(&session, RawDocument::Text(normalized_text), &locale_chain)
236 .map_err(|err| pipeline_err("redact", err))?;
237
238 let clean_text = match clean_doc {
239 CleanDocument::Text(text) => text,
240 _ => {
241 return Err(DocumentError::Pipeline(
242 "pipeline returned non-text variant for text input".to_string(),
243 ));
244 }
245 };
246
247 let manifest = Manifest::from_spans(spans.clone());
248 let counts = count_pii_by_class(&spans);
249 let pii_token_count: u32 = counts.iter().map(|c| c.count).sum();
250
251 let report = BundleReport::new(
252 kind_label(kind),
253 &ocr_result,
254 clean_text.chars().count(),
255 pii_token_count,
256 counts,
257 pdf_page_count,
258 pdf_page_index,
259 );
260
261 let clean_markdown = format_clean_markdown(&clean_text, kind);
262 write_bundle(out_dir, &clean_markdown, &manifest, &report)?;
263
264 Ok(SafeBundle::new(
265 clean_markdown,
266 manifest,
267 LayoutSummary::single_page(),
268 None,
269 report,
270 absolute_input,
271 absolute_out,
272 ))
273}
274
275#[cfg(feature = "ocr-tesseract")]
276pub(crate) fn run_ocr(
277 input: &Path,
278 kind: InputKind,
279) -> Result<(OcrResult, Option<i32>, Option<i32>), DocumentError> {
280 use crate::ocr::TesseractOcr;
281 let ocr = TesseractOcr::new();
282 match kind {
283 InputKind::Png | InputKind::Jpeg => {
284 let result = ocr.extract_from_file(input)?;
285 Ok((result, None, None))
286 }
287 InputKind::Pdf => {
288 #[cfg(feature = "pdf-input")]
289 {
290 use crate::extract::pdf::{rasterize_first_page, PdfRasterConfig};
291 let raster = rasterize_first_page(input, PdfRasterConfig::new())?;
292 let result = ocr.extract_from_bytes(&raster.png_bytes, "png")?;
293 Ok((result, Some(raster.page_count), Some(raster.page_index)))
294 }
295 #[cfg(not(feature = "pdf-input"))]
296 {
297 Err(DocumentError::UnsupportedInput {
298 path: input.to_path_buf(),
299 reason: "rebuild gaze-document with `--features pdf-input` for PDF support",
300 })
301 }
302 }
303 }
304}
305
306#[cfg(feature = "ocr-tesseract")]
307#[cfg(any(feature = "ocr-tesseract", feature = "mcp"))]
308pub(crate) fn build_document_pipeline() -> Result<Pipeline, DocumentError> {
309 let email = RegexDetector::emails().map_err(|err| pipeline_err("email-regex", err))?;
310 let phone = RegexDetector::new(
313 r"\+?\d{1,3}[-.\s]\(?\d{3}\)?[-.\s]?\d{3,4}[-.\s]?\d{0,4}",
314 PiiClass::custom("phone"),
315 )
316 .map_err(|err| pipeline_err("phone-regex", err))?;
317 let recipient_name = AnchoredMatchRecognizer::new(
325 "gaze_document.name.recipient".to_string(),
326 vec![
327 "Bill to".to_string(),
328 "Invoice to".to_string(),
329 "Ship to".to_string(),
330 "Attention".to_string(),
331 "Attn".to_string(),
332 ],
333 AnchoredBoundary::LineEnd,
334 48,
335 NameShape::PersonName,
336 CuePosition::Before,
337 "invoice_recipient".to_string(),
338 2,
339 0.88,
340 110,
341 );
342 Pipeline::builder()
343 .detector(email)
344 .detector(phone)
345 .recognizer(recipient_name)
346 .rule(ClassRule::new(PiiClass::Email, Action::Tokenize))
347 .rule(ClassRule::new(PiiClass::custom("phone"), Action::Tokenize))
348 .rule(ClassRule::new(PiiClass::Name, Action::Tokenize))
349 .rule(DefaultRule::new(Action::Preserve))
350 .build()
351 .map_err(|err| pipeline_err("build", err))
352}
353
354#[cfg(feature = "ocr-tesseract")]
355fn count_pii_by_class(spans: &[EmittedTokenSpan]) -> Vec<ClassCount> {
356 let mut by_class: BTreeMap<String, u32> = BTreeMap::new();
357 for span in spans {
358 *by_class.entry(span.class.to_canonical_str()).or_insert(0) += 1;
359 }
360 by_class
361 .into_iter()
362 .map(|(class, count)| ClassCount::new(class, count))
363 .collect()
364}
365
366#[cfg(feature = "ocr-tesseract")]
367fn write_bundle(
368 out_dir: &Path,
369 clean_markdown: &str,
370 manifest: &Manifest,
371 report: &BundleReport,
372) -> Result<(), DocumentError> {
373 fs::write(out_dir.join(CLEAN_MARKDOWN_FILE), clean_markdown)?;
374 let manifest_json = serde_json::to_vec_pretty(manifest)?;
375 fs::write(out_dir.join(MANIFEST_FILE), manifest_json)?;
376 let report_json = serde_json::to_vec_pretty(report)?;
377 fs::write(out_dir.join(REPORT_FILE), report_json)?;
378 Ok(())
379}
380
381#[cfg(feature = "ocr-tesseract")]
382pub(crate) fn format_clean_markdown(text: &str, kind: InputKind) -> String {
383 let mut out = String::new();
384 out.push_str("# gaze-document safe bundle\n\n");
385 out.push_str(&format!("Source kind: `{}`\n\n", kind_label(kind)));
386 out.push_str("---\n\n");
387 out.push_str(text);
388 if !text.ends_with('\n') {
389 out.push('\n');
390 }
391 out
392}
393
394#[cfg(feature = "ocr-tesseract")]
395pub(crate) fn kind_label(kind: InputKind) -> &'static str {
396 match kind {
397 InputKind::Png => "png",
398 InputKind::Jpeg => "jpeg",
399 InputKind::Pdf => "pdf",
400 }
401}
402
403#[cfg(feature = "ocr-tesseract")]
404fn absolutize(path: &Path) -> PathBuf {
405 if path.is_absolute() {
406 path.to_path_buf()
407 } else {
408 std::env::current_dir()
409 .map(|cwd| cwd.join(path))
410 .unwrap_or_else(|_| path.to_path_buf())
411 }
412}
413
414#[cfg(feature = "ocr-tesseract")]
415fn pipeline_err(stage: &'static str, err: impl std::fmt::Display) -> DocumentError {
416 DocumentError::Pipeline(format!("{stage}: {err}"))
417}
418
419#[cfg(all(test, feature = "ocr-tesseract"))]
420mod tests {
421 use super::*;
422
423 #[test]
424 fn count_pii_by_class_groups_email_and_phone() {
425 let spans = vec![
426 EmittedTokenSpan::new(0..10, 0..10, PiiClass::Email),
427 EmittedTokenSpan::new(20..28, 20..28, PiiClass::Email),
428 EmittedTokenSpan::new(40..50, 40..50, PiiClass::custom("phone")),
429 ];
430 let counts = count_pii_by_class(&spans);
431 assert_eq!(counts.len(), 2);
432 let by_class: BTreeMap<_, _> = counts.iter().map(|c| (c.class.as_str(), c.count)).collect();
433 assert_eq!(by_class.get("email"), Some(&2));
434 assert_eq!(by_class.get("custom:phone"), Some(&1));
435 }
436
437 #[test]
438 fn report_serializes_with_bundle_version() {
439 let ocr = OcrResult::new("body".into(), Some(91.5), 2, "eng".into());
440 let report = BundleReport::new(
441 "png",
442 &ocr,
443 42,
444 3,
445 vec![
446 ClassCount::new("email", 2),
447 ClassCount::new("custom:phone", 1),
448 ],
449 None,
450 None,
451 );
452 let json = serde_json::to_value(&report).expect("serialize");
453 assert_eq!(json["bundle_version"], BUNDLE_VERSION);
454 assert_eq!(json["input_kind"], "png");
455 assert_eq!(json["pii_token_count"], 3);
456 }
457
458 #[test]
459 fn format_clean_markdown_appends_trailing_newline() {
460 let md = format_clean_markdown("hello", InputKind::Png);
461 assert!(md.ends_with('\n'));
462 assert!(md.contains("Source kind: `png`"));
463 assert!(md.contains("hello"));
464 }
465}