1use std::path::{Path, PathBuf};
9
10use async_trait::async_trait;
11use gaze::{CleanDocument, RawDocument};
12use gaze_mcp_core::{
13 Tool, ToolCtx, ToolDescriptor, ToolError, ToolRegistry, ToolRegistryError, ToolResponse,
14};
15use serde::Serialize;
16use serde_json::json;
17
18#[cfg(feature = "ocr-tesseract")]
19use crate::extract::InputKind;
20#[cfg(feature = "ocr-tesseract")]
21use crate::DocumentError;
22
23pub const DEFAULT_MAX_FILE_SIZE: u64 = 25 * 1024 * 1024;
25
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28#[non_exhaustive]
29pub struct GazeReadOpts {
30 pub max_file_size: u64,
32}
33
34impl Default for GazeReadOpts {
35 fn default() -> Self {
36 Self {
37 max_file_size: DEFAULT_MAX_FILE_SIZE,
38 }
39 }
40}
41
42pub fn register_tools(
44 registry: &mut ToolRegistry,
45 opts: GazeReadOpts,
46) -> Result<(), ToolRegistryError> {
47 registry.register(GazeReadText::new())?;
48 registry.register(GazeReadFile::with_max_file_size(opts.max_file_size))?;
49 Ok(())
50}
51
52#[derive(Debug)]
54#[non_exhaustive]
55pub struct GazeReadText {
56 descriptor: ToolDescriptor,
57}
58
59impl GazeReadText {
60 pub fn new() -> Self {
62 Self {
63 descriptor: ToolDescriptor::agent(
64 "gaze_read_text",
65 json!({
66 "type": "object",
67 "properties": {
68 "text": {
69 "type": "string",
70 "description": "Already-extracted text to pseudonymize before model use."
71 }
72 },
73 "required": ["text"]
74 }),
75 )
76 .with_description("Pseudonymize already-extracted text before returning it to an MCP client.")
77 .with_output_schema(response_schema()),
78 }
79 }
80}
81
82impl Default for GazeReadText {
83 fn default() -> Self {
84 Self::new()
85 }
86}
87
88#[async_trait]
89impl Tool for GazeReadText {
90 fn descriptor(&self) -> &ToolDescriptor {
91 &self.descriptor
92 }
93
94 async fn invoke(&self, ctx: &ToolCtx<'_>) -> Result<ToolResponse, ToolError> {
95 let text = required_string(ctx.redacted_args(), "text")?;
96 let clean_text = redact_document_text(text, ctx)?;
97 Ok(ToolResponse::json(json!(DocumentToolResponse {
98 clean_markdown: format_text_markdown(&clean_text),
99 manifest_id: ctx.call_id().to_string(),
100 file_metadata: FileMetadata {
101 source_kind: "text".to_string(),
102 ocr_mean_confidence: None,
103 bundle_version: crate::BUNDLE_VERSION,
104 page_count: None,
105 },
106 })))
107 }
108}
109
110#[derive(Debug)]
112#[non_exhaustive]
113pub struct GazeReadFile {
114 descriptor: ToolDescriptor,
115 max_file_size: u64,
116}
117
118impl GazeReadFile {
119 pub fn new() -> Self {
121 Self::with_max_file_size(DEFAULT_MAX_FILE_SIZE)
122 }
123
124 pub fn with_max_file_size(max_file_size: u64) -> Self {
126 Self {
127 descriptor: ToolDescriptor::agent(
128 "gaze_read_file",
129 json!({
130 "type": "object",
131 "properties": {
132 "path": {
133 "type": "string",
134 "description": "Filesystem path to a PNG, JPG, or PDF document."
135 }
136 },
137 "required": ["path"]
138 }),
139 )
140 .with_description(
141 "Read an image or PDF through OCR and Gaze pseudonymization before MCP return.",
142 )
143 .with_output_schema(response_schema()),
144 max_file_size,
145 }
146 }
147}
148
149impl Default for GazeReadFile {
150 fn default() -> Self {
151 Self::new()
152 }
153}
154
155#[async_trait]
156impl Tool for GazeReadFile {
157 fn descriptor(&self) -> &ToolDescriptor {
158 &self.descriptor
159 }
160
161 async fn invoke(&self, ctx: &ToolCtx<'_>) -> Result<ToolResponse, ToolError> {
162 let path = PathBuf::from(required_string(ctx.redacted_args(), "path")?);
163 validate_file(&path, self.max_file_size)?;
164 read_file_response(&path, ctx).map(|response| ToolResponse::json(json!(response)))
165 }
166}
167
168#[derive(Serialize)]
169struct DocumentToolResponse {
170 clean_markdown: String,
171 manifest_id: String,
172 file_metadata: FileMetadata,
173}
174
175#[derive(Serialize)]
176struct FileMetadata {
177 source_kind: String,
178 ocr_mean_confidence: Option<f32>,
179 bundle_version: u32,
180 page_count: Option<u32>,
181}
182
183fn required_string<'a>(args: &'a serde_json::Value, field: &str) -> Result<&'a str, ToolError> {
184 args.get(field)
185 .and_then(|value| value.as_str())
186 .ok_or_else(|| ToolError::InvalidArgs(format!("missing required string field `{field}`")))
187}
188
189fn redact_document_text(text: &str, ctx: &ToolCtx<'_>) -> Result<String, ToolError> {
190 let pipeline = crate::bundle::build_document_pipeline().map_err(map_document_error)?;
191 let clean = pipeline
192 .redact_with_context(
193 ctx.resources().session(),
194 RawDocument::Text(text.to_string()),
195 ctx.resources().locale_chain(),
196 )
197 .map_err(|err| ToolError::BackendFailure(format!("document pipeline failed: {err}")))?;
198 match clean {
199 CleanDocument::Text(text) => Ok(text),
200 _ => Err(ToolError::BackendFailure(
201 "document pipeline returned non-text output".to_string(),
202 )),
203 }
204}
205
206fn validate_file(path: &Path, max_file_size: u64) -> Result<(), ToolError> {
207 let metadata = std::fs::metadata(path).map_err(|err| map_file_metadata_error(path, err))?;
208 if !metadata.is_file() {
209 return Err(ToolError::InvalidArgs(format!(
210 "path `{}` is not a regular file",
211 path.display()
212 )));
213 }
214 if metadata.len() > max_file_size {
215 return Err(ToolError::LimitExceeded(format!(
216 "file `{}` is {} bytes; configured cap is {} bytes",
217 path.display(),
218 metadata.len(),
219 max_file_size
220 )));
221 }
222 Ok(())
223}
224
225fn map_file_metadata_error(path: &Path, err: std::io::Error) -> ToolError {
226 if err.kind() == std::io::ErrorKind::NotFound {
227 ToolError::NotFound(format!("file `{}` not found", path.display()))
228 } else {
229 ToolError::internal(err)
230 }
231}
232
233#[cfg(feature = "ocr-tesseract")]
234fn read_file_response(path: &Path, ctx: &ToolCtx<'_>) -> Result<DocumentToolResponse, ToolError> {
235 let kind = InputKind::detect(path).map_err(map_document_error)?;
236 let (ocr_result, pdf_page_count, _) =
237 crate::bundle::run_ocr(path, kind).map_err(map_document_error)?;
238 let normalized = crate::ocr::normalize_ocr_artifacts(&ocr_result.text);
239 let clean_text = redact_document_text(&normalized, ctx)?;
240 Ok(DocumentToolResponse {
241 clean_markdown: crate::bundle::format_clean_markdown(&clean_text, kind),
242 manifest_id: ctx.call_id().to_string(),
243 file_metadata: FileMetadata {
244 source_kind: source_kind(kind).to_string(),
245 ocr_mean_confidence: ocr_result.mean_confidence,
246 bundle_version: crate::BUNDLE_VERSION,
247 page_count: pdf_page_count.and_then(|count| u32::try_from(count).ok()),
248 },
249 })
250}
251
252#[cfg(not(feature = "ocr-tesseract"))]
253fn read_file_response(
254 _path: &PathBuf,
255 _ctx: &ToolCtx<'_>,
256) -> Result<DocumentToolResponse, ToolError> {
257 Err(ToolError::BackendUnavailable(
258 "rebuild gaze-document with `--features ocr-tesseract` to enable `gaze_read_file`"
259 .to_string(),
260 ))
261}
262
263#[cfg(feature = "ocr-tesseract")]
264fn source_kind(kind: InputKind) -> &'static str {
265 match crate::bundle::kind_label(kind) {
266 "png" | "jpeg" => "image",
267 "pdf" => "pdf",
268 other => other,
269 }
270}
271
272#[cfg(feature = "ocr-tesseract")]
273fn map_document_error(err: DocumentError) -> ToolError {
274 match err {
275 DocumentError::TesseractNotFound(hint) | DocumentError::PdfiumNotFound(hint) => {
276 ToolError::BackendUnavailable(hint)
277 }
278 DocumentError::TesseractFailed { status, stderr } => {
279 ToolError::BackendFailure(format!("tesseract exited with status {status}: {stderr}"))
280 }
281 DocumentError::PdfRasterFailed(detail) => ToolError::BackendFailure(detail),
282 DocumentError::UnsupportedInput { path, reason } => {
283 ToolError::InvalidArgs(format!("unsupported input `{}`: {reason}", path.display()))
284 }
285 other => ToolError::internal(other),
286 }
287}
288
289#[cfg(not(feature = "ocr-tesseract"))]
290fn map_document_error(err: crate::DocumentError) -> ToolError {
291 ToolError::internal(err)
292}
293
294fn format_text_markdown(text: &str) -> String {
295 let mut out = String::new();
296 out.push_str("# gaze-document safe text\n\n");
297 out.push_str("Source kind: `text`\n\n");
298 out.push_str("---\n\n");
299 out.push_str(text);
300 if !text.ends_with('\n') {
301 out.push('\n');
302 }
303 out
304}
305
306fn response_schema() -> serde_json::Value {
307 json!({
308 "type": "object",
309 "properties": {
310 "clean_markdown": { "type": "string" },
311 "manifest_id": { "type": "string" },
312 "file_metadata": {
313 "type": "object",
314 "properties": {
315 "source_kind": { "type": "string" },
316 "ocr_mean_confidence": { "type": ["number", "null"] },
317 "bundle_version": { "type": "integer" },
318 "page_count": { "type": ["integer", "null"] }
319 },
320 "required": [
321 "source_kind",
322 "ocr_mean_confidence",
323 "bundle_version",
324 "page_count"
325 ]
326 }
327 },
328 "required": ["clean_markdown", "manifest_id", "file_metadata"]
329 })
330}
331
332#[cfg(test)]
333mod tests {
334 use std::sync::atomic::{AtomicUsize, Ordering};
335 use std::sync::Arc;
336
337 use async_trait::async_trait;
338 use gaze_mcp_core::{
339 AuthError, AuthHook, DispatchError, ManifestStore, PiiEnvelope, Principal, SessionIdPolicy,
340 };
341 use gaze_mcp_core::{BeginCallContext, CallHandle, FailureReason, ManifestError, SnapshotRef};
342 use serde_json::json;
343
344 use super::*;
345
346 struct AllowAllAuth;
347
348 #[async_trait]
349 impl AuthHook for AllowAllAuth {
350 async fn authorize_agent(
351 &self,
352 _principal: &Principal,
353 _tool_name: &str,
354 ) -> Result<(), AuthError> {
355 Ok(())
356 }
357
358 async fn authorize_operator(
359 &self,
360 _principal: &Principal,
361 _tool_name: &str,
362 ) -> Result<(), AuthError> {
363 Err(AuthError::Denied("operator tier disabled in test".into()))
364 }
365 }
366
367 struct RecordingManifest {
368 begins: AtomicUsize,
369 finishes: AtomicUsize,
370 failures: AtomicUsize,
371 }
372
373 impl RecordingManifest {
374 fn new() -> Self {
375 Self {
376 begins: AtomicUsize::new(0),
377 finishes: AtomicUsize::new(0),
378 failures: AtomicUsize::new(0),
379 }
380 }
381 }
382
383 #[async_trait]
384 impl ManifestStore for RecordingManifest {
385 async fn begin_call(&self, ctx: BeginCallContext<'_>) -> Result<CallHandle, ManifestError> {
386 self.begins.fetch_add(1, Ordering::SeqCst);
387 Ok(CallHandle::new(ctx.call_id))
388 }
389
390 async fn finish_call(
391 &self,
392 _handle: CallHandle,
393 _snapshot: SnapshotRef,
394 ) -> Result<(), ManifestError> {
395 self.finishes.fetch_add(1, Ordering::SeqCst);
396 Ok(())
397 }
398
399 async fn fail_call(
400 &self,
401 _handle: CallHandle,
402 _reason: FailureReason,
403 ) -> Result<(), ManifestError> {
404 self.failures.fetch_add(1, Ordering::SeqCst);
405 Ok(())
406 }
407 }
408
409 struct Harness {
410 registry: ToolRegistry,
411 auth: AllowAllAuth,
412 manifest: Arc<RecordingManifest>,
413 pipeline: gaze::Pipeline,
414 session: gaze::Session,
415 session_id_policy: SessionIdPolicy,
416 }
417
418 impl Harness {
419 fn new() -> Self {
420 let mut registry = ToolRegistry::new();
421 register_tools(&mut registry, GazeReadOpts::default()).expect("register tools");
422 Self {
423 registry,
424 auth: AllowAllAuth,
425 manifest: Arc::new(RecordingManifest::new()),
426 pipeline: crate::bundle::build_document_pipeline().expect("pipeline"),
427 session: gaze::Session::new(gaze::Scope::Ephemeral).expect("session"),
428 session_id_policy: SessionIdPolicy::default_strict(),
429 }
430 }
431
432 async fn dispatch(
433 &self,
434 tool_name: &str,
435 args: serde_json::Value,
436 ) -> Result<serde_json::Value, DispatchError> {
437 let envelope = PiiEnvelope::new(
438 &self.registry,
439 &self.auth,
440 self.manifest.as_ref(),
441 &self.pipeline,
442 &self.session,
443 &[gaze::LocaleTag::Global],
444 &self.session_id_policy,
445 );
446 envelope
447 .dispatch(&Principal::new("unit-test"), tool_name, args, None)
448 .await
449 .map(|response| response.payload)
450 }
451 }
452
453 fn assert_no_raw_fixture_values(clean_markdown: &str) {
454 assert!(!clean_markdown.contains("Jane Doe"), "{clean_markdown}");
455 assert!(!clean_markdown.contains("@example.com"), "{clean_markdown}");
456 assert!(!clean_markdown.contains("555-0142"), "{clean_markdown}");
457 }
458
459 #[tokio::test]
460 async fn read_text_dispatch_returns_clean_markdown_and_manifest_id() {
461 let harness = Harness::new();
462 let payload = harness
463 .dispatch(
464 "gaze_read_text",
465 json!({
466 "text": "Bill to: Jane Doe\nEmail: jane.doe@example.com\nPhone: +1-555-0142"
467 }),
468 )
469 .await
470 .expect("dispatch succeeds");
471
472 let clean_markdown = payload["clean_markdown"].as_str().expect("clean markdown");
473 assert!(clean_markdown.contains(":Email_"), "{clean_markdown}");
474 assert!(clean_markdown.contains(":Name_"), "{clean_markdown}");
475 assert!(
476 clean_markdown.contains(":Custom:phone_"),
477 "{clean_markdown}"
478 );
479 assert_no_raw_fixture_values(clean_markdown);
480 assert!(!payload["manifest_id"].as_str().unwrap().is_empty());
481 assert_eq!(payload["file_metadata"]["source_kind"], "text");
482 assert_eq!(
483 payload["file_metadata"]["ocr_mean_confidence"],
484 serde_json::Value::Null
485 );
486 assert_eq!(harness.manifest.begins.load(Ordering::SeqCst), 1);
487 assert_eq!(harness.manifest.finishes.load(Ordering::SeqCst), 1);
488 }
489
490 #[tokio::test]
491 async fn read_file_missing_path_fails_closed_as_not_found() {
492 let harness = Harness::new();
493 let err = harness
494 .dispatch(
495 "gaze_read_file",
496 json!({ "path": "testdata/does-not-exist.png" }),
497 )
498 .await
499 .expect_err("missing file fails");
500
501 match err {
502 DispatchError::ToolError(ToolError::NotFound(message)) => {
503 assert!(message.contains("not found"));
504 }
505 other => panic!("unexpected error: {other:?}"),
506 }
507 assert_eq!(harness.manifest.failures.load(Ordering::SeqCst), 1);
508 }
509
510 #[tokio::test]
511 async fn read_file_limit_fails_closed_before_ocr() {
512 let mut registry = ToolRegistry::new();
513 registry
514 .register(GazeReadFile::with_max_file_size(1))
515 .expect("register file tool");
516 let harness = Harness {
517 registry,
518 auth: AllowAllAuth,
519 manifest: Arc::new(RecordingManifest::new()),
520 pipeline: crate::bundle::build_document_pipeline().expect("pipeline"),
521 session: gaze::Session::new(gaze::Scope::Ephemeral).expect("session"),
522 session_id_policy: SessionIdPolicy::default_strict(),
523 };
524 let fixture = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
525 .join("testdata")
526 .join("synthetic_image.png");
527 let err = harness
528 .dispatch("gaze_read_file", json!({ "path": fixture }))
529 .await
530 .expect_err("oversized file fails");
531
532 match err {
533 DispatchError::ToolError(ToolError::LimitExceeded(message)) => {
534 assert!(message.contains("configured cap is 1 bytes"));
535 }
536 other => panic!("unexpected error: {other:?}"),
537 }
538 }
539
540 #[cfg(feature = "ocr-tesseract")]
541 #[tokio::test]
542 async fn read_file_dispatch_returns_clean_markdown_for_fixture_when_backend_available() {
543 let harness = Harness::new();
544 let fixture = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
545 .join("testdata")
546 .join("synthetic_image.png");
547 let payload = match harness
548 .dispatch("gaze_read_file", json!({ "path": fixture }))
549 .await
550 {
551 Ok(payload) => payload,
552 Err(DispatchError::ToolError(ToolError::BackendUnavailable(message))) => {
553 eprintln!("SKIP: document backend unavailable: {message}");
554 return;
555 }
556 Err(other) => panic!("unexpected dispatch error: {other:?}"),
557 };
558
559 let clean_markdown = payload["clean_markdown"].as_str().expect("clean markdown");
560 assert!(clean_markdown.contains(":Email_"), "{clean_markdown}");
561 assert!(clean_markdown.contains(":Name_"), "{clean_markdown}");
562 assert!(
563 clean_markdown.contains(":Custom:phone_"),
564 "{clean_markdown}"
565 );
566 assert_no_raw_fixture_values(clean_markdown);
567 assert_eq!(payload["file_metadata"]["source_kind"], "image");
568 assert!(!payload["manifest_id"].as_str().unwrap().is_empty());
569 }
570}