1use std::path::{Path, PathBuf};
9
10use async_trait::async_trait;
11use gaze::{CleanDocument, RawDocument};
12use gaze_mcp_core::{
13 Tool, ToolCtx, ToolDescriptor, ToolError, ToolRegistry, ToolRegistryError, ToolResponse,
14};
15use serde::Serialize;
16use serde_json::json;
17
18#[cfg(feature = "ocr-tesseract")]
19use crate::extract::InputKind;
20#[cfg(feature = "ocr-tesseract")]
21use crate::DocumentError;
22
23pub const DEFAULT_MAX_FILE_SIZE: u64 = 25 * 1024 * 1024;
25
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28#[non_exhaustive]
29pub struct GazeReadOpts {
30 pub max_file_size: u64,
32}
33
34impl Default for GazeReadOpts {
35 fn default() -> Self {
36 Self {
37 max_file_size: DEFAULT_MAX_FILE_SIZE,
38 }
39 }
40}
41
42pub fn register_tools(
44 registry: &mut ToolRegistry,
45 opts: GazeReadOpts,
46) -> Result<(), ToolRegistryError> {
47 registry.register(GazeReadText::new())?;
48 registry.register(GazeReadFile::with_max_file_size(opts.max_file_size))?;
49 Ok(())
50}
51
52#[derive(Debug)]
54#[non_exhaustive]
55pub struct GazeReadText {
56 descriptor: ToolDescriptor,
57}
58
59impl GazeReadText {
60 pub fn new() -> Self {
62 Self {
63 descriptor: ToolDescriptor::agent(
64 "gaze_read_text",
65 json!({
66 "type": "object",
67 "properties": {
68 "text": {
69 "type": "string",
70 "description": "Already-extracted text to pseudonymize before model use."
71 }
72 },
73 "required": ["text"]
74 }),
75 )
76 .with_description("Pseudonymize already-extracted text before returning it to an MCP client.")
77 .with_output_schema(response_schema()),
78 }
79 }
80}
81
82impl Default for GazeReadText {
83 fn default() -> Self {
84 Self::new()
85 }
86}
87
88#[async_trait]
89impl Tool for GazeReadText {
90 fn descriptor(&self) -> &ToolDescriptor {
91 &self.descriptor
92 }
93
94 async fn invoke(&self, ctx: &ToolCtx<'_>) -> Result<ToolResponse, ToolError> {
95 let text = required_string(ctx.redacted_args(), "text")?;
96 let clean_text = redact_document_text(text, ctx)?;
97 Ok(ToolResponse::json(json!(DocumentToolResponse {
98 clean_markdown: format_text_markdown(&clean_text),
99 manifest_id: ctx.call_id().to_string(),
100 file_metadata: FileMetadata {
101 source_kind: "text".to_string(),
102 ocr_mean_confidence: None,
103 bundle_version: crate::BUNDLE_VERSION,
104 page_count: None,
105 },
106 })))
107 }
108}
109
110#[derive(Debug)]
112#[non_exhaustive]
113pub struct GazeReadFile {
114 descriptor: ToolDescriptor,
115 max_file_size: u64,
116}
117
118impl GazeReadFile {
119 pub fn new() -> Self {
121 Self::with_max_file_size(DEFAULT_MAX_FILE_SIZE)
122 }
123
124 pub fn with_max_file_size(max_file_size: u64) -> Self {
126 Self {
127 descriptor: ToolDescriptor::agent(
128 "gaze_read_file",
129 json!({
130 "type": "object",
131 "properties": {
132 "path": {
133 "type": "string",
134 "description": "Filesystem path to a PNG, JPG, or PDF document."
135 }
136 },
137 "required": ["path"]
138 }),
139 )
140 .with_description(
141 "Read an image or PDF through OCR and Gaze pseudonymization before MCP return.",
142 )
143 .with_output_schema(response_schema()),
144 max_file_size,
145 }
146 }
147}
148
149impl Default for GazeReadFile {
150 fn default() -> Self {
151 Self::new()
152 }
153}
154
155#[async_trait]
156impl Tool for GazeReadFile {
157 fn descriptor(&self) -> &ToolDescriptor {
158 &self.descriptor
159 }
160
161 async fn invoke(&self, ctx: &ToolCtx<'_>) -> Result<ToolResponse, ToolError> {
162 let path = PathBuf::from(required_string(ctx.redacted_args(), "path")?);
163 validate_file(&path, self.max_file_size)?;
164 read_file_response(&path, ctx).map(|response| ToolResponse::json(json!(response)))
165 }
166}
167
168#[derive(Serialize)]
169struct DocumentToolResponse {
170 clean_markdown: String,
171 manifest_id: String,
172 file_metadata: FileMetadata,
173}
174
175#[derive(Serialize)]
176struct FileMetadata {
177 source_kind: String,
178 ocr_mean_confidence: Option<f32>,
179 bundle_version: u32,
180 page_count: Option<u32>,
181}
182
183fn required_string<'a>(args: &'a serde_json::Value, field: &str) -> Result<&'a str, ToolError> {
184 args.get(field)
185 .and_then(|value| value.as_str())
186 .ok_or_else(|| ToolError::InvalidArgs(format!("missing required string field `{field}`")))
187}
188
189fn redact_document_text(text: &str, ctx: &ToolCtx<'_>) -> Result<String, ToolError> {
190 let pipeline = crate::bundle::build_document_pipeline().map_err(map_document_error)?;
191 let clean = pipeline
192 .pseudonymize_with_context(
193 ctx.resources().session(),
194 RawDocument::Text(text.to_string()),
195 ctx.resources().locale_chain(),
196 )
197 .map_err(|err| ToolError::BackendFailure(format!("document pipeline failed: {err}")))?;
198 match clean {
199 CleanDocument::Text(text) => Ok(text),
200 _ => Err(ToolError::BackendFailure(
201 "document pipeline returned non-text output".to_string(),
202 )),
203 }
204}
205
206fn validate_file(path: &Path, max_file_size: u64) -> Result<(), ToolError> {
207 let metadata = std::fs::metadata(path).map_err(|err| map_file_metadata_error(path, err))?;
208 if !metadata.is_file() {
209 return Err(ToolError::InvalidArgs(format!(
210 "path `{}` is not a regular file",
211 path.display()
212 )));
213 }
214 if metadata.len() > max_file_size {
215 return Err(ToolError::LimitExceeded(format!(
216 "file `{}` is {} bytes; configured cap is {} bytes",
217 path.display(),
218 metadata.len(),
219 max_file_size
220 )));
221 }
222 Ok(())
223}
224
225fn map_file_metadata_error(path: &Path, err: std::io::Error) -> ToolError {
226 if err.kind() == std::io::ErrorKind::NotFound {
227 ToolError::NotFound(format!("file `{}` not found", path.display()))
228 } else {
229 ToolError::internal(err)
230 }
231}
232
233#[cfg(feature = "ocr-tesseract")]
234fn read_file_response(path: &Path, ctx: &ToolCtx<'_>) -> Result<DocumentToolResponse, ToolError> {
235 let kind = InputKind::detect(path).map_err(map_document_error)?;
236 let backend = crate::ocr::TesseractBackend::new();
237 let (ocr_result, pdf_page_count, _) =
238 crate::bundle::run_ocr(path, kind, &backend).map_err(map_document_error)?;
239 let normalized = crate::ocr::normalize_ocr_artifacts(&ocr_result.text);
240 let clean_text = redact_document_text(&normalized, ctx)?;
241 Ok(DocumentToolResponse {
242 clean_markdown: crate::bundle::format_clean_markdown(&clean_text, kind),
243 manifest_id: ctx.call_id().to_string(),
244 file_metadata: FileMetadata {
245 source_kind: source_kind(kind).to_string(),
246 ocr_mean_confidence: ocr_result.mean_confidence,
247 bundle_version: crate::BUNDLE_VERSION,
248 page_count: pdf_page_count.and_then(|count| u32::try_from(count).ok()),
249 },
250 })
251}
252
253#[cfg(not(feature = "ocr-tesseract"))]
254fn read_file_response(
255 _path: &PathBuf,
256 _ctx: &ToolCtx<'_>,
257) -> Result<DocumentToolResponse, ToolError> {
258 Err(ToolError::BackendUnavailable(
259 "rebuild gaze-document with `--features ocr-tesseract` to enable `gaze_read_file`"
260 .to_string(),
261 ))
262}
263
264#[cfg(feature = "ocr-tesseract")]
265fn source_kind(kind: InputKind) -> &'static str {
266 match crate::bundle::kind_label(kind) {
267 "png" | "jpeg" => "image",
268 "pdf" => "pdf",
269 other => other,
270 }
271}
272
273#[cfg(feature = "ocr-tesseract")]
274fn map_document_error(err: DocumentError) -> ToolError {
275 match err {
276 DocumentError::TesseractNotFound(hint) | DocumentError::PdfiumNotFound(hint) => {
277 ToolError::BackendUnavailable(hint)
278 }
279 DocumentError::TesseractFailed { status, stderr } => {
280 ToolError::BackendFailure(format!("tesseract exited with status {status}: {stderr}"))
281 }
282 DocumentError::PdfRasterFailed(detail) => ToolError::BackendFailure(detail),
283 DocumentError::UnsupportedInput { path, reason } => {
284 ToolError::InvalidArgs(format!("unsupported input `{}`: {reason}", path.display()))
285 }
286 other => ToolError::internal(other),
287 }
288}
289
290#[cfg(not(feature = "ocr-tesseract"))]
291fn map_document_error(err: crate::DocumentError) -> ToolError {
292 ToolError::internal(err)
293}
294
295fn format_text_markdown(text: &str) -> String {
296 let mut out = String::new();
297 out.push_str("# gaze-document safe text\n\n");
298 out.push_str("Source kind: `text`\n\n");
299 out.push_str("---\n\n");
300 out.push_str(text);
301 if !text.ends_with('\n') {
302 out.push('\n');
303 }
304 out
305}
306
307fn response_schema() -> serde_json::Value {
308 json!({
309 "type": "object",
310 "properties": {
311 "clean_markdown": { "type": "string" },
312 "manifest_id": { "type": "string" },
313 "file_metadata": {
314 "type": "object",
315 "properties": {
316 "source_kind": { "type": "string" },
317 "ocr_mean_confidence": { "type": ["number", "null"] },
318 "bundle_version": { "type": "integer" },
319 "page_count": { "type": ["integer", "null"] }
320 },
321 "required": [
322 "source_kind",
323 "ocr_mean_confidence",
324 "bundle_version",
325 "page_count"
326 ]
327 }
328 },
329 "required": ["clean_markdown", "manifest_id", "file_metadata"]
330 })
331}
332
333#[cfg(test)]
334mod tests {
335 use std::sync::atomic::{AtomicUsize, Ordering};
336 use std::sync::Arc;
337
338 use async_trait::async_trait;
339 use gaze_mcp_core::{
340 AuthError, AuthHook, DispatchError, ManifestStore, PiiEnvelope, Principal, SessionIdPolicy,
341 };
342 use gaze_mcp_core::{BeginCallContext, CallHandle, FailureReason, ManifestError, SnapshotRef};
343 use serde_json::json;
344
345 use super::*;
346
347 struct AllowAllAuth;
348
349 #[async_trait]
350 impl AuthHook for AllowAllAuth {
351 async fn authorize_agent(
352 &self,
353 _principal: &Principal,
354 _tool_name: &str,
355 ) -> Result<(), AuthError> {
356 Ok(())
357 }
358
359 async fn authorize_operator(
360 &self,
361 _principal: &Principal,
362 _tool_name: &str,
363 ) -> Result<(), AuthError> {
364 Err(AuthError::Denied("operator tier disabled in test".into()))
365 }
366 }
367
368 struct RecordingManifest {
369 begins: AtomicUsize,
370 finishes: AtomicUsize,
371 failures: AtomicUsize,
372 }
373
374 impl RecordingManifest {
375 fn new() -> Self {
376 Self {
377 begins: AtomicUsize::new(0),
378 finishes: AtomicUsize::new(0),
379 failures: AtomicUsize::new(0),
380 }
381 }
382 }
383
384 #[async_trait]
385 impl ManifestStore for RecordingManifest {
386 async fn begin_call(&self, ctx: BeginCallContext<'_>) -> Result<CallHandle, ManifestError> {
387 self.begins.fetch_add(1, Ordering::SeqCst);
388 Ok(CallHandle::new(ctx.call_id))
389 }
390
391 async fn finish_call(
392 &self,
393 _handle: CallHandle,
394 _snapshot: SnapshotRef,
395 ) -> Result<(), ManifestError> {
396 self.finishes.fetch_add(1, Ordering::SeqCst);
397 Ok(())
398 }
399
400 async fn fail_call(
401 &self,
402 _handle: CallHandle,
403 _reason: FailureReason,
404 ) -> Result<(), ManifestError> {
405 self.failures.fetch_add(1, Ordering::SeqCst);
406 Ok(())
407 }
408 }
409
410 struct Harness {
411 registry: ToolRegistry,
412 auth: AllowAllAuth,
413 manifest: Arc<RecordingManifest>,
414 pipeline: gaze::Pipeline,
415 session: gaze::Session,
416 session_id_policy: SessionIdPolicy,
417 }
418
419 impl Harness {
420 fn new() -> Self {
421 let mut registry = ToolRegistry::new();
422 register_tools(&mut registry, GazeReadOpts::default()).expect("register tools");
423 Self {
424 registry,
425 auth: AllowAllAuth,
426 manifest: Arc::new(RecordingManifest::new()),
427 pipeline: crate::bundle::build_document_pipeline().expect("pipeline"),
428 session: gaze::Session::new(gaze::Scope::Ephemeral).expect("session"),
429 session_id_policy: SessionIdPolicy::default_strict(),
430 }
431 }
432
433 async fn dispatch(
434 &self,
435 tool_name: &str,
436 args: serde_json::Value,
437 ) -> Result<serde_json::Value, DispatchError> {
438 let envelope = PiiEnvelope::new(
439 &self.registry,
440 &self.auth,
441 self.manifest.as_ref(),
442 &self.pipeline,
443 &self.session,
444 &[gaze::LocaleTag::Global],
445 &self.session_id_policy,
446 );
447 envelope
448 .dispatch(&Principal::new("unit-test"), tool_name, args, None)
449 .await
450 .map(|response| response.payload)
451 }
452 }
453
454 fn assert_no_raw_fixture_values(clean_markdown: &str) {
455 assert!(!clean_markdown.contains("Jane Doe"), "{clean_markdown}");
456 assert!(
457 !clean_markdown.contains("@example.invalid"),
458 "{clean_markdown}"
459 );
460 assert!(!clean_markdown.contains("555-0142"), "{clean_markdown}");
461 }
462
463 #[tokio::test]
464 async fn read_text_dispatch_returns_clean_markdown_and_manifest_id() {
465 let harness = Harness::new();
466 let payload = harness
467 .dispatch(
468 "gaze_read_text",
469 json!({
470 "text": "Bill to: Jane Doe\nEmail: jane.doe@example.invalid\nPhone: +1-555-0142"
471 }),
472 )
473 .await
474 .expect("dispatch succeeds");
475
476 let clean_markdown = payload["clean_markdown"].as_str().expect("clean markdown");
477 assert!(clean_markdown.contains(":Email_"), "{clean_markdown}");
478 assert!(clean_markdown.contains(":Name_"), "{clean_markdown}");
479 assert!(
480 clean_markdown.contains(":Custom:phone_"),
481 "{clean_markdown}"
482 );
483 assert_no_raw_fixture_values(clean_markdown);
484 assert!(!payload["manifest_id"].as_str().unwrap().is_empty());
485 assert_eq!(payload["file_metadata"]["source_kind"], "text");
486 assert_eq!(
487 payload["file_metadata"]["ocr_mean_confidence"],
488 serde_json::Value::Null
489 );
490 assert_eq!(harness.manifest.begins.load(Ordering::SeqCst), 1);
491 assert_eq!(harness.manifest.finishes.load(Ordering::SeqCst), 1);
492 }
493
494 #[tokio::test]
495 async fn read_file_missing_path_fails_closed_as_not_found() {
496 let harness = Harness::new();
497 let err = harness
498 .dispatch(
499 "gaze_read_file",
500 json!({ "path": "testdata/does-not-exist.png" }),
501 )
502 .await
503 .expect_err("missing file fails");
504
505 match err {
506 DispatchError::ToolError(ToolError::NotFound(message)) => {
507 assert!(message.contains("not found"));
508 }
509 other => panic!("unexpected error: {other:?}"),
510 }
511 assert_eq!(harness.manifest.failures.load(Ordering::SeqCst), 1);
512 }
513
514 #[tokio::test]
515 async fn read_file_limit_fails_closed_before_ocr() {
516 let mut registry = ToolRegistry::new();
517 registry
518 .register(GazeReadFile::with_max_file_size(1))
519 .expect("register file tool");
520 let harness = Harness {
521 registry,
522 auth: AllowAllAuth,
523 manifest: Arc::new(RecordingManifest::new()),
524 pipeline: crate::bundle::build_document_pipeline().expect("pipeline"),
525 session: gaze::Session::new(gaze::Scope::Ephemeral).expect("session"),
526 session_id_policy: SessionIdPolicy::default_strict(),
527 };
528 let fixture = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
529 .join("testdata")
530 .join("synthetic_image.png");
531 let err = harness
532 .dispatch("gaze_read_file", json!({ "path": fixture }))
533 .await
534 .expect_err("oversized file fails");
535
536 match err {
537 DispatchError::ToolError(ToolError::LimitExceeded(message)) => {
538 assert!(message.contains("configured cap is 1 bytes"));
539 }
540 other => panic!("unexpected error: {other:?}"),
541 }
542 }
543
544 #[cfg(feature = "ocr-tesseract")]
545 #[tokio::test]
546 async fn read_file_dispatch_returns_clean_markdown_for_fixture_when_backend_available() {
547 let harness = Harness::new();
548 let fixture = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
549 .join("testdata")
550 .join("synthetic_image.png");
551 let payload = match harness
552 .dispatch("gaze_read_file", json!({ "path": fixture }))
553 .await
554 {
555 Ok(payload) => payload,
556 Err(DispatchError::ToolError(ToolError::BackendUnavailable(message))) => {
557 eprintln!("SKIP: document backend unavailable: {message}");
558 return;
559 }
560 Err(other) => panic!("unexpected dispatch error: {other:?}"),
561 };
562
563 let clean_markdown = payload["clean_markdown"].as_str().expect("clean markdown");
564 assert!(clean_markdown.contains(":Email_"), "{clean_markdown}");
565 assert!(clean_markdown.contains(":Name_"), "{clean_markdown}");
566 assert!(
567 clean_markdown.contains(":Custom:phone_"),
568 "{clean_markdown}"
569 );
570 assert_no_raw_fixture_values(clean_markdown);
571 assert_eq!(payload["file_metadata"]["source_kind"], "image");
572 assert!(!payload["manifest_id"].as_str().unwrap().is_empty());
573 }
574}