use async_trait::async_trait;
use serde_json::{json, Value};
use std::path::PathBuf;
use crate::error::{Result, ZeptoError};
use crate::security::{revalidate_path, validate_path_in_workspace};
use super::{Tool, ToolContext, ToolOutput};
const MAX_PDF_BYTES: u64 = 50 * 1024 * 1024;
const DEFAULT_MAX_CHARS: usize = 50_000;
const HARD_MAX_CHARS: usize = 200_000;
pub struct PdfReadTool {
workspace: String,
}
impl PdfReadTool {
pub fn new(workspace: String) -> Self {
Self { workspace }
}
pub fn resolve_path(&self, path: &str) -> Result<PathBuf> {
let safe = validate_path_in_workspace(path, &self.workspace)?;
if safe.as_path().extension().and_then(|e| e.to_str()) != Some("pdf") {
return Err(ZeptoError::Tool(
"Only .pdf files are supported".to_string(),
));
}
revalidate_path(safe.as_path(), &self.workspace)?;
if !safe.as_path().exists() {
return Err(ZeptoError::Tool(format!("File not found: {path}")));
}
Ok(safe.into_path_buf())
}
pub fn truncate_output(text: String, max_chars: usize) -> String {
if text.len() <= max_chars {
return text;
}
let mut byte_end = text.len();
let mut truncated = false;
for (char_count, (byte_idx, _ch)) in text.char_indices().enumerate() {
if char_count == max_chars {
byte_end = byte_idx;
truncated = true;
break;
}
}
if truncated {
let mut s = text[..byte_end].to_string();
s.push_str("\n[TRUNCATED] — output exceeded max_chars");
s
} else {
text
}
}
#[cfg(feature = "tool-pdf")]
fn extract_text(path: &std::path::Path) -> Result<String> {
use lopdf::Document;
let doc = Document::load(path)
.map_err(|e| ZeptoError::Tool(format!("Failed to load PDF: {e}")))?;
let mut text = String::new();
for page_id in doc.page_iter() {
if let Ok(page_text) = doc.extract_text(&[page_id.0]) {
text.push_str(&page_text);
text.push('\n');
}
}
Ok(text)
}
#[cfg(not(feature = "tool-pdf"))]
fn extract_text(_path: &std::path::Path) -> Result<String> {
Err(ZeptoError::Tool(
"PDF extraction requires the 'tool-pdf' build feature. \
Rebuild with: cargo build --features tool-pdf"
.to_string(),
))
}
}
#[async_trait]
impl Tool for PdfReadTool {
fn name(&self) -> &str {
"pdf_read"
}
fn description(&self) -> &str {
"Extract plain text from a PDF file in the workspace. \
Returns all readable text content. \
Image-only or encrypted PDFs may return empty results."
}
fn compact_description(&self) -> &str {
"Extract plain text from a workspace PDF file."
}
fn parameters(&self) -> Value {
json!({
"type": "object",
"required": ["path"],
"properties": {
"path": {
"type": "string",
"description": "Relative path to the PDF file within the workspace"
},
"max_chars": {
"type": "integer",
"description": "Maximum characters to return (default: 50000, max: 200000)",
"default": DEFAULT_MAX_CHARS
}
}
})
}
async fn execute(&self, args: Value, _ctx: &ToolContext) -> Result<ToolOutput> {
let path_str = args["path"].as_str().unwrap_or("");
if path_str.is_empty() {
return Err(ZeptoError::Tool(
"Missing required argument: path".to_string(),
));
}
let max_chars = args["max_chars"]
.as_u64()
.map(|v| v as usize)
.unwrap_or(DEFAULT_MAX_CHARS)
.min(HARD_MAX_CHARS);
let resolved = self.resolve_path(path_str)?;
let meta = tokio::fs::metadata(&resolved)
.await
.map_err(|e| ZeptoError::Tool(format!("Cannot stat file: {e}")))?;
if meta.len() > MAX_PDF_BYTES {
return Err(ZeptoError::Tool(format!(
"PDF too large: {} bytes (max {}MB)",
meta.len(),
MAX_PDF_BYTES / 1024 / 1024
)));
}
let text = tokio::task::spawn_blocking(move || Self::extract_text(&resolved))
.await
.map_err(|e| ZeptoError::Tool(format!("Task panicked: {e}")))??;
if text.trim().is_empty() {
return Ok(ToolOutput::llm_only(
"No text content found. The PDF may be image-only or encrypted.",
));
}
Ok(ToolOutput::llm_only(Self::truncate_output(text, max_chars)))
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::TempDir;
fn tool(workspace: &str) -> PdfReadTool {
PdfReadTool::new(workspace.to_string())
}
#[test]
fn test_rejects_path_outside_workspace() {
let tmp = TempDir::new().unwrap();
let t = tool(tmp.path().to_str().unwrap());
let result = t.resolve_path("../../../etc/passwd");
assert!(result.is_err(), "expected error for path traversal");
}
#[test]
fn test_rejects_non_pdf_extension() {
let tmp = TempDir::new().unwrap();
let txt_path = tmp.path().join("document.txt");
std::fs::File::create(&txt_path).unwrap();
let t = tool(tmp.path().to_str().unwrap());
let result = t.resolve_path("document.txt");
assert!(result.is_err(), "expected error for non-pdf extension");
let msg = format!("{}", result.unwrap_err());
assert!(msg.contains(".pdf"), "error should mention .pdf: {msg}");
}
#[test]
fn test_rejects_missing_file() {
let tmp = TempDir::new().unwrap();
let t = tool(tmp.path().to_str().unwrap());
let result = t.resolve_path("missing.pdf");
assert!(result.is_err(), "expected error for missing file");
let msg = format!("{}", result.unwrap_err());
assert!(
msg.contains("not found") || msg.contains("missing"),
"error should mention missing file: {msg}"
);
}
#[test]
fn test_accepts_valid_pdf_path() {
let tmp = TempDir::new().unwrap();
let pdf_path = tmp.path().join("invoice.pdf");
std::fs::File::create(&pdf_path)
.unwrap()
.write_all(b"%PDF-1.4")
.unwrap();
let t = tool(tmp.path().to_str().unwrap());
let result = t.resolve_path("invoice.pdf");
assert!(
result.is_ok(),
"expected Ok for valid pdf path: {:?}",
result
);
}
#[test]
fn test_truncate_output() {
let long = "a".repeat(200_000);
let result = PdfReadTool::truncate_output(long, 50_000);
assert!(
result.len() <= 50_100,
"truncated output too long: {}",
result.len()
);
assert!(
result.contains("[TRUNCATED]"),
"truncated output missing marker"
);
}
#[test]
fn test_truncate_output_short() {
let short = "hello world".to_string();
let result = PdfReadTool::truncate_output(short.clone(), 50_000);
assert_eq!(result, short, "short strings should be returned unchanged");
}
#[test]
fn test_truncate_output_multibyte() {
let long = "日".repeat(100_000);
let result = PdfReadTool::truncate_output(long, 50_000);
assert!(
result.contains("[TRUNCATED]"),
"should contain TRUNCATED marker"
);
let marker_pos = result
.find('\n')
.expect("should have newline before marker");
let body = &result[..marker_pos];
assert_eq!(
body.chars().count(),
50_000,
"body should be exactly max_chars wide"
);
}
}