agentzero_tools/
pdf_read.rs1use agentzero_core::{Tool, ToolContext, ToolResult};
2use anyhow::{anyhow, Context};
3use async_trait::async_trait;
4use serde::Deserialize;
5use std::path::{Component, Path, PathBuf};
6use std::process::Stdio;
7use tokio::io::AsyncReadExt;
8use tokio::process::Command;
9
10const MAX_OUTPUT_BYTES: usize = 256 * 1024;
11
12#[derive(Debug, Deserialize)]
13struct PdfReadInput {
14 path: String,
15 #[serde(default)]
16 page_start: Option<usize>,
17 #[serde(default)]
18 page_end: Option<usize>,
19}
20
21#[derive(Debug, Default, Clone, Copy)]
22pub struct PdfReadTool;
23
24impl PdfReadTool {
25 fn resolve_path(input_path: &str, workspace_root: &str) -> anyhow::Result<PathBuf> {
26 if input_path.trim().is_empty() {
27 return Err(anyhow!("path is required"));
28 }
29 let relative = Path::new(input_path);
30 if relative.is_absolute() {
31 return Err(anyhow!("absolute paths are not allowed"));
32 }
33 if relative
34 .components()
35 .any(|c| matches!(c, Component::ParentDir))
36 {
37 return Err(anyhow!("path traversal is not allowed"));
38 }
39 let joined = Path::new(workspace_root).join(relative);
40 let canonical_root = Path::new(workspace_root)
41 .canonicalize()
42 .context("unable to resolve workspace root")?;
43 let canonical = joined
44 .canonicalize()
45 .with_context(|| format!("file not found: {input_path}"))?;
46 if !canonical.starts_with(&canonical_root) {
47 return Err(anyhow!("path is outside workspace"));
48 }
49 Ok(canonical)
50 }
51}
52
53#[async_trait]
54impl Tool for PdfReadTool {
55 fn name(&self) -> &'static str {
56 "pdf_read"
57 }
58
59 fn description(&self) -> &'static str {
60 "Extract text content from a PDF file."
61 }
62
63 fn input_schema(&self) -> Option<serde_json::Value> {
64 Some(serde_json::json!({
65 "type": "object",
66 "properties": {
67 "path": { "type": "string", "description": "Path to the PDF file" }
68 },
69 "required": ["path"]
70 }))
71 }
72
73 async fn execute(&self, input: &str, ctx: &ToolContext) -> anyhow::Result<ToolResult> {
74 let req: PdfReadInput =
75 serde_json::from_str(input).context("pdf_read expects JSON: {\"path\": \"...\"}")?;
76
77 let file_path = Self::resolve_path(&req.path, &ctx.workspace_root)?;
78
79 let mut args = vec![file_path.to_string_lossy().to_string()];
81
82 if let Some(start) = req.page_start {
83 args.push("-f".to_string());
84 args.push(start.to_string());
85 }
86 if let Some(end) = req.page_end {
87 args.push("-l".to_string());
88 args.push(end.to_string());
89 }
90 args.push("-".to_string()); let mut child = Command::new("pdftotext")
93 .args(&args)
94 .stdout(Stdio::piped())
95 .stderr(Stdio::piped())
96 .spawn()
97 .context("failed to spawn pdftotext — is poppler-utils installed?")?;
98
99 let stdout_handle = child
100 .stdout
101 .take()
102 .context("stdout not piped on spawned child")?;
103 let stderr_handle = child
104 .stderr
105 .take()
106 .context("stderr not piped on spawned child")?;
107
108 let stdout_task = tokio::spawn(read_limited(stdout_handle));
109 let stderr_task = tokio::spawn(read_limited(stderr_handle));
110
111 let status = child.wait().await.context("pdftotext command failed")?;
112 let stdout = stdout_task.await.context("stdout join")??;
113 let stderr = stderr_task.await.context("stderr join")??;
114
115 if !status.success() {
116 let mut msg = format!("pdftotext exited with code {}", status.code().unwrap_or(-1));
117 if !stderr.is_empty() {
118 msg.push_str(": ");
119 msg.push_str(&stderr);
120 }
121 return Err(anyhow!(msg));
122 }
123
124 if stdout.is_empty() {
125 Ok(ToolResult {
126 output: "(no text content extracted)".to_string(),
127 })
128 } else {
129 Ok(ToolResult { output: stdout })
130 }
131 }
132}
133
134async fn read_limited<R: tokio::io::AsyncRead + Unpin>(mut reader: R) -> anyhow::Result<String> {
135 let mut buf = Vec::new();
136 let mut limited = (&mut reader).take((MAX_OUTPUT_BYTES + 1) as u64);
137 limited.read_to_end(&mut buf).await?;
138 let truncated = buf.len() > MAX_OUTPUT_BYTES;
139 if truncated {
140 buf.truncate(MAX_OUTPUT_BYTES);
141 }
142 let mut s = String::from_utf8_lossy(&buf).to_string();
143 if truncated {
144 s.push_str(&format!("\n<truncated at {} bytes>", MAX_OUTPUT_BYTES));
145 }
146 Ok(s)
147}
148
149#[cfg(test)]
150mod tests {
151 use super::*;
152 use std::fs;
153 use std::sync::atomic::{AtomicU64, Ordering};
154 use std::time::{SystemTime, UNIX_EPOCH};
155
156 static TEMP_COUNTER: AtomicU64 = AtomicU64::new(0);
157
158 fn temp_dir() -> PathBuf {
159 let nanos = SystemTime::now()
160 .duration_since(UNIX_EPOCH)
161 .expect("clock")
162 .as_nanos();
163 let seq = TEMP_COUNTER.fetch_add(1, Ordering::Relaxed);
164 let dir = std::env::temp_dir().join(format!(
165 "agentzero-pdf-read-{}-{nanos}-{seq}",
166 std::process::id()
167 ));
168 fs::create_dir_all(&dir).expect("temp dir should be created");
169 dir
170 }
171
172 #[tokio::test]
173 async fn pdf_read_rejects_path_traversal() {
174 let dir = temp_dir();
175 let tool = PdfReadTool;
176 let err = tool
177 .execute(
178 r#"{"path": "../escape.pdf"}"#,
179 &ToolContext::new(dir.to_string_lossy().to_string()),
180 )
181 .await
182 .expect_err("path traversal should fail");
183 assert!(err.to_string().contains("path traversal"));
184 fs::remove_dir_all(dir).ok();
185 }
186
187 #[tokio::test]
188 async fn pdf_read_rejects_empty_path() {
189 let dir = temp_dir();
190 let tool = PdfReadTool;
191 let err = tool
192 .execute(
193 r#"{"path": ""}"#,
194 &ToolContext::new(dir.to_string_lossy().to_string()),
195 )
196 .await
197 .expect_err("empty path should fail");
198 assert!(err.to_string().contains("path is required"));
199 fs::remove_dir_all(dir).ok();
200 }
201
202 #[tokio::test]
203 async fn pdf_read_rejects_nonexistent_file() {
204 let dir = temp_dir();
205 let tool = PdfReadTool;
206 let err = tool
207 .execute(
208 r#"{"path": "nonexistent.pdf"}"#,
209 &ToolContext::new(dir.to_string_lossy().to_string()),
210 )
211 .await
212 .expect_err("nonexistent file should fail");
213 assert!(err.to_string().contains("not found"));
214 fs::remove_dir_all(dir).ok();
215 }
216}