Skip to main content

agentzero_tools/
pdf_read.rs

1use agentzero_core::{Tool, ToolContext, ToolResult};
2use anyhow::{anyhow, Context};
3use async_trait::async_trait;
4use serde::Deserialize;
5use std::path::{Component, Path, PathBuf};
6use std::process::Stdio;
7use tokio::io::AsyncReadExt;
8use tokio::process::Command;
9
10const MAX_OUTPUT_BYTES: usize = 256 * 1024;
11
12#[derive(Debug, Deserialize)]
13struct PdfReadInput {
14    path: String,
15    #[serde(default)]
16    page_start: Option<usize>,
17    #[serde(default)]
18    page_end: Option<usize>,
19}
20
21#[derive(Debug, Default, Clone, Copy)]
22pub struct PdfReadTool;
23
24impl PdfReadTool {
25    fn resolve_path(input_path: &str, workspace_root: &str) -> anyhow::Result<PathBuf> {
26        if input_path.trim().is_empty() {
27            return Err(anyhow!("path is required"));
28        }
29        let relative = Path::new(input_path);
30        if relative.is_absolute() {
31            return Err(anyhow!("absolute paths are not allowed"));
32        }
33        if relative
34            .components()
35            .any(|c| matches!(c, Component::ParentDir))
36        {
37            return Err(anyhow!("path traversal is not allowed"));
38        }
39        let joined = Path::new(workspace_root).join(relative);
40        let canonical_root = Path::new(workspace_root)
41            .canonicalize()
42            .context("unable to resolve workspace root")?;
43        let canonical = joined
44            .canonicalize()
45            .with_context(|| format!("file not found: {input_path}"))?;
46        if !canonical.starts_with(&canonical_root) {
47            return Err(anyhow!("path is outside workspace"));
48        }
49        Ok(canonical)
50    }
51}
52
53#[async_trait]
54impl Tool for PdfReadTool {
55    fn name(&self) -> &'static str {
56        "pdf_read"
57    }
58
59    fn description(&self) -> &'static str {
60        "Extract text content from a PDF file."
61    }
62
63    fn input_schema(&self) -> Option<serde_json::Value> {
64        Some(serde_json::json!({
65            "type": "object",
66            "properties": {
67                "path": { "type": "string", "description": "Path to the PDF file" }
68            },
69            "required": ["path"]
70        }))
71    }
72
73    async fn execute(&self, input: &str, ctx: &ToolContext) -> anyhow::Result<ToolResult> {
74        let req: PdfReadInput =
75            serde_json::from_str(input).context("pdf_read expects JSON: {\"path\": \"...\"}")?;
76
77        let file_path = Self::resolve_path(&req.path, &ctx.workspace_root)?;
78
79        // Use pdftotext (from poppler-utils) which is commonly available
80        let mut args = vec![file_path.to_string_lossy().to_string()];
81
82        if let Some(start) = req.page_start {
83            args.push("-f".to_string());
84            args.push(start.to_string());
85        }
86        if let Some(end) = req.page_end {
87            args.push("-l".to_string());
88            args.push(end.to_string());
89        }
90        args.push("-".to_string()); // output to stdout
91
92        let mut child = Command::new("pdftotext")
93            .args(&args)
94            .stdout(Stdio::piped())
95            .stderr(Stdio::piped())
96            .spawn()
97            .context("failed to spawn pdftotext — is poppler-utils installed?")?;
98
99        let stdout_handle = child
100            .stdout
101            .take()
102            .context("stdout not piped on spawned child")?;
103        let stderr_handle = child
104            .stderr
105            .take()
106            .context("stderr not piped on spawned child")?;
107
108        let stdout_task = tokio::spawn(read_limited(stdout_handle));
109        let stderr_task = tokio::spawn(read_limited(stderr_handle));
110
111        let status = child.wait().await.context("pdftotext command failed")?;
112        let stdout = stdout_task.await.context("stdout join")??;
113        let stderr = stderr_task.await.context("stderr join")??;
114
115        if !status.success() {
116            let mut msg = format!("pdftotext exited with code {}", status.code().unwrap_or(-1));
117            if !stderr.is_empty() {
118                msg.push_str(": ");
119                msg.push_str(&stderr);
120            }
121            return Err(anyhow!(msg));
122        }
123
124        if stdout.is_empty() {
125            Ok(ToolResult {
126                output: "(no text content extracted)".to_string(),
127            })
128        } else {
129            Ok(ToolResult { output: stdout })
130        }
131    }
132}
133
134async fn read_limited<R: tokio::io::AsyncRead + Unpin>(mut reader: R) -> anyhow::Result<String> {
135    let mut buf = Vec::new();
136    let mut limited = (&mut reader).take((MAX_OUTPUT_BYTES + 1) as u64);
137    limited.read_to_end(&mut buf).await?;
138    let truncated = buf.len() > MAX_OUTPUT_BYTES;
139    if truncated {
140        buf.truncate(MAX_OUTPUT_BYTES);
141    }
142    let mut s = String::from_utf8_lossy(&buf).to_string();
143    if truncated {
144        s.push_str(&format!("\n<truncated at {} bytes>", MAX_OUTPUT_BYTES));
145    }
146    Ok(s)
147}
148
149#[cfg(test)]
150mod tests {
151    use super::*;
152    use std::fs;
153    use std::sync::atomic::{AtomicU64, Ordering};
154    use std::time::{SystemTime, UNIX_EPOCH};
155
156    static TEMP_COUNTER: AtomicU64 = AtomicU64::new(0);
157
158    fn temp_dir() -> PathBuf {
159        let nanos = SystemTime::now()
160            .duration_since(UNIX_EPOCH)
161            .expect("clock")
162            .as_nanos();
163        let seq = TEMP_COUNTER.fetch_add(1, Ordering::Relaxed);
164        let dir = std::env::temp_dir().join(format!(
165            "agentzero-pdf-read-{}-{nanos}-{seq}",
166            std::process::id()
167        ));
168        fs::create_dir_all(&dir).expect("temp dir should be created");
169        dir
170    }
171
172    #[tokio::test]
173    async fn pdf_read_rejects_path_traversal() {
174        let dir = temp_dir();
175        let tool = PdfReadTool;
176        let err = tool
177            .execute(
178                r#"{"path": "../escape.pdf"}"#,
179                &ToolContext::new(dir.to_string_lossy().to_string()),
180            )
181            .await
182            .expect_err("path traversal should fail");
183        assert!(err.to_string().contains("path traversal"));
184        fs::remove_dir_all(dir).ok();
185    }
186
187    #[tokio::test]
188    async fn pdf_read_rejects_empty_path() {
189        let dir = temp_dir();
190        let tool = PdfReadTool;
191        let err = tool
192            .execute(
193                r#"{"path": ""}"#,
194                &ToolContext::new(dir.to_string_lossy().to_string()),
195            )
196            .await
197            .expect_err("empty path should fail");
198        assert!(err.to_string().contains("path is required"));
199        fs::remove_dir_all(dir).ok();
200    }
201
202    #[tokio::test]
203    async fn pdf_read_rejects_nonexistent_file() {
204        let dir = temp_dir();
205        let tool = PdfReadTool;
206        let err = tool
207            .execute(
208                r#"{"path": "nonexistent.pdf"}"#,
209                &ToolContext::new(dir.to_string_lossy().to_string()),
210            )
211            .await
212            .expect_err("nonexistent file should fail");
213        assert!(err.to_string().contains("not found"));
214        fs::remove_dir_all(dir).ok();
215    }
216}