1use super::traits::{Tool, ToolResult};
2use crate::security::SecurityPolicy;
3use async_trait::async_trait;
4use serde_json::json;
5use std::sync::Arc;
6
7const MAX_PDF_BYTES: u64 = 50 * 1024 * 1024;
9const DEFAULT_MAX_CHARS: usize = 50_000;
11const MAX_OUTPUT_CHARS: usize = 200_000;
13
14pub struct PdfReadTool {
22 security: Arc<SecurityPolicy>,
23}
24
25impl PdfReadTool {
26 pub fn new(security: Arc<SecurityPolicy>) -> Self {
27 Self { security }
28 }
29}
30
31#[async_trait]
32impl Tool for PdfReadTool {
33 fn name(&self) -> &str {
34 "pdf_read"
35 }
36
37 fn description(&self) -> &str {
38 "Extract plain text from a PDF file in the workspace. \
39 Returns all readable text. Image-only or encrypted PDFs return an empty result. \
40 Requires the 'rag-pdf' build feature."
41 }
42
43 fn parameters_schema(&self) -> serde_json::Value {
44 json!({
45 "type": "object",
46 "properties": {
47 "path": {
48 "type": "string",
49 "description": "Path to the PDF file. Relative paths resolve from workspace; outside paths require policy allowlist."
50 },
51 "max_chars": {
52 "type": "integer",
53 "description": "Maximum characters to return (default: 50000, max: 200000)",
54 "minimum": 1,
55 "maximum": 200_000
56 }
57 },
58 "required": ["path"]
59 })
60 }
61
62 async fn execute(&self, args: serde_json::Value) -> anyhow::Result<ToolResult> {
63 let path = args
64 .get("path")
65 .and_then(|v| v.as_str())
66 .ok_or_else(|| anyhow::anyhow!("Missing 'path' parameter"))?;
67
68 let max_chars = args
69 .get("max_chars")
70 .and_then(|v| v.as_u64())
71 .map(|n| {
72 usize::try_from(n)
73 .unwrap_or(MAX_OUTPUT_CHARS)
74 .min(MAX_OUTPUT_CHARS)
75 })
76 .unwrap_or(DEFAULT_MAX_CHARS);
77
78 if self.security.is_rate_limited() {
79 return Ok(ToolResult {
80 success: false,
81 output: String::new(),
82 error: Some("Rate limit exceeded: too many actions in the last hour".into()),
83 });
84 }
85
86 if !self.security.is_path_allowed(path) {
87 return Ok(ToolResult {
88 success: false,
89 output: String::new(),
90 error: Some(format!("Path not allowed by security policy: {path}")),
91 });
92 }
93
94 if !self.security.record_action() {
96 return Ok(ToolResult {
97 success: false,
98 output: String::new(),
99 error: Some("Rate limit exceeded: action budget exhausted".into()),
100 });
101 }
102
103 let full_path = self.security.resolve_tool_path(path);
104
105 let resolved_path = match tokio::fs::canonicalize(&full_path).await {
106 Ok(p) => p,
107 Err(e) => {
108 return Ok(ToolResult {
109 success: false,
110 output: String::new(),
111 error: Some(format!("Failed to resolve file path: {e}")),
112 });
113 }
114 };
115
116 if !self.security.is_resolved_path_allowed(&resolved_path) {
117 return Ok(ToolResult {
118 success: false,
119 output: String::new(),
120 error: Some(
121 self.security
122 .resolved_path_violation_message(&resolved_path),
123 ),
124 });
125 }
126
127 tracing::debug!("Reading PDF: {}", resolved_path.display());
128
129 match tokio::fs::metadata(&resolved_path).await {
130 Ok(meta) => {
131 if meta.len() > MAX_PDF_BYTES {
132 return Ok(ToolResult {
133 success: false,
134 output: String::new(),
135 error: Some(format!(
136 "PDF too large: {} bytes (limit: {MAX_PDF_BYTES} bytes)",
137 meta.len()
138 )),
139 });
140 }
141 }
142 Err(e) => {
143 return Ok(ToolResult {
144 success: false,
145 output: String::new(),
146 error: Some(format!("Failed to read file metadata: {e}")),
147 });
148 }
149 }
150
151 let bytes = match tokio::fs::read(&resolved_path).await {
152 Ok(b) => b,
153 Err(e) => {
154 return Ok(ToolResult {
155 success: false,
156 output: String::new(),
157 error: Some(format!("Failed to read PDF file: {e}")),
158 });
159 }
160 };
161
162 #[cfg(feature = "rag-pdf")]
164 {
165 let text = match tokio::task::spawn_blocking(move || {
166 pdf_extract::extract_text_from_mem(&bytes)
167 })
168 .await
169 {
170 Ok(Ok(t)) => t,
171 Ok(Err(e)) => {
172 return Ok(ToolResult {
173 success: false,
174 output: String::new(),
175 error: Some(format!("PDF extraction failed: {e}")),
176 });
177 }
178 Err(e) => {
179 return Ok(ToolResult {
180 success: false,
181 output: String::new(),
182 error: Some(format!("PDF extraction task panicked: {e}")),
183 });
184 }
185 };
186
187 if text.trim().is_empty() {
188 return Ok(ToolResult {
189 success: true,
190 output: "PDF contains no extractable text (may be image-only or encrypted)"
193 .into(),
194 error: None,
195 });
196 }
197
198 let output = if text.chars().count() > max_chars {
199 let mut truncated: String = text.chars().take(max_chars).collect();
200 use std::fmt::Write as _;
201 let _ = write!(truncated, "\n\n... [truncated at {max_chars} chars]");
202 truncated
203 } else {
204 text
205 };
206
207 return Ok(ToolResult {
208 success: true,
209 output,
210 error: None,
211 });
212 }
213
214 #[cfg(not(feature = "rag-pdf"))]
215 {
216 let _ = bytes;
217 let _ = max_chars;
218 Ok(ToolResult {
219 success: false,
220 output: String::new(),
221 error: Some(
222 "PDF extraction is not enabled. \
223 Rebuild with: cargo build --features rag-pdf"
224 .into(),
225 ),
226 })
227 }
228 }
229}
230
231#[cfg(test)]
232mod tests {
233 use super::*;
234 use crate::security::{AutonomyLevel, SecurityPolicy};
235 use tempfile::TempDir;
236
237 fn test_security(workspace: std::path::PathBuf) -> Arc<SecurityPolicy> {
238 Arc::new(SecurityPolicy {
239 autonomy: AutonomyLevel::Supervised,
240 workspace_dir: workspace,
241 ..SecurityPolicy::default()
242 })
243 }
244
245 fn test_security_with_limit(
246 workspace: std::path::PathBuf,
247 max_actions: u32,
248 ) -> Arc<SecurityPolicy> {
249 Arc::new(SecurityPolicy {
250 autonomy: AutonomyLevel::Supervised,
251 workspace_dir: workspace,
252 max_actions_per_hour: max_actions,
253 ..SecurityPolicy::default()
254 })
255 }
256
257 #[test]
258 fn name_is_pdf_read() {
259 let tool = PdfReadTool::new(test_security(std::env::temp_dir()));
260 assert_eq!(tool.name(), "pdf_read");
261 }
262
263 #[test]
264 fn description_not_empty() {
265 let tool = PdfReadTool::new(test_security(std::env::temp_dir()));
266 assert!(!tool.description().is_empty());
267 }
268
269 #[test]
270 fn schema_has_path_required() {
271 let tool = PdfReadTool::new(test_security(std::env::temp_dir()));
272 let schema = tool.parameters_schema();
273 assert!(schema["properties"]["path"].is_object());
274 assert!(schema["properties"]["max_chars"].is_object());
275 let required = schema["required"].as_array().unwrap();
276 assert!(required.contains(&json!("path")));
277 }
278
279 #[test]
280 fn spec_matches_metadata() {
281 let tool = PdfReadTool::new(test_security(std::env::temp_dir()));
282 let spec = tool.spec();
283 assert_eq!(spec.name, "pdf_read");
284 assert!(spec.parameters.is_object());
285 }
286
287 #[tokio::test]
288 async fn missing_path_param_returns_error() {
289 let tool = PdfReadTool::new(test_security(std::env::temp_dir()));
290 let result = tool.execute(json!({})).await;
291 assert!(result.is_err());
292 assert!(result.unwrap_err().to_string().contains("path"));
293 }
294
295 #[tokio::test]
296 async fn absolute_path_is_blocked() {
297 let tool = PdfReadTool::new(test_security(std::env::temp_dir()));
298 let result = tool.execute(json!({"path": "/etc/passwd"})).await.unwrap();
299 assert!(!result.success);
300 assert!(
301 result
302 .error
303 .as_deref()
304 .unwrap_or("")
305 .contains("not allowed")
306 );
307 }
308
309 #[tokio::test]
310 async fn path_traversal_is_blocked() {
311 let tmp = TempDir::new().unwrap();
312 let tool = PdfReadTool::new(test_security(tmp.path().to_path_buf()));
313 let result = tool
314 .execute(json!({"path": "../../../etc/passwd"}))
315 .await
316 .unwrap();
317 assert!(!result.success);
318 assert!(
319 result
320 .error
321 .as_deref()
322 .unwrap_or("")
323 .contains("not allowed")
324 );
325 }
326
327 #[tokio::test]
328 async fn nonexistent_file_returns_error() {
329 let tmp = TempDir::new().unwrap();
330 let tool = PdfReadTool::new(test_security(tmp.path().to_path_buf()));
331 let result = tool
332 .execute(json!({"path": "does_not_exist.pdf"}))
333 .await
334 .unwrap();
335 assert!(!result.success);
336 assert!(
337 result
338 .error
339 .as_deref()
340 .unwrap_or("")
341 .contains("Failed to resolve")
342 );
343 }
344
345 #[tokio::test]
346 async fn rate_limit_blocks_request() {
347 let tmp = TempDir::new().unwrap();
348 let tool = PdfReadTool::new(test_security_with_limit(tmp.path().to_path_buf(), 0));
349 let result = tool.execute(json!({"path": "any.pdf"})).await.unwrap();
350 assert!(!result.success);
351 assert!(result.error.as_deref().unwrap_or("").contains("Rate limit"));
352 }
353
354 #[tokio::test]
355 async fn probing_nonexistent_consumes_rate_limit_budget() {
356 let tmp = TempDir::new().unwrap();
357 let tool = PdfReadTool::new(test_security_with_limit(tmp.path().to_path_buf(), 2));
359
360 let r1 = tool.execute(json!({"path": "a.pdf"})).await.unwrap();
361 assert!(!r1.success);
362 assert!(
363 r1.error
364 .as_deref()
365 .unwrap_or("")
366 .contains("Failed to resolve")
367 );
368
369 let r2 = tool.execute(json!({"path": "b.pdf"})).await.unwrap();
370 assert!(!r2.success);
371 assert!(
372 r2.error
373 .as_deref()
374 .unwrap_or("")
375 .contains("Failed to resolve")
376 );
377
378 let r3 = tool.execute(json!({"path": "c.pdf"})).await.unwrap();
380 assert!(!r3.success);
381 assert!(
382 r3.error.as_deref().unwrap_or("").contains("Rate limit"),
383 "expected rate limit, got: {:?}",
384 r3.error
385 );
386 }
387
388 #[cfg(unix)]
389 #[tokio::test]
390 async fn symlink_escape_is_blocked() {
391 use std::os::unix::fs::symlink;
392
393 let root = TempDir::new().unwrap();
394 let workspace = root.path().join("workspace");
395 let outside = root.path().join("outside");
396 tokio::fs::create_dir_all(&workspace).await.unwrap();
397 tokio::fs::create_dir_all(&outside).await.unwrap();
398 tokio::fs::write(outside.join("secret.pdf"), b"%PDF-1.4 secret")
399 .await
400 .unwrap();
401 symlink(outside.join("secret.pdf"), workspace.join("link.pdf")).unwrap();
402
403 let tool = PdfReadTool::new(test_security(workspace));
404 let result = tool.execute(json!({"path": "link.pdf"})).await.unwrap();
405 assert!(!result.success);
406 assert!(
407 result
408 .error
409 .as_deref()
410 .unwrap_or("")
411 .contains("escapes workspace")
412 );
413 }
414
415 #[cfg(feature = "rag-pdf")]
417 mod extraction {
418 use super::*;
419
420 fn minimal_pdf_bytes() -> Vec<u8> {
423 let body = b"%PDF-1.4\n\
425 1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj\n\
426 2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj\n\
427 3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R\
428 /Contents 4 0 R/Resources<</Font<</F1 5 0 R>>>>>>endobj\n\
429 4 0 obj<</Length 44>>\nstream\n\
430 BT /F1 12 Tf 72 720 Td (Hello PDF) Tj ET\n\
431 endstream\nendobj\n\
432 5 0 obj<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>endobj\n";
433
434 let xref_offset = body.len();
435
436 let xref = format!(
437 "xref\n0 6\n\
438 0000000000 65535 f \n\
439 0000000009 00000 n \n\
440 0000000058 00000 n \n\
441 0000000115 00000 n \n\
442 0000000274 00000 n \n\
443 0000000370 00000 n \n\
444 trailer<</Size 6/Root 1 0 R>>\n\
445 startxref\n{xref_offset}\n%%EOF\n"
446 );
447
448 let mut pdf = body.to_vec();
449 pdf.extend_from_slice(xref.as_bytes());
450 pdf
451 }
452
453 #[tokio::test]
454 async fn extracts_text_from_valid_pdf() {
455 let tmp = TempDir::new().unwrap();
456 let pdf_path = tmp.path().join("test.pdf");
457 tokio::fs::write(&pdf_path, minimal_pdf_bytes())
458 .await
459 .unwrap();
460
461 let tool = PdfReadTool::new(test_security(tmp.path().to_path_buf()));
462 let result = tool.execute(json!({"path": "test.pdf"})).await.unwrap();
463
464 assert!(
467 result.success
468 || result
469 .error
470 .as_deref()
471 .unwrap_or("")
472 .contains("no extractable")
473 );
474 }
475
476 #[tokio::test]
477 async fn max_chars_truncates_output() {
478 let tmp = TempDir::new().unwrap();
479 let pdf_path = tmp.path().join("trunc.pdf");
482 tokio::fs::write(&pdf_path, minimal_pdf_bytes())
483 .await
484 .unwrap();
485
486 let tool = PdfReadTool::new(test_security(tmp.path().to_path_buf()));
487 let result = tool
488 .execute(json!({"path": "trunc.pdf", "max_chars": 5}))
489 .await
490 .unwrap();
491
492 if result.success && !result.output.is_empty() {
495 assert!(
496 result.output.chars().count() <= 5 + "[truncated".len() + 50,
497 "output longer than expected: {} chars",
498 result.output.chars().count()
499 );
500 }
501 }
502
503 #[tokio::test]
504 async fn image_only_pdf_returns_empty_text_warning() {
505 let tmp = TempDir::new().unwrap();
508 let empty_content_pdf = b"%PDF-1.4\n\
509 1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj\n\
510 2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj\n\
511 3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R\
512 /Contents 4 0 R/Resources<<>>>>endobj\n\
513 4 0 obj<</Length 0>>\nstream\n\nendstream\nendobj\n\
514 xref\n0 5\n\
515 0000000000 65535 f \n\
516 0000000009 00000 n \n\
517 0000000058 00000 n \n\
518 0000000115 00000 n \n\
519 0000000250 00000 n \n\
520 trailer<</Size 5/Root 1 0 R>>\nstartxref\n300\n%%EOF\n";
521
522 tokio::fs::write(tmp.path().join("empty.pdf"), empty_content_pdf)
523 .await
524 .unwrap();
525
526 let tool = PdfReadTool::new(test_security(tmp.path().to_path_buf()));
527 let result = tool.execute(json!({"path": "empty.pdf"})).await.unwrap();
528
529 let is_empty_warning = result.success && result.output.contains("no extractable text");
532 let is_extraction_error =
533 !result.success && result.error.as_deref().unwrap_or("").contains("extraction");
534 let is_resolve_error =
535 !result.success && result.error.as_deref().unwrap_or("").contains("Failed");
536 assert!(
537 is_empty_warning || is_extraction_error || is_resolve_error,
538 "unexpected result: success={} error={:?}",
539 result.success,
540 result.error
541 );
542 }
543 }
544
545 #[cfg(not(feature = "rag-pdf"))]
546 #[tokio::test]
547 async fn without_feature_returns_clear_error() {
548 let tmp = TempDir::new().unwrap();
549 let pdf_path = tmp.path().join("doc.pdf");
550 tokio::fs::write(&pdf_path, b"%PDF-1.4 fake").await.unwrap();
551
552 let tool = PdfReadTool::new(test_security(tmp.path().to_path_buf()));
553 let result = tool.execute(json!({"path": "doc.pdf"})).await.unwrap();
554 assert!(!result.success);
555 assert!(
556 result.error.as_deref().unwrap_or("").contains("rag-pdf"),
557 "expected feature hint in error, got: {:?}",
558 result.error
559 );
560 }
561}