spec_ai_core/tools/builtin/
file_extract.rs1use crate::tools::{Tool, ToolResult};
2use anyhow::{anyhow, Context, Result};
3use async_trait::async_trait;
4use extractous::Extractor;
5use serde::{Deserialize, Serialize};
6use serde_json::Value;
7use std::collections::HashMap;
8use std::fs;
9use std::path::PathBuf;
10
11#[derive(Debug, Deserialize)]
13struct FileExtractArgs {
14 path: String,
15 #[serde(default)]
16 include_metadata: bool,
17 #[serde(default)]
18 xml_output: bool,
19 #[serde(default)]
20 max_chars: Option<i32>,
21}
22
23#[derive(Debug, Serialize)]
25struct FileExtractOutput {
26 path: String,
27 content: String,
28 metadata: Option<HashMap<String, Vec<String>>>,
29}
30
31pub struct FileExtractTool;
33
34impl Default for FileExtractTool {
35 fn default() -> Self {
36 Self::new()
37 }
38}
39
40impl FileExtractTool {
41 pub fn new() -> Self {
42 Self
43 }
44
45 fn normalize_path(&self, input: &str) -> Result<PathBuf> {
46 let trimmed = input.trim();
47 if trimmed.is_empty() {
48 return Err(anyhow!("file_extract requires a valid path"));
49 }
50 Ok(PathBuf::from(trimmed))
51 }
52}
53
54#[async_trait]
55impl Tool for FileExtractTool {
56 fn name(&self) -> &str {
57 "file_extract"
58 }
59
60 fn description(&self) -> &str {
61 "Extracts text metadata from files regardless of format (PDF, Office, HTML, etc.)"
62 }
63
64 fn parameters(&self) -> Value {
65 serde_json::json!({
66 "type": "object",
67 "properties": {
68 "path": {
69 "type": "string",
70 "description": "Relative or absolute path to the file that should be extracted"
71 },
72 "include_metadata": {
73 "type": "boolean",
74 "description": "Include metadata returned by Extractous",
75 "default": false
76 },
77 "xml_output": {
78 "type": "boolean",
79 "description": "Request XML formatted result instead of plain text",
80 "default": false
81 },
82 "max_chars": {
83 "type": "integer",
84 "description": "Limit the number of characters returned (must be > 0 if provided)",
85 "minimum": 1
86 }
87 },
88 "required": ["path"]
89 })
90 }
91
92 async fn execute(&self, args: Value) -> Result<ToolResult> {
93 let args: FileExtractArgs =
94 serde_json::from_value(args).context("Failed to parse file_extract arguments")?;
95
96 let path = self.normalize_path(&args.path)?;
97 let metadata =
98 fs::metadata(&path).with_context(|| format!("File not found: {}", path.display()))?;
99
100 if !metadata.is_file() {
101 return Ok(ToolResult::failure(format!(
102 "{} is not a regular file",
103 path.display()
104 )));
105 }
106
107 let mut extractor = Extractor::new();
108 if let Some(max_chars) = args.max_chars {
109 if max_chars <= 0 {
110 return Ok(ToolResult::failure(
111 "max_chars must be greater than zero".to_string(),
112 ));
113 }
114 extractor = extractor.set_extract_string_max_length(max_chars);
115 }
116
117 if args.xml_output {
118 extractor = extractor.set_xml_output(true);
119 }
120
121 let display_path = path.to_string_lossy().into_owned();
122 let (content, extracted_metadata) = extractor
123 .extract_file_to_string(&display_path)
124 .map_err(|err| anyhow!("Failed to extract {}: {}", display_path, err))?;
125
126 let metadata = if args.include_metadata {
127 Some(extracted_metadata)
128 } else {
129 None
130 };
131
132 let output = FileExtractOutput {
133 path: display_path,
134 content,
135 metadata,
136 };
137
138 Ok(ToolResult::success(
139 serde_json::to_string(&output).context("Failed to serialize file_extract output")?,
140 ))
141 }
142}
143
144#[cfg(test)]
145mod tests {
146 use super::*;
147 use tempfile::NamedTempFile;
148
149 #[tokio::test]
150 async fn name_and_description() {
151 let tool = FileExtractTool::new();
152 assert_eq!(tool.name(), "file_extract");
153 assert!(tool.description().contains("Extracts text"));
154 }
155
156 #[tokio::test]
157 async fn parameters_require_path() {
158 let tool = FileExtractTool::new();
159 let params = tool.parameters();
160 let required = params["required"].as_array().unwrap();
161 assert!(required.iter().any(|value| value == "path"));
162 }
163
164 #[tokio::test]
165 async fn invalid_max_chars_returns_failure() {
166 let tool = FileExtractTool::new();
167 let tmp = NamedTempFile::new().unwrap();
168 let args = serde_json::json!({
169 "path": tmp.path().to_string_lossy(),
170 "max_chars": 0
171 });
172
173 let result = tool.execute(args).await.unwrap();
174 assert!(!result.success);
175 assert_eq!(result.error.unwrap(), "max_chars must be greater than zero");
176 }
177}