spec_ai_core/tools/builtin/
file_extract.rs

1use crate::tools::{Tool, ToolResult};
2use anyhow::{anyhow, Context, Result};
3use async_trait::async_trait;
4use extractous::Extractor;
5use serde::{Deserialize, Serialize};
6use serde_json::Value;
7use std::collections::HashMap;
8use std::fs;
9use std::path::PathBuf;
10
11/// Arguments accepted by the file_extract tool
12#[derive(Debug, Deserialize)]
13struct FileExtractArgs {
14    path: String,
15    #[serde(default)]
16    include_metadata: bool,
17    #[serde(default)]
18    xml_output: bool,
19    #[serde(default)]
20    max_chars: Option<i32>,
21}
22
23/// Output payload returned by the file_extract tool
24#[derive(Debug, Serialize)]
25struct FileExtractOutput {
26    path: String,
27    content: String,
28    metadata: Option<HashMap<String, Vec<String>>>,
29}
30
31/// Tool that uses Extractous to read arbitrary files and return textual content
32pub struct FileExtractTool;
33
34impl Default for FileExtractTool {
35    fn default() -> Self {
36        Self::new()
37    }
38}
39
40impl FileExtractTool {
41    pub fn new() -> Self {
42        Self
43    }
44
45    fn normalize_path(&self, input: &str) -> Result<PathBuf> {
46        let trimmed = input.trim();
47        if trimmed.is_empty() {
48            return Err(anyhow!("file_extract requires a valid path"));
49        }
50        Ok(PathBuf::from(trimmed))
51    }
52}
53
54#[async_trait]
55impl Tool for FileExtractTool {
56    fn name(&self) -> &str {
57        "file_extract"
58    }
59
60    fn description(&self) -> &str {
61        "Extracts text metadata from files regardless of format (PDF, Office, HTML, etc.)"
62    }
63
64    fn parameters(&self) -> Value {
65        serde_json::json!({
66            "type": "object",
67            "properties": {
68                "path": {
69                    "type": "string",
70                    "description": "Relative or absolute path to the file that should be extracted"
71                },
72                "include_metadata": {
73                    "type": "boolean",
74                    "description": "Include metadata returned by Extractous",
75                    "default": false
76                },
77                "xml_output": {
78                    "type": "boolean",
79                    "description": "Request XML formatted result instead of plain text",
80                    "default": false
81                },
82                "max_chars": {
83                    "type": "integer",
84                    "description": "Limit the number of characters returned (must be > 0 if provided)",
85                    "minimum": 1
86                }
87            },
88            "required": ["path"]
89        })
90    }
91
92    async fn execute(&self, args: Value) -> Result<ToolResult> {
93        let args: FileExtractArgs =
94            serde_json::from_value(args).context("Failed to parse file_extract arguments")?;
95
96        let path = self.normalize_path(&args.path)?;
97        let metadata =
98            fs::metadata(&path).with_context(|| format!("File not found: {}", path.display()))?;
99
100        if !metadata.is_file() {
101            return Ok(ToolResult::failure(format!(
102                "{} is not a regular file",
103                path.display()
104            )));
105        }
106
107        let mut extractor = Extractor::new();
108        if let Some(max_chars) = args.max_chars {
109            if max_chars <= 0 {
110                return Ok(ToolResult::failure(
111                    "max_chars must be greater than zero".to_string(),
112                ));
113            }
114            extractor = extractor.set_extract_string_max_length(max_chars);
115        }
116
117        if args.xml_output {
118            extractor = extractor.set_xml_output(true);
119        }
120
121        let display_path = path.to_string_lossy().into_owned();
122        let (content, extracted_metadata) = extractor
123            .extract_file_to_string(&display_path)
124            .map_err(|err| anyhow!("Failed to extract {}: {}", display_path, err))?;
125
126        let metadata = if args.include_metadata {
127            Some(extracted_metadata)
128        } else {
129            None
130        };
131
132        let output = FileExtractOutput {
133            path: display_path,
134            content,
135            metadata,
136        };
137
138        Ok(ToolResult::success(
139            serde_json::to_string(&output).context("Failed to serialize file_extract output")?,
140        ))
141    }
142}
143
144#[cfg(test)]
145mod tests {
146    use super::*;
147    use tempfile::NamedTempFile;
148
149    #[tokio::test]
150    async fn name_and_description() {
151        let tool = FileExtractTool::new();
152        assert_eq!(tool.name(), "file_extract");
153        assert!(tool.description().contains("Extracts text"));
154    }
155
156    #[tokio::test]
157    async fn parameters_require_path() {
158        let tool = FileExtractTool::new();
159        let params = tool.parameters();
160        let required = params["required"].as_array().unwrap();
161        assert!(required.iter().any(|value| value == "path"));
162    }
163
164    #[tokio::test]
165    async fn invalid_max_chars_returns_failure() {
166        let tool = FileExtractTool::new();
167        let tmp = NamedTempFile::new().unwrap();
168        let args = serde_json::json!({
169            "path": tmp.path().to_string_lossy(),
170            "max_chars": 0
171        });
172
173        let result = tool.execute(args).await.unwrap();
174        assert!(!result.success);
175        assert_eq!(result.error.unwrap(), "max_chars must be greater than zero");
176    }
177}