Skip to main content

agentzero_tools/
docx_read.rs

1use agentzero_core::{Tool, ToolContext, ToolResult};
2use anyhow::{anyhow, Context};
3use async_trait::async_trait;
4use serde::Deserialize;
5use std::io::Read;
6use std::path::{Component, Path, PathBuf};
7
8const MAX_OUTPUT_BYTES: usize = 256 * 1024;
9
10#[derive(Debug, Deserialize)]
11struct DocxReadInput {
12    path: String,
13}
14
15#[derive(Debug, Default, Clone, Copy)]
16pub struct DocxReadTool;
17
18impl DocxReadTool {
19    fn resolve_path(input_path: &str, workspace_root: &str) -> anyhow::Result<PathBuf> {
20        if input_path.trim().is_empty() {
21            return Err(anyhow!("path is required"));
22        }
23        let relative = Path::new(input_path);
24        if relative.is_absolute() {
25            return Err(anyhow!("absolute paths are not allowed"));
26        }
27        if relative
28            .components()
29            .any(|c| matches!(c, Component::ParentDir))
30        {
31            return Err(anyhow!("path traversal is not allowed"));
32        }
33        let joined = Path::new(workspace_root).join(relative);
34        let canonical_root = Path::new(workspace_root)
35            .canonicalize()
36            .context("unable to resolve workspace root")?;
37        let canonical = joined
38            .canonicalize()
39            .with_context(|| format!("file not found: {input_path}"))?;
40        if !canonical.starts_with(&canonical_root) {
41            return Err(anyhow!("path is outside workspace"));
42        }
43        Ok(canonical)
44    }
45
46    fn extract_text_from_docx(path: &Path) -> anyhow::Result<String> {
47        let file = std::fs::File::open(path)
48            .with_context(|| format!("failed to open file: {}", path.display()))?;
49        let mut archive =
50            zip::ZipArchive::new(file).context("file is not a valid DOCX (ZIP) archive")?;
51
52        let mut document_xml = match archive.by_name("word/document.xml") {
53            Ok(f) => f,
54            Err(_) => return Err(anyhow!("word/document.xml not found in DOCX")),
55        };
56
57        let mut xml = String::new();
58        document_xml
59            .read_to_string(&mut xml)
60            .context("failed to read document.xml")?;
61
62        // Extract text from XML by stripping tags and collecting <w:t> content.
63        let mut text = String::new();
64        let mut in_paragraph = false;
65        let mut paragraph_has_text = false;
66
67        for token in xml.split('<') {
68            if token.is_empty() {
69                continue;
70            }
71            let (tag_part, rest) = token.split_once('>').unwrap_or((token, ""));
72
73            if tag_part.starts_with("w:p ") || tag_part == "w:p" {
74                if in_paragraph && paragraph_has_text {
75                    text.push('\n');
76                }
77                in_paragraph = true;
78                paragraph_has_text = false;
79            } else if tag_part == "/w:p" {
80                if in_paragraph && paragraph_has_text {
81                    text.push('\n');
82                }
83                in_paragraph = false;
84                paragraph_has_text = false;
85            }
86
87            if tag_part.starts_with("w:t") && !rest.is_empty() {
88                text.push_str(rest);
89                paragraph_has_text = true;
90            }
91        }
92
93        if text.len() > MAX_OUTPUT_BYTES {
94            text.truncate(MAX_OUTPUT_BYTES);
95            text.push_str(&format!("\n<truncated at {} bytes>", MAX_OUTPUT_BYTES));
96        }
97
98        Ok(text)
99    }
100}
101
102#[async_trait]
103impl Tool for DocxReadTool {
104    fn name(&self) -> &'static str {
105        "docx_read"
106    }
107
108    fn description(&self) -> &'static str {
109        "Extract text content from a DOCX (Word) file."
110    }
111
112    fn input_schema(&self) -> Option<serde_json::Value> {
113        Some(serde_json::json!({
114            "type": "object",
115            "properties": {
116                "path": { "type": "string", "description": "Path to the DOCX file" }
117            },
118            "required": ["path"]
119        }))
120    }
121
122    async fn execute(&self, input: &str, ctx: &ToolContext) -> anyhow::Result<ToolResult> {
123        let req: DocxReadInput =
124            serde_json::from_str(input).context("docx_read expects JSON: {\"path\": \"...\"}")?;
125
126        let file_path = Self::resolve_path(&req.path, &ctx.workspace_root)?;
127
128        let output = tokio::task::spawn_blocking(move || Self::extract_text_from_docx(&file_path))
129            .await
130            .context("docx extraction task panicked")??;
131
132        if output.trim().is_empty() {
133            Ok(ToolResult {
134                output: "(no text content extracted)".to_string(),
135            })
136        } else {
137            Ok(ToolResult { output })
138        }
139    }
140}
141
142#[cfg(test)]
143mod tests {
144    use super::*;
145    use std::fs;
146    use std::io::Write;
147    use std::sync::atomic::{AtomicU64, Ordering};
148    use std::time::{SystemTime, UNIX_EPOCH};
149
150    static TEMP_COUNTER: AtomicU64 = AtomicU64::new(0);
151
152    fn temp_dir() -> PathBuf {
153        let nanos = SystemTime::now()
154            .duration_since(UNIX_EPOCH)
155            .expect("clock")
156            .as_nanos();
157        let seq = TEMP_COUNTER.fetch_add(1, Ordering::Relaxed);
158        let dir = std::env::temp_dir().join(format!(
159            "agentzero-docx-read-{}-{nanos}-{seq}",
160            std::process::id()
161        ));
162        fs::create_dir_all(&dir).expect("temp dir should be created");
163        dir
164    }
165
166    fn create_minimal_docx(dir: &Path, filename: &str, text: &str) -> PathBuf {
167        let path = dir.join(filename);
168        let file = fs::File::create(&path).unwrap();
169        let mut zip = zip::ZipWriter::new(file);
170        let options = zip::write::SimpleFileOptions::default()
171            .compression_method(zip::CompressionMethod::Stored);
172        zip.start_file("word/document.xml", options).unwrap();
173        let xml = format!(
174            r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
175<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
176<w:body><w:p><w:r><w:t>{text}</w:t></w:r></w:p></w:body></w:document>"#
177        );
178        zip.write_all(xml.as_bytes()).unwrap();
179        zip.finish().unwrap();
180        path
181    }
182
183    #[tokio::test]
184    async fn docx_read_extracts_text() {
185        let dir = temp_dir();
186        create_minimal_docx(&dir, "test.docx", "Hello from DOCX");
187
188        let tool = DocxReadTool;
189        let result = tool
190            .execute(
191                r#"{"path": "test.docx"}"#,
192                &ToolContext::new(dir.to_string_lossy().to_string()),
193            )
194            .await
195            .expect("should extract text");
196        assert!(result.output.contains("Hello from DOCX"));
197        fs::remove_dir_all(dir).ok();
198    }
199
200    #[tokio::test]
201    async fn docx_read_rejects_path_traversal() {
202        let dir = temp_dir();
203        let tool = DocxReadTool;
204        let err = tool
205            .execute(
206                r#"{"path": "../escape.docx"}"#,
207                &ToolContext::new(dir.to_string_lossy().to_string()),
208            )
209            .await
210            .expect_err("path traversal should fail");
211        assert!(err.to_string().contains("path traversal"));
212        fs::remove_dir_all(dir).ok();
213    }
214
215    #[tokio::test]
216    async fn docx_read_rejects_non_docx_file() {
217        let dir = temp_dir();
218        fs::write(dir.join("test.txt"), "plain text").unwrap();
219        let tool = DocxReadTool;
220        let err = tool
221            .execute(
222                r#"{"path": "test.txt"}"#,
223                &ToolContext::new(dir.to_string_lossy().to_string()),
224            )
225            .await
226            .expect_err("non-docx should fail");
227        assert!(err.to_string().contains("not a valid DOCX"));
228        fs::remove_dir_all(dir).ok();
229    }
230}