agentzero_tools/
docx_read.rs1use agentzero_core::{Tool, ToolContext, ToolResult};
2use anyhow::{anyhow, Context};
3use async_trait::async_trait;
4use serde::Deserialize;
5use std::io::Read;
6use std::path::{Component, Path, PathBuf};
7
8const MAX_OUTPUT_BYTES: usize = 256 * 1024;
9
10#[derive(Debug, Deserialize)]
11struct DocxReadInput {
12 path: String,
13}
14
15#[derive(Debug, Default, Clone, Copy)]
16pub struct DocxReadTool;
17
18impl DocxReadTool {
19 fn resolve_path(input_path: &str, workspace_root: &str) -> anyhow::Result<PathBuf> {
20 if input_path.trim().is_empty() {
21 return Err(anyhow!("path is required"));
22 }
23 let relative = Path::new(input_path);
24 if relative.is_absolute() {
25 return Err(anyhow!("absolute paths are not allowed"));
26 }
27 if relative
28 .components()
29 .any(|c| matches!(c, Component::ParentDir))
30 {
31 return Err(anyhow!("path traversal is not allowed"));
32 }
33 let joined = Path::new(workspace_root).join(relative);
34 let canonical_root = Path::new(workspace_root)
35 .canonicalize()
36 .context("unable to resolve workspace root")?;
37 let canonical = joined
38 .canonicalize()
39 .with_context(|| format!("file not found: {input_path}"))?;
40 if !canonical.starts_with(&canonical_root) {
41 return Err(anyhow!("path is outside workspace"));
42 }
43 Ok(canonical)
44 }
45
46 fn extract_text_from_docx(path: &Path) -> anyhow::Result<String> {
47 let file = std::fs::File::open(path)
48 .with_context(|| format!("failed to open file: {}", path.display()))?;
49 let mut archive =
50 zip::ZipArchive::new(file).context("file is not a valid DOCX (ZIP) archive")?;
51
52 let mut document_xml = match archive.by_name("word/document.xml") {
53 Ok(f) => f,
54 Err(_) => return Err(anyhow!("word/document.xml not found in DOCX")),
55 };
56
57 let mut xml = String::new();
58 document_xml
59 .read_to_string(&mut xml)
60 .context("failed to read document.xml")?;
61
62 let mut text = String::new();
64 let mut in_paragraph = false;
65 let mut paragraph_has_text = false;
66
67 for token in xml.split('<') {
68 if token.is_empty() {
69 continue;
70 }
71 let (tag_part, rest) = token.split_once('>').unwrap_or((token, ""));
72
73 if tag_part.starts_with("w:p ") || tag_part == "w:p" {
74 if in_paragraph && paragraph_has_text {
75 text.push('\n');
76 }
77 in_paragraph = true;
78 paragraph_has_text = false;
79 } else if tag_part == "/w:p" {
80 if in_paragraph && paragraph_has_text {
81 text.push('\n');
82 }
83 in_paragraph = false;
84 paragraph_has_text = false;
85 }
86
87 if tag_part.starts_with("w:t") && !rest.is_empty() {
88 text.push_str(rest);
89 paragraph_has_text = true;
90 }
91 }
92
93 if text.len() > MAX_OUTPUT_BYTES {
94 text.truncate(MAX_OUTPUT_BYTES);
95 text.push_str(&format!("\n<truncated at {} bytes>", MAX_OUTPUT_BYTES));
96 }
97
98 Ok(text)
99 }
100}
101
102#[async_trait]
103impl Tool for DocxReadTool {
104 fn name(&self) -> &'static str {
105 "docx_read"
106 }
107
108 fn description(&self) -> &'static str {
109 "Extract text content from a DOCX (Word) file."
110 }
111
112 fn input_schema(&self) -> Option<serde_json::Value> {
113 Some(serde_json::json!({
114 "type": "object",
115 "properties": {
116 "path": { "type": "string", "description": "Path to the DOCX file" }
117 },
118 "required": ["path"]
119 }))
120 }
121
122 async fn execute(&self, input: &str, ctx: &ToolContext) -> anyhow::Result<ToolResult> {
123 let req: DocxReadInput =
124 serde_json::from_str(input).context("docx_read expects JSON: {\"path\": \"...\"}")?;
125
126 let file_path = Self::resolve_path(&req.path, &ctx.workspace_root)?;
127
128 let output = tokio::task::spawn_blocking(move || Self::extract_text_from_docx(&file_path))
129 .await
130 .context("docx extraction task panicked")??;
131
132 if output.trim().is_empty() {
133 Ok(ToolResult {
134 output: "(no text content extracted)".to_string(),
135 })
136 } else {
137 Ok(ToolResult { output })
138 }
139 }
140}
141
142#[cfg(test)]
143mod tests {
144 use super::*;
145 use std::fs;
146 use std::io::Write;
147 use std::sync::atomic::{AtomicU64, Ordering};
148 use std::time::{SystemTime, UNIX_EPOCH};
149
150 static TEMP_COUNTER: AtomicU64 = AtomicU64::new(0);
151
152 fn temp_dir() -> PathBuf {
153 let nanos = SystemTime::now()
154 .duration_since(UNIX_EPOCH)
155 .expect("clock")
156 .as_nanos();
157 let seq = TEMP_COUNTER.fetch_add(1, Ordering::Relaxed);
158 let dir = std::env::temp_dir().join(format!(
159 "agentzero-docx-read-{}-{nanos}-{seq}",
160 std::process::id()
161 ));
162 fs::create_dir_all(&dir).expect("temp dir should be created");
163 dir
164 }
165
166 fn create_minimal_docx(dir: &Path, filename: &str, text: &str) -> PathBuf {
167 let path = dir.join(filename);
168 let file = fs::File::create(&path).unwrap();
169 let mut zip = zip::ZipWriter::new(file);
170 let options = zip::write::SimpleFileOptions::default()
171 .compression_method(zip::CompressionMethod::Stored);
172 zip.start_file("word/document.xml", options).unwrap();
173 let xml = format!(
174 r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
175<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
176<w:body><w:p><w:r><w:t>{text}</w:t></w:r></w:p></w:body></w:document>"#
177 );
178 zip.write_all(xml.as_bytes()).unwrap();
179 zip.finish().unwrap();
180 path
181 }
182
183 #[tokio::test]
184 async fn docx_read_extracts_text() {
185 let dir = temp_dir();
186 create_minimal_docx(&dir, "test.docx", "Hello from DOCX");
187
188 let tool = DocxReadTool;
189 let result = tool
190 .execute(
191 r#"{"path": "test.docx"}"#,
192 &ToolContext::new(dir.to_string_lossy().to_string()),
193 )
194 .await
195 .expect("should extract text");
196 assert!(result.output.contains("Hello from DOCX"));
197 fs::remove_dir_all(dir).ok();
198 }
199
200 #[tokio::test]
201 async fn docx_read_rejects_path_traversal() {
202 let dir = temp_dir();
203 let tool = DocxReadTool;
204 let err = tool
205 .execute(
206 r#"{"path": "../escape.docx"}"#,
207 &ToolContext::new(dir.to_string_lossy().to_string()),
208 )
209 .await
210 .expect_err("path traversal should fail");
211 assert!(err.to_string().contains("path traversal"));
212 fs::remove_dir_all(dir).ok();
213 }
214
215 #[tokio::test]
216 async fn docx_read_rejects_non_docx_file() {
217 let dir = temp_dir();
218 fs::write(dir.join("test.txt"), "plain text").unwrap();
219 let tool = DocxReadTool;
220 let err = tool
221 .execute(
222 r#"{"path": "test.txt"}"#,
223 &ToolContext::new(dir.to_string_lossy().to_string()),
224 )
225 .await
226 .expect_err("non-docx should fail");
227 assert!(err.to_string().contains("not a valid DOCX"));
228 fs::remove_dir_all(dir).ok();
229 }
230}