use argentor_core::{ArgentorResult, ToolCall, ToolResult};
use argentor_skills::skill::{Skill, SkillDescriptor};
use async_trait::async_trait;
use base64::Engine;
use regex::Regex;
use serde_json::json;
#[cfg(test)]
use serde_json::Value;
use crate::zip_reader::{read_central_directory, read_entry_utf8};
pub struct DocxLoaderSkill {
descriptor: SkillDescriptor,
}
impl DocxLoaderSkill {
pub fn new() -> Self {
Self {
descriptor: SkillDescriptor {
name: "docx_loader".to_string(),
description: "DOCX loader: extract_text, extract_paragraphs, extract_tables, count_words. Accepts base64-encoded DOCX bytes or raw document.xml.".to_string(),
parameters_schema: json!({
"type": "object",
"properties": {
"operation": {
"type": "string",
"enum": ["extract_text", "extract_paragraphs", "extract_tables", "count_words"],
"description": "The DOCX operation to perform"
},
"data": {
"type": "string",
"description": "DOCX content: base64-encoded ZIP bytes, or raw document.xml if encoding='xml'"
},
"encoding": {
"type": "string",
"enum": ["base64", "xml"],
"description": "Encoding of 'data'. Default: base64."
}
},
"required": ["operation", "data"]
}),
required_capabilities: vec![],
requires_approval: false,
},
}
}
}
impl Default for DocxLoaderSkill {
fn default() -> Self {
Self::new()
}
}
fn load_document_xml(data: &str, encoding: Option<&str>) -> Result<String, String> {
match encoding {
Some("xml") => Ok(data.to_string()),
_ => {
let bytes = base64::engine::general_purpose::STANDARD
.decode(data)
.map_err(|e| format!("Invalid base64: {e}"))?;
let entries = read_central_directory(&bytes)?;
let entry = entries
.get("word/document.xml")
.ok_or_else(|| "Missing word/document.xml in archive".to_string())?;
read_entry_utf8(&bytes, entry)
}
}
}
pub fn extract_paragraphs_from_xml(xml: &str) -> Vec<String> {
let mut paragraphs = Vec::new();
let p_re = match Regex::new(r"(?is)<w:p\b[^>]*>(.*?)</w:p>") {
Ok(r) => r,
Err(_) => return paragraphs,
};
let t_re = match Regex::new(r"(?is)<w:t(?:\s[^>]*)?>(.*?)</w:t>") {
Ok(r) => r,
Err(_) => return paragraphs,
};
for p_cap in p_re.captures_iter(xml) {
if let Some(p_inner) = p_cap.get(1) {
let mut text = String::new();
for t_cap in t_re.captures_iter(p_inner.as_str()) {
if let Some(t) = t_cap.get(1) {
text.push_str(t.as_str());
}
}
if !text.is_empty() {
paragraphs.push(decode_xml_entities(&text));
}
}
}
paragraphs
}
pub fn extract_tables_from_xml(xml: &str) -> Vec<Vec<Vec<String>>> {
let mut tables = Vec::new();
let tbl_re = match Regex::new(r"(?is)<w:tbl\b[^>]*>(.*?)</w:tbl>") {
Ok(r) => r,
Err(_) => return tables,
};
let tr_re = match Regex::new(r"(?is)<w:tr\b[^>]*>(.*?)</w:tr>") {
Ok(r) => r,
Err(_) => return tables,
};
let tc_re = match Regex::new(r"(?is)<w:tc\b[^>]*>(.*?)</w:tc>") {
Ok(r) => r,
Err(_) => return tables,
};
let t_re = match Regex::new(r"(?is)<w:t(?:\s[^>]*)?>(.*?)</w:t>") {
Ok(r) => r,
Err(_) => return tables,
};
for tbl_cap in tbl_re.captures_iter(xml) {
if let Some(tbl_inner) = tbl_cap.get(1) {
let mut rows: Vec<Vec<String>> = Vec::new();
for tr_cap in tr_re.captures_iter(tbl_inner.as_str()) {
if let Some(tr_inner) = tr_cap.get(1) {
let mut cells: Vec<String> = Vec::new();
for tc_cap in tc_re.captures_iter(tr_inner.as_str()) {
if let Some(tc_inner) = tc_cap.get(1) {
let mut cell_text = String::new();
for t_cap in t_re.captures_iter(tc_inner.as_str()) {
if let Some(t) = t_cap.get(1) {
cell_text.push_str(t.as_str());
}
}
cells.push(decode_xml_entities(&cell_text));
}
}
if !cells.is_empty() {
rows.push(cells);
}
}
}
if !rows.is_empty() {
tables.push(rows);
}
}
}
tables
}
fn decode_xml_entities(s: &str) -> String {
s.replace("<", "<")
.replace(">", ">")
.replace(""", "\"")
.replace("'", "'")
.replace("&", "&")
}
#[async_trait]
impl Skill for DocxLoaderSkill {
fn descriptor(&self) -> &SkillDescriptor {
&self.descriptor
}
async fn execute(&self, call: ToolCall) -> ArgentorResult<ToolResult> {
let operation = match call.arguments["operation"].as_str() {
Some(op) => op,
None => {
return Ok(ToolResult::error(
&call.id,
"Missing required parameter: 'operation'",
))
}
};
let data = match call.arguments["data"].as_str() {
Some(d) => d,
None => {
return Ok(ToolResult::error(
&call.id,
"Missing required parameter: 'data'",
))
}
};
let encoding = call.arguments["encoding"].as_str();
let xml = match load_document_xml(data, encoding) {
Ok(x) => x,
Err(e) => return Ok(ToolResult::error(&call.id, e)),
};
match operation {
"extract_text" => {
let paragraphs = extract_paragraphs_from_xml(&xml);
let text = paragraphs.join("\n");
Ok(ToolResult::success(
&call.id,
json!({
"text": text,
"length": text.len(),
"paragraph_count": paragraphs.len(),
})
.to_string(),
))
}
"extract_paragraphs" => {
let paragraphs = extract_paragraphs_from_xml(&xml);
Ok(ToolResult::success(
&call.id,
json!({
"paragraphs": paragraphs,
"count": paragraphs.len(),
})
.to_string(),
))
}
"extract_tables" => {
let tables = extract_tables_from_xml(&xml);
Ok(ToolResult::success(
&call.id,
json!({
"tables": tables,
"count": tables.len(),
})
.to_string(),
))
}
"count_words" => {
let paragraphs = extract_paragraphs_from_xml(&xml);
let full = paragraphs.join(" ");
let words = full.split_whitespace().count();
Ok(ToolResult::success(
&call.id,
json!({
"words": words,
"characters": full.len(),
"paragraphs": paragraphs.len(),
})
.to_string(),
))
}
_ => Ok(ToolResult::error(
&call.id,
format!("Unknown operation: '{operation}'. Supported: extract_text, extract_paragraphs, extract_tables, count_words"),
)),
}
}
}
#[cfg(test)]
#[allow(clippy::unwrap_used, clippy::expect_used)]
mod tests {
use super::*;
use crate::zip_reader::build_stored_zip_for_tests;
const SAMPLE_XML: &str = r#"<?xml version="1.0"?>
<w:document xmlns:w="x">
<w:body>
<w:p><w:r><w:t>Hello World</w:t></w:r></w:p>
<w:p><w:r><w:t xml:space="preserve">Second paragraph</w:t></w:r></w:p>
<w:tbl>
<w:tr><w:tc><w:p><w:r><w:t>A1</w:t></w:r></w:p></w:tc><w:tc><w:p><w:r><w:t>B1</w:t></w:r></w:p></w:tc></w:tr>
<w:tr><w:tc><w:p><w:r><w:t>A2</w:t></w:r></w:p></w:tc><w:tc><w:p><w:r><w:t>B2</w:t></w:r></w:p></w:tc></w:tr>
</w:tbl>
</w:body>
</w:document>"#;
fn make_call(args: Value) -> ToolCall {
ToolCall {
id: "test".to_string(),
name: "docx_loader".to_string(),
arguments: args,
}
}
#[tokio::test]
async fn test_extract_text_xml_mode() {
let skill = DocxLoaderSkill::new();
let call =
make_call(json!({"operation": "extract_text", "data": SAMPLE_XML, "encoding": "xml"}));
let result = skill.execute(call).await.unwrap();
assert!(!result.is_error, "Result: {}", result.content);
let parsed: Value = serde_json::from_str(&result.content).unwrap();
let text = parsed["text"].as_str().unwrap();
assert!(text.contains("Hello World"));
assert!(text.contains("Second paragraph"));
}
#[tokio::test]
async fn test_extract_paragraphs() {
let skill = DocxLoaderSkill::new();
let call = make_call(
json!({"operation": "extract_paragraphs", "data": SAMPLE_XML, "encoding": "xml"}),
);
let result = skill.execute(call).await.unwrap();
assert!(!result.is_error);
let parsed: Value = serde_json::from_str(&result.content).unwrap();
let count = parsed["count"].as_u64().unwrap();
assert!(count >= 2);
}
#[tokio::test]
async fn test_extract_tables() {
let skill = DocxLoaderSkill::new();
let call = make_call(
json!({"operation": "extract_tables", "data": SAMPLE_XML, "encoding": "xml"}),
);
let result = skill.execute(call).await.unwrap();
assert!(!result.is_error);
let parsed: Value = serde_json::from_str(&result.content).unwrap();
assert_eq!(parsed["count"], 1);
let tables = parsed["tables"].as_array().unwrap();
let first_table = tables[0].as_array().unwrap();
assert_eq!(first_table.len(), 2); let row0 = first_table[0].as_array().unwrap();
assert_eq!(row0.len(), 2); assert_eq!(row0[0], "A1");
assert_eq!(row0[1], "B1");
}
#[tokio::test]
async fn test_count_words() {
let skill = DocxLoaderSkill::new();
let call =
make_call(json!({"operation": "count_words", "data": SAMPLE_XML, "encoding": "xml"}));
let result = skill.execute(call).await.unwrap();
assert!(!result.is_error);
let parsed: Value = serde_json::from_str(&result.content).unwrap();
let words = parsed["words"].as_u64().unwrap();
assert!(words >= 4, "Expected at least 4 words, got {words}");
}
#[tokio::test]
async fn test_extract_from_zip() {
let skill = DocxLoaderSkill::new();
let zip_bytes = build_stored_zip_for_tests(&[
("word/document.xml", SAMPLE_XML.as_bytes()),
("[Content_Types].xml", b"<x/>"),
]);
let encoded = base64::engine::general_purpose::STANDARD.encode(&zip_bytes);
let call = make_call(json!({"operation": "extract_text", "data": encoded}));
let result = skill.execute(call).await.unwrap();
assert!(!result.is_error, "Result: {}", result.content);
let parsed: Value = serde_json::from_str(&result.content).unwrap();
let text = parsed["text"].as_str().unwrap();
assert!(text.contains("Hello World"));
}
#[tokio::test]
async fn test_missing_document_xml() {
let skill = DocxLoaderSkill::new();
let zip_bytes = build_stored_zip_for_tests(&[("other.xml", b"<x/>")]);
let encoded = base64::engine::general_purpose::STANDARD.encode(&zip_bytes);
let call = make_call(json!({"operation": "extract_text", "data": encoded}));
let result = skill.execute(call).await.unwrap();
assert!(result.is_error);
assert!(result.content.contains("document.xml"));
}
#[tokio::test]
async fn test_invalid_base64() {
let skill = DocxLoaderSkill::new();
let call = make_call(json!({"operation": "extract_text", "data": "!!!invalid!!!"}));
let result = skill.execute(call).await.unwrap();
assert!(result.is_error);
}
#[tokio::test]
async fn test_invalid_zip() {
let skill = DocxLoaderSkill::new();
let encoded = base64::engine::general_purpose::STANDARD.encode(b"not a zip archive at all");
let call = make_call(json!({"operation": "extract_text", "data": encoded}));
let result = skill.execute(call).await.unwrap();
assert!(result.is_error);
}
#[tokio::test]
async fn test_empty_document() {
let skill = DocxLoaderSkill::new();
let xml = "<w:document><w:body></w:body></w:document>";
let call = make_call(json!({"operation": "extract_text", "data": xml, "encoding": "xml"}));
let result = skill.execute(call).await.unwrap();
assert!(!result.is_error);
let parsed: Value = serde_json::from_str(&result.content).unwrap();
assert_eq!(parsed["text"], "");
assert_eq!(parsed["paragraph_count"], 0);
}
#[tokio::test]
async fn test_decodes_entities() {
let skill = DocxLoaderSkill::new();
let xml = "<w:p><w:r><w:t>Tom & Jerry</w:t></w:r></w:p>";
let call = make_call(json!({"operation": "extract_text", "data": xml, "encoding": "xml"}));
let result = skill.execute(call).await.unwrap();
let parsed: Value = serde_json::from_str(&result.content).unwrap();
assert!(parsed["text"].as_str().unwrap().contains("Tom & Jerry"));
}
#[tokio::test]
async fn test_no_tables() {
let skill = DocxLoaderSkill::new();
let xml = "<w:p><w:r><w:t>No tables here</w:t></w:r></w:p>";
let call =
make_call(json!({"operation": "extract_tables", "data": xml, "encoding": "xml"}));
let result = skill.execute(call).await.unwrap();
assert!(!result.is_error);
let parsed: Value = serde_json::from_str(&result.content).unwrap();
assert_eq!(parsed["count"], 0);
}
#[tokio::test]
async fn test_missing_operation() {
let skill = DocxLoaderSkill::new();
let call = make_call(json!({"data": SAMPLE_XML, "encoding": "xml"}));
let result = skill.execute(call).await.unwrap();
assert!(result.is_error);
}
#[tokio::test]
async fn test_missing_data() {
let skill = DocxLoaderSkill::new();
let call = make_call(json!({"operation": "extract_text"}));
let result = skill.execute(call).await.unwrap();
assert!(result.is_error);
}
#[tokio::test]
async fn test_unknown_operation() {
let skill = DocxLoaderSkill::new();
let call = make_call(
json!({"operation": "convert_to_pdf", "data": SAMPLE_XML, "encoding": "xml"}),
);
let result = skill.execute(call).await.unwrap();
assert!(result.is_error);
assert!(result.content.contains("Unknown operation"));
}
#[test]
fn test_descriptor_name() {
let skill = DocxLoaderSkill::new();
assert_eq!(skill.descriptor().name, "docx_loader");
}
#[test]
fn test_paragraphs_order_preserved() {
let xml = r#"<w:p><w:r><w:t>First</w:t></w:r></w:p><w:p><w:r><w:t>Second</w:t></w:r></w:p><w:p><w:r><w:t>Third</w:t></w:r></w:p>"#;
let paragraphs = extract_paragraphs_from_xml(xml);
assert_eq!(paragraphs, vec!["First", "Second", "Third"]);
}
}