use super::error::{Result, ToolError};
use super::r#trait::{Tool, ToolCapability, ToolExecutionContext, ToolResult};
use async_trait::async_trait;
use serde::{Deserialize, Serialize};
use serde_json::Value;
use std::io::Read;
use std::path::Path;
pub struct DocParserTool;
const MAX_FILE_SIZE: u64 = 50 * 1024 * 1024;
#[derive(Debug, Deserialize, Serialize)]
struct DocParserInput {
path: String,
#[serde(skip_serializing_if = "Option::is_none")]
max_chars: Option<usize>,
#[serde(skip_serializing_if = "Option::is_none")]
pages: Option<Vec<usize>>,
#[serde(skip_serializing_if = "Option::is_none")]
include_metadata: Option<bool>,
}
#[derive(Debug, Serialize)]
struct DocumentMetadata {
format: String,
file_size: u64,
#[serde(skip_serializing_if = "Option::is_none")]
page_count: Option<usize>,
#[serde(skip_serializing_if = "Option::is_none")]
title: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
author: Option<String>,
}
#[async_trait]
impl Tool for DocParserTool {
fn name(&self) -> &str {
"parse_document"
}
fn description(&self) -> &str {
"Parse and extract text content from documents (PDF, DOCX, TXT, MD, HTML). \
Useful for analyzing documents, extracting information, and converting document content to plain text."
}
fn input_schema(&self) -> Value {
serde_json::json!({
"type": "object",
"properties": {
"path": {
"type": "string",
"description": "Path to the document file (PDF, DOCX, TXT, MD, HTML)"
},
"max_chars": {
"type": "integer",
"description": "Optional: Maximum characters to extract (default: unlimited)",
"minimum": 1
},
"pages": {
"type": "array",
"items": {"type": "integer", "minimum": 1},
"description": "Optional: Specific page numbers to extract (PDF only, 1-indexed)"
},
"include_metadata": {
"type": "boolean",
"description": "Optional: Include document metadata in output (default: false)"
}
},
"required": ["path"]
})
}
fn capabilities(&self) -> Vec<ToolCapability> {
vec![ToolCapability::ReadFiles]
}
fn requires_approval(&self) -> bool {
false }
fn validate_input(&self, input: &Value) -> Result<()> {
let _: DocParserInput = serde_json::from_value(input.clone())
.map_err(|e| ToolError::InvalidInput(format!("Invalid input: {}", e)))?;
Ok(())
}
async fn execute(&self, input: Value, context: &ToolExecutionContext) -> Result<ToolResult> {
let input: DocParserInput = serde_json::from_value(input)?;
let path = super::error::resolve_tool_path(&input.path, &context.working_dir());
if !path.exists() {
return Ok(ToolResult::error(format!(
"File not found: {}",
path.display()
)));
}
if !path.is_file() {
return Ok(ToolResult::error(format!(
"Path is not a file: {}",
path.display()
)));
}
let file_size = std::fs::metadata(&path).map_err(ToolError::Io)?.len();
if file_size > MAX_FILE_SIZE {
return Ok(ToolResult::error(format!(
"File size ({} MB) exceeds maximum allowed size ({} MB). \
Consider splitting the document or using a different approach.",
file_size / (1024 * 1024),
MAX_FILE_SIZE / (1024 * 1024)
)));
}
let extension = path
.extension()
.and_then(|e| e.to_str())
.map(|e| e.to_lowercase())
.unwrap_or_default();
let (text, metadata) = match extension.as_str() {
"pdf" => self.parse_pdf(&path, &input).await?,
"docx" => self.parse_docx(&path).await?,
"txt" | "md" | "markdown" | "rst" | "text" => {
self.parse_text(&path, &extension).await?
}
"html" | "htm" => self.parse_html(&path).await?,
"json" => self.parse_json(&path).await?,
"xml" => self.parse_xml(&path).await?,
_ => {
return Ok(ToolResult::error(format!(
"Unsupported document format: .{}. Supported formats: PDF, DOCX, TXT, MD, HTML, JSON, XML",
extension
)));
}
};
let text = if let Some(max_chars) = input.max_chars {
if text.len() > max_chars {
format!(
"{}...\n\n[Truncated: {} of {} characters shown]",
crate::utils::truncate_str(&text, max_chars),
max_chars,
text.len()
)
} else {
text
}
} else {
text
};
let output = if input.include_metadata.unwrap_or(false) {
let meta = DocumentMetadata {
format: extension.clone(),
file_size,
page_count: metadata.page_count,
title: metadata.title,
author: metadata.author,
};
format!(
"=== Document Metadata ===\n{}\n\n=== Content ===\n{}",
serde_json::to_string_pretty(&meta).unwrap_or_default(),
text
)
} else {
text
};
let output_len = output.len();
Ok(ToolResult::success(output)
.with_metadata("path".to_string(), path.display().to_string())
.with_metadata("format".to_string(), extension)
.with_metadata("chars".to_string(), output_len.to_string()))
}
}
#[derive(Default)]
struct ParsedMetadata {
page_count: Option<usize>,
title: Option<String>,
author: Option<String>,
}
impl DocParserTool {
async fn parse_pdf(
&self,
path: &Path,
input: &DocParserInput,
) -> Result<(String, ParsedMetadata)> {
let path = path.to_path_buf();
let pages = input.pages.clone();
tokio::task::spawn_blocking(move || {
let bytes = std::fs::read(&path).map_err(ToolError::Io)?;
let text = pdf_extract::extract_text_from_mem(&bytes)
.map_err(|e| ToolError::Execution(format!("Failed to parse PDF: {}", e)))?;
let text = if let Some(page_nums) = pages {
let pages: Vec<&str> = text.split("\u{000C}").collect();
let mut selected_text = String::new();
for page_num in page_nums {
if page_num > 0 && page_num <= pages.len() {
selected_text.push_str(&format!("--- Page {} ---\n", page_num));
selected_text.push_str(pages[page_num - 1].trim());
selected_text.push_str("\n\n");
}
}
if selected_text.is_empty() {
text } else {
selected_text
}
} else {
text
};
let page_count = text.matches("\u{000C}").count() + 1;
let metadata = ParsedMetadata {
page_count: Some(page_count),
title: None,
author: None,
};
Ok((text.trim().to_string(), metadata))
})
.await
.map_err(|e| ToolError::Execution(format!("PDF parsing task failed: {}", e)))?
}
async fn parse_docx(&self, path: &Path) -> Result<(String, ParsedMetadata)> {
let path = path.to_path_buf();
tokio::task::spawn_blocking(move || {
let file = std::fs::File::open(&path).map_err(ToolError::Io)?;
let mut archive = zip::ZipArchive::new(file)
.map_err(|e| ToolError::Execution(format!("Failed to open DOCX: {}", e)))?;
let mut text_content = String::new();
let mut title = None;
let mut author = None;
if let Ok(mut document_xml) = archive.by_name("word/document.xml") {
let mut xml_content = String::new();
document_xml
.read_to_string(&mut xml_content)
.map_err(ToolError::Io)?;
text_content = Self::extract_text_from_docx_xml(&xml_content);
}
if let Ok(mut core_xml) = archive.by_name("docProps/core.xml") {
let mut xml_content = String::new();
core_xml
.read_to_string(&mut xml_content)
.map_err(ToolError::Io)?;
let (t, a) = Self::extract_metadata_from_core_xml(&xml_content);
title = t;
author = a;
}
let metadata = ParsedMetadata {
page_count: None, title,
author,
};
Ok((text_content, metadata))
})
.await
.map_err(|e| ToolError::Execution(format!("DOCX parsing task failed: {}", e)))?
}
fn extract_text_from_docx_xml(xml: &str) -> String {
let mut text = String::new();
let mut reader = quick_xml::Reader::from_str(xml);
let mut buf = Vec::new();
let mut in_text = false;
loop {
match reader.read_event_into(&mut buf) {
Ok(quick_xml::events::Event::Start(ref e)) => {
if e.name().as_ref() == b"w:t" {
in_text = true;
}
if e.name().as_ref() == b"w:p" && !text.is_empty() {
text.push('\n');
}
}
Ok(quick_xml::events::Event::Text(e)) => {
if in_text && let Ok(t) = e.unescape() {
text.push_str(&t);
}
}
Ok(quick_xml::events::Event::End(ref e)) if e.name().as_ref() == b"w:t" => {
in_text = false;
}
Ok(quick_xml::events::Event::Eof) => break,
Err(_) => break,
_ => {}
}
buf.clear();
}
text.trim().to_string()
}
fn extract_metadata_from_core_xml(xml: &str) -> (Option<String>, Option<String>) {
let mut title = None;
let mut author = None;
let mut reader = quick_xml::Reader::from_str(xml);
let mut buf = Vec::new();
let mut current_element = String::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(quick_xml::events::Event::Start(ref e)) => {
current_element = String::from_utf8_lossy(e.name().as_ref()).to_string();
}
Ok(quick_xml::events::Event::Text(e)) => {
if let Ok(t) = e.unescape() {
match current_element.as_str() {
"dc:title" => title = Some(t.to_string()),
"dc:creator" => author = Some(t.to_string()),
_ => {}
}
}
}
Ok(quick_xml::events::Event::Eof) => break,
Err(_) => break,
_ => {}
}
buf.clear();
}
(title, author)
}
async fn parse_text(&self, path: &Path, _format: &str) -> Result<(String, ParsedMetadata)> {
let text = tokio::fs::read_to_string(path)
.await
.map_err(ToolError::Io)?;
let metadata = ParsedMetadata {
page_count: None,
title: None,
author: None,
};
Ok((text, metadata))
}
async fn parse_html(&self, path: &Path) -> Result<(String, ParsedMetadata)> {
let html = tokio::fs::read_to_string(path)
.await
.map_err(ToolError::Io)?;
let text = Self::strip_html_tags(&html);
let metadata = ParsedMetadata {
page_count: None,
title: Self::extract_html_title(&html),
author: None,
};
Ok((text, metadata))
}
fn strip_html_tags(html: &str) -> String {
let mut text = String::new();
let mut in_tag = false;
let mut in_script = false;
let mut in_style = false;
let chars: Vec<char> = html.chars().collect();
let mut i = 0;
while i < chars.len() {
let c = chars[i];
if c == '<' {
in_tag = true;
let remaining: String = chars[i..].iter().take(10).collect();
if remaining.to_lowercase().starts_with("<script") {
in_script = true;
} else if remaining.to_lowercase().starts_with("<style") {
in_style = true;
} else if remaining.to_lowercase().starts_with("</script") {
in_script = false;
} else if remaining.to_lowercase().starts_with("</style") {
in_style = false;
}
} else if c == '>' {
in_tag = false;
} else if !in_tag && !in_script && !in_style {
text.push(c);
}
i += 1;
}
let lines: Vec<&str> = text
.lines()
.map(|l| l.trim())
.filter(|l| !l.is_empty())
.collect();
lines.join("\n")
}
fn extract_html_title(html: &str) -> Option<String> {
let lowercase = html.to_lowercase();
if let Some(start) = lowercase.find("<title>") {
let start = start + 7;
if let Some(end) = lowercase[start..].find("</title>") {
return html.get(start..start + end).map(|s| s.trim().to_string());
}
}
None
}
async fn parse_json(&self, path: &Path) -> Result<(String, ParsedMetadata)> {
let json_text = tokio::fs::read_to_string(path)
.await
.map_err(ToolError::Io)?;
let text = match serde_json::from_str::<Value>(&json_text) {
Ok(value) => serde_json::to_string_pretty(&value).unwrap_or(json_text),
Err(_) => json_text,
};
let metadata = ParsedMetadata {
page_count: None,
title: None,
author: None,
};
Ok((text, metadata))
}
async fn parse_xml(&self, path: &Path) -> Result<(String, ParsedMetadata)> {
let xml_text = tokio::fs::read_to_string(path)
.await
.map_err(ToolError::Io)?;
let mut text = String::new();
let mut reader = quick_xml::Reader::from_str(&xml_text);
let mut buf = Vec::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(quick_xml::events::Event::Text(e)) => {
if let Ok(t) = e.unescape() {
let trimmed = t.trim();
if !trimmed.is_empty() {
text.push_str(trimmed);
text.push('\n');
}
}
}
Ok(quick_xml::events::Event::Eof) => break,
Err(_) => break,
_ => {}
}
buf.clear();
}
let metadata = ParsedMetadata {
page_count: None,
title: None,
author: None,
};
Ok((text.trim().to_string(), metadata))
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
use uuid::Uuid;
#[tokio::test]
async fn test_parse_text_file() {
let mut temp_file = NamedTempFile::with_suffix(".txt").unwrap();
writeln!(temp_file, "This is a test document.\nWith multiple lines.").unwrap();
temp_file.flush().unwrap();
let tool = DocParserTool;
let session_id = Uuid::new_v4();
let context = ToolExecutionContext::new(session_id);
let input = serde_json::json!({
"path": temp_file.path().to_str().unwrap()
});
let result = tool.execute(input, &context).await.unwrap();
assert!(result.success);
assert!(result.output.contains("This is a test document"));
assert!(result.output.contains("multiple lines"));
}
#[tokio::test]
async fn test_parse_markdown_file() {
let mut temp_file = NamedTempFile::with_suffix(".md").unwrap();
writeln!(temp_file, "# Header\n\nSome **bold** text.").unwrap();
temp_file.flush().unwrap();
let tool = DocParserTool;
let session_id = Uuid::new_v4();
let context = ToolExecutionContext::new(session_id);
let input = serde_json::json!({
"path": temp_file.path().to_str().unwrap()
});
let result = tool.execute(input, &context).await.unwrap();
assert!(result.success);
assert!(result.output.contains("# Header"));
assert!(result.output.contains("**bold**"));
}
#[tokio::test]
async fn test_parse_json_file() {
let mut temp_file = NamedTempFile::with_suffix(".json").unwrap();
writeln!(temp_file, r#"{{"name": "test", "value": 42}}"#).unwrap();
temp_file.flush().unwrap();
let tool = DocParserTool;
let session_id = Uuid::new_v4();
let context = ToolExecutionContext::new(session_id);
let input = serde_json::json!({
"path": temp_file.path().to_str().unwrap()
});
let result = tool.execute(input, &context).await.unwrap();
assert!(result.success);
assert!(result.output.contains("\"name\""));
assert!(result.output.contains("\"test\""));
}
#[tokio::test]
async fn test_parse_html_file() {
let mut temp_file = NamedTempFile::with_suffix(".html").unwrap();
writeln!(
temp_file,
"<html><head><title>Test Page</title></head><body><p>Hello World</p></body></html>"
)
.unwrap();
temp_file.flush().unwrap();
let tool = DocParserTool;
let session_id = Uuid::new_v4();
let context = ToolExecutionContext::new(session_id);
let input = serde_json::json!({
"path": temp_file.path().to_str().unwrap(),
"include_metadata": true
});
let result = tool.execute(input, &context).await.unwrap();
assert!(result.success);
assert!(result.output.contains("Test Page"));
assert!(result.output.contains("Hello World"));
}
#[tokio::test]
async fn test_max_chars_truncation() {
let mut temp_file = NamedTempFile::with_suffix(".txt").unwrap();
writeln!(
temp_file,
"This is a very long document that should be truncated."
)
.unwrap();
temp_file.flush().unwrap();
let tool = DocParserTool;
let session_id = Uuid::new_v4();
let context = ToolExecutionContext::new(session_id);
let input = serde_json::json!({
"path": temp_file.path().to_str().unwrap(),
"max_chars": 10
});
let result = tool.execute(input, &context).await.unwrap();
assert!(result.success);
assert!(result.output.contains("Truncated"));
}
#[tokio::test]
async fn test_unsupported_format() {
let mut temp_file = NamedTempFile::with_suffix(".xyz").unwrap();
writeln!(temp_file, "Some content").unwrap();
temp_file.flush().unwrap();
let tool = DocParserTool;
let session_id = Uuid::new_v4();
let context = ToolExecutionContext::new(session_id);
let input = serde_json::json!({
"path": temp_file.path().to_str().unwrap()
});
let result = tool.execute(input, &context).await.unwrap();
assert!(!result.success);
assert!(result.error.unwrap().contains("Unsupported"));
}
#[tokio::test]
async fn test_nonexistent_file() {
let tool = DocParserTool;
let session_id = Uuid::new_v4();
let context = ToolExecutionContext::new(session_id);
let input = serde_json::json!({
"path": "/nonexistent/document.pdf"
});
let result = tool.execute(input, &context).await.unwrap();
assert!(!result.success);
assert!(result.error.unwrap().contains("not found"));
}
#[test]
fn test_tool_schema() {
let tool = DocParserTool;
assert_eq!(tool.name(), "parse_document");
assert!(!tool.requires_approval());
let schema = tool.input_schema();
assert!(schema.is_object());
assert!(schema["properties"]["path"].is_object());
}
#[test]
fn test_strip_html_tags() {
let html = "<html><body><p>Hello</p><script>var x=1;</script><p>World</p></body></html>";
let text = DocParserTool::strip_html_tags(html);
assert!(text.contains("Hello"));
assert!(text.contains("World"));
assert!(!text.contains("var x"));
}
#[test]
fn test_extract_html_title() {
let html = "<html><head><title>My Document</title></head><body></body></html>";
let title = DocParserTool::extract_html_title(html);
assert_eq!(title, Some("My Document".to_string()));
}
}