use crate::core::error::{RustChainError, ToolError};
use crate::core::tools::{Tool, ToolCapability, ToolResult};
use async_trait::async_trait;
use serde::{Deserialize, Serialize};
use std::fs;
use std::path::Path;
use tracing::{debug, info};
use csv::ReaderBuilder;
pub struct PdfDocumentLoader;
impl PdfDocumentLoader {
pub fn new() -> Self {
Self
}
async fn load_pdf(&self, file_path: &str) -> Result<DocumentContent, RustChainError> {
let path = Path::new(file_path);
if !path.exists() {
return Err(RustChainError::Tool(ToolError::InvalidParameters {
tool_name: "pdf_loader".to_string(),
details: format!("File does not exist: {}", file_path),
}));
}
if !path.extension().map_or(false, |ext| ext.eq_ignore_ascii_case("pdf")) {
return Err(RustChainError::Tool(ToolError::InvalidParameters {
tool_name: "pdf_loader".to_string(),
details: format!("File is not a PDF: {}", file_path),
}));
}
let file_content = fs::read(file_path)
.map_err(|e| RustChainError::Tool(ToolError::ExecutionFailed {
tool_name: "pdf_loader".to_string(),
reason: format!("Failed to read file: {}", e),
}))?;
let text = self.extract_text_from_pdf(&file_content)?;
let metadata = DocumentMetadata {
file_path: file_path.to_string(),
file_size: file_content.len(),
content_type: "application/pdf".to_string(),
pages: self.count_pdf_pages(&file_content)?,
created_at: None,
modified_at: None,
};
Ok(DocumentContent {
text,
metadata,
source: "pdf_loader".to_string(),
})
}
fn extract_text_from_pdf(&self, pdf_data: &[u8]) -> Result<String, RustChainError> {
let pdf_string = String::from_utf8_lossy(pdf_data);
let mut extracted_text = String::new();
let mut in_text_object = false;
for line in pdf_string.lines() {
if line.contains("BT") { in_text_object = true;
continue;
}
if line.contains("ET") { in_text_object = false;
continue;
}
if in_text_object && line.contains("Tj") {
if let Some(start) = line.find('(') {
if let Some(end) = line.rfind(')') {
let text_part = &line[start + 1..end];
extracted_text.push_str(text_part);
extracted_text.push(' ');
}
}
}
}
if extracted_text.trim().is_empty() {
extracted_text = format!(
"[PDF content detected - {} bytes. Basic text extraction did not find readable text. Consider using a specialized PDF processing library for better results.]",
pdf_data.len()
);
}
Ok(extracted_text.trim().to_string())
}
fn count_pdf_pages(&self, pdf_data: &[u8]) -> Result<usize, RustChainError> {
let pdf_string = String::from_utf8_lossy(pdf_data);
let page_count = pdf_string.matches("/Type /Page").count();
Ok(page_count.max(1))
}
}
#[async_trait]
impl Tool for PdfDocumentLoader {
fn name(&self) -> &'static str {
"pdf_loader"
}
fn capabilities(&self) -> Vec<ToolCapability> {
vec![ToolCapability::Basic, ToolCapability::SystemAccess]
}
async fn invoke(&self, input: &str) -> Result<ToolResult, RustChainError> {
let load_params: DocumentLoadParams = serde_json::from_str(input)
.map_err(|e| RustChainError::Tool(ToolError::InvalidParameters {
tool_name: "pdf_loader".to_string(),
details: format!("Invalid parameters: {}", e),
}))?;
debug!("Loading PDF document: {}", load_params.file_path);
let document = self.load_pdf(&load_params.file_path).await?;
info!("PDF loaded successfully: {} characters, {} pages",
document.text.len(), document.metadata.pages);
Ok(ToolResult::StructuredJson(serde_json::to_value(document)?))
}
}
pub struct CsvDocumentLoader;
impl CsvDocumentLoader {
pub fn new() -> Self {
Self
}
async fn load_csv(&self, file_path: &str, delimiter: Option<char>, has_headers: bool) -> Result<DocumentContent, RustChainError> {
let path = Path::new(file_path);
if !path.exists() {
return Err(RustChainError::Tool(ToolError::InvalidParameters {
tool_name: "csv_loader".to_string(),
details: format!("File does not exist: {}", file_path),
}));
}
let file_content = fs::read_to_string(file_path)
.map_err(|e| RustChainError::Tool(ToolError::ExecutionFailed {
tool_name: "csv_loader".to_string(),
reason: format!("Failed to read file: {}", e),
}))?;
let delimiter = delimiter.unwrap_or(',');
let csv_data = self.parse_csv(&file_content, delimiter, has_headers)?;
let metadata = DocumentMetadata {
file_path: file_path.to_string(),
file_size: file_content.len(),
content_type: "text/csv".to_string(),
pages: 1, created_at: None,
modified_at: None,
};
Ok(DocumentContent {
text: csv_data.formatted_text,
metadata,
source: "csv_loader".to_string(),
})
}
fn parse_csv(&self, content: &str, delimiter: char, has_headers: bool) -> Result<CsvData, RustChainError> {
let mut reader = ReaderBuilder::new()
.delimiter(delimiter as u8)
.has_headers(has_headers)
.flexible(true) .from_reader(content.as_bytes());
let mut headers = Vec::new();
let mut rows = Vec::new();
let mut formatted_text = String::new();
if has_headers {
if let Ok(header_record) = reader.headers() {
headers = header_record.iter().map(|h| h.to_string()).collect();
formatted_text.push_str(&format!("Headers: {}\n\n", headers.join(" | ")));
}
}
if headers.is_empty() {
if let Some(first_record) = reader.records().next() {
if let Ok(record) = first_record {
headers = (1..=record.len()).map(|i| format!("Column_{}", i)).collect();
reader = ReaderBuilder::new()
.delimiter(delimiter as u8)
.has_headers(false)
.flexible(true)
.from_reader(content.as_bytes());
}
}
}
for (row_index, record_result) in reader.records().enumerate() {
match record_result {
Ok(record) => {
let row: Vec<String> = record.iter().map(|field| field.to_string()).collect();
if !headers.is_empty() && headers.len() == row.len() {
formatted_text.push_str(&format!("Row {}:\n", row_index + 1));
for (header, value) in headers.iter().zip(row.iter()) {
formatted_text.push_str(&format!(" {}: {}\n", header, value));
}
formatted_text.push('\n');
} else {
formatted_text.push_str(&format!("Row {}: {}\n", row_index + 1, row.join(" | ")));
}
rows.push(row);
}
Err(e) => {
tracing::warn!("CSV parsing error on row {}: {}", row_index + 1, e);
formatted_text.push_str(&format!("Row {} (parsing error): {}\n", row_index + 1, e));
}
}
}
formatted_text.push_str(&format!("\nSummary:\n"));
formatted_text.push_str(&format!(" Columns: {}\n", headers.len()));
formatted_text.push_str(&format!(" Rows: {}\n", rows.len()));
if !headers.is_empty() {
formatted_text.push_str(&format!(" Column Names: {}\n", headers.join(", ")));
}
Ok(CsvData {
headers,
rows,
formatted_text,
})
}
}
#[async_trait]
impl Tool for CsvDocumentLoader {
fn name(&self) -> &'static str {
"csv_loader"
}
fn capabilities(&self) -> Vec<ToolCapability> {
vec![ToolCapability::Basic, ToolCapability::SystemAccess]
}
async fn invoke(&self, input: &str) -> Result<ToolResult, RustChainError> {
let load_params: CsvLoadParams = serde_json::from_str(input)
.map_err(|e| RustChainError::Tool(ToolError::InvalidParameters {
tool_name: "csv_loader".to_string(),
details: format!("Invalid parameters: {}", e),
}))?;
debug!("Loading CSV document: {}", load_params.file_path);
let document = self.load_csv(
&load_params.file_path,
load_params.delimiter.map(|s| s.chars().next().unwrap_or(',')),
load_params.has_headers.unwrap_or(true)
).await?;
info!("CSV loaded successfully: {} characters, {} rows",
document.text.len(), document.text.lines().count());
Ok(ToolResult::StructuredJson(serde_json::to_value(document)?))
}
}
#[derive(Debug, Serialize, Deserialize)]
pub struct DocumentLoadParams {
pub file_path: String,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct CsvLoadParams {
pub file_path: String,
pub delimiter: Option<String>,
pub has_headers: Option<bool>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DocumentContent {
pub text: String,
pub metadata: DocumentMetadata,
pub source: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DocumentMetadata {
pub file_path: String,
pub file_size: usize,
pub content_type: String,
pub pages: usize,
pub created_at: Option<String>,
pub modified_at: Option<String>,
}
#[derive(Debug, Serialize, Deserialize)]
struct CsvData {
pub headers: Vec<String>,
pub rows: Vec<Vec<String>>,
pub formatted_text: String,
}
pub struct JsonYamlDocumentLoader;
impl JsonYamlDocumentLoader {
pub fn new() -> Self {
Self
}
async fn load_json_yaml(&self, file_path: &str, format: Option<&str>) -> Result<DocumentContent, RustChainError> {
let path = Path::new(file_path);
if !path.exists() {
return Err(RustChainError::Tool(ToolError::InvalidParameters {
tool_name: "json_yaml_loader".to_string(),
details: format!("File does not exist: {}", file_path),
}));
}
let detected_format = if let Some(fmt) = format {
fmt.to_lowercase()
} else {
match path.extension().and_then(|ext| ext.to_str()) {
Some("json") => "json".to_string(),
Some("yaml") | Some("yml") => "yaml".to_string(),
_ => return Err(RustChainError::Tool(ToolError::InvalidParameters {
tool_name: "json_yaml_loader".to_string(),
details: format!("Cannot auto-detect format for file: {}. Supported extensions: .json, .yaml, .yml", file_path),
})),
}
};
let file_content = fs::read_to_string(file_path)
.map_err(|e| RustChainError::Tool(ToolError::ExecutionFailed {
tool_name: "json_yaml_loader".to_string(),
reason: format!("Failed to read file: {}", e),
}))?;
let structured_data = match detected_format.as_str() {
"json" => self.parse_json(&file_content)?,
"yaml" => self.parse_yaml(&file_content)?,
_ => return Err(RustChainError::Tool(ToolError::InvalidParameters {
tool_name: "json_yaml_loader".to_string(),
details: format!("Unsupported format: {}. Supported formats: json, yaml", detected_format),
})),
};
let file_size = file_content.len();
let content_type = match detected_format.as_str() {
"json" => "application/json",
"yaml" => "application/yaml",
_ => "text/plain",
};
let formatted_text = self.format_structured_data(&structured_data, &detected_format);
let metadata = DocumentMetadata {
file_path: file_path.to_string(),
file_size,
content_type: content_type.to_string(),
pages: 1, created_at: None,
modified_at: None,
};
info!("JSON/YAML loaded successfully: {} characters, format: {}",
formatted_text.len(), detected_format);
Ok(DocumentContent {
text: formatted_text,
metadata,
source: "json_yaml_loader".to_string(),
})
}
fn parse_json(&self, content: &str) -> Result<serde_json::Value, RustChainError> {
serde_json::from_str(content).map_err(|e| {
RustChainError::Tool(ToolError::ExecutionFailed {
tool_name: "json_yaml_loader".to_string(),
reason: format!("JSON parsing error: {}", e),
})
})
}
fn parse_yaml(&self, content: &str) -> Result<serde_json::Value, RustChainError> {
serde_yaml::from_str(content).map_err(|e| {
RustChainError::Tool(ToolError::ExecutionFailed {
tool_name: "json_yaml_loader".to_string(),
reason: format!("YAML parsing error: {}", e),
})
})
}
fn format_structured_data(&self, data: &serde_json::Value, format: &str) -> String {
let mut result = String::new();
result.push_str(&format!("Document Format: {}\n", format.to_uppercase()));
result.push_str(&format!("Structure Analysis:\n"));
match data {
serde_json::Value::Object(map) => {
result.push_str(&format!(" Type: Object\n"));
result.push_str(&format!(" Keys: {}\n", map.len()));
result.push_str(&format!(" Top-level keys: {}\n",
map.keys().take(10).cloned().collect::<Vec<_>>().join(", ")));
result.push_str("\nContent:\n");
result.push_str(&serde_json::to_string_pretty(data).unwrap_or_else(|_| "Invalid JSON".to_string()));
},
serde_json::Value::Array(arr) => {
result.push_str(&format!(" Type: Array\n"));
result.push_str(&format!(" Length: {}\n", arr.len()));
if !arr.is_empty() {
if let Some(first) = arr.first() {
result.push_str(&format!(" First element type: {}\n",
match first {
serde_json::Value::Object(_) => "Object",
serde_json::Value::Array(_) => "Array",
serde_json::Value::String(_) => "String",
serde_json::Value::Number(_) => "Number",
serde_json::Value::Bool(_) => "Boolean",
serde_json::Value::Null => "Null",
}
));
}
}
result.push_str("\nContent:\n");
result.push_str(&serde_json::to_string_pretty(data).unwrap_or_else(|_| "Invalid JSON".to_string()));
},
_ => {
result.push_str(&format!(" Type: {}\n",
match data {
serde_json::Value::String(_) => "String",
serde_json::Value::Number(_) => "Number",
serde_json::Value::Bool(_) => "Boolean",
serde_json::Value::Null => "Null",
_ => "Unknown",
}
));
result.push_str("\nContent:\n");
result.push_str(&serde_json::to_string_pretty(data).unwrap_or_else(|_| "Invalid JSON".to_string()));
}
}
result
}
}
#[async_trait]
impl Tool for JsonYamlDocumentLoader {
fn name(&self) -> &'static str {
"json_yaml_loader"
}
fn capabilities(&self) -> Vec<ToolCapability> {
vec![ToolCapability::Basic, ToolCapability::SystemAccess]
}
async fn invoke(&self, input: &str) -> Result<ToolResult, RustChainError> {
let load_params: JsonYamlLoadParams = serde_json::from_str(input)
.map_err(|e| RustChainError::Tool(ToolError::InvalidParameters {
tool_name: "json_yaml_loader".to_string(),
details: format!("Invalid parameters: {}", e),
}))?;
let document = self.load_json_yaml(&load_params.file_path, load_params.format.as_deref()).await?;
Ok(ToolResult::StructuredJson(serde_json::to_value(document)?))
}
}
#[derive(Debug, Serialize, Deserialize)]
struct JsonYamlLoadParams {
pub file_path: String,
pub format: Option<String>, }
pub struct HtmlDocumentLoader;
impl HtmlDocumentLoader {
pub fn new() -> Self {
Self
}
async fn load_html(&self, file_path: &str, extract_mode: Option<&str>) -> Result<DocumentContent, RustChainError> {
let path = Path::new(file_path);
if !path.exists() {
return Err(RustChainError::Tool(ToolError::InvalidParameters {
tool_name: "html_loader".to_string(),
details: format!("File does not exist: {}", file_path),
}));
}
let is_html_file = match path.extension().and_then(|ext| ext.to_str()) {
Some("html") | Some("htm") => true,
_ => false,
};
let file_content = fs::read_to_string(file_path)
.map_err(|e| RustChainError::Tool(ToolError::ExecutionFailed {
tool_name: "html_loader".to_string(),
reason: format!("Failed to read file: {}", e),
}))?;
let parsed_content = self.parse_html(&file_content, extract_mode.unwrap_or("text"))?;
let file_size = file_content.len();
let content_type = if is_html_file {
"text/html"
} else {
"text/plain"
};
let metadata = DocumentMetadata {
file_path: file_path.to_string(),
file_size,
content_type: content_type.to_string(),
pages: 1, created_at: None,
modified_at: None,
};
info!("HTML loaded successfully: {} characters, mode: {}",
parsed_content.len(), extract_mode.unwrap_or("text"));
Ok(DocumentContent {
text: parsed_content,
metadata,
source: "html_loader".to_string(),
})
}
fn parse_html(&self, content: &str, extract_mode: &str) -> Result<String, RustChainError> {
use scraper::Html;
let document = Html::parse_document(content);
match extract_mode {
"text" => self.extract_text_content(&document),
"structure" => self.extract_structural_analysis(&document),
"links" => self.extract_links(&document),
"metadata" => self.extract_metadata(&document),
"all" => {
let text = self.extract_text_content(&document)?;
let structure = self.extract_structural_analysis(&document)?;
let links = self.extract_links(&document)?;
let metadata = self.extract_metadata(&document)?;
Ok(format!(
"=== HTML DOCUMENT ANALYSIS ===\n\n{}\n\n=== STRUCTURAL ANALYSIS ===\n\n{}\n\n=== EXTRACTED LINKS ===\n\n{}\n\n=== PAGE METADATA ===\n\n{}",
text, structure, links, metadata
))
},
_ => Err(RustChainError::Tool(ToolError::InvalidParameters {
tool_name: "html_loader".to_string(),
details: format!("Unsupported extract mode: {}. Supported modes: text, structure, links, metadata, all", extract_mode),
})),
}
}
fn extract_text_content(&self, document: &scraper::Html) -> Result<String, RustChainError> {
use scraper::Selector;
let _script_selector = scraper::Selector::parse("script, style").unwrap();
let _cleaned_html = document.html();
let body_selector = Selector::parse("body").map_err(|e| {
RustChainError::Tool(ToolError::ExecutionFailed {
tool_name: "html_loader".to_string(),
reason: format!("CSS selector error: {}", e),
})
})?;
#[allow(unused_assignments)]
let mut text_content = String::new();
if let Some(body) = document.select(&body_selector).next() {
text_content = self.extract_element_text(body);
} else {
text_content = document.root_element().text().collect::<Vec<_>>().join(" ");
}
let cleaned = text_content
.lines()
.map(|line| line.trim())
.filter(|line| !line.is_empty())
.collect::<Vec<_>>()
.join("\n");
Ok(cleaned)
}
fn extract_element_text(&self, element: scraper::ElementRef) -> String {
element.text().collect::<Vec<_>>().join(" ")
}
fn extract_structural_analysis(&self, document: &scraper::Html) -> Result<String, RustChainError> {
use scraper::Selector;
let mut analysis = String::new();
let selectors = [
("Headings (h1)", "h1"),
("Headings (h2)", "h2"),
("Headings (h3)", "h3"),
("Headings (h4-h6)", "h4, h5, h6"),
("Paragraphs", "p"),
("Lists", "ul, ol"),
("List Items", "li"),
("Links", "a"),
("Images", "img"),
("Tables", "table"),
("Forms", "form"),
("Divs", "div"),
("Spans", "span"),
];
analysis.push_str("HTML Structure Analysis:\n");
for (name, selector_str) in &selectors {
let selector = Selector::parse(selector_str).map_err(|e| {
RustChainError::Tool(ToolError::ExecutionFailed {
tool_name: "html_loader".to_string(),
reason: format!("CSS selector error: {}", e),
})
})?;
let count = document.select(&selector).count();
if count > 0 {
analysis.push_str(&format!(" • {}: {}\n", name, count));
}
}
let title_selector = Selector::parse("title").unwrap();
if let Some(title_element) = document.select(&title_selector).next() {
let title_text = title_element.text().collect::<String>();
if !title_text.trim().is_empty() {
analysis.push_str(&format!("\nPage Title: {}\n", title_text.trim()));
}
}
analysis.push_str("\nHeading Structure:\n");
for level in 1..=6 {
let heading_selector = Selector::parse(&format!("h{}", level)).unwrap();
let headings: Vec<String> = document.select(&heading_selector)
.map(|h| h.text().collect::<String>().trim().to_string())
.filter(|h| !h.is_empty())
.collect();
if !headings.is_empty() {
analysis.push_str(&format!(" H{}: {}\n", level, headings.join(", ")));
}
}
Ok(analysis)
}
fn extract_links(&self, document: &scraper::Html) -> Result<String, RustChainError> {
use scraper::Selector;
let link_selector = Selector::parse("a[href]").map_err(|e| {
RustChainError::Tool(ToolError::ExecutionFailed {
tool_name: "html_loader".to_string(),
reason: format!("CSS selector error: {}", e),
})
})?;
let mut links_text = String::new();
links_text.push_str("Extracted Links:\n");
let links: Vec<(String, String)> = document.select(&link_selector)
.filter_map(|link| {
let href = link.value().attr("href")?;
let text = link.text().collect::<String>().trim().to_string();
Some((href.to_string(), text))
})
.collect();
if links.is_empty() {
links_text.push_str(" No links found\n");
} else {
for (href, text) in links {
let display_text = if text.is_empty() { "(no text)" } else { &text };
links_text.push_str(&format!(" • {} → {}\n", display_text, href));
}
}
Ok(links_text)
}
fn extract_metadata(&self, document: &scraper::Html) -> Result<String, RustChainError> {
use scraper::Selector;
let mut metadata = String::new();
metadata.push_str("HTML Metadata:\n");
let meta_selector = Selector::parse("meta").map_err(|e| {
RustChainError::Tool(ToolError::ExecutionFailed {
tool_name: "html_loader".to_string(),
reason: format!("CSS selector error: {}", e),
})
})?;
let meta_tags: Vec<(String, String)> = document.select(&meta_selector)
.filter_map(|meta| {
let name = meta.value().attr("name")
.or_else(|| meta.value().attr("property"))
.or_else(|| meta.value().attr("http-equiv"))?;
let content = meta.value().attr("content")?;
Some((name.to_string(), content.to_string()))
})
.collect();
if meta_tags.is_empty() {
metadata.push_str(" No meta tags found\n");
} else {
for (name, content) in meta_tags {
metadata.push_str(&format!(" • {}: {}\n", name, content));
}
}
let title_selector = Selector::parse("title").unwrap();
if let Some(title_element) = document.select(&title_selector).next() {
let title_text = title_element.text().collect::<String>();
if !title_text.trim().is_empty() {
metadata.push_str(&format!("\nTitle: {}\n", title_text.trim()));
}
}
let html_selector = Selector::parse("html").unwrap();
if let Some(html_element) = document.select(&html_selector).next() {
if let Some(lang) = html_element.value().attr("lang") {
metadata.push_str(&format!("Language: {}\n", lang));
}
}
Ok(metadata)
}
}
#[async_trait]
impl Tool for HtmlDocumentLoader {
fn name(&self) -> &'static str {
"html_loader"
}
fn capabilities(&self) -> Vec<ToolCapability> {
vec![ToolCapability::Basic, ToolCapability::SystemAccess]
}
async fn invoke(&self, input: &str) -> Result<ToolResult, RustChainError> {
let load_params: HtmlLoadParams = serde_json::from_str(input)
.map_err(|e| RustChainError::Tool(ToolError::InvalidParameters {
tool_name: "html_loader".to_string(),
details: format!("Invalid parameters: {}", e),
}))?;
let document = self.load_html(&load_params.file_path, load_params.extract_mode.as_deref()).await?;
Ok(ToolResult::StructuredJson(serde_json::to_value(document)?))
}
}
#[derive(Debug, Serialize, Deserialize)]
struct HtmlLoadParams {
pub file_path: String,
pub extract_mode: Option<String>, }
pub fn register_document_loaders(registry: &mut crate::core::tools::ToolRegistry) {
info!("Starting document loader registration...");
let pdf_loader = PdfDocumentLoader::new();
registry.register(Box::new(pdf_loader));
info!("Registered PDF Document Loader");
let csv_loader = CsvDocumentLoader::new();
registry.register(Box::new(csv_loader));
info!("Registered CSV Document Loader");
let json_yaml_loader = JsonYamlDocumentLoader::new();
registry.register(Box::new(json_yaml_loader));
info!("Registered JSON/YAML Document Loader");
let html_loader = HtmlDocumentLoader::new();
registry.register(Box::new(html_loader));
info!("Registered HTML Document Loader");
info!("Document loader registration complete");
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs::File;
use std::io::Write;
use tempfile::TempDir;
#[test]
fn test_document_load_params_serialization() {
let params = DocumentLoadParams {
file_path: "/path/to/document.pdf".to_string(),
};
let json = serde_json::to_string(¶ms).unwrap();
let deserialized: DocumentLoadParams = serde_json::from_str(&json).unwrap();
assert_eq!(params.file_path, deserialized.file_path);
}
#[test]
fn test_csv_load_params_serialization() {
let params = CsvLoadParams {
file_path: "/path/to/data.csv".to_string(),
delimiter: Some(";".to_string()),
has_headers: Some(true),
};
let json = serde_json::to_string(¶ms).unwrap();
let deserialized: CsvLoadParams = serde_json::from_str(&json).unwrap();
assert_eq!(params.file_path, deserialized.file_path);
assert_eq!(params.delimiter, deserialized.delimiter);
assert_eq!(params.has_headers, deserialized.has_headers);
}
#[test]
fn test_document_content_serialization() {
let content = DocumentContent {
text: "Sample document content".to_string(),
metadata: DocumentMetadata {
file_path: "/path/to/doc.pdf".to_string(),
file_size: 1024,
content_type: "application/pdf".to_string(),
pages: 5,
created_at: None,
modified_at: None,
},
source: "pdf_loader".to_string(),
};
let json = serde_json::to_value(&content).unwrap();
assert_eq!(json["text"], "Sample document content");
assert_eq!(json["source"], "pdf_loader");
assert_eq!(json["metadata"]["pages"], 5);
}
#[test]
fn test_pdf_loader_name() {
let loader = PdfDocumentLoader::new();
assert_eq!(loader.name(), "pdf_loader");
}
#[test]
fn test_pdf_loader_capabilities() {
let loader = PdfDocumentLoader::new();
let capabilities = loader.capabilities();
assert!(capabilities.contains(&ToolCapability::Basic));
assert!(capabilities.contains(&ToolCapability::SystemAccess));
}
#[test]
fn test_csv_loader_name() {
let loader = CsvDocumentLoader::new();
assert_eq!(loader.name(), "csv_loader");
}
#[test]
fn test_csv_loader_capabilities() {
let loader = CsvDocumentLoader::new();
let capabilities = loader.capabilities();
assert!(capabilities.contains(&ToolCapability::Basic));
assert!(capabilities.contains(&ToolCapability::SystemAccess));
}
#[test]
fn test_csv_parsing_with_headers() {
let loader = CsvDocumentLoader::new();
let csv_content = "name,age,city\nAlice,28,NYC\nBob,35,LA";
let result = loader.parse_csv(csv_content, ',', true);
assert!(result.is_ok());
let csv_data = result.unwrap();
assert_eq!(csv_data.headers, vec!["name", "age", "city"]);
assert_eq!(csv_data.rows.len(), 2);
assert_eq!(csv_data.rows[0], vec!["Alice", "28", "NYC"]);
assert_eq!(csv_data.rows[1], vec!["Bob", "35", "LA"]);
}
#[test]
fn test_csv_parsing_with_quotes() {
let loader = CsvDocumentLoader::new();
let csv_content = "name,location\n\"John Doe\",\"New York, NY\"\n\"Jane Smith\",\"Los Angeles, CA\"";
let result = loader.parse_csv(csv_content, ',', true);
assert!(result.is_ok());
let csv_data = result.unwrap();
assert_eq!(csv_data.headers, vec!["name", "location"]);
assert_eq!(csv_data.rows[0], vec!["John Doe", "New York, NY"]);
assert_eq!(csv_data.rows[1], vec!["Jane Smith", "Los Angeles, CA"]);
}
#[test]
fn test_csv_parsing_without_headers() {
let loader = CsvDocumentLoader::new();
let csv_content = "Alice,28,NYC\nBob,35,LA";
let result = loader.parse_csv(csv_content, ',', false);
assert!(result.is_ok());
let csv_data = result.unwrap();
assert_eq!(csv_data.headers, vec!["Column_1", "Column_2", "Column_3"]);
assert_eq!(csv_data.rows.len(), 2);
assert_eq!(csv_data.rows[0], vec!["Alice", "28", "NYC"]);
}
#[tokio::test]
async fn test_pdf_loader_invalid_file() {
let loader = PdfDocumentLoader::new();
let result = loader.invoke(r#"{"file_path": "/nonexistent/file.pdf"}"#).await;
assert!(result.is_err());
assert!(format!("{:?}", result.unwrap_err()).contains("does not exist"));
}
#[tokio::test]
async fn test_csv_loader_with_temp_file() {
let temp_dir = TempDir::new().unwrap();
let file_path = temp_dir.path().join("test.csv");
let csv_content = "name,age,city\nJohn,30,New York\nJane,25,Los Angeles";
let mut file = File::create(&file_path).unwrap();
file.write_all(csv_content.as_bytes()).unwrap();
let loader = CsvDocumentLoader::new();
let params = serde_json::json!({
"file_path": file_path.to_string_lossy(),
"delimiter": ",",
"has_headers": true
});
let result = loader.invoke(¶ms.to_string()).await;
assert!(result.is_ok());
if let Ok(ToolResult::StructuredJson(response)) = result {
let doc: DocumentContent = serde_json::from_value(response).unwrap();
assert!(doc.text.contains("John"));
assert!(doc.text.contains("Jane"));
assert_eq!(doc.metadata.content_type, "text/csv");
}
}
#[tokio::test]
async fn test_invalid_params() {
let loader = PdfDocumentLoader::new();
let result = loader.invoke("invalid json").await;
assert!(result.is_err());
assert!(format!("{:?}", result.unwrap_err()).contains("Invalid parameters"));
}
#[test]
fn test_json_yaml_loader_name() {
let loader = JsonYamlDocumentLoader::new();
assert_eq!(loader.name(), "json_yaml_loader");
}
#[test]
fn test_json_yaml_loader_capabilities() {
let loader = JsonYamlDocumentLoader::new();
let capabilities = loader.capabilities();
assert!(capabilities.contains(&ToolCapability::Basic));
assert!(capabilities.contains(&ToolCapability::SystemAccess));
}
#[test]
fn test_json_yaml_load_params_serialization() {
let params = JsonYamlLoadParams {
file_path: "/path/to/data.json".to_string(),
format: Some("json".to_string()),
};
let json = serde_json::to_string(¶ms).unwrap();
let deserialized: JsonYamlLoadParams = serde_json::from_str(&json).unwrap();
assert_eq!(params.file_path, deserialized.file_path);
assert_eq!(params.format, deserialized.format);
}
#[test]
fn test_json_parsing() {
let loader = JsonYamlDocumentLoader::new();
let json_content = r#"{"name": "Alice", "age": 30, "active": true}"#;
let result = loader.parse_json(json_content);
assert!(result.is_ok());
let parsed = result.unwrap();
assert_eq!(parsed["name"], "Alice");
assert_eq!(parsed["age"], 30);
assert_eq!(parsed["active"], true);
}
#[test]
fn test_json_parsing_invalid() {
let loader = JsonYamlDocumentLoader::new();
let invalid_json = r#"{"name": "Alice", "age": 30, "active": true"#; let result = loader.parse_json(invalid_json);
assert!(result.is_err());
assert!(format!("{:?}", result.unwrap_err()).contains("JSON parsing error"));
}
#[test]
fn test_yaml_parsing() {
let loader = JsonYamlDocumentLoader::new();
let yaml_content = r#"
name: Alice
age: 30
active: true
skills:
- rust
- python
- javascript
"#;
let result = loader.parse_yaml(yaml_content);
assert!(result.is_ok());
let parsed = result.unwrap();
assert_eq!(parsed["name"], "Alice");
assert_eq!(parsed["age"], 30);
assert_eq!(parsed["active"], true);
assert!(parsed["skills"].is_array());
}
#[test]
fn test_yaml_parsing_invalid() {
let loader = JsonYamlDocumentLoader::new();
let invalid_yaml = r#"
name: Alice
age: 30
active: true
"#;
let result = loader.parse_yaml(invalid_yaml);
assert!(result.is_err());
assert!(format!("{:?}", result.unwrap_err()).contains("YAML parsing error"));
}
#[test]
fn test_format_structured_data_object() {
let loader = JsonYamlDocumentLoader::new();
let data = serde_json::json!({
"name": "Alice",
"age": 30,
"location": "New York"
});
let formatted = loader.format_structured_data(&data, "json");
assert!(formatted.contains("Document Format: JSON"));
assert!(formatted.contains("Type: Object"));
assert!(formatted.contains("Keys: 3"));
assert!(formatted.contains("Top-level keys: "));
assert!(formatted.contains("Alice"));
}
#[test]
fn test_format_structured_data_array() {
let loader = JsonYamlDocumentLoader::new();
let data = serde_json::json!([
{"name": "Alice", "age": 30},
{"name": "Bob", "age": 35}
]);
let formatted = loader.format_structured_data(&data, "json");
assert!(formatted.contains("Document Format: JSON"));
assert!(formatted.contains("Type: Array"));
assert!(formatted.contains("Length: 2"));
assert!(formatted.contains("First element type: Object"));
assert!(formatted.contains("Alice"));
}
#[test]
fn test_format_structured_data_string() {
let loader = JsonYamlDocumentLoader::new();
let data = serde_json::json!("Hello World");
let formatted = loader.format_structured_data(&data, "yaml");
assert!(formatted.contains("Document Format: YAML"));
assert!(formatted.contains("Type: String"));
assert!(formatted.contains("Hello World"));
}
#[tokio::test]
async fn test_json_loader_with_temp_file() {
let temp_dir = TempDir::new().unwrap();
let file_path = temp_dir.path().join("test.json");
let json_content = r#"{
"users": [
{"name": "Alice", "age": 28, "department": "Engineering"},
{"name": "Bob", "age": 35, "department": "Marketing"}
],
"total": 2
}"#;
let mut file = File::create(&file_path).unwrap();
file.write_all(json_content.as_bytes()).unwrap();
let loader = JsonYamlDocumentLoader::new();
let params = serde_json::json!({
"file_path": file_path.to_string_lossy(),
"format": "json"
});
let result = loader.invoke(¶ms.to_string()).await;
assert!(result.is_ok());
if let Ok(ToolResult::StructuredJson(response)) = result {
let doc: DocumentContent = serde_json::from_value(response).unwrap();
assert!(doc.text.contains("Document Format: JSON"));
assert!(doc.text.contains("Type: Object"));
assert!(doc.text.contains("Alice"));
assert!(doc.text.contains("Bob"));
assert_eq!(doc.metadata.content_type, "application/json");
assert_eq!(doc.source, "json_yaml_loader");
}
}
#[tokio::test]
async fn test_yaml_loader_with_temp_file() {
let temp_dir = TempDir::new().unwrap();
let file_path = temp_dir.path().join("test.yaml");
let yaml_content = r#"
config:
name: "RustChain Test"
version: "1.0.0"
features:
- llm
- tools
- rag
users:
- name: Alice
role: admin
active: true
- name: Bob
role: user
active: false
"#;
let mut file = File::create(&file_path).unwrap();
file.write_all(yaml_content.as_bytes()).unwrap();
let loader = JsonYamlDocumentLoader::new();
let params = serde_json::json!({
"file_path": file_path.to_string_lossy()
});
let result = loader.invoke(¶ms.to_string()).await;
assert!(result.is_ok());
if let Ok(ToolResult::StructuredJson(response)) = result {
let doc: DocumentContent = serde_json::from_value(response).unwrap();
assert!(doc.text.contains("Document Format: YAML"));
assert!(doc.text.contains("Type: Object"));
assert!(doc.text.contains("RustChain Test"));
assert!(doc.text.contains("Alice"));
assert!(doc.text.contains("admin"));
assert_eq!(doc.metadata.content_type, "application/yaml");
assert_eq!(doc.source, "json_yaml_loader");
}
}
#[tokio::test]
async fn test_json_loader_auto_detect() {
let temp_dir = TempDir::new().unwrap();
let file_path = temp_dir.path().join("auto.json");
let json_content = r#"{"message": "Auto-detection test", "success": true}"#;
let mut file = File::create(&file_path).unwrap();
file.write_all(json_content.as_bytes()).unwrap();
let loader = JsonYamlDocumentLoader::new();
let params = serde_json::json!({
"file_path": file_path.to_string_lossy()
});
let result = loader.invoke(¶ms.to_string()).await;
assert!(result.is_ok());
if let Ok(ToolResult::StructuredJson(response)) = result {
let doc: DocumentContent = serde_json::from_value(response).unwrap();
assert!(doc.text.contains("Document Format: JSON"));
assert!(doc.text.contains("Auto-detection test"));
assert_eq!(doc.metadata.content_type, "application/json");
}
}
#[tokio::test]
async fn test_json_yaml_loader_invalid_file() {
let loader = JsonYamlDocumentLoader::new();
let result = loader.invoke(r#"{"file_path": "/nonexistent/file.json"}"#).await;
assert!(result.is_err());
assert!(format!("{:?}", result.unwrap_err()).contains("does not exist"));
}
#[tokio::test]
async fn test_json_yaml_loader_unsupported_extension() {
let temp_dir = TempDir::new().unwrap();
let file_path = temp_dir.path().join("test.txt");
let mut file = File::create(&file_path).unwrap();
file.write_all(b"some text content").unwrap();
let loader = JsonYamlDocumentLoader::new();
let params = serde_json::json!({
"file_path": file_path.to_string_lossy()
});
let result = loader.invoke(¶ms.to_string()).await;
assert!(result.is_err());
assert!(format!("{:?}", result.unwrap_err()).contains("Cannot auto-detect format"));
}
#[tokio::test]
async fn test_json_yaml_loader_format_override() {
let temp_dir = TempDir::new().unwrap();
let file_path = temp_dir.path().join("data.txt");
let json_content = r#"{"override": "test", "format": "json"}"#;
let mut file = File::create(&file_path).unwrap();
file.write_all(json_content.as_bytes()).unwrap();
let loader = JsonYamlDocumentLoader::new();
let params = serde_json::json!({
"file_path": file_path.to_string_lossy(),
"format": "json" });
let result = loader.invoke(¶ms.to_string()).await;
assert!(result.is_ok());
if let Ok(ToolResult::StructuredJson(response)) = result {
let doc: DocumentContent = serde_json::from_value(response).unwrap();
assert!(doc.text.contains("Document Format: JSON"));
assert!(doc.text.contains("override"));
assert_eq!(doc.metadata.content_type, "application/json");
}
}
#[test]
fn test_html_loader_name() {
let loader = HtmlDocumentLoader::new();
assert_eq!(loader.name(), "html_loader");
}
#[test]
fn test_html_loader_capabilities() {
let loader = HtmlDocumentLoader::new();
let capabilities = loader.capabilities();
assert!(capabilities.contains(&ToolCapability::Basic));
assert!(capabilities.contains(&ToolCapability::SystemAccess));
}
#[test]
fn test_html_load_params_serialization() {
let params = HtmlLoadParams {
file_path: "/path/to/page.html".to_string(),
extract_mode: Some("all".to_string()),
};
let json = serde_json::to_string(¶ms).unwrap();
let deserialized: HtmlLoadParams = serde_json::from_str(&json).unwrap();
assert_eq!(params.file_path, deserialized.file_path);
assert_eq!(params.extract_mode, deserialized.extract_mode);
}
#[test]
fn test_html_text_extraction() {
let loader = HtmlDocumentLoader::new();
let html_content = r#"
<!DOCTYPE html>
<html>
<head>
<title>Test Page</title>
<script>console.log("should be ignored");</script>
<style>body { color: red; }</style>
</head>
<body>
<h1>Main Heading</h1>
<p>This is a paragraph with <strong>bold text</strong>.</p>
<ul>
<li>List item 1</li>
<li>List item 2</li>
</ul>
</body>
</html>
"#;
let result = loader.parse_html(html_content, "text");
assert!(result.is_ok());
let extracted_text = result.unwrap();
assert!(extracted_text.contains("Main Heading"));
assert!(extracted_text.contains("This is a paragraph"));
assert!(extracted_text.contains("bold text"));
assert!(extracted_text.contains("List item 1"));
assert!(extracted_text.contains("List item 2"));
assert!(!extracted_text.contains("console.log"));
assert!(!extracted_text.contains("color: red"));
}
#[test]
fn test_html_structure_analysis() {
let loader = HtmlDocumentLoader::new();
let html_content = r#"
<!DOCTYPE html>
<html lang="en">
<head>
<title>Structure Test</title>
</head>
<body>
<h1>Main Title</h1>
<h2>Section 1</h2>
<p>First paragraph</p>
<p>Second paragraph</p>
<h2>Section 2</h2>
<ul>
<li>Item 1</li>
<li>Item 2</li>
<li>Item 3</li>
</ul>
<a href="http://example.com">External Link</a>
<img src="image.jpg" alt="Test Image">
<div>
<span>Nested content</span>
</div>
</body>
</html>
"#;
let result = loader.parse_html(html_content, "structure");
assert!(result.is_ok());
let structure = result.unwrap();
assert!(structure.contains("HTML Structure Analysis:"));
assert!(structure.contains("Headings (h1): 1"));
assert!(structure.contains("Headings (h2): 2"));
assert!(structure.contains("Paragraphs: 2"));
assert!(structure.contains("List Items: 3"));
assert!(structure.contains("Links: 1"));
assert!(structure.contains("Images: 1"));
assert!(structure.contains("Page Title: Structure Test"));
assert!(structure.contains("H1: Main Title"));
assert!(structure.contains("H2: Section 1, Section 2"));
}
#[test]
fn test_html_links_extraction() {
let loader = HtmlDocumentLoader::new();
let html_content = "<html><body><a href=\"https://example.com\">Example Website</a><a href=\"/internal/page\">Internal Page</a><a href=\"mailto:test@example.com\">Email Link</a><a href=\"#section1\">Anchor Link</a><a href=\"http://blank.com\"></a><span>Not a link</span></body></html>";
let result = loader.parse_html(html_content, "links");
assert!(result.is_ok());
let links = result.unwrap();
assert!(links.contains("Extracted Links:"));
assert!(links.contains("Example Website → https://example.com"));
assert!(links.contains("Internal Page → /internal/page"));
assert!(links.contains("Email Link → mailto:test@example.com"));
assert!(links.contains("Anchor Link → #section1"));
assert!(links.contains("(no text)")); assert!(!links.contains("Not a link")); }
#[test]
fn test_html_metadata_extraction() {
let loader = HtmlDocumentLoader::new();
let html_content = r#"
<!DOCTYPE html>
<html lang="en-US">
<head>
<title>Metadata Test Page</title>
<meta name="description" content="A test page for metadata extraction">
<meta name="keywords" content="test, html, metadata">
<meta name="author" content="Test Author">
<meta property="og:title" content="Open Graph Title">
<meta http-equiv="content-type" content="text/html; charset=utf-8">
</head>
<body>
<h1>Content</h1>
</body>
</html>
"#;
let result = loader.parse_html(html_content, "metadata");
assert!(result.is_ok());
let metadata = result.unwrap();
assert!(metadata.contains("HTML Metadata:"));
assert!(metadata.contains("description: A test page for metadata extraction"));
assert!(metadata.contains("keywords: test, html, metadata"));
assert!(metadata.contains("author: Test Author"));
assert!(metadata.contains("og:title: Open Graph Title"));
assert!(metadata.contains("content-type: text/html; charset=utf-8"));
assert!(metadata.contains("Title: Metadata Test Page"));
assert!(metadata.contains("Language: en-US"));
}
#[test]
fn test_html_all_extraction_mode() {
let loader = HtmlDocumentLoader::new();
let html_content = r#"
<!DOCTYPE html>
<html>
<head>
<title>Complete Test</title>
<meta name="description" content="Complete test page">
</head>
<body>
<h1>Main Content</h1>
<p>Some text content</p>
<a href="http://example.com">Test Link</a>
</body>
</html>
"#;
let result = loader.parse_html(html_content, "all");
assert!(result.is_ok());
let all_content = result.unwrap();
assert!(all_content.contains("=== HTML DOCUMENT ANALYSIS ==="));
assert!(all_content.contains("=== STRUCTURAL ANALYSIS ==="));
assert!(all_content.contains("=== EXTRACTED LINKS ==="));
assert!(all_content.contains("=== PAGE METADATA ==="));
assert!(all_content.contains("Main Content"));
assert!(all_content.contains("Test Link → http://example.com"));
assert!(all_content.contains("description: Complete test page"));
}
#[test]
fn test_html_invalid_extract_mode() {
let loader = HtmlDocumentLoader::new();
let html_content = "<html><body>Test</body></html>";
let result = loader.parse_html(html_content, "invalid_mode");
assert!(result.is_err());
assert!(format!("{:?}", result.unwrap_err()).contains("Unsupported extract mode"));
}
#[tokio::test]
async fn test_html_loader_with_temp_file() {
let temp_dir = TempDir::new().unwrap();
let file_path = temp_dir.path().join("test.html");
let html_content = "<!DOCTYPE html>
<html lang=\"en\">
<head>
<title>Test HTML Document</title>
<meta name=\"description\" content=\"A test HTML document for RustChain\">
<meta name=\"keywords\" content=\"rust, html, parsing, test\">
</head>
<body>
<header>
<h1>Welcome to RustChain HTML Loader</h1>
<nav>
<a href=\"#about\">About</a>
<a href=\"#features\">Features</a>
<a href=\"https://github.com/rustchain/rustchain\">GitHub</a>
</nav>
</header>
<main>
<section id=\"about\">
<h2>About RustChain</h2>
<p>RustChain is an AI agent framework built in Rust.</p>
<p>It provides comprehensive document processing capabilities.</p>
</section>
<section id=\"features\">
<h2>Key Features</h2>
<ul>
<li>High performance document parsing</li>
<li>Multiple format support</li>
<li>Extensible architecture</li>
</ul>
</section>
</main>
<footer>
<p>© 2025 RustChain Project</p>
</footer>
</body>
</html>";
let mut file = File::create(&file_path).unwrap();
file.write_all(html_content.as_bytes()).unwrap();
let loader = HtmlDocumentLoader::new();
let params = serde_json::json!({
"file_path": file_path.to_string_lossy(),
"extract_mode": "text"
});
let result = loader.invoke(¶ms.to_string()).await;
assert!(result.is_ok());
if let Ok(ToolResult::StructuredJson(response)) = result {
let doc: DocumentContent = serde_json::from_value(response).unwrap();
assert!(doc.text.contains("Welcome to RustChain HTML Loader"));
assert!(doc.text.contains("About RustChain"));
assert!(doc.text.contains("AI agent framework"));
assert!(doc.text.contains("High performance document parsing"));
assert!(!doc.text.contains("DOCTYPE")); assert_eq!(doc.metadata.content_type, "text/html");
assert_eq!(doc.source, "html_loader");
}
}
#[tokio::test]
async fn test_html_loader_structure_mode() {
let temp_dir = TempDir::new().unwrap();
let file_path = temp_dir.path().join("structure.html");
let html_content = "<html><head><title>Structure Analysis Test</title></head><body><h1>Main Title</h1><h2>Section A</h2><h2>Section B</h2><h3>Subsection B.1</h3><p>Paragraph 1</p><p>Paragraph 2</p><p>Paragraph 3</p><ul><li>Item 1</li><li>Item 2</li></ul><ol><li>Ordered Item 1</li></ol><table><tr><td>Cell</td></tr></table><form><input type=\"text\"></form><img src=\"test.jpg\" alt=\"Test\"><a href=\"#\">Link</a></body></html>";
let mut file = File::create(&file_path).unwrap();
file.write_all(html_content.as_bytes()).unwrap();
let loader = HtmlDocumentLoader::new();
let params = serde_json::json!({
"file_path": file_path.to_string_lossy(),
"extract_mode": "structure"
});
let result = loader.invoke(¶ms.to_string()).await;
assert!(result.is_ok());
if let Ok(ToolResult::StructuredJson(response)) = result {
let doc: DocumentContent = serde_json::from_value(response).unwrap();
assert!(doc.text.contains("HTML Structure Analysis:"));
assert!(doc.text.contains("Headings (h1): 1"));
assert!(doc.text.contains("Headings (h2): 2"));
assert!(doc.text.contains("Headings (h3): 1"));
assert!(doc.text.contains("Paragraphs: 3"));
assert!(doc.text.contains("Lists: 2"));
assert!(doc.text.contains("List Items: 3"));
assert!(doc.text.contains("Tables: 1"));
assert!(doc.text.contains("Forms: 1"));
assert!(doc.text.contains("Images: 1"));
assert!(doc.text.contains("Links: 1"));
assert!(doc.text.contains("Page Title: Structure Analysis Test"));
}
}
#[tokio::test]
async fn test_html_loader_invalid_file() {
let loader = HtmlDocumentLoader::new();
let result = loader.invoke(r#"{"file_path": "/nonexistent/file.html"}"#).await;
assert!(result.is_err());
assert!(format!("{:?}", result.unwrap_err()).contains("does not exist"));
}
#[tokio::test]
async fn test_html_loader_non_html_file() {
let temp_dir = TempDir::new().unwrap();
let file_path = temp_dir.path().join("test.txt");
let html_content = "<html><body>HTML content in txt file</body></html>";
let mut file = File::create(&file_path).unwrap();
file.write_all(html_content.as_bytes()).unwrap();
let loader = HtmlDocumentLoader::new();
let params = serde_json::json!({
"file_path": file_path.to_string_lossy(),
"extract_mode": "text"
});
let result = loader.invoke(¶ms.to_string()).await;
assert!(result.is_ok());
if let Ok(ToolResult::StructuredJson(response)) = result {
let doc: DocumentContent = serde_json::from_value(response).unwrap();
assert!(doc.text.contains("HTML content in txt file"));
assert_eq!(doc.metadata.content_type, "text/plain"); assert_eq!(doc.source, "html_loader");
}
}
#[tokio::test]
async fn test_html_loader_all_mode() {
let temp_dir = TempDir::new().unwrap();
let file_path = temp_dir.path().join("complete.html");
let html_content = "<html><head><title>Complete HTML Test</title><meta name=\"description\" content=\"Complete test for all extraction modes\"></head><body><h1>Main Title</h1><p>Content paragraph with <a href=\"https://example.com\">external link</a>.</p></body></html>";
let mut file = File::create(&file_path).unwrap();
file.write_all(html_content.as_bytes()).unwrap();
let loader = HtmlDocumentLoader::new();
let params = serde_json::json!({
"file_path": file_path.to_string_lossy(),
"extract_mode": "all"
});
let result = loader.invoke(¶ms.to_string()).await;
assert!(result.is_ok());
if let Ok(ToolResult::StructuredJson(response)) = result {
let doc: DocumentContent = serde_json::from_value(response).unwrap();
assert!(doc.text.contains("=== HTML DOCUMENT ANALYSIS ==="));
assert!(doc.text.contains("=== STRUCTURAL ANALYSIS ==="));
assert!(doc.text.contains("=== EXTRACTED LINKS ==="));
assert!(doc.text.contains("=== PAGE METADATA ==="));
assert!(doc.text.contains("Main Title")); assert!(doc.text.contains("HTML Structure Analysis:")); assert!(doc.text.contains("external link")); assert!(doc.text.contains("description: Complete test"));
assert_eq!(doc.metadata.content_type, "text/html");
assert_eq!(doc.source, "html_loader");
}
}
}