use crate::core::{
UniversalOutput, DocumentType, ProcessingParams, DocumentMetadata,
DocumentChunk, ChunkPosition, ChunkMetadata, ProcessingInfo
};
use crate::error::DocLoaderError;
use crate::processors::DocumentProcessor;
use crate::utils::{chunk_text, clean_text, extract_text_metadata};
use std::path::Path;
use std::fs;
use chrono::Utc;
use serde_json::json;
pub struct DocxProcessor;
impl DocxProcessor {
pub fn new() -> Self {
Self
}
fn read_docx_file(&self, file_path: &Path) -> Result<(String, DocxMetadata), DocLoaderError> {
let file_content = fs::read(file_path)?;
self.extract_from_docx_content(&file_content)
}
fn extract_from_docx_content(&self, content: &[u8]) -> Result<(String, DocxMetadata), DocLoaderError> {
let text_content = format!(
"DOCX content extraction not fully implemented.\n\
File size: {} bytes\n\
This is a placeholder for DOCX text extraction.\n\
In a real implementation, you would:\n\
1. Extract the ZIP archive\n\
2. Parse document.xml\n\
3. Extract text from XML elements\n\
4. Handle formatting, tables, etc.",
content.len()
);
let metadata = DocxMetadata {
word_count: 0, paragraph_count: 0,
page_count: None,
title: None,
author: None,
subject: None,
company: None,
last_modified_by: None,
};
Ok((text_content, metadata))
}
}
#[derive(Debug, Clone)]
pub struct DocxMetadata {
pub word_count: usize,
pub paragraph_count: usize,
pub page_count: Option<usize>,
pub title: Option<String>,
pub author: Option<String>,
pub subject: Option<String>,
pub company: Option<String>,
pub last_modified_by: Option<String>,
}
impl DocumentProcessor for DocxProcessor {
fn supported_type(&self) -> DocumentType {
DocumentType::DOCX
}
fn process_file(&self, file_path: &Path, params: &ProcessingParams) -> Result<UniversalOutput, DocLoaderError> {
let start_time = std::time::Instant::now();
if !file_path.exists() {
return Err(DocLoaderError::FileNotFound(
format!("File not found: {}", file_path.display())
));
}
let (raw_text, docx_meta) = self.read_docx_file(file_path)?;
let text = if params.text_cleaning {
clean_text(&raw_text)
} else {
raw_text
};
let chunks_text = chunk_text(&text, params.max_chunk_size, params.chunk_overlap);
let mut chunks = Vec::new();
for (index, chunk_text) in chunks_text.iter().enumerate() {
let text_meta = extract_text_metadata(chunk_text);
let start_offset = if index == 0 {
0
} else {
text.find(chunk_text).unwrap_or(0)
};
let chunk = DocumentChunk {
id: format!("docx_chunk_{}", index),
content: chunk_text.clone(),
chunk_index: index,
position: ChunkPosition {
page: None, line: None,
start_offset: Some(start_offset),
end_offset: Some(start_offset + chunk_text.len()),
},
metadata: ChunkMetadata {
size: chunk_text.len(),
language: if params.language_detection { text_meta.detected_language } else { None },
confidence: Some(0.8), format_specific: Some(json!({
"word_count": text_meta.word_count,
"estimated_tokens": text_meta.estimated_tokens,
"paragraph_count": chunk_text.split("\n\n").count()
})),
},
};
chunks.push(chunk);
}
let file_metadata = fs::metadata(file_path)?;
let text_meta = extract_text_metadata(&text);
let document_metadata = DocumentMetadata {
filename: file_path.file_name()
.and_then(|name| name.to_str())
.unwrap_or("unknown.docx")
.to_string(),
filepath: file_path.to_string_lossy().to_string(),
document_type: DocumentType::DOCX,
file_size: file_metadata.len(),
created_at: file_metadata.created().ok().map(|t| t.into()),
modified_at: file_metadata.modified().ok().map(|t| t.into()),
title: docx_meta.title.clone(),
author: docx_meta.author.clone(),
format_metadata: json!({
"docx_metadata": {
"word_count": docx_meta.word_count,
"paragraph_count": docx_meta.paragraph_count,
"page_count": docx_meta.page_count,
"subject": docx_meta.subject,
"company": docx_meta.company,
"last_modified_by": docx_meta.last_modified_by
},
"extracted_text_metadata": {
"total_words": text_meta.word_count,
"total_characters": text_meta.character_count,
"total_lines": text_meta.line_count,
"estimated_tokens": text_meta.estimated_tokens,
"detected_language": text_meta.detected_language
}
}),
};
let processing_time = start_time.elapsed();
let processing_info = ProcessingInfo {
processor: "DocxProcessor".to_string(),
processor_version: self.version().to_string(),
processed_at: Utc::now(),
processing_time_ms: processing_time.as_millis() as u64,
total_chunks: chunks.len(),
total_content_size: text.len(),
processing_params: params.clone(),
};
Ok(UniversalOutput {
document_metadata,
chunks,
processing_info,
})
}
fn process_content(&self, content: &[u8], filename: &str, params: &ProcessingParams) -> Result<UniversalOutput, DocLoaderError> {
let start_time = std::time::Instant::now();
let (raw_text, docx_meta) = self.extract_from_docx_content(content)?;
let text = if params.text_cleaning {
clean_text(&raw_text)
} else {
raw_text
};
let chunks_text = chunk_text(&text, params.max_chunk_size, params.chunk_overlap);
let mut chunks = Vec::new();
for (index, chunk_text) in chunks_text.iter().enumerate() {
let text_meta = extract_text_metadata(chunk_text);
let start_offset = if index == 0 {
0
} else {
text.find(chunk_text).unwrap_or(0)
};
let chunk = DocumentChunk {
id: format!("docx_chunk_{}", index),
content: chunk_text.clone(),
chunk_index: index,
position: ChunkPosition {
page: None,
line: None,
start_offset: Some(start_offset),
end_offset: Some(start_offset + chunk_text.len()),
},
metadata: ChunkMetadata {
size: chunk_text.len(),
language: if params.language_detection { text_meta.detected_language } else { None },
confidence: Some(0.8),
format_specific: Some(json!({
"word_count": text_meta.word_count,
"estimated_tokens": text_meta.estimated_tokens,
"paragraph_count": chunk_text.split("\n\n").count()
})),
},
};
chunks.push(chunk);
}
let text_meta = extract_text_metadata(&text);
let document_metadata = DocumentMetadata {
filename: filename.to_string(),
filepath: format!("memory://{}", filename),
document_type: DocumentType::DOCX,
file_size: content.len() as u64,
created_at: Some(Utc::now()),
modified_at: Some(Utc::now()),
title: docx_meta.title.clone(),
author: docx_meta.author.clone(),
format_metadata: json!({
"docx_metadata": {
"word_count": docx_meta.word_count,
"paragraph_count": docx_meta.paragraph_count,
"page_count": docx_meta.page_count,
"subject": docx_meta.subject,
"company": docx_meta.company,
"last_modified_by": docx_meta.last_modified_by
},
"extracted_text_metadata": {
"total_words": text_meta.word_count,
"total_characters": text_meta.character_count,
"total_lines": text_meta.line_count,
"estimated_tokens": text_meta.estimated_tokens,
"detected_language": text_meta.detected_language
}
}),
};
let processing_time = start_time.elapsed();
let processing_info = ProcessingInfo {
processor: "DocxProcessor".to_string(),
processor_version: self.version().to_string(),
processed_at: Utc::now(),
processing_time_ms: processing_time.as_millis() as u64,
total_chunks: chunks.len(),
total_content_size: text.len(),
processing_params: params.clone(),
};
Ok(UniversalOutput {
document_metadata,
chunks,
processing_info,
})
}
}
impl Default for DocxProcessor {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_docx_processor_creation() {
let processor = DocxProcessor::new();
assert_eq!(processor.supported_type().to_string(), "DOCX");
}
#[test]
fn test_process_content() {
let processor = DocxProcessor::new();
let content = b"fake docx content"; let params = ProcessingParams::default();
let result = processor.process_content(content, "test.docx", ¶ms);
assert!(result.is_ok());
let output = result.unwrap();
assert_eq!(output.document_metadata.document_type.to_string(), "DOCX");
assert!(!output.chunks.is_empty());
}
}