doc_loader 0.3.2

A comprehensive toolkit for extracting and processing documentation from multiple file formats (PDF, TXT, JSON, CSV, DOCX) with Python bindings
Documentation
use crate::core::{
    UniversalOutput, DocumentType, ProcessingParams, DocumentMetadata, 
    DocumentChunk, ChunkPosition, ChunkMetadata, ProcessingInfo
};
use crate::error::DocLoaderError;
use crate::processors::DocumentProcessor;
use crate::utils::{chunk_text, clean_text, extract_text_metadata};

use std::path::Path;
use std::fs;
use chrono::Utc;
use serde_json::json;

pub struct DocxProcessor;

impl DocxProcessor {
    pub fn new() -> Self {
        Self
    }
    
    /// Lit et extrait le texte d'un fichier DOCX
    fn read_docx_file(&self, file_path: &Path) -> Result<(String, DocxMetadata), DocLoaderError> {
        let file_content = fs::read(file_path)?;
        self.extract_from_docx_content(&file_content)
    }
    
    /// Extrait le contenu d'un fichier DOCX depuis les bytes
    fn extract_from_docx_content(&self, content: &[u8]) -> Result<(String, DocxMetadata), DocLoaderError> {
        // Note: docx-rs est une bibliothèque pour créer des documents DOCX, pas pour les lire
        // Pour une vraie implémentation, vous devriez utiliser une bibliothèque comme `docx` ou `zip` + XML parsing
        // Ici, nous fournissons une implémentation simplifiée
        
        // Tentative d'extraction basique (cette partie devrait être remplacée par une vraie extraction)
        let text_content = format!(
            "DOCX content extraction not fully implemented.\n\
            File size: {} bytes\n\
            This is a placeholder for DOCX text extraction.\n\
            In a real implementation, you would:\n\
            1. Extract the ZIP archive\n\
            2. Parse document.xml\n\
            3. Extract text from XML elements\n\
            4. Handle formatting, tables, etc.",
            content.len()
        );
        
        let metadata = DocxMetadata {
            word_count: 0, // Devrait être extrait du document
            paragraph_count: 0,
            page_count: None,
            title: None,
            author: None,
            subject: None,
            company: None,
            last_modified_by: None,
        };
        
        Ok((text_content, metadata))
    }
}

#[derive(Debug, Clone)]
pub struct DocxMetadata {
    pub word_count: usize,
    pub paragraph_count: usize,
    pub page_count: Option<usize>,
    pub title: Option<String>,
    pub author: Option<String>,
    pub subject: Option<String>,
    pub company: Option<String>,
    pub last_modified_by: Option<String>,
}

impl DocumentProcessor for DocxProcessor {
    fn supported_type(&self) -> DocumentType {
        DocumentType::DOCX
    }
    
    fn process_file(&self, file_path: &Path, params: &ProcessingParams) -> Result<UniversalOutput, DocLoaderError> {
        let start_time = std::time::Instant::now();
        
        // Vérifier que le fichier existe
        if !file_path.exists() {
            return Err(DocLoaderError::FileNotFound(
                format!("File not found: {}", file_path.display())
            ));
        }
        
        // Lire et extraire le contenu du DOCX
        let (raw_text, docx_meta) = self.read_docx_file(file_path)?;
        
        // Nettoyer le texte si demandé
        let text = if params.text_cleaning {
            clean_text(&raw_text)
        } else {
            raw_text
        };
        
        // Découper en chunks
        let chunks_text = chunk_text(&text, params.max_chunk_size, params.chunk_overlap);
        
        // Créer les chunks avec métadonnées
        let mut chunks = Vec::new();
        
        for (index, chunk_text) in chunks_text.iter().enumerate() {
            let text_meta = extract_text_metadata(chunk_text);
            
            let start_offset = if index == 0 { 
                0 
            } else { 
                text.find(chunk_text).unwrap_or(0)
            };
            
            let chunk = DocumentChunk {
                id: format!("docx_chunk_{}", index),
                content: chunk_text.clone(),
                chunk_index: index,
                position: ChunkPosition {
                    page: None, // TODO: implémenter la détection de page
                    line: None,
                    start_offset: Some(start_offset),
                    end_offset: Some(start_offset + chunk_text.len()),
                },
                metadata: ChunkMetadata {
                    size: chunk_text.len(),
                    language: if params.language_detection { text_meta.detected_language } else { None },
                    confidence: Some(0.8), // Confiance moyenne car l'extraction DOCX peut être incomplète
                    format_specific: Some(json!({
                        "word_count": text_meta.word_count,
                        "estimated_tokens": text_meta.estimated_tokens,
                        "paragraph_count": chunk_text.split("\n\n").count()
                    })),
                },
            };
            
            chunks.push(chunk);
        }
        
        // Métadonnées du fichier
        let file_metadata = fs::metadata(file_path)?;
        let text_meta = extract_text_metadata(&text);
        
        let document_metadata = DocumentMetadata {
            filename: file_path.file_name()
                .and_then(|name| name.to_str())
                .unwrap_or("unknown.docx")
                .to_string(),
            filepath: file_path.to_string_lossy().to_string(),
            document_type: DocumentType::DOCX,
            file_size: file_metadata.len(),
            created_at: file_metadata.created().ok().map(|t| t.into()),
            modified_at: file_metadata.modified().ok().map(|t| t.into()),
            title: docx_meta.title.clone(),
            author: docx_meta.author.clone(),
            format_metadata: json!({
                "docx_metadata": {
                    "word_count": docx_meta.word_count,
                    "paragraph_count": docx_meta.paragraph_count,
                    "page_count": docx_meta.page_count,
                    "subject": docx_meta.subject,
                    "company": docx_meta.company,
                    "last_modified_by": docx_meta.last_modified_by
                },
                "extracted_text_metadata": {
                    "total_words": text_meta.word_count,
                    "total_characters": text_meta.character_count,
                    "total_lines": text_meta.line_count,
                    "estimated_tokens": text_meta.estimated_tokens,
                    "detected_language": text_meta.detected_language
                }
            }),
        };
        
        let processing_time = start_time.elapsed();
        let processing_info = ProcessingInfo {
            processor: "DocxProcessor".to_string(),
            processor_version: self.version().to_string(),
            processed_at: Utc::now(),
            processing_time_ms: processing_time.as_millis() as u64,
            total_chunks: chunks.len(),
            total_content_size: text.len(),
            processing_params: params.clone(),
        };
        
        Ok(UniversalOutput {
            document_metadata,
            chunks,
            processing_info,
        })
    }
    
    fn process_content(&self, content: &[u8], filename: &str, params: &ProcessingParams) -> Result<UniversalOutput, DocLoaderError> {
        let start_time = std::time::Instant::now();
        
        // Extraire le contenu du DOCX depuis les bytes
        let (raw_text, docx_meta) = self.extract_from_docx_content(content)?;
        
        // Nettoyer le texte si demandé
        let text = if params.text_cleaning {
            clean_text(&raw_text)
        } else {
            raw_text
        };
        
        // Découper en chunks
        let chunks_text = chunk_text(&text, params.max_chunk_size, params.chunk_overlap);
        
        // Créer les chunks avec métadonnées
        let mut chunks = Vec::new();
        
        for (index, chunk_text) in chunks_text.iter().enumerate() {
            let text_meta = extract_text_metadata(chunk_text);
            
            let start_offset = if index == 0 { 
                0 
            } else { 
                text.find(chunk_text).unwrap_or(0)
            };
            
            let chunk = DocumentChunk {
                id: format!("docx_chunk_{}", index),
                content: chunk_text.clone(),
                chunk_index: index,
                position: ChunkPosition {
                    page: None,
                    line: None,
                    start_offset: Some(start_offset),
                    end_offset: Some(start_offset + chunk_text.len()),
                },
                metadata: ChunkMetadata {
                    size: chunk_text.len(),
                    language: if params.language_detection { text_meta.detected_language } else { None },
                    confidence: Some(0.8),
                    format_specific: Some(json!({
                        "word_count": text_meta.word_count,
                        "estimated_tokens": text_meta.estimated_tokens,
                        "paragraph_count": chunk_text.split("\n\n").count()
                    })),
                },
            };
            
            chunks.push(chunk);
        }
        
        let text_meta = extract_text_metadata(&text);
        
        let document_metadata = DocumentMetadata {
            filename: filename.to_string(),
            filepath: format!("memory://{}", filename),
            document_type: DocumentType::DOCX,
            file_size: content.len() as u64,
            created_at: Some(Utc::now()),
            modified_at: Some(Utc::now()),
            title: docx_meta.title.clone(),
            author: docx_meta.author.clone(),
            format_metadata: json!({
                "docx_metadata": {
                    "word_count": docx_meta.word_count,
                    "paragraph_count": docx_meta.paragraph_count,
                    "page_count": docx_meta.page_count,
                    "subject": docx_meta.subject,
                    "company": docx_meta.company,
                    "last_modified_by": docx_meta.last_modified_by
                },
                "extracted_text_metadata": {
                    "total_words": text_meta.word_count,
                    "total_characters": text_meta.character_count,
                    "total_lines": text_meta.line_count,
                    "estimated_tokens": text_meta.estimated_tokens,
                    "detected_language": text_meta.detected_language
                }
            }),
        };
        
        let processing_time = start_time.elapsed();
        let processing_info = ProcessingInfo {
            processor: "DocxProcessor".to_string(),
            processor_version: self.version().to_string(),
            processed_at: Utc::now(),
            processing_time_ms: processing_time.as_millis() as u64,
            total_chunks: chunks.len(),
            total_content_size: text.len(),
            processing_params: params.clone(),
        };
        
        Ok(UniversalOutput {
            document_metadata,
            chunks,
            processing_info,
        })
    }
}

impl Default for DocxProcessor {
    fn default() -> Self {
        Self::new()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_docx_processor_creation() {
        let processor = DocxProcessor::new();
        assert_eq!(processor.supported_type().to_string(), "DOCX");
    }

    #[test]
    fn test_process_content() {
        let processor = DocxProcessor::new();
        let content = b"fake docx content"; // Dans un vrai cas, ce serait un fichier DOCX binaire
        let params = ProcessingParams::default();
        
        let result = processor.process_content(content, "test.docx", &params);
        assert!(result.is_ok());
        
        let output = result.unwrap();
        assert_eq!(output.document_metadata.document_type.to_string(), "DOCX");
        assert!(!output.chunks.is_empty());
    }
}