doc_loader/
lib.rs

1//! # Doc Loader
2//! 
3//! A comprehensive toolkit for extracting and processing documentation from multiple file formats.
4//! 
5//! This library provides unified processing for different document types:
6//! - PDF documents
7//! - Plain text files  
8//! - JSON documents
9//! - CSV files
10//! - DOCX documents
11//! 
12//! Each processor extracts content and metadata, then formats everything into a universal JSON
13//! structure ready for vector stores and RAG systems.
14//! 
15//! ## Features
16//! 
17//! - **Universal JSON Output**: Consistent format across all document types
18//! - **Intelligent Text Processing**: Smart chunking, cleaning, and metadata extraction
19//! - **Modular Architecture**: Each document type has its specialized processor
20//! - **Vector Store Ready**: Optimized output for embedding and indexing
21//! 
22//! ## Example
23//! 
24//! ```rust
25//! use doc_loader::{UniversalProcessor, ProcessingParams};
26//! 
27//! // Create a processor instance
28//! let processor = UniversalProcessor::new();
29//! let params = ProcessingParams::default();
30//! 
31//! // Get supported extensions
32//! let extensions = UniversalProcessor::supported_extensions();
33//! assert!(!extensions.is_empty());
34//! assert!(extensions.contains(&"pdf"));
35//! 
36//! // Example of processing (would require an actual file)
37//! // let result = processor.process_file(Path::new("document.pdf"), Some(params))?;
38//! // println!("Extracted {} chunks", result.chunks.len());
39//! ```
40
41pub mod error;
42pub mod utils;
43pub mod processors;
44pub mod core;
45
46pub use error::{DocLoaderError, Result};
47pub use core::{
48    UniversalOutput, DocumentChunk, ChunkMetadata, DocumentMetadata, 
49    ProcessingParams, DocumentType, ProcessingInfo
50};
51pub use processors::{UniversalProcessor, DocumentProcessor};
52
53// Re-export key utility functions
54pub use utils::{
55    clean_text, chunk_text, extract_text_metadata, detect_language
56};
57
58// Python bindings module
59#[cfg(feature = "python")]
60pub mod python;
61
62// Re-export Python bindings when feature is enabled
63#[cfg(feature = "python")]
64pub use python::*;
65
66#[cfg(test)]
67mod tests {
68    use super::*;
69
70    #[test]
71    fn test_universal_processor_creation() {
72        let _processor = UniversalProcessor::new();
73        // Test that processor is created successfully
74        assert!(!UniversalProcessor::supported_extensions().is_empty());
75    }
76
77    #[test]
78    fn test_supported_extensions() {
79        let extensions = UniversalProcessor::supported_extensions();
80        assert!(extensions.contains(&"pdf"));
81        assert!(extensions.contains(&"txt"));
82        assert!(extensions.contains(&"json"));
83        assert!(extensions.contains(&"csv"));
84        assert!(extensions.contains(&"docx"));
85    }
86
87    #[test]
88    fn test_processing_params_default() {
89        let params = ProcessingParams::default();
90        assert_eq!(params.max_chunk_size, 1000);
91        assert_eq!(params.chunk_overlap, 100);
92        assert!(params.text_cleaning);
93        assert!(!params.language_detection);
94    }
95
96    #[test]
97    fn test_document_type_from_extension() {
98        assert_eq!(DocumentType::from_extension("pdf"), Some(DocumentType::PDF));
99        assert_eq!(DocumentType::from_extension("txt"), Some(DocumentType::TXT));
100        assert_eq!(DocumentType::from_extension("json"), Some(DocumentType::JSON));
101        assert_eq!(DocumentType::from_extension("csv"), Some(DocumentType::CSV));
102        assert_eq!(DocumentType::from_extension("docx"), Some(DocumentType::DOCX));
103        assert_eq!(DocumentType::from_extension("unknown"), None);
104    }
105
106    #[test]
107    fn test_document_type_to_string() {
108        assert_eq!(DocumentType::PDF.to_string(), "PDF");
109        assert_eq!(DocumentType::TXT.to_string(), "TXT");
110        assert_eq!(DocumentType::JSON.to_string(), "JSON");
111        assert_eq!(DocumentType::CSV.to_string(), "CSV");
112        assert_eq!(DocumentType::DOCX.to_string(), "DOCX");
113    }
114
115    #[test]
116    fn test_text_cleaning() {
117        let dirty_text = "  Hello\t\tWorld  \n\n  Test  ";
118        let cleaned = clean_text(dirty_text);
119        assert_eq!(cleaned, "Hello World Test");
120    }
121
122    #[test]
123    fn test_text_chunking() {
124        let text = "This is a test document with multiple sentences. It should be split into chunks properly.";
125        let chunks = chunk_text(text, 50, 10);
126        assert!(chunks.len() > 1);
127        assert!(chunks[0].len() <= 50);
128    }
129
130    #[test]
131    fn test_error_handling() {
132        let error = DocLoaderError::UnsupportedFormat("test".to_string());
133        assert!(error.to_string().contains("Unsupported file format"));
134        
135        let error = DocLoaderError::FileNotFound("test.txt".to_string());
136        assert!(error.to_string().contains("File not found"));
137    }
138}