1pub mod error;
42pub mod utils;
43pub mod processors;
44pub mod core;
45
46pub use error::{DocLoaderError, Result};
47pub use core::{
48 UniversalOutput, DocumentChunk, ChunkMetadata, DocumentMetadata,
49 ProcessingParams, DocumentType, ProcessingInfo
50};
51pub use processors::{UniversalProcessor, DocumentProcessor};
52
53pub use utils::{
55 clean_text, chunk_text, extract_text_metadata, detect_language
56};
57
58#[cfg(feature = "python")]
60pub mod python;
61
62#[cfg(feature = "python")]
64pub use python::*;
65
66#[cfg(test)]
67mod tests {
68 use super::*;
69
70 #[test]
71 fn test_universal_processor_creation() {
72 let _processor = UniversalProcessor::new();
73 assert!(!UniversalProcessor::supported_extensions().is_empty());
75 }
76
77 #[test]
78 fn test_supported_extensions() {
79 let extensions = UniversalProcessor::supported_extensions();
80 assert!(extensions.contains(&"pdf"));
81 assert!(extensions.contains(&"txt"));
82 assert!(extensions.contains(&"json"));
83 assert!(extensions.contains(&"csv"));
84 assert!(extensions.contains(&"docx"));
85 }
86
87 #[test]
88 fn test_processing_params_default() {
89 let params = ProcessingParams::default();
90 assert_eq!(params.max_chunk_size, 1000);
91 assert_eq!(params.chunk_overlap, 100);
92 assert!(params.text_cleaning);
93 assert!(!params.language_detection);
94 }
95
96 #[test]
97 fn test_document_type_from_extension() {
98 assert_eq!(DocumentType::from_extension("pdf"), Some(DocumentType::PDF));
99 assert_eq!(DocumentType::from_extension("txt"), Some(DocumentType::TXT));
100 assert_eq!(DocumentType::from_extension("json"), Some(DocumentType::JSON));
101 assert_eq!(DocumentType::from_extension("csv"), Some(DocumentType::CSV));
102 assert_eq!(DocumentType::from_extension("docx"), Some(DocumentType::DOCX));
103 assert_eq!(DocumentType::from_extension("unknown"), None);
104 }
105
106 #[test]
107 fn test_document_type_to_string() {
108 assert_eq!(DocumentType::PDF.to_string(), "PDF");
109 assert_eq!(DocumentType::TXT.to_string(), "TXT");
110 assert_eq!(DocumentType::JSON.to_string(), "JSON");
111 assert_eq!(DocumentType::CSV.to_string(), "CSV");
112 assert_eq!(DocumentType::DOCX.to_string(), "DOCX");
113 }
114
115 #[test]
116 fn test_text_cleaning() {
117 let dirty_text = " Hello\t\tWorld \n\n Test ";
118 let cleaned = clean_text(dirty_text);
119 assert_eq!(cleaned, "Hello World Test");
120 }
121
122 #[test]
123 fn test_text_chunking() {
124 let text = "This is a test document with multiple sentences. It should be split into chunks properly.";
125 let chunks = chunk_text(text, 50, 10);
126 assert!(chunks.len() > 1);
127 assert!(chunks[0].len() <= 50);
128 }
129
130 #[test]
131 fn test_error_handling() {
132 let error = DocLoaderError::UnsupportedFormat("test".to_string());
133 assert!(error.to_string().contains("Unsupported file format"));
134
135 let error = DocLoaderError::FileNotFound("test.txt".to_string());
136 assert!(error.to_string().contains("File not found"));
137 }
138}