pub mod error;
pub mod utils;
pub mod processors;
pub mod core;
pub use error::{DocLoaderError, Result};
pub use core::{
UniversalOutput, DocumentChunk, ChunkMetadata, DocumentMetadata,
ProcessingParams, DocumentType, ProcessingInfo
};
pub use processors::{UniversalProcessor, DocumentProcessor};
pub use utils::{
clean_text, chunk_text, extract_text_metadata, detect_language
};
#[cfg(feature = "python")]
pub mod python;
#[cfg(feature = "python")]
pub use python::*;
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_universal_processor_creation() {
let _processor = UniversalProcessor::new();
assert!(!UniversalProcessor::supported_extensions().is_empty());
}
#[test]
fn test_supported_extensions() {
let extensions = UniversalProcessor::supported_extensions();
assert!(extensions.contains(&"pdf"));
assert!(extensions.contains(&"txt"));
assert!(extensions.contains(&"json"));
assert!(extensions.contains(&"csv"));
assert!(extensions.contains(&"docx"));
}
#[test]
fn test_processing_params_default() {
let params = ProcessingParams::default();
assert_eq!(params.max_chunk_size, 1000);
assert_eq!(params.chunk_overlap, 100);
assert!(params.text_cleaning);
assert!(!params.language_detection);
}
#[test]
fn test_document_type_from_extension() {
assert_eq!(DocumentType::from_extension("pdf"), Some(DocumentType::PDF));
assert_eq!(DocumentType::from_extension("txt"), Some(DocumentType::TXT));
assert_eq!(DocumentType::from_extension("json"), Some(DocumentType::JSON));
assert_eq!(DocumentType::from_extension("csv"), Some(DocumentType::CSV));
assert_eq!(DocumentType::from_extension("docx"), Some(DocumentType::DOCX));
assert_eq!(DocumentType::from_extension("unknown"), None);
}
#[test]
fn test_document_type_to_string() {
assert_eq!(DocumentType::PDF.to_string(), "PDF");
assert_eq!(DocumentType::TXT.to_string(), "TXT");
assert_eq!(DocumentType::JSON.to_string(), "JSON");
assert_eq!(DocumentType::CSV.to_string(), "CSV");
assert_eq!(DocumentType::DOCX.to_string(), "DOCX");
}
#[test]
fn test_text_cleaning() {
let dirty_text = " Hello\t\tWorld \n\n Test ";
let cleaned = clean_text(dirty_text);
assert_eq!(cleaned, "Hello World Test");
}
#[test]
fn test_text_chunking() {
let text = "This is a test document with multiple sentences. It should be split into chunks properly.";
let chunks = chunk_text(text, 50, 10);
assert!(chunks.len() > 1);
assert!(chunks[0].len() <= 50);
}
#[test]
fn test_error_handling() {
let error = DocLoaderError::UnsupportedFormat("test".to_string());
assert!(error.to_string().contains("Unsupported file format"));
let error = DocLoaderError::FileNotFound("test.txt".to_string());
assert!(error.to_string().contains("File not found"));
}
}