use crate::core::{UniversalOutput, DocumentType, ProcessingParams};
use crate::error::DocLoaderError;
use std::path::Path;
pub mod pdf;
pub mod txt;
pub mod json;
pub mod csv;
pub mod docx;
pub trait DocumentProcessor {
fn supported_type(&self) -> DocumentType;
fn process_file(&self, file_path: &Path, params: &ProcessingParams) -> Result<UniversalOutput, DocLoaderError>;
fn process_content(&self, content: &[u8], filename: &str, params: &ProcessingParams) -> Result<UniversalOutput, DocLoaderError>;
fn version(&self) -> &'static str {
"1.0.0"
}
}
pub struct UniversalProcessor {
pdf_processor: pdf::PdfProcessor,
txt_processor: txt::TxtProcessor,
json_processor: json::JsonProcessor,
csv_processor: csv::CsvProcessor,
docx_processor: docx::DocxProcessor,
}
impl UniversalProcessor {
pub fn new() -> Self {
Self {
pdf_processor: pdf::PdfProcessor::new(),
txt_processor: txt::TxtProcessor::new(),
json_processor: json::JsonProcessor::new(),
csv_processor: csv::CsvProcessor::new(),
docx_processor: docx::DocxProcessor::new(),
}
}
pub fn process_file(&self, file_path: &Path, params: Option<ProcessingParams>) -> Result<UniversalOutput, DocLoaderError> {
let params = params.unwrap_or_default();
let extension = file_path
.extension()
.and_then(|ext| ext.to_str())
.ok_or_else(|| DocLoaderError::UnsupportedFormat("No file extension".to_string()))?;
let doc_type = DocumentType::from_extension(extension)
.ok_or_else(|| DocLoaderError::UnsupportedFormat(format!("Unsupported extension: {}", extension)))?;
match doc_type {
DocumentType::PDF => self.pdf_processor.process_file(file_path, ¶ms),
DocumentType::TXT => self.txt_processor.process_file(file_path, ¶ms),
DocumentType::JSON => self.json_processor.process_file(file_path, ¶ms),
DocumentType::CSV => self.csv_processor.process_file(file_path, ¶ms),
DocumentType::DOCX => self.docx_processor.process_file(file_path, ¶ms),
}
}
pub fn get_processor(&self, doc_type: &DocumentType) -> Box<&dyn DocumentProcessor> {
match doc_type {
DocumentType::PDF => Box::new(&self.pdf_processor),
DocumentType::TXT => Box::new(&self.txt_processor),
DocumentType::JSON => Box::new(&self.json_processor),
DocumentType::CSV => Box::new(&self.csv_processor),
DocumentType::DOCX => Box::new(&self.docx_processor),
}
}
pub fn supported_extensions() -> &'static [&'static str] {
&["pdf", "txt", "json", "csv", "docx"]
}
}
impl Default for UniversalProcessor {
fn default() -> Self {
Self::new()
}
}