use crate::core::{ExtractionResult, Result};
use std::collections::HashMap;
use std::io::Read;
pub trait Parser: Send + Sync {
fn supported_types(&self) -> &[&str];
fn parse(&self, data: &[u8], mime_type: &str) -> Result<ExtractionResult>;
fn parse_stream(&self, reader: &mut dyn Read, mime_type: &str) -> Result<ExtractionResult> {
let mut buffer = Vec::new();
reader.read_to_end(&mut buffer)?;
self.parse(&buffer, mime_type)
}
fn name(&self) -> &str;
}
pub struct ParserRegistry {
parsers: Vec<Box<dyn Parser>>,
mime_to_parser: HashMap<String, usize>,
}
impl ParserRegistry {
pub fn new() -> Self {
Self {
parsers: Vec::new(),
mime_to_parser: HashMap::new(),
}
}
pub fn register(&mut self, parser: Box<dyn Parser>) {
let index = self.parsers.len();
let supported = parser.supported_types();
for mime_type in supported {
self.mime_to_parser.insert(mime_type.to_string(), index);
}
self.parsers.push(parser);
}
pub fn get_parser(&self, mime_type: &str) -> Option<&dyn Parser> {
self.mime_to_parser
.get(mime_type)
.and_then(|&index| self.parsers.get(index))
.map(|boxed| boxed.as_ref())
}
pub fn supported_types(&self) -> Vec<String> {
self.mime_to_parser.keys().cloned().collect()
}
}
impl Default for ParserRegistry {
fn default() -> Self {
let mut registry = Self::new();
registry.register(Box::new(text::PlainTextParser));
registry.register(Box::new(text::JsonParser));
registry.register(Box::new(text::CsvParser));
registry.register(Box::new(text::XmlParser));
registry.register(Box::new(text::HtmlParser));
registry.register(Box::new(text::CssParser));
registry.register(Box::new(text::RtfParser));
registry.register(Box::new(document::PdfParser));
registry.register(Box::new(document::DocxParser));
registry.register(Box::new(document::OdtParser));
registry.register(Box::new(document::XlsxParser));
registry.register(Box::new(document::PptxParser));
registry.register(Box::new(document::OdsParser));
registry.register(Box::new(document::OdpParser));
registry.register(Box::new(document::XlsParser));
registry.register(Box::new(document::DocParser));
registry.register(Box::new(document::PptParser));
registry.register(Box::new(image::JpegParser));
registry.register(Box::new(image::PngParser));
registry.register(Box::new(image::TiffParser));
registry.register(Box::new(archive::ZipParser));
registry.register(Box::new(archive::TarParser));
registry
}
}
pub mod text;
pub mod document;
pub mod image;
pub mod archive;