mod pdf;
pub use pdf::PdfConverter;
use crate::error::{Error, Result};
use crate::model::Metadata;
use crate::render::{ExtractionStats, RenderOptions};
use std::collections::HashMap;
use std::path::Path;
use std::sync::Arc;
#[derive(Debug, Clone, Default)]
pub struct ConvertOptions {
pub render: RenderOptions,
pub password: Option<String>,
pub collect_stats: bool,
pub output_format: OutputFormat,
}
impl ConvertOptions {
pub fn new() -> Self {
Self::default()
}
pub fn with_render_options(mut self, options: RenderOptions) -> Self {
self.render = options;
self
}
pub fn with_password(mut self, password: impl Into<String>) -> Self {
self.password = Some(password.into());
self
}
pub fn with_stats(mut self, collect: bool) -> Self {
self.collect_stats = collect;
self
}
pub fn with_format(mut self, format: OutputFormat) -> Self {
self.output_format = format;
self
}
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
pub enum OutputFormat {
#[default]
Markdown,
Text,
Json,
}
#[derive(Debug, Clone)]
pub struct ConvertResult {
pub content: String,
pub metadata: Metadata,
pub stats: Option<ExtractionStats>,
pub mime_type: &'static str,
}
impl ConvertResult {
pub fn new(content: String, metadata: Metadata) -> Self {
Self {
content,
metadata,
stats: None,
mime_type: "text/markdown",
}
}
pub fn with_stats(mut self, stats: ExtractionStats) -> Self {
self.stats = Some(stats);
self
}
pub fn with_mime_type(mut self, mime_type: &'static str) -> Self {
self.mime_type = mime_type;
self
}
pub fn content_len(&self) -> usize {
self.content.len()
}
}
pub trait DocumentConverter: Send + Sync {
fn supported_extensions(&self) -> &[&str];
fn name(&self) -> &str;
fn convert(&self, path: &Path, options: &ConvertOptions) -> Result<ConvertResult>;
fn convert_bytes(&self, bytes: &[u8], options: &ConvertOptions) -> Result<ConvertResult>;
fn supports_extension(&self, ext: &str) -> bool {
let ext_lower = ext.to_lowercase();
self.supported_extensions().iter().any(|e| *e == ext_lower)
}
}
pub struct ConverterRegistry {
converters: HashMap<String, Arc<dyn DocumentConverter>>,
by_name: HashMap<String, Arc<dyn DocumentConverter>>,
}
impl ConverterRegistry {
pub fn new() -> Self {
Self {
converters: HashMap::new(),
by_name: HashMap::new(),
}
}
pub fn with_defaults() -> Self {
let mut registry = Self::new();
registry.register(Arc::new(PdfConverter::new()));
registry
}
pub fn register(&mut self, converter: Arc<dyn DocumentConverter>) {
for ext in converter.supported_extensions() {
self.converters
.insert(ext.to_lowercase(), converter.clone());
}
self.by_name
.insert(converter.name().to_lowercase(), converter);
}
pub fn get_by_extension(&self, ext: &str) -> Option<Arc<dyn DocumentConverter>> {
self.converters.get(&ext.to_lowercase()).cloned()
}
pub fn get_by_name(&self, name: &str) -> Option<Arc<dyn DocumentConverter>> {
self.by_name.get(&name.to_lowercase()).cloned()
}
pub fn supports(&self, ext: &str) -> bool {
self.converters.contains_key(&ext.to_lowercase())
}
pub fn supported_extensions(&self) -> Vec<&str> {
self.converters.keys().map(|s| s.as_str()).collect()
}
pub fn convert(&self, path: &Path, options: &ConvertOptions) -> Result<ConvertResult> {
let ext = path
.extension()
.and_then(|e| e.to_str())
.ok_or_else(|| Error::Other("File has no extension".into()))?;
let converter = self
.get_by_extension(ext)
.ok_or_else(|| Error::Other(format!("No converter for extension: {}", ext)))?;
converter.convert(path, options)
}
pub fn convert_bytes(
&self,
bytes: &[u8],
ext: &str,
options: &ConvertOptions,
) -> Result<ConvertResult> {
let converter = self
.get_by_extension(ext)
.ok_or_else(|| Error::Other(format!("No converter for extension: {}", ext)))?;
converter.convert_bytes(bytes, options)
}
}
impl Default for ConverterRegistry {
fn default() -> Self {
Self::with_defaults()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_convert_options_builder() {
let options = ConvertOptions::new()
.with_password("secret")
.with_stats(true)
.with_format(OutputFormat::Text);
assert_eq!(options.password, Some("secret".to_string()));
assert!(options.collect_stats);
assert_eq!(options.output_format, OutputFormat::Text);
}
#[test]
fn test_registry_with_defaults() {
let registry = ConverterRegistry::with_defaults();
assert!(registry.supports("pdf"));
assert!(registry.supports("PDF"));
assert!(!registry.supports("docx"));
}
#[test]
fn test_registry_get_by_extension() {
let registry = ConverterRegistry::with_defaults();
let converter = registry.get_by_extension("pdf");
assert!(converter.is_some());
assert_eq!(converter.unwrap().name(), "pdf");
}
#[test]
fn test_registry_get_by_name() {
let registry = ConverterRegistry::with_defaults();
let converter = registry.get_by_name("pdf");
assert!(converter.is_some());
}
}