oxirs_vec/content_processing/
mod.rs

1//! Advanced content processing for multiple document formats
2//!
3//! This module provides comprehensive document parsing and content extraction
4//! capabilities for PDF, HTML, XML, office documents, and multimedia content.
5//!
6//! This module is only available when the `content-processing` feature is enabled.
7
8#[cfg(feature = "content-processing")]
9use anyhow::Result;
10#[cfg(feature = "content-processing")]
11use std::collections::HashMap;
12
13// Re-export handlers
14mod data_handlers;
15mod multimedia_handlers;
16mod office_handlers;
17mod pdf_handler;
18mod text_handlers;
19mod types;
20
21#[cfg(feature = "content-processing")]
22pub use data_handlers::*;
23#[cfg(feature = "content-processing")]
24pub use multimedia_handlers::*;
25#[cfg(feature = "content-processing")]
26pub use office_handlers::*;
27#[cfg(feature = "content-processing")]
28pub use pdf_handler::*;
29#[cfg(feature = "content-processing")]
30pub use text_handlers::*;
31#[cfg(feature = "content-processing")]
32pub use types::*;
33
34/// Advanced content processor
35#[cfg(feature = "content-processing")]
36pub struct ContentProcessor {
37    config: ContentExtractionConfig,
38    format_handlers: HashMap<DocumentFormat, Box<dyn FormatHandler>>,
39}
40
41/// Trait for format-specific content handlers
42#[cfg(feature = "content-processing")]
43pub trait FormatHandler: Send + Sync {
44    /// Extract content from document bytes
45    fn extract_content(
46        &self,
47        data: &[u8],
48        config: &ContentExtractionConfig,
49    ) -> Result<ExtractedContent>;
50
51    /// Check if this handler can process the given data
52    fn can_handle(&self, data: &[u8]) -> bool;
53
54    /// Get supported file extensions
55    fn supported_extensions(&self) -> Vec<&'static str>;
56}