use super::r#trait::DocumentExtractor;
use std::sync::Arc;
pub fn register_extractor(extractor: Arc<dyn DocumentExtractor>) -> crate::Result<()> {
use crate::plugins::registry::get_document_extractor_registry;
let registry = get_document_extractor_registry();
let mut registry = registry
.write()
.expect("~keep Failed to acquire write lock on extractor registry");
registry.register(extractor)
}
pub fn unregister_extractor(name: &str) -> crate::Result<()> {
use crate::plugins::registry::get_document_extractor_registry;
let registry = get_document_extractor_registry();
let mut registry = registry
.write()
.expect("~keep Failed to acquire write lock on extractor registry");
registry.remove(name)
}
pub fn list_extractors() -> crate::Result<Vec<String>> {
use crate::plugins::registry::get_document_extractor_registry;
let registry = get_document_extractor_registry();
let registry = registry
.read()
.expect("~keep Failed to acquire read lock on extractor registry");
Ok(registry.list())
}
pub fn clear_extractors() -> crate::Result<()> {
use crate::plugins::registry::get_document_extractor_registry;
let registry = get_document_extractor_registry();
let mut registry = registry
.write()
.expect("~keep Failed to acquire write lock on extractor registry");
registry.shutdown_all()
}
#[cfg(test)]
mod tests {
use super::*;
use crate::Result;
use crate::core::config::ExtractionConfig;
use crate::plugins::Plugin;
use crate::types::ExtractionResult;
use async_trait::async_trait;
use serial_test::serial;
use std::borrow::Cow;
struct MockExtractor {
mime_types: Vec<&'static str>,
priority: i32,
}
impl Plugin for MockExtractor {
fn name(&self) -> &str {
"mock-extractor"
}
fn version(&self) -> String {
"1.0.0".to_string()
}
fn initialize(&self) -> Result<()> {
Ok(())
}
fn shutdown(&self) -> Result<()> {
Ok(())
}
}
#[async_trait]
impl DocumentExtractor for MockExtractor {
async fn extract_bytes(
&self,
content: &[u8],
mime_type: &str,
_config: &ExtractionConfig,
) -> Result<ExtractionResult> {
Ok(ExtractionResult {
content: String::from_utf8_lossy(content).to_string(),
mime_type: mime_type.to_string().into(),
metadata: crate::types::Metadata::default(),
tables: vec![],
detected_languages: None,
chunks: None,
images: None,
djot_content: None,
pages: None,
elements: None,
ocr_elements: None,
document: None,
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
extracted_keywords: None,
quality_score: None,
processing_warnings: Vec::new(),
annotations: None,
})
}
fn supported_mime_types(&self) -> &[&str] {
&self.mime_types
}
fn priority(&self) -> i32 {
self.priority
}
}
#[test]
#[serial]
fn test_register_extractor() {
use std::sync::Arc;
let extractor = Arc::new(MockExtractor {
mime_types: vec!["text/test-register"],
priority: 50,
});
let result = super::register_extractor(extractor);
assert!(result.is_ok());
let _ = super::unregister_extractor("mock-extractor");
}
#[test]
#[serial]
fn test_unregister_extractor() {
use std::sync::Arc;
let extractor = Arc::new(MockExtractor {
mime_types: vec!["text/test-unregister"],
priority: 50,
});
super::register_extractor(extractor).unwrap();
let result = super::unregister_extractor("mock-extractor");
assert!(result.is_ok());
}
#[test]
#[serial]
fn test_unregister_nonexistent_extractor() {
let result = super::unregister_extractor("nonexistent-extractor-xyz");
assert!(result.is_ok());
}
#[test]
#[serial]
fn test_list_extractors() {
use std::sync::Arc;
super::clear_extractors().unwrap();
let extractor1 = Arc::new(MockExtractor {
mime_types: vec!["text/test-list-1"],
priority: 50,
});
let extractor2 = Arc::new(MockExtractor {
mime_types: vec!["text/test-list-2"],
priority: 51,
});
let list_before = super::list_extractors().unwrap();
assert_eq!(list_before.len(), 0);
super::register_extractor(extractor1).unwrap();
super::register_extractor(extractor2).unwrap();
let list = super::list_extractors().unwrap();
assert_eq!(list.len(), 1);
assert!(list.contains(&"mock-extractor".to_string()));
super::unregister_extractor("mock-extractor").unwrap();
}
#[test]
#[serial]
fn test_clear_extractors() {
use std::sync::Arc;
super::clear_extractors().unwrap();
let extractor1 = Arc::new(MockExtractor {
mime_types: vec!["text/test-clear-1"],
priority: 50,
});
let extractor2 = Arc::new(MockExtractor {
mime_types: vec!["text/test-clear-2"],
priority: 51,
});
super::register_extractor(extractor1).unwrap();
super::register_extractor(extractor2).unwrap();
let result = super::clear_extractors();
assert!(result.is_ok());
let list = super::list_extractors().unwrap();
assert_eq!(list.len(), 0);
}
#[test]
#[serial]
fn test_register_extractor_with_invalid_name() {
use std::sync::Arc;
struct InvalidNameExtractor;
impl Plugin for InvalidNameExtractor {
fn name(&self) -> &str {
"invalid name with spaces"
}
fn version(&self) -> String {
"1.0.0".to_string()
}
fn initialize(&self) -> Result<()> {
Ok(())
}
fn shutdown(&self) -> Result<()> {
Ok(())
}
}
#[async_trait]
impl DocumentExtractor for InvalidNameExtractor {
async fn extract_bytes(&self, _: &[u8], _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> {
Ok(ExtractionResult {
content: String::new(),
mime_type: Cow::Borrowed(""),
metadata: crate::types::Metadata::default(),
tables: vec![],
detected_languages: None,
chunks: None,
images: None,
djot_content: None,
pages: None,
elements: None,
ocr_elements: None,
document: None,
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
extracted_keywords: None,
quality_score: None,
processing_warnings: Vec::new(),
annotations: None,
})
}
fn supported_mime_types(&self) -> &[&str] {
&["text/plain"]
}
}
let extractor = Arc::new(InvalidNameExtractor);
let result = super::register_extractor(extractor);
assert!(matches!(result, Err(crate::KreuzbergError::Validation { .. })));
}
#[test]
#[serial]
fn test_register_extractor_with_empty_name() {
use std::sync::Arc;
struct EmptyNameExtractor;
impl Plugin for EmptyNameExtractor {
fn name(&self) -> &str {
""
}
fn version(&self) -> String {
"1.0.0".to_string()
}
fn initialize(&self) -> Result<()> {
Ok(())
}
fn shutdown(&self) -> Result<()> {
Ok(())
}
}
#[async_trait]
impl DocumentExtractor for EmptyNameExtractor {
async fn extract_bytes(&self, _: &[u8], _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> {
Ok(ExtractionResult {
content: String::new(),
mime_type: Cow::Borrowed(""),
metadata: crate::types::Metadata::default(),
tables: vec![],
detected_languages: None,
chunks: None,
images: None,
djot_content: None,
pages: None,
elements: None,
ocr_elements: None,
document: None,
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
extracted_keywords: None,
quality_score: None,
processing_warnings: Vec::new(),
annotations: None,
})
}
fn supported_mime_types(&self) -> &[&str] {
&["text/plain"]
}
}
let extractor = Arc::new(EmptyNameExtractor);
let result = super::register_extractor(extractor);
assert!(matches!(result, Err(crate::KreuzbergError::Validation { .. })));
}
}