use crate::plugins::{Plugin, PostProcessor, ProcessingStage};
use crate::{ExtractionConfig, ExtractionResult, Result};
use async_trait::async_trait;
use std::borrow::Cow;
#[derive(Debug, Clone, Copy)]
pub struct QualityProcessor;
impl Plugin for QualityProcessor {
fn name(&self) -> &str {
"quality-processing"
}
fn version(&self) -> String {
env!("CARGO_PKG_VERSION").to_string()
}
fn initialize(&self) -> Result<()> {
Ok(())
}
fn shutdown(&self) -> Result<()> {
Ok(())
}
}
#[cfg_attr(not(target_arch = "wasm32"), async_trait)]
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
impl PostProcessor for QualityProcessor {
async fn process(&self, result: &mut ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
let quality_score = if should_use_metadata(&result.metadata) {
crate::text::quality::calculate_quality_score(&result.content, Some(&result.metadata.additional))
} else {
crate::text::quality::calculate_quality_score(&result.content, None)
};
result.quality_score = Some(quality_score);
result.metadata.additional.insert(
Cow::Borrowed("quality_score"),
serde_json::Value::Number(
serde_json::Number::from_f64(quality_score).unwrap_or(serde_json::Number::from(0)),
),
);
Ok(())
}
fn processing_stage(&self) -> ProcessingStage {
ProcessingStage::Early
}
fn should_process(&self, _result: &ExtractionResult, config: &ExtractionConfig) -> bool {
config.enable_quality_processing
}
fn estimated_duration_ms(&self, result: &ExtractionResult) -> u64 {
let text_length = result.content.len();
(text_length / 102400).max(1) as u64
}
}
fn should_use_metadata(metadata: &crate::types::Metadata) -> bool {
const IMPORTANT_FIELDS: &[&str] = &["title", "author", "subject", "description", "keywords"];
IMPORTANT_FIELDS
.iter()
.any(|field| metadata.additional.contains_key(*field))
}
#[cfg(test)]
mod tests {
use super::*;
use crate::types::Metadata;
#[tokio::test]
async fn test_quality_processor() {
let processor = QualityProcessor;
let config = ExtractionConfig {
enable_quality_processing: true,
..Default::default()
};
let mut result = ExtractionResult {
content: "This is a well-written paragraph with proper structure. It contains multiple sentences. The quality should be good.".to_string(),
mime_type: Cow::Borrowed("text/plain"),
metadata: Metadata::default(),
tables: vec![],
detected_languages: None,
chunks: None,
images: None,
pages: None,
elements: None,
ocr_elements: None,
djot_content: None,
document: None,
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
extracted_keywords: None,
quality_score: None,
processing_warnings: Vec::new(),
annotations: None,
children: None,
};
processor.process(&mut result, &config).await.unwrap();
assert!(result.metadata.additional.contains_key("quality_score"));
let score = result.metadata.additional.get("quality_score").unwrap();
assert!(score.is_number());
}
#[tokio::test]
async fn test_quality_processor_disabled() {
let processor = QualityProcessor;
let config = ExtractionConfig {
enable_quality_processing: false,
..Default::default()
};
let mut result = ExtractionResult {
content: "Some text".to_string(),
mime_type: Cow::Borrowed("text/plain"),
metadata: Metadata::default(),
tables: vec![],
detected_languages: None,
chunks: None,
images: None,
djot_content: None,
pages: None,
elements: None,
ocr_elements: None,
document: None,
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
extracted_keywords: None,
quality_score: None,
processing_warnings: Vec::new(),
annotations: None,
children: None,
};
processor.process(&mut result, &config).await.unwrap();
}
#[test]
fn test_quality_processor_plugin_interface() {
let processor = QualityProcessor;
assert_eq!(processor.name(), "quality-processing");
assert!(!processor.version().is_empty());
assert!(processor.initialize().is_ok());
assert!(processor.shutdown().is_ok());
}
#[test]
fn test_quality_processor_stage() {
let processor = QualityProcessor;
assert_eq!(processor.processing_stage(), ProcessingStage::Early);
}
#[test]
fn test_quality_processor_should_process() {
let processor = QualityProcessor;
let result = ExtractionResult {
content: "Sample text".to_string(),
mime_type: Cow::Borrowed("text/plain"),
metadata: Metadata::default(),
tables: vec![],
detected_languages: None,
chunks: None,
images: None,
djot_content: None,
pages: None,
elements: None,
ocr_elements: None,
document: None,
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
extracted_keywords: None,
quality_score: None,
processing_warnings: Vec::new(),
annotations: None,
children: None,
};
let config_with_quality = ExtractionConfig {
enable_quality_processing: true,
..Default::default()
};
assert!(processor.should_process(&result, &config_with_quality));
let config_without_quality = ExtractionConfig {
enable_quality_processing: false,
..Default::default()
};
assert!(!processor.should_process(&result, &config_without_quality));
}
#[test]
fn test_quality_processor_estimated_duration() {
let processor = QualityProcessor;
let short_result = ExtractionResult {
content: "Short".to_string(),
mime_type: Cow::Borrowed("text/plain"),
metadata: Metadata::default(),
tables: vec![],
detected_languages: None,
chunks: None,
images: None,
djot_content: None,
pages: None,
elements: None,
ocr_elements: None,
document: None,
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
extracted_keywords: None,
quality_score: None,
processing_warnings: Vec::new(),
annotations: None,
children: None,
};
let long_result = ExtractionResult {
content: "a".repeat(1000000),
mime_type: Cow::Borrowed("text/plain"),
metadata: Metadata::default(),
tables: vec![],
detected_languages: None,
chunks: None,
images: None,
djot_content: None,
pages: None,
elements: None,
ocr_elements: None,
document: None,
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
extracted_keywords: None,
quality_score: None,
processing_warnings: Vec::new(),
annotations: None,
children: None,
};
let short_duration = processor.estimated_duration_ms(&short_result);
let long_duration = processor.estimated_duration_ms(&long_result);
assert!(long_duration > short_duration);
}
}