kreuzberg 4.8.3

High-performance document intelligence library for Rust. Extract text, metadata, and structured data from PDFs, Office documents, images, and 91+ formats and 248 programming languages via tree-sitter code intelligence with async/sync APIs.
Documentation
//! Quality processing post-processor.
//!
//! This module provides a PostProcessor plugin that performs quality assessment and
//! text cleaning on extraction results.
//!
//! # Performance
//!
//! This processor optimizes metadata handling by:
//! - Checking if important metadata fields exist before allocating
//! - Converting to HashMap only when beneficial metadata is present
//! - Skipping allocation entirely for documents without metadata
//!
//! This avoids unnecessary string cloning for sparse metadata scenarios.

use crate::plugins::{Plugin, PostProcessor, ProcessingStage};
use crate::{ExtractionConfig, ExtractionResult, Result};
use async_trait::async_trait;
#[cfg(test)]
use std::borrow::Cow;

/// Post-processor that calculates quality score and cleans text.
///
/// This processor:
/// - Runs in the Early processing stage
/// - Calculates quality score when `config.enable_quality_processing` is true
/// - Stores quality score in `metadata.additional["quality_score"]`
/// - Cleans and normalizes extracted text
///
/// # Example
///
/// ```rust,no_run
/// use kreuzberg::plugins::{Plugin, PostProcessor};
/// use kreuzberg::text::QualityProcessor;
///
/// let processor = QualityProcessor;
/// assert_eq!(processor.name(), "quality-processing");
/// ```
#[derive(Debug, Clone, Copy)]
pub struct QualityProcessor;

impl Plugin for QualityProcessor {
    fn name(&self) -> &str {
        "quality-processing"
    }

    fn version(&self) -> String {
        env!("CARGO_PKG_VERSION").to_string()
    }

    fn initialize(&self) -> Result<()> {
        Ok(())
    }

    fn shutdown(&self) -> Result<()> {
        Ok(())
    }
}

#[cfg_attr(not(target_arch = "wasm32"), async_trait)]
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
impl PostProcessor for QualityProcessor {
    async fn process(&self, result: &mut ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
        let quality_score = if should_use_metadata(&result.metadata) {
            crate::text::quality::calculate_quality_score(&result.content, Some(&result.metadata.additional))
        } else {
            crate::text::quality::calculate_quality_score(&result.content, None)
        };

        result.quality_score = Some(quality_score);

        Ok(())
    }

    fn processing_stage(&self) -> ProcessingStage {
        ProcessingStage::Early
    }

    fn should_process(&self, _result: &ExtractionResult, config: &ExtractionConfig) -> bool {
        config.enable_quality_processing
    }

    fn estimated_duration_ms(&self, result: &ExtractionResult) -> u64 {
        let text_length = result.content.len();
        (text_length / 102400).max(1) as u64
    }
}

/// Check if metadata contains any important fields without allocation.
///
/// # Performance
///
/// O(1) check avoiding HashMap allocation when metadata is sparse.
/// Only allocates HashMap when important metadata fields are present.
fn should_use_metadata(metadata: &crate::types::Metadata) -> bool {
    const IMPORTANT_FIELDS: &[&str] = &["title", "author", "subject", "description", "keywords"];
    IMPORTANT_FIELDS
        .iter()
        .any(|field| metadata.additional.contains_key(*field))
}

#[cfg(test)]
mod tests {
    use super::*;

    #[tokio::test]
    async fn test_quality_processor() {
        let processor = QualityProcessor;
        let config = ExtractionConfig {
            enable_quality_processing: true,
            ..Default::default()
        };

        let mut result = ExtractionResult {
            content: "This is a well-written paragraph with proper structure. It contains multiple sentences. The quality should be good.".to_string(),
            mime_type: Cow::Borrowed("text/plain"),
            ..Default::default()
        };

        processor.process(&mut result, &config).await.unwrap();

        assert!(result.quality_score.is_some());
        let score = result.quality_score.unwrap();
        assert!((0.0..=1.0).contains(&score));
    }

    #[tokio::test]
    async fn test_quality_processor_disabled() {
        let processor = QualityProcessor;
        let config = ExtractionConfig {
            enable_quality_processing: false,
            ..Default::default()
        };

        let mut result = ExtractionResult {
            content: "Some text".to_string(),
            mime_type: Cow::Borrowed("text/plain"),
            ..Default::default()
        };

        processor.process(&mut result, &config).await.unwrap();
    }

    #[test]
    fn test_quality_processor_plugin_interface() {
        let processor = QualityProcessor;
        assert_eq!(processor.name(), "quality-processing");
        assert!(!processor.version().is_empty());
        assert!(processor.initialize().is_ok());
        assert!(processor.shutdown().is_ok());
    }

    #[test]
    fn test_quality_processor_stage() {
        let processor = QualityProcessor;
        assert_eq!(processor.processing_stage(), ProcessingStage::Early);
    }

    #[test]
    fn test_quality_processor_should_process() {
        let processor = QualityProcessor;

        let result = ExtractionResult {
            content: "Sample text".to_string(),
            mime_type: Cow::Borrowed("text/plain"),
            ..Default::default()
        };

        let config_with_quality = ExtractionConfig {
            enable_quality_processing: true,
            ..Default::default()
        };
        assert!(processor.should_process(&result, &config_with_quality));

        let config_without_quality = ExtractionConfig {
            enable_quality_processing: false,
            ..Default::default()
        };
        assert!(!processor.should_process(&result, &config_without_quality));
    }

    #[test]
    fn test_quality_processor_estimated_duration() {
        let processor = QualityProcessor;

        let short_result = ExtractionResult {
            content: "Short".to_string(),
            mime_type: Cow::Borrowed("text/plain"),
            ..Default::default()
        };

        let long_result = ExtractionResult {
            content: "a".repeat(1000000),
            mime_type: Cow::Borrowed("text/plain"),
            ..Default::default()
        };

        let short_duration = processor.estimated_duration_ms(&short_result);
        let long_duration = processor.estimated_duration_ms(&long_result);

        assert!(long_duration > short_duration);
    }
}