kreuzberg 4.9.7

High-performance document intelligence library for Rust. Extract text, metadata, and structured data from PDFs, Office documents, images, and 91+ formats and 248 programming languages via tree-sitter code intelligence with async/sync APIs.
Documentation
use crate::error::Result;
use crate::text::token_reduction::{
    config::{ReductionLevel, TokenReductionConfig},
    filters::FilterPipeline,
    semantic::SemanticAnalyzer,
    simd_text::{SimdTextProcessor, chunk_text_for_parallel},
};
#[cfg(not(target_arch = "wasm32"))]
use rayon::prelude::*;
use std::sync::Arc;
use unicode_normalization::UnicodeNormalization;

use super::punctuation::PunctuationCleaner;
use super::sentence_selection::SentenceSelector;
use super::word_filtering::WordFilter;

pub struct TokenReducer {
    config: Arc<TokenReductionConfig>,
    text_processor: SimdTextProcessor,
    filter_pipeline: FilterPipeline,
    semantic_analyzer: Option<SemanticAnalyzer>,
    word_filter: WordFilter,
    language: String,
}

impl TokenReducer {
    pub fn new(config: &TokenReductionConfig, language_hint: Option<&str>) -> Result<Self> {
        let config = Arc::new(config.clone());
        let language = language_hint
            .or(config.language_hint.as_deref())
            .unwrap_or("en")
            .to_string();

        let text_processor = SimdTextProcessor::new();
        let filter_pipeline = FilterPipeline::new(&config, &language)?;

        let semantic_analyzer = if matches!(config.level, ReductionLevel::Aggressive | ReductionLevel::Maximum) {
            Some(SemanticAnalyzer::new(&language))
        } else {
            None
        };

        Ok(Self {
            config,
            text_processor,
            filter_pipeline,
            semantic_analyzer,
            word_filter: WordFilter::new(),
            language,
        })
    }

    /// Get the language code being used for stopwords and semantic analysis.
    pub fn language(&self) -> &str {
        &self.language
    }

    pub fn reduce(&self, text: &str) -> String {
        if text.is_empty() || matches!(self.config.level, ReductionLevel::Off) {
            return text.to_string();
        }

        let nfc_string;
        let working_text = if text.is_ascii() {
            text
        } else {
            nfc_string = text.nfc().collect::<String>();
            &nfc_string
        };

        match self.config.level {
            ReductionLevel::Off => working_text.to_string(),
            ReductionLevel::Light => self.apply_light_reduction_optimized(working_text),
            ReductionLevel::Moderate => self.apply_moderate_reduction_optimized(working_text),
            ReductionLevel::Aggressive => self.apply_aggressive_reduction_optimized(working_text),
            ReductionLevel::Maximum => self.apply_maximum_reduction_optimized(working_text),
        }
    }

    pub fn batch_reduce(&self, texts: &[&str]) -> Vec<String> {
        if !self.config.enable_parallel || texts.len() < 2 {
            return texts.iter().map(|text| self.reduce(text)).collect();
        }

        #[cfg(not(target_arch = "wasm32"))]
        {
            texts.par_iter().map(|text| self.reduce(text)).collect()
        }
        #[cfg(target_arch = "wasm32")]
        {
            texts.iter().map(|text| self.reduce(text)).collect()
        }
    }

    fn apply_light_reduction_optimized(&self, text: &str) -> String {
        let mut result = if self.config.use_simd {
            self.text_processor.clean_punctuation(text)
        } else {
            PunctuationCleaner::clean_punctuation_optimized(text)
        };

        result = self.filter_pipeline.apply_light_filters(&result);
        result.trim().to_string()
    }

    fn apply_moderate_reduction_optimized(&self, text: &str) -> String {
        let mut result = self.apply_light_reduction_optimized(text);

        result = if self.config.enable_parallel && text.len() > 1000 {
            self.apply_parallel_moderate_reduction(&result)
        } else {
            self.filter_pipeline.apply_moderate_filters(&result)
        };

        result
    }

    fn apply_aggressive_reduction_optimized(&self, text: &str) -> String {
        let mut result = self.apply_moderate_reduction_optimized(text);

        result = self.word_filter.remove_additional_common_words(&result);
        result = SentenceSelector::apply_sentence_selection(&result);

        if let Some(ref analyzer) = self.semantic_analyzer {
            result = analyzer.apply_semantic_filtering(&result, self.config.semantic_threshold);
        }

        result
    }

    fn apply_maximum_reduction_optimized(&self, text: &str) -> String {
        let mut result = self.apply_aggressive_reduction_optimized(text);

        if let Some(ref analyzer) = self.semantic_analyzer
            && self.config.enable_semantic_clustering
        {
            result = analyzer.apply_hypernym_compression(&result, self.config.target_reduction);
        }

        result
    }

    fn apply_parallel_moderate_reduction(&self, text: &str) -> String {
        #[cfg(not(target_arch = "wasm32"))]
        {
            let num_threads = rayon::current_num_threads();
            let chunks = chunk_text_for_parallel(text, num_threads);
            let processed_chunks: Vec<String> = chunks
                .par_iter()
                .map(|chunk| self.filter_pipeline.apply_moderate_filters(chunk))
                .collect();
            processed_chunks.join(" ")
        }
        #[cfg(target_arch = "wasm32")]
        {
            let chunks = chunk_text_for_parallel(text, 1);
            let processed_chunks: Vec<String> = chunks
                .iter()
                .map(|chunk| self.filter_pipeline.apply_moderate_filters(chunk))
                .collect();
            processed_chunks.join(" ")
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_light_reduction() {
        let config = TokenReductionConfig {
            level: ReductionLevel::Light,
            use_simd: false,
            ..Default::default()
        };

        let reducer = TokenReducer::new(&config, None).unwrap();
        let input = "Hello   world!!!   How are you???";
        let result = reducer.reduce(input);

        assert!(result.len() < input.len());
        assert!(!result.contains("   "));
    }

    #[test]
    fn test_moderate_reduction() {
        let config = TokenReductionConfig {
            level: ReductionLevel::Moderate,
            use_simd: false,
            ..Default::default()
        };

        let reducer = TokenReducer::new(&config, Some("en")).unwrap();
        let input = "The quick brown fox is jumping over the lazy dog";
        let result = reducer.reduce(input);

        assert!(result.len() < input.len());
        assert!(result.contains("quick"));
        assert!(result.contains("brown"));
        assert!(result.contains("fox"));
    }

    #[test]
    fn test_batch_processing() {
        let config = TokenReductionConfig {
            level: ReductionLevel::Light,
            enable_parallel: false,
            ..Default::default()
        };

        let reducer = TokenReducer::new(&config, None).unwrap();
        let inputs = vec!["Hello  world!", "How   are you?", "Fine,  thanks!"];
        let results = reducer.batch_reduce(&inputs);

        assert_eq!(results.len(), inputs.len());
        for result in &results {
            assert!(!result.contains("  "));
        }
    }

    #[test]
    fn test_aggressive_reduction() {
        let config = TokenReductionConfig {
            level: ReductionLevel::Aggressive,
            use_simd: false,
            ..Default::default()
        };

        let reducer = TokenReducer::new(&config, Some("en")).unwrap();
        let input = "The quick brown fox is jumping over the lazy dog and running through the forest";
        let result = reducer.reduce(input);

        assert!(result.len() < input.len());
        assert!(!result.is_empty());
    }

    #[test]
    fn test_maximum_reduction() {
        let config = TokenReductionConfig {
            level: ReductionLevel::Maximum,
            use_simd: false,
            enable_semantic_clustering: true,
            ..Default::default()
        };

        let reducer = TokenReducer::new(&config, Some("en")).unwrap();
        let input = "The quick brown fox is jumping over the lazy dog and running through the forest";
        let result = reducer.reduce(input);

        assert!(result.len() < input.len());
        assert!(!result.is_empty());
    }

    #[test]
    fn test_empty_text_handling() {
        let config = TokenReductionConfig {
            level: ReductionLevel::Moderate,
            ..Default::default()
        };

        let reducer = TokenReducer::new(&config, None).unwrap();
        assert_eq!(reducer.reduce(""), "");
        let result = reducer.reduce("   ");
        assert!(result == "   " || result.is_empty());
    }

    #[test]
    fn test_off_mode_preserves_text() {
        let config = TokenReductionConfig {
            level: ReductionLevel::Off,
            ..Default::default()
        };

        let reducer = TokenReducer::new(&config, None).unwrap();
        let input = "Text   with    multiple   spaces!!!";
        assert_eq!(reducer.reduce(input), input);
    }

    #[test]
    fn test_parallel_batch_processing() {
        let config = TokenReductionConfig {
            level: ReductionLevel::Light,
            enable_parallel: true,
            ..Default::default()
        };

        let reducer = TokenReducer::new(&config, None).unwrap();
        let inputs = vec![
            "First text  with spaces",
            "Second  text with  spaces",
            "Third   text  with spaces",
        ];
        let results = reducer.batch_reduce(&inputs);

        assert_eq!(results.len(), inputs.len());
        for result in &results {
            assert!(!result.contains("  "));
        }
    }

    #[test]
    fn test_cjk_text_handling() {
        let config = TokenReductionConfig {
            level: ReductionLevel::Moderate,
            ..Default::default()
        };

        let reducer = TokenReducer::new(&config, Some("zh")).unwrap();
        let input = "这是中文文本测试";
        let result = reducer.reduce(input);

        assert!(!result.is_empty());
    }

    #[test]
    fn test_mixed_language_text() {
        let config = TokenReductionConfig {
            level: ReductionLevel::Moderate,
            ..Default::default()
        };

        let reducer = TokenReducer::new(&config, None).unwrap();
        let input = "This is English text 这是中文 and some more English";
        let result = reducer.reduce(input);

        assert!(!result.is_empty());
        assert!(result.contains("English") || result.contains(""));
    }

    #[test]
    fn test_unicode_normalization_ascii() {
        let config = TokenReductionConfig {
            level: ReductionLevel::Light,
            ..Default::default()
        };

        let reducer = TokenReducer::new(&config, None).unwrap();
        let input = "Pure ASCII text without special characters";
        let result = reducer.reduce(input);

        assert!(result.contains("ASCII"));
    }

    #[test]
    fn test_unicode_normalization_non_ascii() {
        let config = TokenReductionConfig {
            level: ReductionLevel::Light,
            ..Default::default()
        };

        let reducer = TokenReducer::new(&config, None).unwrap();
        let input = "Café naïve résumé";
        let result = reducer.reduce(input);

        assert!(result.contains("Café") || result.contains("Cafe"));
    }

    #[test]
    fn test_single_text_vs_batch() {
        let config = TokenReductionConfig {
            level: ReductionLevel::Moderate,
            ..Default::default()
        };

        let reducer = TokenReducer::new(&config, None).unwrap();
        let text = "The quick brown fox jumps over the lazy dog";

        let single_result = reducer.reduce(text);
        let batch_results = reducer.batch_reduce(&[text]);

        assert_eq!(single_result, batch_results[0]);
    }

    #[test]
    fn test_important_word_preservation() {
        let config = TokenReductionConfig {
            level: ReductionLevel::Aggressive,
            ..Default::default()
        };

        let reducer = TokenReducer::new(&config, None).unwrap();
        let input = "The IMPORTANT word COVID-19 and 12345 numbers should be preserved";
        let result = reducer.reduce(input);

        assert!(result.contains("IMPORTANT") || result.contains("COVID") || result.contains("12345"));
    }

    #[test]
    fn test_technical_terms_preservation() {
        let config = TokenReductionConfig {
            level: ReductionLevel::Aggressive,
            ..Default::default()
        };

        let reducer = TokenReducer::new(&config, None).unwrap();
        let input = "The implementation uses PyTorch and TensorFlow frameworks";
        let result = reducer.reduce(input);

        assert!(result.contains("PyTorch") || result.contains("TensorFlow"));
    }
}