compression_prompt/
quality_metrics.rs

1//! Quality metrics for compression evaluation (model-free)
2//!
3//! Measures how well compressed text preserves important information
4
5use std::collections::HashSet;
6
7/// Quality assessment for compressed text
8#[derive(Debug, Clone)]
9pub struct QualityMetrics {
10    /// Percentage of important keywords preserved (0.0-1.0)
11    pub keyword_retention: f64,
12
13    /// Percentage of named entities preserved (0.0-1.0)
14    pub entity_retention: f64,
15
16    /// Vocabulary diversity ratio (compressed/original)
17    pub vocabulary_ratio: f64,
18
19    /// Information density (unique_words/total_words)
20    pub information_density: f64,
21
22    /// Overall quality score (0.0-1.0)
23    pub overall_score: f64,
24}
25
26impl QualityMetrics {
27    /// Calculate comprehensive quality metrics
28    pub fn calculate(original: &str, compressed: &str) -> Self {
29        let orig_words = Self::tokenize(original);
30        let comp_words = Self::tokenize(compressed);
31
32        // Extract important elements
33        let orig_keywords = Self::extract_keywords(&orig_words);
34        let comp_keywords = Self::extract_keywords(&comp_words);
35
36        let orig_entities = Self::extract_entities(&orig_words);
37        let comp_entities = Self::extract_entities(&comp_words);
38
39        // Calculate retention rates
40        let keyword_retention = Self::calculate_retention(&orig_keywords, &comp_keywords);
41        let entity_retention = Self::calculate_retention(&orig_entities, &comp_entities);
42
43        // Vocabulary analysis
44        let orig_vocab: HashSet<_> = orig_words.iter().map(|s| s.to_lowercase()).collect();
45        let comp_vocab: HashSet<_> = comp_words.iter().map(|s| s.to_lowercase()).collect();
46        let vocabulary_ratio = comp_vocab.len() as f64 / orig_vocab.len().max(1) as f64;
47
48        // Information density
49        let information_density = if comp_words.is_empty() {
50            0.0
51        } else {
52            comp_vocab.len() as f64 / comp_words.len() as f64
53        };
54
55        // Overall score (weighted average)
56        let overall_score = keyword_retention * 0.4
57            + entity_retention * 0.3
58            + vocabulary_ratio * 0.2
59            + information_density * 0.1;
60
61        Self {
62            keyword_retention,
63            entity_retention,
64            vocabulary_ratio,
65            information_density,
66            overall_score,
67        }
68    }
69
70    /// Tokenize text into words
71    fn tokenize(text: &str) -> Vec<&str> {
72        text.split_whitespace().collect()
73    }
74
75    /// Extract important keywords (long words, capitalized, technical terms)
76    fn extract_keywords(words: &[&str]) -> HashSet<String> {
77        const STOP_WORDS: &[&str] = &[
78            "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with",
79            "by", "from", "as", "is", "was", "are", "were", "be", "been", "being", "have", "has",
80            "had", "do", "does", "did", "will", "would", "should", "could", "may", "might", "must",
81            "can", "this", "that", "these", "those", "we", "they", "it",
82        ];
83
84        words
85            .iter()
86            .filter_map(|word| {
87                let lower = word.to_lowercase();
88                // Keep if: not a stop word AND (long OR capitalized OR contains special chars)
89                if !STOP_WORDS.contains(&lower.as_str())
90                    && (word.len() > 5
91                        || word.chars().next().is_some_and(|c| c.is_uppercase())
92                        || word.contains('-')
93                        || word.contains('_'))
94                {
95                    Some(lower)
96                } else {
97                    None
98                }
99            })
100            .collect()
101    }
102
103    /// Extract named entities (capitalized sequences, emails, URLs, acronyms)
104    fn extract_entities(words: &[&str]) -> HashSet<String> {
105        let mut entities = HashSet::new();
106
107        for (i, word) in words.iter().enumerate() {
108            // Emails and URLs
109            if word.contains('@') || word.starts_with("http") {
110                entities.insert(word.to_lowercase());
111            }
112
113            // Acronyms (2+ uppercase letters)
114            if word.len() > 1 && word.chars().all(|c| c.is_uppercase() || !c.is_alphabetic()) {
115                entities.insert(word.to_string());
116            }
117
118            // Capitalized words (potential proper nouns)
119            if word.chars().next().is_some_and(|c| c.is_uppercase()) && word.len() > 2 {
120                // Multi-word entities (e.g., "John Smith")
121                if i + 1 < words.len()
122                    && words[i + 1]
123                        .chars()
124                        .next()
125                        .is_some_and(|c| c.is_uppercase())
126                {
127                    let entity = format!("{} {}", word, words[i + 1]);
128                    entities.insert(entity);
129                }
130                entities.insert(word.to_string());
131            }
132        }
133
134        entities
135    }
136
137    /// Calculate retention rate between two sets
138    fn calculate_retention(original: &HashSet<String>, compressed: &HashSet<String>) -> f64 {
139        if original.is_empty() {
140            return 1.0;
141        }
142
143        let preserved = original.intersection(compressed).count();
144        preserved as f64 / original.len() as f64
145    }
146
147    /// Format metrics as human-readable string
148    pub fn format(&self) -> String {
149        format!(
150            "Quality Metrics:\n\
151             - Keyword Retention: {:.1}%\n\
152             - Entity Retention: {:.1}%\n\
153             - Vocabulary Ratio: {:.1}%\n\
154             - Info Density: {:.3}\n\
155             - Overall Score: {:.1}%",
156            self.keyword_retention * 100.0,
157            self.entity_retention * 100.0,
158            self.vocabulary_ratio * 100.0,
159            self.information_density,
160            self.overall_score * 100.0
161        )
162    }
163}
164
165#[cfg(test)]
166mod tests {
167    use super::*;
168
169    #[test]
170    fn test_perfect_preservation() {
171        let text = "Machine Learning is a subset of Artificial Intelligence";
172        let metrics = QualityMetrics::calculate(text, text);
173
174        assert_eq!(metrics.keyword_retention, 1.0);
175        assert_eq!(metrics.entity_retention, 1.0);
176        assert_eq!(metrics.vocabulary_ratio, 1.0);
177    }
178
179    #[test]
180    fn test_lossy_compression() {
181        let original = "Machine Learning is a powerful subset of Artificial Intelligence";
182        let compressed = "Machine Learning subset Artificial Intelligence";
183        let metrics = QualityMetrics::calculate(original, compressed);
184
185        // Should retain important keywords
186        assert!(metrics.keyword_retention > 0.7);
187        assert!(metrics.entity_retention > 0.7);
188        assert!(metrics.overall_score > 0.5);
189    }
190
191    #[test]
192    fn test_entity_extraction() {
193        let text = "Dr. John Smith works at IBM and uses john@example.com";
194        let words: Vec<&str> = text.split_whitespace().collect();
195        let entities = QualityMetrics::extract_entities(&words);
196
197        assert!(entities.contains("IBM"));
198        assert!(entities.contains("john@example.com"));
199    }
200}