compression_prompt/
quality_metrics.rs1use std::collections::HashSet;
6
7#[derive(Debug, Clone)]
9pub struct QualityMetrics {
10 pub keyword_retention: f64,
12
13 pub entity_retention: f64,
15
16 pub vocabulary_ratio: f64,
18
19 pub information_density: f64,
21
22 pub overall_score: f64,
24}
25
26impl QualityMetrics {
27 pub fn calculate(original: &str, compressed: &str) -> Self {
29 let orig_words = Self::tokenize(original);
30 let comp_words = Self::tokenize(compressed);
31
32 let orig_keywords = Self::extract_keywords(&orig_words);
34 let comp_keywords = Self::extract_keywords(&comp_words);
35
36 let orig_entities = Self::extract_entities(&orig_words);
37 let comp_entities = Self::extract_entities(&comp_words);
38
39 let keyword_retention = Self::calculate_retention(&orig_keywords, &comp_keywords);
41 let entity_retention = Self::calculate_retention(&orig_entities, &comp_entities);
42
43 let orig_vocab: HashSet<_> = orig_words.iter().map(|s| s.to_lowercase()).collect();
45 let comp_vocab: HashSet<_> = comp_words.iter().map(|s| s.to_lowercase()).collect();
46 let vocabulary_ratio = comp_vocab.len() as f64 / orig_vocab.len().max(1) as f64;
47
48 let information_density = if comp_words.is_empty() {
50 0.0
51 } else {
52 comp_vocab.len() as f64 / comp_words.len() as f64
53 };
54
55 let overall_score = keyword_retention * 0.4
57 + entity_retention * 0.3
58 + vocabulary_ratio * 0.2
59 + information_density * 0.1;
60
61 Self {
62 keyword_retention,
63 entity_retention,
64 vocabulary_ratio,
65 information_density,
66 overall_score,
67 }
68 }
69
70 fn tokenize(text: &str) -> Vec<&str> {
72 text.split_whitespace().collect()
73 }
74
75 fn extract_keywords(words: &[&str]) -> HashSet<String> {
77 const STOP_WORDS: &[&str] = &[
78 "the", "a", "an", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with",
79 "by", "from", "as", "is", "was", "are", "were", "be", "been", "being", "have", "has",
80 "had", "do", "does", "did", "will", "would", "should", "could", "may", "might", "must",
81 "can", "this", "that", "these", "those", "we", "they", "it",
82 ];
83
84 words
85 .iter()
86 .filter_map(|word| {
87 let lower = word.to_lowercase();
88 if !STOP_WORDS.contains(&lower.as_str())
90 && (word.len() > 5
91 || word.chars().next().is_some_and(|c| c.is_uppercase())
92 || word.contains('-')
93 || word.contains('_'))
94 {
95 Some(lower)
96 } else {
97 None
98 }
99 })
100 .collect()
101 }
102
103 fn extract_entities(words: &[&str]) -> HashSet<String> {
105 let mut entities = HashSet::new();
106
107 for (i, word) in words.iter().enumerate() {
108 if word.contains('@') || word.starts_with("http") {
110 entities.insert(word.to_lowercase());
111 }
112
113 if word.len() > 1 && word.chars().all(|c| c.is_uppercase() || !c.is_alphabetic()) {
115 entities.insert(word.to_string());
116 }
117
118 if word.chars().next().is_some_and(|c| c.is_uppercase()) && word.len() > 2 {
120 if i + 1 < words.len()
122 && words[i + 1]
123 .chars()
124 .next()
125 .is_some_and(|c| c.is_uppercase())
126 {
127 let entity = format!("{} {}", word, words[i + 1]);
128 entities.insert(entity);
129 }
130 entities.insert(word.to_string());
131 }
132 }
133
134 entities
135 }
136
137 fn calculate_retention(original: &HashSet<String>, compressed: &HashSet<String>) -> f64 {
139 if original.is_empty() {
140 return 1.0;
141 }
142
143 let preserved = original.intersection(compressed).count();
144 preserved as f64 / original.len() as f64
145 }
146
147 pub fn format(&self) -> String {
149 format!(
150 "Quality Metrics:\n\
151 - Keyword Retention: {:.1}%\n\
152 - Entity Retention: {:.1}%\n\
153 - Vocabulary Ratio: {:.1}%\n\
154 - Info Density: {:.3}\n\
155 - Overall Score: {:.1}%",
156 self.keyword_retention * 100.0,
157 self.entity_retention * 100.0,
158 self.vocabulary_ratio * 100.0,
159 self.information_density,
160 self.overall_score * 100.0
161 )
162 }
163}
164
165#[cfg(test)]
166mod tests {
167 use super::*;
168
169 #[test]
170 fn test_perfect_preservation() {
171 let text = "Machine Learning is a subset of Artificial Intelligence";
172 let metrics = QualityMetrics::calculate(text, text);
173
174 assert_eq!(metrics.keyword_retention, 1.0);
175 assert_eq!(metrics.entity_retention, 1.0);
176 assert_eq!(metrics.vocabulary_ratio, 1.0);
177 }
178
179 #[test]
180 fn test_lossy_compression() {
181 let original = "Machine Learning is a powerful subset of Artificial Intelligence";
182 let compressed = "Machine Learning subset Artificial Intelligence";
183 let metrics = QualityMetrics::calculate(original, compressed);
184
185 assert!(metrics.keyword_retention > 0.7);
187 assert!(metrics.entity_retention > 0.7);
188 assert!(metrics.overall_score > 0.5);
189 }
190
191 #[test]
192 fn test_entity_extraction() {
193 let text = "Dr. John Smith works at IBM and uses john@example.com";
194 let words: Vec<&str> = text.split_whitespace().collect();
195 let entities = QualityMetrics::extract_entities(&words);
196
197 assert!(entities.contains("IBM"));
198 assert!(entities.contains("john@example.com"));
199 }
200}