1use std::collections::HashSet;
21
22use once_cell::sync::Lazy;
23use regex::Regex;
24use serde::{Deserialize, Serialize};
25
26const FILLER_PHRASES: &[&str] = &[
32 "i think",
33 "basically",
34 "you know",
35 "kind of",
36 "sort of",
37 "i mean",
38 "like",
39 "actually",
40 "to be honest",
41 "in my opinion",
42 "i believe",
43 "i guess",
44 "i suppose",
45 "it seems like",
46 "more or less",
47 "pretty much",
48 "at the end of the day",
49 "as a matter of fact",
50 "the thing is",
51 "to be fair",
52 "honestly",
53 "literally",
54 "obviously",
55 "clearly",
56 "just",
57 "simply",
58 "basically speaking",
59 "needless to say",
60 "as you know",
61 "for what it's worth",
62];
63
64const HEDGING_PHRASES: &[&str] = &[
66 "maybe",
67 "perhaps",
68 "sort of",
69 "kind of",
70 "somewhat",
71 "rather",
72 "fairly",
73 "quite",
74 "a bit",
75 "a little",
76 "in a way",
77 "in some ways",
78 "to some extent",
79 "to a degree",
80 "more or less",
81];
82
83static PROPER_NOUN_RE: Lazy<Regex> =
90 Lazy::new(|| Regex::new(r"\b([A-Z][a-z]{2,}(?:\s+[A-Z][a-z]{2,})*)\b").expect("valid regex"));
91
92static NUMBER_DATE_RE: Lazy<Regex> = Lazy::new(|| {
94 Regex::new(r"\b(\d{1,4}[/-]\d{1,2}[/-]\d{1,4}|\d{4}|\d+\.\d+|\d{1,3}(?:,\d{3})*(?:\.\d+)?)\b")
95 .expect("valid regex")
96});
97
98static SENTENCE_SPLIT_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[.!?]\s+").expect("valid regex"));
101
102static COMMON_VERBS: Lazy<Regex> = Lazy::new(|| {
104 Regex::new(r"\b(is|are|was|were|has|have|had|will|can|could|should|would|does|did|do|provides|uses|returns|creates|stores|contains|supports|requires|enables|implements|defines|allows|includes|handles|manages)\b")
105 .expect("valid regex")
106});
107
108#[derive(Debug, Clone, Serialize, Deserialize)]
114pub struct CompressionConfig {
115 pub target_ratio: f32,
117 pub min_content_length: usize,
119 pub preserve_entities: bool,
121}
122
123impl Default for CompressionConfig {
124 fn default() -> Self {
125 Self {
126 target_ratio: 0.1,
127 min_content_length: 100,
128 preserve_entities: true,
129 }
130 }
131}
132
133#[derive(Debug, Clone, Serialize, Deserialize)]
135pub struct CompressedMemory {
136 pub original_tokens: usize,
138 pub compressed_tokens: usize,
140 pub ratio: f32,
142 pub structured_content: String,
144 pub key_entities: Vec<String>,
146 pub key_facts: Vec<String>,
148}
149
150pub struct SemanticCompressor {
156 config: CompressionConfig,
157}
158
159impl SemanticCompressor {
160 pub fn new(config: CompressionConfig) -> Self {
162 Self { config }
163 }
164
165 pub fn compress(&self, text: &str) -> CompressedMemory {
170 let original_tokens = estimate_tokens(text);
171
172 if text.trim().is_empty() {
173 return CompressedMemory {
174 original_tokens: 0,
175 compressed_tokens: 0,
176 ratio: 1.0,
177 structured_content: String::new(),
178 key_entities: Vec::new(),
179 key_facts: Vec::new(),
180 };
181 }
182
183 if text.trim().len() < self.config.min_content_length {
184 return CompressedMemory {
185 original_tokens,
186 compressed_tokens: original_tokens,
187 ratio: 1.0,
188 structured_content: text.trim().to_string(),
189 key_entities: Vec::new(),
190 key_facts: Vec::new(),
191 };
192 }
193
194 let sentences = split_sentences(text);
196
197 let cleaned: Vec<String> = sentences
199 .iter()
200 .map(|s| strip_filler(s))
201 .filter(|s| !s.trim().is_empty())
202 .collect();
203
204 let key_entities = if self.config.preserve_entities {
206 extract_entities(&sentences)
207 } else {
208 Vec::new()
209 };
210
211 let deduped = deduplicate_sentences(&cleaned);
213
214 let cores: Vec<String> = deduped.iter().map(|s| extract_svo_core(s)).collect();
216 let structured_content = cores.join(". ");
217
218 let key_facts = extract_key_facts(&deduped, &key_entities);
220
221 let compressed_tokens = estimate_tokens(&structured_content);
222 let ratio = if original_tokens == 0 {
223 1.0
224 } else {
225 compressed_tokens as f32 / original_tokens as f32
226 };
227
228 CompressedMemory {
229 original_tokens,
230 compressed_tokens,
231 ratio,
232 structured_content,
233 key_entities,
234 key_facts,
235 }
236 }
237
238 pub fn decompress(&self, compressed: &CompressedMemory) -> String {
243 if compressed.structured_content.is_empty() {
244 return String::new();
245 }
246
247 if compressed.key_facts.is_empty() {
248 return compressed.structured_content.clone();
249 }
250
251 let entity_context = if !compressed.key_entities.is_empty() {
252 format!(" (entities: {})", compressed.key_entities.join(", "))
253 } else {
254 String::new()
255 };
256
257 let mut parts: Vec<String> = compressed.key_facts.clone();
258 if let Some(last) = parts.last_mut() {
260 last.push_str(&entity_context);
261 }
262 parts.join(". ")
263 }
264
265 pub fn compress_batch(&self, texts: &[&str]) -> Vec<CompressedMemory> {
267 texts.iter().map(|t| self.compress(t)).collect()
268 }
269}
270
271fn estimate_tokens(text: &str) -> usize {
277 text.len().div_ceil(4)
278}
279
280fn split_sentences(text: &str) -> Vec<String> {
286 let terminators: Vec<(usize, usize, char)> = SENTENCE_SPLIT_RE
288 .find_iter(text)
289 .map(|m| {
290 let punct = text[m.start()..].chars().next().unwrap_or('.');
292 (m.start(), m.end(), punct)
293 })
294 .collect();
295
296 if terminators.is_empty() {
297 let trimmed = text.trim().to_string();
298 return if trimmed.is_empty() {
299 vec![]
300 } else {
301 vec![trimmed]
302 };
303 }
304
305 let mut sentences: Vec<String> = Vec::new();
306 let mut cursor = 0usize;
307
308 for (t_start, t_end, punct) in &terminators {
309 let fragment = text[cursor..*t_start].trim().to_string();
310 if !fragment.is_empty() {
311 sentences.push(format!("{fragment}{punct}"));
312 }
313 cursor = *t_end;
314 }
315 let tail = text[cursor..].trim().to_string();
317 if !tail.is_empty() {
318 sentences.push(tail);
319 }
320
321 sentences
322}
323
324fn strip_filler(text: &str) -> String {
326 let mut result = text.to_string();
327
328 let mut phrases: Vec<&str> = FILLER_PHRASES
330 .iter()
331 .chain(HEDGING_PHRASES.iter())
332 .copied()
333 .collect();
334 phrases.sort_by_key(|b| std::cmp::Reverse(b.len()));
335 phrases.dedup();
336
337 for phrase in phrases {
338 let escaped = regex::escape(phrase);
340 if let Ok(re) = Regex::new(&format!(r"(?i)\b{escaped}\b[,\s]*")) {
343 result = re.replace_all(&result, " ").to_string();
344 }
345 }
346
347 let collapsed = result.split_whitespace().collect::<Vec<_>>().join(" ");
349 collapsed
350}
351
352fn extract_entities(sentences: &[String]) -> Vec<String> {
358 let sentence_starters: HashSet<String> = sentences
361 .iter()
362 .filter_map(|s| s.split_whitespace().next())
363 .map(|w| w.to_lowercase())
364 .collect();
365
366 let full_text = sentences.join(" ");
367 let mut entities: Vec<String> = Vec::new();
368 let mut seen: HashSet<String> = HashSet::new();
369
370 for cap in PROPER_NOUN_RE.captures_iter(&full_text) {
373 let entity = cap[1].to_string();
374 let entity_lower = entity.to_lowercase();
375 let count = PROPER_NOUN_RE
378 .find_iter(&full_text)
379 .filter(|m| full_text[m.start()..m.end()].to_lowercase() == entity_lower)
380 .count();
381 if (!sentence_starters.contains(&entity_lower) || count > 1) && seen.insert(entity.clone())
382 {
383 entities.push(entity);
384 }
385 }
386
387 for cap in NUMBER_DATE_RE.captures_iter(&full_text) {
389 let token = cap[1].to_string();
390 if seen.insert(token.clone()) {
391 entities.push(token);
392 }
393 }
394
395 entities
396}
397
398fn jaccard_similarity(a: &str, b: &str) -> f64 {
400 let set_a: HashSet<&str> = a.split_whitespace().collect();
401 let set_b: HashSet<&str> = b.split_whitespace().collect();
402
403 if set_a.is_empty() && set_b.is_empty() {
404 return 1.0;
405 }
406
407 let intersection = set_a.intersection(&set_b).count();
408 let union = set_a.union(&set_b).count();
409
410 if union == 0 {
411 1.0
412 } else {
413 intersection as f64 / union as f64
414 }
415}
416
417fn deduplicate_sentences(sentences: &[String]) -> Vec<String> {
420 let mut kept: Vec<String> = Vec::new();
421
422 'outer: for sentence in sentences {
423 for existing in &kept {
424 if jaccard_similarity(sentence, existing) > 0.6 {
425 continue 'outer;
426 }
427 }
428 kept.push(sentence.clone());
429 }
430
431 kept
432}
433
434fn extract_svo_core(sentence: &str) -> String {
440 let words: Vec<&str> = sentence.split_whitespace().collect();
441 if words.len() <= 6 {
442 return sentence.trim().to_string();
444 }
445
446 if let Some(verb_match) = COMMON_VERBS.find(sentence) {
447 let pre = &sentence[..verb_match.start()].trim();
450 let post = &sentence[verb_match.end()..].trim();
451 let object_words: Vec<&str> = post.split_whitespace().take(5).collect();
452 let object = object_words.join(" ");
453 let verb = verb_match.as_str();
454
455 let parts = [*pre, verb, &object]
456 .iter()
457 .filter(|p| !p.is_empty())
458 .copied()
459 .collect::<Vec<_>>();
460 return parts.join(" ");
461 }
462
463 words[..words.len().min(8)].join(" ")
465}
466
467fn extract_key_facts(sentences: &[String], entities: &[String]) -> Vec<String> {
469 sentences
470 .iter()
471 .filter(|s| {
472 let has_verb = COMMON_VERBS.is_match(s);
473 let s_lower = s.to_lowercase();
474 let has_entity = entities.iter().any(|e| s_lower.contains(&e.to_lowercase()))
475 || NUMBER_DATE_RE.is_match(s);
476 has_verb && has_entity
477 })
478 .cloned()
479 .collect()
480}
481
482#[cfg(test)]
487mod tests {
488 use super::*;
489
490 fn default_compressor() -> SemanticCompressor {
491 SemanticCompressor::new(CompressionConfig::default())
492 }
493
494 #[test]
498 fn test_short_text_returned_verbatim() {
499 let compressor = default_compressor();
500 let short = "Hello world.";
501 assert!(short.len() < 100);
502 let result = compressor.compress(short);
503 assert_eq!(result.structured_content, short.trim());
504 assert!((result.ratio - 1.0).abs() < f32::EPSILON);
505 }
506
507 #[test]
511 fn test_filler_removal_reduces_content() {
512 let original = "I think basically you know we should sort of consider the proposal. \
513 Actually to be honest I believe we need to look at it more carefully. \
514 Kind of like the previous plan but maybe with more flexibility and scope.";
515 let stripped = strip_filler(original);
516 assert!(
517 stripped.len() < original.len(),
518 "stripped ({}) should be shorter than original ({})",
519 stripped.len(),
520 original.len()
521 );
522 assert!(
524 stripped.to_lowercase().contains("proposal")
525 || stripped.to_lowercase().contains("consider")
526 );
527 }
528
529 #[test]
533 fn test_entity_extraction_proper_nouns() {
534 let sentences = vec![
535 "Alice works at Google in San Francisco.".to_string(),
536 "Bob joined Microsoft last year.".to_string(),
537 ];
538 let entities = extract_entities(&sentences);
539 assert!(
541 !entities.is_empty(),
542 "expected entities, got none from: {sentences:?}"
543 );
544 }
545
546 #[test]
550 fn test_number_date_extraction() {
551 let sentences = vec![
552 "The project started on 2024-01-15 and costs 1500.00 dollars.".to_string(),
553 "There were 42 participants in 2023.".to_string(),
554 ];
555 let entities = extract_entities(&sentences);
556 let has_number = entities
558 .iter()
559 .any(|e| e.chars().any(|c| c.is_ascii_digit()));
560 assert!(has_number, "expected numeric entities; got {entities:?}");
561 }
562
563 #[test]
567 fn test_deduplication_removes_near_duplicates() {
568 let sentences = vec![
569 "The cat sat on the mat.".to_string(),
570 "The cat sat on the mat.".to_string(), "The cat is sitting on the mat.".to_string(), "Dogs love to play in the park every afternoon.".to_string(),
573 ];
574 let deduped = deduplicate_sentences(&sentences);
575 assert!(
577 deduped.len() < sentences.len(),
578 "deduped len {} should be < original len {}",
579 deduped.len(),
580 sentences.len()
581 );
582 assert!(deduped.iter().any(|s| s.contains("Dogs")));
583 }
584
585 #[test]
589 fn test_compression_ratio_computed() {
590 let compressor = default_compressor();
591 let text = "I think basically we need to understand that the system, \
592 you know, is sort of designed to handle large amounts of data. \
593 Actually to be honest the architecture was I believe chosen to \
594 support scalability. At the end of the day the database stores \
595 records and provides search functionality for the application. \
596 The API layer handles authentication and rate limiting as well.";
597 let result = compressor.compress(text);
598 assert!(
599 result.ratio > 0.0 && result.ratio <= 1.0,
600 "ratio {} should be in (0, 1]",
601 result.ratio
602 );
603 assert_eq!(
604 result.ratio,
605 result.compressed_tokens as f32 / result.original_tokens as f32
606 );
607 }
608
609 #[test]
613 fn test_decompress_produces_non_empty_text() {
614 let compressor = default_compressor();
615 let text = "Alice joined Google in 2022 as a senior engineer. \
616 She works on distributed systems and handles large scale data pipelines. \
617 The team uses Rust and Go for backend services in the cloud infrastructure.";
618 let compressed = compressor.compress(text);
619 let decompressed = compressor.decompress(&compressed);
620 assert!(
621 !decompressed.is_empty(),
622 "decompress should produce non-empty text"
623 );
624 }
625
626 #[test]
630 fn test_batch_compression() {
631 let compressor = default_compressor();
632 let texts = &[
633 "Short text.",
634 "Alice works at Google as a software engineer and manages infrastructure projects in California.",
635 "The system provides search and storage capabilities for large enterprise applications.",
636 ];
637 let results = compressor.compress_batch(texts);
638 assert_eq!(results.len(), texts.len());
639 }
640
641 #[test]
645 fn test_empty_input_handled() {
646 let compressor = default_compressor();
647 let result = compressor.compress("");
648 assert_eq!(result.original_tokens, 0);
649 assert_eq!(result.compressed_tokens, 0);
650 assert!(result.structured_content.is_empty());
651 assert!(result.key_entities.is_empty());
652 }
653
654 #[test]
658 fn test_whitespace_only_input_handled() {
659 let compressor = default_compressor();
660 let result = compressor.compress(" \n\t ");
661 assert!(result.structured_content.is_empty());
662 }
663
664 #[test]
668 fn test_jaccard_identical_sentences() {
669 let a = "the cat sat on the mat";
670 assert!((jaccard_similarity(a, a) - 1.0).abs() < 1e-9);
671 }
672
673 #[test]
674 fn test_jaccard_disjoint_sentences() {
675 let a = "apple orange banana";
676 let b = "car truck motorcycle";
677 assert_eq!(jaccard_similarity(a, b), 0.0);
678 }
679}