1use serde::{Deserialize, Serialize};
13
14const STOPWORDS: &[&str] = &[
19 "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by",
20 "from", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "do", "does",
21 "did", "will", "would",
22];
23
24const FILLER_PHRASES: &[&str] = &[
26 "basically",
27 "essentially",
28 "in fact",
29 "as a matter of fact",
30 "it is worth noting that",
31 "it should be noted that",
32 "needless to say",
33 "to be honest",
34 "honestly",
35 "actually",
36 "literally",
37 "obviously",
38 "clearly",
39 "simply",
40 "just",
41 "very",
42 "really",
43 "quite",
44 "rather",
45 "somewhat",
46 "kind of",
47 "sort of",
48];
49
50#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
56pub enum CompressionLevel {
57 None,
59 Light,
61 Medium,
64 Heavy,
66}
67
68#[derive(Debug, Clone, Serialize, Deserialize)]
70pub struct CompressedEntry {
71 pub original_id: i64,
73 pub original_tokens: usize,
75 pub compressed_content: String,
77 pub tokens_used: usize,
79 pub compression_level: CompressionLevel,
81}
82
83#[derive(Debug, Clone, Serialize, Deserialize)]
85pub struct TokenBudget {
86 pub total: usize,
88 pub used: usize,
90 pub remaining: usize,
92}
93
94#[derive(Debug, Clone)]
96pub struct MemoryInput {
97 pub id: i64,
99 pub content: String,
101 pub importance: f32,
103}
104
105pub struct ContextCompressor {
111 budget_tokens: usize,
112 used_tokens: usize,
113}
114
115impl ContextCompressor {
116 pub fn new(budget_tokens: usize) -> Self {
118 Self {
119 budget_tokens,
120 used_tokens: 0,
121 }
122 }
123
124 pub fn estimate_tokens(text: &str) -> usize {
130 text.len().div_ceil(4)
131 }
132
133 pub fn compress_light(text: &str) -> String {
140 let mut result = text.to_string();
141
142 for phrase in FILLER_PHRASES {
144 let lower = result.to_lowercase();
145 let mut positions: Vec<usize> = lower.match_indices(phrase).map(|(i, _)| i).collect();
148 positions.sort_unstable_by(|a, b| b.cmp(a)); for pos in positions {
150 let before_ok = pos == 0
153 || !result
154 .as_bytes()
155 .get(pos - 1)
156 .copied()
157 .map(|b| b.is_ascii_alphabetic())
158 .unwrap_or(false);
159 let after_pos = pos + phrase.len();
160 let after_ok = after_pos >= result.len()
161 || !result
162 .as_bytes()
163 .get(after_pos)
164 .copied()
165 .map(|b| b.is_ascii_alphabetic())
166 .unwrap_or(false);
167
168 if before_ok && after_ok {
169 result.drain(pos..after_pos);
170 }
171 }
172 }
173
174 let words: Vec<&str> = result.split_whitespace().collect();
176 let filtered: Vec<&str> = words
177 .into_iter()
178 .filter(|w| {
179 let lower = w.to_lowercase();
180 let stripped = lower.trim_matches(|c: char| !c.is_alphabetic());
181 !STOPWORDS.contains(&stripped)
182 })
183 .collect();
184
185 filtered.join(" ")
187 }
188
189 pub fn compress_medium(text: &str) -> String {
193 let paragraphs: Vec<&str> = text.split("\n\n").collect();
194 let mut kept: Vec<String> = Vec::new();
195
196 for paragraph in ¶graphs {
197 let sentences = split_sentences(paragraph);
198 let mut para_kept: Vec<&str> = Vec::new();
199
200 for (idx, sentence) in sentences.iter().enumerate() {
201 let trimmed = sentence.trim();
202 if trimmed.is_empty() {
203 continue;
204 }
205
206 if idx == 0 {
208 para_kept.push(trimmed);
209 continue;
210 }
211
212 if has_entity_word(trimmed) {
215 para_kept.push(trimmed);
216 }
217 }
218
219 if !para_kept.is_empty() {
220 kept.push(para_kept.join(" "));
221 }
222 }
223
224 kept.join("\n\n")
225 }
226
227 pub fn compress_heavy(text: &str) -> String {
230 let mut facts: Vec<String> = Vec::new();
231
232 for line in text.lines() {
233 let trimmed = line.trim();
234 if trimmed.is_empty() {
235 continue;
236 }
237
238 if looks_like_fact_line(trimmed) {
240 facts.push(trimmed.to_string());
241 continue;
242 }
243
244 if trimmed.chars().any(|c| c.is_ascii_digit()) {
246 facts.push(trimmed.to_string());
247 continue;
248 }
249
250 if contains_date(trimmed) {
252 facts.push(trimmed.to_string());
253 }
254 }
255
256 if facts.is_empty() {
257 let first = split_sentences(text).into_iter().next().unwrap_or_default();
259 first.trim().to_string()
260 } else {
261 facts.join("\n")
262 }
263 }
264
265 pub fn compress_single(content: &str, level: CompressionLevel) -> String {
271 match level {
272 CompressionLevel::None => content.to_string(),
273 CompressionLevel::Light => Self::compress_light(content),
274 CompressionLevel::Medium => Self::compress_medium(content),
275 CompressionLevel::Heavy => Self::compress_heavy(content),
276 }
277 }
278
279 pub fn compress_for_context(memories: &[MemoryInput], budget: usize) -> Vec<CompressedEntry> {
291 let mut indexed: Vec<usize> = (0..memories.len()).collect();
293 indexed.sort_unstable_by(|&a, &b| {
294 memories[b]
295 .importance
296 .partial_cmp(&memories[a].importance)
297 .unwrap_or(std::cmp::Ordering::Equal)
298 });
299
300 let mut entries: Vec<CompressedEntry> = Vec::new();
301 let mut used: usize = 0;
302
303 for idx in indexed {
304 let mem = &memories[idx];
305 let original_tokens = Self::estimate_tokens(&mem.content);
306
307 let levels = [
309 CompressionLevel::None,
310 CompressionLevel::Light,
311 CompressionLevel::Medium,
312 CompressionLevel::Heavy,
313 ];
314
315 let mut chosen: Option<(CompressionLevel, String, usize)> = None;
316
317 for &level in &levels {
318 let compressed = Self::compress_single(&mem.content, level);
319 let tokens = Self::estimate_tokens(&compressed);
320
321 if used + tokens <= budget {
322 chosen = Some((level, compressed, tokens));
323 break;
324 }
325 }
326
327 if let Some((level, compressed_content, tokens)) = chosen {
328 used += tokens;
329 entries.push(CompressedEntry {
330 original_id: mem.id,
331 original_tokens,
332 compressed_content,
333 tokens_used: tokens,
334 compression_level: level,
335 });
336 }
337 }
339
340 entries
341 }
342
343 pub fn budget(&self) -> TokenBudget {
349 let used = self.used_tokens;
350 let remaining = self.budget_tokens.saturating_sub(used);
351 TokenBudget {
352 total: self.budget_tokens,
353 used,
354 remaining,
355 }
356 }
357}
358
359fn split_sentences(text: &str) -> Vec<&str> {
365 let mut sentences: Vec<&str> = Vec::new();
366 let mut start = 0;
367
368 let bytes = text.as_bytes();
369 let len = bytes.len();
370
371 let mut i = 0;
372 while i < len {
373 let b = bytes[i];
374 if b == b'.' || b == b'!' || b == b'?' {
375 let end = (i + 1).min(len);
377 let s = text[start..end].trim();
378 if !s.is_empty() {
379 sentences.push(s);
380 }
381 i += 1;
383 while i < len && bytes[i] == b' ' {
384 i += 1;
385 }
386 start = i;
387 } else {
388 i += 1;
389 }
390 }
391
392 let tail = text[start..].trim();
394 if !tail.is_empty() {
395 sentences.push(tail);
396 }
397
398 sentences
399}
400
401fn has_entity_word(sentence: &str) -> bool {
404 sentence
405 .split_whitespace()
406 .skip(1) .any(|w| {
408 w.chars()
409 .next()
410 .map(|c| c.is_ascii_uppercase())
411 .unwrap_or(false)
412 })
413}
414
415fn looks_like_fact_line(line: &str) -> bool {
417 if let Some(colon_pos) = line.find(':') {
418 if colon_pos == 0 {
419 return false;
420 }
421 let key = &line[..colon_pos];
422 let trimmed_key = key.trim();
424 !trimmed_key.is_empty()
425 && trimmed_key.len() <= 40
426 && trimmed_key
427 .chars()
428 .all(|c| c.is_alphabetic() || c == ' ' || c == '_' || c == '-')
429 } else {
430 false
431 }
432}
433
434fn contains_date(text: &str) -> bool {
437 const MONTHS: &[&str] = &[
438 "january",
439 "february",
440 "march",
441 "april",
442 "may",
443 "june",
444 "july",
445 "august",
446 "september",
447 "october",
448 "november",
449 "december",
450 ];
451
452 let lower = text.to_lowercase();
453
454 let bytes = text.as_bytes();
456 for i in 0..bytes.len().saturating_sub(3) {
457 if bytes[i].is_ascii_digit()
458 && bytes[i + 1].is_ascii_digit()
459 && bytes[i + 2].is_ascii_digit()
460 && bytes[i + 3].is_ascii_digit()
461 {
462 let year_str = &text[i..i + 4];
463 if let Ok(year) = year_str.parse::<u32>() {
464 if (1900..=2099).contains(&year) {
465 return true;
466 }
467 }
468 }
469 }
470
471 MONTHS.iter().any(|m| lower.contains(m))
473}
474
475#[cfg(test)]
480mod tests {
481 use super::*;
482
483 #[test]
485 fn test_budget_enforcement() {
486 let memories = vec![
487 MemoryInput {
488 id: 1,
489 content: "A".repeat(400), importance: 0.9,
491 },
492 MemoryInput {
493 id: 2,
494 content: "B".repeat(400),
495 importance: 0.8,
496 },
497 MemoryInput {
498 id: 3,
499 content: "C".repeat(400),
500 importance: 0.7,
501 },
502 ];
503
504 let budget = 120; let entries = ContextCompressor::compress_for_context(&memories, budget);
506
507 let total_used: usize = entries.iter().map(|e| e.tokens_used).sum();
508 assert!(
509 total_used <= budget,
510 "total_used={} exceeded budget={}",
511 total_used,
512 budget
513 );
514 }
515
516 #[test]
518 fn test_adaptive_escalation() {
519 let long_content = "The project launched in January 2024. Alice and Bob led the team. \
522 The revenue grew by 40% year over year. Customer satisfaction reached 95%. \
523 The new platform handles 10 million requests per day."
524 .repeat(5);
525
526 let memories = vec![
527 MemoryInput {
528 id: 1,
529 content: long_content.clone(),
530 importance: 1.0,
531 },
532 MemoryInput {
533 id: 2,
534 content: long_content.clone(),
535 importance: 0.1,
536 },
537 ];
538
539 let one_token_count = ContextCompressor::estimate_tokens(&long_content);
541 let budget = one_token_count + one_token_count / 4; let entries = ContextCompressor::compress_for_context(&memories, budget);
544
545 if let Some(first) = entries.first() {
547 assert_eq!(first.original_id, 1, "highest importance should be first");
548 }
549
550 if entries.len() == 2 {
552 let first_level = entries[0].compression_level as u8;
553 let second_level = entries[1].compression_level as u8;
554 assert!(
555 second_level >= first_level,
556 "less important memory should have equal or heavier compression"
557 );
558 }
559 }
560
561 #[test]
563 fn test_empty_input() {
564 let entries = ContextCompressor::compress_for_context(&[], 1000);
565 assert!(entries.is_empty());
566 }
567
568 #[test]
570 fn test_single_memory_fits_at_none() {
571 let content = "This is a short note.";
572 let memories = vec![MemoryInput {
573 id: 42,
574 content: content.to_string(),
575 importance: 0.5,
576 }];
577
578 let budget = 1000; let entries = ContextCompressor::compress_for_context(&memories, budget);
580
581 assert_eq!(entries.len(), 1);
582 assert_eq!(entries[0].original_id, 42);
583 assert_eq!(entries[0].compression_level, CompressionLevel::None);
584 assert_eq!(entries[0].compressed_content, content);
585 }
586
587 #[test]
589 fn test_all_memories_exceed_budget_returns_partial() {
590 let memories = vec![
591 MemoryInput {
592 id: 1,
593 content: "A".repeat(1000), importance: 0.9,
595 },
596 MemoryInput {
597 id: 2,
598 content: "B".repeat(1000),
599 importance: 0.8,
600 },
601 MemoryInput {
602 id: 3,
603 content: "C".repeat(1000),
604 importance: 0.7,
605 },
606 ];
607
608 let budget = 1; let entries = ContextCompressor::compress_for_context(&memories, budget);
610
611 assert!(
612 entries.is_empty(),
613 "nothing should fit in a budget of 1 token"
614 );
615 }
616
617 #[test]
619 fn test_token_estimation() {
620 assert_eq!(ContextCompressor::estimate_tokens(""), 0);
622
623 assert_eq!(ContextCompressor::estimate_tokens("abcd"), 1);
625
626 assert_eq!(ContextCompressor::estimate_tokens("abcdefgh"), 2);
628
629 let s = "a".repeat(100);
631 assert_eq!(ContextCompressor::estimate_tokens(&s), 25);
632
633 assert_eq!(ContextCompressor::estimate_tokens("abcde"), 2);
635 }
636
637 #[test]
639 fn test_light_compression_removes_filler() {
640 let text = "This is basically a very simple test. It is, honestly, quite straightforward.";
641 let compressed = ContextCompressor::compress_light(text);
642
643 assert!(
645 compressed.len() < text.len(),
646 "light compression should shorten text"
647 );
648 assert!(
650 !compressed.to_lowercase().contains("basically"),
651 "filler 'basically' should be removed"
652 );
653 assert!(
655 !compressed.to_lowercase().contains("honestly"),
656 "filler 'honestly' should be removed"
657 );
658 }
659
660 #[test]
662 fn test_heavy_compression_extracts_facts() {
663 let text = "The meeting was uneventful.\n\
664 Revenue: 1.5 million dollars\n\
665 Founded: January 2020\n\
666 The weather was nice today.\n\
667 Headcount: 42 engineers";
668
669 let compressed = ContextCompressor::compress_heavy(text);
670
671 assert!(
673 compressed.contains("Revenue:") || compressed.contains("1.5"),
674 "should include revenue fact"
675 );
676 assert!(
677 compressed.contains("Headcount:") || compressed.contains("42"),
678 "should include headcount fact"
679 );
680 let lines: Vec<&str> = compressed.lines().collect();
682 assert!(
683 lines.len() <= 4,
684 "heavy compression should produce few lines, got {}",
685 lines.len()
686 );
687 }
688
689 #[test]
691 fn test_compress_single_none_returns_unchanged() {
692 let text = "Hello, world!";
693 let result = ContextCompressor::compress_single(text, CompressionLevel::None);
694 assert_eq!(result, text);
695 }
696
697 #[test]
699 fn test_budget_reflects_configured_total() {
700 let compressor = ContextCompressor::new(8192);
701 let b = compressor.budget();
702 assert_eq!(b.total, 8192);
703 assert_eq!(b.used, 0);
704 assert_eq!(b.remaining, 8192);
705 }
706
707 #[test]
709 fn test_medium_compression_keeps_first_sentence() {
710 let text = "First sentence here. Second sentence with Entity Name. Third unimportant one.";
711 let compressed = ContextCompressor::compress_medium(text);
712
713 assert!(
714 compressed.contains("First sentence here"),
715 "medium compression should keep first sentence"
716 );
717 }
718}