use crate::intelligence::token_counter::TiktokenCounter;
use serde::{Deserialize, Serialize};
use tracing::warn;
const STOPWORDS: &[&str] = &[
"a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by",
"from", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "do", "does",
"did", "will", "would",
];
const FILLER_PHRASES: &[&str] = &[
"basically",
"essentially",
"in fact",
"as a matter of fact",
"it is worth noting that",
"it should be noted that",
"needless to say",
"to be honest",
"honestly",
"actually",
"literally",
"obviously",
"clearly",
"simply",
"just",
"very",
"really",
"quite",
"rather",
"somewhat",
"kind of",
"sort of",
];
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum CompressionLevel {
None,
Light,
Medium,
Heavy,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CompressedEntry {
pub original_id: i64,
pub original_tokens: usize,
pub compressed_content: String,
pub tokens_used: usize,
pub compression_level: CompressionLevel,
pub tokenizer_id: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TokenBudget {
pub total: usize,
pub used: usize,
pub remaining: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CompressionPlan {
pub entries: Vec<CompressedEntry>,
pub skipped_ids: Vec<i64>,
pub budget: TokenBudget,
}
#[derive(Debug, Clone)]
pub struct MemoryInput {
pub id: i64,
pub content: String,
pub importance: f32,
}
pub struct ContextCompressor {
budget_tokens: usize,
used_tokens: usize,
token_counter: Option<TiktokenCounter>,
}
impl ContextCompressor {
pub fn new(budget_tokens: usize) -> Self {
Self {
budget_tokens,
used_tokens: 0,
token_counter: None,
}
}
pub fn with_token_counter(budget_tokens: usize, counter: TiktokenCounter) -> Self {
Self {
budget_tokens,
used_tokens: 0,
token_counter: Some(counter),
}
}
fn tokenizer_id(&self) -> String {
match &self.token_counter {
Some(c) => c.encoding_name().to_string(),
None => "chars/4".to_string(),
}
}
fn count(&self, text: &str) -> usize {
match &self.token_counter {
Some(c) => {
use crate::intelligence::context_builder::TokenCounter;
c.count_tokens(text)
}
None => Self::estimate_tokens(text),
}
}
pub fn estimate_tokens(text: &str) -> usize {
text.len().div_ceil(4)
}
pub fn compress_light(text: &str) -> String {
let mut result = text.to_string();
for phrase in FILLER_PHRASES {
let lower = result.to_lowercase();
let mut positions: Vec<usize> = lower.match_indices(phrase).map(|(i, _)| i).collect();
positions.sort_unstable_by(|a, b| b.cmp(a)); for pos in positions {
let before_ok = pos == 0
|| !result
.as_bytes()
.get(pos - 1)
.copied()
.map(|b| b.is_ascii_alphabetic())
.unwrap_or(false);
let after_pos = pos + phrase.len();
let after_ok = after_pos >= result.len()
|| !result
.as_bytes()
.get(after_pos)
.copied()
.map(|b| b.is_ascii_alphabetic())
.unwrap_or(false);
if before_ok && after_ok {
result.drain(pos..after_pos);
}
}
}
let words: Vec<&str> = result.split_whitespace().collect();
let filtered: Vec<&str> = words
.into_iter()
.filter(|w| {
let lower = w.to_lowercase();
let stripped = lower.trim_matches(|c: char| !c.is_alphabetic());
!STOPWORDS.contains(&stripped)
})
.collect();
filtered.join(" ")
}
pub fn compress_medium(text: &str) -> String {
let paragraphs: Vec<&str> = text.split("\n\n").collect();
let mut kept: Vec<String> = Vec::new();
for paragraph in ¶graphs {
let sentences = split_sentences(paragraph);
let mut para_kept: Vec<&str> = Vec::new();
for (idx, sentence) in sentences.iter().enumerate() {
let trimmed = sentence.trim();
if trimmed.is_empty() {
continue;
}
if idx == 0 {
para_kept.push(trimmed);
continue;
}
if has_entity_word(trimmed) {
para_kept.push(trimmed);
}
}
if !para_kept.is_empty() {
kept.push(para_kept.join(" "));
}
}
kept.join("\n\n")
}
pub fn compress_heavy(text: &str) -> String {
let mut facts: Vec<String> = Vec::new();
for line in text.lines() {
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
if looks_like_fact_line(trimmed) {
facts.push(trimmed.to_string());
continue;
}
if trimmed.chars().any(|c| c.is_ascii_digit()) {
facts.push(trimmed.to_string());
continue;
}
if contains_date(trimmed) {
facts.push(trimmed.to_string());
}
}
if facts.is_empty() {
let first = split_sentences(text).into_iter().next().unwrap_or_default();
first.trim().to_string()
} else {
facts.join("\n")
}
}
pub fn compress_single(content: &str, level: CompressionLevel) -> String {
match level {
CompressionLevel::None => content.to_string(),
CompressionLevel::Light => Self::compress_light(content),
CompressionLevel::Medium => Self::compress_medium(content),
CompressionLevel::Heavy => Self::compress_heavy(content),
}
}
pub fn compress_for_context(memories: &[MemoryInput], budget: usize) -> Vec<CompressedEntry> {
let mut compressor = Self::new(budget);
compressor
.compress_for_context_with_diagnostics(memories)
.entries
}
pub fn compress_for_context_with_diagnostics(
&mut self,
memories: &[MemoryInput],
) -> CompressionPlan {
let mut indexed: Vec<usize> = (0..memories.len()).collect();
indexed.sort_unstable_by(|&a, &b| {
memories[b]
.importance
.partial_cmp(&memories[a].importance)
.unwrap_or(std::cmp::Ordering::Equal)
});
let mut entries: Vec<CompressedEntry> = Vec::new();
self.used_tokens = 0;
let mut skipped_ids: Vec<i64> = Vec::new();
let tokenizer_id = self.tokenizer_id();
for idx in indexed {
let mem = &memories[idx];
let original_tokens = self.count(&mem.content);
let levels = [
CompressionLevel::None,
CompressionLevel::Light,
CompressionLevel::Medium,
CompressionLevel::Heavy,
];
let mut chosen: Option<(CompressionLevel, String, usize)> = None;
for &level in &levels {
let compressed = Self::compress_single(&mem.content, level);
let tokens = self.count(&compressed);
if self.used_tokens + tokens <= self.budget_tokens {
chosen = Some((level, compressed, tokens));
break;
}
}
if let Some((level, compressed_content, tokens)) = chosen {
self.used_tokens += tokens;
entries.push(CompressedEntry {
original_id: mem.id,
original_tokens,
compressed_content,
tokens_used: tokens,
compression_level: level,
tokenizer_id: tokenizer_id.clone(),
});
} else {
warn!(
"ContextCompressor skipped memory {} ({} tokens): could not fit even at heavy compression",
mem.id, original_tokens
);
skipped_ids.push(mem.id);
}
}
CompressionPlan {
entries,
skipped_ids,
budget: self.budget(),
}
}
pub fn budget(&self) -> TokenBudget {
let used = self.used_tokens;
let remaining = self.budget_tokens.saturating_sub(used);
TokenBudget {
total: self.budget_tokens,
used,
remaining,
}
}
}
fn split_sentences(text: &str) -> Vec<&str> {
let mut sentences: Vec<&str> = Vec::new();
let mut start = 0;
let bytes = text.as_bytes();
let len = bytes.len();
let mut i = 0;
while i < len {
let b = bytes[i];
if b == b'.' || b == b'!' || b == b'?' {
let end = (i + 1).min(len);
let s = text[start..end].trim();
if !s.is_empty() {
sentences.push(s);
}
i += 1;
while i < len && bytes[i] == b' ' {
i += 1;
}
start = i;
} else {
i += 1;
}
}
let tail = text[start..].trim();
if !tail.is_empty() {
sentences.push(tail);
}
sentences
}
fn has_entity_word(sentence: &str) -> bool {
sentence
.split_whitespace()
.skip(1) .any(|w| {
w.chars()
.next()
.map(|c| c.is_ascii_uppercase())
.unwrap_or(false)
})
}
fn looks_like_fact_line(line: &str) -> bool {
if let Some(colon_pos) = line.find(':') {
if colon_pos == 0 {
return false;
}
let key = &line[..colon_pos];
let trimmed_key = key.trim();
!trimmed_key.is_empty()
&& trimmed_key.len() <= 40
&& trimmed_key
.chars()
.all(|c| c.is_alphabetic() || c == ' ' || c == '_' || c == '-')
} else {
false
}
}
fn contains_date(text: &str) -> bool {
const MONTHS: &[&str] = &[
"january",
"february",
"march",
"april",
"may",
"june",
"july",
"august",
"september",
"october",
"november",
"december",
];
let lower = text.to_lowercase();
let bytes = text.as_bytes();
for i in 0..bytes.len().saturating_sub(3) {
if bytes[i].is_ascii_digit()
&& bytes[i + 1].is_ascii_digit()
&& bytes[i + 2].is_ascii_digit()
&& bytes[i + 3].is_ascii_digit()
{
let year_str = &text[i..i + 4];
if let Ok(year) = year_str.parse::<u32>() {
if (1900..=2099).contains(&year) {
return true;
}
}
}
}
MONTHS.iter().any(|m| lower.contains(m))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_budget_enforcement() {
let memories = vec![
MemoryInput {
id: 1,
content: "A".repeat(400), importance: 0.9,
},
MemoryInput {
id: 2,
content: "B".repeat(400),
importance: 0.8,
},
MemoryInput {
id: 3,
content: "C".repeat(400),
importance: 0.7,
},
];
let budget = 120; let entries = ContextCompressor::compress_for_context(&memories, budget);
let total_used: usize = entries.iter().map(|e| e.tokens_used).sum();
assert!(
total_used <= budget,
"total_used={} exceeded budget={}",
total_used,
budget
);
}
#[test]
fn test_adaptive_escalation() {
let long_content = "The project launched in January 2024. Alice and Bob led the team. \
The revenue grew by 40% year over year. Customer satisfaction reached 95%. \
The new platform handles 10 million requests per day."
.repeat(5);
let memories = vec![
MemoryInput {
id: 1,
content: long_content.clone(),
importance: 1.0,
},
MemoryInput {
id: 2,
content: long_content.clone(),
importance: 0.1,
},
];
let one_token_count = ContextCompressor::estimate_tokens(&long_content);
let budget = one_token_count + one_token_count / 4;
let entries = ContextCompressor::compress_for_context(&memories, budget);
if let Some(first) = entries.first() {
assert_eq!(first.original_id, 1, "highest importance should be first");
}
if entries.len() == 2 {
let first_level = entries[0].compression_level as u8;
let second_level = entries[1].compression_level as u8;
assert!(
second_level >= first_level,
"less important memory should have equal or heavier compression"
);
}
}
#[test]
fn test_empty_input() {
let entries = ContextCompressor::compress_for_context(&[], 1000);
assert!(entries.is_empty());
}
#[test]
fn test_single_memory_fits_at_none() {
let content = "This is a short note.";
let memories = vec![MemoryInput {
id: 42,
content: content.to_string(),
importance: 0.5,
}];
let budget = 1000; let entries = ContextCompressor::compress_for_context(&memories, budget);
assert_eq!(entries.len(), 1);
assert_eq!(entries[0].original_id, 42);
assert_eq!(entries[0].compression_level, CompressionLevel::None);
assert_eq!(entries[0].compressed_content, content);
}
#[test]
fn test_all_memories_exceed_budget_returns_partial() {
let memories = vec![
MemoryInput {
id: 1,
content: "A".repeat(1000), importance: 0.9,
},
MemoryInput {
id: 2,
content: "B".repeat(1000),
importance: 0.8,
},
MemoryInput {
id: 3,
content: "C".repeat(1000),
importance: 0.7,
},
];
let budget = 1; let entries = ContextCompressor::compress_for_context(&memories, budget);
assert!(
entries.is_empty(),
"nothing should fit in a budget of 1 token"
);
}
#[test]
fn test_compress_for_context_with_diagnostics_reports_skips() {
let memories = vec![
MemoryInput {
id: 1,
content: "A".repeat(1000),
importance: 0.9,
},
MemoryInput {
id: 2,
content: "B".repeat(1000),
importance: 0.8,
},
MemoryInput {
id: 3,
content: "C".repeat(1000),
importance: 0.7,
},
];
let mut compressor = ContextCompressor::new(1);
let result = compressor.compress_for_context_with_diagnostics(&memories);
assert!(
result.entries.is_empty(),
"nothing should fit in a budget of 1 token"
);
assert_eq!(result.skipped_ids, vec![1, 2, 3]);
assert_eq!(result.budget.used, 0);
assert_eq!(result.budget.total, 1);
assert_eq!(compressor.budget().used, 0);
}
#[test]
fn test_token_estimation() {
assert_eq!(ContextCompressor::estimate_tokens(""), 0);
assert_eq!(ContextCompressor::estimate_tokens("abcd"), 1);
assert_eq!(ContextCompressor::estimate_tokens("abcdefgh"), 2);
let s = "a".repeat(100);
assert_eq!(ContextCompressor::estimate_tokens(&s), 25);
assert_eq!(ContextCompressor::estimate_tokens("abcde"), 2);
}
#[test]
fn test_light_compression_removes_filler() {
let text = "This is basically a very simple test. It is, honestly, quite straightforward.";
let compressed = ContextCompressor::compress_light(text);
assert!(
compressed.len() < text.len(),
"light compression should shorten text"
);
assert!(
!compressed.to_lowercase().contains("basically"),
"filler 'basically' should be removed"
);
assert!(
!compressed.to_lowercase().contains("honestly"),
"filler 'honestly' should be removed"
);
}
#[test]
fn test_heavy_compression_extracts_facts() {
let text = "The meeting was uneventful.\n\
Revenue: 1.5 million dollars\n\
Founded: January 2020\n\
The weather was nice today.\n\
Headcount: 42 engineers";
let compressed = ContextCompressor::compress_heavy(text);
assert!(
compressed.contains("Revenue:") || compressed.contains("1.5"),
"should include revenue fact"
);
assert!(
compressed.contains("Headcount:") || compressed.contains("42"),
"should include headcount fact"
);
let lines: Vec<&str> = compressed.lines().collect();
assert!(
lines.len() <= 4,
"heavy compression should produce few lines, got {}",
lines.len()
);
}
#[test]
fn test_compress_single_none_returns_unchanged() {
let text = "Hello, world!";
let result = ContextCompressor::compress_single(text, CompressionLevel::None);
assert_eq!(result, text);
}
#[test]
fn test_budget_reflects_configured_total() {
let compressor = ContextCompressor::new(8192);
let b = compressor.budget();
assert_eq!(b.total, 8192);
assert_eq!(b.used, 0);
assert_eq!(b.remaining, 8192);
}
#[test]
fn test_medium_compression_keeps_first_sentence() {
let text = "First sentence here. Second sentence with Entity Name. Third unimportant one.";
let compressed = ContextCompressor::compress_medium(text);
assert!(
compressed.contains("First sentence here"),
"medium compression should keep first sentence"
);
}
}