Skip to main content

engram/intelligence/
compression_semantic.rs

1//! Semantic Structured Compression — RML-1208
2//!
3//! Compresses verbose memory content into structured summaries targeting ~30x
4//! token reduction using rule-based NLP techniques. Pure computation — no
5//! database access, no network I/O.
6//!
7//! ## Pipeline
8//! 1. Split text into sentences
9//! 2. Strip filler and hedging phrases
10//! 3. Extract proper nouns and number/date entities
11//! 4. Derive subject-verb-object cores
12//! 5. Deduplicate near-identical sentences (Jaccard > 0.6)
13//! 6. Reassemble structured_content and key_facts
14//!
15//! ## Invariants
16//! - Never panics on any input (including empty strings)
17//! - Token estimation uses `text.len() / 4`
18//! - Short content below `min_content_length` is returned verbatim
19
20use std::collections::HashSet;
21
22use once_cell::sync::Lazy;
23use regex::Regex;
24use serde::{Deserialize, Serialize};
25
26// =============================================================================
27// Constants — filler / hedging phrase lists
28// =============================================================================
29
30/// Filler phrases that add no information and should be stripped.
31const FILLER_PHRASES: &[&str] = &[
32    "i think",
33    "basically",
34    "you know",
35    "kind of",
36    "sort of",
37    "i mean",
38    "like",
39    "actually",
40    "to be honest",
41    "in my opinion",
42    "i believe",
43    "i guess",
44    "i suppose",
45    "it seems like",
46    "more or less",
47    "pretty much",
48    "at the end of the day",
49    "as a matter of fact",
50    "the thing is",
51    "to be fair",
52    "honestly",
53    "literally",
54    "obviously",
55    "clearly",
56    "just",
57    "simply",
58    "basically speaking",
59    "needless to say",
60    "as you know",
61    "for what it's worth",
62];
63
64/// Hedging phrases — uncertainty markers that inflate token count.
65const HEDGING_PHRASES: &[&str] = &[
66    "maybe",
67    "perhaps",
68    "sort of",
69    "kind of",
70    "somewhat",
71    "rather",
72    "fairly",
73    "quite",
74    "a bit",
75    "a little",
76    "in a way",
77    "in some ways",
78    "to some extent",
79    "to a degree",
80    "more or less",
81];
82
83// =============================================================================
84// Regex patterns (compiled once via Lazy)
85// =============================================================================
86
87/// Matches capitalized words that could be proper nouns.
88/// We use a simple pattern and filter sentence-start words in post-processing.
89static PROPER_NOUN_RE: Lazy<Regex> =
90    Lazy::new(|| Regex::new(r"\b([A-Z][a-z]{2,}(?:\s+[A-Z][a-z]{2,})*)\b").expect("valid regex"));
91
92/// Matches numbers (integers, decimals) and common date-like patterns.
93static NUMBER_DATE_RE: Lazy<Regex> = Lazy::new(|| {
94    Regex::new(r"\b(\d{1,4}[/-]\d{1,2}[/-]\d{1,4}|\d{4}|\d+\.\d+|\d{1,3}(?:,\d{3})*(?:\.\d+)?)\b")
95        .expect("valid regex")
96});
97
98/// Matches sentence-terminating punctuation followed by whitespace.
99/// Used to split sentences without requiring look-behind.
100static SENTENCE_SPLIT_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"[.!?]\s+").expect("valid regex"));
101
102/// Verb word list used to identify the predicate in an SVO triple.
103static COMMON_VERBS: Lazy<Regex> = Lazy::new(|| {
104    Regex::new(r"\b(is|are|was|were|has|have|had|will|can|could|should|would|does|did|do|provides|uses|returns|creates|stores|contains|supports|requires|enables|implements|defines|allows|includes|handles|manages)\b")
105        .expect("valid regex")
106});
107
108// =============================================================================
109// Public types
110// =============================================================================
111
112/// Configuration for the semantic compressor.
113#[derive(Debug, Clone, Serialize, Deserialize)]
114pub struct CompressionConfig {
115    /// Target compression ratio (0.0–1.0). Default: 0.1 (keep 10% of tokens).
116    pub target_ratio: f32,
117    /// Minimum content length (chars) to attempt compression. Default: 100.
118    pub min_content_length: usize,
119    /// Preserve proper nouns and numbers in `key_entities`. Default: true.
120    pub preserve_entities: bool,
121}
122
123impl Default for CompressionConfig {
124    fn default() -> Self {
125        Self {
126            target_ratio: 0.1,
127            min_content_length: 100,
128            preserve_entities: true,
129        }
130    }
131}
132
133/// The result of compressing a single piece of text.
134#[derive(Debug, Clone, Serialize, Deserialize)]
135pub struct CompressedMemory {
136    /// Estimated original token count (`original_text.len() / 4`).
137    pub original_tokens: usize,
138    /// Estimated compressed token count (`structured_content.len() / 4`).
139    pub compressed_tokens: usize,
140    /// Actual ratio: `compressed_tokens as f32 / original_tokens as f32`.
141    pub ratio: f32,
142    /// Stripped, deduplicated sentence cores joined by ". ".
143    pub structured_content: String,
144    /// Proper nouns and numbers/dates extracted from the text.
145    pub key_entities: Vec<String>,
146    /// Sentences that contain at least one entity and one verb.
147    pub key_facts: Vec<String>,
148}
149
150// =============================================================================
151// SemanticCompressor
152// =============================================================================
153
154/// Rule-based semantic compressor — no ML required.
155pub struct SemanticCompressor {
156    config: CompressionConfig,
157}
158
159impl SemanticCompressor {
160    /// Create a new compressor with the given configuration.
161    pub fn new(config: CompressionConfig) -> Self {
162        Self { config }
163    }
164
165    /// Compress a single text string.
166    ///
167    /// If the text is shorter than `config.min_content_length`, returns the
168    /// text verbatim with a ratio of `1.0`.
169    pub fn compress(&self, text: &str) -> CompressedMemory {
170        let original_tokens = estimate_tokens(text);
171
172        if text.trim().is_empty() {
173            return CompressedMemory {
174                original_tokens: 0,
175                compressed_tokens: 0,
176                ratio: 1.0,
177                structured_content: String::new(),
178                key_entities: Vec::new(),
179                key_facts: Vec::new(),
180            };
181        }
182
183        if text.trim().len() < self.config.min_content_length {
184            return CompressedMemory {
185                original_tokens,
186                compressed_tokens: original_tokens,
187                ratio: 1.0,
188                structured_content: text.trim().to_string(),
189                key_entities: Vec::new(),
190                key_facts: Vec::new(),
191            };
192        }
193
194        // Step 1 — Split into sentences
195        let sentences = split_sentences(text);
196
197        // Step 2 — Strip filler and hedging phrases
198        let cleaned: Vec<String> = sentences
199            .iter()
200            .map(|s| strip_filler(s))
201            .filter(|s| !s.trim().is_empty())
202            .collect();
203
204        // Step 3 & 4 — Extract entities + SVO cores
205        let key_entities = if self.config.preserve_entities {
206            extract_entities(&sentences)
207        } else {
208            Vec::new()
209        };
210
211        // Step 5 — Deduplicate similar sentences (Jaccard > 0.6)
212        let deduped = deduplicate_sentences(&cleaned);
213
214        // Step 6 — Build structured content from SVO cores
215        let cores: Vec<String> = deduped.iter().map(|s| extract_svo_core(s)).collect();
216        let structured_content = cores.join(". ");
217
218        // Step 7 — Extract key facts (sentence with entity + verb)
219        let key_facts = extract_key_facts(&deduped, &key_entities);
220
221        let compressed_tokens = estimate_tokens(&structured_content);
222        let ratio = if original_tokens == 0 {
223            1.0
224        } else {
225            compressed_tokens as f32 / original_tokens as f32
226        };
227
228        CompressedMemory {
229            original_tokens,
230            compressed_tokens,
231            ratio,
232            structured_content,
233            key_entities,
234            key_facts,
235        }
236    }
237
238    /// Reconstruct an approximate text from a `CompressedMemory`.
239    ///
240    /// Expands `key_facts` by appending a parenthetical list of related
241    /// entities. If there are no key facts falls back to `structured_content`.
242    pub fn decompress(&self, compressed: &CompressedMemory) -> String {
243        if compressed.structured_content.is_empty() {
244            return String::new();
245        }
246
247        if compressed.key_facts.is_empty() {
248            return compressed.structured_content.clone();
249        }
250
251        let entity_context = if !compressed.key_entities.is_empty() {
252            format!(" (entities: {})", compressed.key_entities.join(", "))
253        } else {
254            String::new()
255        };
256
257        let mut parts: Vec<String> = compressed.key_facts.clone();
258        // Append entity context to the last fact
259        if let Some(last) = parts.last_mut() {
260            last.push_str(&entity_context);
261        }
262        parts.join(". ")
263    }
264
265    /// Compress a batch of texts.
266    pub fn compress_batch(&self, texts: &[&str]) -> Vec<CompressedMemory> {
267        texts.iter().map(|t| self.compress(t)).collect()
268    }
269}
270
271// =============================================================================
272// Internal helpers
273// =============================================================================
274
275/// Estimate token count: `text.len() / 4` (byte-length heuristic).
276fn estimate_tokens(text: &str) -> usize {
277    text.len().div_ceil(4)
278}
279
280/// Split text into sentences on `.`, `!`, `?` boundaries.
281///
282/// Because the Rust `regex` crate does not support look-behind we match the
283/// terminator + whitespace, then manually re-attach the terminator to the
284/// preceding fragment.
285fn split_sentences(text: &str) -> Vec<String> {
286    // Find all match ranges [start, end) of "<punct><whitespace>" sequences
287    let terminators: Vec<(usize, usize, char)> = SENTENCE_SPLIT_RE
288        .find_iter(text)
289        .map(|m| {
290            // The punctuation character is the first byte of the match
291            let punct = text[m.start()..].chars().next().unwrap_or('.');
292            (m.start(), m.end(), punct)
293        })
294        .collect();
295
296    if terminators.is_empty() {
297        let trimmed = text.trim().to_string();
298        return if trimmed.is_empty() {
299            vec![]
300        } else {
301            vec![trimmed]
302        };
303    }
304
305    let mut sentences: Vec<String> = Vec::new();
306    let mut cursor = 0usize;
307
308    for (t_start, t_end, punct) in &terminators {
309        let fragment = text[cursor..*t_start].trim().to_string();
310        if !fragment.is_empty() {
311            sentences.push(format!("{fragment}{punct}"));
312        }
313        cursor = *t_end;
314    }
315    // Remainder after last terminator
316    let tail = text[cursor..].trim().to_string();
317    if !tail.is_empty() {
318        sentences.push(tail);
319    }
320
321    sentences
322}
323
324/// Remove filler and hedging phrases (case-insensitive, whole-word).
325fn strip_filler(text: &str) -> String {
326    let mut result = text.to_string();
327
328    // Sort by descending length so multi-word phrases match before sub-phrases
329    let mut phrases: Vec<&str> = FILLER_PHRASES
330        .iter()
331        .chain(HEDGING_PHRASES.iter())
332        .copied()
333        .collect();
334    phrases.sort_by_key(|b| std::cmp::Reverse(b.len()));
335    phrases.dedup();
336
337    for phrase in phrases {
338        // Build a case-insensitive whole-word regex for this phrase
339        let escaped = regex::escape(phrase);
340        // Match phrase at word boundary (or start/end of string), possibly
341        // followed by a comma or space, and remove it.
342        if let Ok(re) = Regex::new(&format!(r"(?i)\b{escaped}\b[,\s]*")) {
343            result = re.replace_all(&result, " ").to_string();
344        }
345    }
346
347    // Collapse multiple spaces and trim
348    let collapsed = result.split_whitespace().collect::<Vec<_>>().join(" ");
349    collapsed
350}
351
352/// Extract proper nouns and number/date entities from the original sentences.
353///
354/// To approximate "not at sentence start" without look-behind we build a set of
355/// the first word of every sentence and exclude those from the proper-noun list
356/// unless they appear again inside a sentence.
357fn extract_entities(sentences: &[String]) -> Vec<String> {
358    // Collect the first (lowercased) word of every sentence so we can skip them
359    // when they appear at position 0 of a sentence.
360    let sentence_starters: HashSet<String> = sentences
361        .iter()
362        .filter_map(|s| s.split_whitespace().next())
363        .map(|w| w.to_lowercase())
364        .collect();
365
366    let full_text = sentences.join(" ");
367    let mut entities: Vec<String> = Vec::new();
368    let mut seen: HashSet<String> = HashSet::new();
369
370    // Proper nouns — skip tokens whose lowercase form is a sentence starter
371    // unless they appear mid-sentence at least once.
372    for cap in PROPER_NOUN_RE.captures_iter(&full_text) {
373        let entity = cap[1].to_string();
374        let entity_lower = entity.to_lowercase();
375        // Accept if it is NOT a plain sentence starter, OR if it appears
376        // more than once (mid-sentence occurrences will make count > 1).
377        let count = PROPER_NOUN_RE
378            .find_iter(&full_text)
379            .filter(|m| full_text[m.start()..m.end()].to_lowercase() == entity_lower)
380            .count();
381        if (!sentence_starters.contains(&entity_lower) || count > 1) && seen.insert(entity.clone())
382        {
383            entities.push(entity);
384        }
385    }
386
387    // Numbers and dates
388    for cap in NUMBER_DATE_RE.captures_iter(&full_text) {
389        let token = cap[1].to_string();
390        if seen.insert(token.clone()) {
391            entities.push(token);
392        }
393    }
394
395    entities
396}
397
398/// Compute Jaccard similarity between two sentences (token sets).
399fn jaccard_similarity(a: &str, b: &str) -> f64 {
400    let set_a: HashSet<&str> = a.split_whitespace().collect();
401    let set_b: HashSet<&str> = b.split_whitespace().collect();
402
403    if set_a.is_empty() && set_b.is_empty() {
404        return 1.0;
405    }
406
407    let intersection = set_a.intersection(&set_b).count();
408    let union = set_a.union(&set_b).count();
409
410    if union == 0 {
411        1.0
412    } else {
413        intersection as f64 / union as f64
414    }
415}
416
417/// Deduplicate sentences where Jaccard similarity > 0.6.
418/// Keeps the first of each near-duplicate group.
419fn deduplicate_sentences(sentences: &[String]) -> Vec<String> {
420    let mut kept: Vec<String> = Vec::new();
421
422    'outer: for sentence in sentences {
423        for existing in &kept {
424            if jaccard_similarity(sentence, existing) > 0.6 {
425                continue 'outer;
426            }
427        }
428        kept.push(sentence.clone());
429    }
430
431    kept
432}
433
434/// Extract a simplified SVO core from a sentence.
435///
436/// Finds the first verb match and returns the text up to and including a
437/// short object span. Falls back to returning the full trimmed sentence if
438/// no verb is found.
439fn extract_svo_core(sentence: &str) -> String {
440    let words: Vec<&str> = sentence.split_whitespace().collect();
441    if words.len() <= 6 {
442        // Already short enough — return as-is
443        return sentence.trim().to_string();
444    }
445
446    if let Some(verb_match) = COMMON_VERBS.find(sentence) {
447        // Take: everything before the verb (subject), the verb, and up to
448        // 5 words after the verb (object span)
449        let pre = &sentence[..verb_match.start()].trim();
450        let post = &sentence[verb_match.end()..].trim();
451        let object_words: Vec<&str> = post.split_whitespace().take(5).collect();
452        let object = object_words.join(" ");
453        let verb = verb_match.as_str();
454
455        let parts = [*pre, verb, &object]
456            .iter()
457            .filter(|p| !p.is_empty())
458            .copied()
459            .collect::<Vec<_>>();
460        return parts.join(" ");
461    }
462
463    // No verb found — truncate to first 8 words
464    words[..words.len().min(8)].join(" ")
465}
466
467/// Extract key facts: sentences that contain at least one entity and one verb.
468fn extract_key_facts(sentences: &[String], entities: &[String]) -> Vec<String> {
469    sentences
470        .iter()
471        .filter(|s| {
472            let has_verb = COMMON_VERBS.is_match(s);
473            let s_lower = s.to_lowercase();
474            let has_entity = entities.iter().any(|e| s_lower.contains(&e.to_lowercase()))
475                || NUMBER_DATE_RE.is_match(s);
476            has_verb && has_entity
477        })
478        .cloned()
479        .collect()
480}
481
482// =============================================================================
483// Tests
484// =============================================================================
485
486#[cfg(test)]
487mod tests {
488    use super::*;
489
490    fn default_compressor() -> SemanticCompressor {
491        SemanticCompressor::new(CompressionConfig::default())
492    }
493
494    // -------------------------------------------------------------------------
495    // Test 1: Short text below min_content_length is returned verbatim
496    // -------------------------------------------------------------------------
497    #[test]
498    fn test_short_text_returned_verbatim() {
499        let compressor = default_compressor();
500        let short = "Hello world.";
501        assert!(short.len() < 100);
502        let result = compressor.compress(short);
503        assert_eq!(result.structured_content, short.trim());
504        assert!((result.ratio - 1.0).abs() < f32::EPSILON);
505    }
506
507    // -------------------------------------------------------------------------
508    // Test 2: Filler removal reduces content
509    // -------------------------------------------------------------------------
510    #[test]
511    fn test_filler_removal_reduces_content() {
512        let original = "I think basically you know we should sort of consider the proposal. \
513                        Actually to be honest I believe we need to look at it more carefully. \
514                        Kind of like the previous plan but maybe with more flexibility and scope.";
515        let stripped = strip_filler(original);
516        assert!(
517            stripped.len() < original.len(),
518            "stripped ({}) should be shorter than original ({})",
519            stripped.len(),
520            original.len()
521        );
522        // Key content words should still be present
523        assert!(
524            stripped.to_lowercase().contains("proposal")
525                || stripped.to_lowercase().contains("consider")
526        );
527    }
528
529    // -------------------------------------------------------------------------
530    // Test 3: Entity extraction finds proper nouns
531    // -------------------------------------------------------------------------
532    #[test]
533    fn test_entity_extraction_proper_nouns() {
534        let sentences = vec![
535            "Alice works at Google in San Francisco.".to_string(),
536            "Bob joined Microsoft last year.".to_string(),
537        ];
538        let entities = extract_entities(&sentences);
539        // Should find some of: Alice, Google, San, Francisco, Bob, Microsoft
540        assert!(
541            !entities.is_empty(),
542            "expected entities, got none from: {sentences:?}"
543        );
544    }
545
546    // -------------------------------------------------------------------------
547    // Test 4: Number / date extraction
548    // -------------------------------------------------------------------------
549    #[test]
550    fn test_number_date_extraction() {
551        let sentences = vec![
552            "The project started on 2024-01-15 and costs 1500.00 dollars.".to_string(),
553            "There were 42 participants in 2023.".to_string(),
554        ];
555        let entities = extract_entities(&sentences);
556        // Should find numeric tokens like "2024-01-15", "1500.00", "42", "2023"
557        let has_number = entities
558            .iter()
559            .any(|e| e.chars().any(|c| c.is_ascii_digit()));
560        assert!(has_number, "expected numeric entities; got {entities:?}");
561    }
562
563    // -------------------------------------------------------------------------
564    // Test 5: Deduplication of similar sentences
565    // -------------------------------------------------------------------------
566    #[test]
567    fn test_deduplication_removes_near_duplicates() {
568        let sentences = vec![
569            "The cat sat on the mat.".to_string(),
570            "The cat sat on the mat.".to_string(), // exact duplicate
571            "The cat is sitting on the mat.".to_string(), // near duplicate
572            "Dogs love to play in the park every afternoon.".to_string(),
573        ];
574        let deduped = deduplicate_sentences(&sentences);
575        // Exact duplicate must be removed; unique sentence kept
576        assert!(
577            deduped.len() < sentences.len(),
578            "deduped len {} should be < original len {}",
579            deduped.len(),
580            sentences.len()
581        );
582        assert!(deduped.iter().any(|s| s.contains("Dogs")));
583    }
584
585    // -------------------------------------------------------------------------
586    // Test 6: Compression ratio computed correctly
587    // -------------------------------------------------------------------------
588    #[test]
589    fn test_compression_ratio_computed() {
590        let compressor = default_compressor();
591        let text = "I think basically we need to understand that the system, \
592                    you know, is sort of designed to handle large amounts of data. \
593                    Actually to be honest the architecture was I believe chosen to \
594                    support scalability. At the end of the day the database stores \
595                    records and provides search functionality for the application. \
596                    The API layer handles authentication and rate limiting as well.";
597        let result = compressor.compress(text);
598        assert!(
599            result.ratio > 0.0 && result.ratio <= 1.0,
600            "ratio {} should be in (0, 1]",
601            result.ratio
602        );
603        assert_eq!(
604            result.ratio,
605            result.compressed_tokens as f32 / result.original_tokens as f32
606        );
607    }
608
609    // -------------------------------------------------------------------------
610    // Test 7: Decompress produces non-empty text
611    // -------------------------------------------------------------------------
612    #[test]
613    fn test_decompress_produces_non_empty_text() {
614        let compressor = default_compressor();
615        let text = "Alice joined Google in 2022 as a senior engineer. \
616                    She works on distributed systems and handles large scale data pipelines. \
617                    The team uses Rust and Go for backend services in the cloud infrastructure.";
618        let compressed = compressor.compress(text);
619        let decompressed = compressor.decompress(&compressed);
620        assert!(
621            !decompressed.is_empty(),
622            "decompress should produce non-empty text"
623        );
624    }
625
626    // -------------------------------------------------------------------------
627    // Test 8: Batch compression
628    // -------------------------------------------------------------------------
629    #[test]
630    fn test_batch_compression() {
631        let compressor = default_compressor();
632        let texts = &[
633            "Short text.",
634            "Alice works at Google as a software engineer and manages infrastructure projects in California.",
635            "The system provides search and storage capabilities for large enterprise applications.",
636        ];
637        let results = compressor.compress_batch(texts);
638        assert_eq!(results.len(), texts.len());
639    }
640
641    // -------------------------------------------------------------------------
642    // Test 9: Empty input handled gracefully
643    // -------------------------------------------------------------------------
644    #[test]
645    fn test_empty_input_handled() {
646        let compressor = default_compressor();
647        let result = compressor.compress("");
648        assert_eq!(result.original_tokens, 0);
649        assert_eq!(result.compressed_tokens, 0);
650        assert!(result.structured_content.is_empty());
651        assert!(result.key_entities.is_empty());
652    }
653
654    // -------------------------------------------------------------------------
655    // Test 10: Whitespace-only input handled gracefully
656    // -------------------------------------------------------------------------
657    #[test]
658    fn test_whitespace_only_input_handled() {
659        let compressor = default_compressor();
660        let result = compressor.compress("   \n\t   ");
661        assert!(result.structured_content.is_empty());
662    }
663
664    // -------------------------------------------------------------------------
665    // Test 11: Jaccard similarity
666    // -------------------------------------------------------------------------
667    #[test]
668    fn test_jaccard_identical_sentences() {
669        let a = "the cat sat on the mat";
670        assert!((jaccard_similarity(a, a) - 1.0).abs() < 1e-9);
671    }
672
673    #[test]
674    fn test_jaccard_disjoint_sentences() {
675        let a = "apple orange banana";
676        let b = "car truck motorcycle";
677        assert_eq!(jaccard_similarity(a, b), 0.0);
678    }
679}