Skip to main content

engram/intelligence/
synthesis.rs

1//! Online Semantic Synthesis (RML-1209)
2//!
3//! Detects when newly added memories overlap with recent ones in the session
4//! buffer and synthesizes them into a single richer memory, reducing redundancy
5//! and token overhead for downstream consumers.
6//!
7//! # Design
8//! - Pure in-memory: no database access, no async I/O.
9//! - Session-scoped: the buffer lives only as long as the [`SynthesisEngine`].
10//! - Overlap detection uses Jaccard similarity on stopword-filtered tokens.
11//! - Three synthesis strategies: [`SynthesisStrategy::Merge`],
12//!   [`SynthesisStrategy::Replace`], and [`SynthesisStrategy::Append`].
13
14use serde::{Deserialize, Serialize};
15use std::collections::VecDeque;
16
17// ---------------------------------------------------------------------------
18// Stopwords
19// ---------------------------------------------------------------------------
20
21/// Basic English stopwords filtered out before Jaccard computation.
22static STOPWORDS: &[&str] = &[
23    "the", "a", "an", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had",
24    "do", "does", "did", "will", "would", "could", "should", "may", "might", "shall", "can", "to",
25    "of", "in", "for", "on", "with", "at", "by", "from",
26];
27
28// ---------------------------------------------------------------------------
29// Public types
30// ---------------------------------------------------------------------------
31
32/// Configuration for the synthesis engine.
33#[derive(Debug, Clone, Serialize, Deserialize)]
34pub struct SynthesisConfig {
35    /// Jaccard threshold above which two memories are considered overlapping.
36    /// Default: 0.4
37    pub overlap_threshold: f32,
38    /// Number of recent memories kept in the sliding buffer.
39    /// Default: 50
40    pub buffer_size: usize,
41    /// Minimum shared token count required (in addition to threshold).
42    /// Default: 5
43    pub min_overlap_tokens: usize,
44}
45
46impl Default for SynthesisConfig {
47    fn default() -> Self {
48        Self {
49            overlap_threshold: 0.4,
50            buffer_size: 50,
51            min_overlap_tokens: 5,
52        }
53    }
54}
55
56/// How the engine should combine overlapping memories.
57#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
58#[serde(rename_all = "snake_case")]
59pub enum SynthesisStrategy {
60    /// Combine both into richer content (unique sentences from each).
61    Merge,
62    /// Keep the newer content, discard the older.
63    Replace,
64    /// Concatenate with a separator, deduplicating identical lines.
65    Append,
66}
67
68/// The result of a synthesis operation.
69#[derive(Debug, Clone, Serialize, Deserialize)]
70pub struct SynthesizedMemory {
71    /// The synthesized content string.
72    pub content: String,
73    /// IDs of the source memories that were combined.
74    pub sources: Vec<i64>,
75    /// Jaccard overlap score that triggered synthesis.
76    pub overlap_score: f32,
77    /// Which strategy was applied.
78    pub strategy_used: SynthesisStrategy,
79    /// Approximate token reduction vs. keeping both originals separately.
80    pub tokens_saved: usize,
81}
82
83/// One overlap candidate found in the buffer.
84#[derive(Debug, Clone, Serialize, Deserialize)]
85pub struct OverlapResult {
86    /// Memory ID of the overlapping buffer entry.
87    pub memory_id: i64,
88    /// Jaccard overlap score.
89    pub overlap_score: f32,
90    /// Tokens shared between the new content and this buffer entry.
91    pub shared_tokens: Vec<String>,
92}
93
94// ---------------------------------------------------------------------------
95// Internal buffer entry
96// ---------------------------------------------------------------------------
97
98/// A slot in the recent-memory ring buffer.
99pub struct BufferEntry {
100    /// Persistent memory ID (assigned by storage layer).
101    pub id: i64,
102    /// Raw content string.
103    pub content: String,
104    /// Pre-computed normalised tokens (lowercase, stopwords removed).
105    pub tokens: Vec<String>,
106}
107
108// ---------------------------------------------------------------------------
109// SynthesisEngine
110// ---------------------------------------------------------------------------
111
112/// Session-scoped online synthesis engine.
113///
114/// # Usage
115/// ```rust
116/// use engram::intelligence::synthesis::{SynthesisConfig, SynthesisEngine, SynthesisStrategy};
117///
118/// let mut engine = SynthesisEngine::new(SynthesisConfig::default());
119/// engine.add_to_buffer(1, "Rust ownership model uses borrow checker rules");
120///
121/// if let Some(synth) = engine.check_and_synthesize(
122///     "Rust borrow checker enforces ownership rules at compile time",
123///     SynthesisStrategy::Merge,
124/// ) {
125///     println!("Synthesized: {}", synth.content);
126/// }
127/// ```
128pub struct SynthesisEngine {
129    config: SynthesisConfig,
130    buffer: VecDeque<BufferEntry>,
131}
132
133impl SynthesisEngine {
134    /// Create a new engine with the given configuration.
135    pub fn new(config: SynthesisConfig) -> Self {
136        Self {
137            buffer: VecDeque::with_capacity(config.buffer_size),
138            config,
139        }
140    }
141
142    // -----------------------------------------------------------------------
143    // Tokenisation
144    // -----------------------------------------------------------------------
145
146    /// Tokenise `text`: lowercase, split on whitespace and punctuation, strip
147    /// leading/trailing non-alphanumeric characters, filter empty tokens and
148    /// common English stopwords.
149    pub fn tokenize(text: &str) -> Vec<String> {
150        let lower = text.to_lowercase();
151        lower
152            .split(|c: char| c.is_whitespace() || c.is_ascii_punctuation())
153            .map(|tok| tok.trim_matches(|c: char| !c.is_alphanumeric()).to_string())
154            .filter(|tok| !tok.is_empty() && !STOPWORDS.contains(&tok.as_str()))
155            .collect()
156    }
157
158    // -----------------------------------------------------------------------
159    // Similarity
160    // -----------------------------------------------------------------------
161
162    /// Jaccard similarity: |intersection| / |union| over token sets.
163    ///
164    /// Returns 1.0 for two empty slices and 0.0 when the union is empty
165    /// (degenerate case handled gracefully).
166    pub fn jaccard_similarity(a: &[String], b: &[String]) -> f32 {
167        if a.is_empty() && b.is_empty() {
168            return 1.0;
169        }
170
171        // Build sets using sorted dedup (avoids HashSet allocation for small slices)
172        let mut set_a: Vec<&str> = a.iter().map(String::as_str).collect();
173        set_a.sort_unstable();
174        set_a.dedup();
175
176        let mut set_b: Vec<&str> = b.iter().map(String::as_str).collect();
177        set_b.sort_unstable();
178        set_b.dedup();
179
180        let mut intersection = 0usize;
181        let (mut i, mut j) = (0, 0);
182        while i < set_a.len() && j < set_b.len() {
183            match set_a[i].cmp(set_b[j]) {
184                std::cmp::Ordering::Equal => {
185                    intersection += 1;
186                    i += 1;
187                    j += 1;
188                }
189                std::cmp::Ordering::Less => i += 1,
190                std::cmp::Ordering::Greater => j += 1,
191            }
192        }
193
194        // |union| = |A| + |B| - |intersection|
195        let union = set_a.len() + set_b.len() - intersection;
196        if union == 0 {
197            0.0
198        } else {
199            intersection as f32 / union as f32
200        }
201    }
202
203    // -----------------------------------------------------------------------
204    // Overlap detection
205    // -----------------------------------------------------------------------
206
207    /// Check `new_content` against every entry in the buffer.
208    ///
209    /// Returns all entries whose Jaccard score meets both the
210    /// `overlap_threshold` and `min_overlap_tokens` criteria, sorted by score
211    /// descending.
212    pub fn detect_overlap(&self, new_content: &str) -> Vec<OverlapResult> {
213        let new_tokens = Self::tokenize(new_content);
214
215        let mut results: Vec<OverlapResult> = self
216            .buffer
217            .iter()
218            .filter_map(|entry| {
219                let score = Self::jaccard_similarity(&new_tokens, &entry.tokens);
220                if score < self.config.overlap_threshold {
221                    return None;
222                }
223
224                // Collect shared tokens (intersection)
225                let mut new_sorted: Vec<&str> = new_tokens.iter().map(String::as_str).collect();
226                new_sorted.sort_unstable();
227                new_sorted.dedup();
228
229                let mut buf_sorted: Vec<&str> = entry.tokens.iter().map(String::as_str).collect();
230                buf_sorted.sort_unstable();
231                buf_sorted.dedup();
232
233                let shared: Vec<String> = {
234                    let (mut i, mut j) = (0, 0);
235                    let mut shared = Vec::new();
236                    while i < new_sorted.len() && j < buf_sorted.len() {
237                        match new_sorted[i].cmp(buf_sorted[j]) {
238                            std::cmp::Ordering::Equal => {
239                                shared.push(new_sorted[i].to_string());
240                                i += 1;
241                                j += 1;
242                            }
243                            std::cmp::Ordering::Less => i += 1,
244                            std::cmp::Ordering::Greater => j += 1,
245                        }
246                    }
247                    shared
248                };
249
250                if shared.len() < self.config.min_overlap_tokens {
251                    return None;
252                }
253
254                Some(OverlapResult {
255                    memory_id: entry.id,
256                    overlap_score: score,
257                    shared_tokens: shared,
258                })
259            })
260            .collect();
261
262        results.sort_by(|a, b| {
263            b.overlap_score
264                .partial_cmp(&a.overlap_score)
265                .unwrap_or(std::cmp::Ordering::Equal)
266        });
267
268        results
269    }
270
271    // -----------------------------------------------------------------------
272    // Synthesis
273    // -----------------------------------------------------------------------
274
275    /// Synthesise a new memory from `existing_content` (id `existing_id`) and
276    /// `new_content` using the chosen `strategy`.
277    ///
278    /// The `overlap_score` is embedded in the result for caller transparency.
279    pub fn synthesize(
280        &self,
281        existing_content: &str,
282        existing_id: i64,
283        new_content: &str,
284        strategy: SynthesisStrategy,
285    ) -> SynthesizedMemory {
286        let existing_tokens = Self::tokenize(existing_content);
287        let new_tokens = Self::tokenize(new_content);
288        let overlap_score = Self::jaccard_similarity(&existing_tokens, &new_tokens);
289
290        let combined_raw_len = existing_content.len() + new_content.len();
291
292        let content = match strategy {
293            SynthesisStrategy::Merge => Self::merge_content(existing_content, new_content),
294            SynthesisStrategy::Replace => new_content.to_string(),
295            SynthesisStrategy::Append => Self::append_content(existing_content, new_content),
296        };
297
298        let tokens_saved = combined_raw_len.saturating_sub(content.len());
299
300        SynthesizedMemory {
301            content,
302            sources: vec![existing_id],
303            overlap_score,
304            strategy_used: strategy,
305            tokens_saved,
306        }
307    }
308
309    /// Merge strategy: interleave unique sentences from both inputs.
310    ///
311    /// Sentences are split on `.`, `!`, `?` and then deduplicated (normalised
312    /// lowercase comparison). Existing sentences appear first; unique new
313    /// sentences are appended.
314    fn merge_content(existing: &str, new: &str) -> String {
315        let existing_sentences = Self::split_sentences(existing);
316        let new_sentences = Self::split_sentences(new);
317
318        let existing_norm: Vec<String> = existing_sentences
319            .iter()
320            .map(|s| s.to_lowercase())
321            .collect();
322
323        let mut merged: Vec<&str> = existing_sentences.iter().map(String::as_str).collect();
324
325        for (raw, norm) in new_sentences.iter().zip(
326            new_sentences
327                .iter()
328                .map(|s| s.to_lowercase())
329                .collect::<Vec<_>>()
330                .iter(),
331        ) {
332            if !existing_norm.contains(norm) {
333                merged.push(raw.as_str());
334            }
335        }
336
337        merged.join(" ").trim().to_string()
338    }
339
340    /// Append strategy: concatenate with `\n---\n` separator and deduplicate
341    /// identical lines (case-insensitive).
342    fn append_content(existing: &str, new: &str) -> String {
343        let existing_lines: Vec<&str> = existing.lines().collect();
344        let existing_norm: Vec<String> = existing_lines
345            .iter()
346            .map(|l| l.trim().to_lowercase())
347            .collect();
348
349        let mut result_lines: Vec<&str> = existing_lines;
350
351        // Add separator only if existing is non-empty
352        let separator_added = !existing.trim().is_empty();
353
354        let mut new_lines: Vec<&str> = Vec::new();
355        for line in new.lines() {
356            let norm = line.trim().to_lowercase();
357            if !norm.is_empty() && !existing_norm.contains(&norm) {
358                new_lines.push(line);
359            }
360        }
361
362        if !new_lines.is_empty() {
363            if separator_added {
364                result_lines.push("---");
365            }
366            result_lines.extend_from_slice(&new_lines);
367        }
368
369        result_lines.join("\n")
370    }
371
372    /// Split text into non-empty trimmed sentences.
373    fn split_sentences(text: &str) -> Vec<String> {
374        text.split(['.', '!', '?'])
375            .map(|s| s.trim().to_string())
376            .filter(|s| !s.is_empty())
377            .collect()
378    }
379
380    // -----------------------------------------------------------------------
381    // Buffer management
382    // -----------------------------------------------------------------------
383
384    /// Add a memory to the sliding buffer, evicting the oldest entry when the
385    /// buffer is at capacity.
386    pub fn add_to_buffer(&mut self, id: i64, content: &str) {
387        if self.buffer.len() >= self.config.buffer_size {
388            self.buffer.pop_front();
389        }
390        let tokens = Self::tokenize(content);
391        self.buffer.push_back(BufferEntry {
392            id,
393            content: content.to_string(),
394            tokens,
395        });
396    }
397
398    /// Current number of entries in the buffer.
399    pub fn buffer_len(&self) -> usize {
400        self.buffer.len()
401    }
402
403    // -----------------------------------------------------------------------
404    // Convenience
405    // -----------------------------------------------------------------------
406
407    /// Convenience method: detect overlap and, if found, synthesise using the
408    /// best-scoring buffer entry.
409    ///
410    /// Returns `None` when no buffer entry meets the overlap criteria.
411    pub fn check_and_synthesize(
412        &self,
413        new_content: &str,
414        strategy: SynthesisStrategy,
415    ) -> Option<SynthesizedMemory> {
416        let overlaps = self.detect_overlap(new_content);
417        let best = overlaps.first()?;
418
419        // Find the buffer entry for the best overlap
420        let entry = self.buffer.iter().find(|e| e.id == best.memory_id)?;
421
422        Some(self.synthesize(&entry.content, entry.id, new_content, strategy))
423    }
424}
425
426// ---------------------------------------------------------------------------
427// Tests
428// ---------------------------------------------------------------------------
429
430#[cfg(test)]
431mod tests {
432    use super::*;
433
434    // 1. Jaccard similarity computation
435    #[test]
436    fn test_jaccard_similarity_basic() {
437        // Identical token sets => 1.0
438        let a = vec!["rust".to_string(), "ownership".to_string()];
439        let b = vec!["rust".to_string(), "ownership".to_string()];
440        let sim = SynthesisEngine::jaccard_similarity(&a, &b);
441        assert!((sim - 1.0).abs() < 1e-6, "identical sets should give 1.0");
442
443        // Disjoint => 0.0
444        let c = vec!["apple".to_string(), "banana".to_string()];
445        let d = vec!["car".to_string(), "truck".to_string()];
446        let sim2 = SynthesisEngine::jaccard_similarity(&c, &d);
447        assert!((sim2 - 0.0).abs() < 1e-6, "disjoint sets should give 0.0");
448
449        // Partial overlap: {rust, borrow} vs {rust, ownership} => 1/3
450        let e = vec!["rust".to_string(), "borrow".to_string()];
451        let f = vec!["rust".to_string(), "ownership".to_string()];
452        let sim3 = SynthesisEngine::jaccard_similarity(&e, &f);
453        let expected = 1.0f32 / 3.0;
454        assert!(
455            (sim3 - expected).abs() < 1e-5,
456            "partial overlap sim={sim3} expected≈{expected}"
457        );
458    }
459
460    // 2. Detect overlap above threshold
461    #[test]
462    fn test_detect_overlap_above_threshold() {
463        let mut engine = SynthesisEngine::new(SynthesisConfig {
464            overlap_threshold: 0.3,
465            buffer_size: 10,
466            min_overlap_tokens: 2,
467        });
468        engine.add_to_buffer(
469            1,
470            "Rust ownership model uses borrow checker rules compile time safety",
471        );
472
473        let results =
474            engine.detect_overlap("Rust borrow checker enforces ownership rules compile time");
475        assert!(
476            !results.is_empty(),
477            "should find overlap for highly similar content"
478        );
479        assert_eq!(results[0].memory_id, 1);
480        assert!(results[0].overlap_score >= 0.3);
481    }
482
483    // 3. No overlap below threshold
484    #[test]
485    fn test_no_overlap_below_threshold() {
486        let mut engine = SynthesisEngine::new(SynthesisConfig {
487            overlap_threshold: 0.6,
488            buffer_size: 10,
489            min_overlap_tokens: 2,
490        });
491        engine.add_to_buffer(1, "SQLite WAL mode improves write concurrency");
492
493        let results = engine.detect_overlap("Python async await event loop");
494        assert!(
495            results.is_empty(),
496            "unrelated content should not trigger overlap"
497        );
498    }
499
500    // 4. Merge strategy combines content
501    #[test]
502    fn test_merge_strategy_combines_content() {
503        let engine = SynthesisEngine::new(SynthesisConfig::default());
504        let existing = "Rust uses a borrow checker. Safety is guaranteed at compile time.";
505        let new = "Rust uses a borrow checker. Memory leaks are prevented automatically.";
506
507        let result = engine.synthesize(existing, 1, new, SynthesisStrategy::Merge);
508
509        assert_eq!(result.strategy_used, SynthesisStrategy::Merge);
510        // Should contain unique sentence from the new input
511        assert!(
512            result
513                .content
514                .contains("Memory leaks are prevented automatically"),
515            "merged content should include the unique new sentence"
516        );
517        // Should not duplicate the shared sentence
518        let count = result.content.matches("Rust uses a borrow checker").count();
519        assert_eq!(count, 1, "shared sentence should appear only once");
520        assert_eq!(result.sources, vec![1]);
521    }
522
523    // 5. Replace strategy keeps newer content
524    #[test]
525    fn test_replace_strategy_keeps_newer() {
526        let engine = SynthesisEngine::new(SynthesisConfig::default());
527        let existing = "Old description of Rust ownership.";
528        let new = "Updated and more detailed description of Rust ownership.";
529
530        let result = engine.synthesize(existing, 2, new, SynthesisStrategy::Replace);
531
532        assert_eq!(result.strategy_used, SynthesisStrategy::Replace);
533        assert_eq!(result.content, new);
534        assert_eq!(result.sources, vec![2]);
535    }
536
537    // 6. Append strategy concatenates with separator and deduplicates
538    #[test]
539    fn test_append_strategy_concatenates() {
540        let engine = SynthesisEngine::new(SynthesisConfig::default());
541        let existing = "Line one\nLine two";
542        let new = "Line two\nLine three";
543
544        let result = engine.synthesize(existing, 3, new, SynthesisStrategy::Append);
545
546        assert_eq!(result.strategy_used, SynthesisStrategy::Append);
547        // Deduplication: "Line two" should appear only once
548        let count = result.content.matches("Line two").count();
549        assert_eq!(count, 1, "duplicate lines should be removed");
550        // New unique line should be present
551        assert!(
552            result.content.contains("Line three"),
553            "unique new line should be present"
554        );
555        // Separator should be present
556        assert!(
557            result.content.contains("---"),
558            "separator should be present"
559        );
560    }
561
562    // 7. Buffer eviction when full
563    #[test]
564    fn test_buffer_eviction_when_full() {
565        let config = SynthesisConfig {
566            overlap_threshold: 0.4,
567            buffer_size: 3,
568            min_overlap_tokens: 1,
569        };
570        let mut engine = SynthesisEngine::new(config);
571
572        engine.add_to_buffer(1, "memory one");
573        engine.add_to_buffer(2, "memory two");
574        engine.add_to_buffer(3, "memory three");
575        assert_eq!(engine.buffer_len(), 3);
576
577        // Adding a 4th entry should evict the oldest (id=1)
578        engine.add_to_buffer(4, "memory four");
579        assert_eq!(engine.buffer_len(), 3, "buffer should not exceed capacity");
580
581        // Entry id=1 should no longer be in the buffer
582        let ids: Vec<i64> = engine.buffer.iter().map(|e| e.id).collect();
583        assert!(
584            !ids.contains(&1),
585            "oldest entry should have been evicted, got: {ids:?}"
586        );
587        assert!(ids.contains(&4), "newest entry should be present");
588    }
589
590    // 8. Empty / very short content handled gracefully
591    #[test]
592    fn test_empty_content_handled() {
593        let mut engine = SynthesisEngine::new(SynthesisConfig::default());
594
595        // Tokenising empty string should return empty vec
596        let tokens = SynthesisEngine::tokenize("");
597        assert!(tokens.is_empty());
598
599        // Two empty token sets => Jaccard 1.0
600        let sim = SynthesisEngine::jaccard_similarity(&[], &[]);
601        assert!((sim - 1.0).abs() < 1e-6);
602
603        // Adding empty content to buffer should not panic
604        engine.add_to_buffer(99, "");
605        assert_eq!(engine.buffer_len(), 1);
606
607        // Detecting overlap on empty new content should return empty results
608        // (no entry will meet min_overlap_tokens=5 with zero tokens)
609        let results = engine.detect_overlap("");
610        assert!(results.is_empty(), "empty new content yields no overlaps");
611    }
612
613    // 9. check_and_synthesize returns None when buffer is empty
614    #[test]
615    fn test_check_and_synthesize_empty_buffer() {
616        let engine = SynthesisEngine::new(SynthesisConfig::default());
617        let result = engine.check_and_synthesize("some new content here", SynthesisStrategy::Merge);
618        assert!(result.is_none(), "empty buffer should return None");
619    }
620
621    // 10. check_and_synthesize returns Some when overlap is found
622    #[test]
623    fn test_check_and_synthesize_returns_some() {
624        let mut engine = SynthesisEngine::new(SynthesisConfig {
625            overlap_threshold: 0.3,
626            buffer_size: 10,
627            min_overlap_tokens: 2,
628        });
629        engine.add_to_buffer(42, "async rust tokio runtime executor futures polling");
630
631        let result = engine.check_and_synthesize(
632            "tokio runtime executor drives async futures rust",
633            SynthesisStrategy::Replace,
634        );
635        assert!(
636            result.is_some(),
637            "should find overlap and return synthesized memory"
638        );
639        let synth = result.unwrap();
640        assert_eq!(synth.sources, vec![42]);
641    }
642
643    // 11. tokens_saved reflects character reduction
644    #[test]
645    fn test_tokens_saved_replace_strategy() {
646        let engine = SynthesisEngine::new(SynthesisConfig::default());
647        let existing = "A".repeat(100);
648        let new = "B".repeat(20);
649
650        // Replace keeps only new (20 chars). combined_raw = 120. saved = 120-20 = 100
651        let result = engine.synthesize(&existing, 5, &new, SynthesisStrategy::Replace);
652        assert_eq!(result.tokens_saved, 100);
653    }
654}