shodh_memory/memory/
compression.rs

1//! Compression pipeline for memory optimization
2
3use anyhow::{anyhow, Result};
4use base64::{engine::general_purpose, Engine as _};
5use lz4;
6use rust_stemmers::{Algorithm, Stemmer};
7use serde::{Deserialize, Serialize};
8use std::collections::{HashMap, HashSet};
9
10use super::types::*;
11use crate::constants::{
12    COMPRESSION_ACCESS_THRESHOLD, COMPRESSION_AGE_DAYS, COMPRESSION_IMPORTANCE_HIGH,
13    COMPRESSION_IMPORTANCE_LOW, CONSOLIDATION_JACCARD_THRESHOLD,
14    CONSOLIDATION_MAX_CANDIDATES_PER_MEMORY, CONSOLIDATION_MIN_AGE_DAYS, CONSOLIDATION_MIN_SUPPORT,
15    MAX_COMPRESSION_RATIO, MAX_DECOMPRESSED_SIZE,
16};
17
18/// Compression strategy for memories
19#[derive(Debug, Clone)]
20pub enum CompressionStrategy {
21    None,
22    Lz4,           // Fast compression
23    Summarization, // Semantic compression
24    Hybrid,        // Combination of methods
25}
26
27/// Compressed memory representation
28#[derive(Debug, Clone, Serialize, Deserialize)]
29pub struct CompressedMemory {
30    pub id: MemoryId,
31    pub summary: String,
32    pub keywords: Vec<String>,
33    pub importance: f32,
34    pub created_at: chrono::DateTime<chrono::Utc>,
35    pub compression_ratio: f32,
36    pub original_size: usize,
37    pub compressed_data: Vec<u8>,
38    pub strategy: String,
39}
40
41/// Compression pipeline for optimizing memory storage
42pub struct CompressionPipeline {
43    keyword_extractor: KeywordExtractor,
44}
45
46impl Default for CompressionPipeline {
47    fn default() -> Self {
48        Self::new()
49    }
50}
51
52impl CompressionPipeline {
53    pub fn new() -> Self {
54        Self {
55            keyword_extractor: KeywordExtractor::new(),
56        }
57    }
58
59    /// Compress a memory based on its characteristics
60    pub fn compress(&self, memory: &Memory) -> Result<Memory> {
61        // Don't compress if already compressed or very recent
62        if memory.compressed {
63            return Ok(memory.clone());
64        }
65
66        let strategy = self.select_strategy(memory);
67
68        match strategy {
69            CompressionStrategy::None => Ok(memory.clone()),
70            CompressionStrategy::Lz4 => self.compress_lz4(memory),
71            CompressionStrategy::Summarization => self.compress_semantic(memory),
72            CompressionStrategy::Hybrid => self.compress_hybrid(memory),
73        }
74    }
75
76    /// Select compression strategy based on memory characteristics
77    fn select_strategy(&self, memory: &Memory) -> CompressionStrategy {
78        // High importance memories get lighter compression (lossless LZ4)
79        if memory.importance() > COMPRESSION_IMPORTANCE_HIGH {
80            return CompressionStrategy::Lz4;
81        }
82
83        // Frequently accessed memories stay uncompressed
84        if memory.access_count() > COMPRESSION_ACCESS_THRESHOLD {
85            return CompressionStrategy::None;
86        }
87
88        // Old, low-importance memories get aggressive compression (lossy semantic)
89        let age = chrono::Utc::now() - memory.created_at;
90        if age.num_days() > COMPRESSION_AGE_DAYS && memory.importance() < COMPRESSION_IMPORTANCE_LOW
91        {
92            return CompressionStrategy::Summarization;
93        }
94
95        // Default to hybrid approach
96        CompressionStrategy::Hybrid
97    }
98
99    /// LZ4 compression - preserves all data
100    fn compress_lz4(&self, memory: &Memory) -> Result<Memory> {
101        let original =
102            bincode::serde::encode_to_vec(&memory.experience, bincode::config::standard())?;
103        let compressed = lz4::block::compress(&original, None, false)?;
104
105        let compression_ratio = compressed.len() as f32 / original.len() as f32;
106
107        // Create compressed version
108        let mut compressed_memory = memory.clone();
109        compressed_memory.compressed = true;
110
111        // Store compressed data in metadata
112        let compressed_b64 = general_purpose::STANDARD.encode(&compressed);
113        compressed_memory
114            .experience
115            .metadata
116            .insert("compressed_data".to_string(), compressed_b64);
117        compressed_memory.experience.metadata.insert(
118            "compression_ratio".to_string(),
119            compression_ratio.to_string(),
120        );
121        compressed_memory
122            .experience
123            .metadata
124            .insert("compression_strategy".to_string(), "lz4".to_string());
125
126        Ok(compressed_memory)
127    }
128
129    /// Semantic compression - extract essence
130    fn compress_semantic(&self, memory: &Memory) -> Result<Memory> {
131        let mut compressed_memory = memory.clone();
132
133        // Extract keywords
134        let keywords = self.keyword_extractor.extract(&memory.experience.content);
135
136        // Create summary (simplified - in production would use LLM)
137        let summary = self.create_summary(&memory.experience.content, 50);
138
139        // Store only summary and keywords
140        compressed_memory.experience.content = summary;
141        compressed_memory
142            .experience
143            .metadata
144            .insert("keywords".to_string(), keywords.join(","));
145        compressed_memory
146            .experience
147            .metadata
148            .insert("compression_strategy".to_string(), "semantic".to_string());
149        compressed_memory.compressed = true;
150
151        Ok(compressed_memory)
152    }
153
154    /// Hybrid compression - combine strategies
155    fn compress_hybrid(&self, memory: &Memory) -> Result<Memory> {
156        // First apply semantic compression
157        let semantic = self.compress_semantic(memory)?;
158
159        // Then apply LZ4 on the result
160        self.compress_lz4(&semantic)
161    }
162
163    /// Decompress a memory
164    ///
165    /// # Returns
166    /// - `Ok(Memory)` - Decompressed memory with original content restored
167    /// - `Err` - If decompression fails or compression is lossy (semantic)
168    ///
169    /// # Errors
170    /// - Returns error for semantic compression (lossy - original data not recoverable)
171    /// - Returns error if compressed data is missing or corrupted
172    pub fn decompress(&self, memory: &Memory) -> Result<Memory> {
173        if !memory.compressed {
174            return Ok(memory.clone());
175        }
176
177        let strategy = memory
178            .experience
179            .metadata
180            .get("compression_strategy")
181            .map(|s| s.as_str())
182            .unwrap_or("unknown");
183
184        match strategy {
185            "lz4" => self.decompress_lz4(memory),
186            "semantic" => {
187                // Semantic compression is LOSSY - original content is NOT recoverable
188                // This is intentional: we extracted keywords and summary, discarded original
189                // Callers must handle this error appropriately
190                Err(anyhow!(
191                    "Cannot decompress semantically compressed memory '{}': \
192                     semantic compression is lossy. Original content was replaced with \
193                     summary and keywords. Use memory.experience.content for the summary \
194                     and metadata['keywords'] for extracted keywords.",
195                    memory.id.0
196                ))
197            }
198            "hybrid" => {
199                // Hybrid = semantic + lz4. The lz4 layer can be decompressed,
200                // but the underlying content is still the semantic summary
201                let lz4_decompressed = self.decompress_lz4(memory)?;
202                // Mark that this is still semantically compressed (lossy)
203                Err(anyhow!(
204                    "Cannot fully decompress hybrid-compressed memory '{}': \
205                     underlying semantic compression is lossy. LZ4 layer decompressed, \
206                     but original content is not recoverable.",
207                    lz4_decompressed.id.0
208                ))
209            }
210            unknown => Err(anyhow!(
211                "Unknown compression strategy '{}' for memory '{}'. \
212                 Cannot decompress.",
213                unknown,
214                memory.id.0
215            )),
216        }
217    }
218
219    /// Check if a memory's compression is lossless (can be fully decompressed)
220    pub fn is_lossless(&self, memory: &Memory) -> bool {
221        if !memory.compressed {
222            return true;
223        }
224        let strategy = memory
225            .experience
226            .metadata
227            .get("compression_strategy")
228            .map(|s| s.as_str())
229            .unwrap_or("unknown");
230        strategy == "lz4"
231    }
232
233    /// Get the compression strategy used for a memory
234    pub fn get_strategy<'a>(&self, memory: &'a Memory) -> Option<&'a str> {
235        if !memory.compressed {
236            return None;
237        }
238        memory
239            .experience
240            .metadata
241            .get("compression_strategy")
242            .map(|s| s.as_str())
243    }
244
245    /// Decompress LZ4 compressed memory
246    fn decompress_lz4(&self, memory: &Memory) -> Result<Memory> {
247        if let Some(compressed_b64) = memory.experience.metadata.get("compressed_data") {
248            let compressed = general_purpose::STANDARD.decode(compressed_b64)?;
249
250            // Zip bomb protection: Check compression ratio before decompressing
251            // A small payload claiming to decompress to MAX_DECOMPRESSED_SIZE is suspicious
252            let compressed_size = compressed.len();
253            let max_expected_decompressed = compressed_size.saturating_mul(MAX_COMPRESSION_RATIO);
254
255            if max_expected_decompressed > MAX_DECOMPRESSED_SIZE as usize {
256                // The compressed size is so small that even at MAX_COMPRESSION_RATIO
257                // it would exceed our limit - this is suspicious
258                return Err(anyhow!(
259                    "Suspicious compression ratio: compressed size {} bytes with max ratio {} \
260                     would allow {} bytes decompressed, which exceeds limit of {} bytes. \
261                     This may indicate a zip bomb attack.",
262                    compressed_size,
263                    MAX_COMPRESSION_RATIO,
264                    max_expected_decompressed,
265                    MAX_DECOMPRESSED_SIZE
266                ));
267            }
268
269            // Limit decompression size to prevent DoS attacks
270            let decompressed = lz4::block::decompress(&compressed, Some(MAX_DECOMPRESSED_SIZE))?;
271
272            // Post-decompression ratio check for additional safety
273            let actual_ratio = if compressed_size > 0 {
274                decompressed.len() / compressed_size
275            } else {
276                0
277            };
278            if actual_ratio > MAX_COMPRESSION_RATIO {
279                return Err(anyhow!(
280                    "Decompression ratio {} exceeds maximum allowed ratio of {}. \
281                     Compressed: {} bytes, Decompressed: {} bytes. \
282                     This may indicate a zip bomb attack.",
283                    actual_ratio,
284                    MAX_COMPRESSION_RATIO,
285                    compressed_size,
286                    decompressed.len()
287                ));
288            }
289
290            let (experience, _): (Experience, _) =
291                bincode::serde::decode_from_slice(&decompressed, bincode::config::standard())?;
292
293            // Restore the memory
294            let mut restored = memory.clone();
295            restored.experience = experience;
296            restored.compressed = false;
297            restored.experience.metadata.remove("compressed_data");
298            restored.experience.metadata.remove("compression_ratio");
299            restored.experience.metadata.remove("compression_strategy");
300
301            Ok(restored)
302        } else {
303            Err(anyhow!("No compressed data found"))
304        }
305    }
306
307    /// Create a summary of content (extractive - takes first N words)
308    fn create_summary(&self, content: &str, max_words: usize) -> String {
309        // Simple extractive summary - take first N words
310        // In production, this would use NLP/LLM
311        let words: Vec<&str> = content.split_whitespace().collect();
312        let summary_words = &words[..words.len().min(max_words)];
313        format!("{}...", summary_words.join(" "))
314    }
315}
316
317/// Keyword extraction for semantic compression
318struct KeywordExtractor {
319    stop_words: HashSet<String>,
320}
321
322impl KeywordExtractor {
323    fn new() -> Self {
324        let stop_words = Self::load_stop_words();
325        Self { stop_words }
326    }
327
328    fn extract(&self, text: &str) -> Vec<String> {
329        // Simple TF-IDF style extraction
330        let mut word_freq: HashMap<String, usize> = HashMap::new();
331
332        for word in text.split_whitespace() {
333            let clean_word = word
334                .to_lowercase()
335                .chars()
336                .filter(|c| c.is_alphanumeric())
337                .collect::<String>();
338
339            if clean_word.len() >= 2 && !self.stop_words.contains(&clean_word) {
340                *word_freq.entry(clean_word).or_insert(0) += 1;
341            }
342        }
343
344        // Sort by frequency and take top keywords
345        let mut keywords: Vec<(String, usize)> = word_freq.into_iter().collect();
346        keywords.sort_by(|a, b| b.1.cmp(&a.1));
347
348        keywords
349            .into_iter()
350            .take(10)
351            .map(|(word, _)| word)
352            .collect()
353    }
354
355    /// Check if a word is a stop word
356    fn is_stop_word(&self, word: &str) -> bool {
357        self.stop_words.contains(word)
358    }
359
360    fn load_stop_words() -> HashSet<String> {
361        let words = vec![
362            "the",
363            "is",
364            "at",
365            "which",
366            "on",
367            "and",
368            "a",
369            "an",
370            "as",
371            "are",
372            "was",
373            "were",
374            "been",
375            "be",
376            "have",
377            "has",
378            "had",
379            "do",
380            "does",
381            "did",
382            "will",
383            "would",
384            "could",
385            "should",
386            "may",
387            "might",
388            "must",
389            "shall",
390            "can",
391            "need",
392            "dare",
393            "ought",
394            "used",
395            "to",
396            "of",
397            "in",
398            "for",
399            "with",
400            "by",
401            "from",
402            "about",
403            "into",
404            "through",
405            "during",
406            "before",
407            "after",
408            "above",
409            "below",
410            "up",
411            "down",
412            "out",
413            "off",
414            "over",
415            "under",
416            "again",
417            "further",
418            "then",
419            "once",
420            "there",
421            "these",
422            "those",
423            "this",
424            "that",
425            "it",
426            "its",
427            "what",
428            "which",
429            "who",
430            "whom",
431            "whose",
432            "where",
433            "when",
434            "why",
435            "how",
436            "all",
437            "both",
438            "each",
439            "few",
440            "more",
441            "most",
442            "other",
443            "some",
444            "such",
445            "no",
446            "nor",
447            "not",
448            "only",
449            "own",
450            "same",
451            "so",
452            "than",
453            "too",
454            "very",
455            "just",
456            "but",
457            "or",
458            "if",
459            // Pronouns and possessives
460            "i",
461            "you",
462            "he",
463            "she",
464            "we",
465            "they",
466            "me",
467            "him",
468            "her",
469            "us",
470            "them",
471            "my",
472            "your",
473            "his",
474            "our",
475            "their",
476            // Common adverbs and adjectives that aren't meaningful entities
477            "also",
478            "always",
479            "never",
480            "often",
481            "sometimes",
482            "usually",
483            "really",
484            "actually",
485            "basically",
486            "currently",
487            "recently",
488            "already",
489            "still",
490            "yet",
491            "now",
492            "here",
493            "carefully",
494            "quickly",
495            "easily",
496            "well",
497            "much",
498            "many",
499            "new",
500            "old",
501            "good",
502            "great",
503            "best",
504            "like",
505            "even",
506            "also",
507            "get",
508            "got",
509            "set",
510            "use",
511            "using",
512            "used",
513            "make",
514            "made",
515            "being",
516        ];
517
518        words.into_iter().map(String::from).collect()
519    }
520}
521
522/// Compression statistics
523#[derive(Debug, Clone, Serialize, Deserialize)]
524pub struct CompressionStats {
525    pub total_compressed: usize,
526    pub total_original_size: usize,
527    pub total_compressed_size: usize,
528    pub average_compression_ratio: f32,
529    pub strategies_used: HashMap<String, usize>,
530}
531
532impl Default for CompressionStats {
533    fn default() -> Self {
534        Self {
535            total_compressed: 0,
536            total_original_size: 0,
537            total_compressed_size: 0,
538            average_compression_ratio: 1.0,
539            strategies_used: HashMap::new(),
540        }
541    }
542}
543
544// ============================================================================
545// SEMANTIC CONSOLIDATION - Extract durable facts from episodic memories
546// ============================================================================
547
548/// A semantic fact extracted from episodic memories
549///
550/// As memories age, specific episodes ("yesterday I debugged the auth module")
551/// consolidate into semantic knowledge ("the auth module uses JWT tokens").
552/// This mimics how human memory transitions from episodic to semantic.
553#[derive(Debug, Clone, Serialize, Deserialize)]
554pub struct SemanticFact {
555    /// Unique identifier
556    pub id: String,
557    /// The factual statement
558    pub fact: String,
559    /// Confidence in this fact (0.0 - 1.0)
560    pub confidence: f32,
561    /// How many episodic memories support this fact
562    pub support_count: usize,
563    /// Source memory IDs that contributed to this fact
564    pub source_memories: Vec<MemoryId>,
565    /// Keywords/entities this fact relates to
566    pub related_entities: Vec<String>,
567    /// When this fact was first extracted
568    pub created_at: chrono::DateTime<chrono::Utc>,
569    /// When this fact was last reinforced
570    pub last_reinforced: chrono::DateTime<chrono::Utc>,
571    /// Category of fact (preference, capability, relationship, procedure)
572    pub fact_type: FactType,
573}
574
575/// Types of semantic facts
576#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
577pub enum FactType {
578    /// User preference: "prefers concise code"
579    Preference,
580    /// System capability: "can handle 10k requests/sec"
581    Capability,
582    /// Relationship: "auth module depends on JWT library"
583    Relationship,
584    /// Procedure: "to deploy, run cargo build --release"
585    Procedure,
586    /// Definition: "MemoryId is a UUID wrapper"
587    Definition,
588    /// Pattern: "errors often occur after deployment"
589    Pattern,
590}
591
592impl Default for FactType {
593    fn default() -> Self {
594        Self::Pattern
595    }
596}
597
598/// Result of consolidation operation
599#[derive(Debug, Clone, Default)]
600pub struct ConsolidationResult {
601    /// Number of memories processed
602    pub memories_processed: usize,
603    /// Number of new facts extracted
604    pub facts_extracted: usize,
605    /// Number of existing facts reinforced
606    pub facts_reinforced: usize,
607    /// IDs of newly created facts
608    pub new_fact_ids: Vec<String>,
609    /// Newly extracted semantic facts (ready for storage)
610    pub new_facts: Vec<SemanticFact>,
611}
612
613/// Semantic consolidation engine
614///
615/// Extracts durable semantic facts from episodic memories using:
616/// - Multi-extractor pipeline: all extractors run on every memory
617/// - Stemmed-token Jaccard clustering: groups semantically similar patterns
618/// - Sentence-level extraction: facts are real content, not synthetic strings
619pub struct SemanticConsolidator {
620    keyword_extractor: KeywordExtractor,
621    /// Minimum times a pattern must appear to become a fact
622    min_support: usize,
623    /// Minimum age in days before consolidation
624    min_age_days: i64,
625    stemmer: Stemmer,
626}
627
628/// A cluster of semantically similar pattern candidates
629struct PatternCluster {
630    /// Stemmed token set representing this cluster (union of all members)
631    stem_set: HashSet<String>,
632    /// All candidate entries: (pattern_text, memory_id, confidence)
633    members: Vec<(String, MemoryId, f32)>,
634}
635
636impl Default for SemanticConsolidator {
637    fn default() -> Self {
638        Self::new()
639    }
640}
641
642impl SemanticConsolidator {
643    pub fn new() -> Self {
644        Self {
645            keyword_extractor: KeywordExtractor::new(),
646            min_support: CONSOLIDATION_MIN_SUPPORT,
647            min_age_days: CONSOLIDATION_MIN_AGE_DAYS,
648            stemmer: Stemmer::create(Algorithm::English),
649        }
650    }
651
652    /// Create with custom thresholds
653    pub fn with_thresholds(min_support: usize, min_age_days: i64) -> Self {
654        Self {
655            keyword_extractor: KeywordExtractor::new(),
656            min_support,
657            min_age_days,
658            stemmer: Stemmer::create(Algorithm::English),
659        }
660    }
661
662    /// Extract semantic facts from a set of memories
663    ///
664    /// Pipeline:
665    /// 1. Filter memories by age threshold
666    /// 2. Run multi-extractor on each eligible memory
667    /// 3. Cluster candidates by stemmed-token Jaccard similarity
668    /// 4. Convert qualifying clusters (>= min_support) into facts
669    pub fn consolidate(&self, memories: &[Memory]) -> ConsolidationResult {
670        let mut result = ConsolidationResult {
671            memories_processed: memories.len(),
672            ..Default::default()
673        };
674
675        if memories.is_empty() {
676            return result;
677        }
678
679        let now = chrono::Utc::now();
680        let eligible: Vec<&Memory> = memories
681            .iter()
682            .filter(|m| (now - m.created_at).num_days() >= self.min_age_days)
683            .collect();
684
685        if eligible.is_empty() {
686            return result;
687        }
688
689        // Phase 1: Extract candidates using multi-extractor pipeline
690        let mut all_candidates: Vec<(String, MemoryId, f32)> = Vec::new();
691        for memory in &eligible {
692            let extracted = self.extract_fact_candidates(memory);
693            for (pattern, confidence) in extracted {
694                all_candidates.push((pattern, memory.id.clone(), confidence));
695            }
696        }
697
698        if all_candidates.is_empty() {
699            return result;
700        }
701
702        // Phase 2: Group by stemmed-token Jaccard similarity
703        let clusters =
704            self.group_candidates_by_similarity(&all_candidates, CONSOLIDATION_JACCARD_THRESHOLD);
705
706        // Phase 3: Convert qualifying clusters to facts
707        for cluster in clusters {
708            if cluster.members.len() >= self.min_support {
709                let representative = Self::select_representative(&cluster.members);
710                let avg_confidence = cluster.members.iter().map(|(_, _, c)| c).sum::<f32>()
711                    / cluster.members.len() as f32;
712
713                let source_ids: Vec<MemoryId> = cluster
714                    .members
715                    .iter()
716                    .map(|(_, id, _)| id.clone())
717                    .collect();
718                let entities = self.keyword_extractor.extract(representative);
719                let fact_type = self.classify_fact(representative);
720
721                let fact = SemanticFact {
722                    id: uuid::Uuid::new_v4().to_string(),
723                    fact: representative.to_string(),
724                    confidence: avg_confidence.min(1.0),
725                    support_count: cluster.members.len(),
726                    source_memories: source_ids,
727                    related_entities: entities,
728                    created_at: now,
729                    last_reinforced: now,
730                    fact_type,
731                };
732
733                result.new_fact_ids.push(fact.id.clone());
734                result.new_facts.push(fact);
735                result.facts_extracted += 1;
736            }
737        }
738
739        result
740    }
741
742    // ── Clustering ──────────────────────────────────────────────────────────
743
744    /// Tokenize text into stemmed tokens, removing stop words and punctuation
745    fn stemmed_tokens(&self, text: &str) -> HashSet<String> {
746        text.split_whitespace()
747            .map(|w| {
748                w.to_lowercase()
749                    .chars()
750                    .filter(|c| c.is_alphanumeric())
751                    .collect::<String>()
752            })
753            .filter(|w| w.len() >= 2 && !self.keyword_extractor.is_stop_word(w))
754            .map(|w| self.stemmer.stem(&w).to_string())
755            .collect()
756    }
757
758    /// Jaccard similarity between two token sets: |A ∩ B| / |A ∪ B|
759    fn jaccard_similarity(a: &HashSet<String>, b: &HashSet<String>) -> f32 {
760        if a.is_empty() && b.is_empty() {
761            return 0.0;
762        }
763        let intersection = a.intersection(b).count();
764        let union = a.union(b).count();
765        if union == 0 {
766            0.0
767        } else {
768            intersection as f32 / union as f32
769        }
770    }
771
772    /// Group candidates into clusters using greedy single-pass Jaccard matching
773    fn group_candidates_by_similarity(
774        &self,
775        candidates: &[(String, MemoryId, f32)],
776        threshold: f32,
777    ) -> Vec<PatternCluster> {
778        let mut clusters: Vec<PatternCluster> = Vec::new();
779
780        for (pattern, memory_id, confidence) in candidates {
781            let tokens = self.stemmed_tokens(pattern);
782            if tokens.is_empty() {
783                continue;
784            }
785
786            // Find best matching cluster
787            let mut best_idx = None;
788            let mut best_sim = 0.0f32;
789            for (i, cluster) in clusters.iter().enumerate() {
790                let sim = Self::jaccard_similarity(&tokens, &cluster.stem_set);
791                if sim > best_sim {
792                    best_sim = sim;
793                    best_idx = Some(i);
794                }
795            }
796
797            if best_sim >= threshold {
798                // Merge into existing cluster — expand the stem set
799                let idx = best_idx.unwrap();
800                clusters[idx].stem_set = clusters[idx].stem_set.union(&tokens).cloned().collect();
801                clusters[idx]
802                    .members
803                    .push((pattern.clone(), memory_id.clone(), *confidence));
804            } else {
805                // Create new cluster
806                clusters.push(PatternCluster {
807                    stem_set: tokens,
808                    members: vec![(pattern.clone(), memory_id.clone(), *confidence)],
809                });
810            }
811        }
812
813        clusters
814    }
815
816    /// Select the representative fact text from a cluster (highest confidence)
817    fn select_representative(members: &[(String, MemoryId, f32)]) -> &str {
818        members
819            .iter()
820            .max_by(|a, b| a.2.total_cmp(&b.2))
821            .map(|(text, _, _)| text.as_str())
822            .unwrap_or("")
823    }
824
825    // ── Multi-Extractor Pipeline ────────────────────────────────────────────
826
827    /// Extract fact candidates from a single memory using all applicable extractors
828    fn extract_fact_candidates(&self, memory: &Memory) -> Vec<(String, f32)> {
829        let mut candidates = Vec::new();
830        let content = &memory.experience.content;
831        let importance = memory.importance();
832        let exp_type = &memory.experience.experience_type;
833
834        // Run all extractors with type-based confidence multipliers
835        if let Some(fact) = self.extract_procedure(content) {
836            let mult = if *exp_type == ExperienceType::Decision {
837                1.0
838            } else {
839                0.7
840            };
841            candidates.push((fact, importance * mult));
842        }
843
844        if let Some(fact) = self.extract_definition(content) {
845            let mult = if *exp_type == ExperienceType::Learning
846                || *exp_type == ExperienceType::Discovery
847            {
848                1.2
849            } else {
850                0.8
851            };
852            candidates.push((fact, importance * mult));
853        }
854
855        if let Some(fact) = self.extract_pattern(content) {
856            let mult = if *exp_type == ExperienceType::Error {
857                1.1
858            } else {
859                0.7
860            };
861            candidates.push((fact, importance * mult));
862        }
863
864        if let Some(fact) = self.extract_preference(content) {
865            let mult = if *exp_type == ExperienceType::Conversation {
866                1.0
867            } else {
868                0.6
869            };
870            candidates.push((fact, importance * mult));
871        }
872
873        // Universal fallback: extract most salient sentence
874        if let Some(fact) = self.extract_salient_statement(content, &memory.experience.entities) {
875            candidates.push((fact, importance * 0.6));
876        }
877
878        // Deduplicate overlapping extractions from the same memory
879        candidates = self.dedup_within_memory(candidates);
880
881        // Cap per-memory candidates
882        candidates.truncate(CONSOLIDATION_MAX_CANDIDATES_PER_MEMORY);
883
884        // Entity pair relationships (sorted for determinism)
885        // Filter: min 3 chars, no stop words, remove substring-redundant entities
886        if memory.experience.entities.len() >= 2 {
887            let mut sorted_entities: Vec<String> = memory
888                .experience
889                .entities
890                .iter()
891                .map(|e| e.to_lowercase())
892                .filter(|e| e.len() >= 3 && !self.keyword_extractor.is_stop_word(e))
893                .collect();
894            sorted_entities.sort();
895            sorted_entities.dedup();
896
897            // Remove entities that are substrings of another entity in the set
898            // e.g. "chose" is redundant when "chose rust" exists
899            let before_dedup = sorted_entities.clone();
900            sorted_entities.retain(|entity| {
901                !before_dedup
902                    .iter()
903                    .any(|other| other != entity && other.contains(entity.as_str()))
904            });
905
906            let max_entities = sorted_entities.len().min(4);
907            for i in 0..max_entities {
908                for j in (i + 1)..max_entities {
909                    let relationship =
910                        format!("{} relates to {}", sorted_entities[i], sorted_entities[j]);
911                    candidates.push((relationship, importance * 0.8));
912                }
913            }
914        }
915
916        candidates
917    }
918
919    /// Remove overlapping extractions from the same memory (keep highest confidence)
920    fn dedup_within_memory(&self, mut candidates: Vec<(String, f32)>) -> Vec<(String, f32)> {
921        if candidates.len() <= 1 {
922            return candidates;
923        }
924
925        // Sort by confidence descending so we keep higher-confidence versions
926        candidates.sort_by(|a, b| b.1.total_cmp(&a.1));
927
928        let mut kept: Vec<(String, f32, HashSet<String>)> = Vec::new();
929        for (text, conf) in candidates {
930            let tokens = self.stemmed_tokens(&text);
931            let overlaps = kept
932                .iter()
933                .any(|(_, _, t)| Self::jaccard_similarity(&tokens, t) > 0.8);
934            if !overlaps {
935                kept.push((text, conf, tokens));
936            }
937        }
938
939        kept.into_iter()
940            .map(|(text, conf, _)| (text, conf))
941            .collect()
942    }
943
944    // ── Individual Extractors ───────────────────────────────────────────────
945
946    /// Extract a procedure from content (looks for action words)
947    fn extract_procedure(&self, content: &str) -> Option<String> {
948        // Use to_ascii_lowercase() to preserve byte alignment with `content`.
949        // to_lowercase() can change byte lengths for non-ASCII chars (e.g. İ→i̇),
950        // making byte offsets from `lower.find()` invalid for indexing into `content`.
951        let lower = content.to_ascii_lowercase();
952        let action_markers = [
953            "to ",
954            "run ",
955            "execute ",
956            "use ",
957            "call ",
958            "invoke ",
959            "create ",
960            "build ",
961            "deploy ",
962            "install ",
963            "configure ",
964            "start ",
965            "stop ",
966            "restart ",
967            "set up ",
968            "update ",
969            "remove ",
970            "delete ",
971            "add ",
972            "import ",
973            "export ",
974            "migrate ",
975        ];
976
977        for marker in action_markers {
978            if let Some(pos) = lower.find(marker) {
979                if let Some(sentence) = Self::extract_sentence(content, pos) {
980                    return Some(sentence);
981                }
982            }
983        }
984        None
985    }
986
987    /// Extract a definition from content
988    fn extract_definition(&self, content: &str) -> Option<String> {
989        // Use to_ascii_lowercase() to preserve byte alignment with `content`.
990        let lower = content.to_ascii_lowercase();
991        let def_markers = [
992            " is ",
993            " are ",
994            " means ",
995            " refers to ",
996            " represents ",
997            " denotes ",
998            " describes ",
999            " consists of ",
1000            " defined as ",
1001            " known as ",
1002            " stands for ",
1003            " equivalent to ",
1004        ];
1005
1006        for marker in def_markers {
1007            if let Some(pos) = lower.find(marker) {
1008                // Extract subject and definition
1009                let subject_start =
1010                    content[..pos].rfind(|c: char| !c.is_alphanumeric() && c != '_');
1011                let subject_start = subject_start.map(|i| i + 1).unwrap_or(0);
1012                let subject = &content[subject_start..pos];
1013
1014                if subject.len() >= 2 {
1015                    let def_end = content[pos + marker.len()..]
1016                        .find(|c| c == '.' || c == '!' || c == '?' || c == ',');
1017                    let def_end = def_end
1018                        .map(|i| pos + marker.len() + i)
1019                        .unwrap_or(content.len().min(pos + marker.len() + 100));
1020
1021                    let definition = &content[pos + marker.len()..def_end];
1022                    if definition.len() > 5 {
1023                        return Some(format!("{}{}{}", subject, marker, definition.trim()));
1024                    }
1025                }
1026            }
1027        }
1028        None
1029    }
1030
1031    /// Extract a pattern from error content (returns the actual sentence)
1032    fn extract_pattern(&self, content: &str) -> Option<String> {
1033        // Use to_ascii_lowercase() to preserve byte alignment with `content`.
1034        let lower = content.to_ascii_lowercase();
1035        let pattern_markers = [
1036            "error",
1037            "failed",
1038            "crash",
1039            "bug",
1040            "issue",
1041            "problem",
1042            "exception",
1043            "warning",
1044            "panic",
1045            "timeout",
1046            "overflow",
1047            "deadlock",
1048            "leak",
1049            "corrupt",
1050        ];
1051
1052        for marker in pattern_markers {
1053            if let Some(pos) = lower.find(marker) {
1054                if let Some(sentence) = Self::extract_sentence(content, pos) {
1055                    return Some(sentence);
1056                }
1057            }
1058        }
1059        None
1060    }
1061
1062    /// Extract a preference from conversation content (returns the actual sentence)
1063    fn extract_preference(&self, content: &str) -> Option<String> {
1064        // Use to_ascii_lowercase() to preserve byte alignment with `content`.
1065        let lower = content.to_ascii_lowercase();
1066        let pref_markers = [
1067            "prefer",
1068            "like",
1069            "want",
1070            "better",
1071            "should",
1072            "always",
1073            "never",
1074            "dislike",
1075            "avoid",
1076            "recommend",
1077            "favorite",
1078            "rather",
1079            "instead of",
1080            "opt for",
1081        ];
1082
1083        for marker in pref_markers {
1084            if let Some(pos) = lower.find(marker) {
1085                if let Some(sentence) = Self::extract_sentence(content, pos) {
1086                    return Some(sentence);
1087                }
1088            }
1089        }
1090        None
1091    }
1092
1093    /// Extract the most information-dense sentence from content
1094    fn extract_salient_statement(&self, content: &str, entities: &[String]) -> Option<String> {
1095        let sentences = Self::split_sentences(content);
1096        let entity_lower: Vec<String> = entities.iter().map(|e| e.to_lowercase()).collect();
1097
1098        let mut best: Option<(String, f32)> = None;
1099
1100        for sentence in sentences {
1101            let trimmed = sentence.trim();
1102            if trimmed.len() < 20 || trimmed.len() > 200 {
1103                continue;
1104            }
1105
1106            let lower = trimmed.to_lowercase();
1107
1108            // Score: count of non-stop-words (content density)
1109            let content_words: usize = lower
1110                .split_whitespace()
1111                .map(|w| {
1112                    w.chars()
1113                        .filter(|c| c.is_alphanumeric())
1114                        .collect::<String>()
1115                })
1116                .filter(|w| !w.is_empty() && !self.keyword_extractor.is_stop_word(w))
1117                .count();
1118
1119            // Require at least 3 content words for a meaningful statement
1120            if content_words < 3 {
1121                continue;
1122            }
1123
1124            // Bonus for entity mentions
1125            let entity_bonus: f32 = entity_lower
1126                .iter()
1127                .filter(|e| lower.contains(e.as_str()))
1128                .count() as f32
1129                * 2.0;
1130
1131            let score = content_words as f32 + entity_bonus;
1132
1133            if best.as_ref().map_or(true, |(_, s)| score > *s) {
1134                best = Some((trimmed.to_string(), score));
1135            }
1136        }
1137
1138        best.map(|(s, _)| s)
1139    }
1140
1141    // ── Helpers ─────────────────────────────────────────────────────────────
1142
1143    /// Extract the sentence containing a character position
1144    fn extract_sentence(content: &str, pos: usize) -> Option<String> {
1145        let start = content[..pos].rfind(|c| c == '.' || c == '!' || c == '?');
1146        let start = start.map(|i| i + 1).unwrap_or(0);
1147
1148        let end = content[pos..].find(|c| c == '.' || c == '!' || c == '?');
1149        let end = end.map(|i| pos + i).unwrap_or(content.len());
1150
1151        let sentence = content[start..end].trim();
1152        if sentence.len() >= 20 && sentence.len() < 200 {
1153            Some(sentence.to_string())
1154        } else {
1155            None
1156        }
1157    }
1158
1159    /// Split content into sentences by sentence-ending punctuation
1160    fn split_sentences(content: &str) -> Vec<&str> {
1161        let mut sentences = Vec::new();
1162        let mut start = 0;
1163
1164        for (i, c) in content.char_indices() {
1165            if c == '.' || c == '!' || c == '?' {
1166                let sentence = &content[start..i];
1167                if !sentence.trim().is_empty() {
1168                    sentences.push(sentence.trim());
1169                }
1170                start = i + c.len_utf8();
1171            }
1172        }
1173
1174        // Trailing content without sentence-ending punctuation
1175        let remaining = content[start..].trim();
1176        if !remaining.is_empty() {
1177            sentences.push(remaining);
1178        }
1179
1180        sentences
1181    }
1182
1183    /// Classify what type of fact this is
1184    fn classify_fact(&self, pattern: &str) -> FactType {
1185        let lower = pattern.to_lowercase();
1186
1187        if lower.contains("prefer")
1188            || lower.contains("like")
1189            || lower.contains("always")
1190            || lower.contains("never")
1191            || lower.contains("favorite")
1192        {
1193            FactType::Preference
1194        } else if lower.contains("can ") || lower.contains("able to") || lower.contains("supports")
1195        {
1196            FactType::Capability
1197        } else if lower.contains("relates to")
1198            || lower.contains("depends on")
1199            || lower.contains("connects")
1200        {
1201            FactType::Relationship
1202        } else if lower.contains("to ")
1203            || lower.contains("run ")
1204            || lower.contains("execute")
1205            || lower.contains("deploy")
1206        {
1207            FactType::Procedure
1208        } else if lower.contains(" is ") || lower.contains(" are ") || lower.contains("means") {
1209            FactType::Definition
1210        } else {
1211            FactType::Pattern
1212        }
1213    }
1214
1215    /// Reinforce an existing fact with new evidence
1216    ///
1217    /// Called when a memory matches an existing fact, strengthening confidence.
1218    pub fn reinforce_fact(&self, fact: &mut SemanticFact, memory: &Memory) {
1219        fact.support_count += 1;
1220        fact.last_reinforced = chrono::Utc::now();
1221
1222        // Increase confidence with diminishing returns
1223        let boost = 0.1 * (1.0 - fact.confidence);
1224        fact.confidence = (fact.confidence + boost).min(1.0);
1225
1226        // Add source if not already present
1227        if !fact.source_memories.contains(&memory.id) {
1228            fact.source_memories.push(memory.id.clone());
1229        }
1230
1231        // Add any new entities (filter noise: short tokens, stop words)
1232        for entity in &memory.experience.entities {
1233            let lower = entity.to_lowercase();
1234            if lower.len() >= 3
1235                && !self.keyword_extractor.is_stop_word(&lower)
1236                && !fact.related_entities.contains(entity)
1237            {
1238                fact.related_entities.push(entity.clone());
1239            }
1240        }
1241    }
1242
1243    /// Check if a fact would decay below deletion threshold (0.1 confidence)
1244    ///
1245    /// Uses exponential half-life model matching `decay_facts_for_all_users()`:
1246    /// 90-day grace period, then half-life = 180 + (30 × support_count) days.
1247    pub fn should_decay_fact(&self, fact: &SemanticFact) -> bool {
1248        use crate::constants::{
1249            FACT_DECAY_GRACE_DAYS, FACT_DECAY_HALF_LIFE_BASE_DAYS,
1250            FACT_DECAY_HALF_LIFE_PER_SUPPORT_DAYS,
1251        };
1252        let days_since = (chrono::Utc::now() - fact.last_reinforced).num_days();
1253        if days_since <= FACT_DECAY_GRACE_DAYS {
1254            return false;
1255        }
1256        let elapsed = (days_since - FACT_DECAY_GRACE_DAYS) as f64;
1257        let half_life = FACT_DECAY_HALF_LIFE_BASE_DAYS
1258            + (fact.support_count as f64 * FACT_DECAY_HALF_LIFE_PER_SUPPORT_DAYS);
1259        let projected = fact.confidence * (0.5_f64).powf(elapsed / half_life) as f32;
1260        projected < 0.1
1261    }
1262}
1263
1264#[cfg(test)]
1265mod tests {
1266    use super::*;
1267    use uuid::Uuid;
1268
1269    fn create_test_memory(content: &str, importance: f32) -> Memory {
1270        let experience = Experience {
1271            content: content.to_string(),
1272            experience_type: ExperienceType::Observation,
1273            entities: vec!["test".to_string()],
1274            ..Default::default()
1275        };
1276
1277        let created_at = Some(chrono::Utc::now() - chrono::Duration::days(60));
1278
1279        Memory::new(
1280            MemoryId(Uuid::new_v4()),
1281            experience,
1282            importance,
1283            None, // agent_id
1284            None, // run_id
1285            None, // actor_id
1286            created_at,
1287        )
1288    }
1289
1290    #[test]
1291    fn test_compression_pipeline_default() {
1292        let pipeline = CompressionPipeline::default();
1293        assert!(pipeline.keyword_extractor.stop_words.contains("the"));
1294    }
1295
1296    #[test]
1297    fn test_lz4_compress_decompress() {
1298        let pipeline = CompressionPipeline::new();
1299        let memory = create_test_memory("This is a test memory content for compression", 0.9);
1300
1301        let compressed = pipeline.compress(&memory).unwrap();
1302        assert!(compressed.compressed);
1303        assert_eq!(
1304            compressed
1305                .experience
1306                .metadata
1307                .get("compression_strategy")
1308                .unwrap(),
1309            "lz4"
1310        );
1311
1312        let decompressed = pipeline.decompress(&compressed).unwrap();
1313        assert!(!decompressed.compressed);
1314        assert_eq!(decompressed.experience.content, memory.experience.content);
1315    }
1316
1317    #[test]
1318    fn test_already_compressed_memory() {
1319        let pipeline = CompressionPipeline::new();
1320        let mut memory = create_test_memory("Test content", 0.9);
1321        memory.compressed = true;
1322
1323        let result = pipeline.compress(&memory).unwrap();
1324        assert!(result.compressed);
1325    }
1326
1327    #[test]
1328    fn test_semantic_compression_lossy() {
1329        let pipeline = CompressionPipeline::new();
1330        let mut memory = create_test_memory(
1331            "This is a long test memory with many words for semantic compression testing purposes",
1332            0.1,
1333        );
1334        memory.created_at = chrono::Utc::now() - chrono::Duration::days(100);
1335
1336        let compressed = pipeline.compress_semantic(&memory).unwrap();
1337        assert!(compressed.compressed);
1338        assert!(compressed.experience.metadata.contains_key("keywords"));
1339
1340        let result = pipeline.decompress(&compressed);
1341        assert!(result.is_err());
1342        assert!(result.unwrap_err().to_string().contains("lossy"));
1343    }
1344
1345    #[test]
1346    fn test_is_lossless() {
1347        let pipeline = CompressionPipeline::new();
1348        let memory = create_test_memory("Test content", 0.9);
1349
1350        assert!(pipeline.is_lossless(&memory));
1351
1352        let compressed_lz4 = pipeline.compress_lz4(&memory).unwrap();
1353        assert!(pipeline.is_lossless(&compressed_lz4));
1354
1355        let compressed_semantic = pipeline.compress_semantic(&memory).unwrap();
1356        assert!(!pipeline.is_lossless(&compressed_semantic));
1357    }
1358
1359    #[test]
1360    fn test_get_strategy() {
1361        let pipeline = CompressionPipeline::new();
1362        let memory = create_test_memory("Test content", 0.9);
1363
1364        assert!(pipeline.get_strategy(&memory).is_none());
1365
1366        let compressed = pipeline.compress_lz4(&memory).unwrap();
1367        assert_eq!(pipeline.get_strategy(&compressed), Some("lz4"));
1368    }
1369
1370    #[test]
1371    fn test_keyword_extraction() {
1372        let extractor = KeywordExtractor::new();
1373        let text = "Rust programming language memory management ownership borrowing";
1374        let keywords = extractor.extract(text);
1375
1376        assert!(!keywords.is_empty());
1377        assert!(keywords.contains(&"rust".to_string()));
1378        assert!(keywords.contains(&"memory".to_string()));
1379        assert!(!keywords.contains(&"the".to_string()));
1380    }
1381
1382    #[test]
1383    fn test_stop_words_filtered() {
1384        let extractor = KeywordExtractor::new();
1385        let text = "the is at which on and a an as are was were";
1386        let keywords = extractor.extract(text);
1387
1388        assert!(keywords.is_empty());
1389    }
1390
1391    #[test]
1392    fn test_semantic_consolidator_empty() {
1393        let consolidator = SemanticConsolidator::new();
1394        let result = consolidator.consolidate(&[]);
1395
1396        assert_eq!(result.memories_processed, 0);
1397        assert_eq!(result.facts_extracted, 0);
1398    }
1399
1400    #[test]
1401    fn test_semantic_consolidator_with_thresholds() {
1402        let consolidator = SemanticConsolidator::with_thresholds(2, 7);
1403        assert_eq!(consolidator.min_support, 2);
1404        assert_eq!(consolidator.min_age_days, 7);
1405    }
1406
1407    #[test]
1408    fn test_fact_type_classification() {
1409        let consolidator = SemanticConsolidator::new();
1410
1411        assert_eq!(
1412            consolidator.classify_fact("preference: concise code"),
1413            FactType::Preference
1414        );
1415        assert_eq!(
1416            consolidator.classify_fact("system can handle 10k requests"),
1417            FactType::Capability
1418        );
1419        assert_eq!(
1420            consolidator.classify_fact("auth relates to jwt"),
1421            FactType::Relationship
1422        );
1423        assert_eq!(
1424            consolidator.classify_fact("to deploy, run cargo build"),
1425            FactType::Procedure
1426        );
1427        assert_eq!(
1428            consolidator.classify_fact("MemoryId is a UUID wrapper"),
1429            FactType::Definition
1430        );
1431    }
1432
1433    #[test]
1434    fn test_reinforce_fact() {
1435        let consolidator = SemanticConsolidator::new();
1436        let mut fact = SemanticFact {
1437            id: "test-fact".to_string(),
1438            fact: "test fact content".to_string(),
1439            confidence: 0.5,
1440            support_count: 1,
1441            source_memories: vec![],
1442            related_entities: vec![],
1443            created_at: chrono::Utc::now(),
1444            last_reinforced: chrono::Utc::now() - chrono::Duration::days(10),
1445            fact_type: FactType::Pattern,
1446        };
1447        let memory = create_test_memory("reinforcing memory", 0.7);
1448
1449        let old_confidence = fact.confidence;
1450        consolidator.reinforce_fact(&mut fact, &memory);
1451
1452        assert!(fact.confidence > old_confidence);
1453        assert_eq!(fact.support_count, 2);
1454        assert!(fact.source_memories.contains(&memory.id));
1455    }
1456
1457    #[test]
1458    fn test_fact_decay_threshold() {
1459        let consolidator = SemanticConsolidator::new();
1460
1461        let recent_fact = SemanticFact {
1462            id: "recent".to_string(),
1463            fact: "recent fact".to_string(),
1464            confidence: 0.8,
1465            support_count: 5,
1466            source_memories: vec![],
1467            related_entities: vec![],
1468            created_at: chrono::Utc::now(),
1469            last_reinforced: chrono::Utc::now(),
1470            fact_type: FactType::Pattern,
1471        };
1472        assert!(!consolidator.should_decay_fact(&recent_fact));
1473
1474        let old_fact = SemanticFact {
1475            id: "old".to_string(),
1476            fact: "old fact".to_string(),
1477            confidence: 0.1,
1478            support_count: 1,
1479            source_memories: vec![],
1480            related_entities: vec![],
1481            created_at: chrono::Utc::now() - chrono::Duration::days(365),
1482            last_reinforced: chrono::Utc::now() - chrono::Duration::days(100),
1483            fact_type: FactType::Pattern,
1484        };
1485        assert!(consolidator.should_decay_fact(&old_fact));
1486    }
1487
1488    #[test]
1489    fn test_compression_stats_default() {
1490        let stats = CompressionStats::default();
1491
1492        assert_eq!(stats.total_compressed, 0);
1493        assert_eq!(stats.average_compression_ratio, 1.0);
1494        assert!(stats.strategies_used.is_empty());
1495    }
1496
1497    #[test]
1498    fn test_create_summary() {
1499        let pipeline = CompressionPipeline::new();
1500        let content = "This is a long piece of content that should be summarized into fewer words";
1501        let summary = pipeline.create_summary(content, 5);
1502
1503        assert!(summary.ends_with("..."));
1504        assert!(summary.len() < content.len());
1505    }
1506
1507    #[test]
1508    fn test_consolidation_result_default() {
1509        let result = ConsolidationResult::default();
1510
1511        assert_eq!(result.memories_processed, 0);
1512        assert_eq!(result.facts_extracted, 0);
1513        assert!(result.new_facts.is_empty());
1514    }
1515
1516    #[test]
1517    fn test_fact_type_default() {
1518        let fact_type = FactType::default();
1519        assert_eq!(fact_type, FactType::Pattern);
1520    }
1521
1522    fn create_typed_memory(
1523        content: &str,
1524        importance: f32,
1525        exp_type: ExperienceType,
1526        entities: Vec<String>,
1527    ) -> Memory {
1528        let experience = Experience {
1529            content: content.to_string(),
1530            experience_type: exp_type,
1531            entities,
1532            ..Default::default()
1533        };
1534
1535        let created_at = Some(chrono::Utc::now() - chrono::Duration::days(60));
1536
1537        Memory::new(
1538            MemoryId(Uuid::new_v4()),
1539            experience,
1540            importance,
1541            None,
1542            None,
1543            None,
1544            created_at,
1545        )
1546    }
1547
1548    #[test]
1549    fn test_jaccard_similarity_helper() {
1550        let a: HashSet<String> = ["rust", "memory", "safety"]
1551            .iter()
1552            .map(|s| s.to_string())
1553            .collect();
1554        let b: HashSet<String> = ["rust", "memory", "performance"]
1555            .iter()
1556            .map(|s| s.to_string())
1557            .collect();
1558        let c: HashSet<String> = ["python", "web", "django"]
1559            .iter()
1560            .map(|s| s.to_string())
1561            .collect();
1562
1563        let ab = SemanticConsolidator::jaccard_similarity(&a, &b);
1564        assert!(ab > 0.4, "rust+memory overlap should give ~0.5, got {ab}");
1565        assert!(ab < 0.6);
1566
1567        let ac = SemanticConsolidator::jaccard_similarity(&a, &c);
1568        assert!(ac < 0.01, "disjoint sets should give 0.0, got {ac}");
1569
1570        let aa = SemanticConsolidator::jaccard_similarity(&a, &a);
1571        assert!((aa - 1.0).abs() < 0.001, "identical sets should give 1.0");
1572
1573        let empty: HashSet<String> = HashSet::new();
1574        assert_eq!(
1575            SemanticConsolidator::jaccard_similarity(&empty, &empty),
1576            0.0
1577        );
1578    }
1579
1580    #[test]
1581    fn test_stemmed_tokens_removes_stop_words() {
1582        let consolidator = SemanticConsolidator::new();
1583        let tokens = consolidator.stemmed_tokens("The Rust programming language is very fast");
1584
1585        assert!(!tokens.is_empty());
1586        // "the", "is", "very" should be filtered as stop words
1587        assert!(!tokens.contains("the"));
1588        assert!(!tokens.contains("is"));
1589        assert!(!tokens.contains("very"));
1590        // "rust", "programming", "language", "fast" should survive (stemmed)
1591        assert!(tokens.contains("rust"));
1592        assert!(tokens.contains("fast"));
1593    }
1594
1595    #[test]
1596    fn test_similarity_grouping_clusters_similar_patterns() {
1597        let consolidator = SemanticConsolidator::with_thresholds(2, 0);
1598
1599        let m1 = create_test_memory(
1600            "Rust provides memory safety and performance guarantees",
1601            0.8,
1602        );
1603        let m2 = create_test_memory("Rust gives memory safety with great performance", 0.7);
1604        let m3 = create_test_memory("Python is great for data science and machine learning", 0.6);
1605
1606        let result = consolidator.consolidate(&[m1, m2, m3]);
1607
1608        // The two Rust/memory/safety memories should cluster and produce a fact
1609        assert!(
1610            result.facts_extracted >= 1,
1611            "Similar memories about Rust should cluster into at least 1 fact, got {}",
1612            result.facts_extracted
1613        );
1614
1615        // Verify the fact contains Rust-related content
1616        let has_rust_fact = result
1617            .new_facts
1618            .iter()
1619            .any(|f| f.fact.to_lowercase().contains("rust"));
1620        assert!(has_rust_fact, "Should have a fact about Rust");
1621    }
1622
1623    #[test]
1624    fn test_multi_extractor_produces_multiple_candidates() {
1625        let consolidator = SemanticConsolidator::new();
1626
1627        // This memory has both a definition ("is") and an action word ("use")
1628        let memory = create_typed_memory(
1629            "RocksDB is a high-performance embedded database. We should use it for storage.",
1630            0.8,
1631            ExperienceType::Decision,
1632            vec!["RocksDB".to_string()],
1633        );
1634
1635        let candidates = consolidator.extract_fact_candidates(&memory);
1636
1637        // Should produce multiple candidates from different extractors
1638        assert!(
1639            candidates.len() >= 2,
1640            "Multi-extractor should produce >=2 candidates, got {}",
1641            candidates.len()
1642        );
1643    }
1644
1645    #[test]
1646    fn test_generic_fallback_produces_real_sentence() {
1647        let consolidator = SemanticConsolidator::new();
1648
1649        let memory = create_typed_memory(
1650            "The deployment pipeline uses Docker containers for isolation. Each service runs independently.",
1651            0.7,
1652            ExperienceType::Observation,
1653            vec!["Docker".to_string()],
1654        );
1655
1656        let candidates = consolidator.extract_fact_candidates(&memory);
1657
1658        // Should NOT produce "involves: ..." synthetic patterns
1659        let has_involves = candidates
1660            .iter()
1661            .any(|(text, _)| text.starts_with("involves:"));
1662        assert!(
1663            !has_involves,
1664            "Should not produce synthetic 'involves:' patterns"
1665        );
1666
1667        // Should have at least one real sentence
1668        assert!(
1669            !candidates.is_empty(),
1670            "Should extract at least one candidate"
1671        );
1672    }
1673
1674    #[test]
1675    fn test_entity_relationships_sorted_deterministic() {
1676        let consolidator = SemanticConsolidator::new();
1677
1678        let m1 = create_typed_memory(
1679            "Testing entity ordering",
1680            0.7,
1681            ExperienceType::Observation,
1682            vec!["JWT".to_string(), "Auth".to_string(), "Token".to_string()],
1683        );
1684
1685        let m2 = create_typed_memory(
1686            "Testing entity ordering",
1687            0.7,
1688            ExperienceType::Observation,
1689            vec!["Token".to_string(), "Auth".to_string(), "JWT".to_string()],
1690        );
1691
1692        let c1 = consolidator.extract_fact_candidates(&m1);
1693        let c2 = consolidator.extract_fact_candidates(&m2);
1694
1695        // Entity relationships should be identical regardless of input order
1696        let relations1: Vec<&str> = c1
1697            .iter()
1698            .filter(|(t, _)| t.contains("relates to"))
1699            .map(|(t, _)| t.as_str())
1700            .collect();
1701        let relations2: Vec<&str> = c2
1702            .iter()
1703            .filter(|(t, _)| t.contains("relates to"))
1704            .map(|(t, _)| t.as_str())
1705            .collect();
1706
1707        assert_eq!(
1708            relations1, relations2,
1709            "Entity relationships should be deterministic regardless of input order"
1710        );
1711    }
1712}
shodh_memory/memory/compression.rs

shodh_memory/memory/
compression.rs