1use anyhow::{anyhow, Result};
4use base64::{engine::general_purpose, Engine as _};
5use lz4;
6use rust_stemmers::{Algorithm, Stemmer};
7use serde::{Deserialize, Serialize};
8use std::collections::{HashMap, HashSet};
9
10use super::types::*;
11use crate::constants::{
12 COMPRESSION_ACCESS_THRESHOLD, COMPRESSION_AGE_DAYS, COMPRESSION_IMPORTANCE_HIGH,
13 COMPRESSION_IMPORTANCE_LOW, CONSOLIDATION_JACCARD_THRESHOLD,
14 CONSOLIDATION_MAX_CANDIDATES_PER_MEMORY, CONSOLIDATION_MIN_AGE_DAYS, CONSOLIDATION_MIN_SUPPORT,
15 MAX_COMPRESSION_RATIO, MAX_DECOMPRESSED_SIZE,
16};
17
18#[derive(Debug, Clone)]
20pub enum CompressionStrategy {
21 None,
22 Lz4, Summarization, Hybrid, }
26
27#[derive(Debug, Clone, Serialize, Deserialize)]
29pub struct CompressedMemory {
30 pub id: MemoryId,
31 pub summary: String,
32 pub keywords: Vec<String>,
33 pub importance: f32,
34 pub created_at: chrono::DateTime<chrono::Utc>,
35 pub compression_ratio: f32,
36 pub original_size: usize,
37 pub compressed_data: Vec<u8>,
38 pub strategy: String,
39}
40
41pub struct CompressionPipeline {
43 keyword_extractor: KeywordExtractor,
44}
45
46impl Default for CompressionPipeline {
47 fn default() -> Self {
48 Self::new()
49 }
50}
51
52impl CompressionPipeline {
53 pub fn new() -> Self {
54 Self {
55 keyword_extractor: KeywordExtractor::new(),
56 }
57 }
58
59 pub fn compress(&self, memory: &Memory) -> Result<Memory> {
61 if memory.compressed {
63 return Ok(memory.clone());
64 }
65
66 let strategy = self.select_strategy(memory);
67
68 match strategy {
69 CompressionStrategy::None => Ok(memory.clone()),
70 CompressionStrategy::Lz4 => self.compress_lz4(memory),
71 CompressionStrategy::Summarization => self.compress_semantic(memory),
72 CompressionStrategy::Hybrid => self.compress_hybrid(memory),
73 }
74 }
75
76 fn select_strategy(&self, memory: &Memory) -> CompressionStrategy {
78 if memory.importance() > COMPRESSION_IMPORTANCE_HIGH {
80 return CompressionStrategy::Lz4;
81 }
82
83 if memory.access_count() > COMPRESSION_ACCESS_THRESHOLD {
85 return CompressionStrategy::None;
86 }
87
88 let age = chrono::Utc::now() - memory.created_at;
90 if age.num_days() > COMPRESSION_AGE_DAYS && memory.importance() < COMPRESSION_IMPORTANCE_LOW
91 {
92 return CompressionStrategy::Summarization;
93 }
94
95 CompressionStrategy::Hybrid
97 }
98
99 fn compress_lz4(&self, memory: &Memory) -> Result<Memory> {
101 let original =
102 bincode::serde::encode_to_vec(&memory.experience, bincode::config::standard())?;
103 let compressed = lz4::block::compress(&original, None, false)?;
104
105 let compression_ratio = compressed.len() as f32 / original.len() as f32;
106
107 let mut compressed_memory = memory.clone();
109 compressed_memory.compressed = true;
110
111 let compressed_b64 = general_purpose::STANDARD.encode(&compressed);
113 compressed_memory
114 .experience
115 .metadata
116 .insert("compressed_data".to_string(), compressed_b64);
117 compressed_memory.experience.metadata.insert(
118 "compression_ratio".to_string(),
119 compression_ratio.to_string(),
120 );
121 compressed_memory
122 .experience
123 .metadata
124 .insert("compression_strategy".to_string(), "lz4".to_string());
125
126 Ok(compressed_memory)
127 }
128
129 fn compress_semantic(&self, memory: &Memory) -> Result<Memory> {
131 let mut compressed_memory = memory.clone();
132
133 let keywords = self.keyword_extractor.extract(&memory.experience.content);
135
136 let summary = self.create_summary(&memory.experience.content, 50);
138
139 compressed_memory.experience.content = summary;
141 compressed_memory
142 .experience
143 .metadata
144 .insert("keywords".to_string(), keywords.join(","));
145 compressed_memory
146 .experience
147 .metadata
148 .insert("compression_strategy".to_string(), "semantic".to_string());
149 compressed_memory.compressed = true;
150
151 Ok(compressed_memory)
152 }
153
154 fn compress_hybrid(&self, memory: &Memory) -> Result<Memory> {
156 let semantic = self.compress_semantic(memory)?;
158
159 self.compress_lz4(&semantic)
161 }
162
163 pub fn decompress(&self, memory: &Memory) -> Result<Memory> {
173 if !memory.compressed {
174 return Ok(memory.clone());
175 }
176
177 let strategy = memory
178 .experience
179 .metadata
180 .get("compression_strategy")
181 .map(|s| s.as_str())
182 .unwrap_or("unknown");
183
184 match strategy {
185 "lz4" => self.decompress_lz4(memory),
186 "semantic" => {
187 Err(anyhow!(
191 "Cannot decompress semantically compressed memory '{}': \
192 semantic compression is lossy. Original content was replaced with \
193 summary and keywords. Use memory.experience.content for the summary \
194 and metadata['keywords'] for extracted keywords.",
195 memory.id.0
196 ))
197 }
198 "hybrid" => {
199 let lz4_decompressed = self.decompress_lz4(memory)?;
202 Err(anyhow!(
204 "Cannot fully decompress hybrid-compressed memory '{}': \
205 underlying semantic compression is lossy. LZ4 layer decompressed, \
206 but original content is not recoverable.",
207 lz4_decompressed.id.0
208 ))
209 }
210 unknown => Err(anyhow!(
211 "Unknown compression strategy '{}' for memory '{}'. \
212 Cannot decompress.",
213 unknown,
214 memory.id.0
215 )),
216 }
217 }
218
219 pub fn is_lossless(&self, memory: &Memory) -> bool {
221 if !memory.compressed {
222 return true;
223 }
224 let strategy = memory
225 .experience
226 .metadata
227 .get("compression_strategy")
228 .map(|s| s.as_str())
229 .unwrap_or("unknown");
230 strategy == "lz4"
231 }
232
233 pub fn get_strategy<'a>(&self, memory: &'a Memory) -> Option<&'a str> {
235 if !memory.compressed {
236 return None;
237 }
238 memory
239 .experience
240 .metadata
241 .get("compression_strategy")
242 .map(|s| s.as_str())
243 }
244
245 fn decompress_lz4(&self, memory: &Memory) -> Result<Memory> {
247 if let Some(compressed_b64) = memory.experience.metadata.get("compressed_data") {
248 let compressed = general_purpose::STANDARD.decode(compressed_b64)?;
249
250 let compressed_size = compressed.len();
253 let max_expected_decompressed = compressed_size.saturating_mul(MAX_COMPRESSION_RATIO);
254
255 if max_expected_decompressed > MAX_DECOMPRESSED_SIZE as usize {
256 return Err(anyhow!(
259 "Suspicious compression ratio: compressed size {} bytes with max ratio {} \
260 would allow {} bytes decompressed, which exceeds limit of {} bytes. \
261 This may indicate a zip bomb attack.",
262 compressed_size,
263 MAX_COMPRESSION_RATIO,
264 max_expected_decompressed,
265 MAX_DECOMPRESSED_SIZE
266 ));
267 }
268
269 let decompressed = lz4::block::decompress(&compressed, Some(MAX_DECOMPRESSED_SIZE))?;
271
272 let actual_ratio = if compressed_size > 0 {
274 decompressed.len() / compressed_size
275 } else {
276 0
277 };
278 if actual_ratio > MAX_COMPRESSION_RATIO {
279 return Err(anyhow!(
280 "Decompression ratio {} exceeds maximum allowed ratio of {}. \
281 Compressed: {} bytes, Decompressed: {} bytes. \
282 This may indicate a zip bomb attack.",
283 actual_ratio,
284 MAX_COMPRESSION_RATIO,
285 compressed_size,
286 decompressed.len()
287 ));
288 }
289
290 let (experience, _): (Experience, _) =
291 bincode::serde::decode_from_slice(&decompressed, bincode::config::standard())?;
292
293 let mut restored = memory.clone();
295 restored.experience = experience;
296 restored.compressed = false;
297 restored.experience.metadata.remove("compressed_data");
298 restored.experience.metadata.remove("compression_ratio");
299 restored.experience.metadata.remove("compression_strategy");
300
301 Ok(restored)
302 } else {
303 Err(anyhow!("No compressed data found"))
304 }
305 }
306
307 fn create_summary(&self, content: &str, max_words: usize) -> String {
309 let words: Vec<&str> = content.split_whitespace().collect();
312 let summary_words = &words[..words.len().min(max_words)];
313 format!("{}...", summary_words.join(" "))
314 }
315}
316
317struct KeywordExtractor {
319 stop_words: HashSet<String>,
320}
321
322impl KeywordExtractor {
323 fn new() -> Self {
324 let stop_words = Self::load_stop_words();
325 Self { stop_words }
326 }
327
328 fn extract(&self, text: &str) -> Vec<String> {
329 let mut word_freq: HashMap<String, usize> = HashMap::new();
331
332 for word in text.split_whitespace() {
333 let clean_word = word
334 .to_lowercase()
335 .chars()
336 .filter(|c| c.is_alphanumeric())
337 .collect::<String>();
338
339 if clean_word.len() >= 2 && !self.stop_words.contains(&clean_word) {
340 *word_freq.entry(clean_word).or_insert(0) += 1;
341 }
342 }
343
344 let mut keywords: Vec<(String, usize)> = word_freq.into_iter().collect();
346 keywords.sort_by(|a, b| b.1.cmp(&a.1));
347
348 keywords
349 .into_iter()
350 .take(10)
351 .map(|(word, _)| word)
352 .collect()
353 }
354
355 fn is_stop_word(&self, word: &str) -> bool {
357 self.stop_words.contains(word)
358 }
359
360 fn load_stop_words() -> HashSet<String> {
361 let words = vec![
362 "the",
363 "is",
364 "at",
365 "which",
366 "on",
367 "and",
368 "a",
369 "an",
370 "as",
371 "are",
372 "was",
373 "were",
374 "been",
375 "be",
376 "have",
377 "has",
378 "had",
379 "do",
380 "does",
381 "did",
382 "will",
383 "would",
384 "could",
385 "should",
386 "may",
387 "might",
388 "must",
389 "shall",
390 "can",
391 "need",
392 "dare",
393 "ought",
394 "used",
395 "to",
396 "of",
397 "in",
398 "for",
399 "with",
400 "by",
401 "from",
402 "about",
403 "into",
404 "through",
405 "during",
406 "before",
407 "after",
408 "above",
409 "below",
410 "up",
411 "down",
412 "out",
413 "off",
414 "over",
415 "under",
416 "again",
417 "further",
418 "then",
419 "once",
420 "there",
421 "these",
422 "those",
423 "this",
424 "that",
425 "it",
426 "its",
427 "what",
428 "which",
429 "who",
430 "whom",
431 "whose",
432 "where",
433 "when",
434 "why",
435 "how",
436 "all",
437 "both",
438 "each",
439 "few",
440 "more",
441 "most",
442 "other",
443 "some",
444 "such",
445 "no",
446 "nor",
447 "not",
448 "only",
449 "own",
450 "same",
451 "so",
452 "than",
453 "too",
454 "very",
455 "just",
456 "but",
457 "or",
458 "if",
459 "i",
461 "you",
462 "he",
463 "she",
464 "we",
465 "they",
466 "me",
467 "him",
468 "her",
469 "us",
470 "them",
471 "my",
472 "your",
473 "his",
474 "our",
475 "their",
476 "also",
478 "always",
479 "never",
480 "often",
481 "sometimes",
482 "usually",
483 "really",
484 "actually",
485 "basically",
486 "currently",
487 "recently",
488 "already",
489 "still",
490 "yet",
491 "now",
492 "here",
493 "carefully",
494 "quickly",
495 "easily",
496 "well",
497 "much",
498 "many",
499 "new",
500 "old",
501 "good",
502 "great",
503 "best",
504 "like",
505 "even",
506 "also",
507 "get",
508 "got",
509 "set",
510 "use",
511 "using",
512 "used",
513 "make",
514 "made",
515 "being",
516 ];
517
518 words.into_iter().map(String::from).collect()
519 }
520}
521
522#[derive(Debug, Clone, Serialize, Deserialize)]
524pub struct CompressionStats {
525 pub total_compressed: usize,
526 pub total_original_size: usize,
527 pub total_compressed_size: usize,
528 pub average_compression_ratio: f32,
529 pub strategies_used: HashMap<String, usize>,
530}
531
532impl Default for CompressionStats {
533 fn default() -> Self {
534 Self {
535 total_compressed: 0,
536 total_original_size: 0,
537 total_compressed_size: 0,
538 average_compression_ratio: 1.0,
539 strategies_used: HashMap::new(),
540 }
541 }
542}
543
544#[derive(Debug, Clone, Serialize, Deserialize)]
554pub struct SemanticFact {
555 pub id: String,
557 pub fact: String,
559 pub confidence: f32,
561 pub support_count: usize,
563 pub source_memories: Vec<MemoryId>,
565 pub related_entities: Vec<String>,
567 pub created_at: chrono::DateTime<chrono::Utc>,
569 pub last_reinforced: chrono::DateTime<chrono::Utc>,
571 pub fact_type: FactType,
573}
574
575#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
577pub enum FactType {
578 Preference,
580 Capability,
582 Relationship,
584 Procedure,
586 Definition,
588 Pattern,
590}
591
592impl Default for FactType {
593 fn default() -> Self {
594 Self::Pattern
595 }
596}
597
598#[derive(Debug, Clone, Default)]
600pub struct ConsolidationResult {
601 pub memories_processed: usize,
603 pub facts_extracted: usize,
605 pub facts_reinforced: usize,
607 pub new_fact_ids: Vec<String>,
609 pub new_facts: Vec<SemanticFact>,
611}
612
613pub struct SemanticConsolidator {
620 keyword_extractor: KeywordExtractor,
621 min_support: usize,
623 min_age_days: i64,
625 stemmer: Stemmer,
626}
627
628struct PatternCluster {
630 stem_set: HashSet<String>,
632 members: Vec<(String, MemoryId, f32)>,
634}
635
636impl Default for SemanticConsolidator {
637 fn default() -> Self {
638 Self::new()
639 }
640}
641
642impl SemanticConsolidator {
643 pub fn new() -> Self {
644 Self {
645 keyword_extractor: KeywordExtractor::new(),
646 min_support: CONSOLIDATION_MIN_SUPPORT,
647 min_age_days: CONSOLIDATION_MIN_AGE_DAYS,
648 stemmer: Stemmer::create(Algorithm::English),
649 }
650 }
651
652 pub fn with_thresholds(min_support: usize, min_age_days: i64) -> Self {
654 Self {
655 keyword_extractor: KeywordExtractor::new(),
656 min_support,
657 min_age_days,
658 stemmer: Stemmer::create(Algorithm::English),
659 }
660 }
661
662 pub fn consolidate(&self, memories: &[Memory]) -> ConsolidationResult {
670 let mut result = ConsolidationResult {
671 memories_processed: memories.len(),
672 ..Default::default()
673 };
674
675 if memories.is_empty() {
676 return result;
677 }
678
679 let now = chrono::Utc::now();
680 let eligible: Vec<&Memory> = memories
681 .iter()
682 .filter(|m| (now - m.created_at).num_days() >= self.min_age_days)
683 .collect();
684
685 if eligible.is_empty() {
686 return result;
687 }
688
689 let mut all_candidates: Vec<(String, MemoryId, f32)> = Vec::new();
691 for memory in &eligible {
692 let extracted = self.extract_fact_candidates(memory);
693 for (pattern, confidence) in extracted {
694 all_candidates.push((pattern, memory.id.clone(), confidence));
695 }
696 }
697
698 if all_candidates.is_empty() {
699 return result;
700 }
701
702 let clusters =
704 self.group_candidates_by_similarity(&all_candidates, CONSOLIDATION_JACCARD_THRESHOLD);
705
706 for cluster in clusters {
708 if cluster.members.len() >= self.min_support {
709 let representative = Self::select_representative(&cluster.members);
710 let avg_confidence = cluster.members.iter().map(|(_, _, c)| c).sum::<f32>()
711 / cluster.members.len() as f32;
712
713 let source_ids: Vec<MemoryId> = cluster
714 .members
715 .iter()
716 .map(|(_, id, _)| id.clone())
717 .collect();
718 let entities = self.keyword_extractor.extract(representative);
719 let fact_type = self.classify_fact(representative);
720
721 let fact = SemanticFact {
722 id: uuid::Uuid::new_v4().to_string(),
723 fact: representative.to_string(),
724 confidence: avg_confidence.min(1.0),
725 support_count: cluster.members.len(),
726 source_memories: source_ids,
727 related_entities: entities,
728 created_at: now,
729 last_reinforced: now,
730 fact_type,
731 };
732
733 result.new_fact_ids.push(fact.id.clone());
734 result.new_facts.push(fact);
735 result.facts_extracted += 1;
736 }
737 }
738
739 result
740 }
741
742 fn stemmed_tokens(&self, text: &str) -> HashSet<String> {
746 text.split_whitespace()
747 .map(|w| {
748 w.to_lowercase()
749 .chars()
750 .filter(|c| c.is_alphanumeric())
751 .collect::<String>()
752 })
753 .filter(|w| w.len() >= 2 && !self.keyword_extractor.is_stop_word(w))
754 .map(|w| self.stemmer.stem(&w).to_string())
755 .collect()
756 }
757
758 fn jaccard_similarity(a: &HashSet<String>, b: &HashSet<String>) -> f32 {
760 if a.is_empty() && b.is_empty() {
761 return 0.0;
762 }
763 let intersection = a.intersection(b).count();
764 let union = a.union(b).count();
765 if union == 0 {
766 0.0
767 } else {
768 intersection as f32 / union as f32
769 }
770 }
771
772 fn group_candidates_by_similarity(
774 &self,
775 candidates: &[(String, MemoryId, f32)],
776 threshold: f32,
777 ) -> Vec<PatternCluster> {
778 let mut clusters: Vec<PatternCluster> = Vec::new();
779
780 for (pattern, memory_id, confidence) in candidates {
781 let tokens = self.stemmed_tokens(pattern);
782 if tokens.is_empty() {
783 continue;
784 }
785
786 let mut best_idx = None;
788 let mut best_sim = 0.0f32;
789 for (i, cluster) in clusters.iter().enumerate() {
790 let sim = Self::jaccard_similarity(&tokens, &cluster.stem_set);
791 if sim > best_sim {
792 best_sim = sim;
793 best_idx = Some(i);
794 }
795 }
796
797 if best_sim >= threshold {
798 let idx = best_idx.unwrap();
800 clusters[idx].stem_set = clusters[idx].stem_set.union(&tokens).cloned().collect();
801 clusters[idx]
802 .members
803 .push((pattern.clone(), memory_id.clone(), *confidence));
804 } else {
805 clusters.push(PatternCluster {
807 stem_set: tokens,
808 members: vec![(pattern.clone(), memory_id.clone(), *confidence)],
809 });
810 }
811 }
812
813 clusters
814 }
815
816 fn select_representative(members: &[(String, MemoryId, f32)]) -> &str {
818 members
819 .iter()
820 .max_by(|a, b| a.2.total_cmp(&b.2))
821 .map(|(text, _, _)| text.as_str())
822 .unwrap_or("")
823 }
824
825 fn extract_fact_candidates(&self, memory: &Memory) -> Vec<(String, f32)> {
829 let mut candidates = Vec::new();
830 let content = &memory.experience.content;
831 let importance = memory.importance();
832 let exp_type = &memory.experience.experience_type;
833
834 if let Some(fact) = self.extract_procedure(content) {
836 let mult = if *exp_type == ExperienceType::Decision {
837 1.0
838 } else {
839 0.7
840 };
841 candidates.push((fact, importance * mult));
842 }
843
844 if let Some(fact) = self.extract_definition(content) {
845 let mult = if *exp_type == ExperienceType::Learning
846 || *exp_type == ExperienceType::Discovery
847 {
848 1.2
849 } else {
850 0.8
851 };
852 candidates.push((fact, importance * mult));
853 }
854
855 if let Some(fact) = self.extract_pattern(content) {
856 let mult = if *exp_type == ExperienceType::Error {
857 1.1
858 } else {
859 0.7
860 };
861 candidates.push((fact, importance * mult));
862 }
863
864 if let Some(fact) = self.extract_preference(content) {
865 let mult = if *exp_type == ExperienceType::Conversation {
866 1.0
867 } else {
868 0.6
869 };
870 candidates.push((fact, importance * mult));
871 }
872
873 if let Some(fact) = self.extract_salient_statement(content, &memory.experience.entities) {
875 candidates.push((fact, importance * 0.6));
876 }
877
878 candidates = self.dedup_within_memory(candidates);
880
881 candidates.truncate(CONSOLIDATION_MAX_CANDIDATES_PER_MEMORY);
883
884 if memory.experience.entities.len() >= 2 {
887 let mut sorted_entities: Vec<String> = memory
888 .experience
889 .entities
890 .iter()
891 .map(|e| e.to_lowercase())
892 .filter(|e| e.len() >= 3 && !self.keyword_extractor.is_stop_word(e))
893 .collect();
894 sorted_entities.sort();
895 sorted_entities.dedup();
896
897 let before_dedup = sorted_entities.clone();
900 sorted_entities.retain(|entity| {
901 !before_dedup
902 .iter()
903 .any(|other| other != entity && other.contains(entity.as_str()))
904 });
905
906 let max_entities = sorted_entities.len().min(4);
907 for i in 0..max_entities {
908 for j in (i + 1)..max_entities {
909 let relationship =
910 format!("{} relates to {}", sorted_entities[i], sorted_entities[j]);
911 candidates.push((relationship, importance * 0.8));
912 }
913 }
914 }
915
916 candidates
917 }
918
919 fn dedup_within_memory(&self, mut candidates: Vec<(String, f32)>) -> Vec<(String, f32)> {
921 if candidates.len() <= 1 {
922 return candidates;
923 }
924
925 candidates.sort_by(|a, b| b.1.total_cmp(&a.1));
927
928 let mut kept: Vec<(String, f32, HashSet<String>)> = Vec::new();
929 for (text, conf) in candidates {
930 let tokens = self.stemmed_tokens(&text);
931 let overlaps = kept
932 .iter()
933 .any(|(_, _, t)| Self::jaccard_similarity(&tokens, t) > 0.8);
934 if !overlaps {
935 kept.push((text, conf, tokens));
936 }
937 }
938
939 kept.into_iter()
940 .map(|(text, conf, _)| (text, conf))
941 .collect()
942 }
943
944 fn extract_procedure(&self, content: &str) -> Option<String> {
948 let lower = content.to_ascii_lowercase();
952 let action_markers = [
953 "to ",
954 "run ",
955 "execute ",
956 "use ",
957 "call ",
958 "invoke ",
959 "create ",
960 "build ",
961 "deploy ",
962 "install ",
963 "configure ",
964 "start ",
965 "stop ",
966 "restart ",
967 "set up ",
968 "update ",
969 "remove ",
970 "delete ",
971 "add ",
972 "import ",
973 "export ",
974 "migrate ",
975 ];
976
977 for marker in action_markers {
978 if let Some(pos) = lower.find(marker) {
979 if let Some(sentence) = Self::extract_sentence(content, pos) {
980 return Some(sentence);
981 }
982 }
983 }
984 None
985 }
986
987 fn extract_definition(&self, content: &str) -> Option<String> {
989 let lower = content.to_ascii_lowercase();
991 let def_markers = [
992 " is ",
993 " are ",
994 " means ",
995 " refers to ",
996 " represents ",
997 " denotes ",
998 " describes ",
999 " consists of ",
1000 " defined as ",
1001 " known as ",
1002 " stands for ",
1003 " equivalent to ",
1004 ];
1005
1006 for marker in def_markers {
1007 if let Some(pos) = lower.find(marker) {
1008 let subject_start =
1010 content[..pos].rfind(|c: char| !c.is_alphanumeric() && c != '_');
1011 let subject_start = subject_start.map(|i| i + 1).unwrap_or(0);
1012 let subject = &content[subject_start..pos];
1013
1014 if subject.len() >= 2 {
1015 let def_end = content[pos + marker.len()..]
1016 .find(|c| c == '.' || c == '!' || c == '?' || c == ',');
1017 let def_end = def_end
1018 .map(|i| pos + marker.len() + i)
1019 .unwrap_or(content.len().min(pos + marker.len() + 100));
1020
1021 let definition = &content[pos + marker.len()..def_end];
1022 if definition.len() > 5 {
1023 return Some(format!("{}{}{}", subject, marker, definition.trim()));
1024 }
1025 }
1026 }
1027 }
1028 None
1029 }
1030
1031 fn extract_pattern(&self, content: &str) -> Option<String> {
1033 let lower = content.to_ascii_lowercase();
1035 let pattern_markers = [
1036 "error",
1037 "failed",
1038 "crash",
1039 "bug",
1040 "issue",
1041 "problem",
1042 "exception",
1043 "warning",
1044 "panic",
1045 "timeout",
1046 "overflow",
1047 "deadlock",
1048 "leak",
1049 "corrupt",
1050 ];
1051
1052 for marker in pattern_markers {
1053 if let Some(pos) = lower.find(marker) {
1054 if let Some(sentence) = Self::extract_sentence(content, pos) {
1055 return Some(sentence);
1056 }
1057 }
1058 }
1059 None
1060 }
1061
1062 fn extract_preference(&self, content: &str) -> Option<String> {
1064 let lower = content.to_ascii_lowercase();
1066 let pref_markers = [
1067 "prefer",
1068 "like",
1069 "want",
1070 "better",
1071 "should",
1072 "always",
1073 "never",
1074 "dislike",
1075 "avoid",
1076 "recommend",
1077 "favorite",
1078 "rather",
1079 "instead of",
1080 "opt for",
1081 ];
1082
1083 for marker in pref_markers {
1084 if let Some(pos) = lower.find(marker) {
1085 if let Some(sentence) = Self::extract_sentence(content, pos) {
1086 return Some(sentence);
1087 }
1088 }
1089 }
1090 None
1091 }
1092
1093 fn extract_salient_statement(&self, content: &str, entities: &[String]) -> Option<String> {
1095 let sentences = Self::split_sentences(content);
1096 let entity_lower: Vec<String> = entities.iter().map(|e| e.to_lowercase()).collect();
1097
1098 let mut best: Option<(String, f32)> = None;
1099
1100 for sentence in sentences {
1101 let trimmed = sentence.trim();
1102 if trimmed.len() < 20 || trimmed.len() > 200 {
1103 continue;
1104 }
1105
1106 let lower = trimmed.to_lowercase();
1107
1108 let content_words: usize = lower
1110 .split_whitespace()
1111 .map(|w| {
1112 w.chars()
1113 .filter(|c| c.is_alphanumeric())
1114 .collect::<String>()
1115 })
1116 .filter(|w| !w.is_empty() && !self.keyword_extractor.is_stop_word(w))
1117 .count();
1118
1119 if content_words < 3 {
1121 continue;
1122 }
1123
1124 let entity_bonus: f32 = entity_lower
1126 .iter()
1127 .filter(|e| lower.contains(e.as_str()))
1128 .count() as f32
1129 * 2.0;
1130
1131 let score = content_words as f32 + entity_bonus;
1132
1133 if best.as_ref().map_or(true, |(_, s)| score > *s) {
1134 best = Some((trimmed.to_string(), score));
1135 }
1136 }
1137
1138 best.map(|(s, _)| s)
1139 }
1140
1141 fn extract_sentence(content: &str, pos: usize) -> Option<String> {
1145 let start = content[..pos].rfind(|c| c == '.' || c == '!' || c == '?');
1146 let start = start.map(|i| i + 1).unwrap_or(0);
1147
1148 let end = content[pos..].find(|c| c == '.' || c == '!' || c == '?');
1149 let end = end.map(|i| pos + i).unwrap_or(content.len());
1150
1151 let sentence = content[start..end].trim();
1152 if sentence.len() >= 20 && sentence.len() < 200 {
1153 Some(sentence.to_string())
1154 } else {
1155 None
1156 }
1157 }
1158
1159 fn split_sentences(content: &str) -> Vec<&str> {
1161 let mut sentences = Vec::new();
1162 let mut start = 0;
1163
1164 for (i, c) in content.char_indices() {
1165 if c == '.' || c == '!' || c == '?' {
1166 let sentence = &content[start..i];
1167 if !sentence.trim().is_empty() {
1168 sentences.push(sentence.trim());
1169 }
1170 start = i + c.len_utf8();
1171 }
1172 }
1173
1174 let remaining = content[start..].trim();
1176 if !remaining.is_empty() {
1177 sentences.push(remaining);
1178 }
1179
1180 sentences
1181 }
1182
1183 fn classify_fact(&self, pattern: &str) -> FactType {
1185 let lower = pattern.to_lowercase();
1186
1187 if lower.contains("prefer")
1188 || lower.contains("like")
1189 || lower.contains("always")
1190 || lower.contains("never")
1191 || lower.contains("favorite")
1192 {
1193 FactType::Preference
1194 } else if lower.contains("can ") || lower.contains("able to") || lower.contains("supports")
1195 {
1196 FactType::Capability
1197 } else if lower.contains("relates to")
1198 || lower.contains("depends on")
1199 || lower.contains("connects")
1200 {
1201 FactType::Relationship
1202 } else if lower.contains("to ")
1203 || lower.contains("run ")
1204 || lower.contains("execute")
1205 || lower.contains("deploy")
1206 {
1207 FactType::Procedure
1208 } else if lower.contains(" is ") || lower.contains(" are ") || lower.contains("means") {
1209 FactType::Definition
1210 } else {
1211 FactType::Pattern
1212 }
1213 }
1214
1215 pub fn reinforce_fact(&self, fact: &mut SemanticFact, memory: &Memory) {
1219 fact.support_count += 1;
1220 fact.last_reinforced = chrono::Utc::now();
1221
1222 let boost = 0.1 * (1.0 - fact.confidence);
1224 fact.confidence = (fact.confidence + boost).min(1.0);
1225
1226 if !fact.source_memories.contains(&memory.id) {
1228 fact.source_memories.push(memory.id.clone());
1229 }
1230
1231 for entity in &memory.experience.entities {
1233 let lower = entity.to_lowercase();
1234 if lower.len() >= 3
1235 && !self.keyword_extractor.is_stop_word(&lower)
1236 && !fact.related_entities.contains(entity)
1237 {
1238 fact.related_entities.push(entity.clone());
1239 }
1240 }
1241 }
1242
1243 pub fn should_decay_fact(&self, fact: &SemanticFact) -> bool {
1248 use crate::constants::{
1249 FACT_DECAY_GRACE_DAYS, FACT_DECAY_HALF_LIFE_BASE_DAYS,
1250 FACT_DECAY_HALF_LIFE_PER_SUPPORT_DAYS,
1251 };
1252 let days_since = (chrono::Utc::now() - fact.last_reinforced).num_days();
1253 if days_since <= FACT_DECAY_GRACE_DAYS {
1254 return false;
1255 }
1256 let elapsed = (days_since - FACT_DECAY_GRACE_DAYS) as f64;
1257 let half_life = FACT_DECAY_HALF_LIFE_BASE_DAYS
1258 + (fact.support_count as f64 * FACT_DECAY_HALF_LIFE_PER_SUPPORT_DAYS);
1259 let projected = fact.confidence * (0.5_f64).powf(elapsed / half_life) as f32;
1260 projected < 0.1
1261 }
1262}
1263
1264#[cfg(test)]
1265mod tests {
1266 use super::*;
1267 use uuid::Uuid;
1268
1269 fn create_test_memory(content: &str, importance: f32) -> Memory {
1270 let experience = Experience {
1271 content: content.to_string(),
1272 experience_type: ExperienceType::Observation,
1273 entities: vec!["test".to_string()],
1274 ..Default::default()
1275 };
1276
1277 let created_at = Some(chrono::Utc::now() - chrono::Duration::days(60));
1278
1279 Memory::new(
1280 MemoryId(Uuid::new_v4()),
1281 experience,
1282 importance,
1283 None, None, None, created_at,
1287 )
1288 }
1289
1290 #[test]
1291 fn test_compression_pipeline_default() {
1292 let pipeline = CompressionPipeline::default();
1293 assert!(pipeline.keyword_extractor.stop_words.contains("the"));
1294 }
1295
1296 #[test]
1297 fn test_lz4_compress_decompress() {
1298 let pipeline = CompressionPipeline::new();
1299 let memory = create_test_memory("This is a test memory content for compression", 0.9);
1300
1301 let compressed = pipeline.compress(&memory).unwrap();
1302 assert!(compressed.compressed);
1303 assert_eq!(
1304 compressed
1305 .experience
1306 .metadata
1307 .get("compression_strategy")
1308 .unwrap(),
1309 "lz4"
1310 );
1311
1312 let decompressed = pipeline.decompress(&compressed).unwrap();
1313 assert!(!decompressed.compressed);
1314 assert_eq!(decompressed.experience.content, memory.experience.content);
1315 }
1316
1317 #[test]
1318 fn test_already_compressed_memory() {
1319 let pipeline = CompressionPipeline::new();
1320 let mut memory = create_test_memory("Test content", 0.9);
1321 memory.compressed = true;
1322
1323 let result = pipeline.compress(&memory).unwrap();
1324 assert!(result.compressed);
1325 }
1326
1327 #[test]
1328 fn test_semantic_compression_lossy() {
1329 let pipeline = CompressionPipeline::new();
1330 let mut memory = create_test_memory(
1331 "This is a long test memory with many words for semantic compression testing purposes",
1332 0.1,
1333 );
1334 memory.created_at = chrono::Utc::now() - chrono::Duration::days(100);
1335
1336 let compressed = pipeline.compress_semantic(&memory).unwrap();
1337 assert!(compressed.compressed);
1338 assert!(compressed.experience.metadata.contains_key("keywords"));
1339
1340 let result = pipeline.decompress(&compressed);
1341 assert!(result.is_err());
1342 assert!(result.unwrap_err().to_string().contains("lossy"));
1343 }
1344
1345 #[test]
1346 fn test_is_lossless() {
1347 let pipeline = CompressionPipeline::new();
1348 let memory = create_test_memory("Test content", 0.9);
1349
1350 assert!(pipeline.is_lossless(&memory));
1351
1352 let compressed_lz4 = pipeline.compress_lz4(&memory).unwrap();
1353 assert!(pipeline.is_lossless(&compressed_lz4));
1354
1355 let compressed_semantic = pipeline.compress_semantic(&memory).unwrap();
1356 assert!(!pipeline.is_lossless(&compressed_semantic));
1357 }
1358
1359 #[test]
1360 fn test_get_strategy() {
1361 let pipeline = CompressionPipeline::new();
1362 let memory = create_test_memory("Test content", 0.9);
1363
1364 assert!(pipeline.get_strategy(&memory).is_none());
1365
1366 let compressed = pipeline.compress_lz4(&memory).unwrap();
1367 assert_eq!(pipeline.get_strategy(&compressed), Some("lz4"));
1368 }
1369
1370 #[test]
1371 fn test_keyword_extraction() {
1372 let extractor = KeywordExtractor::new();
1373 let text = "Rust programming language memory management ownership borrowing";
1374 let keywords = extractor.extract(text);
1375
1376 assert!(!keywords.is_empty());
1377 assert!(keywords.contains(&"rust".to_string()));
1378 assert!(keywords.contains(&"memory".to_string()));
1379 assert!(!keywords.contains(&"the".to_string()));
1380 }
1381
1382 #[test]
1383 fn test_stop_words_filtered() {
1384 let extractor = KeywordExtractor::new();
1385 let text = "the is at which on and a an as are was were";
1386 let keywords = extractor.extract(text);
1387
1388 assert!(keywords.is_empty());
1389 }
1390
1391 #[test]
1392 fn test_semantic_consolidator_empty() {
1393 let consolidator = SemanticConsolidator::new();
1394 let result = consolidator.consolidate(&[]);
1395
1396 assert_eq!(result.memories_processed, 0);
1397 assert_eq!(result.facts_extracted, 0);
1398 }
1399
1400 #[test]
1401 fn test_semantic_consolidator_with_thresholds() {
1402 let consolidator = SemanticConsolidator::with_thresholds(2, 7);
1403 assert_eq!(consolidator.min_support, 2);
1404 assert_eq!(consolidator.min_age_days, 7);
1405 }
1406
1407 #[test]
1408 fn test_fact_type_classification() {
1409 let consolidator = SemanticConsolidator::new();
1410
1411 assert_eq!(
1412 consolidator.classify_fact("preference: concise code"),
1413 FactType::Preference
1414 );
1415 assert_eq!(
1416 consolidator.classify_fact("system can handle 10k requests"),
1417 FactType::Capability
1418 );
1419 assert_eq!(
1420 consolidator.classify_fact("auth relates to jwt"),
1421 FactType::Relationship
1422 );
1423 assert_eq!(
1424 consolidator.classify_fact("to deploy, run cargo build"),
1425 FactType::Procedure
1426 );
1427 assert_eq!(
1428 consolidator.classify_fact("MemoryId is a UUID wrapper"),
1429 FactType::Definition
1430 );
1431 }
1432
1433 #[test]
1434 fn test_reinforce_fact() {
1435 let consolidator = SemanticConsolidator::new();
1436 let mut fact = SemanticFact {
1437 id: "test-fact".to_string(),
1438 fact: "test fact content".to_string(),
1439 confidence: 0.5,
1440 support_count: 1,
1441 source_memories: vec![],
1442 related_entities: vec![],
1443 created_at: chrono::Utc::now(),
1444 last_reinforced: chrono::Utc::now() - chrono::Duration::days(10),
1445 fact_type: FactType::Pattern,
1446 };
1447 let memory = create_test_memory("reinforcing memory", 0.7);
1448
1449 let old_confidence = fact.confidence;
1450 consolidator.reinforce_fact(&mut fact, &memory);
1451
1452 assert!(fact.confidence > old_confidence);
1453 assert_eq!(fact.support_count, 2);
1454 assert!(fact.source_memories.contains(&memory.id));
1455 }
1456
1457 #[test]
1458 fn test_fact_decay_threshold() {
1459 let consolidator = SemanticConsolidator::new();
1460
1461 let recent_fact = SemanticFact {
1462 id: "recent".to_string(),
1463 fact: "recent fact".to_string(),
1464 confidence: 0.8,
1465 support_count: 5,
1466 source_memories: vec![],
1467 related_entities: vec![],
1468 created_at: chrono::Utc::now(),
1469 last_reinforced: chrono::Utc::now(),
1470 fact_type: FactType::Pattern,
1471 };
1472 assert!(!consolidator.should_decay_fact(&recent_fact));
1473
1474 let old_fact = SemanticFact {
1475 id: "old".to_string(),
1476 fact: "old fact".to_string(),
1477 confidence: 0.1,
1478 support_count: 1,
1479 source_memories: vec![],
1480 related_entities: vec![],
1481 created_at: chrono::Utc::now() - chrono::Duration::days(365),
1482 last_reinforced: chrono::Utc::now() - chrono::Duration::days(100),
1483 fact_type: FactType::Pattern,
1484 };
1485 assert!(consolidator.should_decay_fact(&old_fact));
1486 }
1487
1488 #[test]
1489 fn test_compression_stats_default() {
1490 let stats = CompressionStats::default();
1491
1492 assert_eq!(stats.total_compressed, 0);
1493 assert_eq!(stats.average_compression_ratio, 1.0);
1494 assert!(stats.strategies_used.is_empty());
1495 }
1496
1497 #[test]
1498 fn test_create_summary() {
1499 let pipeline = CompressionPipeline::new();
1500 let content = "This is a long piece of content that should be summarized into fewer words";
1501 let summary = pipeline.create_summary(content, 5);
1502
1503 assert!(summary.ends_with("..."));
1504 assert!(summary.len() < content.len());
1505 }
1506
1507 #[test]
1508 fn test_consolidation_result_default() {
1509 let result = ConsolidationResult::default();
1510
1511 assert_eq!(result.memories_processed, 0);
1512 assert_eq!(result.facts_extracted, 0);
1513 assert!(result.new_facts.is_empty());
1514 }
1515
1516 #[test]
1517 fn test_fact_type_default() {
1518 let fact_type = FactType::default();
1519 assert_eq!(fact_type, FactType::Pattern);
1520 }
1521
1522 fn create_typed_memory(
1523 content: &str,
1524 importance: f32,
1525 exp_type: ExperienceType,
1526 entities: Vec<String>,
1527 ) -> Memory {
1528 let experience = Experience {
1529 content: content.to_string(),
1530 experience_type: exp_type,
1531 entities,
1532 ..Default::default()
1533 };
1534
1535 let created_at = Some(chrono::Utc::now() - chrono::Duration::days(60));
1536
1537 Memory::new(
1538 MemoryId(Uuid::new_v4()),
1539 experience,
1540 importance,
1541 None,
1542 None,
1543 None,
1544 created_at,
1545 )
1546 }
1547
1548 #[test]
1549 fn test_jaccard_similarity_helper() {
1550 let a: HashSet<String> = ["rust", "memory", "safety"]
1551 .iter()
1552 .map(|s| s.to_string())
1553 .collect();
1554 let b: HashSet<String> = ["rust", "memory", "performance"]
1555 .iter()
1556 .map(|s| s.to_string())
1557 .collect();
1558 let c: HashSet<String> = ["python", "web", "django"]
1559 .iter()
1560 .map(|s| s.to_string())
1561 .collect();
1562
1563 let ab = SemanticConsolidator::jaccard_similarity(&a, &b);
1564 assert!(ab > 0.4, "rust+memory overlap should give ~0.5, got {ab}");
1565 assert!(ab < 0.6);
1566
1567 let ac = SemanticConsolidator::jaccard_similarity(&a, &c);
1568 assert!(ac < 0.01, "disjoint sets should give 0.0, got {ac}");
1569
1570 let aa = SemanticConsolidator::jaccard_similarity(&a, &a);
1571 assert!((aa - 1.0).abs() < 0.001, "identical sets should give 1.0");
1572
1573 let empty: HashSet<String> = HashSet::new();
1574 assert_eq!(
1575 SemanticConsolidator::jaccard_similarity(&empty, &empty),
1576 0.0
1577 );
1578 }
1579
1580 #[test]
1581 fn test_stemmed_tokens_removes_stop_words() {
1582 let consolidator = SemanticConsolidator::new();
1583 let tokens = consolidator.stemmed_tokens("The Rust programming language is very fast");
1584
1585 assert!(!tokens.is_empty());
1586 assert!(!tokens.contains("the"));
1588 assert!(!tokens.contains("is"));
1589 assert!(!tokens.contains("very"));
1590 assert!(tokens.contains("rust"));
1592 assert!(tokens.contains("fast"));
1593 }
1594
1595 #[test]
1596 fn test_similarity_grouping_clusters_similar_patterns() {
1597 let consolidator = SemanticConsolidator::with_thresholds(2, 0);
1598
1599 let m1 = create_test_memory(
1600 "Rust provides memory safety and performance guarantees",
1601 0.8,
1602 );
1603 let m2 = create_test_memory("Rust gives memory safety with great performance", 0.7);
1604 let m3 = create_test_memory("Python is great for data science and machine learning", 0.6);
1605
1606 let result = consolidator.consolidate(&[m1, m2, m3]);
1607
1608 assert!(
1610 result.facts_extracted >= 1,
1611 "Similar memories about Rust should cluster into at least 1 fact, got {}",
1612 result.facts_extracted
1613 );
1614
1615 let has_rust_fact = result
1617 .new_facts
1618 .iter()
1619 .any(|f| f.fact.to_lowercase().contains("rust"));
1620 assert!(has_rust_fact, "Should have a fact about Rust");
1621 }
1622
1623 #[test]
1624 fn test_multi_extractor_produces_multiple_candidates() {
1625 let consolidator = SemanticConsolidator::new();
1626
1627 let memory = create_typed_memory(
1629 "RocksDB is a high-performance embedded database. We should use it for storage.",
1630 0.8,
1631 ExperienceType::Decision,
1632 vec!["RocksDB".to_string()],
1633 );
1634
1635 let candidates = consolidator.extract_fact_candidates(&memory);
1636
1637 assert!(
1639 candidates.len() >= 2,
1640 "Multi-extractor should produce >=2 candidates, got {}",
1641 candidates.len()
1642 );
1643 }
1644
1645 #[test]
1646 fn test_generic_fallback_produces_real_sentence() {
1647 let consolidator = SemanticConsolidator::new();
1648
1649 let memory = create_typed_memory(
1650 "The deployment pipeline uses Docker containers for isolation. Each service runs independently.",
1651 0.7,
1652 ExperienceType::Observation,
1653 vec!["Docker".to_string()],
1654 );
1655
1656 let candidates = consolidator.extract_fact_candidates(&memory);
1657
1658 let has_involves = candidates
1660 .iter()
1661 .any(|(text, _)| text.starts_with("involves:"));
1662 assert!(
1663 !has_involves,
1664 "Should not produce synthetic 'involves:' patterns"
1665 );
1666
1667 assert!(
1669 !candidates.is_empty(),
1670 "Should extract at least one candidate"
1671 );
1672 }
1673
1674 #[test]
1675 fn test_entity_relationships_sorted_deterministic() {
1676 let consolidator = SemanticConsolidator::new();
1677
1678 let m1 = create_typed_memory(
1679 "Testing entity ordering",
1680 0.7,
1681 ExperienceType::Observation,
1682 vec!["JWT".to_string(), "Auth".to_string(), "Token".to_string()],
1683 );
1684
1685 let m2 = create_typed_memory(
1686 "Testing entity ordering",
1687 0.7,
1688 ExperienceType::Observation,
1689 vec!["Token".to_string(), "Auth".to_string(), "JWT".to_string()],
1690 );
1691
1692 let c1 = consolidator.extract_fact_candidates(&m1);
1693 let c2 = consolidator.extract_fact_candidates(&m2);
1694
1695 let relations1: Vec<&str> = c1
1697 .iter()
1698 .filter(|(t, _)| t.contains("relates to"))
1699 .map(|(t, _)| t.as_str())
1700 .collect();
1701 let relations2: Vec<&str> = c2
1702 .iter()
1703 .filter(|(t, _)| t.contains("relates to"))
1704 .map(|(t, _)| t.as_str())
1705 .collect();
1706
1707 assert_eq!(
1708 relations1, relations2,
1709 "Entity relationships should be deterministic regardless of input order"
1710 );
1711 }
1712}