1use serde::{Deserialize, Serialize};
15use std::collections::VecDeque;
16
17static STOPWORDS: &[&str] = &[
23 "the", "a", "an", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had",
24 "do", "does", "did", "will", "would", "could", "should", "may", "might", "shall", "can", "to",
25 "of", "in", "for", "on", "with", "at", "by", "from",
26];
27
28#[derive(Debug, Clone, Serialize, Deserialize)]
34pub struct SynthesisConfig {
35 pub overlap_threshold: f32,
38 pub buffer_size: usize,
41 pub min_overlap_tokens: usize,
44}
45
46impl Default for SynthesisConfig {
47 fn default() -> Self {
48 Self {
49 overlap_threshold: 0.4,
50 buffer_size: 50,
51 min_overlap_tokens: 5,
52 }
53 }
54}
55
56#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
58#[serde(rename_all = "snake_case")]
59pub enum SynthesisStrategy {
60 Merge,
62 Replace,
64 Append,
66}
67
68#[derive(Debug, Clone, Serialize, Deserialize)]
70pub struct SynthesizedMemory {
71 pub content: String,
73 pub sources: Vec<i64>,
75 pub overlap_score: f32,
77 pub strategy_used: SynthesisStrategy,
79 pub tokens_saved: usize,
81}
82
83#[derive(Debug, Clone, Serialize, Deserialize)]
85pub struct OverlapResult {
86 pub memory_id: i64,
88 pub overlap_score: f32,
90 pub shared_tokens: Vec<String>,
92}
93
94pub struct BufferEntry {
100 pub id: i64,
102 pub content: String,
104 pub tokens: Vec<String>,
106}
107
108pub struct SynthesisEngine {
129 config: SynthesisConfig,
130 buffer: VecDeque<BufferEntry>,
131}
132
133impl SynthesisEngine {
134 pub fn new(config: SynthesisConfig) -> Self {
136 Self {
137 buffer: VecDeque::with_capacity(config.buffer_size),
138 config,
139 }
140 }
141
142 pub fn tokenize(text: &str) -> Vec<String> {
150 let lower = text.to_lowercase();
151 lower
152 .split(|c: char| c.is_whitespace() || c.is_ascii_punctuation())
153 .map(|tok| tok.trim_matches(|c: char| !c.is_alphanumeric()).to_string())
154 .filter(|tok| !tok.is_empty() && !STOPWORDS.contains(&tok.as_str()))
155 .collect()
156 }
157
158 pub fn jaccard_similarity(a: &[String], b: &[String]) -> f32 {
167 if a.is_empty() && b.is_empty() {
168 return 1.0;
169 }
170
171 let mut set_a: Vec<&str> = a.iter().map(String::as_str).collect();
173 set_a.sort_unstable();
174 set_a.dedup();
175
176 let mut set_b: Vec<&str> = b.iter().map(String::as_str).collect();
177 set_b.sort_unstable();
178 set_b.dedup();
179
180 let mut intersection = 0usize;
181 let (mut i, mut j) = (0, 0);
182 while i < set_a.len() && j < set_b.len() {
183 match set_a[i].cmp(set_b[j]) {
184 std::cmp::Ordering::Equal => {
185 intersection += 1;
186 i += 1;
187 j += 1;
188 }
189 std::cmp::Ordering::Less => i += 1,
190 std::cmp::Ordering::Greater => j += 1,
191 }
192 }
193
194 let union = set_a.len() + set_b.len() - intersection;
196 if union == 0 {
197 0.0
198 } else {
199 intersection as f32 / union as f32
200 }
201 }
202
203 pub fn detect_overlap(&self, new_content: &str) -> Vec<OverlapResult> {
213 let new_tokens = Self::tokenize(new_content);
214
215 let mut results: Vec<OverlapResult> = self
216 .buffer
217 .iter()
218 .filter_map(|entry| {
219 let score = Self::jaccard_similarity(&new_tokens, &entry.tokens);
220 if score < self.config.overlap_threshold {
221 return None;
222 }
223
224 let mut new_sorted: Vec<&str> = new_tokens.iter().map(String::as_str).collect();
226 new_sorted.sort_unstable();
227 new_sorted.dedup();
228
229 let mut buf_sorted: Vec<&str> = entry.tokens.iter().map(String::as_str).collect();
230 buf_sorted.sort_unstable();
231 buf_sorted.dedup();
232
233 let shared: Vec<String> = {
234 let (mut i, mut j) = (0, 0);
235 let mut shared = Vec::new();
236 while i < new_sorted.len() && j < buf_sorted.len() {
237 match new_sorted[i].cmp(buf_sorted[j]) {
238 std::cmp::Ordering::Equal => {
239 shared.push(new_sorted[i].to_string());
240 i += 1;
241 j += 1;
242 }
243 std::cmp::Ordering::Less => i += 1,
244 std::cmp::Ordering::Greater => j += 1,
245 }
246 }
247 shared
248 };
249
250 if shared.len() < self.config.min_overlap_tokens {
251 return None;
252 }
253
254 Some(OverlapResult {
255 memory_id: entry.id,
256 overlap_score: score,
257 shared_tokens: shared,
258 })
259 })
260 .collect();
261
262 results.sort_by(|a, b| {
263 b.overlap_score
264 .partial_cmp(&a.overlap_score)
265 .unwrap_or(std::cmp::Ordering::Equal)
266 });
267
268 results
269 }
270
271 pub fn synthesize(
280 &self,
281 existing_content: &str,
282 existing_id: i64,
283 new_content: &str,
284 strategy: SynthesisStrategy,
285 ) -> SynthesizedMemory {
286 let existing_tokens = Self::tokenize(existing_content);
287 let new_tokens = Self::tokenize(new_content);
288 let overlap_score = Self::jaccard_similarity(&existing_tokens, &new_tokens);
289
290 let combined_raw_len = existing_content.len() + new_content.len();
291
292 let content = match strategy {
293 SynthesisStrategy::Merge => Self::merge_content(existing_content, new_content),
294 SynthesisStrategy::Replace => new_content.to_string(),
295 SynthesisStrategy::Append => Self::append_content(existing_content, new_content),
296 };
297
298 let tokens_saved = combined_raw_len.saturating_sub(content.len());
299
300 SynthesizedMemory {
301 content,
302 sources: vec![existing_id],
303 overlap_score,
304 strategy_used: strategy,
305 tokens_saved,
306 }
307 }
308
309 fn merge_content(existing: &str, new: &str) -> String {
315 let existing_sentences = Self::split_sentences(existing);
316 let new_sentences = Self::split_sentences(new);
317
318 let existing_norm: Vec<String> = existing_sentences
319 .iter()
320 .map(|s| s.to_lowercase())
321 .collect();
322
323 let mut merged: Vec<&str> = existing_sentences.iter().map(String::as_str).collect();
324
325 for (raw, norm) in new_sentences.iter().zip(
326 new_sentences
327 .iter()
328 .map(|s| s.to_lowercase())
329 .collect::<Vec<_>>()
330 .iter(),
331 ) {
332 if !existing_norm.contains(norm) {
333 merged.push(raw.as_str());
334 }
335 }
336
337 merged.join(" ").trim().to_string()
338 }
339
340 fn append_content(existing: &str, new: &str) -> String {
343 let existing_lines: Vec<&str> = existing.lines().collect();
344 let existing_norm: Vec<String> = existing_lines
345 .iter()
346 .map(|l| l.trim().to_lowercase())
347 .collect();
348
349 let mut result_lines: Vec<&str> = existing_lines;
350
351 let separator_added = !existing.trim().is_empty();
353
354 let mut new_lines: Vec<&str> = Vec::new();
355 for line in new.lines() {
356 let norm = line.trim().to_lowercase();
357 if !norm.is_empty() && !existing_norm.contains(&norm) {
358 new_lines.push(line);
359 }
360 }
361
362 if !new_lines.is_empty() {
363 if separator_added {
364 result_lines.push("---");
365 }
366 result_lines.extend_from_slice(&new_lines);
367 }
368
369 result_lines.join("\n")
370 }
371
372 fn split_sentences(text: &str) -> Vec<String> {
374 text.split(['.', '!', '?'])
375 .map(|s| s.trim().to_string())
376 .filter(|s| !s.is_empty())
377 .collect()
378 }
379
380 pub fn add_to_buffer(&mut self, id: i64, content: &str) {
387 if self.buffer.len() >= self.config.buffer_size {
388 self.buffer.pop_front();
389 }
390 let tokens = Self::tokenize(content);
391 self.buffer.push_back(BufferEntry {
392 id,
393 content: content.to_string(),
394 tokens,
395 });
396 }
397
398 pub fn buffer_len(&self) -> usize {
400 self.buffer.len()
401 }
402
403 pub fn check_and_synthesize(
412 &self,
413 new_content: &str,
414 strategy: SynthesisStrategy,
415 ) -> Option<SynthesizedMemory> {
416 let overlaps = self.detect_overlap(new_content);
417 let best = overlaps.first()?;
418
419 let entry = self.buffer.iter().find(|e| e.id == best.memory_id)?;
421
422 Some(self.synthesize(&entry.content, entry.id, new_content, strategy))
423 }
424}
425
426#[cfg(test)]
431mod tests {
432 use super::*;
433
434 #[test]
436 fn test_jaccard_similarity_basic() {
437 let a = vec!["rust".to_string(), "ownership".to_string()];
439 let b = vec!["rust".to_string(), "ownership".to_string()];
440 let sim = SynthesisEngine::jaccard_similarity(&a, &b);
441 assert!((sim - 1.0).abs() < 1e-6, "identical sets should give 1.0");
442
443 let c = vec!["apple".to_string(), "banana".to_string()];
445 let d = vec!["car".to_string(), "truck".to_string()];
446 let sim2 = SynthesisEngine::jaccard_similarity(&c, &d);
447 assert!((sim2 - 0.0).abs() < 1e-6, "disjoint sets should give 0.0");
448
449 let e = vec!["rust".to_string(), "borrow".to_string()];
451 let f = vec!["rust".to_string(), "ownership".to_string()];
452 let sim3 = SynthesisEngine::jaccard_similarity(&e, &f);
453 let expected = 1.0f32 / 3.0;
454 assert!(
455 (sim3 - expected).abs() < 1e-5,
456 "partial overlap sim={sim3} expected≈{expected}"
457 );
458 }
459
460 #[test]
462 fn test_detect_overlap_above_threshold() {
463 let mut engine = SynthesisEngine::new(SynthesisConfig {
464 overlap_threshold: 0.3,
465 buffer_size: 10,
466 min_overlap_tokens: 2,
467 });
468 engine.add_to_buffer(
469 1,
470 "Rust ownership model uses borrow checker rules compile time safety",
471 );
472
473 let results =
474 engine.detect_overlap("Rust borrow checker enforces ownership rules compile time");
475 assert!(
476 !results.is_empty(),
477 "should find overlap for highly similar content"
478 );
479 assert_eq!(results[0].memory_id, 1);
480 assert!(results[0].overlap_score >= 0.3);
481 }
482
483 #[test]
485 fn test_no_overlap_below_threshold() {
486 let mut engine = SynthesisEngine::new(SynthesisConfig {
487 overlap_threshold: 0.6,
488 buffer_size: 10,
489 min_overlap_tokens: 2,
490 });
491 engine.add_to_buffer(1, "SQLite WAL mode improves write concurrency");
492
493 let results = engine.detect_overlap("Python async await event loop");
494 assert!(
495 results.is_empty(),
496 "unrelated content should not trigger overlap"
497 );
498 }
499
500 #[test]
502 fn test_merge_strategy_combines_content() {
503 let engine = SynthesisEngine::new(SynthesisConfig::default());
504 let existing = "Rust uses a borrow checker. Safety is guaranteed at compile time.";
505 let new = "Rust uses a borrow checker. Memory leaks are prevented automatically.";
506
507 let result = engine.synthesize(existing, 1, new, SynthesisStrategy::Merge);
508
509 assert_eq!(result.strategy_used, SynthesisStrategy::Merge);
510 assert!(
512 result
513 .content
514 .contains("Memory leaks are prevented automatically"),
515 "merged content should include the unique new sentence"
516 );
517 let count = result.content.matches("Rust uses a borrow checker").count();
519 assert_eq!(count, 1, "shared sentence should appear only once");
520 assert_eq!(result.sources, vec![1]);
521 }
522
523 #[test]
525 fn test_replace_strategy_keeps_newer() {
526 let engine = SynthesisEngine::new(SynthesisConfig::default());
527 let existing = "Old description of Rust ownership.";
528 let new = "Updated and more detailed description of Rust ownership.";
529
530 let result = engine.synthesize(existing, 2, new, SynthesisStrategy::Replace);
531
532 assert_eq!(result.strategy_used, SynthesisStrategy::Replace);
533 assert_eq!(result.content, new);
534 assert_eq!(result.sources, vec![2]);
535 }
536
537 #[test]
539 fn test_append_strategy_concatenates() {
540 let engine = SynthesisEngine::new(SynthesisConfig::default());
541 let existing = "Line one\nLine two";
542 let new = "Line two\nLine three";
543
544 let result = engine.synthesize(existing, 3, new, SynthesisStrategy::Append);
545
546 assert_eq!(result.strategy_used, SynthesisStrategy::Append);
547 let count = result.content.matches("Line two").count();
549 assert_eq!(count, 1, "duplicate lines should be removed");
550 assert!(
552 result.content.contains("Line three"),
553 "unique new line should be present"
554 );
555 assert!(
557 result.content.contains("---"),
558 "separator should be present"
559 );
560 }
561
562 #[test]
564 fn test_buffer_eviction_when_full() {
565 let config = SynthesisConfig {
566 overlap_threshold: 0.4,
567 buffer_size: 3,
568 min_overlap_tokens: 1,
569 };
570 let mut engine = SynthesisEngine::new(config);
571
572 engine.add_to_buffer(1, "memory one");
573 engine.add_to_buffer(2, "memory two");
574 engine.add_to_buffer(3, "memory three");
575 assert_eq!(engine.buffer_len(), 3);
576
577 engine.add_to_buffer(4, "memory four");
579 assert_eq!(engine.buffer_len(), 3, "buffer should not exceed capacity");
580
581 let ids: Vec<i64> = engine.buffer.iter().map(|e| e.id).collect();
583 assert!(
584 !ids.contains(&1),
585 "oldest entry should have been evicted, got: {ids:?}"
586 );
587 assert!(ids.contains(&4), "newest entry should be present");
588 }
589
590 #[test]
592 fn test_empty_content_handled() {
593 let mut engine = SynthesisEngine::new(SynthesisConfig::default());
594
595 let tokens = SynthesisEngine::tokenize("");
597 assert!(tokens.is_empty());
598
599 let sim = SynthesisEngine::jaccard_similarity(&[], &[]);
601 assert!((sim - 1.0).abs() < 1e-6);
602
603 engine.add_to_buffer(99, "");
605 assert_eq!(engine.buffer_len(), 1);
606
607 let results = engine.detect_overlap("");
610 assert!(results.is_empty(), "empty new content yields no overlaps");
611 }
612
613 #[test]
615 fn test_check_and_synthesize_empty_buffer() {
616 let engine = SynthesisEngine::new(SynthesisConfig::default());
617 let result = engine.check_and_synthesize("some new content here", SynthesisStrategy::Merge);
618 assert!(result.is_none(), "empty buffer should return None");
619 }
620
621 #[test]
623 fn test_check_and_synthesize_returns_some() {
624 let mut engine = SynthesisEngine::new(SynthesisConfig {
625 overlap_threshold: 0.3,
626 buffer_size: 10,
627 min_overlap_tokens: 2,
628 });
629 engine.add_to_buffer(42, "async rust tokio runtime executor futures polling");
630
631 let result = engine.check_and_synthesize(
632 "tokio runtime executor drives async futures rust",
633 SynthesisStrategy::Replace,
634 );
635 assert!(
636 result.is_some(),
637 "should find overlap and return synthesized memory"
638 );
639 let synth = result.unwrap();
640 assert_eq!(synth.sources, vec![42]);
641 }
642
643 #[test]
645 fn test_tokens_saved_replace_strategy() {
646 let engine = SynthesisEngine::new(SynthesisConfig::default());
647 let existing = "A".repeat(100);
648 let new = "B".repeat(20);
649
650 let result = engine.synthesize(&existing, 5, &new, SynthesisStrategy::Replace);
652 assert_eq!(result.tokens_saved, 100);
653 }
654}