1use chrono::{NaiveDate, Utc};
8use std::collections::HashMap;
9use std::fs;
10use std::path::Path;
11
12#[derive(Debug, Clone)]
14pub struct MemoryChunk {
15 pub path: String,
17 pub start_line: usize,
19 pub end_line: usize,
21 pub text: String,
23}
24
25#[derive(Debug, Clone)]
27pub struct SearchResult {
28 pub chunk: MemoryChunk,
30 pub score: f64,
32}
33
34pub struct MemoryIndex {
36 chunks: Vec<MemoryChunk>,
38 term_index: HashMap<String, Vec<usize>>,
40 doc_freq: HashMap<String, usize>,
42 total_docs: usize,
44}
45
46impl MemoryIndex {
47 pub fn new() -> Self {
49 Self {
50 chunks: Vec::new(),
51 term_index: HashMap::new(),
52 doc_freq: HashMap::new(),
53 total_docs: 0,
54 }
55 }
56
57 pub fn index_workspace(workspace: &Path) -> Result<Self, String> {
59 let mut index = Self::new();
60
61 let memory_md = workspace.join("MEMORY.md");
63 if memory_md.exists() {
64 index.index_file(&memory_md, "MEMORY.md")?;
65 }
66
67 let memory_dir = workspace.join("memory");
69 if memory_dir.exists() && memory_dir.is_dir() {
70 index.index_directory(&memory_dir, "memory")?;
71 }
72
73 index.build_inverted_index();
75
76 Ok(index)
77 }
78
79 fn index_file(&mut self, path: &Path, relative_path: &str) -> Result<(), String> {
81 let content = fs::read_to_string(path)
82 .map_err(|e| format!("Failed to read {}: {}", relative_path, e))?;
83
84 let chunks = self.chunk_content(&content, relative_path);
87 self.chunks.extend(chunks);
88
89 Ok(())
90 }
91
92 fn index_directory(&mut self, dir: &Path, relative_prefix: &str) -> Result<(), String> {
94 let entries = fs::read_dir(dir)
95 .map_err(|e| format!("Failed to read directory {}: {}", relative_prefix, e))?;
96
97 for entry in entries.flatten() {
98 let path = entry.path();
99 let name = entry.file_name().to_string_lossy().to_string();
100 let relative = format!("{}/{}", relative_prefix, name);
101
102 if path.is_file() && name.ends_with(".md") {
103 self.index_file(&path, &relative)?;
104 } else if path.is_dir() && !name.starts_with('.') {
105 self.index_directory(&path, &relative)?;
106 }
107 }
108
109 Ok(())
110 }
111
112 fn chunk_content(&self, content: &str, path: &str) -> Vec<MemoryChunk> {
114 let mut chunks = Vec::new();
115 let lines: Vec<&str> = content.lines().collect();
116
117 if lines.is_empty() {
118 return chunks;
119 }
120
121 let mut current_chunk = String::new();
123 let mut chunk_start = 1;
124 let mut line_count = 0;
125
126 for (i, line) in lines.iter().enumerate() {
127 let line_num = i + 1;
128
129 let is_heading = line.starts_with("## ") || line.starts_with("# ");
131
132 if (is_heading || line_count >= 20) && !current_chunk.trim().is_empty() {
134 chunks.push(MemoryChunk {
135 path: path.to_string(),
136 start_line: chunk_start,
137 end_line: line_num - 1,
138 text: current_chunk.trim().to_string(),
139 });
140 current_chunk = String::new();
141 chunk_start = line_num;
142 line_count = 0;
143 }
144
145 current_chunk.push_str(line);
146 current_chunk.push('\n');
147 line_count += 1;
148 }
149
150 if !current_chunk.trim().is_empty() {
152 chunks.push(MemoryChunk {
153 path: path.to_string(),
154 start_line: chunk_start,
155 end_line: lines.len(),
156 text: current_chunk.trim().to_string(),
157 });
158 }
159
160 chunks
161 }
162
163 fn build_inverted_index(&mut self) {
165 self.term_index.clear();
166 self.doc_freq.clear();
167 self.total_docs = self.chunks.len();
168
169 for (idx, chunk) in self.chunks.iter().enumerate() {
170 let terms = tokenize(&chunk.text);
171 let unique_terms: std::collections::HashSet<_> = terms.iter().collect();
172
173 for term in unique_terms {
174 self.term_index
175 .entry(term.clone())
176 .or_default()
177 .push(idx);
178
179 *self.doc_freq.entry(term.clone()).or_insert(0) += 1;
180 }
181 }
182 }
183
184 pub fn search(&self, query: &str, max_results: usize) -> Vec<SearchResult> {
186 let query_terms = tokenize(query);
187
188 if query_terms.is_empty() || self.chunks.is_empty() {
189 return Vec::new();
190 }
191
192 let mut scores: Vec<(usize, f64)> = Vec::new();
194
195 for (idx, _chunk) in self.chunks.iter().enumerate() {
196 let score = self.bm25_score(idx, &query_terms);
197 if score > 0.0 {
198 scores.push((idx, score));
199 }
200 }
201
202 scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
204
205 scores
207 .into_iter()
208 .take(max_results)
209 .map(|(idx, score)| SearchResult {
210 chunk: self.chunks[idx].clone(),
211 score,
212 })
213 .collect()
214 }
215
216 fn bm25_score(&self, chunk_idx: usize, query_terms: &[String]) -> f64 {
218 const K1: f64 = 1.2;
219 const B: f64 = 0.75;
220
221 let chunk = &self.chunks[chunk_idx];
222 let chunk_terms = tokenize(&chunk.text);
223 let doc_len = chunk_terms.len() as f64;
224
225 let avg_doc_len = self.chunks.iter()
227 .map(|c| tokenize(&c.text).len())
228 .sum::<usize>() as f64 / self.total_docs.max(1) as f64;
229
230 let mut score = 0.0;
231
232 for term in query_terms {
233 let tf = chunk_terms.iter().filter(|t| *t == term).count() as f64;
234 let df = *self.doc_freq.get(term).unwrap_or(&0) as f64;
235
236 if tf > 0.0 && df > 0.0 {
237 let idf = ((self.total_docs as f64 - df + 0.5) / (df + 0.5) + 1.0).ln();
239
240 let tf_norm = (tf * (K1 + 1.0)) / (tf + K1 * (1.0 - B + B * (doc_len / avg_doc_len)));
242
243 score += idf * tf_norm;
244 }
245 }
246
247 score
248 }
249
250 pub fn search_with_decay(
261 &self,
262 query: &str,
263 max_results: usize,
264 half_life_days: f64,
265 ) -> Vec<SearchResult> {
266 let query_terms = tokenize(query);
267
268 if query_terms.is_empty() || self.chunks.is_empty() {
269 return Vec::new();
270 }
271
272 let today = Utc::now().date_naive();
273 let decay_lambda = (2.0_f64).ln() / half_life_days;
274
275 let mut scores: Vec<(usize, f64)> = Vec::new();
276
277 for (idx, chunk) in self.chunks.iter().enumerate() {
278 let base_score = self.bm25_score(idx, &query_terms);
279
280 if base_score > 0.0 {
281 let decayed_score = if Self::is_evergreen(&chunk.path) {
282 base_score } else {
284 let age_days = Self::extract_age_days(&chunk.path, today);
285 let decay = (-decay_lambda * age_days as f64).exp();
286 base_score * decay
287 };
288
289 scores.push((idx, decayed_score));
290 }
291 }
292
293 scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
294
295 scores
296 .into_iter()
297 .take(max_results)
298 .map(|(idx, score)| SearchResult {
299 chunk: self.chunks[idx].clone(),
300 score,
301 })
302 .collect()
303 }
304
305 fn is_evergreen(path: &str) -> bool {
309 path == "MEMORY.md" || !path.starts_with("memory/")
310 }
311
312 fn extract_age_days(path: &str, today: NaiveDate) -> i64 {
317 if let Some(filename) = path.strip_prefix("memory/") {
319 if let Some(date_str) = filename.strip_suffix(".md") {
320 let date_part = date_str.rsplit('/').next().unwrap_or(date_str);
322 if let Ok(date) = NaiveDate::parse_from_str(date_part, "%Y-%m-%d") {
323 return (today - date).num_days().max(0);
324 }
325 }
326 }
327 0 }
329}
330
331#[allow(dead_code)]
333const EVERGREEN_FILES: &[&str] = &["MEMORY.md"];
334
335impl Default for MemoryIndex {
336 fn default() -> Self {
337 Self::new()
338 }
339}
340
341fn tokenize(text: &str) -> Vec<String> {
343 text.to_lowercase()
344 .split(|c: char| !c.is_alphanumeric() && c != '-' && c != '_')
345 .filter(|s| s.len() >= 2) .map(|s| s.to_string())
347 .collect()
348}
349
350pub fn read_memory_file(
352 workspace: &Path,
353 relative_path: &str,
354 from_line: Option<usize>,
355 num_lines: Option<usize>,
356) -> Result<String, String> {
357 if !is_valid_memory_path(relative_path) {
359 return Err(format!(
360 "Path '{}' is not a valid memory file. Must be MEMORY.md or memory/*.md",
361 relative_path
362 ));
363 }
364
365 let full_path = workspace.join(relative_path);
366
367 if !full_path.exists() {
368 return Err(format!("Memory file not found: {}", relative_path));
369 }
370
371 let content = fs::read_to_string(&full_path)
372 .map_err(|e| format!("Failed to read {}: {}", relative_path, e))?;
373
374 let lines: Vec<&str> = content.lines().collect();
375 let total_lines = lines.len();
376
377 let start = from_line.unwrap_or(1).saturating_sub(1); let count = num_lines.unwrap_or(total_lines);
380
381 if start >= total_lines {
382 return Ok(String::new());
383 }
384
385 let end = (start + count).min(total_lines);
386 let selected: Vec<&str> = lines[start..end].to_vec();
387
388 Ok(selected.join("\n"))
389}
390
391fn is_valid_memory_path(path: &str) -> bool {
393 if path == "MEMORY.md" {
395 return true;
396 }
397
398 if path.starts_with("memory/") && path.ends_with(".md") {
399 !path.contains("..") && !path.contains("//")
401 } else {
402 false
403 }
404}
405
406#[cfg(test)]
407mod tests {
408 use super::*;
409 use std::fs;
410 use tempfile::TempDir;
411
412 fn setup_test_workspace() -> TempDir {
413 let dir = TempDir::new().unwrap();
414
415 fs::write(
417 dir.path().join("MEMORY.md"),
418 "# Long-term Memory\n\n## Preferences\nUser prefers dark mode.\nFavorite color is blue.\n\n## Projects\nWorking on RustyClaw.\n"
419 ).unwrap();
420
421 fs::create_dir(dir.path().join("memory")).unwrap();
423
424 fs::write(
426 dir.path().join("memory/2026-02-12.md"),
427 "# 2026-02-12\n\n## Morning\nStarted implementing memory tools.\n\n## Afternoon\nWorking on BM25 search.\n"
428 ).unwrap();
429
430 dir
431 }
432
433 #[test]
434 fn test_index_workspace() {
435 let workspace = setup_test_workspace();
436 let index = MemoryIndex::index_workspace(workspace.path()).unwrap();
437
438 assert!(!index.chunks.is_empty());
439 assert!(index.total_docs > 0);
440 }
441
442 #[test]
443 fn test_search_finds_relevant() {
444 let workspace = setup_test_workspace();
445 let index = MemoryIndex::index_workspace(workspace.path()).unwrap();
446
447 let results = index.search("dark mode", 5);
448 assert!(!results.is_empty());
449 assert!(results[0].chunk.text.contains("dark mode"));
450 }
451
452 #[test]
453 fn test_search_empty_query() {
454 let workspace = setup_test_workspace();
455 let index = MemoryIndex::index_workspace(workspace.path()).unwrap();
456
457 let results = index.search("", 5);
458 assert!(results.is_empty());
459 }
460
461 #[test]
462 fn test_read_memory_file() {
463 let workspace = setup_test_workspace();
464
465 let content = read_memory_file(workspace.path(), "MEMORY.md", None, None).unwrap();
466 assert!(content.contains("Long-term Memory"));
467 }
468
469 #[test]
470 fn test_read_memory_file_with_range() {
471 let workspace = setup_test_workspace();
472
473 let content = read_memory_file(workspace.path(), "MEMORY.md", Some(3), Some(2)).unwrap();
474 assert!(!content.is_empty());
476 }
477
478 #[test]
479 fn test_read_memory_file_invalid_path() {
480 let workspace = setup_test_workspace();
481
482 let result = read_memory_file(workspace.path(), "../etc/passwd", None, None);
483 assert!(result.is_err());
484 }
485
486 #[test]
487 fn test_valid_memory_paths() {
488 assert!(is_valid_memory_path("MEMORY.md"));
489 assert!(is_valid_memory_path("memory/2026-02-12.md"));
490 assert!(is_valid_memory_path("memory/notes/work.md"));
491
492 assert!(!is_valid_memory_path("../secret.md"));
493 assert!(!is_valid_memory_path("memory/../../../etc/passwd"));
494 assert!(!is_valid_memory_path("src/main.rs"));
495 assert!(!is_valid_memory_path("memory/file.txt"));
496 }
497
498 #[test]
499 fn test_tokenize() {
500 let tokens = tokenize("Hello, World! This is a TEST.");
501 assert!(tokens.contains(&"hello".to_string()));
502 assert!(tokens.contains(&"world".to_string()));
503 assert!(tokens.contains(&"test".to_string()));
504 assert!(!tokens.contains(&"a".to_string()));
506 }
507
508 #[test]
509 fn test_search_with_decay() {
510 let workspace = setup_test_workspace();
511 let index = MemoryIndex::index_workspace(workspace.path()).unwrap();
512
513 let results = index.search_with_decay("memory tools", 5, 30.0);
515 assert!(!results.is_empty());
516 }
518
519 #[test]
520 fn test_is_evergreen() {
521 assert!(MemoryIndex::is_evergreen("MEMORY.md"));
522 assert!(MemoryIndex::is_evergreen("SOUL.md"));
523 assert!(!MemoryIndex::is_evergreen("memory/2026-02-20.md"));
524 assert!(!MemoryIndex::is_evergreen("memory/notes/2026-02-20.md"));
525 }
526
527 #[test]
528 fn test_extract_age_days() {
529 use chrono::NaiveDate;
530
531 let today = NaiveDate::from_ymd_opt(2026, 2, 20).unwrap();
532
533 let age = MemoryIndex::extract_age_days("memory/2026-02-15.md", today);
535 assert_eq!(age, 5);
536
537 let age = MemoryIndex::extract_age_days("memory/2026-02-20.md", today);
539 assert_eq!(age, 0);
540
541 let age = MemoryIndex::extract_age_days("memory/notes.md", today);
543 assert_eq!(age, 0);
544
545 let age = MemoryIndex::extract_age_days("memory/project/2026-02-10.md", today);
547 assert_eq!(age, 10);
548 }
549
550 #[test]
551 fn test_recency_affects_ranking() {
552 let dir = TempDir::new().unwrap();
554 fs::create_dir(dir.path().join("memory")).unwrap();
555
556 fs::write(
558 dir.path().join("memory/2026-01-01.md"),
559 "# Old Note\nThis contains important search term.\n"
560 ).unwrap();
561
562 fs::write(
564 dir.path().join("memory/2026-02-19.md"),
565 "# Recent Note\nThis also contains important search term.\n"
566 ).unwrap();
567
568 let index = MemoryIndex::index_workspace(dir.path()).unwrap();
569
570 let results = index.search_with_decay("important search term", 2, 30.0);
572 assert_eq!(results.len(), 2);
573 assert!(results[0].chunk.path.contains("2026-02"));
575 }
576}