1use chrono::{NaiveDate, Utc};
8use std::collections::HashMap;
9use std::fs;
10use std::path::Path;
11
12#[derive(Debug, Clone)]
14pub struct MemoryChunk {
15 pub path: String,
17 pub start_line: usize,
19 pub end_line: usize,
21 pub text: String,
23}
24
25#[derive(Debug, Clone)]
27pub struct SearchResult {
28 pub chunk: MemoryChunk,
30 pub score: f64,
32}
33
34pub struct MemoryIndex {
36 chunks: Vec<MemoryChunk>,
38 term_index: HashMap<String, Vec<usize>>,
40 doc_freq: HashMap<String, usize>,
42 total_docs: usize,
44}
45
46impl MemoryIndex {
47 pub fn new() -> Self {
49 Self {
50 chunks: Vec::new(),
51 term_index: HashMap::new(),
52 doc_freq: HashMap::new(),
53 total_docs: 0,
54 }
55 }
56
57 pub fn index_workspace(workspace: &Path) -> Result<Self, String> {
59 let mut index = Self::new();
60
61 let memory_md = workspace.join("MEMORY.md");
63 if memory_md.exists() {
64 index.index_file(&memory_md, "MEMORY.md")?;
65 }
66
67 let memory_dir = workspace.join("memory");
69 if memory_dir.exists() && memory_dir.is_dir() {
70 index.index_directory(&memory_dir, "memory")?;
71 }
72
73 index.build_inverted_index();
75
76 Ok(index)
77 }
78
79 fn index_file(&mut self, path: &Path, relative_path: &str) -> Result<(), String> {
81 let content = fs::read_to_string(path)
82 .map_err(|e| format!("Failed to read {}: {}", relative_path, e))?;
83
84 let chunks = self.chunk_content(&content, relative_path);
87 self.chunks.extend(chunks);
88
89 Ok(())
90 }
91
92 fn index_directory(&mut self, dir: &Path, relative_prefix: &str) -> Result<(), String> {
94 let entries = fs::read_dir(dir)
95 .map_err(|e| format!("Failed to read directory {}: {}", relative_prefix, e))?;
96
97 for entry in entries.flatten() {
98 let path = entry.path();
99 let name = entry.file_name().to_string_lossy().to_string();
100 let relative = format!("{}/{}", relative_prefix, name);
101
102 if path.is_file() && name.ends_with(".md") {
103 self.index_file(&path, &relative)?;
104 } else if path.is_dir() && !name.starts_with('.') {
105 self.index_directory(&path, &relative)?;
106 }
107 }
108
109 Ok(())
110 }
111
112 fn chunk_content(&self, content: &str, path: &str) -> Vec<MemoryChunk> {
114 let mut chunks = Vec::new();
115 let lines: Vec<&str> = content.lines().collect();
116
117 if lines.is_empty() {
118 return chunks;
119 }
120
121 let mut current_chunk = String::new();
123 let mut chunk_start = 1;
124 let mut line_count = 0;
125
126 for (i, line) in lines.iter().enumerate() {
127 let line_num = i + 1;
128
129 let is_heading = line.starts_with("## ") || line.starts_with("# ");
131
132 if (is_heading || line_count >= 20) && !current_chunk.trim().is_empty() {
134 chunks.push(MemoryChunk {
135 path: path.to_string(),
136 start_line: chunk_start,
137 end_line: line_num - 1,
138 text: current_chunk.trim().to_string(),
139 });
140 current_chunk = String::new();
141 chunk_start = line_num;
142 line_count = 0;
143 }
144
145 current_chunk.push_str(line);
146 current_chunk.push('\n');
147 line_count += 1;
148 }
149
150 if !current_chunk.trim().is_empty() {
152 chunks.push(MemoryChunk {
153 path: path.to_string(),
154 start_line: chunk_start,
155 end_line: lines.len(),
156 text: current_chunk.trim().to_string(),
157 });
158 }
159
160 chunks
161 }
162
163 fn build_inverted_index(&mut self) {
165 self.term_index.clear();
166 self.doc_freq.clear();
167 self.total_docs = self.chunks.len();
168
169 for (idx, chunk) in self.chunks.iter().enumerate() {
170 let terms = tokenize(&chunk.text);
171 let unique_terms: std::collections::HashSet<_> = terms.iter().collect();
172
173 for term in unique_terms {
174 self.term_index.entry(term.clone()).or_default().push(idx);
175
176 *self.doc_freq.entry(term.clone()).or_insert(0) += 1;
177 }
178 }
179 }
180
181 pub fn search(&self, query: &str, max_results: usize) -> Vec<SearchResult> {
183 let query_terms = tokenize(query);
184
185 if query_terms.is_empty() || self.chunks.is_empty() {
186 return Vec::new();
187 }
188
189 let mut scores: Vec<(usize, f64)> = Vec::new();
191
192 for (idx, _chunk) in self.chunks.iter().enumerate() {
193 let score = self.bm25_score(idx, &query_terms);
194 if score > 0.0 {
195 scores.push((idx, score));
196 }
197 }
198
199 scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
201
202 scores
204 .into_iter()
205 .take(max_results)
206 .map(|(idx, score)| SearchResult {
207 chunk: self.chunks[idx].clone(),
208 score,
209 })
210 .collect()
211 }
212
213 fn bm25_score(&self, chunk_idx: usize, query_terms: &[String]) -> f64 {
215 const K1: f64 = 1.2;
216 const B: f64 = 0.75;
217
218 let chunk = &self.chunks[chunk_idx];
219 let chunk_terms = tokenize(&chunk.text);
220 let doc_len = chunk_terms.len() as f64;
221
222 let avg_doc_len = self
224 .chunks
225 .iter()
226 .map(|c| tokenize(&c.text).len())
227 .sum::<usize>() as f64
228 / self.total_docs.max(1) as f64;
229
230 let mut score = 0.0;
231
232 for term in query_terms {
233 let tf = chunk_terms.iter().filter(|t| *t == term).count() as f64;
234 let df = *self.doc_freq.get(term).unwrap_or(&0) as f64;
235
236 if tf > 0.0 && df > 0.0 {
237 let idf = ((self.total_docs as f64 - df + 0.5) / (df + 0.5) + 1.0).ln();
239
240 let tf_norm =
242 (tf * (K1 + 1.0)) / (tf + K1 * (1.0 - B + B * (doc_len / avg_doc_len)));
243
244 score += idf * tf_norm;
245 }
246 }
247
248 score
249 }
250
251 pub fn search_with_decay(
262 &self,
263 query: &str,
264 max_results: usize,
265 half_life_days: f64,
266 ) -> Vec<SearchResult> {
267 let query_terms = tokenize(query);
268
269 if query_terms.is_empty() || self.chunks.is_empty() {
270 return Vec::new();
271 }
272
273 let today = Utc::now().date_naive();
274 let decay_lambda = (2.0_f64).ln() / half_life_days;
275
276 let mut scores: Vec<(usize, f64)> = Vec::new();
277
278 for (idx, chunk) in self.chunks.iter().enumerate() {
279 let base_score = self.bm25_score(idx, &query_terms);
280
281 if base_score > 0.0 {
282 let decayed_score = if Self::is_evergreen(&chunk.path) {
283 base_score } else {
285 let age_days = Self::extract_age_days(&chunk.path, today);
286 let decay = (-decay_lambda * age_days as f64).exp();
287 base_score * decay
288 };
289
290 scores.push((idx, decayed_score));
291 }
292 }
293
294 scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
295
296 scores
297 .into_iter()
298 .take(max_results)
299 .map(|(idx, score)| SearchResult {
300 chunk: self.chunks[idx].clone(),
301 score,
302 })
303 .collect()
304 }
305
306 fn is_evergreen(path: &str) -> bool {
310 path == "MEMORY.md" || !path.starts_with("memory/")
311 }
312
313 fn extract_age_days(path: &str, today: NaiveDate) -> i64 {
318 if let Some(filename) = path.strip_prefix("memory/") {
320 if let Some(date_str) = filename.strip_suffix(".md") {
321 let date_part = date_str.rsplit('/').next().unwrap_or(date_str);
323 if let Ok(date) = NaiveDate::parse_from_str(date_part, "%Y-%m-%d") {
324 return (today - date).num_days().max(0);
325 }
326 }
327 }
328 0 }
330}
331
332#[allow(dead_code)]
334const EVERGREEN_FILES: &[&str] = &["MEMORY.md"];
335
336impl Default for MemoryIndex {
337 fn default() -> Self {
338 Self::new()
339 }
340}
341
342fn tokenize(text: &str) -> Vec<String> {
344 text.to_lowercase()
345 .split(|c: char| !c.is_alphanumeric() && c != '-' && c != '_')
346 .filter(|s| s.len() >= 2) .map(|s| s.to_string())
348 .collect()
349}
350
351pub fn read_memory_file(
353 workspace: &Path,
354 relative_path: &str,
355 from_line: Option<usize>,
356 num_lines: Option<usize>,
357) -> Result<String, String> {
358 if !is_valid_memory_path(relative_path) {
360 return Err(format!(
361 "Path '{}' is not a valid memory file. Must be MEMORY.md or memory/*.md",
362 relative_path
363 ));
364 }
365
366 let full_path = workspace.join(relative_path);
367
368 if !full_path.exists() {
369 return Err(format!("Memory file not found: {}", relative_path));
370 }
371
372 let content = fs::read_to_string(&full_path)
373 .map_err(|e| format!("Failed to read {}: {}", relative_path, e))?;
374
375 let lines: Vec<&str> = content.lines().collect();
376 let total_lines = lines.len();
377
378 let start = from_line.unwrap_or(1).saturating_sub(1); let count = num_lines.unwrap_or(total_lines);
381
382 if start >= total_lines {
383 return Ok(String::new());
384 }
385
386 let end = (start + count).min(total_lines);
387 let selected: Vec<&str> = lines[start..end].to_vec();
388
389 Ok(selected.join("\n"))
390}
391
392fn is_valid_memory_path(path: &str) -> bool {
394 if path == "MEMORY.md" {
396 return true;
397 }
398
399 if path.starts_with("memory/") && path.ends_with(".md") {
400 !path.contains("..") && !path.contains("//")
402 } else {
403 false
404 }
405}
406
407#[cfg(test)]
408mod tests {
409 use super::*;
410 use std::fs;
411 use tempfile::TempDir;
412
413 fn setup_test_workspace() -> TempDir {
414 let dir = TempDir::new().unwrap();
415
416 fs::write(
418 dir.path().join("MEMORY.md"),
419 "# Long-term Memory\n\n## Preferences\nUser prefers dark mode.\nFavorite color is blue.\n\n## Projects\nWorking on RustyClaw.\n"
420 ).unwrap();
421
422 fs::create_dir(dir.path().join("memory")).unwrap();
424
425 fs::write(
427 dir.path().join("memory/2026-02-12.md"),
428 "# 2026-02-12\n\n## Morning\nStarted implementing memory tools.\n\n## Afternoon\nWorking on BM25 search.\n"
429 ).unwrap();
430
431 dir
432 }
433
434 #[test]
435 fn test_index_workspace() {
436 let workspace = setup_test_workspace();
437 let index = MemoryIndex::index_workspace(workspace.path()).unwrap();
438
439 assert!(!index.chunks.is_empty());
440 assert!(index.total_docs > 0);
441 }
442
443 #[test]
444 fn test_search_finds_relevant() {
445 let workspace = setup_test_workspace();
446 let index = MemoryIndex::index_workspace(workspace.path()).unwrap();
447
448 let results = index.search("dark mode", 5);
449 assert!(!results.is_empty());
450 assert!(results[0].chunk.text.contains("dark mode"));
451 }
452
453 #[test]
454 fn test_search_empty_query() {
455 let workspace = setup_test_workspace();
456 let index = MemoryIndex::index_workspace(workspace.path()).unwrap();
457
458 let results = index.search("", 5);
459 assert!(results.is_empty());
460 }
461
462 #[test]
463 fn test_read_memory_file() {
464 let workspace = setup_test_workspace();
465
466 let content = read_memory_file(workspace.path(), "MEMORY.md", None, None).unwrap();
467 assert!(content.contains("Long-term Memory"));
468 }
469
470 #[test]
471 fn test_read_memory_file_with_range() {
472 let workspace = setup_test_workspace();
473
474 let content = read_memory_file(workspace.path(), "MEMORY.md", Some(3), Some(2)).unwrap();
475 assert!(!content.is_empty());
477 }
478
479 #[test]
480 fn test_read_memory_file_invalid_path() {
481 let workspace = setup_test_workspace();
482
483 let result = read_memory_file(workspace.path(), "../etc/passwd", None, None);
484 assert!(result.is_err());
485 }
486
487 #[test]
488 fn test_valid_memory_paths() {
489 assert!(is_valid_memory_path("MEMORY.md"));
490 assert!(is_valid_memory_path("memory/2026-02-12.md"));
491 assert!(is_valid_memory_path("memory/notes/work.md"));
492
493 assert!(!is_valid_memory_path("../secret.md"));
494 assert!(!is_valid_memory_path("memory/../../../etc/passwd"));
495 assert!(!is_valid_memory_path("src/main.rs"));
496 assert!(!is_valid_memory_path("memory/file.txt"));
497 }
498
499 #[test]
500 fn test_tokenize() {
501 let tokens = tokenize("Hello, World! This is a TEST.");
502 assert!(tokens.contains(&"hello".to_string()));
503 assert!(tokens.contains(&"world".to_string()));
504 assert!(tokens.contains(&"test".to_string()));
505 assert!(!tokens.contains(&"a".to_string()));
507 }
508
509 #[test]
510 fn test_search_with_decay() {
511 let workspace = setup_test_workspace();
512 let index = MemoryIndex::index_workspace(workspace.path()).unwrap();
513
514 let results = index.search_with_decay("memory tools", 5, 30.0);
516 assert!(!results.is_empty());
517 }
519
520 #[test]
521 fn test_is_evergreen() {
522 assert!(MemoryIndex::is_evergreen("MEMORY.md"));
523 assert!(MemoryIndex::is_evergreen("SOUL.md"));
524 assert!(!MemoryIndex::is_evergreen("memory/2026-02-20.md"));
525 assert!(!MemoryIndex::is_evergreen("memory/notes/2026-02-20.md"));
526 }
527
528 #[test]
529 fn test_extract_age_days() {
530 use chrono::NaiveDate;
531
532 let today = NaiveDate::from_ymd_opt(2026, 2, 20).unwrap();
533
534 let age = MemoryIndex::extract_age_days("memory/2026-02-15.md", today);
536 assert_eq!(age, 5);
537
538 let age = MemoryIndex::extract_age_days("memory/2026-02-20.md", today);
540 assert_eq!(age, 0);
541
542 let age = MemoryIndex::extract_age_days("memory/notes.md", today);
544 assert_eq!(age, 0);
545
546 let age = MemoryIndex::extract_age_days("memory/project/2026-02-10.md", today);
548 assert_eq!(age, 10);
549 }
550
551 #[test]
552 fn test_recency_affects_ranking() {
553 let dir = TempDir::new().unwrap();
555 fs::create_dir(dir.path().join("memory")).unwrap();
556
557 fs::write(
559 dir.path().join("memory/2026-01-01.md"),
560 "# Old Note\nThis contains important search term.\n",
561 )
562 .unwrap();
563
564 fs::write(
566 dir.path().join("memory/2026-02-19.md"),
567 "# Recent Note\nThis also contains important search term.\n",
568 )
569 .unwrap();
570
571 let index = MemoryIndex::index_workspace(dir.path()).unwrap();
572
573 let results = index.search_with_decay("important search term", 2, 30.0);
575 assert_eq!(results.len(), 2);
576 assert!(results[0].chunk.path.contains("2026-02"));
578 }
579}