Skip to main content

rustyclaw_core/
memory.rs

1//! Memory search and retrieval for RustyClaw.
2//!
3//! Provides semantic-like search over `MEMORY.md` and `memory/*.md` files.
4//! Current implementation uses keyword/BM25-style matching with temporal decay
5//! for recency weighting. Embeddings can be added later for true semantic search.
6
7use chrono::{NaiveDate, Utc};
8use std::collections::HashMap;
9use std::fs;
10use std::path::Path;
11
12/// A chunk of text from a memory file with metadata.
13#[derive(Debug, Clone)]
14pub struct MemoryChunk {
15    /// Source file path (relative to workspace).
16    pub path: String,
17    /// Starting line number (1-indexed).
18    pub start_line: usize,
19    /// Ending line number (1-indexed, inclusive).
20    pub end_line: usize,
21    /// The text content of this chunk.
22    pub text: String,
23}
24
25/// A search result with relevance score.
26#[derive(Debug, Clone)]
27pub struct SearchResult {
28    /// The matching chunk.
29    pub chunk: MemoryChunk,
30    /// Relevance score (higher is better).
31    pub score: f64,
32}
33
34/// Memory search index.
35pub struct MemoryIndex {
36    /// All indexed chunks.
37    chunks: Vec<MemoryChunk>,
38    /// Inverted index: term -> chunk indices.
39    term_index: HashMap<String, Vec<usize>>,
40    /// Document frequency for each term.
41    doc_freq: HashMap<String, usize>,
42    /// Total number of chunks.
43    total_docs: usize,
44}
45
46impl MemoryIndex {
47    /// Create a new empty index.
48    pub fn new() -> Self {
49        Self {
50            chunks: Vec::new(),
51            term_index: HashMap::new(),
52            doc_freq: HashMap::new(),
53            total_docs: 0,
54        }
55    }
56
57    /// Index all memory files in a workspace.
58    pub fn index_workspace(workspace: &Path) -> Result<Self, String> {
59        let mut index = Self::new();
60
61        // Index MEMORY.md if it exists
62        let memory_md = workspace.join("MEMORY.md");
63        if memory_md.exists() {
64            index.index_file(&memory_md, "MEMORY.md")?;
65        }
66
67        // Index memory/*.md
68        let memory_dir = workspace.join("memory");
69        if memory_dir.exists() && memory_dir.is_dir() {
70            index.index_directory(&memory_dir, "memory")?;
71        }
72
73        // Build inverted index
74        index.build_inverted_index();
75
76        Ok(index)
77    }
78
79    /// Index a single file.
80    fn index_file(&mut self, path: &Path, relative_path: &str) -> Result<(), String> {
81        let content = fs::read_to_string(path)
82            .map_err(|e| format!("Failed to read {}: {}", relative_path, e))?;
83
84        // Split into chunks (~400 tokens target, roughly 300-400 words)
85        // For simplicity, we chunk by paragraphs or heading sections
86        let chunks = self.chunk_content(&content, relative_path);
87        self.chunks.extend(chunks);
88
89        Ok(())
90    }
91
92    /// Index a directory recursively.
93    fn index_directory(&mut self, dir: &Path, relative_prefix: &str) -> Result<(), String> {
94        let entries = fs::read_dir(dir)
95            .map_err(|e| format!("Failed to read directory {}: {}", relative_prefix, e))?;
96
97        for entry in entries.flatten() {
98            let path = entry.path();
99            let name = entry.file_name().to_string_lossy().to_string();
100            let relative = format!("{}/{}", relative_prefix, name);
101
102            if path.is_file() && name.ends_with(".md") {
103                self.index_file(&path, &relative)?;
104            } else if path.is_dir() && !name.starts_with('.') {
105                self.index_directory(&path, &relative)?;
106            }
107        }
108
109        Ok(())
110    }
111
112    /// Chunk content into searchable pieces.
113    fn chunk_content(&self, content: &str, path: &str) -> Vec<MemoryChunk> {
114        let mut chunks = Vec::new();
115        let lines: Vec<&str> = content.lines().collect();
116
117        if lines.is_empty() {
118            return chunks;
119        }
120
121        // Chunk by sections (## headings) or every ~20 lines
122        let mut current_chunk = String::new();
123        let mut chunk_start = 1;
124        let mut line_count = 0;
125
126        for (i, line) in lines.iter().enumerate() {
127            let line_num = i + 1;
128
129            // Check if this is a heading that should start a new chunk
130            let is_heading = line.starts_with("## ") || line.starts_with("# ");
131
132            // Start new chunk on heading or every ~20 lines (if we have content)
133            if (is_heading || line_count >= 20) && !current_chunk.trim().is_empty() {
134                chunks.push(MemoryChunk {
135                    path: path.to_string(),
136                    start_line: chunk_start,
137                    end_line: line_num - 1,
138                    text: current_chunk.trim().to_string(),
139                });
140                current_chunk = String::new();
141                chunk_start = line_num;
142                line_count = 0;
143            }
144
145            current_chunk.push_str(line);
146            current_chunk.push('\n');
147            line_count += 1;
148        }
149
150        // Don't forget the last chunk
151        if !current_chunk.trim().is_empty() {
152            chunks.push(MemoryChunk {
153                path: path.to_string(),
154                start_line: chunk_start,
155                end_line: lines.len(),
156                text: current_chunk.trim().to_string(),
157            });
158        }
159
160        chunks
161    }
162
163    /// Build the inverted index for BM25 search.
164    fn build_inverted_index(&mut self) {
165        self.term_index.clear();
166        self.doc_freq.clear();
167        self.total_docs = self.chunks.len();
168
169        for (idx, chunk) in self.chunks.iter().enumerate() {
170            let terms = tokenize(&chunk.text);
171            let unique_terms: std::collections::HashSet<_> = terms.iter().collect();
172
173            for term in unique_terms {
174                self.term_index.entry(term.clone()).or_default().push(idx);
175
176                *self.doc_freq.entry(term.clone()).or_insert(0) += 1;
177            }
178        }
179    }
180
181    /// Search the index using BM25-style scoring.
182    pub fn search(&self, query: &str, max_results: usize) -> Vec<SearchResult> {
183        let query_terms = tokenize(query);
184
185        if query_terms.is_empty() || self.chunks.is_empty() {
186            return Vec::new();
187        }
188
189        // Score each chunk
190        let mut scores: Vec<(usize, f64)> = Vec::new();
191
192        for (idx, _chunk) in self.chunks.iter().enumerate() {
193            let score = self.bm25_score(idx, &query_terms);
194            if score > 0.0 {
195                scores.push((idx, score));
196            }
197        }
198
199        // Sort by score descending
200        scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
201
202        // Return top results
203        scores
204            .into_iter()
205            .take(max_results)
206            .map(|(idx, score)| SearchResult {
207                chunk: self.chunks[idx].clone(),
208                score,
209            })
210            .collect()
211    }
212
213    /// Calculate BM25 score for a chunk.
214    fn bm25_score(&self, chunk_idx: usize, query_terms: &[String]) -> f64 {
215        const K1: f64 = 1.2;
216        const B: f64 = 0.75;
217
218        let chunk = &self.chunks[chunk_idx];
219        let chunk_terms = tokenize(&chunk.text);
220        let doc_len = chunk_terms.len() as f64;
221
222        // Calculate average document length
223        let avg_doc_len = self
224            .chunks
225            .iter()
226            .map(|c| tokenize(&c.text).len())
227            .sum::<usize>() as f64
228            / self.total_docs.max(1) as f64;
229
230        let mut score = 0.0;
231
232        for term in query_terms {
233            let tf = chunk_terms.iter().filter(|t| *t == term).count() as f64;
234            let df = *self.doc_freq.get(term).unwrap_or(&0) as f64;
235
236            if tf > 0.0 && df > 0.0 {
237                // IDF component
238                let idf = ((self.total_docs as f64 - df + 0.5) / (df + 0.5) + 1.0).ln();
239
240                // TF component with length normalization
241                let tf_norm =
242                    (tf * (K1 + 1.0)) / (tf + K1 * (1.0 - B + B * (doc_len / avg_doc_len)));
243
244                score += idf * tf_norm;
245            }
246        }
247
248        score
249    }
250
251    /// Search with temporal decay (recency weighting).
252    ///
253    /// Recent memory files are boosted using exponential decay with configurable
254    /// half-life. Files that don't have a date in their path (like MEMORY.md)
255    /// are treated as "evergreen" and don't decay.
256    ///
257    /// # Arguments
258    /// * `query` - Search query string
259    /// * `max_results` - Maximum number of results to return
260    /// * `half_life_days` - Half-life for temporal decay in days (default: 30)
261    pub fn search_with_decay(
262        &self,
263        query: &str,
264        max_results: usize,
265        half_life_days: f64,
266    ) -> Vec<SearchResult> {
267        let query_terms = tokenize(query);
268
269        if query_terms.is_empty() || self.chunks.is_empty() {
270            return Vec::new();
271        }
272
273        let today = Utc::now().date_naive();
274        let decay_lambda = (2.0_f64).ln() / half_life_days;
275
276        let mut scores: Vec<(usize, f64)> = Vec::new();
277
278        for (idx, chunk) in self.chunks.iter().enumerate() {
279            let base_score = self.bm25_score(idx, &query_terms);
280
281            if base_score > 0.0 {
282                let decayed_score = if Self::is_evergreen(&chunk.path) {
283                    base_score // No decay for evergreen files
284                } else {
285                    let age_days = Self::extract_age_days(&chunk.path, today);
286                    let decay = (-decay_lambda * age_days as f64).exp();
287                    base_score * decay
288                };
289
290                scores.push((idx, decayed_score));
291            }
292        }
293
294        scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
295
296        scores
297            .into_iter()
298            .take(max_results)
299            .map(|(idx, score)| SearchResult {
300                chunk: self.chunks[idx].clone(),
301                score,
302            })
303            .collect()
304    }
305
306    /// Check if a file path is "evergreen" (shouldn't decay).
307    ///
308    /// Evergreen files include MEMORY.md and any file not in the memory/ directory.
309    fn is_evergreen(path: &str) -> bool {
310        path == "MEMORY.md" || !path.starts_with("memory/")
311    }
312
313    /// Extract the age in days from a dated file path.
314    ///
315    /// Expects paths like "memory/2026-02-20.md" and returns days since that date.
316    /// Returns 0 for paths without a parseable date.
317    fn extract_age_days(path: &str, today: NaiveDate) -> i64 {
318        // Try to extract date from path like "memory/2026-02-20.md"
319        if let Some(filename) = path.strip_prefix("memory/") {
320            if let Some(date_str) = filename.strip_suffix(".md") {
321                // Handle nested paths like "memory/subfolder/2026-02-20.md"
322                let date_part = date_str.rsplit('/').next().unwrap_or(date_str);
323                if let Ok(date) = NaiveDate::parse_from_str(date_part, "%Y-%m-%d") {
324                    return (today - date).num_days().max(0);
325                }
326            }
327        }
328        0 // Unknown date = no decay
329    }
330}
331
332/// Files that should never be decayed (evergreen).
333#[allow(dead_code)]
334const EVERGREEN_FILES: &[&str] = &["MEMORY.md"];
335
336impl Default for MemoryIndex {
337    fn default() -> Self {
338        Self::new()
339    }
340}
341
342/// Tokenize text into lowercase terms for indexing/searching.
343fn tokenize(text: &str) -> Vec<String> {
344    text.to_lowercase()
345        .split(|c: char| !c.is_alphanumeric() && c != '-' && c != '_')
346        .filter(|s| s.len() >= 2) // Skip very short tokens
347        .map(|s| s.to_string())
348        .collect()
349}
350
351/// Read specific lines from a memory file.
352pub fn read_memory_file(
353    workspace: &Path,
354    relative_path: &str,
355    from_line: Option<usize>,
356    num_lines: Option<usize>,
357) -> Result<String, String> {
358    // Validate path is within memory scope
359    if !is_valid_memory_path(relative_path) {
360        return Err(format!(
361            "Path '{}' is not a valid memory file. Must be MEMORY.md or memory/*.md",
362            relative_path
363        ));
364    }
365
366    let full_path = workspace.join(relative_path);
367
368    if !full_path.exists() {
369        return Err(format!("Memory file not found: {}", relative_path));
370    }
371
372    let content = fs::read_to_string(&full_path)
373        .map_err(|e| format!("Failed to read {}: {}", relative_path, e))?;
374
375    let lines: Vec<&str> = content.lines().collect();
376    let total_lines = lines.len();
377
378    // Handle line range
379    let start = from_line.unwrap_or(1).saturating_sub(1); // Convert to 0-indexed
380    let count = num_lines.unwrap_or(total_lines);
381
382    if start >= total_lines {
383        return Ok(String::new());
384    }
385
386    let end = (start + count).min(total_lines);
387    let selected: Vec<&str> = lines[start..end].to_vec();
388
389    Ok(selected.join("\n"))
390}
391
392/// Check if a path is a valid memory file path.
393fn is_valid_memory_path(path: &str) -> bool {
394    // Must be MEMORY.md or within memory/ directory
395    if path == "MEMORY.md" {
396        return true;
397    }
398
399    if path.starts_with("memory/") && path.ends_with(".md") {
400        // Check for path traversal
401        !path.contains("..") && !path.contains("//")
402    } else {
403        false
404    }
405}
406
407#[cfg(test)]
408mod tests {
409    use super::*;
410    use std::fs;
411    use tempfile::TempDir;
412
413    fn setup_test_workspace() -> TempDir {
414        let dir = TempDir::new().unwrap();
415
416        // Create MEMORY.md
417        fs::write(
418            dir.path().join("MEMORY.md"),
419            "# Long-term Memory\n\n## Preferences\nUser prefers dark mode.\nFavorite color is blue.\n\n## Projects\nWorking on RustyClaw.\n"
420        ).unwrap();
421
422        // Create memory directory
423        fs::create_dir(dir.path().join("memory")).unwrap();
424
425        // Create daily note
426        fs::write(
427            dir.path().join("memory/2026-02-12.md"),
428            "# 2026-02-12\n\n## Morning\nStarted implementing memory tools.\n\n## Afternoon\nWorking on BM25 search.\n"
429        ).unwrap();
430
431        dir
432    }
433
434    #[test]
435    fn test_index_workspace() {
436        let workspace = setup_test_workspace();
437        let index = MemoryIndex::index_workspace(workspace.path()).unwrap();
438
439        assert!(!index.chunks.is_empty());
440        assert!(index.total_docs > 0);
441    }
442
443    #[test]
444    fn test_search_finds_relevant() {
445        let workspace = setup_test_workspace();
446        let index = MemoryIndex::index_workspace(workspace.path()).unwrap();
447
448        let results = index.search("dark mode", 5);
449        assert!(!results.is_empty());
450        assert!(results[0].chunk.text.contains("dark mode"));
451    }
452
453    #[test]
454    fn test_search_empty_query() {
455        let workspace = setup_test_workspace();
456        let index = MemoryIndex::index_workspace(workspace.path()).unwrap();
457
458        let results = index.search("", 5);
459        assert!(results.is_empty());
460    }
461
462    #[test]
463    fn test_read_memory_file() {
464        let workspace = setup_test_workspace();
465
466        let content = read_memory_file(workspace.path(), "MEMORY.md", None, None).unwrap();
467        assert!(content.contains("Long-term Memory"));
468    }
469
470    #[test]
471    fn test_read_memory_file_with_range() {
472        let workspace = setup_test_workspace();
473
474        let content = read_memory_file(workspace.path(), "MEMORY.md", Some(3), Some(2)).unwrap();
475        // Line 3-4 should be "## Preferences" and the next line
476        assert!(!content.is_empty());
477    }
478
479    #[test]
480    fn test_read_memory_file_invalid_path() {
481        let workspace = setup_test_workspace();
482
483        let result = read_memory_file(workspace.path(), "../etc/passwd", None, None);
484        assert!(result.is_err());
485    }
486
487    #[test]
488    fn test_valid_memory_paths() {
489        assert!(is_valid_memory_path("MEMORY.md"));
490        assert!(is_valid_memory_path("memory/2026-02-12.md"));
491        assert!(is_valid_memory_path("memory/notes/work.md"));
492
493        assert!(!is_valid_memory_path("../secret.md"));
494        assert!(!is_valid_memory_path("memory/../../../etc/passwd"));
495        assert!(!is_valid_memory_path("src/main.rs"));
496        assert!(!is_valid_memory_path("memory/file.txt"));
497    }
498
499    #[test]
500    fn test_tokenize() {
501        let tokens = tokenize("Hello, World! This is a TEST.");
502        assert!(tokens.contains(&"hello".to_string()));
503        assert!(tokens.contains(&"world".to_string()));
504        assert!(tokens.contains(&"test".to_string()));
505        // Single-char tokens should be filtered
506        assert!(!tokens.contains(&"a".to_string()));
507    }
508
509    #[test]
510    fn test_search_with_decay() {
511        let workspace = setup_test_workspace();
512        let index = MemoryIndex::index_workspace(workspace.path()).unwrap();
513
514        // Search with recency weighting (30 day half-life)
515        let results = index.search_with_decay("memory tools", 5, 30.0);
516        assert!(!results.is_empty());
517        // Results from dated files should be ranked by recency
518    }
519
520    #[test]
521    fn test_is_evergreen() {
522        assert!(MemoryIndex::is_evergreen("MEMORY.md"));
523        assert!(MemoryIndex::is_evergreen("SOUL.md"));
524        assert!(!MemoryIndex::is_evergreen("memory/2026-02-20.md"));
525        assert!(!MemoryIndex::is_evergreen("memory/notes/2026-02-20.md"));
526    }
527
528    #[test]
529    fn test_extract_age_days() {
530        use chrono::NaiveDate;
531
532        let today = NaiveDate::from_ymd_opt(2026, 2, 20).unwrap();
533
534        // File from 5 days ago
535        let age = MemoryIndex::extract_age_days("memory/2026-02-15.md", today);
536        assert_eq!(age, 5);
537
538        // File from today
539        let age = MemoryIndex::extract_age_days("memory/2026-02-20.md", today);
540        assert_eq!(age, 0);
541
542        // File with non-date name
543        let age = MemoryIndex::extract_age_days("memory/notes.md", today);
544        assert_eq!(age, 0);
545
546        // Nested path with date
547        let age = MemoryIndex::extract_age_days("memory/project/2026-02-10.md", today);
548        assert_eq!(age, 10);
549    }
550
551    #[test]
552    fn test_recency_affects_ranking() {
553        // Create workspace with files from different dates
554        let dir = TempDir::new().unwrap();
555        fs::create_dir(dir.path().join("memory")).unwrap();
556
557        // Old file with the search term
558        fs::write(
559            dir.path().join("memory/2026-01-01.md"),
560            "# Old Note\nThis contains important search term.\n",
561        )
562        .unwrap();
563
564        // Recent file with the search term
565        fs::write(
566            dir.path().join("memory/2026-02-19.md"),
567            "# Recent Note\nThis also contains important search term.\n",
568        )
569        .unwrap();
570
571        let index = MemoryIndex::index_workspace(dir.path()).unwrap();
572
573        // With recency weighting, recent file should rank higher
574        let results = index.search_with_decay("important search term", 2, 30.0);
575        assert_eq!(results.len(), 2);
576        // The more recent file should be first
577        assert!(results[0].chunk.path.contains("2026-02"));
578    }
579}