Skip to main content

rustyclaw_core/
memory.rs

1//! Memory search and retrieval for RustyClaw.
2//!
3//! Provides semantic-like search over `MEMORY.md` and `memory/*.md` files.
4//! Current implementation uses keyword/BM25-style matching with temporal decay
5//! for recency weighting. Embeddings can be added later for true semantic search.
6
7use chrono::{NaiveDate, Utc};
8use std::collections::HashMap;
9use std::fs;
10use std::path::Path;
11
12/// A chunk of text from a memory file with metadata.
13#[derive(Debug, Clone)]
14pub struct MemoryChunk {
15    /// Source file path (relative to workspace).
16    pub path: String,
17    /// Starting line number (1-indexed).
18    pub start_line: usize,
19    /// Ending line number (1-indexed, inclusive).
20    pub end_line: usize,
21    /// The text content of this chunk.
22    pub text: String,
23}
24
25/// A search result with relevance score.
26#[derive(Debug, Clone)]
27pub struct SearchResult {
28    /// The matching chunk.
29    pub chunk: MemoryChunk,
30    /// Relevance score (higher is better).
31    pub score: f64,
32}
33
34/// Memory search index.
35pub struct MemoryIndex {
36    /// All indexed chunks.
37    chunks: Vec<MemoryChunk>,
38    /// Inverted index: term -> chunk indices.
39    term_index: HashMap<String, Vec<usize>>,
40    /// Document frequency for each term.
41    doc_freq: HashMap<String, usize>,
42    /// Total number of chunks.
43    total_docs: usize,
44}
45
46impl MemoryIndex {
47    /// Create a new empty index.
48    pub fn new() -> Self {
49        Self {
50            chunks: Vec::new(),
51            term_index: HashMap::new(),
52            doc_freq: HashMap::new(),
53            total_docs: 0,
54        }
55    }
56
57    /// Index all memory files in a workspace.
58    pub fn index_workspace(workspace: &Path) -> Result<Self, String> {
59        let mut index = Self::new();
60
61        // Index MEMORY.md if it exists
62        let memory_md = workspace.join("MEMORY.md");
63        if memory_md.exists() {
64            index.index_file(&memory_md, "MEMORY.md")?;
65        }
66
67        // Index memory/*.md
68        let memory_dir = workspace.join("memory");
69        if memory_dir.exists() && memory_dir.is_dir() {
70            index.index_directory(&memory_dir, "memory")?;
71        }
72
73        // Build inverted index
74        index.build_inverted_index();
75
76        Ok(index)
77    }
78
79    /// Index a single file.
80    fn index_file(&mut self, path: &Path, relative_path: &str) -> Result<(), String> {
81        let content = fs::read_to_string(path)
82            .map_err(|e| format!("Failed to read {}: {}", relative_path, e))?;
83
84        // Split into chunks (~400 tokens target, roughly 300-400 words)
85        // For simplicity, we chunk by paragraphs or heading sections
86        let chunks = self.chunk_content(&content, relative_path);
87        self.chunks.extend(chunks);
88
89        Ok(())
90    }
91
92    /// Index a directory recursively.
93    fn index_directory(&mut self, dir: &Path, relative_prefix: &str) -> Result<(), String> {
94        let entries = fs::read_dir(dir)
95            .map_err(|e| format!("Failed to read directory {}: {}", relative_prefix, e))?;
96
97        for entry in entries.flatten() {
98            let path = entry.path();
99            let name = entry.file_name().to_string_lossy().to_string();
100            let relative = format!("{}/{}", relative_prefix, name);
101
102            if path.is_file() && name.ends_with(".md") {
103                self.index_file(&path, &relative)?;
104            } else if path.is_dir() && !name.starts_with('.') {
105                self.index_directory(&path, &relative)?;
106            }
107        }
108
109        Ok(())
110    }
111
112    /// Chunk content into searchable pieces.
113    fn chunk_content(&self, content: &str, path: &str) -> Vec<MemoryChunk> {
114        let mut chunks = Vec::new();
115        let lines: Vec<&str> = content.lines().collect();
116
117        if lines.is_empty() {
118            return chunks;
119        }
120
121        // Chunk by sections (## headings) or every ~20 lines
122        let mut current_chunk = String::new();
123        let mut chunk_start = 1;
124        let mut line_count = 0;
125
126        for (i, line) in lines.iter().enumerate() {
127            let line_num = i + 1;
128
129            // Check if this is a heading that should start a new chunk
130            let is_heading = line.starts_with("## ") || line.starts_with("# ");
131
132            // Start new chunk on heading or every ~20 lines (if we have content)
133            if (is_heading || line_count >= 20) && !current_chunk.trim().is_empty() {
134                chunks.push(MemoryChunk {
135                    path: path.to_string(),
136                    start_line: chunk_start,
137                    end_line: line_num - 1,
138                    text: current_chunk.trim().to_string(),
139                });
140                current_chunk = String::new();
141                chunk_start = line_num;
142                line_count = 0;
143            }
144
145            current_chunk.push_str(line);
146            current_chunk.push('\n');
147            line_count += 1;
148        }
149
150        // Don't forget the last chunk
151        if !current_chunk.trim().is_empty() {
152            chunks.push(MemoryChunk {
153                path: path.to_string(),
154                start_line: chunk_start,
155                end_line: lines.len(),
156                text: current_chunk.trim().to_string(),
157            });
158        }
159
160        chunks
161    }
162
163    /// Build the inverted index for BM25 search.
164    fn build_inverted_index(&mut self) {
165        self.term_index.clear();
166        self.doc_freq.clear();
167        self.total_docs = self.chunks.len();
168
169        for (idx, chunk) in self.chunks.iter().enumerate() {
170            let terms = tokenize(&chunk.text);
171            let unique_terms: std::collections::HashSet<_> = terms.iter().collect();
172
173            for term in unique_terms {
174                self.term_index
175                    .entry(term.clone())
176                    .or_default()
177                    .push(idx);
178
179                *self.doc_freq.entry(term.clone()).or_insert(0) += 1;
180            }
181        }
182    }
183
184    /// Search the index using BM25-style scoring.
185    pub fn search(&self, query: &str, max_results: usize) -> Vec<SearchResult> {
186        let query_terms = tokenize(query);
187
188        if query_terms.is_empty() || self.chunks.is_empty() {
189            return Vec::new();
190        }
191
192        // Score each chunk
193        let mut scores: Vec<(usize, f64)> = Vec::new();
194
195        for (idx, _chunk) in self.chunks.iter().enumerate() {
196            let score = self.bm25_score(idx, &query_terms);
197            if score > 0.0 {
198                scores.push((idx, score));
199            }
200        }
201
202        // Sort by score descending
203        scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
204
205        // Return top results
206        scores
207            .into_iter()
208            .take(max_results)
209            .map(|(idx, score)| SearchResult {
210                chunk: self.chunks[idx].clone(),
211                score,
212            })
213            .collect()
214    }
215
216    /// Calculate BM25 score for a chunk.
217    fn bm25_score(&self, chunk_idx: usize, query_terms: &[String]) -> f64 {
218        const K1: f64 = 1.2;
219        const B: f64 = 0.75;
220
221        let chunk = &self.chunks[chunk_idx];
222        let chunk_terms = tokenize(&chunk.text);
223        let doc_len = chunk_terms.len() as f64;
224
225        // Calculate average document length
226        let avg_doc_len = self.chunks.iter()
227            .map(|c| tokenize(&c.text).len())
228            .sum::<usize>() as f64 / self.total_docs.max(1) as f64;
229
230        let mut score = 0.0;
231
232        for term in query_terms {
233            let tf = chunk_terms.iter().filter(|t| *t == term).count() as f64;
234            let df = *self.doc_freq.get(term).unwrap_or(&0) as f64;
235
236            if tf > 0.0 && df > 0.0 {
237                // IDF component
238                let idf = ((self.total_docs as f64 - df + 0.5) / (df + 0.5) + 1.0).ln();
239
240                // TF component with length normalization
241                let tf_norm = (tf * (K1 + 1.0)) / (tf + K1 * (1.0 - B + B * (doc_len / avg_doc_len)));
242
243                score += idf * tf_norm;
244            }
245        }
246
247        score
248    }
249
250    /// Search with temporal decay (recency weighting).
251    ///
252    /// Recent memory files are boosted using exponential decay with configurable
253    /// half-life. Files that don't have a date in their path (like MEMORY.md)
254    /// are treated as "evergreen" and don't decay.
255    ///
256    /// # Arguments
257    /// * `query` - Search query string
258    /// * `max_results` - Maximum number of results to return
259    /// * `half_life_days` - Half-life for temporal decay in days (default: 30)
260    pub fn search_with_decay(
261        &self,
262        query: &str,
263        max_results: usize,
264        half_life_days: f64,
265    ) -> Vec<SearchResult> {
266        let query_terms = tokenize(query);
267
268        if query_terms.is_empty() || self.chunks.is_empty() {
269            return Vec::new();
270        }
271
272        let today = Utc::now().date_naive();
273        let decay_lambda = (2.0_f64).ln() / half_life_days;
274
275        let mut scores: Vec<(usize, f64)> = Vec::new();
276
277        for (idx, chunk) in self.chunks.iter().enumerate() {
278            let base_score = self.bm25_score(idx, &query_terms);
279
280            if base_score > 0.0 {
281                let decayed_score = if Self::is_evergreen(&chunk.path) {
282                    base_score // No decay for evergreen files
283                } else {
284                    let age_days = Self::extract_age_days(&chunk.path, today);
285                    let decay = (-decay_lambda * age_days as f64).exp();
286                    base_score * decay
287                };
288
289                scores.push((idx, decayed_score));
290            }
291        }
292
293        scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
294
295        scores
296            .into_iter()
297            .take(max_results)
298            .map(|(idx, score)| SearchResult {
299                chunk: self.chunks[idx].clone(),
300                score,
301            })
302            .collect()
303    }
304
305    /// Check if a file path is "evergreen" (shouldn't decay).
306    ///
307    /// Evergreen files include MEMORY.md and any file not in the memory/ directory.
308    fn is_evergreen(path: &str) -> bool {
309        path == "MEMORY.md" || !path.starts_with("memory/")
310    }
311
312    /// Extract the age in days from a dated file path.
313    ///
314    /// Expects paths like "memory/2026-02-20.md" and returns days since that date.
315    /// Returns 0 for paths without a parseable date.
316    fn extract_age_days(path: &str, today: NaiveDate) -> i64 {
317        // Try to extract date from path like "memory/2026-02-20.md"
318        if let Some(filename) = path.strip_prefix("memory/") {
319            if let Some(date_str) = filename.strip_suffix(".md") {
320                // Handle nested paths like "memory/subfolder/2026-02-20.md"
321                let date_part = date_str.rsplit('/').next().unwrap_or(date_str);
322                if let Ok(date) = NaiveDate::parse_from_str(date_part, "%Y-%m-%d") {
323                    return (today - date).num_days().max(0);
324                }
325            }
326        }
327        0 // Unknown date = no decay
328    }
329}
330
331/// Files that should never be decayed (evergreen).
332#[allow(dead_code)]
333const EVERGREEN_FILES: &[&str] = &["MEMORY.md"];
334
335impl Default for MemoryIndex {
336    fn default() -> Self {
337        Self::new()
338    }
339}
340
341/// Tokenize text into lowercase terms for indexing/searching.
342fn tokenize(text: &str) -> Vec<String> {
343    text.to_lowercase()
344        .split(|c: char| !c.is_alphanumeric() && c != '-' && c != '_')
345        .filter(|s| s.len() >= 2) // Skip very short tokens
346        .map(|s| s.to_string())
347        .collect()
348}
349
350/// Read specific lines from a memory file.
351pub fn read_memory_file(
352    workspace: &Path,
353    relative_path: &str,
354    from_line: Option<usize>,
355    num_lines: Option<usize>,
356) -> Result<String, String> {
357    // Validate path is within memory scope
358    if !is_valid_memory_path(relative_path) {
359        return Err(format!(
360            "Path '{}' is not a valid memory file. Must be MEMORY.md or memory/*.md",
361            relative_path
362        ));
363    }
364
365    let full_path = workspace.join(relative_path);
366
367    if !full_path.exists() {
368        return Err(format!("Memory file not found: {}", relative_path));
369    }
370
371    let content = fs::read_to_string(&full_path)
372        .map_err(|e| format!("Failed to read {}: {}", relative_path, e))?;
373
374    let lines: Vec<&str> = content.lines().collect();
375    let total_lines = lines.len();
376
377    // Handle line range
378    let start = from_line.unwrap_or(1).saturating_sub(1); // Convert to 0-indexed
379    let count = num_lines.unwrap_or(total_lines);
380
381    if start >= total_lines {
382        return Ok(String::new());
383    }
384
385    let end = (start + count).min(total_lines);
386    let selected: Vec<&str> = lines[start..end].to_vec();
387
388    Ok(selected.join("\n"))
389}
390
391/// Check if a path is a valid memory file path.
392fn is_valid_memory_path(path: &str) -> bool {
393    // Must be MEMORY.md or within memory/ directory
394    if path == "MEMORY.md" {
395        return true;
396    }
397
398    if path.starts_with("memory/") && path.ends_with(".md") {
399        // Check for path traversal
400        !path.contains("..") && !path.contains("//")
401    } else {
402        false
403    }
404}
405
406#[cfg(test)]
407mod tests {
408    use super::*;
409    use std::fs;
410    use tempfile::TempDir;
411
412    fn setup_test_workspace() -> TempDir {
413        let dir = TempDir::new().unwrap();
414
415        // Create MEMORY.md
416        fs::write(
417            dir.path().join("MEMORY.md"),
418            "# Long-term Memory\n\n## Preferences\nUser prefers dark mode.\nFavorite color is blue.\n\n## Projects\nWorking on RustyClaw.\n"
419        ).unwrap();
420
421        // Create memory directory
422        fs::create_dir(dir.path().join("memory")).unwrap();
423
424        // Create daily note
425        fs::write(
426            dir.path().join("memory/2026-02-12.md"),
427            "# 2026-02-12\n\n## Morning\nStarted implementing memory tools.\n\n## Afternoon\nWorking on BM25 search.\n"
428        ).unwrap();
429
430        dir
431    }
432
433    #[test]
434    fn test_index_workspace() {
435        let workspace = setup_test_workspace();
436        let index = MemoryIndex::index_workspace(workspace.path()).unwrap();
437
438        assert!(!index.chunks.is_empty());
439        assert!(index.total_docs > 0);
440    }
441
442    #[test]
443    fn test_search_finds_relevant() {
444        let workspace = setup_test_workspace();
445        let index = MemoryIndex::index_workspace(workspace.path()).unwrap();
446
447        let results = index.search("dark mode", 5);
448        assert!(!results.is_empty());
449        assert!(results[0].chunk.text.contains("dark mode"));
450    }
451
452    #[test]
453    fn test_search_empty_query() {
454        let workspace = setup_test_workspace();
455        let index = MemoryIndex::index_workspace(workspace.path()).unwrap();
456
457        let results = index.search("", 5);
458        assert!(results.is_empty());
459    }
460
461    #[test]
462    fn test_read_memory_file() {
463        let workspace = setup_test_workspace();
464
465        let content = read_memory_file(workspace.path(), "MEMORY.md", None, None).unwrap();
466        assert!(content.contains("Long-term Memory"));
467    }
468
469    #[test]
470    fn test_read_memory_file_with_range() {
471        let workspace = setup_test_workspace();
472
473        let content = read_memory_file(workspace.path(), "MEMORY.md", Some(3), Some(2)).unwrap();
474        // Line 3-4 should be "## Preferences" and the next line
475        assert!(!content.is_empty());
476    }
477
478    #[test]
479    fn test_read_memory_file_invalid_path() {
480        let workspace = setup_test_workspace();
481
482        let result = read_memory_file(workspace.path(), "../etc/passwd", None, None);
483        assert!(result.is_err());
484    }
485
486    #[test]
487    fn test_valid_memory_paths() {
488        assert!(is_valid_memory_path("MEMORY.md"));
489        assert!(is_valid_memory_path("memory/2026-02-12.md"));
490        assert!(is_valid_memory_path("memory/notes/work.md"));
491
492        assert!(!is_valid_memory_path("../secret.md"));
493        assert!(!is_valid_memory_path("memory/../../../etc/passwd"));
494        assert!(!is_valid_memory_path("src/main.rs"));
495        assert!(!is_valid_memory_path("memory/file.txt"));
496    }
497
498    #[test]
499    fn test_tokenize() {
500        let tokens = tokenize("Hello, World! This is a TEST.");
501        assert!(tokens.contains(&"hello".to_string()));
502        assert!(tokens.contains(&"world".to_string()));
503        assert!(tokens.contains(&"test".to_string()));
504        // Single-char tokens should be filtered
505        assert!(!tokens.contains(&"a".to_string()));
506    }
507
508    #[test]
509    fn test_search_with_decay() {
510        let workspace = setup_test_workspace();
511        let index = MemoryIndex::index_workspace(workspace.path()).unwrap();
512
513        // Search with recency weighting (30 day half-life)
514        let results = index.search_with_decay("memory tools", 5, 30.0);
515        assert!(!results.is_empty());
516        // Results from dated files should be ranked by recency
517    }
518
519    #[test]
520    fn test_is_evergreen() {
521        assert!(MemoryIndex::is_evergreen("MEMORY.md"));
522        assert!(MemoryIndex::is_evergreen("SOUL.md"));
523        assert!(!MemoryIndex::is_evergreen("memory/2026-02-20.md"));
524        assert!(!MemoryIndex::is_evergreen("memory/notes/2026-02-20.md"));
525    }
526
527    #[test]
528    fn test_extract_age_days() {
529        use chrono::NaiveDate;
530
531        let today = NaiveDate::from_ymd_opt(2026, 2, 20).unwrap();
532
533        // File from 5 days ago
534        let age = MemoryIndex::extract_age_days("memory/2026-02-15.md", today);
535        assert_eq!(age, 5);
536
537        // File from today
538        let age = MemoryIndex::extract_age_days("memory/2026-02-20.md", today);
539        assert_eq!(age, 0);
540
541        // File with non-date name
542        let age = MemoryIndex::extract_age_days("memory/notes.md", today);
543        assert_eq!(age, 0);
544
545        // Nested path with date
546        let age = MemoryIndex::extract_age_days("memory/project/2026-02-10.md", today);
547        assert_eq!(age, 10);
548    }
549
550    #[test]
551    fn test_recency_affects_ranking() {
552        // Create workspace with files from different dates
553        let dir = TempDir::new().unwrap();
554        fs::create_dir(dir.path().join("memory")).unwrap();
555
556        // Old file with the search term
557        fs::write(
558            dir.path().join("memory/2026-01-01.md"),
559            "# Old Note\nThis contains important search term.\n"
560        ).unwrap();
561
562        // Recent file with the search term
563        fs::write(
564            dir.path().join("memory/2026-02-19.md"),
565            "# Recent Note\nThis also contains important search term.\n"
566        ).unwrap();
567
568        let index = MemoryIndex::index_workspace(dir.path()).unwrap();
569
570        // With recency weighting, recent file should rank higher
571        let results = index.search_with_decay("important search term", 2, 30.0);
572        assert_eq!(results.len(), 2);
573        // The more recent file should be first
574        assert!(results[0].chunk.path.contains("2026-02"));
575    }
576}