shodh_memory/memory/
files.rs

1//! File Memory Storage for Codebase Integration
2//!
3//! Stores learned knowledge about files in a codebase.
4//! Separate from regular memories to avoid search pollution.
5//!
6//! Features:
7//! - CRUD operations for FileMemory
8//! - Indexing by project, path, and file type
9//! - Semantic search via embeddings
10//! - Access tracking for heat maps
11
12use anyhow::{Context, Result};
13use glob::Pattern;
14use rocksdb::{ColumnFamily, ColumnFamilyDescriptor, Options, WriteBatch, DB};
15use sha2::{Digest, Sha256};
16use std::collections::HashMap;
17use std::fs;
18use std::path::Path;
19use std::sync::Arc;
20
21use super::types::{
22    CodebaseConfig, CodebaseScanResult, FileMemory, FileMemoryId, FileType, IndexingProgress,
23    LearnedFrom, ProjectId,
24};
25
26const CF_FILES: &str = "files";
27const CF_FILE_INDEX: &str = "file_index";
28
29/// Storage and query engine for file memories
30pub struct FileMemoryStore {
31    db: Arc<DB>,
32    /// Default configuration
33    config: CodebaseConfig,
34}
35
36impl FileMemoryStore {
37    /// CF accessor for the files column family
38    fn files_cf(&self) -> &ColumnFamily {
39        self.db.cf_handle(CF_FILES).expect("files CF must exist")
40    }
41
42    /// CF accessor for the file_index column family
43    fn file_index_cf(&self) -> &ColumnFamily {
44        self.db
45            .cf_handle(CF_FILE_INDEX)
46            .expect("file_index CF must exist")
47    }
48
49    /// Return CF descriptors needed by this store. The caller must include
50    /// these when opening the shared RocksDB instance.
51    pub fn cf_descriptors() -> Vec<ColumnFamilyDescriptor> {
52        let mut cf_opts = Options::default();
53        cf_opts.create_if_missing(true);
54        vec![
55            ColumnFamilyDescriptor::new(CF_FILES, cf_opts.clone()),
56            ColumnFamilyDescriptor::new(CF_FILE_INDEX, cf_opts),
57        ]
58    }
59
60    /// Create a new file memory store backed by a shared DB that already
61    /// contains the required column families (`files`, `file_index`).
62    pub fn new(db: Arc<DB>, storage_path: &Path) -> Result<Self> {
63        let files_path = storage_path.join("files");
64        std::fs::create_dir_all(&files_path)?;
65
66        Self::migrate_from_separate_dbs(&files_path, &db)?;
67
68        tracing::info!("File memory store initialized");
69        Ok(Self {
70            db,
71            config: CodebaseConfig::default(),
72        })
73    }
74
75    /// One-time migration: copy data from the old separate-DB layout
76    /// (`files/memories` and `files/index`) into the column families of the
77    /// shared DB, then rename the old directories so the migration is
78    /// not repeated.
79    fn migrate_from_separate_dbs(files_path: &Path, db: &DB) -> Result<()> {
80        let old_dirs: &[(&str, &str)] = &[("memories", CF_FILES), ("index", CF_FILE_INDEX)];
81
82        for (old_name, cf_name) in old_dirs {
83            let old_dir = files_path.join(old_name);
84            if !old_dir.is_dir() {
85                continue;
86            }
87
88            let cf = db
89                .cf_handle(cf_name)
90                .unwrap_or_else(|| panic!("{cf_name} CF must exist"));
91            let old_opts = Options::default();
92            match DB::open_for_read_only(&old_opts, &old_dir, false) {
93                Ok(old_db) => {
94                    let mut batch = WriteBatch::default();
95                    let mut count = 0usize;
96                    for item in old_db.iterator(rocksdb::IteratorMode::Start) {
97                        if let Ok((key, value)) = item {
98                            batch.put_cf(cf, &key, &value);
99                            count += 1;
100                            if count % 10_000 == 0 {
101                                db.write(std::mem::take(&mut batch))?;
102                            }
103                        }
104                    }
105                    if !batch.is_empty() {
106                        db.write(batch)?;
107                    }
108                    drop(old_db);
109                    tracing::info!("  files/{old_name}: migrated {count} entries to {cf_name} CF");
110
111                    let backup = files_path.join(format!("{old_name}.pre_cf_migration"));
112                    if backup.exists() {
113                        let _ = std::fs::remove_dir_all(&backup);
114                    }
115                    if let Err(e) = std::fs::rename(&old_dir, &backup) {
116                        tracing::warn!("Could not rename old {old_name} dir: {e}");
117                    }
118                }
119                Err(e) => {
120                    tracing::warn!("Could not open old {old_name} DB for migration: {e}");
121                }
122            }
123        }
124        Ok(())
125    }
126
127    /// Set custom configuration
128    pub fn with_config(mut self, config: CodebaseConfig) -> Self {
129        self.config = config;
130        self
131    }
132
133    /// Flush all column families to disk (critical for graceful shutdown)
134    pub fn flush(&self) -> Result<()> {
135        use rocksdb::FlushOptions;
136        let mut flush_opts = FlushOptions::default();
137        flush_opts.set_wait(true);
138        for cf_name in &[CF_FILES, CF_FILE_INDEX] {
139            if let Some(cf) = self.db.cf_handle(cf_name) {
140                self.db
141                    .flush_cf_opt(cf, &flush_opts)
142                    .map_err(|e| anyhow::anyhow!("Failed to flush {cf_name}: {e}"))?;
143            }
144        }
145        Ok(())
146    }
147
148    /// Get references to all RocksDB databases for backup
149    pub fn databases(&self) -> Vec<(&str, &Arc<DB>)> {
150        vec![("files_shared", &self.db)]
151    }
152
153    // =========================================================================
154    // CRUD OPERATIONS
155    // =========================================================================
156
157    /// Store a new file memory
158    pub fn store(&self, file_memory: &FileMemory) -> Result<()> {
159        let key = format!("{}:{}", file_memory.user_id, file_memory.id.0);
160        let value = serde_json::to_vec(file_memory).context("Failed to serialize file memory")?;
161
162        self.db
163            .put_cf(self.files_cf(), key.as_bytes(), &value)
164            .context("Failed to store file memory")?;
165
166        self.update_indices(file_memory)?;
167
168        tracing::debug!(
169            file_id = %file_memory.id,
170            path = %file_memory.path,
171            user_id = %file_memory.user_id,
172            "Stored file memory"
173        );
174
175        Ok(())
176    }
177
178    /// Get a file memory by ID
179    pub fn get(&self, user_id: &str, file_id: &FileMemoryId) -> Result<Option<FileMemory>> {
180        let key = format!("{}:{}", user_id, file_id.0);
181
182        match self.db.get_cf(self.files_cf(), key.as_bytes())? {
183            Some(value) => {
184                let file_memory: FileMemory =
185                    serde_json::from_slice(&value).context("Failed to deserialize file memory")?;
186                Ok(Some(file_memory))
187            }
188            None => Ok(None),
189        }
190    }
191
192    /// Get a file memory by path (relative path within project)
193    pub fn get_by_path(
194        &self,
195        user_id: &str,
196        project_id: &ProjectId,
197        path: &str,
198    ) -> Result<Option<FileMemory>> {
199        // Look up in path index
200        let path_key = format!(
201            "path:{}:{}:{}",
202            user_id,
203            project_id.0,
204            Self::hash_path(path)
205        );
206
207        match self.db.get_cf(self.file_index_cf(), path_key.as_bytes())? {
208            Some(file_id_bytes) => {
209                let file_id_str =
210                    String::from_utf8(file_id_bytes.to_vec()).context("Invalid file ID")?;
211                let file_id = FileMemoryId(
212                    uuid::Uuid::parse_str(&file_id_str).context("Invalid file ID UUID")?,
213                );
214                self.get(user_id, &file_id)
215            }
216            None => Ok(None),
217        }
218    }
219
220    /// Update a file memory
221    pub fn update(&self, file_memory: &FileMemory) -> Result<()> {
222        // Remove old indices first (in case path changed)
223        if let Some(existing) = self.get(&file_memory.user_id, &file_memory.id)? {
224            self.remove_indices(&existing)?;
225        }
226
227        // Store updated version
228        self.store(file_memory)
229    }
230
231    /// Delete a file memory
232    pub fn delete(&self, user_id: &str, file_id: &FileMemoryId) -> Result<bool> {
233        if let Some(file_memory) = self.get(user_id, file_id)? {
234            let key = format!("{}:{}", user_id, file_id.0);
235            self.db.delete_cf(self.files_cf(), key.as_bytes())?;
236            self.remove_indices(&file_memory)?;
237
238            tracing::debug!(
239                file_id = %file_id,
240                path = %file_memory.path,
241                "Deleted file memory"
242            );
243
244            Ok(true)
245        } else {
246            Ok(false)
247        }
248    }
249
250    /// Delete all file memories for a project
251    pub fn delete_project_files(&self, user_id: &str, project_id: &ProjectId) -> Result<usize> {
252        let files = self.list_by_project(user_id, project_id, None)?;
253        let count = files.len();
254
255        for file in files {
256            self.delete(user_id, &file.id)?;
257        }
258
259        tracing::info!(
260            project_id = %project_id.0,
261            count = count,
262            "Deleted all file memories for project"
263        );
264
265        Ok(count)
266    }
267
268    // =========================================================================
269    // LISTING & QUERYING
270    // =========================================================================
271
272    /// List all file memories for a user
273    pub fn list_by_user(&self, user_id: &str, limit: Option<usize>) -> Result<Vec<FileMemory>> {
274        let prefix = format!("user:{}:", user_id);
275        let mut files = Vec::new();
276
277        let iter = self
278            .db
279            .prefix_iterator_cf(self.file_index_cf(), prefix.as_bytes());
280        for item in iter {
281            let (key, file_id_bytes) = item?;
282            let key_str = String::from_utf8_lossy(&key);
283
284            // Stop if we've left our prefix
285            if !key_str.starts_with(&prefix) {
286                break;
287            }
288
289            let file_id_str = String::from_utf8(file_id_bytes.to_vec())?;
290            let file_id =
291                FileMemoryId(uuid::Uuid::parse_str(&file_id_str).context("Invalid file ID")?);
292
293            if let Some(file) = self.get(user_id, &file_id)? {
294                files.push(file);
295
296                if let Some(lim) = limit {
297                    if files.len() >= lim {
298                        break;
299                    }
300                }
301            }
302        }
303
304        // Sort by access count descending (most accessed first)
305        files.sort_by(|a, b| b.access_count.cmp(&a.access_count));
306
307        Ok(files)
308    }
309
310    /// List all file memories for a project
311    pub fn list_by_project(
312        &self,
313        user_id: &str,
314        project_id: &ProjectId,
315        limit: Option<usize>,
316    ) -> Result<Vec<FileMemory>> {
317        let prefix = format!("project:{}:{}:", user_id, project_id.0);
318        let mut files = Vec::new();
319
320        let iter = self
321            .db
322            .prefix_iterator_cf(self.file_index_cf(), prefix.as_bytes());
323        for item in iter {
324            let (key, file_id_bytes) = item?;
325            let key_str = String::from_utf8_lossy(&key);
326
327            if !key_str.starts_with(&prefix) {
328                break;
329            }
330
331            let file_id_str = String::from_utf8(file_id_bytes.to_vec())?;
332            let file_id =
333                FileMemoryId(uuid::Uuid::parse_str(&file_id_str).context("Invalid file ID")?);
334
335            if let Some(file) = self.get(user_id, &file_id)? {
336                files.push(file);
337
338                if let Some(lim) = limit {
339                    if files.len() >= lim {
340                        break;
341                    }
342                }
343            }
344        }
345
346        // Sort by path for consistent ordering
347        files.sort_by(|a, b| a.path.cmp(&b.path));
348
349        Ok(files)
350    }
351
352    /// List file memories by type
353    pub fn list_by_type(
354        &self,
355        user_id: &str,
356        project_id: &ProjectId,
357        file_type: &FileType,
358        limit: Option<usize>,
359    ) -> Result<Vec<FileMemory>> {
360        let type_str = format!("{:?}", file_type);
361        let prefix = format!("type:{}:{}:{}:", user_id, project_id.0, type_str);
362        let mut files = Vec::new();
363
364        let iter = self
365            .db
366            .prefix_iterator_cf(self.file_index_cf(), prefix.as_bytes());
367        for item in iter {
368            let (key, file_id_bytes) = item?;
369            let key_str = String::from_utf8_lossy(&key);
370
371            if !key_str.starts_with(&prefix) {
372                break;
373            }
374
375            let file_id_str = String::from_utf8(file_id_bytes.to_vec())?;
376            let file_id =
377                FileMemoryId(uuid::Uuid::parse_str(&file_id_str).context("Invalid file ID")?);
378
379            if let Some(file) = self.get(user_id, &file_id)? {
380                files.push(file);
381
382                if let Some(lim) = limit {
383                    if files.len() >= lim {
384                        break;
385                    }
386                }
387            }
388        }
389
390        Ok(files)
391    }
392
393    /// Get file count for a project
394    pub fn count_by_project(&self, user_id: &str, project_id: &ProjectId) -> Result<usize> {
395        let prefix = format!("project:{}:{}:", user_id, project_id.0);
396        let mut count = 0;
397
398        let iter = self
399            .db
400            .prefix_iterator_cf(self.file_index_cf(), prefix.as_bytes());
401        for item in iter {
402            let (key, _) = item?;
403            let key_str = String::from_utf8_lossy(&key);
404
405            if !key_str.starts_with(&prefix) {
406                break;
407            }
408
409            count += 1;
410        }
411
412        Ok(count)
413    }
414
415    // =========================================================================
416    // ACCESS TRACKING
417    // =========================================================================
418
419    /// Record an access to a file (increments counter, updates timestamp)
420    pub fn record_access(
421        &self,
422        user_id: &str,
423        project_id: &ProjectId,
424        path: &str,
425        learned_from: LearnedFrom,
426    ) -> Result<Option<FileMemory>> {
427        if let Some(mut file) = self.get_by_path(user_id, project_id, path)? {
428            file.record_access(learned_from);
429            self.update(&file)?;
430            Ok(Some(file))
431        } else {
432            Ok(None)
433        }
434    }
435
436    // =========================================================================
437    // CODEBASE SCANNING
438    // =========================================================================
439
440    /// Scan a directory and return eligible files for indexing
441    pub fn scan_codebase(
442        &self,
443        codebase_path: &Path,
444        config: Option<&CodebaseConfig>,
445    ) -> Result<CodebaseScanResult> {
446        let config = config.unwrap_or(&self.config);
447        let mut result = CodebaseScanResult {
448            total_files: 0,
449            eligible_files: 0,
450            skipped_files: 0,
451            skip_reasons: HashMap::new(),
452            limit_reached: false,
453            file_paths: Vec::new(),
454        };
455
456        // Compile exclude patterns
457        let exclude_patterns: Vec<Pattern> = config
458            .exclude_patterns
459            .iter()
460            .filter_map(|p| Pattern::new(p).ok())
461            .collect();
462
463        self.scan_directory_recursive(
464            codebase_path,
465            codebase_path,
466            &exclude_patterns,
467            config,
468            &mut result,
469        )?;
470
471        tracing::info!(
472            path = %codebase_path.display(),
473            total = result.total_files,
474            eligible = result.eligible_files,
475            skipped = result.skipped_files,
476            limit_reached = result.limit_reached,
477            "Scanned codebase"
478        );
479
480        Ok(result)
481    }
482
483    fn scan_directory_recursive(
484        &self,
485        root: &Path,
486        current: &Path,
487        exclude_patterns: &[Pattern],
488        config: &CodebaseConfig,
489        result: &mut CodebaseScanResult,
490    ) -> Result<()> {
491        if result.limit_reached {
492            return Ok(());
493        }
494
495        let entries = match fs::read_dir(current) {
496            Ok(e) => e,
497            Err(e) => {
498                tracing::warn!(path = %current.display(), error = %e, "Failed to read directory");
499                return Ok(());
500            }
501        };
502
503        // Commonly excluded directory names (checked explicitly for performance and reliability)
504        const EXCLUDED_DIR_NAMES: &[&str] = &[
505            ".git",
506            ".svn",
507            ".hg",
508            ".bzr", // VCS
509            "node_modules",
510            "__pycache__",
511            ".venv", // Dependencies
512            "venv",
513            "env",
514            ".env",
515            "virtualenv", // More Python venvs
516            "site-packages",
517            "Lib",
518            "Scripts", // Python internals
519            "target",
520            "dist",
521            "build",
522            "out",
523            "bin", // Build outputs
524            ".idea",
525            ".vscode", // IDE
526            ".cache",
527            ".tmp",
528            "tmp", // Temp
529            "data",
530            "logs",
531            "coverage", // Runtime data
532            "release-test",
533            "test-wheel", // Test artifacts
534        ];
535
536        // Directory name patterns to skip (suffix matching)
537        const EXCLUDED_DIR_SUFFIXES: &[&str] = &[
538            "_data",    // Any *_data directories (e.g., shodh_memory_data)
539            "_cache",   // Any *_cache directories
540            "_output",  // Any *_output directories
541            "_venv",    // Any *_venv directories
542            "_env",     // Any *_env directories
543            "_install", // Any *_install directories (test installs)
544        ];
545
546        for entry in entries {
547            let entry = match entry {
548                Ok(e) => e,
549                Err(_) => continue,
550            };
551
552            let path = entry.path();
553            let file_name = entry.file_name();
554            let file_name_str = file_name.to_string_lossy();
555
556            // Quick check: skip commonly excluded directories by name
557            if path.is_dir() {
558                // Exact match exclusion
559                if EXCLUDED_DIR_NAMES.iter().any(|&name| file_name_str == name) {
560                    *result
561                        .skip_reasons
562                        .entry(format!("{}/", file_name_str))
563                        .or_insert(0) += 1;
564                    result.skipped_files += 1;
565                    continue;
566                }
567                // Suffix pattern exclusion (e.g., *_data, *_cache)
568                if EXCLUDED_DIR_SUFFIXES
569                    .iter()
570                    .any(|&suffix| file_name_str.ends_with(suffix))
571                {
572                    *result
573                        .skip_reasons
574                        .entry(format!(
575                            "*{}/",
576                            file_name_str
577                                .rsplit_once('_')
578                                .map_or(&file_name_str[..], |(_, s)| s)
579                        ))
580                        .or_insert(0) += 1;
581                    result.skipped_files += 1;
582                    continue;
583                }
584            }
585
586            let relative_path = path
587                .strip_prefix(root)
588                .unwrap_or(&path)
589                .to_string_lossy()
590                .replace('\\', "/");
591
592            // Check exclude patterns (for custom patterns and file patterns like *.lock)
593            let mut excluded = false;
594            for pattern in exclude_patterns {
595                // For directory patterns (ending with /), check if relative path starts with it
596                let pattern_str = pattern.as_str();
597                if pattern_str.ends_with('/') {
598                    let dir_name = pattern_str.trim_end_matches('/');
599                    if relative_path == dir_name
600                        || relative_path.starts_with(&format!("{}/", dir_name))
601                    {
602                        *result.skip_reasons.entry(pattern.to_string()).or_insert(0) += 1;
603                        excluded = true;
604                        break;
605                    }
606                } else if pattern.matches(&relative_path) || pattern.matches(&file_name_str) {
607                    *result.skip_reasons.entry(pattern.to_string()).or_insert(0) += 1;
608                    excluded = true;
609                    break;
610                }
611            }
612
613            if excluded {
614                result.skipped_files += 1;
615                continue;
616            }
617
618            if path.is_dir() {
619                // Recurse into directory
620                self.scan_directory_recursive(root, &path, exclude_patterns, config, result)?;
621            } else if path.is_file() {
622                result.total_files += 1;
623
624                // Check if binary
625                if config.skip_binary && Self::is_likely_binary(&path) {
626                    *result.skip_reasons.entry("binary".to_string()).or_insert(0) += 1;
627                    result.skipped_files += 1;
628                    continue;
629                }
630
631                // Check file size
632                if let Ok(metadata) = path.metadata() {
633                    if metadata.len() > config.max_file_size_for_embedding as u64 {
634                        *result
635                            .skip_reasons
636                            .entry("too_large".to_string())
637                            .or_insert(0) += 1;
638                        result.skipped_files += 1;
639                        continue;
640                    }
641                }
642
643                // File is eligible
644                result.eligible_files += 1;
645                result.file_paths.push(relative_path);
646
647                // Check limit
648                if result.eligible_files >= config.max_files_per_project {
649                    result.limit_reached = true;
650                    return Ok(());
651                }
652            }
653        }
654
655        Ok(())
656    }
657
658    /// Check if a file is likely binary (non-text)
659    fn is_likely_binary(path: &Path) -> bool {
660        let binary_extensions = [
661            "exe", "dll", "so", "dylib", "bin", "obj", "o", "a", "lib", "png", "jpg", "jpeg",
662            "gif", "bmp", "ico", "webp", "mp3", "mp4", "avi", "mov", "mkv", "wav", "flac", "zip",
663            "tar", "gz", "rar", "7z", "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx", "woff",
664            "woff2", "ttf", "otf", "eot", "class", "pyc", "pyo", "wasm",
665        ];
666
667        path.extension()
668            .and_then(|e| e.to_str())
669            .map(|e| binary_extensions.contains(&e.to_lowercase().as_str()))
670            .unwrap_or(false)
671    }
672
673    // =========================================================================
674    // FILE HASHING
675    // =========================================================================
676
677    /// Compute SHA256 hash of file content
678    pub fn hash_file_content(content: &[u8]) -> String {
679        let mut hasher = Sha256::new();
680        hasher.update(content);
681        format!("{:x}", hasher.finalize())
682    }
683
684    /// Compute SHA256 hash of a path (for indexing)
685    fn hash_path(path: &str) -> String {
686        let mut hasher = Sha256::new();
687        hasher.update(path.as_bytes());
688        format!("{:x}", hasher.finalize())[..16].to_string()
689    }
690
691    // =========================================================================
692    // FILE INDEXING
693    // =========================================================================
694
695    /// Index a single file: read, hash, count lines, detect type, extract key items
696    pub fn index_file(
697        &self,
698        codebase_root: &Path,
699        relative_path: &str,
700        project_id: &ProjectId,
701        user_id: &str,
702    ) -> Result<FileMemory> {
703        let absolute_path = codebase_root.join(relative_path);
704        let content = fs::read(&absolute_path)
705            .with_context(|| format!("Failed to read file: {}", absolute_path.display()))?;
706
707        let file_hash = Self::hash_file_content(&content);
708        let size_bytes = content.len() as u64;
709
710        // Count lines (for text files)
711        let content_str = String::from_utf8_lossy(&content);
712        let line_count = content_str.lines().count();
713
714        // Detect file type from extension
715        let file_type = absolute_path
716            .extension()
717            .and_then(|e| e.to_str())
718            .map(FileType::from_extension)
719            .unwrap_or_default();
720
721        // Extract key items (functions, classes, etc.)
722        let key_items = Self::extract_key_items(&content_str, &file_type);
723
724        // Create FileMemory
725        let mut file_memory = FileMemory::new(
726            project_id.clone(),
727            user_id.to_string(),
728            relative_path.to_string(),
729            absolute_path.to_string_lossy().to_string(),
730            file_hash,
731            file_type,
732            line_count,
733            size_bytes,
734        );
735
736        file_memory.key_items = key_items;
737
738        // Store it
739        self.store(&file_memory)?;
740
741        Ok(file_memory)
742    }
743
744    /// Index a single file and generate embedding
745    pub fn index_file_with_embedding<E: crate::embeddings::Embedder>(
746        &self,
747        codebase_root: &Path,
748        relative_path: &str,
749        project_id: &ProjectId,
750        user_id: &str,
751        embedder: &E,
752    ) -> Result<FileMemory> {
753        let mut file_memory = self.index_file(codebase_root, relative_path, project_id, user_id)?;
754
755        // Generate embedding from summary content
756        let embed_content = Self::prepare_embed_content(&file_memory);
757        if !embed_content.is_empty() {
758            match embedder.encode(&embed_content) {
759                Ok(embedding) => {
760                    file_memory.embedding = Some(embedding);
761                    self.update(&file_memory)?;
762                }
763                Err(e) => {
764                    tracing::warn!(
765                        path = %file_memory.path,
766                        error = %e,
767                        "Failed to generate embedding for file"
768                    );
769                }
770            }
771        }
772
773        Ok(file_memory)
774    }
775
776    /// Prepare content for embedding (path + key items + summary)
777    fn prepare_embed_content(file: &FileMemory) -> String {
778        let mut parts = Vec::new();
779
780        // Include relative path (helps with file discovery)
781        parts.push(file.path.clone());
782
783        // Include key items
784        if !file.key_items.is_empty() {
785            parts.push(file.key_items.join(" "));
786        }
787
788        // Include summary if available
789        if !file.summary.is_empty() {
790            parts.push(file.summary.clone());
791        }
792
793        // Include purpose if available
794        if let Some(ref purpose) = file.purpose {
795            parts.push(purpose.clone());
796        }
797
798        parts.join(" | ")
799    }
800
801    /// Extract key items from file content based on file type
802    fn extract_key_items(content: &str, file_type: &FileType) -> Vec<String> {
803        let mut items = Vec::new();
804
805        match file_type {
806            FileType::Rust => {
807                // Extract pub fn, pub struct, pub enum, pub trait, impl
808                for line in content.lines() {
809                    let trimmed = line.trim();
810                    if trimmed.starts_with("pub fn ")
811                        || trimmed.starts_with("pub async fn ")
812                        || trimmed.starts_with("pub struct ")
813                        || trimmed.starts_with("pub enum ")
814                        || trimmed.starts_with("pub trait ")
815                        || trimmed.starts_with("impl ")
816                    {
817                        // Extract the name
818                        if let Some(name) = Self::extract_rust_name(trimmed) {
819                            if !items.contains(&name) {
820                                items.push(name);
821                            }
822                        }
823                    }
824                }
825            }
826            FileType::TypeScript | FileType::JavaScript => {
827                // Extract export function, export class, export const, export interface
828                for line in content.lines() {
829                    let trimmed = line.trim();
830                    if trimmed.starts_with("export ")
831                        || trimmed.starts_with("function ")
832                        || trimmed.starts_with("class ")
833                        || trimmed.starts_with("interface ")
834                        || trimmed.starts_with("const ")
835                    {
836                        if let Some(name) = Self::extract_js_name(trimmed) {
837                            if !items.contains(&name) {
838                                items.push(name);
839                            }
840                        }
841                    }
842                }
843            }
844            FileType::Python => {
845                // Extract def, class, async def
846                for line in content.lines() {
847                    let trimmed = line.trim();
848                    if trimmed.starts_with("def ")
849                        || trimmed.starts_with("async def ")
850                        || trimmed.starts_with("class ")
851                    {
852                        if let Some(name) = Self::extract_python_name(trimmed) {
853                            if !items.contains(&name) {
854                                items.push(name);
855                            }
856                        }
857                    }
858                }
859            }
860            FileType::Go => {
861                // Extract func, type struct, type interface
862                for line in content.lines() {
863                    let trimmed = line.trim();
864                    if trimmed.starts_with("func ") || trimmed.starts_with("type ") {
865                        if let Some(name) = Self::extract_go_name(trimmed) {
866                            if !items.contains(&name) {
867                                items.push(name);
868                            }
869                        }
870                    }
871                }
872            }
873            _ => {
874                // For other types, just count significant lines
875                // Could add more extractors later
876            }
877        }
878
879        // Limit to prevent bloat
880        items.truncate(50);
881        items
882    }
883
884    fn extract_rust_name(line: &str) -> Option<String> {
885        // "pub fn foo(" -> "foo"
886        // "pub struct Bar {" -> "Bar"
887        // "impl Foo for Bar {" -> "Foo for Bar"
888        let line = line.trim_start_matches("pub ").trim_start_matches("async ");
889
890        if line.starts_with("fn ") {
891            let rest = line.strip_prefix("fn ")?;
892            let name = rest.split(|c| c == '(' || c == '<').next()?;
893            Some(name.trim().to_string())
894        } else if line.starts_with("struct ") {
895            let rest = line.strip_prefix("struct ")?;
896            let name = rest.split(|c| c == '{' || c == '<' || c == '(').next()?;
897            Some(name.trim().to_string())
898        } else if line.starts_with("enum ") {
899            let rest = line.strip_prefix("enum ")?;
900            let name = rest.split(|c| c == '{' || c == '<').next()?;
901            Some(name.trim().to_string())
902        } else if line.starts_with("trait ") {
903            let rest = line.strip_prefix("trait ")?;
904            let name = rest.split(|c| c == '{' || c == '<' || c == ':').next()?;
905            Some(name.trim().to_string())
906        } else if line.starts_with("impl ") {
907            let rest = line.strip_prefix("impl ")?;
908            let sig = rest.split('{').next()?;
909            Some(sig.trim().to_string())
910        } else {
911            None
912        }
913    }
914
915    fn extract_js_name(line: &str) -> Option<String> {
916        // "export function foo(" -> "foo"
917        // "export class Bar {" -> "Bar"
918        let line = line
919            .trim_start_matches("export ")
920            .trim_start_matches("default ")
921            .trim_start_matches("async ");
922
923        if line.starts_with("function ") {
924            let rest = line.strip_prefix("function ")?;
925            let name = rest.split('(').next()?;
926            Some(name.trim().to_string())
927        } else if line.starts_with("class ") {
928            let rest = line.strip_prefix("class ")?;
929            let name = rest.split(|c| c == '{' || c == ' ').next()?;
930            Some(name.trim().to_string())
931        } else if line.starts_with("interface ") {
932            let rest = line.strip_prefix("interface ")?;
933            let name = rest.split(|c| c == '{' || c == ' ' || c == '<').next()?;
934            Some(name.trim().to_string())
935        } else if line.starts_with("const ") {
936            let rest = line.strip_prefix("const ")?;
937            let name = rest.split(|c| c == '=' || c == ':').next()?;
938            Some(name.trim().to_string())
939        } else {
940            None
941        }
942    }
943
944    fn extract_python_name(line: &str) -> Option<String> {
945        // "def foo(" -> "foo"
946        // "class Bar:" -> "Bar"
947        let line = line.trim_start_matches("async ");
948
949        if line.starts_with("def ") {
950            let rest = line.strip_prefix("def ")?;
951            let name = rest.split('(').next()?;
952            Some(name.trim().to_string())
953        } else if line.starts_with("class ") {
954            let rest = line.strip_prefix("class ")?;
955            let name = rest.split(|c| c == '(' || c == ':').next()?;
956            Some(name.trim().to_string())
957        } else {
958            None
959        }
960    }
961
962    fn extract_go_name(line: &str) -> Option<String> {
963        // "func Foo(" -> "Foo"
964        // "func (r *Receiver) Method(" -> "Method"
965        // "type Bar struct" -> "Bar"
966        if line.starts_with("func ") {
967            let rest = line.strip_prefix("func ")?;
968            // Handle method receivers: func (r *Type) Name(
969            if rest.starts_with('(') {
970                // Skip receiver, find method name
971                let after_receiver = rest.split(')').nth(1)?;
972                let name = after_receiver.trim().split('(').next()?;
973                Some(name.trim().to_string())
974            } else {
975                let name = rest.split('(').next()?;
976                Some(name.trim().to_string())
977            }
978        } else if line.starts_with("type ") {
979            let rest = line.strip_prefix("type ")?;
980            let name = rest.split_whitespace().next()?;
981            Some(name.trim().to_string())
982        } else {
983            None
984        }
985    }
986
987    /// Index all files in a codebase (blocking version)
988    pub fn index_codebase(
989        &self,
990        codebase_root: &Path,
991        project_id: &ProjectId,
992        user_id: &str,
993        config: Option<&CodebaseConfig>,
994    ) -> Result<IndexingResult> {
995        // First scan to get eligible files
996        let scan_result = self.scan_codebase(codebase_root, config)?;
997
998        let mut result = IndexingResult {
999            total_files: scan_result.eligible_files,
1000            indexed_files: 0,
1001            skipped_files: 0,
1002            errors: Vec::new(),
1003        };
1004
1005        for relative_path in &scan_result.file_paths {
1006            match self.index_file(codebase_root, relative_path, project_id, user_id) {
1007                Ok(_) => {
1008                    result.indexed_files += 1;
1009                }
1010                Err(e) => {
1011                    result.errors.push(format!("{}: {}", relative_path, e));
1012                    result.skipped_files += 1;
1013                }
1014            }
1015        }
1016
1017        tracing::info!(
1018            path = %codebase_root.display(),
1019            total = result.total_files,
1020            indexed = result.indexed_files,
1021            skipped = result.skipped_files,
1022            errors = result.errors.len(),
1023            "Indexed codebase"
1024        );
1025
1026        Ok(result)
1027    }
1028
1029    /// Index codebase with embeddings (requires embedder)
1030    pub fn index_codebase_with_embeddings<E: crate::embeddings::Embedder>(
1031        &self,
1032        codebase_root: &Path,
1033        project_id: &ProjectId,
1034        user_id: &str,
1035        embedder: &E,
1036        config: Option<&CodebaseConfig>,
1037        progress_callback: Option<&dyn Fn(IndexingProgress)>,
1038    ) -> Result<IndexingResult> {
1039        let scan_result = self.scan_codebase(codebase_root, config)?;
1040
1041        let mut result = IndexingResult {
1042            total_files: scan_result.eligible_files,
1043            indexed_files: 0,
1044            skipped_files: 0,
1045            errors: Vec::new(),
1046        };
1047
1048        let mut progress = IndexingProgress::new(scan_result.eligible_files);
1049
1050        for relative_path in &scan_result.file_paths {
1051            progress.current_file = Some(relative_path.clone());
1052
1053            match self.index_file_with_embedding(
1054                codebase_root,
1055                relative_path,
1056                project_id,
1057                user_id,
1058                embedder,
1059            ) {
1060                Ok(_) => {
1061                    result.indexed_files += 1;
1062                }
1063                Err(e) => {
1064                    let error_msg = format!("{}: {}", relative_path, e);
1065                    result.errors.push(error_msg.clone());
1066                    progress.errors.push(error_msg);
1067                    result.skipped_files += 1;
1068                }
1069            }
1070
1071            progress.processed += 1;
1072
1073            if let Some(cb) = progress_callback {
1074                cb(progress.clone());
1075            }
1076        }
1077
1078        progress.complete = true;
1079        if let Some(cb) = progress_callback {
1080            cb(progress);
1081        }
1082
1083        tracing::info!(
1084            path = %codebase_root.display(),
1085            total = result.total_files,
1086            indexed = result.indexed_files,
1087            skipped = result.skipped_files,
1088            errors = result.errors.len(),
1089            "Indexed codebase with embeddings"
1090        );
1091
1092        Ok(result)
1093    }
1094
1095    // =========================================================================
1096    // INDEX MANAGEMENT
1097    // =========================================================================
1098
1099    fn update_indices(&self, file: &FileMemory) -> Result<()> {
1100        let mut batch = WriteBatch::default();
1101        let id_str = file.id.0.to_string();
1102        let idx_cf = self.file_index_cf();
1103
1104        // Index by user
1105        let user_key = format!("user:{}:{}", file.user_id, id_str);
1106        batch.put_cf(idx_cf, user_key.as_bytes(), id_str.as_bytes());
1107
1108        // Index by project
1109        let project_key = format!("project:{}:{}:{}", file.user_id, file.project_id.0, id_str);
1110        batch.put_cf(idx_cf, project_key.as_bytes(), id_str.as_bytes());
1111
1112        // Index by path (for fast lookup)
1113        let path_key = format!(
1114            "path:{}:{}:{}",
1115            file.user_id,
1116            file.project_id.0,
1117            Self::hash_path(&file.path)
1118        );
1119        batch.put_cf(idx_cf, path_key.as_bytes(), id_str.as_bytes());
1120
1121        // Index by file type
1122        let type_str = format!("{:?}", file.file_type);
1123        let type_key = format!(
1124            "type:{}:{}:{}:{}",
1125            file.user_id, file.project_id.0, type_str, id_str
1126        );
1127        batch.put_cf(idx_cf, type_key.as_bytes(), id_str.as_bytes());
1128
1129        self.db
1130            .write(batch)
1131            .context("Failed to update file memory indices")?;
1132
1133        Ok(())
1134    }
1135
1136    fn remove_indices(&self, file: &FileMemory) -> Result<()> {
1137        let mut batch = WriteBatch::default();
1138        let id_str = file.id.0.to_string();
1139        let idx_cf = self.file_index_cf();
1140
1141        let user_key = format!("user:{}:{}", file.user_id, id_str);
1142        batch.delete_cf(idx_cf, user_key.as_bytes());
1143
1144        let project_key = format!("project:{}:{}:{}", file.user_id, file.project_id.0, id_str);
1145        batch.delete_cf(idx_cf, project_key.as_bytes());
1146
1147        let path_key = format!(
1148            "path:{}:{}:{}",
1149            file.user_id,
1150            file.project_id.0,
1151            Self::hash_path(&file.path)
1152        );
1153        batch.delete_cf(idx_cf, path_key.as_bytes());
1154
1155        let type_str = format!("{:?}", file.file_type);
1156        let type_key = format!(
1157            "type:{}:{}:{}:{}",
1158            file.user_id, file.project_id.0, type_str, id_str
1159        );
1160        batch.delete_cf(idx_cf, type_key.as_bytes());
1161
1162        self.db.write(batch)?;
1163        Ok(())
1164    }
1165
1166    // =========================================================================
1167    // STATS
1168    // =========================================================================
1169
1170    /// Get statistics for file memories
1171    pub fn stats(&self, user_id: &str) -> Result<FileMemoryStats> {
1172        let files = self.list_by_user(user_id, None)?;
1173
1174        let total_files = files.len();
1175        let total_size: u64 = files.iter().map(|f| f.size_bytes).sum();
1176        let total_lines: usize = files.iter().map(|f| f.line_count).sum();
1177        let total_accesses: u32 = files.iter().map(|f| f.access_count).sum();
1178
1179        // Count by type
1180        let mut by_type: HashMap<String, usize> = HashMap::new();
1181        for file in &files {
1182            let type_str = format!("{:?}", file.file_type);
1183            *by_type.entry(type_str).or_insert(0) += 1;
1184        }
1185
1186        // Count by learned_from
1187        let mut by_source: HashMap<String, usize> = HashMap::new();
1188        for file in &files {
1189            let source_str = format!("{:?}", file.learned_from);
1190            *by_source.entry(source_str).or_insert(0) += 1;
1191        }
1192
1193        Ok(FileMemoryStats {
1194            total_files,
1195            total_size_bytes: total_size,
1196            total_lines,
1197            total_accesses,
1198            by_type,
1199            by_source,
1200        })
1201    }
1202}
1203
1204/// Statistics about file memories
1205#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1206pub struct FileMemoryStats {
1207    pub total_files: usize,
1208    pub total_size_bytes: u64,
1209    pub total_lines: usize,
1210    pub total_accesses: u32,
1211    pub by_type: HashMap<String, usize>,
1212    pub by_source: HashMap<String, usize>,
1213}
1214
1215/// Result of indexing a codebase
1216#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1217pub struct IndexingResult {
1218    /// Total files attempted
1219    pub total_files: usize,
1220    /// Files successfully indexed
1221    pub indexed_files: usize,
1222    /// Files skipped due to errors
1223    pub skipped_files: usize,
1224    /// Error messages for failed files
1225    pub errors: Vec<String>,
1226}
1227
1228#[cfg(test)]
1229mod tests {
1230    use super::*;
1231    use tempfile::TempDir;
1232
1233    fn create_test_store() -> (FileMemoryStore, TempDir) {
1234        let temp_dir = TempDir::new().unwrap();
1235        let db_path = temp_dir.path().join("files_db");
1236
1237        let mut opts = Options::default();
1238        opts.create_if_missing(true);
1239        opts.create_missing_column_families(true);
1240
1241        let mut cfs = vec![ColumnFamilyDescriptor::new("default", {
1242            let mut o = Options::default();
1243            o.create_if_missing(true);
1244            o
1245        })];
1246        cfs.extend(FileMemoryStore::cf_descriptors());
1247        let db = Arc::new(DB::open_cf_descriptors(&opts, &db_path, cfs).unwrap());
1248        let store = FileMemoryStore::new(db, temp_dir.path()).unwrap();
1249        (store, temp_dir)
1250    }
1251
1252    #[test]
1253    fn test_store_and_retrieve() {
1254        let (store, _dir) = create_test_store();
1255
1256        let project_id = ProjectId::new();
1257        let file = FileMemory::new(
1258            project_id.clone(),
1259            "test-user".to_string(),
1260            "src/main.rs".to_string(),
1261            "/home/user/project/src/main.rs".to_string(),
1262            "abc123".to_string(),
1263            FileType::Rust,
1264            100,
1265            5000,
1266        );
1267
1268        // Store
1269        store.store(&file).unwrap();
1270
1271        // Retrieve by ID
1272        let retrieved = store.get("test-user", &file.id).unwrap().unwrap();
1273        assert_eq!(retrieved.path, "src/main.rs");
1274        assert_eq!(retrieved.file_type, FileType::Rust);
1275
1276        // Retrieve by path
1277        let by_path = store
1278            .get_by_path("test-user", &project_id, "src/main.rs")
1279            .unwrap()
1280            .unwrap();
1281        assert_eq!(by_path.id, file.id);
1282    }
1283
1284    #[test]
1285    fn test_list_by_project() {
1286        let (store, _dir) = create_test_store();
1287
1288        let project_id = ProjectId::new();
1289
1290        // Create multiple files
1291        for i in 0..5 {
1292            let file = FileMemory::new(
1293                project_id.clone(),
1294                "test-user".to_string(),
1295                format!("src/file{}.rs", i),
1296                format!("/home/user/project/src/file{}.rs", i),
1297                format!("hash{}", i),
1298                FileType::Rust,
1299                100,
1300                5000,
1301            );
1302            store.store(&file).unwrap();
1303        }
1304
1305        let files = store
1306            .list_by_project("test-user", &project_id, None)
1307            .unwrap();
1308        assert_eq!(files.len(), 5);
1309    }
1310
1311    #[test]
1312    fn test_record_access() {
1313        let (store, _dir) = create_test_store();
1314
1315        let project_id = ProjectId::new();
1316        let file = FileMemory::new(
1317            project_id.clone(),
1318            "test-user".to_string(),
1319            "src/main.rs".to_string(),
1320            "/home/user/project/src/main.rs".to_string(),
1321            "abc123".to_string(),
1322            FileType::Rust,
1323            100,
1324            5000,
1325        );
1326
1327        store.store(&file).unwrap();
1328
1329        // Record access
1330        let updated = store
1331            .record_access(
1332                "test-user",
1333                &project_id,
1334                "src/main.rs",
1335                LearnedFrom::ReadAccess,
1336            )
1337            .unwrap()
1338            .unwrap();
1339
1340        assert_eq!(updated.access_count, 2); // 1 initial + 1 access
1341        assert_eq!(updated.learned_from, LearnedFrom::ReadAccess);
1342    }
1343
1344    #[test]
1345    fn test_delete() {
1346        let (store, _dir) = create_test_store();
1347
1348        let project_id = ProjectId::new();
1349        let file = FileMemory::new(
1350            project_id.clone(),
1351            "test-user".to_string(),
1352            "src/main.rs".to_string(),
1353            "/home/user/project/src/main.rs".to_string(),
1354            "abc123".to_string(),
1355            FileType::Rust,
1356            100,
1357            5000,
1358        );
1359
1360        store.store(&file).unwrap();
1361
1362        // Delete
1363        let deleted = store.delete("test-user", &file.id).unwrap();
1364        assert!(deleted);
1365
1366        // Verify gone
1367        let retrieved = store.get("test-user", &file.id).unwrap();
1368        assert!(retrieved.is_none());
1369    }
1370
1371    #[test]
1372    fn test_file_type_detection() {
1373        assert_eq!(FileType::from_extension("rs"), FileType::Rust);
1374        assert_eq!(FileType::from_extension("ts"), FileType::TypeScript);
1375        assert_eq!(FileType::from_extension("tsx"), FileType::TypeScript);
1376        assert_eq!(FileType::from_extension("py"), FileType::Python);
1377        assert_eq!(FileType::from_extension("go"), FileType::Go);
1378        assert_eq!(FileType::from_extension("md"), FileType::Markdown);
1379        assert!(matches!(
1380            FileType::from_extension("unknown"),
1381            FileType::Other(_)
1382        ));
1383    }
1384}
shodh_memory/memory/files.rs

shodh_memory/memory/
files.rs