1use anyhow::{Context, Result};
13use glob::Pattern;
14use rocksdb::{ColumnFamily, ColumnFamilyDescriptor, Options, WriteBatch, DB};
15use sha2::{Digest, Sha256};
16use std::collections::HashMap;
17use std::fs;
18use std::path::Path;
19use std::sync::Arc;
20
21use super::types::{
22 CodebaseConfig, CodebaseScanResult, FileMemory, FileMemoryId, FileType, IndexingProgress,
23 LearnedFrom, ProjectId,
24};
25
26const CF_FILES: &str = "files";
27const CF_FILE_INDEX: &str = "file_index";
28
29pub struct FileMemoryStore {
31 db: Arc<DB>,
32 config: CodebaseConfig,
34}
35
36impl FileMemoryStore {
37 fn files_cf(&self) -> &ColumnFamily {
39 self.db.cf_handle(CF_FILES).expect("files CF must exist")
40 }
41
42 fn file_index_cf(&self) -> &ColumnFamily {
44 self.db
45 .cf_handle(CF_FILE_INDEX)
46 .expect("file_index CF must exist")
47 }
48
49 pub fn cf_descriptors() -> Vec<ColumnFamilyDescriptor> {
52 let mut cf_opts = Options::default();
53 cf_opts.create_if_missing(true);
54 vec![
55 ColumnFamilyDescriptor::new(CF_FILES, cf_opts.clone()),
56 ColumnFamilyDescriptor::new(CF_FILE_INDEX, cf_opts),
57 ]
58 }
59
60 pub fn new(db: Arc<DB>, storage_path: &Path) -> Result<Self> {
63 let files_path = storage_path.join("files");
64 std::fs::create_dir_all(&files_path)?;
65
66 Self::migrate_from_separate_dbs(&files_path, &db)?;
67
68 tracing::info!("File memory store initialized");
69 Ok(Self {
70 db,
71 config: CodebaseConfig::default(),
72 })
73 }
74
75 fn migrate_from_separate_dbs(files_path: &Path, db: &DB) -> Result<()> {
80 let old_dirs: &[(&str, &str)] = &[("memories", CF_FILES), ("index", CF_FILE_INDEX)];
81
82 for (old_name, cf_name) in old_dirs {
83 let old_dir = files_path.join(old_name);
84 if !old_dir.is_dir() {
85 continue;
86 }
87
88 let cf = db
89 .cf_handle(cf_name)
90 .unwrap_or_else(|| panic!("{cf_name} CF must exist"));
91 let old_opts = Options::default();
92 match DB::open_for_read_only(&old_opts, &old_dir, false) {
93 Ok(old_db) => {
94 let mut batch = WriteBatch::default();
95 let mut count = 0usize;
96 for item in old_db.iterator(rocksdb::IteratorMode::Start) {
97 if let Ok((key, value)) = item {
98 batch.put_cf(cf, &key, &value);
99 count += 1;
100 if count % 10_000 == 0 {
101 db.write(std::mem::take(&mut batch))?;
102 }
103 }
104 }
105 if !batch.is_empty() {
106 db.write(batch)?;
107 }
108 drop(old_db);
109 tracing::info!(" files/{old_name}: migrated {count} entries to {cf_name} CF");
110
111 let backup = files_path.join(format!("{old_name}.pre_cf_migration"));
112 if backup.exists() {
113 let _ = std::fs::remove_dir_all(&backup);
114 }
115 if let Err(e) = std::fs::rename(&old_dir, &backup) {
116 tracing::warn!("Could not rename old {old_name} dir: {e}");
117 }
118 }
119 Err(e) => {
120 tracing::warn!("Could not open old {old_name} DB for migration: {e}");
121 }
122 }
123 }
124 Ok(())
125 }
126
127 pub fn with_config(mut self, config: CodebaseConfig) -> Self {
129 self.config = config;
130 self
131 }
132
133 pub fn flush(&self) -> Result<()> {
135 use rocksdb::FlushOptions;
136 let mut flush_opts = FlushOptions::default();
137 flush_opts.set_wait(true);
138 for cf_name in &[CF_FILES, CF_FILE_INDEX] {
139 if let Some(cf) = self.db.cf_handle(cf_name) {
140 self.db
141 .flush_cf_opt(cf, &flush_opts)
142 .map_err(|e| anyhow::anyhow!("Failed to flush {cf_name}: {e}"))?;
143 }
144 }
145 Ok(())
146 }
147
148 pub fn databases(&self) -> Vec<(&str, &Arc<DB>)> {
150 vec![("files_shared", &self.db)]
151 }
152
153 pub fn store(&self, file_memory: &FileMemory) -> Result<()> {
159 let key = format!("{}:{}", file_memory.user_id, file_memory.id.0);
160 let value = serde_json::to_vec(file_memory).context("Failed to serialize file memory")?;
161
162 self.db
163 .put_cf(self.files_cf(), key.as_bytes(), &value)
164 .context("Failed to store file memory")?;
165
166 self.update_indices(file_memory)?;
167
168 tracing::debug!(
169 file_id = %file_memory.id,
170 path = %file_memory.path,
171 user_id = %file_memory.user_id,
172 "Stored file memory"
173 );
174
175 Ok(())
176 }
177
178 pub fn get(&self, user_id: &str, file_id: &FileMemoryId) -> Result<Option<FileMemory>> {
180 let key = format!("{}:{}", user_id, file_id.0);
181
182 match self.db.get_cf(self.files_cf(), key.as_bytes())? {
183 Some(value) => {
184 let file_memory: FileMemory =
185 serde_json::from_slice(&value).context("Failed to deserialize file memory")?;
186 Ok(Some(file_memory))
187 }
188 None => Ok(None),
189 }
190 }
191
192 pub fn get_by_path(
194 &self,
195 user_id: &str,
196 project_id: &ProjectId,
197 path: &str,
198 ) -> Result<Option<FileMemory>> {
199 let path_key = format!(
201 "path:{}:{}:{}",
202 user_id,
203 project_id.0,
204 Self::hash_path(path)
205 );
206
207 match self.db.get_cf(self.file_index_cf(), path_key.as_bytes())? {
208 Some(file_id_bytes) => {
209 let file_id_str =
210 String::from_utf8(file_id_bytes.to_vec()).context("Invalid file ID")?;
211 let file_id = FileMemoryId(
212 uuid::Uuid::parse_str(&file_id_str).context("Invalid file ID UUID")?,
213 );
214 self.get(user_id, &file_id)
215 }
216 None => Ok(None),
217 }
218 }
219
220 pub fn update(&self, file_memory: &FileMemory) -> Result<()> {
222 if let Some(existing) = self.get(&file_memory.user_id, &file_memory.id)? {
224 self.remove_indices(&existing)?;
225 }
226
227 self.store(file_memory)
229 }
230
231 pub fn delete(&self, user_id: &str, file_id: &FileMemoryId) -> Result<bool> {
233 if let Some(file_memory) = self.get(user_id, file_id)? {
234 let key = format!("{}:{}", user_id, file_id.0);
235 self.db.delete_cf(self.files_cf(), key.as_bytes())?;
236 self.remove_indices(&file_memory)?;
237
238 tracing::debug!(
239 file_id = %file_id,
240 path = %file_memory.path,
241 "Deleted file memory"
242 );
243
244 Ok(true)
245 } else {
246 Ok(false)
247 }
248 }
249
250 pub fn delete_project_files(&self, user_id: &str, project_id: &ProjectId) -> Result<usize> {
252 let files = self.list_by_project(user_id, project_id, None)?;
253 let count = files.len();
254
255 for file in files {
256 self.delete(user_id, &file.id)?;
257 }
258
259 tracing::info!(
260 project_id = %project_id.0,
261 count = count,
262 "Deleted all file memories for project"
263 );
264
265 Ok(count)
266 }
267
268 pub fn list_by_user(&self, user_id: &str, limit: Option<usize>) -> Result<Vec<FileMemory>> {
274 let prefix = format!("user:{}:", user_id);
275 let mut files = Vec::new();
276
277 let iter = self
278 .db
279 .prefix_iterator_cf(self.file_index_cf(), prefix.as_bytes());
280 for item in iter {
281 let (key, file_id_bytes) = item?;
282 let key_str = String::from_utf8_lossy(&key);
283
284 if !key_str.starts_with(&prefix) {
286 break;
287 }
288
289 let file_id_str = String::from_utf8(file_id_bytes.to_vec())?;
290 let file_id =
291 FileMemoryId(uuid::Uuid::parse_str(&file_id_str).context("Invalid file ID")?);
292
293 if let Some(file) = self.get(user_id, &file_id)? {
294 files.push(file);
295
296 if let Some(lim) = limit {
297 if files.len() >= lim {
298 break;
299 }
300 }
301 }
302 }
303
304 files.sort_by(|a, b| b.access_count.cmp(&a.access_count));
306
307 Ok(files)
308 }
309
310 pub fn list_by_project(
312 &self,
313 user_id: &str,
314 project_id: &ProjectId,
315 limit: Option<usize>,
316 ) -> Result<Vec<FileMemory>> {
317 let prefix = format!("project:{}:{}:", user_id, project_id.0);
318 let mut files = Vec::new();
319
320 let iter = self
321 .db
322 .prefix_iterator_cf(self.file_index_cf(), prefix.as_bytes());
323 for item in iter {
324 let (key, file_id_bytes) = item?;
325 let key_str = String::from_utf8_lossy(&key);
326
327 if !key_str.starts_with(&prefix) {
328 break;
329 }
330
331 let file_id_str = String::from_utf8(file_id_bytes.to_vec())?;
332 let file_id =
333 FileMemoryId(uuid::Uuid::parse_str(&file_id_str).context("Invalid file ID")?);
334
335 if let Some(file) = self.get(user_id, &file_id)? {
336 files.push(file);
337
338 if let Some(lim) = limit {
339 if files.len() >= lim {
340 break;
341 }
342 }
343 }
344 }
345
346 files.sort_by(|a, b| a.path.cmp(&b.path));
348
349 Ok(files)
350 }
351
352 pub fn list_by_type(
354 &self,
355 user_id: &str,
356 project_id: &ProjectId,
357 file_type: &FileType,
358 limit: Option<usize>,
359 ) -> Result<Vec<FileMemory>> {
360 let type_str = format!("{:?}", file_type);
361 let prefix = format!("type:{}:{}:{}:", user_id, project_id.0, type_str);
362 let mut files = Vec::new();
363
364 let iter = self
365 .db
366 .prefix_iterator_cf(self.file_index_cf(), prefix.as_bytes());
367 for item in iter {
368 let (key, file_id_bytes) = item?;
369 let key_str = String::from_utf8_lossy(&key);
370
371 if !key_str.starts_with(&prefix) {
372 break;
373 }
374
375 let file_id_str = String::from_utf8(file_id_bytes.to_vec())?;
376 let file_id =
377 FileMemoryId(uuid::Uuid::parse_str(&file_id_str).context("Invalid file ID")?);
378
379 if let Some(file) = self.get(user_id, &file_id)? {
380 files.push(file);
381
382 if let Some(lim) = limit {
383 if files.len() >= lim {
384 break;
385 }
386 }
387 }
388 }
389
390 Ok(files)
391 }
392
393 pub fn count_by_project(&self, user_id: &str, project_id: &ProjectId) -> Result<usize> {
395 let prefix = format!("project:{}:{}:", user_id, project_id.0);
396 let mut count = 0;
397
398 let iter = self
399 .db
400 .prefix_iterator_cf(self.file_index_cf(), prefix.as_bytes());
401 for item in iter {
402 let (key, _) = item?;
403 let key_str = String::from_utf8_lossy(&key);
404
405 if !key_str.starts_with(&prefix) {
406 break;
407 }
408
409 count += 1;
410 }
411
412 Ok(count)
413 }
414
415 pub fn record_access(
421 &self,
422 user_id: &str,
423 project_id: &ProjectId,
424 path: &str,
425 learned_from: LearnedFrom,
426 ) -> Result<Option<FileMemory>> {
427 if let Some(mut file) = self.get_by_path(user_id, project_id, path)? {
428 file.record_access(learned_from);
429 self.update(&file)?;
430 Ok(Some(file))
431 } else {
432 Ok(None)
433 }
434 }
435
436 pub fn scan_codebase(
442 &self,
443 codebase_path: &Path,
444 config: Option<&CodebaseConfig>,
445 ) -> Result<CodebaseScanResult> {
446 let config = config.unwrap_or(&self.config);
447 let mut result = CodebaseScanResult {
448 total_files: 0,
449 eligible_files: 0,
450 skipped_files: 0,
451 skip_reasons: HashMap::new(),
452 limit_reached: false,
453 file_paths: Vec::new(),
454 };
455
456 let exclude_patterns: Vec<Pattern> = config
458 .exclude_patterns
459 .iter()
460 .filter_map(|p| Pattern::new(p).ok())
461 .collect();
462
463 self.scan_directory_recursive(
464 codebase_path,
465 codebase_path,
466 &exclude_patterns,
467 config,
468 &mut result,
469 )?;
470
471 tracing::info!(
472 path = %codebase_path.display(),
473 total = result.total_files,
474 eligible = result.eligible_files,
475 skipped = result.skipped_files,
476 limit_reached = result.limit_reached,
477 "Scanned codebase"
478 );
479
480 Ok(result)
481 }
482
483 fn scan_directory_recursive(
484 &self,
485 root: &Path,
486 current: &Path,
487 exclude_patterns: &[Pattern],
488 config: &CodebaseConfig,
489 result: &mut CodebaseScanResult,
490 ) -> Result<()> {
491 if result.limit_reached {
492 return Ok(());
493 }
494
495 let entries = match fs::read_dir(current) {
496 Ok(e) => e,
497 Err(e) => {
498 tracing::warn!(path = %current.display(), error = %e, "Failed to read directory");
499 return Ok(());
500 }
501 };
502
503 const EXCLUDED_DIR_NAMES: &[&str] = &[
505 ".git",
506 ".svn",
507 ".hg",
508 ".bzr", "node_modules",
510 "__pycache__",
511 ".venv", "venv",
513 "env",
514 ".env",
515 "virtualenv", "site-packages",
517 "Lib",
518 "Scripts", "target",
520 "dist",
521 "build",
522 "out",
523 "bin", ".idea",
525 ".vscode", ".cache",
527 ".tmp",
528 "tmp", "data",
530 "logs",
531 "coverage", "release-test",
533 "test-wheel", ];
535
536 const EXCLUDED_DIR_SUFFIXES: &[&str] = &[
538 "_data", "_cache", "_output", "_venv", "_env", "_install", ];
545
546 for entry in entries {
547 let entry = match entry {
548 Ok(e) => e,
549 Err(_) => continue,
550 };
551
552 let path = entry.path();
553 let file_name = entry.file_name();
554 let file_name_str = file_name.to_string_lossy();
555
556 if path.is_dir() {
558 if EXCLUDED_DIR_NAMES.iter().any(|&name| file_name_str == name) {
560 *result
561 .skip_reasons
562 .entry(format!("{}/", file_name_str))
563 .or_insert(0) += 1;
564 result.skipped_files += 1;
565 continue;
566 }
567 if EXCLUDED_DIR_SUFFIXES
569 .iter()
570 .any(|&suffix| file_name_str.ends_with(suffix))
571 {
572 *result
573 .skip_reasons
574 .entry(format!(
575 "*{}/",
576 file_name_str
577 .rsplit_once('_')
578 .map_or(&file_name_str[..], |(_, s)| s)
579 ))
580 .or_insert(0) += 1;
581 result.skipped_files += 1;
582 continue;
583 }
584 }
585
586 let relative_path = path
587 .strip_prefix(root)
588 .unwrap_or(&path)
589 .to_string_lossy()
590 .replace('\\', "/");
591
592 let mut excluded = false;
594 for pattern in exclude_patterns {
595 let pattern_str = pattern.as_str();
597 if pattern_str.ends_with('/') {
598 let dir_name = pattern_str.trim_end_matches('/');
599 if relative_path == dir_name
600 || relative_path.starts_with(&format!("{}/", dir_name))
601 {
602 *result.skip_reasons.entry(pattern.to_string()).or_insert(0) += 1;
603 excluded = true;
604 break;
605 }
606 } else if pattern.matches(&relative_path) || pattern.matches(&file_name_str) {
607 *result.skip_reasons.entry(pattern.to_string()).or_insert(0) += 1;
608 excluded = true;
609 break;
610 }
611 }
612
613 if excluded {
614 result.skipped_files += 1;
615 continue;
616 }
617
618 if path.is_dir() {
619 self.scan_directory_recursive(root, &path, exclude_patterns, config, result)?;
621 } else if path.is_file() {
622 result.total_files += 1;
623
624 if config.skip_binary && Self::is_likely_binary(&path) {
626 *result.skip_reasons.entry("binary".to_string()).or_insert(0) += 1;
627 result.skipped_files += 1;
628 continue;
629 }
630
631 if let Ok(metadata) = path.metadata() {
633 if metadata.len() > config.max_file_size_for_embedding as u64 {
634 *result
635 .skip_reasons
636 .entry("too_large".to_string())
637 .or_insert(0) += 1;
638 result.skipped_files += 1;
639 continue;
640 }
641 }
642
643 result.eligible_files += 1;
645 result.file_paths.push(relative_path);
646
647 if result.eligible_files >= config.max_files_per_project {
649 result.limit_reached = true;
650 return Ok(());
651 }
652 }
653 }
654
655 Ok(())
656 }
657
658 fn is_likely_binary(path: &Path) -> bool {
660 let binary_extensions = [
661 "exe", "dll", "so", "dylib", "bin", "obj", "o", "a", "lib", "png", "jpg", "jpeg",
662 "gif", "bmp", "ico", "webp", "mp3", "mp4", "avi", "mov", "mkv", "wav", "flac", "zip",
663 "tar", "gz", "rar", "7z", "pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx", "woff",
664 "woff2", "ttf", "otf", "eot", "class", "pyc", "pyo", "wasm",
665 ];
666
667 path.extension()
668 .and_then(|e| e.to_str())
669 .map(|e| binary_extensions.contains(&e.to_lowercase().as_str()))
670 .unwrap_or(false)
671 }
672
673 pub fn hash_file_content(content: &[u8]) -> String {
679 let mut hasher = Sha256::new();
680 hasher.update(content);
681 format!("{:x}", hasher.finalize())
682 }
683
684 fn hash_path(path: &str) -> String {
686 let mut hasher = Sha256::new();
687 hasher.update(path.as_bytes());
688 format!("{:x}", hasher.finalize())[..16].to_string()
689 }
690
691 pub fn index_file(
697 &self,
698 codebase_root: &Path,
699 relative_path: &str,
700 project_id: &ProjectId,
701 user_id: &str,
702 ) -> Result<FileMemory> {
703 let absolute_path = codebase_root.join(relative_path);
704 let content = fs::read(&absolute_path)
705 .with_context(|| format!("Failed to read file: {}", absolute_path.display()))?;
706
707 let file_hash = Self::hash_file_content(&content);
708 let size_bytes = content.len() as u64;
709
710 let content_str = String::from_utf8_lossy(&content);
712 let line_count = content_str.lines().count();
713
714 let file_type = absolute_path
716 .extension()
717 .and_then(|e| e.to_str())
718 .map(FileType::from_extension)
719 .unwrap_or_default();
720
721 let key_items = Self::extract_key_items(&content_str, &file_type);
723
724 let mut file_memory = FileMemory::new(
726 project_id.clone(),
727 user_id.to_string(),
728 relative_path.to_string(),
729 absolute_path.to_string_lossy().to_string(),
730 file_hash,
731 file_type,
732 line_count,
733 size_bytes,
734 );
735
736 file_memory.key_items = key_items;
737
738 self.store(&file_memory)?;
740
741 Ok(file_memory)
742 }
743
744 pub fn index_file_with_embedding<E: crate::embeddings::Embedder>(
746 &self,
747 codebase_root: &Path,
748 relative_path: &str,
749 project_id: &ProjectId,
750 user_id: &str,
751 embedder: &E,
752 ) -> Result<FileMemory> {
753 let mut file_memory = self.index_file(codebase_root, relative_path, project_id, user_id)?;
754
755 let embed_content = Self::prepare_embed_content(&file_memory);
757 if !embed_content.is_empty() {
758 match embedder.encode(&embed_content) {
759 Ok(embedding) => {
760 file_memory.embedding = Some(embedding);
761 self.update(&file_memory)?;
762 }
763 Err(e) => {
764 tracing::warn!(
765 path = %file_memory.path,
766 error = %e,
767 "Failed to generate embedding for file"
768 );
769 }
770 }
771 }
772
773 Ok(file_memory)
774 }
775
776 fn prepare_embed_content(file: &FileMemory) -> String {
778 let mut parts = Vec::new();
779
780 parts.push(file.path.clone());
782
783 if !file.key_items.is_empty() {
785 parts.push(file.key_items.join(" "));
786 }
787
788 if !file.summary.is_empty() {
790 parts.push(file.summary.clone());
791 }
792
793 if let Some(ref purpose) = file.purpose {
795 parts.push(purpose.clone());
796 }
797
798 parts.join(" | ")
799 }
800
801 fn extract_key_items(content: &str, file_type: &FileType) -> Vec<String> {
803 let mut items = Vec::new();
804
805 match file_type {
806 FileType::Rust => {
807 for line in content.lines() {
809 let trimmed = line.trim();
810 if trimmed.starts_with("pub fn ")
811 || trimmed.starts_with("pub async fn ")
812 || trimmed.starts_with("pub struct ")
813 || trimmed.starts_with("pub enum ")
814 || trimmed.starts_with("pub trait ")
815 || trimmed.starts_with("impl ")
816 {
817 if let Some(name) = Self::extract_rust_name(trimmed) {
819 if !items.contains(&name) {
820 items.push(name);
821 }
822 }
823 }
824 }
825 }
826 FileType::TypeScript | FileType::JavaScript => {
827 for line in content.lines() {
829 let trimmed = line.trim();
830 if trimmed.starts_with("export ")
831 || trimmed.starts_with("function ")
832 || trimmed.starts_with("class ")
833 || trimmed.starts_with("interface ")
834 || trimmed.starts_with("const ")
835 {
836 if let Some(name) = Self::extract_js_name(trimmed) {
837 if !items.contains(&name) {
838 items.push(name);
839 }
840 }
841 }
842 }
843 }
844 FileType::Python => {
845 for line in content.lines() {
847 let trimmed = line.trim();
848 if trimmed.starts_with("def ")
849 || trimmed.starts_with("async def ")
850 || trimmed.starts_with("class ")
851 {
852 if let Some(name) = Self::extract_python_name(trimmed) {
853 if !items.contains(&name) {
854 items.push(name);
855 }
856 }
857 }
858 }
859 }
860 FileType::Go => {
861 for line in content.lines() {
863 let trimmed = line.trim();
864 if trimmed.starts_with("func ") || trimmed.starts_with("type ") {
865 if let Some(name) = Self::extract_go_name(trimmed) {
866 if !items.contains(&name) {
867 items.push(name);
868 }
869 }
870 }
871 }
872 }
873 _ => {
874 }
877 }
878
879 items.truncate(50);
881 items
882 }
883
884 fn extract_rust_name(line: &str) -> Option<String> {
885 let line = line.trim_start_matches("pub ").trim_start_matches("async ");
889
890 if line.starts_with("fn ") {
891 let rest = line.strip_prefix("fn ")?;
892 let name = rest.split(|c| c == '(' || c == '<').next()?;
893 Some(name.trim().to_string())
894 } else if line.starts_with("struct ") {
895 let rest = line.strip_prefix("struct ")?;
896 let name = rest.split(|c| c == '{' || c == '<' || c == '(').next()?;
897 Some(name.trim().to_string())
898 } else if line.starts_with("enum ") {
899 let rest = line.strip_prefix("enum ")?;
900 let name = rest.split(|c| c == '{' || c == '<').next()?;
901 Some(name.trim().to_string())
902 } else if line.starts_with("trait ") {
903 let rest = line.strip_prefix("trait ")?;
904 let name = rest.split(|c| c == '{' || c == '<' || c == ':').next()?;
905 Some(name.trim().to_string())
906 } else if line.starts_with("impl ") {
907 let rest = line.strip_prefix("impl ")?;
908 let sig = rest.split('{').next()?;
909 Some(sig.trim().to_string())
910 } else {
911 None
912 }
913 }
914
915 fn extract_js_name(line: &str) -> Option<String> {
916 let line = line
919 .trim_start_matches("export ")
920 .trim_start_matches("default ")
921 .trim_start_matches("async ");
922
923 if line.starts_with("function ") {
924 let rest = line.strip_prefix("function ")?;
925 let name = rest.split('(').next()?;
926 Some(name.trim().to_string())
927 } else if line.starts_with("class ") {
928 let rest = line.strip_prefix("class ")?;
929 let name = rest.split(|c| c == '{' || c == ' ').next()?;
930 Some(name.trim().to_string())
931 } else if line.starts_with("interface ") {
932 let rest = line.strip_prefix("interface ")?;
933 let name = rest.split(|c| c == '{' || c == ' ' || c == '<').next()?;
934 Some(name.trim().to_string())
935 } else if line.starts_with("const ") {
936 let rest = line.strip_prefix("const ")?;
937 let name = rest.split(|c| c == '=' || c == ':').next()?;
938 Some(name.trim().to_string())
939 } else {
940 None
941 }
942 }
943
944 fn extract_python_name(line: &str) -> Option<String> {
945 let line = line.trim_start_matches("async ");
948
949 if line.starts_with("def ") {
950 let rest = line.strip_prefix("def ")?;
951 let name = rest.split('(').next()?;
952 Some(name.trim().to_string())
953 } else if line.starts_with("class ") {
954 let rest = line.strip_prefix("class ")?;
955 let name = rest.split(|c| c == '(' || c == ':').next()?;
956 Some(name.trim().to_string())
957 } else {
958 None
959 }
960 }
961
962 fn extract_go_name(line: &str) -> Option<String> {
963 if line.starts_with("func ") {
967 let rest = line.strip_prefix("func ")?;
968 if rest.starts_with('(') {
970 let after_receiver = rest.split(')').nth(1)?;
972 let name = after_receiver.trim().split('(').next()?;
973 Some(name.trim().to_string())
974 } else {
975 let name = rest.split('(').next()?;
976 Some(name.trim().to_string())
977 }
978 } else if line.starts_with("type ") {
979 let rest = line.strip_prefix("type ")?;
980 let name = rest.split_whitespace().next()?;
981 Some(name.trim().to_string())
982 } else {
983 None
984 }
985 }
986
987 pub fn index_codebase(
989 &self,
990 codebase_root: &Path,
991 project_id: &ProjectId,
992 user_id: &str,
993 config: Option<&CodebaseConfig>,
994 ) -> Result<IndexingResult> {
995 let scan_result = self.scan_codebase(codebase_root, config)?;
997
998 let mut result = IndexingResult {
999 total_files: scan_result.eligible_files,
1000 indexed_files: 0,
1001 skipped_files: 0,
1002 errors: Vec::new(),
1003 };
1004
1005 for relative_path in &scan_result.file_paths {
1006 match self.index_file(codebase_root, relative_path, project_id, user_id) {
1007 Ok(_) => {
1008 result.indexed_files += 1;
1009 }
1010 Err(e) => {
1011 result.errors.push(format!("{}: {}", relative_path, e));
1012 result.skipped_files += 1;
1013 }
1014 }
1015 }
1016
1017 tracing::info!(
1018 path = %codebase_root.display(),
1019 total = result.total_files,
1020 indexed = result.indexed_files,
1021 skipped = result.skipped_files,
1022 errors = result.errors.len(),
1023 "Indexed codebase"
1024 );
1025
1026 Ok(result)
1027 }
1028
1029 pub fn index_codebase_with_embeddings<E: crate::embeddings::Embedder>(
1031 &self,
1032 codebase_root: &Path,
1033 project_id: &ProjectId,
1034 user_id: &str,
1035 embedder: &E,
1036 config: Option<&CodebaseConfig>,
1037 progress_callback: Option<&dyn Fn(IndexingProgress)>,
1038 ) -> Result<IndexingResult> {
1039 let scan_result = self.scan_codebase(codebase_root, config)?;
1040
1041 let mut result = IndexingResult {
1042 total_files: scan_result.eligible_files,
1043 indexed_files: 0,
1044 skipped_files: 0,
1045 errors: Vec::new(),
1046 };
1047
1048 let mut progress = IndexingProgress::new(scan_result.eligible_files);
1049
1050 for relative_path in &scan_result.file_paths {
1051 progress.current_file = Some(relative_path.clone());
1052
1053 match self.index_file_with_embedding(
1054 codebase_root,
1055 relative_path,
1056 project_id,
1057 user_id,
1058 embedder,
1059 ) {
1060 Ok(_) => {
1061 result.indexed_files += 1;
1062 }
1063 Err(e) => {
1064 let error_msg = format!("{}: {}", relative_path, e);
1065 result.errors.push(error_msg.clone());
1066 progress.errors.push(error_msg);
1067 result.skipped_files += 1;
1068 }
1069 }
1070
1071 progress.processed += 1;
1072
1073 if let Some(cb) = progress_callback {
1074 cb(progress.clone());
1075 }
1076 }
1077
1078 progress.complete = true;
1079 if let Some(cb) = progress_callback {
1080 cb(progress);
1081 }
1082
1083 tracing::info!(
1084 path = %codebase_root.display(),
1085 total = result.total_files,
1086 indexed = result.indexed_files,
1087 skipped = result.skipped_files,
1088 errors = result.errors.len(),
1089 "Indexed codebase with embeddings"
1090 );
1091
1092 Ok(result)
1093 }
1094
1095 fn update_indices(&self, file: &FileMemory) -> Result<()> {
1100 let mut batch = WriteBatch::default();
1101 let id_str = file.id.0.to_string();
1102 let idx_cf = self.file_index_cf();
1103
1104 let user_key = format!("user:{}:{}", file.user_id, id_str);
1106 batch.put_cf(idx_cf, user_key.as_bytes(), id_str.as_bytes());
1107
1108 let project_key = format!("project:{}:{}:{}", file.user_id, file.project_id.0, id_str);
1110 batch.put_cf(idx_cf, project_key.as_bytes(), id_str.as_bytes());
1111
1112 let path_key = format!(
1114 "path:{}:{}:{}",
1115 file.user_id,
1116 file.project_id.0,
1117 Self::hash_path(&file.path)
1118 );
1119 batch.put_cf(idx_cf, path_key.as_bytes(), id_str.as_bytes());
1120
1121 let type_str = format!("{:?}", file.file_type);
1123 let type_key = format!(
1124 "type:{}:{}:{}:{}",
1125 file.user_id, file.project_id.0, type_str, id_str
1126 );
1127 batch.put_cf(idx_cf, type_key.as_bytes(), id_str.as_bytes());
1128
1129 self.db
1130 .write(batch)
1131 .context("Failed to update file memory indices")?;
1132
1133 Ok(())
1134 }
1135
1136 fn remove_indices(&self, file: &FileMemory) -> Result<()> {
1137 let mut batch = WriteBatch::default();
1138 let id_str = file.id.0.to_string();
1139 let idx_cf = self.file_index_cf();
1140
1141 let user_key = format!("user:{}:{}", file.user_id, id_str);
1142 batch.delete_cf(idx_cf, user_key.as_bytes());
1143
1144 let project_key = format!("project:{}:{}:{}", file.user_id, file.project_id.0, id_str);
1145 batch.delete_cf(idx_cf, project_key.as_bytes());
1146
1147 let path_key = format!(
1148 "path:{}:{}:{}",
1149 file.user_id,
1150 file.project_id.0,
1151 Self::hash_path(&file.path)
1152 );
1153 batch.delete_cf(idx_cf, path_key.as_bytes());
1154
1155 let type_str = format!("{:?}", file.file_type);
1156 let type_key = format!(
1157 "type:{}:{}:{}:{}",
1158 file.user_id, file.project_id.0, type_str, id_str
1159 );
1160 batch.delete_cf(idx_cf, type_key.as_bytes());
1161
1162 self.db.write(batch)?;
1163 Ok(())
1164 }
1165
1166 pub fn stats(&self, user_id: &str) -> Result<FileMemoryStats> {
1172 let files = self.list_by_user(user_id, None)?;
1173
1174 let total_files = files.len();
1175 let total_size: u64 = files.iter().map(|f| f.size_bytes).sum();
1176 let total_lines: usize = files.iter().map(|f| f.line_count).sum();
1177 let total_accesses: u32 = files.iter().map(|f| f.access_count).sum();
1178
1179 let mut by_type: HashMap<String, usize> = HashMap::new();
1181 for file in &files {
1182 let type_str = format!("{:?}", file.file_type);
1183 *by_type.entry(type_str).or_insert(0) += 1;
1184 }
1185
1186 let mut by_source: HashMap<String, usize> = HashMap::new();
1188 for file in &files {
1189 let source_str = format!("{:?}", file.learned_from);
1190 *by_source.entry(source_str).or_insert(0) += 1;
1191 }
1192
1193 Ok(FileMemoryStats {
1194 total_files,
1195 total_size_bytes: total_size,
1196 total_lines,
1197 total_accesses,
1198 by_type,
1199 by_source,
1200 })
1201 }
1202}
1203
1204#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1206pub struct FileMemoryStats {
1207 pub total_files: usize,
1208 pub total_size_bytes: u64,
1209 pub total_lines: usize,
1210 pub total_accesses: u32,
1211 pub by_type: HashMap<String, usize>,
1212 pub by_source: HashMap<String, usize>,
1213}
1214
1215#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
1217pub struct IndexingResult {
1218 pub total_files: usize,
1220 pub indexed_files: usize,
1222 pub skipped_files: usize,
1224 pub errors: Vec<String>,
1226}
1227
1228#[cfg(test)]
1229mod tests {
1230 use super::*;
1231 use tempfile::TempDir;
1232
1233 fn create_test_store() -> (FileMemoryStore, TempDir) {
1234 let temp_dir = TempDir::new().unwrap();
1235 let db_path = temp_dir.path().join("files_db");
1236
1237 let mut opts = Options::default();
1238 opts.create_if_missing(true);
1239 opts.create_missing_column_families(true);
1240
1241 let mut cfs = vec![ColumnFamilyDescriptor::new("default", {
1242 let mut o = Options::default();
1243 o.create_if_missing(true);
1244 o
1245 })];
1246 cfs.extend(FileMemoryStore::cf_descriptors());
1247 let db = Arc::new(DB::open_cf_descriptors(&opts, &db_path, cfs).unwrap());
1248 let store = FileMemoryStore::new(db, temp_dir.path()).unwrap();
1249 (store, temp_dir)
1250 }
1251
1252 #[test]
1253 fn test_store_and_retrieve() {
1254 let (store, _dir) = create_test_store();
1255
1256 let project_id = ProjectId::new();
1257 let file = FileMemory::new(
1258 project_id.clone(),
1259 "test-user".to_string(),
1260 "src/main.rs".to_string(),
1261 "/home/user/project/src/main.rs".to_string(),
1262 "abc123".to_string(),
1263 FileType::Rust,
1264 100,
1265 5000,
1266 );
1267
1268 store.store(&file).unwrap();
1270
1271 let retrieved = store.get("test-user", &file.id).unwrap().unwrap();
1273 assert_eq!(retrieved.path, "src/main.rs");
1274 assert_eq!(retrieved.file_type, FileType::Rust);
1275
1276 let by_path = store
1278 .get_by_path("test-user", &project_id, "src/main.rs")
1279 .unwrap()
1280 .unwrap();
1281 assert_eq!(by_path.id, file.id);
1282 }
1283
1284 #[test]
1285 fn test_list_by_project() {
1286 let (store, _dir) = create_test_store();
1287
1288 let project_id = ProjectId::new();
1289
1290 for i in 0..5 {
1292 let file = FileMemory::new(
1293 project_id.clone(),
1294 "test-user".to_string(),
1295 format!("src/file{}.rs", i),
1296 format!("/home/user/project/src/file{}.rs", i),
1297 format!("hash{}", i),
1298 FileType::Rust,
1299 100,
1300 5000,
1301 );
1302 store.store(&file).unwrap();
1303 }
1304
1305 let files = store
1306 .list_by_project("test-user", &project_id, None)
1307 .unwrap();
1308 assert_eq!(files.len(), 5);
1309 }
1310
1311 #[test]
1312 fn test_record_access() {
1313 let (store, _dir) = create_test_store();
1314
1315 let project_id = ProjectId::new();
1316 let file = FileMemory::new(
1317 project_id.clone(),
1318 "test-user".to_string(),
1319 "src/main.rs".to_string(),
1320 "/home/user/project/src/main.rs".to_string(),
1321 "abc123".to_string(),
1322 FileType::Rust,
1323 100,
1324 5000,
1325 );
1326
1327 store.store(&file).unwrap();
1328
1329 let updated = store
1331 .record_access(
1332 "test-user",
1333 &project_id,
1334 "src/main.rs",
1335 LearnedFrom::ReadAccess,
1336 )
1337 .unwrap()
1338 .unwrap();
1339
1340 assert_eq!(updated.access_count, 2); assert_eq!(updated.learned_from, LearnedFrom::ReadAccess);
1342 }
1343
1344 #[test]
1345 fn test_delete() {
1346 let (store, _dir) = create_test_store();
1347
1348 let project_id = ProjectId::new();
1349 let file = FileMemory::new(
1350 project_id.clone(),
1351 "test-user".to_string(),
1352 "src/main.rs".to_string(),
1353 "/home/user/project/src/main.rs".to_string(),
1354 "abc123".to_string(),
1355 FileType::Rust,
1356 100,
1357 5000,
1358 );
1359
1360 store.store(&file).unwrap();
1361
1362 let deleted = store.delete("test-user", &file.id).unwrap();
1364 assert!(deleted);
1365
1366 let retrieved = store.get("test-user", &file.id).unwrap();
1368 assert!(retrieved.is_none());
1369 }
1370
1371 #[test]
1372 fn test_file_type_detection() {
1373 assert_eq!(FileType::from_extension("rs"), FileType::Rust);
1374 assert_eq!(FileType::from_extension("ts"), FileType::TypeScript);
1375 assert_eq!(FileType::from_extension("tsx"), FileType::TypeScript);
1376 assert_eq!(FileType::from_extension("py"), FileType::Python);
1377 assert_eq!(FileType::from_extension("go"), FileType::Go);
1378 assert_eq!(FileType::from_extension("md"), FileType::Markdown);
1379 assert!(matches!(
1380 FileType::from_extension("unknown"),
1381 FileType::Other(_)
1382 ));
1383 }
1384}