aurora_semantic/storage/
disk.rs

1//! Disk-based storage implementation.
2
3use std::collections::HashMap;
4use std::fs::{self, File};
5use std::io::{BufReader, BufWriter, Write};
6use std::path::{Path, PathBuf};
7
8use parking_lot::RwLock;
9
10use crate::error::{Error, Result};
11use crate::storage::{Storage, StorageOptions, WorkspaceMetadata};
12use crate::types::{Chunk, Document, LanguageStats, WorkspaceId, WorkspaceStats};
13
14/// File-based storage implementation.
15pub struct DiskStorage {
16    /// Root directory for all storage.
17    root_dir: PathBuf,
18    /// Storage options.
19    options: StorageOptions,
20    /// Cached metadata.
21    metadata_cache: RwLock<HashMap<WorkspaceId, WorkspaceMetadata>>,
22}
23
24impl DiskStorage {
25    /// Create a new disk storage instance.
26    pub fn new(root_dir: PathBuf) -> Result<Self> {
27        Self::with_options(root_dir, StorageOptions::default())
28    }
29
30    /// Create a new disk storage with custom options.
31    pub fn with_options(root_dir: PathBuf, options: StorageOptions) -> Result<Self> {
32        // Create root directory if it doesn't exist
33        fs::create_dir_all(&root_dir)?;
34
35        Ok(Self {
36            root_dir,
37            options,
38            metadata_cache: RwLock::new(HashMap::new()),
39        })
40    }
41
42    /// Get the directory for a specific workspace.
43    fn workspace_dir(&self, workspace_id: &WorkspaceId) -> PathBuf {
44        self.root_dir.join(workspace_id.to_string())
45    }
46
47    /// Get the metadata file path for a workspace.
48    fn metadata_path(&self, workspace_id: &WorkspaceId) -> PathBuf {
49        self.workspace_dir(workspace_id).join("metadata.json")
50    }
51
52    /// Get the documents file path for a workspace.
53    fn documents_path(&self, workspace_id: &WorkspaceId) -> PathBuf {
54        self.workspace_dir(workspace_id).join("documents.bin")
55    }
56
57    /// Get the chunks file path for a workspace.
58    fn chunks_path(&self, workspace_id: &WorkspaceId) -> PathBuf {
59        self.workspace_dir(workspace_id).join("chunks.bin")
60    }
61
62    /// Get the embeddings file path for a workspace.
63    fn embeddings_path(&self, workspace_id: &WorkspaceId) -> PathBuf {
64        self.workspace_dir(workspace_id).join("embeddings.bin")
65    }
66
67    /// Serialize data to a file.
68    fn write_bincode<T: serde::Serialize>(&self, path: &Path, data: &T) -> Result<()> {
69        let file = File::create(path)?;
70        let mut writer = BufWriter::with_capacity(self.options.buffer_size, file);
71        bincode::serialize_into(&mut writer, data)?;
72        writer.flush()?;
73        Ok(())
74    }
75
76    /// Deserialize data from a file.
77    fn read_bincode<T: serde::de::DeserializeOwned>(&self, path: &Path) -> Result<T> {
78        let file = File::open(path)?;
79        let reader = BufReader::with_capacity(self.options.buffer_size, file);
80        let data = bincode::deserialize_from(reader)?;
81        Ok(data)
82    }
83
84    /// Write JSON data to a file.
85    fn write_json<T: serde::Serialize>(&self, path: &Path, data: &T) -> Result<()> {
86        let file = File::create(path)?;
87        let writer = BufWriter::with_capacity(self.options.buffer_size, file);
88        serde_json::to_writer_pretty(writer, data)?;
89        Ok(())
90    }
91
92    /// Read JSON data from a file.
93    fn read_json<T: serde::de::DeserializeOwned>(&self, path: &Path) -> Result<T> {
94        let file = File::open(path)?;
95        let reader = BufReader::with_capacity(self.options.buffer_size, file);
96        let data = serde_json::from_reader(reader)?;
97        Ok(data)
98    }
99
100    /// Calculate directory size recursively.
101    fn dir_size(&self, path: &Path) -> u64 {
102        if !path.exists() {
103            return 0;
104        }
105
106        walkdir::WalkDir::new(path)
107            .into_iter()
108            .filter_map(|e| e.ok())
109            .filter(|e| e.file_type().is_file())
110            .filter_map(|e| e.metadata().ok())
111            .map(|m| m.len())
112            .sum()
113    }
114}
115
116impl Storage for DiskStorage {
117    fn init_workspace(&self, workspace_id: &WorkspaceId, root_path: &PathBuf) -> Result<()> {
118        let ws_dir = self.workspace_dir(workspace_id);
119        fs::create_dir_all(&ws_dir)?;
120
121        // Create subdirectories
122        fs::create_dir_all(self.tantivy_index_path(workspace_id))?;
123
124        // Create initial metadata
125        let metadata = WorkspaceMetadata::new(
126            workspace_id.clone(),
127            root_path.clone(),
128            384, // Default embedding dimension
129        );
130
131        self.save_workspace_metadata(workspace_id, &metadata)?;
132
133        Ok(())
134    }
135
136    fn workspace_exists(&self, workspace_id: &WorkspaceId) -> bool {
137        self.metadata_path(workspace_id).exists()
138    }
139
140    fn load_workspace_metadata(&self, workspace_id: &WorkspaceId) -> Result<WorkspaceMetadata> {
141        // Check cache first
142        {
143            let cache = self.metadata_cache.read();
144            if let Some(meta) = cache.get(workspace_id) {
145                return Ok(meta.clone());
146            }
147        }
148
149        let path = self.metadata_path(workspace_id);
150        if !path.exists() {
151            return Err(Error::WorkspaceNotFound(workspace_id.clone()));
152        }
153
154        let metadata: WorkspaceMetadata = self.read_json(&path)?;
155
156        // Update cache
157        {
158            let mut cache = self.metadata_cache.write();
159            cache.insert(workspace_id.clone(), metadata.clone());
160        }
161
162        Ok(metadata)
163    }
164
165    fn save_workspace_metadata(
166        &self,
167        workspace_id: &WorkspaceId,
168        metadata: &WorkspaceMetadata,
169    ) -> Result<()> {
170        let path = self.metadata_path(workspace_id);
171        self.write_json(&path, metadata)?;
172
173        // Update cache
174        {
175            let mut cache = self.metadata_cache.write();
176            cache.insert(workspace_id.clone(), metadata.clone());
177        }
178
179        Ok(())
180    }
181
182    fn save_documents(&self, workspace_id: &WorkspaceId, documents: &[Document]) -> Result<()> {
183        let path = self.documents_path(workspace_id);
184        self.write_bincode(&path, &documents.to_vec())
185    }
186
187    fn load_documents(&self, workspace_id: &WorkspaceId) -> Result<Vec<Document>> {
188        let path = self.documents_path(workspace_id);
189        if !path.exists() {
190            return Ok(Vec::new());
191        }
192        self.read_bincode(&path)
193    }
194
195    fn save_chunks(&self, workspace_id: &WorkspaceId, chunks: &[Chunk]) -> Result<()> {
196        let path = self.chunks_path(workspace_id);
197        self.write_bincode(&path, &chunks.to_vec())
198    }
199
200    fn load_chunks(&self, workspace_id: &WorkspaceId) -> Result<Vec<Chunk>> {
201        let path = self.chunks_path(workspace_id);
202        if !path.exists() {
203            return Ok(Vec::new());
204        }
205        self.read_bincode(&path)
206    }
207
208    fn save_embeddings(
209        &self,
210        workspace_id: &WorkspaceId,
211        embeddings: &[(String, Vec<f32>)],
212    ) -> Result<()> {
213        let path = self.embeddings_path(workspace_id);
214        self.write_bincode(&path, &embeddings.to_vec())
215    }
216
217    fn load_embeddings(&self, workspace_id: &WorkspaceId) -> Result<Vec<(String, Vec<f32>)>> {
218        let path = self.embeddings_path(workspace_id);
219        if !path.exists() {
220            return Ok(Vec::new());
221        }
222        self.read_bincode(&path)
223    }
224
225    fn tantivy_index_path(&self, workspace_id: &WorkspaceId) -> PathBuf {
226        self.workspace_dir(workspace_id).join("tantivy")
227    }
228
229    fn vector_index_path(&self, workspace_id: &WorkspaceId) -> PathBuf {
230        self.workspace_dir(workspace_id).join("vectors.usearch")
231    }
232
233    fn delete_workspace(&self, workspace_id: &WorkspaceId) -> Result<()> {
234        let ws_dir = self.workspace_dir(workspace_id);
235        if ws_dir.exists() {
236            fs::remove_dir_all(&ws_dir)?;
237        }
238
239        // Remove from cache
240        {
241            let mut cache = self.metadata_cache.write();
242            cache.remove(workspace_id);
243        }
244
245        Ok(())
246    }
247
248    fn list_workspaces(&self) -> Result<Vec<WorkspaceId>> {
249        let mut workspaces = Vec::new();
250
251        if !self.root_dir.exists() {
252            return Ok(workspaces);
253        }
254
255        for entry in fs::read_dir(&self.root_dir)? {
256            let entry = entry?;
257            if entry.file_type()?.is_dir() {
258                if let Some(name) = entry.file_name().to_str() {
259                    if let Ok(uuid) = uuid::Uuid::parse_str(name) {
260                        workspaces.push(WorkspaceId::from_uuid(uuid));
261                    }
262                }
263            }
264        }
265
266        Ok(workspaces)
267    }
268
269    fn get_workspace_stats(&self, workspace_id: &WorkspaceId) -> Result<WorkspaceStats> {
270        let metadata = self.load_workspace_metadata(workspace_id)?;
271
272        let index_size_bytes = self.dir_size(&self.workspace_dir(workspace_id));
273
274        let language_stats: Vec<LanguageStats> = metadata
275            .language_stats
276            .iter()
277            .map(|(lang, info)| LanguageStats {
278                language: *lang,
279                file_count: info.file_count,
280                chunk_count: info.chunk_count,
281                total_bytes: info.total_bytes,
282            })
283            .collect();
284
285        Ok(WorkspaceStats {
286            workspace_id: workspace_id.clone(),
287            root_path: metadata.root_path,
288            document_count: metadata.document_count,
289            chunk_count: metadata.chunk_count,
290            total_bytes: metadata.total_bytes,
291            index_size_bytes,
292            created_at: metadata.created_at,
293            updated_at: metadata.updated_at,
294            language_stats,
295        })
296    }
297
298    fn flush(&self) -> Result<()> {
299        // Write all cached metadata to disk
300        let cache = self.metadata_cache.read();
301        for (workspace_id, metadata) in cache.iter() {
302            let path = self.metadata_path(workspace_id);
303            self.write_json(&path, metadata)?;
304        }
305        Ok(())
306    }
307}
308
309#[cfg(test)]
310mod tests {
311    use super::*;
312    use tempfile::TempDir;
313
314    #[test]
315    fn test_disk_storage_lifecycle() {
316        let temp_dir = TempDir::new().unwrap();
317        let storage = DiskStorage::new(temp_dir.path().to_path_buf()).unwrap();
318
319        let workspace_id = WorkspaceId::new();
320        let root_path = PathBuf::from("/test/project");
321
322        // Initialize workspace
323        storage.init_workspace(&workspace_id, &root_path).unwrap();
324        assert!(storage.workspace_exists(&workspace_id));
325
326        // Load metadata
327        let metadata = storage.load_workspace_metadata(&workspace_id).unwrap();
328        assert_eq!(metadata.root_path, root_path);
329
330        // List workspaces
331        let workspaces = storage.list_workspaces().unwrap();
332        assert!(workspaces.contains(&workspace_id));
333
334        // Delete workspace
335        storage.delete_workspace(&workspace_id).unwrap();
336        assert!(!storage.workspace_exists(&workspace_id));
337    }
338
339    #[test]
340    fn test_document_persistence() {
341        let temp_dir = TempDir::new().unwrap();
342        let storage = DiskStorage::new(temp_dir.path().to_path_buf()).unwrap();
343
344        let workspace_id = WorkspaceId::new();
345        storage
346            .init_workspace(&workspace_id, &PathBuf::from("/test"))
347            .unwrap();
348
349        let docs = vec![Document {
350            id: crate::types::DocumentId::new(),
351            relative_path: PathBuf::from("test.rs"),
352            absolute_path: PathBuf::from("/test/test.rs"),
353            language: Language::Rust,
354            content_hash: "abc123".to_string(),
355            size_bytes: 1000,
356            modified_at: chrono::Utc::now(),
357        }];
358
359        storage.save_documents(&workspace_id, &docs).unwrap();
360
361        let loaded = storage.load_documents(&workspace_id).unwrap();
362        assert_eq!(loaded.len(), 1);
363        assert_eq!(loaded[0].relative_path, PathBuf::from("test.rs"));
364    }
365}