aurora_semantic/storage/
metadata.rs

1//! Workspace metadata storage.
2
3use std::collections::HashMap;
4use std::path::PathBuf;
5
6use chrono::{DateTime, Utc};
7use serde::{Deserialize, Serialize};
8
9use crate::types::{DocumentId, Language, WorkspaceId};
10
11/// Metadata about a workspace index.
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct WorkspaceMetadata {
14    /// Workspace identifier.
15    pub workspace_id: WorkspaceId,
16    /// Root path of the workspace.
17    pub root_path: PathBuf,
18    /// When the workspace was first indexed.
19    pub created_at: DateTime<Utc>,
20    /// When the workspace was last updated.
21    pub updated_at: DateTime<Utc>,
22    /// Number of indexed documents.
23    pub document_count: usize,
24    /// Number of indexed chunks.
25    pub chunk_count: usize,
26    /// Total size of indexed files in bytes.
27    pub total_bytes: u64,
28    /// Embedding dimension used.
29    pub embedding_dimension: usize,
30    /// Version of the index format.
31    pub index_version: u32,
32    /// Document states for incremental updates.
33    pub document_states: HashMap<PathBuf, DocumentState>,
34    /// Language statistics.
35    pub language_stats: HashMap<Language, LanguageInfo>,
36}
37
38impl WorkspaceMetadata {
39    /// Create new metadata for a workspace.
40    pub fn new(workspace_id: WorkspaceId, root_path: PathBuf, embedding_dimension: usize) -> Self {
41        let now = Utc::now();
42        Self {
43            workspace_id,
44            root_path,
45            created_at: now,
46            updated_at: now,
47            document_count: 0,
48            chunk_count: 0,
49            total_bytes: 0,
50            embedding_dimension,
51            index_version: CURRENT_INDEX_VERSION,
52            document_states: HashMap::new(),
53            language_stats: HashMap::new(),
54        }
55    }
56
57    /// Update timestamp.
58    pub fn touch(&mut self) {
59        self.updated_at = Utc::now();
60    }
61
62    /// Check if a document needs reindexing.
63    pub fn needs_reindex(&self, path: &PathBuf, content_hash: &str) -> bool {
64        match self.document_states.get(path) {
65            Some(state) => state.content_hash != content_hash,
66            None => true,
67        }
68    }
69
70    /// Record a document as indexed.
71    pub fn record_document(
72        &mut self,
73        path: PathBuf,
74        document_id: DocumentId,
75        content_hash: String,
76        size_bytes: u64,
77        language: Language,
78        chunk_count: usize,
79    ) {
80        self.document_states.insert(
81            path,
82            DocumentState {
83                document_id,
84                content_hash,
85                size_bytes,
86                language,
87                chunk_count,
88                indexed_at: Utc::now(),
89            },
90        );
91
92        // Update language stats
93        let lang_info = self.language_stats.entry(language).or_insert(LanguageInfo {
94            file_count: 0,
95            chunk_count: 0,
96            total_bytes: 0,
97        });
98        lang_info.file_count += 1;
99        lang_info.chunk_count += chunk_count;
100        lang_info.total_bytes += size_bytes;
101
102        self.document_count = self.document_states.len();
103        self.chunk_count = self.document_states.values().map(|s| s.chunk_count).sum();
104        self.total_bytes = self.document_states.values().map(|s| s.size_bytes).sum();
105        self.touch();
106    }
107
108    /// Remove a document from the index.
109    pub fn remove_document(&mut self, path: &PathBuf) {
110        if let Some(state) = self.document_states.remove(path) {
111            // Update language stats
112            if let Some(lang_info) = self.language_stats.get_mut(&state.language) {
113                lang_info.file_count = lang_info.file_count.saturating_sub(1);
114                lang_info.chunk_count = lang_info.chunk_count.saturating_sub(state.chunk_count);
115                lang_info.total_bytes = lang_info.total_bytes.saturating_sub(state.size_bytes);
116
117                if lang_info.file_count == 0 {
118                    self.language_stats.remove(&state.language);
119                }
120            }
121
122            self.document_count = self.document_states.len();
123            self.chunk_count = self.document_states.values().map(|s| s.chunk_count).sum();
124            self.total_bytes = self.document_states.values().map(|s| s.size_bytes).sum();
125            self.touch();
126        }
127    }
128
129    /// Get paths that have been deleted from disk.
130    pub fn find_deleted_documents(&self, existing_paths: &[PathBuf]) -> Vec<PathBuf> {
131        let existing_set: std::collections::HashSet<_> = existing_paths.iter().collect();
132        self.document_states
133            .keys()
134            .filter(|path| !existing_set.contains(path))
135            .cloned()
136            .collect()
137    }
138
139    /// Check if the index version is compatible.
140    pub fn is_compatible(&self) -> bool {
141        self.index_version == CURRENT_INDEX_VERSION
142    }
143}
144
145/// State of a single document in the index.
146#[derive(Debug, Clone, Serialize, Deserialize)]
147pub struct DocumentState {
148    /// Document identifier.
149    pub document_id: DocumentId,
150    /// Content hash for change detection.
151    pub content_hash: String,
152    /// File size in bytes.
153    pub size_bytes: u64,
154    /// Programming language.
155    pub language: Language,
156    /// Number of chunks extracted.
157    pub chunk_count: usize,
158    /// When this document was indexed.
159    pub indexed_at: DateTime<Utc>,
160}
161
162/// Statistics for a single language.
163#[derive(Debug, Clone, Serialize, Deserialize)]
164pub struct LanguageInfo {
165    /// Number of files.
166    pub file_count: usize,
167    /// Number of chunks.
168    pub chunk_count: usize,
169    /// Total bytes.
170    pub total_bytes: u64,
171}
172
173/// Current index format version.
174pub const CURRENT_INDEX_VERSION: u32 = 1;
175
176#[cfg(test)]
177mod tests {
178    use super::*;
179
180    #[test]
181    fn test_workspace_metadata() {
182        let mut meta = WorkspaceMetadata::new(
183            WorkspaceId::new(),
184            PathBuf::from("/test"),
185            384,
186        );
187
188        assert_eq!(meta.document_count, 0);
189        assert!(meta.is_compatible());
190
191        // Record a document
192        meta.record_document(
193            PathBuf::from("test.rs"),
194            DocumentId::new(),
195            "hash123".to_string(),
196            1000,
197            Language::Rust,
198            5,
199        );
200
201        assert_eq!(meta.document_count, 1);
202        assert_eq!(meta.chunk_count, 5);
203        assert_eq!(meta.total_bytes, 1000);
204
205        // Check reindex need
206        assert!(!meta.needs_reindex(&PathBuf::from("test.rs"), "hash123"));
207        assert!(meta.needs_reindex(&PathBuf::from("test.rs"), "hash456"));
208        assert!(meta.needs_reindex(&PathBuf::from("other.rs"), "hash123"));
209    }
210}