aurora_semantic/engine/
mod.rs

1//! Engine module providing the main public API.
2//!
3//! The `Engine` struct is the primary entry point for using aurora-semantic.
4
5use std::collections::HashMap;
6use std::path::Path;
7use std::sync::Arc;
8
9use dashmap::DashMap;
10
11use crate::chunker::{Chunker, DefaultChunker};
12use crate::config::{EngineConfig, WorkspaceConfig};
13use crate::embeddings::{create_embedding_text, Embedder, HashEmbedder};
14use crate::error::{Error, Result};
15use crate::ignore::FileWalker;
16use crate::search::{LexicalIndex, SearchQuery, SemanticIndex};
17use crate::storage::{DiskStorage, Storage, WorkspaceMetadata};
18use crate::types::{
19    Chunk, Document, DocumentId, IndexPhase, IndexProgress, Language, SearchResult, WorkspaceId,
20    WorkspaceStats,
21};
22
23/// Progress callback type.
24pub type ProgressCallback = Box<dyn Fn(IndexProgress) + Send + Sync>;
25
26/// The main semantic search engine.
27pub struct Engine {
28    config: EngineConfig,
29    storage: Arc<DiskStorage>,
30    chunker: Arc<DefaultChunker>,
31    embedder: Arc<dyn Embedder>,
32    workspaces: DashMap<WorkspaceId, WorkspaceState>,
33}
34
35/// State of an indexed workspace.
36struct WorkspaceState {
37    #[allow(dead_code)]
38    metadata: WorkspaceMetadata,
39    documents: Vec<Document>,
40    chunks: Vec<Chunk>,
41    lexical_index: Arc<LexicalIndex>,
42    semantic_index: Arc<SemanticIndex>,
43}
44
45impl Engine {
46    /// Create a new engine with hash-based embeddings (for testing).
47    pub fn new(config: EngineConfig) -> Result<Self> {
48        let storage = Arc::new(DiskStorage::new(config.index_dir.clone())?);
49        let chunker = Arc::new(DefaultChunker::new(config.chunking.clone())?);
50        let embedder: Arc<dyn Embedder> = Arc::new(HashEmbedder::new(config.embedding.dimension));
51
52        Ok(Self {
53            config,
54            storage,
55            chunker,
56            embedder,
57            workspaces: DashMap::new(),
58        })
59    }
60
61    /// Create an engine with a custom embedder (e.g., ONNX model).
62    pub fn with_embedder<E: Embedder + 'static>(config: EngineConfig, embedder: E) -> Result<Self> {
63        let storage = Arc::new(DiskStorage::new(config.index_dir.clone())?);
64        let chunker = Arc::new(DefaultChunker::new(config.chunking.clone())?);
65
66        Ok(Self {
67            config,
68            storage,
69            chunker,
70            embedder: Arc::new(embedder),
71            workspaces: DashMap::new(),
72        })
73    }
74
75    /// Get the engine configuration.
76    pub fn config(&self) -> &EngineConfig {
77        &self.config
78    }
79
80    /// Index a workspace.
81    pub async fn index_workspace(
82        &self,
83        workspace_config: WorkspaceConfig,
84        progress: Option<ProgressCallback>,
85    ) -> Result<WorkspaceId> {
86        let workspace_id = WorkspaceId::new();
87        let root_path = workspace_config.root_path.clone();
88
89        if !root_path.exists() {
90            return Err(Error::DirectoryNotFound(root_path));
91        }
92
93        // Initialize storage
94        self.storage.init_workspace(&workspace_id, &root_path)?;
95
96        // Report scanning phase
97        if let Some(ref cb) = progress {
98            cb(IndexProgress {
99                phase: IndexPhase::Scanning,
100                processed: 0,
101                total: 0,
102                current_file: None,
103                eta_seconds: None,
104            });
105        }
106
107        // Walk and collect files
108        let walker = FileWalker::new(root_path.clone(), self.config.ignore.clone())?;
109        let files = walker.walk()?;
110        let total_files = files.len();
111
112        // Parse files into documents and chunks
113        if let Some(ref cb) = progress {
114            cb(IndexProgress {
115                phase: IndexPhase::Parsing,
116                processed: 0,
117                total: total_files,
118                current_file: None,
119                eta_seconds: None,
120            });
121        }
122
123        let mut documents = Vec::new();
124        let mut chunks = Vec::new();
125
126        for (i, file) in files.iter().enumerate() {
127            if let Some(ref cb) = progress {
128                cb(IndexProgress {
129                    phase: IndexPhase::Parsing,
130                    processed: i,
131                    total: total_files,
132                    current_file: Some(file.clone()),
133                    eta_seconds: None,
134                });
135            }
136
137            if let Ok((doc, file_chunks)) = self.process_file(file, &root_path) {
138                documents.push(doc);
139                chunks.extend(file_chunks);
140            }
141        }
142
143        // Generate embeddings
144        if let Some(ref cb) = progress {
145            cb(IndexProgress {
146                phase: IndexPhase::Embedding,
147                processed: 0,
148                total: chunks.len(),
149                current_file: None,
150                eta_seconds: None,
151            });
152        }
153
154        let mut embeddings = Vec::new();
155        let batch_size = self.config.embedding.batch_size;
156
157        for (batch_idx, batch) in chunks.chunks(batch_size).enumerate() {
158            if let Some(ref cb) = progress {
159                cb(IndexProgress {
160                    phase: IndexPhase::Embedding,
161                    processed: batch_idx * batch_size,
162                    total: chunks.len(),
163                    current_file: None,
164                    eta_seconds: None,
165                });
166            }
167
168            let texts: Vec<String> = batch.iter().map(create_embedding_text).collect();
169            let text_refs: Vec<&str> = texts.iter().map(|s| s.as_str()).collect();
170
171            if let Ok(batch_embeddings) = self.embedder.embed_batch(&text_refs) {
172                for (chunk, embedding) in batch.iter().zip(batch_embeddings) {
173                    embeddings.push((chunk.id.0.to_string(), embedding));
174                }
175            }
176        }
177
178        // Build indexes
179        if let Some(ref cb) = progress {
180            cb(IndexProgress {
181                phase: IndexPhase::Indexing,
182                processed: 0,
183                total: 1,
184                current_file: None,
185                eta_seconds: None,
186            });
187        }
188
189        // Build lexical index
190        let lexical_path = self.storage.tantivy_index_path(&workspace_id);
191        let lexical_index = LexicalIndex::create(&lexical_path, self.config.search.clone())?;
192
193        let doc_map: HashMap<String, &Document> =
194            documents.iter().map(|d| (d.id.0.to_string(), d)).collect();
195        lexical_index.index_chunks(&chunks, &doc_map)?;
196
197        // Build semantic index
198        let mut semantic_index =
199            SemanticIndex::new(self.embedder.dimension(), self.config.search.clone())?;
200        semantic_index.add_embeddings(&embeddings)?;
201
202        // Save to storage
203        if let Some(ref cb) = progress {
204            cb(IndexProgress {
205                phase: IndexPhase::Persisting,
206                processed: 0,
207                total: 1,
208                current_file: None,
209                eta_seconds: None,
210            });
211        }
212
213        self.storage.save_documents(&workspace_id, &documents)?;
214        self.storage.save_chunks(&workspace_id, &chunks)?;
215        self.storage.save_embeddings(&workspace_id, &embeddings)?;
216
217        let vector_path = self.storage.vector_index_path(&workspace_id);
218        semantic_index.save(&vector_path)?;
219
220        // Update metadata
221        let mut metadata = self.storage.load_workspace_metadata(&workspace_id)?;
222        for doc in &documents {
223            let chunk_count = chunks.iter().filter(|c| c.document_id == doc.id).count();
224            metadata.record_document(
225                doc.relative_path.clone(),
226                doc.id.clone(),
227                doc.content_hash.clone(),
228                doc.size_bytes,
229                doc.language,
230                chunk_count,
231            );
232        }
233        self.storage
234            .save_workspace_metadata(&workspace_id, &metadata)?;
235
236        // Store in memory
237        self.workspaces.insert(
238            workspace_id.clone(),
239            WorkspaceState {
240                metadata,
241                documents,
242                chunks,
243                lexical_index: Arc::new(lexical_index),
244                semantic_index: Arc::new(semantic_index),
245            },
246        );
247
248        if let Some(ref cb) = progress {
249            cb(IndexProgress {
250                phase: IndexPhase::Complete,
251                processed: total_files,
252                total: total_files,
253                current_file: None,
254                eta_seconds: Some(0.0),
255            });
256        }
257
258        Ok(workspace_id)
259    }
260
261    /// Load an existing workspace index.
262    pub fn load_workspace(&self, workspace_id: &WorkspaceId) -> Result<()> {
263        if !self.storage.workspace_exists(workspace_id) {
264            return Err(Error::WorkspaceNotFound(workspace_id.clone()));
265        }
266
267        let metadata = self.storage.load_workspace_metadata(workspace_id)?;
268        let documents = self.storage.load_documents(workspace_id)?;
269        let chunks = self.storage.load_chunks(workspace_id)?;
270
271        let lexical_path = self.storage.tantivy_index_path(workspace_id);
272        let lexical_index = LexicalIndex::open(&lexical_path, self.config.search.clone())?;
273
274        let vector_path = self.storage.vector_index_path(workspace_id);
275        let semantic_index = SemanticIndex::load(
276            &vector_path,
277            self.embedder.dimension(),
278            self.config.search.clone(),
279        )?;
280
281        self.workspaces.insert(
282            workspace_id.clone(),
283            WorkspaceState {
284                metadata,
285                documents,
286                chunks,
287                lexical_index: Arc::new(lexical_index),
288                semantic_index: Arc::new(semantic_index),
289            },
290        );
291
292        Ok(())
293    }
294
295    /// Search for code across a workspace.
296    pub fn search(
297        &self,
298        workspace_id: &WorkspaceId,
299        query: SearchQuery,
300    ) -> Result<Vec<SearchResult>> {
301        let state = self
302            .workspaces
303            .get(workspace_id)
304            .ok_or_else(|| Error::WorkspaceNotFound(workspace_id.clone()))?;
305
306        // Build lookup maps
307        let chunk_map: HashMap<String, &Chunk> = state
308            .chunks
309            .iter()
310            .map(|c| (c.id.0.to_string(), c))
311            .collect();
312
313        let doc_map: HashMap<String, &Document> = state
314            .documents
315            .iter()
316            .map(|d| (d.id.0.to_string(), d))
317            .collect();
318
319        match query.mode {
320            crate::config::SearchMode::Lexical => {
321                state.lexical_index.search_index(&query, &chunk_map, &doc_map)
322            }
323            crate::config::SearchMode::Semantic => {
324                state.semantic_index.search_with_embedder(
325                    &query,
326                    self.embedder.as_ref(),
327                    &chunk_map,
328                    &doc_map,
329                )
330            }
331            crate::config::SearchMode::Hybrid => {
332                let lexical_results =
333                    state.lexical_index.search_index(&query, &chunk_map, &doc_map)?;
334                let semantic_results = state.semantic_index.search_with_embedder(
335                    &query,
336                    self.embedder.as_ref(),
337                    &chunk_map,
338                    &doc_map,
339                )?;
340
341                // Merge results
342                let weights = vec![
343                    self.config.search.lexical_weight,
344                    self.config.search.semantic_weight,
345                ];
346                Ok(crate::search::merge_results(
347                    vec![lexical_results, semantic_results],
348                    &weights,
349                    query.limit,
350                ))
351            }
352        }
353    }
354
355    /// Search with a simple text query.
356    pub fn search_text(
357        &self,
358        workspace_id: &WorkspaceId,
359        text: &str,
360    ) -> Result<Vec<SearchResult>> {
361        let query = SearchQuery::new(text)
362            .limit(self.config.search.default_limit)
363            .min_score(self.config.search.min_score);
364
365        self.search(workspace_id, query)
366    }
367
368    /// Delete a workspace and all its data.
369    pub fn delete_workspace(&self, workspace_id: &WorkspaceId) -> Result<()> {
370        self.workspaces.remove(workspace_id);
371        self.storage.delete_workspace(workspace_id)
372    }
373
374    /// List all indexed workspaces.
375    pub fn list_workspaces(&self) -> Result<Vec<WorkspaceId>> {
376        self.storage.list_workspaces()
377    }
378
379    /// Get statistics for a workspace.
380    pub fn get_workspace_stats(&self, workspace_id: &WorkspaceId) -> Result<WorkspaceStats> {
381        self.storage.get_workspace_stats(workspace_id)
382    }
383
384    fn process_file(&self, path: &Path, root: &Path) -> Result<(Document, Vec<Chunk>)> {
385        let content = std::fs::read_to_string(path)?;
386        let metadata = std::fs::metadata(path)?;
387        let hash = blake3::hash(content.as_bytes()).to_hex().to_string();
388
389        let extension = path.extension().and_then(|e| e.to_str()).unwrap_or("");
390        let language = Language::from_extension(extension);
391        let relative_path = path.strip_prefix(root).unwrap_or(path).to_path_buf();
392
393        let document = Document {
394            id: DocumentId::new(),
395            relative_path,
396            absolute_path: path.to_path_buf(),
397            language,
398            content_hash: hash,
399            size_bytes: metadata.len(),
400            modified_at: metadata
401                .modified()
402                .ok()
403                .map(chrono::DateTime::from)
404                .unwrap_or_else(chrono::Utc::now),
405        };
406
407        let chunks = self.chunker.chunk(&document, &content)?;
408        Ok((document, chunks))
409    }
410}
411
412unsafe impl Send for Engine {}
413unsafe impl Sync for Engine {}