1use std::collections::HashMap;
6use std::path::Path;
7use std::sync::Arc;
8
9use dashmap::DashMap;
10
11use crate::chunker::{Chunker, DefaultChunker};
12use crate::config::{EngineConfig, WorkspaceConfig};
13use crate::embeddings::{create_embedding_text, Embedder, HashEmbedder};
14use crate::error::{Error, Result};
15use crate::ignore::FileWalker;
16use crate::search::{LexicalIndex, SearchQuery, SemanticIndex};
17use crate::storage::{DiskStorage, Storage, WorkspaceMetadata};
18use crate::types::{
19 Chunk, Document, DocumentId, IndexPhase, IndexProgress, Language, SearchResult, WorkspaceId,
20 WorkspaceStats,
21};
22
23pub type ProgressCallback = Box<dyn Fn(IndexProgress) + Send + Sync>;
25
26pub struct Engine {
28 config: EngineConfig,
29 storage: Arc<DiskStorage>,
30 chunker: Arc<DefaultChunker>,
31 embedder: Arc<dyn Embedder>,
32 workspaces: DashMap<WorkspaceId, WorkspaceState>,
33}
34
35struct WorkspaceState {
37 #[allow(dead_code)]
38 metadata: WorkspaceMetadata,
39 documents: Vec<Document>,
40 chunks: Vec<Chunk>,
41 lexical_index: Arc<LexicalIndex>,
42 semantic_index: Arc<SemanticIndex>,
43}
44
45impl Engine {
46 pub fn new(config: EngineConfig) -> Result<Self> {
48 let storage = Arc::new(DiskStorage::new(config.index_dir.clone())?);
49 let chunker = Arc::new(DefaultChunker::new(config.chunking.clone())?);
50 let embedder: Arc<dyn Embedder> = Arc::new(HashEmbedder::new(config.embedding.dimension));
51
52 Ok(Self {
53 config,
54 storage,
55 chunker,
56 embedder,
57 workspaces: DashMap::new(),
58 })
59 }
60
61 pub fn with_embedder<E: Embedder + 'static>(config: EngineConfig, embedder: E) -> Result<Self> {
63 let storage = Arc::new(DiskStorage::new(config.index_dir.clone())?);
64 let chunker = Arc::new(DefaultChunker::new(config.chunking.clone())?);
65
66 Ok(Self {
67 config,
68 storage,
69 chunker,
70 embedder: Arc::new(embedder),
71 workspaces: DashMap::new(),
72 })
73 }
74
75 pub fn config(&self) -> &EngineConfig {
77 &self.config
78 }
79
80 pub async fn index_workspace(
82 &self,
83 workspace_config: WorkspaceConfig,
84 progress: Option<ProgressCallback>,
85 ) -> Result<WorkspaceId> {
86 let workspace_id = WorkspaceId::new();
87 let root_path = workspace_config.root_path.clone();
88
89 if !root_path.exists() {
90 return Err(Error::DirectoryNotFound(root_path));
91 }
92
93 self.storage.init_workspace(&workspace_id, &root_path)?;
95
96 if let Some(ref cb) = progress {
98 cb(IndexProgress {
99 phase: IndexPhase::Scanning,
100 processed: 0,
101 total: 0,
102 current_file: None,
103 eta_seconds: None,
104 });
105 }
106
107 let walker = FileWalker::new(root_path.clone(), self.config.ignore.clone())?;
109 let files = walker.walk()?;
110 let total_files = files.len();
111
112 if let Some(ref cb) = progress {
114 cb(IndexProgress {
115 phase: IndexPhase::Parsing,
116 processed: 0,
117 total: total_files,
118 current_file: None,
119 eta_seconds: None,
120 });
121 }
122
123 let mut documents = Vec::new();
124 let mut chunks = Vec::new();
125
126 for (i, file) in files.iter().enumerate() {
127 if let Some(ref cb) = progress {
128 cb(IndexProgress {
129 phase: IndexPhase::Parsing,
130 processed: i,
131 total: total_files,
132 current_file: Some(file.clone()),
133 eta_seconds: None,
134 });
135 }
136
137 if let Ok((doc, file_chunks)) = self.process_file(file, &root_path) {
138 documents.push(doc);
139 chunks.extend(file_chunks);
140 }
141 }
142
143 if let Some(ref cb) = progress {
145 cb(IndexProgress {
146 phase: IndexPhase::Embedding,
147 processed: 0,
148 total: chunks.len(),
149 current_file: None,
150 eta_seconds: None,
151 });
152 }
153
154 let mut embeddings = Vec::new();
155 let batch_size = self.config.embedding.batch_size;
156
157 for (batch_idx, batch) in chunks.chunks(batch_size).enumerate() {
158 if let Some(ref cb) = progress {
159 cb(IndexProgress {
160 phase: IndexPhase::Embedding,
161 processed: batch_idx * batch_size,
162 total: chunks.len(),
163 current_file: None,
164 eta_seconds: None,
165 });
166 }
167
168 let texts: Vec<String> = batch.iter().map(create_embedding_text).collect();
169 let text_refs: Vec<&str> = texts.iter().map(|s| s.as_str()).collect();
170
171 if let Ok(batch_embeddings) = self.embedder.embed_batch(&text_refs) {
172 for (chunk, embedding) in batch.iter().zip(batch_embeddings) {
173 embeddings.push((chunk.id.0.to_string(), embedding));
174 }
175 }
176 }
177
178 if let Some(ref cb) = progress {
180 cb(IndexProgress {
181 phase: IndexPhase::Indexing,
182 processed: 0,
183 total: 1,
184 current_file: None,
185 eta_seconds: None,
186 });
187 }
188
189 let lexical_path = self.storage.tantivy_index_path(&workspace_id);
191 let lexical_index = LexicalIndex::create(&lexical_path, self.config.search.clone())?;
192
193 let doc_map: HashMap<String, &Document> =
194 documents.iter().map(|d| (d.id.0.to_string(), d)).collect();
195 lexical_index.index_chunks(&chunks, &doc_map)?;
196
197 let mut semantic_index =
199 SemanticIndex::new(self.embedder.dimension(), self.config.search.clone())?;
200 semantic_index.add_embeddings(&embeddings)?;
201
202 if let Some(ref cb) = progress {
204 cb(IndexProgress {
205 phase: IndexPhase::Persisting,
206 processed: 0,
207 total: 1,
208 current_file: None,
209 eta_seconds: None,
210 });
211 }
212
213 self.storage.save_documents(&workspace_id, &documents)?;
214 self.storage.save_chunks(&workspace_id, &chunks)?;
215 self.storage.save_embeddings(&workspace_id, &embeddings)?;
216
217 let vector_path = self.storage.vector_index_path(&workspace_id);
218 semantic_index.save(&vector_path)?;
219
220 let mut metadata = self.storage.load_workspace_metadata(&workspace_id)?;
222 for doc in &documents {
223 let chunk_count = chunks.iter().filter(|c| c.document_id == doc.id).count();
224 metadata.record_document(
225 doc.relative_path.clone(),
226 doc.id.clone(),
227 doc.content_hash.clone(),
228 doc.size_bytes,
229 doc.language,
230 chunk_count,
231 );
232 }
233 self.storage
234 .save_workspace_metadata(&workspace_id, &metadata)?;
235
236 self.workspaces.insert(
238 workspace_id.clone(),
239 WorkspaceState {
240 metadata,
241 documents,
242 chunks,
243 lexical_index: Arc::new(lexical_index),
244 semantic_index: Arc::new(semantic_index),
245 },
246 );
247
248 if let Some(ref cb) = progress {
249 cb(IndexProgress {
250 phase: IndexPhase::Complete,
251 processed: total_files,
252 total: total_files,
253 current_file: None,
254 eta_seconds: Some(0.0),
255 });
256 }
257
258 Ok(workspace_id)
259 }
260
261 pub fn load_workspace(&self, workspace_id: &WorkspaceId) -> Result<()> {
263 if !self.storage.workspace_exists(workspace_id) {
264 return Err(Error::WorkspaceNotFound(workspace_id.clone()));
265 }
266
267 let metadata = self.storage.load_workspace_metadata(workspace_id)?;
268 let documents = self.storage.load_documents(workspace_id)?;
269 let chunks = self.storage.load_chunks(workspace_id)?;
270
271 let lexical_path = self.storage.tantivy_index_path(workspace_id);
272 let lexical_index = LexicalIndex::open(&lexical_path, self.config.search.clone())?;
273
274 let vector_path = self.storage.vector_index_path(workspace_id);
275 let semantic_index = SemanticIndex::load(
276 &vector_path,
277 self.embedder.dimension(),
278 self.config.search.clone(),
279 )?;
280
281 self.workspaces.insert(
282 workspace_id.clone(),
283 WorkspaceState {
284 metadata,
285 documents,
286 chunks,
287 lexical_index: Arc::new(lexical_index),
288 semantic_index: Arc::new(semantic_index),
289 },
290 );
291
292 Ok(())
293 }
294
295 pub fn search(
297 &self,
298 workspace_id: &WorkspaceId,
299 query: SearchQuery,
300 ) -> Result<Vec<SearchResult>> {
301 let state = self
302 .workspaces
303 .get(workspace_id)
304 .ok_or_else(|| Error::WorkspaceNotFound(workspace_id.clone()))?;
305
306 let chunk_map: HashMap<String, &Chunk> = state
308 .chunks
309 .iter()
310 .map(|c| (c.id.0.to_string(), c))
311 .collect();
312
313 let doc_map: HashMap<String, &Document> = state
314 .documents
315 .iter()
316 .map(|d| (d.id.0.to_string(), d))
317 .collect();
318
319 match query.mode {
320 crate::config::SearchMode::Lexical => {
321 state.lexical_index.search_index(&query, &chunk_map, &doc_map)
322 }
323 crate::config::SearchMode::Semantic => {
324 state.semantic_index.search_with_embedder(
325 &query,
326 self.embedder.as_ref(),
327 &chunk_map,
328 &doc_map,
329 )
330 }
331 crate::config::SearchMode::Hybrid => {
332 let lexical_results =
333 state.lexical_index.search_index(&query, &chunk_map, &doc_map)?;
334 let semantic_results = state.semantic_index.search_with_embedder(
335 &query,
336 self.embedder.as_ref(),
337 &chunk_map,
338 &doc_map,
339 )?;
340
341 let weights = vec![
343 self.config.search.lexical_weight,
344 self.config.search.semantic_weight,
345 ];
346 Ok(crate::search::merge_results(
347 vec![lexical_results, semantic_results],
348 &weights,
349 query.limit,
350 ))
351 }
352 }
353 }
354
355 pub fn search_text(
357 &self,
358 workspace_id: &WorkspaceId,
359 text: &str,
360 ) -> Result<Vec<SearchResult>> {
361 let query = SearchQuery::new(text)
362 .limit(self.config.search.default_limit)
363 .min_score(self.config.search.min_score);
364
365 self.search(workspace_id, query)
366 }
367
368 pub fn delete_workspace(&self, workspace_id: &WorkspaceId) -> Result<()> {
370 self.workspaces.remove(workspace_id);
371 self.storage.delete_workspace(workspace_id)
372 }
373
374 pub fn list_workspaces(&self) -> Result<Vec<WorkspaceId>> {
376 self.storage.list_workspaces()
377 }
378
379 pub fn get_workspace_stats(&self, workspace_id: &WorkspaceId) -> Result<WorkspaceStats> {
381 self.storage.get_workspace_stats(workspace_id)
382 }
383
384 fn process_file(&self, path: &Path, root: &Path) -> Result<(Document, Vec<Chunk>)> {
385 let content = std::fs::read_to_string(path)?;
386 let metadata = std::fs::metadata(path)?;
387 let hash = blake3::hash(content.as_bytes()).to_hex().to_string();
388
389 let extension = path.extension().and_then(|e| e.to_str()).unwrap_or("");
390 let language = Language::from_extension(extension);
391 let relative_path = path.strip_prefix(root).unwrap_or(path).to_path_buf();
392
393 let document = Document {
394 id: DocumentId::new(),
395 relative_path,
396 absolute_path: path.to_path_buf(),
397 language,
398 content_hash: hash,
399 size_bytes: metadata.len(),
400 modified_at: metadata
401 .modified()
402 .ok()
403 .map(chrono::DateTime::from)
404 .unwrap_or_else(chrono::Utc::now),
405 };
406
407 let chunks = self.chunker.chunk(&document, &content)?;
408 Ok((document, chunks))
409 }
410}
411
412unsafe impl Send for Engine {}
413unsafe impl Sync for Engine {}