1use anyhow::{Result, anyhow};
2use rusqlite::{Connection, params};
3use sha2::{Digest, Sha256};
4use std::fs;
5use std::path::{Path, PathBuf};
6use std::sync::{Arc, Mutex};
7use std::time::Duration;
8use tracing::{debug, info, warn};
9use uuid::Uuid;
10
11use super::embeddings::{cosine_similarity, deserialize_embedding, serialize_embedding};
12use super::search::MemoryChunk;
13
14#[derive(Clone)]
15pub struct MemoryIndex {
16 conn: Arc<Mutex<Connection>>,
17 workspace: PathBuf,
18 db_path: PathBuf,
19 has_vec_extension: bool,
21 chunk_size: usize,
23 chunk_overlap: usize,
25}
26
27#[derive(Debug)]
28pub struct ReindexStats {
29 pub files_processed: usize,
30 pub files_updated: usize,
31 pub chunks_indexed: usize,
32 pub duration: Duration,
33}
34
35impl MemoryIndex {
36 pub fn new_with_db_path(workspace: &Path, db_path: &Path) -> Result<Self> {
38 if let Some(parent) = db_path.parent() {
40 fs::create_dir_all(parent)?;
41 }
42
43 let conn = Connection::open(db_path)?;
44
45 let needs_migration = Self::needs_schema_migration(&conn)?;
47 if needs_migration {
48 info!("Migrating database schema to OpenClaw-compatible format");
49 Self::migrate_to_openclaw_schema(&conn)?;
50 }
51
52 conn.execute_batch(
54 r#"
55 -- Metadata key/value store
56 CREATE TABLE IF NOT EXISTS meta (
57 key TEXT PRIMARY KEY,
58 value TEXT NOT NULL
59 );
60
61 -- File tracking (OpenClaw-compatible)
62 CREATE TABLE IF NOT EXISTS files (
63 path TEXT PRIMARY KEY,
64 source TEXT NOT NULL DEFAULT 'memory',
65 hash TEXT NOT NULL,
66 mtime INTEGER NOT NULL,
67 size INTEGER NOT NULL
68 );
69
70 -- Chunked content (OpenClaw-compatible)
71 CREATE TABLE IF NOT EXISTS chunks (
72 id TEXT PRIMARY KEY,
73 path TEXT NOT NULL,
74 source TEXT NOT NULL DEFAULT 'memory',
75 start_line INTEGER NOT NULL,
76 end_line INTEGER NOT NULL,
77 hash TEXT NOT NULL,
78 model TEXT NOT NULL DEFAULT '',
79 text TEXT NOT NULL,
80 embedding TEXT NOT NULL DEFAULT '',
81 updated_at INTEGER NOT NULL
82 );
83
84 -- Embedding cache (OpenClaw-compatible)
85 CREATE TABLE IF NOT EXISTS embedding_cache (
86 provider TEXT NOT NULL,
87 model TEXT NOT NULL,
88 provider_key TEXT NOT NULL,
89 hash TEXT NOT NULL,
90 embedding TEXT NOT NULL,
91 dims INTEGER,
92 updated_at INTEGER NOT NULL,
93 PRIMARY KEY (provider, model, provider_key, hash)
94 );
95
96 -- Indexes
97 CREATE INDEX IF NOT EXISTS idx_chunks_path ON chunks(path);
98 CREATE INDEX IF NOT EXISTS idx_chunks_source ON chunks(source);
99 CREATE INDEX IF NOT EXISTS idx_embedding_cache_updated_at ON embedding_cache(updated_at);
100 "#,
101 )?;
102
103 Self::ensure_fts_table(&conn)?;
105
106 Self::ensure_column(&conn, "files", "source", "TEXT NOT NULL DEFAULT 'memory'")?;
108 Self::ensure_column(&conn, "chunks", "source", "TEXT NOT NULL DEFAULT 'memory'")?;
109
110 let has_vec_extension = Self::try_load_sqlite_vec(&conn);
112 if has_vec_extension {
113 debug!("sqlite-vec extension loaded successfully");
114 Self::ensure_vec_table(&conn)?;
115 } else {
116 debug!("sqlite-vec extension not available, using in-memory vector search");
117 }
118
119 Ok(Self {
120 conn: Arc::new(Mutex::new(conn)),
121 workspace: workspace.to_path_buf(),
122 db_path: db_path.to_path_buf(),
123 has_vec_extension,
124 chunk_size: 400,
125 chunk_overlap: 80,
126 })
127 }
128
129 pub fn with_chunk_config(mut self, chunk_size: usize, chunk_overlap: usize) -> Self {
131 self.chunk_size = chunk_size;
132 self.chunk_overlap = chunk_overlap;
133 self
134 }
135
136 #[cfg(feature = "sqlite-vec")]
138 #[allow(unsafe_code)]
139 fn try_load_sqlite_vec(conn: &Connection) -> bool {
140 if unsafe { conn.load_extension_enable() }.is_err() {
146 return false;
147 }
148
149 let ext_paths = [
151 "vec0", "./vec0",
153 "/usr/local/lib/vec0",
154 "/usr/lib/vec0",
155 ];
156
157 for path in ext_paths {
158 if unsafe { conn.load_extension(path, None::<&str>) }.is_ok() {
160 let _ = conn.load_extension_disable();
161 return true;
162 }
163 }
164
165 let _ = conn.load_extension_disable();
166 false
167 }
168
169 #[cfg(not(feature = "sqlite-vec"))]
170 fn try_load_sqlite_vec(_conn: &Connection) -> bool {
171 false
172 }
173
174 fn ensure_vec_table(conn: &Connection) -> Result<()> {
176 let result = conn.execute(
178 "CREATE VIRTUAL TABLE IF NOT EXISTS chunks_vec USING vec0(id TEXT PRIMARY KEY, embedding float[1536])",
179 [],
180 );
181 match result {
182 Ok(_) => debug!("chunks_vec table created/verified"),
183 Err(e) => debug!("chunks_vec table creation skipped: {}", e),
184 }
185 Ok(())
186 }
187
188 pub fn new(workspace: &Path) -> Result<Self> {
190 let db_path = workspace.join("memory.sqlite");
191 Self::new_with_db_path(workspace, &db_path)
192 }
193
194 pub fn index_file(&self, path: &Path, force: bool) -> Result<bool> {
196 let content = fs::read_to_string(path)?;
197 let file_hash = hash_content(&content);
198 let metadata = fs::metadata(path)?;
199 let mtime = metadata
200 .modified()?
201 .duration_since(std::time::UNIX_EPOCH)?
202 .as_secs() as i64;
203 let size = metadata.len() as i64;
204
205 let relative_path = path
206 .strip_prefix(&self.workspace)
207 .unwrap_or(path)
208 .to_string_lossy()
209 .to_string();
210
211 let conn = self
212 .conn
213 .lock()
214 .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
215
216 if !force {
218 let existing: Option<String> = conn
219 .query_row(
220 "SELECT hash FROM files WHERE path = ?1",
221 params![&relative_path],
222 |row| row.get(0),
223 )
224 .ok();
225
226 if existing.as_deref() == Some(&file_hash) {
227 debug!("File unchanged, skipping: {}", relative_path);
228 return Ok(false);
229 }
230 }
231
232 debug!("Indexing file: {}", relative_path);
233
234 let now = std::time::SystemTime::now()
235 .duration_since(std::time::UNIX_EPOCH)?
236 .as_secs() as i64;
237
238 conn.execute(
240 "INSERT OR REPLACE INTO files (path, source, hash, mtime, size) VALUES (?1, 'memory', ?2, ?3, ?4)",
241 params![&relative_path, &file_hash, mtime, size],
242 )?;
243
244 Self::delete_chunks_for_path(&conn, &relative_path)?;
246
247 let chunks = chunk_text(&content, self.chunk_size, self.chunk_overlap);
249
250 for chunk in chunks.iter() {
251 let chunk_id = Uuid::new_v4().to_string();
252 let chunk_hash = hash_content(&chunk.content);
253
254 conn.execute(
255 r#"INSERT INTO chunks (id, path, source, start_line, end_line, hash, model, text, embedding, updated_at)
256 VALUES (?1, ?2, 'memory', ?3, ?4, ?5, '', ?6, '', ?7)"#,
257 params![&chunk_id, &relative_path, chunk.line_start, chunk.line_end, &chunk_hash, &chunk.content, now],
258 )?;
259
260 Self::insert_fts(
262 &conn,
263 &chunk_id,
264 &relative_path,
265 "memory",
266 "",
267 chunk.line_start,
268 chunk.line_end,
269 &chunk.content,
270 )?;
271 }
272
273 Ok(true)
274 }
275
276 pub fn insert_chunk(
279 &self,
280 virtual_path: &str,
281 content: &str,
282 line_start_raw: usize,
283 line_end_raw: usize,
284 ) -> Result<()> {
285 let conn = self
286 .conn
287 .lock()
288 .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
289
290 let line_start = line_start_raw as i64;
291 let line_end = line_end_raw as i64;
292 let chunk_id = Uuid::new_v4().to_string();
293 let chunk_hash = hash_content(content);
294 let now = std::time::SystemTime::now()
295 .duration_since(std::time::UNIX_EPOCH)?
296 .as_secs() as i64;
297
298 conn.execute(
299 r#"INSERT INTO chunks (id, path, source, start_line, end_line, hash, model, text, embedding, updated_at)
300 VALUES (?1, ?2, 'session', ?3, ?4, ?5, '', ?6, '', ?7)"#,
301 params![&chunk_id, virtual_path, line_start, line_end, &chunk_hash, content, now],
302 )?;
303
304 Self::insert_fts(
305 &conn,
306 &chunk_id,
307 virtual_path,
308 "session",
309 "",
310 line_start as i32,
311 line_end as i32,
312 content,
313 )?;
314
315 Ok(())
316 }
317
318 fn delete_chunks_for_path(conn: &Connection, path: &str) -> Result<()> {
320 let mut stmt = conn.prepare("SELECT id FROM chunks WHERE path = ?1")?;
322 let chunk_ids: Vec<String> = stmt
323 .query_map(params![path], |row| row.get(0))?
324 .filter_map(|r| r.ok())
325 .collect();
326
327 for chunk_id in chunk_ids {
328 let _ = conn.execute("DELETE FROM chunks_fts WHERE id = ?1", params![&chunk_id]);
329 }
330
331 conn.execute("DELETE FROM chunks WHERE path = ?1", params![path])?;
333 Ok(())
334 }
335
336 pub fn remove_file(&self, relative_path: &str) -> Result<()> {
338 let conn = self
339 .conn
340 .lock()
341 .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
342
343 Self::delete_chunks_for_path(&conn, relative_path)?;
344 conn.execute("DELETE FROM files WHERE path = ?1", params![relative_path])?;
345
346 debug!("Removed deleted file from index: {}", relative_path);
347 Ok(())
348 }
349
350 pub fn indexed_files(&self) -> Result<Vec<String>> {
352 let conn = self
353 .conn
354 .lock()
355 .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
356
357 let mut stmt = conn.prepare("SELECT path FROM files")?;
358 let rows = stmt.query_map([], |row| row.get(0))?;
359
360 let mut paths = Vec::new();
361 for row in rows {
362 paths.push(row?);
363 }
364 Ok(paths)
365 }
366
367 #[allow(clippy::too_many_arguments)]
369 fn insert_fts(
370 conn: &Connection,
371 id: &str,
372 path: &str,
373 source: &str,
374 model: &str,
375 start_line: i32,
376 end_line: i32,
377 text: &str,
378 ) -> Result<()> {
379 let _ = conn.execute(
380 "INSERT INTO chunks_fts (text, id, path, source, model, start_line, end_line) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
381 params![text, id, path, source, model, start_line, end_line],
382 );
383 Ok(())
384 }
385
386 pub fn search(&self, query: &str, limit: usize) -> Result<Vec<MemoryChunk>> {
388 let fts_query = match build_fts_query(query) {
389 Some(q) => q,
390 None => return Ok(Vec::new()),
391 };
392
393 let conn = self
394 .conn
395 .lock()
396 .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
397
398 let mut stmt = conn.prepare(
401 r#"
402 SELECT fts.path, fts.start_line, fts.end_line, fts.text, bm25(chunks_fts) as score, c.updated_at
403 FROM chunks_fts fts
404 JOIN chunks c ON c.id = fts.id
405 WHERE chunks_fts MATCH ?1
406 ORDER BY score
407 LIMIT ?2
408 "#,
409 )?;
410
411 let rows = stmt.query_map(params![&fts_query, limit as i64], |row| {
412 Ok(MemoryChunk {
413 file: row.get(0)?,
414 line_start: row.get(1)?,
415 line_end: row.get(2)?,
416 content: row.get(3)?,
417 score: row.get::<_, f64>(4)?.abs(), updated_at: row.get(5)?,
419 })
420 })?;
421
422 let mut results = Vec::new();
423 for row in rows {
424 results.push(row?);
425 }
426
427 Ok(results)
428 }
429
430 pub fn search_fts_raw(&self, fts_query: &str, limit: usize) -> Result<Vec<MemoryChunk>> {
433 if fts_query.is_empty() {
434 return Ok(Vec::new());
435 }
436
437 let conn = self
438 .conn
439 .lock()
440 .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
441
442 let mut stmt = conn.prepare(
443 r#"
444 SELECT fts.path, fts.start_line, fts.end_line, fts.text, bm25(chunks_fts) as score, c.updated_at
445 FROM chunks_fts fts
446 JOIN chunks c ON c.id = fts.id
447 WHERE chunks_fts MATCH ?1
448 ORDER BY score
449 LIMIT ?2
450 "#,
451 )?;
452
453 let rows = stmt.query_map(params![fts_query, limit as i64], |row| {
454 Ok(MemoryChunk {
455 file: row.get(0)?,
456 line_start: row.get(1)?,
457 line_end: row.get(2)?,
458 content: row.get(3)?,
459 score: row.get::<_, f64>(4)?.abs(),
460 updated_at: row.get(5)?,
461 })
462 })?;
463
464 let mut results = Vec::new();
465 for row in rows {
466 results.push(row?);
467 }
468
469 Ok(results)
470 }
471
472 pub fn chunk_count(&self) -> Result<usize> {
474 let conn = self
475 .conn
476 .lock()
477 .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
478 let count: i64 = conn.query_row("SELECT COUNT(*) FROM chunks", [], |row| row.get(0))?;
479 Ok(count as usize)
480 }
481
482 pub fn file_chunk_count(&self, path: &Path) -> Result<usize> {
484 let relative_path = path
485 .strip_prefix(&self.workspace)
486 .unwrap_or(path)
487 .to_string_lossy()
488 .to_string();
489
490 let conn = self
491 .conn
492 .lock()
493 .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
494 let count: i64 = conn.query_row(
495 "SELECT COUNT(*) FROM chunks WHERE path = ?1",
496 params![&relative_path],
497 |row| row.get(0),
498 )?;
499 Ok(count as usize)
500 }
501
502 pub fn size_bytes(&self) -> Result<u64> {
504 if self.db_path.exists() {
505 Ok(fs::metadata(&self.db_path)?.len())
506 } else {
507 Ok(0)
508 }
509 }
510
511 pub fn db_path(&self) -> &Path {
513 &self.db_path
514 }
515
516 fn needs_schema_migration(conn: &Connection) -> Result<bool> {
518 let result: rusqlite::Result<String> =
523 conn.query_row("PRAGMA table_info(chunks)", [], |row| row.get(1));
524
525 if result.is_err() {
526 return Ok(false);
528 }
529
530 let has_file_path: bool = conn.prepare("SELECT file_path FROM chunks LIMIT 0").is_ok();
532 let has_content: bool = conn.prepare("SELECT content FROM chunks LIMIT 0").is_ok();
533
534 Ok(has_file_path || has_content)
536 }
537
538 fn migrate_to_openclaw_schema(conn: &Connection) -> Result<()> {
540 conn.execute("BEGIN TRANSACTION", [])?;
542
543 let _ = conn.execute("ALTER TABLE chunks RENAME TO chunks_old", []);
545 let _ = conn.execute("ALTER TABLE files RENAME TO files_old", []);
546
547 let _ = conn.execute("DROP TABLE IF EXISTS chunks_fts", []);
549 let _ = conn.execute("DROP TRIGGER IF EXISTS chunks_ai", []);
550 let _ = conn.execute("DROP TRIGGER IF EXISTS chunks_ad", []);
551 let _ = conn.execute("DROP TRIGGER IF EXISTS chunks_au", []);
552
553 conn.execute(
555 r#"
556 CREATE TABLE IF NOT EXISTS files (
557 path TEXT PRIMARY KEY,
558 source TEXT NOT NULL DEFAULT 'memory',
559 hash TEXT NOT NULL,
560 mtime INTEGER NOT NULL,
561 size INTEGER NOT NULL
562 )
563 "#,
564 [],
565 )?;
566
567 conn.execute(
568 r#"
569 CREATE TABLE IF NOT EXISTS chunks (
570 id TEXT PRIMARY KEY,
571 path TEXT NOT NULL,
572 source TEXT NOT NULL DEFAULT 'memory',
573 start_line INTEGER NOT NULL,
574 end_line INTEGER NOT NULL,
575 hash TEXT NOT NULL,
576 model TEXT NOT NULL DEFAULT '',
577 text TEXT NOT NULL,
578 embedding TEXT NOT NULL DEFAULT '',
579 updated_at INTEGER NOT NULL
580 )
581 "#,
582 [],
583 )?;
584
585 let now = std::time::SystemTime::now()
587 .duration_since(std::time::UNIX_EPOCH)
588 .unwrap_or_default()
589 .as_secs() as i64;
590
591 let _ = conn.execute(
593 r#"
594 INSERT INTO files (path, source, hash, mtime, size)
595 SELECT path, 'memory', hash, mtime, size FROM files_old
596 "#,
597 [],
598 );
599
600 let has_embedding_cols = conn
603 .prepare("SELECT embedding FROM chunks_old LIMIT 0")
604 .is_ok();
605
606 if has_embedding_cols {
608 let mut stmt = conn.prepare(
610 "SELECT file_path, line_start, line_end, content, embedding, embedding_model FROM chunks_old",
611 )?;
612 let rows = stmt.query_map([], |row| {
613 Ok((
614 row.get::<_, String>(0)?,
615 row.get::<_, i32>(1)?,
616 row.get::<_, i32>(2)?,
617 row.get::<_, String>(3)?,
618 row.get::<_, Option<String>>(4)?,
619 row.get::<_, Option<String>>(5)?,
620 ))
621 })?;
622
623 for row in rows {
624 let (file_path, line_start, line_end, content, embedding, model) = row?;
625 let new_id = Uuid::new_v4().to_string();
626 let hash = hash_content(&content);
627 let model = model.unwrap_or_default();
628 let embedding = embedding.unwrap_or_default();
629
630 conn.execute(
631 r#"
632 INSERT INTO chunks (id, path, source, start_line, end_line, hash, model, text, embedding, updated_at)
633 VALUES (?1, ?2, 'memory', ?3, ?4, ?5, ?6, ?7, ?8, ?9)
634 "#,
635 params![&new_id, &file_path, line_start, line_end, &hash, &model, &content, &embedding, now],
636 )?;
637 }
638 } else {
639 let mut stmt =
641 conn.prepare("SELECT file_path, line_start, line_end, content FROM chunks_old")?;
642 let rows = stmt.query_map([], |row| {
643 Ok((
644 row.get::<_, String>(0)?,
645 row.get::<_, i32>(1)?,
646 row.get::<_, i32>(2)?,
647 row.get::<_, String>(3)?,
648 ))
649 })?;
650
651 for row in rows {
652 let (file_path, line_start, line_end, content) = row?;
653 let new_id = Uuid::new_v4().to_string();
654 let hash = hash_content(&content);
655
656 conn.execute(
657 r#"
658 INSERT INTO chunks (id, path, source, start_line, end_line, hash, model, text, embedding, updated_at)
659 VALUES (?1, ?2, 'memory', ?3, ?4, ?5, '', ?6, '', ?7)
660 "#,
661 params![&new_id, &file_path, line_start, line_end, &hash, &content, now],
662 )?;
663 }
664 }
665
666 let _ = conn.execute("DROP TABLE IF EXISTS chunks_old", []);
668 let _ = conn.execute("DROP TABLE IF EXISTS files_old", []);
669
670 conn.execute("COMMIT", [])?;
671 info!("Schema migration completed successfully");
672 Ok(())
673 }
674
675 fn ensure_fts_table(conn: &Connection) -> Result<()> {
677 let result = conn.execute(
679 r#"
680 CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
681 text,
682 id UNINDEXED,
683 path UNINDEXED,
684 source UNINDEXED,
685 model UNINDEXED,
686 start_line UNINDEXED,
687 end_line UNINDEXED
688 )
689 "#,
690 [],
691 );
692
693 match result {
694 Ok(_) => debug!("FTS5 table created/verified"),
695 Err(e) => debug!("FTS5 table creation skipped: {}", e),
696 }
697
698 Ok(())
699 }
700
701 fn ensure_column(conn: &Connection, table: &str, column: &str, definition: &str) -> Result<()> {
703 let sql = format!("SELECT {} FROM {} LIMIT 0", column, table);
704 if conn.prepare(&sql).is_err() {
705 let alter = format!("ALTER TABLE {} ADD COLUMN {} {}", table, column, definition);
706 conn.execute(&alter, [])?;
707 debug!("Added column {} to table {}", column, table);
708 }
709 Ok(())
710 }
711
712 pub fn chunks_without_embeddings(&self, limit: usize) -> Result<Vec<(String, String)>> {
714 let conn = self
715 .conn
716 .lock()
717 .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
718
719 let mut stmt = conn.prepare(
720 "SELECT id, text FROM chunks WHERE embedding = '' OR embedding IS NULL LIMIT ?1",
721 )?;
722
723 let rows = stmt.query_map(params![limit as i64], |row| {
724 Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?))
725 })?;
726
727 let mut results = Vec::new();
728 for row in rows {
729 results.push(row?);
730 }
731
732 Ok(results)
733 }
734
735 pub fn store_embedding(&self, chunk_id: &str, embedding: &[f32], model: &str) -> Result<()> {
737 let conn = self
738 .conn
739 .lock()
740 .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
741
742 let embedding_json = serialize_embedding(embedding);
743 let now = std::time::SystemTime::now()
744 .duration_since(std::time::UNIX_EPOCH)?
745 .as_secs() as i64;
746
747 conn.execute(
748 "UPDATE chunks SET embedding = ?1, model = ?2, updated_at = ?3 WHERE id = ?4",
749 params![&embedding_json, model, now, chunk_id],
750 )?;
751
752 if self.has_vec_extension {
754 let embedding_blob = embedding_to_blob(embedding);
755 let _ = conn.execute(
756 "INSERT OR REPLACE INTO chunks_vec (id, embedding) VALUES (?1, ?2)",
757 params![chunk_id, &embedding_blob],
758 );
759 }
760
761 Ok(())
762 }
763
764 pub fn get_cached_embedding(
770 &self,
771 provider: &str,
772 model: &str,
773 text_hash: &str,
774 ) -> Result<Option<Vec<f32>>> {
775 let conn = self
776 .conn
777 .lock()
778 .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
779
780 let result: Option<String> = conn
781 .query_row(
782 "SELECT embedding FROM embedding_cache WHERE provider = ?1 AND model = ?2 AND hash = ?3",
783 params![provider, model, text_hash],
784 |row| row.get(0),
785 )
786 .ok();
787
788 Ok(result.map(|json| deserialize_embedding(&json)))
789 }
790
791 pub fn cache_embedding(
793 &self,
794 provider: &str,
795 model: &str,
796 provider_key: &str,
797 text_hash: &str,
798 embedding: &[f32],
799 ) -> Result<()> {
800 let conn = self
801 .conn
802 .lock()
803 .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
804
805 let embedding_json = serialize_embedding(embedding);
806 let dims = embedding.len() as i32;
807 let now = std::time::SystemTime::now()
808 .duration_since(std::time::UNIX_EPOCH)?
809 .as_secs() as i64;
810
811 conn.execute(
812 "INSERT OR REPLACE INTO embedding_cache (provider, model, provider_key, hash, embedding, dims, updated_at)
813 VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
814 params![provider, model, provider_key, text_hash, &embedding_json, dims, now],
815 )?;
816
817 Ok(())
818 }
819
820 pub fn has_vec_extension(&self) -> bool {
822 self.has_vec_extension
823 }
824
825 pub fn search_vector(
828 &self,
829 query_embedding: &[f32],
830 model: &str,
831 limit: usize,
832 ) -> Result<Vec<MemoryChunk>> {
833 let conn = self
834 .conn
835 .lock()
836 .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
837
838 if self.has_vec_extension {
840 if let Ok(results) = self.search_vector_fast(&conn, query_embedding, model, limit) {
841 return Ok(results);
842 }
843 warn!("sqlite-vec search failed, falling back to in-memory scan");
844 }
845
846 self.search_vector_scan(&conn, query_embedding, model, limit)
848 }
849
850 fn search_vector_fast(
852 &self,
853 conn: &Connection,
854 query_embedding: &[f32],
855 model: &str,
856 limit: usize,
857 ) -> Result<Vec<MemoryChunk>> {
858 let query_blob = embedding_to_blob(query_embedding);
859
860 let mut stmt = conn.prepare(
862 r#"
863 SELECT c.path, c.start_line, c.end_line, c.text,
864 1.0 - vec_distance_cosine(v.embedding, ?1) AS score, c.updated_at
865 FROM chunks_vec v
866 JOIN chunks c ON c.id = v.id
867 WHERE c.model = ?2
868 ORDER BY score DESC
869 LIMIT ?3
870 "#,
871 )?;
872
873 let rows = stmt.query_map(params![&query_blob, model, limit as i64], |row| {
874 Ok(MemoryChunk {
875 file: row.get(0)?,
876 line_start: row.get(1)?,
877 line_end: row.get(2)?,
878 content: row.get(3)?,
879 score: row.get(4)?,
880 updated_at: row.get(5)?,
881 })
882 })?;
883
884 let mut results = Vec::new();
885 for row in rows {
886 results.push(row?);
887 }
888 Ok(results)
889 }
890
891 fn search_vector_scan(
893 &self,
894 conn: &Connection,
895 query_embedding: &[f32],
896 model: &str,
897 limit: usize,
898 ) -> Result<Vec<MemoryChunk>> {
899 let mut stmt = conn.prepare(
900 "SELECT id, path, start_line, end_line, text, embedding, updated_at
901 FROM chunks
902 WHERE embedding != '' AND embedding IS NOT NULL AND model = ?1",
903 )?;
904
905 let rows = stmt.query_map(params![model], |row| {
906 Ok((
907 row.get::<_, String>(0)?,
908 row.get::<_, String>(1)?,
909 row.get::<_, i32>(2)?,
910 row.get::<_, i32>(3)?,
911 row.get::<_, String>(4)?,
912 row.get::<_, String>(5)?,
913 row.get::<_, i64>(6)?,
914 ))
915 })?;
916
917 let mut scored: Vec<(f32, MemoryChunk)> = Vec::new();
919
920 for row in rows {
921 let (_, path, start_line, end_line, text, embedding_json, updated_at) = row?;
922 let embedding = deserialize_embedding(&embedding_json);
923
924 if embedding.len() == query_embedding.len() {
925 let similarity = cosine_similarity(query_embedding, &embedding);
926 scored.push((
927 similarity,
928 MemoryChunk {
929 file: path,
930 line_start: start_line,
931 line_end: end_line,
932 content: text,
933 score: similarity as f64,
934 updated_at,
935 },
936 ));
937 }
938 }
939
940 scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
942
943 Ok(scored
945 .into_iter()
946 .take(limit)
947 .map(|(_, chunk)| chunk)
948 .collect())
949 }
950
951 pub fn search_hybrid(
953 &self,
954 fts_query: &str,
955 query_embedding: Option<&[f32]>,
956 model: &str,
957 limit: usize,
958 text_weight: f32,
959 vector_weight: f32,
960 ) -> Result<Vec<MemoryChunk>> {
961 let fts_results = self.search_fts_raw(fts_query, limit * 2)?;
963
964 let vector_results = if let Some(embedding) = query_embedding {
966 self.search_vector(embedding, model, limit * 2)?
967 } else {
968 Vec::new()
969 };
970
971 let mut merged: std::collections::HashMap<String, (f32, MemoryChunk)> =
973 std::collections::HashMap::new();
974
975 for (rank, result) in fts_results.into_iter().enumerate() {
978 let key = format!("{}:{}:{}", result.file, result.line_start, result.line_end);
979 let rank_score = 1.0 / (1.0 + rank as f32); let weighted_score = rank_score * text_weight;
981 merged.insert(key, (weighted_score, result));
982 }
983
984 for (rank, result) in vector_results.into_iter().enumerate() {
986 let key = format!("{}:{}:{}", result.file, result.line_start, result.line_end);
987 let rank_score = 1.0 / (1.0 + rank as f32);
988 let weighted_score = rank_score * vector_weight;
989
990 if let Some((existing_score, existing_chunk)) = merged.get_mut(&key) {
991 *existing_score += weighted_score;
992 existing_chunk.score = *existing_score as f64;
993 } else {
994 let mut chunk = result;
995 chunk.score = weighted_score as f64;
996 merged.insert(key, (weighted_score, chunk));
997 }
998 }
999
1000 let mut results: Vec<_> = merged.into_values().collect();
1002 results.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
1003
1004 Ok(results
1005 .into_iter()
1006 .take(limit)
1007 .map(|(_, chunk)| chunk)
1008 .collect())
1009 }
1010
1011 pub fn embedded_chunk_count(&self, model: &str) -> Result<usize> {
1013 let conn = self
1014 .conn
1015 .lock()
1016 .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
1017
1018 let count: i64 = conn.query_row(
1019 "SELECT COUNT(*) FROM chunks WHERE embedding != '' AND embedding IS NOT NULL AND model = ?1",
1020 params![model],
1021 |row| row.get(0),
1022 )?;
1023
1024 Ok(count as usize)
1025 }
1026}
1027
1028fn hash_content(content: &str) -> String {
1029 let mut hasher = Sha256::new();
1030 hasher.update(content.as_bytes());
1031 format!("{:x}", hasher.finalize())
1032}
1033
1034fn embedding_to_blob(embedding: &[f32]) -> Vec<u8> {
1036 let mut blob = Vec::with_capacity(embedding.len() * 4);
1037 for &val in embedding {
1038 blob.extend_from_slice(&val.to_le_bytes());
1039 }
1040 blob
1041}
1042
1043fn build_fts_query(raw: &str) -> Option<String> {
1046 let tokens: Vec<&str> = raw
1047 .split(|c: char| !c.is_alphanumeric() && c != '_')
1048 .map(|t| t.trim())
1049 .filter(|t| !t.is_empty())
1050 .collect();
1051
1052 if tokens.is_empty() {
1053 return None;
1054 }
1055
1056 let quoted: Vec<String> = tokens
1058 .iter()
1059 .map(|t| format!("\"{}\"", t.replace('"', "")))
1060 .collect();
1061
1062 Some(quoted.join(" AND "))
1063}
1064
1065struct ChunkInfo {
1066 line_start: i32,
1067 line_end: i32,
1068 content: String,
1069}
1070
1071fn chunk_text(text: &str, target_tokens: usize, overlap_tokens: usize) -> Vec<ChunkInfo> {
1072 let lines: Vec<&str> = text.lines().collect();
1073 let mut chunks = Vec::new();
1074
1075 if lines.is_empty() {
1076 return chunks;
1077 }
1078
1079 let target_chars = target_tokens * 4;
1081 let overlap_chars = overlap_tokens * 4;
1082
1083 let mut start_line = 0;
1084 let mut current_chars = 0;
1085 let mut chunk_lines = Vec::new();
1086
1087 for (i, line) in lines.iter().enumerate() {
1088 chunk_lines.push(*line);
1089 current_chars += line.len() + 1; if current_chars >= target_chars || i == lines.len() - 1 {
1092 chunks.push(ChunkInfo {
1094 line_start: (start_line + 1) as i32,
1095 line_end: (i + 1) as i32,
1096 content: chunk_lines.join("\n"),
1097 });
1098
1099 let mut overlap_len = 0;
1101 let mut overlap_start = chunk_lines.len();
1102
1103 for (j, line) in chunk_lines.iter().enumerate().rev() {
1104 overlap_len += line.len() + 1;
1105 if overlap_len >= overlap_chars {
1106 overlap_start = j;
1107 break;
1108 }
1109 }
1110
1111 if overlap_start < chunk_lines.len() {
1113 start_line += overlap_start;
1114 chunk_lines = chunk_lines[overlap_start..].to_vec();
1115 current_chars = chunk_lines.iter().map(|l| l.len() + 1).sum();
1116 } else {
1117 start_line = i + 1;
1118 chunk_lines.clear();
1119 current_chars = 0;
1120 }
1121 }
1122 }
1123
1124 chunks
1125}
1126
1127#[cfg(test)]
1128mod tests {
1129 use super::*;
1130 use tempfile::TempDir;
1131
1132 #[test]
1133 fn test_chunk_text() {
1134 let text = "Line 1\nLine 2\nLine 3\nLine 4\nLine 5";
1135 let chunks = chunk_text(text, 10, 2); assert!(!chunks.is_empty());
1138 assert_eq!(chunks[0].line_start, 1);
1139 }
1140
1141 #[test]
1142 fn test_memory_index() -> Result<()> {
1143 let temp_dir = TempDir::new()?;
1144 let workspace = temp_dir.path();
1145
1146 let test_file = workspace.join("test.md");
1148 fs::write(
1149 &test_file,
1150 "# Test\n\nThis is a test document.\n\nWith multiple lines.",
1151 )?;
1152
1153 let index = MemoryIndex::new(workspace)?;
1154 index.index_file(&test_file, false)?;
1155
1156 assert!(index.chunk_count()? > 0);
1157
1158 let results = index.search("test document", 10)?;
1159 assert!(!results.is_empty());
1160
1161 Ok(())
1162 }
1163}