1use anyhow::{Result, anyhow};
2use rusqlite::{Connection, params};
3use sha2::{Digest, Sha256};
4use std::fs;
5use std::path::{Path, PathBuf};
6use std::sync::{Arc, Mutex};
7use std::time::Duration;
8use tracing::{debug, info, warn};
9use uuid::Uuid;
10
11use super::embeddings::{cosine_similarity, deserialize_embedding, serialize_embedding};
12use super::search::MemoryChunk;
13
14#[derive(Clone)]
15pub struct MemoryIndex {
16 conn: Arc<Mutex<Connection>>,
17 workspace: PathBuf,
18 db_path: PathBuf,
19 has_vec_extension: bool,
21 chunk_size: usize,
23 chunk_overlap: usize,
25}
26
27#[derive(Debug)]
28pub struct ReindexStats {
29 pub files_processed: usize,
30 pub files_updated: usize,
31 pub chunks_indexed: usize,
32 pub duration: Duration,
33}
34
35impl MemoryIndex {
36 pub fn new_with_db_path(workspace: &Path, db_path: &Path) -> Result<Self> {
38 if let Some(parent) = db_path.parent() {
40 fs::create_dir_all(parent)?;
41 }
42
43 let conn = Connection::open(db_path)?;
44
45 let needs_migration = Self::needs_schema_migration(&conn)?;
47 if needs_migration {
48 info!("Migrating database schema to OpenClaw-compatible format");
49 Self::migrate_to_openclaw_schema(&conn)?;
50 }
51
52 conn.execute_batch(
54 r#"
55 -- Metadata key/value store
56 CREATE TABLE IF NOT EXISTS meta (
57 key TEXT PRIMARY KEY,
58 value TEXT NOT NULL
59 );
60
61 -- File tracking (OpenClaw-compatible)
62 CREATE TABLE IF NOT EXISTS files (
63 path TEXT PRIMARY KEY,
64 source TEXT NOT NULL DEFAULT 'memory',
65 hash TEXT NOT NULL,
66 mtime INTEGER NOT NULL,
67 size INTEGER NOT NULL
68 );
69
70 -- Chunked content (OpenClaw-compatible)
71 CREATE TABLE IF NOT EXISTS chunks (
72 id TEXT PRIMARY KEY,
73 path TEXT NOT NULL,
74 source TEXT NOT NULL DEFAULT 'memory',
75 start_line INTEGER NOT NULL,
76 end_line INTEGER NOT NULL,
77 hash TEXT NOT NULL,
78 model TEXT NOT NULL DEFAULT '',
79 text TEXT NOT NULL,
80 embedding TEXT NOT NULL DEFAULT '',
81 updated_at INTEGER NOT NULL
82 );
83
84 -- Embedding cache (OpenClaw-compatible)
85 CREATE TABLE IF NOT EXISTS embedding_cache (
86 provider TEXT NOT NULL,
87 model TEXT NOT NULL,
88 provider_key TEXT NOT NULL,
89 hash TEXT NOT NULL,
90 embedding TEXT NOT NULL,
91 dims INTEGER,
92 updated_at INTEGER NOT NULL,
93 PRIMARY KEY (provider, model, provider_key, hash)
94 );
95
96 -- Indexes
97 CREATE INDEX IF NOT EXISTS idx_chunks_path ON chunks(path);
98 CREATE INDEX IF NOT EXISTS idx_chunks_source ON chunks(source);
99 CREATE INDEX IF NOT EXISTS idx_embedding_cache_updated_at ON embedding_cache(updated_at);
100 "#,
101 )?;
102
103 Self::ensure_fts_table(&conn)?;
105
106 Self::ensure_column(&conn, "files", "source", "TEXT NOT NULL DEFAULT 'memory'")?;
108 Self::ensure_column(&conn, "chunks", "source", "TEXT NOT NULL DEFAULT 'memory'")?;
109
110 let has_vec_extension = Self::try_load_sqlite_vec(&conn);
112 if has_vec_extension {
113 debug!("sqlite-vec extension loaded successfully");
114 Self::ensure_vec_table(&conn)?;
115 } else {
116 debug!("sqlite-vec extension not available, using in-memory vector search");
117 }
118
119 Ok(Self {
120 conn: Arc::new(Mutex::new(conn)),
121 workspace: workspace.to_path_buf(),
122 db_path: db_path.to_path_buf(),
123 has_vec_extension,
124 chunk_size: 400,
125 chunk_overlap: 80,
126 })
127 }
128
129 pub fn with_chunk_config(mut self, chunk_size: usize, chunk_overlap: usize) -> Self {
131 self.chunk_size = chunk_size;
132 self.chunk_overlap = chunk_overlap;
133 self
134 }
135
136 #[cfg(feature = "sqlite-vec")]
138 #[allow(unsafe_code)]
139 fn try_load_sqlite_vec(conn: &Connection) -> bool {
140 if unsafe { conn.load_extension_enable() }.is_err() {
146 return false;
147 }
148
149 let ext_paths = [
151 "vec0", "./vec0",
153 "/usr/local/lib/vec0",
154 "/usr/lib/vec0",
155 ];
156
157 for path in ext_paths {
158 if unsafe { conn.load_extension(path, None::<&str>) }.is_ok() {
160 let _ = conn.load_extension_disable();
161 return true;
162 }
163 }
164
165 let _ = conn.load_extension_disable();
166 false
167 }
168
169 #[cfg(not(feature = "sqlite-vec"))]
170 fn try_load_sqlite_vec(_conn: &Connection) -> bool {
171 false
172 }
173
174 fn ensure_vec_table(conn: &Connection) -> Result<()> {
176 let result = conn.execute(
178 "CREATE VIRTUAL TABLE IF NOT EXISTS chunks_vec USING vec0(id TEXT PRIMARY KEY, embedding float[1536])",
179 [],
180 );
181 match result {
182 Ok(_) => debug!("chunks_vec table created/verified"),
183 Err(e) => debug!("chunks_vec table creation skipped: {}", e),
184 }
185 Ok(())
186 }
187
188 pub fn new(workspace: &Path) -> Result<Self> {
190 let db_path = workspace.join("memory.sqlite");
191 Self::new_with_db_path(workspace, &db_path)
192 }
193
194 pub fn index_file(&self, path: &Path, force: bool) -> Result<bool> {
196 let content = fs::read_to_string(path)?;
197 let file_hash = hash_content(&content);
198 let metadata = fs::metadata(path)?;
199 let mtime = metadata
200 .modified()?
201 .duration_since(std::time::UNIX_EPOCH)?
202 .as_secs() as i64;
203 let size = metadata.len() as i64;
204
205 let relative_path = path
206 .strip_prefix(&self.workspace)
207 .unwrap_or(path)
208 .to_string_lossy()
209 .to_string();
210
211 let conn = self
212 .conn
213 .lock()
214 .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
215
216 if !force {
218 let existing: Option<String> = conn
219 .query_row(
220 "SELECT hash FROM files WHERE path = ?1",
221 params![&relative_path],
222 |row| row.get(0),
223 )
224 .ok();
225
226 if existing.as_deref() == Some(&file_hash) {
227 debug!("File unchanged, skipping: {}", relative_path);
228 return Ok(false);
229 }
230 }
231
232 debug!("Indexing file: {}", relative_path);
233
234 let now = std::time::SystemTime::now()
235 .duration_since(std::time::UNIX_EPOCH)?
236 .as_secs() as i64;
237
238 conn.execute(
240 "INSERT OR REPLACE INTO files (path, source, hash, mtime, size) VALUES (?1, 'memory', ?2, ?3, ?4)",
241 params![&relative_path, &file_hash, mtime, size],
242 )?;
243
244 Self::delete_chunks_for_path(&conn, &relative_path)?;
246
247 let chunks = chunk_text(&content, self.chunk_size, self.chunk_overlap);
249
250 for chunk in chunks.iter() {
251 let chunk_id = Uuid::new_v4().to_string();
252 let chunk_hash = hash_content(&chunk.content);
253
254 conn.execute(
255 r#"INSERT INTO chunks (id, path, source, start_line, end_line, hash, model, text, embedding, updated_at)
256 VALUES (?1, ?2, 'memory', ?3, ?4, ?5, '', ?6, '', ?7)"#,
257 params![&chunk_id, &relative_path, chunk.line_start, chunk.line_end, &chunk_hash, &chunk.content, now],
258 )?;
259
260 Self::insert_fts(
262 &conn,
263 &chunk_id,
264 &relative_path,
265 "memory",
266 "",
267 chunk.line_start,
268 chunk.line_end,
269 &chunk.content,
270 )?;
271 }
272
273 Ok(true)
274 }
275
276 fn delete_chunks_for_path(conn: &Connection, path: &str) -> Result<()> {
278 let mut stmt = conn.prepare("SELECT id FROM chunks WHERE path = ?1")?;
280 let chunk_ids: Vec<String> = stmt
281 .query_map(params![path], |row| row.get(0))?
282 .filter_map(|r| r.ok())
283 .collect();
284
285 for chunk_id in chunk_ids {
286 let _ = conn.execute("DELETE FROM chunks_fts WHERE id = ?1", params![&chunk_id]);
287 }
288
289 conn.execute("DELETE FROM chunks WHERE path = ?1", params![path])?;
291 Ok(())
292 }
293
294 pub fn remove_file(&self, relative_path: &str) -> Result<()> {
296 let conn = self
297 .conn
298 .lock()
299 .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
300
301 Self::delete_chunks_for_path(&conn, relative_path)?;
302 conn.execute("DELETE FROM files WHERE path = ?1", params![relative_path])?;
303
304 debug!("Removed deleted file from index: {}", relative_path);
305 Ok(())
306 }
307
308 pub fn indexed_files(&self) -> Result<Vec<String>> {
310 let conn = self
311 .conn
312 .lock()
313 .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
314
315 let mut stmt = conn.prepare("SELECT path FROM files")?;
316 let rows = stmt.query_map([], |row| row.get(0))?;
317
318 let mut paths = Vec::new();
319 for row in rows {
320 paths.push(row?);
321 }
322 Ok(paths)
323 }
324
325 #[allow(clippy::too_many_arguments)]
327 fn insert_fts(
328 conn: &Connection,
329 id: &str,
330 path: &str,
331 source: &str,
332 model: &str,
333 start_line: i32,
334 end_line: i32,
335 text: &str,
336 ) -> Result<()> {
337 let _ = conn.execute(
338 "INSERT INTO chunks_fts (text, id, path, source, model, start_line, end_line) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
339 params![text, id, path, source, model, start_line, end_line],
340 );
341 Ok(())
342 }
343
344 pub fn search(&self, query: &str, limit: usize) -> Result<Vec<MemoryChunk>> {
346 let fts_query = match build_fts_query(query) {
347 Some(q) => q,
348 None => return Ok(Vec::new()),
349 };
350
351 let conn = self
352 .conn
353 .lock()
354 .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
355
356 let mut stmt = conn.prepare(
359 r#"
360 SELECT fts.path, fts.start_line, fts.end_line, fts.text, bm25(chunks_fts) as score, c.updated_at
361 FROM chunks_fts fts
362 JOIN chunks c ON c.id = fts.id
363 WHERE chunks_fts MATCH ?1
364 ORDER BY score
365 LIMIT ?2
366 "#,
367 )?;
368
369 let rows = stmt.query_map(params![&fts_query, limit as i64], |row| {
370 Ok(MemoryChunk {
371 file: row.get(0)?,
372 line_start: row.get(1)?,
373 line_end: row.get(2)?,
374 content: row.get(3)?,
375 score: row.get::<_, f64>(4)?.abs(), updated_at: row.get(5)?,
377 })
378 })?;
379
380 let mut results = Vec::new();
381 for row in rows {
382 results.push(row?);
383 }
384
385 Ok(results)
386 }
387
388 pub fn chunk_count(&self) -> Result<usize> {
390 let conn = self
391 .conn
392 .lock()
393 .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
394 let count: i64 = conn.query_row("SELECT COUNT(*) FROM chunks", [], |row| row.get(0))?;
395 Ok(count as usize)
396 }
397
398 pub fn file_chunk_count(&self, path: &Path) -> Result<usize> {
400 let relative_path = path
401 .strip_prefix(&self.workspace)
402 .unwrap_or(path)
403 .to_string_lossy()
404 .to_string();
405
406 let conn = self
407 .conn
408 .lock()
409 .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
410 let count: i64 = conn.query_row(
411 "SELECT COUNT(*) FROM chunks WHERE path = ?1",
412 params![&relative_path],
413 |row| row.get(0),
414 )?;
415 Ok(count as usize)
416 }
417
418 pub fn size_bytes(&self) -> Result<u64> {
420 if self.db_path.exists() {
421 Ok(fs::metadata(&self.db_path)?.len())
422 } else {
423 Ok(0)
424 }
425 }
426
427 pub fn db_path(&self) -> &Path {
429 &self.db_path
430 }
431
432 fn needs_schema_migration(conn: &Connection) -> Result<bool> {
434 let result: rusqlite::Result<String> =
439 conn.query_row("PRAGMA table_info(chunks)", [], |row| row.get(1));
440
441 if result.is_err() {
442 return Ok(false);
444 }
445
446 let has_file_path: bool = conn.prepare("SELECT file_path FROM chunks LIMIT 0").is_ok();
448 let has_content: bool = conn.prepare("SELECT content FROM chunks LIMIT 0").is_ok();
449
450 Ok(has_file_path || has_content)
452 }
453
454 fn migrate_to_openclaw_schema(conn: &Connection) -> Result<()> {
456 conn.execute("BEGIN TRANSACTION", [])?;
458
459 let _ = conn.execute("ALTER TABLE chunks RENAME TO chunks_old", []);
461 let _ = conn.execute("ALTER TABLE files RENAME TO files_old", []);
462
463 let _ = conn.execute("DROP TABLE IF EXISTS chunks_fts", []);
465 let _ = conn.execute("DROP TRIGGER IF EXISTS chunks_ai", []);
466 let _ = conn.execute("DROP TRIGGER IF EXISTS chunks_ad", []);
467 let _ = conn.execute("DROP TRIGGER IF EXISTS chunks_au", []);
468
469 conn.execute(
471 r#"
472 CREATE TABLE IF NOT EXISTS files (
473 path TEXT PRIMARY KEY,
474 source TEXT NOT NULL DEFAULT 'memory',
475 hash TEXT NOT NULL,
476 mtime INTEGER NOT NULL,
477 size INTEGER NOT NULL
478 )
479 "#,
480 [],
481 )?;
482
483 conn.execute(
484 r#"
485 CREATE TABLE IF NOT EXISTS chunks (
486 id TEXT PRIMARY KEY,
487 path TEXT NOT NULL,
488 source TEXT NOT NULL DEFAULT 'memory',
489 start_line INTEGER NOT NULL,
490 end_line INTEGER NOT NULL,
491 hash TEXT NOT NULL,
492 model TEXT NOT NULL DEFAULT '',
493 text TEXT NOT NULL,
494 embedding TEXT NOT NULL DEFAULT '',
495 updated_at INTEGER NOT NULL
496 )
497 "#,
498 [],
499 )?;
500
501 let now = std::time::SystemTime::now()
503 .duration_since(std::time::UNIX_EPOCH)
504 .unwrap_or_default()
505 .as_secs() as i64;
506
507 let _ = conn.execute(
509 r#"
510 INSERT INTO files (path, source, hash, mtime, size)
511 SELECT path, 'memory', hash, mtime, size FROM files_old
512 "#,
513 [],
514 );
515
516 let has_embedding_cols = conn
519 .prepare("SELECT embedding FROM chunks_old LIMIT 0")
520 .is_ok();
521
522 if has_embedding_cols {
524 let mut stmt = conn.prepare(
526 "SELECT file_path, line_start, line_end, content, embedding, embedding_model FROM chunks_old",
527 )?;
528 let rows = stmt.query_map([], |row| {
529 Ok((
530 row.get::<_, String>(0)?,
531 row.get::<_, i32>(1)?,
532 row.get::<_, i32>(2)?,
533 row.get::<_, String>(3)?,
534 row.get::<_, Option<String>>(4)?,
535 row.get::<_, Option<String>>(5)?,
536 ))
537 })?;
538
539 for row in rows {
540 let (file_path, line_start, line_end, content, embedding, model) = row?;
541 let new_id = Uuid::new_v4().to_string();
542 let hash = hash_content(&content);
543 let model = model.unwrap_or_default();
544 let embedding = embedding.unwrap_or_default();
545
546 conn.execute(
547 r#"
548 INSERT INTO chunks (id, path, source, start_line, end_line, hash, model, text, embedding, updated_at)
549 VALUES (?1, ?2, 'memory', ?3, ?4, ?5, ?6, ?7, ?8, ?9)
550 "#,
551 params![&new_id, &file_path, line_start, line_end, &hash, &model, &content, &embedding, now],
552 )?;
553 }
554 } else {
555 let mut stmt =
557 conn.prepare("SELECT file_path, line_start, line_end, content FROM chunks_old")?;
558 let rows = stmt.query_map([], |row| {
559 Ok((
560 row.get::<_, String>(0)?,
561 row.get::<_, i32>(1)?,
562 row.get::<_, i32>(2)?,
563 row.get::<_, String>(3)?,
564 ))
565 })?;
566
567 for row in rows {
568 let (file_path, line_start, line_end, content) = row?;
569 let new_id = Uuid::new_v4().to_string();
570 let hash = hash_content(&content);
571
572 conn.execute(
573 r#"
574 INSERT INTO chunks (id, path, source, start_line, end_line, hash, model, text, embedding, updated_at)
575 VALUES (?1, ?2, 'memory', ?3, ?4, ?5, '', ?6, '', ?7)
576 "#,
577 params![&new_id, &file_path, line_start, line_end, &hash, &content, now],
578 )?;
579 }
580 }
581
582 let _ = conn.execute("DROP TABLE IF EXISTS chunks_old", []);
584 let _ = conn.execute("DROP TABLE IF EXISTS files_old", []);
585
586 conn.execute("COMMIT", [])?;
587 info!("Schema migration completed successfully");
588 Ok(())
589 }
590
591 fn ensure_fts_table(conn: &Connection) -> Result<()> {
593 let result = conn.execute(
595 r#"
596 CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
597 text,
598 id UNINDEXED,
599 path UNINDEXED,
600 source UNINDEXED,
601 model UNINDEXED,
602 start_line UNINDEXED,
603 end_line UNINDEXED
604 )
605 "#,
606 [],
607 );
608
609 match result {
610 Ok(_) => debug!("FTS5 table created/verified"),
611 Err(e) => debug!("FTS5 table creation skipped: {}", e),
612 }
613
614 Ok(())
615 }
616
617 fn ensure_column(conn: &Connection, table: &str, column: &str, definition: &str) -> Result<()> {
619 let sql = format!("SELECT {} FROM {} LIMIT 0", column, table);
620 if conn.prepare(&sql).is_err() {
621 let alter = format!("ALTER TABLE {} ADD COLUMN {} {}", table, column, definition);
622 conn.execute(&alter, [])?;
623 debug!("Added column {} to table {}", column, table);
624 }
625 Ok(())
626 }
627
628 pub fn chunks_without_embeddings(&self, limit: usize) -> Result<Vec<(String, String)>> {
630 let conn = self
631 .conn
632 .lock()
633 .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
634
635 let mut stmt = conn.prepare(
636 "SELECT id, text FROM chunks WHERE embedding = '' OR embedding IS NULL LIMIT ?1",
637 )?;
638
639 let rows = stmt.query_map(params![limit as i64], |row| {
640 Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?))
641 })?;
642
643 let mut results = Vec::new();
644 for row in rows {
645 results.push(row?);
646 }
647
648 Ok(results)
649 }
650
651 pub fn store_embedding(&self, chunk_id: &str, embedding: &[f32], model: &str) -> Result<()> {
653 let conn = self
654 .conn
655 .lock()
656 .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
657
658 let embedding_json = serialize_embedding(embedding);
659 let now = std::time::SystemTime::now()
660 .duration_since(std::time::UNIX_EPOCH)?
661 .as_secs() as i64;
662
663 conn.execute(
664 "UPDATE chunks SET embedding = ?1, model = ?2, updated_at = ?3 WHERE id = ?4",
665 params![&embedding_json, model, now, chunk_id],
666 )?;
667
668 if self.has_vec_extension {
670 let embedding_blob = embedding_to_blob(embedding);
671 let _ = conn.execute(
672 "INSERT OR REPLACE INTO chunks_vec (id, embedding) VALUES (?1, ?2)",
673 params![chunk_id, &embedding_blob],
674 );
675 }
676
677 Ok(())
678 }
679
680 pub fn get_cached_embedding(
686 &self,
687 provider: &str,
688 model: &str,
689 text_hash: &str,
690 ) -> Result<Option<Vec<f32>>> {
691 let conn = self
692 .conn
693 .lock()
694 .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
695
696 let result: Option<String> = conn
697 .query_row(
698 "SELECT embedding FROM embedding_cache WHERE provider = ?1 AND model = ?2 AND hash = ?3",
699 params![provider, model, text_hash],
700 |row| row.get(0),
701 )
702 .ok();
703
704 Ok(result.map(|json| deserialize_embedding(&json)))
705 }
706
707 pub fn cache_embedding(
709 &self,
710 provider: &str,
711 model: &str,
712 provider_key: &str,
713 text_hash: &str,
714 embedding: &[f32],
715 ) -> Result<()> {
716 let conn = self
717 .conn
718 .lock()
719 .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
720
721 let embedding_json = serialize_embedding(embedding);
722 let dims = embedding.len() as i32;
723 let now = std::time::SystemTime::now()
724 .duration_since(std::time::UNIX_EPOCH)?
725 .as_secs() as i64;
726
727 conn.execute(
728 "INSERT OR REPLACE INTO embedding_cache (provider, model, provider_key, hash, embedding, dims, updated_at)
729 VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
730 params![provider, model, provider_key, text_hash, &embedding_json, dims, now],
731 )?;
732
733 Ok(())
734 }
735
736 pub fn has_vec_extension(&self) -> bool {
738 self.has_vec_extension
739 }
740
741 pub fn search_vector(
744 &self,
745 query_embedding: &[f32],
746 model: &str,
747 limit: usize,
748 ) -> Result<Vec<MemoryChunk>> {
749 let conn = self
750 .conn
751 .lock()
752 .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
753
754 if self.has_vec_extension {
756 if let Ok(results) = self.search_vector_fast(&conn, query_embedding, model, limit) {
757 return Ok(results);
758 }
759 warn!("sqlite-vec search failed, falling back to in-memory scan");
760 }
761
762 self.search_vector_scan(&conn, query_embedding, model, limit)
764 }
765
766 fn search_vector_fast(
768 &self,
769 conn: &Connection,
770 query_embedding: &[f32],
771 model: &str,
772 limit: usize,
773 ) -> Result<Vec<MemoryChunk>> {
774 let query_blob = embedding_to_blob(query_embedding);
775
776 let mut stmt = conn.prepare(
778 r#"
779 SELECT c.path, c.start_line, c.end_line, c.text,
780 1.0 - vec_distance_cosine(v.embedding, ?1) AS score, c.updated_at
781 FROM chunks_vec v
782 JOIN chunks c ON c.id = v.id
783 WHERE c.model = ?2
784 ORDER BY score DESC
785 LIMIT ?3
786 "#,
787 )?;
788
789 let rows = stmt.query_map(params![&query_blob, model, limit as i64], |row| {
790 Ok(MemoryChunk {
791 file: row.get(0)?,
792 line_start: row.get(1)?,
793 line_end: row.get(2)?,
794 content: row.get(3)?,
795 score: row.get(4)?,
796 updated_at: row.get(5)?,
797 })
798 })?;
799
800 let mut results = Vec::new();
801 for row in rows {
802 results.push(row?);
803 }
804 Ok(results)
805 }
806
807 fn search_vector_scan(
809 &self,
810 conn: &Connection,
811 query_embedding: &[f32],
812 model: &str,
813 limit: usize,
814 ) -> Result<Vec<MemoryChunk>> {
815 let mut stmt = conn.prepare(
816 "SELECT id, path, start_line, end_line, text, embedding, updated_at
817 FROM chunks
818 WHERE embedding != '' AND embedding IS NOT NULL AND model = ?1",
819 )?;
820
821 let rows = stmt.query_map(params![model], |row| {
822 Ok((
823 row.get::<_, String>(0)?,
824 row.get::<_, String>(1)?,
825 row.get::<_, i32>(2)?,
826 row.get::<_, i32>(3)?,
827 row.get::<_, String>(4)?,
828 row.get::<_, String>(5)?,
829 row.get::<_, i64>(6)?,
830 ))
831 })?;
832
833 let mut scored: Vec<(f32, MemoryChunk)> = Vec::new();
835
836 for row in rows {
837 let (_, path, start_line, end_line, text, embedding_json, updated_at) = row?;
838 let embedding = deserialize_embedding(&embedding_json);
839
840 if embedding.len() == query_embedding.len() {
841 let similarity = cosine_similarity(query_embedding, &embedding);
842 scored.push((
843 similarity,
844 MemoryChunk {
845 file: path,
846 line_start: start_line,
847 line_end: end_line,
848 content: text,
849 score: similarity as f64,
850 updated_at,
851 },
852 ));
853 }
854 }
855
856 scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
858
859 Ok(scored
861 .into_iter()
862 .take(limit)
863 .map(|(_, chunk)| chunk)
864 .collect())
865 }
866
867 pub fn search_hybrid(
869 &self,
870 query: &str,
871 query_embedding: Option<&[f32]>,
872 model: &str,
873 limit: usize,
874 text_weight: f32,
875 vector_weight: f32,
876 ) -> Result<Vec<MemoryChunk>> {
877 let fts_results = self.search(query, limit * 2)?;
879
880 let vector_results = if let Some(embedding) = query_embedding {
882 self.search_vector(embedding, model, limit * 2)?
883 } else {
884 Vec::new()
885 };
886
887 let mut merged: std::collections::HashMap<String, (f32, MemoryChunk)> =
889 std::collections::HashMap::new();
890
891 for (rank, result) in fts_results.into_iter().enumerate() {
894 let key = format!("{}:{}:{}", result.file, result.line_start, result.line_end);
895 let rank_score = 1.0 / (1.0 + rank as f32); let weighted_score = rank_score * text_weight;
897 merged.insert(key, (weighted_score, result));
898 }
899
900 for (rank, result) in vector_results.into_iter().enumerate() {
902 let key = format!("{}:{}:{}", result.file, result.line_start, result.line_end);
903 let rank_score = 1.0 / (1.0 + rank as f32);
904 let weighted_score = rank_score * vector_weight;
905
906 if let Some((existing_score, existing_chunk)) = merged.get_mut(&key) {
907 *existing_score += weighted_score;
908 existing_chunk.score = *existing_score as f64;
909 } else {
910 let mut chunk = result;
911 chunk.score = weighted_score as f64;
912 merged.insert(key, (weighted_score, chunk));
913 }
914 }
915
916 let mut results: Vec<_> = merged.into_values().collect();
918 results.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
919
920 Ok(results
921 .into_iter()
922 .take(limit)
923 .map(|(_, chunk)| chunk)
924 .collect())
925 }
926
927 pub fn embedded_chunk_count(&self, model: &str) -> Result<usize> {
929 let conn = self
930 .conn
931 .lock()
932 .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
933
934 let count: i64 = conn.query_row(
935 "SELECT COUNT(*) FROM chunks WHERE embedding != '' AND embedding IS NOT NULL AND model = ?1",
936 params![model],
937 |row| row.get(0),
938 )?;
939
940 Ok(count as usize)
941 }
942}
943
944fn hash_content(content: &str) -> String {
945 let mut hasher = Sha256::new();
946 hasher.update(content.as_bytes());
947 format!("{:x}", hasher.finalize())
948}
949
950fn embedding_to_blob(embedding: &[f32]) -> Vec<u8> {
952 let mut blob = Vec::with_capacity(embedding.len() * 4);
953 for &val in embedding {
954 blob.extend_from_slice(&val.to_le_bytes());
955 }
956 blob
957}
958
959fn build_fts_query(raw: &str) -> Option<String> {
962 let tokens: Vec<&str> = raw
963 .split(|c: char| !c.is_alphanumeric() && c != '_')
964 .map(|t| t.trim())
965 .filter(|t| !t.is_empty())
966 .collect();
967
968 if tokens.is_empty() {
969 return None;
970 }
971
972 let quoted: Vec<String> = tokens
974 .iter()
975 .map(|t| format!("\"{}\"", t.replace('"', "")))
976 .collect();
977
978 Some(quoted.join(" AND "))
979}
980
981struct ChunkInfo {
982 line_start: i32,
983 line_end: i32,
984 content: String,
985}
986
987fn chunk_text(text: &str, target_tokens: usize, overlap_tokens: usize) -> Vec<ChunkInfo> {
988 let lines: Vec<&str> = text.lines().collect();
989 let mut chunks = Vec::new();
990
991 if lines.is_empty() {
992 return chunks;
993 }
994
995 let target_chars = target_tokens * 4;
997 let overlap_chars = overlap_tokens * 4;
998
999 let mut start_line = 0;
1000 let mut current_chars = 0;
1001 let mut chunk_lines = Vec::new();
1002
1003 for (i, line) in lines.iter().enumerate() {
1004 chunk_lines.push(*line);
1005 current_chars += line.len() + 1; if current_chars >= target_chars || i == lines.len() - 1 {
1008 chunks.push(ChunkInfo {
1010 line_start: (start_line + 1) as i32,
1011 line_end: (i + 1) as i32,
1012 content: chunk_lines.join("\n"),
1013 });
1014
1015 let mut overlap_len = 0;
1017 let mut overlap_start = chunk_lines.len();
1018
1019 for (j, line) in chunk_lines.iter().enumerate().rev() {
1020 overlap_len += line.len() + 1;
1021 if overlap_len >= overlap_chars {
1022 overlap_start = j;
1023 break;
1024 }
1025 }
1026
1027 if overlap_start < chunk_lines.len() {
1029 start_line += overlap_start;
1030 chunk_lines = chunk_lines[overlap_start..].to_vec();
1031 current_chars = chunk_lines.iter().map(|l| l.len() + 1).sum();
1032 } else {
1033 start_line = i + 1;
1034 chunk_lines.clear();
1035 current_chars = 0;
1036 }
1037 }
1038 }
1039
1040 chunks
1041}
1042
1043#[cfg(test)]
1044mod tests {
1045 use super::*;
1046 use tempfile::TempDir;
1047
1048 #[test]
1049 fn test_chunk_text() {
1050 let text = "Line 1\nLine 2\nLine 3\nLine 4\nLine 5";
1051 let chunks = chunk_text(text, 10, 2); assert!(!chunks.is_empty());
1054 assert_eq!(chunks[0].line_start, 1);
1055 }
1056
1057 #[test]
1058 fn test_memory_index() -> Result<()> {
1059 let temp_dir = TempDir::new()?;
1060 let workspace = temp_dir.path();
1061
1062 let test_file = workspace.join("test.md");
1064 fs::write(
1065 &test_file,
1066 "# Test\n\nThis is a test document.\n\nWith multiple lines.",
1067 )?;
1068
1069 let index = MemoryIndex::new(workspace)?;
1070 index.index_file(&test_file, false)?;
1071
1072 assert!(index.chunk_count()? > 0);
1073
1074 let results = index.search("test document", 10)?;
1075 assert!(!results.is_empty());
1076
1077 Ok(())
1078 }
1079}