Skip to main content

localgpt_core/memory/
index.rs

1use anyhow::{Result, anyhow};
2use rusqlite::{Connection, params};
3use sha2::{Digest, Sha256};
4use std::fs;
5use std::path::{Path, PathBuf};
6use std::sync::{Arc, Mutex};
7use std::time::Duration;
8use tracing::{debug, info, warn};
9use uuid::Uuid;
10
11use super::embeddings::{cosine_similarity, deserialize_embedding, serialize_embedding};
12use super::search::MemoryChunk;
13
14#[derive(Clone)]
15pub struct MemoryIndex {
16    conn: Arc<Mutex<Connection>>,
17    workspace: PathBuf,
18    db_path: PathBuf,
19    /// Whether sqlite-vec extension is loaded for fast vector search
20    has_vec_extension: bool,
21    /// Token count per chunk (default: 400)
22    chunk_size: usize,
23    /// Token overlap between chunks (default: 80)
24    chunk_overlap: usize,
25}
26
27#[derive(Debug)]
28pub struct ReindexStats {
29    pub files_processed: usize,
30    pub files_updated: usize,
31    pub chunks_indexed: usize,
32    pub duration: Duration,
33}
34
35impl MemoryIndex {
36    /// Create a new memory index with database at the specified path
37    pub fn new_with_db_path(workspace: &Path, db_path: &Path) -> Result<Self> {
38        // Ensure parent directory exists
39        if let Some(parent) = db_path.parent() {
40            fs::create_dir_all(parent)?;
41        }
42
43        let conn = Connection::open(db_path)?;
44
45        // Check if we need to migrate from old schema
46        let needs_migration = Self::needs_schema_migration(&conn)?;
47        if needs_migration {
48            info!("Migrating database schema to OpenClaw-compatible format");
49            Self::migrate_to_openclaw_schema(&conn)?;
50        }
51
52        // Initialize OpenClaw-compatible schema
53        conn.execute_batch(
54            r#"
55            -- Metadata key/value store
56            CREATE TABLE IF NOT EXISTS meta (
57                key TEXT PRIMARY KEY,
58                value TEXT NOT NULL
59            );
60
61            -- File tracking (OpenClaw-compatible)
62            CREATE TABLE IF NOT EXISTS files (
63                path TEXT PRIMARY KEY,
64                source TEXT NOT NULL DEFAULT 'memory',
65                hash TEXT NOT NULL,
66                mtime INTEGER NOT NULL,
67                size INTEGER NOT NULL
68            );
69
70            -- Chunked content (OpenClaw-compatible)
71            CREATE TABLE IF NOT EXISTS chunks (
72                id TEXT PRIMARY KEY,
73                path TEXT NOT NULL,
74                source TEXT NOT NULL DEFAULT 'memory',
75                start_line INTEGER NOT NULL,
76                end_line INTEGER NOT NULL,
77                hash TEXT NOT NULL,
78                model TEXT NOT NULL DEFAULT '',
79                text TEXT NOT NULL,
80                embedding TEXT NOT NULL DEFAULT '',
81                updated_at INTEGER NOT NULL
82            );
83
84            -- Embedding cache (OpenClaw-compatible)
85            CREATE TABLE IF NOT EXISTS embedding_cache (
86                provider TEXT NOT NULL,
87                model TEXT NOT NULL,
88                provider_key TEXT NOT NULL,
89                hash TEXT NOT NULL,
90                embedding TEXT NOT NULL,
91                dims INTEGER,
92                updated_at INTEGER NOT NULL,
93                PRIMARY KEY (provider, model, provider_key, hash)
94            );
95
96            -- Indexes
97            CREATE INDEX IF NOT EXISTS idx_chunks_path ON chunks(path);
98            CREATE INDEX IF NOT EXISTS idx_chunks_source ON chunks(source);
99            CREATE INDEX IF NOT EXISTS idx_embedding_cache_updated_at ON embedding_cache(updated_at);
100            "#,
101        )?;
102
103        // Create FTS5 table (OpenClaw-compatible with UNINDEXED columns)
104        Self::ensure_fts_table(&conn)?;
105
106        // Ensure source column exists on older tables
107        Self::ensure_column(&conn, "files", "source", "TEXT NOT NULL DEFAULT 'memory'")?;
108        Self::ensure_column(&conn, "chunks", "source", "TEXT NOT NULL DEFAULT 'memory'")?;
109
110        // Try to load sqlite-vec extension for fast vector search
111        let has_vec_extension = Self::try_load_sqlite_vec(&conn);
112        if has_vec_extension {
113            debug!("sqlite-vec extension loaded successfully");
114            Self::ensure_vec_table(&conn)?;
115        } else {
116            debug!("sqlite-vec extension not available, using in-memory vector search");
117        }
118
119        Ok(Self {
120            conn: Arc::new(Mutex::new(conn)),
121            workspace: workspace.to_path_buf(),
122            db_path: db_path.to_path_buf(),
123            has_vec_extension,
124            chunk_size: 400,
125            chunk_overlap: 80,
126        })
127    }
128
129    /// Set chunk size and overlap (builder pattern)
130    pub fn with_chunk_config(mut self, chunk_size: usize, chunk_overlap: usize) -> Self {
131        self.chunk_size = chunk_size;
132        self.chunk_overlap = chunk_overlap;
133        self
134    }
135
136    /// Try to load sqlite-vec extension
137    #[cfg(feature = "sqlite-vec")]
138    #[allow(unsafe_code)]
139    fn try_load_sqlite_vec(conn: &Connection) -> bool {
140        // sqlite-vec provides the extension as a loadable module
141        // Try to load it - this requires the extension to be installed on the system
142
143        // SAFETY: load_extension is unsafe because it loads arbitrary native code
144        // We only load from known, trusted paths (sqlite-vec)
145        if unsafe { conn.load_extension_enable() }.is_err() {
146            return false;
147        }
148
149        // Try loading from common locations
150        let ext_paths = [
151            "vec0", // If in LD_LIBRARY_PATH
152            "./vec0",
153            "/usr/local/lib/vec0",
154            "/usr/lib/vec0",
155        ];
156
157        for path in ext_paths {
158            // SAFETY: Loading sqlite-vec extension from trusted path
159            if unsafe { conn.load_extension(path, None::<&str>) }.is_ok() {
160                let _ = conn.load_extension_disable();
161                return true;
162            }
163        }
164
165        let _ = conn.load_extension_disable();
166        false
167    }
168
169    #[cfg(not(feature = "sqlite-vec"))]
170    fn try_load_sqlite_vec(_conn: &Connection) -> bool {
171        false
172    }
173
174    /// Create virtual table for vector search (requires sqlite-vec)
175    fn ensure_vec_table(conn: &Connection) -> Result<()> {
176        // Create chunks_vec virtual table if sqlite-vec is available
177        let result = conn.execute(
178            "CREATE VIRTUAL TABLE IF NOT EXISTS chunks_vec USING vec0(id TEXT PRIMARY KEY, embedding float[1536])",
179            [],
180        );
181        match result {
182            Ok(_) => debug!("chunks_vec table created/verified"),
183            Err(e) => debug!("chunks_vec table creation skipped: {}", e),
184        }
185        Ok(())
186    }
187
188    /// Create a new memory index with database in workspace (legacy path)
189    pub fn new(workspace: &Path) -> Result<Self> {
190        let db_path = workspace.join("memory.sqlite");
191        Self::new_with_db_path(workspace, &db_path)
192    }
193
194    /// Index a file, returning true if it was updated
195    pub fn index_file(&self, path: &Path, force: bool) -> Result<bool> {
196        let content = fs::read_to_string(path)?;
197        let file_hash = hash_content(&content);
198        let metadata = fs::metadata(path)?;
199        let mtime = metadata
200            .modified()?
201            .duration_since(std::time::UNIX_EPOCH)?
202            .as_secs() as i64;
203        let size = metadata.len() as i64;
204
205        let relative_path = path
206            .strip_prefix(&self.workspace)
207            .unwrap_or(path)
208            .to_string_lossy()
209            .to_string();
210
211        let conn = self
212            .conn
213            .lock()
214            .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
215
216        // Check if file has changed
217        if !force {
218            let existing: Option<String> = conn
219                .query_row(
220                    "SELECT hash FROM files WHERE path = ?1",
221                    params![&relative_path],
222                    |row| row.get(0),
223                )
224                .ok();
225
226            if existing.as_deref() == Some(&file_hash) {
227                debug!("File unchanged, skipping: {}", relative_path);
228                return Ok(false);
229            }
230        }
231
232        debug!("Indexing file: {}", relative_path);
233
234        let now = std::time::SystemTime::now()
235            .duration_since(std::time::UNIX_EPOCH)?
236            .as_secs() as i64;
237
238        // Update file record (OpenClaw-compatible columns)
239        conn.execute(
240            "INSERT OR REPLACE INTO files (path, source, hash, mtime, size) VALUES (?1, 'memory', ?2, ?3, ?4)",
241            params![&relative_path, &file_hash, mtime, size],
242        )?;
243
244        // Delete existing chunks and their FTS entries
245        Self::delete_chunks_for_path(&conn, &relative_path)?;
246
247        // Create new chunks (OpenClaw-compatible)
248        let chunks = chunk_text(&content, self.chunk_size, self.chunk_overlap);
249
250        for chunk in chunks.iter() {
251            let chunk_id = Uuid::new_v4().to_string();
252            let chunk_hash = hash_content(&chunk.content);
253
254            conn.execute(
255                r#"INSERT INTO chunks (id, path, source, start_line, end_line, hash, model, text, embedding, updated_at)
256                   VALUES (?1, ?2, 'memory', ?3, ?4, ?5, '', ?6, '', ?7)"#,
257                params![&chunk_id, &relative_path, chunk.line_start, chunk.line_end, &chunk_hash, &chunk.content, now],
258            )?;
259
260            // Insert into FTS
261            Self::insert_fts(
262                &conn,
263                &chunk_id,
264                &relative_path,
265                "memory",
266                "",
267                chunk.line_start,
268                chunk.line_end,
269                &chunk.content,
270            )?;
271        }
272
273        Ok(true)
274    }
275
276    /// Delete chunks for a path and their FTS entries
277    fn delete_chunks_for_path(conn: &Connection, path: &str) -> Result<()> {
278        // Delete from FTS first (get chunk IDs)
279        let mut stmt = conn.prepare("SELECT id FROM chunks WHERE path = ?1")?;
280        let chunk_ids: Vec<String> = stmt
281            .query_map(params![path], |row| row.get(0))?
282            .filter_map(|r| r.ok())
283            .collect();
284
285        for chunk_id in chunk_ids {
286            let _ = conn.execute("DELETE FROM chunks_fts WHERE id = ?1", params![&chunk_id]);
287        }
288
289        // Delete chunks
290        conn.execute("DELETE FROM chunks WHERE path = ?1", params![path])?;
291        Ok(())
292    }
293
294    /// Remove a file and its chunks from the index (for deleted files)
295    pub fn remove_file(&self, relative_path: &str) -> Result<()> {
296        let conn = self
297            .conn
298            .lock()
299            .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
300
301        Self::delete_chunks_for_path(&conn, relative_path)?;
302        conn.execute("DELETE FROM files WHERE path = ?1", params![relative_path])?;
303
304        debug!("Removed deleted file from index: {}", relative_path);
305        Ok(())
306    }
307
308    /// Get all indexed file paths
309    pub fn indexed_files(&self) -> Result<Vec<String>> {
310        let conn = self
311            .conn
312            .lock()
313            .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
314
315        let mut stmt = conn.prepare("SELECT path FROM files")?;
316        let rows = stmt.query_map([], |row| row.get(0))?;
317
318        let mut paths = Vec::new();
319        for row in rows {
320            paths.push(row?);
321        }
322        Ok(paths)
323    }
324
325    /// Insert into FTS table
326    #[allow(clippy::too_many_arguments)]
327    fn insert_fts(
328        conn: &Connection,
329        id: &str,
330        path: &str,
331        source: &str,
332        model: &str,
333        start_line: i32,
334        end_line: i32,
335        text: &str,
336    ) -> Result<()> {
337        let _ = conn.execute(
338            "INSERT INTO chunks_fts (text, id, path, source, model, start_line, end_line) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
339            params![text, id, path, source, model, start_line, end_line],
340        );
341        Ok(())
342    }
343
344    /// Search using FTS5
345    pub fn search(&self, query: &str, limit: usize) -> Result<Vec<MemoryChunk>> {
346        let fts_query = match build_fts_query(query) {
347            Some(q) => q,
348            None => return Ok(Vec::new()),
349        };
350
351        let conn = self
352            .conn
353            .lock()
354            .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
355
356        // OpenClaw-compatible: use 'path', 'start_line', 'end_line', 'text' columns
357        // Join with chunks to get updated_at for temporal decay
358        let mut stmt = conn.prepare(
359            r#"
360            SELECT fts.path, fts.start_line, fts.end_line, fts.text, bm25(chunks_fts) as score, c.updated_at
361            FROM chunks_fts fts
362            JOIN chunks c ON c.id = fts.id
363            WHERE chunks_fts MATCH ?1
364            ORDER BY score
365            LIMIT ?2
366            "#,
367        )?;
368
369        let rows = stmt.query_map(params![&fts_query, limit as i64], |row| {
370            Ok(MemoryChunk {
371                file: row.get(0)?,
372                line_start: row.get(1)?,
373                line_end: row.get(2)?,
374                content: row.get(3)?,
375                score: row.get::<_, f64>(4)?.abs(), // BM25 returns negative scores
376                updated_at: row.get(5)?,
377            })
378        })?;
379
380        let mut results = Vec::new();
381        for row in rows {
382            results.push(row?);
383        }
384
385        Ok(results)
386    }
387
388    /// Get total chunk count
389    pub fn chunk_count(&self) -> Result<usize> {
390        let conn = self
391            .conn
392            .lock()
393            .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
394        let count: i64 = conn.query_row("SELECT COUNT(*) FROM chunks", [], |row| row.get(0))?;
395        Ok(count as usize)
396    }
397
398    /// Get chunk count for a specific file
399    pub fn file_chunk_count(&self, path: &Path) -> Result<usize> {
400        let relative_path = path
401            .strip_prefix(&self.workspace)
402            .unwrap_or(path)
403            .to_string_lossy()
404            .to_string();
405
406        let conn = self
407            .conn
408            .lock()
409            .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
410        let count: i64 = conn.query_row(
411            "SELECT COUNT(*) FROM chunks WHERE path = ?1",
412            params![&relative_path],
413            |row| row.get(0),
414        )?;
415        Ok(count as usize)
416    }
417
418    /// Get database size in bytes
419    pub fn size_bytes(&self) -> Result<u64> {
420        if self.db_path.exists() {
421            Ok(fs::metadata(&self.db_path)?.len())
422        } else {
423            Ok(0)
424        }
425    }
426
427    /// Get the database path
428    pub fn db_path(&self) -> &Path {
429        &self.db_path
430    }
431
432    /// Check if we need to migrate from old LocalGPT schema to OpenClaw schema
433    fn needs_schema_migration(conn: &Connection) -> Result<bool> {
434        // Check for old schema indicators:
435        // 1. chunks table has 'file_path' column instead of 'path'
436        // 2. chunks table has 'content' column instead of 'text'
437        // 3. chunks.id is INTEGER instead of TEXT
438        let result: rusqlite::Result<String> =
439            conn.query_row("PRAGMA table_info(chunks)", [], |row| row.get(1));
440
441        if result.is_err() {
442            // chunks table doesn't exist yet, no migration needed
443            return Ok(false);
444        }
445
446        // Check column names
447        let has_file_path: bool = conn.prepare("SELECT file_path FROM chunks LIMIT 0").is_ok();
448        let has_content: bool = conn.prepare("SELECT content FROM chunks LIMIT 0").is_ok();
449
450        // Old schema has file_path and content columns
451        Ok(has_file_path || has_content)
452    }
453
454    /// Migrate from old LocalGPT schema to OpenClaw-compatible schema
455    fn migrate_to_openclaw_schema(conn: &Connection) -> Result<()> {
456        // Start transaction
457        conn.execute("BEGIN TRANSACTION", [])?;
458
459        // 1. Rename old tables
460        let _ = conn.execute("ALTER TABLE chunks RENAME TO chunks_old", []);
461        let _ = conn.execute("ALTER TABLE files RENAME TO files_old", []);
462
463        // 2. Drop old FTS and triggers
464        let _ = conn.execute("DROP TABLE IF EXISTS chunks_fts", []);
465        let _ = conn.execute("DROP TRIGGER IF EXISTS chunks_ai", []);
466        let _ = conn.execute("DROP TRIGGER IF EXISTS chunks_ad", []);
467        let _ = conn.execute("DROP TRIGGER IF EXISTS chunks_au", []);
468
469        // 3. Create new tables with OpenClaw schema
470        conn.execute(
471            r#"
472            CREATE TABLE IF NOT EXISTS files (
473                path TEXT PRIMARY KEY,
474                source TEXT NOT NULL DEFAULT 'memory',
475                hash TEXT NOT NULL,
476                mtime INTEGER NOT NULL,
477                size INTEGER NOT NULL
478            )
479            "#,
480            [],
481        )?;
482
483        conn.execute(
484            r#"
485            CREATE TABLE IF NOT EXISTS chunks (
486                id TEXT PRIMARY KEY,
487                path TEXT NOT NULL,
488                source TEXT NOT NULL DEFAULT 'memory',
489                start_line INTEGER NOT NULL,
490                end_line INTEGER NOT NULL,
491                hash TEXT NOT NULL,
492                model TEXT NOT NULL DEFAULT '',
493                text TEXT NOT NULL,
494                embedding TEXT NOT NULL DEFAULT '',
495                updated_at INTEGER NOT NULL
496            )
497            "#,
498            [],
499        )?;
500
501        // 4. Migrate data from old tables
502        let now = std::time::SystemTime::now()
503            .duration_since(std::time::UNIX_EPOCH)
504            .unwrap_or_default()
505            .as_secs() as i64;
506
507        // Migrate files
508        let _ = conn.execute(
509            r#"
510            INSERT INTO files (path, source, hash, mtime, size)
511            SELECT path, 'memory', hash, mtime, size FROM files_old
512            "#,
513            [],
514        );
515
516        // Migrate chunks - generate new TEXT UUIDs for each row
517        // Check if old schema has embedding columns
518        let has_embedding_cols = conn
519            .prepare("SELECT embedding FROM chunks_old LIMIT 0")
520            .is_ok();
521
522        // Read old data and insert with new UUIDs
523        if has_embedding_cols {
524            // Old schema has embedding columns
525            let mut stmt = conn.prepare(
526                "SELECT file_path, line_start, line_end, content, embedding, embedding_model FROM chunks_old",
527            )?;
528            let rows = stmt.query_map([], |row| {
529                Ok((
530                    row.get::<_, String>(0)?,
531                    row.get::<_, i32>(1)?,
532                    row.get::<_, i32>(2)?,
533                    row.get::<_, String>(3)?,
534                    row.get::<_, Option<String>>(4)?,
535                    row.get::<_, Option<String>>(5)?,
536                ))
537            })?;
538
539            for row in rows {
540                let (file_path, line_start, line_end, content, embedding, model) = row?;
541                let new_id = Uuid::new_v4().to_string();
542                let hash = hash_content(&content);
543                let model = model.unwrap_or_default();
544                let embedding = embedding.unwrap_or_default();
545
546                conn.execute(
547                    r#"
548                    INSERT INTO chunks (id, path, source, start_line, end_line, hash, model, text, embedding, updated_at)
549                    VALUES (?1, ?2, 'memory', ?3, ?4, ?5, ?6, ?7, ?8, ?9)
550                    "#,
551                    params![&new_id, &file_path, line_start, line_end, &hash, &model, &content, &embedding, now],
552                )?;
553            }
554        } else {
555            // Old schema without embedding columns
556            let mut stmt =
557                conn.prepare("SELECT file_path, line_start, line_end, content FROM chunks_old")?;
558            let rows = stmt.query_map([], |row| {
559                Ok((
560                    row.get::<_, String>(0)?,
561                    row.get::<_, i32>(1)?,
562                    row.get::<_, i32>(2)?,
563                    row.get::<_, String>(3)?,
564                ))
565            })?;
566
567            for row in rows {
568                let (file_path, line_start, line_end, content) = row?;
569                let new_id = Uuid::new_v4().to_string();
570                let hash = hash_content(&content);
571
572                conn.execute(
573                    r#"
574                    INSERT INTO chunks (id, path, source, start_line, end_line, hash, model, text, embedding, updated_at)
575                    VALUES (?1, ?2, 'memory', ?3, ?4, ?5, '', ?6, '', ?7)
576                    "#,
577                    params![&new_id, &file_path, line_start, line_end, &hash, &content, now],
578                )?;
579            }
580        }
581
582        // 5. Drop old tables
583        let _ = conn.execute("DROP TABLE IF EXISTS chunks_old", []);
584        let _ = conn.execute("DROP TABLE IF EXISTS files_old", []);
585
586        conn.execute("COMMIT", [])?;
587        info!("Schema migration completed successfully");
588        Ok(())
589    }
590
591    /// Create FTS5 table with OpenClaw-compatible structure
592    fn ensure_fts_table(conn: &Connection) -> Result<()> {
593        // OpenClaw uses UNINDEXED columns for metadata
594        let result = conn.execute(
595            r#"
596            CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
597                text,
598                id UNINDEXED,
599                path UNINDEXED,
600                source UNINDEXED,
601                model UNINDEXED,
602                start_line UNINDEXED,
603                end_line UNINDEXED
604            )
605            "#,
606            [],
607        );
608
609        match result {
610            Ok(_) => debug!("FTS5 table created/verified"),
611            Err(e) => debug!("FTS5 table creation skipped: {}", e),
612        }
613
614        Ok(())
615    }
616
617    /// Ensure a column exists on a table (for migrations)
618    fn ensure_column(conn: &Connection, table: &str, column: &str, definition: &str) -> Result<()> {
619        let sql = format!("SELECT {} FROM {} LIMIT 0", column, table);
620        if conn.prepare(&sql).is_err() {
621            let alter = format!("ALTER TABLE {} ADD COLUMN {} {}", table, column, definition);
622            conn.execute(&alter, [])?;
623            debug!("Added column {} to table {}", column, table);
624        }
625        Ok(())
626    }
627
628    /// Get chunks that need embeddings (OpenClaw-compatible: id is TEXT, text column)
629    pub fn chunks_without_embeddings(&self, limit: usize) -> Result<Vec<(String, String)>> {
630        let conn = self
631            .conn
632            .lock()
633            .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
634
635        let mut stmt = conn.prepare(
636            "SELECT id, text FROM chunks WHERE embedding = '' OR embedding IS NULL LIMIT ?1",
637        )?;
638
639        let rows = stmt.query_map(params![limit as i64], |row| {
640            Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?))
641        })?;
642
643        let mut results = Vec::new();
644        for row in rows {
645            results.push(row?);
646        }
647
648        Ok(results)
649    }
650
651    /// Store embedding for a chunk (OpenClaw-compatible: id is TEXT, model column)
652    pub fn store_embedding(&self, chunk_id: &str, embedding: &[f32], model: &str) -> Result<()> {
653        let conn = self
654            .conn
655            .lock()
656            .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
657
658        let embedding_json = serialize_embedding(embedding);
659        let now = std::time::SystemTime::now()
660            .duration_since(std::time::UNIX_EPOCH)?
661            .as_secs() as i64;
662
663        conn.execute(
664            "UPDATE chunks SET embedding = ?1, model = ?2, updated_at = ?3 WHERE id = ?4",
665            params![&embedding_json, model, now, chunk_id],
666        )?;
667
668        // Also store in vec table if sqlite-vec is available
669        if self.has_vec_extension {
670            let embedding_blob = embedding_to_blob(embedding);
671            let _ = conn.execute(
672                "INSERT OR REPLACE INTO chunks_vec (id, embedding) VALUES (?1, ?2)",
673                params![chunk_id, &embedding_blob],
674            );
675        }
676
677        Ok(())
678    }
679
680    // ========================================================================
681    // Embedding Cache (OpenClaw-compatible)
682    // ========================================================================
683
684    /// Get cached embedding by content hash
685    pub fn get_cached_embedding(
686        &self,
687        provider: &str,
688        model: &str,
689        text_hash: &str,
690    ) -> Result<Option<Vec<f32>>> {
691        let conn = self
692            .conn
693            .lock()
694            .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
695
696        let result: Option<String> = conn
697            .query_row(
698                "SELECT embedding FROM embedding_cache WHERE provider = ?1 AND model = ?2 AND hash = ?3",
699                params![provider, model, text_hash],
700                |row| row.get(0),
701            )
702            .ok();
703
704        Ok(result.map(|json| deserialize_embedding(&json)))
705    }
706
707    /// Store embedding in cache
708    pub fn cache_embedding(
709        &self,
710        provider: &str,
711        model: &str,
712        provider_key: &str,
713        text_hash: &str,
714        embedding: &[f32],
715    ) -> Result<()> {
716        let conn = self
717            .conn
718            .lock()
719            .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
720
721        let embedding_json = serialize_embedding(embedding);
722        let dims = embedding.len() as i32;
723        let now = std::time::SystemTime::now()
724            .duration_since(std::time::UNIX_EPOCH)?
725            .as_secs() as i64;
726
727        conn.execute(
728            "INSERT OR REPLACE INTO embedding_cache (provider, model, provider_key, hash, embedding, dims, updated_at)
729             VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
730            params![provider, model, provider_key, text_hash, &embedding_json, dims, now],
731        )?;
732
733        Ok(())
734    }
735
736    /// Check if sqlite-vec is available
737    pub fn has_vec_extension(&self) -> bool {
738        self.has_vec_extension
739    }
740
741    /// Vector search using embeddings (OpenClaw-compatible columns)
742    /// Uses sqlite-vec if available for fast search, otherwise falls back to in-memory scan
743    pub fn search_vector(
744        &self,
745        query_embedding: &[f32],
746        model: &str,
747        limit: usize,
748    ) -> Result<Vec<MemoryChunk>> {
749        let conn = self
750            .conn
751            .lock()
752            .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
753
754        // Try sqlite-vec fast path if available
755        if self.has_vec_extension {
756            if let Ok(results) = self.search_vector_fast(&conn, query_embedding, model, limit) {
757                return Ok(results);
758            }
759            warn!("sqlite-vec search failed, falling back to in-memory scan");
760        }
761
762        // Fallback: in-memory scan (slower but always works)
763        self.search_vector_scan(&conn, query_embedding, model, limit)
764    }
765
766    /// Fast vector search using sqlite-vec extension
767    fn search_vector_fast(
768        &self,
769        conn: &Connection,
770        query_embedding: &[f32],
771        model: &str,
772        limit: usize,
773    ) -> Result<Vec<MemoryChunk>> {
774        let query_blob = embedding_to_blob(query_embedding);
775
776        // sqlite-vec uses vec_distance_cosine for cosine distance (1 - similarity)
777        let mut stmt = conn.prepare(
778            r#"
779            SELECT c.path, c.start_line, c.end_line, c.text,
780                   1.0 - vec_distance_cosine(v.embedding, ?1) AS score, c.updated_at
781            FROM chunks_vec v
782            JOIN chunks c ON c.id = v.id
783            WHERE c.model = ?2
784            ORDER BY score DESC
785            LIMIT ?3
786            "#,
787        )?;
788
789        let rows = stmt.query_map(params![&query_blob, model, limit as i64], |row| {
790            Ok(MemoryChunk {
791                file: row.get(0)?,
792                line_start: row.get(1)?,
793                line_end: row.get(2)?,
794                content: row.get(3)?,
795                score: row.get(4)?,
796                updated_at: row.get(5)?,
797            })
798        })?;
799
800        let mut results = Vec::new();
801        for row in rows {
802            results.push(row?);
803        }
804        Ok(results)
805    }
806
807    /// In-memory vector scan (fallback when sqlite-vec not available)
808    fn search_vector_scan(
809        &self,
810        conn: &Connection,
811        query_embedding: &[f32],
812        model: &str,
813        limit: usize,
814    ) -> Result<Vec<MemoryChunk>> {
815        let mut stmt = conn.prepare(
816            "SELECT id, path, start_line, end_line, text, embedding, updated_at
817             FROM chunks
818             WHERE embedding != '' AND embedding IS NOT NULL AND model = ?1",
819        )?;
820
821        let rows = stmt.query_map(params![model], |row| {
822            Ok((
823                row.get::<_, String>(0)?,
824                row.get::<_, String>(1)?,
825                row.get::<_, i32>(2)?,
826                row.get::<_, i32>(3)?,
827                row.get::<_, String>(4)?,
828                row.get::<_, String>(5)?,
829                row.get::<_, i64>(6)?,
830            ))
831        })?;
832
833        // Compute similarities and sort
834        let mut scored: Vec<(f32, MemoryChunk)> = Vec::new();
835
836        for row in rows {
837            let (_, path, start_line, end_line, text, embedding_json, updated_at) = row?;
838            let embedding = deserialize_embedding(&embedding_json);
839
840            if embedding.len() == query_embedding.len() {
841                let similarity = cosine_similarity(query_embedding, &embedding);
842                scored.push((
843                    similarity,
844                    MemoryChunk {
845                        file: path,
846                        line_start: start_line,
847                        line_end: end_line,
848                        content: text,
849                        score: similarity as f64,
850                        updated_at,
851                    },
852                ));
853            }
854        }
855
856        // Sort by similarity (descending)
857        scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
858
859        // Take top results
860        Ok(scored
861            .into_iter()
862            .take(limit)
863            .map(|(_, chunk)| chunk)
864            .collect())
865    }
866
867    /// Hybrid search: combine FTS and vector results
868    pub fn search_hybrid(
869        &self,
870        query: &str,
871        query_embedding: Option<&[f32]>,
872        model: &str,
873        limit: usize,
874        text_weight: f32,
875        vector_weight: f32,
876    ) -> Result<Vec<MemoryChunk>> {
877        // Get FTS results
878        let fts_results = self.search(query, limit * 2)?;
879
880        // Get vector results if embedding provided
881        let vector_results = if let Some(embedding) = query_embedding {
882            self.search_vector(embedding, model, limit * 2)?
883        } else {
884            Vec::new()
885        };
886
887        // Merge results using rank-based scoring (OpenClaw-compatible)
888        let mut merged: std::collections::HashMap<String, (f32, MemoryChunk)> =
889            std::collections::HashMap::new();
890
891        // Add FTS results using rank-based scoring (OpenClaw-compatible)
892        // BM25 results are already ordered by relevance (best first)
893        for (rank, result) in fts_results.into_iter().enumerate() {
894            let key = format!("{}:{}:{}", result.file, result.line_start, result.line_end);
895            let rank_score = 1.0 / (1.0 + rank as f32); // rank 0 → 1.0, rank 1 → 0.5, rank 9 → 0.1
896            let weighted_score = rank_score * text_weight;
897            merged.insert(key, (weighted_score, result));
898        }
899
900        // Add/merge vector results using rank-based scoring
901        for (rank, result) in vector_results.into_iter().enumerate() {
902            let key = format!("{}:{}:{}", result.file, result.line_start, result.line_end);
903            let rank_score = 1.0 / (1.0 + rank as f32);
904            let weighted_score = rank_score * vector_weight;
905
906            if let Some((existing_score, existing_chunk)) = merged.get_mut(&key) {
907                *existing_score += weighted_score;
908                existing_chunk.score = *existing_score as f64;
909            } else {
910                let mut chunk = result;
911                chunk.score = weighted_score as f64;
912                merged.insert(key, (weighted_score, chunk));
913            }
914        }
915
916        // Sort by combined score and take top results
917        let mut results: Vec<_> = merged.into_values().collect();
918        results.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
919
920        Ok(results
921            .into_iter()
922            .take(limit)
923            .map(|(_, chunk)| chunk)
924            .collect())
925    }
926
927    /// Count chunks with embeddings (OpenClaw-compatible: model column)
928    pub fn embedded_chunk_count(&self, model: &str) -> Result<usize> {
929        let conn = self
930            .conn
931            .lock()
932            .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
933
934        let count: i64 = conn.query_row(
935            "SELECT COUNT(*) FROM chunks WHERE embedding != '' AND embedding IS NOT NULL AND model = ?1",
936            params![model],
937            |row| row.get(0),
938        )?;
939
940        Ok(count as usize)
941    }
942}
943
944fn hash_content(content: &str) -> String {
945    let mut hasher = Sha256::new();
946    hasher.update(content.as_bytes());
947    format!("{:x}", hasher.finalize())
948}
949
950/// Convert f32 embedding to binary blob for sqlite-vec
951fn embedding_to_blob(embedding: &[f32]) -> Vec<u8> {
952    let mut blob = Vec::with_capacity(embedding.len() * 4);
953    for &val in embedding {
954        blob.extend_from_slice(&val.to_le_bytes());
955    }
956    blob
957}
958
959/// Build FTS5 query from raw input (OpenClaw-compatible)
960/// Tokenizes input and joins with AND so all terms must appear (in any order)
961fn build_fts_query(raw: &str) -> Option<String> {
962    let tokens: Vec<&str> = raw
963        .split(|c: char| !c.is_alphanumeric() && c != '_')
964        .map(|t| t.trim())
965        .filter(|t| !t.is_empty())
966        .collect();
967
968    if tokens.is_empty() {
969        return None;
970    }
971
972    // Quote each token individually, join with AND
973    let quoted: Vec<String> = tokens
974        .iter()
975        .map(|t| format!("\"{}\"", t.replace('"', "")))
976        .collect();
977
978    Some(quoted.join(" AND "))
979}
980
981struct ChunkInfo {
982    line_start: i32,
983    line_end: i32,
984    content: String,
985}
986
987fn chunk_text(text: &str, target_tokens: usize, overlap_tokens: usize) -> Vec<ChunkInfo> {
988    let lines: Vec<&str> = text.lines().collect();
989    let mut chunks = Vec::new();
990
991    if lines.is_empty() {
992        return chunks;
993    }
994
995    // Rough estimate: 4 chars per token
996    let target_chars = target_tokens * 4;
997    let overlap_chars = overlap_tokens * 4;
998
999    let mut start_line = 0;
1000    let mut current_chars = 0;
1001    let mut chunk_lines = Vec::new();
1002
1003    for (i, line) in lines.iter().enumerate() {
1004        chunk_lines.push(*line);
1005        current_chars += line.len() + 1; // +1 for newline
1006
1007        if current_chars >= target_chars || i == lines.len() - 1 {
1008            // Create chunk
1009            chunks.push(ChunkInfo {
1010                line_start: (start_line + 1) as i32,
1011                line_end: (i + 1) as i32,
1012                content: chunk_lines.join("\n"),
1013            });
1014
1015            // Calculate overlap for next chunk
1016            let mut overlap_len = 0;
1017            let mut overlap_start = chunk_lines.len();
1018
1019            for (j, line) in chunk_lines.iter().enumerate().rev() {
1020                overlap_len += line.len() + 1;
1021                if overlap_len >= overlap_chars {
1022                    overlap_start = j;
1023                    break;
1024                }
1025            }
1026
1027            // Prepare for next chunk
1028            if overlap_start < chunk_lines.len() {
1029                start_line += overlap_start;
1030                chunk_lines = chunk_lines[overlap_start..].to_vec();
1031                current_chars = chunk_lines.iter().map(|l| l.len() + 1).sum();
1032            } else {
1033                start_line = i + 1;
1034                chunk_lines.clear();
1035                current_chars = 0;
1036            }
1037        }
1038    }
1039
1040    chunks
1041}
1042
1043#[cfg(test)]
1044mod tests {
1045    use super::*;
1046    use tempfile::TempDir;
1047
1048    #[test]
1049    fn test_chunk_text() {
1050        let text = "Line 1\nLine 2\nLine 3\nLine 4\nLine 5";
1051        let chunks = chunk_text(text, 10, 2); // Small chunks for testing
1052
1053        assert!(!chunks.is_empty());
1054        assert_eq!(chunks[0].line_start, 1);
1055    }
1056
1057    #[test]
1058    fn test_memory_index() -> Result<()> {
1059        let temp_dir = TempDir::new()?;
1060        let workspace = temp_dir.path();
1061
1062        // Create a test file
1063        let test_file = workspace.join("test.md");
1064        fs::write(
1065            &test_file,
1066            "# Test\n\nThis is a test document.\n\nWith multiple lines.",
1067        )?;
1068
1069        let index = MemoryIndex::new(workspace)?;
1070        index.index_file(&test_file, false)?;
1071
1072        assert!(index.chunk_count()? > 0);
1073
1074        let results = index.search("test document", 10)?;
1075        assert!(!results.is_empty());
1076
1077        Ok(())
1078    }
1079}