Skip to main content

localgpt_core/memory/
index.rs

1use anyhow::{Result, anyhow};
2use rusqlite::{Connection, params};
3use sha2::{Digest, Sha256};
4use std::fs;
5use std::path::{Path, PathBuf};
6use std::sync::{Arc, Mutex};
7use std::time::Duration;
8use tracing::{debug, info, warn};
9use uuid::Uuid;
10
11use super::embeddings::{cosine_similarity, deserialize_embedding, serialize_embedding};
12use super::search::MemoryChunk;
13
14#[derive(Clone)]
15pub struct MemoryIndex {
16    conn: Arc<Mutex<Connection>>,
17    workspace: PathBuf,
18    db_path: PathBuf,
19    /// Whether sqlite-vec extension is loaded for fast vector search
20    has_vec_extension: bool,
21    /// Token count per chunk (default: 400)
22    chunk_size: usize,
23    /// Token overlap between chunks (default: 80)
24    chunk_overlap: usize,
25}
26
27#[derive(Debug)]
28pub struct ReindexStats {
29    pub files_processed: usize,
30    pub files_updated: usize,
31    pub chunks_indexed: usize,
32    pub duration: Duration,
33}
34
35impl MemoryIndex {
36    /// Create a new memory index with database at the specified path
37    pub fn new_with_db_path(workspace: &Path, db_path: &Path) -> Result<Self> {
38        // Ensure parent directory exists
39        if let Some(parent) = db_path.parent() {
40            fs::create_dir_all(parent)?;
41        }
42
43        let conn = Connection::open(db_path)?;
44
45        // Check if we need to migrate from old schema
46        let needs_migration = Self::needs_schema_migration(&conn)?;
47        if needs_migration {
48            info!("Migrating database schema to OpenClaw-compatible format");
49            Self::migrate_to_openclaw_schema(&conn)?;
50        }
51
52        // Initialize OpenClaw-compatible schema
53        conn.execute_batch(
54            r#"
55            -- Metadata key/value store
56            CREATE TABLE IF NOT EXISTS meta (
57                key TEXT PRIMARY KEY,
58                value TEXT NOT NULL
59            );
60
61            -- File tracking (OpenClaw-compatible)
62            CREATE TABLE IF NOT EXISTS files (
63                path TEXT PRIMARY KEY,
64                source TEXT NOT NULL DEFAULT 'memory',
65                hash TEXT NOT NULL,
66                mtime INTEGER NOT NULL,
67                size INTEGER NOT NULL
68            );
69
70            -- Chunked content (OpenClaw-compatible)
71            CREATE TABLE IF NOT EXISTS chunks (
72                id TEXT PRIMARY KEY,
73                path TEXT NOT NULL,
74                source TEXT NOT NULL DEFAULT 'memory',
75                start_line INTEGER NOT NULL,
76                end_line INTEGER NOT NULL,
77                hash TEXT NOT NULL,
78                model TEXT NOT NULL DEFAULT '',
79                text TEXT NOT NULL,
80                embedding TEXT NOT NULL DEFAULT '',
81                updated_at INTEGER NOT NULL
82            );
83
84            -- Embedding cache (OpenClaw-compatible)
85            CREATE TABLE IF NOT EXISTS embedding_cache (
86                provider TEXT NOT NULL,
87                model TEXT NOT NULL,
88                provider_key TEXT NOT NULL,
89                hash TEXT NOT NULL,
90                embedding TEXT NOT NULL,
91                dims INTEGER,
92                updated_at INTEGER NOT NULL,
93                PRIMARY KEY (provider, model, provider_key, hash)
94            );
95
96            -- Indexes
97            CREATE INDEX IF NOT EXISTS idx_chunks_path ON chunks(path);
98            CREATE INDEX IF NOT EXISTS idx_chunks_source ON chunks(source);
99            CREATE INDEX IF NOT EXISTS idx_embedding_cache_updated_at ON embedding_cache(updated_at);
100            "#,
101        )?;
102
103        // Create FTS5 table (OpenClaw-compatible with UNINDEXED columns)
104        Self::ensure_fts_table(&conn)?;
105
106        // Ensure source column exists on older tables
107        Self::ensure_column(&conn, "files", "source", "TEXT NOT NULL DEFAULT 'memory'")?;
108        Self::ensure_column(&conn, "chunks", "source", "TEXT NOT NULL DEFAULT 'memory'")?;
109
110        // Try to load sqlite-vec extension for fast vector search
111        let has_vec_extension = Self::try_load_sqlite_vec(&conn);
112        if has_vec_extension {
113            debug!("sqlite-vec extension loaded successfully");
114            Self::ensure_vec_table(&conn)?;
115        } else {
116            debug!("sqlite-vec extension not available, using in-memory vector search");
117        }
118
119        Ok(Self {
120            conn: Arc::new(Mutex::new(conn)),
121            workspace: workspace.to_path_buf(),
122            db_path: db_path.to_path_buf(),
123            has_vec_extension,
124            chunk_size: 400,
125            chunk_overlap: 80,
126        })
127    }
128
129    /// Set chunk size and overlap (builder pattern)
130    pub fn with_chunk_config(mut self, chunk_size: usize, chunk_overlap: usize) -> Self {
131        self.chunk_size = chunk_size;
132        self.chunk_overlap = chunk_overlap;
133        self
134    }
135
136    /// Try to load sqlite-vec extension
137    #[cfg(feature = "sqlite-vec")]
138    #[allow(unsafe_code)]
139    fn try_load_sqlite_vec(conn: &Connection) -> bool {
140        // sqlite-vec provides the extension as a loadable module
141        // Try to load it - this requires the extension to be installed on the system
142
143        // SAFETY: load_extension is unsafe because it loads arbitrary native code
144        // We only load from known, trusted paths (sqlite-vec)
145        if unsafe { conn.load_extension_enable() }.is_err() {
146            return false;
147        }
148
149        // Try loading from common locations
150        let ext_paths = [
151            "vec0", // If in LD_LIBRARY_PATH
152            "./vec0",
153            "/usr/local/lib/vec0",
154            "/usr/lib/vec0",
155        ];
156
157        for path in ext_paths {
158            // SAFETY: Loading sqlite-vec extension from trusted path
159            if unsafe { conn.load_extension(path, None::<&str>) }.is_ok() {
160                let _ = conn.load_extension_disable();
161                return true;
162            }
163        }
164
165        let _ = conn.load_extension_disable();
166        false
167    }
168
169    #[cfg(not(feature = "sqlite-vec"))]
170    fn try_load_sqlite_vec(_conn: &Connection) -> bool {
171        false
172    }
173
174    /// Create virtual table for vector search (requires sqlite-vec)
175    fn ensure_vec_table(conn: &Connection) -> Result<()> {
176        // Create chunks_vec virtual table if sqlite-vec is available
177        let result = conn.execute(
178            "CREATE VIRTUAL TABLE IF NOT EXISTS chunks_vec USING vec0(id TEXT PRIMARY KEY, embedding float[1536])",
179            [],
180        );
181        match result {
182            Ok(_) => debug!("chunks_vec table created/verified"),
183            Err(e) => debug!("chunks_vec table creation skipped: {}", e),
184        }
185        Ok(())
186    }
187
188    /// Create a new memory index with database in workspace (legacy path)
189    pub fn new(workspace: &Path) -> Result<Self> {
190        let db_path = workspace.join("memory.sqlite");
191        Self::new_with_db_path(workspace, &db_path)
192    }
193
194    /// Index a file, returning true if it was updated
195    pub fn index_file(&self, path: &Path, force: bool) -> Result<bool> {
196        let content = fs::read_to_string(path)?;
197        let file_hash = hash_content(&content);
198        let metadata = fs::metadata(path)?;
199        let mtime = metadata
200            .modified()?
201            .duration_since(std::time::UNIX_EPOCH)?
202            .as_secs() as i64;
203        let size = metadata.len() as i64;
204
205        let relative_path = path
206            .strip_prefix(&self.workspace)
207            .unwrap_or(path)
208            .to_string_lossy()
209            .to_string();
210
211        let conn = self
212            .conn
213            .lock()
214            .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
215
216        // Check if file has changed
217        if !force {
218            let existing: Option<String> = conn
219                .query_row(
220                    "SELECT hash FROM files WHERE path = ?1",
221                    params![&relative_path],
222                    |row| row.get(0),
223                )
224                .ok();
225
226            if existing.as_deref() == Some(&file_hash) {
227                debug!("File unchanged, skipping: {}", relative_path);
228                return Ok(false);
229            }
230        }
231
232        debug!("Indexing file: {}", relative_path);
233
234        let now = std::time::SystemTime::now()
235            .duration_since(std::time::UNIX_EPOCH)?
236            .as_secs() as i64;
237
238        // Update file record (OpenClaw-compatible columns)
239        conn.execute(
240            "INSERT OR REPLACE INTO files (path, source, hash, mtime, size) VALUES (?1, 'memory', ?2, ?3, ?4)",
241            params![&relative_path, &file_hash, mtime, size],
242        )?;
243
244        // Delete existing chunks and their FTS entries
245        Self::delete_chunks_for_path(&conn, &relative_path)?;
246
247        // Create new chunks (OpenClaw-compatible)
248        let chunks = chunk_text(&content, self.chunk_size, self.chunk_overlap);
249
250        for chunk in chunks.iter() {
251            let chunk_id = Uuid::new_v4().to_string();
252            let chunk_hash = hash_content(&chunk.content);
253
254            conn.execute(
255                r#"INSERT INTO chunks (id, path, source, start_line, end_line, hash, model, text, embedding, updated_at)
256                   VALUES (?1, ?2, 'memory', ?3, ?4, ?5, '', ?6, '', ?7)"#,
257                params![&chunk_id, &relative_path, chunk.line_start, chunk.line_end, &chunk_hash, &chunk.content, now],
258            )?;
259
260            // Insert into FTS
261            Self::insert_fts(
262                &conn,
263                &chunk_id,
264                &relative_path,
265                "memory",
266                "",
267                chunk.line_start,
268                chunk.line_end,
269                &chunk.content,
270            )?;
271        }
272
273        Ok(true)
274    }
275
276    /// Insert a single pre-built chunk into the index.
277    /// Used by session indexer to add session transcript chunks.
278    pub fn insert_chunk(
279        &self,
280        virtual_path: &str,
281        content: &str,
282        line_start_raw: usize,
283        line_end_raw: usize,
284    ) -> Result<()> {
285        let conn = self
286            .conn
287            .lock()
288            .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
289
290        let line_start = line_start_raw as i64;
291        let line_end = line_end_raw as i64;
292        let chunk_id = Uuid::new_v4().to_string();
293        let chunk_hash = hash_content(content);
294        let now = std::time::SystemTime::now()
295            .duration_since(std::time::UNIX_EPOCH)?
296            .as_secs() as i64;
297
298        conn.execute(
299            r#"INSERT INTO chunks (id, path, source, start_line, end_line, hash, model, text, embedding, updated_at)
300               VALUES (?1, ?2, 'session', ?3, ?4, ?5, '', ?6, '', ?7)"#,
301            params![&chunk_id, virtual_path, line_start, line_end, &chunk_hash, content, now],
302        )?;
303
304        Self::insert_fts(
305            &conn,
306            &chunk_id,
307            virtual_path,
308            "session",
309            "",
310            line_start as i32,
311            line_end as i32,
312            content,
313        )?;
314
315        Ok(())
316    }
317
318    /// Delete chunks for a path and their FTS entries
319    fn delete_chunks_for_path(conn: &Connection, path: &str) -> Result<()> {
320        // Delete from FTS first (get chunk IDs)
321        let mut stmt = conn.prepare("SELECT id FROM chunks WHERE path = ?1")?;
322        let chunk_ids: Vec<String> = stmt
323            .query_map(params![path], |row| row.get(0))?
324            .filter_map(|r| r.ok())
325            .collect();
326
327        for chunk_id in chunk_ids {
328            let _ = conn.execute("DELETE FROM chunks_fts WHERE id = ?1", params![&chunk_id]);
329        }
330
331        // Delete chunks
332        conn.execute("DELETE FROM chunks WHERE path = ?1", params![path])?;
333        Ok(())
334    }
335
336    /// Remove a file and its chunks from the index (for deleted files)
337    pub fn remove_file(&self, relative_path: &str) -> Result<()> {
338        let conn = self
339            .conn
340            .lock()
341            .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
342
343        Self::delete_chunks_for_path(&conn, relative_path)?;
344        conn.execute("DELETE FROM files WHERE path = ?1", params![relative_path])?;
345
346        debug!("Removed deleted file from index: {}", relative_path);
347        Ok(())
348    }
349
350    /// Get all indexed file paths
351    pub fn indexed_files(&self) -> Result<Vec<String>> {
352        let conn = self
353            .conn
354            .lock()
355            .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
356
357        let mut stmt = conn.prepare("SELECT path FROM files")?;
358        let rows = stmt.query_map([], |row| row.get(0))?;
359
360        let mut paths = Vec::new();
361        for row in rows {
362            paths.push(row?);
363        }
364        Ok(paths)
365    }
366
367    /// Insert into FTS table
368    #[allow(clippy::too_many_arguments)]
369    fn insert_fts(
370        conn: &Connection,
371        id: &str,
372        path: &str,
373        source: &str,
374        model: &str,
375        start_line: i32,
376        end_line: i32,
377        text: &str,
378    ) -> Result<()> {
379        let _ = conn.execute(
380            "INSERT INTO chunks_fts (text, id, path, source, model, start_line, end_line) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
381            params![text, id, path, source, model, start_line, end_line],
382        );
383        Ok(())
384    }
385
386    /// Search using FTS5
387    pub fn search(&self, query: &str, limit: usize) -> Result<Vec<MemoryChunk>> {
388        let fts_query = match build_fts_query(query) {
389            Some(q) => q,
390            None => return Ok(Vec::new()),
391        };
392
393        let conn = self
394            .conn
395            .lock()
396            .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
397
398        // OpenClaw-compatible: use 'path', 'start_line', 'end_line', 'text' columns
399        // Join with chunks to get updated_at for temporal decay
400        let mut stmt = conn.prepare(
401            r#"
402            SELECT fts.path, fts.start_line, fts.end_line, fts.text, bm25(chunks_fts) as score, c.updated_at
403            FROM chunks_fts fts
404            JOIN chunks c ON c.id = fts.id
405            WHERE chunks_fts MATCH ?1
406            ORDER BY score
407            LIMIT ?2
408            "#,
409        )?;
410
411        let rows = stmt.query_map(params![&fts_query, limit as i64], |row| {
412            Ok(MemoryChunk {
413                file: row.get(0)?,
414                line_start: row.get(1)?,
415                line_end: row.get(2)?,
416                content: row.get(3)?,
417                score: row.get::<_, f64>(4)?.abs(), // BM25 returns negative scores
418                updated_at: row.get(5)?,
419            })
420        })?;
421
422        let mut results = Vec::new();
423        for row in rows {
424            results.push(row?);
425        }
426
427        Ok(results)
428    }
429
430    /// Search using a pre-built FTS5 query string (bypasses build_fts_query).
431    /// Used by query expansion which provides its own OR-joined keyword query.
432    pub fn search_fts_raw(&self, fts_query: &str, limit: usize) -> Result<Vec<MemoryChunk>> {
433        if fts_query.is_empty() {
434            return Ok(Vec::new());
435        }
436
437        let conn = self
438            .conn
439            .lock()
440            .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
441
442        let mut stmt = conn.prepare(
443            r#"
444            SELECT fts.path, fts.start_line, fts.end_line, fts.text, bm25(chunks_fts) as score, c.updated_at
445            FROM chunks_fts fts
446            JOIN chunks c ON c.id = fts.id
447            WHERE chunks_fts MATCH ?1
448            ORDER BY score
449            LIMIT ?2
450            "#,
451        )?;
452
453        let rows = stmt.query_map(params![fts_query, limit as i64], |row| {
454            Ok(MemoryChunk {
455                file: row.get(0)?,
456                line_start: row.get(1)?,
457                line_end: row.get(2)?,
458                content: row.get(3)?,
459                score: row.get::<_, f64>(4)?.abs(),
460                updated_at: row.get(5)?,
461            })
462        })?;
463
464        let mut results = Vec::new();
465        for row in rows {
466            results.push(row?);
467        }
468
469        Ok(results)
470    }
471
472    /// Get total chunk count
473    pub fn chunk_count(&self) -> Result<usize> {
474        let conn = self
475            .conn
476            .lock()
477            .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
478        let count: i64 = conn.query_row("SELECT COUNT(*) FROM chunks", [], |row| row.get(0))?;
479        Ok(count as usize)
480    }
481
482    /// Get chunk count for a specific file
483    pub fn file_chunk_count(&self, path: &Path) -> Result<usize> {
484        let relative_path = path
485            .strip_prefix(&self.workspace)
486            .unwrap_or(path)
487            .to_string_lossy()
488            .to_string();
489
490        let conn = self
491            .conn
492            .lock()
493            .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
494        let count: i64 = conn.query_row(
495            "SELECT COUNT(*) FROM chunks WHERE path = ?1",
496            params![&relative_path],
497            |row| row.get(0),
498        )?;
499        Ok(count as usize)
500    }
501
502    /// Get database size in bytes
503    pub fn size_bytes(&self) -> Result<u64> {
504        if self.db_path.exists() {
505            Ok(fs::metadata(&self.db_path)?.len())
506        } else {
507            Ok(0)
508        }
509    }
510
511    /// Get the database path
512    pub fn db_path(&self) -> &Path {
513        &self.db_path
514    }
515
516    /// Check if we need to migrate from old LocalGPT schema to OpenClaw schema
517    fn needs_schema_migration(conn: &Connection) -> Result<bool> {
518        // Check for old schema indicators:
519        // 1. chunks table has 'file_path' column instead of 'path'
520        // 2. chunks table has 'content' column instead of 'text'
521        // 3. chunks.id is INTEGER instead of TEXT
522        let result: rusqlite::Result<String> =
523            conn.query_row("PRAGMA table_info(chunks)", [], |row| row.get(1));
524
525        if result.is_err() {
526            // chunks table doesn't exist yet, no migration needed
527            return Ok(false);
528        }
529
530        // Check column names
531        let has_file_path: bool = conn.prepare("SELECT file_path FROM chunks LIMIT 0").is_ok();
532        let has_content: bool = conn.prepare("SELECT content FROM chunks LIMIT 0").is_ok();
533
534        // Old schema has file_path and content columns
535        Ok(has_file_path || has_content)
536    }
537
538    /// Migrate from old LocalGPT schema to OpenClaw-compatible schema
539    fn migrate_to_openclaw_schema(conn: &Connection) -> Result<()> {
540        // Start transaction
541        conn.execute("BEGIN TRANSACTION", [])?;
542
543        // 1. Rename old tables
544        let _ = conn.execute("ALTER TABLE chunks RENAME TO chunks_old", []);
545        let _ = conn.execute("ALTER TABLE files RENAME TO files_old", []);
546
547        // 2. Drop old FTS and triggers
548        let _ = conn.execute("DROP TABLE IF EXISTS chunks_fts", []);
549        let _ = conn.execute("DROP TRIGGER IF EXISTS chunks_ai", []);
550        let _ = conn.execute("DROP TRIGGER IF EXISTS chunks_ad", []);
551        let _ = conn.execute("DROP TRIGGER IF EXISTS chunks_au", []);
552
553        // 3. Create new tables with OpenClaw schema
554        conn.execute(
555            r#"
556            CREATE TABLE IF NOT EXISTS files (
557                path TEXT PRIMARY KEY,
558                source TEXT NOT NULL DEFAULT 'memory',
559                hash TEXT NOT NULL,
560                mtime INTEGER NOT NULL,
561                size INTEGER NOT NULL
562            )
563            "#,
564            [],
565        )?;
566
567        conn.execute(
568            r#"
569            CREATE TABLE IF NOT EXISTS chunks (
570                id TEXT PRIMARY KEY,
571                path TEXT NOT NULL,
572                source TEXT NOT NULL DEFAULT 'memory',
573                start_line INTEGER NOT NULL,
574                end_line INTEGER NOT NULL,
575                hash TEXT NOT NULL,
576                model TEXT NOT NULL DEFAULT '',
577                text TEXT NOT NULL,
578                embedding TEXT NOT NULL DEFAULT '',
579                updated_at INTEGER NOT NULL
580            )
581            "#,
582            [],
583        )?;
584
585        // 4. Migrate data from old tables
586        let now = std::time::SystemTime::now()
587            .duration_since(std::time::UNIX_EPOCH)
588            .unwrap_or_default()
589            .as_secs() as i64;
590
591        // Migrate files
592        let _ = conn.execute(
593            r#"
594            INSERT INTO files (path, source, hash, mtime, size)
595            SELECT path, 'memory', hash, mtime, size FROM files_old
596            "#,
597            [],
598        );
599
600        // Migrate chunks - generate new TEXT UUIDs for each row
601        // Check if old schema has embedding columns
602        let has_embedding_cols = conn
603            .prepare("SELECT embedding FROM chunks_old LIMIT 0")
604            .is_ok();
605
606        // Read old data and insert with new UUIDs
607        if has_embedding_cols {
608            // Old schema has embedding columns
609            let mut stmt = conn.prepare(
610                "SELECT file_path, line_start, line_end, content, embedding, embedding_model FROM chunks_old",
611            )?;
612            let rows = stmt.query_map([], |row| {
613                Ok((
614                    row.get::<_, String>(0)?,
615                    row.get::<_, i32>(1)?,
616                    row.get::<_, i32>(2)?,
617                    row.get::<_, String>(3)?,
618                    row.get::<_, Option<String>>(4)?,
619                    row.get::<_, Option<String>>(5)?,
620                ))
621            })?;
622
623            for row in rows {
624                let (file_path, line_start, line_end, content, embedding, model) = row?;
625                let new_id = Uuid::new_v4().to_string();
626                let hash = hash_content(&content);
627                let model = model.unwrap_or_default();
628                let embedding = embedding.unwrap_or_default();
629
630                conn.execute(
631                    r#"
632                    INSERT INTO chunks (id, path, source, start_line, end_line, hash, model, text, embedding, updated_at)
633                    VALUES (?1, ?2, 'memory', ?3, ?4, ?5, ?6, ?7, ?8, ?9)
634                    "#,
635                    params![&new_id, &file_path, line_start, line_end, &hash, &model, &content, &embedding, now],
636                )?;
637            }
638        } else {
639            // Old schema without embedding columns
640            let mut stmt =
641                conn.prepare("SELECT file_path, line_start, line_end, content FROM chunks_old")?;
642            let rows = stmt.query_map([], |row| {
643                Ok((
644                    row.get::<_, String>(0)?,
645                    row.get::<_, i32>(1)?,
646                    row.get::<_, i32>(2)?,
647                    row.get::<_, String>(3)?,
648                ))
649            })?;
650
651            for row in rows {
652                let (file_path, line_start, line_end, content) = row?;
653                let new_id = Uuid::new_v4().to_string();
654                let hash = hash_content(&content);
655
656                conn.execute(
657                    r#"
658                    INSERT INTO chunks (id, path, source, start_line, end_line, hash, model, text, embedding, updated_at)
659                    VALUES (?1, ?2, 'memory', ?3, ?4, ?5, '', ?6, '', ?7)
660                    "#,
661                    params![&new_id, &file_path, line_start, line_end, &hash, &content, now],
662                )?;
663            }
664        }
665
666        // 5. Drop old tables
667        let _ = conn.execute("DROP TABLE IF EXISTS chunks_old", []);
668        let _ = conn.execute("DROP TABLE IF EXISTS files_old", []);
669
670        conn.execute("COMMIT", [])?;
671        info!("Schema migration completed successfully");
672        Ok(())
673    }
674
675    /// Create FTS5 table with OpenClaw-compatible structure
676    fn ensure_fts_table(conn: &Connection) -> Result<()> {
677        // OpenClaw uses UNINDEXED columns for metadata
678        let result = conn.execute(
679            r#"
680            CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
681                text,
682                id UNINDEXED,
683                path UNINDEXED,
684                source UNINDEXED,
685                model UNINDEXED,
686                start_line UNINDEXED,
687                end_line UNINDEXED
688            )
689            "#,
690            [],
691        );
692
693        match result {
694            Ok(_) => debug!("FTS5 table created/verified"),
695            Err(e) => debug!("FTS5 table creation skipped: {}", e),
696        }
697
698        Ok(())
699    }
700
701    /// Ensure a column exists on a table (for migrations)
702    fn ensure_column(conn: &Connection, table: &str, column: &str, definition: &str) -> Result<()> {
703        let sql = format!("SELECT {} FROM {} LIMIT 0", column, table);
704        if conn.prepare(&sql).is_err() {
705            let alter = format!("ALTER TABLE {} ADD COLUMN {} {}", table, column, definition);
706            conn.execute(&alter, [])?;
707            debug!("Added column {} to table {}", column, table);
708        }
709        Ok(())
710    }
711
712    /// Get chunks that need embeddings (OpenClaw-compatible: id is TEXT, text column)
713    pub fn chunks_without_embeddings(&self, limit: usize) -> Result<Vec<(String, String)>> {
714        let conn = self
715            .conn
716            .lock()
717            .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
718
719        let mut stmt = conn.prepare(
720            "SELECT id, text FROM chunks WHERE embedding = '' OR embedding IS NULL LIMIT ?1",
721        )?;
722
723        let rows = stmt.query_map(params![limit as i64], |row| {
724            Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?))
725        })?;
726
727        let mut results = Vec::new();
728        for row in rows {
729            results.push(row?);
730        }
731
732        Ok(results)
733    }
734
735    /// Store embedding for a chunk (OpenClaw-compatible: id is TEXT, model column)
736    pub fn store_embedding(&self, chunk_id: &str, embedding: &[f32], model: &str) -> Result<()> {
737        let conn = self
738            .conn
739            .lock()
740            .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
741
742        let embedding_json = serialize_embedding(embedding);
743        let now = std::time::SystemTime::now()
744            .duration_since(std::time::UNIX_EPOCH)?
745            .as_secs() as i64;
746
747        conn.execute(
748            "UPDATE chunks SET embedding = ?1, model = ?2, updated_at = ?3 WHERE id = ?4",
749            params![&embedding_json, model, now, chunk_id],
750        )?;
751
752        // Also store in vec table if sqlite-vec is available
753        if self.has_vec_extension {
754            let embedding_blob = embedding_to_blob(embedding);
755            let _ = conn.execute(
756                "INSERT OR REPLACE INTO chunks_vec (id, embedding) VALUES (?1, ?2)",
757                params![chunk_id, &embedding_blob],
758            );
759        }
760
761        Ok(())
762    }
763
764    // ========================================================================
765    // Embedding Cache (OpenClaw-compatible)
766    // ========================================================================
767
768    /// Get cached embedding by content hash
769    pub fn get_cached_embedding(
770        &self,
771        provider: &str,
772        model: &str,
773        text_hash: &str,
774    ) -> Result<Option<Vec<f32>>> {
775        let conn = self
776            .conn
777            .lock()
778            .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
779
780        let result: Option<String> = conn
781            .query_row(
782                "SELECT embedding FROM embedding_cache WHERE provider = ?1 AND model = ?2 AND hash = ?3",
783                params![provider, model, text_hash],
784                |row| row.get(0),
785            )
786            .ok();
787
788        Ok(result.map(|json| deserialize_embedding(&json)))
789    }
790
791    /// Store embedding in cache
792    pub fn cache_embedding(
793        &self,
794        provider: &str,
795        model: &str,
796        provider_key: &str,
797        text_hash: &str,
798        embedding: &[f32],
799    ) -> Result<()> {
800        let conn = self
801            .conn
802            .lock()
803            .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
804
805        let embedding_json = serialize_embedding(embedding);
806        let dims = embedding.len() as i32;
807        let now = std::time::SystemTime::now()
808            .duration_since(std::time::UNIX_EPOCH)?
809            .as_secs() as i64;
810
811        conn.execute(
812            "INSERT OR REPLACE INTO embedding_cache (provider, model, provider_key, hash, embedding, dims, updated_at)
813             VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
814            params![provider, model, provider_key, text_hash, &embedding_json, dims, now],
815        )?;
816
817        Ok(())
818    }
819
820    /// Check if sqlite-vec is available
821    pub fn has_vec_extension(&self) -> bool {
822        self.has_vec_extension
823    }
824
825    /// Vector search using embeddings (OpenClaw-compatible columns)
826    /// Uses sqlite-vec if available for fast search, otherwise falls back to in-memory scan
827    pub fn search_vector(
828        &self,
829        query_embedding: &[f32],
830        model: &str,
831        limit: usize,
832    ) -> Result<Vec<MemoryChunk>> {
833        let conn = self
834            .conn
835            .lock()
836            .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
837
838        // Try sqlite-vec fast path if available
839        if self.has_vec_extension {
840            if let Ok(results) = self.search_vector_fast(&conn, query_embedding, model, limit) {
841                return Ok(results);
842            }
843            warn!("sqlite-vec search failed, falling back to in-memory scan");
844        }
845
846        // Fallback: in-memory scan (slower but always works)
847        self.search_vector_scan(&conn, query_embedding, model, limit)
848    }
849
850    /// Fast vector search using sqlite-vec extension
851    fn search_vector_fast(
852        &self,
853        conn: &Connection,
854        query_embedding: &[f32],
855        model: &str,
856        limit: usize,
857    ) -> Result<Vec<MemoryChunk>> {
858        let query_blob = embedding_to_blob(query_embedding);
859
860        // sqlite-vec uses vec_distance_cosine for cosine distance (1 - similarity)
861        let mut stmt = conn.prepare(
862            r#"
863            SELECT c.path, c.start_line, c.end_line, c.text,
864                   1.0 - vec_distance_cosine(v.embedding, ?1) AS score, c.updated_at
865            FROM chunks_vec v
866            JOIN chunks c ON c.id = v.id
867            WHERE c.model = ?2
868            ORDER BY score DESC
869            LIMIT ?3
870            "#,
871        )?;
872
873        let rows = stmt.query_map(params![&query_blob, model, limit as i64], |row| {
874            Ok(MemoryChunk {
875                file: row.get(0)?,
876                line_start: row.get(1)?,
877                line_end: row.get(2)?,
878                content: row.get(3)?,
879                score: row.get(4)?,
880                updated_at: row.get(5)?,
881            })
882        })?;
883
884        let mut results = Vec::new();
885        for row in rows {
886            results.push(row?);
887        }
888        Ok(results)
889    }
890
891    /// In-memory vector scan (fallback when sqlite-vec not available)
892    fn search_vector_scan(
893        &self,
894        conn: &Connection,
895        query_embedding: &[f32],
896        model: &str,
897        limit: usize,
898    ) -> Result<Vec<MemoryChunk>> {
899        let mut stmt = conn.prepare(
900            "SELECT id, path, start_line, end_line, text, embedding, updated_at
901             FROM chunks
902             WHERE embedding != '' AND embedding IS NOT NULL AND model = ?1",
903        )?;
904
905        let rows = stmt.query_map(params![model], |row| {
906            Ok((
907                row.get::<_, String>(0)?,
908                row.get::<_, String>(1)?,
909                row.get::<_, i32>(2)?,
910                row.get::<_, i32>(3)?,
911                row.get::<_, String>(4)?,
912                row.get::<_, String>(5)?,
913                row.get::<_, i64>(6)?,
914            ))
915        })?;
916
917        // Compute similarities and sort
918        let mut scored: Vec<(f32, MemoryChunk)> = Vec::new();
919
920        for row in rows {
921            let (_, path, start_line, end_line, text, embedding_json, updated_at) = row?;
922            let embedding = deserialize_embedding(&embedding_json);
923
924            if embedding.len() == query_embedding.len() {
925                let similarity = cosine_similarity(query_embedding, &embedding);
926                scored.push((
927                    similarity,
928                    MemoryChunk {
929                        file: path,
930                        line_start: start_line,
931                        line_end: end_line,
932                        content: text,
933                        score: similarity as f64,
934                        updated_at,
935                    },
936                ));
937            }
938        }
939
940        // Sort by similarity (descending)
941        scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
942
943        // Take top results
944        Ok(scored
945            .into_iter()
946            .take(limit)
947            .map(|(_, chunk)| chunk)
948            .collect())
949    }
950
951    /// Hybrid search: combine FTS and vector results
952    pub fn search_hybrid(
953        &self,
954        fts_query: &str,
955        query_embedding: Option<&[f32]>,
956        model: &str,
957        limit: usize,
958        text_weight: f32,
959        vector_weight: f32,
960    ) -> Result<Vec<MemoryChunk>> {
961        // Get FTS results using pre-built query (from query expansion or raw)
962        let fts_results = self.search_fts_raw(fts_query, limit * 2)?;
963
964        // Get vector results if embedding provided
965        let vector_results = if let Some(embedding) = query_embedding {
966            self.search_vector(embedding, model, limit * 2)?
967        } else {
968            Vec::new()
969        };
970
971        // Merge results using rank-based scoring (OpenClaw-compatible)
972        let mut merged: std::collections::HashMap<String, (f32, MemoryChunk)> =
973            std::collections::HashMap::new();
974
975        // Add FTS results using rank-based scoring (OpenClaw-compatible)
976        // BM25 results are already ordered by relevance (best first)
977        for (rank, result) in fts_results.into_iter().enumerate() {
978            let key = format!("{}:{}:{}", result.file, result.line_start, result.line_end);
979            let rank_score = 1.0 / (1.0 + rank as f32); // rank 0 → 1.0, rank 1 → 0.5, rank 9 → 0.1
980            let weighted_score = rank_score * text_weight;
981            merged.insert(key, (weighted_score, result));
982        }
983
984        // Add/merge vector results using rank-based scoring
985        for (rank, result) in vector_results.into_iter().enumerate() {
986            let key = format!("{}:{}:{}", result.file, result.line_start, result.line_end);
987            let rank_score = 1.0 / (1.0 + rank as f32);
988            let weighted_score = rank_score * vector_weight;
989
990            if let Some((existing_score, existing_chunk)) = merged.get_mut(&key) {
991                *existing_score += weighted_score;
992                existing_chunk.score = *existing_score as f64;
993            } else {
994                let mut chunk = result;
995                chunk.score = weighted_score as f64;
996                merged.insert(key, (weighted_score, chunk));
997            }
998        }
999
1000        // Sort by combined score and take top results
1001        let mut results: Vec<_> = merged.into_values().collect();
1002        results.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
1003
1004        Ok(results
1005            .into_iter()
1006            .take(limit)
1007            .map(|(_, chunk)| chunk)
1008            .collect())
1009    }
1010
1011    /// Count chunks with embeddings (OpenClaw-compatible: model column)
1012    pub fn embedded_chunk_count(&self, model: &str) -> Result<usize> {
1013        let conn = self
1014            .conn
1015            .lock()
1016            .map_err(|e| anyhow!("Lock poisoned: {}", e))?;
1017
1018        let count: i64 = conn.query_row(
1019            "SELECT COUNT(*) FROM chunks WHERE embedding != '' AND embedding IS NOT NULL AND model = ?1",
1020            params![model],
1021            |row| row.get(0),
1022        )?;
1023
1024        Ok(count as usize)
1025    }
1026}
1027
1028fn hash_content(content: &str) -> String {
1029    let mut hasher = Sha256::new();
1030    hasher.update(content.as_bytes());
1031    format!("{:x}", hasher.finalize())
1032}
1033
1034/// Convert f32 embedding to binary blob for sqlite-vec
1035fn embedding_to_blob(embedding: &[f32]) -> Vec<u8> {
1036    let mut blob = Vec::with_capacity(embedding.len() * 4);
1037    for &val in embedding {
1038        blob.extend_from_slice(&val.to_le_bytes());
1039    }
1040    blob
1041}
1042
1043/// Build FTS5 query from raw input (OpenClaw-compatible)
1044/// Tokenizes input and joins with AND so all terms must appear (in any order)
1045fn build_fts_query(raw: &str) -> Option<String> {
1046    let tokens: Vec<&str> = raw
1047        .split(|c: char| !c.is_alphanumeric() && c != '_')
1048        .map(|t| t.trim())
1049        .filter(|t| !t.is_empty())
1050        .collect();
1051
1052    if tokens.is_empty() {
1053        return None;
1054    }
1055
1056    // Quote each token individually, join with AND
1057    let quoted: Vec<String> = tokens
1058        .iter()
1059        .map(|t| format!("\"{}\"", t.replace('"', "")))
1060        .collect();
1061
1062    Some(quoted.join(" AND "))
1063}
1064
1065struct ChunkInfo {
1066    line_start: i32,
1067    line_end: i32,
1068    content: String,
1069}
1070
1071fn chunk_text(text: &str, target_tokens: usize, overlap_tokens: usize) -> Vec<ChunkInfo> {
1072    let lines: Vec<&str> = text.lines().collect();
1073    let mut chunks = Vec::new();
1074
1075    if lines.is_empty() {
1076        return chunks;
1077    }
1078
1079    // Rough estimate: 4 chars per token
1080    let target_chars = target_tokens * 4;
1081    let overlap_chars = overlap_tokens * 4;
1082
1083    let mut start_line = 0;
1084    let mut current_chars = 0;
1085    let mut chunk_lines = Vec::new();
1086
1087    for (i, line) in lines.iter().enumerate() {
1088        chunk_lines.push(*line);
1089        current_chars += line.len() + 1; // +1 for newline
1090
1091        if current_chars >= target_chars || i == lines.len() - 1 {
1092            // Create chunk
1093            chunks.push(ChunkInfo {
1094                line_start: (start_line + 1) as i32,
1095                line_end: (i + 1) as i32,
1096                content: chunk_lines.join("\n"),
1097            });
1098
1099            // Calculate overlap for next chunk
1100            let mut overlap_len = 0;
1101            let mut overlap_start = chunk_lines.len();
1102
1103            for (j, line) in chunk_lines.iter().enumerate().rev() {
1104                overlap_len += line.len() + 1;
1105                if overlap_len >= overlap_chars {
1106                    overlap_start = j;
1107                    break;
1108                }
1109            }
1110
1111            // Prepare for next chunk
1112            if overlap_start < chunk_lines.len() {
1113                start_line += overlap_start;
1114                chunk_lines = chunk_lines[overlap_start..].to_vec();
1115                current_chars = chunk_lines.iter().map(|l| l.len() + 1).sum();
1116            } else {
1117                start_line = i + 1;
1118                chunk_lines.clear();
1119                current_chars = 0;
1120            }
1121        }
1122    }
1123
1124    chunks
1125}
1126
1127#[cfg(test)]
1128mod tests {
1129    use super::*;
1130    use tempfile::TempDir;
1131
1132    #[test]
1133    fn test_chunk_text() {
1134        let text = "Line 1\nLine 2\nLine 3\nLine 4\nLine 5";
1135        let chunks = chunk_text(text, 10, 2); // Small chunks for testing
1136
1137        assert!(!chunks.is_empty());
1138        assert_eq!(chunks[0].line_start, 1);
1139    }
1140
1141    #[test]
1142    fn test_memory_index() -> Result<()> {
1143        let temp_dir = TempDir::new()?;
1144        let workspace = temp_dir.path();
1145
1146        // Create a test file
1147        let test_file = workspace.join("test.md");
1148        fs::write(
1149            &test_file,
1150            "# Test\n\nThis is a test document.\n\nWith multiple lines.",
1151        )?;
1152
1153        let index = MemoryIndex::new(workspace)?;
1154        index.index_file(&test_file, false)?;
1155
1156        assert!(index.chunk_count()? > 0);
1157
1158        let results = index.search("test document", 10)?;
1159        assert!(!results.is_empty());
1160
1161        Ok(())
1162    }
1163}