Skip to main content

sqlite_graphrag/commands/
vec.rs

1//! Handler for the `vec` CLI subcommand family.
2//!
3//! Provides three maintenance operations for the `vec_memories` virtual
4//! table that backs the embedding KNN search:
5//!
6//! - `orphan-list`: lists `vec_memories` rows whose `memory_id` no longer
7//!   references a live (non-soft-deleted) memory.
8//! - `purge-orphan`: deletes those orphan rows in a single transaction.
9//! - `stats`: surfaces total rows, orphan count, and coverage percentage.
10//!
11//! G39 (v1.0.69): before v1.0.69, the only way to detect a vec-orphan was
12//! `health --json` which reported `vec_memories_orphaned > 0` with no
13//! remediation path. This module closes the loop.
14
15use crate::errors::AppError;
16use crate::output;
17use crate::paths::AppPaths;
18use crate::storage::connection::{open_ro, open_rw};
19use serde::Serialize;
20
21/// Arguments for the `vec` subcommand family.
22#[derive(clap::Args)]
23#[command(
24    about = "Vector index maintenance (orphan detection, purge, stats)",
25    after_long_help = "EXAMPLES:\n  \
26        # List orphan vec_memories rows whose memory_id is gone\n  \
27        sqlite-graphrag vec orphan-list\n\n  \
28        # Dry-run the purge (does not delete)\n  \
29        sqlite-graphrag vec purge-orphan --dry-run\n\n  \
30        # Actually purge orphans\n  \
31        sqlite-graphrag vec purge-orphan --yes\n\n  \
32        # Show stats for all vec0 tables\n  \
33        sqlite-graphrag vec stats --json"
34)]
35pub struct VecArgs {
36    #[command(subcommand)]
37    pub command: VecSubcommand,
38}
39
40/// Subcommands nested under `vec`.
41#[derive(clap::Subcommand)]
42pub enum VecSubcommand {
43    /// List orphan vec_memories rows.
44    OrphanList(VecOrphanListArgs),
45    /// Delete orphan vec_memories rows. Requires `--yes` to confirm.
46    PurgeOrphan(VecPurgeOrphanArgs),
47    /// Show statistics for vec_memories, vec_entities, vec_chunks.
48    Stats(VecStatsArgs),
49}
50
51/// Arguments for `vec orphan-list`.
52#[derive(clap::Args)]
53pub struct VecOrphanListArgs {
54    /// No-op; JSON is always emitted on stdout.
55    #[arg(long, hide = true)]
56    pub json: bool,
57    /// Path to the SQLite database file.
58    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
59    pub db: Option<String>,
60}
61
62/// Arguments for `vec purge-orphan`.
63#[derive(clap::Args)]
64pub struct VecOrphanListInner {
65    pub json: bool,
66    pub db: Option<String>,
67}
68
69/// Arguments for `vec purge-orphan`.
70#[derive(clap::Args)]
71pub struct VecPurgeOrphanArgs {
72    /// No-op; JSON is always emitted on stdout.
73    #[arg(long, hide = true)]
74    pub json: bool,
75    /// Path to the SQLite database file.
76    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
77    pub db: Option<String>,
78    /// Skip the interactive confirmation; required for automation.
79    #[arg(long, default_value_t = false)]
80    pub yes: bool,
81    /// Report what would be purged without writing.
82    #[arg(long, default_value_t = false)]
83    pub dry_run: bool,
84}
85
86/// Arguments for `vec stats`.
87#[derive(clap::Args)]
88pub struct VecStatsArgs {
89    /// No-op; JSON is always emitted on stdout.
90    #[arg(long, hide = true)]
91    pub json: bool,
92    /// Path to the SQLite database file.
93    #[arg(long, env = "SQLITE_GRAPHRAG_DB_PATH")]
94    pub db: Option<String>,
95}
96
97#[derive(Serialize)]
98struct VecOrphanListItem {
99    /// The orphan `memory_id` value stored in `vec_memories`.
100    memory_id: i64,
101    /// Hash of the float vector blob, for fingerprinting.
102    vector_hash: String,
103    /// When the orphan row was originally inserted.
104    created_at: i64,
105}
106
107#[derive(Serialize)]
108struct VecOrphanListResponse {
109    action: String,
110    count: i64,
111    items: Vec<VecOrphanListItem>,
112    elapsed_ms: u64,
113}
114
115#[derive(Serialize)]
116struct VecPurgeOrphanResponse {
117    action: String,
118    deleted: i64,
119    /// Number of orphan rows in `vec_entities` that were also removed (G39).
120    deleted_entities: i64,
121    /// Number of orphan rows in `vec_chunks` that were also removed (G39).
122    deleted_chunks: i64,
123    dry_run: bool,
124    elapsed_ms: u64,
125}
126
127#[derive(Serialize)]
128struct VecStatsResponse {
129    total_rows: i64,
130    orphaned: i64,
131    coverage_percent: f64,
132    #[serde(skip_serializing_if = "Option::is_none")]
133    vec_entities_rows: Option<i64>,
134    #[serde(skip_serializing_if = "Option::is_none")]
135    vec_chunks_rows: Option<i64>,
136    fts_memories_rows: i64,
137    elapsed_ms: u64,
138}
139
140/// Dispatch entry point called from `main`.
141///
142/// # Errors
143/// Propagates any [`AppError`] raised by the underlying subcommand.
144pub fn run(args: VecArgs) -> Result<(), AppError> {
145    match args.command {
146        VecSubcommand::OrphanList(a) => run_orphan_list(a),
147        VecSubcommand::PurgeOrphan(a) => run_purge_orphan(a),
148        VecSubcommand::Stats(a) => run_stats(a),
149    }
150}
151
152fn run_orphan_list(args: VecOrphanListArgs) -> Result<(), AppError> {
153    let start = std::time::Instant::now();
154    let paths = AppPaths::resolve(args.db.as_deref())?;
155    crate::storage::connection::ensure_db_ready(&paths)?;
156    let conn = open_ro(&paths.db)?;
157
158    // FTS5-style table existence gate so the command is a no-op on
159    // databases that were created before vec_memories existed.
160    let table_exists: bool = conn
161        .query_row(
162            "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='vec_memories'",
163            [],
164            |r| r.get::<_, i64>(0).map(|v| v > 0),
165        )
166        .unwrap_or(false);
167    if !table_exists {
168        return output::emit_json(&VecOrphanListResponse {
169            action: "orphan_list".to_string(),
170            count: 0,
171            items: Vec::new(),
172            elapsed_ms: start.elapsed().as_millis() as u64,
173        });
174    }
175
176    // List vec_memories rows that have no corresponding live memory row.
177    // We use a hash of the float[] blob (BLAKE3) as a fingerprint so the
178    // operator can detect duplicate embeddings even after the parent
179    // memory has been re-embedded with new content.
180    let mut stmt = conn.prepare(
181        "SELECT v.memory_id, v.embedding, v.created_at
182         FROM vec_memories v
183         LEFT JOIN memories m ON m.id = v.memory_id
184         WHERE m.id IS NULL
185         ORDER BY v.memory_id",
186    )?;
187    let rows: Vec<VecOrphanListItem> = stmt
188        .query_map([], |r| {
189            let memory_id: i64 = r.get(0)?;
190            let blob: Vec<u8> = r.get(1)?;
191            let created_at: i64 = r.get(2)?;
192            let vector_hash = blake3::hash(&blob).to_hex().to_string();
193            Ok(VecOrphanListItem {
194                memory_id,
195                vector_hash,
196                created_at,
197            })
198        })?
199        .collect::<Result<Vec<_>, _>>()?;
200    let count = rows.len() as i64;
201
202    output::emit_json(&VecOrphanListResponse {
203        action: "orphan_list".to_string(),
204        count,
205        items: rows,
206        elapsed_ms: start.elapsed().as_millis() as u64,
207    })?;
208    Ok(())
209}
210
211fn run_purge_orphan(args: VecPurgeOrphanArgs) -> Result<(), AppError> {
212    let start = std::time::Instant::now();
213    let paths = AppPaths::resolve(args.db.as_deref())?;
214    crate::storage::connection::ensure_db_ready(&paths)?;
215    let conn = open_rw(&paths.db)?;
216
217    // Count first so we can return a deterministic response even on dry-run.
218    let table_exists: bool = conn
219        .query_row(
220            "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='vec_memories'",
221            [],
222            |r| r.get::<_, i64>(0).map(|v| v > 0),
223        )
224        .unwrap_or(false);
225    if !table_exists {
226        return output::emit_json(&VecPurgeOrphanResponse {
227            action: "purge_orphan".to_string(),
228            deleted: 0,
229            deleted_entities: 0,
230            deleted_chunks: 0,
231            dry_run: args.dry_run,
232            elapsed_ms: start.elapsed().as_millis() as u64,
233        });
234    }
235
236    let orphan_count: i64 = conn
237        .query_row(
238            "SELECT COUNT(*) FROM vec_memories v
239             LEFT JOIN memories m ON m.id = v.memory_id
240             WHERE m.id IS NULL",
241            [],
242            |r| r.get(0),
243        )
244        .unwrap_or(0);
245
246    // G39: also count orphans in vec_entities and vec_chunks. These
247    // tables follow the same `memory_id` foreign key convention and
248    // accumulate orphans on the same paths as vec_memories.
249    let orphan_entities_count: i64 = if vec_table_exists(&conn, "vec_entities") {
250        conn.query_row(
251            "SELECT COUNT(*) FROM vec_entities v
252             LEFT JOIN memories m ON m.id = v.memory_id
253             WHERE m.id IS NULL",
254            [],
255            |r| r.get(0),
256        )
257        .unwrap_or(0)
258    } else {
259        0
260    };
261    let orphan_chunks_count: i64 = if vec_table_exists(&conn, "vec_chunks") {
262        conn.query_row(
263            "SELECT COUNT(*) FROM vec_chunks v
264             LEFT JOIN memories m ON m.id = v.memory_id
265             WHERE m.id IS NULL",
266            [],
267            |r| r.get(0),
268        )
269        .unwrap_or(0)
270    } else {
271        0
272    };
273
274    if args.dry_run {
275        tracing::info!(target: "vec", orphan_count, orphan_entities_count, orphan_chunks_count, "dry-run: would delete orphans");
276        return output::emit_json(&VecPurgeOrphanResponse {
277            action: "purge_orphan_dry_run".to_string(),
278            deleted: 0,
279            deleted_entities: 0,
280            deleted_chunks: 0,
281            dry_run: true,
282            elapsed_ms: start.elapsed().as_millis() as u64,
283        });
284    }
285
286    if !args.yes {
287        return Err(AppError::Validation(format!(
288            "refusing to delete {orphan_count} vec_memories + {orphan_entities_count} vec_entities + {orphan_chunks_count} vec_chunks orphan rows without --yes (use --dry-run to preview)"
289        )));
290    }
291
292    let deleted: i64 = conn.execute(
293        "DELETE FROM vec_memories
294         WHERE memory_id NOT IN (SELECT id FROM memories)",
295        [],
296    )? as i64;
297
298    let deleted_entities: i64 = if vec_table_exists(&conn, "vec_entities") {
299        conn.execute(
300            "DELETE FROM vec_entities
301             WHERE memory_id NOT IN (SELECT id FROM memories)",
302            [],
303        )
304        .unwrap_or(0) as i64
305    } else {
306        0
307    };
308    let deleted_chunks: i64 = if vec_table_exists(&conn, "vec_chunks") {
309        conn.execute(
310            "DELETE FROM vec_chunks
311             WHERE memory_id NOT IN (SELECT id FROM memories)",
312            [],
313        )
314        .unwrap_or(0) as i64
315    } else {
316        0
317    };
318
319    tracing::info!(target: "vec", deleted, deleted_entities, deleted_chunks, "purged orphan vec rows");
320
321    output::emit_json(&VecPurgeOrphanResponse {
322        action: "purged_orphan".to_string(),
323        deleted,
324        deleted_entities,
325        deleted_chunks,
326        dry_run: false,
327        elapsed_ms: start.elapsed().as_millis() as u64,
328    })?;
329    Ok(())
330}
331
332fn run_stats(args: VecStatsArgs) -> Result<(), AppError> {
333    let start = std::time::Instant::now();
334    let paths = AppPaths::resolve(args.db.as_deref())?;
335    crate::storage::connection::ensure_db_ready(&paths)?;
336    let conn = open_ro(&paths.db)?;
337
338    let vec_memories_exists: bool = conn
339        .query_row(
340            "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='vec_memories'",
341            [],
342            |r| r.get::<_, i64>(0).map(|v| v > 0),
343        )
344        .unwrap_or(false);
345    let (total_rows, orphaned) = if vec_memories_exists {
346        let total: i64 = conn
347            .query_row("SELECT COUNT(*) FROM vec_memories", [], |r| r.get(0))
348            .unwrap_or(0);
349        let orph: i64 = conn
350            .query_row(
351                "SELECT COUNT(*) FROM vec_memories v
352                 LEFT JOIN memories m ON m.id = v.memory_id
353                 WHERE m.id IS NULL",
354                [],
355                |r| r.get(0),
356            )
357            .unwrap_or(0);
358        (total, orph)
359    } else {
360        (0, 0)
361    };
362    let coverage_percent = if total_rows > 0 {
363        ((total_rows - orphaned) as f64 / total_rows as f64) * 100.0
364    } else {
365        100.0
366    };
367
368    let vec_entities_rows = if vec_table_exists(&conn, "vec_entities") {
369        conn.query_row("SELECT COUNT(*) FROM vec_entities", [], |r| r.get(0))
370            .ok()
371    } else {
372        None
373    };
374    let vec_chunks_rows = if vec_table_exists(&conn, "vec_chunks") {
375        conn.query_row("SELECT COUNT(*) FROM vec_chunks", [], |r| r.get(0))
376            .ok()
377    } else {
378        None
379    };
380    let fts_memories_rows = conn
381        .query_row("SELECT COUNT(*) FROM fts_memories", [], |r| r.get(0))
382        .unwrap_or(0);
383
384    output::emit_json(&VecStatsResponse {
385        total_rows,
386        orphaned,
387        coverage_percent,
388        vec_entities_rows,
389        vec_chunks_rows,
390        fts_memories_rows,
391        elapsed_ms: start.elapsed().as_millis() as u64,
392    })?;
393    Ok(())
394}
395
396fn vec_table_exists(conn: &rusqlite::Connection, name: &str) -> bool {
397    conn.query_row(
398        "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name=?1",
399        rusqlite::params![name],
400        |r| r.get::<_, i64>(0).map(|v| v > 0),
401    )
402    .unwrap_or(false)
403}
404
405#[cfg(test)]
406mod tests {
407    use super::*;
408
409    #[test]
410    fn vec_orphan_list_response_serializes_all_fields() {
411        let resp = VecOrphanListResponse {
412            action: "orphan_list".into(),
413            count: 0,
414            items: Vec::new(),
415            elapsed_ms: 5,
416        };
417        let v = serde_json::to_value(&resp).unwrap();
418        assert_eq!(v["action"], "orphan_list");
419        assert_eq!(v["count"], 0i64);
420        assert_eq!(v["elapsed_ms"], 5u64);
421        assert!(v["items"].is_array());
422    }
423
424    #[test]
425    fn vec_purge_orphan_response_serializes_dry_run_flag() {
426        let resp = VecPurgeOrphanResponse {
427            action: "purge_orphan_dry_run".into(),
428            deleted: 0,
429            deleted_entities: 0,
430            deleted_chunks: 0,
431            dry_run: true,
432            elapsed_ms: 1,
433        };
434        let v = serde_json::to_value(&resp).unwrap();
435        assert_eq!(v["dry_run"], true);
436        assert_eq!(v["deleted"], 0i64);
437    }
438
439    #[test]
440    fn vec_stats_response_computes_coverage() {
441        let resp = VecStatsResponse {
442            total_rows: 100,
443            orphaned: 25,
444            coverage_percent: 75.0,
445            vec_entities_rows: Some(50),
446            vec_chunks_rows: None,
447            fts_memories_rows: 100,
448            elapsed_ms: 10,
449        };
450        let v = serde_json::to_value(&resp).unwrap();
451        assert_eq!(v["coverage_percent"], 75.0);
452        assert_eq!(v["vec_entities_rows"], 50i64);
453        assert!(v.get("vec_chunks_rows").is_none());
454    }
455}