Skip to main content

ckg_storage/store/
insert.rs

1//! Bulk insert operations: `put_symbols` and `put_edges`.
2
3use std::collections::BTreeMap;
4
5use ckg_core::{Edge, Result, Symbol};
6use cozo::{DataValue, ScriptMutability};
7
8use crate::cozo_compat::CozoErrorKind;
9use super::map_err;
10use super::Storage;
11
12/// Serialize a `Symbol` into a Cozo row (list of DataValues).
13pub(super) fn symbol_to_row(s: &Symbol) -> DataValue {
14    DataValue::List(vec![
15        DataValue::from(s.id.as_str()),
16        DataValue::from(s.qname.as_str()),
17        DataValue::from(s.name.as_str()),
18        DataValue::from(s.kind.as_str()),
19        DataValue::from(s.file.as_str()),
20        DataValue::from(s.line as i64),
21        DataValue::from(s.col as i64),
22        DataValue::Bool(s.is_public),
23        DataValue::from(s.doc.as_str()),
24        DataValue::from(s.hash.as_str()),
25    ])
26}
27
28/// Serialize an `Edge` into a Cozo row, optionally including the confidence
29/// column (relations with confidence: `Calls`, `Imports`, `Extends`,
30/// `Implements`, `Awaits`).
31pub(super) fn edge_to_row(e: &Edge, with_conf: bool) -> DataValue {
32    if with_conf {
33        DataValue::List(vec![
34            DataValue::from(e.src.as_str()),
35            DataValue::from(e.dst.as_str()),
36            DataValue::from(e.confidence as f64),
37        ])
38    } else {
39        DataValue::List(vec![
40            DataValue::from(e.src.as_str()),
41            DataValue::from(e.dst.as_str()),
42        ])
43    }
44}
45
46impl Storage {
47    /// Bulk insert symbols using `:put` with a `$rows` parameter so user data
48    /// is bound, not pasted into the script (avoids quote-escape pitfalls).
49    pub fn put_symbols(&self, symbols: &[Symbol]) -> Result<()> {
50        const SCRIPT: &str = "
51?[id, qname, name, kind, file, line, col, is_public, doc, hash] <- $rows
52:put Symbol {id => qname, name, kind, file, line, col, is_public, doc, hash}
53";
54        for chunk in symbols.chunks(1000) {
55            let rows: Vec<DataValue> = chunk.iter().map(symbol_to_row).collect();
56            let mut params = BTreeMap::new();
57            params.insert("rows".into(), DataValue::List(rows));
58            self.db
59                .run_script(SCRIPT, params, ScriptMutability::Mutable)
60                .map_err(map_err)?;
61        }
62        Ok(())
63    }
64
65    /// Index GC: delete every symbol whose `file` is NOT in `live_files`.
66    /// Also reaps every inbound/outbound edge from those symbols.
67    ///
68    /// Use case: during `ckg index`, the indexer walks the working tree and
69    /// produces a fresh symbol set for the files it sees. Files that were
70    /// indexed previously but have since been renamed or deleted leave
71    /// phantom Symbol rows that pollute `dead-code`, `orphan-calls`, and
72    /// blast-radius results. Without this sweep, removing a source file
73    /// silently leaves its symbols behind forever.
74    ///
75    /// `live_files` is the set of file paths the current index pass
76    /// produced symbols for. Anything in the Symbol relation with a
77    /// `file` not in this set is treated as phantom and dropped.
78    ///
79    /// Returns the count of deleted Symbol rows (edges are deleted as a
80    /// side-effect via Cozo's referential rules — see comment below).
81    pub fn gc_symbols_not_in(&self, live_files: &std::collections::HashSet<String>) -> Result<usize> {
82        // 1. Find phantom symbol ids.
83        let rows = self
84            .db
85            .run_script(
86                "?[id, file] := *Symbol{id, file}",
87                BTreeMap::new(),
88                ScriptMutability::Immutable,
89            )
90            .map_err(map_err)?;
91        let phantom_ids: Vec<DataValue> = rows
92            .rows
93            .into_iter()
94            .filter_map(|r| {
95                let id = match r.first() {
96                    Some(DataValue::Str(s)) => s.to_string(),
97                    _ => return None,
98                };
99                let file = match r.get(1) {
100                    Some(DataValue::Str(s)) => s.to_string(),
101                    _ => return None,
102                };
103                if live_files.contains(&file) {
104                    None
105                } else {
106                    Some(DataValue::from(id.as_str()))
107                }
108            })
109            .collect();
110        if phantom_ids.is_empty() {
111            return Ok(0);
112        }
113        let count = phantom_ids.len();
114        // 2. Delete from Symbol and every edge relation that references
115        //    these ids by `src` or `dst`. Cozo doesn't cascade, so we
116        //    walk each relation explicitly.
117        const EDGE_RELS: &[&str] = &[
118            "Calls", "Imports", "Extends", "Implements", "Defines", "Documents", "Tests", "Awaits",
119        ];
120        for chunk in phantom_ids.chunks(1000) {
121            let mut params = BTreeMap::new();
122            params.insert("ids".into(), DataValue::List(chunk.to_vec()));
123            // Symbol :rm by id.
124            self.db
125                .run_script(
126                    "?[id] := id in $ids\n:rm Symbol {id}",
127                    params.clone(),
128                    ScriptMutability::Mutable,
129                )
130                .map_err(map_err)?;
131            // Edges: remove every row where src or dst matches. We do
132            // src and dst as two passes to keep the Datalog simple.
133            for rel in EDGE_RELS {
134                let script_src = format!(
135                    "?[src, dst] := *{rel}{{src, dst}}, src in $ids\n:rm {rel} {{src, dst}}\n"
136                );
137                let script_dst = format!(
138                    "?[src, dst] := *{rel}{{src, dst}}, dst in $ids\n:rm {rel} {{src, dst}}\n"
139                );
140                // Some relations have a confidence column — the :rm
141                // matches on the primary key so the schema variant
142                // doesn't matter. RelationMissing is swallowed
143                // (idempotent recovery from old schemas that lack this
144                // relation). All other error classes (I/O, corruption,
145                // lock contention) are propagated so they don't silently
146                // leave dangling edges in the graph.
147                for script in [&script_src, &script_dst] {
148                    if let Err(e) = self
149                        .db
150                        .run_script(script, params.clone(), ScriptMutability::Mutable)
151                    {
152                        let mapped = map_err(e);
153                        if !matches!(CozoErrorKind::of(&mapped), CozoErrorKind::RelationMissing) {
154                            return Err(mapped);
155                        }
156                    }
157                }
158            }
159        }
160        tracing::info!(deleted = count, "index GC: reaped phantom symbols");
161        Ok(count)
162    }
163
164    /// Bulk insert edges. Routes by `EdgeKind` to the right relation.
165    ///
166    /// `by_rel` groups edges by their Cozo relation name (`&'static str` from
167    /// `EdgeKind::as_relation()`). An enum-keyed array would be marginally
168    /// faster but `EdgeKind` variants are few and the BTreeMap overhead is
169    /// negligible — see L5 tracking comment. Deferring until `EdgeKind` gains
170    /// a stable `#[repr(u8)]` discriminant.
171    pub fn put_edges(&self, edges: &[Edge]) -> Result<()> {
172        let mut by_rel: BTreeMap<&'static str, Vec<&Edge>> = BTreeMap::new();
173        for e in edges {
174            by_rel.entry(e.kind.as_relation()).or_default().push(e);
175        }
176        for (rel, list) in by_rel {
177            let with_conf = matches!(
178                rel,
179                "Calls" | "Imports" | "Extends" | "Implements" | "Awaits"
180            );
181            let script = if with_conf {
182                format!("?[src, dst, confidence] <- $rows\n:put {rel} {{src, dst => confidence}}\n")
183            } else {
184                format!("?[src, dst] <- $rows\n:put {rel} {{src, dst}}\n")
185            };
186            for chunk in list.chunks(1000) {
187                let rows: Vec<DataValue> =
188                    chunk.iter().map(|e| edge_to_row(e, with_conf)).collect();
189                let mut params = BTreeMap::new();
190                params.insert("rows".into(), DataValue::List(rows));
191                self.db
192                    .run_script(&script, params, ScriptMutability::Mutable)
193                    .map_err(map_err)?;
194            }
195        }
196        Ok(())
197    }
198}