Skip to main content

mimir_graph/
store_graph.rs

1//! Persist extraction results as nodes/edges, incrementally.
2//!
3//! Symbol identity across re-extraction: blake3(project | path | qualified
4//! | kind) stored in meta.stable_id — the row (and its ULID, and every
5//! memory link pointing at it) survives line shifts and body edits.
6//! Line numbers live in span columns, never in identity.
7
8use std::collections::{HashMap, HashSet};
9use std::path::Path;
10
11use mimir_core::error::{Error, Result};
12use mimir_core::model::{now_unix, Kind, NewNode, Node, Rel};
13use mimir_core::store::{self, row_to_node, NODE_COLS};
14use rusqlite::{params, Connection, OptionalExtension};
15
16use crate::extract::{self, FileExtract};
17use crate::languages::Lang;
18
19#[derive(Debug, Default, PartialEq, Eq)]
20pub struct GraphStats {
21    pub files_seen: usize,
22    pub files_indexed: usize,
23    pub unchanged: usize,
24    pub removed: usize,
25    pub symbols: usize,
26    pub calls_resolved: usize,
27    pub calls_heuristic: usize,
28    pub imports: usize,
29}
30
31pub fn stable_id(project_id: i64, rel_path: &str, qualified: &str, kind: &str) -> String {
32    blake3::hash(format!("{project_id}|{rel_path}|{qualified}|{kind}").as_bytes())
33        .to_hex()
34        .to_string()
35}
36
37/// Build or incrementally update the code graph for a project rooted at
38/// `root`. mtime+size short-circuit, blake3 change detection, one
39/// transaction per file; calls re-resolved only for changed files.
40pub fn update(conn: &mut Connection, project: &Node, root: &Path) -> Result<GraphStats> {
41    let mut stats = GraphStats::default();
42    let mut seen: HashSet<String> = HashSet::new();
43    let mut changed_files: Vec<(i64, String, FileExtract)> = Vec::new();
44
45    for entry in ignore::WalkBuilder::new(root).build() {
46        let entry = match entry {
47            Ok(e) => e,
48            Err(err) => {
49                tracing::warn!(%err, "skipping unreadable entry");
50                continue;
51            }
52        };
53        if !entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
54            continue;
55        }
56        let path = entry.path();
57        let rel = path
58            .strip_prefix(root)
59            .unwrap_or(path)
60            .to_string_lossy()
61            .replace('\\', "/");
62        let Some(lang) = Lang::from_path(&rel) else {
63            continue;
64        };
65        seen.insert(rel.clone());
66        stats.files_seen += 1;
67
68        let meta = entry
69            .metadata()
70            .map_err(|e| Error::Invalid(format!("stat {rel}: {e}")))?;
71        let mtime = meta
72            .modified()
73            .ok()
74            .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
75            .map(|d| d.as_secs() as i64)
76            .unwrap_or(0);
77        let size = meta.len() as i64;
78
79        let existing = code_file(conn, project.id, &rel)?;
80        if let Some(f) = &existing {
81            if f.deleted_at.is_none()
82                && f.meta.get("mtime").and_then(|v| v.as_i64()) == Some(mtime)
83                && f.meta.get("size").and_then(|v| v.as_i64()) == Some(size)
84            {
85                stats.unchanged += 1;
86                continue;
87            }
88        }
89
90        let raw = std::fs::read(path).map_err(|e| Error::io(path, e))?;
91        let content = String::from_utf8_lossy(&raw);
92        let hash = blake3::hash(content.as_bytes()).as_bytes().to_vec();
93        if let Some(f) = &existing {
94            if f.deleted_at.is_none() && f.content_hash.as_deref() == Some(&hash[..]) {
95                conn.execute(
96                    "UPDATE node SET meta = json_set(meta, '$.mtime', ?2, '$.size', ?3),
97                                     updated_at = ?4 WHERE id = ?1",
98                    params![f.id, mtime, size, now_unix()],
99                )?;
100                stats.unchanged += 1;
101                continue;
102            }
103        }
104
105        let fx = extract::extract(lang, &content);
106        let file_id = persist_file(
107            conn,
108            project.id,
109            existing.as_ref(),
110            &rel,
111            lang,
112            &hash,
113            mtime,
114            size,
115            &fx,
116            &mut stats,
117        )?;
118        changed_files.push((file_id, rel, fx));
119        stats.files_indexed += 1;
120    }
121
122    // Files gone from disk: soft-delete file + its symbols.
123    let mut stmt = conn.prepare(
124        "SELECT id, path FROM node
125         WHERE kind = 'file' AND project_id = ?1 AND collection_id IS NULL
126           AND deleted_at IS NULL",
127    )?;
128    let live: Vec<(i64, String)> = stmt
129        .query_map([project.id], |r| Ok((r.get(0)?, r.get(1)?)))?
130        .collect::<rusqlite::Result<_>>()?;
131    drop(stmt);
132    for (id, path) in live {
133        if !seen.contains(&path) {
134            conn.execute(
135                "UPDATE node SET deleted_at = ?2
136                 WHERE deleted_at IS NULL AND (id = ?1 OR parent_id = ?1)",
137                params![id, now_unix()],
138            )?;
139            stats.removed += 1;
140        }
141    }
142
143    resolve_calls(conn, project.id, &changed_files, &mut stats)?;
144    Ok(stats)
145}
146
147fn code_file(conn: &Connection, project_id: i64, rel: &str) -> Result<Option<Node>> {
148    Ok(conn
149        .query_row(
150            &format!(
151                "SELECT {NODE_COLS} FROM node
152                 WHERE kind = 'file' AND project_id = ?1 AND path = ?2
153                   AND collection_id IS NULL"
154            ),
155            params![project_id, rel],
156            row_to_node,
157        )
158        .optional()?)
159}
160
161#[allow(clippy::too_many_arguments)]
162fn persist_file(
163    conn: &mut Connection,
164    project_id: i64,
165    existing: Option<&Node>,
166    rel: &str,
167    lang: Lang,
168    hash: &[u8],
169    mtime: i64,
170    size: i64,
171    fx: &FileExtract,
172    stats: &mut GraphStats,
173) -> Result<i64> {
174    let imports_json: Vec<serde_json::Value> = fx
175        .imports
176        .iter()
177        .map(|i| serde_json::json!({"local": i.local, "source": i.source}))
178        .collect();
179    let calls_json: Vec<serde_json::Value> = fx
180        .calls
181        .iter()
182        .filter(|c| !c.caller.is_empty())
183        .map(|c| serde_json::json!({"caller": c.caller, "callee": c.callee}))
184        .collect();
185    let file_meta = serde_json::json!({
186        "mtime": mtime, "size": size,
187        "imports": imports_json, "calls": calls_json,
188    });
189
190    let tx = conn.transaction()?;
191    let file_id = match existing {
192        Some(f) => {
193            tx.execute(
194                "UPDATE node SET content_hash = ?2, meta = ?3, lang = ?4,
195                                 updated_at = ?5, deleted_at = NULL WHERE id = ?1",
196                params![f.id, hash, file_meta.to_string(), lang.name(), now_unix()],
197            )?;
198            f.id
199        }
200        None => {
201            let mut new = NewNode::new(Kind::File);
202            new.title = Some(
203                Path::new(rel)
204                    .file_name()
205                    .map(|s| s.to_string_lossy().into_owned())
206                    .unwrap_or_else(|| rel.to_string()),
207            );
208            new.path = Some(rel.to_string());
209            new.lang = Some(lang.name().into());
210            new.project_id = Some(project_id);
211            new.content_hash = Some(hash.to_vec());
212            new.meta = Some(file_meta);
213            store::insert_node(&tx, new)?.id
214        }
215    };
216
217    // Upsert symbols by stable_id; collect survivors to prune the rest.
218    let mut kept: HashSet<i64> = HashSet::new();
219    for sym in &fx.symbols {
220        let sid = stable_id(project_id, rel, &sym.qualified, sym.kind);
221        let body = match &sym.doc {
222            Some(d) => format!("{}\n{d}", sym.signature),
223            None => sym.signature.clone(),
224        };
225        let meta = serde_json::json!({"stable_id": sid, "name": sym.name});
226        let existing_id: Option<i64> = tx
227            .query_row(
228                "SELECT id FROM node
229                 WHERE kind = 'symbol' AND json_extract(meta, '$.stable_id') = ?1",
230                [&sid],
231                |r| r.get(0),
232            )
233            .optional()?;
234        let id = match existing_id {
235            Some(id) => {
236                tx.execute(
237                    "UPDATE node SET title = ?2, body = ?3, subkind = ?4, path = ?5,
238                            span_start = ?6, span_end = ?7, content_hash = ?8, meta = ?9,
239                            lang = ?10, parent_id = ?11, updated_at = ?12, deleted_at = NULL
240                     WHERE id = ?1",
241                    params![
242                        id,
243                        sym.qualified,
244                        body,
245                        sym.kind,
246                        rel,
247                        sym.start_line as i64,
248                        sym.end_line as i64,
249                        blake3::hash(body.as_bytes()).as_bytes().to_vec(),
250                        meta.to_string(),
251                        lang.name(),
252                        file_id,
253                        now_unix()
254                    ],
255                )?;
256                id
257            }
258            None => {
259                let mut new = NewNode::new(Kind::Symbol);
260                new.subkind = Some(sym.kind.into());
261                new.title = Some(sym.qualified.clone());
262                new.body = Some(body.clone());
263                new.path = Some(rel.to_string());
264                new.lang = Some(lang.name().into());
265                new.project_id = Some(project_id);
266                new.parent_id = Some(file_id);
267                new.span_start = Some(sym.start_line as i64);
268                new.span_end = Some(sym.end_line as i64);
269                new.content_hash = Some(blake3::hash(body.as_bytes()).as_bytes().to_vec());
270                new.meta = Some(meta);
271                store::insert_node(&tx, new)?.id
272            }
273        };
274        kept.insert(id);
275        stats.symbols += 1;
276    }
277    // Symbols that vanished from this file: hard-delete (derived data;
278    // edges cascade, embeddings cascade).
279    {
280        let mut stmt =
281            tx.prepare("SELECT id FROM node WHERE kind = 'symbol' AND parent_id = ?1")?;
282        let all: Vec<i64> = stmt
283            .query_map([file_id], |r| r.get(0))?
284            .collect::<rusqlite::Result<_>>()?;
285        drop(stmt);
286        for id in all {
287            if !kept.contains(&id) {
288                tx.execute("DELETE FROM node WHERE id = ?1", [id])?;
289            }
290        }
291    }
292    tx.commit()?;
293    Ok(file_id)
294}
295
296/// A project symbol as seen by the resolver.
297struct SymRef {
298    id: i64,
299    name: String,
300    qualified: String,
301    path: String,
302}
303
304/// Re-resolve call/import edges for the changed files only. Unchanged
305/// symbols keep their ids, so inbound edges stay valid automatically.
306fn resolve_calls(
307    conn: &Connection,
308    project_id: i64,
309    changed: &[(i64, String, FileExtract)],
310    stats: &mut GraphStats,
311) -> Result<()> {
312    if changed.is_empty() {
313        return Ok(());
314    }
315    // Project-wide symbol table (one query, in-memory buckets).
316    let mut stmt = conn.prepare(
317        "SELECT id, json_extract(meta, '$.name'), title, path FROM node
318         WHERE kind = 'symbol' AND project_id = ?1 AND deleted_at IS NULL",
319    )?;
320    let symbols: Vec<SymRef> = stmt
321        .query_map([project_id], |r| {
322            Ok(SymRef {
323                id: r.get(0)?,
324                name: r.get::<_, Option<String>>(1)?.unwrap_or_default(),
325                qualified: r.get::<_, Option<String>>(2)?.unwrap_or_default(),
326                path: r.get::<_, Option<String>>(3)?.unwrap_or_default(),
327            })
328        })?
329        .collect::<rusqlite::Result<_>>()?;
330    drop(stmt);
331
332    let mut by_name: HashMap<&str, Vec<&SymRef>> = HashMap::new();
333    for s in &symbols {
334        by_name.entry(s.name.as_str()).or_default().push(s);
335    }
336    let mut by_file_qualified: HashMap<(&str, &str), i64> = HashMap::new();
337    for s in &symbols {
338        by_file_qualified.insert((s.path.as_str(), s.qualified.as_str()), s.id);
339    }
340    let file_paths: HashSet<&str> = {
341        let mut set = HashSet::new();
342        for s in &symbols {
343            set.insert(s.path.as_str());
344        }
345        set
346    };
347    let file_ids: HashMap<String, i64> = {
348        let mut stmt = conn.prepare(
349            "SELECT path, id FROM node
350             WHERE kind = 'file' AND project_id = ?1 AND collection_id IS NULL
351               AND deleted_at IS NULL",
352        )?;
353        let rows: Vec<(String, i64)> = stmt
354            .query_map([project_id], |r| Ok((r.get(0)?, r.get(1)?)))?
355            .collect::<rusqlite::Result<_>>()?;
356        rows.into_iter().collect()
357    };
358
359    let tx = conn.unchecked_transaction()?;
360    for (file_id, rel, fx) in changed {
361        // Wipe edges originating from this file's symbols + its imports.
362        tx.execute(
363            "DELETE FROM edge WHERE rel = 'calls' AND src IN
364               (SELECT id FROM node WHERE kind = 'symbol' AND parent_id = ?1)",
365            [file_id],
366        )?;
367        tx.execute(
368            "DELETE FROM edge WHERE rel = 'imports' AND src = ?1",
369            [file_id],
370        )?;
371
372        // Import map for tier-2 resolution + file→file import edges.
373        let mut import_target: HashMap<&str, String> = HashMap::new();
374        for imp in &fx.imports {
375            if let Some(target) = resolve_import(rel, &imp.source, &file_paths) {
376                import_target.insert(imp.local.as_str(), target.clone());
377                if let Some(dst) = file_ids.get(&target) {
378                    if *dst != *file_id {
379                        store::link(&tx, *file_id, *dst, Rel::Imports, 1.0)?;
380                        stats.imports += 1;
381                    }
382                }
383            }
384        }
385
386        for call in &fx.calls {
387            if call.caller.is_empty() {
388                continue; // top-level statements have no source symbol
389            }
390            let Some(&src) = by_file_qualified.get(&(rel.as_str(), call.caller.as_str())) else {
391                continue;
392            };
393            let candidates = by_name.get(call.callee.as_str());
394            let Some(candidates) = candidates else {
395                continue;
396            };
397            // Tier 1: same file.
398            if let Some(c) = candidates.iter().find(|c| c.path == *rel && c.id != src) {
399                link_call(&tx, src, c.id, 1.0, true)?;
400                stats.calls_resolved += 1;
401                continue;
402            }
403            // Tier 2: imported name → resolved file.
404            if let Some(target) = import_target.get(call.callee.as_str()) {
405                if let Some(c) = candidates.iter().find(|c| c.path == *target) {
406                    link_call(&tx, src, c.id, 1.0, true)?;
407                    stats.calls_resolved += 1;
408                    continue;
409                }
410            }
411            // Tier 3: global by name — honest about ambiguity.
412            let global: Vec<&&SymRef> = candidates.iter().filter(|c| c.id != src).collect();
413            match global.len() {
414                0 => {}
415                1 => {
416                    link_call(&tx, src, global[0].id, 0.8, true)?;
417                    stats.calls_resolved += 1;
418                }
419                n if n <= 3 => {
420                    for c in &global {
421                        link_call(&tx, src, c.id, 1.0 / n as f64, false)?;
422                        stats.calls_heuristic += 1;
423                    }
424                }
425                _ => {} // >3 candidates: too ambiguous to be useful
426            }
427        }
428    }
429    tx.commit()?;
430    Ok(())
431}
432
433fn link_call(conn: &Connection, src: i64, dst: i64, weight: f64, resolved: bool) -> Result<()> {
434    conn.execute(
435        "INSERT INTO edge (src, dst, rel, weight, meta, created_at)
436         VALUES (?1, ?2, 'calls', ?3, json_object('resolved', ?4), ?5)
437         ON CONFLICT(src, dst, rel) DO UPDATE SET
438           weight = excluded.weight, meta = excluded.meta",
439        params![src, dst, weight, resolved, now_unix()],
440    )?;
441    Ok(())
442}
443
444/// Best-effort import-source → project-file resolution.
445fn resolve_import(importer: &str, source: &str, files: &HashSet<&str>) -> Option<String> {
446    let dir = Path::new(importer).parent().unwrap_or(Path::new(""));
447    let try_paths = |bases: Vec<String>| -> Option<String> {
448        bases.into_iter().find(|b| files.contains(b.as_str()))
449    };
450
451    if source.starts_with('.') {
452        if source.contains("::") {
453            return None; // relative Rust paths handled below by suffix
454        }
455        // JS/TS relative ("./x", "../y/z") or Python relative (".util").
456        if source.starts_with("./") || source.starts_with("../") {
457            let joined = normalize(&dir.join(source));
458            return try_paths(vec![
459                format!("{joined}.ts"),
460                format!("{joined}.tsx"),
461                format!("{joined}.js"),
462                format!("{joined}.jsx"),
463                format!("{joined}/index.ts"),
464                format!("{joined}/index.js"),
465                joined.clone(),
466            ]);
467        }
468        // Python relative: ".util" / "..pkg.mod"
469        let dots = source.chars().take_while(|c| *c == '.').count();
470        let module = &source[dots..];
471        let mut base = dir.to_path_buf();
472        for _ in 1..dots {
473            base = base.parent().map(Path::to_path_buf).unwrap_or_default();
474        }
475        let joined = normalize(&base.join(module.replace('.', "/")));
476        return try_paths(vec![
477            format!("{joined}.py"),
478            format!("{joined}/__init__.py"),
479        ]);
480    }
481
482    if source.contains("::") {
483        // Rust: crate::a::b / super::x — match by path suffix on segments.
484        let segs: Vec<&str> = source
485            .split("::")
486            .filter(|s| !matches!(*s, "crate" | "super" | "self"))
487            .collect();
488        if segs.is_empty() {
489            return None;
490        }
491        // The import target is usually an item; its module file is the
492        // second-to-last segment (or last for module imports).
493        for take in (1..=segs.len().min(3)).rev() {
494            let suffix = format!("{}.rs", segs[..take].join("/"));
495            if let Some(hit) = files.iter().find(|f| f.ends_with(&suffix)) {
496                return Some(hit.to_string());
497            }
498        }
499        return None;
500    }
501
502    if source.contains('.') && !source.contains('/') {
503        // Python absolute module: a.b.c
504        let joined = source.replace('.', "/");
505        return try_paths(vec![
506            format!("{joined}.py"),
507            format!("{joined}/__init__.py"),
508        ])
509        .or_else(|| {
510            files
511                .iter()
512                .find(|f| f.ends_with(&format!("{joined}.py")))
513                .map(|f| f.to_string())
514        });
515    }
516
517    // Go package path / bare python module: match a file in a dir (or with
518    // a name) equal to the last segment.
519    let last = source.rsplit('/').next().unwrap_or(source);
520    files
521        .iter()
522        .find(|f| {
523            Path::new(f)
524                .parent()
525                .and_then(|p| p.file_name())
526                .map(|d| d.to_string_lossy() == last)
527                .unwrap_or(false)
528                || **f == format!("{last}.py")
529        })
530        .map(|f| f.to_string())
531}
532
533fn normalize(p: &Path) -> String {
534    let mut parts: Vec<&std::ffi::OsStr> = Vec::new();
535    for c in p.components() {
536        match c {
537            std::path::Component::ParentDir => {
538                parts.pop();
539            }
540            std::path::Component::CurDir => {}
541            std::path::Component::Normal(s) => parts.push(s),
542            _ => {}
543        }
544    }
545    parts
546        .iter()
547        .map(|s| s.to_string_lossy())
548        .collect::<Vec<_>>()
549        .join("/")
550}