Skip to main content

mimir_graph/
store_graph.rs

1//! Persist extraction results as nodes/edges, incrementally.
2//!
3//! Symbol identity across re-extraction: blake3(project | path | qualified
4//! | kind) stored in meta.stable_id — the row (and its ULID, and every
5//! memory link pointing at it) survives line shifts and body edits.
6//! Line numbers live in span columns, never in identity.
7
8use std::collections::{HashMap, HashSet};
9use std::path::Path;
10
11use mimir_core::error::{Error, Result};
12use mimir_core::model::{now_unix, Kind, NewNode, Node, Rel};
13use mimir_core::store::{self, row_to_node, NODE_COLS};
14use rusqlite::{params, Connection, OptionalExtension};
15
16use crate::extract::{self, FileExtract};
17use crate::languages::Lang;
18
19#[derive(Debug, Default, PartialEq, Eq)]
20pub struct GraphStats {
21    pub files_seen: usize,
22    pub files_indexed: usize,
23    pub unchanged: usize,
24    pub removed: usize,
25    pub symbols: usize,
26    pub calls_resolved: usize,
27    pub calls_heuristic: usize,
28    pub imports: usize,
29}
30
31pub fn stable_id(project_id: i64, rel_path: &str, qualified: &str, kind: &str) -> String {
32    blake3::hash(format!("{project_id}|{rel_path}|{qualified}|{kind}").as_bytes())
33        .to_hex()
34        .to_string()
35}
36
37/// Build or incrementally update the code graph for a project rooted at
38/// `root`. mtime+size short-circuit, blake3 change detection, one
39/// transaction for the whole update; calls re-resolved only for changed
40/// files.
41pub fn update(conn: &mut Connection, project: &Node, root: &Path) -> Result<GraphStats> {
42    let mut stats = GraphStats::default();
43    let mut seen: HashSet<String> = HashSet::new();
44    let mut changed_files: Vec<(i64, String, FileExtract)> = Vec::new();
45
46    // One transaction for the whole update: file/symbol upserts and the
47    // call-edge rebuild commit together. A crash mid-update used to leave
48    // committed file hashes with missing call edges — and the hash
49    // short-circuit then skipped those files forever. IMMEDIATE so a
50    // concurrent writer waits (busy_timeout) rather than erroring on a
51    // DEFERRED read→write lock upgrade.
52    let tx = conn.transaction_with_behavior(rusqlite::TransactionBehavior::Immediate)?;
53
54    for entry in ignore::WalkBuilder::new(root).build() {
55        let entry = match entry {
56            Ok(e) => e,
57            Err(err) => {
58                tracing::warn!(%err, "skipping unreadable entry");
59                continue;
60            }
61        };
62        if !entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
63            continue;
64        }
65        let path = entry.path();
66        let rel = path
67            .strip_prefix(root)
68            .unwrap_or(path)
69            .to_string_lossy()
70            .replace('\\', "/");
71        let Some(lang) = Lang::from_path(&rel) else {
72            continue;
73        };
74        seen.insert(rel.clone());
75        stats.files_seen += 1;
76
77        let meta = entry
78            .metadata()
79            .map_err(|e| Error::Invalid(format!("stat {rel}: {e}")))?;
80        // -1 = mtime unavailable (some network/virtual filesystems). It
81        // must never satisfy the fast path: 0==0 would skip changed files
82        // forever; -1 falls through to the content-hash check instead.
83        let mtime = meta
84            .modified()
85            .ok()
86            .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
87            .map(|d| d.as_secs() as i64)
88            .unwrap_or(-1);
89        let size = meta.len() as i64;
90
91        let existing = code_file(&tx, project.id, &rel)?;
92        if let Some(f) = &existing {
93            if mtime >= 0
94                && f.deleted_at.is_none()
95                && f.meta.get("mtime").and_then(|v| v.as_i64()) == Some(mtime)
96                && f.meta.get("size").and_then(|v| v.as_i64()) == Some(size)
97            {
98                stats.unchanged += 1;
99                continue;
100            }
101        }
102
103        let raw = std::fs::read(path).map_err(|e| Error::io(path, e))?;
104        let content = String::from_utf8_lossy(&raw);
105        let hash = blake3::hash(content.as_bytes()).as_bytes().to_vec();
106        if let Some(f) = &existing {
107            if f.deleted_at.is_none() && f.content_hash.as_deref() == Some(&hash[..]) {
108                tx.execute(
109                    "UPDATE node SET meta = json_set(meta, '$.mtime', ?2, '$.size', ?3),
110                                     updated_at = ?4 WHERE id = ?1",
111                    params![f.id, mtime, size, now_unix()],
112                )?;
113                stats.unchanged += 1;
114                continue;
115            }
116        }
117
118        let fx = extract::extract(lang, &content);
119        let file_id = persist_file(
120            &tx,
121            project.id,
122            existing.as_ref(),
123            &rel,
124            lang,
125            &hash,
126            mtime,
127            size,
128            &fx,
129            &mut stats,
130        )?;
131        changed_files.push((file_id, rel, fx));
132        stats.files_indexed += 1;
133    }
134
135    // Files gone from disk: soft-delete file + its symbols.
136    let mut stmt = tx.prepare(
137        "SELECT id, path FROM node
138         WHERE kind = 'file' AND project_id = ?1 AND collection_id IS NULL
139           AND deleted_at IS NULL",
140    )?;
141    let live: Vec<(i64, String)> = stmt
142        .query_map([project.id], |r| Ok((r.get(0)?, r.get(1)?)))?
143        .collect::<rusqlite::Result<_>>()?;
144    drop(stmt);
145    for (id, path) in live {
146        if !seen.contains(&path) {
147            tx.execute(
148                "UPDATE node SET deleted_at = ?2
149                 WHERE deleted_at IS NULL AND (id = ?1 OR parent_id = ?1)",
150                params![id, now_unix()],
151            )?;
152            stats.removed += 1;
153        }
154    }
155
156    resolve_calls(&tx, project.id, &changed_files, &mut stats)?;
157    tx.commit()?;
158    Ok(stats)
159}
160
161fn code_file(conn: &Connection, project_id: i64, rel: &str) -> Result<Option<Node>> {
162    Ok(conn
163        .query_row(
164            &format!(
165                "SELECT {NODE_COLS} FROM node
166                 WHERE kind = 'file' AND project_id = ?1 AND path = ?2
167                   AND collection_id IS NULL"
168            ),
169            params![project_id, rel],
170            row_to_node,
171        )
172        .optional()?)
173}
174
175#[allow(clippy::too_many_arguments)]
176fn persist_file(
177    conn: &Connection,
178    project_id: i64,
179    existing: Option<&Node>,
180    rel: &str,
181    lang: Lang,
182    hash: &[u8],
183    mtime: i64,
184    size: i64,
185    fx: &FileExtract,
186    stats: &mut GraphStats,
187) -> Result<i64> {
188    let imports_json: Vec<serde_json::Value> = fx
189        .imports
190        .iter()
191        .map(|i| serde_json::json!({"local": i.local, "source": i.source}))
192        .collect();
193    let calls_json: Vec<serde_json::Value> = fx
194        .calls
195        .iter()
196        .filter(|c| !c.caller.is_empty())
197        .map(|c| serde_json::json!({"caller": c.caller, "callee": c.callee}))
198        .collect();
199    let file_meta = serde_json::json!({
200        "mtime": mtime, "size": size,
201        "imports": imports_json, "calls": calls_json,
202    });
203
204    let file_id = match existing {
205        Some(f) => {
206            conn.execute(
207                "UPDATE node SET content_hash = ?2, meta = ?3, lang = ?4,
208                                 updated_at = ?5, deleted_at = NULL WHERE id = ?1",
209                params![f.id, hash, file_meta.to_string(), lang.name(), now_unix()],
210            )?;
211            f.id
212        }
213        None => {
214            let mut new = NewNode::new(Kind::File);
215            new.title = Some(
216                Path::new(rel)
217                    .file_name()
218                    .map(|s| s.to_string_lossy().into_owned())
219                    .unwrap_or_else(|| rel.to_string()),
220            );
221            new.path = Some(rel.to_string());
222            new.lang = Some(lang.name().into());
223            new.project_id = Some(project_id);
224            new.content_hash = Some(hash.to_vec());
225            new.meta = Some(file_meta);
226            store::insert_node(conn, new)?.id
227        }
228    };
229
230    // Upsert symbols by stable_id; collect survivors to prune the rest.
231    let mut kept: HashSet<i64> = HashSet::new();
232    for sym in &fx.symbols {
233        let sid = stable_id(project_id, rel, &sym.qualified, sym.kind);
234        let body = match &sym.doc {
235            Some(d) => format!("{}\n{d}", sym.signature),
236            None => sym.signature.clone(),
237        };
238        let meta = serde_json::json!({"stable_id": sid, "name": sym.name});
239        let existing_id: Option<i64> = conn
240            .query_row(
241                "SELECT id FROM node
242                 WHERE kind = 'symbol' AND json_extract(meta, '$.stable_id') = ?1",
243                [&sid],
244                |r| r.get(0),
245            )
246            .optional()?;
247        let id = match existing_id {
248            Some(id) => {
249                conn.execute(
250                    "UPDATE node SET title = ?2, body = ?3, subkind = ?4, path = ?5,
251                            span_start = ?6, span_end = ?7, content_hash = ?8, meta = ?9,
252                            lang = ?10, parent_id = ?11, updated_at = ?12, deleted_at = NULL
253                     WHERE id = ?1",
254                    params![
255                        id,
256                        sym.qualified,
257                        body,
258                        sym.kind,
259                        rel,
260                        sym.start_line as i64,
261                        sym.end_line as i64,
262                        blake3::hash(body.as_bytes()).as_bytes().to_vec(),
263                        meta.to_string(),
264                        lang.name(),
265                        file_id,
266                        now_unix()
267                    ],
268                )?;
269                id
270            }
271            None => {
272                let mut new = NewNode::new(Kind::Symbol);
273                new.subkind = Some(sym.kind.into());
274                new.title = Some(sym.qualified.clone());
275                new.body = Some(body.clone());
276                new.path = Some(rel.to_string());
277                new.lang = Some(lang.name().into());
278                new.project_id = Some(project_id);
279                new.parent_id = Some(file_id);
280                new.span_start = Some(sym.start_line as i64);
281                new.span_end = Some(sym.end_line as i64);
282                new.content_hash = Some(blake3::hash(body.as_bytes()).as_bytes().to_vec());
283                new.meta = Some(meta);
284                store::insert_node(conn, new)?.id
285            }
286        };
287        kept.insert(id);
288        stats.symbols += 1;
289    }
290    // Symbols that vanished from this file: hard-delete (derived data;
291    // edges cascade, embeddings cascade).
292    {
293        let mut stmt =
294            conn.prepare("SELECT id FROM node WHERE kind = 'symbol' AND parent_id = ?1")?;
295        let all: Vec<i64> = stmt
296            .query_map([file_id], |r| r.get(0))?
297            .collect::<rusqlite::Result<_>>()?;
298        drop(stmt);
299        for id in all {
300            if !kept.contains(&id) {
301                conn.execute("DELETE FROM node WHERE id = ?1", [id])?;
302            }
303        }
304    }
305    Ok(file_id)
306}
307
308/// A project symbol as seen by the resolver.
309struct SymRef {
310    id: i64,
311    name: String,
312    qualified: String,
313    path: String,
314}
315
316/// Re-resolve call/import edges for the changed files only. Unchanged
317/// symbols keep their ids, so inbound edges stay valid automatically.
318fn resolve_calls(
319    conn: &Connection,
320    project_id: i64,
321    changed: &[(i64, String, FileExtract)],
322    stats: &mut GraphStats,
323) -> Result<()> {
324    if changed.is_empty() {
325        return Ok(());
326    }
327    // Project-wide symbol table (one query, in-memory buckets).
328    let mut stmt = conn.prepare(
329        "SELECT id, json_extract(meta, '$.name'), title, path FROM node
330         WHERE kind = 'symbol' AND project_id = ?1 AND deleted_at IS NULL",
331    )?;
332    let symbols: Vec<SymRef> = stmt
333        .query_map([project_id], |r| {
334            Ok(SymRef {
335                id: r.get(0)?,
336                name: r.get::<_, Option<String>>(1)?.unwrap_or_default(),
337                qualified: r.get::<_, Option<String>>(2)?.unwrap_or_default(),
338                path: r.get::<_, Option<String>>(3)?.unwrap_or_default(),
339            })
340        })?
341        .collect::<rusqlite::Result<_>>()?;
342    drop(stmt);
343
344    let mut by_name: HashMap<&str, Vec<&SymRef>> = HashMap::new();
345    for s in &symbols {
346        // A NULL/empty name would bucket under "" and attract phantom
347        // call edges from any unresolvable callee.
348        if !s.name.is_empty() {
349            by_name.entry(s.name.as_str()).or_default().push(s);
350        }
351    }
352    let mut by_file_qualified: HashMap<(&str, &str), i64> = HashMap::new();
353    for s in &symbols {
354        by_file_qualified.insert((s.path.as_str(), s.qualified.as_str()), s.id);
355    }
356    let file_paths: HashSet<&str> = {
357        let mut set = HashSet::new();
358        for s in &symbols {
359            set.insert(s.path.as_str());
360        }
361        set
362    };
363    let file_ids: HashMap<String, i64> = {
364        let mut stmt = conn.prepare(
365            "SELECT path, id FROM node
366             WHERE kind = 'file' AND project_id = ?1 AND collection_id IS NULL
367               AND deleted_at IS NULL",
368        )?;
369        let rows: Vec<(String, i64)> = stmt
370            .query_map([project_id], |r| Ok((r.get(0)?, r.get(1)?)))?
371            .collect::<rusqlite::Result<_>>()?;
372        rows.into_iter().collect()
373    };
374
375    for (file_id, rel, fx) in changed {
376        // Wipe edges originating from this file's symbols + its imports.
377        conn.execute(
378            "DELETE FROM edge WHERE rel = 'calls' AND src IN
379               (SELECT id FROM node WHERE kind = 'symbol' AND parent_id = ?1)",
380            [file_id],
381        )?;
382        conn.execute(
383            "DELETE FROM edge WHERE rel = 'imports' AND src = ?1",
384            [file_id],
385        )?;
386
387        // Import map for tier-2 resolution + file→file import edges.
388        let mut import_target: HashMap<&str, String> = HashMap::new();
389        for imp in &fx.imports {
390            if let Some(target) = resolve_import(rel, &imp.source, &file_paths) {
391                import_target.insert(imp.local.as_str(), target.clone());
392                if let Some(dst) = file_ids.get(&target) {
393                    if *dst != *file_id {
394                        store::link(conn, *file_id, *dst, Rel::Imports, 1.0)?;
395                        stats.imports += 1;
396                    }
397                }
398            }
399        }
400
401        for call in &fx.calls {
402            if call.caller.is_empty() {
403                continue; // top-level statements have no source symbol
404            }
405            let Some(&src) = by_file_qualified.get(&(rel.as_str(), call.caller.as_str())) else {
406                continue;
407            };
408            let candidates = by_name.get(call.callee.as_str());
409            let Some(candidates) = candidates else {
410                continue;
411            };
412            // Tier 1: same file.
413            if let Some(c) = candidates.iter().find(|c| c.path == *rel && c.id != src) {
414                link_call(conn, src, c.id, 1.0, true)?;
415                stats.calls_resolved += 1;
416                continue;
417            }
418            // Tier 2: imported name → resolved file.
419            if let Some(target) = import_target.get(call.callee.as_str()) {
420                if let Some(c) = candidates.iter().find(|c| c.path == *target) {
421                    link_call(conn, src, c.id, 1.0, true)?;
422                    stats.calls_resolved += 1;
423                    continue;
424                }
425            }
426            // Tier 3: global by name — honest about ambiguity.
427            let global: Vec<&&SymRef> = candidates.iter().filter(|c| c.id != src).collect();
428            match global.len() {
429                0 => {}
430                1 => {
431                    link_call(conn, src, global[0].id, 0.8, true)?;
432                    stats.calls_resolved += 1;
433                }
434                n if n <= 3 => {
435                    for c in &global {
436                        link_call(conn, src, c.id, 1.0 / n as f64, false)?;
437                        stats.calls_heuristic += 1;
438                    }
439                }
440                _ => {} // >3 candidates: too ambiguous to be useful
441            }
442        }
443    }
444    Ok(())
445}
446
447fn link_call(conn: &Connection, src: i64, dst: i64, weight: f64, resolved: bool) -> Result<()> {
448    conn.execute(
449        "INSERT INTO edge (src, dst, rel, weight, meta, created_at)
450         VALUES (?1, ?2, 'calls', ?3, json_object('resolved', ?4), ?5)
451         ON CONFLICT(src, dst, rel) DO UPDATE SET
452           weight = excluded.weight, meta = excluded.meta",
453        params![src, dst, weight, resolved, now_unix()],
454    )?;
455    Ok(())
456}
457
458/// Best-effort import-source → project-file resolution.
459fn resolve_import(importer: &str, source: &str, files: &HashSet<&str>) -> Option<String> {
460    let dir = Path::new(importer).parent().unwrap_or(Path::new(""));
461    let try_paths = |bases: Vec<String>| -> Option<String> {
462        bases.into_iter().find(|b| files.contains(b.as_str()))
463    };
464
465    if source.starts_with('.') {
466        if source.contains("::") {
467            return None; // relative Rust paths handled below by suffix
468        }
469        // JS/TS relative ("./x", "../y/z") or Python relative (".util").
470        if source.starts_with("./") || source.starts_with("../") {
471            let joined = normalize(&dir.join(source));
472            return try_paths(vec![
473                format!("{joined}.ts"),
474                format!("{joined}.tsx"),
475                format!("{joined}.js"),
476                format!("{joined}.jsx"),
477                format!("{joined}/index.ts"),
478                format!("{joined}/index.js"),
479                joined.clone(),
480            ]);
481        }
482        // Python relative: ".util" / "..pkg.mod"
483        let dots = source.chars().take_while(|c| *c == '.').count();
484        let module = &source[dots..];
485        let mut base = dir.to_path_buf();
486        for _ in 1..dots {
487            base = base.parent().map(Path::to_path_buf).unwrap_or_default();
488        }
489        let joined = normalize(&base.join(module.replace('.', "/")));
490        return try_paths(vec![
491            format!("{joined}.py"),
492            format!("{joined}/__init__.py"),
493        ]);
494    }
495
496    if source.contains("::") {
497        // Rust: crate::a::b / super::x — match by path suffix on segments.
498        let segs: Vec<&str> = source
499            .split("::")
500            .filter(|s| !matches!(*s, "crate" | "super" | "self"))
501            .collect();
502        if segs.is_empty() {
503            return None;
504        }
505        // The import target is usually an item; its module file is the
506        // second-to-last segment (or last for module imports).
507        for take in (1..=segs.len().min(3)).rev() {
508            let suffix = format!("{}.rs", segs[..take].join("/"));
509            if let Some(hit) = files.iter().find(|f| f.ends_with(&suffix)) {
510                return Some(hit.to_string());
511            }
512        }
513        return None;
514    }
515
516    if source.contains('.') && !source.contains('/') {
517        // Python absolute module: a.b.c
518        let joined = source.replace('.', "/");
519        return try_paths(vec![
520            format!("{joined}.py"),
521            format!("{joined}/__init__.py"),
522        ])
523        .or_else(|| {
524            files
525                .iter()
526                .find(|f| f.ends_with(&format!("{joined}.py")))
527                .map(|f| f.to_string())
528        });
529    }
530
531    // Go package path / bare python module: match a file in a dir (or with
532    // a name) equal to the last segment.
533    let last = source.rsplit('/').next().unwrap_or(source);
534    files
535        .iter()
536        .find(|f| {
537            Path::new(f)
538                .parent()
539                .and_then(|p| p.file_name())
540                .map(|d| d.to_string_lossy() == last)
541                .unwrap_or(false)
542                || **f == format!("{last}.py")
543        })
544        .map(|f| f.to_string())
545}
546
547fn normalize(p: &Path) -> String {
548    let mut parts: Vec<&std::ffi::OsStr> = Vec::new();
549    for c in p.components() {
550        match c {
551            std::path::Component::ParentDir => {
552                parts.pop();
553            }
554            std::path::Component::CurDir => {}
555            std::path::Component::Normal(s) => parts.push(s),
556            _ => {}
557        }
558    }
559    parts
560        .iter()
561        .map(|s| s.to_string_lossy())
562        .collect::<Vec<_>>()
563        .join("/")
564}