Skip to main content

mimir_graph/
store_graph.rs

1//! Persist extraction results as nodes/edges, incrementally.
2//!
3//! Symbol identity across re-extraction: blake3(project | path | qualified
4//! | kind) stored in meta.stable_id — the row (and its ULID, and every
5//! memory link pointing at it) survives line shifts and body edits.
6//! Line numbers live in span columns, never in identity.
7
8use std::collections::{HashMap, HashSet};
9use std::path::Path;
10
11use mimir_core::error::{Error, Result};
12use mimir_core::model::{now_unix, Kind, NewNode, Node, Rel};
13use mimir_core::store::{self, row_to_node, NODE_COLS};
14use rusqlite::{params, Connection, OptionalExtension};
15
16use crate::extract::{self, FileExtract};
17use crate::languages::Lang;
18
19#[derive(Debug, Default, PartialEq, Eq)]
20pub struct GraphStats {
21    pub files_seen: usize,
22    pub files_indexed: usize,
23    pub unchanged: usize,
24    pub removed: usize,
25    pub symbols: usize,
26    pub calls_resolved: usize,
27    pub calls_heuristic: usize,
28    pub imports: usize,
29}
30
31pub fn stable_id(project_id: i64, rel_path: &str, qualified: &str, kind: &str) -> String {
32    blake3::hash(format!("{project_id}|{rel_path}|{qualified}|{kind}").as_bytes())
33        .to_hex()
34        .to_string()
35}
36
37/// Build or incrementally update the code graph for a project rooted at
38/// `root`. mtime+size short-circuit, blake3 change detection, one
39/// transaction for the whole update; calls re-resolved only for changed
40/// files.
41pub fn update(conn: &mut Connection, project: &Node, root: &Path) -> Result<GraphStats> {
42    let mut stats = GraphStats::default();
43    let mut seen: HashSet<String> = HashSet::new();
44    let mut changed_files: Vec<(i64, String, FileExtract)> = Vec::new();
45
46    // One transaction for the whole update: file/symbol upserts and the
47    // call-edge rebuild commit together. A crash mid-update used to leave
48    // committed file hashes with missing call edges — and the hash
49    // short-circuit then skipped those files forever.
50    let tx = conn.transaction()?;
51
52    for entry in ignore::WalkBuilder::new(root).build() {
53        let entry = match entry {
54            Ok(e) => e,
55            Err(err) => {
56                tracing::warn!(%err, "skipping unreadable entry");
57                continue;
58            }
59        };
60        if !entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
61            continue;
62        }
63        let path = entry.path();
64        let rel = path
65            .strip_prefix(root)
66            .unwrap_or(path)
67            .to_string_lossy()
68            .replace('\\', "/");
69        let Some(lang) = Lang::from_path(&rel) else {
70            continue;
71        };
72        seen.insert(rel.clone());
73        stats.files_seen += 1;
74
75        let meta = entry
76            .metadata()
77            .map_err(|e| Error::Invalid(format!("stat {rel}: {e}")))?;
78        // -1 = mtime unavailable (some network/virtual filesystems). It
79        // must never satisfy the fast path: 0==0 would skip changed files
80        // forever; -1 falls through to the content-hash check instead.
81        let mtime = meta
82            .modified()
83            .ok()
84            .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
85            .map(|d| d.as_secs() as i64)
86            .unwrap_or(-1);
87        let size = meta.len() as i64;
88
89        let existing = code_file(&tx, project.id, &rel)?;
90        if let Some(f) = &existing {
91            if mtime >= 0
92                && f.deleted_at.is_none()
93                && f.meta.get("mtime").and_then(|v| v.as_i64()) == Some(mtime)
94                && f.meta.get("size").and_then(|v| v.as_i64()) == Some(size)
95            {
96                stats.unchanged += 1;
97                continue;
98            }
99        }
100
101        let raw = std::fs::read(path).map_err(|e| Error::io(path, e))?;
102        let content = String::from_utf8_lossy(&raw);
103        let hash = blake3::hash(content.as_bytes()).as_bytes().to_vec();
104        if let Some(f) = &existing {
105            if f.deleted_at.is_none() && f.content_hash.as_deref() == Some(&hash[..]) {
106                tx.execute(
107                    "UPDATE node SET meta = json_set(meta, '$.mtime', ?2, '$.size', ?3),
108                                     updated_at = ?4 WHERE id = ?1",
109                    params![f.id, mtime, size, now_unix()],
110                )?;
111                stats.unchanged += 1;
112                continue;
113            }
114        }
115
116        let fx = extract::extract(lang, &content);
117        let file_id = persist_file(
118            &tx,
119            project.id,
120            existing.as_ref(),
121            &rel,
122            lang,
123            &hash,
124            mtime,
125            size,
126            &fx,
127            &mut stats,
128        )?;
129        changed_files.push((file_id, rel, fx));
130        stats.files_indexed += 1;
131    }
132
133    // Files gone from disk: soft-delete file + its symbols.
134    let mut stmt = tx.prepare(
135        "SELECT id, path FROM node
136         WHERE kind = 'file' AND project_id = ?1 AND collection_id IS NULL
137           AND deleted_at IS NULL",
138    )?;
139    let live: Vec<(i64, String)> = stmt
140        .query_map([project.id], |r| Ok((r.get(0)?, r.get(1)?)))?
141        .collect::<rusqlite::Result<_>>()?;
142    drop(stmt);
143    for (id, path) in live {
144        if !seen.contains(&path) {
145            tx.execute(
146                "UPDATE node SET deleted_at = ?2
147                 WHERE deleted_at IS NULL AND (id = ?1 OR parent_id = ?1)",
148                params![id, now_unix()],
149            )?;
150            stats.removed += 1;
151        }
152    }
153
154    resolve_calls(&tx, project.id, &changed_files, &mut stats)?;
155    tx.commit()?;
156    Ok(stats)
157}
158
159fn code_file(conn: &Connection, project_id: i64, rel: &str) -> Result<Option<Node>> {
160    Ok(conn
161        .query_row(
162            &format!(
163                "SELECT {NODE_COLS} FROM node
164                 WHERE kind = 'file' AND project_id = ?1 AND path = ?2
165                   AND collection_id IS NULL"
166            ),
167            params![project_id, rel],
168            row_to_node,
169        )
170        .optional()?)
171}
172
173#[allow(clippy::too_many_arguments)]
174fn persist_file(
175    conn: &Connection,
176    project_id: i64,
177    existing: Option<&Node>,
178    rel: &str,
179    lang: Lang,
180    hash: &[u8],
181    mtime: i64,
182    size: i64,
183    fx: &FileExtract,
184    stats: &mut GraphStats,
185) -> Result<i64> {
186    let imports_json: Vec<serde_json::Value> = fx
187        .imports
188        .iter()
189        .map(|i| serde_json::json!({"local": i.local, "source": i.source}))
190        .collect();
191    let calls_json: Vec<serde_json::Value> = fx
192        .calls
193        .iter()
194        .filter(|c| !c.caller.is_empty())
195        .map(|c| serde_json::json!({"caller": c.caller, "callee": c.callee}))
196        .collect();
197    let file_meta = serde_json::json!({
198        "mtime": mtime, "size": size,
199        "imports": imports_json, "calls": calls_json,
200    });
201
202    let file_id = match existing {
203        Some(f) => {
204            conn.execute(
205                "UPDATE node SET content_hash = ?2, meta = ?3, lang = ?4,
206                                 updated_at = ?5, deleted_at = NULL WHERE id = ?1",
207                params![f.id, hash, file_meta.to_string(), lang.name(), now_unix()],
208            )?;
209            f.id
210        }
211        None => {
212            let mut new = NewNode::new(Kind::File);
213            new.title = Some(
214                Path::new(rel)
215                    .file_name()
216                    .map(|s| s.to_string_lossy().into_owned())
217                    .unwrap_or_else(|| rel.to_string()),
218            );
219            new.path = Some(rel.to_string());
220            new.lang = Some(lang.name().into());
221            new.project_id = Some(project_id);
222            new.content_hash = Some(hash.to_vec());
223            new.meta = Some(file_meta);
224            store::insert_node(conn, new)?.id
225        }
226    };
227
228    // Upsert symbols by stable_id; collect survivors to prune the rest.
229    let mut kept: HashSet<i64> = HashSet::new();
230    for sym in &fx.symbols {
231        let sid = stable_id(project_id, rel, &sym.qualified, sym.kind);
232        let body = match &sym.doc {
233            Some(d) => format!("{}\n{d}", sym.signature),
234            None => sym.signature.clone(),
235        };
236        let meta = serde_json::json!({"stable_id": sid, "name": sym.name});
237        let existing_id: Option<i64> = conn
238            .query_row(
239                "SELECT id FROM node
240                 WHERE kind = 'symbol' AND json_extract(meta, '$.stable_id') = ?1",
241                [&sid],
242                |r| r.get(0),
243            )
244            .optional()?;
245        let id = match existing_id {
246            Some(id) => {
247                conn.execute(
248                    "UPDATE node SET title = ?2, body = ?3, subkind = ?4, path = ?5,
249                            span_start = ?6, span_end = ?7, content_hash = ?8, meta = ?9,
250                            lang = ?10, parent_id = ?11, updated_at = ?12, deleted_at = NULL
251                     WHERE id = ?1",
252                    params![
253                        id,
254                        sym.qualified,
255                        body,
256                        sym.kind,
257                        rel,
258                        sym.start_line as i64,
259                        sym.end_line as i64,
260                        blake3::hash(body.as_bytes()).as_bytes().to_vec(),
261                        meta.to_string(),
262                        lang.name(),
263                        file_id,
264                        now_unix()
265                    ],
266                )?;
267                id
268            }
269            None => {
270                let mut new = NewNode::new(Kind::Symbol);
271                new.subkind = Some(sym.kind.into());
272                new.title = Some(sym.qualified.clone());
273                new.body = Some(body.clone());
274                new.path = Some(rel.to_string());
275                new.lang = Some(lang.name().into());
276                new.project_id = Some(project_id);
277                new.parent_id = Some(file_id);
278                new.span_start = Some(sym.start_line as i64);
279                new.span_end = Some(sym.end_line as i64);
280                new.content_hash = Some(blake3::hash(body.as_bytes()).as_bytes().to_vec());
281                new.meta = Some(meta);
282                store::insert_node(conn, new)?.id
283            }
284        };
285        kept.insert(id);
286        stats.symbols += 1;
287    }
288    // Symbols that vanished from this file: hard-delete (derived data;
289    // edges cascade, embeddings cascade).
290    {
291        let mut stmt =
292            conn.prepare("SELECT id FROM node WHERE kind = 'symbol' AND parent_id = ?1")?;
293        let all: Vec<i64> = stmt
294            .query_map([file_id], |r| r.get(0))?
295            .collect::<rusqlite::Result<_>>()?;
296        drop(stmt);
297        for id in all {
298            if !kept.contains(&id) {
299                conn.execute("DELETE FROM node WHERE id = ?1", [id])?;
300            }
301        }
302    }
303    Ok(file_id)
304}
305
306/// A project symbol as seen by the resolver.
307struct SymRef {
308    id: i64,
309    name: String,
310    qualified: String,
311    path: String,
312}
313
314/// Re-resolve call/import edges for the changed files only. Unchanged
315/// symbols keep their ids, so inbound edges stay valid automatically.
316fn resolve_calls(
317    conn: &Connection,
318    project_id: i64,
319    changed: &[(i64, String, FileExtract)],
320    stats: &mut GraphStats,
321) -> Result<()> {
322    if changed.is_empty() {
323        return Ok(());
324    }
325    // Project-wide symbol table (one query, in-memory buckets).
326    let mut stmt = conn.prepare(
327        "SELECT id, json_extract(meta, '$.name'), title, path FROM node
328         WHERE kind = 'symbol' AND project_id = ?1 AND deleted_at IS NULL",
329    )?;
330    let symbols: Vec<SymRef> = stmt
331        .query_map([project_id], |r| {
332            Ok(SymRef {
333                id: r.get(0)?,
334                name: r.get::<_, Option<String>>(1)?.unwrap_or_default(),
335                qualified: r.get::<_, Option<String>>(2)?.unwrap_or_default(),
336                path: r.get::<_, Option<String>>(3)?.unwrap_or_default(),
337            })
338        })?
339        .collect::<rusqlite::Result<_>>()?;
340    drop(stmt);
341
342    let mut by_name: HashMap<&str, Vec<&SymRef>> = HashMap::new();
343    for s in &symbols {
344        // A NULL/empty name would bucket under "" and attract phantom
345        // call edges from any unresolvable callee.
346        if !s.name.is_empty() {
347            by_name.entry(s.name.as_str()).or_default().push(s);
348        }
349    }
350    let mut by_file_qualified: HashMap<(&str, &str), i64> = HashMap::new();
351    for s in &symbols {
352        by_file_qualified.insert((s.path.as_str(), s.qualified.as_str()), s.id);
353    }
354    let file_paths: HashSet<&str> = {
355        let mut set = HashSet::new();
356        for s in &symbols {
357            set.insert(s.path.as_str());
358        }
359        set
360    };
361    let file_ids: HashMap<String, i64> = {
362        let mut stmt = conn.prepare(
363            "SELECT path, id FROM node
364             WHERE kind = 'file' AND project_id = ?1 AND collection_id IS NULL
365               AND deleted_at IS NULL",
366        )?;
367        let rows: Vec<(String, i64)> = stmt
368            .query_map([project_id], |r| Ok((r.get(0)?, r.get(1)?)))?
369            .collect::<rusqlite::Result<_>>()?;
370        rows.into_iter().collect()
371    };
372
373    for (file_id, rel, fx) in changed {
374        // Wipe edges originating from this file's symbols + its imports.
375        conn.execute(
376            "DELETE FROM edge WHERE rel = 'calls' AND src IN
377               (SELECT id FROM node WHERE kind = 'symbol' AND parent_id = ?1)",
378            [file_id],
379        )?;
380        conn.execute(
381            "DELETE FROM edge WHERE rel = 'imports' AND src = ?1",
382            [file_id],
383        )?;
384
385        // Import map for tier-2 resolution + file→file import edges.
386        let mut import_target: HashMap<&str, String> = HashMap::new();
387        for imp in &fx.imports {
388            if let Some(target) = resolve_import(rel, &imp.source, &file_paths) {
389                import_target.insert(imp.local.as_str(), target.clone());
390                if let Some(dst) = file_ids.get(&target) {
391                    if *dst != *file_id {
392                        store::link(conn, *file_id, *dst, Rel::Imports, 1.0)?;
393                        stats.imports += 1;
394                    }
395                }
396            }
397        }
398
399        for call in &fx.calls {
400            if call.caller.is_empty() {
401                continue; // top-level statements have no source symbol
402            }
403            let Some(&src) = by_file_qualified.get(&(rel.as_str(), call.caller.as_str())) else {
404                continue;
405            };
406            let candidates = by_name.get(call.callee.as_str());
407            let Some(candidates) = candidates else {
408                continue;
409            };
410            // Tier 1: same file.
411            if let Some(c) = candidates.iter().find(|c| c.path == *rel && c.id != src) {
412                link_call(conn, src, c.id, 1.0, true)?;
413                stats.calls_resolved += 1;
414                continue;
415            }
416            // Tier 2: imported name → resolved file.
417            if let Some(target) = import_target.get(call.callee.as_str()) {
418                if let Some(c) = candidates.iter().find(|c| c.path == *target) {
419                    link_call(conn, src, c.id, 1.0, true)?;
420                    stats.calls_resolved += 1;
421                    continue;
422                }
423            }
424            // Tier 3: global by name — honest about ambiguity.
425            let global: Vec<&&SymRef> = candidates.iter().filter(|c| c.id != src).collect();
426            match global.len() {
427                0 => {}
428                1 => {
429                    link_call(conn, src, global[0].id, 0.8, true)?;
430                    stats.calls_resolved += 1;
431                }
432                n if n <= 3 => {
433                    for c in &global {
434                        link_call(conn, src, c.id, 1.0 / n as f64, false)?;
435                        stats.calls_heuristic += 1;
436                    }
437                }
438                _ => {} // >3 candidates: too ambiguous to be useful
439            }
440        }
441    }
442    Ok(())
443}
444
445fn link_call(conn: &Connection, src: i64, dst: i64, weight: f64, resolved: bool) -> Result<()> {
446    conn.execute(
447        "INSERT INTO edge (src, dst, rel, weight, meta, created_at)
448         VALUES (?1, ?2, 'calls', ?3, json_object('resolved', ?4), ?5)
449         ON CONFLICT(src, dst, rel) DO UPDATE SET
450           weight = excluded.weight, meta = excluded.meta",
451        params![src, dst, weight, resolved, now_unix()],
452    )?;
453    Ok(())
454}
455
456/// Best-effort import-source → project-file resolution.
457fn resolve_import(importer: &str, source: &str, files: &HashSet<&str>) -> Option<String> {
458    let dir = Path::new(importer).parent().unwrap_or(Path::new(""));
459    let try_paths = |bases: Vec<String>| -> Option<String> {
460        bases.into_iter().find(|b| files.contains(b.as_str()))
461    };
462
463    if source.starts_with('.') {
464        if source.contains("::") {
465            return None; // relative Rust paths handled below by suffix
466        }
467        // JS/TS relative ("./x", "../y/z") or Python relative (".util").
468        if source.starts_with("./") || source.starts_with("../") {
469            let joined = normalize(&dir.join(source));
470            return try_paths(vec![
471                format!("{joined}.ts"),
472                format!("{joined}.tsx"),
473                format!("{joined}.js"),
474                format!("{joined}.jsx"),
475                format!("{joined}/index.ts"),
476                format!("{joined}/index.js"),
477                joined.clone(),
478            ]);
479        }
480        // Python relative: ".util" / "..pkg.mod"
481        let dots = source.chars().take_while(|c| *c == '.').count();
482        let module = &source[dots..];
483        let mut base = dir.to_path_buf();
484        for _ in 1..dots {
485            base = base.parent().map(Path::to_path_buf).unwrap_or_default();
486        }
487        let joined = normalize(&base.join(module.replace('.', "/")));
488        return try_paths(vec![
489            format!("{joined}.py"),
490            format!("{joined}/__init__.py"),
491        ]);
492    }
493
494    if source.contains("::") {
495        // Rust: crate::a::b / super::x — match by path suffix on segments.
496        let segs: Vec<&str> = source
497            .split("::")
498            .filter(|s| !matches!(*s, "crate" | "super" | "self"))
499            .collect();
500        if segs.is_empty() {
501            return None;
502        }
503        // The import target is usually an item; its module file is the
504        // second-to-last segment (or last for module imports).
505        for take in (1..=segs.len().min(3)).rev() {
506            let suffix = format!("{}.rs", segs[..take].join("/"));
507            if let Some(hit) = files.iter().find(|f| f.ends_with(&suffix)) {
508                return Some(hit.to_string());
509            }
510        }
511        return None;
512    }
513
514    if source.contains('.') && !source.contains('/') {
515        // Python absolute module: a.b.c
516        let joined = source.replace('.', "/");
517        return try_paths(vec![
518            format!("{joined}.py"),
519            format!("{joined}/__init__.py"),
520        ])
521        .or_else(|| {
522            files
523                .iter()
524                .find(|f| f.ends_with(&format!("{joined}.py")))
525                .map(|f| f.to_string())
526        });
527    }
528
529    // Go package path / bare python module: match a file in a dir (or with
530    // a name) equal to the last segment.
531    let last = source.rsplit('/').next().unwrap_or(source);
532    files
533        .iter()
534        .find(|f| {
535            Path::new(f)
536                .parent()
537                .and_then(|p| p.file_name())
538                .map(|d| d.to_string_lossy() == last)
539                .unwrap_or(false)
540                || **f == format!("{last}.py")
541        })
542        .map(|f| f.to_string())
543}
544
545fn normalize(p: &Path) -> String {
546    let mut parts: Vec<&std::ffi::OsStr> = Vec::new();
547    for c in p.components() {
548        match c {
549            std::path::Component::ParentDir => {
550                parts.pop();
551            }
552            std::path::Component::CurDir => {}
553            std::path::Component::Normal(s) => parts.push(s),
554            _ => {}
555        }
556    }
557    parts
558        .iter()
559        .map(|s| s.to_string_lossy())
560        .collect::<Vec<_>>()
561        .join("/")
562}