cgx_engine/
graph.rs

1use std::path::{Path, PathBuf};
2
3use duckdb::params;
4use serde::{Deserialize, Serialize};
5use sha2::{Digest, Sha256};
6
7use crate::parser::{EdgeDef, NodeDef};
8
9/// A node in the code knowledge graph as stored in (and read from) the DuckDB database.
10#[derive(Debug, Clone, Serialize, Deserialize)]
11pub struct Node {
12    /// Stable unique identifier, e.g. `fn:src/lib.rs:parse` or `file:src/lib.rs`.
13    pub id: String,
14    /// String form of [`NodeKind`], e.g. `"Function"`, `"File"`.
15    pub kind: String,
16    pub name: String,
17    /// Repo-relative file path.
18    pub path: String,
19    pub line_start: u32,
20    pub line_end: u32,
21    #[serde(default)]
22    pub language: String,
23    /// Normalised commit-frequency score over the last 90 days (0.0–1.0).
24    #[serde(default)]
25    pub churn: f64,
26    /// Normalised co-change coupling score (0.0–1.0).
27    #[serde(default)]
28    pub coupling: f64,
29    /// Community ID assigned by the Louvain clustering pass (0 = unclustered).
30    #[serde(default)]
31    pub community: i64,
32    #[serde(default)]
33    pub in_degree: i64,
34    #[serde(default)]
35    pub out_degree: i64,
36    /// Whether the symbol is publicly exported from its file.
37    #[serde(default)]
38    pub exported: bool,
39    /// Whether dead-code analysis flagged this node.
40    #[serde(default)]
41    pub is_dead_candidate: bool,
42    #[serde(default)]
43    pub dead_reason: Option<String>,
44    /// Cyclomatic-complexity-derived score (higher = more complex).
45    #[serde(default)]
46    pub complexity: f64,
47    #[serde(default)]
48    pub is_test_file: bool,
49    /// Number of test functions in this file (only set on `File` nodes).
50    #[serde(default)]
51    pub test_count: i64,
52    /// Whether at least one test exercises this symbol.
53    #[serde(default)]
54    pub is_tested: bool,
55}
56
57/// A directed edge in the code knowledge graph.
58#[derive(Debug, Clone, Serialize, Deserialize)]
59pub struct Edge {
60    /// Composite key: `"<src>|<kind>|<dst>"`.
61    pub id: String,
62    pub src: String,
63    pub dst: String,
64    /// String form of [`EdgeKind`], e.g. `"CALLS"`, `"IMPORTS"`.
65    pub kind: String,
66    #[serde(default = "default_weight")]
67    pub weight: f64,
68    #[serde(default = "default_weight")]
69    pub confidence: f64,
70}
71
72fn default_weight() -> f64 {
73    1.0
74}
75
76/// High-level statistics for a fully indexed repository.
77#[derive(Debug, Clone, Serialize, Deserialize)]
78pub struct RepoStats {
79    pub node_count: u64,
80    pub edge_count: u64,
81    /// Fraction of nodes per language, e.g. `{"typescript": 0.72}`.
82    pub language_breakdown: std::collections::HashMap<String, f64>,
83    pub community_count: u32,
84    pub function_count: u64,
85    pub class_count: u64,
86    pub file_count: u64,
87}
88
89/// `(community_id, label, node_count, top_node_ids)`
90pub type CommunityRow = (i64, String, i64, Vec<String>);
91/// `(overall_pct, per_community: Vec<(community_id, documented, total)>, undocumented_high_coupling_nodes)`
92pub type DocsCoverage = (f64, Vec<(i64, i64, i64)>, Vec<Node>);
93/// `(overall_pct, tested_count, untested_count, untested_high_coupling_nodes)`
94pub type TestCoverageSummary = (f64, i64, i64, Vec<Node>);
95type CommunityGroup = (Vec<(String, i64, String)>, i64); // (kind, in_degree, name)
96
97/// A lightweight record of a single git commit used by the timeline view.
98#[derive(Debug, Clone, Serialize, Deserialize)]
99pub struct SnapshotEntry {
100    pub id: String,
101    pub commit_sha: String,
102    /// `"YYYY-MM-DD"` formatted date of the commit.
103    pub commit_date: String,
104    pub commit_msg: String,
105    pub node_count: i64,
106    pub edge_count: i64,
107    /// JSON blob: `{"file_count": N, "insertions": N, "deletions": N}`
108    pub snapshot_data: Option<String>,
109}
110
111/// A comment annotation tag (e.g. `@todo`, `@fixme`) stored in the `tags` table.
112#[derive(Debug, Clone, Serialize, Deserialize)]
113pub struct TagRow {
114    pub id: String,
115    pub file_path: String,
116    pub line: u32,
117    /// e.g. `"todo"`, `"fixme"`, `"hack"`.
118    pub tag_type: String,
119    pub text: String,
120    /// `"code"`, `"jsx"`, or `"jsx_commented_code"` — see [`CommentKind`](crate::parser::CommentKind).
121    pub comment_type: String,
122}
123
124/// A detected code-clone pair stored in the `clones` table.
125#[derive(Debug, Clone, Serialize, Deserialize)]
126pub struct CloneRow {
127    pub id: String,
128    pub node_a: String,
129    pub node_b: String,
130    /// Jaccard similarity score, 0.0–1.0.
131    pub similarity: f64,
132    /// `"exact"` or `"near"`.
133    pub kind: String,
134}
135
136impl Default for Node {
137    fn default() -> Self {
138        Self {
139            id: String::new(),
140            kind: String::new(),
141            name: String::new(),
142            path: String::new(),
143            line_start: 0,
144            line_end: 0,
145            language: String::new(),
146            churn: 0.0,
147            coupling: 0.0,
148            community: 0,
149            in_degree: 0,
150            out_degree: 0,
151            exported: false,
152            is_dead_candidate: false,
153            dead_reason: None,
154            complexity: 0.0,
155            is_test_file: false,
156            test_count: 0,
157            is_tested: false,
158        }
159    }
160}
161
162impl Node {
163    pub fn from_def(d: &NodeDef, language: &str) -> Self {
164        let exported = d
165            .metadata
166            .get("exported")
167            .and_then(|v| v.as_bool())
168            .unwrap_or(false);
169        let complexity = d
170            .metadata
171            .get("complexity")
172            .and_then(|v| v.as_f64())
173            .unwrap_or(0.0);
174        Self {
175            id: d.id.clone(),
176            kind: d.kind.as_str().to_string(),
177            name: d.name.clone(),
178            path: d.path.clone(),
179            line_start: d.line_start,
180            line_end: d.line_end,
181            language: language.to_string(),
182            churn: 0.0,
183            coupling: 0.0,
184            community: 0,
185            in_degree: 0,
186            out_degree: 0,
187            exported,
188            is_dead_candidate: false,
189            dead_reason: None,
190            complexity,
191            is_test_file: false,
192            test_count: 0,
193            is_tested: false,
194        }
195    }
196}
197
198impl Edge {
199    pub fn from_def(d: &EdgeDef) -> Self {
200        let id = format!("{}|{}|{}", d.src, d.kind.as_str(), d.dst);
201        Self {
202            id,
203            src: d.src.clone(),
204            dst: d.dst.clone(),
205            kind: d.kind.as_str().to_string(),
206            weight: d.weight,
207            confidence: d.confidence,
208        }
209    }
210}
211
212/// Handle to the DuckDB database that stores the code knowledge graph.
213///
214/// Each repository gets its own `.db` file under `~/.cgx/repos/<repo_id>.db`.
215/// All methods operate on the embedded DuckDB connection and are synchronous.
216pub struct GraphDb {
217    pub conn: duckdb::Connection,
218    /// SHA-256–derived stable identifier for the repository path.
219    pub repo_id: String,
220    pub db_path: PathBuf,
221}
222
223impl GraphDb {
224    /// Open (or create) the graph database for the repository at `repo_path`.
225    ///
226    /// Creates all required tables and runs forward migrations on existing databases.
227    pub fn open(repo_path: &Path) -> anyhow::Result<Self> {
228        let repo_id = repo_hash(repo_path);
229        let dir = dirs::home_dir()
230            .ok_or_else(|| anyhow::anyhow!("cannot determine home directory"))?
231            .join(".cgx")
232            .join("repos");
233        std::fs::create_dir_all(&dir)?;
234
235        let db_path = dir.join(format!("{}.db", repo_id));
236        let conn = duckdb::Connection::open(&db_path)?;
237
238        conn.execute_batch(
239            "CREATE TABLE IF NOT EXISTS nodes (
240                id                 VARCHAR PRIMARY KEY,
241                kind               VARCHAR NOT NULL,
242                name               VARCHAR NOT NULL,
243                path               VARCHAR NOT NULL,
244                line_start         INTEGER,
245                line_end           INTEGER,
246                language           VARCHAR,
247                churn              DOUBLE DEFAULT 0.0,
248                coupling           DOUBLE DEFAULT 0.0,
249                community          BIGINT DEFAULT 0,
250                in_degree          BIGINT DEFAULT 0,
251                out_degree         BIGINT DEFAULT 0,
252                exported           TINYINT DEFAULT 0,
253                is_dead_candidate  TINYINT DEFAULT 0,
254                dead_reason        TEXT,
255                metadata           JSON
256            );
257            CREATE TABLE IF NOT EXISTS edges (
258                id         VARCHAR PRIMARY KEY,
259                src        VARCHAR NOT NULL,
260                dst        VARCHAR NOT NULL,
261                kind       VARCHAR NOT NULL,
262                weight     DOUBLE DEFAULT 1.0,
263                confidence DOUBLE DEFAULT 1.0,
264                metadata   JSON
265            );
266            CREATE TABLE IF NOT EXISTS communities (
267                id         INTEGER PRIMARY KEY,
268                label      VARCHAR,
269                node_count INTEGER,
270                top_nodes  JSON
271            );
272            CREATE TABLE IF NOT EXISTS repo_meta (
273                key        VARCHAR PRIMARY KEY,
274                value      JSON
275            );
276            CREATE TABLE IF NOT EXISTS file_hashes (
277                path       VARCHAR PRIMARY KEY,
278                hash       VARCHAR NOT NULL,
279                indexed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
280            );
281            CREATE TABLE IF NOT EXISTS tags (
282                id           VARCHAR PRIMARY KEY,
283                file_path    VARCHAR NOT NULL,
284                line         INTEGER NOT NULL,
285                tag_type     VARCHAR NOT NULL,
286                text         VARCHAR NOT NULL,
287                comment_type VARCHAR NOT NULL DEFAULT 'code'
288            );
289            CREATE TABLE IF NOT EXISTS clones (
290                id         VARCHAR PRIMARY KEY,
291                node_a     VARCHAR NOT NULL,
292                node_b     VARCHAR NOT NULL,
293                similarity FLOAT NOT NULL,
294                kind       VARCHAR NOT NULL
295            );
296            CREATE INDEX IF NOT EXISTS idx_nodes_kind      ON nodes(kind);
297            CREATE INDEX IF NOT EXISTS idx_nodes_path      ON nodes(path);
298            CREATE INDEX IF NOT EXISTS idx_nodes_community ON nodes(community);
299            CREATE INDEX IF NOT EXISTS idx_edges_src       ON edges(src);
300            CREATE INDEX IF NOT EXISTS idx_edges_dst       ON edges(dst);
301            CREATE INDEX IF NOT EXISTS idx_edges_kind      ON edges(kind);
302            CREATE INDEX IF NOT EXISTS idx_tags_file       ON tags(file_path);
303            CREATE INDEX IF NOT EXISTS idx_tags_type       ON tags(tag_type);
304            CREATE INDEX IF NOT EXISTS idx_clones_a        ON clones(node_a);
305            CREATE INDEX IF NOT EXISTS idx_clones_b        ON clones(node_b);",
306        )?;
307
308        // Migration: add new columns to existing DBs that pre-date this schema.
309        // DuckDB 1.x supports "ADD COLUMN IF NOT EXISTS" which is a no-op when
310        // the column is already present — no error, no transaction abort.
311        conn.execute_batch(
312            "ALTER TABLE nodes ADD COLUMN IF NOT EXISTS exported           TINYINT DEFAULT 0;
313             ALTER TABLE nodes ADD COLUMN IF NOT EXISTS is_dead_candidate  TINYINT DEFAULT 0;
314             ALTER TABLE nodes ADD COLUMN IF NOT EXISTS dead_reason        TEXT;
315             ALTER TABLE nodes ADD COLUMN IF NOT EXISTS complexity         DOUBLE DEFAULT 0.0;
316             ALTER TABLE nodes ADD COLUMN IF NOT EXISTS doc_comment        TEXT;
317             ALTER TABLE nodes ADD COLUMN IF NOT EXISTS is_test_file       TINYINT DEFAULT 0;
318             ALTER TABLE nodes ADD COLUMN IF NOT EXISTS test_count         INTEGER DEFAULT 0;
319             ALTER TABLE nodes ADD COLUMN IF NOT EXISTS is_tested          TINYINT DEFAULT 0;
320             CREATE INDEX IF NOT EXISTS idx_nodes_dead       ON nodes(is_dead_candidate);
321             CREATE INDEX IF NOT EXISTS idx_nodes_complexity ON nodes(complexity);
322             CREATE INDEX IF NOT EXISTS idx_nodes_is_tested  ON nodes(is_tested);",
323        )?;
324
325        conn.execute_batch(
326            "CREATE TABLE IF NOT EXISTS snapshots (
327                id           VARCHAR PRIMARY KEY,
328                commit_sha   VARCHAR NOT NULL,
329                commit_date  TEXT NOT NULL,
330                commit_msg   VARCHAR,
331                node_count   INTEGER,
332                edge_count   INTEGER,
333                snapshot_data TEXT
334            );
335            CREATE INDEX IF NOT EXISTS idx_snapshots_date ON snapshots(commit_date);",
336        )?;
337
338        Ok(Self {
339            conn,
340            repo_id,
341            db_path,
342        })
343    }
344
345    /// Insert or replace nodes in the `nodes` table, returning the count written.
346    pub fn upsert_nodes(&self, nodes: &[Node]) -> anyhow::Result<usize> {
347        if nodes.is_empty() {
348            return Ok(0);
349        }
350        let mut count = 0;
351        let mut stmt = self.conn.prepare(
352            "INSERT OR REPLACE INTO nodes (id, kind, name, path, line_start, line_end, language, churn, coupling, community, in_degree, out_degree, exported, complexity, is_test_file, test_count, is_tested)
353             VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
354        )?;
355        for node in nodes {
356            stmt.execute(params![
357                node.id,
358                node.kind,
359                node.name,
360                node.path,
361                node.line_start,
362                node.line_end,
363                node.language,
364                node.churn,
365                node.coupling,
366                node.community,
367                node.in_degree,
368                node.out_degree,
369                node.exported as i32,
370                node.complexity,
371                node.is_test_file as i32,
372                node.test_count,
373                node.is_tested as i32,
374            ])?;
375            count += 1;
376        }
377        Ok(count)
378    }
379
380    /// Insert or replace edges in the `edges` table, returning the count written.
381    pub fn upsert_edges(&self, edges: &[Edge]) -> anyhow::Result<usize> {
382        if edges.is_empty() {
383            return Ok(0);
384        }
385        let mut count = 0;
386        let mut stmt = self.conn.prepare(
387            "INSERT OR REPLACE INTO edges (id, src, dst, kind, weight, confidence)
388             VALUES (?, ?, ?, ?, ?, ?)",
389        )?;
390        for edge in edges {
391            stmt.execute(params![
392                edge.id,
393                edge.src,
394                edge.dst,
395                edge.kind,
396                edge.weight,
397                edge.confidence,
398            ])?;
399            count += 1;
400        }
401        Ok(count)
402    }
403
404    /// Insert or replace comment annotation tags, returning the count written.
405    pub fn upsert_tags(&self, tags: &[TagRow]) -> anyhow::Result<usize> {
406        if tags.is_empty() {
407            return Ok(0);
408        }
409        let mut count = 0;
410        let mut stmt = self.conn.prepare(
411            "INSERT OR REPLACE INTO tags (id, file_path, line, tag_type, text, comment_type)
412             VALUES (?, ?, ?, ?, ?, ?)",
413        )?;
414        for tag in tags {
415            stmt.execute(params![
416                tag.id,
417                tag.file_path,
418                tag.line,
419                tag.tag_type,
420                tag.text,
421                tag.comment_type,
422            ])?;
423            count += 1;
424        }
425        Ok(count)
426    }
427
428    /// Query comment annotation tags with optional filters on tag type and comment kind.
429    pub fn get_tags(
430        &self,
431        tag_type_filter: Option<&str>,
432        comment_type_filter: Option<&str>,
433    ) -> anyhow::Result<Vec<TagRow>> {
434        let sql = match (tag_type_filter, comment_type_filter) {
435            (Some(_), Some(_)) => {
436                "SELECT id, file_path, line, tag_type, text, comment_type FROM tags \
437                 WHERE tag_type = ? AND comment_type = ? ORDER BY file_path, line"
438            }
439            (Some(_), None) => {
440                "SELECT id, file_path, line, tag_type, text, comment_type FROM tags \
441                 WHERE tag_type = ? ORDER BY file_path, line"
442            }
443            (None, Some(_)) => {
444                "SELECT id, file_path, line, tag_type, text, comment_type FROM tags \
445                 WHERE comment_type = ? ORDER BY file_path, line"
446            }
447            (None, None) => {
448                "SELECT id, file_path, line, tag_type, text, comment_type FROM tags \
449                 ORDER BY file_path, line"
450            }
451        };
452
453        let mut stmt = self.conn.prepare(sql)?;
454        let map_row = |row: &duckdb::Row| {
455            Ok(TagRow {
456                id: row.get(0)?,
457                file_path: row.get(1)?,
458                line: row.get::<_, u32>(2)?,
459                tag_type: row.get(3)?,
460                text: row.get(4)?,
461                comment_type: row.get(5)?,
462            })
463        };
464
465        let rows = match (tag_type_filter, comment_type_filter) {
466            (Some(t), Some(c)) => stmt.query_map(params![t, c], map_row)?,
467            (Some(t), None) => stmt.query_map(params![t], map_row)?,
468            (None, Some(c)) => stmt.query_map(params![c], map_row)?,
469            (None, None) => stmt.query_map([], map_row)?,
470        };
471
472        let mut results = Vec::new();
473        for row in rows {
474            results.push(row?);
475        }
476        Ok(results)
477    }
478
479    /// Drop and recreate the `tags` table, removing all stored annotations.
480    pub fn clear_all_tags(&self) -> anyhow::Result<()> {
481        self.conn.execute_batch(
482            "DROP TABLE IF EXISTS tags;
483             CREATE TABLE IF NOT EXISTS tags (
484                 id           VARCHAR PRIMARY KEY,
485                 file_path    VARCHAR NOT NULL,
486                 line         INTEGER NOT NULL,
487                 tag_type     VARCHAR NOT NULL,
488                 text         VARCHAR NOT NULL,
489                 comment_type VARCHAR NOT NULL DEFAULT 'code'
490             );
491             CREATE INDEX IF NOT EXISTS idx_tags_file ON tags(file_path);
492             CREATE INDEX IF NOT EXISTS idx_tags_type ON tags(tag_type);",
493        )?;
494        Ok(())
495    }
496
497    /// Delete all tags whose `file_path` is in `paths` (used during incremental re-index).
498    pub fn delete_tags_for_paths(&self, paths: &[String]) -> anyhow::Result<()> {
499        if paths.is_empty() {
500            return Ok(());
501        }
502        let mut stmt = self.conn.prepare("DELETE FROM tags WHERE file_path = ?")?;
503        for path in paths {
504            stmt.execute(params![path])?;
505        }
506        Ok(())
507    }
508
509    /// Fetch a single node by its stable `id`, returning `None` if not found.
510    pub fn get_node(&self, id: &str) -> anyhow::Result<Option<Node>> {
511        let mut stmt = self
512            .conn
513            .prepare("SELECT id, kind, name, path, line_start, line_end, language, churn, coupling, community, in_degree, out_degree, COALESCE(exported, false) as exported, COALESCE(is_dead_candidate, false) as is_dead_candidate, dead_reason, COALESCE(complexity, 0.0), COALESCE(is_test_file, 0), COALESCE(test_count, 0), COALESCE(is_tested, 0) FROM nodes WHERE id = ?")?;
514        let mut rows = stmt.query_map(params![id], |row| {
515            Ok(Node {
516                id: row.get(0)?,
517                kind: row.get(1)?,
518                name: row.get(2)?,
519                path: row.get(3)?,
520                line_start: row.get(4)?,
521                line_end: row.get(5)?,
522                language: row.get(6)?,
523                churn: row.get(7)?,
524                coupling: row.get(8)?,
525                community: row.get(9)?,
526                in_degree: row.get(10)?,
527                out_degree: row.get(11)?,
528                exported: row.get::<_, i64>(12).map(|v| v != 0).unwrap_or(false),
529                is_dead_candidate: row.get::<_, bool>(13).unwrap_or(false),
530                dead_reason: row.get::<_, Option<String>>(14).unwrap_or(None),
531                complexity: row.get::<_, f64>(15).unwrap_or(0.0),
532                is_test_file: row.get::<_, i64>(16).map(|v| v != 0).unwrap_or(false),
533                test_count: row.get::<_, i64>(17).unwrap_or(0),
534                is_tested: row.get::<_, i64>(18).map(|v| v != 0).unwrap_or(false),
535            })
536        })?;
537
538        match rows.next() {
539            Some(Ok(node)) => Ok(Some(node)),
540            _ => Ok(None),
541        }
542    }
543
544    /// BFS outward from `id` following edges in both directions, up to `depth` hops (max 3).
545    pub fn get_neighbors(&self, id: &str, depth: u8) -> anyhow::Result<Vec<Node>> {
546        let mut seen = std::collections::HashSet::new();
547        seen.insert(id.to_string());
548        let mut current = vec![id.to_string()];
549        let mut result: Vec<Node> = Vec::new();
550        let max_depth = depth.min(3);
551
552        for _ in 0..max_depth {
553            if current.is_empty() {
554                break;
555            }
556            let mut next = Vec::new();
557
558            for cur_id in &current {
559                let mut stmt = self.conn.prepare(
560                    "SELECT DISTINCT n.id, n.kind, n.name, n.path, n.line_start, n.line_end, n.language, n.churn, n.coupling, n.community, n.in_degree, n.out_degree, COALESCE(n.exported, false), COALESCE(n.is_dead_candidate, false), n.dead_reason, COALESCE(n.complexity, 0.0), COALESCE(n.is_test_file, 0), COALESCE(n.test_count, 0), COALESCE(n.is_tested, 0)
561                     FROM nodes n
562                     INNER JOIN edges e ON (e.dst = n.id AND e.src = ?1) OR (e.src = n.id AND e.dst = ?2)
563                     LIMIT 100",
564                )?;
565                let rows = stmt.query_map(params![cur_id, cur_id], |row| {
566                    Ok(Node {
567                        id: row.get(0)?,
568                        kind: row.get(1)?,
569                        name: row.get(2)?,
570                        path: row.get(3)?,
571                        line_start: row.get(4)?,
572                        line_end: row.get(5)?,
573                        language: row.get(6)?,
574                        churn: row.get(7)?,
575                        coupling: row.get(8)?,
576                        community: row.get(9)?,
577                        in_degree: row.get(10)?,
578                        out_degree: row.get(11)?,
579                        exported: row.get::<_, i64>(12).map(|v| v != 0).unwrap_or(false),
580                        is_dead_candidate: row.get::<_, bool>(13).unwrap_or(false),
581                        dead_reason: row.get::<_, Option<String>>(14).unwrap_or(None),
582                        complexity: row.get::<_, f64>(15).unwrap_or(0.0),
583                        is_test_file: row.get::<_, i64>(16).map(|v| v != 0).unwrap_or(false),
584                        test_count: row.get::<_, i64>(17).unwrap_or(0),
585                        is_tested: row.get::<_, i64>(18).map(|v| v != 0).unwrap_or(false),
586                    })
587                })?;
588
589                for row in rows {
590                    let node = row?;
591                    if seen.insert(node.id.clone()) {
592                        next.push(node.id.clone());
593                        result.push(node);
594                    }
595                }
596            }
597            current = next;
598        }
599
600        Ok(result)
601    }
602
603    /// Return every node in the graph.
604    pub fn get_all_nodes(&self) -> anyhow::Result<Vec<Node>> {
605        let mut stmt = self.conn.prepare(
606            "SELECT id, kind, name, path, line_start, line_end, language, churn, coupling, community, in_degree, out_degree, COALESCE(exported, false), COALESCE(is_dead_candidate, false), dead_reason, COALESCE(complexity, 0.0), COALESCE(is_test_file, 0), COALESCE(test_count, 0), COALESCE(is_tested, 0) FROM nodes",
607        )?;
608        let rows = stmt.query_map([], |row| {
609            Ok(Node {
610                id: row.get(0)?,
611                kind: row.get(1)?,
612                name: row.get(2)?,
613                path: row.get(3)?,
614                line_start: row.get(4)?,
615                line_end: row.get(5)?,
616                language: row.get(6)?,
617                churn: row.get(7)?,
618                coupling: row.get(8)?,
619                community: row.get(9)?,
620                in_degree: row.get(10)?,
621                out_degree: row.get(11)?,
622                exported: row.get::<_, i64>(12).map(|v| v != 0).unwrap_or(false),
623                is_dead_candidate: row.get::<_, bool>(13).unwrap_or(false),
624                dead_reason: row.get::<_, Option<String>>(14).unwrap_or(None),
625                complexity: row.get::<_, f64>(15).unwrap_or(0.0),
626                is_test_file: row.get::<_, i64>(16).map(|v| v != 0).unwrap_or(false),
627                test_count: row.get::<_, i64>(17).unwrap_or(0),
628                is_tested: row.get::<_, i64>(18).map(|v| v != 0).unwrap_or(false),
629            })
630        })?;
631
632        let mut nodes = Vec::new();
633        for row in rows {
634            nodes.push(row?);
635        }
636        Ok(nodes)
637    }
638
639    /// Return every edge in the graph.
640    pub fn get_all_edges(&self) -> anyhow::Result<Vec<Edge>> {
641        let mut stmt = self
642            .conn
643            .prepare("SELECT id, src, dst, kind, weight, confidence FROM edges")?;
644        let rows = stmt.query_map([], |row| {
645            Ok(Edge {
646                id: row.get(0)?,
647                src: row.get(1)?,
648                dst: row.get(2)?,
649                kind: row.get(3)?,
650                weight: row.get(4)?,
651                confidence: row.get(5)?,
652            })
653        })?;
654
655        let mut edges = Vec::new();
656        for row in rows {
657            edges.push(row?);
658        }
659        Ok(edges)
660    }
661
662    /// Total number of nodes in the graph.
663    pub fn node_count(&self) -> anyhow::Result<u64> {
664        let count: i64 = self
665            .conn
666            .query_row("SELECT COUNT(*) FROM nodes", [], |row| row.get(0))?;
667        Ok(count as u64)
668    }
669
670    /// Total number of edges in the graph.
671    pub fn edge_count(&self) -> anyhow::Result<u64> {
672        let count: i64 = self
673            .conn
674            .query_row("SELECT COUNT(*) FROM edges", [], |row| row.get(0))?;
675        Ok(count as u64)
676    }
677
678    /// Truncate nodes, edges, and communities tables (does not remove file hashes).
679    pub fn clear(&self) -> anyhow::Result<()> {
680        // TRUNCATE avoids DuckDB ART index bulk-delete failures on large datasets
681        // and is more reliable than DROP+CREATE for data persistence across connections.
682        self.conn.execute_batch(
683            "TRUNCATE TABLE edges;
684             TRUNCATE TABLE nodes;
685             TRUNCATE TABLE communities;",
686        )?;
687        Ok(())
688    }
689
690    /// Fraction of nodes per language, normalised to sum to 1.0.
691    pub fn get_language_breakdown(&self) -> anyhow::Result<std::collections::HashMap<String, f64>> {
692        let mut stmt = self.conn.prepare(
693            "SELECT language, COUNT(*) as cnt FROM nodes WHERE language != '' GROUP BY language",
694        )?;
695        let rows = stmt.query_map([], |row| {
696            Ok((row.get::<_, String>(0)?, row.get::<_, i64>(1)?))
697        })?;
698
699        let mut counts: std::collections::HashMap<String, i64> = std::collections::HashMap::new();
700        for row in rows {
701            let (lang, cnt) = row?;
702            *counts.entry(lang).or_default() += cnt;
703        }
704
705        let total: i64 = counts.values().sum();
706        if total == 0 {
707            return Ok(std::collections::HashMap::new());
708        }
709
710        let mut breakdown = std::collections::HashMap::new();
711        for (lang, cnt) in counts {
712            breakdown.insert(lang, cnt as f64 / total as f64);
713        }
714        Ok(breakdown)
715    }
716
717    /// Count of nodes per [`NodeKind`] string, e.g. `{"Function": 412, "File": 38}`.
718    pub fn get_node_counts_by_kind(
719        &self,
720    ) -> anyhow::Result<std::collections::HashMap<String, u64>> {
721        let mut stmt = self
722            .conn
723            .prepare("SELECT kind, COUNT(*) as cnt FROM nodes GROUP BY kind")?;
724        let rows = stmt.query_map([], |row| {
725            Ok((row.get::<_, String>(0)?, row.get::<_, i64>(1)?))
726        })?;
727
728        let mut counts = std::collections::HashMap::new();
729        for row in rows {
730            let (kind, cnt) = row?;
731            counts.insert(kind, cnt as u64);
732        }
733        Ok(counts)
734    }
735
736    /// Write normalised churn and coupling scores back to a single node.
737    pub fn upsert_node_scores(
738        &self,
739        node_id: &str,
740        churn: f64,
741        coupling: f64,
742    ) -> anyhow::Result<()> {
743        self.conn.execute(
744            "UPDATE nodes SET churn = ?, coupling = ? WHERE id = ?",
745            params![churn, coupling, node_id],
746        )?;
747        Ok(())
748    }
749
750    /// Recompute and persist `in_degree` / `out_degree` for every node.
751    pub fn update_in_out_degrees(&self) -> anyhow::Result<()> {
752        self.conn.execute_batch(
753            "UPDATE nodes SET in_degree = 0, out_degree = 0;
754             UPDATE nodes SET out_degree = (SELECT COUNT(*) FROM edges WHERE edges.src = nodes.id);
755             UPDATE nodes SET in_degree = (SELECT COUNT(*) FROM edges WHERE edges.dst = nodes.id);",
756        )?;
757        Ok(())
758    }
759
760    /// Return the top `limit` file hotspots ranked by `churn × coupling + in_degree`.
761    ///
762    /// Each tuple is `(path, churn, coupling, in_degree)`.
763    pub fn get_hotspots(&self, limit: usize) -> anyhow::Result<Vec<(String, f64, f64, i64)>> {
764        let mut stmt = self.conn.prepare(
765            "SELECT path, churn, coupling, in_degree
766             FROM nodes
767             WHERE kind = 'File' AND (churn > 0.0 OR in_degree > 0)
768             ORDER BY (churn * COALESCE(coupling, 0.0) + CAST(in_degree AS DOUBLE) * 0.01) DESC
769             LIMIT ?",
770        )?;
771        let rows = stmt.query_map(params![limit as i64], |row| {
772            Ok((
773                row.get::<_, String>(0)?,
774                row.get::<_, f64>(1)?,
775                row.get::<_, f64>(2)?,
776                row.get::<_, i64>(3)?,
777            ))
778        })?;
779        let mut results = Vec::new();
780        for row in rows {
781            results.push(row?);
782        }
783        Ok(results)
784    }
785
786    /// Return `(author_name, file_count)` pairs sorted by file count descending.
787    pub fn get_ownership(&self) -> anyhow::Result<Vec<(String, i64)>> {
788        let mut stmt = self.conn.prepare(
789            "SELECT n.name, COUNT(e.id) as file_count
790             FROM nodes n
791             INNER JOIN edges e ON e.src = n.id AND e.kind = 'OWNS'
792             WHERE n.kind = 'Author'
793             GROUP BY n.name
794             ORDER BY file_count DESC",
795        )?;
796        let rows = stmt.query_map([], |row| {
797            Ok((row.get::<_, String>(0)?, row.get::<_, i64>(1)?))
798        })?;
799        let mut results = Vec::new();
800        for row in rows {
801            results.push(row?);
802        }
803        Ok(results)
804    }
805
806    /// Recompute `coupling` for every `File` node as `in_degree / max_in_degree`.
807    pub fn compute_coupling(&self) -> anyhow::Result<()> {
808        self.conn.execute_batch(
809            "UPDATE nodes SET coupling = 0.0;
810             UPDATE nodes SET coupling = 
811                CASE 
812                    WHEN (SELECT MAX(in_degree) FROM nodes WHERE kind = 'File') > 0
813                    THEN CAST(in_degree AS DOUBLE) / CAST((SELECT MAX(in_degree) FROM nodes WHERE kind = 'File') AS DOUBLE)
814                    ELSE 0.0
815                END
816             WHERE kind = 'File';",
817        )?;
818        Ok(())
819    }
820
821    /// Write Louvain community assignments from `detect_communities` back to the DB.
822    pub fn update_node_communities(
823        &self,
824        communities: &std::collections::HashMap<String, i64>,
825    ) -> anyhow::Result<usize> {
826        if communities.is_empty() {
827            return Ok(0);
828        }
829        let mut count = 0;
830        let mut stmt = self
831            .conn
832            .prepare("UPDATE nodes SET community = ? WHERE id = ?")?;
833        for (node_id, community) in communities {
834            let affected = stmt.execute(params![*community, node_id.as_str()])?;
835            count += affected;
836        }
837        Ok(count)
838    }
839
840    /// Return a [`RepoStats`] summary for the currently indexed repository.
841    pub fn get_stats(&self) -> anyhow::Result<RepoStats> {
842        let node_count = self.node_count()?;
843        let edge_count = self.edge_count()?;
844        let lang_breakdown = self.get_language_breakdown()?;
845        let communities = self.get_communities()?;
846        let counts_by_kind = self.get_node_counts_by_kind()?;
847
848        Ok(RepoStats {
849            node_count,
850            edge_count,
851            language_breakdown: lang_breakdown,
852            community_count: communities.len() as u32,
853            function_count: counts_by_kind.get("Function").copied().unwrap_or(0),
854            class_count: counts_by_kind.get("Class").copied().unwrap_or(0),
855            file_count: counts_by_kind.get("File").copied().unwrap_or(0),
856        })
857    }
858
859    /// Return up to `limit` nodes with `in_degree == 0` — likely entry points or roots.
860    pub fn get_entry_points(&self, limit: usize) -> anyhow::Result<Vec<Node>> {
861        let mut stmt = self.conn.prepare(
862            "SELECT id, kind, name, path, line_start, line_end, language, churn, coupling, community, in_degree, out_degree, COALESCE(exported, false), COALESCE(is_dead_candidate, false), dead_reason, COALESCE(complexity, 0.0), COALESCE(is_test_file, 0), COALESCE(test_count, 0), COALESCE(is_tested, 0)
863             FROM nodes
864             WHERE in_degree = 0 AND kind != 'File' AND kind != 'Author'
865             ORDER BY out_degree DESC
866             LIMIT ?",
867        )?;
868        let rows = stmt.query_map(params![limit as i64], |row| {
869            Ok(Node {
870                id: row.get(0)?,
871                kind: row.get(1)?,
872                name: row.get(2)?,
873                path: row.get(3)?,
874                line_start: row.get(4)?,
875                line_end: row.get(5)?,
876                language: row.get(6)?,
877                churn: row.get(7)?,
878                coupling: row.get(8)?,
879                community: row.get(9)?,
880                in_degree: row.get(10)?,
881                out_degree: row.get(11)?,
882                exported: row.get::<_, i64>(12).map(|v| v != 0).unwrap_or(false),
883                is_dead_candidate: row.get::<_, bool>(13).unwrap_or(false),
884                dead_reason: row.get::<_, Option<String>>(14).unwrap_or(None),
885                complexity: row.get::<_, f64>(15).unwrap_or(0.0),
886                is_test_file: row.get::<_, i64>(16).map(|v| v != 0).unwrap_or(false),
887                test_count: row.get::<_, i64>(17).unwrap_or(0),
888                is_tested: row.get::<_, i64>(18).map(|v| v != 0).unwrap_or(false),
889            })
890        })?;
891        let mut results = Vec::new();
892        for row in rows {
893            results.push(row?);
894        }
895        Ok(results)
896    }
897
898    /// Return up to `limit` nodes with the highest `in_degree` — the most-depended-on symbols.
899    pub fn get_god_nodes(&self, limit: usize) -> anyhow::Result<Vec<Node>> {
900        let mut stmt = self.conn.prepare(
901            "SELECT id, kind, name, path, line_start, line_end, language, churn, coupling, community, in_degree, out_degree, COALESCE(exported, false), COALESCE(is_dead_candidate, false), dead_reason, COALESCE(complexity, 0.0), COALESCE(is_test_file, 0), COALESCE(test_count, 0), COALESCE(is_tested, 0)
902             FROM nodes
903             WHERE in_degree > 0 AND kind != 'File' AND kind != 'Author'
904             ORDER BY in_degree DESC
905             LIMIT ?",
906        )?;
907        let rows = stmt.query_map(params![limit as i64], |row| {
908            Ok(Node {
909                id: row.get(0)?,
910                kind: row.get(1)?,
911                name: row.get(2)?,
912                path: row.get(3)?,
913                line_start: row.get(4)?,
914                line_end: row.get(5)?,
915                language: row.get(6)?,
916                churn: row.get(7)?,
917                coupling: row.get(8)?,
918                community: row.get(9)?,
919                in_degree: row.get(10)?,
920                out_degree: row.get(11)?,
921                exported: row.get::<_, i64>(12).map(|v| v != 0).unwrap_or(false),
922                is_dead_candidate: row.get::<_, bool>(13).unwrap_or(false),
923                dead_reason: row.get::<_, Option<String>>(14).unwrap_or(None),
924                complexity: row.get::<_, f64>(15).unwrap_or(0.0),
925                is_test_file: row.get::<_, i64>(16).map(|v| v != 0).unwrap_or(false),
926                test_count: row.get::<_, i64>(17).unwrap_or(0),
927                is_tested: row.get::<_, i64>(18).map(|v| v != 0).unwrap_or(false),
928            })
929        })?;
930        let mut results = Vec::new();
931        for row in rows {
932            results.push(row?);
933        }
934        Ok(results)
935    }
936
937    /// Return all communities as [`CommunityRow`] tuples, sorted by size descending.
938    pub fn get_communities(&self) -> anyhow::Result<Vec<CommunityRow>> {
939        let mut stmt = self.conn.prepare(
940            "SELECT community, kind, name, path, in_degree
941             FROM nodes
942             WHERE community > 0
943             ORDER BY community",
944        )?;
945        let rows = stmt.query_map([], |row| {
946            Ok((
947                row.get::<_, i64>(0)?,
948                row.get::<_, String>(1)?,
949                row.get::<_, String>(2)?,
950                row.get::<_, String>(3)?,
951                row.get::<_, i64>(4)?,
952            ))
953        })?;
954
955        let mut community_map: std::collections::HashMap<i64, CommunityGroup> =
956            std::collections::HashMap::new();
957        for row in rows {
958            let (community, kind, name, _path, in_degree) = row?;
959            let entry = community_map
960                .entry(community)
961                .or_insert_with(|| (Vec::new(), 0));
962            entry.0.push((kind, in_degree, name));
963            entry.1 += 1;
964        }
965
966        let mut result: Vec<CommunityRow> = community_map
967            .into_iter()
968            .map(|(community, (mut items, count))| {
969                items.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.2.cmp(&b.2)));
970                let top_nodes: Vec<String> = items
971                    .iter()
972                    .take(5)
973                    .map(|(kind, _deg, name)| format!("{}:{}", kind, name))
974                    .collect();
975                let label = top_nodes
976                    .first()
977                    .cloned()
978                    .unwrap_or_else(|| format!("community-{}", community));
979                (community, label, count, top_nodes)
980            })
981            .collect();
982
983        result.sort_by_key(|row| std::cmp::Reverse(row.2));
984        Ok(result)
985    }
986
987    /// Reset all community assignments to 0 and clear the communities table.
988    pub fn clear_communities(&self) -> anyhow::Result<()> {
989        self.conn.execute("UPDATE nodes SET community = 0", [])?;
990        self.conn.execute("DELETE FROM communities", [])?;
991        Ok(())
992    }
993
994    /// BFS following only incoming edges — returns all nodes that depend on `id`.
995    /// Used for blast-radius analysis: if `id` changes, these nodes are affected.
996    pub fn get_dependents(&self, id: &str, depth: u8) -> anyhow::Result<Vec<Node>> {
997        let mut seen = std::collections::HashSet::new();
998        seen.insert(id.to_string());
999        let mut current = vec![id.to_string()];
1000        let mut result: Vec<Node> = Vec::new();
1001        let max_depth = depth.min(3);
1002
1003        for _ in 0..max_depth {
1004            if current.is_empty() {
1005                break;
1006            }
1007            let mut next = Vec::new();
1008            for cur_id in &current {
1009                let mut stmt = self.conn.prepare(
1010                    "SELECT DISTINCT n.id, n.kind, n.name, n.path, n.line_start, n.line_end, n.language, n.churn, n.coupling, n.community, n.in_degree, n.out_degree, COALESCE(n.exported, false), COALESCE(n.is_dead_candidate, false), n.dead_reason, COALESCE(n.complexity, 0.0), COALESCE(n.is_test_file, 0), COALESCE(n.test_count, 0), COALESCE(n.is_tested, 0)
1011                     FROM nodes n
1012                     INNER JOIN edges e ON e.src = n.id AND e.dst = ?
1013                     LIMIT 100",
1014                )?;
1015                let rows = stmt.query_map(params![cur_id], |row| {
1016                    Ok(Node {
1017                        id: row.get(0)?,
1018                        kind: row.get(1)?,
1019                        name: row.get(2)?,
1020                        path: row.get(3)?,
1021                        line_start: row.get(4)?,
1022                        line_end: row.get(5)?,
1023                        language: row.get(6)?,
1024                        churn: row.get(7)?,
1025                        coupling: row.get(8)?,
1026                        community: row.get(9)?,
1027                        in_degree: row.get(10)?,
1028                        out_degree: row.get(11)?,
1029                        exported: row.get::<_, i64>(12).map(|v| v != 0).unwrap_or(false),
1030                        is_dead_candidate: row.get::<_, bool>(13).unwrap_or(false),
1031                        dead_reason: row.get::<_, Option<String>>(14).unwrap_or(None),
1032                        complexity: row.get::<_, f64>(15).unwrap_or(0.0),
1033                        is_test_file: row.get::<_, i64>(16).map(|v| v != 0).unwrap_or(false),
1034                        test_count: row.get::<_, i64>(17).unwrap_or(0),
1035                        is_tested: row.get::<_, i64>(18).map(|v| v != 0).unwrap_or(false),
1036                    })
1037                })?;
1038                for row in rows {
1039                    let node = row?;
1040                    if seen.insert(node.id.clone()) {
1041                        next.push(node.id.clone());
1042                        result.push(node);
1043                    }
1044                }
1045            }
1046            current = next;
1047        }
1048
1049        Ok(result)
1050    }
1051
1052    /// Return all nodes that belong to the given community ID.
1053    pub fn get_nodes_by_community(&self, community: i64) -> anyhow::Result<Vec<Node>> {
1054        let mut stmt = self.conn.prepare(
1055            "SELECT id, kind, name, path, line_start, line_end, language, churn, coupling, community, in_degree, out_degree, COALESCE(exported, false), COALESCE(is_dead_candidate, false), dead_reason, COALESCE(complexity, 0.0), COALESCE(is_test_file, 0), COALESCE(test_count, 0), COALESCE(is_tested, 0) FROM nodes WHERE community = ?",
1056        )?;
1057        let rows = stmt.query_map(params![community], |row| {
1058            Ok(Node {
1059                id: row.get(0)?,
1060                kind: row.get(1)?,
1061                name: row.get(2)?,
1062                path: row.get(3)?,
1063                line_start: row.get(4)?,
1064                line_end: row.get(5)?,
1065                language: row.get(6)?,
1066                churn: row.get(7)?,
1067                coupling: row.get(8)?,
1068                community: row.get(9)?,
1069                in_degree: row.get(10)?,
1070                out_degree: row.get(11)?,
1071                exported: row.get::<_, i64>(12).map(|v| v != 0).unwrap_or(false),
1072                is_dead_candidate: row.get::<_, bool>(13).unwrap_or(false),
1073                dead_reason: row.get::<_, Option<String>>(14).unwrap_or(None),
1074                complexity: row.get::<_, f64>(15).unwrap_or(0.0),
1075                is_test_file: row.get::<_, i64>(16).map(|v| v != 0).unwrap_or(false),
1076                test_count: row.get::<_, i64>(17).unwrap_or(0),
1077                is_tested: row.get::<_, i64>(18).map(|v| v != 0).unwrap_or(false),
1078            })
1079        })?;
1080        let mut nodes = Vec::new();
1081        for row in rows {
1082            nodes.push(row?);
1083        }
1084        Ok(nodes)
1085    }
1086
1087    /// Set `is_dead_candidate = true` and `dead_reason` for each `(node_id, reason)` pair.
1088    pub fn mark_dead_candidates(&self, items: &[(String, String)]) -> anyhow::Result<()> {
1089        // items = vec of (node_id, dead_reason)
1090        if items.is_empty() {
1091            return Ok(());
1092        }
1093        let mut stmt = self
1094            .conn
1095            .prepare("UPDATE nodes SET is_dead_candidate = 1, dead_reason = ? WHERE id = ?")?;
1096        for (id, reason) in items {
1097            stmt.execute(params![reason, id])?;
1098        }
1099        Ok(())
1100    }
1101
1102    /// Return `(total_dead_candidates, high_confidence_count)` from the DB.
1103    pub fn get_dead_code_stats(&self) -> anyhow::Result<(i64, i64)> {
1104        // Returns (total_candidates, high_confidence_count)
1105        let total: i64 = self
1106            .conn
1107            .query_row(
1108                "SELECT COUNT(*) FROM nodes WHERE is_dead_candidate = 1",
1109                [],
1110                |r| r.get(0),
1111            )
1112            .unwrap_or(0);
1113        // High confidence = unreachable or disconnected reasons
1114        let high: i64 = self.conn.query_row(
1115            "SELECT COUNT(*) FROM nodes WHERE is_dead_candidate = 1 AND dead_reason IN ('unreachable', 'disconnected')", [], |r| r.get(0)
1116        ).unwrap_or(0);
1117        Ok((total, high))
1118    }
1119
1120    /// Return all edges where both endpoints belong to the given community.
1121    pub fn get_edges_by_community(&self, community: i64) -> anyhow::Result<Vec<Edge>> {
1122        let mut stmt = self.conn.prepare(
1123            "SELECT DISTINCT e.id, e.src, e.dst, e.kind, e.weight, e.confidence
1124             FROM edges e
1125             INNER JOIN nodes n1 ON e.src = n1.id AND n1.community = ?
1126             INNER JOIN nodes n2 ON e.dst = n2.id AND n2.community = ?",
1127        )?;
1128        let rows = stmt.query_map(params![community, community], |row| {
1129            Ok(Edge {
1130                id: row.get(0)?,
1131                src: row.get(1)?,
1132                dst: row.get(2)?,
1133                kind: row.get(3)?,
1134                weight: row.get(4)?,
1135                confidence: row.get(5)?,
1136            })
1137        })?;
1138        let mut edges = Vec::new();
1139        for row in rows {
1140            edges.push(row?);
1141        }
1142        Ok(edges)
1143    }
1144
1145    // ── File hashes for incremental indexing ────────────────────────────────
1146
1147    /// Load the SHA-256 content hashes of all previously indexed files (used for incremental indexing).
1148    pub fn get_file_hashes(&self) -> anyhow::Result<std::collections::HashMap<String, String>> {
1149        let mut stmt = self.conn.prepare("SELECT path, hash FROM file_hashes")?;
1150        let rows = stmt.query_map([], |row| {
1151            Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?))
1152        })?;
1153        let mut result = std::collections::HashMap::new();
1154        for row in rows {
1155            let (path, hash) = row?;
1156            result.insert(path, hash);
1157        }
1158        Ok(result)
1159    }
1160
1161    /// Record or update the SHA-256 hash for a single file path.
1162    pub fn set_file_hash(&self, path: &str, hash: &str) -> anyhow::Result<()> {
1163        self.conn.execute(
1164            "INSERT OR REPLACE INTO file_hashes (path, hash) VALUES (?, ?)",
1165            params![path, hash],
1166        )?;
1167        Ok(())
1168    }
1169
1170    /// Remove stored file hashes for deleted or moved files.
1171    pub fn remove_file_hashes(&self, paths: &[String]) -> anyhow::Result<()> {
1172        if paths.is_empty() {
1173            return Ok(());
1174        }
1175        let placeholders = paths.iter().map(|_| "?").collect::<Vec<_>>().join(",");
1176        let sql = format!("DELETE FROM file_hashes WHERE path IN ({})", placeholders);
1177        let mut stmt = self.conn.prepare(&sql)?;
1178        let params: Vec<&dyn duckdb::ToSql> =
1179            paths.iter().map(|p| p as &dyn duckdb::ToSql).collect();
1180        stmt.execute(params.as_slice())?;
1181        Ok(())
1182    }
1183
1184    /// Delete all nodes (and their connected edges) whose path is in `paths`.
1185    ///
1186    /// Used during incremental re-indexing to remove stale data for changed files.
1187    pub fn delete_nodes_by_paths(&self, paths: &[String]) -> anyhow::Result<usize> {
1188        if paths.is_empty() {
1189            return Ok(0);
1190        }
1191        let placeholders = paths.iter().map(|_| "?").collect::<Vec<_>>().join(",");
1192        // Delete edges connected to nodes from these paths first
1193        let sql_edges = format!(
1194            "DELETE FROM edges WHERE src IN (SELECT id FROM nodes WHERE path IN ({})) OR dst IN (SELECT id FROM nodes WHERE path IN ({}))",
1195            placeholders, placeholders
1196        );
1197        let mut stmt_edges = self.conn.prepare(&sql_edges)?;
1198        let params_edges: Vec<&dyn duckdb::ToSql> = paths
1199            .iter()
1200            .chain(paths.iter())
1201            .map(|p| p as &dyn duckdb::ToSql)
1202            .collect();
1203        stmt_edges.execute(params_edges.as_slice())?;
1204
1205        // Delete nodes
1206        let sql_nodes = format!("DELETE FROM nodes WHERE path IN ({})", placeholders);
1207        let mut stmt_nodes = self.conn.prepare(&sql_nodes)?;
1208        let params_nodes: Vec<&dyn duckdb::ToSql> =
1209            paths.iter().map(|p| p as &dyn duckdb::ToSql).collect();
1210        let count = stmt_nodes.execute(params_nodes.as_slice())?;
1211        Ok(count)
1212    }
1213
1214    /// Store a doc comment string on a node (used by the docs-coverage analysis pass).
1215    pub fn update_node_doc_comment(&self, id: &str, doc: &str) -> anyhow::Result<()> {
1216        self.conn.execute(
1217            "UPDATE nodes SET doc_comment = ? WHERE id = ?",
1218            params![doc, id],
1219        )?;
1220        Ok(())
1221    }
1222
1223    /// Write a cyclomatic-complexity score back to a single node.
1224    pub fn update_node_complexity(&self, id: &str, complexity: f64) -> anyhow::Result<()> {
1225        self.conn.execute(
1226            "UPDATE nodes SET complexity = ? WHERE id = ?",
1227            params![complexity, id],
1228        )?;
1229        Ok(())
1230    }
1231
1232    /// Return up to `limit` `Function` nodes with `complexity >= min_score`, sorted descending.
1233    pub fn get_nodes_by_complexity(
1234        &self,
1235        limit: usize,
1236        min_score: f64,
1237    ) -> anyhow::Result<Vec<Node>> {
1238        let mut stmt = self.conn.prepare(
1239            "SELECT id, kind, name, path, line_start, line_end, language, churn, coupling, community, in_degree, out_degree, COALESCE(exported, false), COALESCE(is_dead_candidate, false), dead_reason, COALESCE(complexity, 0.0), COALESCE(is_test_file, 0), COALESCE(test_count, 0), COALESCE(is_tested, 0)
1240             FROM nodes
1241             WHERE kind = 'Function' AND COALESCE(complexity, 0.0) >= ?
1242             ORDER BY complexity DESC
1243             LIMIT ?",
1244        )?;
1245        let rows = stmt.query_map(params![min_score, limit as i64], |row| {
1246            Ok(Node {
1247                id: row.get(0)?,
1248                kind: row.get(1)?,
1249                name: row.get(2)?,
1250                path: row.get(3)?,
1251                line_start: row.get(4)?,
1252                line_end: row.get(5)?,
1253                language: row.get(6)?,
1254                churn: row.get(7)?,
1255                coupling: row.get(8)?,
1256                community: row.get(9)?,
1257                in_degree: row.get(10)?,
1258                out_degree: row.get(11)?,
1259                exported: row.get::<_, i64>(12).map(|v| v != 0).unwrap_or(false),
1260                is_dead_candidate: row.get::<_, bool>(13).unwrap_or(false),
1261                dead_reason: row.get::<_, Option<String>>(14).unwrap_or(None),
1262                complexity: row.get::<_, f64>(15).unwrap_or(0.0),
1263                is_test_file: row.get::<_, i64>(16).map(|v| v != 0).unwrap_or(false),
1264                test_count: row.get::<_, i64>(17).unwrap_or(0),
1265                is_tested: row.get::<_, i64>(18).map(|v| v != 0).unwrap_or(false),
1266            })
1267        })?;
1268        let mut results = Vec::new();
1269        for row in rows {
1270            results.push(row?);
1271        }
1272        Ok(results)
1273    }
1274
1275    /// Returns (overall_pct, Vec<(community_id, documented, total)>, Vec<undocumented high-coupling nodes>)
1276    pub fn get_docs_coverage(&self) -> anyhow::Result<DocsCoverage> {
1277        let overall: f64 = self
1278            .conn
1279            .query_row(
1280                "SELECT COALESCE(
1281                    CAST(SUM(CASE WHEN doc_comment IS NOT NULL AND doc_comment != '' THEN 1 ELSE 0 END) AS DOUBLE)
1282                    / NULLIF(CAST(COUNT(*) AS DOUBLE), 0.0) * 100.0,
1283                    0.0)
1284                 FROM nodes WHERE kind IN ('Function', 'Class') AND path NOT LIKE '%test%'",
1285                [],
1286                |r| r.get(0),
1287            )
1288            .unwrap_or(0.0);
1289
1290        let mut by_community = Vec::new();
1291        let mut stmt = self.conn.prepare(
1292            "SELECT community,
1293                    SUM(CASE WHEN doc_comment IS NOT NULL AND doc_comment != '' THEN 1 ELSE 0 END) as documented,
1294                    COUNT(*) as total
1295             FROM nodes
1296             WHERE kind IN ('Function', 'Class') AND path NOT LIKE '%test%'
1297             GROUP BY community
1298             ORDER BY community",
1299        )?;
1300        let comm_rows = stmt.query_map([], |row| {
1301            Ok((
1302                row.get::<_, i64>(0)?,
1303                row.get::<_, i64>(1)?,
1304                row.get::<_, i64>(2)?,
1305            ))
1306        })?;
1307        for row in comm_rows {
1308            by_community.push(row?);
1309        }
1310
1311        let mut undoc_stmt = self.conn.prepare(
1312            "SELECT id, kind, name, path, line_start, line_end, language, churn, coupling, community, in_degree, out_degree, COALESCE(exported, false), COALESCE(is_dead_candidate, false), dead_reason, COALESCE(complexity, 0.0), COALESCE(is_test_file, 0), COALESCE(test_count, 0), COALESCE(is_tested, 0)
1313             FROM nodes
1314             WHERE kind = 'Function' AND (doc_comment IS NULL OR doc_comment = '')
1315             ORDER BY in_degree DESC
1316             LIMIT 10",
1317        )?;
1318        let undoc_rows = undoc_stmt.query_map([], |row| {
1319            Ok(Node {
1320                id: row.get(0)?,
1321                kind: row.get(1)?,
1322                name: row.get(2)?,
1323                path: row.get(3)?,
1324                line_start: row.get(4)?,
1325                line_end: row.get(5)?,
1326                language: row.get(6)?,
1327                churn: row.get(7)?,
1328                coupling: row.get(8)?,
1329                community: row.get(9)?,
1330                in_degree: row.get(10)?,
1331                out_degree: row.get(11)?,
1332                exported: row.get::<_, i64>(12).map(|v| v != 0).unwrap_or(false),
1333                is_dead_candidate: row.get::<_, bool>(13).unwrap_or(false),
1334                dead_reason: row.get::<_, Option<String>>(14).unwrap_or(None),
1335                complexity: row.get::<_, f64>(15).unwrap_or(0.0),
1336                is_test_file: row.get::<_, i64>(16).map(|v| v != 0).unwrap_or(false),
1337                test_count: row.get::<_, i64>(17).unwrap_or(0),
1338                is_tested: row.get::<_, i64>(18).map(|v| v != 0).unwrap_or(false),
1339            })
1340        })?;
1341        let mut undocumented = Vec::new();
1342        for row in undoc_rows {
1343            undocumented.push(row?);
1344        }
1345
1346        Ok((overall, by_community, undocumented))
1347    }
1348
1349    /// Insert or replace clone-pair records, returning the count written.
1350    pub fn upsert_clones(&self, clones: &[CloneRow]) -> anyhow::Result<usize> {
1351        if clones.is_empty() {
1352            return Ok(0);
1353        }
1354        let mut count = 0;
1355        let mut stmt = self.conn.prepare(
1356            "INSERT OR REPLACE INTO clones (id, node_a, node_b, similarity, kind) VALUES (?, ?, ?, ?, ?)",
1357        )?;
1358        for c in clones {
1359            stmt.execute(params![c.id, c.node_a, c.node_b, c.similarity, c.kind])?;
1360            count += 1;
1361        }
1362        Ok(count)
1363    }
1364
1365    /// Query clone pairs above `min_similarity`, optionally filtered by `kind` (`"exact"` or `"near"`).
1366    pub fn get_clones(
1367        &self,
1368        min_similarity: f64,
1369        kind_filter: Option<&str>,
1370    ) -> anyhow::Result<Vec<CloneRow>> {
1371        let (sql, use_kind) = if kind_filter.is_some() {
1372            (
1373                "SELECT id, node_a, node_b, similarity, kind FROM clones WHERE similarity >= ? AND kind = ? ORDER BY similarity DESC",
1374                true,
1375            )
1376        } else {
1377            (
1378                "SELECT id, node_a, node_b, similarity, kind FROM clones WHERE similarity >= ? ORDER BY similarity DESC",
1379                false,
1380            )
1381        };
1382
1383        let mut stmt = self.conn.prepare(sql)?;
1384        let map_row = |row: &duckdb::Row| {
1385            Ok(CloneRow {
1386                id: row.get(0)?,
1387                node_a: row.get(1)?,
1388                node_b: row.get(2)?,
1389                similarity: row.get::<_, f32>(3)? as f64,
1390                kind: row.get(4)?,
1391            })
1392        };
1393
1394        let rows = if use_kind {
1395            stmt.query_map(params![min_similarity, kind_filter.unwrap_or("")], map_row)?
1396        } else {
1397            stmt.query_map(params![min_similarity], map_row)?
1398        };
1399
1400        let mut results = Vec::new();
1401        for row in rows {
1402            results.push(row?);
1403        }
1404        Ok(results)
1405    }
1406
1407    /// Delete all clone-pair records from the database.
1408    pub fn clear_clones(&self) -> anyhow::Result<()> {
1409        self.conn.execute("DELETE FROM clones", [])?;
1410        Ok(())
1411    }
1412
1413    /// Flag every node whose path is in `paths` as a test file (`is_test_file = true`).
1414    pub fn mark_test_files(&self, paths: &[String]) -> anyhow::Result<()> {
1415        if paths.is_empty() {
1416            return Ok(());
1417        }
1418        let mut stmt = self
1419            .conn
1420            .prepare("UPDATE nodes SET is_test_file = 1 WHERE path = ?")?;
1421        for path in paths {
1422            stmt.execute(params![path])?;
1423        }
1424        Ok(())
1425    }
1426
1427    /// After inserting TESTS edges, compute test_count and is_tested for non-test nodes.
1428    pub fn update_test_coverage(&self) -> anyhow::Result<()> {
1429        self.conn.execute_batch(
1430            "UPDATE nodes SET test_count = (
1431                SELECT COUNT(*) FROM edges
1432                WHERE edges.dst = nodes.id AND edges.kind = 'TESTS'
1433             );
1434             UPDATE nodes SET is_tested = (test_count > 0)
1435             WHERE is_test_file = 0;",
1436        )?;
1437        Ok(())
1438    }
1439
1440    /// Returns (overall_pct, tested_count, untested_count, gaps ranked by risk)
1441    pub fn get_test_coverage_summary(
1442        &self,
1443        top_n: usize,
1444    ) -> anyhow::Result<(f64, i64, i64, Vec<Node>)> {
1445        let tested: i64 = self
1446            .conn
1447            .query_row(
1448                "SELECT COUNT(*) FROM nodes WHERE kind IN ('Function','Class') AND is_test_file = 0 AND is_tested = 1",
1449                [],
1450                |r| r.get(0),
1451            )
1452            .unwrap_or(0);
1453        let total: i64 = self
1454            .conn
1455            .query_row(
1456                "SELECT COUNT(*) FROM nodes WHERE kind IN ('Function','Class') AND is_test_file = 0",
1457                [],
1458                |r| r.get(0),
1459            )
1460            .unwrap_or(0);
1461
1462        let overall_pct = if total > 0 {
1463            (tested as f64 / total as f64) * 100.0
1464        } else {
1465            0.0
1466        };
1467
1468        let mut gap_stmt = self.conn.prepare(
1469            "SELECT id, kind, name, path, line_start, line_end, language, churn, coupling, community, in_degree, out_degree, COALESCE(exported, false), COALESCE(is_dead_candidate, false), dead_reason, COALESCE(complexity, 0.0), COALESCE(is_test_file, 0), COALESCE(test_count, 0), COALESCE(is_tested, 0)
1470             FROM nodes
1471             WHERE kind IN ('Function','Class') AND is_test_file = 0 AND COALESCE(is_tested, 0) = 0
1472             ORDER BY (churn * CAST(in_degree AS DOUBLE) + CAST(in_degree AS DOUBLE) * 0.5) DESC
1473             LIMIT ?",
1474        )?;
1475        let gap_rows = gap_stmt.query_map(params![top_n as i64], |row| {
1476            Ok(Node {
1477                id: row.get(0)?,
1478                kind: row.get(1)?,
1479                name: row.get(2)?,
1480                path: row.get(3)?,
1481                line_start: row.get(4)?,
1482                line_end: row.get(5)?,
1483                language: row.get(6)?,
1484                churn: row.get(7)?,
1485                coupling: row.get(8)?,
1486                community: row.get(9)?,
1487                in_degree: row.get(10)?,
1488                out_degree: row.get(11)?,
1489                exported: row.get::<_, i64>(12).map(|v| v != 0).unwrap_or(false),
1490                is_dead_candidate: row.get::<_, bool>(13).unwrap_or(false),
1491                dead_reason: row.get::<_, Option<String>>(14).unwrap_or(None),
1492                complexity: row.get::<_, f64>(15).unwrap_or(0.0),
1493                is_test_file: row.get::<_, i64>(16).map(|v| v != 0).unwrap_or(false),
1494                test_count: row.get::<_, i64>(17).unwrap_or(0),
1495                is_tested: row.get::<_, i64>(18).map(|v| v != 0).unwrap_or(false),
1496            })
1497        })?;
1498        let mut gaps = Vec::new();
1499        for row in gap_rows {
1500            gaps.push(row?);
1501        }
1502
1503        Ok((overall_pct, tested, total - tested, gaps))
1504    }
1505
1506    /// Insert or replace a timeline snapshot entry.
1507    pub fn upsert_snapshot(&self, entry: &SnapshotEntry) -> anyhow::Result<()> {
1508        self.conn.execute(
1509            "INSERT OR REPLACE INTO snapshots (id, commit_sha, commit_date, commit_msg, node_count, edge_count, snapshot_data)
1510             VALUES (?, ?, ?, ?, ?, ?, ?)",
1511            params![
1512                entry.id,
1513                entry.commit_sha,
1514                entry.commit_date,
1515                entry.commit_msg,
1516                entry.node_count,
1517                entry.edge_count,
1518                entry.snapshot_data,
1519            ],
1520        )?;
1521        Ok(())
1522    }
1523
1524    /// Return up to `limit` timeline snapshots, most recent first.
1525    pub fn get_snapshots(&self, limit: usize) -> anyhow::Result<Vec<SnapshotEntry>> {
1526        let mut stmt = self.conn.prepare(
1527            "SELECT id, commit_sha, commit_date, commit_msg, COALESCE(node_count,0), COALESCE(edge_count,0), snapshot_data
1528             FROM snapshots ORDER BY commit_date DESC LIMIT ?",
1529        )?;
1530        let rows = stmt.query_map(params![limit as i64], |row| {
1531            Ok(SnapshotEntry {
1532                id: row.get(0)?,
1533                commit_sha: row.get(1)?,
1534                commit_date: row.get(2)?,
1535                commit_msg: row.get(3)?,
1536                node_count: row.get(4)?,
1537                edge_count: row.get(5)?,
1538                snapshot_data: row.get(6)?,
1539            })
1540        })?;
1541        let mut result = Vec::new();
1542        for row in rows {
1543            result.push(row?);
1544        }
1545        Ok(result)
1546    }
1547
1548    /// Look up a snapshot by full SHA or short prefix, returning `None` if not cached.
1549    pub fn get_snapshot_by_sha(&self, sha: &str) -> anyhow::Result<Option<SnapshotEntry>> {
1550        let mut stmt = self.conn.prepare(
1551            "SELECT id, commit_sha, commit_date, commit_msg, COALESCE(node_count,0), COALESCE(edge_count,0), snapshot_data
1552             FROM snapshots WHERE commit_sha = ? OR commit_sha LIKE ? LIMIT 1",
1553        )?;
1554        let prefix = format!("{}%", sha);
1555        let mut rows = stmt.query_map(params![sha, prefix], |row| {
1556            Ok(SnapshotEntry {
1557                id: row.get(0)?,
1558                commit_sha: row.get(1)?,
1559                commit_date: row.get(2)?,
1560                commit_msg: row.get(3)?,
1561                node_count: row.get(4)?,
1562                edge_count: row.get(5)?,
1563                snapshot_data: row.get(6)?,
1564            })
1565        })?;
1566        match rows.next() {
1567            Some(Ok(entry)) => Ok(Some(entry)),
1568            _ => Ok(None),
1569        }
1570    }
1571
1572    /// Total number of cached timeline snapshots.
1573    pub fn snapshot_count(&self) -> i64 {
1574        self.conn
1575            .query_row("SELECT COUNT(*) FROM snapshots", [], |r| r.get(0))
1576            .unwrap_or(0)
1577    }
1578}
1579
1580pub fn repo_hash(path: &Path) -> String {
1581    let canonical = path.canonicalize().unwrap_or_else(|_| path.to_path_buf());
1582    let path_str = canonical.to_string_lossy().to_string();
1583    let mut hasher = Sha256::new();
1584    hasher.update(path_str.as_bytes());
1585    format!("{:x}", hasher.finalize())[..16].to_string()
1586}
cgx_engine/graph.rs

cgx_engine/
graph.rs