cartog-db 0.22.0

SQLite persistence layer for cartog code graph
Documentation
//! 6-tier name-based edge resolution, full and scoped (incremental).
//!
//! Part of the [`Database`](super::Database) impl, split out of `lib.rs` for navigability.

use super::*;

impl Database {
    // ── Edge Resolution ──

    /// Resolve target_name → target_id for all unresolved edges.
    ///
    /// Runs two passes so that import edges resolved in pass 1 enable
    /// import-path resolution (tier 2) for non-import edges in pass 2.
    ///
    /// 6-tier priority resolution (per pass):
    /// 1. Same file — symbol with matching name in the same file
    /// 2. Import-path — follow resolved imports to find the target in the imported file
    /// 3. Same directory — symbol in a file in the same directory
    /// 4. Parent scope preference — when multiple global matches, prefer same parent scope
    /// 5. Unique project-wide match — exactly one symbol with that name globally
    /// 6. Class over constructor — when exactly 2 matches and one is a class, prefer class
    pub fn resolve_edges(&self) -> Result<u32> {
        let tx = self.conn.unchecked_transaction()?;
        let total = self.resolve_edges_in_tx()?;
        tx.commit()?;
        Ok(total)
    }

    /// Like [`Self::resolve_edges`] but assumes the caller already holds an
    /// open transaction.
    pub fn resolve_edges_in_tx(&self) -> Result<u32> {
        let mut total_resolved = 0u32;
        for _pass in 0..2 {
            let resolved = self.resolve_edges_pass()?;
            if resolved == 0 {
                break;
            }
            total_resolved += resolved;
        }
        Ok(total_resolved)
    }

    fn resolve_edges_pass(&self) -> Result<u32> {
        // Skip state=2 (unresolvable) edges: the heuristic can't resolve what
        // LSP definitively gave up on, and re-walking them every dirty reindex
        // burns measurable time on large repos. They re-enter via
        // `reset_unresolvable_for_names` when a matching symbol appears.
        let mut unresolved_stmt = self.conn.prepare(
            "SELECT e.id, e.target_name, e.file_path, e.source_id
             FROM edges e WHERE e.target_id IS NULL AND e.resolution_state = 0",
        )?;

        let unresolved: Vec<(i64, String, String, String)> = unresolved_stmt
            .query_map([], |row| {
                Ok((row.get(0)?, row.get(1)?, row.get(2)?, row.get(3)?))
            })?
            .collect::<std::result::Result<Vec<_>, _>>()?;

        self.resolve_edge_batch(&unresolved)
    }

    /// 6-tier heuristic resolution for a batch of unresolved edges.
    ///
    /// Caller is responsible for transaction wrapping — see the public
    /// [`Self::resolve_edges`] / [`Self::resolve_edges_scoped`] helpers.
    fn resolve_edge_batch(&self, unresolved: &[(i64, String, String, String)]) -> Result<u32> {
        let mut resolved = 0u32;

        let mut same_file_stmt = self
            .conn
            .prepare("SELECT id FROM symbols WHERE name = ?1 AND file_path = ?2 LIMIT 1")?;

        let mut import_resolve_stmt = self.conn.prepare(
            "SELECT s.id FROM symbols s
             INNER JOIN edges ie ON ie.kind = 'imports' AND ie.target_name = ?1
                 AND ie.target_id IS NOT NULL
             INNER JOIN symbols is2 ON is2.id = ie.source_id AND is2.file_path = ?2
             INNER JOIN symbols resolved ON resolved.id = ie.target_id
             WHERE s.name = ?1 AND s.kind != 'import'
                 AND s.file_path = resolved.file_path
             LIMIT 1",
        )?;

        let mut same_dir_stmt = self
            .conn
            .prepare("SELECT id FROM symbols WHERE name = ?1 AND file_path LIKE ?2 LIMIT 1")?;

        let mut parent_scope_stmt = self.conn.prepare(
            "SELECT s.id FROM symbols s
             INNER JOIN symbols source ON source.id = ?2
             WHERE s.name = ?1 AND s.parent_id = source.parent_id AND s.id != source.id
             LIMIT 1",
        )?;

        let mut anywhere_stmt = self
            .conn
            .prepare("SELECT id, kind FROM symbols WHERE name = ?1 AND kind != 'import' LIMIT 3")?;

        // Heuristic resolve must flip state=1 alongside target_id; otherwise
        // unresolved_edges() (state=0 filter) would still surface the edge to
        // the next LSP pass, wasting a server roundtrip on already-known answers.
        let mut update_stmt = self.conn.prepare(
            "UPDATE edges SET target_id = ?1, resolution_state = 1, resolution_source = ?2
             WHERE id = ?3",
        )?;

        for (edge_id, target_name, edge_file, source_id) in unresolved {
            let simple_name = target_name.rsplit('.').next().unwrap_or(target_name);

            // 1) Same file
            let target_id: Option<String> = same_file_stmt
                .query_row(params![simple_name, edge_file], |row| row.get(0))
                .optional()?;

            if let Some(tid) = target_id {
                update_stmt.execute(params![tid, EdgeProvenance::SameFile.as_str(), edge_id])?;
                resolved += 1;
                continue;
            }

            // 2) Import-path resolution
            let target_id: Option<String> = import_resolve_stmt
                .query_row(params![simple_name, edge_file], |row| row.get(0))
                .optional()?;

            if let Some(tid) = target_id {
                update_stmt.execute(params![tid, EdgeProvenance::ImportPath.as_str(), edge_id])?;
                resolved += 1;
                continue;
            }

            // 3) Same directory
            let dir = edge_file
                .rsplit_once('/')
                .map(|(d, _)| format!("{d}/%"))
                .unwrap_or_default();

            if !dir.is_empty() {
                let target_id: Option<String> = same_dir_stmt
                    .query_row(params![simple_name, dir], |row| row.get(0))
                    .optional()?;

                if let Some(tid) = target_id {
                    update_stmt.execute(params![tid, EdgeProvenance::SameDir.as_str(), edge_id])?;
                    resolved += 1;
                    continue;
                }
            }

            // 4) Parent scope preference
            let target_id: Option<String> = parent_scope_stmt
                .query_row(params![simple_name, source_id], |row| row.get(0))
                .optional()?;

            if let Some(tid) = target_id {
                update_stmt.execute(params![tid, EdgeProvenance::ParentScope.as_str(), edge_id])?;
                resolved += 1;
                continue;
            }

            // 5+6) Project-wide: unique match, or class-over-constructor disambiguation
            let mut rows = anywhere_stmt.query(params![simple_name])?;
            let mut matches: Vec<(String, String)> = Vec::new();
            while let Some(row) = rows.next()? {
                matches.push((row.get(0)?, row.get(1)?));
                if matches.len() == 3 {
                    break;
                }
            }
            drop(rows);

            // Tier and target are decided together so the provenance label can
            // never drift from the branch that picked the id.
            let resolution = match matches.as_slice() {
                [(id, _)] => Some((id, EdgeProvenance::UniqueGlobal)),
                [a, b] => disambiguate_two(a, b).map(|id| (id, EdgeProvenance::KindDisambig)),
                _ => None,
            };

            if let Some((tid, prov)) = resolution {
                update_stmt.execute(params![tid, prov.as_str(), edge_id])?;
                resolved += 1;
            }
        }

        Ok(resolved)
    }

    /// Compute and store in-degree centrality for all symbols.
    ///
    /// In-degree = number of resolved incoming edges (calls, imports, inherits, etc.).
    /// Higher in-degree means the symbol is referenced more across the codebase.
    /// Resets all in-degree values to 0 first, then batch-updates from the edges table.
    ///
    /// tx-safe: two unconditional statements participate in any active outer
    /// transaction — see [`Self::begin_indexing_tx`].
    pub fn compute_in_degrees(&self) -> Result<u32> {
        self.conn.execute("UPDATE symbols SET in_degree = 0", [])?;

        // CTE computes counts once; the UPDATE applies them.
        // Avoids a correlated subquery per symbol (O(n*m) → O(n+m)).
        let updated = self.conn.execute(
            "WITH counts AS (
                SELECT target_id, COUNT(*) AS cnt
                FROM edges WHERE target_id IS NOT NULL
                GROUP BY target_id
            )
            UPDATE symbols SET in_degree = (
                SELECT cnt FROM counts WHERE counts.target_id = symbols.id
            )
            WHERE id IN (SELECT target_id FROM counts)",
            [],
        )?;

        Ok(updated as u32)
    }

    // ── Scoped resolution (incremental indexing) ──

    /// Invalidate resolved edges that point into any of the dirty files.
    ///
    /// When a file is re-indexed, its symbols may have been renamed/removed.
    /// Edges from *unchanged* files that previously resolved to those symbols
    /// must be cleared so they can be re-resolved against the new symbol set.
    ///
    /// tx-safe: single statement — see [`Self::begin_indexing_tx`].
    pub fn invalidate_edges_targeting(
        &self,
        dirty_files: &std::collections::HashSet<String>,
    ) -> Result<u32> {
        if dirty_files.is_empty() {
            return Ok(0);
        }
        // After file re-indexing, edges from unchanged files may point to
        // symbol IDs that no longer exist (removed or renamed symbols).
        // Set these dangling references to NULL AND reset resolution_state so
        // the heuristic + LSP passes get another shot. Without the state reset
        // the edge would stay at state=1 (resolved) but with target_id NULL —
        // permanently invisible to `unresolved_edges()`.
        let n = self.conn.execute(
            "UPDATE edges SET target_id = NULL, resolution_state = 0, resolution_source = NULL
             WHERE target_id IS NOT NULL
               AND NOT EXISTS (SELECT 1 FROM symbols WHERE symbols.id = edges.target_id)",
            [],
        )?;
        Ok(n as u32)
    }

    /// Resolve edges scoped to dirty files only.
    ///
    /// Processes: edges originating from dirty files (freshly extracted)
    /// and edges whose target was just invalidated (target_id set to NULL).
    /// Uses the same 6-tier heuristic as `resolve_edges`.
    /// Resolve edges after scoped invalidation.
    ///
    /// After `invalidate_edges_targeting` has cleared target_ids for edges
    /// pointing into dirty files, this re-resolves all currently unresolved edges.
    /// Fewer edges are unresolved compared to a first-time full resolve.
    pub fn resolve_edges_scoped(
        &self,
        dirty_files: &std::collections::HashSet<String>,
    ) -> Result<u32> {
        let tx = self.conn.unchecked_transaction()?;
        let total = self.resolve_edges_scoped_in_tx(dirty_files)?;
        tx.commit()?;
        Ok(total)
    }

    /// Like [`Self::resolve_edges_scoped`] but assumes the caller already
    /// holds an open transaction.
    pub fn resolve_edges_scoped_in_tx(
        &self,
        dirty_files: &std::collections::HashSet<String>,
    ) -> Result<u32> {
        if dirty_files.is_empty() {
            return Ok(0);
        }
        // After invalidation, the set of unresolved edges is naturally scoped:
        // only edges from dirty files (freshly extracted) or targeting dirty files
        // (just invalidated) have target_id = NULL.
        // Reuse the same 2-pass resolution.
        self.resolve_edges_in_tx()
    }

    /// Recompute in-degree centrality only for symbols in/around dirty files.
    ///
    /// tx-safe: every internal statement participates in any active outer
    /// transaction — see [`Self::begin_indexing_tx`]. Does NOT open one of
    /// its own, unlike the batched `*_in_tx` helpers; outside an outer
    /// transaction the per-file resets are not atomic with the recompute.
    pub fn compute_in_degrees_scoped(
        &self,
        dirty_files: &std::collections::HashSet<String>,
    ) -> Result<u32> {
        if dirty_files.is_empty() {
            return Ok(0);
        }

        // Reset in-degree for symbols in dirty files
        for file in dirty_files {
            self.conn.execute(
                "UPDATE symbols SET in_degree = 0 WHERE file_path = ?1",
                params![file],
            )?;
        }

        // Also reset symbols that are targets of edges from dirty files
        // (their in-degree may have changed)
        for file in dirty_files {
            self.conn.execute(
                "UPDATE symbols SET in_degree = 0
                 WHERE id IN (
                     SELECT DISTINCT e.target_id FROM edges e
                     WHERE e.file_path = ?1 AND e.target_id IS NOT NULL
                 )",
                params![file],
            )?;
        }

        // Recompute for all symbols with in_degree = 0 that have incoming edges
        let updated = self.conn.execute(
            "WITH counts AS (
                SELECT target_id, COUNT(*) AS cnt
                FROM edges WHERE target_id IS NOT NULL
                GROUP BY target_id
            )
            UPDATE symbols SET in_degree = (
                SELECT cnt FROM counts WHERE counts.target_id = symbols.id
            )
            WHERE in_degree = 0
              AND id IN (SELECT target_id FROM counts)",
            [],
        )?;

        Ok(updated as u32)
    }
}