ckg-storage 1.1.2

CozoDB-backed storage layer for ckg (per-repo + registry DBs).
Documentation
//! Cross-file call resolution and test-edge detection.
//!
//! `resolve_cross_file_calls`: rewrites unresolved `Calls.dst` bare-names to
//! canonical Symbol ids using a three-tier match (exact name → qname suffix →
//! leaf segment), all requiring a unique match to avoid ambiguous rewrites.
//!
//! `detect_test_edges`: emits `Tests` relation edges from `test_*` functions
//! to their stripped-prefix target symbol when the match is unique.

use std::collections::BTreeMap;

use ckg_core::Result;
use cozo::{DataValue, ScriptMutability};

use super::map_err;
use super::Storage;

/// Strip **Rust** generic / lifetime parameters from a qname.
///
/// Examples: `Ctx<'_>::push_symbol` → `Ctx::push_symbol`,
/// `HashMap<K, V>::insert` → `HashMap::insert`. Bracket-balanced so
/// nested generics also collapse.
///
/// **Scope (L3):** This function is only applied to Rust-origin qnames.
/// Python / Ruby / JS qnames don't use `<>`-delimited generics in this
/// sense, so calling it on those languages is a no-op in practice.
fn strip_generics(s: &str) -> String {
    let mut out = String::with_capacity(s.len());
    let mut depth: usize = 0;
    for c in s.chars() {
        match c {
            '<' => depth += 1,
            '>' if depth > 0 => depth -= 1,
            _ if depth == 0 => out.push(c),
            _ => {}
        }
    }
    out
}

/// Return the trailing identifier segment of a Rust / Python / Ruby
/// dotted-or-scoped path. `commands::index::run` → `run`,
/// `pkg.module.fn` → `fn`. If no separator is present, returns the input
/// unchanged.
fn leaf_of(s: &str) -> &str {
    s.rsplit("::")
        .next()
        .unwrap_or(s)
        .rsplit('.')
        .next()
        .unwrap_or(s)
}

impl Storage {
    /// Cross-file resolution pass: for every `Calls` edge whose `dst` is a
    /// bare name (not an existing symbol id) AND the name matches *exactly
    /// one* `Symbol.name` in the same repo, rewrite `dst` to that id.
    ///
    /// Returns the number of edges rewritten. Confidence on rewritten edges
    /// stays at 0.5 (still ambiguous compared to in-file resolution which
    /// gets 1.0 at extract time).
    pub fn resolve_cross_file_calls(&self) -> Result<usize> {
        // Step 1: load only unresolved Calls (dst not in Symbol.id) — server-side filter.
        let calls = self
            .db
            .run_script(
                "?[src, dst, confidence] := *Calls{src, dst, confidence}, \
                 not *Symbol{id: dst}",
                BTreeMap::new(),
                ScriptMutability::Immutable,
            )
            .map_err(map_err)?;

        // Step 2: derive distinct needles (full dst + leaf segment).
        let mut needles: std::collections::HashSet<String> = std::collections::HashSet::new();
        for r in &calls.rows {
            if let Some(DataValue::Str(dst)) = r.get(1) {
                let d = dst.as_str();
                needles.insert(d.to_string());
                let leaf = leaf_of(d);
                if leaf != d {
                    needles.insert(leaf.to_string());
                }
            }
        }

        // Step 3: targeted Symbol load via bound `needle_set[n]` relation.
        let mut by_name: std::collections::HashMap<String, Vec<String>> =
            std::collections::HashMap::new();
        let mut qnames: Vec<(String, String)> = Vec::new();
        if !needles.is_empty() {
            let needle_rows: Vec<DataValue> = needles
                .iter()
                .map(|n| DataValue::List(vec![DataValue::from(n.as_str())]))
                .collect();
            let mut params = BTreeMap::new();
            params.insert("needle_rows".into(), DataValue::List(needle_rows));
            let rows = self
                .db
                .run_script(
                    "needle_set[n] <- $needle_rows\n\
                     ?[name, qname, id] := *Symbol{name, qname, id}, needle_set[name]",
                    params,
                    ScriptMutability::Immutable,
                )
                .map_err(map_err)?;
            qnames.reserve(rows.rows.len());
            for r in rows.rows {
                if let (
                    Some(DataValue::Str(n)),
                    Some(DataValue::Str(q)),
                    Some(DataValue::Str(i)),
                ) = (r.first(), r.get(1), r.get(2))
                {
                    by_name
                        .entry(n.to_string())
                        .or_default()
                        .push(i.to_string());
                    qnames.push((strip_generics(q.as_str()), i.to_string()));
                }
            }
        }

        let mut rewrites: Vec<(String, String, String, f64)> = Vec::new();
        for r in calls.rows {
            let (Some(DataValue::Str(src)), Some(DataValue::Str(dst)), Some(DataValue::Num(c))) =
                (r.first(), r.get(1), r.get(2))
            else {
                continue;
            };
            let dst_s = dst.to_string();

            // Three-tier resolution, all requiring a UNIQUE match:
            //   1. Full path matches a Symbol.name verbatim.
            //   2. Some Symbol.qname ends with the dst path — covers
            //      `commands::index::run` even when leaf `run` collides.
            //   3. Leaf segment matches a Symbol.name uniquely.
            let candidate_id = by_name
                .get(&dst_s)
                .filter(|v| v.len() == 1)
                .map(|v| v[0].clone())
                .or_else(|| {
                    // Progressive-suffix qname match. For a path call
                    // `a::b::c::d`, try suffix matches starting from the
                    // longest. First suffix yielding a unique candidate wins.
                    let segments: Vec<&str> =
                        dst_s.split(['.', ':']).filter(|s| !s.is_empty()).collect();
                    if segments.len() < 2 {
                        return None;
                    }
                    let max_skip = segments.len().saturating_sub(2);
                    for skip in 0..=max_skip {
                        let suffix_segs = &segments[skip..];
                        let suffix_dot = suffix_segs.join(".");
                        let suffix_colon = suffix_segs.join("::");
                        // CR-M-2: pre-compute the four `endswith` needles OUTSIDE
                        // the qnames filter. Inside, original code allocated
                        // `format!("::{n}")` per qname — O(unresolved × qnames).
                        let dot_dot = format!(".{suffix_dot}");
                        let dot_colon = format!(".{suffix_colon}");
                        let colon_dot = format!("::{suffix_dot}");
                        let colon_colon = format!("::{suffix_colon}");
                        let mut hits: Vec<&String> = qnames
                            .iter()
                            .filter(|(q, _)| {
                                q.as_str() == suffix_dot.as_str()
                                    || q.as_str() == suffix_colon.as_str()
                                    || q.ends_with(&dot_dot)
                                    || q.ends_with(&dot_colon)
                                    || q.ends_with(&colon_dot)
                                    || q.ends_with(&colon_colon)
                            })
                            .map(|(_, id)| id)
                            .collect();
                        hits.sort();
                        hits.dedup();
                        if hits.len() == 1 {
                            return Some(hits[0].clone());
                        }
                        // hits.len() > 1 → ambiguous; try narrower suffix (skip+1).
                    }
                    None
                })
                .or_else(|| {
                    let leaf = leaf_of(&dst_s);
                    if leaf == dst_s {
                        return None;
                    }
                    by_name
                        .get(leaf)
                        .filter(|v| v.len() == 1)
                        .map(|v| v[0].clone())
                });

            if let Some(target) = candidate_id {
                let conf = match c {
                    cozo::Num::Float(f) => *f,
                    cozo::Num::Int(i) => *i as f64,
                };
                rewrites.push((src.to_string(), dst_s, target.clone(), conf));
            }
        }

        if rewrites.is_empty() {
            return Ok(0);
        }

        // Apply: delete old, insert rewritten. Cozo has no UPDATE so we use :rm + :put.
        // RESOLVE-C3: two separate `{...}` blocks in one imperative program so
        // both are committed in a single SessionTx.commit_tx() — atomic.
        // A single `{ stmt1 ; stmt2 }` block with two `?` heads is rejected by
        // Cozo 0.7.13 with "cannot have multiple definitions since it contains
        // non-Horn clauses". Two blocks avoid this restriction.
        let n = rewrites.len();
        for chunk in rewrites.chunks(500) {
            let rm_rows: Vec<DataValue> = chunk
                .iter()
                .map(|(s, d, _, _)| {
                    DataValue::List(vec![
                        DataValue::from(s.as_str()),
                        DataValue::from(d.as_str()),
                    ])
                })
                .collect();
            let put_rows: Vec<DataValue> = chunk
                .iter()
                .map(|(s, _, new_d, c)| {
                    DataValue::List(vec![
                        DataValue::from(s.as_str()),
                        DataValue::from(new_d.as_str()),
                        DataValue::from(*c),
                    ])
                })
                .collect();
            // Pack both row sets into a single params map and execute two
            // separate blocks so each `?` head is unambiguous.
            let mut p = BTreeMap::new();
            p.insert("rm_rows".into(), DataValue::List(rm_rows));
            p.insert("put_rows".into(), DataValue::List(put_rows));
            self.db
                .run_script(
                    "{ ?[src, dst] <- $rm_rows :rm Calls {src, dst} } \
                     { ?[src, dst, confidence] <- $put_rows :put Calls {src, dst => confidence} }",
                    p,
                    ScriptMutability::Mutable,
                )
                .map_err(map_err)?;
        }
        Ok(n)
    }

    /// Detect test functions and emit `Tests` edges to their candidate target.
    ///
    /// Rule: any `Symbol` of kind=function/method whose `name` starts with
    /// `test_` is a test. Strip the prefix to get the bare target name.
    /// If exactly one other symbol shares that bare name, emit a `Tests` edge.
    ///
    /// Returns the number of `Tests` edges written.
    pub fn detect_test_edges(&self) -> Result<usize> {
        let rows = self
            .db
            .run_script(
                "?[id, name, kind] := *Symbol{id, name, kind}, \
                 (kind = \"function\" or kind = \"method\")",
                BTreeMap::new(),
                ScriptMutability::Immutable,
            )
            .map_err(map_err)?;

        // Build name → ids index; collect tests.
        let mut by_name: std::collections::HashMap<String, Vec<String>> =
            std::collections::HashMap::new();
        let mut tests: Vec<(String, String)> = Vec::new(); // (test_id, target_name)
        for r in rows.rows {
            let (Some(DataValue::Str(id)), Some(DataValue::Str(name)), _) =
                (r.first(), r.get(1), r.get(2))
            else {
                continue;
            };
            let id_s = id.to_string();
            let n = name.to_string();
            by_name.entry(n.clone()).or_default().push(id_s.clone());
            if let Some(stripped) = n.strip_prefix("test_") {
                tests.push((id_s, stripped.to_string()));
            }
        }

        let mut written: Vec<(String, String)> = Vec::new();
        for (test_id, target) in tests {
            match by_name.get(&target) {
                Some(candidates) if candidates.len() == 1 => {
                    written.push((test_id, candidates[0].clone()));
                }
                Some(candidates) => {
                    tracing::trace!(
                        "test edge skipped (ambiguous): {test_id} → {target} matched {} candidates",
                        candidates.len()
                    );
                }
                None => {
                    tracing::trace!(
                        "test edge skipped (no target): {test_id} prefix-stripped to {target} \
                         but no Symbol of that name exists"
                    );
                }
            }
        }

        if written.is_empty() {
            return Ok(0);
        }
        let n = written.len();
        let rows: Vec<DataValue> = written
            .into_iter()
            .map(|(s, d)| {
                DataValue::List(vec![
                    DataValue::from(s.as_str()),
                    DataValue::from(d.as_str()),
                ])
            })
            .collect();
        let mut params = BTreeMap::new();
        params.insert("rows".into(), DataValue::List(rows));
        self.db
            .run_script(
                "?[src, dst] <- $rows :put Tests {src, dst}",
                params,
                ScriptMutability::Mutable,
            )
            .map_err(map_err)?;
        Ok(n)
    }
}