use anyhow::{anyhow, Result};
use arrow::array::{Array, Int32Array, RecordBatch, StringArray};
use std::collections::{HashMap, HashSet};
use std::sync::OnceLock;
use znippy_zoomies::stree32::STree32;
use super::symbols::{CallEdgeRow, SymbolRow};
use crate::warehouse::iceberg::{
discover_latest_snapshot, read_snapshot_batches, IcebergWarehouse, TABLE_CALL_EDGES,
TABLE_SYMBOL_FACTS,
};
fn last_seg(s: &str) -> &str {
s.rsplit("::").next().unwrap_or(s)
}
#[inline]
fn fnv1a32(s: &str) -> u32 {
let mut h: u32 = 0x811c_9dc5;
for &b in s.as_bytes() {
h ^= b as u32;
h = h.wrapping_mul(0x0100_0193);
}
if h == u32::MAX { u32::MAX - 1 } else { h }
}
struct StreeIndex {
sorted_hashes: Vec<u32>,
bucket_off: Vec<u32>,
entry_keys: Vec<String>,
entry_post: Vec<Vec<usize>>,
tree: Option<STree32>,
}
impl std::fmt::Debug for StreeIndex {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("StreeIndex")
.field("keys", &self.entry_keys.len())
.field("buckets", &self.sorted_hashes.len())
.field("built", &self.tree.is_some())
.finish()
}
}
impl StreeIndex {
fn build(keys: &HashMap<String, Vec<usize>>) -> Self {
let mut by_hash: HashMap<u32, Vec<&String>> = HashMap::new();
for k in keys.keys() {
by_hash.entry(fnv1a32(k)).or_default().push(k);
}
let mut sorted_hashes: Vec<u32> = by_hash.keys().copied().collect();
sorted_hashes.sort_unstable();
let mut bucket_off = Vec::with_capacity(sorted_hashes.len() + 1);
let mut entry_keys = Vec::with_capacity(keys.len());
let mut entry_post = Vec::with_capacity(keys.len());
bucket_off.push(0u32);
for h in &sorted_hashes {
let mut bucket = by_hash.remove(h).unwrap();
bucket.sort_unstable();
for k in bucket {
entry_keys.push(k.clone());
entry_post.push(keys[k].clone());
}
bucket_off.push(entry_keys.len() as u32);
}
let tree = (!sorted_hashes.is_empty()).then(|| STree32::new(&sorted_hashes));
StreeIndex { sorted_hashes, bucket_off, entry_keys, entry_post, tree }
}
#[inline]
fn resolve(&self, bucket: usize, name: &str) -> Option<&Vec<usize>> {
let lo = self.bucket_off[bucket] as usize;
let hi = self.bucket_off[bucket + 1] as usize;
(lo..hi).find(|&e| self.entry_keys[e] == name).map(|e| &self.entry_post[e])
}
#[cfg(test)]
#[inline]
fn get(&self, name: &str) -> Option<&Vec<usize>> {
let tree = self.tree.as_ref()?;
let bucket = tree.find_exact(fnv1a32(name))?;
self.resolve(bucket, name)
}
fn get_batch(&self, names: &[&str]) -> Vec<Option<&Vec<usize>>> {
if self.tree.is_none() {
return vec![None; names.len()];
}
let tree = self.tree.as_ref().unwrap();
let hashes: Vec<u32> = names.iter().map(|n| fnv1a32(n)).collect();
let routed = tree.batch_stream::<16>(&hashes);
names
.iter()
.zip(routed)
.map(|(name, hit)| hit.and_then(|bucket| self.resolve(bucket, name)))
.collect()
}
}
#[derive(Debug, Default)]
struct CallIndex {
callee_keys: HashMap<String, Vec<usize>>,
caller_keys: HashMap<String, Vec<usize>>,
adj: HashMap<String, Vec<String>>,
nodes: HashSet<String>,
callee_stree: OnceLock<StreeIndex>,
caller_stree: OnceLock<StreeIndex>,
}
impl CallIndex {
fn callee_stree(&self) -> &StreeIndex {
self.callee_stree.get_or_init(|| StreeIndex::build(&self.callee_keys))
}
fn caller_stree(&self) -> &StreeIndex {
self.caller_stree.get_or_init(|| StreeIndex::build(&self.caller_keys))
}
}
impl CallIndex {
fn index_suffixes(keys: &mut HashMap<String, Vec<usize>>, path: &str, i: usize) {
keys.entry(path.to_string()).or_default().push(i);
let mut rest = path;
while let Some(pos) = rest.find("::") {
rest = &rest[pos + 2..];
keys.entry(rest.to_string()).or_default().push(i);
}
}
fn build(calls: &[CallEdgeRow]) -> Self {
let mut idx = CallIndex::default();
for (i, e) in calls.iter().enumerate() {
Self::index_suffixes(&mut idx.callee_keys, &e.callee_ident, i);
Self::index_suffixes(&mut idx.caller_keys, &e.caller_path, i);
let f = last_seg(&e.caller_path);
let t = last_seg(&e.callee_ident);
idx.adj.entry(f.to_string()).or_default().push(t.to_string());
idx.nodes.insert(f.to_string());
idx.nodes.insert(t.to_string());
}
idx
}
}
pub struct KnowledgeView {
pub symbols: Vec<SymbolRow>,
pub calls: Vec<CallEdgeRow>,
index: OnceLock<CallIndex>,
}
impl KnowledgeView {
pub fn new(symbols: Vec<SymbolRow>, calls: Vec<CallEdgeRow>) -> Self {
KnowledgeView { symbols, calls, index: OnceLock::new() }
}
fn index(&self) -> &CallIndex {
self.index.get_or_init(|| CallIndex::build(&self.calls))
}
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
pub(crate) struct ScanAudit {
pub discovery_rows: usize,
pub data_rows: usize,
}
fn col<'a, T: 'static>(batch: &'a RecordBatch, name: &str) -> Result<&'a T> {
batch
.column_by_name(name)
.ok_or_else(|| anyhow!("missing column `{name}`"))?
.as_any()
.downcast_ref::<T>()
.ok_or_else(|| anyhow!("column `{name}` has unexpected type"))
}
pub fn load_latest(wh: &IcebergWarehouse, repo: &str) -> Result<KnowledgeView> {
Ok(load_latest_audited(wh, repo)?.0)
}
pub(crate) fn load_latest_audited(
wh: &IcebergWarehouse,
repo: &str,
) -> Result<(KnowledgeView, ScanAudit)> {
wh.block_on(async {
let Some((snap, discovery_rows)) =
discover_latest_snapshot(wh, TABLE_SYMBOL_FACTS, repo).await?
else {
return Ok((KnowledgeView::new(vec![], vec![]), ScanAudit::default()));
};
let mut data_rows = 0usize;
let s_batches = read_snapshot_batches(wh, TABLE_SYMBOL_FACTS, &snap).await?;
let mut symbols = Vec::new();
for b in &s_batches {
let snaps = col::<StringArray>(b, "snapshot_id")?;
let crate_name = col::<StringArray>(b, "crate_name")?;
let module_path = col::<StringArray>(b, "module_path")?;
let item_kind = col::<StringArray>(b, "item_kind")?;
let item_name = col::<StringArray>(b, "item_name")?;
let visibility = col::<StringArray>(b, "visibility")?;
let file = col::<StringArray>(b, "file")?;
let line = col::<Int32Array>(b, "line")?;
let doc_lines = col::<Int32Array>(b, "doc_lines")?;
let signature = col::<StringArray>(b, "signature")?;
for i in 0..b.num_rows() {
if snaps.value(i) != snap {
continue;
}
data_rows += 1;
let sig = signature.value(i);
symbols.push(SymbolRow {
crate_name: crate_name.value(i).to_string(),
module_path: module_path.value(i).to_string(),
item_kind: item_kind.value(i).to_string(),
item_name: item_name.value(i).to_string(),
visibility: visibility.value(i).to_string(),
file: file.value(i).to_string(),
line: line.value(i).max(0) as u32,
doc_lines: doc_lines.value(i).max(0) as u32,
signature: if sig.is_empty() { None } else { Some(sig.to_string()) },
});
}
}
let c_batches = read_snapshot_batches(wh, TABLE_CALL_EDGES, &snap).await?;
let mut calls = Vec::new();
for b in &c_batches {
let snaps = col::<StringArray>(b, "snapshot_id")?;
let crate_name = col::<StringArray>(b, "crate_name")?;
let caller = col::<StringArray>(b, "caller_path")?;
let callee = col::<StringArray>(b, "callee_ident")?;
let kind = col::<StringArray>(b, "call_kind")?;
let file = col::<StringArray>(b, "file")?;
let line = col::<Int32Array>(b, "line")?;
for i in 0..b.num_rows() {
if snaps.value(i) != snap {
continue;
}
data_rows += 1;
calls.push(CallEdgeRow {
crate_name: crate_name.value(i).to_string(),
caller_path: caller.value(i).to_string(),
callee_ident: callee.value(i).to_string(),
call_kind: kind.value(i).to_string(),
file: file.value(i).to_string(),
line: line.value(i).max(0) as u32,
});
}
}
Ok((KnowledgeView::new(symbols, calls), ScanAudit { discovery_rows, data_rows }))
})
}
#[cfg(feature = "scip")]
pub fn load_latest_scip(
wh: &IcebergWarehouse,
repo: &str,
) -> Result<Option<KnowledgeView>> {
let scan = wh.load_latest_scip(repo)?;
if scan.rows.is_empty() {
return Ok(None);
}
let calls = super::scip::scip_call_edges(&scan);
let symbols = scan.rows.iter().filter(|r| r.is_definition).map(scip_symbol_row).collect();
Ok(Some(KnowledgeView::new(symbols, calls)))
}
#[cfg(feature = "scip")]
fn scip_symbol_row(r: &super::scip::ScipRow) -> SymbolRow {
SymbolRow {
crate_name: String::new(),
module_path: r.symbol.clone(),
item_kind: r.kind.clone(),
item_name: if r.display_name.is_empty() { r.symbol.clone() } else { r.display_name.clone() },
visibility: String::new(),
file: r.file.clone(),
line: r.start_line,
doc_lines: 0,
signature: None,
}
}
pub fn load_preferred_merged(
wh: &IcebergWarehouse,
members: &[String],
) -> Result<(KnowledgeView, &'static str)> {
let mut symbols: Vec<SymbolRow> = Vec::new();
let mut calls: Vec<CallEdgeRow> = Vec::new();
#[allow(unused_mut)]
let mut any_resolved = false;
#[allow(unused_mut)]
let mut resolved_members: std::collections::HashSet<String> = std::collections::HashSet::new();
#[cfg(feature = "scip")]
{
let mut scans = Vec::new();
for m in members {
let scan = wh.load_latest_scip(m)?;
if !scan.rows.is_empty() {
resolved_members.insert(m.clone());
scans.push(scan);
}
}
if !scans.is_empty() {
any_resolved = true;
let refs: Vec<&super::scip::ScipScan> = scans.iter().collect();
let globals = super::scip::global_symbol_table(&refs);
for scan in &scans {
calls.extend(super::scip::scip_call_edges_with(scan, &globals));
symbols.extend(scan.rows.iter().filter(|r| r.is_definition).map(scip_symbol_row));
}
}
}
let mut any_syn = false;
for m in members {
if resolved_members.contains(m) {
continue;
}
let view = load_latest(wh, m)?;
if !view.symbols.is_empty() || !view.calls.is_empty() {
any_syn = true;
symbols.extend(view.symbols);
calls.extend(view.calls);
}
}
let source = if any_resolved {
"resolved/scip"
} else if any_syn {
"syn"
} else {
""
};
Ok((KnowledgeView::new(symbols, calls), source))
}
pub fn load_preferred(wh: &IcebergWarehouse, repo: &str) -> Result<(KnowledgeView, &'static str)> {
#[cfg(feature = "scip")]
{
if let Some(view) = load_latest_scip(wh, repo)? {
return Ok((view, "resolved/scip"));
}
}
Ok((load_latest(wh, repo)?, "syn"))
}
impl KnowledgeView {
pub fn symbol_lookup(&self, pattern: &str, limit: usize) -> Vec<&SymbolRow> {
let p = pattern.to_lowercase();
self.symbols
.iter()
.filter(|s| s.item_name.to_lowercase().contains(&p))
.take(limit)
.collect()
}
pub fn defined_in(&self, suffix: &str) -> Vec<&SymbolRow> {
self.symbols.iter().filter(|s| s.file.ends_with(suffix)).collect()
}
pub fn callers_of(&self, name: &str) -> Vec<&CallEdgeRow> {
match self.index().callee_keys.get(name) {
Some(hits) => hits.iter().map(|&i| &self.calls[i]).collect(),
None => Vec::new(),
}
}
pub fn callees_of(&self, name: &str) -> Vec<&CallEdgeRow> {
match self.index().caller_keys.get(name) {
Some(hits) => hits.iter().map(|&i| &self.calls[i]).collect(),
None => Vec::new(),
}
}
pub fn callers_of_batch(&self, names: &[&str]) -> Vec<Vec<&CallEdgeRow>> {
let stree = self.index().callee_stree();
if stree.tree.is_none() {
return names.iter().map(|n| self.callers_of(n)).collect();
}
stree
.get_batch(names)
.into_iter()
.map(|hit| match hit {
Some(post) => post.iter().map(|&i| &self.calls[i]).collect(),
None => Vec::new(),
})
.collect()
}
pub fn callees_of_batch(&self, names: &[&str]) -> Vec<Vec<&CallEdgeRow>> {
let stree = self.index().caller_stree();
if stree.tree.is_none() {
return names.iter().map(|n| self.callees_of(n)).collect();
}
stree
.get_batch(names)
.into_iter()
.map(|hit| match hit {
Some(post) => post.iter().map(|&i| &self.calls[i]).collect(),
None => Vec::new(),
})
.collect()
}
pub fn call_path(&self, from: &str, to: &str) -> Option<Vec<String>> {
use std::collections::VecDeque;
let from = last_seg(from).to_string();
let to = last_seg(to).to_string();
let idx = self.index();
let adj = &idx.adj;
let nodes = &idx.nodes;
if from == to {
return nodes.contains(from.as_str()).then(|| vec![from]);
}
if !nodes.contains(from.as_str()) {
return None;
}
let mut parent: HashMap<String, String> = HashMap::new();
let mut seen: HashSet<String> = HashSet::new();
let mut queue: VecDeque<String> = VecDeque::new();
seen.insert(from.clone());
queue.push_back(from.clone());
while let Some(cur) = queue.pop_front() {
let Some(callees) = adj.get(cur.as_str()) else { continue };
for c in callees {
let c = c.as_str();
if !seen.insert(c.to_string()) {
continue;
}
parent.insert(c.to_string(), cur.clone());
if c == to {
let mut path = vec![to.clone()];
let mut node = to.clone();
while let Some(p) = parent.get(&node) {
path.push(p.clone());
node = p.clone();
}
path.reverse();
return Some(path);
}
queue.push_back(c.to_string());
}
}
None
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::knowledge::symbols::CallEdgeRow;
fn edge(callee: &str) -> CallEdgeRow {
CallEdgeRow {
crate_name: "demo".into(),
caller_path: "demo::f".into(),
callee_ident: callee.into(),
call_kind: "call".into(),
file: "src/lib.rs".into(),
line: 1,
}
}
fn edge_from(caller: &str, callee: &str) -> CallEdgeRow {
CallEdgeRow {
crate_name: "demo".into(),
caller_path: caller.into(),
callee_ident: callee.into(),
call_kind: "call".into(),
file: "src/lib.rs".into(),
line: 1,
}
}
#[test]
fn load_latest_snapshot_pushdown_is_byte_identical_and_scans_fewer_rows() {
use arrow::array::TimestampMicrosecondArray;
use chrono::Duration;
use iceberg::Catalog;
use crate::knowledge::symbols::{SymbolRow, SymbolScan};
use crate::warehouse::iceberg::IcebergWarehouse;
fn sym(name: &str) -> SymbolRow {
SymbolRow {
crate_name: "demo".into(),
module_path: format!("demo::{name}"),
item_kind: "fn".into(),
item_name: name.into(),
visibility: "pub".into(),
file: "src/lib.rs".into(),
line: 1,
doc_lines: 0,
signature: Some(format!("fn {name}()")),
}
}
fn call(caller: &str, callee: &str) -> CallEdgeRow {
CallEdgeRow {
crate_name: "demo".into(),
caller_path: format!("demo::{caller}"),
callee_ident: callee.into(),
call_kind: "call".into(),
file: "src/lib.rs".into(),
line: 2,
}
}
let dir = tempfile::tempdir().unwrap();
let wh = IcebergWarehouse::open(dir.path()).unwrap();
let base = chrono::Utc::now();
let append = |repo: &str, secs: i64, symbols: Vec<SymbolRow>, calls: Vec<CallEdgeRow>| {
let scan = SymbolScan {
snapshot_id: uuid::Uuid::new_v4(),
ts: base + Duration::seconds(secs),
repo: repo.to_string(),
symbols,
calls,
features: vec![],
tests: vec![],
};
wh.append_symbol_scan(&scan).unwrap();
};
append("demo", 0, vec![sym("a1")], vec![call("a1", "new")]);
append("demo", 1, vec![sym("a2"), sym("b2")], vec![call("a2", "push")]);
append(
"demo",
2,
vec![sym("a3"), sym("b3"), sym("c3")],
vec![call("a3", "read"), call("b3", "write")],
);
append("other", 9, vec![sym("o1")], vec![call("o1", "leak")]);
let (old_view, old_rows, old_snap) = wh
.block_on(async {
let st = wh.catalog().load_table(&wh.table_ident(TABLE_SYMBOL_FACTS)).await?;
let s_batches =
skade::read_filtered(&st, &skade::ScanFilter::eq("repo", "demo"), &[]).await?;
let mut latest: Option<(String, i64)> = None;
for b in &s_batches {
let snaps = col::<StringArray>(b, "snapshot_id")?;
let ts = col::<TimestampMicrosecondArray>(b, "ts_micros")?;
for i in 0..b.num_rows() {
let t = ts.value(i);
if latest.as_ref().map(|(_, lt)| t > *lt).unwrap_or(true) {
latest = Some((snaps.value(i).to_string(), t));
}
}
}
let (snap, _) = latest.unwrap();
let mut rows = 0usize;
let mut symbols = Vec::new();
for b in &s_batches {
rows += b.num_rows();
let snaps = col::<StringArray>(b, "snapshot_id")?;
let ck = col::<StringArray>(b, "crate_name")?;
let mp = col::<StringArray>(b, "module_path")?;
let ik = col::<StringArray>(b, "item_kind")?;
let it = col::<StringArray>(b, "item_name")?;
let vis = col::<StringArray>(b, "visibility")?;
let file = col::<StringArray>(b, "file")?;
let line = col::<Int32Array>(b, "line")?;
let doc = col::<Int32Array>(b, "doc_lines")?;
let sig = col::<StringArray>(b, "signature")?;
for i in 0..b.num_rows() {
if snaps.value(i) != snap {
continue;
}
let s = sig.value(i);
symbols.push(SymbolRow {
crate_name: ck.value(i).to_string(),
module_path: mp.value(i).to_string(),
item_kind: ik.value(i).to_string(),
item_name: it.value(i).to_string(),
visibility: vis.value(i).to_string(),
file: file.value(i).to_string(),
line: line.value(i).max(0) as u32,
doc_lines: doc.value(i).max(0) as u32,
signature: if s.is_empty() { None } else { Some(s.to_string()) },
});
}
}
let ct = wh.catalog().load_table(&wh.table_ident(TABLE_CALL_EDGES)).await?;
let c_batches =
skade::read_filtered(&ct, &skade::ScanFilter::eq("repo", "demo"), &[]).await?;
let mut calls = Vec::new();
for b in &c_batches {
rows += b.num_rows();
let snaps = col::<StringArray>(b, "snapshot_id")?;
let ck = col::<StringArray>(b, "crate_name")?;
let caller = col::<StringArray>(b, "caller_path")?;
let callee = col::<StringArray>(b, "callee_ident")?;
let kind = col::<StringArray>(b, "call_kind")?;
let file = col::<StringArray>(b, "file")?;
let line = col::<Int32Array>(b, "line")?;
for i in 0..b.num_rows() {
if snaps.value(i) != snap {
continue;
}
calls.push(CallEdgeRow {
crate_name: ck.value(i).to_string(),
caller_path: caller.value(i).to_string(),
callee_ident: callee.value(i).to_string(),
call_kind: kind.value(i).to_string(),
file: file.value(i).to_string(),
line: line.value(i).max(0) as u32,
});
}
}
anyhow::Ok((KnowledgeView::new(symbols, calls), rows, snap))
})
.unwrap();
let (new_view, audit) = load_latest_audited(&wh, "demo").unwrap();
let ser = |v: &KnowledgeView| {
(serde_json::to_value(&v.symbols).unwrap(), serde_json::to_value(&v.calls).unwrap())
};
assert_eq!(ser(&new_view), ser(&old_view), "NEW must equal OLD byte-for-byte");
assert_eq!(new_view.symbols.len(), 3);
assert_eq!(new_view.calls.len(), 2);
assert_eq!(
new_view.symbols.iter().map(|s| s.item_name.as_str()).collect::<Vec<_>>(),
vec!["a3", "b3", "c3"],
);
assert_eq!(old_rows, 10, "old full-history scan touches every snapshot");
assert_eq!(audit.data_rows, 5, "new decodes only the latest snapshot");
assert!(
audit.data_rows < old_rows,
"snapshot pushdown must decode fewer full-width rows: new={} old={}",
audit.data_rows,
old_rows,
);
let (repo_rows, snap_rows) = wh
.block_on(async {
let st = wh.catalog().load_table(&wh.table_ident(TABLE_SYMBOL_FACTS)).await?;
let repo_plan =
skade::plan_stats(&st, Some(&skade::ScanFilter::eq("repo", "demo"))).await?;
let snap_plan = skade::plan_stats(
&st,
Some(&skade::ScanFilter::eq("snapshot_id", old_snap.clone())),
)
.await?;
anyhow::Ok((repo_plan.rows_planned, snap_plan.rows_planned))
})
.unwrap();
assert_eq!(repo_rows, 6, "repo predicate plans the whole demo history");
assert_eq!(snap_rows, 3, "snapshot predicate prunes to one snapshot's file");
assert!(snap_rows < repo_rows);
}
#[test]
fn callers_of_matches_last_segment_and_bare() {
let view = KnowledgeView::new(
vec![],
vec![
edge("new"), edge("Arc::new"), edge("Foo::new"), edge("renew"), edge("Foo::make"), ],
);
let hits: Vec<&str> = view.callers_of("new").iter().map(|c| c.callee_ident.as_str()).collect();
assert!(hits.contains(&"new"));
assert!(hits.contains(&"Arc::new"));
assert!(hits.contains(&"Foo::new"));
assert!(!hits.contains(&"renew"), "{hits:?}");
assert!(!hits.contains(&"Foo::make"));
assert_eq!(hits.len(), 3);
let exact: Vec<&str> = view.callers_of("Arc::new").iter().map(|c| c.callee_ident.as_str()).collect();
assert_eq!(exact, vec!["Arc::new"]);
}
#[test]
fn call_path_bfs_over_call_edges() {
let view = KnowledgeView::new(
vec![],
vec![
edge_from("a::run", "step"),
edge_from("b::step", "Repo::commit"),
edge_from("a::run", "noop"),
],
);
let p = view.call_path("run", "commit").expect("path exists");
assert_eq!(p, vec!["run", "step", "commit"]);
let p2 = view.call_path("a::run", "Repo::commit").expect("path exists");
assert_eq!(p2, vec!["run", "step", "commit"]);
assert_eq!(view.call_path("step", "step"), Some(vec!["step".to_string()]));
assert_eq!(view.call_path("commit", "run"), None);
assert_eq!(view.call_path("ghost", "run"), None);
}
#[cfg(feature = "scip")]
#[test]
fn load_latest_scip_builds_resolved_view() {
use crate::knowledge::scip::{ingest_index, ScipScan};
use crate::warehouse::iceberg::IcebergWarehouse;
use scip::types::{symbol_information, Document, Index, Occurrence, SymbolInformation, SymbolRole};
let mut idx = Index::new();
let mut doc = Document::new();
doc.relative_path = "src/lib.rs".into();
let mut outer_si = SymbolInformation::new();
outer_si.symbol = "rust-analyzer cargo demo 0.1.0 outer().".into();
outer_si.display_name = "outer".into();
outer_si.kind = symbol_information::Kind::Function.into();
doc.symbols.push(outer_si.clone());
let mut outer_def = Occurrence::new();
outer_def.range = vec![10, 3, 10, 8];
outer_def.enclosing_range = vec![10, 0, 20, 1];
outer_def.symbol = outer_si.symbol.clone();
outer_def.symbol_roles = SymbolRole::Definition as i32;
doc.occurrences.push(outer_def);
let mut inner_si = SymbolInformation::new();
inner_si.symbol = "rust-analyzer cargo demo 0.1.0 inner().".into();
inner_si.display_name = "inner".into();
inner_si.kind = symbol_information::Kind::Function.into();
doc.symbols.push(inner_si.clone());
let mut inner_def = Occurrence::new();
inner_def.range = vec![30, 3, 30, 8];
inner_def.enclosing_range = vec![30, 0, 34, 1];
inner_def.symbol = inner_si.symbol.clone();
inner_def.symbol_roles = SymbolRole::Definition as i32;
doc.occurrences.push(inner_def);
let mut ref_inner = Occurrence::new();
ref_inner.range = vec![13, 8, 13, 13];
ref_inner.symbol = inner_si.symbol.clone();
doc.occurrences.push(ref_inner);
idx.documents.push(doc);
let scan: ScipScan = ingest_index(idx, "demo", "deadbeefsha", uuid::Uuid::new_v4(), chrono::Utc::now());
let dir = tempfile::tempdir().unwrap();
let wh = IcebergWarehouse::open(dir.path()).unwrap();
wh.append_scip_scan(&scan).unwrap();
let view = load_latest_scip(&wh, "demo").unwrap().expect("scip rows present");
let callers: Vec<&str> = view.callers_of("inner").iter().map(|c| c.caller_path.as_str()).collect();
assert_eq!(callers, vec!["outer"], "resolved caller via containment");
assert_eq!(view.call_path("outer", "inner"), Some(vec!["outer".to_string(), "inner".to_string()]));
assert!(load_latest_scip(&wh, "other").unwrap().is_none());
}
#[cfg(feature = "scip")]
#[test]
fn load_preferred_merged_resolves_cross_binary() {
use crate::knowledge::scip::{ingest_index, ScipScan};
use crate::warehouse::iceberg::IcebergWarehouse;
use scip::types::{symbol_information, Document, Index, Occurrence, SymbolInformation, SymbolRole};
let mut bidx = Index::new();
let mut bdoc = Document::new();
bdoc.relative_path = "src/main.rs".into();
let mut main_si = SymbolInformation::new();
main_si.symbol = "rust-analyzer cargo demo_bin 0.1.0 main().".into();
main_si.display_name = "main".into();
main_si.kind = symbol_information::Kind::Function.into();
bdoc.symbols.push(main_si.clone());
let mut main_def = Occurrence::new();
main_def.range = vec![10, 3, 10, 7];
main_def.enclosing_range = vec![10, 0, 20, 1];
main_def.symbol = main_si.symbol.clone();
main_def.symbol_roles = SymbolRole::Definition as i32;
bdoc.occurrences.push(main_def);
let mut ref_helper = Occurrence::new();
ref_helper.range = vec![13, 8, 13, 14];
ref_helper.symbol = "rust-analyzer cargo demo_lib 0.1.0 helper().".into();
bdoc.occurrences.push(ref_helper);
bidx.documents.push(bdoc);
let bin: ScipScan = ingest_index(bidx, "demo_bin", "binsha", uuid::Uuid::new_v4(), chrono::Utc::now());
let mut lidx = Index::new();
let mut ldoc = Document::new();
ldoc.relative_path = "src/lib.rs".into();
let mut helper_si = SymbolInformation::new();
helper_si.symbol = "rust-analyzer cargo demo_lib 0.1.0 helper().".into();
helper_si.display_name = "helper".into();
helper_si.kind = symbol_information::Kind::Function.into();
ldoc.symbols.push(helper_si.clone());
let mut helper_def = Occurrence::new();
helper_def.range = vec![5, 7, 5, 13];
helper_def.enclosing_range = vec![5, 0, 9, 1];
helper_def.symbol = helper_si.symbol.clone();
helper_def.symbol_roles = SymbolRole::Definition as i32;
ldoc.occurrences.push(helper_def);
lidx.documents.push(ldoc);
let lib: ScipScan = ingest_index(lidx, "demo_lib", "libsha", uuid::Uuid::new_v4(), chrono::Utc::now());
let dir = tempfile::tempdir().unwrap();
let wh = IcebergWarehouse::open(dir.path()).unwrap();
wh.append_scip_scan(&bin).unwrap();
wh.append_scip_scan(&lib).unwrap();
let (solo, _src) = load_preferred(&wh, "demo_bin").unwrap();
assert!(
solo.callers_of("helper").is_empty(),
"single-member view must not resolve the cross-binary call: {:?}",
solo.calls
);
let members = vec!["demo_bin".to_string(), "demo_lib".to_string()];
let (merged, source) = load_preferred_merged(&wh, &members).unwrap();
assert_eq!(source, "resolved/scip");
let callers: Vec<&str> =
merged.callers_of("helper").iter().map(|c| c.caller_path.as_str()).collect();
assert_eq!(callers, vec!["main"], "cross-binary edge resolved via moniker join");
assert_eq!(
merged.call_path("main", "helper"),
Some(vec!["main".to_string(), "helper".to_string()]),
);
}
fn linear_callers_of<'a>(calls: &'a [CallEdgeRow], name: &str) -> Vec<&'a CallEdgeRow> {
let suffix = format!("::{name}");
calls
.iter()
.filter(|c| c.callee_ident == name || c.callee_ident.ends_with(&suffix))
.collect()
}
fn linear_callees_of<'a>(calls: &'a [CallEdgeRow], name: &str) -> Vec<&'a CallEdgeRow> {
calls
.iter()
.filter(|c| c.caller_path == name || c.caller_path.ends_with(&format!("::{name}")))
.collect()
}
fn linear_call_path(calls: &[CallEdgeRow], from: &str, to: &str) -> Option<Vec<String>> {
use std::collections::{BTreeMap, BTreeSet, VecDeque};
fn last_seg(s: &str) -> &str {
s.rsplit("::").next().unwrap_or(s)
}
let from = last_seg(from).to_string();
let to = last_seg(to).to_string();
let mut adj: BTreeMap<&str, Vec<&str>> = BTreeMap::new();
let mut nodes: BTreeSet<&str> = BTreeSet::new();
for e in calls {
let f = last_seg(&e.caller_path);
let t = last_seg(&e.callee_ident);
adj.entry(f).or_default().push(t);
nodes.insert(f);
nodes.insert(t);
}
if from == to {
return nodes.contains(from.as_str()).then(|| vec![from]);
}
if !nodes.contains(from.as_str()) {
return None;
}
let mut parent: BTreeMap<String, String> = BTreeMap::new();
let mut seen: BTreeSet<String> = BTreeSet::new();
let mut queue: VecDeque<String> = VecDeque::new();
seen.insert(from.clone());
queue.push_back(from.clone());
while let Some(cur) = queue.pop_front() {
let Some(callees) = adj.get(cur.as_str()) else { continue };
for &c in callees {
if !seen.insert(c.to_string()) {
continue;
}
parent.insert(c.to_string(), cur.clone());
if c == to {
let mut path = vec![to.clone()];
let mut node = to.clone();
while let Some(p) = parent.get(&node) {
path.push(p.clone());
node = p.clone();
}
path.reverse();
return Some(path);
}
queue.push_back(c.to_string());
}
}
None
}
fn rich_fixture() -> Vec<CallEdgeRow> {
vec![
edge_from("a::run", "step"), edge_from("mod::a::run", "Arc::new"), edge_from("b::step", "Repo::commit"), edge_from("x::y::run", "new"), edge_from("run", "Foo::new"), edge_from("c::renew", "renew"), edge_from("p::q::step", "a::b::new"), edge_from("k::Arc::new", "Arc::new"), edge_from("z::run", "noop"), edge_from("b::step", "x::y::new"), ]
}
fn ptrs(rows: Vec<&CallEdgeRow>) -> Vec<*const CallEdgeRow> {
rows.into_iter().map(|r| r as *const CallEdgeRow).collect()
}
fn ptrs_v(rows: &[&CallEdgeRow]) -> Vec<*const CallEdgeRow> {
rows.iter().map(|&r| r as *const CallEdgeRow).collect()
}
#[test]
fn indexed_lookups_match_linear_scan_exactly() {
let view = KnowledgeView::new(vec![], rich_fixture());
let names = [
"new", "run", "step", "Arc::new", "commit", "Repo::commit", "renew", "noop",
"a::b::new", "b::new", "y::new", "x::y::new", "Foo::new", "ghost", "",
];
for name in names {
assert_eq!(
ptrs(view.callers_of(name)),
ptrs(linear_callers_of(&view.calls, name)),
"callers_of({name:?}) must equal the linear scan (set + order)",
);
assert_eq!(
ptrs(view.callees_of(name)),
ptrs(linear_callees_of(&view.calls, name)),
"callees_of({name:?}) must equal the linear scan (set + order)",
);
}
let pairs = [
("run", "new"),
("a::run", "Arc::new"),
("run", "commit"),
("step", "commit"),
("run", "run"),
("step", "step"),
("ghost", "new"),
("commit", "run"),
("run", "noop"),
("step", "new"),
("run", "renew"),
];
for (from, to) in pairs {
assert_eq!(
view.call_path(from, to),
linear_call_path(&view.calls, from, to),
"call_path({from:?}, {to:?}) must equal the linear rebuild",
);
}
}
#[test]
fn callers_of_lookup_is_o_hits_not_o_n() {
fn build(noise: usize) -> KnowledgeView {
let mut calls = Vec::with_capacity(noise + 3);
for i in 0..noise {
calls.push(edge_from(&format!("noise::c{i}"), &format!("noise::g{i}")));
}
calls.push(edge_from("m::one", "target"));
calls.push(edge_from("m::two", "Wrap::target"));
calls.push(edge_from("m::three", "a::b::target"));
KnowledgeView::new(vec![], calls)
}
let small = build(1_000);
let big = build(200_000);
assert_eq!(small.callers_of("target").len(), 3);
assert_eq!(big.callers_of("target").len(), 3);
let small_work = small.index().callee_keys.get("target").map_or(0, Vec::len);
let big_work = big.index().callee_keys.get("target").map_or(0, Vec::len);
assert_eq!(small_work, 3, "posting list holds only the 3 hits at N=1_003");
assert_eq!(big_work, 3, "posting list STILL holds only the 3 hits at N=200_003");
assert_eq!(big_work, small_work, "lookup work independent of N ⇒ O(hits), not O(N)");
assert!(big.calls.len() >= 200_000, "the large fixture really is large");
assert!(big.index().callee_keys.get("absent_symbol").is_none());
assert!(big.callers_of("absent_symbol").is_empty());
}
#[test]
fn index_is_built_once_and_reused() {
let view = KnowledgeView::new(vec![], rich_fixture());
assert!(view.index.get().is_none(), "index is not built eagerly");
let _ = view.callers_of("new");
let built = view.index.get().expect("index materialised on first query");
let first = built as *const CallIndex;
let _ = view.callees_of("run");
let _ = view.call_path("run", "commit");
let again = view.index.get().expect("still present") as *const CallIndex;
assert_eq!(first, again, "same CallIndex reused — built once, not per call");
}
#[test]
fn stree_batched_lookups_match_inverted_index_and_linear_scan() {
let view = KnowledgeView::new(vec![], rich_fixture());
let names = [
"new", "run", "step", "Arc::new", "commit", "Repo::commit", "renew", "noop",
"a::b::new", "b::new", "y::new", "x::y::new", "Foo::new", "ghost", "",
];
let idx = view.index();
let callee_stree = idx.callee_stree();
let caller_stree = idx.caller_stree();
assert!(callee_stree.tree.is_some(), "the callee stree is actually built");
assert!(caller_stree.tree.is_some(), "the caller stree is actually built");
for name in names {
assert_eq!(
callee_stree.get(name),
idx.callee_keys.get(name),
"stree get({name:?}) must equal the callee_keys HashMap (cross-check)",
);
assert_eq!(
caller_stree.get(name),
idx.caller_keys.get(name),
"stree get({name:?}) must equal the caller_keys HashMap (cross-check)",
);
}
let batch_callers = view.callers_of_batch(&names);
let batch_callees = view.callees_of_batch(&names);
assert_eq!(batch_callers.len(), names.len());
assert_eq!(batch_callees.len(), names.len());
for (i, name) in names.iter().enumerate() {
assert_eq!(
ptrs_v(&batch_callers[i]),
ptrs(view.callers_of(name)),
"callers_of_batch[{name:?}] must equal per-name callers_of",
);
assert_eq!(
ptrs_v(&batch_callers[i]),
ptrs(linear_callers_of(&view.calls, name)),
"callers_of_batch[{name:?}] must equal the linear scan oracle",
);
assert_eq!(
ptrs_v(&batch_callees[i]),
ptrs(view.callees_of(name)),
"callees_of_batch[{name:?}] must equal per-name callees_of",
);
assert_eq!(
ptrs_v(&batch_callees[i]),
ptrs(linear_callees_of(&view.calls, name)),
"callees_of_batch[{name:?}] must equal the linear scan oracle",
);
}
assert_eq!(
callee_stree.entry_keys.len(),
idx.callee_keys.len(),
"every inverted-index key is present in the stree",
);
assert!(
callee_stree.sorted_hashes.len() <= callee_stree.entry_keys.len(),
"never more buckets than keys",
);
assert!(
callee_stree.sorted_hashes.windows(2).all(|w| w[0] < w[1]),
"stree keys are strictly ascending + unique (valid S-tree input)",
);
let mut calls = Vec::with_capacity(50_003);
for i in 0..50_000 {
calls.push(edge_from(&format!("noise{i}::caller"), &format!("noise{i}::callee{i}")));
}
calls.push(edge_from("m::one", "target"));
calls.push(edge_from("m::two", "Wrap::target"));
calls.push(edge_from("m::three", "a::b::target"));
let big = KnowledgeView::new(vec![], calls);
let mut qnames: Vec<String> = (0..50_000).map(|i| format!("callee{i}")).collect();
for s in ["target", "Wrap::target", "a::b::target", "absent_symbol", "new"] {
qnames.push(s.to_string());
}
let qrefs: Vec<&str> = qnames.iter().map(String::as_str).collect();
let big_batch = big.callers_of_batch(&qrefs);
assert_eq!(big_batch.len(), qrefs.len());
for (i, name) in qrefs.iter().enumerate() {
assert_eq!(
ptrs_v(&big_batch[i]),
ptrs(big.callers_of(name)),
"large-fixture callers_of_batch[{name:?}] diverged from per-name callers_of",
);
if i % 500 == 0 || i >= 50_000 {
assert_eq!(
ptrs_v(&big_batch[i]),
ptrs(linear_callers_of(&big.calls, name)),
"large-fixture callers_of_batch[{name:?}] diverged from the linear scan",
);
}
}
let ti = qrefs.iter().position(|&n| n == "target").unwrap();
assert_eq!(big_batch[ti].len(), 3, "all 3 `target` callers via the batched stree");
let ai = qrefs.iter().position(|&n| n == "absent_symbol").unwrap();
assert!(big_batch[ai].is_empty(), "an absent key resolves to no edges");
assert!(
big.index().callee_stree().sorted_hashes.len() > 10_000,
"large fixture builds a deep multi-level stree (the no-bench scale stand-in)",
);
}
}