use std::path::Path;
use crate::graph::{GraphDb, Node};
pub struct SkillData {
pub indexed_at: String,
pub node_count: u64,
pub function_count: u64,
pub class_count: u64,
pub file_count: u64,
pub edge_count: u64,
pub language_breakdown: String,
pub community_count: u32,
pub top_communities: Vec<CommunityInfo>,
pub hotspots: Vec<Node>,
pub entry_points: Vec<Node>,
pub god_nodes: Vec<Node>,
pub dead_code_count: u64,
pub dead_code_high: u64,
}
pub struct CommunityInfo {
pub id: i64,
pub label: String,
pub node_count: i64,
}
const SKILL_TEMPLATE: &str = r#"# cgx — Codebase Knowledge Graph
> Auto-generated by cgx on {{ indexed_at }}. Do not edit manually.
> Re-run `cgx analyze` to refresh.
## When to Use cgx
Use cgx commands INSTEAD of reading source files when you need to:
- Find where any function, class, or variable is defined
- Understand what depends on a specific piece of code
- Know what will break before making a change
- Understand the architecture of an unfamiliar area
- Find who owns a file or module
- Find dead code or unused exports
- Trace a full call chain from entry point to implementation
**Rule: Never open a file speculatively. Query first. Open only if you need
the implementation body — not to find where something lives.**
## Trigger Patterns — MANDATORY
You MUST run cgx (CLI or MCP) automatically — without waiting to be asked — whenever:
- "what calls X" / "who uses X" / "what depends on X"
- "show me the architecture" / "how does this work" / "explain X"
- "what breaks if I change X" / "blast radius of X"
- "find X in the codebase" / "where is X defined" / "where is X used"
- "who owns X" / "who wrote X" / "git blame X"
- "is X used anywhere" / "is X dead code" / "can I delete X"
- Starting ANY new task — run `cgx summary` before touching any file
- Before ANY edit — run `cgx query blast-radius` on the target function
- After any question about code structure, dependencies, or architecture
Never grep or open source files to answer structural questions. cgx is always faster and cheaper.
## Commands
If cgx is available as an MCP tool (preferred — no subprocess):
```
get_repo_summary # orient: nodes, communities, hotspots
find_symbol <name> # locate any function/class/type
get_blast_radius <node_id> # what breaks if this changes
get_neighbors <node_id> # direct dependencies
get_call_chain <from> <to> # trace a call path
get_hotspots # riskiest files to edit
get_file_owners <path> # git blame ownership
search_graph <query> # full-text search
run_query <sql> # raw SQL against the graph
```
If cgx is available as a CLI:
```bash
cgx summary # orient yourself
cgx query find <name> # locate a symbol
cgx query find <name> --kind=Function
cgx query blast-radius <function> # change impact (run BEFORE every edit)
cgx query deps <node-name> # what does this depend on
cgx query chain "<A> -> <B>" # trace call path
cgx hotspots # high-risk files
cgx query owners <path> # file ownership
cgx query search "<phrase>" # search by concept
cgx query community <id-or-name> # explore a cluster
cgx query dead-code # find unused exports
```
## Dead Code Commands
```bash
cgx query dead-code --summary
cgx query dead-code --safe-to-delete
cgx query dead-code --kind=exports --path=src/auth/
cgx query dead-code --kind=files
```
## Workflow: Starting a Task
1. `cgx summary` — orient yourself
2. `cgx query find <entry-point>` — locate the relevant node
3. `cgx query blast-radius <node>` — know the risk before touching it
4. Open only the specific files you need
## Workflow: Before Every Edit
1. `cgx query blast-radius <function>` — what breaks?
2. `cgx query deps <function>` — what does it depend on?
3. Make the change
4. `cgx query blast-radius <function>` — verify ripple is as expected
## Token Budget
| Action | Approx tokens |
|---------------------------|---------------|
| `cgx summary` | ~400 |
| `cgx query find X` | ~200 |
| `cgx query blast-radius X`| ~300-800 |
| Opening one source file | ~2,000-15,000 |
Prefer 3 cgx queries over opening 1 file speculatively.
## This Codebase
- **Indexed:** {{ indexed_at }}
- **Nodes:** {{ node_count }} ({{ function_count }} functions,
{{ class_count }} classes, {{ file_count }} files)
- **Edges:** {{ edge_count }}
- **Languages:** {{ language_breakdown }}
- **Communities:** {{ community_count }}
- **Dead code candidates:** {{ dead_code_count }} ({{ dead_code_high }} high confidence · safe to investigate)
### Top Communities
{{ top_communities_list }}
### Hotspots (highest risk — review carefully before editing)
{{ hotspots_list }}
### Entry Points (nothing imports these — safe starting points)
{{ entry_points_list }}
### Most Depended-On Nodes (god nodes — change with extreme care)
{{ god_nodes_list }}
"#;
const AGENTS_TEMPLATE: &str = r#"# Codebase Architecture
> Auto-generated by cgx {{ indexed_at }}. Re-run `cgx analyze` to refresh.
> For the full skill context used by AI agents, see `CGX_SKILL.md`.
## Overview
- **Nodes:** {{ node_count }} across {{ file_count }} files
- **Languages:** {{ language_breakdown }}
- **Communities:** {{ community_count }} architectural clusters
## Module Map
The graph is partitioned into {{ community_count }} communities via Louvain clustering.
Each community is a cohesive module — edits inside one community rarely ripple outside it.
{{ community_descriptions }}
## Hotspots (High Risk — Review Before Editing)
Files ranked by churn × coupling score. Editing these is likely to break things.
{{ hotspots_table }}
## Entry Points
Files/functions with no inbound dependencies — safe places to start tracing.
{{ entry_points_list }}
## God Nodes (Most Depended-On)
These are used everywhere. Breaking them has maximum blast radius.
{{ god_nodes_list }}
> Query this graph before opening any file. See `CGX_SKILL.md` for full command reference.
"#;
pub fn build_skill_data(db: &GraphDb) -> anyhow::Result<SkillData> {
let node_count = db.node_count()?;
let edge_count = db.edge_count()?;
let lang_breakdown = db.get_language_breakdown()?;
let communities = db.get_communities()?;
let counts_by_kind = db.get_node_counts_by_kind()?;
let function_count = counts_by_kind.get("Function").copied().unwrap_or(0);
let class_count = counts_by_kind.get("Class").copied().unwrap_or(0);
let file_count = counts_by_kind.get("File").copied().unwrap_or(0);
let language_breakdown = if lang_breakdown.is_empty() {
"none".to_string()
} else {
let mut entries: Vec<_> = lang_breakdown.iter().collect();
entries.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap_or(std::cmp::Ordering::Equal));
entries
.iter()
.map(|(lang, pct)| format!("{} {:.0}%", lang, *pct * 100.0))
.collect::<Vec<_>>()
.join(", ")
};
let top_communities: Vec<CommunityInfo> = communities
.iter()
.take(5)
.map(|(id, label, count, _top_nodes)| CommunityInfo {
id: *id,
label: label.clone(),
node_count: *count,
})
.collect();
let all_nodes = db.get_all_nodes()?;
let mut file_nodes: Vec<&Node> = all_nodes
.iter()
.filter(|n| n.kind == "File" && n.churn > 0.0)
.collect();
file_nodes.sort_by(|a, b| {
let sa = a.churn * a.coupling + a.in_degree as f64 * 0.01;
let sb = b.churn * b.coupling + b.in_degree as f64 * 0.01;
sb.partial_cmp(&sa).unwrap_or(std::cmp::Ordering::Equal)
});
let hotspots: Vec<Node> = file_nodes.iter().take(5).map(|&n| n.clone()).collect();
let mut entry_nodes: Vec<&Node> = all_nodes
.iter()
.filter(|n| n.in_degree == 0 && n.kind != "File" && n.kind != "Author")
.collect();
entry_nodes.sort_by_key(|node| std::cmp::Reverse(node.out_degree));
let entry_points: Vec<Node> = entry_nodes.iter().take(5).map(|&n| n.clone()).collect();
let mut god_nodes: Vec<&Node> = all_nodes
.iter()
.filter(|n| n.in_degree > 0 && n.kind != "File")
.collect();
god_nodes.sort_by_key(|node| std::cmp::Reverse(node.in_degree));
let mut seen_names = std::collections::HashSet::new();
let top_god_nodes: Vec<Node> = god_nodes
.iter()
.filter(|n| seen_names.insert(n.name.clone()))
.take(5)
.map(|&n| n.clone())
.collect();
let dead_code_count = db.get_dead_code_stats().map(|(t, _)| t as u64).unwrap_or(0);
let dead_code_high = db.get_dead_code_stats().map(|(_, h)| h as u64).unwrap_or(0);
Ok(SkillData {
indexed_at: chrono::Utc::now().to_rfc3339(),
node_count,
function_count,
class_count,
file_count,
edge_count,
language_breakdown,
community_count: communities.len() as u32,
top_communities,
hotspots,
entry_points,
god_nodes: top_god_nodes,
dead_code_count,
dead_code_high,
})
}
pub fn generate_skill(data: &SkillData) -> String {
let mut c = SKILL_TEMPLATE.to_string();
c = c.replace("{{ indexed_at }}", &data.indexed_at);
c = c.replace("{{ node_count }}", &data.node_count.to_string());
c = c.replace("{{ function_count }}", &data.function_count.to_string());
c = c.replace("{{ class_count }}", &data.class_count.to_string());
c = c.replace("{{ file_count }}", &data.file_count.to_string());
c = c.replace("{{ edge_count }}", &data.edge_count.to_string());
c = c.replace("{{ language_breakdown }}", &data.language_breakdown);
c = c.replace("{{ community_count }}", &data.community_count.to_string());
c = c.replace("{{ dead_code_count }}", &data.dead_code_count.to_string());
c = c.replace("{{ dead_code_high }}", &data.dead_code_high.to_string());
let communities_list = if data.top_communities.is_empty() {
"_(none detected)_\n".to_string()
} else {
data.top_communities
.iter()
.map(|ci| format!("- **#{}** — {} ({} nodes)", ci.id, ci.label, ci.node_count))
.collect::<Vec<_>>()
.join("\n")
};
c = c.replace("{{ top_communities_list }}", &communities_list);
let hotspots_list = if data.hotspots.is_empty() {
"_(none — no git history or low churn)_\n".to_string()
} else {
data.hotspots
.iter()
.map(|n| {
format!(
"- `{}` — churn {:.2}, {} callers",
n.path, n.churn, n.in_degree
)
})
.collect::<Vec<_>>()
.join("\n")
};
c = c.replace("{{ hotspots_list }}", &hotspots_list);
let entry_list = if data.entry_points.is_empty() {
"_(none detected)_\n".to_string()
} else {
data.entry_points
.iter()
.map(|n| format!("- `{}` ({})", n.name, n.kind))
.collect::<Vec<_>>()
.join("\n")
};
c = c.replace("{{ entry_points_list }}", &entry_list);
let god_list = if data.god_nodes.is_empty() {
"_(none detected)_\n".to_string()
} else {
data.god_nodes
.iter()
.map(|n| format!("- `{}` — {} callers", n.name, n.in_degree))
.collect::<Vec<_>>()
.join("\n")
};
c = c.replace("{{ god_nodes_list }}", &god_list);
if c.contains("{{") {
eprintln!(" Warning: CGX_SKILL.md contains unreplaced placeholder tokens");
}
c
}
pub fn generate_agents_md(data: &SkillData) -> String {
let mut c = AGENTS_TEMPLATE.to_string();
c = c.replace("{{ indexed_at }}", &data.indexed_at);
c = c.replace("{{ node_count }}", &data.node_count.to_string());
c = c.replace("{{ file_count }}", &data.file_count.to_string());
c = c.replace("{{ language_breakdown }}", &data.language_breakdown);
c = c.replace("{{ community_count }}", &data.community_count.to_string());
let community_descriptions = if data.top_communities.is_empty() {
"No architectural communities detected.\n".to_string()
} else {
data.top_communities
.iter()
.map(|ci| format!("- **#{} — {}** ({} nodes)", ci.id, ci.label, ci.node_count))
.collect::<Vec<_>>()
.join("\n")
};
c = c.replace("{{ community_descriptions }}", &community_descriptions);
let hotspots_table = if data.hotspots.is_empty() {
"No hotspots detected (no git history or low churn).\n".to_string()
} else {
let mut t = String::from("| File | Churn | Callers |\n|------|-------|--------|\n");
for n in &data.hotspots {
t.push_str(&format!(
"| `{}` | {:.2} | {} |\n",
n.path, n.churn, n.in_degree
));
}
t
};
c = c.replace("{{ hotspots_table }}", &hotspots_table);
let entry_list = if data.entry_points.is_empty() {
"_(none detected)_\n".to_string()
} else {
data.entry_points
.iter()
.map(|n| format!("- `{}` ({})", n.name, n.kind))
.collect::<Vec<_>>()
.join("\n")
};
c = c.replace("{{ entry_points_list }}", &entry_list);
let god_list = if data.god_nodes.is_empty() {
"_(none detected)_\n".to_string()
} else {
data.god_nodes
.iter()
.map(|n| {
format!(
"- `{}` ({}) — {} callers, in `{}`",
n.name, n.kind, n.in_degree, n.path
)
})
.collect::<Vec<_>>()
.join("\n")
};
c = c.replace("{{ god_nodes_list }}", &god_list);
if c.contains("{{") {
eprintln!(" Warning: AGENTS.md contains unreplaced placeholder tokens");
}
c
}
pub fn write_skill(repo_root: &Path, data: &SkillData) -> anyhow::Result<()> {
std::fs::write(repo_root.join("CGX_SKILL.md"), generate_skill(data))?;
Ok(())
}
pub fn write_agents_md(repo_root: &Path, data: &SkillData) -> anyhow::Result<()> {
std::fs::write(repo_root.join("AGENTS.md"), generate_agents_md(data))?;
Ok(())
}
pub fn install_git_hooks(repo_root: &Path) -> anyhow::Result<(bool, bool)> {
let hooks_dir = repo_root.join(".git").join("hooks");
if !hooks_dir.exists() {
return Ok((false, false));
}
Ok((
install_one_hook(&hooks_dir.join("post-commit")),
install_one_hook(&hooks_dir.join("post-checkout")),
))
}
fn install_one_hook(path: &Path) -> bool {
if path.exists() {
if let Ok(existing) = std::fs::read_to_string(path) {
let lines: Vec<&str> = existing.lines().collect();
if lines.len() < 2 || !lines[1].contains("cgx-managed") {
eprintln!(
" Warning: {} exists but was not created by cgx. Skipping.",
path.display()
);
return false;
}
} else {
return false;
}
}
let bin = std::env::current_exe()
.ok()
.and_then(|p| p.to_str().map(|s| s.to_string()))
.unwrap_or_else(|| "cgx".to_string());
let content = format!(
"#!/bin/sh\n# cgx-managed\n{} analyze --incremental --quiet\n",
bin
);
if std::fs::write(path, content).is_err() {
return false;
}
#[cfg(unix)]
{
use std::os::unix::fs::PermissionsExt;
let _ = std::fs::set_permissions(path, std::fs::Permissions::from_mode(0o755));
}
true
}