cgx-engine 0.1.0

Core engine for cgx — Tree-sitter parsing, DuckDB graph storage, git analysis, and clustering
Documentation
use std::path::Path;

use crate::graph::{GraphDb, Node};

pub struct SkillData {
    pub indexed_at: String,
    pub node_count: u64,
    pub function_count: u64,
    pub class_count: u64,
    pub file_count: u64,
    pub edge_count: u64,
    pub language_breakdown: String,
    pub community_count: u32,
    pub top_communities: Vec<CommunityInfo>,
    pub hotspots: Vec<Node>,
    pub entry_points: Vec<Node>,
    pub god_nodes: Vec<Node>,
}

pub struct CommunityInfo {
    pub id: i64,
    pub label: String,
    pub node_count: i64,
}

const SKILL_TEMPLATE: &str = r#"# cgx — Codebase Knowledge Graph

> Auto-generated by cgx on {{ indexed_at }}. Do not edit manually.
> Re-run `cgx analyze` to refresh.

## When to Use cgx

Use cgx commands INSTEAD of reading source files when you need to:
- Find where any function, class, or variable is defined
- Understand what depends on a specific piece of code
- Know what will break before making a change
- Understand the architecture of an unfamiliar area
- Find who owns a file or module
- Find dead code or unused exports
- Trace a full call chain from entry point to implementation

**Rule: Never open a file speculatively. Query first. Open only if you need
the implementation body — not to find where something lives.**

## Trigger Patterns

Run cgx automatically when the user says or implies any of:
- "what calls X" / "who uses X" / "what depends on X"
- "show me the architecture" / "how does this work"
- "what breaks if I change X" / "blast radius of X"
- "find X in the codebase" / "where is X defined"
- "who owns X" / "who wrote X"
- "is X used anywhere" / "is X dead code"
- Starting a new task in an unfamiliar part of the codebase
- Before making any edit to a function with many callers

## Commands

```bash
# Always run first in a new session
cgx summary

# Find any symbol
cgx query find <name>
cgx query find <name> --kind=Function

# Dependencies of a node
cgx query deps <node-name>

# Blast radius — run BEFORE every edit
cgx query blast-radius <function-name>

# Trace a call path
cgx query chain "<A> -> <B>"

# High-risk files
cgx hotspots

# Code ownership
cgx query owners <path>

# Search by concept
cgx query search "<phrase>"

# Community / cluster
cgx query community <id-or-name>

# Dead code
cgx query dead-code
```

## Workflow: Starting a Task

1. `cgx summary`                        — orient yourself
2. `cgx query find <entry-point>`       — locate the relevant node
3. `cgx query blast-radius <node>`      — know the risk before touching it
4. Open only the specific files you need

## Workflow: Before Every Edit

1. `cgx query blast-radius <function>`  — what breaks?
2. `cgx query deps <function>`          — what does it depend on?
3. Make the change
4. `cgx query blast-radius <function>`  — verify ripple is as expected

## Token Budget

| Action                    | Approx tokens |
|---------------------------|---------------|
| `cgx summary`             | ~400          |
| `cgx query find X`        | ~200          |
| `cgx query blast-radius X`| ~300-800      |
| Opening one source file   | ~2,000-15,000 |

Prefer 3 cgx queries over opening 1 file speculatively.

## This Codebase

- **Indexed:** {{ indexed_at }}
- **Nodes:** {{ node_count }} ({{ function_count }} functions,
  {{ class_count }} classes, {{ file_count }} files)
- **Edges:** {{ edge_count }}
- **Languages:** {{ language_breakdown }}
- **Communities:** {{ community_count }}

### Top Communities
{{ top_communities_list }}

### Hotspots (highest risk — review carefully before editing)
{{ hotspots_list }}

### Entry Points (nothing imports these — safe starting points)
{{ entry_points_list }}

### Most Depended-On Nodes (god nodes — change with extreme care)
{{ god_nodes_list }}
"#;

const AGENTS_TEMPLATE: &str = r#"# Codebase Architecture

> Auto-generated by cgx {{ indexed_at }}

## Overview
{{ node_count }} nodes across {{ file_count }} files.
Primary languages: {{ language_breakdown }}.
{{ community_count }} architectural communities detected.

## Module Map
{{ community_descriptions }}

## Hotspots
These files change frequently and have many dependents.
Review carefully before editing.
{{ hotspots_table }}

## Entry Points
These files have no inbound imports — they are roots.
{{ entry_points_list }}

## AI Integration
This repo is indexed by cgx. Two integration modes are available:

**Skills (zero config):** Read `CGX_SKILL.md` for command reference.

**MCP (structured):** Run `cgx setup` to configure your editor,
then `cgx mcp` to start the server.
"#;

pub fn build_skill_data(db: &GraphDb) -> anyhow::Result<SkillData> {
    let node_count = db.node_count()?;
    let edge_count = db.edge_count()?;
    let lang_breakdown = db.get_language_breakdown()?;
    let communities = db.get_communities()?;
    let counts_by_kind = db.get_node_counts_by_kind()?;

    let function_count = counts_by_kind.get("Function").copied().unwrap_or(0);
    let class_count = counts_by_kind.get("Class").copied().unwrap_or(0);
    let file_count = counts_by_kind.get("File").copied().unwrap_or(0);

    let language_breakdown = if lang_breakdown.is_empty() {
        "none".to_string()
    } else {
        let mut entries: Vec<_> = lang_breakdown.iter().collect();
        entries.sort_by(|a, b| b.1.partial_cmp(a.1).unwrap_or(std::cmp::Ordering::Equal));
        entries
            .iter()
            .map(|(lang, pct)| format!("{} {:.0}%", lang, *pct * 100.0))
            .collect::<Vec<_>>()
            .join(", ")
    };

    let top_communities: Vec<CommunityInfo> = communities
        .iter()
        .take(5)
        .map(|(id, label, count, _top_nodes)| CommunityInfo {
            id: *id,
            label: label.clone(),
            node_count: *count,
        })
        .collect();

    let all_nodes = db.get_all_nodes()?;

    let mut file_nodes: Vec<&Node> = all_nodes
        .iter()
        .filter(|n| n.kind == "File" && n.churn > 0.0)
        .collect();
    file_nodes.sort_by(|a, b| {
        let sa = a.churn * a.coupling + a.in_degree as f64 * 0.01;
        let sb = b.churn * b.coupling + b.in_degree as f64 * 0.01;
        sb.partial_cmp(&sa).unwrap_or(std::cmp::Ordering::Equal)
    });
    let hotspots: Vec<Node> = file_nodes.iter().take(5).map(|&n| n.clone()).collect();

    let mut entry_nodes: Vec<&Node> = all_nodes
        .iter()
        .filter(|n| n.in_degree == 0 && n.kind != "File" && n.kind != "Author")
        .collect();
    entry_nodes.sort_by(|a, b| b.out_degree.cmp(&a.out_degree));
    let entry_points: Vec<Node> = entry_nodes.iter().take(5).map(|&n| n.clone()).collect();

    let mut god_nodes: Vec<&Node> = all_nodes
        .iter()
        .filter(|n| n.in_degree > 0 && n.kind != "File")
        .collect();
    god_nodes.sort_by(|a, b| b.in_degree.cmp(&a.in_degree));
    // Deduplicate by name (same function defined in multiple files shows same name)
    let mut seen_names = std::collections::HashSet::new();
    let top_god_nodes: Vec<Node> = god_nodes.iter()
        .filter(|n| seen_names.insert(n.name.clone()))
        .take(5)
        .map(|&n| n.clone())
        .collect();

    Ok(SkillData {
        indexed_at: chrono::Utc::now().to_rfc3339(),
        node_count,
        function_count,
        class_count,
        file_count,
        edge_count,
        language_breakdown,
        community_count: communities.len() as u32,
        top_communities,
        hotspots,
        entry_points,
        god_nodes: top_god_nodes,
    })
}

pub fn generate_skill(data: &SkillData) -> String {
    let mut c = SKILL_TEMPLATE.to_string();

    c = c.replace("{{ indexed_at }}", &data.indexed_at);
    c = c.replace("{{ node_count }}", &data.node_count.to_string());
    c = c.replace("{{ function_count }}", &data.function_count.to_string());
    c = c.replace("{{ class_count }}", &data.class_count.to_string());
    c = c.replace("{{ file_count }}", &data.file_count.to_string());
    c = c.replace("{{ edge_count }}", &data.edge_count.to_string());
    c = c.replace("{{ language_breakdown }}", &data.language_breakdown);
    c = c.replace("{{ community_count }}", &data.community_count.to_string());

    let communities_list = if data.top_communities.is_empty() {
        "_(none detected)_\n".to_string()
    } else {
        data.top_communities.iter()
            .map(|ci| format!("- **#{}** — {} ({} nodes)", ci.id, ci.label, ci.node_count))
            .collect::<Vec<_>>().join("\n")
    };
    c = c.replace("{{ top_communities_list }}", &communities_list);

    let hotspots_list = if data.hotspots.is_empty() {
        "_(none — no git history or low churn)_\n".to_string()
    } else {
        data.hotspots.iter()
            .map(|n| format!("- `{}` — churn {:.2}, {} callers", n.path, n.churn, n.in_degree))
            .collect::<Vec<_>>().join("\n")
    };
    c = c.replace("{{ hotspots_list }}", &hotspots_list);

    let entry_list = if data.entry_points.is_empty() {
        "_(none detected)_\n".to_string()
    } else {
        data.entry_points.iter()
            .map(|n| format!("- `{}` ({})", n.name, n.kind))
            .collect::<Vec<_>>().join("\n")
    };
    c = c.replace("{{ entry_points_list }}", &entry_list);

    let god_list = if data.god_nodes.is_empty() {
        "_(none detected)_\n".to_string()
    } else {
        data.god_nodes.iter()
            .map(|n| format!("- `{}` — {} callers", n.name, n.in_degree))
            .collect::<Vec<_>>().join("\n")
    };
    c = c.replace("{{ god_nodes_list }}", &god_list);

    if c.contains("{{") {
        eprintln!("  Warning: CGX_SKILL.md contains unreplaced placeholder tokens");
    }
    c
}

pub fn generate_agents_md(data: &SkillData) -> String {
    let mut c = AGENTS_TEMPLATE.to_string();

    c = c.replace("{{ indexed_at }}", &data.indexed_at);
    c = c.replace("{{ node_count }}", &data.node_count.to_string());
    c = c.replace("{{ file_count }}", &data.file_count.to_string());
    c = c.replace("{{ language_breakdown }}", &data.language_breakdown);
    c = c.replace("{{ community_count }}", &data.community_count.to_string());

    let community_descriptions = if data.top_communities.is_empty() {
        "No architectural communities detected.\n".to_string()
    } else {
        data.top_communities.iter()
            .map(|ci| format!("- **#{}{}** ({} nodes)", ci.id, ci.label, ci.node_count))
            .collect::<Vec<_>>().join("\n")
    };
    c = c.replace("{{ community_descriptions }}", &community_descriptions);

    let hotspots_table = if data.hotspots.is_empty() {
        "No hotspots detected (no git history or low churn).\n".to_string()
    } else {
        let mut t = String::from("| File | Churn | Callers |\n|------|-------|--------|\n");
        for n in &data.hotspots {
            t.push_str(&format!("| `{}` | {:.2} | {} |\n", n.path, n.churn, n.in_degree));
        }
        t
    };
    c = c.replace("{{ hotspots_table }}", &hotspots_table);

    let entry_list = if data.entry_points.is_empty() {
        "_(none detected)_\n".to_string()
    } else {
        data.entry_points.iter()
            .map(|n| format!("- `{}` ({})", n.name, n.kind))
            .collect::<Vec<_>>().join("\n")
    };
    c = c.replace("{{ entry_points_list }}", &entry_list);

    if c.contains("{{") {
        eprintln!("  Warning: AGENTS.md contains unreplaced placeholder tokens");
    }

    c
}

pub fn write_skill(repo_root: &Path, data: &SkillData) -> anyhow::Result<()> {
    std::fs::write(repo_root.join("CGX_SKILL.md"), generate_skill(data))?;
    Ok(())
}

pub fn write_agents_md(repo_root: &Path, data: &SkillData) -> anyhow::Result<()> {
    std::fs::write(repo_root.join("AGENTS.md"), generate_agents_md(data))?;
    Ok(())
}

pub fn install_git_hooks(repo_root: &Path) -> anyhow::Result<(bool, bool)> {
    let hooks_dir = repo_root.join(".git").join("hooks");
    if !hooks_dir.exists() {
        return Ok((false, false));
    }
    Ok((install_one_hook(&hooks_dir.join("post-commit")), install_one_hook(&hooks_dir.join("post-checkout"))))
}

fn install_one_hook(path: &Path) -> bool {
    if path.exists() {
        if let Ok(existing) = std::fs::read_to_string(path) {
            let lines: Vec<&str> = existing.lines().collect();
            if lines.len() < 2 || !lines[1].contains("cgx-managed") {
                eprintln!("  Warning: {} exists but was not created by cgx. Skipping.", path.display());
                return false;
            }
        } else {
            return false;
        }
    }
    // Use the running binary's path so the hook works even if cgx isn't on PATH
    let bin = std::env::current_exe()
        .ok()
        .and_then(|p| p.to_str().map(|s| s.to_string()))
        .unwrap_or_else(|| "cgx".to_string());
    let content = format!("#!/bin/sh\n# cgx-managed\n{} analyze --incremental --quiet\n", bin);
    if std::fs::write(path, content).is_err() {
        return false;
    }
    #[cfg(unix)]
    {
        use std::os::unix::fs::PermissionsExt;
        let _ = std::fs::set_permissions(path, std::fs::Permissions::from_mode(0o755));
    }
    true
}