bctx-weave 0.1.28

use forge::signal::compactor;
use once_cell::sync::Lazy;
use regex::Regex;

// "Scanning N files..." / "Running N rules..."
static PROGRESS_RE: Lazy<Regex> = Lazy::new(|| {
    Regex::new(r"(?m)^(Scanning|Running|Skipping|Loading|Fetching|Initializing) [^\n]+\n?").unwrap()
});
// Code context lines (the `>` / `|` / `^~~~` highlighting blocks in semgrep output)
static CONTEXT_RE: Lazy<Regex> =
    Lazy::new(|| Regex::new(r"(?m)^[\s]*(>|\||\d+ \||[\^~]+)[^\n]*\n?").unwrap());

pub fn compress_semgrep(_subcmd: &str, raw: &str) -> String {
    let cleaned = compactor::normalise(raw);

    // JSON output
    if cleaned.trim_start().starts_with('{') && cleaned.contains("\"results\"") {
        return compress_semgrep_json(&cleaned);
    }

    let s = PROGRESS_RE.replace_all(&cleaned, "");
    let s = CONTEXT_RE.replace_all(&s, "");

    // Detect no findings
    if s.contains("No findings.") || s.contains("0 findings") {
        // Keep summary line
        let summary = s
            .lines()
            .find(|l| l.contains("findings") || l.contains("Findings"))
            .unwrap_or("semgrep: no findings");
        return summary.trim().to_string();
    }

    // Group findings by rule ID
    let mut by_rule: std::collections::HashMap<String, Vec<String>> =
        std::collections::HashMap::new();

    // Finding format: "  rule-id\n  path/to/file.py:10\n  message"
    let mut current_rule = String::new();
    let mut current_file = String::new();

    for line in s.lines() {
        let t = line.trim();
        if t.is_empty() {
            continue;
        }
        // Rule IDs look like: "org.rule-name" or "path.to.rule"
        if t.starts_with("--") || t.starts_with("==") || t.starts_with("Findings:") {
            continue;
        }
        // File:line pattern
        if t.contains(':') && !t.starts_with('/') {
            let parts: Vec<&str> = t.splitn(2, ':').collect();
            if parts.len() == 2 && parts[1].parse::<u32>().is_ok() {
                current_file = t.to_string();
                continue;
            }
        }
        // Full path findings: "/path/to/file.py:10:5: rule-id ..."
        if t.starts_with('/') || t.starts_with("./") {
            if let Some(rule_pos) = t.rfind(':') {
                let rule = t[rule_pos + 1..].trim().to_string();
                if !rule.is_empty() {
                    current_rule = rule.clone();
                    by_rule
                        .entry(rule)
                        .or_default()
                        .push(t[..rule_pos].to_string());
                    continue;
                }
            }
        }
        // Rule-like identifier (contains dot or slash, no spaces, upper or lower)
        if !t.contains(' ') && (t.contains('.') || t.contains('/')) && !t.ends_with('/') {
            current_rule = t.to_string();
            continue;
        }
        // Message line following a rule
        if !current_rule.is_empty() {
            let entry = if current_file.is_empty() {
                t.to_string()
            } else {
                format!("{current_file}: {t}")
            };
            by_rule.entry(current_rule.clone()).or_default().push(entry);
            current_file.clear();
        }
    }

    if by_rule.is_empty() {
        // Fallback: keep non-empty, non-progress lines
        let kept: Vec<&str> = s.lines().filter(|l| !l.trim().is_empty()).collect();
        return if kept.len() > 30 {
            format!(
                "{}\n… [{} more lines]",
                kept[..30].join("\n"),
                kept.len() - 30
            )
        } else {
            kept.join("\n")
        };
    }

    let total: usize = by_rule.values().map(|v| v.len()).sum();
    let mut rules: Vec<&String> = by_rule.keys().collect();
    rules.sort();

    let mut result: Vec<String> = Vec::new();
    for rule in &rules {
        let locs = &by_rule[*rule];
        result.push(format!("{rule} — {} finding(s)", locs.len()));
        for loc in locs.iter().take(3) {
            result.push(format!("  {loc}"));
        }
        if locs.len() > 3 {
            result.push(format!("  … {} more", locs.len() - 3));
        }
    }
    result.push(format!(
        "semgrep: {total} findings across {} rules",
        rules.len()
    ));
    result.join("\n")
}

fn compress_semgrep_json(raw: &str) -> String {
    use once_cell::sync::Lazy;
    use regex::Regex;
    static RULE_RE: Lazy<Regex> =
        Lazy::new(|| Regex::new(r#""check_id"\s*:\s*"([^"]+)""#).unwrap());
    static PATH_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r#""path"\s*:\s*"([^"]+)""#).unwrap());

    let rules: Vec<&str> = RULE_RE
        .captures_iter(raw)
        .filter_map(|c| c.get(1).map(|m| m.as_str()))
        .collect();
    let paths: Vec<&str> = PATH_RE
        .captures_iter(raw)
        .filter_map(|c| c.get(1).map(|m| m.as_str()))
        .collect();

    if rules.is_empty() {
        return "semgrep [json]: no findings".to_string();
    }

    let mut counts: std::collections::HashMap<&str, usize> = std::collections::HashMap::new();
    for rule in &rules {
        *counts.entry(rule).or_insert(0) += 1;
    }

    let unique_files: std::collections::HashSet<&str> = paths.iter().copied().collect();
    let total = rules.len();
    let mut rule_list: Vec<String> = counts
        .iter()
        .map(|(r, c)| {
            if *c > 1 {
                format!("{r}(×{c})")
            } else {
                (*r).to_string()
            }
        })
        .collect();
    rule_list.sort();

    format!(
        "semgrep [json]: {total} findings in {} files — {}",
        unique_files.len(),
        rule_list.join(", ")
    )
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn strips_progress_lines() {
        let raw = "Scanning 100 files...\nRunning 42 rules...\n/app/main.py:10: python.lang.security.audit.eval-detected\n  Use of eval() detected\nFindings: 1\n";
        let out = compress_semgrep("scan", &raw);
        assert!(!out.contains("Scanning"), "{out}");
        assert!(!out.contains("Running 42"), "{out}");
    }

    #[test]
    fn no_findings_clean() {
        let raw = "Scanning 50 files...\nNo findings.\n";
        let out = compress_semgrep("scan", raw);
        assert!(
            out.contains("no findings") || out.contains("No findings"),
            "{out}"
        );
        assert!(!out.contains("Scanning"), "{out}");
    }

    #[test]
    fn json_mode_extracts_rule_counts() {
        let raw = r#"{"results":[{"check_id":"python.lang.eval","path":"app/main.py"},{"check_id":"python.lang.eval","path":"app/utils.py"},{"check_id":"python.sqli","path":"app/db.py"}],"errors":[]}"#;
        let out = compress_semgrep("scan", raw);
        assert!(out.contains("json") || out.contains("python"), "{out}");
        assert!(out.contains("×2") || out.contains("2"), "{out}");
    }

    #[test]
    fn strips_code_context_lines() {
        let raw = "/app/foo.py:10: python.eval\n> eval(user_input)\n  ^^^^^^^^^^^^^^^^\n| context line\nFindings: 1\n";
        let out = compress_semgrep("scan", &raw);
        assert!(!out.contains("> eval"), "{out}");
        assert!(!out.contains("^^^^^^^^"), "{out}");
    }
}