repotoire 0.8.2

Graph-powered code analysis CLI. 110 detectors for security, architecture, bus factor, and code quality.
Documentation
//! Hierarchical Predictive Coding Detector
//!
//! Replaces the flat n-gram SurprisalDetector with a 5-level
//! hierarchical predictive coding engine. Each level independently
//! models "what's normal" and computes prediction errors (z-scores).
//! Concordance across levels drives severity — a function that is
//! surprising at multiple independent levels is a much stronger
//! signal than any single metric.

use crate::detectors::base::Detector;
use crate::detectors::function_context::FunctionContextMap;
use crate::graph::GraphQueryExt;
use crate::models::Finding;
use crate::predictive::PredictiveCodingEngine;
use anyhow::Result;
use std::path::PathBuf;
use std::sync::Arc;

pub struct HierarchicalSurprisalDetector {
    max_findings: usize,
}

impl HierarchicalSurprisalDetector {
    pub fn new() -> Self {
        Self { max_findings: 30 }
    }
}

impl Default for HierarchicalSurprisalDetector {
    fn default() -> Self {
        Self::new()
    }
}

impl Detector for HierarchicalSurprisalDetector {
    fn name(&self) -> &'static str {
        "hierarchical-surprisal"
    }

    fn description(&self) -> &'static str {
        "Detects unusual code using hierarchical predictive coding (5 levels)"
    }

    fn category(&self) -> &'static str {
        "predictive-coding"
    }

    fn detect(
        &self,
        ctx: &crate::detectors::analysis_context::AnalysisContext,
    ) -> Result<Vec<Finding>> {
        let graph = ctx.graph;
        let files = &ctx.as_file_provider();
        let contexts = &ctx.functions;
        let i = graph.interner();
        let mut engine = PredictiveCodingEngine::new();
        engine.train_and_score(
            graph,
            files,
            contexts,
            ctx.cached_embeddings.as_ref().map(|arc| arc.as_ref()),
        );

        let surprising = engine.get_surprising_entities(2); // min concordance 2 for findings

        let mut findings: Vec<Finding> = Vec::new();
        let functions = graph.get_functions_shared();

        for (qn, score) in surprising.iter().take(self.max_findings) {
            // Find the function node for file/line info
            let func = functions.iter().find(|f| f.qn(i) == *qn);
            let (file_path, line_start, line_end, func_name) = match func {
                Some(f) => (
                    PathBuf::from(f.path(i)),
                    Some(f.line_start),
                    Some(f.line_end),
                    f.node_name(i).to_string(),
                ),
                None => continue,
            };

            // Build per-level detail string
            let mut level_detail = String::new();
            for ls in &score.level_scores {
                let marker = if ls.is_surprising { " *" } else { "" };
                level_detail.push_str(&format!(
                    "  {:<20} z={:.1}{}\n",
                    ls.level.label(),
                    ls.z_score,
                    marker
                ));
            }

            let severity = score.severity;

            let description = format!(
                "Function `{}` is surprising at {} of 5 hierarchy levels:\n\n{}\n\
                 Compound surprise: {:.1} (precision-weighted)\n\
                 Concordance: {}/5 levels\n\n\
                 **Possible causes:**\n\
                 - AI-generated code with different style\n\
                 - Copy-pasted from a different codebase\n\
                 - Architectural misplacement\n\
                 - Unusual algorithm or potential bug",
                func_name,
                score.concordance,
                level_detail,
                score.compound_surprise,
                score.concordance,
            );

            // Build threshold_metadata with per-level info
            let mut metadata = std::collections::BTreeMap::new();
            metadata.insert(
                "threshold_source".to_string(),
                "predictive-coding".to_string(),
            );
            metadata.insert("concordance".to_string(), score.concordance.to_string());
            metadata.insert(
                "compound_surprise".to_string(),
                format!("{:.2}", score.compound_surprise),
            );
            for ls in &score.level_scores {
                let key = format!(
                    "{}_z_score",
                    ls.level.label().replace(' ', "_").to_lowercase()
                );
                metadata.insert(key, format!("{:.2}", ls.z_score));
            }

            // Phase 1b dual-branch bridge: typed prediction reasons.
            // See `level_score_to_prediction_reason` for the bridge
            // logic and sign convention.
            let prediction_reasons: Vec<crate::dual_branch::PredictionReason> = score
                .level_scores
                .iter()
                .filter_map(level_score_to_prediction_reason)
                .collect();

            findings.push(Finding {
                id: String::new(),
                detector: "HierarchicalSurprisalDetector".to_string(),
                severity,
                title: format!("Unusual code pattern in `{}`", func_name),
                description,
                affected_files: vec![file_path],
                line_start,
                line_end,
                suggested_fix: Some(
                    "Review this function for:\n\
                     1. Style consistency with the rest of the project\n\
                     2. Correctness — unusual patterns may indicate bugs\n\
                     3. Architectural fit — is this in the right module?"
                        .to_string(),
                ),
                estimated_effort: Some("15 minutes".to_string()),
                category: Some("predictive-coding".to_string()),
                why_it_matters: Some(format!(
                    "This function's patterns are unusual at {} of 5 independent hierarchy levels \
                     (token, structural, dependency, relational, architectural). \
                     Multi-level concordance is a stronger signal than any single metric.",
                    score.concordance
                )),
                threshold_metadata: metadata,
                prediction_reasons,
                ..Default::default()
            });
        }

        Ok(findings)
    }
}

/// Phase 1b dual-branch bridge from `predictive::LevelScore` to a typed
/// `PredictionReason`.
///
/// Returns `None` for non-surprising levels — matching the existing
/// detector's emission semantics, which only treats surprising levels as
/// evidence. Phase 1c can decide whether to also emit reverse signals
/// for non-surprising levels (evidence *against* the finding being real).
///
/// # Sign convention
///
/// Hierarchical surprisal is evidence the finding is real (RealBug
/// branch). Per `PredictionReason::weight`'s convention ("positive leans
/// Benign, negative leans RealBug"; see `dual_branch.rs`), the weight is
/// **negative** here.
///
/// # Magnitude
///
/// Weight = `-clamp(z_score / 5.0, 0.0, 1.0)`. A z-score of 5.0 (very
/// surprising) maps to weight -1.0; a z-score at the surprisal threshold
/// (typically ~2.0) maps to ~-0.4. The 5.0 normalizer matches the rough
/// upper bound observed in `predictive::compound`'s precision-weighted
/// scores; treating anything above 5σ as "fully RealBug-leaning" is a
/// reasonable saturation point because evidence beyond that doesn't
/// meaningfully change the prediction.
fn level_score_to_prediction_reason(
    ls: &crate::predictive::LevelScore,
) -> Option<crate::dual_branch::PredictionReason> {
    if !ls.is_surprising {
        return None;
    }
    let level_name = ls.level.label().to_string();
    let weight = -((ls.z_score / 5.0).clamp(0.0, 1.0)) as f32;
    let note = format!(
        "{} z-score {:.2} exceeds surprisal threshold {:.2}.",
        ls.level.label(),
        ls.z_score,
        ls.threshold,
    );
    Some(crate::dual_branch::PredictionReason {
        kind: crate::dual_branch::PredictionReasonKind::HierarchicalLevel {
            level_name,
            z_score: ls.z_score,
        },
        weight,
        note,
    })
}

impl super::RegisteredDetector for HierarchicalSurprisalDetector {
    fn create(_init: &super::DetectorInit) -> std::sync::Arc<dyn Detector> {
        std::sync::Arc::new(Self::new())
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::detectors::file_provider::MockFileProvider;
    use crate::graph::builder::GraphBuilder;

    #[test]
    fn test_detector_name_and_category() {
        let detector = HierarchicalSurprisalDetector::new();
        assert_eq!(detector.name(), "hierarchical-surprisal");
        assert_eq!(detector.category(), "predictive-coding");
    }

    #[test]
    fn test_detector_empty_graph_no_crash() {
        let store = GraphBuilder::new().freeze();
        let _files = MockFileProvider::new(vec![]);
        let detector = HierarchicalSurprisalDetector::new();
        let ctx = crate::detectors::analysis_context::AnalysisContext::test(&store);
        let findings = detector.detect(&ctx).expect("detection should succeed");
        assert!(findings.is_empty());
    }

    // ── Phase 1b dual-branch bridge tests ──

    #[test]
    fn bridge_skips_non_surprising_levels() {
        // Non-surprising levels are evidence *against* the finding (or
        // at minimum evidence the model predicted correctly). The
        // existing detector ignores them, and so does the Phase 1b
        // bridge (matching emission semantics).
        let ls = crate::predictive::LevelScore {
            level: crate::predictive::Level::Token,
            z_score: 1.5,
            threshold: 2.0,
            is_surprising: false,
        };
        assert!(level_score_to_prediction_reason(&ls).is_none());
    }

    #[test]
    fn bridge_surprising_level_produces_negative_weight() {
        // Hierarchical surprisal is evidence the finding is real
        // (RealBug). The bridge must emit a NEGATIVE weight per the
        // sign convention in dual_branch.rs.
        let ls = crate::predictive::LevelScore {
            level: crate::predictive::Level::Architectural,
            z_score: 3.0,
            threshold: 2.0,
            is_surprising: true,
        };
        let reason = level_score_to_prediction_reason(&ls).expect("surprising level should emit");
        assert!(
            reason.weight < 0.0,
            "surprisal evidence must lean RealBug (negative weight); got {}",
            reason.weight
        );
        // Magnitude check: z=3.0 / 5.0 = 0.6.
        assert!(
            (reason.weight - (-0.6)).abs() < 1e-6,
            "weight magnitude must be -clamp(z/5, 0, 1) = -0.6; got {}",
            reason.weight
        );
    }

    #[test]
    fn bridge_weight_saturates_at_minus_one() {
        // A very high z-score (>5) should saturate at weight -1.0 rather
        // than produce magnitudes beyond [-1, 1]. Saturation matches
        // the Magnitude section of level_score_to_prediction_reason's
        // docstring.
        let ls = crate::predictive::LevelScore {
            level: crate::predictive::Level::Token,
            z_score: 12.0,
            threshold: 2.0,
            is_surprising: true,
        };
        let reason = level_score_to_prediction_reason(&ls).expect("surprising");
        assert!(
            (reason.weight - (-1.0)).abs() < 1e-6,
            "z=12 should saturate to weight=-1.0; got {}",
            reason.weight
        );
    }

    #[test]
    fn bridge_kind_uses_predictive_level_label() {
        // Phase 1a's `prediction_reason_hierarchical_level_matches_
        // predictive_label` test pinned the convention `level_name ==
        // Level::label()`. This bridge is the single producer of those
        // values, so the convention must hold here too.
        for level in [
            crate::predictive::Level::Token,
            crate::predictive::Level::Structural,
            crate::predictive::Level::DependencyChain,
            crate::predictive::Level::Relational,
            crate::predictive::Level::Architectural,
        ] {
            let ls = crate::predictive::LevelScore {
                level,
                z_score: 3.0,
                threshold: 2.0,
                is_surprising: true,
            };
            let reason = level_score_to_prediction_reason(&ls).expect("surprising");
            match reason.kind {
                crate::dual_branch::PredictionReasonKind::HierarchicalLevel {
                    level_name,
                    z_score,
                } => {
                    assert_eq!(
                        level_name,
                        level.label(),
                        "bridge must populate level_name from Level::label()"
                    );
                    assert!(
                        (z_score - 3.0).abs() < 1e-6,
                        "z_score must roundtrip; got {z_score}"
                    );
                }
                other => panic!("expected HierarchicalLevel, got {other:?}"),
            }
        }
    }
}