repotoire 0.9.0

//! Feedback collection for training data
//!
//! Collects user feedback on findings to build training data.
//! Stores labeled examples in JSONL format.

use crate::dual_branch::{BranchLabel, PredictionReason, PredictionReasonKind};
use crate::models::{Finding, Severity};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::fs::{File, OpenOptions};
use std::io::{BufRead, BufReader, Write};
use std::path::{Path, PathBuf};

/// A labeled training example
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LabeledFinding {
    /// Finding ID
    pub finding_id: String,
    /// Detector name
    pub detector: String,
    /// Severity level
    #[serde(deserialize_with = "deserialize_severity_compat")]
    pub severity: Severity,
    /// Title
    pub title: String,
    /// Description (truncated)
    pub description: String,
    /// Affected file path
    pub file_path: String,
    /// Line number
    pub line_start: Option<u32>,
    /// Whether user marked as true positive
    pub is_true_positive: bool,
    /// Optional reason from user
    pub reason: Option<String>,
    /// Timestamp
    pub timestamp: String,

    // ── Dual-branch context (Phase 3 prep) ──
    //
    // These five fields capture the predictor's state at labeling time so
    // Phase 3 (predictor tuning) can distinguish predictor mispredictions
    // from detector-level FPs. All `#[serde(default, ...skip_if...)]` so
    // pre-Phase-3 JSONL entries parse forward and single-branch findings
    // do not bloat the log.
    /// True iff finding had a dual-branch interpretation at the time of
    /// labeling (i.e. `Finding.alternative_branch.is_some()`). Phase 3
    /// uses this to partition mispredictions from single-branch FPs.
    #[serde(default)]
    pub had_alternative_branch: bool,
    /// The predictor's choice: which branch was carried as primary.
    /// `"real_bug"` / `"benign"` for dual-branch findings, `None` for
    /// single-branch.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub predicted_label: Option<String>,
    /// The alternative interpretation's severity (the side NOT picked).
    /// `None` for single-branch.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub alternative_severity: Option<Severity>,
    /// Typed evidence codes the predictor used (variant discriminant
    /// names, e.g. `"EnclosingScope"`, `"GraphFlow"`). Phase 3 will
    /// weight these against labels.
    ///
    /// Stored as `Vec<String>` rather than the full `PredictionReason`
    /// enum because:
    ///
    /// 1. The full enum carries free-form bound values (scope names,
    ///    file paths, etc.) that bloat the JSONL and complicate offline
    ///    aggregation; Phase 3 tuning needs aggregate counts per reason
    ///    kind, not the bound values.
    /// 2. Forward-compat: new reason kinds will be added in Phase 3+.
    ///    The string representation keeps old `repotoire feedback`
    ///    exports parseable.
    ///
    /// If Phase 3 later wants the bound values, they're recoverable
    /// from the cached `last_findings.json` keyed by `finding_id`.
    #[serde(default, skip_serializing_if = "Vec::is_empty")]
    pub prediction_reason_kinds: Vec<String>,
    /// Severity before any collapse / remap; preserved so Phase 3 can
    /// detect cases where collapse made the wrong call.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub original_severity: Option<Severity>,
}

/// Map a `PredictionReason` to its variant discriminant name.
///
/// Uses an exhaustive `match` (no wildcard) so that adding a new
/// variant to `PredictionReasonKind` is a compile error here. That is
/// the intentional tripwire: keeping this in sync with the enum is
/// required, not optional.
pub(crate) fn reason_kind(r: &PredictionReason) -> &'static str {
    match &r.kind {
        PredictionReasonKind::BundledCode => "BundledCode",
        PredictionReasonKind::NonProductionPath => "NonProductionPath",
        PredictionReasonKind::MultiDetectorAgreement { .. } => "MultiDetectorAgreement",
        PredictionReasonKind::TestFixtureFile => "TestFixtureFile",
        PredictionReasonKind::HierarchicalLevel { .. } => "HierarchicalLevel",
        PredictionReasonKind::KeywordArgument { .. } => "KeywordArgument",
        PredictionReasonKind::FirstArgIdentifier { .. } => "FirstArgIdentifier",
        PredictionReasonKind::EnclosingScope { .. } => "EnclosingScope",
        PredictionReasonKind::ImportPresence { .. } => "ImportPresence",
        PredictionReasonKind::FilePath { .. } => "FilePath",
        PredictionReasonKind::StructuralPattern { .. } => "StructuralPattern",
        PredictionReasonKind::Custom { .. } => "Custom",
    }
}

/// Map an `AlternativeBranch.label` to the predicted-primary label string.
///
/// The alternative is the OPPOSITE of the primary, so when the
/// alternative is labelled `RealBug` the primary the predictor chose
/// was `Benign`, and vice versa. See `BranchLabel::opposite()` for the
/// canonical definition.
pub(crate) fn predicted_label_from_alt(alt_label: BranchLabel) -> &'static str {
    match alt_label.opposite() {
        BranchLabel::RealBug => "real_bug",
        BranchLabel::Benign => "benign",
    }
}

impl LabeledFinding {
    pub fn from_finding(finding: &Finding, is_tp: bool, reason: Option<String>) -> Self {
        let had_alternative_branch = finding.alternative_branch.is_some();
        let predicted_label = finding
            .alternative_branch
            .as_ref()
            .map(|alt| predicted_label_from_alt(alt.label).to_string());
        let alternative_severity = finding.alternative_branch.as_ref().map(|alt| alt.severity);
        let prediction_reason_kinds: Vec<String> = finding
            .prediction_reasons
            .iter()
            .map(|r| reason_kind(r).to_string())
            .collect();

        Self {
            finding_id: finding.id.clone(),
            detector: finding.detector.clone(),
            severity: finding.severity,
            title: finding.title.clone(),
            description: finding.description.chars().take(500).collect(),
            file_path: finding
                .affected_files
                .first()
                .map(|p| p.to_string_lossy().to_string())
                .unwrap_or_default(),
            line_start: finding.line_start,
            is_true_positive: is_tp,
            reason,
            timestamp: chrono::Utc::now().to_rfc3339(),
            had_alternative_branch,
            predicted_label,
            alternative_severity,
            prediction_reason_kinds,
            original_severity: finding.original_severity,
        }
    }
}

/// Feedback collector - stores labeled examples
pub struct FeedbackCollector {
    data_path: PathBuf,
}

impl FeedbackCollector {
    /// Create collector with default path
    pub fn new() -> Self {
        let data_path = dirs::data_dir()
            .unwrap_or_else(|| PathBuf::from("."))
            .join("repotoire")
            .join("training_data.jsonl");

        Self { data_path }
    }

    /// Create with custom path
    pub fn with_path(path: impl Into<PathBuf>) -> Self {
        Self {
            data_path: path.into(),
        }
    }

    /// Record a labeled finding
    pub fn record(
        &self,
        finding: &Finding,
        is_tp: bool,
        reason: Option<String>,
    ) -> std::io::Result<()> {
        // Ensure directory exists
        if let Some(parent) = self.data_path.parent() {
            std::fs::create_dir_all(parent)?;
        }

        let labeled = LabeledFinding::from_finding(finding, is_tp, reason);
        let json = serde_json::to_string(&labeled)?;

        let mut file = OpenOptions::new()
            .create(true)
            .append(true)
            .open(&self.data_path)?;

        writeln!(file, "{}", json)?;
        Ok(())
    }

    /// Record multiple findings with same label
    pub fn record_batch(&self, findings: &[Finding], is_tp: bool) -> std::io::Result<usize> {
        let mut count = 0;
        for finding in findings {
            self.record(finding, is_tp, None)?;
            count += 1;
        }
        Ok(count)
    }

    /// Load all labeled examples
    pub fn load_all(&self) -> std::io::Result<Vec<LabeledFinding>> {
        if !self.data_path.exists() {
            return Ok(Vec::new());
        }

        let file = File::open(&self.data_path)?;
        let reader = BufReader::new(file);

        let mut examples = Vec::new();
        for line in reader.lines() {
            let line = line?;
            if line.trim().is_empty() {
                continue;
            }
            if let Ok(labeled) = serde_json::from_str::<LabeledFinding>(&line) {
                examples.push(labeled);
            }
        }

        Ok(examples)
    }

    /// Build a label map: finding_id → is_true_positive.
    /// Last entry wins (supports re-labeling). Unparseable lines are
    /// silently skipped (matching `load_all()` behavior).
    pub fn load_label_map(&self) -> HashMap<String, bool> {
        let entries = match self.load_all() {
            Ok(v) => v,
            Err(e) => {
                tracing::warn!("Failed to load feedback labels: {}", e);
                return HashMap::new();
            }
        };

        let mut map = HashMap::new();
        for entry in entries {
            map.insert(entry.finding_id, entry.is_true_positive);
        }
        map
    }

    /// Get training statistics
    pub fn stats(&self) -> std::io::Result<TrainingStats> {
        let examples = self.load_all()?;

        let tp_count = examples.iter().filter(|e| e.is_true_positive).count();
        let fp_count = examples.iter().filter(|e| !e.is_true_positive).count();

        // Count by detector
        let mut by_detector: std::collections::HashMap<String, (usize, usize)> =
            std::collections::HashMap::new();
        for ex in &examples {
            let entry = by_detector.entry(ex.detector.clone()).or_insert((0, 0));
            if ex.is_true_positive {
                entry.0 += 1;
            } else {
                entry.1 += 1;
            }
        }

        Ok(TrainingStats {
            total: examples.len(),
            true_positives: tp_count,
            false_positives: fp_count,
            by_detector,
        })
    }

    /// Path to the data file
    pub fn data_path(&self) -> &Path {
        &self.data_path
    }
}

impl Default for FeedbackCollector {
    fn default() -> Self {
        Self::new()
    }
}

/// Training data statistics
#[derive(Debug)]
pub struct TrainingStats {
    pub total: usize,
    pub true_positives: usize,
    pub false_positives: usize,
    pub by_detector: std::collections::HashMap<String, (usize, usize)>,
}

impl std::fmt::Display for TrainingStats {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        writeln!(f, "Training Data Statistics:")?;
        writeln!(f, "  Total examples: {}", self.total)?;
        writeln!(
            f,
            "  True positives: {} ({:.1}%)",
            self.true_positives,
            if self.total > 0 {
                self.true_positives as f64 / self.total as f64 * 100.0
            } else {
                0.0
            }
        )?;
        writeln!(
            f,
            "  False positives: {} ({:.1}%)",
            self.false_positives,
            if self.total > 0 {
                self.false_positives as f64 / self.total as f64 * 100.0
            } else {
                0.0
            }
        )?;
        writeln!(f, "\n  By detector:")?;

        let mut detectors: Vec<_> = self.by_detector.iter().collect();
        detectors.sort_by_key(|item| std::cmp::Reverse(item.1 .0 + item.1 .1));

        for (detector, (tp, fp)) in detectors.iter().take(10) {
            writeln!(f, "    {}: {} TP, {} FP", detector, tp, fp)?;
        }

        Ok(())
    }
}

/// Deserialize Severity from both old format ("High", Debug) and new format ("high", Display/serde).
fn deserialize_severity_compat<'de, D>(deserializer: D) -> Result<Severity, D::Error>
where
    D: serde::Deserializer<'de>,
{
    let s = String::deserialize(deserializer)?;
    s.parse::<Severity>().map_err(serde::de::Error::custom)
}

#[cfg(test)]
mod tests {
    use super::*;
    use tempfile::TempDir;

    #[test]
    fn test_record_and_load() {
        let dir = TempDir::new().expect("create temp dir");
        let path = dir.path().join("test_feedback.jsonl");
        let collector = FeedbackCollector::with_path(&path);

        let finding = Finding {
            id: "test-123".into(),
            detector: "TestDetector".into(),
            severity: crate::models::Severity::High,
            title: "Test finding".into(),
            description: "A test finding for testing".into(),
            ..Default::default()
        };

        collector
            .record(&finding, true, Some("Real issue".into()))
            .expect("record true positive");
        collector
            .record(&finding, false, Some("Not a problem".into()))
            .expect("record false positive");

        let loaded = collector.load_all().expect("load feedback records");
        assert_eq!(loaded.len(), 2);
        assert!(loaded[0].is_true_positive);
        assert!(!loaded[1].is_true_positive);
    }

    #[test]
    fn test_load_label_map_last_writer_wins() {
        let dir = TempDir::new().expect("create temp dir");
        let path = dir.path().join("test_labels.jsonl");
        let collector = FeedbackCollector::with_path(&path);

        let finding = Finding {
            id: "abc-123".into(),
            detector: "TestDetector".into(),
            severity: crate::models::Severity::High,
            title: "Test".into(),
            ..Default::default()
        };

        // Label as TP first, then re-label as FP
        collector.record(&finding, true, None).unwrap();
        collector
            .record(&finding, false, Some("Actually not a bug".into()))
            .unwrap();

        let map = collector.load_label_map();
        assert_eq!(map.len(), 1);
        assert_eq!(
            map.get("abc-123"),
            Some(&false),
            "Last entry (FP) should win"
        );
    }

    #[test]
    fn test_load_label_map_empty_file() {
        let dir = TempDir::new().expect("create temp dir");
        let path = dir.path().join("nonexistent.jsonl");
        let collector = FeedbackCollector::with_path(&path);

        let map = collector.load_label_map();
        assert!(map.is_empty());
    }

    // ─────────────────────────────────────────────────────────────────
    // Phase 3 prep: dual-branch context capture
    // ─────────────────────────────────────────────────────────────────

    use crate::dual_branch::{
        AlternativeBranch, BranchLabel, PredictionReason, PredictionReasonKind,
    };

    fn make_reason(kind: PredictionReasonKind) -> PredictionReason {
        PredictionReason {
            kind,
            weight: 0.0,
            note: String::new(),
        }
    }

    #[test]
    fn dual_branch_fields_roundtrip_through_jsonl() {
        let dir = TempDir::new().expect("create temp dir");
        let path = dir.path().join("dual.jsonl");
        let collector = FeedbackCollector::with_path(&path);

        let finding = Finding {
            id: "dual-1".into(),
            detector: "JwtWeakDetector".into(),
            severity: Severity::High,
            title: "Test dual-branch".into(),
            description: "Test".into(),
            alternative_branch: Some(AlternativeBranch {
                label: BranchLabel::Benign,
                severity: Severity::Info,
                title: "Hardened JWT".into(),
                description: "Algorithms allowlist present".into(),
                suggested_fix: None,
            }),
            prediction_reasons: vec![
                make_reason(PredictionReasonKind::EnclosingScope {
                    scope_kind: "function".into(),
                    name: "decode_token".into(),
                }),
                make_reason(PredictionReasonKind::KeywordArgument {
                    name: "algorithms".into(),
                    value: "[\"RS256\"]".into(),
                }),
            ],
            original_severity: Some(Severity::Critical),
            ..Default::default()
        };

        collector
            .record(&finding, false, Some("predictor mistake".into()))
            .expect("record");

        let loaded = collector.load_all().expect("load");
        assert_eq!(loaded.len(), 1);
        let row = &loaded[0];

        assert!(row.had_alternative_branch, "dual-branch flag set");
        // Alternative is Benign → predicted primary is RealBug.
        assert_eq!(row.predicted_label.as_deref(), Some("real_bug"));
        assert_eq!(row.alternative_severity, Some(Severity::Info));
        assert_eq!(
            row.prediction_reason_kinds,
            vec!["EnclosingScope".to_string(), "KeywordArgument".to_string()]
        );
        assert_eq!(row.original_severity, Some(Severity::Critical));
    }

    #[test]
    fn predicted_label_inverts_alt_realbug_to_benign() {
        let finding = Finding {
            id: "x".into(),
            detector: "D".into(),
            severity: Severity::Info,
            title: "t".into(),
            description: String::new(),
            alternative_branch: Some(AlternativeBranch {
                label: BranchLabel::RealBug,
                severity: Severity::Critical,
                title: "alt".into(),
                description: String::new(),
                suggested_fix: None,
            }),
            ..Default::default()
        };
        let labeled = LabeledFinding::from_finding(&finding, true, None);
        // When alt is RealBug, primary is Benign.
        assert_eq!(labeled.predicted_label.as_deref(), Some("benign"));
    }

    #[test]
    fn predicted_label_inverts_alt_benign_to_realbug() {
        let finding = Finding {
            id: "x".into(),
            detector: "D".into(),
            severity: Severity::High,
            title: "t".into(),
            description: String::new(),
            alternative_branch: Some(AlternativeBranch {
                label: BranchLabel::Benign,
                severity: Severity::Info,
                title: "alt".into(),
                description: String::new(),
                suggested_fix: None,
            }),
            ..Default::default()
        };
        let labeled = LabeledFinding::from_finding(&finding, true, None);
        // When alt is Benign, primary is RealBug.
        assert_eq!(labeled.predicted_label.as_deref(), Some("real_bug"));
    }

    #[test]
    fn single_branch_finding_has_no_predicted_label() {
        let finding = Finding {
            id: "x".into(),
            detector: "D".into(),
            severity: Severity::High,
            title: "t".into(),
            description: String::new(),
            ..Default::default()
        };
        let labeled = LabeledFinding::from_finding(&finding, false, None);
        assert!(!labeled.had_alternative_branch);
        assert!(labeled.predicted_label.is_none());
        assert!(labeled.alternative_severity.is_none());
        assert!(labeled.prediction_reason_kinds.is_empty());
        assert!(labeled.original_severity.is_none());
    }

    /// Tripwire test: when a new `PredictionReasonKind` variant is
    /// added, this test fails until the author updates `reason_kind`
    /// to map it. The intentional coupling forces the JSONL schema to
    /// stay in sync with the enum.
    ///
    /// Constructs ONE reason per variant and asserts the count of
    /// distinct discriminant strings equals the count of variants.
    /// `dedup()` would also catch a copy-paste mismatch where two
    /// variants accidentally share a discriminant string.
    #[test]
    fn reason_kind_covers_every_variant_exhaustively() {
        let all_variants: Vec<PredictionReasonKind> = vec![
            PredictionReasonKind::BundledCode,
            PredictionReasonKind::NonProductionPath,
            PredictionReasonKind::MultiDetectorAgreement { count: 2 },
            PredictionReasonKind::TestFixtureFile,
            PredictionReasonKind::HierarchicalLevel {
                level_name: "L1 Token".into(),
                z_score: 0.0,
            },
            PredictionReasonKind::KeywordArgument {
                name: "verify".into(),
                value: "False".into(),
            },
            PredictionReasonKind::FirstArgIdentifier {
                name: "password".into(),
            },
            PredictionReasonKind::EnclosingScope {
                scope_kind: "function".into(),
                name: "f".into(),
            },
            PredictionReasonKind::ImportPresence {
                module: "jwt".into(),
            },
            PredictionReasonKind::FilePath {
                hint: "/scripts".into(),
            },
            PredictionReasonKind::StructuralPattern {
                description: "x[:N]".into(),
            },
            PredictionReasonKind::Custom {
                description: "legacy".into(),
            },
        ];

        let reasons: Vec<PredictionReason> = all_variants.into_iter().map(make_reason).collect();
        let n_variants = reasons.len();

        let finding = Finding {
            id: "x".into(),
            detector: "D".into(),
            severity: Severity::Info,
            title: "t".into(),
            description: String::new(),
            prediction_reasons: reasons,
            ..Default::default()
        };
        let labeled = LabeledFinding::from_finding(&finding, true, None);
        assert_eq!(
            labeled.prediction_reason_kinds.len(),
            n_variants,
            "every variant should map to a discriminant string"
        );

        // No two discriminant strings collide (catches copy-paste in
        // `reason_kind`'s `match` arms).
        let mut sorted = labeled.prediction_reason_kinds.clone();
        sorted.sort();
        sorted.dedup();
        assert_eq!(
            sorted.len(),
            n_variants,
            "discriminant strings should be unique per variant; got {:?}",
            labeled.prediction_reason_kinds
        );
    }

    /// Forward-compat: old JSONL entries without the new fields must
    /// deserialize cleanly (treating absent fields as defaults). Pin
    /// this so a future `#[serde(deny_unknown_fields)]` or similar
    /// regression is caught.
    #[test]
    fn legacy_jsonl_without_dual_branch_fields_deserializes() {
        let legacy_json = r#"{
            "finding_id": "old-1",
            "detector": "TestDetector",
            "severity": "high",
            "title": "Old finding",
            "description": "From before Phase 3 prep",
            "file_path": "/tmp/x.py",
            "line_start": 10,
            "is_true_positive": true,
            "reason": null,
            "timestamp": "2026-01-01T00:00:00Z"
        }"#;
        let parsed: LabeledFinding =
            serde_json::from_str(legacy_json).expect("legacy entry must parse");
        assert!(!parsed.had_alternative_branch);
        assert!(parsed.predicted_label.is_none());
        assert!(parsed.alternative_severity.is_none());
        assert!(parsed.prediction_reason_kinds.is_empty());
        assert!(parsed.original_severity.is_none());
    }
}