datasynth-eval 5.34.0

Evaluation framework for synthetic financial data quality and coherence
Documentation
//! Detectability profile of a planted-anomaly population (engine feature A6).
//!
//! Each planted anomaly carries an *observability class* (A4): which detection arm can, in
//! principle, surface it — per-JE **density**, relational **account-flow graph**, **temporal**, or
//! cross-period **memory-only**. This analyzer summarizes how the planted population distributes
//! across those arms, so a benchmark can report detection against the *per-signal ceiling* rather
//! than one pooled score, and can surface the residual-faint **memory-only** tail — the family that
//! defeats label-free residual detectors and motivates carry-forward memory (FINDINGS §12 / §40).
//!
//! Scope: this is the observability *ceiling* map computed from the ground-truth labels. The actual
//! per-class detection ROC requires running the inverse-audit detector (the AuditDetector library /
//! showcase); this report tells you what is *in principle* catchable by which arm.

use crate::error::EvalResult;
use datasynth_core::models::{LabeledAnomaly, ObservabilityClass};
use serde::{Deserialize, Serialize};
use std::collections::BTreeMap;

/// The observability profile of a planted-anomaly population.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DetectabilityReport {
    /// Total planted anomalies considered.
    pub total: usize,
    /// Count per observability class (all four classes always present; 0 when absent).
    pub by_class: BTreeMap<String, usize>,
    /// Fraction of the population per observability class.
    pub fraction_by_class: BTreeMap<String, f64>,
    /// Fraction observable only with cross-period / cross-entity memory — the residual-faint tail
    /// that label-free residual arms miss (FINDINGS §40-41).
    pub memory_only_fraction: f64,
    /// Fraction observable by a single-entry per-JE density residual.
    pub per_je_density_fraction: f64,
    /// Fraction observable in the relational account-flow graph.
    pub relational_graph_fraction: f64,
    /// Fraction observable only in the cross-period time series.
    pub temporal_fraction: f64,
    /// Counts cross-tabulated by observability class → anomaly category (Fraud/Error/…).
    pub by_class_and_category: BTreeMap<String, BTreeMap<String, usize>>,
}

/// Computes the [`DetectabilityReport`] for a population of labeled anomalies.
#[derive(Debug, Clone, Default)]
pub struct DetectabilityAnalyzer;

impl DetectabilityAnalyzer {
    /// New analyzer.
    pub fn new() -> Self {
        Self
    }

    /// Summarize the observability profile of `labels`.
    pub fn analyze(&self, labels: &[LabeledAnomaly]) -> EvalResult<DetectabilityReport> {
        let total = labels.len();
        let mut by_class: BTreeMap<String, usize> = BTreeMap::new();
        let mut by_class_and_category: BTreeMap<String, BTreeMap<String, usize>> = BTreeMap::new();

        // Seed all four classes so the schema is stable regardless of the population.
        for c in [
            ObservabilityClass::PerJeDensity,
            ObservabilityClass::RelationalGraph,
            ObservabilityClass::Temporal,
            ObservabilityClass::MemoryOnly,
        ] {
            by_class.insert(c.as_str().to_string(), 0);
        }

        for l in labels {
            let cls = l.observability.as_str().to_string();
            *by_class.entry(cls.clone()).or_default() += 1;
            *by_class_and_category
                .entry(cls)
                .or_default()
                .entry(l.anomaly_type.category().to_string())
                .or_default() += 1;
        }

        let denom = total.max(1) as f64;
        let fraction_by_class: BTreeMap<String, f64> = by_class
            .iter()
            .map(|(k, v)| (k.clone(), *v as f64 / denom))
            .collect();
        let frac = |c: ObservabilityClass| *by_class.get(c.as_str()).unwrap_or(&0) as f64 / denom;

        Ok(DetectabilityReport {
            total,
            memory_only_fraction: frac(ObservabilityClass::MemoryOnly),
            per_je_density_fraction: frac(ObservabilityClass::PerJeDensity),
            relational_graph_fraction: frac(ObservabilityClass::RelationalGraph),
            temporal_fraction: frac(ObservabilityClass::Temporal),
            by_class,
            fraction_by_class,
            by_class_and_category,
        })
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use chrono::NaiveDate;
    use datasynth_core::models::{AnomalyType, ErrorType, FraudType, RelationalAnomalyType};

    fn label(at: AnomalyType) -> LabeledAnomaly {
        LabeledAnomaly::new(
            "A".to_string(),
            at,
            "JE".to_string(),
            "JE".to_string(),
            "1000".to_string(),
            NaiveDate::from_ymd_opt(2026, 1, 1).unwrap(),
        )
    }

    #[test]
    fn profiles_population_across_observability_arms() {
        let labels = vec![
            label(AnomalyType::Fraud(FraudType::RoundDollarManipulation)), // per_je_density
            label(AnomalyType::Fraud(FraudType::DuplicatePayment)),        // memory_only
            label(AnomalyType::Relational(
                RelationalAnomalyType::CircularTransaction,
            )), // relational_graph
            label(AnomalyType::Error(ErrorType::WrongPeriod)),             // temporal
        ];
        let report = DetectabilityAnalyzer::new().analyze(&labels).unwrap();

        assert_eq!(report.total, 4);
        assert_eq!(report.by_class["per_je_density"], 1);
        assert_eq!(report.by_class["memory_only"], 1);
        assert_eq!(report.by_class["relational_graph"], 1);
        assert_eq!(report.by_class["temporal"], 1);
        assert!((report.memory_only_fraction - 0.25).abs() < 1e-9);
        assert!((report.per_je_density_fraction - 0.25).abs() < 1e-9);
        // Cross-tab: the memory-only entry is a Fraud.
        assert_eq!(report.by_class_and_category["memory_only"]["Fraud"], 1);
    }

    #[test]
    fn empty_population_has_stable_zeroed_schema() {
        let report = DetectabilityAnalyzer::new().analyze(&[]).unwrap();
        assert_eq!(report.total, 0);
        // All four classes are present and zeroed; fractions are 0 (no divide-by-zero).
        assert_eq!(report.by_class.len(), 4);
        assert_eq!(report.memory_only_fraction, 0.0);
        assert_eq!(report.by_class["relational_graph"], 0);
    }
}