Skip to main content

dsfb_database/
metrics.rs

1//! Operational evaluation metrics.
2//!
3//! All metrics are computed against perturbation ground-truth windows; for
4//! real-data runs without ground truth, the harness reports only the
5//! ground-truth-free metrics (compression ratio, replay determinism,
6//! elasticity).
7//!
8//! Definitions follow §8 of the paper exactly so that the numbers in the
9//! results tables can be re-derived from this file alone.
10
11use crate::grammar::{Episode, MotifClass};
12use crate::perturbation::{PerturbationClass, PerturbationWindow};
13use serde::{Deserialize, Serialize};
14
15#[derive(Debug, Clone, Serialize, Deserialize, Default)]
16pub struct PerMotifMetrics {
17    pub motif: String,
18    pub tp: u64,
19    pub fp: u64,
20    pub fn_: u64,
21    pub precision: f64,
22    pub recall: f64,
23    pub f1: f64,
24    /// Median time-to-detection in seconds across true positives.
25    pub time_to_detection_median_s: f64,
26    /// 95th percentile time-to-detection.
27    pub time_to_detection_p95_s: f64,
28    /// False-alarm rate during stable windows (episodes per stable hour).
29    pub false_alarm_rate_per_hour: f64,
30    /// Episode compression ratio: residual samples in motif's class /
31    /// number of episodes emitted.
32    pub episode_compression_ratio: f64,
33}
34
35fn perturbation_to_motif(c: PerturbationClass) -> MotifClass {
36    match c {
37        PerturbationClass::LatencyInjection => MotifClass::PlanRegressionOnset,
38        PerturbationClass::StatisticsStaleness => MotifClass::CardinalityMismatchRegime,
39        PerturbationClass::LockHold => MotifClass::ContentionRamp,
40        PerturbationClass::CacheEviction => MotifClass::CacheCollapse,
41        PerturbationClass::WorkloadShift => MotifClass::WorkloadPhaseTransition,
42    }
43}
44
45/// Compute per-motif precision / recall / F1 / TTD against perturbation
46/// windows. An episode is a TP if its `[t_start, t_end]` overlaps the
47/// matching-class perturbation window for the same channel.
48pub fn evaluate(
49    episodes: &[Episode],
50    windows: &[PerturbationWindow],
51    total_residual_samples_per_motif: &std::collections::HashMap<MotifClass, usize>,
52    trace_duration_s: f64,
53) -> Vec<PerMotifMetrics> {
54    debug_assert!(
55        trace_duration_s.is_finite(),
56        "trace_duration_s must be finite"
57    );
58    debug_assert!(
59        trace_duration_s >= 0.0,
60        "trace_duration_s must be non-negative"
61    );
62
63    let mut out = Vec::with_capacity(MotifClass::ALL.len());
64    for motif in MotifClass::ALL {
65        let class_eps: Vec<&Episode> = episodes.iter().filter(|e| e.motif == motif).collect();
66        let class_wins: Vec<&PerturbationWindow> = windows
67            .iter()
68            .filter(|w| perturbation_to_motif(w.class) == motif)
69            .collect();
70
71        let (tp, fp, fn_, ttds) = count_confusion(&class_eps, &class_wins);
72        debug_assert_eq!(
73            fn_ as usize + tp as usize,
74            class_wins.len(),
75            "every window must be either matched (tp) or missed (fn)"
76        );
77
78        let (precision, recall, f1) = precision_recall_f1(tp, fp, fn_);
79        debug_assert!(
80            (0.0..=1.0).contains(&precision),
81            "precision must be in [0,1]"
82        );
83        debug_assert!((0.0..=1.0).contains(&recall), "recall must be in [0,1]");
84        debug_assert!((0.0..=1.0).contains(&f1), "f1 must be in [0,1]");
85
86        let (median, p95) = ttd_percentiles(ttds);
87        let far = false_alarm_rate_per_hour(&class_eps, windows, trace_duration_s);
88        let compression = compression_ratio(
89            &class_eps,
90            total_residual_samples_per_motif
91                .get(&motif)
92                .copied()
93                .unwrap_or(0),
94        );
95
96        out.push(PerMotifMetrics {
97            motif: motif.name().to_string(),
98            tp,
99            fp,
100            fn_,
101            precision,
102            recall,
103            f1,
104            time_to_detection_median_s: median,
105            time_to_detection_p95_s: p95,
106            false_alarm_rate_per_hour: far,
107            episode_compression_ratio: compression,
108        });
109    }
110    debug_assert_eq!(
111        out.len(),
112        MotifClass::ALL.len(),
113        "one row per motif is the invariant relied on by the report layer"
114    );
115    out
116}
117
118/// One episode overlaps a perturbation window iff their intervals
119/// overlap **and** the episode channel prefix/contains the window
120/// channel (operator channels may be coarser than motif channels).
121fn episode_matches_window(ep: &Episode, w: &PerturbationWindow) -> bool {
122    let overlap = ep.t_end >= w.t_start && ep.t_start <= w.t_end;
123    let chan_ok = ep
124        .channel
125        .as_deref()
126        .map(|c| c.starts_with(&w.channel) || c.contains(&w.channel))
127        .unwrap_or(true);
128    overlap && chan_ok
129}
130
131/// Walk the episode / window lists and compute TP / FP / FN counts plus
132/// the set of time-to-detection measurements (seconds from window open
133/// to episode open for matched windows). Implements the
134/// redundant-overlap clemency rule documented in §8 of the paper.
135fn count_confusion(
136    class_eps: &[&Episode],
137    class_wins: &[&PerturbationWindow],
138) -> (u64, u64, u64, Vec<f64>) {
139    let mut tp: u64 = 0;
140    let mut fp: u64 = 0;
141    let mut fn_: u64 = 0;
142    let mut ttds: Vec<f64> = Vec::new();
143    let mut matched_windows = vec![false; class_wins.len()];
144
145    for ep in class_eps {
146        let hit = class_wins
147            .iter()
148            .enumerate()
149            .find(|(wi, w)| !matched_windows[*wi] && episode_matches_window(ep, w))
150            .map(|(wi, _)| wi);
151        if let Some(wi) = hit {
152            matched_windows[wi] = true;
153            tp += 1;
154            let w = class_wins[wi];
155            ttds.push((ep.t_start - w.t_start).max(0.0));
156        } else if !class_wins.iter().any(|w| episode_matches_window(ep, w)) {
157            // Redundant-overlap clemency: count FP only when the
158            // episode overlaps *no* window of the right class on a
159            // related channel (redundant co-located detections are
160            // not false alarms per §8).
161            fp += 1;
162        }
163    }
164    for matched in &matched_windows {
165        if !matched {
166            fn_ += 1;
167        }
168    }
169    (tp, fp, fn_, ttds)
170}
171
172/// Canonical precision / recall / F1 definitions with zero-guards. At
173/// the boundary (tp+fp=0 or tp+fn=0) the score collapses to 0 — that
174/// is the §8 convention rather than NaN.
175fn precision_recall_f1(tp: u64, fp: u64, fn_: u64) -> (f64, f64, f64) {
176    let precision = if tp + fp == 0 {
177        0.0
178    } else {
179        tp as f64 / (tp + fp) as f64
180    };
181    let recall = if tp + fn_ == 0 {
182        0.0
183    } else {
184        tp as f64 / (tp + fn_) as f64
185    };
186    let f1 = if precision + recall == 0.0 {
187        0.0
188    } else {
189        2.0 * precision * recall / (precision + recall)
190    };
191    (precision, recall, f1)
192}
193
194/// Median and 95th-percentile TTD from a TTD list. Empty lists produce
195/// (0.0, 0.0) by §8 convention.
196fn ttd_percentiles(mut ttds: Vec<f64>) -> (f64, f64) {
197    if ttds.is_empty() {
198        return (0.0, 0.0);
199    }
200    ttds.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
201    let median = ttds[ttds.len() / 2];
202    let idx = ((ttds.len() as f64 - 1.0) * 0.95).round() as usize;
203    let p95 = ttds[idx.min(ttds.len() - 1)];
204    debug_assert!(
205        median.is_finite() && p95.is_finite(),
206        "percentiles finite on finite input"
207    );
208    (median, p95)
209}
210
211/// False-alarm rate in episodes-per-stable-hour. "Stable" = trace time
212/// outside *any* perturbation window of *any* class.
213fn false_alarm_rate_per_hour(
214    class_eps: &[&Episode],
215    windows: &[PerturbationWindow],
216    trace_duration_s: f64,
217) -> f64 {
218    let stable_eps: u64 = class_eps
219        .iter()
220        .filter(|ep| {
221            !windows
222                .iter()
223                .any(|w| ep.t_end >= w.t_start && ep.t_start <= w.t_end)
224        })
225        .count() as u64;
226    let stable_hours =
227        (trace_duration_s - windows.iter().map(|w| w.t_end - w.t_start).sum::<f64>()).max(1.0)
228            / 3600.0;
229    debug_assert!(stable_hours > 0.0, "stable_hours lower-clamped to 1s/3600");
230    stable_eps as f64 / stable_hours
231}
232
233/// Episode-compression ratio: input residual samples per emitted
234/// episode. Zero-episode streams report zero (matches §8's
235/// "no episodes ⇒ no compression report" convention).
236fn compression_ratio(class_eps: &[&Episode], total_samples: usize) -> f64 {
237    if class_eps.is_empty() {
238        0.0
239    } else {
240        total_samples as f64 / class_eps.len() as f64
241    }
242}
243
244/// Elasticity report: rerun the grammar with thresholds scaled by `factor`
245/// and return the per-motif F1 delta. Caller does the rerun; this just
246/// computes the deltas.
247pub fn f1_delta(baseline: &[PerMotifMetrics], scaled: &[PerMotifMetrics]) -> Vec<(String, f64)> {
248    baseline
249        .iter()
250        .zip(scaled.iter())
251        .map(|(a, b)| (a.motif.clone(), b.f1 - a.f1))
252        .collect()
253}
254
255/// Cross-signal agreement per episode and per-motif mean.
256///
257/// For each emitted episode `E` of motif class `c` on channel `ch` over
258/// `[t_s, t_e]`, define cross-signal agreement as the fraction of *other*
259/// motif classes `c' ≠ c` that also emit at least one episode overlapping
260/// `[t_s, t_e]` on any channel. Range `[0, 1]`; zero = isolated signal,
261/// one = full grammar coverage.
262///
263/// This is **not** a ground-truth correlation and **not** a causal score.
264/// High agreement can arise from a common exogenous stressor *or* from
265/// the grammar's overlapping sensitivity; low agreement is not evidence
266/// of isolation either. It is a structural co-occurrence measurement
267/// whose sole purpose is to quantify the multi-signal coupling that
268/// distinguishes DSFB episodes from per-series change-point reports.
269///
270/// See paper §7 for the matching limitation statement.
271pub fn cross_signal_agreement(episodes: &[Episode]) -> Vec<(MotifClass, f64)> {
272    let mut per_motif: Vec<(MotifClass, Vec<f64>)> =
273        MotifClass::ALL.iter().map(|m| (*m, Vec::new())).collect();
274    for ep in episodes {
275        let other_classes_with_overlap = MotifClass::ALL
276            .iter()
277            .filter(|m| **m != ep.motif)
278            .filter(|m| {
279                episodes
280                    .iter()
281                    .any(|e| e.motif == **m && e.t_end >= ep.t_start && e.t_start <= ep.t_end)
282            })
283            .count();
284        let agreement = other_classes_with_overlap as f64 / (MotifClass::ALL.len() - 1) as f64;
285        debug_assert!(
286            (0.0..=1.0).contains(&agreement),
287            "cross-signal agreement must be in [0,1]"
288        );
289        per_motif
290            .iter_mut()
291            .find(|(m, _)| *m == ep.motif)
292            .map(|(_, v)| v.push(agreement));
293    }
294    per_motif
295        .into_iter()
296        .map(|(m, vs)| {
297            let mean = if vs.is_empty() {
298                0.0
299            } else {
300                vs.iter().sum::<f64>() / vs.len() as f64
301            };
302            (m, mean)
303        })
304        .collect()
305}
306
307/// Stability-under-perturbation scalar metric.
308///
309/// Given per-(motif, scale) F1 rows from the stress sweep, compute the
310/// normalised area-under-the-F1 curve over scales in the operating
311/// envelope window `[0.5, 1.5]` (around the canonical scale of 1.0). The
312/// scalar collapses the F1-vs-scale curve to a single number in `[0, 1]`
313/// that answers "how robust is this motif's detection across a ±50%
314/// perturbation-magnitude window?" — higher is more stable.
315///
316/// Trapezoidal integration on the subset of provided scales that fall in
317/// the window; normalisation divides by the window width so the result
318/// is the mean F1 across the interval. Motifs with no samples in the
319/// window return 0.0.
320///
321/// This is a re-read of the existing `tpcds.stress.csv` data, not a new
322/// experiment. See paper §6 (operating envelope) for the companion
323/// narrative.
324pub fn stability_under_perturbation(
325    stress_rows: &[(f64, String, f64)],
326) -> std::collections::HashMap<String, f64> {
327    let (lo, hi) = (0.5_f64, 1.5_f64);
328    let mut by_motif: std::collections::BTreeMap<String, Vec<(f64, f64)>> =
329        std::collections::BTreeMap::new();
330    for (scale, motif, f1) in stress_rows {
331        if *scale >= lo && *scale <= hi && f1.is_finite() {
332            by_motif
333                .entry(motif.clone())
334                .or_default()
335                .push((*scale, *f1));
336        }
337    }
338    let mut out = std::collections::HashMap::new();
339    for (motif, mut pts) in by_motif {
340        pts.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal));
341        if pts.len() < 2 {
342            out.insert(motif, 0.0);
343            continue;
344        }
345        let mut auc = 0.0_f64;
346        for pair in pts.windows(2) {
347            let (x0, y0) = pair[0];
348            let (x1, y1) = pair[1];
349            auc += 0.5 * (y0 + y1) * (x1 - x0);
350        }
351        let width = pts.last().unwrap().0 - pts.first().unwrap().0;
352        let norm = if width > 0.0 { auc / width } else { 0.0 };
353        debug_assert!(norm.is_finite(), "stability AUC finite");
354        out.insert(motif, norm.clamp(0.0, 1.0));
355    }
356    out
357}