Skip to main content

dag_ml_core/
observability.rs

1//! ADR-12 observability hooks.
2//!
3//! All `tracing` emission for the control core is centralized here so the set of
4//! telemetry field names stays auditable in one place. Spans and events carry
5//! **identifiers and counts only** — never feature matrices, targets, sample
6//! values or metadata contents. This preserves, at the telemetry layer, the same
7//! boundary the data ABI enforces: the core never exposes raw data.
8//!
9//! The CI lint `scripts/lint_tracing_fields.py` enforces two invariants:
10//! 1. no `tracing` usage exists outside this module (so every event is vetted);
11//! 2. no field name in this module matches the forbidden
12//!    `data|features|targets|sample|metadata` pattern (singular `sample` also
13//!    rejects `sample_count`, `sample_ids`, etc.).
14//!
15//! The core only emits through the `tracing` facade; it never installs a
16//! subscriber. Binaries and hosts choose a sink (the CLI installs a
17//! `tracing_subscriber::fmt` layer driven by `RUST_LOG`; see
18//! `docs/OBSERVABILITY.md`).
19
20use tracing::{info_span, warn, Span};
21
22/// Frozen ADR-12 telemetry field allowlist. Every field emitted by this module
23/// must appear here, and each entry is an identifier or a count — never data.
24/// Adding a field requires an ADR-12 update and a review per the privacy rule.
25pub const OBSERVABILITY_FIELD_ALLOWLIST: &[&str] = &[
26    "run_id",
27    "plan_id",
28    "variant_id",
29    "fold_id",
30    "controller_id",
31    "phase",
32    "node_id",
33    "partition_id",
34    "cache_hit",
35    "oof_refused",
36    "category",
37    "code",
38    "violator_count",
39];
40
41/// Build the per-phase-scope span (ADR-12). `run_id`/`plan_id` correlate
42/// concurrent or overlapping runs; empty `variant_id`/`fold_id` mean the field is
43/// not applicable to the current phase. Fields are identifiers only.
44pub fn phase_span(
45    run_id: &str,
46    plan_id: &str,
47    phase: &str,
48    variant_id: Option<&str>,
49    fold_id: Option<&str>,
50) -> Span {
51    info_span!(
52        "dag_ml.phase",
53        run_id = run_id,
54        plan_id = plan_id,
55        phase = phase,
56        variant_id = variant_id.unwrap_or_default(),
57        fold_id = fold_id.unwrap_or_default(),
58    )
59}
60
61/// Build the per-node span (ADR-12), nested under the current phase span so node
62/// telemetry is attributed to its run, plan and controller. Identifiers only.
63pub fn node_span(
64    run_id: &str,
65    plan_id: &str,
66    phase: &str,
67    node_id: &str,
68    controller_id: &str,
69) -> Span {
70    info_span!(
71        "dag_ml.node",
72        run_id = run_id,
73        plan_id = plan_id,
74        phase = phase,
75        node_id = node_id,
76        controller_id = controller_id,
77    )
78}
79
80/// Emit the ADR-12 out-of-fold leakage refusal event with stable taxonomy fields
81/// (`category`/`code` mirror [`crate::DagMlError::OofLeakage`]) so log consumers
82/// can alert on refusals without parsing messages.
83pub fn emit_oof_refusal(node_id: &str, violator_count: usize) {
84    warn!(
85        oof_refused = true,
86        category = "validation",
87        code = "oof_leakage",
88        node_id = node_id,
89        violator_count = violator_count,
90        "out-of-fold leakage refused"
91    );
92}
93
94#[cfg(test)]
95mod tests {
96    use super::*;
97
98    #[test]
99    fn allowlist_contains_no_data_bearing_field() {
100        for field in OBSERVABILITY_FIELD_ALLOWLIST {
101            for forbidden in ["data", "features", "targets", "sample", "metadata"] {
102                assert!(
103                    !field.contains(forbidden),
104                    "allowlisted field `{field}` leaks `{forbidden}`"
105                );
106            }
107        }
108    }
109
110    #[test]
111    fn helpers_emit_without_subscriber() {
112        // No subscriber is installed in tests; the facade calls must be no-ops
113        // rather than panic.
114        let span = phase_span(
115            "run:1",
116            "plan:1",
117            "FIT_CV",
118            Some("variant:0"),
119            Some("fold:0"),
120        );
121        let _entered = span.entered();
122        let _node = node_span("run:1", "plan:1", "FIT_CV", "node:model", "controller:m").entered();
123        emit_oof_refusal("node:model", 2);
124    }
125}