dag_ml_core/observability.rs
1//! ADR-12 observability hooks.
2//!
3//! All `tracing` emission for the control core is centralized here so the set of
4//! telemetry field names stays auditable in one place. Spans and events carry
5//! **identifiers and counts only** — never feature matrices, targets, sample
6//! values or metadata contents. This preserves, at the telemetry layer, the same
7//! boundary the data ABI enforces: the core never exposes raw data.
8//!
9//! The CI lint `scripts/lint_tracing_fields.py` enforces two invariants:
10//! 1. no `tracing` usage exists outside this module (so every event is vetted);
11//! 2. no field name in this module matches the forbidden
12//! `data|features|targets|sample|metadata` pattern (singular `sample` also
13//! rejects `sample_count`, `sample_ids`, etc.).
14//!
15//! The core only emits through the `tracing` facade; it never installs a
16//! subscriber. Binaries and hosts choose a sink (the CLI installs a
17//! `tracing_subscriber::fmt` layer driven by `RUST_LOG`; see
18//! `docs/OBSERVABILITY.md`).
19
20use tracing::{info_span, warn, Span};
21
22/// Frozen ADR-12 telemetry field allowlist. Every field emitted by this module
23/// must appear here, and each entry is an identifier or a count — never data.
24/// Adding a field requires an ADR-12 update and a review per the privacy rule.
25pub const OBSERVABILITY_FIELD_ALLOWLIST: &[&str] = &[
26 "run_id",
27 "plan_id",
28 "variant_id",
29 "fold_id",
30 "controller_id",
31 "phase",
32 "node_id",
33 "partition_id",
34 "cache_hit",
35 "oof_refused",
36 "category",
37 "code",
38 "violator_count",
39];
40
41/// Build the per-phase-scope span (ADR-12). `run_id`/`plan_id` correlate
42/// concurrent or overlapping runs; empty `variant_id`/`fold_id` mean the field is
43/// not applicable to the current phase. Fields are identifiers only.
44pub fn phase_span(
45 run_id: &str,
46 plan_id: &str,
47 phase: &str,
48 variant_id: Option<&str>,
49 fold_id: Option<&str>,
50) -> Span {
51 info_span!(
52 "dag_ml.phase",
53 run_id = run_id,
54 plan_id = plan_id,
55 phase = phase,
56 variant_id = variant_id.unwrap_or_default(),
57 fold_id = fold_id.unwrap_or_default(),
58 )
59}
60
61/// Build the per-node span (ADR-12), nested under the current phase span so node
62/// telemetry is attributed to its run, plan and controller. Identifiers only.
63pub fn node_span(
64 run_id: &str,
65 plan_id: &str,
66 phase: &str,
67 node_id: &str,
68 controller_id: &str,
69) -> Span {
70 info_span!(
71 "dag_ml.node",
72 run_id = run_id,
73 plan_id = plan_id,
74 phase = phase,
75 node_id = node_id,
76 controller_id = controller_id,
77 )
78}
79
80/// Emit the ADR-12 out-of-fold leakage refusal event with stable taxonomy fields
81/// (`category`/`code` mirror [`crate::DagMlError::OofLeakage`]) so log consumers
82/// can alert on refusals without parsing messages.
83pub fn emit_oof_refusal(node_id: &str, violator_count: usize) {
84 warn!(
85 oof_refused = true,
86 category = "validation",
87 code = "oof_leakage",
88 node_id = node_id,
89 violator_count = violator_count,
90 "out-of-fold leakage refused"
91 );
92}
93
94#[cfg(test)]
95mod tests {
96 use super::*;
97
98 #[test]
99 fn allowlist_contains_no_data_bearing_field() {
100 for field in OBSERVABILITY_FIELD_ALLOWLIST {
101 for forbidden in ["data", "features", "targets", "sample", "metadata"] {
102 assert!(
103 !field.contains(forbidden),
104 "allowlisted field `{field}` leaks `{forbidden}`"
105 );
106 }
107 }
108 }
109
110 #[test]
111 fn helpers_emit_without_subscriber() {
112 // No subscriber is installed in tests; the facade calls must be no-ops
113 // rather than panic.
114 let span = phase_span(
115 "run:1",
116 "plan:1",
117 "FIT_CV",
118 Some("variant:0"),
119 Some("fold:0"),
120 );
121 let _entered = span.entered();
122 let _node = node_span("run:1", "plan:1", "FIT_CV", "node:model", "controller:m").entered();
123 emit_oof_refusal("node:model", 2);
124 }
125}