datasynth_group/aggregate/coverage_report.rs
1//! IC matching coverage report — Task 5.7.
2//!
3//! After [`crate::aggregate::ic_matcher::match_ic_pairs`] has joined the
4//! seller-side and buyer-side journal entries of every IC pair, this
5//! module summarises the result into a sidecar diagnostic / observability
6//! artefact for auditors and dashboards.
7//!
8//! # Position in the v5.0 aggregate phase
9//!
10//! ```text
11//! match_ic_pairs Task 5.3
12//! │
13//! ▼
14//! IcMatchResult ──────► build_coverage_report ──► CoverageReport
15//! │ │
16//! ▼ ▼
17//! generate_eliminations Task 5.4 ic_eliminations/
18//! │ ic_matching_coverage.json
19//! ▼
20//! eliminations_to_journal_entries Task 5.5
21//! │
22//! ▼
23//! apply_eliminations_to_tb Task 5.6
24//! ```
25//!
26//! The report is intentionally **not** part of the consolidated TB
27//! pipeline. The downstream elimination → JE → post-elim chain consumes
28//! [`crate::aggregate::ic_matcher::IcMatchResult`] directly; the coverage
29//! report is a sidecar artefact whose only readers are auditors,
30//! dashboards, and the "Good engagement" gate (`coverage ≥ 0.98` per spec
31//! §5.4).
32//!
33//! # On-disk contract
34//!
35//! Per spec §5.4, the report lands at
36//! `{out_dir}/ic_eliminations/ic_matching_coverage.json` with these fields:
37//!
38//! ```json
39//! {
40//! "total_pairs_planned": 250000,
41//! "matched": 248750,
42//! "coverage": 0.995,
43//! "unmatched_by_reason": {
44//! "missing_buyer_side": 780,
45//! "missing_seller_side": 350,
46//! "amount_drift_above_tolerance": 120
47//! },
48//! "unmatched_sample": [/* first 100 unmatched sides */]
49//! }
50//! ```
51//!
52//! # Sample cap
53//!
54//! `unmatched_sample` is capped at the first **100** entries from
55//! [`crate::aggregate::ic_matcher::IcMatchResult::unmatched`] (which is
56//! pre-sorted by `pair_id` for determinism). The cap matches the spec's
57//! "first 100 unmatched pairs for debugging" — the goal is diagnostic
58//! triage, not exhaustive replay. Auditors needing the full unmatched
59//! list can re-run [`crate::aggregate::ic_matcher::match_ic_pairs`] in a
60//! debugger.
61//!
62//! # Schema stability
63//!
64//! `unmatched_by_reason` is always emitted with **all three**
65//! [`UnmatchedReason`] variants present (zero-valued when no
66//! observations) so dashboards can rely on the keys existing. The map
67//! is a [`BTreeMap`] keyed by the reason enum so the JSON output is
68//! deterministically ordered.
69//!
70//! # Determinism
71//!
72//! Two calls with identical [`IcMatchResult`] inputs produce
73//! byte-identical JSON output. This is verified by the
74//! `deterministic_output` test in `tests/coverage_report.rs`.
75
76use std::collections::BTreeMap;
77use std::fs;
78use std::path::{Path, PathBuf};
79
80use serde::{Deserialize, Serialize};
81
82use crate::aggregate::ic_matcher::{IcMatchResult, UnmatchedReason, UnmatchedSide};
83use crate::errors::{GroupError, GroupResult};
84
85/// Maximum number of unmatched sides included in
86/// [`CoverageReport::unmatched_sample`]. Mirrors the spec §5.4
87/// "\[:100\]" semantics.
88pub const UNMATCHED_SAMPLE_CAP: usize = 100;
89
90/// Subdirectory within the group output root where the coverage report
91/// lives. Per spec §5.4, the report sits alongside the elimination JEs.
92pub const COVERAGE_REPORT_SUBDIR: &str = "ic_eliminations";
93
94/// File name for the coverage report, per spec §5.4.
95pub const COVERAGE_REPORT_FILENAME: &str = "ic_matching_coverage.json";
96
97// ── Public types ──────────────────────────────────────────────────────────────
98
99/// Diagnostic / observability summary of [`IcMatchResult`].
100///
101/// See spec §5.4 for the field contract. All counts mirror the
102/// matcher's already-computed values — this report is a thin pass-through
103/// layer with the histogram + sample for auditors / dashboards.
104#[derive(Debug, Clone, Serialize, Deserialize)]
105pub struct CoverageReport {
106 /// Total number of distinct IC pairs the manifest planned (sum
107 /// across all entities of seller-side plans). Mirrors
108 /// [`IcMatchResult::total_planned`].
109 pub total_pairs_planned: usize,
110 /// Number of fully matched pairs (both sides observed). Equals
111 /// `IcMatchResult::matched.len()`.
112 pub matched: usize,
113 /// `matched / total_pairs_planned`, with `0/0` mapped to `0.0`
114 /// (mirrors [`IcMatchResult::coverage`] so the two stay
115 /// consistent — see the matcher's module docs for the rationale).
116 pub coverage: f64,
117 /// Histogram of unmatched sides keyed by reason.
118 ///
119 /// Always contains all three [`UnmatchedReason`] variants — even
120 /// when zero — so the schema is stable across runs and dashboards
121 /// can rely on the keys being present. Keyed by the enum itself
122 /// (with snake_case serde rename) and stored in a [`BTreeMap`] so
123 /// the JSON output's iteration order is deterministic.
124 pub unmatched_by_reason: BTreeMap<UnmatchedReason, usize>,
125 /// Up to [`UNMATCHED_SAMPLE_CAP`] unmatched sides for diagnostic
126 /// triage. The sample is the head of
127 /// [`IcMatchResult::unmatched`] (already sorted by `pair_id` for
128 /// determinism) so the same input always produces the same sample.
129 pub unmatched_sample: Vec<UnmatchedSide>,
130}
131
132// ── Public API ────────────────────────────────────────────────────────────────
133
134/// Build a [`CoverageReport`] from an [`IcMatchResult`].
135///
136/// Pure function: no I/O, no allocation beyond the report itself, no
137/// dependence on global state. Two calls with the same input produce
138/// equal reports.
139///
140/// # Behaviour
141///
142/// 1. **Counts.** `total_pairs_planned`, `matched`, and `coverage`
143/// are passed through verbatim from `result` — the matcher already
144/// did the math, we just relabel.
145/// 2. **Histogram.** Initialise `unmatched_by_reason` with all three
146/// [`UnmatchedReason`] variants set to zero (so the schema stays
147/// stable for downstream dashboards) and increment the relevant
148/// bucket for each entry in `result.unmatched`.
149/// 3. **Sample.** Take the first [`UNMATCHED_SAMPLE_CAP`] entries
150/// from `result.unmatched`. The matcher pre-sorts `unmatched` by
151/// `pair_id`, so the sample is deterministic.
152pub fn build_coverage_report(result: &IcMatchResult) -> CoverageReport {
153 // Initialise the histogram with all three reasons present so
154 // downstream dashboards can rely on the keys existing even when a
155 // run has zero observations of a given reason.
156 let mut unmatched_by_reason: BTreeMap<UnmatchedReason, usize> = BTreeMap::new();
157 unmatched_by_reason.insert(UnmatchedReason::MissingBuyerSide, 0);
158 unmatched_by_reason.insert(UnmatchedReason::MissingSellerSide, 0);
159 unmatched_by_reason.insert(UnmatchedReason::AmountDriftAboveTolerance, 0);
160
161 for side in &result.unmatched {
162 *unmatched_by_reason.entry(side.reason).or_insert(0) += 1;
163 }
164
165 let unmatched_sample: Vec<UnmatchedSide> = result
166 .unmatched
167 .iter()
168 .take(UNMATCHED_SAMPLE_CAP)
169 .cloned()
170 .collect();
171
172 CoverageReport {
173 total_pairs_planned: result.total_planned,
174 matched: result.matched.len(),
175 coverage: result.coverage,
176 unmatched_by_reason,
177 unmatched_sample,
178 }
179}
180
181/// Write the coverage report to
182/// `{out_dir}/ic_eliminations/ic_matching_coverage.json`.
183///
184/// Creates the `ic_eliminations/` subdirectory if it doesn't exist.
185/// Output is pretty-printed JSON with a trailing newline so the file is
186/// human-readable when opened in an editor.
187///
188/// Returns the absolute path of the written file so callers logging
189/// "wrote ic_matching_coverage.json to …" don't have to re-derive it.
190///
191/// # Errors
192///
193/// - [`GroupError::Io`] if the subdirectory creation, file write, or
194/// path resolution fails.
195/// - [`GroupError::Serde`] if the report fails to serialise (should be
196/// impossible — every field is `Serialize`-friendly).
197pub fn write_coverage_report(report: &CoverageReport, out_dir: &Path) -> GroupResult<PathBuf> {
198 let dir = out_dir.join(COVERAGE_REPORT_SUBDIR);
199 fs::create_dir_all(&dir).map_err(GroupError::Io)?;
200
201 let path = dir.join(COVERAGE_REPORT_FILENAME);
202
203 let mut json = serde_json::to_string_pretty(report)?;
204 json.push('\n');
205 fs::write(&path, json).map_err(GroupError::Io)?;
206
207 Ok(path)
208}
209
210// ── Unit tests ────────────────────────────────────────────────────────────────
211
212#[cfg(test)]
213mod tests {
214 use super::*;
215 use crate::aggregate::ic_matcher::{IcMatchResult, IcMatchedPair, UnmatchedSide};
216 use crate::shard::ic_plan::IcRole;
217 use chrono::NaiveDate;
218 use datasynth_core::models::journal_entry::JournalEntryHeader;
219 use datasynth_core::models::{IcPairId, JournalEntry};
220
221 fn empty_match_result() -> IcMatchResult {
222 IcMatchResult {
223 matched: Vec::new(),
224 unmatched: Vec::new(),
225 total_planned: 0,
226 coverage: 0.0,
227 }
228 }
229
230 fn dummy_je() -> JournalEntry {
231 let header = JournalEntryHeader::new(
232 "ENT".to_string(),
233 NaiveDate::from_ymd_opt(2024, 6, 15).unwrap(),
234 );
235 JournalEntry::new(header)
236 }
237
238 #[test]
239 fn build_from_empty_matches_zero_coverage() {
240 let r = build_coverage_report(&empty_match_result());
241 assert_eq!(r.total_pairs_planned, 0);
242 assert_eq!(r.matched, 0);
243 assert_eq!(r.coverage, 0.0);
244 assert!(r.unmatched_sample.is_empty());
245 assert_eq!(r.unmatched_by_reason.len(), 3);
246 assert_eq!(
247 r.unmatched_by_reason
248 .get(&UnmatchedReason::MissingBuyerSide),
249 Some(&0)
250 );
251 assert_eq!(
252 r.unmatched_by_reason
253 .get(&UnmatchedReason::MissingSellerSide),
254 Some(&0)
255 );
256 assert_eq!(
257 r.unmatched_by_reason
258 .get(&UnmatchedReason::AmountDriftAboveTolerance),
259 Some(&0)
260 );
261 }
262
263 #[test]
264 fn histogram_keys_always_present_even_when_zero() {
265 // Result with one unmatched side using a single reason — the
266 // other two reasons must still be present at zero so the
267 // schema stays stable.
268 let mr = IcMatchResult {
269 matched: Vec::new(),
270 unmatched: vec![UnmatchedSide {
271 pair_id: IcPairId::from_bytes([1; 32]),
272 present_role: IcRole::Seller,
273 present_entity: "ENT".to_string(),
274 present_je: dummy_je(),
275 reason: UnmatchedReason::MissingBuyerSide,
276 }],
277 total_planned: 1,
278 coverage: 0.0,
279 };
280 let r = build_coverage_report(&mr);
281 assert_eq!(
282 r.unmatched_by_reason
283 .get(&UnmatchedReason::MissingBuyerSide),
284 Some(&1)
285 );
286 assert_eq!(
287 r.unmatched_by_reason
288 .get(&UnmatchedReason::MissingSellerSide),
289 Some(&0),
290 "MissingSellerSide must be present at 0 for schema stability"
291 );
292 assert_eq!(
293 r.unmatched_by_reason
294 .get(&UnmatchedReason::AmountDriftAboveTolerance),
295 Some(&0),
296 "AmountDriftAboveTolerance must be present at 0 for schema stability"
297 );
298 }
299
300 #[test]
301 fn build_passes_through_matched_count_and_coverage() {
302 let je = dummy_je();
303 let pair = IcMatchedPair {
304 pair_id: IcPairId::from_bytes([2; 32]),
305 seller_entity: "S".to_string(),
306 buyer_entity: "B".to_string(),
307 seller_je: je.clone(),
308 buyer_je: je,
309 };
310 let mr = IcMatchResult {
311 matched: vec![pair],
312 unmatched: Vec::new(),
313 total_planned: 2,
314 coverage: 0.5,
315 };
316 let r = build_coverage_report(&mr);
317 assert_eq!(r.total_pairs_planned, 2);
318 assert_eq!(r.matched, 1);
319 assert_eq!(r.coverage, 0.5);
320 }
321}