Skip to main content

datasynth_group/aggregate/
coverage_report.rs

1//! IC matching coverage report — Task 5.7.
2//!
3//! After [`crate::aggregate::ic_matcher::match_ic_pairs`] has joined the
4//! seller-side and buyer-side journal entries of every IC pair, this
5//! module summarises the result into a sidecar diagnostic / observability
6//! artefact for auditors and dashboards.
7//!
8//! # Position in the v5.0 aggregate phase
9//!
10//! ```text
11//!   match_ic_pairs           Task 5.3
12//!         │
13//!         ▼
14//!   IcMatchResult ──────► build_coverage_report ──► CoverageReport
15//!         │                                              │
16//!         ▼                                              ▼
17//!   generate_eliminations   Task 5.4              ic_eliminations/
18//!         │                                       ic_matching_coverage.json
19//!         ▼
20//!   eliminations_to_journal_entries   Task 5.5
21//!         │
22//!         ▼
23//!   apply_eliminations_to_tb          Task 5.6
24//! ```
25//!
26//! The report is intentionally **not** part of the consolidated TB
27//! pipeline.  The downstream elimination → JE → post-elim chain consumes
28//! [`crate::aggregate::ic_matcher::IcMatchResult`] directly; the coverage
29//! report is a sidecar artefact whose only readers are auditors,
30//! dashboards, and the "Good engagement" gate (`coverage ≥ 0.98` per spec
31//! §5.4).
32//!
33//! # On-disk contract
34//!
35//! Per spec §5.4, the report lands at
36//! `{out_dir}/ic_eliminations/ic_matching_coverage.json` with these fields:
37//!
38//! ```json
39//! {
40//!   "total_pairs_planned": 250000,
41//!   "matched": 248750,
42//!   "coverage": 0.995,
43//!   "unmatched_by_reason": {
44//!     "missing_buyer_side": 780,
45//!     "missing_seller_side": 350,
46//!     "amount_drift_above_tolerance": 120
47//!   },
48//!   "unmatched_sample": [/* first 100 unmatched sides */]
49//! }
50//! ```
51//!
52//! # Sample cap
53//!
54//! `unmatched_sample` is capped at the first **100** entries from
55//! [`crate::aggregate::ic_matcher::IcMatchResult::unmatched`] (which is
56//! pre-sorted by `pair_id` for determinism).  The cap matches the spec's
57//! "first 100 unmatched pairs for debugging" — the goal is diagnostic
58//! triage, not exhaustive replay.  Auditors needing the full unmatched
59//! list can re-run [`crate::aggregate::ic_matcher::match_ic_pairs`] in a
60//! debugger.
61//!
62//! # Schema stability
63//!
64//! `unmatched_by_reason` is always emitted with **all three**
65//! [`UnmatchedReason`] variants present (zero-valued when no
66//! observations) so dashboards can rely on the keys existing.  The map
67//! is a [`BTreeMap`] keyed by the reason enum so the JSON output is
68//! deterministically ordered.
69//!
70//! # Determinism
71//!
72//! Two calls with identical [`IcMatchResult`] inputs produce
73//! byte-identical JSON output.  This is verified by the
74//! `deterministic_output` test in `tests/coverage_report.rs`.
75
76use std::collections::BTreeMap;
77use std::fs;
78use std::path::{Path, PathBuf};
79
80use serde::{Deserialize, Serialize};
81
82use crate::aggregate::ic_matcher::{IcMatchResult, UnmatchedReason, UnmatchedSide};
83use crate::errors::{GroupError, GroupResult};
84
85/// Maximum number of unmatched sides included in
86/// [`CoverageReport::unmatched_sample`].  Mirrors the spec §5.4
87/// "\[:100\]" semantics.
88pub const UNMATCHED_SAMPLE_CAP: usize = 100;
89
90/// Subdirectory within the group output root where the coverage report
91/// lives.  Per spec §5.4, the report sits alongside the elimination JEs.
92pub const COVERAGE_REPORT_SUBDIR: &str = "ic_eliminations";
93
94/// File name for the coverage report, per spec §5.4.
95pub const COVERAGE_REPORT_FILENAME: &str = "ic_matching_coverage.json";
96
97// ── Public types ──────────────────────────────────────────────────────────────
98
99/// Diagnostic / observability summary of [`IcMatchResult`].
100///
101/// See spec §5.4 for the field contract.  All counts mirror the
102/// matcher's already-computed values — this report is a thin pass-through
103/// layer with the histogram + sample for auditors / dashboards.
104#[derive(Debug, Clone, Serialize, Deserialize)]
105pub struct CoverageReport {
106    /// Total number of distinct IC pairs the manifest planned (sum
107    /// across all entities of seller-side plans).  Mirrors
108    /// [`IcMatchResult::total_planned`].
109    pub total_pairs_planned: usize,
110    /// Number of fully matched pairs (both sides observed).  Equals
111    /// `IcMatchResult::matched.len()`.
112    pub matched: usize,
113    /// `matched / total_pairs_planned`, with `0/0` mapped to `0.0`
114    /// (mirrors [`IcMatchResult::coverage`] so the two stay
115    /// consistent — see the matcher's module docs for the rationale).
116    pub coverage: f64,
117    /// Histogram of unmatched sides keyed by reason.
118    ///
119    /// Always contains all three [`UnmatchedReason`] variants — even
120    /// when zero — so the schema is stable across runs and dashboards
121    /// can rely on the keys being present.  Keyed by the enum itself
122    /// (with snake_case serde rename) and stored in a [`BTreeMap`] so
123    /// the JSON output's iteration order is deterministic.
124    pub unmatched_by_reason: BTreeMap<UnmatchedReason, usize>,
125    /// Up to [`UNMATCHED_SAMPLE_CAP`] unmatched sides for diagnostic
126    /// triage.  The sample is the head of
127    /// [`IcMatchResult::unmatched`] (already sorted by `pair_id` for
128    /// determinism) so the same input always produces the same sample.
129    pub unmatched_sample: Vec<UnmatchedSide>,
130}
131
132// ── Public API ────────────────────────────────────────────────────────────────
133
134/// Build a [`CoverageReport`] from an [`IcMatchResult`].
135///
136/// Pure function: no I/O, no allocation beyond the report itself, no
137/// dependence on global state.  Two calls with the same input produce
138/// equal reports.
139///
140/// # Behaviour
141///
142/// 1. **Counts.**  `total_pairs_planned`, `matched`, and `coverage`
143///    are passed through verbatim from `result` — the matcher already
144///    did the math, we just relabel.
145/// 2. **Histogram.**  Initialise `unmatched_by_reason` with all three
146///    [`UnmatchedReason`] variants set to zero (so the schema stays
147///    stable for downstream dashboards) and increment the relevant
148///    bucket for each entry in `result.unmatched`.
149/// 3. **Sample.**  Take the first [`UNMATCHED_SAMPLE_CAP`] entries
150///    from `result.unmatched`.  The matcher pre-sorts `unmatched` by
151///    `pair_id`, so the sample is deterministic.
152pub fn build_coverage_report(result: &IcMatchResult) -> CoverageReport {
153    // Initialise the histogram with all three reasons present so
154    // downstream dashboards can rely on the keys existing even when a
155    // run has zero observations of a given reason.
156    let mut unmatched_by_reason: BTreeMap<UnmatchedReason, usize> = BTreeMap::new();
157    unmatched_by_reason.insert(UnmatchedReason::MissingBuyerSide, 0);
158    unmatched_by_reason.insert(UnmatchedReason::MissingSellerSide, 0);
159    unmatched_by_reason.insert(UnmatchedReason::AmountDriftAboveTolerance, 0);
160
161    for side in &result.unmatched {
162        *unmatched_by_reason.entry(side.reason).or_insert(0) += 1;
163    }
164
165    let unmatched_sample: Vec<UnmatchedSide> = result
166        .unmatched
167        .iter()
168        .take(UNMATCHED_SAMPLE_CAP)
169        .cloned()
170        .collect();
171
172    CoverageReport {
173        total_pairs_planned: result.total_planned,
174        matched: result.matched.len(),
175        coverage: result.coverage,
176        unmatched_by_reason,
177        unmatched_sample,
178    }
179}
180
181/// Write the coverage report to
182/// `{out_dir}/ic_eliminations/ic_matching_coverage.json`.
183///
184/// Creates the `ic_eliminations/` subdirectory if it doesn't exist.
185/// Output is pretty-printed JSON with a trailing newline so the file is
186/// human-readable when opened in an editor.
187///
188/// Returns the absolute path of the written file so callers logging
189/// "wrote ic_matching_coverage.json to …" don't have to re-derive it.
190///
191/// # Errors
192///
193/// - [`GroupError::Io`] if the subdirectory creation, file write, or
194///   path resolution fails.
195/// - [`GroupError::Serde`] if the report fails to serialise (should be
196///   impossible — every field is `Serialize`-friendly).
197pub fn write_coverage_report(report: &CoverageReport, out_dir: &Path) -> GroupResult<PathBuf> {
198    let dir = out_dir.join(COVERAGE_REPORT_SUBDIR);
199    fs::create_dir_all(&dir).map_err(GroupError::Io)?;
200
201    let path = dir.join(COVERAGE_REPORT_FILENAME);
202
203    let mut json = serde_json::to_string_pretty(report)?;
204    json.push('\n');
205    fs::write(&path, json).map_err(GroupError::Io)?;
206
207    Ok(path)
208}
209
210// ── Unit tests ────────────────────────────────────────────────────────────────
211
212#[cfg(test)]
213mod tests {
214    use super::*;
215    use crate::aggregate::ic_matcher::{IcMatchResult, IcMatchedPair, UnmatchedSide};
216    use crate::shard::ic_plan::IcRole;
217    use chrono::NaiveDate;
218    use datasynth_core::models::journal_entry::JournalEntryHeader;
219    use datasynth_core::models::{IcPairId, JournalEntry};
220
221    fn empty_match_result() -> IcMatchResult {
222        IcMatchResult {
223            matched: Vec::new(),
224            unmatched: Vec::new(),
225            total_planned: 0,
226            coverage: 0.0,
227        }
228    }
229
230    fn dummy_je() -> JournalEntry {
231        let header = JournalEntryHeader::new(
232            "ENT".to_string(),
233            NaiveDate::from_ymd_opt(2024, 6, 15).unwrap(),
234        );
235        JournalEntry::new(header)
236    }
237
238    #[test]
239    fn build_from_empty_matches_zero_coverage() {
240        let r = build_coverage_report(&empty_match_result());
241        assert_eq!(r.total_pairs_planned, 0);
242        assert_eq!(r.matched, 0);
243        assert_eq!(r.coverage, 0.0);
244        assert!(r.unmatched_sample.is_empty());
245        assert_eq!(r.unmatched_by_reason.len(), 3);
246        assert_eq!(
247            r.unmatched_by_reason
248                .get(&UnmatchedReason::MissingBuyerSide),
249            Some(&0)
250        );
251        assert_eq!(
252            r.unmatched_by_reason
253                .get(&UnmatchedReason::MissingSellerSide),
254            Some(&0)
255        );
256        assert_eq!(
257            r.unmatched_by_reason
258                .get(&UnmatchedReason::AmountDriftAboveTolerance),
259            Some(&0)
260        );
261    }
262
263    #[test]
264    fn histogram_keys_always_present_even_when_zero() {
265        // Result with one unmatched side using a single reason — the
266        // other two reasons must still be present at zero so the
267        // schema stays stable.
268        let mr = IcMatchResult {
269            matched: Vec::new(),
270            unmatched: vec![UnmatchedSide {
271                pair_id: IcPairId::from_bytes([1; 32]),
272                present_role: IcRole::Seller,
273                present_entity: "ENT".to_string(),
274                present_je: dummy_je(),
275                reason: UnmatchedReason::MissingBuyerSide,
276            }],
277            total_planned: 1,
278            coverage: 0.0,
279        };
280        let r = build_coverage_report(&mr);
281        assert_eq!(
282            r.unmatched_by_reason
283                .get(&UnmatchedReason::MissingBuyerSide),
284            Some(&1)
285        );
286        assert_eq!(
287            r.unmatched_by_reason
288                .get(&UnmatchedReason::MissingSellerSide),
289            Some(&0),
290            "MissingSellerSide must be present at 0 for schema stability"
291        );
292        assert_eq!(
293            r.unmatched_by_reason
294                .get(&UnmatchedReason::AmountDriftAboveTolerance),
295            Some(&0),
296            "AmountDriftAboveTolerance must be present at 0 for schema stability"
297        );
298    }
299
300    #[test]
301    fn build_passes_through_matched_count_and_coverage() {
302        let je = dummy_je();
303        let pair = IcMatchedPair {
304            pair_id: IcPairId::from_bytes([2; 32]),
305            seller_entity: "S".to_string(),
306            buyer_entity: "B".to_string(),
307            seller_je: je.clone(),
308            buyer_je: je,
309        };
310        let mr = IcMatchResult {
311            matched: vec![pair],
312            unmatched: Vec::new(),
313            total_planned: 2,
314            coverage: 0.5,
315        };
316        let r = build_coverage_report(&mr);
317        assert_eq!(r.total_pairs_planned, 2);
318        assert_eq!(r.matched, 1);
319        assert_eq!(r.coverage, 0.5);
320    }
321}