Skip to main content

devboy_format_pipeline/
round_trip.rs

1//! Round-trip correctness gate for L1 / L2 encoders.
2//!
3//! Paper 2 §Encoder Bug Postmortem (2026-04-25) describes a class of
4//! silent data-loss bugs where the chosen encoder produced shorter output
5//! at the cost of dropping wrapping fields or nested-object cells. The
6//! `mckp_v2` encoder (`deep_mckp_with_inner_table`) was the fix; this
7//! module is the regression test that ensures no future change re-introduces
8//! the bug.
9//!
10//! Each registered encoder declares its [`DataLoss`] guarantee:
11//!
12//! - [`DataLoss::None`] — every top-level and nested key in the input must
13//!   appear (textually) in the output. `mckp_v2` and `json_compact` must
14//!   meet this bar.
15//! - [`DataLoss::TopLevel`] — wrapping object's top-level fields are
16//!   intentionally dropped. Naive `csv` / `markdown_table` are in this
17//!   category and **may not be selected as the production default** (see
18//!   §Implementation Status migration warning).
19//! - [`DataLoss::Nested`] — nested values inside otherwise-preserved
20//!   wrappers may be flattened or dropped.
21//!
22//! The `round_trip_keys` API parses the encoded form back to a textual key
23//! set and compares it against the input. The shape of "decoder" is
24//! deliberately tolerant — we only check membership of key strings, not
25//! type fidelity, since the encoders are explicitly token-saving and not
26//! always type-lossless.
27
28use std::collections::BTreeSet;
29
30/// Whether an encoder is allowed to drop input keys, and where.
31#[derive(Debug, Clone, Copy, PartialEq, Eq)]
32pub enum DataLoss {
33    /// Encoder must preserve every input key (top-level and nested).
34    /// Failing this is a bug.
35    None,
36    /// Encoder may drop the wrapping object's top-level keys (sibling
37    /// fields of the chosen array). It must still preserve all keys
38    /// inside the array elements.
39    TopLevel,
40    /// Encoder may drop nested-object keys inside array elements
41    /// (e.g. older naive CSV that only kept primitive columns).
42    Nested,
43}
44
45/// Result of a round-trip key comparison.
46#[derive(Debug, Clone)]
47pub struct KeyComparison {
48    pub encoder_id: String,
49    pub input_keys: BTreeSet<String>,
50    pub output_keys: BTreeSet<String>,
51    pub dropped: Vec<String>,
52    pub added: Vec<String>,
53    pub allowed_loss: DataLoss,
54}
55
56impl KeyComparison {
57    /// True iff the comparison meets the encoder's declared guarantee.
58    pub fn is_within_contract(&self) -> bool {
59        match self.allowed_loss {
60            DataLoss::None => self.dropped.is_empty(),
61            // Tolerate any drop: encoder is documented as lossy. Production
62            // code paths must already gate on this (refusing to set a
63            // lossy encoder as the default).
64            DataLoss::TopLevel | DataLoss::Nested => true,
65        }
66    }
67
68    /// Compact summary for assertions / paper tables.
69    pub fn report(&self) -> String {
70        format!(
71            "encoder={} input_keys={} output_keys={} dropped={:?} added={:?} loss={:?}",
72            self.encoder_id,
73            self.input_keys.len(),
74            self.output_keys.len(),
75            self.dropped,
76            self.added,
77            self.allowed_loss,
78        )
79    }
80}
81
82/// Loss profile declared by each encoder.
83///
84/// The list is the source of truth — adding a new encoder requires adding
85/// a row here, which forces the author to think about which guarantee it
86/// holds (and forces the test below to cover it).
87pub fn declared_loss(encoder_id: &str) -> Option<DataLoss> {
88    Some(match encoder_id {
89        "json_compact" | "deep_mckp" | "deep_mckp_inner_table" | "mckp_v2" => DataLoss::None,
90        // `kv` and `mr_diff_fence` flatten / serialise structured values
91        // into a single line — we still expect every key to be visible
92        // textually, so they are `None` for *key* preservation.
93        "kv" | "mr_diff_fence" => DataLoss::None,
94        // `csv` (try_array_csv) and `csv_from_md` discard wrapping object
95        // context (see §Encoder Bug Postmortem). Keys *inside* array
96        // elements are preserved (union of keys), but a top-level
97        // wrapper's siblings are dropped.
98        "csv" | "csv_from_md" => DataLoss::TopLevel,
99        _ => return None,
100    })
101}
102
103/// Run a round-trip key comparison for a known encoder id and the body it
104/// produced from `raw_input`. Returns `None` if the encoder id is unknown.
105pub fn round_trip_keys(
106    encoder_id: &str,
107    raw_input: &str,
108    encoded_output: &str,
109) -> Option<KeyComparison> {
110    let allowed_loss = declared_loss(encoder_id)?;
111    let input_keys = collect_json_keys(raw_input).unwrap_or_default();
112    let output_keys = decode_keys(encoder_id, encoded_output);
113    let dropped: Vec<String> = input_keys.difference(&output_keys).cloned().collect();
114    let added: Vec<String> = output_keys.difference(&input_keys).cloned().collect();
115    Some(KeyComparison {
116        encoder_id: encoder_id.to_string(),
117        input_keys,
118        output_keys,
119        dropped,
120        added,
121        allowed_loss,
122    })
123}
124
125/// Walk a JSON value and return every `key` seen at any depth — including
126/// keys nested inside arrays and inside string-valued cells that happen to
127/// parse back as JSON. Free-form text and primitives contribute no keys.
128fn collect_json_keys(raw: &str) -> Option<BTreeSet<String>> {
129    let val: serde_json::Value = serde_json::from_str(raw.trim_start()).ok()?;
130    let mut out = BTreeSet::new();
131    walk_value(&val, &mut out);
132    Some(out)
133}
134
135fn walk_value(v: &serde_json::Value, out: &mut BTreeSet<String>) {
136    match v {
137        serde_json::Value::Object(map) => {
138            for (k, child) in map {
139                out.insert(k.clone());
140                walk_value(child, out);
141            }
142        }
143        serde_json::Value::Array(arr) => {
144            for child in arr {
145                walk_value(child, out);
146            }
147        }
148        _ => {}
149    }
150}
151
152/// Decode the textual key set out of an encoder's output. The decoders are
153/// permissive — they look for any token that *could* be a key, since the
154/// goal is to detect *missing* keys, not validate full syntactic recovery.
155fn decode_keys(encoder_id: &str, encoded: &str) -> BTreeSet<String> {
156    match encoder_id {
157        "json_compact" | "deep_mckp" => {
158            // Compact JSON parses cleanly back to a Value.
159            collect_json_keys(encoded).unwrap_or_default()
160        }
161        "deep_mckp_inner_table" | "mckp_v2" => decode_inner_table_keys(encoded),
162        "csv" | "csv_from_md" => decode_csv_header_keys(encoded),
163        "kv" => decode_kv_keys(encoded),
164        "mr_diff_fence" => decode_diff_fence_keys(encoded),
165        _ => BTreeSet::new(),
166    }
167}
168
169/// Parse the deep_mckp_with_inner_table output: top-level `key: value`
170/// lines (until the first blank line), then a `## <main_array_name>`
171/// section heading, then a markdown table. Returns the union of pre-table
172/// kv keys, the heading, the table headers, and any keys recovered from
173/// inline-JSON cells.
174fn decode_inner_table_keys(encoded: &str) -> BTreeSet<String> {
175    let mut out = BTreeSet::new();
176    let mut lines = encoded.lines().peekable();
177
178    // Phase 1: top-level kv lines.
179    while let Some(line) = lines.peek() {
180        if line.trim().is_empty() {
181            lines.next();
182            break;
183        }
184        if line.starts_with("## ") || line.starts_with("| ") || line.starts_with("|---") {
185            // Reached the section heading or table without a separating
186            // blank line — stop kv parsing.
187            break;
188        }
189        let line = lines.next().unwrap();
190        if let Some((k, v)) = line.split_once(": ") {
191            out.insert(k.trim().to_string());
192            // The value may itself be inline JSON — recover its keys too.
193            if let Ok(val) = serde_json::from_str::<serde_json::Value>(v.trim()) {
194                walk_value(&val, &mut out);
195            }
196        }
197    }
198
199    // Phase 2 (optional): `## <main_array_name>` section heading carries
200    // the wrapping object's array key.
201    while let Some(line) = lines.peek() {
202        if line.trim().is_empty() {
203            lines.next();
204            continue;
205        }
206        if let Some(rest) = line.strip_prefix("## ") {
207            out.insert(rest.trim().to_string());
208            lines.next();
209            // Consume the blank line between heading and table.
210            if matches!(lines.peek(), Some(l) if l.trim().is_empty()) {
211                lines.next();
212            }
213        }
214        break;
215    }
216
217    // Phase 3: markdown table header.
218    if let Some(header) = lines.next() {
219        for cell in split_md_row(header) {
220            if !cell.is_empty() {
221                out.insert(cell);
222            }
223        }
224        // Skip the `| --- | --- |` separator.
225        let _ = lines.next();
226    }
227
228    // Phase 3: inline-JSON cells inside data rows.
229    for row in lines {
230        for cell in split_md_row(row) {
231            if (cell.starts_with('{') && cell.ends_with('}'))
232                || (cell.starts_with('[') && cell.ends_with(']'))
233            {
234                let unescaped = cell.replace("\\|", "|");
235                if let Ok(val) = serde_json::from_str::<serde_json::Value>(&unescaped) {
236                    walk_value(&val, &mut out);
237                }
238            }
239        }
240    }
241
242    out
243}
244
245fn split_md_row(line: &str) -> Vec<String> {
246    let trimmed = line.trim().trim_start_matches('|').trim_end_matches('|');
247    trimmed
248        .split(" | ")
249        .map(|s| s.trim().to_string())
250        .filter(|s| !s.is_empty())
251        .collect()
252}
253
254/// CSV (or csv_from_md) output — keys are the comma-separated header row.
255/// Wrapping object's top-level keys are *not* recoverable from this
256/// representation; that drop is the documented `DataLoss::TopLevel`.
257fn decode_csv_header_keys(encoded: &str) -> BTreeSet<String> {
258    let header = encoded.lines().next().unwrap_or("");
259    header
260        .split(',')
261        .map(|s| s.trim().trim_matches('"').to_string())
262        .filter(|s| !s.is_empty())
263        .collect()
264}
265
266/// `key: value` lines, one per pair.
267fn decode_kv_keys(encoded: &str) -> BTreeSet<String> {
268    let mut out = BTreeSet::new();
269    for line in encoded.lines() {
270        if let Some((k, v)) = line.split_once(": ") {
271            out.insert(k.trim().to_string());
272            if let Ok(val) = serde_json::from_str::<serde_json::Value>(v.trim()) {
273                walk_value(&val, &mut out);
274            }
275        }
276    }
277    out
278}
279
280/// `mr_diff_fence` output is a sequence of fenced blocks with a path
281/// header. We recover `path` (and `diff` / `content` if recognisable as
282/// the section labels) so that the input's matching keys round-trip.
283fn decode_diff_fence_keys(encoded: &str) -> BTreeSet<String> {
284    let mut out = BTreeSet::new();
285    // The encoder writes `path: <p>\n` lines and ```diff fences. The
286    // top-level JSON had `diffs`, `path`, and `content` / `diff` keys —
287    // we approximate by saying any of those words appearing in the
288    // output counts.
289    let lower = encoded.to_ascii_lowercase();
290    for k in ["diffs", "path", "diff", "content"] {
291        if lower.contains(k) {
292            out.insert(k.to_string());
293        }
294    }
295    out
296}
297
298#[cfg(test)]
299mod tests {
300    use super::*;
301    use crate::shape::classify;
302    use crate::templates;
303
304    fn keys_of(raw: &str) -> BTreeSet<String> {
305        collect_json_keys(raw).unwrap_or_default()
306    }
307
308    // ─── mckp_v2 / deep_mckp_inner_table — must preserve EVERY key ─────
309
310    #[test]
311    fn mckp_v2_preserves_top_level_and_nested_keys() {
312        let raw = r#"{
313            "company": "Acme",
314            "year": 2026,
315            "employees": [
316                {"id": 1, "name": "Ada", "address": {"city": "Boston"}},
317                {"id": 2, "name": "Lin", "address": {"city": "Tokyo"}, "phone": "555"}
318            ]
319        }"#;
320        let cls = classify(raw);
321        let body = templates::deep_mckp_with_inner_table(raw, &cls)
322            .expect("mckp_v2 should engage on object-wrapping-array shape");
323        let cmp = round_trip_keys("mckp_v2", raw, &body).expect("encoder is registered");
324        assert!(
325            cmp.is_within_contract(),
326            "mckp_v2 dropped keys: {}",
327            cmp.report()
328        );
329        // Spot-check the specific keys called out in the postmortem.
330        for k in [
331            "company",
332            "year",
333            "employees",
334            "id",
335            "name",
336            "address",
337            "city",
338        ] {
339            assert!(
340                cmp.output_keys.contains(k),
341                "expected key `{k}` in mckp_v2 output, got {:?}",
342                cmp.output_keys
343            );
344        }
345    }
346
347    #[test]
348    fn mckp_v2_preserves_keys_when_inner_objects_are_heterogeneous() {
349        // Last record has an extra `phone` field — union-of-keys must
350        // include it.
351        let raw = r#"{
352            "scope": "ops",
353            "items": [
354                {"id": 1, "ok": true},
355                {"id": 2, "ok": false, "phone": "x"}
356            ]
357        }"#;
358        let cls = classify(raw);
359        let body = templates::deep_mckp_with_inner_table(raw, &cls).unwrap();
360        let cmp = round_trip_keys("mckp_v2", raw, &body).unwrap();
361        assert!(cmp.is_within_contract(), "{}", cmp.report());
362        assert!(cmp.output_keys.contains("phone"));
363        assert!(cmp.output_keys.contains("scope"));
364    }
365
366    #[test]
367    fn mckp_v2_returns_none_when_no_inner_array() {
368        // Without a homogeneous inner array, the encoder declines. The
369        // gate doesn't apply (no encoded output to compare).
370        let raw = r#"{"a": 1, "b": 2}"#;
371        let cls = classify(raw);
372        assert!(templates::deep_mckp_with_inner_table(raw, &cls).is_none());
373    }
374
375    // ─── pipeline_deep_mckp / json_compact — fully lossless ───────────
376
377    #[test]
378    fn pipeline_deep_mckp_is_lossless() {
379        let raw = r#"{
380            "url_a": "https://example.com",
381            "log": "line1\nline2",
382            "hash": "deadbeef",
383            "nested": {"k": "v"}
384        }"#;
385        let cls = classify(raw);
386        let body = templates::pipeline_deep_mckp(raw, &cls).unwrap_or_else(|| {
387            // If the template doesn't engage on this shape, fall back to
388            // serde_json compaction which is the L2 last resort.
389            serde_json::to_string(&serde_json::from_str::<serde_json::Value>(raw).unwrap()).unwrap()
390        });
391        let cmp = round_trip_keys("deep_mckp", raw, &body).unwrap();
392        assert!(cmp.is_within_contract(), "{}", cmp.report());
393        assert_eq!(cmp.dropped.len(), 0);
394    }
395
396    #[test]
397    fn json_compact_is_lossless() {
398        let raw = r#"{"id":1,"items":[{"a":2},{"b":3}]}"#;
399        let body = serde_json::to_string(&serde_json::from_str::<serde_json::Value>(raw).unwrap())
400            .unwrap();
401        let cmp = round_trip_keys("json_compact", raw, &body).unwrap();
402        assert!(cmp.is_within_contract());
403        assert_eq!(cmp.dropped.len(), 0);
404    }
405
406    // ─── csv / csv_from_md — declared lossy (TopLevel) ─────────────────
407
408    #[test]
409    fn naive_csv_drops_top_level_wrapper_as_documented() {
410        // Input has a wrapping object with `meta` and `rows`. Naive CSV
411        // would emit only the rows table — `meta` *will* be dropped.
412        // We assert the gate flags this as expected loss, not a regression.
413        let raw = r#"{
414            "meta": "report-2026-04-25",
415            "rows": [
416                {"id": 1, "v": "a"},
417                {"id": 2, "v": "b"}
418            ]
419        }"#;
420        // Hand-build the naive CSV that the bug used to emit (the lossy
421        // historical path, kept here as the regression target).
422        let body = "id,v\n1,a\n2,b\n";
423        let cmp = round_trip_keys("csv", raw, body).unwrap();
424        assert_eq!(cmp.allowed_loss, DataLoss::TopLevel);
425        // Documented loss: `meta` and `rows` (the wrapper) are gone.
426        assert!(cmp.dropped.iter().any(|k| k == "meta"));
427        assert!(cmp.dropped.iter().any(|k| k == "rows"));
428        // But keys inside the array are recoverable from the CSV header.
429        assert!(cmp.output_keys.contains("id"));
430        assert!(cmp.output_keys.contains("v"));
431        // `is_within_contract` tolerates the documented loss — failing it
432        // would mean we tightened the contract.
433        assert!(cmp.is_within_contract());
434    }
435
436    #[test]
437    fn csv_from_md_documents_the_same_loss() {
438        // markdown_table → csv only carries the table itself; any
439        // surrounding section/heading text is lost on this path.
440        let md = "# Report 2026-04-25\n\n| id | v |\n|---|---|\n| 1 | a |\n| 2 | b |\n";
441        let cls = classify(md);
442        let body = templates::csv_from_md(md, &cls).unwrap();
443        // No native JSON input here, so we synthesise the "logical" input
444        // (what the markdown represents) for the comparison.
445        let logical =
446            r#"{"heading":"Report 2026-04-25","rows":[{"id":"1","v":"a"},{"id":"2","v":"b"}]}"#;
447        let cmp = round_trip_keys("csv_from_md", logical, &body).unwrap();
448        assert_eq!(cmp.allowed_loss, DataLoss::TopLevel);
449        assert!(cmp.is_within_contract());
450        assert!(cmp.output_keys.contains("id"));
451    }
452
453    // ─── kv format — keys preserved, values flattened ─────────────────
454
455    #[test]
456    fn kv_format_preserves_all_top_level_keys() {
457        let raw = r#"{"alpha":1,"beta":"two","gamma":true,"delta":null,"epsilon":3.14}"#;
458        let body = "alpha: 1\nbeta: two\ngamma: true\ndelta: \nepsilon: 3.14\n";
459        let cmp = round_trip_keys("kv", raw, body).unwrap();
460        assert!(cmp.is_within_contract(), "{}", cmp.report());
461        for k in ["alpha", "beta", "gamma", "delta", "epsilon"] {
462            assert!(cmp.output_keys.contains(k));
463        }
464    }
465
466    // ─── meta-test: every encoder id used by the pipeline has a row ───
467
468    #[test]
469    fn declared_loss_table_covers_known_encoders() {
470        for id in [
471            "json_compact",
472            "deep_mckp",
473            "deep_mckp_inner_table",
474            "mckp_v2",
475            "csv",
476            "csv_from_md",
477            "kv",
478            "mr_diff_fence",
479        ] {
480            assert!(
481                declared_loss(id).is_some(),
482                "encoder id `{id}` missing from declared_loss table"
483            );
484        }
485        // Defensive: an unknown id should still return None so the test
486        // does not silently flag a typo as `None` loss.
487        assert!(declared_loss("totally_made_up").is_none());
488    }
489
490    #[test]
491    fn empty_input_collects_no_keys() {
492        assert!(keys_of("").is_empty());
493        assert!(keys_of("not json").is_empty());
494        assert!(keys_of("[1,2,3]").is_empty());
495    }
496}