devboy-format-pipeline 0.28.1

Format pipeline for devboy-tools — TOON encoding, MCKP tree-budget trimming, cursor pagination, deduplication; the output stage shared by every devboy provider.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
//! Round-trip correctness gate for L1 / L2 encoders.
//!
//! Paper 2 §Encoder Bug Postmortem (2026-04-25) describes a class of
//! silent data-loss bugs where the chosen encoder produced shorter output
//! at the cost of dropping wrapping fields or nested-object cells. The
//! `mckp_v2` encoder (`deep_mckp_with_inner_table`) was the fix; this
//! module is the regression test that ensures no future change re-introduces
//! the bug.
//!
//! Each registered encoder declares its [`DataLoss`] guarantee:
//!
//! - [`DataLoss::None`] — every top-level and nested key in the input must
//!   appear (textually) in the output. `mckp_v2` and `json_compact` must
//!   meet this bar.
//! - [`DataLoss::TopLevel`] — wrapping object's top-level fields are
//!   intentionally dropped. Naive `csv` / `markdown_table` are in this
//!   category and **may not be selected as the production default** (see
//!   §Implementation Status migration warning).
//! - [`DataLoss::Nested`] — nested values inside otherwise-preserved
//!   wrappers may be flattened or dropped.
//!
//! The `round_trip_keys` API parses the encoded form back to a textual key
//! set and compares it against the input. The shape of "decoder" is
//! deliberately tolerant — we only check membership of key strings, not
//! type fidelity, since the encoders are explicitly token-saving and not
//! always type-lossless.

use std::collections::BTreeSet;

/// Whether an encoder is allowed to drop input keys, and where.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DataLoss {
    /// Encoder must preserve every input key (top-level and nested).
    /// Failing this is a bug.
    None,
    /// Encoder may drop the wrapping object's top-level keys (sibling
    /// fields of the chosen array). It must still preserve all keys
    /// inside the array elements.
    TopLevel,
    /// Encoder may drop nested-object keys inside array elements
    /// (e.g. older naive CSV that only kept primitive columns).
    Nested,
}

/// Result of a round-trip key comparison.
#[derive(Debug, Clone)]
pub struct KeyComparison {
    pub encoder_id: String,
    pub input_keys: BTreeSet<String>,
    pub output_keys: BTreeSet<String>,
    pub dropped: Vec<String>,
    pub added: Vec<String>,
    pub allowed_loss: DataLoss,
}

impl KeyComparison {
    /// True iff the comparison meets the encoder's declared guarantee.
    pub fn is_within_contract(&self) -> bool {
        match self.allowed_loss {
            DataLoss::None => self.dropped.is_empty(),
            // Tolerate any drop: encoder is documented as lossy. Production
            // code paths must already gate on this (refusing to set a
            // lossy encoder as the default).
            DataLoss::TopLevel | DataLoss::Nested => true,
        }
    }

    /// Compact summary for assertions / paper tables.
    pub fn report(&self) -> String {
        format!(
            "encoder={} input_keys={} output_keys={} dropped={:?} added={:?} loss={:?}",
            self.encoder_id,
            self.input_keys.len(),
            self.output_keys.len(),
            self.dropped,
            self.added,
            self.allowed_loss,
        )
    }
}

/// Loss profile declared by each encoder.
///
/// The list is the source of truth — adding a new encoder requires adding
/// a row here, which forces the author to think about which guarantee it
/// holds (and forces the test below to cover it).
pub fn declared_loss(encoder_id: &str) -> Option<DataLoss> {
    Some(match encoder_id {
        "json_compact" | "deep_mckp" | "deep_mckp_inner_table" | "mckp_v2" => DataLoss::None,
        // `kv` and `mr_diff_fence` flatten / serialise structured values
        // into a single line — we still expect every key to be visible
        // textually, so they are `None` for *key* preservation.
        "kv" | "mr_diff_fence" => DataLoss::None,
        // `csv` (try_array_csv) and `csv_from_md` discard wrapping object
        // context (see §Encoder Bug Postmortem). Keys *inside* array
        // elements are preserved (union of keys), but a top-level
        // wrapper's siblings are dropped.
        "csv" | "csv_from_md" => DataLoss::TopLevel,
        _ => return None,
    })
}

/// Run a round-trip key comparison for a known encoder id and the body it
/// produced from `raw_input`. Returns `None` if the encoder id is unknown.
pub fn round_trip_keys(
    encoder_id: &str,
    raw_input: &str,
    encoded_output: &str,
) -> Option<KeyComparison> {
    let allowed_loss = declared_loss(encoder_id)?;
    let input_keys = collect_json_keys(raw_input).unwrap_or_default();
    let output_keys = decode_keys(encoder_id, encoded_output);
    let dropped: Vec<String> = input_keys.difference(&output_keys).cloned().collect();
    let added: Vec<String> = output_keys.difference(&input_keys).cloned().collect();
    Some(KeyComparison {
        encoder_id: encoder_id.to_string(),
        input_keys,
        output_keys,
        dropped,
        added,
        allowed_loss,
    })
}

/// Walk a JSON value and return every `key` seen at any depth — including
/// keys nested inside arrays and inside string-valued cells that happen to
/// parse back as JSON. Free-form text and primitives contribute no keys.
fn collect_json_keys(raw: &str) -> Option<BTreeSet<String>> {
    let val: serde_json::Value = serde_json::from_str(raw.trim_start()).ok()?;
    let mut out = BTreeSet::new();
    walk_value(&val, &mut out);
    Some(out)
}

fn walk_value(v: &serde_json::Value, out: &mut BTreeSet<String>) {
    match v {
        serde_json::Value::Object(map) => {
            for (k, child) in map {
                out.insert(k.clone());
                walk_value(child, out);
            }
        }
        serde_json::Value::Array(arr) => {
            for child in arr {
                walk_value(child, out);
            }
        }
        _ => {}
    }
}

/// Decode the textual key set out of an encoder's output. The decoders are
/// permissive — they look for any token that *could* be a key, since the
/// goal is to detect *missing* keys, not validate full syntactic recovery.
fn decode_keys(encoder_id: &str, encoded: &str) -> BTreeSet<String> {
    match encoder_id {
        "json_compact" | "deep_mckp" => {
            // Compact JSON parses cleanly back to a Value.
            collect_json_keys(encoded).unwrap_or_default()
        }
        "deep_mckp_inner_table" | "mckp_v2" => decode_inner_table_keys(encoded),
        "csv" | "csv_from_md" => decode_csv_header_keys(encoded),
        "kv" => decode_kv_keys(encoded),
        "mr_diff_fence" => decode_diff_fence_keys(encoded),
        _ => BTreeSet::new(),
    }
}

/// Parse the deep_mckp_with_inner_table output: top-level `key: value`
/// lines (until the first blank line), then a `## <main_array_name>`
/// section heading, then a markdown table. Returns the union of pre-table
/// kv keys, the heading, the table headers, and any keys recovered from
/// inline-JSON cells.
fn decode_inner_table_keys(encoded: &str) -> BTreeSet<String> {
    let mut out = BTreeSet::new();
    let mut lines = encoded.lines().peekable();

    // Phase 1: top-level kv lines.
    while let Some(line) = lines.peek() {
        if line.trim().is_empty() {
            lines.next();
            break;
        }
        if line.starts_with("## ") || line.starts_with("| ") || line.starts_with("|---") {
            // Reached the section heading or table without a separating
            // blank line — stop kv parsing.
            break;
        }
        let line = lines.next().unwrap();
        if let Some((k, v)) = line.split_once(": ") {
            out.insert(k.trim().to_string());
            // The value may itself be inline JSON — recover its keys too.
            if let Ok(val) = serde_json::from_str::<serde_json::Value>(v.trim()) {
                walk_value(&val, &mut out);
            }
        }
    }

    // Phase 2 (optional): `## <main_array_name>` section heading carries
    // the wrapping object's array key.
    while let Some(line) = lines.peek() {
        if line.trim().is_empty() {
            lines.next();
            continue;
        }
        if let Some(rest) = line.strip_prefix("## ") {
            out.insert(rest.trim().to_string());
            lines.next();
            // Consume the blank line between heading and table.
            if matches!(lines.peek(), Some(l) if l.trim().is_empty()) {
                lines.next();
            }
        }
        break;
    }

    // Phase 3: markdown table header.
    if let Some(header) = lines.next() {
        for cell in split_md_row(header) {
            if !cell.is_empty() {
                out.insert(cell);
            }
        }
        // Skip the `| --- | --- |` separator.
        let _ = lines.next();
    }

    // Phase 3: inline-JSON cells inside data rows.
    for row in lines {
        for cell in split_md_row(row) {
            if (cell.starts_with('{') && cell.ends_with('}'))
                || (cell.starts_with('[') && cell.ends_with(']'))
            {
                let unescaped = cell.replace("\\|", "|");
                if let Ok(val) = serde_json::from_str::<serde_json::Value>(&unescaped) {
                    walk_value(&val, &mut out);
                }
            }
        }
    }

    out
}

fn split_md_row(line: &str) -> Vec<String> {
    let trimmed = line.trim().trim_start_matches('|').trim_end_matches('|');
    trimmed
        .split(" | ")
        .map(|s| s.trim().to_string())
        .filter(|s| !s.is_empty())
        .collect()
}

/// CSV (or csv_from_md) output — keys are the comma-separated header row.
/// Wrapping object's top-level keys are *not* recoverable from this
/// representation; that drop is the documented `DataLoss::TopLevel`.
fn decode_csv_header_keys(encoded: &str) -> BTreeSet<String> {
    let header = encoded.lines().next().unwrap_or("");
    header
        .split(',')
        .map(|s| s.trim().trim_matches('"').to_string())
        .filter(|s| !s.is_empty())
        .collect()
}

/// `key: value` lines, one per pair.
fn decode_kv_keys(encoded: &str) -> BTreeSet<String> {
    let mut out = BTreeSet::new();
    for line in encoded.lines() {
        if let Some((k, v)) = line.split_once(": ") {
            out.insert(k.trim().to_string());
            if let Ok(val) = serde_json::from_str::<serde_json::Value>(v.trim()) {
                walk_value(&val, &mut out);
            }
        }
    }
    out
}

/// `mr_diff_fence` output is a sequence of fenced blocks with a path
/// header. We recover `path` (and `diff` / `content` if recognisable as
/// the section labels) so that the input's matching keys round-trip.
fn decode_diff_fence_keys(encoded: &str) -> BTreeSet<String> {
    let mut out = BTreeSet::new();
    // The encoder writes `path: <p>\n` lines and ```diff fences. The
    // top-level JSON had `diffs`, `path`, and `content` / `diff` keys —
    // we approximate by saying any of those words appearing in the
    // output counts.
    let lower = encoded.to_ascii_lowercase();
    for k in ["diffs", "path", "diff", "content"] {
        if lower.contains(k) {
            out.insert(k.to_string());
        }
    }
    out
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::shape::classify;
    use crate::templates;

    fn keys_of(raw: &str) -> BTreeSet<String> {
        collect_json_keys(raw).unwrap_or_default()
    }

    // ─── mckp_v2 / deep_mckp_inner_table — must preserve EVERY key ─────

    #[test]
    fn mckp_v2_preserves_top_level_and_nested_keys() {
        let raw = r#"{
            "company": "Acme",
            "year": 2026,
            "employees": [
                {"id": 1, "name": "Ada", "address": {"city": "Boston"}},
                {"id": 2, "name": "Lin", "address": {"city": "Tokyo"}, "phone": "555"}
            ]
        }"#;
        let cls = classify(raw);
        let body = templates::deep_mckp_with_inner_table(raw, &cls)
            .expect("mckp_v2 should engage on object-wrapping-array shape");
        let cmp = round_trip_keys("mckp_v2", raw, &body).expect("encoder is registered");
        assert!(
            cmp.is_within_contract(),
            "mckp_v2 dropped keys: {}",
            cmp.report()
        );
        // Spot-check the specific keys called out in the postmortem.
        for k in [
            "company",
            "year",
            "employees",
            "id",
            "name",
            "address",
            "city",
        ] {
            assert!(
                cmp.output_keys.contains(k),
                "expected key `{k}` in mckp_v2 output, got {:?}",
                cmp.output_keys
            );
        }
    }

    #[test]
    fn mckp_v2_preserves_keys_when_inner_objects_are_heterogeneous() {
        // Last record has an extra `phone` field — union-of-keys must
        // include it.
        let raw = r#"{
            "scope": "ops",
            "items": [
                {"id": 1, "ok": true},
                {"id": 2, "ok": false, "phone": "x"}
            ]
        }"#;
        let cls = classify(raw);
        let body = templates::deep_mckp_with_inner_table(raw, &cls).unwrap();
        let cmp = round_trip_keys("mckp_v2", raw, &body).unwrap();
        assert!(cmp.is_within_contract(), "{}", cmp.report());
        assert!(cmp.output_keys.contains("phone"));
        assert!(cmp.output_keys.contains("scope"));
    }

    #[test]
    fn mckp_v2_returns_none_when_no_inner_array() {
        // Without a homogeneous inner array, the encoder declines. The
        // gate doesn't apply (no encoded output to compare).
        let raw = r#"{"a": 1, "b": 2}"#;
        let cls = classify(raw);
        assert!(templates::deep_mckp_with_inner_table(raw, &cls).is_none());
    }

    // ─── pipeline_deep_mckp / json_compact — fully lossless ───────────

    #[test]
    fn pipeline_deep_mckp_is_lossless() {
        let raw = r#"{
            "url_a": "https://example.com",
            "log": "line1\nline2",
            "hash": "deadbeef",
            "nested": {"k": "v"}
        }"#;
        let cls = classify(raw);
        let body = templates::pipeline_deep_mckp(raw, &cls).unwrap_or_else(|| {
            // If the template doesn't engage on this shape, fall back to
            // serde_json compaction which is the L2 last resort.
            serde_json::to_string(&serde_json::from_str::<serde_json::Value>(raw).unwrap()).unwrap()
        });
        let cmp = round_trip_keys("deep_mckp", raw, &body).unwrap();
        assert!(cmp.is_within_contract(), "{}", cmp.report());
        assert_eq!(cmp.dropped.len(), 0);
    }

    #[test]
    fn json_compact_is_lossless() {
        let raw = r#"{"id":1,"items":[{"a":2},{"b":3}]}"#;
        let body = serde_json::to_string(&serde_json::from_str::<serde_json::Value>(raw).unwrap())
            .unwrap();
        let cmp = round_trip_keys("json_compact", raw, &body).unwrap();
        assert!(cmp.is_within_contract());
        assert_eq!(cmp.dropped.len(), 0);
    }

    // ─── csv / csv_from_md — declared lossy (TopLevel) ─────────────────

    #[test]
    fn naive_csv_drops_top_level_wrapper_as_documented() {
        // Input has a wrapping object with `meta` and `rows`. Naive CSV
        // would emit only the rows table — `meta` *will* be dropped.
        // We assert the gate flags this as expected loss, not a regression.
        let raw = r#"{
            "meta": "report-2026-04-25",
            "rows": [
                {"id": 1, "v": "a"},
                {"id": 2, "v": "b"}
            ]
        }"#;
        // Hand-build the naive CSV that the bug used to emit (the lossy
        // historical path, kept here as the regression target).
        let body = "id,v\n1,a\n2,b\n";
        let cmp = round_trip_keys("csv", raw, body).unwrap();
        assert_eq!(cmp.allowed_loss, DataLoss::TopLevel);
        // Documented loss: `meta` and `rows` (the wrapper) are gone.
        assert!(cmp.dropped.iter().any(|k| k == "meta"));
        assert!(cmp.dropped.iter().any(|k| k == "rows"));
        // But keys inside the array are recoverable from the CSV header.
        assert!(cmp.output_keys.contains("id"));
        assert!(cmp.output_keys.contains("v"));
        // `is_within_contract` tolerates the documented loss — failing it
        // would mean we tightened the contract.
        assert!(cmp.is_within_contract());
    }

    #[test]
    fn csv_from_md_documents_the_same_loss() {
        // markdown_table → csv only carries the table itself; any
        // surrounding section/heading text is lost on this path.
        let md = "# Report 2026-04-25\n\n| id | v |\n|---|---|\n| 1 | a |\n| 2 | b |\n";
        let cls = classify(md);
        let body = templates::csv_from_md(md, &cls).unwrap();
        // No native JSON input here, so we synthesise the "logical" input
        // (what the markdown represents) for the comparison.
        let logical =
            r#"{"heading":"Report 2026-04-25","rows":[{"id":"1","v":"a"},{"id":"2","v":"b"}]}"#;
        let cmp = round_trip_keys("csv_from_md", logical, &body).unwrap();
        assert_eq!(cmp.allowed_loss, DataLoss::TopLevel);
        assert!(cmp.is_within_contract());
        assert!(cmp.output_keys.contains("id"));
    }

    // ─── kv format — keys preserved, values flattened ─────────────────

    #[test]
    fn kv_format_preserves_all_top_level_keys() {
        let raw = r#"{"alpha":1,"beta":"two","gamma":true,"delta":null,"epsilon":3.14}"#;
        let body = "alpha: 1\nbeta: two\ngamma: true\ndelta: \nepsilon: 3.14\n";
        let cmp = round_trip_keys("kv", raw, body).unwrap();
        assert!(cmp.is_within_contract(), "{}", cmp.report());
        for k in ["alpha", "beta", "gamma", "delta", "epsilon"] {
            assert!(cmp.output_keys.contains(k));
        }
    }

    // ─── meta-test: every encoder id used by the pipeline has a row ───

    #[test]
    fn declared_loss_table_covers_known_encoders() {
        for id in [
            "json_compact",
            "deep_mckp",
            "deep_mckp_inner_table",
            "mckp_v2",
            "csv",
            "csv_from_md",
            "kv",
            "mr_diff_fence",
        ] {
            assert!(
                declared_loss(id).is_some(),
                "encoder id `{id}` missing from declared_loss table"
            );
        }
        // Defensive: an unknown id should still return None so the test
        // does not silently flag a typo as `None` loss.
        assert!(declared_loss("totally_made_up").is_none());
    }

    #[test]
    fn empty_input_collects_no_keys() {
        assert!(keys_of("").is_empty());
        assert!(keys_of("not json").is_empty());
        assert!(keys_of("[1,2,3]").is_empty());
    }
}