use std::collections::BTreeSet;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DataLoss {
None,
TopLevel,
Nested,
}
#[derive(Debug, Clone)]
pub struct KeyComparison {
pub encoder_id: String,
pub input_keys: BTreeSet<String>,
pub output_keys: BTreeSet<String>,
pub dropped: Vec<String>,
pub added: Vec<String>,
pub allowed_loss: DataLoss,
}
impl KeyComparison {
pub fn is_within_contract(&self) -> bool {
match self.allowed_loss {
DataLoss::None => self.dropped.is_empty(),
DataLoss::TopLevel | DataLoss::Nested => true,
}
}
pub fn report(&self) -> String {
format!(
"encoder={} input_keys={} output_keys={} dropped={:?} added={:?} loss={:?}",
self.encoder_id,
self.input_keys.len(),
self.output_keys.len(),
self.dropped,
self.added,
self.allowed_loss,
)
}
}
pub fn declared_loss(encoder_id: &str) -> Option<DataLoss> {
Some(match encoder_id {
"json_compact" | "deep_mckp" | "deep_mckp_inner_table" | "mckp_v2" => DataLoss::None,
"kv" | "mr_diff_fence" => DataLoss::None,
"csv" | "csv_from_md" => DataLoss::TopLevel,
_ => return None,
})
}
pub fn round_trip_keys(
encoder_id: &str,
raw_input: &str,
encoded_output: &str,
) -> Option<KeyComparison> {
let allowed_loss = declared_loss(encoder_id)?;
let input_keys = collect_json_keys(raw_input).unwrap_or_default();
let output_keys = decode_keys(encoder_id, encoded_output);
let dropped: Vec<String> = input_keys.difference(&output_keys).cloned().collect();
let added: Vec<String> = output_keys.difference(&input_keys).cloned().collect();
Some(KeyComparison {
encoder_id: encoder_id.to_string(),
input_keys,
output_keys,
dropped,
added,
allowed_loss,
})
}
fn collect_json_keys(raw: &str) -> Option<BTreeSet<String>> {
let val: serde_json::Value = serde_json::from_str(raw.trim_start()).ok()?;
let mut out = BTreeSet::new();
walk_value(&val, &mut out);
Some(out)
}
fn walk_value(v: &serde_json::Value, out: &mut BTreeSet<String>) {
match v {
serde_json::Value::Object(map) => {
for (k, child) in map {
out.insert(k.clone());
walk_value(child, out);
}
}
serde_json::Value::Array(arr) => {
for child in arr {
walk_value(child, out);
}
}
_ => {}
}
}
fn decode_keys(encoder_id: &str, encoded: &str) -> BTreeSet<String> {
match encoder_id {
"json_compact" | "deep_mckp" => {
collect_json_keys(encoded).unwrap_or_default()
}
"deep_mckp_inner_table" | "mckp_v2" => decode_inner_table_keys(encoded),
"csv" | "csv_from_md" => decode_csv_header_keys(encoded),
"kv" => decode_kv_keys(encoded),
"mr_diff_fence" => decode_diff_fence_keys(encoded),
_ => BTreeSet::new(),
}
}
fn decode_inner_table_keys(encoded: &str) -> BTreeSet<String> {
let mut out = BTreeSet::new();
let mut lines = encoded.lines().peekable();
while let Some(line) = lines.peek() {
if line.trim().is_empty() {
lines.next();
break;
}
if line.starts_with("## ") || line.starts_with("| ") || line.starts_with("|---") {
break;
}
let line = lines.next().unwrap();
if let Some((k, v)) = line.split_once(": ") {
out.insert(k.trim().to_string());
if let Ok(val) = serde_json::from_str::<serde_json::Value>(v.trim()) {
walk_value(&val, &mut out);
}
}
}
while let Some(line) = lines.peek() {
if line.trim().is_empty() {
lines.next();
continue;
}
if let Some(rest) = line.strip_prefix("## ") {
out.insert(rest.trim().to_string());
lines.next();
if matches!(lines.peek(), Some(l) if l.trim().is_empty()) {
lines.next();
}
}
break;
}
if let Some(header) = lines.next() {
for cell in split_md_row(header) {
if !cell.is_empty() {
out.insert(cell);
}
}
let _ = lines.next();
}
for row in lines {
for cell in split_md_row(row) {
if (cell.starts_with('{') && cell.ends_with('}'))
|| (cell.starts_with('[') && cell.ends_with(']'))
{
let unescaped = cell.replace("\\|", "|");
if let Ok(val) = serde_json::from_str::<serde_json::Value>(&unescaped) {
walk_value(&val, &mut out);
}
}
}
}
out
}
fn split_md_row(line: &str) -> Vec<String> {
let trimmed = line.trim().trim_start_matches('|').trim_end_matches('|');
trimmed
.split(" | ")
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect()
}
fn decode_csv_header_keys(encoded: &str) -> BTreeSet<String> {
let header = encoded.lines().next().unwrap_or("");
header
.split(',')
.map(|s| s.trim().trim_matches('"').to_string())
.filter(|s| !s.is_empty())
.collect()
}
fn decode_kv_keys(encoded: &str) -> BTreeSet<String> {
let mut out = BTreeSet::new();
for line in encoded.lines() {
if let Some((k, v)) = line.split_once(": ") {
out.insert(k.trim().to_string());
if let Ok(val) = serde_json::from_str::<serde_json::Value>(v.trim()) {
walk_value(&val, &mut out);
}
}
}
out
}
fn decode_diff_fence_keys(encoded: &str) -> BTreeSet<String> {
let mut out = BTreeSet::new();
let lower = encoded.to_ascii_lowercase();
for k in ["diffs", "path", "diff", "content"] {
if lower.contains(k) {
out.insert(k.to_string());
}
}
out
}
#[cfg(test)]
mod tests {
use super::*;
use crate::shape::classify;
use crate::templates;
fn keys_of(raw: &str) -> BTreeSet<String> {
collect_json_keys(raw).unwrap_or_default()
}
#[test]
fn mckp_v2_preserves_top_level_and_nested_keys() {
let raw = r#"{
"company": "Acme",
"year": 2026,
"employees": [
{"id": 1, "name": "Ada", "address": {"city": "Boston"}},
{"id": 2, "name": "Lin", "address": {"city": "Tokyo"}, "phone": "555"}
]
}"#;
let cls = classify(raw);
let body = templates::deep_mckp_with_inner_table(raw, &cls)
.expect("mckp_v2 should engage on object-wrapping-array shape");
let cmp = round_trip_keys("mckp_v2", raw, &body).expect("encoder is registered");
assert!(
cmp.is_within_contract(),
"mckp_v2 dropped keys: {}",
cmp.report()
);
for k in [
"company",
"year",
"employees",
"id",
"name",
"address",
"city",
] {
assert!(
cmp.output_keys.contains(k),
"expected key `{k}` in mckp_v2 output, got {:?}",
cmp.output_keys
);
}
}
#[test]
fn mckp_v2_preserves_keys_when_inner_objects_are_heterogeneous() {
let raw = r#"{
"scope": "ops",
"items": [
{"id": 1, "ok": true},
{"id": 2, "ok": false, "phone": "x"}
]
}"#;
let cls = classify(raw);
let body = templates::deep_mckp_with_inner_table(raw, &cls).unwrap();
let cmp = round_trip_keys("mckp_v2", raw, &body).unwrap();
assert!(cmp.is_within_contract(), "{}", cmp.report());
assert!(cmp.output_keys.contains("phone"));
assert!(cmp.output_keys.contains("scope"));
}
#[test]
fn mckp_v2_returns_none_when_no_inner_array() {
let raw = r#"{"a": 1, "b": 2}"#;
let cls = classify(raw);
assert!(templates::deep_mckp_with_inner_table(raw, &cls).is_none());
}
#[test]
fn pipeline_deep_mckp_is_lossless() {
let raw = r#"{
"url_a": "https://example.com",
"log": "line1\nline2",
"hash": "deadbeef",
"nested": {"k": "v"}
}"#;
let cls = classify(raw);
let body = templates::pipeline_deep_mckp(raw, &cls).unwrap_or_else(|| {
serde_json::to_string(&serde_json::from_str::<serde_json::Value>(raw).unwrap()).unwrap()
});
let cmp = round_trip_keys("deep_mckp", raw, &body).unwrap();
assert!(cmp.is_within_contract(), "{}", cmp.report());
assert_eq!(cmp.dropped.len(), 0);
}
#[test]
fn json_compact_is_lossless() {
let raw = r#"{"id":1,"items":[{"a":2},{"b":3}]}"#;
let body = serde_json::to_string(&serde_json::from_str::<serde_json::Value>(raw).unwrap())
.unwrap();
let cmp = round_trip_keys("json_compact", raw, &body).unwrap();
assert!(cmp.is_within_contract());
assert_eq!(cmp.dropped.len(), 0);
}
#[test]
fn naive_csv_drops_top_level_wrapper_as_documented() {
let raw = r#"{
"meta": "report-2026-04-25",
"rows": [
{"id": 1, "v": "a"},
{"id": 2, "v": "b"}
]
}"#;
let body = "id,v\n1,a\n2,b\n";
let cmp = round_trip_keys("csv", raw, body).unwrap();
assert_eq!(cmp.allowed_loss, DataLoss::TopLevel);
assert!(cmp.dropped.iter().any(|k| k == "meta"));
assert!(cmp.dropped.iter().any(|k| k == "rows"));
assert!(cmp.output_keys.contains("id"));
assert!(cmp.output_keys.contains("v"));
assert!(cmp.is_within_contract());
}
#[test]
fn csv_from_md_documents_the_same_loss() {
let md = "# Report 2026-04-25\n\n| id | v |\n|---|---|\n| 1 | a |\n| 2 | b |\n";
let cls = classify(md);
let body = templates::csv_from_md(md, &cls).unwrap();
let logical =
r#"{"heading":"Report 2026-04-25","rows":[{"id":"1","v":"a"},{"id":"2","v":"b"}]}"#;
let cmp = round_trip_keys("csv_from_md", logical, &body).unwrap();
assert_eq!(cmp.allowed_loss, DataLoss::TopLevel);
assert!(cmp.is_within_contract());
assert!(cmp.output_keys.contains("id"));
}
#[test]
fn kv_format_preserves_all_top_level_keys() {
let raw = r#"{"alpha":1,"beta":"two","gamma":true,"delta":null,"epsilon":3.14}"#;
let body = "alpha: 1\nbeta: two\ngamma: true\ndelta: \nepsilon: 3.14\n";
let cmp = round_trip_keys("kv", raw, body).unwrap();
assert!(cmp.is_within_contract(), "{}", cmp.report());
for k in ["alpha", "beta", "gamma", "delta", "epsilon"] {
assert!(cmp.output_keys.contains(k));
}
}
#[test]
fn declared_loss_table_covers_known_encoders() {
for id in [
"json_compact",
"deep_mckp",
"deep_mckp_inner_table",
"mckp_v2",
"csv",
"csv_from_md",
"kv",
"mr_diff_fence",
] {
assert!(
declared_loss(id).is_some(),
"encoder id `{id}` missing from declared_loss table"
);
}
assert!(declared_loss("totally_made_up").is_none());
}
#[test]
fn empty_input_collects_no_keys() {
assert!(keys_of("").is_empty());
assert!(keys_of("not json").is_empty());
assert!(keys_of("[1,2,3]").is_empty());
}
}