1use std::collections::BTreeSet;
29
30#[derive(Debug, Clone, Copy, PartialEq, Eq)]
32pub enum DataLoss {
33 None,
36 TopLevel,
40 Nested,
43}
44
45#[derive(Debug, Clone)]
47pub struct KeyComparison {
48 pub encoder_id: String,
49 pub input_keys: BTreeSet<String>,
50 pub output_keys: BTreeSet<String>,
51 pub dropped: Vec<String>,
52 pub added: Vec<String>,
53 pub allowed_loss: DataLoss,
54}
55
56impl KeyComparison {
57 pub fn is_within_contract(&self) -> bool {
59 match self.allowed_loss {
60 DataLoss::None => self.dropped.is_empty(),
61 DataLoss::TopLevel | DataLoss::Nested => true,
65 }
66 }
67
68 pub fn report(&self) -> String {
70 format!(
71 "encoder={} input_keys={} output_keys={} dropped={:?} added={:?} loss={:?}",
72 self.encoder_id,
73 self.input_keys.len(),
74 self.output_keys.len(),
75 self.dropped,
76 self.added,
77 self.allowed_loss,
78 )
79 }
80}
81
82pub fn declared_loss(encoder_id: &str) -> Option<DataLoss> {
88 Some(match encoder_id {
89 "json_compact" | "deep_mckp" | "deep_mckp_inner_table" | "mckp_v2" => DataLoss::None,
90 "kv" | "mr_diff_fence" => DataLoss::None,
94 "csv" | "csv_from_md" => DataLoss::TopLevel,
99 _ => return None,
100 })
101}
102
103pub fn round_trip_keys(
106 encoder_id: &str,
107 raw_input: &str,
108 encoded_output: &str,
109) -> Option<KeyComparison> {
110 let allowed_loss = declared_loss(encoder_id)?;
111 let input_keys = collect_json_keys(raw_input).unwrap_or_default();
112 let output_keys = decode_keys(encoder_id, encoded_output);
113 let dropped: Vec<String> = input_keys.difference(&output_keys).cloned().collect();
114 let added: Vec<String> = output_keys.difference(&input_keys).cloned().collect();
115 Some(KeyComparison {
116 encoder_id: encoder_id.to_string(),
117 input_keys,
118 output_keys,
119 dropped,
120 added,
121 allowed_loss,
122 })
123}
124
125fn collect_json_keys(raw: &str) -> Option<BTreeSet<String>> {
129 let val: serde_json::Value = serde_json::from_str(raw.trim_start()).ok()?;
130 let mut out = BTreeSet::new();
131 walk_value(&val, &mut out);
132 Some(out)
133}
134
135fn walk_value(v: &serde_json::Value, out: &mut BTreeSet<String>) {
136 match v {
137 serde_json::Value::Object(map) => {
138 for (k, child) in map {
139 out.insert(k.clone());
140 walk_value(child, out);
141 }
142 }
143 serde_json::Value::Array(arr) => {
144 for child in arr {
145 walk_value(child, out);
146 }
147 }
148 _ => {}
149 }
150}
151
152fn decode_keys(encoder_id: &str, encoded: &str) -> BTreeSet<String> {
156 match encoder_id {
157 "json_compact" | "deep_mckp" => {
158 collect_json_keys(encoded).unwrap_or_default()
160 }
161 "deep_mckp_inner_table" | "mckp_v2" => decode_inner_table_keys(encoded),
162 "csv" | "csv_from_md" => decode_csv_header_keys(encoded),
163 "kv" => decode_kv_keys(encoded),
164 "mr_diff_fence" => decode_diff_fence_keys(encoded),
165 _ => BTreeSet::new(),
166 }
167}
168
169fn decode_inner_table_keys(encoded: &str) -> BTreeSet<String> {
175 let mut out = BTreeSet::new();
176 let mut lines = encoded.lines().peekable();
177
178 while let Some(line) = lines.peek() {
180 if line.trim().is_empty() {
181 lines.next();
182 break;
183 }
184 if line.starts_with("## ") || line.starts_with("| ") || line.starts_with("|---") {
185 break;
188 }
189 let line = lines.next().unwrap();
190 if let Some((k, v)) = line.split_once(": ") {
191 out.insert(k.trim().to_string());
192 if let Ok(val) = serde_json::from_str::<serde_json::Value>(v.trim()) {
194 walk_value(&val, &mut out);
195 }
196 }
197 }
198
199 while let Some(line) = lines.peek() {
202 if line.trim().is_empty() {
203 lines.next();
204 continue;
205 }
206 if let Some(rest) = line.strip_prefix("## ") {
207 out.insert(rest.trim().to_string());
208 lines.next();
209 if matches!(lines.peek(), Some(l) if l.trim().is_empty()) {
211 lines.next();
212 }
213 }
214 break;
215 }
216
217 if let Some(header) = lines.next() {
219 for cell in split_md_row(header) {
220 if !cell.is_empty() {
221 out.insert(cell);
222 }
223 }
224 let _ = lines.next();
226 }
227
228 for row in lines {
230 for cell in split_md_row(row) {
231 if (cell.starts_with('{') && cell.ends_with('}'))
232 || (cell.starts_with('[') && cell.ends_with(']'))
233 {
234 let unescaped = cell.replace("\\|", "|");
235 if let Ok(val) = serde_json::from_str::<serde_json::Value>(&unescaped) {
236 walk_value(&val, &mut out);
237 }
238 }
239 }
240 }
241
242 out
243}
244
245fn split_md_row(line: &str) -> Vec<String> {
246 let trimmed = line.trim().trim_start_matches('|').trim_end_matches('|');
247 trimmed
248 .split(" | ")
249 .map(|s| s.trim().to_string())
250 .filter(|s| !s.is_empty())
251 .collect()
252}
253
254fn decode_csv_header_keys(encoded: &str) -> BTreeSet<String> {
258 let header = encoded.lines().next().unwrap_or("");
259 header
260 .split(',')
261 .map(|s| s.trim().trim_matches('"').to_string())
262 .filter(|s| !s.is_empty())
263 .collect()
264}
265
266fn decode_kv_keys(encoded: &str) -> BTreeSet<String> {
268 let mut out = BTreeSet::new();
269 for line in encoded.lines() {
270 if let Some((k, v)) = line.split_once(": ") {
271 out.insert(k.trim().to_string());
272 if let Ok(val) = serde_json::from_str::<serde_json::Value>(v.trim()) {
273 walk_value(&val, &mut out);
274 }
275 }
276 }
277 out
278}
279
280fn decode_diff_fence_keys(encoded: &str) -> BTreeSet<String> {
284 let mut out = BTreeSet::new();
285 let lower = encoded.to_ascii_lowercase();
290 for k in ["diffs", "path", "diff", "content"] {
291 if lower.contains(k) {
292 out.insert(k.to_string());
293 }
294 }
295 out
296}
297
298#[cfg(test)]
299mod tests {
300 use super::*;
301 use crate::shape::classify;
302 use crate::templates;
303
304 fn keys_of(raw: &str) -> BTreeSet<String> {
305 collect_json_keys(raw).unwrap_or_default()
306 }
307
308 #[test]
311 fn mckp_v2_preserves_top_level_and_nested_keys() {
312 let raw = r#"{
313 "company": "Acme",
314 "year": 2026,
315 "employees": [
316 {"id": 1, "name": "Ada", "address": {"city": "Boston"}},
317 {"id": 2, "name": "Lin", "address": {"city": "Tokyo"}, "phone": "555"}
318 ]
319 }"#;
320 let cls = classify(raw);
321 let body = templates::deep_mckp_with_inner_table(raw, &cls)
322 .expect("mckp_v2 should engage on object-wrapping-array shape");
323 let cmp = round_trip_keys("mckp_v2", raw, &body).expect("encoder is registered");
324 assert!(
325 cmp.is_within_contract(),
326 "mckp_v2 dropped keys: {}",
327 cmp.report()
328 );
329 for k in [
331 "company",
332 "year",
333 "employees",
334 "id",
335 "name",
336 "address",
337 "city",
338 ] {
339 assert!(
340 cmp.output_keys.contains(k),
341 "expected key `{k}` in mckp_v2 output, got {:?}",
342 cmp.output_keys
343 );
344 }
345 }
346
347 #[test]
348 fn mckp_v2_preserves_keys_when_inner_objects_are_heterogeneous() {
349 let raw = r#"{
352 "scope": "ops",
353 "items": [
354 {"id": 1, "ok": true},
355 {"id": 2, "ok": false, "phone": "x"}
356 ]
357 }"#;
358 let cls = classify(raw);
359 let body = templates::deep_mckp_with_inner_table(raw, &cls).unwrap();
360 let cmp = round_trip_keys("mckp_v2", raw, &body).unwrap();
361 assert!(cmp.is_within_contract(), "{}", cmp.report());
362 assert!(cmp.output_keys.contains("phone"));
363 assert!(cmp.output_keys.contains("scope"));
364 }
365
366 #[test]
367 fn mckp_v2_returns_none_when_no_inner_array() {
368 let raw = r#"{"a": 1, "b": 2}"#;
371 let cls = classify(raw);
372 assert!(templates::deep_mckp_with_inner_table(raw, &cls).is_none());
373 }
374
375 #[test]
378 fn pipeline_deep_mckp_is_lossless() {
379 let raw = r#"{
380 "url_a": "https://example.com",
381 "log": "line1\nline2",
382 "hash": "deadbeef",
383 "nested": {"k": "v"}
384 }"#;
385 let cls = classify(raw);
386 let body = templates::pipeline_deep_mckp(raw, &cls).unwrap_or_else(|| {
387 serde_json::to_string(&serde_json::from_str::<serde_json::Value>(raw).unwrap()).unwrap()
390 });
391 let cmp = round_trip_keys("deep_mckp", raw, &body).unwrap();
392 assert!(cmp.is_within_contract(), "{}", cmp.report());
393 assert_eq!(cmp.dropped.len(), 0);
394 }
395
396 #[test]
397 fn json_compact_is_lossless() {
398 let raw = r#"{"id":1,"items":[{"a":2},{"b":3}]}"#;
399 let body = serde_json::to_string(&serde_json::from_str::<serde_json::Value>(raw).unwrap())
400 .unwrap();
401 let cmp = round_trip_keys("json_compact", raw, &body).unwrap();
402 assert!(cmp.is_within_contract());
403 assert_eq!(cmp.dropped.len(), 0);
404 }
405
406 #[test]
409 fn naive_csv_drops_top_level_wrapper_as_documented() {
410 let raw = r#"{
414 "meta": "report-2026-04-25",
415 "rows": [
416 {"id": 1, "v": "a"},
417 {"id": 2, "v": "b"}
418 ]
419 }"#;
420 let body = "id,v\n1,a\n2,b\n";
423 let cmp = round_trip_keys("csv", raw, body).unwrap();
424 assert_eq!(cmp.allowed_loss, DataLoss::TopLevel);
425 assert!(cmp.dropped.iter().any(|k| k == "meta"));
427 assert!(cmp.dropped.iter().any(|k| k == "rows"));
428 assert!(cmp.output_keys.contains("id"));
430 assert!(cmp.output_keys.contains("v"));
431 assert!(cmp.is_within_contract());
434 }
435
436 #[test]
437 fn csv_from_md_documents_the_same_loss() {
438 let md = "# Report 2026-04-25\n\n| id | v |\n|---|---|\n| 1 | a |\n| 2 | b |\n";
441 let cls = classify(md);
442 let body = templates::csv_from_md(md, &cls).unwrap();
443 let logical =
446 r#"{"heading":"Report 2026-04-25","rows":[{"id":"1","v":"a"},{"id":"2","v":"b"}]}"#;
447 let cmp = round_trip_keys("csv_from_md", logical, &body).unwrap();
448 assert_eq!(cmp.allowed_loss, DataLoss::TopLevel);
449 assert!(cmp.is_within_contract());
450 assert!(cmp.output_keys.contains("id"));
451 }
452
453 #[test]
456 fn kv_format_preserves_all_top_level_keys() {
457 let raw = r#"{"alpha":1,"beta":"two","gamma":true,"delta":null,"epsilon":3.14}"#;
458 let body = "alpha: 1\nbeta: two\ngamma: true\ndelta: \nepsilon: 3.14\n";
459 let cmp = round_trip_keys("kv", raw, body).unwrap();
460 assert!(cmp.is_within_contract(), "{}", cmp.report());
461 for k in ["alpha", "beta", "gamma", "delta", "epsilon"] {
462 assert!(cmp.output_keys.contains(k));
463 }
464 }
465
466 #[test]
469 fn declared_loss_table_covers_known_encoders() {
470 for id in [
471 "json_compact",
472 "deep_mckp",
473 "deep_mckp_inner_table",
474 "mckp_v2",
475 "csv",
476 "csv_from_md",
477 "kv",
478 "mr_diff_fence",
479 ] {
480 assert!(
481 declared_loss(id).is_some(),
482 "encoder id `{id}` missing from declared_loss table"
483 );
484 }
485 assert!(declared_loss("totally_made_up").is_none());
488 }
489
490 #[test]
491 fn empty_input_collects_no_keys() {
492 assert!(keys_of("").is_empty());
493 assert!(keys_of("not json").is_empty());
494 assert!(keys_of("[1,2,3]").is_empty());
495 }
496}