1use fionn_core::{TapeNodeKind, TapeSource, TapeValue};
32use serde_json::{Map, Value};
33use std::collections::{HashMap, HashSet};
34
35#[derive(Debug, Clone)]
37pub struct CsvDiffOptions {
38 pub row_identity: RowIdentityMode,
40 pub column_order_significant: bool,
42 pub ignore_columns: HashSet<String>,
44 pub empty_is_null: bool,
46 pub key_column: Option<String>,
48}
49
50impl Default for CsvDiffOptions {
51 fn default() -> Self {
52 Self {
53 row_identity: RowIdentityMode::Positional,
54 column_order_significant: false,
55 ignore_columns: HashSet::new(),
56 empty_is_null: false,
57 key_column: None,
58 }
59 }
60}
61
62impl CsvDiffOptions {
63 pub fn with_key_column(key: impl Into<String>) -> Self {
65 Self {
66 row_identity: RowIdentityMode::KeyBased,
67 key_column: Some(key.into()),
68 ..Default::default()
69 }
70 }
71
72 #[must_use]
74 pub fn content_addressed() -> Self {
75 Self {
76 row_identity: RowIdentityMode::ContentAddressed,
77 ..Default::default()
78 }
79 }
80}
81
82#[derive(Debug, Clone, Copy, PartialEq, Eq)]
84pub enum RowIdentityMode {
85 Positional,
87 KeyBased,
89 ContentAddressed,
91}
92
93#[derive(Debug, Clone, PartialEq, Eq)]
95pub enum CsvDiffOp {
96 AddRow {
98 position: usize,
100 values: HashMap<String, String>,
102 },
103 RemoveRow {
105 position: usize,
107 key: Option<String>,
109 },
110 ModifyRow {
112 position: usize,
114 key: Option<String>,
116 changes: Vec<CellChange>,
118 },
119 MoveRow {
121 from_position: usize,
123 to_position: usize,
125 key: Option<String>,
127 },
128 AddColumn {
130 name: String,
132 position: usize,
134 },
135 RemoveColumn {
137 name: String,
139 position: usize,
141 },
142 RenameColumn {
144 old_name: String,
146 new_name: String,
148 position: usize,
150 },
151}
152
153#[derive(Debug, Clone, PartialEq, Eq)]
155pub struct CellChange {
156 pub column: String,
158 pub old_value: Option<String>,
160 pub new_value: Option<String>,
162}
163
164#[derive(Debug, Clone)]
166pub struct CsvDiff {
167 pub operations: Vec<CsvDiffOp>,
169 pub stats: CsvDiffStats,
171}
172
173#[derive(Debug, Clone, Default)]
175pub struct CsvDiffStats {
176 pub rows_added: usize,
178 pub rows_removed: usize,
180 pub rows_modified: usize,
182 pub rows_unchanged: usize,
184 pub cells_changed: usize,
186 pub columns_added: usize,
188 pub columns_removed: usize,
190}
191
192impl CsvDiff {
193 #[must_use]
195 pub const fn is_empty(&self) -> bool {
196 self.operations.is_empty()
197 }
198
199 #[must_use]
201 pub const fn len(&self) -> usize {
202 self.operations.len()
203 }
204}
205
206#[must_use]
211pub fn csv_diff(source: &Value, target: &Value, options: &CsvDiffOptions) -> CsvDiff {
212 let source_rows = extract_csv_rows(source);
213 let target_rows = extract_csv_rows(target);
214
215 match options.row_identity {
216 RowIdentityMode::Positional => diff_positional(&source_rows, &target_rows, options),
217 RowIdentityMode::KeyBased => diff_key_based(&source_rows, &target_rows, options),
218 RowIdentityMode::ContentAddressed => {
219 diff_content_addressed(&source_rows, &target_rows, options)
220 }
221 }
222}
223
224pub fn csv_diff_tapes<S: TapeSource, T: TapeSource>(
226 source: &S,
227 target: &T,
228 options: &CsvDiffOptions,
229) -> CsvDiff {
230 let source_value = tape_to_csv_value(source);
233 let target_value = tape_to_csv_value(target);
234
235 csv_diff(&source_value, &target_value, options)
236}
237
238fn extract_csv_rows(value: &Value) -> Vec<HashMap<String, String>> {
243 let mut rows = Vec::new();
244
245 if let Some(csv) = value.get("csv") {
247 if let Some(rows_arr) = csv.get("rows").and_then(|r| r.as_array()) {
248 for row in rows_arr {
249 if let Some(obj) = row.as_object() {
250 let row_map: HashMap<String, String> = obj
251 .iter()
252 .map(|(k, v)| {
253 let val = match v {
254 Value::String(s) => s.clone(),
255 Value::Null => String::new(),
256 other => other.to_string(),
257 };
258 (k.clone(), val)
259 })
260 .collect();
261 rows.push(row_map);
262 }
263 }
264 }
265 }
266 else if let Some(arr) = value.as_array() {
268 for row in arr {
269 if let Some(obj) = row.as_object() {
270 let row_map: HashMap<String, String> = obj
271 .iter()
272 .map(|(k, v)| {
273 let val = match v {
274 Value::String(s) => s.clone(),
275 Value::Null => String::new(),
276 other => other.to_string(),
277 };
278 (k.clone(), val)
279 })
280 .collect();
281 rows.push(row_map);
282 }
283 }
284 }
285
286 rows
287}
288
289fn tape_to_csv_value<T: TapeSource>(tape: &T) -> Value {
290 let mut rows = Vec::new();
293
294 let mut i = 0;
296 while i < tape.len() {
297 if let Some(node) = tape.node_at(i) {
298 if let TapeNodeKind::ObjectStart { count } = node.kind {
299 let mut row = Map::new();
300 let obj_end = i + count * 2 + 1; let mut field_idx = i + 1;
302
303 while field_idx < obj_end && field_idx < tape.len() {
304 if let Some(key) = tape.key_at(field_idx) {
305 field_idx += 1; if let Some(value) = tape.value_at(field_idx) {
307 let json_val = match value {
308 TapeValue::String(s) => Value::String(s.to_string()),
309 TapeValue::Int(n) => Value::Number(n.into()),
310 TapeValue::Float(f) => serde_json::Number::from_f64(f)
311 .map_or(Value::Null, Value::Number),
312 TapeValue::Bool(b) => Value::Bool(b),
313 TapeValue::Null => Value::Null,
314 TapeValue::RawNumber(s) => s
315 .parse::<i64>()
316 .map(|n| Value::Number(n.into()))
317 .or_else(|_| {
318 s.parse::<f64>()
319 .ok()
320 .and_then(serde_json::Number::from_f64)
321 .map(Value::Number)
322 .ok_or(())
323 })
324 .unwrap_or_else(|()| Value::String(s.to_string())),
325 };
326 row.insert(key.to_string(), json_val);
327 }
328 if let Ok(skip) = tape.skip_value(field_idx) {
329 field_idx = skip;
330 } else {
331 field_idx += 1;
332 }
333 } else {
334 field_idx += 1;
335 }
336 }
337
338 if !row.is_empty() {
339 rows.push(Value::Object(row));
340 }
341 if let Ok(skip) = tape.skip_value(i) {
342 i = skip;
343 } else {
344 i += 1;
345 }
346 } else {
347 i += 1;
348 }
349 } else {
350 i += 1;
351 }
352 }
353
354 Value::Array(rows)
355}
356
357fn diff_positional(
358 source: &[HashMap<String, String>],
359 target: &[HashMap<String, String>],
360 options: &CsvDiffOptions,
361) -> CsvDiff {
362 let mut operations = Vec::new();
363 let mut stats = CsvDiffStats::default();
364
365 let max_len = source.len().max(target.len());
366
367 for i in 0..max_len {
368 match (source.get(i), target.get(i)) {
369 (Some(src_row), Some(tgt_row)) => {
370 let changes = diff_row(src_row, tgt_row, options);
371 if changes.is_empty() {
372 stats.rows_unchanged += 1;
373 } else {
374 stats.rows_modified += 1;
375 stats.cells_changed += changes.len();
376 operations.push(CsvDiffOp::ModifyRow {
377 position: i,
378 key: options
379 .key_column
380 .as_ref()
381 .and_then(|k| src_row.get(k).cloned()),
382 changes,
383 });
384 }
385 }
386 (Some(_), None) => {
387 stats.rows_removed += 1;
388 operations.push(CsvDiffOp::RemoveRow {
389 position: i,
390 key: options
391 .key_column
392 .as_ref()
393 .and_then(|k| source[i].get(k).cloned()),
394 });
395 }
396 (None, Some(tgt_row)) => {
397 stats.rows_added += 1;
398 operations.push(CsvDiffOp::AddRow {
399 position: i,
400 values: tgt_row.clone(),
401 });
402 }
403 (None, None) => unreachable!(),
404 }
405 }
406
407 CsvDiff { operations, stats }
408}
409
410fn diff_key_based(
411 source: &[HashMap<String, String>],
412 target: &[HashMap<String, String>],
413 options: &CsvDiffOptions,
414) -> CsvDiff {
415 let Some(key_col) = &options.key_column else {
416 return diff_positional(source, target, options);
417 };
418
419 let mut operations = Vec::new();
420 let mut stats = CsvDiffStats::default();
421
422 let source_by_key: HashMap<&str, (usize, &HashMap<String, String>)> = source
424 .iter()
425 .enumerate()
426 .filter_map(|(i, row)| row.get(key_col).map(|k| (k.as_str(), (i, row))))
427 .collect();
428
429 let target_by_key: HashMap<&str, (usize, &HashMap<String, String>)> = target
431 .iter()
432 .enumerate()
433 .filter_map(|(i, row)| row.get(key_col).map(|k| (k.as_str(), (i, row))))
434 .collect();
435
436 for (key, (pos, _)) in &source_by_key {
438 if !target_by_key.contains_key(*key) {
439 stats.rows_removed += 1;
440 operations.push(CsvDiffOp::RemoveRow {
441 position: *pos,
442 key: Some((*key).to_string()),
443 });
444 }
445 }
446
447 for (key, (pos, tgt_row)) in &target_by_key {
449 if let Some((_, src_row)) = source_by_key.get(*key) {
450 let changes = diff_row(src_row, tgt_row, options);
451 if changes.is_empty() {
452 stats.rows_unchanged += 1;
453 } else {
454 stats.rows_modified += 1;
455 stats.cells_changed += changes.len();
456 operations.push(CsvDiffOp::ModifyRow {
457 position: *pos,
458 key: Some((*key).to_string()),
459 changes,
460 });
461 }
462 } else {
463 stats.rows_added += 1;
464 operations.push(CsvDiffOp::AddRow {
465 position: *pos,
466 values: (*tgt_row).clone(),
467 });
468 }
469 }
470
471 CsvDiff { operations, stats }
472}
473
474fn diff_content_addressed(
475 source: &[HashMap<String, String>],
476 target: &[HashMap<String, String>],
477 options: &CsvDiffOptions,
478) -> CsvDiff {
479 let mut operations = Vec::new();
480 let mut stats = CsvDiffStats::default();
481
482 let source_hashes: HashMap<u64, (usize, &HashMap<String, String>)> = source
484 .iter()
485 .enumerate()
486 .map(|(i, row)| (hash_row(row, options), (i, row)))
487 .collect();
488
489 let target_hashes: HashMap<u64, (usize, &HashMap<String, String>)> = target
490 .iter()
491 .enumerate()
492 .map(|(i, row)| (hash_row(row, options), (i, row)))
493 .collect();
494
495 let mut matched_source: HashSet<usize> = HashSet::new();
497 let mut matched_target: HashSet<usize> = HashSet::new();
498
499 for (hash, (tgt_pos, _)) in &target_hashes {
500 if let Some((src_pos, _)) = source_hashes.get(hash) {
501 if *src_pos == *tgt_pos {
502 stats.rows_unchanged += 1;
503 } else {
504 operations.push(CsvDiffOp::MoveRow {
505 from_position: *src_pos,
506 to_position: *tgt_pos,
507 key: options
508 .key_column
509 .as_ref()
510 .and_then(|k| source[*src_pos].get(k).cloned()),
511 });
512 }
513 matched_source.insert(*src_pos);
514 matched_target.insert(*tgt_pos);
515 }
516 }
517
518 for (i, _) in source.iter().enumerate() {
520 if !matched_source.contains(&i) {
521 stats.rows_removed += 1;
522 operations.push(CsvDiffOp::RemoveRow {
523 position: i,
524 key: options
525 .key_column
526 .as_ref()
527 .and_then(|k| source[i].get(k).cloned()),
528 });
529 }
530 }
531
532 for (i, row) in target.iter().enumerate() {
534 if !matched_target.contains(&i) {
535 stats.rows_added += 1;
536 operations.push(CsvDiffOp::AddRow {
537 position: i,
538 values: row.clone(),
539 });
540 }
541 }
542
543 CsvDiff { operations, stats }
544}
545
546fn diff_row(
547 source: &HashMap<String, String>,
548 target: &HashMap<String, String>,
549 options: &CsvDiffOptions,
550) -> Vec<CellChange> {
551 let mut changes = Vec::new();
552
553 let all_cols: HashSet<&str> = source
555 .keys()
556 .chain(target.keys())
557 .map(String::as_str)
558 .collect();
559
560 for col in all_cols {
561 if options.ignore_columns.contains(col) {
562 continue;
563 }
564
565 let src_val = source.get(col).map(String::as_str);
566 let tgt_val = target.get(col).map(String::as_str);
567
568 let src_normalized = if options.empty_is_null {
570 src_val.filter(|s| !s.is_empty())
571 } else {
572 src_val
573 };
574 let tgt_normalized = if options.empty_is_null {
575 tgt_val.filter(|s| !s.is_empty())
576 } else {
577 tgt_val
578 };
579
580 if src_normalized != tgt_normalized {
581 changes.push(CellChange {
582 column: col.to_string(),
583 old_value: src_val.map(std::borrow::ToOwned::to_owned),
584 new_value: tgt_val.map(std::borrow::ToOwned::to_owned),
585 });
586 }
587 }
588
589 changes
590}
591
592fn hash_row(row: &HashMap<String, String>, options: &CsvDiffOptions) -> u64 {
593 use std::collections::hash_map::DefaultHasher;
594 use std::hash::{Hash, Hasher};
595
596 let mut hasher = DefaultHasher::new();
597
598 let mut keys: Vec<&str> = row
600 .keys()
601 .filter(|k| !options.ignore_columns.contains(*k))
602 .map(String::as_str)
603 .collect();
604 keys.sort_unstable();
605
606 for key in keys {
607 key.hash(&mut hasher);
608 if let Some(val) = row.get(key) {
609 let normalized = if options.empty_is_null && val.is_empty() {
610 ""
611 } else {
612 val.as_str()
613 };
614 normalized.hash(&mut hasher);
615 }
616 }
617
618 hasher.finish()
619}
620
621#[cfg(test)]
622mod tests {
623 use super::*;
624 use serde_json::json;
625
626 #[test]
627 fn test_csv_diff_positional_identical() {
628 let data = json!({
629 "csv": {
630 "rows": [
631 {"id": "1", "name": "Alice"},
632 {"id": "2", "name": "Bob"}
633 ]
634 }
635 });
636
637 let diff = csv_diff(&data, &data, &CsvDiffOptions::default());
638 assert!(diff.is_empty());
639 assert_eq!(diff.stats.rows_unchanged, 2);
640 }
641
642 #[test]
643 fn test_csv_diff_positional_modified() {
644 let source = json!({
645 "csv": {
646 "rows": [
647 {"id": "1", "name": "Alice"},
648 {"id": "2", "name": "Bob"}
649 ]
650 }
651 });
652 let target = json!({
653 "csv": {
654 "rows": [
655 {"id": "1", "name": "Alice"},
656 {"id": "2", "name": "Robert"}
657 ]
658 }
659 });
660
661 let diff = csv_diff(&source, &target, &CsvDiffOptions::default());
662 assert_eq!(diff.stats.rows_unchanged, 1);
663 assert_eq!(diff.stats.rows_modified, 1);
664 assert_eq!(diff.stats.cells_changed, 1);
665 }
666
667 #[test]
668 fn test_csv_diff_key_based() {
669 let source = json!([
670 {"id": "1", "name": "Alice"},
671 {"id": "2", "name": "Bob"}
672 ]);
673 let target = json!([
674 {"id": "2", "name": "Bob"},
675 {"id": "3", "name": "Charlie"}
676 ]);
677
678 let options = CsvDiffOptions::with_key_column("id");
679 let diff = csv_diff(&source, &target, &options);
680
681 assert_eq!(diff.stats.rows_removed, 1); assert_eq!(diff.stats.rows_added, 1); assert_eq!(diff.stats.rows_unchanged, 1); }
685
686 #[test]
687 fn test_csv_diff_content_addressed_move() {
688 let source = json!([
689 {"id": "1", "name": "Alice"},
690 {"id": "2", "name": "Bob"}
691 ]);
692 let target = json!([
693 {"id": "2", "name": "Bob"},
694 {"id": "1", "name": "Alice"}
695 ]);
696
697 let options = CsvDiffOptions::content_addressed();
698 let diff = csv_diff(&source, &target, &options);
699
700 let move_count = diff
702 .operations
703 .iter()
704 .filter(|op| matches!(op, CsvDiffOp::MoveRow { .. }))
705 .count();
706 assert!(move_count >= 1);
707 }
708
709 #[test]
710 fn test_csv_diff_empty_is_null() {
711 let source = json!([{"id": "1", "name": ""}]);
712 let target = json!([{"id": "1"}]);
713
714 let diff1 = csv_diff(&source, &target, &CsvDiffOptions::default());
716 assert!(!diff1.is_empty());
717
718 let options = CsvDiffOptions {
720 empty_is_null: true,
721 ..Default::default()
722 };
723 let diff2 = csv_diff(&source, &target, &options);
724 assert!(diff2.is_empty());
725 }
726
727 #[test]
728 fn test_csv_diff_ignore_columns() {
729 let source = json!([{"id": "1", "name": "Alice", "updated": "2024-01-01"}]);
730 let target = json!([{"id": "1", "name": "Alice", "updated": "2024-01-02"}]);
731
732 let mut options = CsvDiffOptions::default();
733 options.ignore_columns.insert("updated".to_string());
734
735 let diff = csv_diff(&source, &target, &options);
736 assert!(diff.is_empty());
737 }
738}