Skip to main content

fionn_diff/
csv_diff.rs

1// SPDX-License-Identifier: MIT OR Apache-2.0
2//! CSV-specific diff semantics
3//!
4//! CSV diff operations require special handling because:
5//! - Rows have implicit identity (by position or by key column)
6//! - Column order may or may not be significant
7//! - Missing values vs empty strings have semantic differences
8//!
9//! This module provides:
10//! - Row-based diff (compare by position)
11//! - Key-based diff (compare by identity column)
12//! - Content-addressed diff (compare by row hash)
13//!
14//! ## Diff Modes
15//!
16//! ### Positional Mode (default)
17//! Rows are compared by their position. Row 0 vs Row 0, etc.
18//! - Fast for ordered data
19//! - Produces large diffs for insertions/deletions
20//!
21//! ### Key-Based Mode
22//! Rows are matched by a key column (e.g., "id").
23//! - Handles insertions/deletions gracefully
24//! - Requires unique key column
25//!
26//! ### Content-Addressed Mode
27//! Rows are hashed and compared by content.
28//! - Detects moved rows
29//! - Higher memory usage
30
31use fionn_core::{TapeNodeKind, TapeSource, TapeValue};
32use serde_json::{Map, Value};
33use std::collections::{HashMap, HashSet};
34
35/// CSV diff options
36#[derive(Debug, Clone)]
37pub struct CsvDiffOptions {
38    /// How to identify rows for comparison
39    pub row_identity: RowIdentityMode,
40    /// Whether column order matters
41    pub column_order_significant: bool,
42    /// Columns to ignore in comparison
43    pub ignore_columns: HashSet<String>,
44    /// Treat empty string as null
45    pub empty_is_null: bool,
46    /// Key column for key-based identity
47    pub key_column: Option<String>,
48}
49
50impl Default for CsvDiffOptions {
51    fn default() -> Self {
52        Self {
53            row_identity: RowIdentityMode::Positional,
54            column_order_significant: false,
55            ignore_columns: HashSet::new(),
56            empty_is_null: false,
57            key_column: None,
58        }
59    }
60}
61
62impl CsvDiffOptions {
63    /// Create options for key-based row identity
64    pub fn with_key_column(key: impl Into<String>) -> Self {
65        Self {
66            row_identity: RowIdentityMode::KeyBased,
67            key_column: Some(key.into()),
68            ..Default::default()
69        }
70    }
71
72    /// Create options for content-addressed diff
73    #[must_use]
74    pub fn content_addressed() -> Self {
75        Self {
76            row_identity: RowIdentityMode::ContentAddressed,
77            ..Default::default()
78        }
79    }
80}
81
82/// How rows are identified for diff comparison
83#[derive(Debug, Clone, Copy, PartialEq, Eq)]
84pub enum RowIdentityMode {
85    /// Compare rows by position (row 0 vs row 0)
86    Positional,
87    /// Compare rows by key column value
88    KeyBased,
89    /// Compare rows by content hash
90    ContentAddressed,
91}
92
93/// A single CSV diff operation
94#[derive(Debug, Clone, PartialEq, Eq)]
95pub enum CsvDiffOp {
96    /// Row added at position
97    AddRow {
98        /// Row index where the row was added
99        position: usize,
100        /// Column values for the new row
101        values: HashMap<String, String>,
102    },
103    /// Row removed from position
104    RemoveRow {
105        /// Row index where the row was removed
106        position: usize,
107        /// Key column value (if key-based identity)
108        key: Option<String>,
109    },
110    /// Row modified
111    ModifyRow {
112        /// Row index of the modified row
113        position: usize,
114        /// Key column value (if key-based identity)
115        key: Option<String>,
116        /// List of cell changes in this row
117        changes: Vec<CellChange>,
118    },
119    /// Row moved (content-addressed only)
120    MoveRow {
121        /// Original row index
122        from_position: usize,
123        /// New row index
124        to_position: usize,
125        /// Key column value (if key-based identity)
126        key: Option<String>,
127    },
128    /// Column added
129    AddColumn {
130        /// Name of the new column
131        name: String,
132        /// Column index
133        position: usize,
134    },
135    /// Column removed
136    RemoveColumn {
137        /// Name of the removed column
138        name: String,
139        /// Column index
140        position: usize,
141    },
142    /// Column renamed
143    RenameColumn {
144        /// Original column name
145        old_name: String,
146        /// New column name
147        new_name: String,
148        /// Column index
149        position: usize,
150    },
151}
152
153/// Change to a single cell
154#[derive(Debug, Clone, PartialEq, Eq)]
155pub struct CellChange {
156    /// Column name
157    pub column: String,
158    /// Old value (None if column was added)
159    pub old_value: Option<String>,
160    /// New value (None if column was removed)
161    pub new_value: Option<String>,
162}
163
164/// Result of CSV diff
165#[derive(Debug, Clone)]
166pub struct CsvDiff {
167    /// Operations to transform source into target
168    pub operations: Vec<CsvDiffOp>,
169    /// Statistics about the diff
170    pub stats: CsvDiffStats,
171}
172
173/// Statistics about a CSV diff
174#[derive(Debug, Clone, Default)]
175pub struct CsvDiffStats {
176    /// Number of rows added
177    pub rows_added: usize,
178    /// Number of rows removed
179    pub rows_removed: usize,
180    /// Number of rows modified
181    pub rows_modified: usize,
182    /// Number of rows unchanged
183    pub rows_unchanged: usize,
184    /// Number of cells changed
185    pub cells_changed: usize,
186    /// Number of columns added
187    pub columns_added: usize,
188    /// Number of columns removed
189    pub columns_removed: usize,
190}
191
192impl CsvDiff {
193    /// Check if the diff is empty (no changes)
194    #[must_use]
195    pub const fn is_empty(&self) -> bool {
196        self.operations.is_empty()
197    }
198
199    /// Get number of operations
200    #[must_use]
201    pub const fn len(&self) -> usize {
202        self.operations.len()
203    }
204}
205
206/// Compute diff between two CSV representations
207///
208/// Both source and target should be JSON values in the CSV gron format:
209/// `{"csv": {"rows": [{"col1": "val1", ...}, ...]}}`
210#[must_use]
211pub fn csv_diff(source: &Value, target: &Value, options: &CsvDiffOptions) -> CsvDiff {
212    let source_rows = extract_csv_rows(source);
213    let target_rows = extract_csv_rows(target);
214
215    match options.row_identity {
216        RowIdentityMode::Positional => diff_positional(&source_rows, &target_rows, options),
217        RowIdentityMode::KeyBased => diff_key_based(&source_rows, &target_rows, options),
218        RowIdentityMode::ContentAddressed => {
219            diff_content_addressed(&source_rows, &target_rows, options)
220        }
221    }
222}
223
224/// Compute diff between two `TapeSource` CSVs
225pub fn csv_diff_tapes<S: TapeSource, T: TapeSource>(
226    source: &S,
227    target: &T,
228    options: &CsvDiffOptions,
229) -> CsvDiff {
230    // Convert tapes to values for diff
231    // This is the bridge between tape and diff semantics
232    let source_value = tape_to_csv_value(source);
233    let target_value = tape_to_csv_value(target);
234
235    csv_diff(&source_value, &target_value, options)
236}
237
238// =============================================================================
239// Internal Implementation
240// =============================================================================
241
242fn extract_csv_rows(value: &Value) -> Vec<HashMap<String, String>> {
243    let mut rows = Vec::new();
244
245    // Handle csv.rows[N].col format
246    if let Some(csv) = value.get("csv") {
247        if let Some(rows_arr) = csv.get("rows").and_then(|r| r.as_array()) {
248            for row in rows_arr {
249                if let Some(obj) = row.as_object() {
250                    let row_map: HashMap<String, String> = obj
251                        .iter()
252                        .map(|(k, v)| {
253                            let val = match v {
254                                Value::String(s) => s.clone(),
255                                Value::Null => String::new(),
256                                other => other.to_string(),
257                            };
258                            (k.clone(), val)
259                        })
260                        .collect();
261                    rows.push(row_map);
262                }
263            }
264        }
265    }
266    // Handle direct array of objects
267    else if let Some(arr) = value.as_array() {
268        for row in arr {
269            if let Some(obj) = row.as_object() {
270                let row_map: HashMap<String, String> = obj
271                    .iter()
272                    .map(|(k, v)| {
273                        let val = match v {
274                            Value::String(s) => s.clone(),
275                            Value::Null => String::new(),
276                            other => other.to_string(),
277                        };
278                        (k.clone(), val)
279                    })
280                    .collect();
281                rows.push(row_map);
282            }
283        }
284    }
285
286    rows
287}
288
289fn tape_to_csv_value<T: TapeSource>(tape: &T) -> Value {
290    // Extract CSV structure from tape
291    // This handles the csv.rows[N].col format
292    let mut rows = Vec::new();
293
294    // Walk tape looking for row structures
295    let mut i = 0;
296    while i < tape.len() {
297        if let Some(node) = tape.node_at(i) {
298            if let TapeNodeKind::ObjectStart { count } = node.kind {
299                let mut row = Map::new();
300                let obj_end = i + count * 2 + 1; // count key-value pairs + object node
301                let mut field_idx = i + 1;
302
303                while field_idx < obj_end && field_idx < tape.len() {
304                    if let Some(key) = tape.key_at(field_idx) {
305                        field_idx += 1; // Move past key
306                        if let Some(value) = tape.value_at(field_idx) {
307                            let json_val = match value {
308                                TapeValue::String(s) => Value::String(s.to_string()),
309                                TapeValue::Int(n) => Value::Number(n.into()),
310                                TapeValue::Float(f) => serde_json::Number::from_f64(f)
311                                    .map_or(Value::Null, Value::Number),
312                                TapeValue::Bool(b) => Value::Bool(b),
313                                TapeValue::Null => Value::Null,
314                                TapeValue::RawNumber(s) => s
315                                    .parse::<i64>()
316                                    .map(|n| Value::Number(n.into()))
317                                    .or_else(|_| {
318                                        s.parse::<f64>()
319                                            .ok()
320                                            .and_then(serde_json::Number::from_f64)
321                                            .map(Value::Number)
322                                            .ok_or(())
323                                    })
324                                    .unwrap_or_else(|()| Value::String(s.to_string())),
325                            };
326                            row.insert(key.to_string(), json_val);
327                        }
328                        if let Ok(skip) = tape.skip_value(field_idx) {
329                            field_idx = skip;
330                        } else {
331                            field_idx += 1;
332                        }
333                    } else {
334                        field_idx += 1;
335                    }
336                }
337
338                if !row.is_empty() {
339                    rows.push(Value::Object(row));
340                }
341                if let Ok(skip) = tape.skip_value(i) {
342                    i = skip;
343                } else {
344                    i += 1;
345                }
346            } else {
347                i += 1;
348            }
349        } else {
350            i += 1;
351        }
352    }
353
354    Value::Array(rows)
355}
356
357fn diff_positional(
358    source: &[HashMap<String, String>],
359    target: &[HashMap<String, String>],
360    options: &CsvDiffOptions,
361) -> CsvDiff {
362    let mut operations = Vec::new();
363    let mut stats = CsvDiffStats::default();
364
365    let max_len = source.len().max(target.len());
366
367    for i in 0..max_len {
368        match (source.get(i), target.get(i)) {
369            (Some(src_row), Some(tgt_row)) => {
370                let changes = diff_row(src_row, tgt_row, options);
371                if changes.is_empty() {
372                    stats.rows_unchanged += 1;
373                } else {
374                    stats.rows_modified += 1;
375                    stats.cells_changed += changes.len();
376                    operations.push(CsvDiffOp::ModifyRow {
377                        position: i,
378                        key: options
379                            .key_column
380                            .as_ref()
381                            .and_then(|k| src_row.get(k).cloned()),
382                        changes,
383                    });
384                }
385            }
386            (Some(_), None) => {
387                stats.rows_removed += 1;
388                operations.push(CsvDiffOp::RemoveRow {
389                    position: i,
390                    key: options
391                        .key_column
392                        .as_ref()
393                        .and_then(|k| source[i].get(k).cloned()),
394                });
395            }
396            (None, Some(tgt_row)) => {
397                stats.rows_added += 1;
398                operations.push(CsvDiffOp::AddRow {
399                    position: i,
400                    values: tgt_row.clone(),
401                });
402            }
403            (None, None) => unreachable!(),
404        }
405    }
406
407    CsvDiff { operations, stats }
408}
409
410fn diff_key_based(
411    source: &[HashMap<String, String>],
412    target: &[HashMap<String, String>],
413    options: &CsvDiffOptions,
414) -> CsvDiff {
415    let Some(key_col) = &options.key_column else {
416        return diff_positional(source, target, options);
417    };
418
419    let mut operations = Vec::new();
420    let mut stats = CsvDiffStats::default();
421
422    // Index source rows by key
423    let source_by_key: HashMap<&str, (usize, &HashMap<String, String>)> = source
424        .iter()
425        .enumerate()
426        .filter_map(|(i, row)| row.get(key_col).map(|k| (k.as_str(), (i, row))))
427        .collect();
428
429    // Index target rows by key
430    let target_by_key: HashMap<&str, (usize, &HashMap<String, String>)> = target
431        .iter()
432        .enumerate()
433        .filter_map(|(i, row)| row.get(key_col).map(|k| (k.as_str(), (i, row))))
434        .collect();
435
436    // Find removed rows
437    for (key, (pos, _)) in &source_by_key {
438        if !target_by_key.contains_key(*key) {
439            stats.rows_removed += 1;
440            operations.push(CsvDiffOp::RemoveRow {
441                position: *pos,
442                key: Some((*key).to_string()),
443            });
444        }
445    }
446
447    // Find added and modified rows
448    for (key, (pos, tgt_row)) in &target_by_key {
449        if let Some((_, src_row)) = source_by_key.get(*key) {
450            let changes = diff_row(src_row, tgt_row, options);
451            if changes.is_empty() {
452                stats.rows_unchanged += 1;
453            } else {
454                stats.rows_modified += 1;
455                stats.cells_changed += changes.len();
456                operations.push(CsvDiffOp::ModifyRow {
457                    position: *pos,
458                    key: Some((*key).to_string()),
459                    changes,
460                });
461            }
462        } else {
463            stats.rows_added += 1;
464            operations.push(CsvDiffOp::AddRow {
465                position: *pos,
466                values: (*tgt_row).clone(),
467            });
468        }
469    }
470
471    CsvDiff { operations, stats }
472}
473
474fn diff_content_addressed(
475    source: &[HashMap<String, String>],
476    target: &[HashMap<String, String>],
477    options: &CsvDiffOptions,
478) -> CsvDiff {
479    let mut operations = Vec::new();
480    let mut stats = CsvDiffStats::default();
481
482    // Hash rows for content addressing
483    let source_hashes: HashMap<u64, (usize, &HashMap<String, String>)> = source
484        .iter()
485        .enumerate()
486        .map(|(i, row)| (hash_row(row, options), (i, row)))
487        .collect();
488
489    let target_hashes: HashMap<u64, (usize, &HashMap<String, String>)> = target
490        .iter()
491        .enumerate()
492        .map(|(i, row)| (hash_row(row, options), (i, row)))
493        .collect();
494
495    // Find moved rows (same content, different position)
496    let mut matched_source: HashSet<usize> = HashSet::new();
497    let mut matched_target: HashSet<usize> = HashSet::new();
498
499    for (hash, (tgt_pos, _)) in &target_hashes {
500        if let Some((src_pos, _)) = source_hashes.get(hash) {
501            if *src_pos == *tgt_pos {
502                stats.rows_unchanged += 1;
503            } else {
504                operations.push(CsvDiffOp::MoveRow {
505                    from_position: *src_pos,
506                    to_position: *tgt_pos,
507                    key: options
508                        .key_column
509                        .as_ref()
510                        .and_then(|k| source[*src_pos].get(k).cloned()),
511                });
512            }
513            matched_source.insert(*src_pos);
514            matched_target.insert(*tgt_pos);
515        }
516    }
517
518    // Find removed rows (in source but not in target)
519    for (i, _) in source.iter().enumerate() {
520        if !matched_source.contains(&i) {
521            stats.rows_removed += 1;
522            operations.push(CsvDiffOp::RemoveRow {
523                position: i,
524                key: options
525                    .key_column
526                    .as_ref()
527                    .and_then(|k| source[i].get(k).cloned()),
528            });
529        }
530    }
531
532    // Find added rows (in target but not in source)
533    for (i, row) in target.iter().enumerate() {
534        if !matched_target.contains(&i) {
535            stats.rows_added += 1;
536            operations.push(CsvDiffOp::AddRow {
537                position: i,
538                values: row.clone(),
539            });
540        }
541    }
542
543    CsvDiff { operations, stats }
544}
545
546fn diff_row(
547    source: &HashMap<String, String>,
548    target: &HashMap<String, String>,
549    options: &CsvDiffOptions,
550) -> Vec<CellChange> {
551    let mut changes = Vec::new();
552
553    // All columns from both rows
554    let all_cols: HashSet<&str> = source
555        .keys()
556        .chain(target.keys())
557        .map(String::as_str)
558        .collect();
559
560    for col in all_cols {
561        if options.ignore_columns.contains(col) {
562            continue;
563        }
564
565        let src_val = source.get(col).map(String::as_str);
566        let tgt_val = target.get(col).map(String::as_str);
567
568        // Normalize empty strings if option set
569        let src_normalized = if options.empty_is_null {
570            src_val.filter(|s| !s.is_empty())
571        } else {
572            src_val
573        };
574        let tgt_normalized = if options.empty_is_null {
575            tgt_val.filter(|s| !s.is_empty())
576        } else {
577            tgt_val
578        };
579
580        if src_normalized != tgt_normalized {
581            changes.push(CellChange {
582                column: col.to_string(),
583                old_value: src_val.map(std::borrow::ToOwned::to_owned),
584                new_value: tgt_val.map(std::borrow::ToOwned::to_owned),
585            });
586        }
587    }
588
589    changes
590}
591
592fn hash_row(row: &HashMap<String, String>, options: &CsvDiffOptions) -> u64 {
593    use std::collections::hash_map::DefaultHasher;
594    use std::hash::{Hash, Hasher};
595
596    let mut hasher = DefaultHasher::new();
597
598    // Sort keys for consistent hashing
599    let mut keys: Vec<&str> = row
600        .keys()
601        .filter(|k| !options.ignore_columns.contains(*k))
602        .map(String::as_str)
603        .collect();
604    keys.sort_unstable();
605
606    for key in keys {
607        key.hash(&mut hasher);
608        if let Some(val) = row.get(key) {
609            let normalized = if options.empty_is_null && val.is_empty() {
610                ""
611            } else {
612                val.as_str()
613            };
614            normalized.hash(&mut hasher);
615        }
616    }
617
618    hasher.finish()
619}
620
621#[cfg(test)]
622mod tests {
623    use super::*;
624    use serde_json::json;
625
626    #[test]
627    fn test_csv_diff_positional_identical() {
628        let data = json!({
629            "csv": {
630                "rows": [
631                    {"id": "1", "name": "Alice"},
632                    {"id": "2", "name": "Bob"}
633                ]
634            }
635        });
636
637        let diff = csv_diff(&data, &data, &CsvDiffOptions::default());
638        assert!(diff.is_empty());
639        assert_eq!(diff.stats.rows_unchanged, 2);
640    }
641
642    #[test]
643    fn test_csv_diff_positional_modified() {
644        let source = json!({
645            "csv": {
646                "rows": [
647                    {"id": "1", "name": "Alice"},
648                    {"id": "2", "name": "Bob"}
649                ]
650            }
651        });
652        let target = json!({
653            "csv": {
654                "rows": [
655                    {"id": "1", "name": "Alice"},
656                    {"id": "2", "name": "Robert"}
657                ]
658            }
659        });
660
661        let diff = csv_diff(&source, &target, &CsvDiffOptions::default());
662        assert_eq!(diff.stats.rows_unchanged, 1);
663        assert_eq!(diff.stats.rows_modified, 1);
664        assert_eq!(diff.stats.cells_changed, 1);
665    }
666
667    #[test]
668    fn test_csv_diff_key_based() {
669        let source = json!([
670            {"id": "1", "name": "Alice"},
671            {"id": "2", "name": "Bob"}
672        ]);
673        let target = json!([
674            {"id": "2", "name": "Bob"},
675            {"id": "3", "name": "Charlie"}
676        ]);
677
678        let options = CsvDiffOptions::with_key_column("id");
679        let diff = csv_diff(&source, &target, &options);
680
681        assert_eq!(diff.stats.rows_removed, 1); // id=1 removed
682        assert_eq!(diff.stats.rows_added, 1); // id=3 added
683        assert_eq!(diff.stats.rows_unchanged, 1); // id=2 unchanged
684    }
685
686    #[test]
687    fn test_csv_diff_content_addressed_move() {
688        let source = json!([
689            {"id": "1", "name": "Alice"},
690            {"id": "2", "name": "Bob"}
691        ]);
692        let target = json!([
693            {"id": "2", "name": "Bob"},
694            {"id": "1", "name": "Alice"}
695        ]);
696
697        let options = CsvDiffOptions::content_addressed();
698        let diff = csv_diff(&source, &target, &options);
699
700        // Both rows moved
701        let move_count = diff
702            .operations
703            .iter()
704            .filter(|op| matches!(op, CsvDiffOp::MoveRow { .. }))
705            .count();
706        assert!(move_count >= 1);
707    }
708
709    #[test]
710    fn test_csv_diff_empty_is_null() {
711        let source = json!([{"id": "1", "name": ""}]);
712        let target = json!([{"id": "1"}]);
713
714        // Without empty_is_null, these are different
715        let diff1 = csv_diff(&source, &target, &CsvDiffOptions::default());
716        assert!(!diff1.is_empty());
717
718        // With empty_is_null, these are the same
719        let options = CsvDiffOptions {
720            empty_is_null: true,
721            ..Default::default()
722        };
723        let diff2 = csv_diff(&source, &target, &options);
724        assert!(diff2.is_empty());
725    }
726
727    #[test]
728    fn test_csv_diff_ignore_columns() {
729        let source = json!([{"id": "1", "name": "Alice", "updated": "2024-01-01"}]);
730        let target = json!([{"id": "1", "name": "Alice", "updated": "2024-01-02"}]);
731
732        let mut options = CsvDiffOptions::default();
733        options.ignore_columns.insert("updated".to_string());
734
735        let diff = csv_diff(&source, &target, &options);
736        assert!(diff.is_empty());
737    }
738}