Skip to main content

kobold_csv/
diff.rs

1//! `KOBOLD.CSV.DIFF.1` -- a row/field-wise diff between two parsed delimited tables.
2//!
3//! Where kobold-json diffs two JSON trees by path, the CSV analogue compares two TABLES: a source extract
4//! against a target extract, cell by cell. Each table is a header row of column names plus data rows. The
5//! diff aligns columns by NAME (not position, so a reordered target still compares correctly), walks rows in
6//! order, and emits a [`DiffEntry`] per differing cell -- the exact `(row, field, source, target)` an analyst
7//! needs to chase a reconciliation break.
8//!
9//! Rows present in only one table, and columns present in only one table, are reported as findings rather
10//! than silently ignored. This module is independent of GnuCOBOL/libcob.
11
12use crate::dialect::{parse_row, Dialect};
13use crate::model::Finding;
14
15/// A parsed delimited table: a header (column names) and the data rows.
16#[derive(Debug, Clone, PartialEq, Eq)]
17pub struct Table {
18    /// The column names from the header row.
19    pub header: Vec<String>,
20    /// The data rows (each a vector of cell values aligned to `header`).
21    pub rows: Vec<Vec<String>>,
22}
23
24/// Parse delimited `text` into a [`Table`] under dialect `d`, fail-closed on a malformed line or a data row
25/// whose column count differs from the header.
26pub fn parse_table(text: &[u8], d: &Dialect) -> Result<Table, Vec<Finding>> {
27    let lines = split_lines(text);
28    let mut findings = Vec::new();
29    if lines.is_empty() {
30        return Err(vec![Finding::new("CSV_EMPTY", "no rows (expected a header)".to_string())]);
31    }
32    let header = match parse_row(lines[0], d) {
33        Ok(h) => h,
34        Err(f) => return Err(vec![f]),
35    };
36    let mut rows = Vec::new();
37    for (i, line) in lines.iter().enumerate().skip(1) {
38        match parse_row(line, d) {
39            Ok(r) => {
40                if r.len() != header.len() {
41                    findings.push(Finding::new(
42                        "COLUMN_COUNT",
43                        format!("row {}: {} columns, header has {}", i, r.len(), header.len()),
44                    ));
45                } else {
46                    rows.push(r);
47                }
48            }
49            Err(mut f) => {
50                f.message = format!("row {}: {}", i, f.message);
51                findings.push(f);
52            }
53        }
54    }
55    if findings.is_empty() {
56        Ok(Table { header, rows })
57    } else {
58        Err(findings)
59    }
60}
61
62fn split_lines(text: &[u8]) -> Vec<&[u8]> {
63    let mut lines = Vec::new();
64    let mut start = 0;
65    for i in 0..text.len() {
66        if text[i] == 0x0a {
67            lines.push(&text[start..i]);
68            start = i + 1;
69        }
70    }
71    if start < text.len() {
72        lines.push(&text[start..]);
73    }
74    lines
75}
76
77/// One cell-level difference between a source table and a target table.
78#[derive(Debug, Clone, PartialEq, Eq)]
79pub struct DiffEntry {
80    /// The zero-based data-row index (excludes the header).
81    pub row: usize,
82    /// The column (field) name.
83    pub field: String,
84    /// The source value at this cell (empty string if the row/column is absent in source).
85    pub source: String,
86    /// The target value at this cell (empty string if the row/column is absent in target).
87    pub target: String,
88}
89
90/// The result of a [`diff`]: the differing cells plus structural findings (extra/missing rows or columns).
91#[derive(Debug, Clone, PartialEq, Eq)]
92pub struct DiffReport {
93    /// Per-cell differences, in (row, column) order.
94    pub entries: Vec<DiffEntry>,
95    /// Structural findings: a row or column present in only one table.
96    pub findings: Vec<Finding>,
97}
98
99impl DiffReport {
100    /// True iff the two tables are identical (no cell differences and no structural findings).
101    pub fn is_clean(&self) -> bool {
102        self.entries.is_empty() && self.findings.is_empty()
103    }
104}
105
106/// `KOBOLD.CSV.DIFF.1` -- compare two parsed tables `source` and `target` cell by cell, aligning columns by
107/// name. Columns present in only one table, and rows present in only one table, become findings.
108pub fn diff(source: &Table, target: &Table) -> DiffReport {
109    let mut entries = Vec::new();
110    let mut findings = Vec::new();
111
112    // Columns only in source.
113    for col in &source.header {
114        if !target.header.iter().any(|c| c == col) {
115            findings.push(Finding::new(
116                "COLUMN_ONLY_IN_SOURCE",
117                format!("column {} present in source but not target", col),
118            ));
119        }
120    }
121    // Columns only in target.
122    for col in &target.header {
123        if !source.header.iter().any(|c| c == col) {
124            findings.push(Finding::new(
125                "COLUMN_ONLY_IN_TARGET",
126                format!("column {} present in target but not source", col),
127            ));
128        }
129    }
130
131    // Row count mismatch.
132    if source.rows.len() != target.rows.len() {
133        findings.push(Finding::new(
134            "ROW_COUNT",
135            format!("source has {} rows, target has {}", source.rows.len(), target.rows.len()),
136        ));
137    }
138
139    // Common columns, in source header order.
140    let common: Vec<&String> =
141        source.header.iter().filter(|c| target.header.iter().any(|t| &t == c)).collect();
142
143    let nrows = source.rows.len().max(target.rows.len());
144    for r in 0..nrows {
145        let srow = source.rows.get(r);
146        let trow = target.rows.get(r);
147        for col in &common {
148            let sval = srow
149                .and_then(|row| column_value(&source.header, row, col))
150                .unwrap_or_default();
151            let tval = trow
152                .and_then(|row| column_value(&target.header, row, col))
153                .unwrap_or_default();
154            if sval != tval {
155                entries.push(DiffEntry {
156                    row: r,
157                    field: (*col).clone(),
158                    source: sval,
159                    target: tval,
160                });
161            }
162        }
163    }
164
165    DiffReport { entries, findings }
166}
167
168/// Look up the value of `col` in `row`, given the table `header` (column alignment by name).
169fn column_value(header: &[String], row: &[String], col: &str) -> Option<String> {
170    let idx = header.iter().position(|h| h == col)?;
171    row.get(idx).cloned()
172}
173
174#[cfg(test)]
175mod tests {
176    use super::*;
177
178    fn table(text: &[u8]) -> Table {
179        parse_table(text, &Dialect::csv()).expect("parse table")
180    }
181
182    #[test]
183    fn identical_tables_are_clean() {
184        let a = table(b"ACCT,AMT\nA1,12.50\nA2,0.99\n");
185        let report = diff(&a, &a);
186        assert!(report.is_clean());
187    }
188
189    #[test]
190    fn changed_cell_is_reported() {
191        let s = table(b"ACCT,AMT\nA1,12.50\n");
192        let t = table(b"ACCT,AMT\nA1,99.99\n");
193        let report = diff(&s, &t);
194        assert_eq!(report.entries.len(), 1);
195        assert_eq!(report.entries[0].field, "AMT");
196        assert_eq!(report.entries[0].source, "12.50");
197        assert_eq!(report.entries[0].target, "99.99");
198    }
199
200    #[test]
201    fn column_alignment_by_name_not_position() {
202        // Same data, columns reordered -> no cell differences.
203        let s = table(b"ACCT,AMT\nA1,12.50\n");
204        let t = table(b"AMT,ACCT\n12.50,A1\n");
205        let report = diff(&s, &t);
206        assert!(report.entries.is_empty(), "entries: {:?}", report.entries);
207        assert!(report.findings.is_empty());
208    }
209
210    #[test]
211    fn column_only_in_one_table_is_a_finding() {
212        let s = table(b"ACCT,AMT,STATUS\nA1,12.50,OK\n");
213        let t = table(b"ACCT,AMT\nA1,12.50\n");
214        let report = diff(&s, &t);
215        assert!(report.findings.iter().any(|f| f.code == "COLUMN_ONLY_IN_SOURCE"));
216    }
217
218    #[test]
219    fn row_count_mismatch_is_a_finding() {
220        let s = table(b"ACCT,AMT\nA1,12.50\nA2,1.00\n");
221        let t = table(b"ACCT,AMT\nA1,12.50\n");
222        let report = diff(&s, &t);
223        assert!(report.findings.iter().any(|f| f.code == "ROW_COUNT"));
224    }
225}