Skip to main content

kobold_csv/
parse.rs

1//! `KOBOLD.CSV.PARSE.1` + `KOBOLD.CSV.ROUNDTRIP.1` (FAIL-CLOSED) -- parse a COMPACT extract back into
2//! record bytes, and prove the round trip.
3//!
4//! [`parse_into`] takes a `Compact` (header + value rows) CSV produced against `copybook` and re-encodes
5//! each field value into its declared PIC bytes, returning one reconstructed record per row. It FAILS CLOSED
6//! (a [`Finding`], never a truncation/coercion) on:
7//!
8//! * a value too long for an alphanumeric field (`VALUE_OVERFLOW`),
9//! * a non-numeric value into a numeric field (`NUMERIC_INVALID`), too many integer/fraction digits, a sign
10//!   on an unsigned field,
11//! * a header whose column names do not match the copybook's leaf field names (`HEADER_MISMATCH`),
12//! * a data row whose column count differs from the header (`COLUMN_COUNT`),
13//! * a malformed CSV line (the dialect reader's own findings bubble up).
14//!
15//! [`roundtrip`] proves records -> Compact CSV -> [`parse_into`] -> identical bytes, reporting per-record
16//! honestly when a value-only compact extract legitimately does NOT round-trip (e.g. a numeric field whose
17//! stored zoned form differs from its canonical render, so re-encoding yields the canonical -- not the
18//! original -- bytes).
19//!
20//! This module is independent of GnuCOBOL/libcob.
21
22use crate::dialect::{parse_row, Dialect};
23use crate::export::{export, Mode};
24use crate::model::{Copybook, FieldDecl, FieldKind, Finding};
25
26/// `KOBOLD.CSV.PARSE.1` -- reconstruct record bytes from a Compact CSV `csv_text` against `copybook`.
27///
28/// Returns `Ok(records)` (one `Vec<u8>` of `copybook.record_length()` bytes per data row), or
29/// `Err(findings)` listing every problem across the whole input. Fail-closed: on ANY finding no bytes are
30/// returned for the offending row's class of error -- the result is all-or-nothing.
31pub fn parse_into(
32    copybook: &Copybook,
33    csv_text: &[u8],
34    dialect: &Dialect,
35) -> Result<Vec<Vec<u8>>, Vec<Finding>> {
36    let leaves = copybook.leaf_fields();
37    let mut findings: Vec<Finding> = Vec::new();
38
39    // Split into logical lines on LF; the dialect reader tolerates a trailing CR per line. NOTE: a Compact
40    // extract never embeds newlines inside a field value (compact values are decoded, trimmed display text),
41    // so a plain LF split is sound here. (Audit/Evidence are not re-parsed by this court.)
42    let lines: Vec<&[u8]> = split_lines(csv_text);
43    if lines.is_empty() {
44        return Err(vec![Finding::new("CSV_EMPTY", "no rows (expected a header row)".to_string())]);
45    }
46
47    // Header row.
48    let header = match parse_row(lines[0], dialect) {
49        Ok(h) => h,
50        Err(f) => return Err(vec![f]),
51    };
52    let expected: Vec<&str> = leaves.iter().map(|f| f.name.as_str()).collect();
53    if header.len() != expected.len() || header.iter().zip(&expected).any(|(h, e)| h != e) {
54        findings.push(Finding::new(
55            "HEADER_MISMATCH",
56            format!("header {:?} does not match copybook leaf fields {:?}", header, expected),
57        ));
58        // Header mismatch is fatal: we cannot trust column->field alignment.
59        return Err(findings);
60    }
61
62    let total = copybook.record_length();
63    let mut records: Vec<Vec<u8>> = Vec::new();
64
65    for (row_idx, line) in lines.iter().enumerate().skip(1) {
66        let row = match parse_row(line, dialect) {
67            Ok(r) => r,
68            Err(mut f) => {
69                f.message = format!("row {}: {}", row_idx, f.message);
70                findings.push(f);
71                continue;
72            }
73        };
74        if row.len() != leaves.len() {
75            findings.push(Finding::new(
76                "COLUMN_COUNT",
77                format!("row {}: {} columns, expected {}", row_idx, row.len(), leaves.len()),
78            ));
79            continue;
80        }
81        let mut out = vec![b' '; total];
82        let before = findings.len();
83        for (f, val) in leaves.iter().zip(&row) {
84            encode_leaf(f, val, &mut out, &mut findings, row_idx);
85        }
86        if findings.len() == before {
87            records.push(out);
88        }
89    }
90
91    if findings.is_empty() {
92        Ok(records)
93    } else {
94        Err(findings)
95    }
96}
97
98/// Split `text` into logical lines on LF, dropping a single trailing empty line (the terminator after the
99/// last row). Each returned slice excludes the LF; a trailing CR is left for the dialect reader to strip.
100fn split_lines(text: &[u8]) -> Vec<&[u8]> {
101    let mut lines = Vec::new();
102    let mut start = 0;
103    for i in 0..text.len() {
104        if text[i] == 0x0a {
105            lines.push(&text[start..i]);
106            start = i + 1;
107        }
108    }
109    if start < text.len() {
110        lines.push(&text[start..]);
111    }
112    lines
113}
114
115/// Encode one leaf field value into `out` at its offset, fail-closed.
116fn encode_leaf(d: &FieldDecl, value: &str, out: &mut [u8], findings: &mut Vec<Finding>, row: usize) {
117    match &d.kind {
118        FieldKind::Alphanumeric => encode_alnum(d, value, out, findings, row),
119        FieldKind::Numeric { scale, signed } => {
120            encode_numeric(d, value, *scale, *signed, out, findings, row)
121        }
122        FieldKind::Group(_) => {} // leaves never include groups
123    }
124}
125
126/// Place `bytes` at absolute `offset`, fail-closed if it would not fit.
127fn place(out: &mut [u8], offset: usize, bytes: &[u8], findings: &mut Vec<Finding>, name: &str, row: usize) {
128    let end = offset + bytes.len();
129    if end > out.len() {
130        findings.push(Finding::new(
131            "FIELD_OUT_OF_RANGE",
132            format!(
133                "row {}: field {}: writing [{}..{}] exceeds record length {}",
134                row, name, offset, end, out.len()
135            ),
136        ));
137        return;
138    }
139    out[offset..end].copy_from_slice(bytes);
140}
141
142/// Encode an alphanumeric value: pad with trailing spaces to the field length; fail closed if longer.
143fn encode_alnum(d: &FieldDecl, value: &str, out: &mut [u8], findings: &mut Vec<Finding>, row: usize) {
144    let mut bytes = Vec::with_capacity(value.len());
145    for ch in value.chars() {
146        let cp = ch as u32;
147        if cp > 0xff {
148            findings.push(Finding::new(
149                "ALNUM_NON_BYTE",
150                format!("row {}: field {}: char U+{:04X} not representable in one byte", row, d.name, cp),
151            ));
152            return;
153        }
154        bytes.push(cp as u8);
155    }
156    if bytes.len() > d.length {
157        findings.push(Finding::new(
158            "VALUE_OVERFLOW",
159            format!(
160                "row {}: field {}: value of {} bytes overflows field length {} (fail-closed, no truncation)",
161                row,
162                d.name,
163                bytes.len(),
164                d.length
165            ),
166        ));
167        return;
168    }
169    let mut buf = vec![b' '; d.length];
170    buf[..bytes.len()].copy_from_slice(&bytes);
171    place(out, d.offset, &buf, findings, &d.name, row);
172}
173
174/// Encode a numeric value into zoned-decimal display digits per `scale`/`signed`. Fail-closed on a
175/// non-numeric value, too many integer/fraction digits, or a sign on an unsigned field.
176fn encode_numeric(
177    d: &FieldDecl,
178    value: &str,
179    scale: usize,
180    signed: bool,
181    out: &mut [u8],
182    findings: &mut Vec<Finding>,
183    row: usize,
184) {
185    let mut s = value.trim();
186    let mut negative = false;
187    if let Some(rest) = s.strip_prefix('-') {
188        negative = true;
189        s = rest;
190    } else if let Some(rest) = s.strip_prefix('+') {
191        s = rest;
192    }
193    if negative && !signed {
194        findings.push(Finding::new(
195            "SIGN_ON_UNSIGNED",
196            format!("row {}: field {}: negative value into unsigned PIC {}", row, d.name, d.pic),
197        ));
198        return;
199    }
200
201    let (int_str, frac_str) = match s.split_once('.') {
202        Some((i, f)) => (i, f),
203        None => (s, ""),
204    };
205    if int_str.is_empty() && frac_str.is_empty() {
206        findings.push(Finding::new(
207            "NUMERIC_EMPTY",
208            format!("row {}: field {}: empty numeric value", row, d.name),
209        ));
210        return;
211    }
212    for (label, part) in [("integer", int_str), ("fraction", frac_str)] {
213        if !part.chars().all(|c| c.is_ascii_digit()) {
214            findings.push(Finding::new(
215                "NUMERIC_INVALID",
216                format!("row {}: field {}: non-numeric {} part {:?} (fail-closed)", row, d.name, label, part),
217            ));
218            return;
219        }
220    }
221    if frac_str.len() > scale {
222        findings.push(Finding::new(
223            "FRACTION_OVERFLOW",
224            format!(
225                "row {}: field {}: {} fraction digits exceed scale {} (fail-closed, no rounding)",
226                row,
227                d.name,
228                frac_str.len(),
229                scale
230            ),
231        ));
232        return;
233    }
234
235    let int_digits = d.length.saturating_sub(scale);
236    let int_trimmed = int_str.trim_start_matches('0');
237    if int_trimmed.len() > int_digits {
238        findings.push(Finding::new(
239            "VALUE_OVERFLOW",
240            format!(
241                "row {}: field {}: integer part {:?} needs {} digits, field has {} (fail-closed)",
242                row,
243                d.name,
244                int_str,
245                int_trimmed.len(),
246                int_digits
247            ),
248        ));
249        return;
250    }
251
252    let mut digits = String::with_capacity(d.length);
253    for _ in 0..(int_digits - int_trimmed.len()) {
254        digits.push('0');
255    }
256    digits.push_str(int_trimmed);
257    digits.push_str(frac_str);
258    for _ in 0..(scale - frac_str.len()) {
259        digits.push('0');
260    }
261
262    let mut bytes: Vec<u8> = digits.into_bytes();
263    if bytes.len() != d.length {
264        findings.push(Finding::new(
265            "NUMERIC_LENGTH",
266            format!("row {}: field {}: built {} digits, declared length {}", row, d.name, bytes.len(), d.length),
267        ));
268        return;
269    }
270    if signed {
271        if let Some(last) = bytes.last_mut() {
272            *last = overpunch_byte(*last, negative);
273        }
274    }
275    place(out, d.offset, &bytes, findings, &d.name, row);
276}
277
278/// Map an ASCII digit byte + sign to its zoned overpunch byte (the inverse of export's `overpunch`).
279fn overpunch_byte(digit: u8, negative: bool) -> u8 {
280    let n = digit.wrapping_sub(b'0');
281    if n > 9 {
282        return digit;
283    }
284    match (negative, n) {
285        (false, 0) => b'{',
286        (false, k) => b'A' + (k - 1),
287        (true, 0) => b'}',
288        (true, k) => b'J' + (k - 1),
289    }
290}
291
292/// The per-record outcome of a [`roundtrip`] proof.
293#[derive(Debug, Clone, PartialEq, Eq)]
294pub struct RoundtripRecord {
295    /// The zero-based record index.
296    pub index: usize,
297    /// True iff `parse_into` reproduced this record's exact original bytes.
298    pub identical: bool,
299    /// The original record bytes.
300    pub original: Vec<u8>,
301    /// The bytes reconstructed from the Compact extract (empty if the row failed closed).
302    pub reconstructed: Vec<u8>,
303    /// Findings explaining a non-roundtrip (re-encode failures, or the canonical-form note below).
304    pub findings: Vec<Finding>,
305}
306
307/// The full result of a [`roundtrip`] proof over a record set.
308#[derive(Debug, Clone, PartialEq, Eq)]
309pub struct RoundtripReport {
310    /// Per-record outcomes, in order.
311    pub records: Vec<RoundtripRecord>,
312}
313
314impl RoundtripReport {
315    /// True iff EVERY record reproduced identically through the Compact extract.
316    pub fn all_identical(&self) -> bool {
317        self.records.iter().all(|r| r.identical)
318    }
319}
320
321/// `KOBOLD.CSV.ROUNDTRIP.1` -- prove records -> Compact CSV -> [`parse_into`] -> identical bytes.
322///
323/// A Compact extract carries only the DECODED value (not the raw zoned form). For alphanumeric fields and
324/// for numeric fields already in canonical form this round-trips exactly. When a numeric field's STORED form
325/// differs from its canonical render (e.g. an unsigned field whose bytes carried a stray sign overpunch, or
326/// leading-space padding), the re-encode yields the canonical bytes -- legitimately NOT the original -- and
327/// that record is reported `identical: false` with a `NON_CANONICAL_STORAGE` finding rather than pretending.
328pub fn roundtrip(copybook: &Copybook, records: &[&[u8]], dialect: &Dialect) -> RoundtripReport {
329    let csv = export(copybook, records, Mode::Compact, dialect);
330    let parsed = parse_into(copybook, csv.as_bytes(), dialect);
331
332    let mut out = Vec::with_capacity(records.len());
333    match parsed {
334        Ok(recs) => {
335            for (i, orig) in records.iter().enumerate() {
336                let recon = recs.get(i).cloned().unwrap_or_default();
337                let identical = recon.as_slice() == *orig;
338                let mut findings = Vec::new();
339                if !identical {
340                    findings.push(Finding::new(
341                        "NON_CANONICAL_STORAGE",
342                        format!(
343                            "record {}: compact extract round-trips to canonical bytes which differ from the \
344                             stored form (a value-only extract cannot preserve a non-canonical zoned/padded \
345                             representation)",
346                            i
347                        ),
348                    ));
349                }
350                out.push(RoundtripRecord {
351                    index: i,
352                    identical,
353                    original: orig.to_vec(),
354                    reconstructed: recon,
355                    findings,
356                });
357            }
358        }
359        Err(findings) => {
360            // A re-encode failed closed somewhere; attribute the whole batch's findings to each record so the
361            // caller sees the failure without us guessing per-row attribution beyond the messages themselves.
362            for (i, orig) in records.iter().enumerate() {
363                out.push(RoundtripRecord {
364                    index: i,
365                    identical: false,
366                    original: orig.to_vec(),
367                    reconstructed: Vec::new(),
368                    findings: findings.clone(),
369                });
370            }
371        }
372    }
373
374    RoundtripReport { records: out }
375}
376
377#[cfg(test)]
378mod tests {
379    use super::*;
380
381    fn signed_copybook() -> Copybook {
382        Copybook {
383            record_name: "CUST".into(),
384            encoding: "ascii".into(),
385            fields: vec![
386                FieldDecl::alnum("NAME", "X(4)", 0, 4),
387                FieldDecl::numeric("AMT", "S9(3)V99", 4, 5, 2, true),
388            ],
389        }
390    }
391
392    fn unsigned_copybook() -> Copybook {
393        Copybook {
394            record_name: "R".into(),
395            encoding: "ascii".into(),
396            fields: vec![
397                FieldDecl::alnum("NAME", "X(4)", 0, 4),
398                FieldDecl::numeric("AMT", "9(3)V99", 4, 5, 2, false),
399            ],
400        }
401    }
402
403    #[test]
404    fn compact_roundtrip_identical_bytes() {
405        // KOBOLD.CSV.ROUNDTRIP.1: canonical records survive bytes -> Compact CSV -> parse_into -> bytes.
406        let cb = signed_copybook();
407        // AMT = -12.50 zoned: "0125" + overpunch of '0' negative = '}' -> "0125}"
408        let recs: Vec<&[u8]> = vec![b"JOHN0125}", b"JANE0007A"]; // 'A' = +1 overpunch on last digit
409        let report = roundtrip(&cb, &recs, &Dialect::csv());
410        assert!(report.all_identical(), "report: {:?}", report);
411    }
412
413    #[test]
414    fn parse_into_reconstructs() {
415        let cb = unsigned_copybook();
416        let csv = b"NAME,AMT\nAL,12.50\n";
417        let recs = parse_into(&cb, csv, &Dialect::csv()).expect("parse");
418        assert_eq!(recs.len(), 1);
419        assert_eq!(&recs[0], b"AL  01250");
420    }
421
422    #[test]
423    fn fail_closed_overflow_value() {
424        let cb = unsigned_copybook();
425        let csv = b"NAME,AMT\nTOOLONG,12.50\n"; // NAME too long for X(4)
426        let findings = parse_into(&cb, csv, &Dialect::csv()).expect_err("must fail closed");
427        assert!(findings.iter().any(|f| f.code == "VALUE_OVERFLOW"));
428    }
429
430    #[test]
431    fn fail_closed_nonnumeric() {
432        let cb = unsigned_copybook();
433        let csv = b"NAME,AMT\nAL,1X.50\n"; // non-numeric integer part
434        let findings = parse_into(&cb, csv, &Dialect::csv()).expect_err("must fail closed");
435        assert!(findings.iter().any(|f| f.code == "NUMERIC_INVALID"));
436    }
437
438    #[test]
439    fn fail_closed_header_mismatch() {
440        let cb = unsigned_copybook();
441        let csv = b"NAME,BALANCE\nAL,12.50\n"; // wrong second column name
442        let findings = parse_into(&cb, csv, &Dialect::csv()).expect_err("must fail closed");
443        assert_eq!(findings[0].code, "HEADER_MISMATCH");
444    }
445
446    #[test]
447    fn fail_closed_wrong_column_count() {
448        let cb = unsigned_copybook();
449        let csv = b"NAME,AMT\nAL,12.50,EXTRA\n";
450        let findings = parse_into(&cb, csv, &Dialect::csv()).expect_err("must fail closed");
451        assert!(findings.iter().any(|f| f.code == "COLUMN_COUNT"));
452    }
453
454    #[test]
455    fn non_canonical_storage_reported_honestly() {
456        let cb = unsigned_copybook();
457        // AMT stored as "  099" (leading spaces) renders to "0.99" but re-encodes to "00099": not identical.
458        let recs: Vec<&[u8]> = vec![b"AL    099"];
459        let report = roundtrip(&cb, &recs, &Dialect::csv());
460        assert!(!report.all_identical());
461        assert_eq!(report.records[0].findings[0].code, "NON_CANONICAL_STORAGE");
462    }
463}