tabkit 0.4.3

Tabular files → schema + sample rows. The shared spreadsheet reader Tauri / Iced / native desktop apps reach for when they need to introspect XLSX / CSV / TSV without inventing the same calamine-plus-type-inference glue twice.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
//! CSV / TSV reader, backed by the [`csv`](https://crates.io/crates/csv)
//! crate. Tab vs comma is auto-selected by extension (`.tsv` →
//! tab, everything else → comma).

use crate::{infer_column_type, Column, Error, ReadOptions, Reader, Result, Row, Table, Value};
use std::path::Path;

/// CSV / TSV reader.
#[derive(Default)]
pub struct CsvReader;

impl CsvReader {
    /// Construct a reader. Cannot fail.
    #[must_use]
    pub fn new() -> Self {
        Self
    }
}

impl Reader for CsvReader {
    fn extensions(&self) -> &[&'static str] {
        &["csv", "tsv"]
    }

    fn name(&self) -> &'static str {
        "csv"
    }

    fn read(&self, path: &Path, options: &ReadOptions) -> Result<Table> {
        // Tab vs comma based on extension. The csv crate doesn't
        // sniff content; for that we'd need a separate detection
        // pass. Extension is right ~99% of the time.
        let delimiter = if path
            .extension()
            .and_then(|os| os.to_str())
            .map(str::to_ascii_lowercase)
            .as_deref()
            == Some("tsv")
        {
            b'\t'
        } else {
            b','
        };

        let mut reader = ::csv::ReaderBuilder::new()
            .has_headers(options.has_header)
            .delimiter(delimiter)
            .flexible(true) // tolerate ragged rows; pad with nulls below
            .from_path(path)
            .map_err(|e| Error::ParseError(format!("csv open failed: {e}")))?;

        // Build column names. csv treats the first row as headers
        // when `has_headers(true)`; in headerless mode we generate
        // `column_<i>` names from the first record's width.
        let column_names: Vec<String> = if options.has_header {
            reader
                .headers()
                .map_err(|e| Error::ParseError(format!("csv headers read failed: {e}")))?
                .iter()
                .enumerate()
                .map(|(idx, h)| {
                    if h.trim().is_empty() {
                        format!("column_{idx}")
                    } else {
                        h.to_string()
                    }
                })
                .collect()
        } else {
            // Peek the first record to learn the column count.
            // csv's StringRecord iter consumes the row, so we
            // store it for later.
            Vec::new()
        };

        let mut sample_rows: Vec<Row> = Vec::with_capacity(options.max_sample_rows);
        let mut row_count: u64 = 0;
        let mut headerless_width: Option<usize> = None;
        let mut pending_first_record: Option<Vec<String>> = None;

        for record in reader.records() {
            let record = record.map_err(|e| {
                Error::ParseError(format!("csv row {} parse failed: {e}", row_count + 1))
            })?;
            row_count += 1;

            // In headerless mode the very first record sets the
            // column count. Save it as a pending row so it lands
            // in `sample_rows` like any other data row.
            if !options.has_header && headerless_width.is_none() {
                let width = record.len();
                headerless_width = Some(width);
                pending_first_record = Some(record.iter().map(str::to_string).collect());
                continue;
            }

            if sample_rows.len() < options.max_sample_rows {
                sample_rows.push(record.iter().map(parse_cell).collect());
            }
        }

        // For headerless, generate column_<i> names + push the
        // pending first row.
        let final_column_names = if options.has_header {
            column_names
        } else {
            let width = headerless_width.unwrap_or(0);
            let names: Vec<String> = (0..width).map(|i| format!("column_{i}")).collect();
            if let Some(first) = pending_first_record {
                if sample_rows.len() < options.max_sample_rows {
                    sample_rows.insert(0, first.iter().map(|s| parse_cell(s.as_str())).collect());
                }
            }
            names
        };

        let columns = pad_and_infer(&final_column_names, &mut sample_rows);

        let mut metadata = std::collections::HashMap::new();
        metadata.insert(
            "delimiter".into(),
            if delimiter == b'\t' {
                "tab".into()
            } else {
                ",".into()
            },
        );

        Ok(Table {
            columns,
            sample_rows,
            row_count: Some(row_count),
            metadata,
        })
    }
}

/// Pad each sample row out to the column count (so ragged input
/// doesn't produce out-of-bounds reads later) and run type inference
/// per column. Pulled out of [`CsvReader::read`] to keep that
/// function under clippy's 100-line ceiling and to share the logic
/// with future callers.
fn pad_and_infer(column_names: &[String], sample_rows: &mut [Row]) -> Vec<Column> {
    let width = column_names.len();
    for row in sample_rows.iter_mut() {
        while row.len() < width {
            row.push(Value::Null);
        }
        row.truncate(width);
    }
    column_names
        .iter()
        .enumerate()
        .map(|(idx, name)| {
            let column_samples: Vec<Value> = sample_rows
                .iter()
                .map(|r| r.get(idx).cloned().unwrap_or(Value::Null))
                .collect();
            let (data_type, nullable) = infer_column_type(&column_samples);
            Column {
                name: name.clone(),
                data_type,
                nullable,
            }
        })
        .collect()
}

/// Parse a raw CSV cell string into a typed `Value`. Rules:
/// - Empty string → `Null` (CSV has no real null; this is the
///   conventional read).
/// - All-digits (with optional leading `-`) → `Integer` if it
///   fits `i64`, else `Text`.
/// - Decimal-looking → `Float` if `parse::<f64>` accepts it.
/// - `true` / `false` (case-insensitive) → `Bool`.
/// - ISO-8601 `YYYY-MM-DD` → `Date`.
/// - ISO-8601 `YYYY-MM-DDTHH:MM:SS[.fff][±HH:MM|Z]` → `DateTime`.
/// - Anything else → `Text`.
fn parse_cell(raw: &str) -> Value {
    if raw.is_empty() {
        return Value::Null;
    }
    let trimmed = raw.trim();
    if trimmed.is_empty() {
        return Value::Text(raw.to_string());
    }
    // Bool first — narrowest match.
    if trimmed.eq_ignore_ascii_case("true") {
        return Value::Bool(true);
    }
    if trimmed.eq_ignore_ascii_case("false") {
        return Value::Bool(false);
    }
    // Integer next — all-digits with optional leading `-`. We
    // check the byte pattern before parse() because parse() also
    // accepts `+1` / `0x1f` etc. that don't read as plain integers
    // to a human eye.
    if is_plain_integer(trimmed) {
        if let Ok(i) = trimmed.parse::<i64>() {
            return Value::Integer(i);
        }
    }
    // Float — parse if it has a `.` or `e`/`E`. We don't accept
    // bare integer strings as Float; that'd defeat the
    // Integer-first path.
    if trimmed.contains('.') || trimmed.contains('e') || trimmed.contains('E') {
        if let Ok(f) = trimmed.parse::<f64>() {
            return Value::Float(f);
        }
    }
    // Dates — detect ISO-8601 `YYYY-MM-DD` (date) and the longer
    // datetime form. We rigid-pattern-match rather than pulling
    // in `chrono` or `regex` because the cell shapes are
    // canonical and the library stays dep-light. Other date
    // dialects (`MM/DD/YYYY`, locale-specific) fall through to
    // `Text` — callers wanting them can post-process.
    if looks_like_iso_date(trimmed) {
        return Value::Date(trimmed.to_string());
    }
    if looks_like_iso_datetime(trimmed) {
        return Value::DateTime(trimmed.to_string());
    }
    Value::Text(raw.to_string())
}

fn is_plain_integer(s: &str) -> bool {
    let bytes = s.as_bytes();
    if bytes.is_empty() {
        return false;
    }
    let start = usize::from(bytes[0] == b'-');
    if start == bytes.len() {
        return false;
    }
    bytes[start..].iter().all(u8::is_ascii_digit)
}

/// Rigid match on `YYYY-MM-DD` shape — exactly 10 chars, dashes at
/// positions 4 and 7, ASCII digits everywhere else. We don't
/// validate the date itself (Feb 30 still parses) — that's the
/// caller's responsibility.
fn looks_like_iso_date(s: &str) -> bool {
    s.len() == 10
        && s.as_bytes().iter().enumerate().all(|(idx, &b)| match idx {
            4 | 7 => b == b'-',
            _ => b.is_ascii_digit(),
        })
}

/// Match on the prefix `YYYY-MM-DDTHH:MM:SS` (19 chars), with
/// optional sub-second fraction and timezone. Examples:
///
/// - `2024-01-15T12:00:00`
/// - `2024-01-15T12:00:00Z`
/// - `2024-01-15T12:00:00.123`
/// - `2024-01-15T12:00:00+02:00`
///
/// Some emitters use a space instead of `T`. We accept that too.
fn looks_like_iso_datetime(s: &str) -> bool {
    if s.len() < 19 {
        return false;
    }
    let bytes = s.as_bytes();
    let date_separator_ok = matches!(bytes[10], b'T' | b' ');
    if !date_separator_ok {
        return false;
    }
    // YYYY-MM-DD prefix.
    let date_part_ok = bytes[..10].iter().enumerate().all(|(idx, &b)| match idx {
        4 | 7 => b == b'-',
        _ => b.is_ascii_digit(),
    });
    if !date_part_ok {
        return false;
    }
    // HH:MM:SS — bytes 11-18.
    let time_part_ok = bytes[11..19].iter().enumerate().all(|(idx, &b)| match idx {
        2 | 5 => b == b':',
        _ => b.is_ascii_digit(),
    });
    time_part_ok
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::io::Write;

    fn write_csv(content: &str) -> tempfile::NamedTempFile {
        let mut f = tempfile::Builder::new().suffix(".csv").tempfile().unwrap();
        f.write_all(content.as_bytes()).unwrap();
        f.flush().unwrap();
        f
    }

    #[test]
    fn extensions_handles_csv_and_tsv() {
        assert_eq!(CsvReader.extensions(), &["csv", "tsv"]);
    }

    #[test]
    fn name_identifies_backend() {
        assert_eq!(CsvReader.name(), "csv");
    }

    #[test]
    fn reads_basic_csv_with_header() {
        let f = write_csv("name,age\nAlice,30\nBob,25\n");
        let table = CsvReader.read(f.path(), &ReadOptions::default()).unwrap();
        assert_eq!(table.columns.len(), 2);
        assert_eq!(table.columns[0].name, "name");
        assert_eq!(table.columns[1].name, "age");
        assert_eq!(table.sample_rows.len(), 2);
        assert_eq!(table.row_count, Some(2));
    }

    #[test]
    fn type_inference_picks_integer_for_age() {
        let f = write_csv("name,age\nAlice,30\nBob,25\n");
        let table = CsvReader.read(f.path(), &ReadOptions::default()).unwrap();
        assert_eq!(table.columns[1].data_type, crate::DataType::Integer);
    }

    #[test]
    fn type_inference_picks_float_for_mixed_int_and_float() {
        let f = write_csv("v\n1\n2.5\n3\n");
        let table = CsvReader.read(f.path(), &ReadOptions::default()).unwrap();
        assert_eq!(table.columns[0].data_type, crate::DataType::Float);
    }

    #[test]
    fn type_inference_falls_back_to_text_on_mixed() {
        let f = write_csv("v\n1\nhello\n");
        let table = CsvReader.read(f.path(), &ReadOptions::default()).unwrap();
        assert_eq!(table.columns[0].data_type, crate::DataType::Text);
    }

    #[test]
    fn empty_cells_become_null_and_mark_column_nullable() {
        // The csv crate skips lines containing zero bytes (a bare
        // `\n` between rows is treated as no record), so we test
        // empty-cell handling on a multi-column CSV where one row
        // has an explicitly empty first field.
        let f = write_csv("v,name\n1,a\n,b\n3,c\n");
        let table = CsvReader.read(f.path(), &ReadOptions::default()).unwrap();
        assert_eq!(table.columns[0].data_type, crate::DataType::Integer);
        assert!(table.columns[0].nullable);
    }

    #[test]
    fn ragged_rows_get_padded_with_nulls() {
        // Second row has only one cell; should be padded to 2.
        let f = write_csv("a,b\n1,2\n3\n");
        let table = CsvReader.read(f.path(), &ReadOptions::default()).unwrap();
        assert_eq!(table.sample_rows[1].len(), 2);
        assert_eq!(table.sample_rows[1][1], Value::Null);
    }

    #[test]
    fn sample_cap_limits_rows() {
        use std::fmt::Write as _;
        let mut content = String::from("v\n");
        for i in 0..200 {
            writeln!(content, "{i}").unwrap();
        }
        let f = write_csv(&content);
        let table = CsvReader
            .read(f.path(), &ReadOptions::default().max_sample_rows(10))
            .unwrap();
        assert_eq!(table.sample_rows.len(), 10);
        // row_count counts every row, not just sampled ones.
        assert_eq!(table.row_count, Some(200));
    }

    #[test]
    fn empty_header_cell_falls_back_to_column_index() {
        let f = write_csv(",b\n1,2\n");
        let table = CsvReader.read(f.path(), &ReadOptions::default()).unwrap();
        assert_eq!(table.columns[0].name, "column_0");
        assert_eq!(table.columns[1].name, "b");
    }

    #[test]
    fn headerless_mode_generates_column_names() {
        let f = write_csv("1,2\n3,4\n");
        let table = CsvReader
            .read(f.path(), &ReadOptions::default().has_header(false))
            .unwrap();
        assert_eq!(table.columns[0].name, "column_0");
        assert_eq!(table.columns[1].name, "column_1");
        assert_eq!(table.sample_rows.len(), 2);
        assert_eq!(table.row_count, Some(2));
    }

    #[test]
    fn missing_file_returns_typed_error() {
        let result = CsvReader.read(Path::new("/nonexistent.csv"), &ReadOptions::default());
        assert!(matches!(result, Err(Error::ParseError(_))));
    }

    #[test]
    fn parse_cell_recognises_basic_types() {
        assert_eq!(parse_cell(""), Value::Null);
        assert_eq!(parse_cell("42"), Value::Integer(42));
        assert_eq!(parse_cell("-7"), Value::Integer(-7));
        assert_eq!(parse_cell("2.5"), Value::Float(2.5));
        assert_eq!(parse_cell("true"), Value::Bool(true));
        assert_eq!(parse_cell("FALSE"), Value::Bool(false));
        assert_eq!(parse_cell("hello"), Value::Text("hello".into()));
    }

    #[test]
    fn parse_cell_recognises_iso_dates() {
        assert_eq!(parse_cell("2024-01-15"), Value::Date("2024-01-15".into()));
        assert_eq!(parse_cell("1970-12-31"), Value::Date("1970-12-31".into()));
    }

    #[test]
    fn parse_cell_recognises_iso_datetimes() {
        assert_eq!(
            parse_cell("2024-01-15T12:00:00"),
            Value::DateTime("2024-01-15T12:00:00".into())
        );
        assert_eq!(
            parse_cell("2024-01-15T12:00:00Z"),
            Value::DateTime("2024-01-15T12:00:00Z".into())
        );
        assert_eq!(
            parse_cell("2024-01-15T12:00:00.123"),
            Value::DateTime("2024-01-15T12:00:00.123".into())
        );
        assert_eq!(
            parse_cell("2024-01-15T12:00:00+02:00"),
            Value::DateTime("2024-01-15T12:00:00+02:00".into())
        );
        // Space-separated form (some emitters use this instead of `T`).
        assert_eq!(
            parse_cell("2024-01-15 12:00:00"),
            Value::DateTime("2024-01-15 12:00:00".into())
        );
    }

    #[test]
    fn parse_cell_rejects_non_iso_date_dialects() {
        // MM/DD/YYYY and similar locale forms fall through to Text
        // — caller post-processes if they want them.
        assert_eq!(parse_cell("01/15/2024"), Value::Text("01/15/2024".into()));
        // Pseudo-ISO that doesn't fit the rigid pattern.
        assert_eq!(parse_cell("2024-1-15"), Value::Text("2024-1-15".into()));
    }

    #[test]
    fn date_column_inferred_correctly_in_csv() {
        let f = write_csv("created\n2024-01-15\n2024-02-20\n2024-03-31\n");
        let table = CsvReader.read(f.path(), &ReadOptions::default()).unwrap();
        assert_eq!(table.columns[0].data_type, crate::DataType::Date);
    }

    #[test]
    fn date_plus_datetime_widens_to_datetime() {
        let f = write_csv("ts\n2024-01-15\n2024-02-20T12:00:00\n");
        let table = CsvReader.read(f.path(), &ReadOptions::default()).unwrap();
        assert_eq!(table.columns[0].data_type, crate::DataType::DateTime);
    }
}