use crate::{infer_column_type, Column, Error, ReadOptions, Reader, Result, Row, Table, Value};
use std::path::Path;
#[derive(Default)]
pub struct CsvReader;
impl CsvReader {
#[must_use]
pub fn new() -> Self {
Self
}
}
impl Reader for CsvReader {
fn extensions(&self) -> &[&'static str] {
&["csv", "tsv"]
}
fn name(&self) -> &'static str {
"csv"
}
fn read(&self, path: &Path, options: &ReadOptions) -> Result<Table> {
let delimiter = if path
.extension()
.and_then(|os| os.to_str())
.map(str::to_ascii_lowercase)
.as_deref()
== Some("tsv")
{
b'\t'
} else {
b','
};
let mut reader = ::csv::ReaderBuilder::new()
.has_headers(options.has_header)
.delimiter(delimiter)
.flexible(true) .from_path(path)
.map_err(|e| Error::ParseError(format!("csv open failed: {e}")))?;
let column_names: Vec<String> = if options.has_header {
reader
.headers()
.map_err(|e| Error::ParseError(format!("csv headers read failed: {e}")))?
.iter()
.enumerate()
.map(|(idx, h)| {
if h.trim().is_empty() {
format!("column_{idx}")
} else {
h.to_string()
}
})
.collect()
} else {
Vec::new()
};
let mut sample_rows: Vec<Row> = Vec::with_capacity(options.max_sample_rows);
let mut row_count: u64 = 0;
let mut headerless_width: Option<usize> = None;
let mut pending_first_record: Option<Vec<String>> = None;
for record in reader.records() {
let record = record.map_err(|e| {
Error::ParseError(format!("csv row {} parse failed: {e}", row_count + 1))
})?;
row_count += 1;
if !options.has_header && headerless_width.is_none() {
let width = record.len();
headerless_width = Some(width);
pending_first_record = Some(record.iter().map(str::to_string).collect());
continue;
}
if sample_rows.len() < options.max_sample_rows {
sample_rows.push(record.iter().map(parse_cell).collect());
}
}
let final_column_names = if options.has_header {
column_names
} else {
let width = headerless_width.unwrap_or(0);
let names: Vec<String> = (0..width).map(|i| format!("column_{i}")).collect();
if let Some(first) = pending_first_record {
if sample_rows.len() < options.max_sample_rows {
sample_rows.insert(0, first.iter().map(|s| parse_cell(s.as_str())).collect());
}
}
names
};
let columns = pad_and_infer(&final_column_names, &mut sample_rows);
let mut metadata = std::collections::HashMap::new();
metadata.insert(
"delimiter".into(),
if delimiter == b'\t' {
"tab".into()
} else {
",".into()
},
);
Ok(Table {
columns,
sample_rows,
row_count: Some(row_count),
metadata,
})
}
}
fn pad_and_infer(column_names: &[String], sample_rows: &mut [Row]) -> Vec<Column> {
let width = column_names.len();
for row in sample_rows.iter_mut() {
while row.len() < width {
row.push(Value::Null);
}
row.truncate(width);
}
column_names
.iter()
.enumerate()
.map(|(idx, name)| {
let column_samples: Vec<Value> = sample_rows
.iter()
.map(|r| r.get(idx).cloned().unwrap_or(Value::Null))
.collect();
let (data_type, nullable) = infer_column_type(&column_samples);
Column {
name: name.clone(),
data_type,
nullable,
}
})
.collect()
}
fn parse_cell(raw: &str) -> Value {
if raw.is_empty() {
return Value::Null;
}
let trimmed = raw.trim();
if trimmed.is_empty() {
return Value::Text(raw.to_string());
}
if trimmed.eq_ignore_ascii_case("true") {
return Value::Bool(true);
}
if trimmed.eq_ignore_ascii_case("false") {
return Value::Bool(false);
}
if is_plain_integer(trimmed) {
if let Ok(i) = trimmed.parse::<i64>() {
return Value::Integer(i);
}
}
if trimmed.contains('.') || trimmed.contains('e') || trimmed.contains('E') {
if let Ok(f) = trimmed.parse::<f64>() {
return Value::Float(f);
}
}
if looks_like_iso_date(trimmed) {
return Value::Date(trimmed.to_string());
}
if looks_like_iso_datetime(trimmed) {
return Value::DateTime(trimmed.to_string());
}
Value::Text(raw.to_string())
}
fn is_plain_integer(s: &str) -> bool {
let bytes = s.as_bytes();
if bytes.is_empty() {
return false;
}
let start = usize::from(bytes[0] == b'-');
if start == bytes.len() {
return false;
}
bytes[start..].iter().all(u8::is_ascii_digit)
}
fn looks_like_iso_date(s: &str) -> bool {
s.len() == 10
&& s.as_bytes().iter().enumerate().all(|(idx, &b)| match idx {
4 | 7 => b == b'-',
_ => b.is_ascii_digit(),
})
}
fn looks_like_iso_datetime(s: &str) -> bool {
if s.len() < 19 {
return false;
}
let bytes = s.as_bytes();
let date_separator_ok = matches!(bytes[10], b'T' | b' ');
if !date_separator_ok {
return false;
}
let date_part_ok = bytes[..10].iter().enumerate().all(|(idx, &b)| match idx {
4 | 7 => b == b'-',
_ => b.is_ascii_digit(),
});
if !date_part_ok {
return false;
}
let time_part_ok = bytes[11..19].iter().enumerate().all(|(idx, &b)| match idx {
2 | 5 => b == b':',
_ => b.is_ascii_digit(),
});
time_part_ok
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
fn write_csv(content: &str) -> tempfile::NamedTempFile {
let mut f = tempfile::Builder::new().suffix(".csv").tempfile().unwrap();
f.write_all(content.as_bytes()).unwrap();
f.flush().unwrap();
f
}
#[test]
fn extensions_handles_csv_and_tsv() {
assert_eq!(CsvReader.extensions(), &["csv", "tsv"]);
}
#[test]
fn name_identifies_backend() {
assert_eq!(CsvReader.name(), "csv");
}
#[test]
fn reads_basic_csv_with_header() {
let f = write_csv("name,age\nAlice,30\nBob,25\n");
let table = CsvReader.read(f.path(), &ReadOptions::default()).unwrap();
assert_eq!(table.columns.len(), 2);
assert_eq!(table.columns[0].name, "name");
assert_eq!(table.columns[1].name, "age");
assert_eq!(table.sample_rows.len(), 2);
assert_eq!(table.row_count, Some(2));
}
#[test]
fn type_inference_picks_integer_for_age() {
let f = write_csv("name,age\nAlice,30\nBob,25\n");
let table = CsvReader.read(f.path(), &ReadOptions::default()).unwrap();
assert_eq!(table.columns[1].data_type, crate::DataType::Integer);
}
#[test]
fn type_inference_picks_float_for_mixed_int_and_float() {
let f = write_csv("v\n1\n2.5\n3\n");
let table = CsvReader.read(f.path(), &ReadOptions::default()).unwrap();
assert_eq!(table.columns[0].data_type, crate::DataType::Float);
}
#[test]
fn type_inference_falls_back_to_text_on_mixed() {
let f = write_csv("v\n1\nhello\n");
let table = CsvReader.read(f.path(), &ReadOptions::default()).unwrap();
assert_eq!(table.columns[0].data_type, crate::DataType::Text);
}
#[test]
fn empty_cells_become_null_and_mark_column_nullable() {
let f = write_csv("v,name\n1,a\n,b\n3,c\n");
let table = CsvReader.read(f.path(), &ReadOptions::default()).unwrap();
assert_eq!(table.columns[0].data_type, crate::DataType::Integer);
assert!(table.columns[0].nullable);
}
#[test]
fn ragged_rows_get_padded_with_nulls() {
let f = write_csv("a,b\n1,2\n3\n");
let table = CsvReader.read(f.path(), &ReadOptions::default()).unwrap();
assert_eq!(table.sample_rows[1].len(), 2);
assert_eq!(table.sample_rows[1][1], Value::Null);
}
#[test]
fn sample_cap_limits_rows() {
use std::fmt::Write as _;
let mut content = String::from("v\n");
for i in 0..200 {
writeln!(content, "{i}").unwrap();
}
let f = write_csv(&content);
let table = CsvReader
.read(f.path(), &ReadOptions::default().max_sample_rows(10))
.unwrap();
assert_eq!(table.sample_rows.len(), 10);
assert_eq!(table.row_count, Some(200));
}
#[test]
fn empty_header_cell_falls_back_to_column_index() {
let f = write_csv(",b\n1,2\n");
let table = CsvReader.read(f.path(), &ReadOptions::default()).unwrap();
assert_eq!(table.columns[0].name, "column_0");
assert_eq!(table.columns[1].name, "b");
}
#[test]
fn headerless_mode_generates_column_names() {
let f = write_csv("1,2\n3,4\n");
let table = CsvReader
.read(f.path(), &ReadOptions::default().has_header(false))
.unwrap();
assert_eq!(table.columns[0].name, "column_0");
assert_eq!(table.columns[1].name, "column_1");
assert_eq!(table.sample_rows.len(), 2);
assert_eq!(table.row_count, Some(2));
}
#[test]
fn missing_file_returns_typed_error() {
let result = CsvReader.read(Path::new("/nonexistent.csv"), &ReadOptions::default());
assert!(matches!(result, Err(Error::ParseError(_))));
}
#[test]
fn parse_cell_recognises_basic_types() {
assert_eq!(parse_cell(""), Value::Null);
assert_eq!(parse_cell("42"), Value::Integer(42));
assert_eq!(parse_cell("-7"), Value::Integer(-7));
assert_eq!(parse_cell("2.5"), Value::Float(2.5));
assert_eq!(parse_cell("true"), Value::Bool(true));
assert_eq!(parse_cell("FALSE"), Value::Bool(false));
assert_eq!(parse_cell("hello"), Value::Text("hello".into()));
}
#[test]
fn parse_cell_recognises_iso_dates() {
assert_eq!(parse_cell("2024-01-15"), Value::Date("2024-01-15".into()));
assert_eq!(parse_cell("1970-12-31"), Value::Date("1970-12-31".into()));
}
#[test]
fn parse_cell_recognises_iso_datetimes() {
assert_eq!(
parse_cell("2024-01-15T12:00:00"),
Value::DateTime("2024-01-15T12:00:00".into())
);
assert_eq!(
parse_cell("2024-01-15T12:00:00Z"),
Value::DateTime("2024-01-15T12:00:00Z".into())
);
assert_eq!(
parse_cell("2024-01-15T12:00:00.123"),
Value::DateTime("2024-01-15T12:00:00.123".into())
);
assert_eq!(
parse_cell("2024-01-15T12:00:00+02:00"),
Value::DateTime("2024-01-15T12:00:00+02:00".into())
);
assert_eq!(
parse_cell("2024-01-15 12:00:00"),
Value::DateTime("2024-01-15 12:00:00".into())
);
}
#[test]
fn parse_cell_rejects_non_iso_date_dialects() {
assert_eq!(parse_cell("01/15/2024"), Value::Text("01/15/2024".into()));
assert_eq!(parse_cell("2024-1-15"), Value::Text("2024-1-15".into()));
}
#[test]
fn date_column_inferred_correctly_in_csv() {
let f = write_csv("created\n2024-01-15\n2024-02-20\n2024-03-31\n");
let table = CsvReader.read(f.path(), &ReadOptions::default()).unwrap();
assert_eq!(table.columns[0].data_type, crate::DataType::Date);
}
#[test]
fn date_plus_datetime_widens_to_datetime() {
let f = write_csv("ts\n2024-01-15\n2024-02-20T12:00:00\n");
let table = CsvReader.read(f.path(), &ReadOptions::default()).unwrap();
assert_eq!(table.columns[0].data_type, crate::DataType::DateTime);
}
}