#![allow(clippy::should_implement_trait)]
#[allow(unused_imports)]
use super::functions::*;
use crate::{Error, Result};
use std::fs::{File, OpenOptions};
use std::io::{BufRead, BufReader, BufWriter, Write};
use std::path::Path;
#[allow(dead_code)]
#[derive(Debug, Clone)]
pub struct CsvWriterConfig {
pub delimiter: char,
pub line_ending: String,
pub quote_all: bool,
pub precision: usize,
}
#[allow(dead_code)]
pub struct CsvParser<'a> {
pub(super) input: &'a str,
pub(super) delimiter: char,
pub(super) comment_prefix: Option<char>,
}
#[allow(dead_code)]
impl<'a> CsvParser<'a> {
pub fn new(input: &'a str, delimiter: char) -> Self {
Self {
input,
delimiter,
comment_prefix: None,
}
}
pub fn with_comment_prefix(mut self, prefix: char) -> Self {
self.comment_prefix = Some(prefix);
self
}
pub fn parse_all(self) -> std::result::Result<Vec<CsvRecord>, Error> {
let mut records = Vec::new();
let mut chars = self.input.chars().peekable();
'outer: loop {
if chars.peek().is_none() {
break;
}
let mut fields: Vec<String> = Vec::new();
let mut field = String::new();
let mut in_quotes = false;
loop {
match chars.next() {
None => {
if in_quotes {
return Err(Error::Parse(
"unterminated quoted field at EOF".to_string(),
));
}
fields.push(field);
break;
}
Some('"') if !in_quotes => {
in_quotes = true;
}
Some('"') if in_quotes => {
if chars.peek() == Some(&'"') {
chars.next();
field.push('"');
} else {
in_quotes = false;
}
}
Some('\\') if in_quotes => match chars.next() {
Some('n') => field.push('\n'),
Some('t') => field.push('\t'),
Some('r') => field.push('\r'),
Some('"') => field.push('"'),
Some('\\') => field.push('\\'),
Some(c) => {
field.push('\\');
field.push(c);
}
None => {
return Err(Error::Parse("trailing backslash at EOF".to_string()));
}
},
Some('\r') if !in_quotes => {
if chars.peek() == Some(&'\n') {
chars.next();
}
fields.push(field);
break;
}
Some('\n') if !in_quotes => {
fields.push(field);
break;
}
Some(c) if c == self.delimiter && !in_quotes => {
fields.push(field.clone());
field = String::new();
}
Some(c) => {
field.push(c);
}
#[allow(unreachable_patterns)]
_ => {}
}
}
if let Some(prefix) = self.comment_prefix
&& fields
.first()
.map(|f| f.trim_start().starts_with(prefix))
.unwrap_or(false)
{
continue 'outer;
}
if fields.len() == 1 && fields[0].trim().is_empty() {
continue 'outer;
}
records.push(CsvRecord { fields });
}
Ok(records)
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct CsvRecord {
pub fields: Vec<String>,
}
impl CsvRecord {
pub fn len(&self) -> usize {
self.fields.len()
}
pub fn is_empty(&self) -> bool {
self.fields.is_empty()
}
pub fn get(&self, index: usize) -> &str {
self.fields.get(index).map(|s| s.as_str()).unwrap_or("")
}
}
#[allow(dead_code)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum PivotAgg {
Sum,
Mean,
Count,
Min,
Max,
}
#[allow(dead_code)]
#[derive(Debug, Clone)]
pub struct CsvTable {
pub headers: Vec<String>,
pub rows: Vec<Vec<String>>,
}
#[allow(dead_code)]
impl CsvTable {
pub fn new(headers: Vec<String>) -> Self {
Self {
headers,
rows: Vec::new(),
}
}
pub fn from_str(data: &str, delimiter: char) -> std::result::Result<Self, Error> {
let parser = CsvParser::new(data, delimiter).with_comment_prefix('#');
let mut records = parser.parse_all()?;
if records.is_empty() {
return Err(Error::Parse("CSV table is empty".to_string()));
}
let header_rec = records.remove(0);
let headers: Vec<String> = header_rec
.fields
.iter()
.map(|s| s.trim().to_string())
.collect();
let ncols = headers.len();
let mut rows = Vec::new();
for rec in records {
let mut row: Vec<String> = rec
.fields
.into_iter()
.map(|s| s.trim().to_string())
.collect();
while row.len() < ncols {
row.push(String::new());
}
rows.push(row);
}
Ok(Self { headers, rows })
}
pub fn to_csv_string(&self, delimiter: char) -> String {
let mut out = String::new();
out.push_str(&self.headers.join(&delimiter.to_string()));
out.push('\n');
for row in &self.rows {
let line: Vec<String> = row.iter().map(|f| quote_field(f, delimiter)).collect();
out.push_str(&line.join(&delimiter.to_string()));
out.push('\n');
}
out
}
pub fn column_index(&self, name: &str) -> std::result::Result<usize, Error> {
self.headers
.iter()
.position(|h| h == name)
.ok_or_else(|| Error::Parse(format!("column '{}' not found", name)))
}
pub fn column_values(&self, name: &str) -> std::result::Result<Vec<&str>, Error> {
let idx = self.column_index(name)?;
Ok(self.rows.iter().map(|r| r[idx].as_str()).collect())
}
pub fn column_f64(&self, name: &str) -> std::result::Result<Vec<f64>, Error> {
let idx = self.column_index(name)?;
self.rows
.iter()
.enumerate()
.map(|(i, r)| {
let s = r[idx].trim();
if s.is_empty() {
Ok(f64::NAN)
} else {
s.parse::<f64>().map_err(|_| {
Error::Parse(format!("row {}: cannot parse '{}' as f64", i, s))
})
}
})
.collect()
}
pub fn row_count(&self) -> usize {
self.rows.len()
}
pub fn col_count(&self) -> usize {
self.headers.len()
}
}
#[allow(dead_code)]
pub struct InMemoryCsvWriter {
pub(super) columns: Vec<String>,
pub(super) delimiter: char,
pub(super) precision: usize,
pub(super) rows: Vec<Vec<f64>>,
}
#[allow(dead_code)]
impl InMemoryCsvWriter {
pub fn new(columns: &[&str], delimiter: char) -> Self {
Self {
columns: columns.iter().map(|s| s.to_string()).collect(),
delimiter,
precision: 6,
rows: Vec::new(),
}
}
pub fn with_precision(mut self, precision: usize) -> Self {
self.precision = precision;
self
}
pub fn write_row(&self, values: &[f64]) -> std::result::Result<String, Error> {
if values.len() != self.columns.len() {
return Err(Error::Parse(format!(
"expected {} values, got {}",
self.columns.len(),
values.len()
)));
}
let parts: Vec<String> = values
.iter()
.map(|v| format!("{:.prec$}", v, prec = self.precision))
.collect();
Ok(parts.join(&self.delimiter.to_string()))
}
pub fn write_header(&self) -> String {
self.columns.join(&self.delimiter.to_string())
}
pub fn add_row(&mut self, values: Vec<f64>) {
self.rows.push(values);
}
pub fn write_all(&self, rows: &[Vec<f64>]) -> String {
let mut out = self.write_header();
out.push('\n');
for row in rows {
if let Ok(line) = self.write_row(row) {
out.push_str(&line);
out.push('\n');
}
}
out
}
}
pub struct CsvReader;
impl CsvReader {
pub fn read(path: &str) -> Result<(Vec<String>, Vec<Vec<f64>>)> {
let file = File::open(Path::new(path))?;
let reader = BufReader::new(file);
let mut lines = reader.lines();
let header_line = lines
.next()
.ok_or_else(|| Error::Parse("empty CSV file".to_string()))??;
let headers: Vec<String> = header_line
.split(',')
.map(|s| s.trim().to_string())
.collect();
let mut rows = Vec::new();
for line in lines {
let line = line?;
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
let row: std::result::Result<Vec<f64>, _> = trimmed
.split(',')
.map(|s| s.trim().parse::<f64>())
.collect();
let row = row.map_err(|e| Error::Parse(e.to_string()))?;
rows.push(row);
}
Ok((headers, rows))
}
}
#[allow(dead_code)]
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ColumnType {
Integer,
Float,
Boolean,
Text,
Empty,
}
#[allow(dead_code)]
pub struct CsvStreamParser {
pub(super) reader: BufReader<File>,
pub(super) delimiter: char,
pub(super) headers: Vec<String>,
pub(super) pending: String,
pub(super) open_quotes: bool,
}
#[allow(dead_code)]
impl CsvStreamParser {
pub fn open(path: &str, delimiter: char) -> std::result::Result<Self, Error> {
let file = File::open(Path::new(path))?;
let mut reader = BufReader::new(file);
let mut header_line = String::new();
reader.read_line(&mut header_line)?;
let headers = split_csv_line(
header_line.trim_end_matches('\n').trim_end_matches('\r'),
delimiter,
)
.into_iter()
.map(|s| s.trim().to_string())
.collect();
Ok(Self {
reader,
delimiter,
headers,
pending: String::new(),
open_quotes: false,
})
}
pub fn headers(&self) -> &[String] {
&self.headers
}
pub fn next_record(&mut self) -> std::result::Result<Option<CsvRecord>, Error> {
loop {
let mut line = String::new();
let bytes_read = self.reader.read_line(&mut line)?;
if bytes_read == 0 {
if self.pending.is_empty() {
return Ok(None);
}
if self.open_quotes {
return Err(Error::Parse("unterminated quoted field at EOF".to_string()));
}
let record = self.flush_pending()?;
return Ok(Some(record));
}
for ch in line.chars() {
if ch == '"' {
self.open_quotes = !self.open_quotes;
}
}
self.pending.push_str(&line);
if !self.open_quotes {
let record = self.flush_pending()?;
if record.fields.len() == 1 && record.fields[0].trim().is_empty() {
continue;
}
return Ok(Some(record));
}
}
}
fn flush_pending(&mut self) -> std::result::Result<CsvRecord, Error> {
let line = std::mem::take(&mut self.pending);
let trimmed = line.trim_end_matches('\n').trim_end_matches('\r');
let fields = split_csv_line(trimmed, self.delimiter);
Ok(CsvRecord {
fields: fields.into_iter().map(|s| s.trim().to_string()).collect(),
})
}
}
#[allow(dead_code)]
pub struct InMemoryCsvReader {
pub(super) headers: Vec<String>,
pub(super) rows: Vec<Vec<Option<f64>>>,
}
#[allow(dead_code)]
impl InMemoryCsvReader {
pub fn from_str(data: &str) -> std::result::Result<Self, Error> {
Self::parse_with_delimiter(data, ',')
}
pub fn parse_with_delimiter(data: &str, delim: char) -> std::result::Result<Self, Error> {
let mut non_empty_lines: Vec<&str> = data
.lines()
.filter(|l| {
let t = l.trim();
!t.is_empty() && !t.starts_with('#')
})
.collect();
if non_empty_lines.is_empty() {
return Err(Error::Parse("CSV input is empty".to_string()));
}
let header_line = non_empty_lines.remove(0);
let headers: Vec<String> = split_csv_line(header_line, delim)
.into_iter()
.map(|s| s.trim().to_string())
.collect();
if headers.is_empty() {
return Err(Error::Parse("no headers found".to_string()));
}
let mut rows: Vec<Vec<Option<f64>>> = Vec::new();
for line in &non_empty_lines {
let fields = split_csv_line(line, delim);
let parsed: Vec<Option<f64>> = fields
.iter()
.map(|f| {
let t = f.trim();
if t.is_empty() {
None
} else {
t.parse::<f64>().ok()
}
})
.collect();
rows.push(parsed);
}
Ok(Self { headers, rows })
}
pub fn get_column_f64(&self, name: &str) -> std::result::Result<Vec<f64>, Error> {
let idx = self
.headers
.iter()
.position(|h| h == name)
.ok_or_else(|| Error::Parse(format!("column '{}' not found", name)))?;
let col: Vec<f64> = self
.rows
.iter()
.map(|row| row.get(idx).copied().flatten().unwrap_or(f64::NAN))
.collect();
Ok(col)
}
pub fn get_row_count(&self) -> usize {
self.rows.len()
}
pub fn headers(&self) -> &[String] {
&self.headers
}
pub fn column_stats(&self, name: &str) -> std::result::Result<(f64, f64, f64, f64), Error> {
let col = self.get_column_f64(name)?;
let valid: Vec<f64> = col.into_iter().filter(|v| !v.is_nan()).collect();
if valid.is_empty() {
return Err(Error::Parse(format!(
"column '{}' has no valid numeric data",
name
)));
}
let n = valid.len() as f64;
let min = valid.iter().cloned().fold(f64::INFINITY, f64::min);
let max = valid.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
let mean = valid.iter().sum::<f64>() / n;
let variance = valid.iter().map(|v| (v - mean).powi(2)).sum::<f64>() / n;
let std = variance.sqrt();
Ok((min, max, mean, std))
}
}
#[allow(dead_code)]
pub struct TypedCsvReader {
pub(super) table: CsvTable,
}
#[allow(dead_code)]
impl TypedCsvReader {
pub fn from_str(data: &str) -> std::result::Result<Self, Error> {
let table = CsvTable::from_str(data, ',')?;
Ok(Self { table })
}
pub fn with_delimiter(data: &str, delimiter: char) -> std::result::Result<Self, Error> {
let table = CsvTable::from_str(data, delimiter)?;
Ok(Self { table })
}
pub fn column_type(&self, name: &str) -> std::result::Result<ColumnType, Error> {
let idx = self.table.column_index(name)?;
let values: Vec<&str> = self.table.rows.iter().map(|r| r[idx].as_str()).collect();
Ok(infer_column_type(&values))
}
pub fn column_as_i64(&self, name: &str) -> std::result::Result<Vec<i64>, Error> {
let idx = self.table.column_index(name)?;
self.table
.rows
.iter()
.enumerate()
.map(|(i, r)| {
let s = r[idx].trim();
s.parse::<i64>()
.map_err(|_| Error::Parse(format!("row {}: cannot parse '{}' as i64", i, s)))
})
.collect()
}
pub fn column_as_f64(&self, name: &str) -> std::result::Result<Vec<f64>, Error> {
self.table.column_f64(name)
}
pub fn column_as_bool(&self, name: &str) -> std::result::Result<Vec<bool>, Error> {
let idx = self.table.column_index(name)?;
self.table
.rows
.iter()
.enumerate()
.map(|(i, r)| {
let s = r[idx].trim().to_lowercase();
match s.as_str() {
"true" | "1" | "yes" => Ok(true),
"false" | "0" | "no" => Ok(false),
other => Err(Error::Parse(format!(
"row {}: cannot parse '{}' as bool",
i, other
))),
}
})
.collect()
}
pub fn table(&self) -> &CsvTable {
&self.table
}
pub fn headers(&self) -> &[String] {
&self.table.headers
}
pub fn row_count(&self) -> usize {
self.table.row_count()
}
}
#[allow(dead_code)]
#[derive(Debug, Clone)]
pub struct CsvDiff {
pub removed: Vec<Vec<String>>,
pub added: Vec<Vec<String>>,
pub changed: Vec<CsvChangedRow>,
}
pub struct CsvWriter {
pub(super) writer: BufWriter<File>,
}
impl CsvWriter {
pub fn new(path: &str, headers: &[&str]) -> Result<Self> {
let file = File::create(Path::new(path))?;
let mut writer = BufWriter::new(file);
writeln!(writer, "{}", headers.join(","))?;
writer.flush()?;
let file = OpenOptions::new().append(true).open(Path::new(path))?;
let writer = BufWriter::new(file);
Ok(Self { writer })
}
pub fn write_row(&mut self, values: &[f64]) -> Result<()> {
let row: Vec<String> = values.iter().map(|v| v.to_string()).collect();
writeln!(self.writer, "{}", row.join(","))?;
self.writer.flush()?;
Ok(())
}
}
#[allow(dead_code)]
#[derive(Debug, Clone)]
pub struct CsvChangedRow {
pub key: String,
pub before: Vec<String>,
pub after: Vec<String>,
}
#[allow(dead_code)]
pub struct ConfigurableCsvWriter {
pub(super) config: CsvWriterConfig,
pub(super) buffer: String,
}
#[allow(dead_code)]
impl ConfigurableCsvWriter {
pub fn new(config: CsvWriterConfig) -> Self {
Self {
config,
buffer: String::new(),
}
}
pub fn write_header(&mut self, headers: &[&str]) {
let line = if self.config.quote_all {
headers
.iter()
.map(|h| format!("\"{}\"", h.replace('"', "\"\"")))
.collect::<Vec<_>>()
.join(&self.config.delimiter.to_string())
} else {
headers
.iter()
.map(|h| quote_field(h, self.config.delimiter))
.collect::<Vec<_>>()
.join(&self.config.delimiter.to_string())
};
self.buffer.push_str(&line);
self.buffer.push_str(&self.config.line_ending);
}
pub fn write_f64_row(&mut self, values: &[f64]) {
let prec = self.config.precision;
let line: Vec<String> = values
.iter()
.map(|v| format!("{:.prec$}", v, prec = prec))
.collect();
self.buffer
.push_str(&line.join(&self.config.delimiter.to_string()));
self.buffer.push_str(&self.config.line_ending);
}
pub fn write_str_row(&mut self, values: &[&str]) {
let delim = self.config.delimiter;
let line: Vec<String> = values
.iter()
.map(|v| {
if self.config.quote_all {
format!("\"{}\"", v.replace('"', "\"\""))
} else {
quote_field(v, delim)
}
})
.collect();
self.buffer
.push_str(&line.join(&self.config.delimiter.to_string()));
self.buffer.push_str(&self.config.line_ending);
}
pub fn finish(self) -> String {
self.buffer
}
}