#![allow(clippy::should_implement_trait)]
use super::functions::detect_delimiter;
#[allow(unused_imports)]
use super::functions::*;
#[allow(unused_imports)]
use super::functions_2::*;
#[allow(dead_code)]
pub struct CsvRecord {
pub fields: Vec<String>,
}
#[allow(dead_code)]
pub struct CsvFile {
pub headers: Vec<String>,
pub records: Vec<CsvRecord>,
}
impl CsvFile {
pub fn new(headers: Vec<String>) -> Self {
CsvFile {
headers,
records: Vec::new(),
}
}
pub fn add_record(&mut self, fields: Vec<String>) {
self.records.push(CsvRecord { fields });
}
pub fn add_record_f64(&mut self, values: &[f64]) {
let fields = values.iter().map(|v| format!("{}", v)).collect();
self.records.push(CsvRecord { fields });
}
pub fn record_count(&self) -> usize {
self.records.len()
}
pub fn column_count(&self) -> usize {
self.headers.len()
}
pub fn get_column_f64(&self, col_idx: usize) -> Result<Vec<f64>, String> {
if col_idx >= self.headers.len() {
return Err(format!("Column index {} out of range", col_idx));
}
let mut out = Vec::with_capacity(self.records.len());
for (row, rec) in self.records.iter().enumerate() {
let s = rec
.fields
.get(col_idx)
.ok_or_else(|| format!("Row {} has no field at column {}", row, col_idx))?;
let v: f64 = s
.trim()
.parse()
.map_err(|e| format!("Row {}, col {}: parse error: {}", row, col_idx, e))?;
out.push(v);
}
Ok(out)
}
pub fn get_column_by_name(&self, name: &str) -> Option<usize> {
self.headers.iter().position(|h| h == name)
}
#[allow(clippy::inherent_to_string)]
pub fn to_string(&self) -> String {
self.to_string_with_delimiter(',')
}
#[allow(dead_code)]
pub fn to_string_with_delimiter(&self, delim: char) -> String {
let mut out = String::new();
let d = delim.to_string();
out.push_str(&self.headers.join(&d));
out.push('\n');
for rec in &self.records {
out.push_str(&rec.fields.join(&d));
out.push('\n');
}
out
}
pub fn from_str(s: &str) -> Result<Self, String> {
Self::from_str_with_delimiter(s, ',')
}
#[allow(dead_code)]
pub fn from_str_with_delimiter(s: &str, delim: char) -> Result<Self, String> {
let mut lines = s.lines();
let header_line = lines.next().ok_or("Empty CSV input")?;
let headers: Vec<String> = header_line
.split(delim)
.map(|f| f.trim().to_string())
.collect();
if headers.is_empty() || headers.iter().all(|h| h.is_empty()) {
return Err("No headers found".to_string());
}
let mut records = Vec::new();
for line in lines {
if line.trim().is_empty() {
continue;
}
let fields: Vec<String> = line.split(delim).map(|f| f.trim().to_string()).collect();
records.push(CsvRecord { fields });
}
Ok(CsvFile { headers, records })
}
pub fn filter_rows(&self, col_idx: usize, pred: impl Fn(f64) -> bool) -> CsvFile {
let mut out = CsvFile::new(self.headers.clone());
for rec in &self.records {
if let Some(s) = rec.fields.get(col_idx)
&& let Ok(v) = s.trim().parse::<f64>()
&& pred(v)
{
out.records.push(CsvRecord {
fields: rec.fields.clone(),
});
}
}
out
}
#[allow(dead_code)]
pub fn infer_column_type(&self, col_idx: usize) -> ColumnType {
if col_idx >= self.headers.len() {
return ColumnType::Text;
}
let mut all_int = true;
let mut all_float = true;
let mut any_value = false;
for rec in &self.records {
if let Some(s) = rec.fields.get(col_idx) {
let s = s.trim();
if s.is_empty() {
continue;
}
any_value = true;
if s.parse::<i64>().is_err() {
all_int = false;
}
if s.parse::<f64>().is_err() {
all_float = false;
}
}
}
if !any_value {
return ColumnType::Text;
}
if all_int {
ColumnType::Integer
} else if all_float {
ColumnType::Float
} else {
ColumnType::Text
}
}
#[allow(dead_code)]
pub fn select_columns(&self, col_indices: &[usize]) -> CsvFile {
let headers: Vec<String> = col_indices
.iter()
.filter_map(|&i| self.headers.get(i).cloned())
.collect();
let mut out = CsvFile::new(headers);
for rec in &self.records {
let fields: Vec<String> = col_indices
.iter()
.map(|&i| rec.fields.get(i).cloned().unwrap_or_default())
.collect();
out.records.push(CsvRecord { fields });
}
out
}
#[allow(dead_code)]
pub fn select_columns_by_name(&self, names: &[&str]) -> CsvFile {
let indices: Vec<usize> = names
.iter()
.filter_map(|n| self.get_column_by_name(n))
.collect();
self.select_columns(&indices)
}
#[allow(dead_code)]
pub fn normalize_headers(&mut self) {
for h in &mut self.headers {
let normalized: String = h
.trim()
.to_lowercase()
.chars()
.map(|c| {
if c.is_alphanumeric() || c == '_' {
c
} else {
'_'
}
})
.collect();
*h = normalized;
}
}
#[allow(dead_code)]
pub fn column_stats(&self, col_idx: usize) -> Option<ColumnStats> {
let values = self.get_column_f64(col_idx).ok()?;
if values.is_empty() {
return None;
}
let mut min = f64::INFINITY;
let mut max = f64::NEG_INFINITY;
let mut sum = 0.0;
for &v in &values {
if v < min {
min = v;
}
if v > max {
max = v;
}
sum += v;
}
let count = values.len();
Some(ColumnStats {
min,
max,
mean: sum / count as f64,
count,
sum,
})
}
#[allow(dead_code)]
pub fn all_column_stats(&self) -> Vec<(String, ColumnStats)> {
let mut result = Vec::new();
for i in 0..self.headers.len() {
if let Some(stats) = self.column_stats(i) {
result.push((self.headers[i].clone(), stats));
}
}
result
}
#[allow(dead_code)]
pub fn get_column_strings(&self, col_idx: usize) -> Result<Vec<String>, String> {
if col_idx >= self.headers.len() {
return Err(format!("Column index {} out of range", col_idx));
}
let mut out = Vec::with_capacity(self.records.len());
for (row, rec) in self.records.iter().enumerate() {
let s = rec
.fields
.get(col_idx)
.ok_or_else(|| format!("Row {} has no field at column {}", row, col_idx))?;
out.push(s.trim().to_string());
}
Ok(out)
}
#[allow(dead_code)]
pub fn get_column_i64(&self, col_idx: usize) -> Result<Vec<i64>, String> {
if col_idx >= self.headers.len() {
return Err(format!("Column index {} out of range", col_idx));
}
let mut out = Vec::with_capacity(self.records.len());
for (row, rec) in self.records.iter().enumerate() {
let s = rec
.fields
.get(col_idx)
.ok_or_else(|| format!("Row {} has no field at column {}", row, col_idx))?;
let v: i64 = s
.trim()
.parse()
.map_err(|e| format!("Row {}, col {}: parse error: {}", row, col_idx, e))?;
out.push(v);
}
Ok(out)
}
#[allow(dead_code)]
pub fn sort_by_column(&mut self, col_idx: usize) {
self.records.sort_by(|a, b| {
let va = a
.fields
.get(col_idx)
.and_then(|s| s.trim().parse::<f64>().ok())
.unwrap_or(f64::NAN);
let vb = b
.fields
.get(col_idx)
.and_then(|s| s.trim().parse::<f64>().ok())
.unwrap_or(f64::NAN);
va.partial_cmp(&vb).unwrap_or(std::cmp::Ordering::Equal)
});
}
}
#[allow(dead_code)]
pub struct CsvWriter {
pub(super) headers: Vec<String>,
pub(super) delimiter: char,
pub(super) lines: Vec<String>,
}
impl CsvWriter {
pub fn new(headers: Vec<String>, delimiter: char) -> Self {
let header_line = headers.join(&delimiter.to_string());
Self {
headers,
delimiter,
lines: vec![header_line],
}
}
pub fn write_row(&mut self, values: &[&str]) {
let n = self.headers.len();
let row: Vec<&str> = (0..n)
.map(|i| values.get(i).copied().unwrap_or(""))
.collect();
self.lines.push(row.join(&self.delimiter.to_string()));
}
pub fn write_row_f64(&mut self, values: &[f64]) {
let strs: Vec<String> = values.iter().map(|v| format!("{v:.6}")).collect();
let refs: Vec<&str> = strs.iter().map(String::as_str).collect();
self.write_row(&refs);
}
pub fn finish(self) -> String {
self.lines.join("\n")
}
pub fn row_count(&self) -> usize {
self.lines.len().saturating_sub(1)
}
}
#[allow(dead_code)]
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum AggOp {
Sum,
Mean,
Min,
Max,
Std,
Count,
}
#[allow(dead_code)]
pub struct TimeSeriesCsv {
pub csv: CsvFile,
pub time_column: String,
}
impl TimeSeriesCsv {
pub fn new(csv: CsvFile, time_column: &str) -> Self {
Self {
csv,
time_column: time_column.to_owned(),
}
}
pub fn from_str(s: &str, time_column: &str) -> Result<Self, String> {
let csv = CsvFile::from_str(s)?;
Ok(Self::new(csv, time_column))
}
pub fn times(&self) -> Option<Vec<f64>> {
let idx = self
.csv
.headers
.iter()
.position(|h| h == &self.time_column)?;
self.csv.get_column_f64(idx).ok()
}
pub fn column_f64(&self, name: &str) -> Option<Vec<f64>> {
let idx = self.csv.headers.iter().position(|h| h == name)?;
self.csv.get_column_f64(idx).ok()
}
pub fn n_steps(&self) -> usize {
self.csv.records.len()
}
pub fn duration(&self) -> f64 {
let ts = self.times().unwrap_or_default();
if ts.len() < 2 {
return 0.0;
}
let min = ts.iter().cloned().fold(f64::INFINITY, f64::min);
let max = ts.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
max - min
}
}
#[allow(dead_code)]
#[derive(Debug, Clone)]
pub struct CsvSchema {
pub columns: Vec<(String, ColumnType)>,
}
impl CsvSchema {
pub fn new(columns: Vec<(String, ColumnType)>) -> Self {
Self { columns }
}
pub fn len(&self) -> usize {
self.columns.len()
}
pub fn is_empty(&self) -> bool {
self.columns.is_empty()
}
pub fn validate(&self, csv: &CsvFile) -> Vec<String> {
let mut errors = Vec::new();
if csv.headers.len() != self.columns.len() {
errors.push(format!(
"column count mismatch: schema has {} columns, file has {}",
self.columns.len(),
csv.headers.len()
));
return errors;
}
for (col_idx, (name, expected_type)) in self.columns.iter().enumerate() {
if csv.headers[col_idx] != *name {
errors.push(format!(
"column {} name mismatch: expected '{}', got '{}'",
col_idx, name, csv.headers[col_idx]
));
}
for (row_idx, record) in csv.records.iter().enumerate() {
if col_idx >= record.fields.len() {
errors.push(format!("row {} column {} missing", row_idx, col_idx));
continue;
}
let v = &record.fields[col_idx];
match expected_type {
ColumnType::Integer => {
if v.parse::<i64>().is_err() {
errors.push(format!(
"row {} column '{}': expected Integer, got '{}'",
row_idx, name, v
));
}
}
ColumnType::Float => {
if v.parse::<f64>().is_err() {
errors.push(format!(
"row {} column '{}': expected Float, got '{}'",
row_idx, name, v
));
}
}
ColumnType::Text => {}
}
}
}
errors
}
}
pub struct LazyCsvIter<'a> {
pub(super) lines: std::str::Lines<'a>,
pub(super) delimiter: char,
pub headers: Vec<String>,
}
impl<'a> LazyCsvIter<'a> {
pub fn new(input: &'a str, delimiter: char) -> Self {
let mut lines = input.lines();
let headers = lines
.next()
.map(|h| {
h.split(delimiter)
.map(str::trim)
.map(String::from)
.collect()
})
.unwrap_or_default();
Self {
lines,
delimiter,
headers,
}
}
}
#[allow(dead_code)]
#[derive(Debug, Default)]
pub struct CsvValidationReport {
pub errors: Vec<String>,
}
impl CsvValidationReport {
pub fn is_valid(&self) -> bool {
self.errors.is_empty()
}
pub fn error_count(&self) -> usize {
self.errors.len()
}
}
#[derive(Debug, Clone, PartialEq)]
#[allow(dead_code)]
pub enum ColumnType {
Integer,
Float,
Text,
}
#[allow(dead_code)]
#[derive(Debug, Clone)]
pub enum CsvColumnData {
Integer(Vec<i64>),
Float(Vec<f64>),
Text(Vec<String>),
}
#[allow(dead_code)]
impl CsvColumnData {
pub fn len(&self) -> usize {
match self {
CsvColumnData::Integer(v) => v.len(),
CsvColumnData::Float(v) => v.len(),
CsvColumnData::Text(v) => v.len(),
}
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn column_type(&self) -> ColumnType {
match self {
CsvColumnData::Integer(_) => ColumnType::Integer,
CsvColumnData::Float(_) => ColumnType::Float,
CsvColumnData::Text(_) => ColumnType::Text,
}
}
}
#[allow(dead_code)]
#[derive(Debug, Clone)]
pub struct CsvDataFrame {
pub column_names: Vec<String>,
pub columns: Vec<CsvColumnData>,
}
#[allow(dead_code)]
impl CsvDataFrame {
pub fn from_csv(csv: &CsvFile) -> Self {
let mut column_names = csv.headers.clone();
let mut columns: Vec<CsvColumnData> = Vec::with_capacity(csv.headers.len());
for col_idx in 0..csv.headers.len() {
let col_type = csv.infer_column_type(col_idx);
let col_data = match col_type {
ColumnType::Integer => {
let vals: Vec<i64> = csv
.records
.iter()
.map(|r| {
r.fields
.get(col_idx)
.and_then(|s| s.trim().parse::<i64>().ok())
.unwrap_or(0)
})
.collect();
CsvColumnData::Integer(vals)
}
ColumnType::Float => {
let vals: Vec<f64> = csv
.records
.iter()
.map(|r| {
r.fields
.get(col_idx)
.and_then(|s| s.trim().parse::<f64>().ok())
.unwrap_or(f64::NAN)
})
.collect();
CsvColumnData::Float(vals)
}
ColumnType::Text => {
let vals: Vec<String> = csv
.records
.iter()
.map(|r| {
r.fields
.get(col_idx)
.map(|s| s.trim().to_string())
.unwrap_or_default()
})
.collect();
CsvColumnData::Text(vals)
}
};
columns.push(col_data);
}
for (i, name) in column_names.iter_mut().enumerate() {
if name.is_empty() {
*name = format!("col_{}", i);
}
}
CsvDataFrame {
column_names,
columns,
}
}
pub fn from_str(s: &str) -> std::result::Result<Self, String> {
let csv = CsvFile::from_str(s).map_err(|e| format!("line 1: {e}"))?;
Ok(Self::from_csv(&csv))
}
pub fn from_str_with_delimiter(s: &str, delim: char) -> std::result::Result<Self, String> {
let csv = CsvFile::from_str_with_delimiter(s, delim).map_err(|e| format!("line 1: {e}"))?;
Ok(Self::from_csv(&csv))
}
pub fn n_rows(&self) -> usize {
self.columns.first().map(|c| c.len()).unwrap_or(0)
}
pub fn n_cols(&self) -> usize {
self.columns.len()
}
pub fn column_index(&self, name: &str) -> Option<usize> {
self.column_names.iter().position(|n| n == name)
}
pub fn column(&self, idx: usize) -> Option<&CsvColumnData> {
self.columns.get(idx)
}
pub fn column_by_name(&self, name: &str) -> Option<&CsvColumnData> {
let idx = self.column_index(name)?;
self.column(idx)
}
pub fn float_column(&self, name: &str) -> Option<&Vec<f64>> {
match self.column_by_name(name)? {
CsvColumnData::Float(v) => Some(v),
_ => None,
}
}
pub fn integer_column(&self, name: &str) -> Option<&Vec<i64>> {
match self.column_by_name(name)? {
CsvColumnData::Integer(v) => Some(v),
_ => None,
}
}
pub fn text_column(&self, name: &str) -> Option<&Vec<String>> {
match self.column_by_name(name)? {
CsvColumnData::Text(v) => Some(v),
_ => None,
}
}
pub fn to_csv_string(&self) -> String {
let mut out = self.column_names.join(",");
out.push('\n');
let n_rows = self.n_rows();
for row in 0..n_rows {
let fields: Vec<String> = self
.columns
.iter()
.map(|col| match col {
CsvColumnData::Integer(v) => {
v.get(row).map(|x| x.to_string()).unwrap_or_default()
}
CsvColumnData::Float(v) => {
v.get(row).map(|x| format!("{}", x)).unwrap_or_default()
}
CsvColumnData::Text(v) => v.get(row).cloned().unwrap_or_default(),
})
.collect();
out.push_str(&fields.join(","));
out.push('\n');
}
out
}
}
#[allow(dead_code)]
pub struct StreamingCsvReader<'a> {
pub delimiter: char,
pub headers: Vec<String>,
pub(super) lines: std::str::Lines<'a>,
pub(super) row: usize,
}
#[allow(dead_code)]
impl<'a> StreamingCsvReader<'a> {
pub fn new(input: &'a str, delimiter: char) -> Self {
let mut lines = input.lines();
let headers = lines
.next()
.map(|h| {
h.split(delimiter)
.map(str::trim)
.map(String::from)
.collect()
})
.unwrap_or_default();
Self {
delimiter,
headers,
lines,
row: 0,
}
}
pub fn auto(input: &'a str) -> Self {
let delim = detect_delimiter(input);
Self::new(input, delim)
}
pub fn n_cols(&self) -> usize {
self.headers.len()
}
pub fn current_row(&self) -> usize {
self.row
}
pub fn next_row(&mut self) -> Option<Vec<String>> {
loop {
let line = self.lines.next()?;
if line.trim().is_empty() {
continue;
}
self.row += 1;
return Some(
line.split(self.delimiter)
.map(str::trim)
.map(String::from)
.collect(),
);
}
}
pub fn collect_all(mut self) -> CsvFile {
let mut file = CsvFile::new(self.headers.clone());
while let Some(fields) = self.next_row() {
file.add_record(fields);
}
file
}
}
#[derive(Debug, Clone)]
#[allow(dead_code)]
pub struct ColumnStats {
pub min: f64,
pub max: f64,
pub mean: f64,
pub count: usize,
pub sum: f64,
}
#[allow(dead_code)]
#[derive(Debug, Clone, Default)]
pub struct TrajectoryFrame {
pub title: String,
pub positions: Vec<[f64; 3]>,
}
#[allow(dead_code)]
impl TrajectoryFrame {
pub fn new() -> Self {
Self::default()
}
pub fn n_atoms(&self) -> usize {
self.positions.len()
}
}