use super::{DataFrame, Series};
use crate::dataframe::core::SeriesType;
use csv::{ReaderBuilder, WriterBuilder};
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufReader, BufWriter};
#[derive(Debug)]
pub struct BoolParseError;
impl DataFrame {
pub fn from_csv(path: &str) -> Result<Self, Box<dyn std::error::Error>> {
Self::from_csv_with_options(path, CsvReadOptions::default())
}
pub fn from_csv_with_options(
path: &str,
options: CsvReadOptions,
) -> Result<Self, Box<dyn std::error::Error>> {
let file = File::open(path)?;
let mut rdr = ReaderBuilder::new()
.delimiter(options.delimiter)
.has_headers(options.has_headers)
.from_reader(BufReader::new(file));
let headers = if options.has_headers {
rdr.headers()?.clone()
} else {
csv::StringRecord::from(
(0..rdr.headers()?.len())
.map(|i| format!("column_{}", i))
.collect::<Vec<_>>(),
)
};
let mut raw_data: Vec<Vec<String>> = vec![Vec::new(); headers.len()];
for result in rdr.records() {
let record = result?;
for (i, field) in record.iter().enumerate() {
if i < raw_data.len() {
raw_data[i].push(field.to_string());
}
}
}
let mut column_types = Vec::new();
for col_data in &raw_data {
column_types.push(Self::infer_column_type(col_data));
}
let mut series_data = Vec::new();
for (i, col_data) in raw_data.into_iter().enumerate() {
let series = match column_types[i] {
SeriesType::Int64 => {
let parsed: Result<Vec<i64>, _> =
col_data.iter().map(|s| s.trim().parse::<i64>()).collect();
match parsed {
Ok(values) => Series::Int64(values),
Err(_) => Series::Utf8(col_data), }
}
SeriesType::Float64 => {
let parsed: Result<Vec<f64>, _> =
col_data.iter().map(|s| s.trim().parse::<f64>()).collect();
match parsed {
Ok(values) => Series::Float64(values),
Err(_) => Series::Utf8(col_data), }
}
SeriesType::Bool => {
let parsed: Result<Vec<bool>, _> = col_data
.iter()
.map(|s| Self::parse_bool(s.trim()))
.collect();
match parsed {
Ok(values) => Series::Bool(values),
Err(_) => Series::Utf8(col_data), }
}
SeriesType::Utf8 => Series::Utf8(col_data),
};
series_data.push(series);
}
let column_names: Vec<String> = headers.iter().map(|h| h.to_string()).collect();
Ok(DataFrame::new(
column_names.into_iter().zip(series_data).collect(),
))
}
pub fn to_csv(&self, path: &str) -> Result<(), Box<dyn std::error::Error>> {
self.to_csv_with_options(path, CsvWriteOptions::default())
}
pub fn to_csv_with_options(
&self,
path: &str,
options: CsvWriteOptions,
) -> Result<(), Box<dyn std::error::Error>> {
let file = File::create(path)?;
let mut wtr = WriterBuilder::new()
.delimiter(options.delimiter)
.from_writer(BufWriter::new(file));
if options.write_headers {
wtr.write_record(&self.columns)?;
}
for row_idx in 0..self.len() {
let mut record = Vec::new();
for series in &self.data {
let value = match series {
Series::Int64(v) => v[row_idx].to_string(),
Series::Float64(v) => {
if options.float_precision > 0 {
format!("{:.prec$}", v[row_idx], prec = options.float_precision)
} else {
v[row_idx].to_string()
}
}
Series::Bool(v) => v[row_idx].to_string(),
Series::Utf8(v) => v[row_idx].clone(),
};
record.push(value);
}
wtr.write_record(&record)?;
}
wtr.flush()?;
Ok(())
}
pub fn from_jsonl(path: &str) -> Result<Self, Box<dyn std::error::Error>> {
use std::fs;
let content = fs::read_to_string(path)?;
let mut all_columns: std::collections::HashSet<String> = std::collections::HashSet::new();
let mut records: Vec<HashMap<String, serde_json::Value>> = Vec::new();
for line in content.lines() {
if line.trim().is_empty() {
continue;
}
let record: HashMap<String, serde_json::Value> = serde_json::from_str(line)?;
for key in record.keys() {
all_columns.insert(key.clone());
}
records.push(record);
}
let columns: Vec<String> = all_columns.into_iter().collect();
let mut column_data: HashMap<String, Vec<String>> = HashMap::new();
for col in &columns {
column_data.insert(col.clone(), Vec::new());
}
for record in records {
for col in &columns {
let value = match record.get(col) {
Some(serde_json::Value::String(s)) => s.clone(),
Some(serde_json::Value::Number(n)) => n.to_string(),
Some(serde_json::Value::Bool(b)) => b.to_string(),
Some(serde_json::Value::Null) => "".to_string(),
Some(_) => "".to_string(), None => "".to_string(), };
column_data.get_mut(col).unwrap().push(value);
}
}
let mut series_data = Vec::new();
let mut final_columns = Vec::new();
for col in columns {
let col_values = column_data.remove(&col).unwrap();
let col_type = Self::infer_column_type(&col_values);
let series = match col_type {
SeriesType::Int64 => {
let parsed: Vec<i64> = col_values
.iter()
.map(|s| s.trim().parse::<i64>().unwrap_or(0))
.collect();
Series::Int64(parsed)
}
SeriesType::Float64 => {
let parsed: Vec<f64> = col_values
.iter()
.map(|s| s.trim().parse::<f64>().unwrap_or(0.0))
.collect();
Series::Float64(parsed)
}
SeriesType::Bool => {
let parsed: Vec<bool> = col_values
.iter()
.map(|s| Self::parse_bool(s.trim()).unwrap_or(false))
.collect();
Series::Bool(parsed)
}
SeriesType::Utf8 => Series::Utf8(col_values),
};
final_columns.push(col);
series_data.push(series);
}
Ok(DataFrame::new(
final_columns.into_iter().zip(series_data).collect(),
))
}
pub fn to_jsonl(&self, path: &str) -> Result<(), Box<dyn std::error::Error>> {
use std::fs::File;
use std::io::Write;
let mut file = File::create(path)?;
for row_idx in 0..self.len() {
let mut record = serde_json::Map::new();
for (col_idx, col_name) in self.columns.iter().enumerate() {
let value = match &self.data[col_idx] {
Series::Int64(v) => {
serde_json::Value::Number(serde_json::Number::from(v[row_idx]))
}
Series::Float64(v) => {
if let Some(n) = serde_json::Number::from_f64(v[row_idx]) {
serde_json::Value::Number(n)
} else {
serde_json::Value::Null
}
}
Series::Bool(v) => serde_json::Value::Bool(v[row_idx]),
Series::Utf8(v) => serde_json::Value::String(v[row_idx].clone()),
};
record.insert(col_name.clone(), value);
}
let line = serde_json::to_string(&record)?;
writeln!(file, "{}", line)?;
}
Ok(())
}
pub fn to_json(&self, path: &str) -> Result<(), Box<dyn std::error::Error>> {
use std::fs::File;
use std::io::Write;
let mut records = Vec::new();
for row_idx in 0..self.len() {
let mut record = serde_json::Map::new();
for (col_idx, col_name) in self.columns.iter().enumerate() {
let value = match &self.data[col_idx] {
Series::Int64(v) => {
serde_json::Value::Number(serde_json::Number::from(v[row_idx]))
}
Series::Float64(v) => {
if let Some(n) = serde_json::Number::from_f64(v[row_idx]) {
serde_json::Value::Number(n)
} else {
serde_json::Value::Null
}
}
Series::Bool(v) => serde_json::Value::Bool(v[row_idx]),
Series::Utf8(v) => serde_json::Value::String(v[row_idx].clone()),
};
record.insert(col_name.clone(), value);
}
records.push(serde_json::Value::Object(record));
}
let json_array = serde_json::Value::Array(records);
let mut file = File::create(path)?;
writeln!(file, "{}", serde_json::to_string_pretty(&json_array)?)?;
Ok(())
}
pub fn infer_column_type(data: &[String]) -> SeriesType {
if data.is_empty() {
return SeriesType::Utf8;
}
let mut int_count = 0;
let mut float_count = 0;
let mut bool_count = 0;
let total = data.len();
for value in data {
let trimmed = value.trim();
if trimmed.is_empty() {
continue;
}
if trimmed.parse::<i64>().is_ok() {
int_count += 1;
} else if trimmed.parse::<f64>().is_ok() {
float_count += 1;
} else if Self::parse_bool(trimmed).is_ok() {
bool_count += 1;
}
}
let threshold = (total as f64 * 0.8).ceil() as usize;
if bool_count >= threshold {
SeriesType::Bool
} else if int_count >= threshold {
SeriesType::Int64
} else if (int_count + float_count) >= threshold {
SeriesType::Float64
} else {
SeriesType::Utf8
}
}
pub fn parse_bool(s: &str) -> Result<bool, BoolParseError> {
match s.to_lowercase().as_str() {
"true" | "t" | "yes" | "y" | "1" => Ok(true),
"false" | "f" | "no" | "n" | "0" => Ok(false),
_ => Err(BoolParseError),
}
}
}
#[derive(Debug, Clone)]
pub struct CsvReadOptions {
pub delimiter: u8,
pub has_headers: bool,
}
impl Default for CsvReadOptions {
fn default() -> Self {
CsvReadOptions {
delimiter: b',',
has_headers: true,
}
}
}
#[derive(Debug, Clone)]
pub struct CsvWriteOptions {
pub delimiter: u8,
pub write_headers: bool,
pub float_precision: usize,
}
impl Default for CsvWriteOptions {
fn default() -> Self {
CsvWriteOptions {
delimiter: b',',
write_headers: true,
float_precision: 0, }
}
}