use std::path::Path;
use std::fs;
use csv::Reader;
struct ColumnProfiler {
numeric_count: usize,
missing_count: usize,
}
#[derive(Debug)]
#[derive(Clone)]
struct Statistics {
min: f64,
max: f64,
sum: f64,
count: usize,
}
pub struct CsvAnalysis {
header: csv::StringRecord,
columns: Vec<ColumnProfiler>,
stats_col: Vec<Option<Statistics>>,
total_rows: usize,
}
pub fn csv_analyze(f: &str, report_gen: Option<i32>) -> CsvAnalysis {
let file_name = f;
if !file_name.ends_with(".csv") {
eprintln!("❌ Error: Not a CSV file (must end with .csv)");
std::process::exit(1);
}
if !Path::new(&file_name).exists() {
println!("❌ File not found!");
std::process::exit(1);
}
let mut data = Reader::from_path(&file_name).expect("Error opening csv file");
println!("File: {}", file_name);
let header = data.headers().expect("Error reading").clone();
let header_length = header.len();
let mut stats_col: Vec<Option<Statistics>> = vec![None; header_length];
let mut column: Vec<ColumnProfiler> = header
.iter()
.map(|_h| ColumnProfiler {
numeric_count: 0,
missing_count: 0,
})
.collect();
let mut total_rows = 0;
for record in data.records() {
let re = record.unwrap();
total_rows += 1;
for (i, val) in re.iter().enumerate() {
if val.trim().is_empty() {
column[i].missing_count += 1;
}
if let Ok(x) = val.parse::<f64>() {
column[i].numeric_count += 1;
if stats_col[i].is_none() {
stats_col[i] = Some(Statistics {
min: x,
max: x,
sum: x,
count: 1,
})
} else {
let stats = stats_col[i].as_mut().unwrap();
if x < stats.min {
stats.min = x;
}
if x > stats.max {
stats.max = x;
}
stats.count += 1;
stats.sum += x;
}
}
}
}
for (_i, stats_opt) in stats_col.iter().enumerate() {
if let Some(stats) = stats_opt {
let _mean = stats.sum / stats.count as f64;
}
}
if report_gen == Option::from(1) {
println!(
"{:<30} | {:>7} | {:<11} | {:>14} | {:>14} | {:>14}",
"Column","Missing","Type","Min","Max","Mean"
);
println!("{}", "-".repeat(105));
let mut report = String::new();
report.push_str(&format!("# RustSight Data Analysis Report\n"));
report.push_str(&format!("**Dataset:** `{}`\n\n", file_name));
report.push_str("| Column | Missing | Type | Min | Max | Mean |\n");
report.push_str("|--------|--------|------|-----|-----|------|\n");
for i in 0..header_length {
let col = &column[i];
let col_type = if col.numeric_count > total_rows / 2 {
"numeric"
} else {
"categorical"
};
match &stats_col[i] {
Some(stats) => {
let mean = stats.sum / stats.count as f64;
println!(
"{:<30} | {:>7} | {:<11} | {:>14.2} | {:>14.2} | {:>14.2}",
&header[i],
col.missing_count,
col_type,
stats.min,
stats.max,
mean
);
report.push_str(&format!(
"| {} | {} | {} | {:.2} | {:.2} | {:.2} |\n",
&header[i],
col.missing_count,
col_type,
stats.min,
stats.max,
mean
));
}
None => {
println!(
"{:<30} | {:>7} | {:<11} | {:>14} | {:>14} | {:>14}",
&header[i],
col.missing_count,
col_type,
"N/A",
"N/A",
"N/A"
);
report.push_str(&format!(
"| {} | {} | {} | N/A | N/A | N/A |\n",
&header[i],
col.missing_count,
col_type
));
}
}
}
report.push_str(&format!("\n**Total rows:** {} ", total_rows));
report.push_str(&format!("\n**Columns:** {}\n", header_length));
use std::path::{Path, PathBuf};
let input_path = Path::new(file_name);
let stem = input_path.file_stem().expect("Invalid file name").to_string_lossy();
let output_file: PathBuf = input_path.with_file_name(format!("{}_report.md", stem));
fs::write(&output_file, &report).expect("Error writing report");
println!("\n✔ Report saved → {}", output_file.display());
}
CsvAnalysis {
header: header,
columns: column,
stats_col: stats_col,
total_rows: total_rows,
}
}
pub fn data_validation (analysis: &CsvAnalysis) {
let mut warnings = 0;
for i in 0..analysis.header.len() {
let col = &analysis.columns[i];
let missing_ratio =
col.missing_count as f64 / analysis.total_rows as f64;
if missing_ratio > 0.01 {
warnings += 1;
println!(
"⚠ Column '{}' has {:.1}% missing values ({} rows)",
&analysis.header[i],
missing_ratio * 100.0,
col.missing_count
);
}
if let Some(stats) = &analysis.stats_col[i] {
let mean = stats.sum / stats.count as f64;
if (stats.max - stats.min) == 0.0 && stats.count > 1 {
warnings += 1;
println!(
"⚠ Column '{}' has no variance",
&analysis.header[i]
);
}
if stats.max > mean * 10.0 {
warnings += 1;
println!(
"⚠ Column '{}' may contain outliers",
&analysis.header[i]
);
}
}
let numeric_ratio =
col.numeric_count as f64 / analysis.total_rows as f64;
if numeric_ratio > 0.2 && numeric_ratio < 0.8 {
warnings += 1;
println!(
"⚠ Column '{}' has mixed data types",
&analysis.header[i]
);
}
}
if warnings == 0 {
println!("Data is clean you are good to go!\n");
}
}