use anyhow::{Context, Result};
use clap::{Parser, Subcommand};
use comfy_table::{Cell, ContentArrangement, Table};
use polars::prelude::*;
use std::path::PathBuf;
use std::fs::File;
use rand::Rng;
{{#if (eq visualization "yes")}}
use plotters::prelude::*;
{{/if}}
#[derive(Parser)]
#[command(author, version, about, long_about = None)]
struct Cli {
#[command(subcommand)]
command: Commands,
}
#[derive(Subcommand)]
enum Commands {
Analyze {
#[arg(short, long)]
file: PathBuf,
#[arg(short = 't', long, default_value = "{{#if (eq data_source "CSV files")}}csv{{else}}{{#if (eq data_source "Parquet files")}}parquet{{else}}{{#if (eq data_source "JSON data")}}json{{else}}csv{{/if}}{{/if}}{{/if}}")]
format: String,
{{#if (eq data_source "JSON data")}}
#[arg(long, default_value = "records")]
json_format: String,
{{/if}}
#[arg(short = 'c', long)]
filter_column: Option<String>,
#[arg(short = 'v', long)]
filter_value: Option<String>,
#[arg(short = 'g', long)]
group_by: Option<String>,
#[arg(short = 'a', long)]
aggregate: Option<String>,
#[arg(short = 'u', long, default_value = "count")]
agg_func: String,
#[arg(short, long)]
stats: bool,
#[arg(long, default_value = "0.95")]
confidence: f64,
},
{{#if (eq visualization "yes")}}
Visualize {
#[arg(short, long)]
file: PathBuf,
#[arg(short = 't', long, default_value = "{{#if (eq data_source "CSV files")}}csv{{else}}{{#if (eq data_source "Parquet files")}}parquet{{else}}{{#if (eq data_source "JSON data")}}json{{else}}csv{{/if}}{{/if}}{{/if}}")]
format: String,
{{#if (eq data_source "JSON data")}}
#[arg(long, default_value = "records")]
json_format: String,
{{/if}}
#[arg(short = 'c', long)]
column: String,
#[arg(short, long)]
output: Option<PathBuf>,
},
{{/if}}
Generate {
#[arg(short, long, default_value_t = 100)]
rows: usize,
#[arg(short, long)]
output: PathBuf,
#[arg(short = 't', long, default_value = "{{#if (eq data_source "CSV files")}}csv{{else}}{{#if (eq data_source "Parquet files")}}parquet{{else}}{{#if (eq data_source "JSON data")}}json{{else}}csv{{/if}}{{/if}}{{/if}}")]
format: String,
},
}
fn main() -> Result<()> {
let cli = Cli::parse();
match &cli.command {
Commands::Analyze {
file,
format,
{{#if (eq data_source "JSON data")}}
json_format,
{{/if}}
filter_column,
filter_value,
group_by,
aggregate,
agg_func,
stats,
confidence,
} => {
println!("📊 Loading data from {}: {}", format, file.display());
let df = match format.to_lowercase().as_str() {
{{#if (eq data_source "JSON data")}}
"json" => {
println!("Using JSON format: {}", json_format);
let json_fmt = match json_format.to_lowercase().as_str() {
"lines" => JsonFormat::JsonLines,
_ => JsonFormat::Json, };
let file = File::open(file)
.with_context(|| format!("Failed to open JSON file: {}", file.display()))?;
JsonReader::new(file)
.with_json_format(json_fmt)
.finish()
.with_context(|| format!("Failed to read JSON file"))?
},
{{/if}}
{{#if (eq data_source "Parquet files")}}
"parquet" => {
let file = File::open(file)
.with_context(|| format!("Failed to open Parquet file: {}", file.display()))?;
ParquetReader::new(file)
.finish()
.with_context(|| format!("Failed to read Parquet file"))?
},
{{/if}}
_ => {
let file = File::open(file)
.with_context(|| format!("Failed to open CSV file: {}", file.display()))?;
CsvReader::new(file)
.finish()
.with_context(|| "Failed to parse CSV data")?
}
};
println!("\n📋 Data Overview:");
println!("Rows: {}", df.height());
println!("Columns: {}", df.width());
println!("Column names: {:?}", df.get_column_names());
let filtered_df = if let (Some(col_name), Some(val)) = (filter_column, filter_value) {
println!("\n🔍 Filtering where {} = {}", col_name, val);
let filter_expr = col(col_name).eq(lit(val.as_str()));
df.lazy().filter(filter_expr).collect()?
} else {
df.clone()
};
let result_df = if let Some(group_col) = group_by {
println!("\n📊 Grouping by: {}", group_col);
let agg_col = aggregate.as_ref().unwrap_or(&group_col);
println!("📊 Aggregating: {} using {}", agg_col, agg_func);
let gb = filtered_df.lazy().group_by([col(group_col)]);
match agg_func.as_str() {
"sum" => gb.agg([col(agg_col).sum()]).collect()?,
"mean" => gb.agg([col(agg_col).mean()]).collect()?,
"min" => gb.agg([col(agg_col).min()]).collect()?,
"max" => gb.agg([col(agg_col).max()]).collect()?,
_ => gb.agg([col(agg_col).count()]).collect()?,
}
} else {
filtered_df
};
let options = SortMultipleOptions {
descending: vec![false],
nulls_last: vec![true],
maintain_order: false,
multithreaded: true,
limit: None,
};
let result_df = if let Some(group_col) = group_by {
result_df.sort([group_col.to_string()], options)?
} else {
result_df
};
if *stats {
println!("\n📈 Statistical Analysis (Confidence Level: {}%):", confidence * 100.0);
for col_name in result_df.get_column_names() {
if let Ok(col) = result_df.column(col_name) {
if !matches!(col.dtype(),
DataType::Int8 | DataType::Int16 | DataType::Int32 |
DataType::Int64 | DataType::UInt8 | DataType::UInt16 |
DataType::UInt32 | DataType::UInt64 | DataType::Float32 |
DataType::Float64) {
continue;
}
println!("\nColumn: {}", col_name);
let series = match col {
Column::Series(s) => s,
_ => continue,
};
let count = series.len() as f64;
println!(" Count: {}", count);
if let Ok(Some(min_val)) = series.min::<f64>() {
println!(" Min: {}", min_val);
}
if let Ok(Some(max_val)) = series.max::<f64>() {
println!(" Max: {}", max_val);
}
let mean = if let Some(mean_val) = series.mean() {
println!(" Mean: {:.4}", mean_val);
mean_val
} else {
0.0
};
let std_dev = if let Some(std_val) = series.std(1) {
println!(" Std Dev: {:.4}", std_val);
std_val
} else {
0.0
};
let z_score = match *confidence {
0.90 => 1.645,
0.99 => 2.576,
_ => 1.96, };
let margin_error = z_score * std_dev / count.sqrt();
println!(" {}% Confidence Interval: {:.4} ± {:.4}",
confidence * 100.0, mean, margin_error);
{{#if (eq visualization "yes")}}
if let Some(numeric_col) = series.cast(&DataType::Float64).ok() {
if let Ok(values) = numeric_col.f64() {
let output_path = file.with_file_name(format!(
"{}_histogram_{}.png",
file.file_stem().unwrap().to_string_lossy(),
col_name
));
if let Err(e) = plot_histogram(&values, col_name, &output_path) {
println!(" Warning: Could not generate histogram: {}", e);
} else {
println!(" Histogram saved to: {}", output_path.display());
}
}
}
{{/if}}
}
}
}
print_dataframe(&result_df)?;
let output_path = file.with_file_name(format!(
"{}_analyzed.{}",
file.file_stem().unwrap().to_string_lossy(),
match format.to_lowercase().as_str() {
{{#if (eq data_source "JSON data")}}
"json" => "json",
{{/if}}
{{#if (eq data_source "Parquet files")}}
"parquet" => "parquet",
{{/if}}
_ => "csv",
}
));
let mut output_file = File::create(&output_path)?;
let mut result_df_mut = result_df.clone();
match format.to_lowercase().as_str() {
{{#if (eq data_source "JSON data")}}
"json" => {
let json_fmt = match json_format.to_lowercase().as_str() {
"lines" => JsonFormat::JsonLines,
_ => JsonFormat::Json,
};
let mut json_writer = JsonWriter::new(&mut output_file)
.with_json_format(json_fmt);
json_writer.finish(&mut result_df_mut)?;
},
{{/if}}
{{#if (eq data_source "Parquet files")}}
"parquet" => {
let mut parquet_writer = ParquetWriter::new(&mut output_file);
parquet_writer.finish(&mut result_df_mut)?;
},
{{/if}}
_ => {
let mut csv_writer = CsvWriter::new(&mut output_file);
csv_writer.finish(&mut result_df_mut)?;
}
}
println!("\n💾 Results saved to: {}", output_path.display());
}
{{#if (eq visualization "yes")}}
Commands::Visualize { file, format, {{#if (eq data_source "JSON data")}} json_format, {{/if}} column, output } => {
println!("📊 Loading data from {}: {}", format, file.display());
let df = match format.to_lowercase().as_str() {
{{#if (eq data_source "JSON data")}}
"json" => {
println!("Using JSON format: {}", json_format);
let json_fmt = match json_format.to_lowercase().as_str() {
"lines" => JsonFormat::JsonLines,
_ => JsonFormat::Json, };
let file = File::open(file)
.with_context(|| format!("Failed to open JSON file: {}", file.display()))?;
JsonReader::new(file)
.with_json_format(json_fmt)
.finish()
.with_context(|| format!("Failed to read JSON file"))?
},
{{/if}}
{{#if (eq data_source "Parquet files")}}
"parquet" => {
let file = File::open(file)
.with_context(|| format!("Failed to open Parquet file: {}", file.display()))?;
ParquetReader::new(file)
.finish()
.with_context(|| format!("Failed to read Parquet file"))?
},
{{/if}}
_ => {
let file = File::open(file)
.with_context(|| format!("Failed to open CSV file: {}", file.display()))?;
CsvReader::new(file)
.finish()
.with_context(|| "Failed to parse CSV data")?
}
};
let column_exists = df.get_column_names().iter().any(|c| c.as_str() == column);
if !column_exists {
println!("Error: Column '{}' does not exist in the data", column);
return Ok(());
}
let col = df.column(column).unwrap();
if !matches!(col.dtype(),
DataType::Int8 | DataType::Int16 | DataType::Int32 |
DataType::Int64 | DataType::UInt8 | DataType::UInt16 |
DataType::UInt32 | DataType::UInt64 | DataType::Float32 |
DataType::Float64) {
println!("Error: Column '{}' is not numeric", column);
return Ok(());
}
let output_path = output.clone().unwrap_or_else(|| {
file.with_file_name(format!("{}_histogram.png", column))
});
if let Some(numeric_col) = col.cast(&DataType::Float64).ok() {
if let Ok(values) = numeric_col.f64() {
if let Err(e) = plot_histogram(&values, column, &output_path) {
println!("Error: Could not generate histogram: {}", e);
} else {
println!("Histogram saved to: {}", output_path.display());
}
}
}
},
{{/if}}
Commands::Generate { rows, output, format } => {
println!("🔄 Generating sample dataset with {} rows", rows);
let mut rng = rand::thread_rng();
let mut df = df! [
"id" => (0..*rows as i32).collect::<Vec<_>>(),
"name" => (0..*rows).map(|i| format!("Person_{}", i)).collect::<Vec<_>>(),
"age" => (0..*rows).map(|_| rng.gen_range(20..100)).collect::<Vec<_>>(),
"salary" => (0..*rows).map(|_| (rng.gen::<f32>() * 100000.0).round()).collect::<Vec<_>>(),
"department" => (0..*rows).map(|i| {
match i % 5 {
0 => "Engineering",
1 => "Marketing",
2 => "Sales",
3 => "HR",
_ => "Finance",
}.to_string()
}).collect::<Vec<_>>(),
"date" => (0..*rows).map(|i| {
let days = i as i64 % 365;
chrono::NaiveDate::from_ymd_opt(2023, 1, 1)
.unwrap()
.checked_add_days(chrono::Days::new(days as u64))
.unwrap()
.format("%Y-%m-%d")
.to_string()
}).collect::<Vec<_>>()
]?;
if let Some(parent) = output.parent() {
std::fs::create_dir_all(parent)?;
}
let mut file = File::create(output)?;
match format.to_lowercase().as_str() {
{{#if (eq data_source "JSON data")}}
"json" => {
let mut json_writer = JsonWriter::new(&mut file)
.with_json_format(JsonFormat::Json);
json_writer.finish(&mut df)?;
},
{{/if}}
{{#if (eq data_source "Parquet files")}}
"parquet" => {
let mut parquet_writer = ParquetWriter::new(&mut file);
parquet_writer.finish(&mut df)?;
},
{{/if}}
_ => {
let mut csv_writer = CsvWriter::new(&mut file);
csv_writer.finish(&mut df)?;
}
}
println!("✅ Sample data saved to: {}", output.display());
println!("\nTry analyzing it with:");
println!("cargo run -- analyze -f {}", output.display());
}
}
Ok(())
}
{{#if (eq visualization "yes")}}
fn plot_histogram(values: &ChunkedArray<Float64Type>, column_name: &str, output_path: &PathBuf) -> Result<()> {
let root = BitMapBackend::new(output_path, (800, 600)).into_drawing_area();
root.fill(&WHITE)?;
let min_val = values.min().unwrap_or(0.0);
let max_val = values.max().unwrap_or(100.0);
let bin_count = 10;
let bin_width = (max_val - min_val) / bin_count as f64;
let mut bins = vec![0; bin_count];
let mut bin_labels = vec![];
for i in 0..bin_count {
let lower = min_val + i as f64 * bin_width;
let upper = min_val + (i + 1) as f64 * bin_width;
bin_labels.push(format!("{:.1}-{:.1}", lower, upper));
}
for val in values.into_iter().flatten() {
let bin_idx = ((val - min_val) / bin_width).floor() as usize;
if bin_idx < bin_count {
bins[bin_idx] += 1;
}
}
let max_count = *bins.iter().max().unwrap_or(&1) as f64;
let mut chart = ChartBuilder::on(&root)
.caption(format!("Histogram of {}", column_name), ("sans-serif", 22))
.margin(10)
.x_label_area_size(40)
.y_label_area_size(60)
.build_cartesian_2d(
0..bin_count,
0.0..max_count * 1.1,
)?;
chart.configure_mesh()
.x_labels(bin_count)
.x_label_formatter(&|x| {
let idx = *x as usize;
if idx < bin_labels.len() {
bin_labels[idx].clone()
} else {
"".to_string()
}
})
.x_desc("Value")
.y_desc("Count")
.draw()?;
chart.draw_series(
Histogram::vertical(&chart)
.style(BLUE.filled())
.margin(0)
.data(bins.iter().enumerate().map(|(i, c)| (i, *c as f64))),
)?;
root.present()?;
Ok(())
}
{{/if}}
fn print_dataframe(df: &DataFrame) -> Result<()> {
let mut table = Table::new();
table.set_content_arrangement(ContentArrangement::Dynamic);
let headers: Vec<Cell> = df.get_column_names()
.iter()
.map(|name| Cell::new(name))
.collect();
table.set_header(headers);
let max_rows = std::cmp::min(df.height(), 20);
for row_idx in 0..max_rows {
let row_cells: Vec<Cell> = df.get_column_names()
.iter()
.map(|col_name| {
let col = df.column(col_name).unwrap();
let cell_value = match col.dtype() {
DataType::Float32 | DataType::Float64 => {
format!("{:.2}", col.get(row_idx).unwrap())
},
_ => format!("{}", col.get(row_idx).unwrap()),
};
Cell::new(cell_value)
})
.collect();
table.add_row(row_cells);
}
println!("\n📊 Results:");
println!("{table}");
if df.height() > max_rows {
println!("(Showing {} of {} rows)", max_rows, df.height());
}
Ok(())
}