pub mod format {
use std::path::Path;
pub fn from_extension(path: &str) -> &'static str {
let ext = Path::new(path)
.extension()
.and_then(|s| s.to_str())
.map(|s| s.to_lowercase())
.unwrap_or_default();
match ext.as_str() {
"csv" => "csv",
"xlsx" | "xls" => "excel",
"ods" => "ods",
"parquet" => "parquet",
"avro" => "avro",
"json" => "json",
_ => "unknown",
}
}
pub fn is_supported(format: &str) -> bool {
matches!(
format,
"csv" | "excel" | "ods" | "parquet" | "avro" | "json"
)
}
}
pub mod validation {
use anyhow::{Result, anyhow};
pub fn validate_column_index(data: &[Vec<String>], col_idx: usize) -> Result<()> {
if data.is_empty() {
return Err(anyhow!("Data is empty"));
}
let num_cols = data[0].len();
if col_idx >= num_cols {
return Err(anyhow!(
"Column index {} out of range (max: {})",
col_idx,
num_cols - 1
));
}
Ok(())
}
pub fn validate_consistent_columns(data: &[Vec<String>]) -> Result<()> {
if data.is_empty() {
return Ok(());
}
let expected_cols = data[0].len();
for (i, row) in data.iter().enumerate() {
if row.len() != expected_cols {
return Err(anyhow!(
"Row {} has {} columns, expected {}",
i,
row.len(),
expected_cols
));
}
}
Ok(())
}
pub fn validate_cell_range(range: &str) -> Result<()> {
let re = regex::Regex::new(r"^[A-Z]+[0-9]+(:[A-Z]+[0-9]+)?$")?;
if !re.is_match(range) {
return Err(anyhow!("Invalid cell range format: {}", range));
}
Ok(())
}
}
pub mod transform {
use super::validation::validate_column_index;
use anyhow::Result;
use rayon::prelude::*;
pub fn apply_to_column<F>(
data: &mut [Vec<String>],
col_idx: usize,
mut transform_fn: F,
) -> Result<()>
where
F: FnMut(&str) -> String,
{
validate_column_index(data, col_idx)?;
for row in data.iter_mut().skip(1) {
if let Some(cell) = row.get_mut(col_idx) {
*cell = transform_fn(cell);
}
}
Ok(())
}
pub fn apply_to_column_parallel<F>(
data: &mut [Vec<String>],
col_idx: usize,
transform_fn: F,
) -> Result<()>
where
F: Fn(&str) -> String + Sync + Send,
{
validate_column_index(data, col_idx)?;
data.par_iter_mut().skip(1).for_each(|row| {
if let Some(cell) = row.get_mut(col_idx) {
*cell = transform_fn(cell);
}
});
Ok(())
}
pub fn filter_data<F>(data: &[Vec<String>], predicate: F) -> Vec<Vec<String>>
where
F: Fn(&[String]) -> bool,
{
if data.is_empty() {
return Vec::new();
}
let mut result = vec![data[0].clone()]; result.extend(data.iter().skip(1).filter(|row| predicate(row)).cloned());
result
}
pub fn filter_data_parallel<F>(data: &[Vec<String>], predicate: F) -> Vec<Vec<String>>
where
F: Fn(&[String]) -> bool + Sync + Send,
{
if data.is_empty() {
return Vec::new();
}
let mut result = vec![data[0].clone()]; let filtered: Vec<Vec<String>> = data
.par_iter()
.skip(1)
.filter(|row| predicate(row))
.cloned()
.collect();
result.extend(filtered);
result
}
pub fn sort_by_column<F>(
data: &mut [Vec<String>],
col_idx: usize,
mut compare_fn: F,
) -> Result<()>
where
F: FnMut(&str, &str) -> std::cmp::Ordering,
{
validate_column_index(data, col_idx)?;
if data.len() <= 1 {
return Ok(());
}
let _header = data[0].clone();
let mut data_rows: Vec<&mut Vec<String>> = data.iter_mut().skip(1).collect();
data_rows.sort_by(|a, b| {
let a_val = a.get(col_idx).map(|s| s.as_str()).unwrap_or("");
let b_val = b.get(col_idx).map(|s| s.as_str()).unwrap_or("");
compare_fn(a_val, b_val)
});
Ok(())
}
pub fn sort_by_column_parallel<F>(
data: &mut [Vec<String>],
col_idx: usize,
compare_fn: F,
) -> Result<()>
where
F: Fn(&str, &str) -> std::cmp::Ordering + Sync + Send,
{
validate_column_index(data, col_idx)?;
if data.len() <= 1 {
return Ok(());
}
let _header = data[0].clone();
let mut data_rows: Vec<&mut Vec<String>> = data.iter_mut().skip(1).collect();
data_rows.par_sort_by(|a, b| {
let a_val = a.get(col_idx).map(|s| s.as_str()).unwrap_or("");
let b_val = b.get(col_idx).map(|s| s.as_str()).unwrap_or("");
compare_fn(a_val, b_val)
});
Ok(())
}
}
pub mod error {
use anyhow::anyhow;
use std::fmt;
pub fn with_file_context(error: impl fmt::Display, file: &str) -> anyhow::Error {
anyhow!("Error processing file '{}': {}", file, error)
}
pub fn with_cell_context(
error: impl fmt::Display,
file: &str,
row: usize,
col: usize,
) -> anyhow::Error {
anyhow!("Error at {}:{}:{}: {}", file, row + 1, col + 1, error)
}
pub fn with_column_context(
error: impl fmt::Display,
file: &str,
column: &str,
) -> anyhow::Error {
anyhow!("Error in column '{}' of file '{}': {}", column, file, error)
}
}
pub mod string {
pub fn normalize_whitespace(s: &str) -> String {
s.trim().split_whitespace().collect::<Vec<_>>().join(" ")
}
pub fn is_numeric(s: &str) -> bool {
s.parse::<f64>().is_ok()
}
pub fn is_empty_or_whitespace(s: &str) -> bool {
s.trim().is_empty()
}
pub fn to_number(s: &str) -> Option<f64> {
s.trim().parse::<f64>().ok()
}
}
pub mod collection {
pub fn unique_preserve_order<T: Clone + Eq + std::hash::Hash>(vec: &[T]) -> Vec<T> {
let mut seen = std::collections::HashSet::new();
let mut result = Vec::new();
for item in vec {
if seen.insert(item) {
result.push(item.clone());
}
}
result
}
pub fn chunk<T: std::clone::Clone>(data: Vec<T>, chunk_size: usize) -> Vec<Vec<T>> {
if chunk_size == 0 {
return vec![data];
}
data.chunks(chunk_size)
.map(|chunk| chunk.to_vec())
.collect()
}
pub fn flatten<T>(nested: Vec<Vec<T>>) -> Vec<T> {
nested.into_iter().flatten().collect()
}
}