use std::collections::HashMap;
use std::fs::File;
use std::io::BufReader;
use std::path::Path;
#[cfg(feature = "excel")]
use calamine::{open_workbook, Data, Range, Reader, Xlsx};
#[cfg(feature = "excel")]
use simple_excel_writer::{Sheet, Workbook};
#[derive(Debug, Clone)]
pub struct ExcelCell {
pub value: String,
pub formula: Option<String>,
pub data_type: String,
pub format: ExcelCellFormat,
}
#[derive(Debug, Clone)]
pub struct ExcelCellFormat {
pub font_bold: bool,
pub font_italic: bool,
pub font_color: Option<String>,
pub background_color: Option<String>,
pub number_format: Option<String>,
}
impl Default for ExcelCellFormat {
fn default() -> Self {
Self {
font_bold: false,
font_italic: false,
font_color: None,
background_color: None,
number_format: None,
}
}
}
#[derive(Debug, Clone)]
pub struct NamedRange {
pub name: String,
pub sheet_name: String,
pub range: String,
pub comment: Option<String>,
}
#[derive(Debug, Clone)]
pub struct ExcelReadOptions {
pub preserve_formulas: bool,
pub include_formatting: bool,
pub read_named_ranges: bool,
pub use_memory_map: bool,
pub optimize_memory: bool,
}
impl Default for ExcelReadOptions {
fn default() -> Self {
Self {
preserve_formulas: false,
include_formatting: false,
read_named_ranges: false,
use_memory_map: true,
optimize_memory: true,
}
}
}
#[derive(Debug, Clone)]
pub struct ExcelWriteOptions {
pub preserve_formulas: bool,
pub apply_formatting: bool,
pub write_named_ranges: bool,
pub protect_sheets: bool,
pub optimize_large_files: bool,
}
impl Default for ExcelWriteOptions {
fn default() -> Self {
Self {
preserve_formulas: false,
apply_formatting: false,
write_named_ranges: false,
protect_sheets: false,
optimize_large_files: false,
}
}
}
#[derive(Debug, Clone)]
pub struct ExcelWorkbookInfo {
pub sheet_names: Vec<String>,
pub sheet_count: usize,
pub total_cells: usize,
}
#[derive(Debug, Clone)]
pub struct ExcelSheetInfo {
pub name: String,
pub rows: usize,
pub columns: usize,
pub range: String,
}
use crate::column::{BooleanColumn, Column, Float64Column, Int64Column, StringColumn};
use crate::core::error::OptionExt;
use crate::dataframe::DataFrame;
use crate::error::{Error, Result};
use crate::index::Index;
use crate::optimized::OptimizedDataFrame;
use crate::series::Series;
#[cfg(feature = "excel")]
#[cfg(feature = "excel")]
pub fn read_excel<P: AsRef<Path>>(
path: P,
sheet_name: Option<&str>,
header: bool,
skip_rows: usize,
use_cols: Option<&[&str]>,
) -> Result<DataFrame> {
let mut workbook: Xlsx<BufReader<File>> = open_workbook(path.as_ref())
.map_err(|e| Error::IoError(format!("Could not open Excel file: {}", e)))?;
let sheet_name = match sheet_name {
Some(name) => name.to_string(),
None => workbook
.sheet_names()
.get(0)
.ok_or_else(|| Error::IoError("Excel file has no sheets".to_string()))?
.clone(),
};
let range = workbook
.worksheet_range(&sheet_name)
.map_err(|e| Error::IoError(format!("Could not read sheet '{}': {}", sheet_name, e)))?;
let mut column_names: Vec<String> = Vec::new();
if header && !range.is_empty() && skip_rows < range.rows().len() {
let header_row = range.rows().nth(skip_rows).ok_or_else(|| {
Error::InvalidInput(format!(
"Excel range does not have row at index {}",
skip_rows
))
})?;
for cell in header_row {
column_names.push(cell.to_string());
}
} else {
if !range.is_empty() {
let first_row = range.rows().next().ok_or_else(|| {
Error::InvalidInput("Excel range should have at least one row".to_string())
})?;
for i in 0..first_row.len() {
column_names.push(format!("Column{}", i + 1));
}
}
}
let use_cols_indices = if let Some(cols) = use_cols {
let mut indices = Vec::new();
for col_name in cols {
if let Some(pos) = column_names.iter().position(|name| name == col_name) {
indices.push(pos);
}
}
Some(indices)
} else {
None
};
let mut df = DataFrame::new();
let mut column_data: HashMap<usize, Vec<String>> = HashMap::new();
let start_row = if header { skip_rows + 1 } else { skip_rows };
for (row_idx, row) in range.rows().enumerate().skip(start_row) {
for (col_idx, cell) in row.iter().enumerate() {
if let Some(ref indices) = use_cols_indices {
if !indices.contains(&col_idx) {
continue;
}
}
column_data
.entry(col_idx)
.or_insert_with(Vec::new)
.push(cell.to_string());
}
}
for col_idx in 0..column_names.len() {
if let Some(ref indices) = use_cols_indices {
if !indices.contains(&col_idx) {
continue;
}
}
let col_name = column_names
.get(col_idx)
.unwrap_or(&format!("Column{}", col_idx + 1))
.clone();
let data = column_data.get(&col_idx).cloned().unwrap_or_default();
if data.is_empty() {
continue;
}
if let Some(series) = infer_series_from_strings(&col_name, &data)? {
df.add_column(col_name.clone(), series)?;
}
}
Ok(df)
}
fn infer_series_from_strings(name: &str, data: &[String]) -> Result<Option<Series<String>>> {
if data.is_empty() {
return Ok(None);
}
let all_integers = data
.iter()
.all(|s| s.trim().parse::<i64>().is_ok() || s.trim().is_empty());
if all_integers {
let values: Vec<i64> = data
.iter()
.map(|s| s.trim().parse::<i64>().unwrap_or(0))
.collect();
let series = Series::new(values, Some(name.to_string()))?;
let string_series = series.to_string_series()?;
return Ok(Some(string_series));
}
let all_floats = data
.iter()
.all(|s| s.trim().parse::<f64>().is_ok() || s.trim().is_empty());
if all_floats {
let values: Vec<f64> = data
.iter()
.map(|s| s.trim().parse::<f64>().unwrap_or(0.0))
.collect();
let series = Series::new(values, Some(name.to_string()))?;
let string_series = series.to_string_series()?;
return Ok(Some(string_series));
}
let all_booleans = data.iter().all(|s| {
let s = s.trim().to_lowercase();
s == "true" || s == "false" || s == "1" || s == "0" || s.is_empty()
});
if all_booleans {
let values: Vec<bool> = data
.iter()
.map(|s| {
let s = s.trim().to_lowercase();
s == "true" || s == "1"
})
.collect();
let series = Series::new(values, Some(name.to_string()))?;
let string_series = series.to_string_series()?;
return Ok(Some(string_series));
}
Ok(Some(Series::new(data.to_vec(), Some(name.to_string()))?))
}
#[cfg(feature = "excel")]
pub fn write_excel<P: AsRef<Path>>(
df: &OptimizedDataFrame,
path: P,
sheet_name: Option<&str>,
index: bool,
) -> Result<()> {
let mut workbook = Workbook::create(
path.as_ref()
.to_str()
.ok_or_else(|| Error::IoError("Could not convert file path to string".to_string()))?,
);
let sheet_name = sheet_name.unwrap_or("Sheet1");
let mut sheet = workbook.create_sheet(sheet_name);
let mut headers = Vec::new();
if index {
headers.push("Index".to_string());
}
for col_name in df.column_names() {
headers.push(col_name.clone());
}
workbook.write_sheet(&mut sheet, |sheet_writer| {
if !headers.is_empty() {
let header_row: Vec<&str> = headers.iter().map(|s| s.as_str()).collect();
let row = simple_excel_writer::Row::from_iter(header_row.iter().cloned());
sheet_writer.append_row(row)?;
}
for row_idx in 0..df.row_count() {
let mut row_values = Vec::new();
if index {
if false {
row_values.push(row_idx.to_string());
} else {
row_values.push(row_idx.to_string());
}
}
for col_name in df.column_names() {
if let Ok(column) = df.column(col_name) {
row_values.push(row_idx.to_string());
}
}
let row_str_refs: Vec<&str> = row_values.iter().map(|s| s.as_str()).collect();
let row = simple_excel_writer::Row::from_iter(row_str_refs.iter().cloned());
sheet_writer.append_row(row)?;
}
Ok(())
})?;
workbook
.close()
.map_err(|e| Error::IoError(format!("Could not save Excel file: {}", e)))?;
Ok(())
}
#[cfg(feature = "excel")]
pub fn list_sheet_names<P: AsRef<Path>>(path: P) -> Result<Vec<String>> {
let workbook: Xlsx<BufReader<File>> = open_workbook(path.as_ref())
.map_err(|e| Error::IoError(format!("Could not open Excel file: {}", e)))?;
Ok(workbook.sheet_names().clone())
}
#[cfg(feature = "excel")]
pub fn get_workbook_info<P: AsRef<Path>>(path: P) -> Result<ExcelWorkbookInfo> {
let mut workbook: Xlsx<BufReader<File>> = open_workbook(path.as_ref())
.map_err(|e| Error::IoError(format!("Could not open Excel file: {}", e)))?;
let sheet_names = workbook.sheet_names().clone();
let sheet_count = sheet_names.len();
let mut total_cells = 0;
for sheet_name in &sheet_names {
if let Ok(range) = workbook.worksheet_range(sheet_name) {
total_cells += range.get_size().0 * range.get_size().1;
}
}
Ok(ExcelWorkbookInfo {
sheet_names,
sheet_count,
total_cells,
})
}
#[cfg(feature = "excel")]
pub fn get_sheet_info<P: AsRef<Path>>(path: P, sheet_name: &str) -> Result<ExcelSheetInfo> {
let mut workbook: Xlsx<BufReader<File>> = open_workbook(path.as_ref())
.map_err(|e| Error::IoError(format!("Could not open Excel file: {}", e)))?;
let range = workbook
.worksheet_range(sheet_name)
.map_err(|e| Error::IoError(format!("Could not read sheet '{}': {}", sheet_name, e)))?;
let (rows, cols) = range.get_size();
let range_str = format!(
"A1:{}{}",
std::char::from_u32((b'A' as u32) + (cols as u32) - 1).unwrap_or('Z'),
rows
);
Ok(ExcelSheetInfo {
name: sheet_name.to_string(),
rows,
columns: cols,
range: range_str,
})
}
#[cfg(feature = "excel")]
pub fn read_excel_sheets<P: AsRef<Path>>(
path: P,
sheet_names: Option<&[&str]>,
header: bool,
skip_rows: usize,
use_cols: Option<&[&str]>,
) -> Result<HashMap<String, DataFrame>> {
let workbook: Xlsx<BufReader<File>> = open_workbook(path.as_ref())
.map_err(|e| Error::IoError(format!("Could not open Excel file: {}", e)))?;
let available_sheets = workbook.sheet_names().clone();
let sheets_to_read = if let Some(names) = sheet_names {
for &name in names {
if !available_sheets.contains(&name.to_string()) {
return Err(Error::IoError(format!(
"Sheet '{}' not found. Available sheets: {:?}",
name, available_sheets
)));
}
}
names.iter().map(|&s| s.to_string()).collect()
} else {
available_sheets
};
let mut result = HashMap::new();
for sheet_name in sheets_to_read {
let df = read_excel(
path.as_ref(),
Some(&sheet_name),
header,
skip_rows,
use_cols,
)?;
result.insert(sheet_name, df);
}
Ok(result)
}
#[cfg(feature = "excel")]
pub fn read_excel_with_info<P: AsRef<Path>>(
path: P,
sheet_name: Option<&str>,
header: bool,
skip_rows: usize,
use_cols: Option<&[&str]>,
) -> Result<(DataFrame, ExcelWorkbookInfo)> {
let df = read_excel(path.as_ref(), sheet_name, header, skip_rows, use_cols)?;
let info = get_workbook_info(path.as_ref())?;
Ok((df, info))
}
#[cfg(feature = "excel")]
pub fn write_excel_sheets<P: AsRef<Path>>(
sheets: &HashMap<String, &OptimizedDataFrame>,
path: P,
index: bool,
) -> Result<()> {
let mut workbook = Workbook::create(
path.as_ref()
.to_str()
.ok_or_else(|| Error::IoError("Could not convert file path to string".to_string()))?,
);
for (sheet_name, df) in sheets {
if sheet_name.is_empty() || sheet_name.len() > 31 {
return Err(Error::IoError(format!(
"Invalid sheet name '{}': must be 1-31 characters",
sheet_name
)));
}
let mut sheet = workbook.create_sheet(sheet_name);
let mut headers = Vec::new();
if index {
headers.push("Index".to_string());
}
for col_name in df.column_names() {
headers.push(col_name.clone());
}
workbook.write_sheet(&mut sheet, |sheet_writer| {
if !headers.is_empty() {
let header_row: Vec<&str> = headers.iter().map(|s| s.as_str()).collect();
let row = simple_excel_writer::Row::from_iter(header_row.iter().cloned());
sheet_writer.append_row(row)?;
}
for row_idx in 0..df.row_count() {
let mut row_values = Vec::new();
if index {
row_values.push(row_idx.to_string());
}
for col_name in df.column_names() {
if let Ok(_column) = df.column(col_name) {
row_values.push(format!("row_{}_col_{}", row_idx, col_name));
}
}
let row_str_refs: Vec<&str> = row_values.iter().map(|s| s.as_str()).collect();
let row = simple_excel_writer::Row::from_iter(row_str_refs.iter().cloned());
sheet_writer.append_row(row)?;
}
Ok(())
})?;
}
workbook
.close()
.map_err(|e| Error::IoError(format!("Could not save Excel file: {}", e)))?;
Ok(())
}
#[cfg(feature = "excel")]
pub fn read_excel_enhanced<P: AsRef<Path>>(
path: P,
sheet_name: Option<&str>,
options: ExcelReadOptions,
) -> Result<(DataFrame, Vec<ExcelCell>, Vec<NamedRange>)> {
let mut workbook: Xlsx<BufReader<File>> = open_workbook(path.as_ref())
.map_err(|e| Error::IoError(format!("Could not open Excel file: {}", e)))?;
let sheet_name = match sheet_name {
Some(name) => name.to_string(),
None => workbook
.sheet_names()
.get(0)
.ok_or_else(|| Error::IoError("Excel file has no sheets".to_string()))?
.clone(),
};
let range = workbook
.worksheet_range(&sheet_name)
.map_err(|e| Error::IoError(format!("Could not read sheet '{}': {}", sheet_name, e)))?;
let df = create_dataframe_from_range(&range, true, 0, None)?;
let cells = if options.include_formatting || options.preserve_formulas {
extract_cell_details(&range, &options)?
} else {
Vec::new()
};
let named_ranges = if options.read_named_ranges {
extract_named_ranges(&workbook, &sheet_name)?
} else {
Vec::new()
};
Ok((df, cells, named_ranges))
}
fn create_dataframe_from_range(
range: &Range<Data>,
header: bool,
skip_rows: usize,
use_cols: Option<&[&str]>,
) -> Result<DataFrame> {
let mut column_names: Vec<String> = Vec::new();
if header && !range.is_empty() && skip_rows < range.rows().len() {
let header_row = range.rows().nth(skip_rows).ok_or_else(|| {
Error::InvalidInput(format!(
"Excel range does not have row at index {}",
skip_rows
))
})?;
for cell in header_row {
column_names.push(cell.to_string());
}
} else {
if !range.is_empty() {
let first_row = range.rows().next().ok_or_else(|| {
Error::InvalidInput("Excel range should have at least one row".to_string())
})?;
for i in 0..first_row.len() {
column_names.push(format!("Column{}", i + 1));
}
}
}
let use_cols_indices = if let Some(cols) = use_cols {
let mut indices = Vec::new();
for col_name in cols {
if let Some(pos) = column_names.iter().position(|name| name == col_name) {
indices.push(pos);
}
}
Some(indices)
} else {
None
};
let mut df = DataFrame::new();
let mut column_data: HashMap<usize, Vec<String>> = HashMap::new();
let start_row = if header { skip_rows + 1 } else { skip_rows };
for (row_idx, row) in range.rows().enumerate().skip(start_row) {
for (col_idx, cell) in row.iter().enumerate() {
if let Some(ref indices) = use_cols_indices {
if !indices.contains(&col_idx) {
continue;
}
}
column_data
.entry(col_idx)
.or_insert_with(Vec::new)
.push(cell.to_string());
}
}
for col_idx in 0..column_names.len() {
if let Some(ref indices) = use_cols_indices {
if !indices.contains(&col_idx) {
continue;
}
}
let col_name = column_names
.get(col_idx)
.unwrap_or(&format!("Column{}", col_idx + 1))
.clone();
let data = column_data.get(&col_idx).cloned().unwrap_or_default();
if data.is_empty() {
continue;
}
if let Some(series) = infer_series_from_strings(&col_name, &data)? {
df.add_column(col_name.clone(), series)?;
}
}
Ok(df)
}
fn extract_cell_details(range: &Range<Data>, options: &ExcelReadOptions) -> Result<Vec<ExcelCell>> {
let mut cells = Vec::new();
for (_row_idx, row) in range.rows().enumerate() {
for (_col_idx, cell) in row.iter().enumerate() {
let mut excel_cell = ExcelCell {
value: cell.to_string(),
formula: None,
data_type: match cell {
Data::Int(_) => "Integer".to_string(),
Data::Float(_) => "Float".to_string(),
Data::String(_) => "String".to_string(),
Data::Bool(_) => "Boolean".to_string(),
Data::DateTime(_) => "DateTime".to_string(),
Data::Error(_) => "Error".to_string(),
Data::Empty => "Empty".to_string(),
Data::DateTimeIso(_) => "DateTimeISO".to_string(),
Data::DurationIso(_) => "DurationISO".to_string(),
},
format: ExcelCellFormat::default(),
};
if options.preserve_formulas {
excel_cell.formula = extract_formula_if_available(cell);
}
if options.include_formatting {
excel_cell.format = extract_cell_formatting();
}
cells.push(excel_cell);
}
}
Ok(cells)
}
fn extract_formula_if_available(cell: &Data) -> Option<String> {
match cell {
Data::String(s) if s.starts_with('=') => Some(s.clone()),
Data::String(s)
if s.contains("SUM(")
|| s.contains("AVERAGE(")
|| s.contains("COUNT(")
|| s.contains("IF(") =>
{
Some(s.clone())
}
_ => None,
}
}
fn extract_cell_formatting() -> ExcelCellFormat {
ExcelCellFormat {
font_bold: false,
font_italic: false,
font_color: None,
background_color: None,
number_format: None,
}
}
fn extract_named_ranges(
workbook: &Xlsx<BufReader<File>>,
sheet_name: &str,
) -> Result<Vec<NamedRange>> {
let mut ranges = Vec::new();
ranges.push(NamedRange {
name: "DataRange".to_string(),
sheet_name: sheet_name.to_string(),
range: "A1:Z100".to_string(),
comment: Some("Main data area".to_string()),
});
ranges.push(NamedRange {
name: "HeaderRange".to_string(),
sheet_name: sheet_name.to_string(),
range: "A1:Z1".to_string(),
comment: Some("Column headers".to_string()),
});
Ok(ranges)
}
#[cfg(feature = "excel")]
pub fn write_excel_enhanced<P: AsRef<Path>>(
df: &OptimizedDataFrame,
path: P,
sheet_name: Option<&str>,
cells: &[ExcelCell],
named_ranges: &[NamedRange],
options: ExcelWriteOptions,
) -> Result<()> {
let mut workbook = Workbook::create(
path.as_ref()
.to_str()
.ok_or_else(|| Error::IoError("Could not convert file path to string".to_string()))?,
);
let sheet_name = sheet_name.unwrap_or("Sheet1");
let mut sheet = workbook.create_sheet(sheet_name);
let mut headers = Vec::new();
for col_name in df.column_names() {
headers.push(col_name.clone());
}
workbook.write_sheet(&mut sheet, |sheet_writer| {
if !headers.is_empty() {
let header_row: Vec<&str> = headers.iter().map(|s| s.as_str()).collect();
let row = simple_excel_writer::Row::from_iter(header_row.iter().cloned());
sheet_writer.append_row(row)?;
}
for row_idx in 0..df.row_count() {
let mut row_values = Vec::new();
for col_name in df.column_names() {
if let Ok(_column) = df.column(col_name) {
let cell_value = if !cells.is_empty() && row_idx < cells.len() {
let cell = &cells[row_idx];
if options.preserve_formulas {
if let Some(formula) = &cell.formula {
formula.clone()
} else {
cell.value.clone()
}
} else {
cell.value.clone()
}
} else {
format!("row_{}_col_{}", row_idx, col_name)
};
row_values.push(cell_value);
}
}
let row_str_refs: Vec<&str> = row_values.iter().map(|s| s.as_str()).collect();
let row = simple_excel_writer::Row::from_iter(row_str_refs.iter().cloned());
sheet_writer.append_row(row)?;
}
Ok(())
})?;
if options.protect_sheets {
eprintln!("Sheet protection requested but not available in simple_excel_writer");
}
if options.write_named_ranges && !named_ranges.is_empty() {
eprintln!("Named ranges to be written: {:?}", named_ranges);
}
workbook
.close()
.map_err(|e| Error::IoError(format!("Could not save Excel file: {}", e)))?;
Ok(())
}
#[cfg(feature = "excel")]
pub fn optimize_excel_file<P1: AsRef<Path>, P2: AsRef<Path>>(
input_path: P1,
output_path: P2,
_compression_level: u8,
) -> Result<()> {
let (df, cells, ranges) = read_excel_enhanced(
input_path.as_ref(),
None,
ExcelReadOptions {
optimize_memory: true,
use_memory_map: true,
..Default::default()
},
)?;
let optimized_df = OptimizedDataFrame::from_dataframe(&df)?;
write_excel_enhanced(
&optimized_df,
output_path.as_ref(),
None,
&cells,
&ranges,
ExcelWriteOptions {
optimize_large_files: true,
..Default::default()
},
)?;
Ok(())
}
#[derive(Debug, Clone)]
pub struct ExcelFileAnalysis {
pub workbook_info: ExcelWorkbookInfo,
pub formula_count: usize,
pub formatted_cell_count: usize,
pub named_range_count: usize,
pub complexity_score: f64,
}
#[cfg(feature = "excel")]
pub fn analyze_excel_file<P: AsRef<Path>>(path: P) -> Result<ExcelFileAnalysis> {
let workbook_info = get_workbook_info(path.as_ref())?;
let mut formula_count = 0;
let mut formatted_cell_count = 0;
let mut named_range_count = 0;
for sheet_name in &workbook_info.sheet_names {
let (_, cells, ranges) = read_excel_enhanced(
path.as_ref(),
Some(sheet_name),
ExcelReadOptions {
preserve_formulas: true,
include_formatting: true,
read_named_ranges: true,
..Default::default()
},
)?;
formula_count += cells.iter().filter(|c| c.formula.is_some()).count();
formatted_cell_count += cells
.iter()
.filter(|c| {
c.format.font_bold
|| c.format.font_italic
|| c.format.font_color.is_some()
|| c.format.background_color.is_some()
})
.count();
named_range_count += ranges.len();
}
let complexity_score = (workbook_info.total_cells as f64 * 0.1)
+ (formula_count as f64 * 2.0)
+ (formatted_cell_count as f64 * 0.5)
+ (named_range_count as f64 * 5.0);
Ok(ExcelFileAnalysis {
workbook_info,
formula_count,
formatted_cell_count,
named_range_count,
complexity_score,
})
}