use std::io::Cursor;
use calamine::{Data, Reader, Sheets, open_workbook_auto_from_rs};
use polars::prelude::*;
use crate::error::{DataLoadError, Result};
use crate::options::LoadOptions;
pub fn load_excel(content: &[u8], options: &LoadOptions) -> Result<DataFrame> {
let cursor = Cursor::new(content);
let mut workbook: Sheets<_> = open_workbook_auto_from_rs(cursor)?;
let sheet_name = get_sheet_name(&workbook, options)?;
let range = workbook.worksheet_range(&sheet_name)?;
if range.is_empty() {
return Err(DataLoadError::EmptyFile);
}
let height = range.height();
let width = range.width();
if width == 0 {
return Err(DataLoadError::EmptyFile);
}
let data_start_row = options.skip_rows;
if data_start_row >= height {
return Err(DataLoadError::EmptyFile);
}
let (headers, content_start_row) = if options.has_header {
let header_row = data_start_row;
let headers: Vec<String> = (0..width)
.map(|col| {
range
.get((header_row, col))
.map(cell_to_string)
.filter(|s| !s.is_empty())
.unwrap_or_else(|| format!("column_{col}"))
})
.collect();
(headers, data_start_row + 1)
} else {
let headers: Vec<String> = (0..width)
.map(|col| format!("column_{col}"))
.collect();
(headers, data_start_row)
};
let end_row = match options.max_rows {
Some(max) => (content_start_row + max).min(height),
None => height,
};
if content_start_row >= end_row {
return Ok(create_empty_dataframe(&headers));
}
let columns: Vec<Column> = (0..width)
.map(|col| build_column(&range, col, &headers[col], content_start_row, end_row))
.collect();
Ok(DataFrame::new(columns)?)
}
fn get_sheet_name(workbook: &Sheets<Cursor<&[u8]>>, options: &LoadOptions) -> Result<String> {
let sheet_names = workbook.sheet_names();
if sheet_names.is_empty() {
return Err(DataLoadError::NoSheetsFound);
}
if let Some(ref name) = options.sheet_name {
if sheet_names.contains(name) {
return Ok(name.clone());
}
return Err(DataLoadError::UnsupportedFileType(format!(
"Sheet '{}' not found. Available sheets: {:?}",
name, sheet_names
)));
}
if let Some(index) = options.sheet_index {
if index < sheet_names.len() {
return Ok(sheet_names[index].clone());
}
return Err(DataLoadError::UnsupportedFileType(format!(
"Sheet index {} out of range. File has {} sheets.",
index,
sheet_names.len()
)));
}
Ok(sheet_names[0].clone())
}
fn build_column(
range: &calamine::Range<Data>,
col: usize,
name: &str,
start_row: usize,
end_row: usize,
) -> Column {
let mut has_int = false;
let mut has_float = false;
let mut has_bool = false;
let mut has_string = false;
for row in start_row..end_row {
if let Some(cell) = range.get((row, col)) {
match cell {
Data::Int(_) => has_int = true,
Data::Float(_) => has_float = true,
Data::Bool(_) => has_bool = true,
Data::String(_) | Data::DateTimeIso(_) | Data::DurationIso(_) => {
has_string = true;
}
Data::DateTime(_) => has_float = true, Data::Empty | Data::Error(_) => {}
}
}
}
if has_string || (has_bool && (has_int || has_float)) {
build_string_column(range, col, name, start_row, end_row)
} else if has_float || (has_int && has_float) {
build_float_column(range, col, name, start_row, end_row)
} else if has_int {
build_int_column(range, col, name, start_row, end_row)
} else if has_bool {
build_bool_column(range, col, name, start_row, end_row)
} else {
build_string_column(range, col, name, start_row, end_row)
}
}
fn build_string_column(
range: &calamine::Range<Data>,
col: usize,
name: &str,
start_row: usize,
end_row: usize,
) -> Column {
let values: Vec<Option<String>> = (start_row..end_row)
.map(|row| {
range.get((row, col)).and_then(|cell| {
let s = cell_to_string(cell);
if s.is_empty() {
None
} else {
Some(s)
}
})
})
.collect();
Column::new(name.into(), values)
}
fn build_float_column(
range: &calamine::Range<Data>,
col: usize,
name: &str,
start_row: usize,
end_row: usize,
) -> Column {
let values: Vec<Option<f64>> = (start_row..end_row)
.map(|row| {
range.get((row, col)).and_then(|cell| match cell {
Data::Float(f) => Some(*f),
Data::Int(i) => Some(*i as f64),
Data::DateTime(dt) => Some(dt.as_f64()),
Data::Empty => None,
Data::String(s) => s.parse().ok(),
_ => None,
})
})
.collect();
Column::new(name.into(), values)
}
fn build_int_column(
range: &calamine::Range<Data>,
col: usize,
name: &str,
start_row: usize,
end_row: usize,
) -> Column {
let values: Vec<Option<i64>> = (start_row..end_row)
.map(|row| {
range.get((row, col)).and_then(|cell| match cell {
Data::Int(i) => Some(*i),
Data::Float(f) if f.fract() == 0.0 => Some(*f as i64),
Data::Empty => None,
Data::String(s) => s.parse().ok(),
_ => None,
})
})
.collect();
Column::new(name.into(), values)
}
fn build_bool_column(
range: &calamine::Range<Data>,
col: usize,
name: &str,
start_row: usize,
end_row: usize,
) -> Column {
let values: Vec<Option<bool>> = (start_row..end_row)
.map(|row| {
range.get((row, col)).and_then(|cell| match cell {
Data::Bool(b) => Some(*b),
Data::Empty => None,
_ => None,
})
})
.collect();
Column::new(name.into(), values)
}
fn cell_to_string(cell: &Data) -> String {
match cell {
Data::Empty => String::new(),
Data::String(s) => s.clone(),
Data::Int(i) => i.to_string(),
Data::Float(f) => {
if f.fract() == 0.0 && f.abs() < i64::MAX as f64 {
(*f as i64).to_string()
} else {
f.to_string()
}
}
Data::Bool(b) => b.to_string(),
Data::DateTime(dt) => format_excel_datetime(dt.as_f64()),
Data::DateTimeIso(s) => s.clone(),
Data::DurationIso(s) => s.clone(),
Data::Error(e) => format!("#ERROR:{e:?}"),
}
}
fn format_excel_datetime(serial: f64) -> String {
let days = serial.trunc() as i64;
let time_fraction = serial.fract();
let adjusted_days = if days >= 60 { days - 1 } else { days };
let hours = (time_fraction * 24.0).trunc() as u32;
let minutes = ((time_fraction * 24.0).fract() * 60.0).trunc() as u32;
if time_fraction > 0.0 {
format!("Day {adjusted_days} {hours:02}:{minutes:02}")
} else {
format!("Day {adjusted_days}")
}
}
fn create_empty_dataframe(headers: &[String]) -> DataFrame {
let columns: Vec<Column> = headers
.iter()
.map(|name| Column::new(name.as_str().into(), Vec::<String>::new()))
.collect();
DataFrame::new(columns).unwrap_or_default()
}
pub fn list_sheets(content: &[u8]) -> Result<Vec<String>> {
let cursor = Cursor::new(content);
let workbook: Sheets<_> = open_workbook_auto_from_rs(cursor)?;
Ok(workbook.sheet_names().to_vec())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_cell_to_string() {
assert_eq!(cell_to_string(&Data::Empty), "");
assert_eq!(cell_to_string(&Data::String("hello".to_string())), "hello");
assert_eq!(cell_to_string(&Data::Int(42)), "42");
assert_eq!(cell_to_string(&Data::Float(3.14)), "3.14");
assert_eq!(cell_to_string(&Data::Float(42.0)), "42");
assert_eq!(cell_to_string(&Data::Bool(true)), "true");
}
}