#[cfg(feature = "python")]
mod python;
use calamine::{Data, Range, Table};
#[cfg(feature = "polars")]
use polars_core::frame::DataFrame;
#[cfg(feature = "python")]
use pyo3::pyclass;
use crate::{
FastExcelColumn, FastExcelErrorKind, IdxOrName, LoadSheetOrTableOptions, SelectedColumns,
data::height_without_tail_whitespace,
error::{ErrorContext, FastExcelResult},
types::{
dtype::DTypes,
excelsheet::{
Header, Pagination,
column_info::{
AvailableColumns, ColumnInfo, build_available_columns_info, finalize_column_info,
},
deferred_selection_to_concrete,
},
},
utils::schema::get_schema_sample_rows,
};
#[derive(Debug)]
#[cfg_attr(feature = "python", pyclass(name = "_ExcelTable"))]
pub struct ExcelTable {
name: String,
sheet_name: String,
selected_columns: Vec<ColumnInfo>,
available_columns: AvailableColumns,
table: Table<Data>,
header: Header,
pagination: Pagination,
opts: LoadSheetOrTableOptions,
height: Option<usize>,
total_height: Option<usize>,
width: Option<usize>,
limit: usize,
}
impl ExcelTable {
fn extract_selected_columns_and_table_columns(
table: &Table<Data>,
selected_columns: &[IdxOrName],
) -> FastExcelResult<(Vec<String>, Vec<IdxOrName>)> {
let table_columns: Vec<String> = table.columns().into();
let column_offset = table.data().start().map_or(0, |(_row, col)| col as usize);
let selected_column_indices = selected_columns
.iter()
.map(|idx_or_name| match idx_or_name {
IdxOrName::Idx(idx) => Ok(*idx),
IdxOrName::Name(name) => table_columns
.iter()
.enumerate()
.find_map(|(idx, col_name)| {
(col_name.as_str() == name.as_str()).then_some(idx + column_offset)
})
.ok_or_else(|| FastExcelErrorKind::ColumnNotFound(name.clone().into()).into())
.with_context(|| format!("available columns are: {table_columns:?}")),
})
.collect::<FastExcelResult<Vec<usize>>>()?;
let table_columns = table_columns
.into_iter()
.enumerate()
.filter_map(|(idx, col_name)| {
selected_column_indices
.contains(&(idx + column_offset))
.then_some(col_name)
})
.collect();
let selected_columns = selected_column_indices
.into_iter()
.map(Into::into)
.collect();
Ok((table_columns, selected_columns))
}
fn build_header_and_update_selection(
table: &Table<Data>,
opts: LoadSheetOrTableOptions,
) -> FastExcelResult<(Header, LoadSheetOrTableOptions)> {
Ok(match (&opts.column_names, opts.header_row) {
(None, None) => {
match &opts.selected_columns {
SelectedColumns::Selection(selected_columns) => {
let (table_columns, selected_columns) =
Self::extract_selected_columns_and_table_columns(
table,
selected_columns,
)?;
let opts =
opts.selected_columns(SelectedColumns::Selection(selected_columns));
(Header::With(table_columns), opts)
}
SelectedColumns::DeferredSelection(deferred_selection) => {
let concrete_columns = deferred_selection_to_concrete(
deferred_selection,
table.data().end().map_or(0, |(_row, col)| col as usize),
);
let (table_columns, selected_columns) =
Self::extract_selected_columns_and_table_columns(
table,
&concrete_columns,
)?;
let opts =
opts.selected_columns(SelectedColumns::Selection(selected_columns));
(Header::With(table_columns), opts)
}
_ => (Header::With(table.columns().into()), opts),
}
}
(None, Some(row)) => (Header::At(row), opts),
(Some(column_names), _) => (Header::With(column_names.clone()), opts),
})
}
pub(crate) fn try_new(
table: Table<Data>,
opts: LoadSheetOrTableOptions,
) -> FastExcelResult<Self> {
let pagination = Pagination::try_new(opts.skip_rows.clone(), opts.n_rows, table.data())?;
let (header, opts) = Self::build_header_and_update_selection(&table, opts)?;
let available_columns_info =
build_available_columns_info(table.data(), &opts.selected_columns, &header)?;
let selected_columns_info = opts
.selected_columns
.select_columns(available_columns_info)?;
let mut excel_table = ExcelTable {
name: table.name().to_owned(),
sheet_name: table.sheet_name().to_owned(),
available_columns: AvailableColumns::Pending,
selected_columns: Vec::with_capacity(0),
table,
header,
pagination,
opts,
height: None,
total_height: None,
width: None,
limit: 0,
};
excel_table.limit = excel_table.compute_limit();
let row_limit = get_schema_sample_rows(
excel_table.opts.schema_sample_rows,
excel_table.offset(),
excel_table.limit(),
);
let selected_columns = finalize_column_info(
selected_columns_info,
excel_table.data(),
excel_table.offset(),
row_limit,
excel_table.opts.dtypes.as_ref(),
&excel_table.opts.dtype_coercion,
excel_table.opts.whitespace_as_null,
)?;
excel_table.selected_columns = selected_columns;
Ok(excel_table)
}
pub(crate) fn data(&self) -> &Range<Data> {
self.table.data()
}
fn ensure_available_columns_loaded(&mut self) -> FastExcelResult<()> {
let available_columns = match &self.available_columns {
AvailableColumns::Pending => {
let available_columns_info = build_available_columns_info(
self.table.data(),
&self.opts.selected_columns,
&self.header,
)?;
let final_info = finalize_column_info(
available_columns_info,
self.data(),
self.offset(),
self.limit(),
self.opts.dtypes.as_ref(),
&self.opts.dtype_coercion,
self.opts.whitespace_as_null,
)?;
AvailableColumns::Loaded(final_info)
}
AvailableColumns::Loaded(_) => return Ok(()),
};
self.available_columns = available_columns;
Ok(())
}
fn load_available_columns(&mut self) -> FastExcelResult<&[ColumnInfo]> {
self.ensure_available_columns_loaded()?;
self.available_columns.as_loaded()
}
pub fn offset(&self) -> usize {
self.header.offset() + self.pagination.offset()
}
fn compute_limit(&self) -> usize {
let upper_bound = if self.opts.skip_whitespace_tail_rows {
height_without_tail_whitespace(self.data()).unwrap_or_else(|| self.data().height())
} else {
self.data().height()
};
if let Some(n_rows) = self.pagination.n_rows() {
let limit = self.offset() + n_rows;
if limit < upper_bound {
return limit;
}
}
upper_bound
}
pub fn limit(&self) -> usize {
self.limit
}
pub fn selected_columns(&self) -> Vec<ColumnInfo> {
self.selected_columns.clone()
}
pub fn available_columns(&mut self) -> FastExcelResult<Vec<ColumnInfo>> {
self.load_available_columns().map(|cols| cols.to_vec())
}
pub fn specified_dtypes(&self) -> Option<&DTypes> {
self.opts.dtypes.as_ref()
}
pub fn width(&mut self) -> usize {
self.width.unwrap_or_else(|| {
let width = self.data().width();
self.width = Some(width);
width
})
}
pub fn height(&mut self) -> usize {
self.height.unwrap_or_else(|| {
let height = self.limit() - self.offset();
self.height = Some(height);
height
})
}
pub fn total_height(&mut self) -> usize {
self.total_height.unwrap_or_else(|| {
let total_height = self.data().height() - self.header.offset();
self.total_height = Some(total_height);
total_height
})
}
pub fn name(&self) -> &str {
&self.name
}
pub fn sheet_name(&self) -> &str {
&self.sheet_name
}
pub fn to_columns(&self) -> FastExcelResult<Vec<FastExcelColumn>> {
self.selected_columns
.iter()
.map(|column_info| {
FastExcelColumn::try_from_column_info(
column_info,
self.table.data(),
self.offset(),
self.limit(),
self.opts.whitespace_as_null,
)
})
.collect()
}
#[cfg(feature = "polars")]
pub fn to_polars(&self) -> FastExcelResult<DataFrame> {
use crate::error::FastExcelErrorKind;
let pl_columns = self.to_columns()?.into_iter().map(Into::into).collect();
DataFrame::new_infer_height(pl_columns).map_err(|err| {
FastExcelErrorKind::Internal(format!("could not create DataFrame: {err:?}")).into()
})
}
}