fastexcel 0.20.1

A fast excel reader for Rust and Python
Documentation
use arrow_array::RecordBatch;
use pyo3::{Bound, IntoPyObjectExt, PyAny, PyResult, Python, pymethods, types::PyString};

use super::{DefinedName, ExcelReader};

use crate::{
    ExcelSheet,
    data::{ExcelSheetData, record_batch_from_data_and_columns},
    error::{ErrorContext, FastExcelErrorKind, FastExcelResult, py_errors::IntoPyResult},
    types::{
        dtype::{DTypeCoercion, DTypes},
        excelreader::LoadSheetOrTableOptions,
        excelsheet::{
            Header, Pagination, SelectedColumns, SkipRows,
            column_info::{build_available_columns_info, finalize_column_info},
        },
        idx_or_name::IdxOrName,
    },
    utils::schema::get_schema_sample_rows,
};

impl ExcelReader {
    fn build_selected_columns(
        use_columns: Option<&Bound<'_, PyAny>>,
    ) -> FastExcelResult<SelectedColumns> {
        use_columns.try_into().with_context(|| format!("expected selected columns to be list[str] | list[int] | str | Callable[[ColumnInfo], bool] | None, got {use_columns:?}"))
    }

    fn load_sheet_eager(
        data: &ExcelSheetData,
        opts: LoadSheetOrTableOptions,
    ) -> FastExcelResult<RecordBatch> {
        let data_header_row = opts.data_header_row();
        let pagination = match &data {
            ExcelSheetData::Owned(range) => {
                Pagination::try_new(opts.skip_rows, opts.n_rows, range)?
            }
            ExcelSheetData::Ref(range) => Pagination::try_new(opts.skip_rows, opts.n_rows, range)?,
        };

        let header = Header::new(data_header_row, opts.column_names);

        let offset = header.offset() + pagination.offset();
        let limit = {
            let upper_bound = data.height();
            if let Some(n_rows) = pagination.n_rows() {
                // minimum value between (offset+n_rows) and the data's height
                std::cmp::min(offset + n_rows, upper_bound)
            } else {
                upper_bound
            }
        };

        let sample_rows_limit = get_schema_sample_rows(opts.schema_sample_rows, offset, limit);
        let available_columns_info =
            build_available_columns_info(data, &opts.selected_columns, &header)?;
        let final_columns_info = opts
            .selected_columns
            .select_columns(available_columns_info)?;

        let available_columns = finalize_column_info(
            final_columns_info,
            data,
            offset,
            sample_rows_limit,
            opts.dtypes.as_ref(),
            &opts.dtype_coercion,
            opts.whitespace_as_null,
        )?;

        match data {
            ExcelSheetData::Owned(data) => record_batch_from_data_and_columns(
                &available_columns,
                data,
                offset,
                limit,
                opts.whitespace_as_null,
            ),
            ExcelSheetData::Ref(data) => record_batch_from_data_and_columns(
                &available_columns,
                data,
                offset,
                limit,
                opts.whitespace_as_null,
            ),
        }
    }

    fn build_sheet<'py>(
        &mut self,
        idx_or_name: IdxOrName,
        opts: LoadSheetOrTableOptions,
        eager: bool,
        py: Python<'py>,
    ) -> PyResult<Bound<'py, PyAny>> {
        let calamine_header_row = opts.calamine_header_row();

        let sheet_meta = self
            .find_sheet_meta(idx_or_name)
            .into_pyresult()?
            .to_owned();

        if eager && self.sheets.supports_by_ref() {
            let range = py
                .detach(|| {
                    self.sheets
                        .with_header_row(calamine_header_row)
                        .worksheet_range_ref(&sheet_meta.name)
                })
                .into_pyresult()?;
            let rb = py
                .detach(|| Self::load_sheet_eager(&range.into(), opts))
                .into_pyresult()?;

            #[cfg(feature = "pyarrow")]
            {
                use arrow_pyarrow::ToPyArrow;
                rb.to_pyarrow(py)
            }
            #[cfg(not(feature = "pyarrow"))]
            {
                Err(pyo3::exceptions::PyRuntimeError::new_err(
                    "Eager loading requires pyarrow feature. Use eager=False for PyCapsule interface.",
                ))
            }
        } else {
            let range = py
                .detach(|| {
                    self.sheets
                        .with_header_row(calamine_header_row)
                        .worksheet_range(&sheet_meta.name)
                })
                .into_pyresult()?;
            let sheet = ExcelSheet::try_new(sheet_meta, range.into(), opts).into_pyresult()?;

            if eager {
                #[cfg(feature = "pyarrow")]
                {
                    sheet.to_arrow(py)
                }
                #[cfg(not(feature = "pyarrow"))]
                {
                    Err(pyo3::exceptions::PyRuntimeError::new_err(
                        "Eager loading requires pyarrow feature. Use eager=False for PyCapsule interface.",
                    ))
                }
            } else {
                sheet.into_bound_py_any(py)
            }
        }
    }

    #[allow(clippy::too_many_arguments)]
    fn build_table<'py>(
        &mut self,
        name: &str,
        opts: LoadSheetOrTableOptions,
        eager: bool,
        py: Python<'py>,
    ) -> PyResult<Bound<'py, PyAny>> {
        let excel_table = py.detach(|| self.load_table(name, opts)).into_pyresult()?;

        if eager {
            #[cfg(feature = "pyarrow")]
            {
                Ok(excel_table.to_arrow(py)?)
            }
            #[cfg(not(feature = "pyarrow"))]
            {
                Err(pyo3::exceptions::PyRuntimeError::new_err(
                    "Eager loading requires pyarrow feature. Use eager=False for PyCapsule interface.",
                ))
            }
        } else {
            excel_table.into_bound_py_any(py)
        }
    }
}

#[pymethods]
impl ExcelReader {
    pub fn __repr__(&self) -> String {
        format!("ExcelReader<{}>", &self.source)
    }

    #[pyo3(name = "table_names", signature = (sheet_name = None))]
    pub(crate) fn py_table_names(&mut self, sheet_name: Option<&str>) -> PyResult<Vec<&str>> {
        self.sheets.table_names(sheet_name).into_pyresult()
    }

    #[pyo3(name = "defined_names")]
    pub(crate) fn py_defined_names(&mut self) -> PyResult<Vec<DefinedName>> {
        self.defined_names().into_pyresult()
    }

    #[pyo3(name = "load_sheet", signature = (
        idx_or_name,
        *,
        header_row = 0,
        column_names = None,
        skip_rows = SkipRows::SkipEmptyRowsAtBeginning,
        n_rows = None,
        schema_sample_rows = 1_000,
        dtype_coercion = DTypeCoercion::Coerce,
        use_columns = None,
        dtypes = None,
        eager = false,
        skip_whitespace_tail_rows = false,
        whitespace_as_null = false,
    ))]
    #[allow(clippy::too_many_arguments)]
    pub(crate) fn py_load_sheet<'py>(
        &mut self,
        idx_or_name: &Bound<'py, PyAny>,
        header_row: Option<usize>,
        column_names: Option<Vec<String>>,
        skip_rows: SkipRows,
        n_rows: Option<usize>,
        schema_sample_rows: Option<usize>,
        dtype_coercion: DTypeCoercion,
        use_columns: Option<&Bound<'py, PyAny>>,
        dtypes: Option<DTypes>,
        eager: bool,
        skip_whitespace_tail_rows: bool,
        whitespace_as_null: bool,
        py: Python<'py>,
    ) -> PyResult<Bound<'py, PyAny>> {
        // Cannot use NonZeroUsize in the parameters, as it is not supported by pyo3
        if let Some(0) = schema_sample_rows {
            return Err(FastExcelErrorKind::InvalidParameters(
                "schema_sample_rows cannot be 0, as it would prevent dtype inferring".to_string(),
            )
            .into())
            .into_pyresult();
        }
        let idx_or_name = idx_or_name.try_into().into_pyresult()?;
        let selected_columns = Self::build_selected_columns(use_columns).into_pyresult()?;
        let opts = LoadSheetOrTableOptions {
            header_row,
            column_names,
            skip_rows,
            n_rows,
            schema_sample_rows,
            dtype_coercion,
            selected_columns,
            dtypes,
            skip_whitespace_tail_rows,
            whitespace_as_null,
        };

        self.build_sheet(idx_or_name, opts, eager, py)
    }

    #[pyo3(name = "load_table", signature = (
        name,
        *,
        header_row = 0,
        column_names = None,
        skip_rows = SkipRows::SkipEmptyRowsAtBeginning,
        n_rows = None,
        schema_sample_rows = 1_000,
        dtype_coercion = DTypeCoercion::Coerce,
        use_columns = None,
        dtypes = None,
        eager = false,
        skip_whitespace_tail_rows = false,
        whitespace_as_null = false,
    ))]
    #[allow(clippy::too_many_arguments)]
    pub(crate) fn py_load_table<'py>(
        &mut self,
        name: &Bound<'py, PyString>,
        header_row: Option<usize>,
        column_names: Option<Vec<String>>,
        skip_rows: SkipRows,
        n_rows: Option<usize>,
        schema_sample_rows: Option<usize>,
        dtype_coercion: DTypeCoercion,
        use_columns: Option<&Bound<'py, PyAny>>,
        dtypes: Option<DTypes>,
        eager: bool,
        skip_whitespace_tail_rows: bool,
        whitespace_as_null: bool,
        py: Python<'py>,
    ) -> PyResult<Bound<'py, PyAny>> {
        // Cannot use NonZeroUsize in the parameters, as it is not supported by pyo3
        if let Some(0) = schema_sample_rows {
            return Err(FastExcelErrorKind::InvalidParameters(
                "schema_sample_rows cannot be 0, as it would prevent dtype inferring".to_string(),
            )
            .into())
            .into_pyresult();
        }

        let selected_columns = Self::build_selected_columns(use_columns).into_pyresult()?;
        let opts = LoadSheetOrTableOptions {
            header_row,
            column_names,
            skip_rows,
            n_rows,
            schema_sample_rows,
            dtype_coercion,
            selected_columns,
            dtypes,
            skip_whitespace_tail_rows,
            whitespace_as_null,
        };

        self.build_table(&name.to_string(), opts, eager, py)
    }

    #[getter("sheet_names")]
    pub(crate) fn py_sheet_names(&self) -> Vec<&str> {
        self.sheet_names()
    }
}

#[pymethods]
impl DefinedName {
    /// Creates a new `DefinedName` object.
    #[new]
    pub fn py_new(name: String, formula: String) -> Self {
        DefinedName { name, formula }
    }

    #[getter("name")]
    pub fn py_name(&self) -> &str {
        &self.name
    }

    #[getter("formula")]
    pub fn py_formula(&self) -> &str {
        &self.formula
    }

    pub fn __repr__(&self) -> String {
        format!(
            "DefinedName<{name} ({formula})>",
            name = &self.name,
            formula = self
                .formula
                .get(..10)
                .map(|s| format!("{}...", s))
                .as_deref()
                .unwrap_or(self.formula.as_str())
        )
    }

    pub fn __eq__(&self, other: &Self) -> bool {
        self == other
    }
}