#[cfg(feature = "python")]
mod python;
use std::{
collections::{HashMap, HashSet},
fmt::{Debug, Display},
str::FromStr,
sync::OnceLock,
};
use calamine::{CellErrorType, CellType, DataType, Range};
use log::warn;
#[cfg(feature = "python")]
use pyo3::{IntoPyObject, IntoPyObjectRef};
use crate::error::{FastExcelError, FastExcelErrorKind, FastExcelResult};
use super::idx_or_name::IdxOrName;
#[derive(Debug, Clone, PartialEq, Eq, Hash, Copy)]
pub enum DType {
Null,
Int,
Float,
String,
Bool,
DateTime,
Date,
Duration,
}
impl FromStr for DType {
type Err = FastExcelError;
fn from_str(raw_dtype: &str) -> FastExcelResult<Self> {
match raw_dtype {
"null" => Ok(Self::Null),
"int" => Ok(Self::Int),
"float" => Ok(Self::Float),
"string" => Ok(Self::String),
"boolean" => Ok(Self::Bool),
"datetime" => Ok(Self::DateTime),
"date" => Ok(Self::Date),
"duration" => Ok(Self::Duration),
_ => Err(FastExcelErrorKind::InvalidParameters(format!(
"unsupported dtype: \"{raw_dtype}\""
))
.into()),
}
}
}
impl Display for DType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(match self {
DType::Null => "null",
DType::Int => "int",
DType::Float => "float",
DType::String => "string",
DType::Bool => "boolean",
DType::DateTime => "datetime",
DType::Date => "date",
DType::Duration => "duration",
})
}
}
pub type DTypeMap = HashMap<IdxOrName, DType>;
#[derive(Debug, Clone)]
#[cfg_attr(feature = "python", derive(IntoPyObject, IntoPyObjectRef))]
pub enum DTypes {
All(DType),
Map(DTypeMap),
}
impl FromStr for DTypes {
type Err = FastExcelError;
fn from_str(dtypes: &str) -> FastExcelResult<Self> {
Ok(DTypes::All(DType::from_str(dtypes)?))
}
}
#[derive(Debug, Clone, PartialEq, Eq, Hash, Copy, Default)]
pub enum DTypeCoercion {
#[default]
Coerce,
Strict,
}
impl FromStr for DTypeCoercion {
type Err = FastExcelError;
fn from_str(raw_dtype_coercion: &str) -> FastExcelResult<Self> {
match raw_dtype_coercion {
"coerce" => Ok(Self::Coerce),
"strict" => Ok(Self::Strict),
_ => Err(FastExcelErrorKind::InvalidParameters(format!(
"unsupported dtype_coercion: \"{raw_dtype_coercion}\""
))
.into()),
}
}
}
const NULL_STRING_VALUES: [&str; 19] = [
"", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN",
"<NA>", "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null",
];
fn get_cell_dtype<DT: CellType + Debug + DataType>(
data: &Range<DT>,
row: usize,
col: usize,
whitespace_as_null: bool,
) -> FastExcelResult<DType> {
let cell = data
.get((row, col))
.ok_or(FastExcelErrorKind::CannotRetrieveCellData(row, col))?;
if cell.is_int() {
Ok(DType::Int)
} else if cell.is_float() {
Ok(DType::Float)
} else if cell.is_string() {
if NULL_STRING_VALUES.contains(&cell.get_string().unwrap())
|| (whitespace_as_null
&& cell
.get_string()
.is_none_or(|s| s.trim().is_empty()))
{
Ok(DType::Null)
} else {
Ok(DType::String)
}
} else if cell.is_bool() {
Ok(DType::Bool)
} else if cell.is_datetime() {
let excel_datetime = cell
.get_datetime()
.expect("calamine indicated that cell is a datetime but get_datetime returned None");
Ok(if excel_datetime.is_datetime() {
DType::DateTime
} else {
DType::Duration
})
}
else if cell.is_datetime_iso() {
match cell.as_datetime() {
Some(_) => Ok(DType::DateTime),
None => Ok(DType::Date),
}
}
else if cell.is_duration_iso() {
Ok(DType::Duration)
}
else if cell.is_empty() {
Ok(DType::Null)
} else if cell.is_error() {
match cell.get_error() {
Some(
CellErrorType::NA
| CellErrorType::Value
| CellErrorType::Null
| CellErrorType::Ref
| CellErrorType::Num
| CellErrorType::Div0,
) => Ok(DType::Null),
Some(err) => Err(FastExcelErrorKind::CalamineCellError(err.to_owned()).into()),
None => Err(FastExcelErrorKind::Internal(format!(
"cell is an error but get_error returned None: {cell:?}"
))
.into()),
}
} else {
Err(FastExcelErrorKind::Internal(format!("unsupported cell type: {cell:?}")).into())
}
}
static FLOAT_TYPES_CELL: OnceLock<HashSet<DType>> = OnceLock::new();
static INT_TYPES_CELL: OnceLock<HashSet<DType>> = OnceLock::new();
static STRING_TYPES_CELL: OnceLock<HashSet<DType>> = OnceLock::new();
fn float_types() -> &'static HashSet<DType> {
FLOAT_TYPES_CELL.get_or_init(|| HashSet::from([DType::Int, DType::Float, DType::Bool]))
}
fn int_types() -> &'static HashSet<DType> {
INT_TYPES_CELL.get_or_init(|| HashSet::from([DType::Int, DType::Bool]))
}
fn string_types() -> &'static HashSet<DType> {
STRING_TYPES_CELL.get_or_init(|| {
HashSet::from([
DType::Bool,
DType::Int,
DType::Float,
DType::String,
DType::DateTime,
DType::Date,
])
})
}
pub(crate) fn get_dtype_for_column<DT: CellType + Debug + DataType>(
data: &Range<DT>,
start_row: usize,
end_row: usize,
col: usize,
dtype_coercion: &DTypeCoercion,
whitespace_as_null: bool,
) -> FastExcelResult<DType> {
let mut column_types = (start_row..end_row)
.map(|row| get_cell_dtype(data, row, col, whitespace_as_null))
.collect::<FastExcelResult<HashSet<_>>>()?;
column_types.remove(&DType::Null);
if column_types.is_empty() {
if start_row == end_row {
Ok(DType::Null)
} else {
warn!("Could not determine dtype for column {col}, falling back to string");
Ok(DType::String)
}
} else if matches!(dtype_coercion, &DTypeCoercion::Strict) && column_types.len() != 1 {
Err(
FastExcelErrorKind::UnsupportedColumnTypeCombination(format!(
"type coercion is strict and column contains {column_types:?}"
))
.into(),
)
} else if column_types.len() == 1 {
Ok(column_types.into_iter().next().unwrap())
} else if column_types.is_subset(int_types()) {
Ok(DType::Int)
} else if column_types.is_subset(float_types()) {
Ok(DType::Float)
} else if column_types.is_subset(string_types()) {
Ok(DType::String)
} else {
Err(
FastExcelErrorKind::UnsupportedColumnTypeCombination(format!("{column_types:?}"))
.into(),
)
}
}
pub(crate) fn excel_float_to_string(x: f64) -> String {
format!("{x:.9}")
.trim_end_matches('0')
.trim_end_matches('.')
.to_string()
}
#[cfg(feature = "__pyo3-tests")]
#[cfg(test)]
mod tests {
use calamine::{Cell, Data as CalData};
use pretty_assertions::assert_eq;
use rstest::{fixture, rstest};
use super::*;
#[fixture]
fn range() -> Range<CalData> {
Range::from_sparse(vec![
Cell::new((0, 0), CalData::Bool(true)),
Cell::new((1, 0), CalData::Bool(false)),
Cell::new((2, 0), CalData::String("NULL".to_string())),
Cell::new((3, 0), CalData::Int(42)),
Cell::new((4, 0), CalData::Float(13.37)),
Cell::new((5, 0), CalData::String("hello".to_string())),
Cell::new((6, 0), CalData::Empty),
Cell::new((7, 0), CalData::String("#N/A".to_string())),
Cell::new((8, 0), CalData::Int(12)),
Cell::new((9, 0), CalData::Float(12.21)),
Cell::new((10, 0), CalData::Bool(true)),
Cell::new((11, 0), CalData::Int(1337)),
])
}
#[rstest]
#[case(0, 2, DType::Bool)]
#[case(3, 4, DType::Int)]
#[case(4, 5, DType::Float)]
#[case(5, 6, DType::String)]
#[case(3, 5, DType::Float)]
#[case(2, 5, DType::Float)]
#[case(4, 6, DType::String)]
#[case(3, 6, DType::String)]
#[case(2, 8, DType::String)]
#[case(6, 9, DType::Int)]
#[case(7, 10, DType::Float)]
#[case(7, 11, DType::Float)]
#[case(10, 12, DType::Int)]
fn get_arrow_column_type_multi_dtype_ok_coerce(
range: Range<CalData>,
#[case] start_row: usize,
#[case] end_row: usize,
#[case] expected: DType,
) {
assert_eq!(
get_dtype_for_column(&range, start_row, end_row, 0, &DTypeCoercion::Coerce, false)
.unwrap(),
expected
);
}
#[rstest]
#[case(0, 2, DType::Bool)]
#[case(3, 4, DType::Int)]
#[case(4, 5, DType::Float)]
#[case(5, 6, DType::String)]
#[case(6, 9, DType::Int)]
fn get_arrow_column_type_multi_dtype_ok_strict(
range: Range<CalData>,
#[case] start_row: usize,
#[case] end_row: usize,
#[case] expected: DType,
) {
assert_eq!(
get_dtype_for_column(&range, start_row, end_row, 0, &DTypeCoercion::Strict, false)
.unwrap(),
expected
);
}
#[rstest]
#[case(3, 5)]
#[case(4, 6)]
#[case(3, 6)]
#[case(2, 8)]
#[case(7, 10)]
#[case(7, 11)]
#[case(10, 12)]
fn get_arrow_column_type_multi_dtype_ko_strict(
range: Range<CalData>,
#[case] start_row: usize,
#[case] end_row: usize,
) {
let result =
get_dtype_for_column(&range, start_row, end_row, 0, &DTypeCoercion::Strict, false);
assert!(matches!(
result.unwrap_err().kind,
FastExcelErrorKind::UnsupportedColumnTypeCombination(_)
));
}
#[rstest]
#[case(29.020000000000003, "29.02")]
#[case(10000_f64, "10000")]
#[case(23.0, "23")]
fn test_excel_float_to_string(#[case] x: f64, #[case] expected: &str) {
assert_eq!(excel_float_to_string(x), expected.to_string());
}
}