use std::io::Cursor;
use calamine::{Data, Reader, open_workbook_auto_from_rs};
use serde_json::{Map, Value as JsonValue};
use crate::error::{TransformError, TransformErrorKind};
use crate::model::{ExcelFormulaPolicy, ExcelInput, ExcelSheetRef, RuleFile};
use super::{InputData, NormalizationOptions, enforce_records_limit};
mod cell;
mod columns;
mod range;
mod workbook;
mod worksheet;
mod xlsx;
mod xml;
use self::cell::{cell_at, excel_cell_to_json, formula_at};
use self::columns::{
read_explicit_column_names, read_header_names, selected_column_indexes, validate_header_names,
};
use self::range::parse_cell_window;
use self::xlsx::{preflight_xlsx_package, xlsx_bytes_for_calamine};
pub fn normalize_excel_records(
rule: &RuleFile,
input: InputData<'_>,
options: &NormalizationOptions,
) -> Result<Vec<JsonValue>, TransformError> {
let excel = rule.input.excel.as_ref().ok_or_else(|| {
TransformError::new(
TransformErrorKind::InvalidInput,
"input.excel is required when format=excel",
)
})?;
let bytes = excel_input_bytes(input, options)?;
preflight_xlsx_package(bytes, excel, options)?;
let calamine_bytes = xlsx_bytes_for_calamine(bytes)?;
let cursor = Cursor::new(calamine_bytes.as_ref());
let mut workbook = open_workbook_auto_from_rs(cursor).map_err(|err| {
TransformError::new(
TransformErrorKind::InvalidInput,
format!("failed to open Excel workbook: {}", err),
)
})?;
let sheet_names = workbook.sheet_names().to_owned();
if sheet_names.is_empty() {
return Err(invalid("Excel workbook has no sheets"));
}
if sheet_names.len() > options.max_excel_sheets {
return Err(invalid("input exceeds max_excel_sheets"));
}
let sheet_name = select_sheet_name(excel, &sheet_names)?;
let range = workbook.worksheet_range(&sheet_name).map_err(|err| {
TransformError::new(
TransformErrorKind::InvalidInput,
format!("failed to read Excel sheet: {}", err),
)
})?;
let formulas = if excel.formula == ExcelFormulaPolicy::Cached {
None
} else {
Some(workbook.worksheet_formula(&sheet_name).map_err(|err| {
TransformError::new(
TransformErrorKind::InvalidInput,
format!("failed to read Excel formulas: {}", err),
)
})?)
};
let rows: Vec<&[Data]> = range.rows().collect();
let formula_end = formulas
.as_ref()
.and_then(|formulas| formulas.end())
.map(|(row, col)| (row as usize + 1, col as usize + 1))
.unwrap_or((0, 0));
let row_count = rows.len().max(formula_end.0);
let window = parse_cell_window(excel.range.as_deref())?;
let max_width = rows.iter().map(|row| row.len()).max().unwrap_or(0);
let max_width = max_width.max(formula_end.1);
if row_count > options.max_excel_rows {
return Err(invalid("input exceeds max_excel_rows"));
}
let selected_columns = selected_column_indexes(excel, &rows, window, max_width)?;
let cell_count = row_count
.checked_mul(selected_columns.len())
.ok_or_else(|| invalid("input exceeds max_excel_cells"))?;
if cell_count > options.max_excel_cells {
return Err(invalid("input exceeds max_excel_cells"));
}
let headers = if excel.has_header {
read_header_names(&rows, window, &selected_columns, excel.header_row)?
} else {
read_explicit_column_names(excel)?
};
validate_header_names(&headers)?;
let data_start_row = if excel.has_header {
excel
.data_start_row
.map(|row| row.saturating_sub(1))
.unwrap_or_else(|| excel.header_row)
} else {
excel
.data_start_row
.map(|row| row.saturating_sub(1))
.unwrap_or(window.start_row)
}
.max(window.start_row);
let data_end_row = window
.end_row
.unwrap_or_else(|| row_count.saturating_sub(1));
let mut records = Vec::new();
if data_start_row >= row_count || data_start_row > data_end_row {
return Ok(records);
}
for row_index in data_start_row..=data_end_row.min(row_count.saturating_sub(1)) {
let mut obj = Map::new();
for (field_index, column_index) in selected_columns.iter().enumerate() {
let cell = cell_at(&rows, row_index, *column_index);
let formula = formula_at(formulas.as_ref(), row_index, *column_index);
if let Some(value) = excel_cell_to_json(cell, formula, excel, options)? {
obj.insert(headers[field_index].clone(), value);
}
}
if obj.is_empty() {
continue;
}
records.push(JsonValue::Object(obj));
enforce_records_limit(records.len(), options)?;
}
Ok(records)
}
fn excel_input_bytes<'a>(
input: InputData<'a>,
options: &NormalizationOptions,
) -> Result<&'a [u8], TransformError> {
let bytes = match input {
InputData::Text(_) => {
return Err(invalid("Excel input must be provided as bytes"));
}
InputData::Bytes(bytes) => bytes,
};
if bytes.len() > options.max_input_bytes {
return Err(invalid("input exceeds max_input_bytes"));
}
Ok(bytes)
}
fn select_sheet_name(excel: &ExcelInput, sheet_names: &[String]) -> Result<String, TransformError> {
match &excel.sheet {
Some(ExcelSheetRef::Name(name)) => {
if sheet_names.iter().any(|sheet_name| sheet_name == name) {
Ok(name.clone())
} else {
Err(invalid("Excel sheet was not found"))
}
}
Some(ExcelSheetRef::Index(index)) => sheet_names
.get(*index)
.cloned()
.ok_or_else(|| invalid("Excel sheet index is out of range")),
None => sheet_names
.first()
.cloned()
.ok_or_else(|| invalid("Excel workbook has no sheets")),
}
}
pub(super) fn invalid(message: impl Into<String>) -> TransformError {
TransformError::new(TransformErrorKind::InvalidInput, message)
}