dataload 0.1.1

A flexible data loading library for CSV and Excel files with automatic delimiter detection
Documentation
//! Excel file loading implementation.
//!
//! This module is only available when the `excel` feature is enabled.

use std::io::Cursor;

use calamine::{Data, Reader, Sheets, open_workbook_auto_from_rs};
use polars::prelude::*;

use crate::error::{DataLoadError, Result};
use crate::options::LoadOptions;

/// Loads Excel content into a DataFrame.
///
/// Supports xlsx, xls, xlsm, xlsb, and ods formats.
///
/// # Arguments
///
/// * `content` - The raw bytes of the Excel file.
/// * `options` - Loading options including sheet selection, header settings, etc.
///
/// # Errors
///
/// Returns an error if:
/// - The Excel file cannot be parsed
/// - The specified sheet does not exist
/// - The workbook contains no sheets
/// - The sheet is empty
pub fn load_excel(content: &[u8], options: &LoadOptions) -> Result<DataFrame> {
    let cursor = Cursor::new(content);
    let mut workbook: Sheets<_> = open_workbook_auto_from_rs(cursor)?;

    // Determine which sheet to read
    let sheet_name = get_sheet_name(&workbook, options)?;

    let range = workbook.worksheet_range(&sheet_name)?;

    if range.is_empty() {
        return Err(DataLoadError::EmptyFile);
    }

    let height = range.height();
    let width = range.width();

    if width == 0 {
        return Err(DataLoadError::EmptyFile);
    }

    // Apply skip_rows
    let data_start_row = options.skip_rows;

    if data_start_row >= height {
        return Err(DataLoadError::EmptyFile);
    }

    // Determine headers
    let (headers, content_start_row) = if options.has_header {
        let header_row = data_start_row;
        let headers: Vec<String> = (0..width)
            .map(|col| {
                range
                    .get((header_row, col))
                    .map(cell_to_string)
                    .filter(|s| !s.is_empty())
                    .unwrap_or_else(|| format!("column_{col}"))
            })
            .collect();
        (headers, data_start_row + 1)
    } else {
        let headers: Vec<String> = (0..width)
            .map(|col| format!("column_{col}"))
            .collect();
        (headers, data_start_row)
    };

    // Calculate end row
    let end_row = match options.max_rows {
        Some(max) => (content_start_row + max).min(height),
        None => height,
    };

    if content_start_row >= end_row {
        // No data rows after header
        return Ok(create_empty_dataframe(&headers));
    }

    // Build columns with type inference
    let columns: Vec<Column> = (0..width)
        .map(|col| build_column(&range, col, &headers[col], content_start_row, end_row))
        .collect();

    Ok(DataFrame::new(columns)?)
}

/// Gets the sheet name to read based on options.
fn get_sheet_name(workbook: &Sheets<Cursor<&[u8]>>, options: &LoadOptions) -> Result<String> {
    let sheet_names = workbook.sheet_names();

    if sheet_names.is_empty() {
        return Err(DataLoadError::NoSheetsFound);
    }

    // Sheet name takes precedence
    if let Some(ref name) = options.sheet_name {
        if sheet_names.contains(name) {
            return Ok(name.clone());
        }
        return Err(DataLoadError::UnsupportedFileType(format!(
            "Sheet '{}' not found. Available sheets: {:?}",
            name, sheet_names
        )));
    }

    // Use sheet index
    if let Some(index) = options.sheet_index {
        if index < sheet_names.len() {
            return Ok(sheet_names[index].clone());
        }
        return Err(DataLoadError::UnsupportedFileType(format!(
            "Sheet index {} out of range. File has {} sheets.",
            index,
            sheet_names.len()
        )));
    }

    // Default to first sheet
    Ok(sheet_names[0].clone())
}

/// Builds a column (Column) from Excel data with basic type inference.
fn build_column(
    range: &calamine::Range<Data>,
    col: usize,
    name: &str,
    start_row: usize,
    end_row: usize,
) -> Column {
    // First pass: determine the predominant type
    let mut has_int = false;
    let mut has_float = false;
    let mut has_bool = false;
    let mut has_string = false;

    for row in start_row..end_row {
        if let Some(cell) = range.get((row, col)) {
            match cell {
                Data::Int(_) => has_int = true,
                Data::Float(_) => has_float = true,
                Data::Bool(_) => has_bool = true,
                Data::String(_) | Data::DateTimeIso(_) | Data::DurationIso(_) => {
                    has_string = true;
                }
                Data::DateTime(_) => has_float = true, // Excel stores dates as floats
                Data::Empty | Data::Error(_) => {}
            }
        }
    }

    // Decide column type based on what we found
    if has_string || (has_bool && (has_int || has_float)) {
        // Mixed types or strings present: use String
        build_string_column(range, col, name, start_row, end_row)
    } else if has_float || (has_int && has_float) {
        // Any floats: use f64
        build_float_column(range, col, name, start_row, end_row)
    } else if has_int {
        // Only integers: use i64
        build_int_column(range, col, name, start_row, end_row)
    } else if has_bool {
        // Only booleans: use bool
        build_bool_column(range, col, name, start_row, end_row)
    } else {
        // All empty/error: use String with nulls
        build_string_column(range, col, name, start_row, end_row)
    }
}

fn build_string_column(
    range: &calamine::Range<Data>,
    col: usize,
    name: &str,
    start_row: usize,
    end_row: usize,
) -> Column {
    let values: Vec<Option<String>> = (start_row..end_row)
        .map(|row| {
            range.get((row, col)).and_then(|cell| {
                let s = cell_to_string(cell);
                if s.is_empty() {
                    None
                } else {
                    Some(s)
                }
            })
        })
        .collect();
    Column::new(name.into(), values)
}

fn build_float_column(
    range: &calamine::Range<Data>,
    col: usize,
    name: &str,
    start_row: usize,
    end_row: usize,
) -> Column {
    let values: Vec<Option<f64>> = (start_row..end_row)
        .map(|row| {
            range.get((row, col)).and_then(|cell| match cell {
                Data::Float(f) => Some(*f),
                Data::Int(i) => Some(*i as f64),
                Data::DateTime(dt) => Some(dt.as_f64()),
                Data::Empty => None,
                Data::String(s) => s.parse().ok(),
                _ => None,
            })
        })
        .collect();
    Column::new(name.into(), values)
}

fn build_int_column(
    range: &calamine::Range<Data>,
    col: usize,
    name: &str,
    start_row: usize,
    end_row: usize,
) -> Column {
    let values: Vec<Option<i64>> = (start_row..end_row)
        .map(|row| {
            range.get((row, col)).and_then(|cell| match cell {
                Data::Int(i) => Some(*i),
                Data::Float(f) if f.fract() == 0.0 => Some(*f as i64),
                Data::Empty => None,
                Data::String(s) => s.parse().ok(),
                _ => None,
            })
        })
        .collect();
    Column::new(name.into(), values)
}

fn build_bool_column(
    range: &calamine::Range<Data>,
    col: usize,
    name: &str,
    start_row: usize,
    end_row: usize,
) -> Column {
    let values: Vec<Option<bool>> = (start_row..end_row)
        .map(|row| {
            range.get((row, col)).and_then(|cell| match cell {
                Data::Bool(b) => Some(*b),
                Data::Empty => None,
                _ => None,
            })
        })
        .collect();
    Column::new(name.into(), values)
}

/// Converts an Excel cell to a string representation.
fn cell_to_string(cell: &Data) -> String {
    match cell {
        Data::Empty => String::new(),
        Data::String(s) => s.clone(),
        Data::Int(i) => i.to_string(),
        Data::Float(f) => {
            // Avoid unnecessary decimal places for whole numbers
            if f.fract() == 0.0 && f.abs() < i64::MAX as f64 {
                (*f as i64).to_string()
            } else {
                f.to_string()
            }
        }
        Data::Bool(b) => b.to_string(),
        Data::DateTime(dt) => format_excel_datetime(dt.as_f64()),
        Data::DateTimeIso(s) => s.clone(),
        Data::DurationIso(s) => s.clone(),
        Data::Error(e) => format!("#ERROR:{e:?}"),
    }
}

/// Formats an Excel datetime serial number as a string.
///
/// Excel stores dates as days since 1900-01-01 (with a bug for 1900-02-29).
fn format_excel_datetime(serial: f64) -> String {
    // This is a simplified conversion - for full accuracy,
    // consider using the chrono crate
    let days = serial.trunc() as i64;
    let time_fraction = serial.fract();

    // Excel's epoch is 1900-01-01, but it incorrectly treats 1900 as a leap year
    // Day 1 = 1900-01-01, Day 60 = 1900-02-29 (doesn't exist), Day 61 = 1900-03-01
    let adjusted_days = if days >= 60 { days - 1 } else { days };

    // Convert to a basic date representation
    // This is approximate - for production use, use chrono
    let hours = (time_fraction * 24.0).trunc() as u32;
    let minutes = ((time_fraction * 24.0).fract() * 60.0).trunc() as u32;

    if time_fraction > 0.0 {
        format!("Day {adjusted_days} {hours:02}:{minutes:02}")
    } else {
        format!("Day {adjusted_days}")
    }
}

/// Creates an empty DataFrame with the given column names.
fn create_empty_dataframe(headers: &[String]) -> DataFrame {
    let columns: Vec<Column> = headers
        .iter()
        .map(|name| Column::new(name.as_str().into(), Vec::<String>::new()))
        .collect();

    DataFrame::new(columns).unwrap_or_default()
}

/// Returns the list of sheet names in an Excel file.
///
/// # Arguments
///
/// * `content` - The raw bytes of the Excel file.
///
/// # Errors
///
/// Returns an error if the Excel file cannot be parsed.
pub fn list_sheets(content: &[u8]) -> Result<Vec<String>> {
    let cursor = Cursor::new(content);
    let workbook: Sheets<_> = open_workbook_auto_from_rs(cursor)?;
    Ok(workbook.sheet_names().to_vec())
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_cell_to_string() {
        assert_eq!(cell_to_string(&Data::Empty), "");
        assert_eq!(cell_to_string(&Data::String("hello".to_string())), "hello");
        assert_eq!(cell_to_string(&Data::Int(42)), "42");
        assert_eq!(cell_to_string(&Data::Float(3.14)), "3.14");
        assert_eq!(cell_to_string(&Data::Float(42.0)), "42");
        assert_eq!(cell_to_string(&Data::Bool(true)), "true");
    }
}