dataload 0.1.1

A flexible data loading library for CSV and Excel files with automatic delimiter detection
Documentation
//! Main data loader implementation.

use std::path::Path;

use polars::prelude::DataFrame;

use crate::error::{DataLoadError, Result};
use crate::file_type::{detect_file_type, FileType};
use crate::options::LoadOptions;

/// A flexible data loader for CSV and Excel files.
///
/// `DataLoader` provides a builder-pattern API for configuring how files
/// are loaded into Polars DataFrames.
///
/// # Examples
///
/// ## Basic usage
///
/// ```no_run
/// use dataload::DataLoader;
/// use std::path::Path;
///
/// let df = DataLoader::new()
///     .load_file(Path::new("data.csv"))?;
/// # Ok::<(), dataload::DataLoadError>(())
/// ```
///
/// ## With custom options
///
/// ```no_run
/// use dataload::{DataLoader, Delimiter};
/// use std::path::Path;
///
/// let df = DataLoader::new()
///     .with_delimiter(Delimiter::Tab)
///     .with_header(false)
///     .with_max_rows(Some(1000))
///     .load_file(Path::new("data.tsv"))?;
/// # Ok::<(), dataload::DataLoadError>(())
/// ```
///
/// ## Loading from bytes
///
/// ```
/// use dataload::DataLoader;
///
/// let csv_data = b"name,age\nAlice,30\nBob,25";
/// let df = DataLoader::new()
///     .load_bytes(csv_data, "data.csv")?;
/// assert_eq!(df.shape(), (2, 2));
/// # Ok::<(), dataload::DataLoadError>(())
/// ```
#[derive(Debug, Clone, Default)]
pub struct DataLoader {
    options: LoadOptions,
}

impl DataLoader {
    /// Creates a new `DataLoader` with default options.
    #[must_use]
    pub fn new() -> Self {
        Self::default()
    }

    /// Creates a `DataLoader` with the given options.
    #[must_use]
    pub const fn with_options(options: LoadOptions) -> Self {
        Self { options }
    }

    /// Sets the delimiter for CSV parsing.
    ///
    /// Use `Delimiter::Auto` (default) for automatic detection.
    #[must_use]
    pub fn with_delimiter(mut self, delimiter: crate::delimiter::Delimiter) -> Self {
        self.options.delimiter = delimiter;
        self
    }

    /// Sets whether the first row contains headers.
    #[must_use]
    pub const fn with_header(mut self, has_header: bool) -> Self {
        self.options.has_header = has_header;
        self
    }

    /// Sets the number of rows to skip from the start.
    #[must_use]
    pub const fn with_skip_rows(mut self, skip_rows: usize) -> Self {
        self.options.skip_rows = skip_rows;
        self
    }

    /// Sets the maximum number of rows to read.
    #[must_use]
    pub const fn with_max_rows(mut self, max_rows: Option<usize>) -> Self {
        self.options.max_rows = max_rows;
        self
    }

    /// Sets the sheet index to read from Excel files (0-indexed).
    #[must_use]
    pub const fn with_sheet_index(mut self, index: usize) -> Self {
        self.options.sheet_index = Some(index);
        self
    }

    /// Sets the sheet name to read from Excel files.
    #[must_use]
    pub fn with_sheet_name(mut self, name: impl Into<String>) -> Self {
        self.options.sheet_name = Some(name.into());
        self
    }

    /// Sets whether to infer data types.
    #[must_use]
    pub const fn with_infer_schema(mut self, infer: bool) -> Self {
        self.options.infer_schema = infer;
        self
    }

    /// Returns a reference to the current options.
    #[must_use]
    pub const fn options(&self) -> &LoadOptions {
        &self.options
    }

    /// Loads a file from the filesystem.
    ///
    /// The file type is detected from the file extension and magic bytes.
    ///
    /// # Arguments
    ///
    /// * `path` - Path to the file to load.
    ///
    /// # Errors
    ///
    /// Returns an error if:
    /// - The file cannot be read
    /// - The file type is not supported
    /// - The file content cannot be parsed
    pub fn load_file(&self, path: &Path) -> Result<DataFrame> {
        let content = std::fs::read(path)?;

        let filename = path
            .file_name()
            .and_then(|s| s.to_str())
            .ok_or_else(|| DataLoadError::UnsupportedFileType("invalid path".into()))?;

        self.load_bytes(&content, filename)
    }

    /// Loads data from raw bytes.
    ///
    /// # Arguments
    ///
    /// * `content` - The raw bytes of the file.
    /// * `filename` - The filename (used for type detection via extension).
    ///
    /// # Errors
    ///
    /// Returns an error if:
    /// - The file type cannot be detected
    /// - The file content cannot be parsed
    pub fn load_bytes(&self, content: &[u8], filename: &str) -> Result<DataFrame> {
        let file_type = detect_file_type(content, filename)
            .ok_or_else(|| DataLoadError::UnsupportedFileType(filename.to_string()))?;

        match file_type {
            FileType::Csv => crate::csv::load_csv(content, &self.options),
            #[cfg(feature = "excel")]
            FileType::Excel => crate::excel::load_excel(content, &self.options),
            #[cfg(not(feature = "excel"))]
            FileType::Excel => Err(DataLoadError::UnsupportedFileType(
                "Excel support requires the 'excel' feature".into(),
            )),
        }
    }
}

/// Convenience function to load a file with default options.
///
/// # Examples
///
/// ```no_run
/// use dataload::load_file;
/// use std::path::Path;
///
/// let df = load_file(Path::new("data.csv"))?;
/// # Ok::<(), dataload::DataLoadError>(())
/// ```
pub fn load_file(path: &Path) -> Result<DataFrame> {
    DataLoader::new().load_file(path)
}

/// Convenience function to load bytes with default options.
///
/// # Examples
///
/// ```
/// use dataload::load_bytes;
///
/// let csv_data = b"a,b,c\n1,2,3";
/// let df = load_bytes(csv_data, "data.csv")?;
/// # Ok::<(), dataload::DataLoadError>(())
/// ```
pub fn load_bytes(content: &[u8], filename: &str) -> Result<DataFrame> {
    DataLoader::new().load_bytes(content, filename)
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::Delimiter;

    #[test]
    fn test_dataloader_default() {
        let loader = DataLoader::new();
        assert_eq!(loader.options.delimiter, Delimiter::Auto);
        assert!(loader.options.has_header);
    }

    #[test]
    fn test_dataloader_builder() {
        let loader = DataLoader::new()
            .with_delimiter(Delimiter::Semicolon)
            .with_header(false)
            .with_skip_rows(2)
            .with_max_rows(Some(100));

        assert_eq!(loader.options.delimiter, Delimiter::Semicolon);
        assert!(!loader.options.has_header);
        assert_eq!(loader.options.skip_rows, 2);
        assert_eq!(loader.options.max_rows, Some(100));
    }

    #[test]
    fn test_load_bytes_csv() {
        let content = b"name,value\nfoo,1\nbar,2";
        let df = DataLoader::new().load_bytes(content, "test.csv").unwrap();
        assert_eq!(df.shape(), (2, 2));
    }

    #[test]
    fn test_load_bytes_unsupported() {
        let content = b"{}";
        let result = DataLoader::new().load_bytes(content, "test.json");
        assert!(matches!(result, Err(DataLoadError::UnsupportedFileType(_))));
    }

    #[test]
    fn test_convenience_function() {
        let content = b"x,y\n1,2";
        let df = load_bytes(content, "data.csv").unwrap();
        assert_eq!(df.shape(), (1, 2));
    }
}