dataload 0.1.1

A flexible data loading library for CSV and Excel files with automatic delimiter detection
Documentation
//! File type detection based on magic bytes and file extensions.

/// Supported file types for data loading.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum FileType {
    /// CSV or other delimited text files.
    Csv,
    /// Excel files (xlsx, xls, xlsm, xlsb, ods).
    Excel,
}

impl FileType {
    /// Returns a human-readable name for the file type.
    #[must_use]
    pub const fn name(&self) -> &'static str {
        match self {
            Self::Csv => "CSV",
            Self::Excel => "Excel",
        }
    }
}

/// Magic bytes for file type detection.
mod magic {
    /// ZIP file magic bytes (used by xlsx, xlsm, xlsb, ods).
    pub const ZIP: &[u8] = &[0x50, 0x4B, 0x03, 0x04];

    /// OLE2 compound document magic bytes (used by xls).
    pub const OLE2: &[u8] = &[0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1];
}

/// Detects the file type from content and filename.
///
/// This function first checks magic bytes (more reliable), then falls back
/// to file extension detection.
///
/// # Arguments
///
/// * `content` - The file content as a byte slice.
/// * `filename` - The filename, used for extension-based detection.
///
/// # Returns
///
/// The detected `FileType`, or `None` if the type is not supported.
#[must_use]
pub fn detect_file_type(content: &[u8], filename: &str) -> Option<FileType> {
    // Check magic bytes first (more reliable)
    if let Some(file_type) = detect_from_magic_bytes(content) {
        return Some(file_type);
    }

    // Fall back to extension-based detection
    detect_from_extension(filename)
}

/// Detects file type from magic bytes.
fn detect_from_magic_bytes(content: &[u8]) -> Option<FileType> {
    if content.len() < 4 {
        return None;
    }

    // ZIP-based formats (xlsx, xlsm, xlsb, ods)
    if content.starts_with(magic::ZIP) {
        return Some(FileType::Excel);
    }

    // OLE2 format (xls)
    if content.len() >= 8 && content.starts_with(magic::OLE2) {
        return Some(FileType::Excel);
    }

    None
}

/// Detects file type from file extension.
fn detect_from_extension(filename: &str) -> Option<FileType> {
    let extension = filename
        .rsplit('.')
        .next()?
        .to_ascii_lowercase();

    match extension.as_str() {
        // CSV and delimited text files
        "csv" | "tsv" | "txt" | "dat" | "tab" => Some(FileType::Csv),

        // Excel and spreadsheet files
        "xlsx" | "xls" | "xlsm" | "xlsb" | "ods" => Some(FileType::Excel),

        _ => None,
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_detect_xlsx_from_magic() {
        // ZIP magic bytes
        let content = [0x50, 0x4B, 0x03, 0x04, 0x00, 0x00];
        assert_eq!(detect_file_type(&content, "unknown"), Some(FileType::Excel));
    }

    #[test]
    fn test_detect_xls_from_magic() {
        // OLE2 magic bytes
        let content = [0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1];
        assert_eq!(detect_file_type(&content, "unknown"), Some(FileType::Excel));
    }

    #[test]
    fn test_detect_csv_from_extension() {
        let content = b"a,b,c\n1,2,3";
        assert_eq!(detect_file_type(content, "data.csv"), Some(FileType::Csv));
        assert_eq!(detect_file_type(content, "data.CSV"), Some(FileType::Csv));
        assert_eq!(detect_file_type(content, "data.tsv"), Some(FileType::Csv));
        assert_eq!(detect_file_type(content, "data.txt"), Some(FileType::Csv));
    }

    #[test]
    fn test_detect_excel_from_extension() {
        let content = b"not actually excel content";
        assert_eq!(detect_file_type(content, "data.xlsx"), Some(FileType::Excel));
        assert_eq!(detect_file_type(content, "data.xls"), Some(FileType::Excel));
        assert_eq!(detect_file_type(content, "data.xlsm"), Some(FileType::Excel));
        assert_eq!(detect_file_type(content, "data.ods"), Some(FileType::Excel));
    }

    #[test]
    fn test_unsupported_extension() {
        let content = b"some content";
        assert_eq!(detect_file_type(content, "data.json"), None);
        assert_eq!(detect_file_type(content, "data.xml"), None);
        assert_eq!(detect_file_type(content, "data.parquet"), None);
    }

    #[test]
    fn test_magic_bytes_override_extension() {
        // ZIP magic bytes but .csv extension - magic bytes win
        let content = [0x50, 0x4B, 0x03, 0x04, 0x00, 0x00];
        assert_eq!(detect_file_type(&content, "data.csv"), Some(FileType::Excel));
    }
}