dataload 0.1.1 - Docs.rs

//! Delimiter detection and configuration for CSV parsing.

/// Supported delimiters for CSV files.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum Delimiter {
    /// Automatically detect the delimiter from file content.
    #[default]
    Auto,
    /// Comma separator (`,`).
    Comma,
    /// Tab separator (`\t`).
    Tab,
    /// Semicolon separator (`;`).
    Semicolon,
    /// Pipe separator (`|`).
    Pipe,
    /// Custom single-byte delimiter.
    Custom(u8),
}

impl Delimiter {
    /// Returns the byte representation if this is a concrete delimiter.
    /// Returns `None` for `Auto`.
    #[must_use]
    pub const fn as_byte(self) -> Option<u8> {
        match self {
            Self::Auto => None,
            Self::Comma => Some(b','),
            Self::Tab => Some(b'\t'),
            Self::Semicolon => Some(b';'),
            Self::Pipe => Some(b'|'),
            Self::Custom(b) => Some(b),
        }
    }

    /// Creates a delimiter from a byte value.
    #[must_use]
    pub const fn from_byte(b: u8) -> Self {
        match b {
            b',' => Self::Comma,
            b'\t' => Self::Tab,
            b';' => Self::Semicolon,
            b'|' => Self::Pipe,
            other => Self::Custom(other),
        }
    }
}

impl From<u8> for Delimiter {
    fn from(b: u8) -> Self {
        Self::from_byte(b)
    }
}

impl From<char> for Delimiter {
    fn from(c: char) -> Self {
        if c.is_ascii() {
            Self::from_byte(c as u8)
        } else {
            Self::Custom(b',') // Fallback for non-ASCII
        }
    }
}

/// Detects the most likely delimiter in CSV content.
///
/// This function samples the first few lines and looks for a character
/// that appears consistently across all lines. Consistency is a strong
/// indicator that the character is the actual delimiter.
///
/// # Arguments
///
/// * `content` - The CSV content as a string slice.
///
/// # Returns
///
/// The detected delimiter as a byte, defaulting to comma if detection fails.
#[must_use]
pub fn detect_delimiter(content: &str) -> u8 {
    const CANDIDATES: [u8; 4] = [b',', b'\t', b';', b'|'];
    const SAMPLE_LINES: usize = 20;

    // Sample first N non-empty lines
    let lines: Vec<&str> = content
        .lines()
        .filter(|line| !line.trim().is_empty())
        .take(SAMPLE_LINES)
        .collect();

    if lines.is_empty() {
        return b',';
    }

    let mut best_delimiter = b',';
    let mut best_score = 0u32;

    for &candidate in &CANDIDATES {
        let counts: Vec<usize> = lines
            .iter()
            .map(|line| count_unquoted_occurrences(line, candidate))
            .collect();

        // Skip if delimiter doesn't appear at all
        if counts.iter().all(|&c| c == 0) {
            continue;
        }

        // Calculate consistency score
        // A good delimiter appears the same number of times on each line
        let min_count = counts.iter().copied().min().unwrap_or(0);
        let max_count = counts.iter().copied().max().unwrap_or(0);

        // Perfect consistency: all lines have the same count
        let is_consistent = min_count == max_count && min_count > 0;

        // Score based on consistency and frequency
        let score = if is_consistent {
            // Highly reward consistent delimiters
            (min_count as u32) * 100
        } else if max_count > 0 && (max_count - min_count) <= 1 {
            // Allow off-by-one (common in files with optional trailing fields)
            (min_count as u32) * 50
        } else {
            // Inconsistent, but still consider if it appears frequently
            min_count as u32
        };

        if score > best_score {
            best_score = score;
            best_delimiter = candidate;
        }
    }

    best_delimiter
}

/// Counts occurrences of a delimiter that are not inside quoted strings.
fn count_unquoted_occurrences(line: &str, delimiter: u8) -> usize {
    let mut count = 0;
    let mut in_quotes = false;
    let mut prev_char = 0u8;

    for &byte in line.as_bytes() {
        if byte == b'"' && prev_char != b'\\' {
            in_quotes = !in_quotes;
        } else if byte == delimiter && !in_quotes {
            count += 1;
        }
        prev_char = byte;
    }

    count
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_detect_comma() {
        let content = "a,b,c\n1,2,3\n4,5,6";
        assert_eq!(detect_delimiter(content), b',');
    }

    #[test]
    fn test_detect_tab() {
        let content = "a\tb\tc\n1\t2\t3\n4\t5\t6";
        assert_eq!(detect_delimiter(content), b'\t');
    }

    #[test]
    fn test_detect_semicolon() {
        let content = "a;b;c\n1;2;3\n4;5;6";
        assert_eq!(detect_delimiter(content), b';');
    }

    #[test]
    fn test_detect_pipe() {
        let content = "a|b|c\n1|2|3\n4|5|6";
        assert_eq!(detect_delimiter(content), b'|');
    }

    #[test]
    fn test_quoted_fields_ignored() {
        // Commas inside quotes should not be counted
        let content = r#"name,description,value
"John","Hello, world",100
"Jane","Goodbye, world",200"#;
        assert_eq!(detect_delimiter(content), b',');
    }

    #[test]
    fn test_delimiter_from_byte() {
        assert_eq!(Delimiter::from_byte(b','), Delimiter::Comma);
        assert_eq!(Delimiter::from_byte(b'\t'), Delimiter::Tab);
        assert_eq!(Delimiter::from_byte(b';'), Delimiter::Semicolon);
        assert_eq!(Delimiter::from_byte(b'|'), Delimiter::Pipe);
        assert_eq!(Delimiter::from_byte(b':'), Delimiter::Custom(b':'));
    }

    #[test]
    fn test_delimiter_as_byte() {
        assert_eq!(Delimiter::Auto.as_byte(), None);
        assert_eq!(Delimiter::Comma.as_byte(), Some(b','));
        assert_eq!(Delimiter::Tab.as_byte(), Some(b'\t'));
    }
}