#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum Delimiter {
#[default]
Auto,
Comma,
Tab,
Semicolon,
Pipe,
Custom(u8),
}
impl Delimiter {
#[must_use]
pub const fn as_byte(self) -> Option<u8> {
match self {
Self::Auto => None,
Self::Comma => Some(b','),
Self::Tab => Some(b'\t'),
Self::Semicolon => Some(b';'),
Self::Pipe => Some(b'|'),
Self::Custom(b) => Some(b),
}
}
#[must_use]
pub const fn from_byte(b: u8) -> Self {
match b {
b',' => Self::Comma,
b'\t' => Self::Tab,
b';' => Self::Semicolon,
b'|' => Self::Pipe,
other => Self::Custom(other),
}
}
}
impl From<u8> for Delimiter {
fn from(b: u8) -> Self {
Self::from_byte(b)
}
}
impl From<char> for Delimiter {
fn from(c: char) -> Self {
if c.is_ascii() {
Self::from_byte(c as u8)
} else {
Self::Custom(b',') }
}
}
#[must_use]
pub fn detect_delimiter(content: &str) -> u8 {
const CANDIDATES: [u8; 4] = [b',', b'\t', b';', b'|'];
const SAMPLE_LINES: usize = 20;
let lines: Vec<&str> = content
.lines()
.filter(|line| !line.trim().is_empty())
.take(SAMPLE_LINES)
.collect();
if lines.is_empty() {
return b',';
}
let mut best_delimiter = b',';
let mut best_score = 0u32;
for &candidate in &CANDIDATES {
let counts: Vec<usize> = lines
.iter()
.map(|line| count_unquoted_occurrences(line, candidate))
.collect();
if counts.iter().all(|&c| c == 0) {
continue;
}
let min_count = counts.iter().copied().min().unwrap_or(0);
let max_count = counts.iter().copied().max().unwrap_or(0);
let is_consistent = min_count == max_count && min_count > 0;
let score = if is_consistent {
(min_count as u32) * 100
} else if max_count > 0 && (max_count - min_count) <= 1 {
(min_count as u32) * 50
} else {
min_count as u32
};
if score > best_score {
best_score = score;
best_delimiter = candidate;
}
}
best_delimiter
}
fn count_unquoted_occurrences(line: &str, delimiter: u8) -> usize {
let mut count = 0;
let mut in_quotes = false;
let mut prev_char = 0u8;
for &byte in line.as_bytes() {
if byte == b'"' && prev_char != b'\\' {
in_quotes = !in_quotes;
} else if byte == delimiter && !in_quotes {
count += 1;
}
prev_char = byte;
}
count
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_comma() {
let content = "a,b,c\n1,2,3\n4,5,6";
assert_eq!(detect_delimiter(content), b',');
}
#[test]
fn test_detect_tab() {
let content = "a\tb\tc\n1\t2\t3\n4\t5\t6";
assert_eq!(detect_delimiter(content), b'\t');
}
#[test]
fn test_detect_semicolon() {
let content = "a;b;c\n1;2;3\n4;5;6";
assert_eq!(detect_delimiter(content), b';');
}
#[test]
fn test_detect_pipe() {
let content = "a|b|c\n1|2|3\n4|5|6";
assert_eq!(detect_delimiter(content), b'|');
}
#[test]
fn test_quoted_fields_ignored() {
let content = r#"name,description,value
"John","Hello, world",100
"Jane","Goodbye, world",200"#;
assert_eq!(detect_delimiter(content), b',');
}
#[test]
fn test_delimiter_from_byte() {
assert_eq!(Delimiter::from_byte(b','), Delimiter::Comma);
assert_eq!(Delimiter::from_byte(b'\t'), Delimiter::Tab);
assert_eq!(Delimiter::from_byte(b';'), Delimiter::Semicolon);
assert_eq!(Delimiter::from_byte(b'|'), Delimiter::Pipe);
assert_eq!(Delimiter::from_byte(b':'), Delimiter::Custom(b':'));
}
#[test]
fn test_delimiter_as_byte() {
assert_eq!(Delimiter::Auto.as_byte(), None);
assert_eq!(Delimiter::Comma.as_byte(), Some(b','));
assert_eq!(Delimiter::Tab.as_byte(), Some(b'\t'));
}
}