use std::borrow::Cow;
use std::fs::File;
use std::io::{Read, Seek};
use std::path::Path;
use crate::encoding::{detect_and_transcode, detect_encoding, skip_bom};
use crate::error::{Result, SnifferError};
use crate::field_type::Type;
use crate::metadata::{Dialect, Header, Metadata, Quote};
use crate::sample::{DatePreference, SampleSize};
use crate::tum::potential_dialects::{
PotentialDialect, detect_line_terminator, generate_dialects_with_terminator,
};
use crate::tum::score::{DialectScore, find_best_dialect, score_all_dialects_with_best_table};
use crate::tum::table::{Table, parse_table};
use crate::tum::type_detection::infer_column_types;
const MAX_RECORDS_BYTES: usize = 100 * 1024 * 1024;
#[derive(Debug, Clone)]
pub struct Sniffer {
sample_size: SampleSize,
date_preference: DatePreference,
forced_delimiter: Option<u8>,
forced_quote: Option<Quote>,
}
impl Default for Sniffer {
fn default() -> Self {
Self::new()
}
}
impl Sniffer {
pub const fn new() -> Self {
Self {
sample_size: SampleSize::Records(100),
date_preference: DatePreference::MdyFormat,
forced_delimiter: None,
forced_quote: None,
}
}
pub fn sample_size(&mut self, sample_size: SampleSize) -> &mut Self {
self.sample_size = sample_size;
self
}
pub fn date_preference(&mut self, date_preference: DatePreference) -> &mut Self {
self.date_preference = date_preference;
self
}
pub fn delimiter(&mut self, delimiter: u8) -> &mut Self {
self.forced_delimiter = Some(delimiter);
self
}
pub fn quote(&mut self, quote: Quote) -> &mut Self {
self.forced_quote = Some(quote);
self
}
pub fn sniff_path<P: AsRef<Path>>(&mut self, path: P) -> Result<Metadata> {
let file = File::open(path.as_ref())?;
let mut reader = std::io::BufReader::new(file);
self.sniff_reader(&mut reader)
}
pub fn sniff_reader<R: Read + Seek>(&mut self, reader: R) -> Result<Metadata> {
let data = self.read_sample(reader)?;
if data.is_empty() {
return Err(SnifferError::EmptyData);
}
self.sniff_bytes(&data)
}
pub fn sniff_bytes(&self, data: &[u8]) -> Result<Metadata> {
if data.is_empty() {
return Err(SnifferError::EmptyData);
}
let (transcoded_data, was_transcoded) = detect_and_transcode(data);
let data = &transcoded_data[..];
let encoding_info = detect_encoding(data);
let is_utf8 = !was_transcoded || encoding_info.is_utf8;
let data = skip_bom(data);
let (comment_preamble_rows, data) = skip_preamble(data);
let line_terminator = detect_line_terminator(data);
let dialects = self.forced_delimiter.map_or_else(
|| generate_dialects_with_terminator(line_terminator),
|delim| {
let quotes = if let Some(q) = self.forced_quote {
vec![q]
} else {
vec![Quote::Some(b'"'), Quote::Some(b'\''), Quote::None]
};
quotes
.into_iter()
.map(|q| PotentialDialect::new(delim, q, line_terminator))
.collect()
},
);
let max_rows = match self.sample_size {
SampleSize::Records(n) => n,
SampleSize::Bytes(_) | SampleSize::All => 0, };
let (scores, best_table) = score_all_dialects_with_best_table(data, &dialects, max_rows);
let best = find_best_dialect(&scores)
.ok_or_else(|| SnifferError::NoDialectDetected("No valid dialect found".to_string()))?;
let table_for_preamble = match best_table {
Some((dialect, table)) if dialect == best.dialect => table,
_ => parse_table(data, &best.dialect, max_rows),
};
let structural_preamble = detect_structural_preamble(&table_for_preamble);
let total_preamble_rows = comment_preamble_rows + structural_preamble;
self.build_metadata(
best,
is_utf8,
structural_preamble,
total_preamble_rows,
&table_for_preamble,
data,
)
}
fn read_sample<R: Read + Seek>(&self, mut reader: R) -> Result<Vec<u8>> {
fn fill<R: Read>(reader: &mut R, buf: &mut [u8]) -> std::io::Result<usize> {
let mut filled = 0;
while filled < buf.len() {
match reader.read(&mut buf[filled..]) {
Ok(0) => break,
Ok(n) => filled += n,
Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
Err(e) => return Err(e),
}
}
Ok(filled)
}
match self.sample_size {
SampleSize::Bytes(n) => {
let mut buffer = vec![0u8; n];
let bytes_read = fill(&mut reader, &mut buffer)?;
buffer.truncate(bytes_read);
Ok(buffer)
}
SampleSize::All => {
const MAX_BYTES: u64 = 1024 * 1024 * 1024; let mut buffer = Vec::new();
(&mut reader).take(MAX_BYTES).read_to_end(&mut buffer)?;
if buffer.len() as u64 == MAX_BYTES {
let mut probe = [0u8; 1];
if reader.read(&mut probe)? > 0 {
eprintln!(
"warning: input exceeds 1 GB; sniffing on truncated sample — results may be inaccurate"
);
}
}
Ok(buffer)
}
SampleSize::Records(n) => {
let estimated_size = n.saturating_mul(1024).clamp(8192, MAX_RECORDS_BYTES);
let mut buffer = vec![0u8; estimated_size];
let bytes_read = fill(&mut reader, &mut buffer)?;
buffer.truncate(bytes_read);
if bytes_read == estimated_size {
let newlines = bytecount::count(&buffer, b'\n');
if newlines < n {
let remaining = MAX_RECORDS_BYTES.saturating_sub(buffer.len());
let additional = (n - newlines).saturating_mul(2048).min(remaining);
let mut more = vec![0u8; additional];
let more_read = fill(&mut reader, &mut more)?;
more.truncate(more_read);
buffer.extend(more);
}
}
if buffer.len() >= MAX_RECORDS_BYTES {
let mut probe = [0u8; 1];
if reader.read(&mut probe)? > 0 {
eprintln!(
"warning: Records sample capped at 100 MB; \
sniff result may be approximate for very large inputs"
);
}
}
Ok(buffer)
}
}
}
fn build_metadata(
&self,
score: &DialectScore,
is_utf8: bool,
structural_preamble: usize,
total_preamble_rows: usize,
table: &Table,
data: &[u8],
) -> Result<Metadata> {
if table.is_empty() {
return Err(SnifferError::EmptyData);
}
let effective_table: Cow<'_, Table> =
if structural_preamble > 0 && table.rows.len() > structural_preamble {
let mut et = Table::new();
et.rows = table.rows[structural_preamble..].to_vec();
et.field_counts = table.field_counts[structural_preamble..].to_vec();
et.update_modal_field_count();
Cow::Owned(et)
} else {
Cow::Borrowed(table)
};
let header = detect_header(&effective_table, total_preamble_rows);
let fields = if header.has_header_row && !effective_table.rows.is_empty() {
effective_table.rows[0].clone()
} else {
(0..score.num_fields)
.map(|i| format!("field_{}", i + 1))
.collect()
};
let data_table = if header.has_header_row && effective_table.rows.len() > 1 {
let mut dt = crate::tum::table::Table::new();
dt.rows = effective_table.rows[1..].to_vec();
dt.field_counts = effective_table.field_counts[1..].to_vec();
dt.update_modal_field_count();
dt
} else {
effective_table.into_owned()
};
let types = infer_column_types(&data_table);
let dialect = Dialect {
delimiter: score.dialect.delimiter,
header,
quote: score.dialect.quote,
flexible: !score.is_uniform,
is_utf8,
};
let avg_record_len = calculate_avg_record_len(data, table.num_rows());
Ok(Metadata {
dialect,
avg_record_len,
num_fields: score.num_fields,
fields,
types,
})
}
}
fn detect_header(table: &crate::tum::table::Table, preamble_rows: usize) -> Header {
if table.rows.is_empty() {
return Header::new(false, preamble_rows);
}
if table.rows.len() < 2 {
return Header::new(false, preamble_rows);
}
let first_row = &table.rows[0];
let second_row = &table.rows[1];
let mut header_score = 0.0;
let mut checks = 0;
let (first_text_count, first_numeric_count) =
first_row.iter().fold((0, 0), |(text, num), s| {
let t = crate::tum::type_detection::detect_cell_type(s);
(
text + usize::from(t == Type::Text),
num + usize::from(t.is_numeric()),
)
});
let second_text_count = second_row
.iter()
.filter(|s| crate::tum::type_detection::detect_cell_type(s) == Type::Text)
.count();
if first_text_count > second_text_count {
header_score += 1.0;
}
checks += 1;
if first_text_count > first_numeric_count {
header_score += 0.5;
}
checks += 1;
let unique_count = {
let mut seen = std::collections::HashSet::new();
first_row.iter().filter(|s| seen.insert(s.as_str())).count()
};
if unique_count == first_row.len() {
header_score += 0.5;
}
checks += 1;
let avg_first_len: f64 = first_row
.iter()
.map(std::string::String::len)
.sum::<usize>() as f64
/ first_row.len().max(1) as f64;
let avg_second_len: f64 = second_row
.iter()
.map(std::string::String::len)
.sum::<usize>() as f64
/ second_row.len().max(1) as f64;
if avg_first_len <= avg_second_len {
header_score += 0.3;
}
checks += 1;
let has_header = (header_score / checks as f64) > 0.4;
Header::new(has_header, preamble_rows)
}
fn calculate_avg_record_len(data: &[u8], num_rows: usize) -> usize {
if num_rows == 0 || data.is_empty() {
return 0;
}
let mut rows_seen = 0;
let mut byte_offset = 0;
for (i, &byte) in data.iter().enumerate() {
if byte == b'\n' {
rows_seen += 1;
if rows_seen >= num_rows {
byte_offset = i + 1; break;
}
}
}
if byte_offset == 0 {
byte_offset = data.len();
}
byte_offset / num_rows
}
fn skip_preamble(data: &[u8]) -> (usize, &[u8]) {
let mut preamble_rows = 0;
let mut offset = 0;
while offset < data.len() {
let mut line_start = offset;
while line_start < data.len() && (data[line_start] == b' ' || data[line_start] == b'\t') {
line_start += 1;
}
if line_start < data.len() && data[line_start] == b'#' {
let mut line_end = line_start;
while line_end < data.len() && data[line_end] != b'\n' && data[line_end] != b'\r' {
line_end += 1;
}
if line_end < data.len() && data[line_end] == b'\r' {
line_end += 1;
}
if line_end < data.len() && data[line_end] == b'\n' {
line_end += 1;
}
preamble_rows += 1;
offset = line_end;
} else {
break;
}
}
(preamble_rows, &data[offset..])
}
fn detect_structural_preamble(table: &crate::tum::table::Table) -> usize {
let n = table.field_counts.len();
if n < 3 {
return 0;
}
let modal_count = table.modal_field_count();
let mut matching_suffix = vec![0usize; n];
let mut count = 0;
for i in (0..n).rev() {
if table.field_counts[i] == modal_count {
count += 1;
}
matching_suffix[i] = count;
}
for (i, &field_count) in table.field_counts.iter().enumerate() {
if field_count == modal_count {
let remaining_len = n - i;
let matching = matching_suffix[i];
let consistency = matching as f64 / remaining_len as f64;
if consistency >= 0.8 {
return i;
}
}
}
0
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_sniffer_builder() {
let mut sniffer = Sniffer::new();
sniffer
.sample_size(SampleSize::Records(50))
.date_preference(DatePreference::DmyFormat)
.delimiter(b',');
assert_eq!(sniffer.sample_size, SampleSize::Records(50));
assert_eq!(sniffer.date_preference, DatePreference::DmyFormat);
assert_eq!(sniffer.forced_delimiter, Some(b','));
}
#[test]
fn test_sniff_bytes() {
let data = b"name,age,city\nAlice,30,NYC\nBob,25,LA\n";
let sniffer = Sniffer::new();
let metadata = sniffer.sniff_bytes(data).unwrap();
assert_eq!(metadata.dialect.delimiter, b',');
assert!(metadata.dialect.header.has_header_row);
assert_eq!(metadata.num_fields, 3);
assert_eq!(metadata.fields, vec!["name", "age", "city"]);
}
#[test]
fn test_sniff_tsv() {
let data = b"name\tage\tcity\nAlice\t30\tNYC\nBob\t25\tLA\n";
let sniffer = Sniffer::new();
let metadata = sniffer.sniff_bytes(data).unwrap();
assert_eq!(metadata.dialect.delimiter, b'\t');
assert!(metadata.dialect.header.has_header_row);
}
#[test]
fn test_sniff_semicolon() {
let data = b"name;age;city\nAlice;30;NYC\nBob;25;LA\n";
let sniffer = Sniffer::new();
let metadata = sniffer.sniff_bytes(data).unwrap();
assert_eq!(metadata.dialect.delimiter, b';');
}
#[test]
fn test_sniff_no_header() {
let data = b"1,2,3\n4,5,6\n7,8,9\n";
let sniffer = Sniffer::new();
let metadata = sniffer.sniff_bytes(data).unwrap();
assert_eq!(metadata.dialect.delimiter, b',');
assert!(!metadata.dialect.header.has_header_row);
}
#[test]
fn test_sniff_with_quotes() {
let data = b"\"name\",\"value\"\n\"hello, world\",123\n\"test\",456\n";
let sniffer = Sniffer::new();
let metadata = sniffer.sniff_bytes(data).unwrap();
assert_eq!(metadata.dialect.delimiter, b',');
assert_eq!(metadata.dialect.quote, Quote::Some(b'"'));
}
#[test]
fn test_sniff_empty() {
let data = b"";
let sniffer = Sniffer::new();
let result = sniffer.sniff_bytes(data);
assert!(result.is_err());
}
#[test]
fn test_skip_preamble() {
let data = b"# This is a comment\n# Another comment\nname,age\nAlice,30\n";
let (preamble_rows, remaining) = skip_preamble(data);
assert_eq!(preamble_rows, 2);
assert_eq!(remaining, b"name,age\nAlice,30\n");
let data = b"name,age\nAlice,30\n";
let (preamble_rows, remaining) = skip_preamble(data);
assert_eq!(preamble_rows, 0);
assert_eq!(remaining, b"name,age\nAlice,30\n");
let data = b" # Indented comment\nname,age\n";
let (preamble_rows, remaining) = skip_preamble(data);
assert_eq!(preamble_rows, 1);
assert_eq!(remaining, b"name,age\n");
}
#[test]
fn test_sniff_with_preamble() {
let data = b"# LimeSurvey export\n# Generated 2024-01-01\nname,age,city\nAlice,30,NYC\nBob,25,LA\n";
let sniffer = Sniffer::new();
let metadata = sniffer.sniff_bytes(data).unwrap();
assert_eq!(metadata.dialect.delimiter, b',');
assert!(metadata.dialect.header.has_header_row);
assert_eq!(metadata.num_fields, 3);
}
#[test]
fn test_comment_preamble_propagated() {
let data = b"# Comment 1\n# Comment 2\nname,age\nAlice,30\nBob,25\n";
let metadata = Sniffer::new().sniff_bytes(data).unwrap();
assert_eq!(metadata.dialect.header.num_preamble_rows, 2);
assert!(metadata.dialect.header.has_header_row);
assert_eq!(metadata.fields, vec!["name", "age"]);
}
#[test]
fn test_structural_preamble_detection() {
let data = b"TITLE\nSUB,TITLE\nA,B,C,D,E\n1,2,3,4,5\n2,3,4,5,6\n3,4,5,6,7\n";
let metadata = Sniffer::new().sniff_bytes(data).unwrap();
assert_eq!(metadata.dialect.header.num_preamble_rows, 2);
assert!(metadata.dialect.header.has_header_row);
assert_eq!(metadata.fields, vec!["A", "B", "C", "D", "E"]);
}
#[test]
fn test_mixed_preamble_detection() {
let data =
b"# File header\nMETADATA\nname,age,city\nAlice,30,NYC\nBob,25,LA\nCharlie,35,CHI\n";
let metadata = Sniffer::new().sniff_bytes(data).unwrap();
assert_eq!(metadata.dialect.header.num_preamble_rows, 2);
assert!(metadata.dialect.header.has_header_row);
assert_eq!(metadata.fields, vec!["name", "age", "city"]);
}
#[test]
fn test_no_preamble() {
let data = b"a,b,c\n1,2,3\n4,5,6\n";
let metadata = Sniffer::new().sniff_bytes(data).unwrap();
assert_eq!(metadata.dialect.header.num_preamble_rows, 0);
}
#[test]
fn test_detect_structural_preamble_function() {
use crate::tum::table::Table;
let mut table = Table::new();
table.rows = vec![
vec!["TITLE".to_string()],
vec!["".to_string(), "".to_string()],
vec!["A".to_string(), "B".to_string(), "C".to_string()],
vec!["1".to_string(), "2".to_string(), "3".to_string()],
vec!["4".to_string(), "5".to_string(), "6".to_string()],
];
table.field_counts = vec![1, 2, 3, 3, 3];
table.update_modal_field_count();
assert_eq!(detect_structural_preamble(&table), 2);
let mut table = Table::new();
table.rows = vec![
vec!["A".to_string(), "B".to_string(), "C".to_string()],
vec!["1".to_string(), "2".to_string(), "3".to_string()],
];
table.field_counts = vec![3, 3];
table.update_modal_field_count();
assert_eq!(detect_structural_preamble(&table), 0);
let mut table = Table::new();
table.rows = vec![vec!["A".to_string()]];
table.field_counts = vec![1];
table.update_modal_field_count();
assert_eq!(detect_structural_preamble(&table), 0);
}
#[test]
fn test_avg_record_len_calculated_from_data() {
let short_data = b"a,b\n1,2\n3,4\n";
let sniffer = Sniffer::new();
let metadata = sniffer.sniff_bytes(short_data).unwrap();
assert_eq!(metadata.avg_record_len, 4);
}
#[test]
fn test_avg_record_len_with_quoted_fields() {
let quoted_data = b"\"hello\",\"world\"\n\"foo\",\"bar\"\n";
let sniffer = Sniffer::new();
let metadata = sniffer.sniff_bytes(quoted_data).unwrap();
assert_eq!(metadata.avg_record_len, 14);
}
#[test]
fn test_records_mode_cap_boundary_ok() {
let row = b"col1,col2,col3\n1,2,3\n"; let total = MAX_RECORDS_BYTES + row.len();
let data: Vec<u8> = row.iter().copied().cycle().take(total).collect();
assert!(
data.len() > MAX_RECORDS_BYTES,
"test data must exceed MAX_RECORDS_BYTES to exercise probe-read path"
);
let cursor = std::io::Cursor::new(data);
let mut sniffer = Sniffer::new();
sniffer.sample_size(SampleSize::Records(200_000));
let result = sniffer.sniff_reader(cursor);
assert!(
result.is_ok(),
"sniff should succeed at cap boundary: {result:?}"
);
}
}