use std::cell::RefCell;
use rayon::prelude::*;
use super::potential_dialects::PotentialDialect;
use super::table::{Table, parse_table, parse_table_normalized};
use super::type_detection::{TypeScoreBuffers, calculate_pattern_score, calculate_type_score};
use super::uniformity::{calculate_tau_0, calculate_tau_1, is_uniform};
thread_local! {
static BUFFERS: RefCell<TypeScoreBuffers> = RefCell::new(TypeScoreBuffers::new());
}
#[derive(Debug, Clone, Copy)]
struct QuoteCounts {
double: usize,
single: usize,
backslash_single: usize,
backslash_double: usize,
data_len: usize,
}
impl QuoteCounts {
fn new(data: &[u8]) -> Self {
let mut backslash_single = 0usize;
let mut backslash_double = 0usize;
for window in data.windows(2) {
if window[0] == b'\\' {
if window[1] == b'\'' {
backslash_single += 1;
} else if window[1] == b'"' {
backslash_double += 1;
}
}
}
Self {
double: bytecount::count(data, b'"'),
single: bytecount::count(data, b'\''),
backslash_single,
backslash_double,
data_len: data.len(),
}
}
}
#[derive(Debug, Clone)]
struct QuoteBoundaryCounts {
double_boundaries: Vec<(u8, usize)>,
single_boundaries: Vec<(u8, usize)>,
single_opening_boundaries: Vec<(u8, usize)>,
double_newline_boundaries: usize,
single_newline_boundaries: usize,
single_opening_newline_boundaries: usize,
starts_with_double: bool,
starts_with_single: bool,
}
impl QuoteBoundaryCounts {
fn new(data: &[u8], delimiters: &[u8]) -> Self {
let mut double_counts: Vec<usize> = vec![0; delimiters.len()];
let mut single_counts: Vec<usize> = vec![0; delimiters.len()];
let mut single_opening_counts: Vec<usize> = vec![0; delimiters.len()];
let mut double_newline_boundaries: usize = 0;
let mut single_newline_boundaries: usize = 0;
let mut single_opening_newline_boundaries: usize = 0;
let mut delim_indices = [usize::MAX; 256];
for (i, &d) in delimiters.iter().enumerate() {
delim_indices[d as usize] = i;
}
for window in data.windows(2) {
let is_newline = window[0] == b'\n' || window[0] == b'\r';
let delim_idx = delim_indices[window[0] as usize];
let is_delimiter = delim_idx != usize::MAX;
if is_newline || is_delimiter {
if window[1] == b'"' {
if is_newline {
double_newline_boundaries += 1;
} else {
double_counts[delim_idx] += 1;
}
}
if window[1] == b'\'' {
if is_newline {
single_newline_boundaries += 1;
single_opening_newline_boundaries += 1;
} else {
single_counts[delim_idx] += 1;
single_opening_counts[delim_idx] += 1;
}
}
}
let is_end_newline = window[1] == b'\n' || window[1] == b'\r';
let end_delim_idx = delim_indices[window[1] as usize];
let is_end_delimiter = end_delim_idx != usize::MAX;
if window[0] == b'"' && (is_end_newline || is_end_delimiter) {
if is_end_newline {
double_newline_boundaries += 1;
} else {
double_counts[end_delim_idx] += 1;
}
}
if window[0] == b'\'' && (is_end_newline || is_end_delimiter) {
if is_end_newline {
single_newline_boundaries += 1;
} else {
single_counts[end_delim_idx] += 1;
}
}
}
let starts_with_double = !data.is_empty() && data[0] == b'"';
let starts_with_single = !data.is_empty() && data[0] == b'\'';
Self {
double_boundaries: delimiters.iter().copied().zip(double_counts).collect(),
single_boundaries: delimiters.iter().copied().zip(single_counts).collect(),
single_opening_boundaries: delimiters
.iter()
.copied()
.zip(single_opening_counts)
.collect(),
double_newline_boundaries,
single_newline_boundaries,
single_opening_newline_boundaries,
starts_with_double,
starts_with_single,
}
}
fn get_boundary_count(&self, quote_char: u8, delimiter: u8) -> usize {
let (boundaries, newline_boundaries) = if quote_char == b'"' {
(&self.double_boundaries, self.double_newline_boundaries)
} else {
(&self.single_boundaries, self.single_newline_boundaries)
};
let delimiter_count = boundaries
.iter()
.find(|&&(d, _)| d == delimiter)
.map_or(0, |&(_, c)| c);
let starts_with_quote = (quote_char == b'"' && self.starts_with_double)
|| (quote_char == b'\'' && self.starts_with_single);
let start_bonus = usize::from(starts_with_quote);
delimiter_count + newline_boundaries + start_bonus
}
fn get_single_opening_boundary_count(&self, delimiter: u8) -> usize {
let delimiter_count = self
.single_opening_boundaries
.iter()
.find(|&&(d, _)| d == delimiter)
.map_or(0, |&(_, c)| c);
let start_bonus = usize::from(self.starts_with_single);
delimiter_count + self.single_opening_newline_boundaries + start_bonus
}
}
#[derive(Debug, Clone)]
pub struct DialectScore {
pub dialect: PotentialDialect,
pub gamma: f64,
#[allow(dead_code)]
pub tau_0: f64,
#[allow(dead_code)]
pub tau_1: f64,
#[allow(dead_code)]
pub type_score: f64,
#[allow(dead_code)]
pub pattern_score: f64,
#[allow(dead_code)]
pub num_rows: usize,
pub num_fields: usize,
pub is_uniform: bool,
}
impl DialectScore {
pub fn new(dialect: PotentialDialect, table: &Table, type_score: f64) -> Self {
let tau_0 = calculate_tau_0(table);
let tau_1 = calculate_tau_1(table);
let pattern_score = calculate_pattern_score(table);
let uniform = is_uniform(table);
let gamma = compute_gamma(
tau_0,
tau_1,
type_score,
pattern_score,
table,
dialect.delimiter,
);
Self {
dialect,
gamma,
tau_0,
tau_1,
type_score,
pattern_score,
num_rows: table.num_rows(),
num_fields: table.modal_field_count(),
is_uniform: uniform,
}
}
pub const fn zero(dialect: PotentialDialect) -> Self {
Self {
dialect,
gamma: 0.0,
tau_0: 0.0,
tau_1: 0.0,
type_score: 0.0,
pattern_score: 0.0,
num_rows: 0,
num_fields: 0,
is_uniform: false,
}
}
}
fn compute_gamma(
tau_0: f64,
tau_1: f64,
type_score: f64,
pattern_score: f64,
table: &Table,
delimiter: u8,
) -> f64 {
if table.is_empty() {
return 0.0;
}
let uniformity_score = (tau_0 * tau_1).sqrt();
let type_contribution = type_score * 0.3;
let pattern_contribution = pattern_score * 0.1;
let row_bonus = (table.num_rows().min(20) as f64 / 20.0) * 0.1;
let field_count = table.modal_field_count();
let field_bonus = if field_count >= 2 {
(field_count.min(10) as f64 / 10.0) * 0.2
} else {
0.0
};
let single_field_penalty = if field_count == 1 { 0.5 } else { 1.0 };
let high_field_penalty = if field_count > 100 {
0.5
} else if field_count > 50 {
0.8
} else {
1.0
};
let num_rows = table.num_rows();
let small_sample_penalty = if num_rows < 3 {
0.80 } else if num_rows < 5 {
0.90 } else {
1.0
};
let delimiter_penalty = match delimiter {
b',' | b';' | b'\t' => 1.0, b'|' => 0.98, b':' => 0.90, b' ' => 0.75, b'^' | b'~' => 0.80, b'#' => {
if field_count >= 3 && num_rows >= 50 {
0.85 } else {
0.60 }
}
b'&' => 0.60, 0xA7 => 0.78, b'/' => 0.65, _ => 0.70, };
let raw_score = uniformity_score.mul_add(0.5, type_contribution)
+ pattern_contribution
+ row_bonus
+ field_bonus;
raw_score * single_field_penalty * high_field_penalty * delimiter_penalty * small_sample_penalty
}
#[allow(dead_code)]
pub fn score_dialect(data: &[u8], dialect: &PotentialDialect, max_rows: usize) -> DialectScore {
let quote_counts = QuoteCounts::new(data);
let mut buffers = TypeScoreBuffers::new();
let (score, _table) =
score_dialect_with_counts(data, dialect, max_rows, "e_counts, &mut buffers);
score
}
fn score_dialect_with_counts(
data: &[u8],
dialect: &PotentialDialect,
max_rows: usize,
quote_counts: &QuoteCounts,
buffers: &mut TypeScoreBuffers,
) -> (DialectScore, Table) {
let table = parse_table(data, dialect, max_rows);
if table.is_empty() {
return (DialectScore::zero(dialect.clone()), table);
}
let type_score = calculate_type_score(&table, buffers);
let mut score = DialectScore::new(dialect.clone(), &table, type_score);
let quote_multiplier = quote_evidence_score_with_data(data, quote_counts, dialect);
score.gamma *= quote_multiplier;
(score, table)
}
fn score_dialect_with_normalized_data(
normalized_data: &[u8],
dialect: &PotentialDialect,
max_rows: usize,
quote_counts: &QuoteCounts,
boundary_counts: &QuoteBoundaryCounts,
buffers: &mut TypeScoreBuffers,
) -> (DialectScore, Table) {
let table = parse_table_normalized(normalized_data, dialect, max_rows);
if table.is_empty() {
return (DialectScore::zero(dialect.clone()), table);
}
let type_score = calculate_type_score(&table, buffers);
let mut score = DialectScore::new(dialect.clone(), &table, type_score);
let quote_multiplier =
quote_evidence_score_with_cached_boundaries(quote_counts, boundary_counts, dialect);
let effective_multiplier =
if quote_multiplier > 1.5 && score.num_fields >= 5 && !score.is_uniform {
let first_fields = table
.field_counts
.first()
.copied()
.unwrap_or(score.num_fields);
if first_fields <= 1 {
let modal = score.num_fields;
let mut distinct_counts: Vec<usize> = table
.field_counts
.iter()
.filter(|&&c| c != modal)
.copied()
.collect();
distinct_counts.sort_unstable();
distinct_counts.dedup();
let distinct_non_modal = distinct_counts.len();
if distinct_non_modal >= 3 {
1.0 + (quote_multiplier - 1.0) * 0.3
} else {
quote_multiplier
}
} else {
quote_multiplier
}
} else {
quote_multiplier
};
let effective_multiplier = if dialect.delimiter == b' ' && !table.rows.is_empty() {
let empty_first_count = table
.rows
.iter()
.filter(|row| row.first().is_none_or(|f| f.is_empty()))
.count();
if empty_first_count * 2 > table.rows.len() {
effective_multiplier.min(1.05) * 0.55
} else {
effective_multiplier
}
} else {
effective_multiplier
};
score.gamma *= effective_multiplier;
if dialect.delimiter == b',' && score.num_fields == 2 && !table.rows.is_empty() {
let hash_sep_count = table
.rows
.iter()
.filter(|row| row.first().is_some_and(|f| f.trim_start().contains(" # ")))
.count();
if hash_sep_count * 10 > table.rows.len() * 9 {
score.gamma *= 0.82;
}
}
(score, table)
}
#[allow(dead_code)]
fn quote_evidence_score(data: &[u8], dialect: &PotentialDialect) -> f64 {
let quote_counts = QuoteCounts::new(data);
quote_evidence_score_with_counts("e_counts, dialect)
}
fn quote_evidence_score_with_counts(quote_counts: &QuoteCounts, dialect: &PotentialDialect) -> f64 {
use crate::metadata::Quote;
if quote_counts.data_len == 0 {
return 1.0;
}
let double_density = (quote_counts.double * 1000) / quote_counts.data_len;
let single_density = (quote_counts.single * 1000) / quote_counts.data_len;
let min_density_threshold = 5;
match dialect.quote {
Quote::Some(b'"') => {
if double_density >= min_density_threshold {
1.06
} else {
1.0
}
}
Quote::Some(b'\'') => {
if double_density == 0 && single_density >= min_density_threshold {
1.10
} else if single_density >= min_density_threshold * 2
&& double_density < min_density_threshold
{
1.05
} else if double_density >= min_density_threshold {
0.92
} else {
1.0
}
}
Quote::None => {
if double_density >= min_density_threshold {
0.90
} else {
1.0
}
}
Quote::Some(_) => 1.0, }
}
#[allow(dead_code)]
fn quote_boundary_count(data: &[u8], quote_char: u8, delimiter: u8) -> usize {
let mut boundary_pairs = 0;
for window in data.windows(2) {
if (window[0] == delimiter || window[0] == b'\n' || window[0] == b'\r')
&& window[1] == quote_char
{
boundary_pairs += 1;
}
if window[0] == quote_char
&& (window[1] == delimiter || window[1] == b'\n' || window[1] == b'\r')
{
boundary_pairs += 1;
}
}
if !data.is_empty() && data[0] == quote_char {
boundary_pairs += 1;
}
boundary_pairs
}
fn compute_single_quote_multiplier(
quote_counts: &QuoteCounts,
boundary_count: usize,
opening_count: usize,
single_density: usize,
double_density: usize,
min_density_threshold: usize,
) -> f64 {
if quote_counts.double == 0
&& opening_count >= 2
&& boundary_count >= 4
&& single_density >= min_density_threshold * 2
{
2.2
} else if quote_counts.double == 0
&& opening_count >= 1
&& boundary_count >= 2
&& single_density >= min_density_threshold
{
1.20
} else if double_density >= min_density_threshold {
0.90
} else if quote_counts.backslash_single > 0
&& quote_counts.backslash_double == 0
&& boundary_count == 0
{
1.10
} else if quote_counts.double == 0
&& opening_count == 0
&& boundary_count >= 20
&& single_density >= 50
{
1.10
} else if boundary_count == 0 && single_density > 0 {
0.95
} else {
1.0
}
}
fn quote_evidence_score_with_cached_boundaries(
quote_counts: &QuoteCounts,
boundary_counts: &QuoteBoundaryCounts,
dialect: &PotentialDialect,
) -> f64 {
use crate::metadata::Quote;
if quote_counts.data_len == 0 {
return 1.0;
}
let double_density = (quote_counts.double * 1000) / quote_counts.data_len;
let single_density = (quote_counts.single * 1000) / quote_counts.data_len;
let min_density_threshold = 5;
match dialect.quote {
Quote::Some(b'"') => {
let boundary_count = boundary_counts.get_boundary_count(b'"', dialect.delimiter);
if quote_counts.single == 0
&& boundary_count >= 2
&& double_density >= min_density_threshold
{
2.2
} else if boundary_count >= 2 && double_density >= min_density_threshold {
1.15
} else if double_density >= min_density_threshold {
1.08
} else {
1.0
}
}
Quote::Some(b'\'') => {
let boundary_count = boundary_counts.get_boundary_count(b'\'', dialect.delimiter);
let opening_count =
boundary_counts.get_single_opening_boundary_count(dialect.delimiter);
compute_single_quote_multiplier(
quote_counts,
boundary_count,
opening_count,
single_density,
double_density,
min_density_threshold,
)
}
Quote::None => {
if double_density >= min_density_threshold {
0.90
} else {
1.0
}
}
Quote::Some(_) => 1.0, }
}
fn quote_opening_boundary_count(data: &[u8], quote_char: u8, delimiter: u8) -> usize {
let mut count = 0;
for window in data.windows(2) {
if (window[0] == delimiter || window[0] == b'\n' || window[0] == b'\r')
&& window[1] == quote_char
{
count += 1;
}
}
if !data.is_empty() && data[0] == quote_char {
count += 1;
}
count
}
fn quote_evidence_score_with_data(
data: &[u8],
quote_counts: &QuoteCounts,
dialect: &PotentialDialect,
) -> f64 {
use crate::metadata::Quote;
if quote_counts.data_len == 0 {
return 1.0;
}
let double_density = (quote_counts.double * 1000) / quote_counts.data_len;
let single_density = (quote_counts.single * 1000) / quote_counts.data_len;
let min_density_threshold = 5;
match dialect.quote {
Quote::Some(b'"') => {
let boundary_count = quote_boundary_count(data, b'"', dialect.delimiter);
if quote_counts.single == 0
&& boundary_count >= 2
&& double_density >= min_density_threshold
{
2.2
} else if boundary_count >= 2 && double_density >= min_density_threshold {
1.15
} else if double_density >= min_density_threshold {
1.08
} else {
1.0
}
}
Quote::Some(b'\'') => {
let boundary_count = quote_boundary_count(data, b'\'', dialect.delimiter);
let opening_count = quote_opening_boundary_count(data, b'\'', dialect.delimiter);
compute_single_quote_multiplier(
quote_counts,
boundary_count,
opening_count,
single_density,
double_density,
min_density_threshold,
)
}
Quote::None => {
if double_density >= min_density_threshold {
0.90
} else {
1.0
}
}
Quote::Some(_) => 1.0, }
}
pub fn find_best_dialect(scores: &[DialectScore]) -> Option<&DialectScore> {
let all_single_field = scores
.iter()
.filter(|s| s.gamma > 0.0)
.all(|s| s.num_fields <= 1);
scores.iter().filter(|s| s.gamma > 0.0).max_by(|a, b| {
let score_ratio = if a.gamma > b.gamma {
b.gamma / a.gamma
} else {
a.gamma / b.gamma
};
if all_single_field {
let a_delim_priority = delimiter_priority(a.dialect.delimiter);
let b_delim_priority = delimiter_priority(b.dialect.delimiter);
match a_delim_priority.cmp(&b_delim_priority) {
std::cmp::Ordering::Equal => {
let a_quote_priority = quote_priority(a.dialect.quote);
let b_quote_priority = quote_priority(b.dialect.quote);
return a_quote_priority.cmp(&b_quote_priority);
}
other => return other,
}
}
if score_ratio > 0.95 {
let a_delim_priority = delimiter_priority(a.dialect.delimiter);
let b_delim_priority = delimiter_priority(b.dialect.delimiter);
match a_delim_priority.cmp(&b_delim_priority) {
std::cmp::Ordering::Equal => {
let a_quote_priority = quote_priority(a.dialect.quote);
let b_quote_priority = quote_priority(b.dialect.quote);
match a_quote_priority.cmp(&b_quote_priority) {
std::cmp::Ordering::Equal => a
.gamma
.partial_cmp(&b.gamma)
.unwrap_or(std::cmp::Ordering::Equal),
other => other,
}
}
other => other,
}
} else {
a.gamma
.partial_cmp(&b.gamma)
.unwrap_or(std::cmp::Ordering::Equal)
}
})
}
const fn delimiter_priority(delimiter: u8) -> u8 {
match delimiter {
b',' => 10, b';' => 9, b'\t' => 8, b'|' => 8,
b':' => 4, b'^' => 3, b'~' => 3, 0xA7 => 2, b'/' => 2, b' ' => 2, b'#' => 1, b'&' => 1, _ => 0, }
}
const fn quote_priority(quote: crate::metadata::Quote) -> u8 {
use crate::metadata::Quote;
match quote {
Quote::Some(b'"') => 3, Quote::Some(b'\'') => 2, Quote::None => 1, Quote::Some(_) => 0, }
}
#[allow(dead_code)]
pub fn score_all_dialects(
data: &[u8],
dialects: &[PotentialDialect],
max_rows: usize,
) -> Vec<DialectScore> {
let (scores, _) = score_all_dialects_with_best_table(data, dialects, max_rows);
scores
}
pub fn score_all_dialects_with_best_table(
data: &[u8],
dialects: &[PotentialDialect],
max_rows: usize,
) -> (Vec<DialectScore>, Option<(PotentialDialect, Table)>) {
let quote_counts = QuoteCounts::new(data);
let delimiters: Vec<u8> = dialects
.iter()
.map(|d| d.delimiter)
.collect::<std::collections::HashSet<_>>()
.into_iter()
.collect();
let line_terminator = dialects
.first()
.map_or(super::potential_dialects::LineTerminator::LF, |d| {
d.line_terminator
});
let normalized_data = super::potential_dialects::normalize_line_endings(data, line_terminator);
let normalized_bytes: &[u8] = normalized_data.as_ref();
let boundary_counts = QuoteBoundaryCounts::new(normalized_bytes, &delimiters);
let pairs: Vec<(DialectScore, Table)> = dialects
.par_iter()
.map(|d| {
BUFFERS.with(|b| {
score_dialect_with_normalized_data(
normalized_bytes,
d,
max_rows,
"e_counts,
&boundary_counts,
&mut b.borrow_mut(),
)
})
})
.collect();
let best_table = pairs
.iter()
.enumerate()
.max_by(|(i, a), (j, b)| {
a.0.gamma
.partial_cmp(&b.0.gamma)
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| j.cmp(i)) })
.map(|(_, (s, t))| (s.dialect.clone(), t.clone()));
let mut scores: Vec<DialectScore> = pairs.into_iter().map(|(s, _)| s).collect();
scores.sort_by(|a, b| {
b.gamma
.partial_cmp(&a.gamma)
.unwrap_or(std::cmp::Ordering::Equal)
});
(scores, best_table)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::metadata::Quote;
use crate::tum::potential_dialects::LineTerminator;
#[test]
fn test_score_simple_csv() {
let data = b"a,b,c\n1,2,3\n4,5,6\n";
let dialect = PotentialDialect::new(b',', Quote::Some(b'"'), LineTerminator::LF);
let score = score_dialect(data, &dialect, 100);
assert!(score.gamma > 0.0);
assert_eq!(score.num_fields, 3);
assert!(score.is_uniform);
}
#[test]
fn test_wrong_delimiter_lower_score() {
let data = b"a,b,c\n1,2,3\n4,5,6\n";
let correct_dialect = PotentialDialect::new(b',', Quote::Some(b'"'), LineTerminator::LF);
let wrong_dialect = PotentialDialect::new(b';', Quote::Some(b'"'), LineTerminator::LF);
let correct_score = score_dialect(data, &correct_dialect, 100);
let wrong_score = score_dialect(data, &wrong_dialect, 100);
assert!(correct_score.gamma > wrong_score.gamma);
}
#[test]
fn test_find_best_dialect() {
let data = b"a,b,c\n1,2,3\n4,5,6\n";
let dialects = vec![
PotentialDialect::new(b',', Quote::Some(b'"'), LineTerminator::LF),
PotentialDialect::new(b';', Quote::Some(b'"'), LineTerminator::LF),
PotentialDialect::new(b'\t', Quote::Some(b'"'), LineTerminator::LF),
];
let scores = score_all_dialects(data, &dialects, 100);
let best = find_best_dialect(&scores).unwrap();
assert_eq!(best.dialect.delimiter, b',');
}
#[test]
fn test_quote_opening_boundary_count_apostrophes_only() {
let data = b"value's, other's, thing's\n";
let count = quote_opening_boundary_count(data, b'\'', b',');
assert_eq!(count, 0);
}
#[test]
fn test_quote_opening_boundary_count_genuine_quoting() {
let data = b",'field', 'next'\n";
let count = quote_opening_boundary_count(data, b'\'', b',');
assert!(
count >= 1,
"expected at least 1 opening boundary, got {count}"
);
}
#[test]
fn test_quote_opening_boundary_count_leading_quote() {
let data = b"'field','next'\n";
let count = quote_opening_boundary_count(data, b'\'', b',');
assert_eq!(count, 2);
}
#[test]
fn test_quote_opening_boundary_count_empty() {
let count = quote_opening_boundary_count(b"", b'\'', b',');
assert_eq!(count, 0);
}
#[test]
fn test_get_single_opening_boundary_count_apostrophes_only() {
let data = b"it's, we're, they've\n";
let delimiters = vec![b','];
let counts = QuoteBoundaryCounts::new(data, &delimiters);
let opening = counts.get_single_opening_boundary_count(b',');
assert_eq!(
opening, 0,
"apostrophes should produce zero opening boundaries"
);
}
#[test]
fn test_get_single_opening_boundary_count_genuine_quoting() {
let data = b"'first','second','third'\n";
let delimiters = vec![b','];
let counts = QuoteBoundaryCounts::new(data, &delimiters);
let opening = counts.get_single_opening_boundary_count(b',');
assert!(
opening >= 2,
"expected ≥2 opening boundaries for genuinely quoted fields, got {opening}"
);
}
#[test]
fn test_hash_penalty_strict_for_small_table() {
let mut small_data = String::new();
for _ in 0..10 {
small_data.push_str("a#b#c\n");
}
let mut large_data = String::new();
for _ in 0..60 {
large_data.push_str("a#b#c\n");
}
let dialects = vec![PotentialDialect::new(
b'#',
Quote::Some(b'"'),
LineTerminator::LF,
)];
let small_scores = score_all_dialects(small_data.as_bytes(), &dialects, 200);
let large_scores = score_all_dialects(large_data.as_bytes(), &dialects, 200);
let small_score = small_scores
.iter()
.find(|s| s.dialect.delimiter == b'#')
.unwrap();
let large_score = large_scores
.iter()
.find(|s| s.dialect.delimiter == b'#')
.unwrap();
assert!(
large_score.gamma > small_score.gamma * 1.3,
"large hash table (0.85 penalty) should outscore small hash table (0.60 penalty) \
by factor ≥ 1.3; small={} large={}",
small_score.gamma,
large_score.gamma
);
}
#[test]
fn test_hash_penalty_relaxed_for_large_table() {
let mut data = String::new();
for i in 0..60 {
data.push_str(&format!("val{i}#val{i}b#val{i}c\n"));
}
let bytes = data.as_bytes();
let hash_dialect = PotentialDialect::new(b'#', Quote::Some(b'"'), LineTerminator::LF);
let hash_score = score_dialect(bytes, &hash_dialect, 200);
assert!(
hash_score.gamma > 0.3,
"large hash-delimited table should have a meaningful gamma; got {}",
hash_score.gamma
);
}
#[test]
fn test_space_dampening_fires_when_majority_empty_first() {
let leading_space_data = b" a b\n c d\n e f\n";
let no_leading_space_data = b"a b c\nd e f\ng h i\n";
let dialects = vec![PotentialDialect::new(
b' ',
Quote::Some(b'"'),
LineTerminator::LF,
)];
let dampened_scores = score_all_dialects(leading_space_data, &dialects, 100);
let undampened_scores = score_all_dialects(no_leading_space_data, &dialects, 100);
let dampened_score = dampened_scores
.iter()
.find(|s| s.dialect.delimiter == b' ')
.unwrap();
let undampened_score = undampened_scores
.iter()
.find(|s| s.dialect.delimiter == b' ')
.unwrap();
assert!(
dampened_score.gamma < undampened_score.gamma,
"dampening should reduce score when majority rows have empty first field; \
dampened={} undampened={}",
dampened_score.gamma,
undampened_score.gamma
);
}
#[test]
fn test_space_dampening_does_not_fire_when_minority_empty_first() {
let data = b" x y\na b\nc d\n";
let space_dialect = PotentialDialect::new(b' ', Quote::Some(b'"'), LineTerminator::LF);
let score = score_dialect(data, &space_dialect, 100);
assert!(
score.gamma > 0.1,
"dampening should not fire for minority empty-first; gamma={}",
score.gamma
);
}
#[test]
fn test_comma_hash_penalty_fires_on_hash_delimited_data() {
let penalized_data = b"foo # baz,bar\nfoo # baz,bar\nfoo # baz,bar\n\
foo # baz,bar\nfoo # baz,bar\nfoo # baz,bar\n\
foo # baz,bar\nfoo # baz,bar\nfoo # baz,bar\n\
foo # baz,bar\n";
let clean_data = b"foo bar baz,bar\nfoo bar baz,bar\nfoo bar baz,bar\n\
foo bar baz,bar\nfoo bar baz,bar\nfoo bar baz,bar\n\
foo bar baz,bar\nfoo bar baz,bar\nfoo bar baz,bar\n\
foo bar baz,bar\n";
let dialects = vec![PotentialDialect::new(
b',',
Quote::Some(b'"'),
LineTerminator::LF,
)];
let penalized_scores = score_all_dialects(penalized_data, &dialects, 100);
let clean_scores = score_all_dialects(clean_data, &dialects, 100);
let penalized_score = penalized_scores
.iter()
.find(|s| s.dialect.delimiter == b',')
.unwrap();
let clean_score = clean_scores
.iter()
.find(|s| s.dialect.delimiter == b',')
.unwrap();
assert!(
penalized_score.gamma >= 0.0,
"comma gamma must be non-negative"
);
assert!(
penalized_score.gamma < clean_score.gamma,
"comma penalty (0.82×) should reduce score when ' # ' dominates field-0; \
penalized={} clean={}",
penalized_score.gamma,
clean_score.gamma
);
}
#[test]
fn test_comma_hash_penalty_does_not_fire_below_90pct() {
let data = b"a # b,c\na # b,c\na # b,c\na # b,c\na # b,c\n\
x,y\nx,y\nx,y\nx,y\nx,y\n";
let comma_dialect = PotentialDialect::new(b',', Quote::Some(b'"'), LineTerminator::LF);
let score = score_dialect(data, &comma_dialect, 100);
assert!(score.gamma >= 0.0);
}
#[test]
fn test_backslash_single_boost_applied() {
let data_with_backslash = b"it\\'s fine,next\ndon\\'t stop,go\nwe\\'re here,now\n";
let data_no_apostrophe = b"its fine,next\ndont stop,go\nwere here,now\n";
let sq_dialect = PotentialDialect::new(b',', Quote::Some(b'\''), LineTerminator::LF);
let boosted_score = score_dialect(data_with_backslash, &sq_dialect, 100);
let baseline_score = score_dialect(data_no_apostrophe, &sq_dialect, 100);
assert!(
boosted_score.gamma > 0.0,
"single-quote dialect must score positively; gamma={}",
boosted_score.gamma
);
assert!(
boosted_score.gamma > baseline_score.gamma,
"backslash-escape boost (1.10×) should raise sq score above no-apostrophe baseline; \
boosted={} baseline={}",
boosted_score.gamma,
baseline_score.gamma
);
}
#[test]
fn test_backslash_boost_does_not_fire_when_double_quotes_present() {
let data = b"it\\'s,\"quoted\"\ndon\\'t,\"also\"\n";
let sq_dialect = PotentialDialect::new(b',', Quote::Some(b'\''), LineTerminator::LF);
let score = score_dialect(data, &sq_dialect, 100);
assert!(score.gamma >= 0.0);
}
#[test]
fn test_closing_only_boost_below_threshold_no_boost() {
let tab_sq_dialect = PotentialDialect::new(b'\t', Quote::Some(b'\''), LineTerminator::LF);
let mut data_19 = Vec::new();
for _ in 0..19 {
data_19.extend_from_slice(b"x'\trest\n");
}
for _ in 0..6 {
data_19.extend_from_slice(b"x\trest\n");
}
let mut data_20 = Vec::new();
for _ in 0..20 {
data_20.extend_from_slice(b"x'\trest\n");
}
for _ in 0..5 {
data_20.extend_from_slice(b"x\trest\n");
}
let score_19 = score_dialect(&data_19, &tab_sq_dialect, 200);
let score_20 = score_dialect(&data_20, &tab_sq_dialect, 200);
assert!(
score_20.gamma > score_19.gamma,
"closing-only boost (1.10×) should fire at boundary_count=20 but not at 19; \
score_19={} score_20={}",
score_19.gamma,
score_20.gamma
);
let dialects = vec![tab_sq_dialect];
let cached_19 = score_all_dialects(&data_19, &dialects, 200);
let cached_20 = score_all_dialects(&data_20, &dialects, 200);
let cached_score_19 = cached_19
.iter()
.find(|s| s.dialect.delimiter == b'\t')
.unwrap();
let cached_score_20 = cached_20
.iter()
.find(|s| s.dialect.delimiter == b'\t')
.unwrap();
assert!(
cached_score_20.gamma > cached_score_19.gamma,
"closing-only boost (1.10×) should fire on cached path at boundary_count=20 but not at 19; \
cached_19={} cached_20={}",
cached_score_19.gamma,
cached_score_20.gamma
);
let tolerance = 1e-9_f64;
assert!(
(cached_score_19.gamma - score_19.gamma).abs() < tolerance,
"cached and non-cached paths disagree on 19-boundary score: \
non_cached={} cached={}",
score_19.gamma,
cached_score_19.gamma
);
assert!(
(cached_score_20.gamma - score_20.gamma).abs() < tolerance,
"cached and non-cached paths disagree on 20-boundary score: \
non_cached={} cached={}",
score_20.gamma,
cached_score_20.gamma
);
}
}