use super::potential_dialects::PotentialDialect;
use crate::metadata::Quote;
use foldhash::{HashMap, HashMapExt};
use std::borrow::Cow;
use std::io::{BufRead, Cursor};
#[derive(Debug, Clone)]
pub struct Table {
pub rows: Vec<Vec<String>>,
pub field_counts: Vec<usize>,
cached_modal_field_count: usize,
cached_modal_field_count_freq: usize,
}
impl Table {
pub const fn new() -> Self {
Self {
rows: Vec::new(),
field_counts: Vec::new(),
cached_modal_field_count: 0,
cached_modal_field_count_freq: 0,
}
}
#[inline]
pub const fn is_empty(&self) -> bool {
self.rows.is_empty()
}
#[inline]
pub const fn num_rows(&self) -> usize {
self.rows.len()
}
#[inline]
pub const fn modal_field_count(&self) -> usize {
self.cached_modal_field_count
}
fn compute_modal_field_count(field_counts: &[usize]) -> (usize, usize) {
if field_counts.is_empty() {
return (0, 0);
}
let max_fc = field_counts.iter().copied().max().unwrap_or(0);
if max_fc <= 256 {
let mut freq = [0usize; 257];
for &fc in field_counts {
freq[fc] += 1;
}
let mut best_fc = 0;
let mut best_count = 0;
for (fc, &count) in freq.iter().enumerate() {
if count > best_count || (count == best_count && fc > best_fc) {
best_fc = fc;
best_count = count;
}
}
(best_fc, best_count)
} else {
let mut counts: HashMap<usize, usize> = HashMap::with_capacity(field_counts.len());
for &fc in field_counts {
*counts.entry(fc).or_insert(0) += 1;
}
counts
.into_iter()
.max_by(|(fc_a, count_a), (fc_b, count_b)| {
count_a.cmp(count_b).then_with(|| fc_a.cmp(fc_b))
})
.map_or((0, 0), |(fc, count)| (fc, count))
}
}
pub fn update_modal_field_count(&mut self) {
let (modal, freq) = Self::compute_modal_field_count(&self.field_counts);
self.cached_modal_field_count = modal;
self.cached_modal_field_count_freq = freq;
}
#[inline]
pub const fn modal_field_count_freq(&self) -> usize {
self.cached_modal_field_count_freq
}
#[inline]
pub fn min_field_count(&self) -> usize {
self.field_counts.iter().copied().min().unwrap_or(0)
}
#[inline]
pub fn max_field_count(&self) -> usize {
self.field_counts.iter().copied().max().unwrap_or(0)
}
}
impl Default for Table {
fn default() -> Self {
Self::new()
}
}
pub fn parse_table(data: &[u8], dialect: &PotentialDialect, max_rows: usize) -> Table {
let normalized = normalize_line_endings(data, dialect);
parse_table_impl(&normalized, dialect, max_rows)
}
pub(crate) fn parse_table_normalized(
data: &[u8],
dialect: &PotentialDialect,
max_rows: usize,
) -> Table {
parse_table_impl(data, dialect, max_rows)
}
fn parse_table_impl<D: AsRef<[u8]>>(data: D, dialect: &PotentialDialect, max_rows: usize) -> Table {
let mut table = Table::new();
let mut reader_builder = csv::ReaderBuilder::new();
reader_builder
.delimiter(dialect.delimiter)
.has_headers(false)
.flexible(true)
.buffer_capacity(32768);
match dialect.quote {
Quote::None => {
reader_builder.quoting(false);
}
Quote::Some(q) => {
reader_builder.quoting(true);
reader_builder.quote(q);
}
}
let cursor = Cursor::new(data);
let mut reader = reader_builder.from_reader(cursor);
let mut record = csv::StringRecord::new();
let limit = if max_rows == 0 { usize::MAX } else { max_rows };
while table.rows.len() < limit {
match reader.read_record(&mut record) {
Ok(true) => {
let row: Vec<String> = record
.iter()
.map(std::string::ToString::to_string)
.collect();
let field_count = row.len();
table.rows.push(row);
table.field_counts.push(field_count);
}
Ok(false) => break, Err(_) => break, }
}
table.update_modal_field_count();
table
}
fn normalize_line_endings<'a>(data: &'a [u8], dialect: &PotentialDialect) -> Cow<'a, [u8]> {
use super::potential_dialects::LineTerminator;
match dialect.line_terminator {
LineTerminator::LF => Cow::Borrowed(data), LineTerminator::CRLF => {
let mut result = Vec::with_capacity(data.len());
let mut i = 0;
while i < data.len() {
if i + 1 < data.len() && data[i] == b'\r' && data[i + 1] == b'\n' {
result.push(b'\n');
i += 2;
} else {
result.push(data[i]);
i += 1;
}
}
Cow::Owned(result)
}
LineTerminator::CR => {
Cow::Owned(
data.iter()
.map(|&b| if b == b'\r' { b'\n' } else { b })
.collect(),
)
}
}
}
#[allow(dead_code)]
pub fn parse_table_simple(data: &[u8], dialect: &PotentialDialect, max_rows: usize) -> Table {
let mut table = Table::new();
let normalized = normalize_line_endings(data, dialect);
let cursor = Cursor::new(normalized.as_ref());
let limit = if max_rows == 0 { usize::MAX } else { max_rows };
for line in cursor.lines().take(limit) {
let Ok(line) = line else { continue };
if line.is_empty() {
continue;
}
let fields = split_line(&line, dialect);
let field_count = fields.len();
table.rows.push(fields);
table.field_counts.push(field_count);
}
table.update_modal_field_count();
table
}
#[allow(dead_code)]
fn split_line(line: &str, dialect: &PotentialDialect) -> Vec<String> {
let delimiter = dialect.delimiter as char;
let quote_char = match dialect.quote {
Quote::None => None,
Quote::Some(q) => Some(q as char),
};
let mut fields = Vec::new();
let mut current_field = String::new();
let mut in_quotes = false;
let mut chars = line.chars().peekable();
while let Some(c) = chars.next() {
if let Some(q) = quote_char
&& c == q
{
if in_quotes {
if chars.peek() == Some(&q) {
current_field.push(q);
chars.next();
} else {
in_quotes = false;
}
} else {
in_quotes = true;
}
continue;
}
if c == delimiter && !in_quotes {
fields.push(current_field.trim().to_string());
current_field = String::new();
} else {
current_field.push(c);
}
}
fields.push(current_field.trim().to_string());
fields
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tum::potential_dialects::LineTerminator;
#[test]
fn test_parse_simple_csv() {
let data = b"a,b,c\n1,2,3\n4,5,6\n";
let dialect = PotentialDialect::new(b',', Quote::Some(b'"'), LineTerminator::LF);
let table = parse_table(data, &dialect, 0);
assert_eq!(table.num_rows(), 3);
assert_eq!(table.field_counts, vec![3, 3, 3]);
assert_eq!(table.rows[0], vec!["a", "b", "c"]);
}
#[test]
fn test_parse_quoted_csv() {
let data = b"\"a,b\",c,d\n1,2,3\n";
let dialect = PotentialDialect::new(b',', Quote::Some(b'"'), LineTerminator::LF);
let table = parse_table(data, &dialect, 0);
assert_eq!(table.num_rows(), 2);
assert_eq!(table.rows[0], vec!["a,b", "c", "d"]);
}
#[test]
fn test_modal_field_count() {
let mut table = Table::new();
table.field_counts = vec![3, 3, 3, 4, 3];
table.update_modal_field_count();
assert_eq!(table.modal_field_count(), 3);
}
}