use alloc::borrow::Cow;
use alloc::str::from_utf8;
use alloc::vec;
use alloc::vec::Vec;
use bytecount::count;
use memchr::memchr;
use crate::error::EtError;
use crate::parsers::common::NewLine;
use crate::parsers::extract;
use crate::parsers::tsv::TsvParams;
use crate::record::Value;
#[derive(Clone, Copy, Debug, Default)]
pub struct StreamingStats {
pub n: usize,
pub mean: f64,
m2: f64,
pub min: f64,
pub max: f64,
}
impl StreamingStats {
#[must_use]
pub fn new() -> Self {
StreamingStats {
n: 0,
mean: 0.,
m2: 0.,
min: f64::MAX,
max: f64::MIN,
}
}
#[allow(clippy::cast_precision_loss)]
pub fn update(&mut self, val: f64) {
self.n += 1;
let delta = val - self.mean;
self.mean += delta / self.n as f64;
let delta2 = val - self.mean;
self.m2 += delta * delta2;
self.min = self.min.min(val);
self.max = self.max.max(val);
}
#[allow(clippy::cast_precision_loss)]
#[must_use]
pub fn variance(&self) -> f64 {
self.m2 / self.n as f64
}
}
#[inline]
pub(crate) fn split<'a>(
buffer: &mut Vec<Cow<'a, str>>,
line: &'a [u8],
delim: u8,
quote: u8,
) -> Result<usize, EtError> {
let mut cur_pos = 0;
let mut token_num = 0;
while cur_pos < line.len() {
if token_num >= buffer.len() {
buffer.push(Cow::Borrowed(""));
}
if line[cur_pos] == quote {
let mut quoted_quotes = false;
loop {
let qpos = cur_pos + 1;
if let Some(next) = memchr(quote, &line[qpos..]) {
if qpos + next + 1 == line.len() || line[qpos + next + 1] == delim {
if quoted_quotes {
buffer[token_num] += from_utf8(&line[qpos..qpos + next])?;
} else {
buffer[token_num] = Cow::Borrowed(from_utf8(&line[qpos..qpos + next])?);
}
cur_pos += next + 2;
break;
} else if line[qpos + next + 1] != quote {
return Err("quotes must start and end next to delimiters".into());
}
if quoted_quotes {
buffer[token_num] += from_utf8(&line[qpos..=qpos + next])?;
} else {
buffer[token_num] = Cow::Borrowed(from_utf8(&line[qpos..=qpos + next])?);
}
quoted_quotes = true;
cur_pos += next + 2;
} else {
return Err("unclosed delimiter".into());
}
}
} else if let Some(next) = memchr(delim, &line[cur_pos..]) {
buffer[token_num] = from_utf8(&line[cur_pos..cur_pos + next])?.into();
cur_pos += next;
} else {
buffer[token_num] = from_utf8(&line[cur_pos..line.len()])?.into();
cur_pos = line.len();
};
cur_pos += 1;
token_num += 1;
}
if line.last() == Some(&delim) {
if token_num >= buffer.len() {
buffer.push(Cow::Borrowed(""));
} else {
buffer[token_num] = "".into();
}
token_num += 1;
}
buffer.truncate(token_num);
Ok(token_num)
}
pub fn sniff_params_from_data(params: &mut TsvParams, data: &[u8]) {
let con = &mut 0;
let mut stats = [StreamingStats::new(); N_DELIMS];
let mut quote_diff = 0;
while let Ok(NewLine(line)) = extract(data, con, &mut 0) {
count_bytes(line, &mut stats, &mut quote_diff);
}
if params.quote_char.is_none() {
params.quote_char = Some(if quote_diff < 0 { b'\'' } else { b'"' });
}
let mut possible_delims = Vec::new();
for (ix, stat) in stats.iter().take(N_DELIMS - 1).enumerate() {
let avg_delims_required = if DELIMS[ix] == b' ' {
3. } else {
1.
};
if stat.mean >= avg_delims_required {
possible_delims.push((stat.variance(), stat.mean, DELIMS[ix]));
}
}
possible_delims.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(core::cmp::Ordering::Equal));
let (delim_char, avg_delims) = if possible_delims.is_empty() {
(b',', 0.)
} else {
(possible_delims[0].2, possible_delims[0].1)
};
if params.delim_char.is_none() {
params.delim_char = Some(delim_char);
}
let con = &mut 0;
let mut ix = 0;
let mut skip_lines = 0;
let mut in_data = 0;
while let Ok(NewLine(line)) = extract(data, con, &mut 0) {
let n_delims = count(line, delim_char);
#[allow(clippy::cast_precision_loss)]
if (n_delims as f64 - avg_delims).abs() < 1. {
if in_data == 0 {
skip_lines = ix;
} else if in_data == 5 {
break;
}
in_data += 1;
} else {
in_data = 0;
}
ix += 1;
}
if params.skip_lines.is_none() {
params.skip_lines = Some(skip_lines);
}
}
pub fn sniff_types_from_data(params: &mut TsvParams, data: &[u8]) {
let delim_char = params.delim_char.unwrap_or(DEFAULT_DELIM);
let mut fields = vec![Cow::Borrowed(""); 32];
let mut types: Vec<TsvFieldType> = Vec::new();
let mut line_ix = 0;
let con = &mut 0;
while let Ok(NewLine(line)) = extract(data, con, &mut 0) {
if line_ix < params.skip_lines.unwrap_or(0) + 1 {
line_ix += 1;
continue;
}
drop(split(
&mut fields,
line,
delim_char,
params.quote_char.unwrap_or(b'"'),
));
for (field_ix, field) in fields.iter().enumerate() {
if field_ix >= types.len() {
let mut ty = TsvFieldType::default();
ty.infer(field);
types.push(ty);
} else {
types[field_ix].infer(field);
}
}
line_ix += 1;
}
params.types = types;
}
const DELIMS: &[u8] = b"\t;:|~,^ ";
const N_DELIMS: usize = 9;
pub const DEFAULT_DELIM: u8 = b'\t';
pub const DEFAULT_QUOTE: u8 = b'"';
fn count_bytes(line: &[u8], stats: &mut [StreamingStats; N_DELIMS], quote_diff: &mut i32) {
let mut counts = [0u16; N_DELIMS];
for b in line {
counts[match b {
b'\t' => 0,
b';' => 1,
b':' => 2,
b'|' => 3,
b'~' => 4,
b',' => 5,
b'^' => 6,
b' ' => 7,
b'\'' => {
*quote_diff = quote_diff.saturating_sub(1);
8
}
b'"' => {
*quote_diff = quote_diff.saturating_add(1);
8
}
_ => 8,
}] += 1;
}
for (count, stat) in counts.iter().zip(stats.iter_mut()) {
stat.update(f64::from(*count));
}
}
const TSV_STR: u8 = 1;
const TSV_BOOL: u8 = 2;
const TSV_FLOAT: u8 = 4;
const TSV_INT: u8 = 8;
const TSV_DATE: u8 = 16;
#[derive(Clone, Copy, Debug)]
pub struct TsvFieldType {
ty: u8,
}
impl Default for TsvFieldType {
fn default() -> Self {
TsvFieldType { ty: u8::MAX }
}
}
impl TsvFieldType {
pub fn infer(&mut self, field: &str) {
let mut possible_type = TSV_STR;
let field = field.trim();
if field == "F"
|| field == "f"
|| field == "FALSE"
|| field == "false"
|| field == "False"
|| field == "T"
|| field == "t"
|| field == "TRUE"
|| field == "true"
|| field == "True"
{
possible_type |= TSV_BOOL;
}
let mut numeric = false;
let mut nonnumeric = false;
let mut has_period = false;
let mut has_comma = false;
for chr in field.chars() {
match chr {
'0'..='9' => numeric = true,
'.' => has_period = true,
',' => has_comma = true,
' ' | '+' | '-' => {}
_ => nonnumeric = true,
}
}
if numeric && !nonnumeric {
if has_comma || has_period {
possible_type |= TSV_FLOAT;
} else if !(has_comma || has_period) {
possible_type |= TSV_INT;
}
}
self.ty &= possible_type;
}
#[allow(clippy::match_same_arms)] #[must_use]
pub fn coerce<'a>(&self, field: Cow<'a, str>) -> Value<'a> {
let f = field.trim();
match 128 >> self.ty.leading_zeros() {
TSV_STR => Value::from(field),
TSV_BOOL => {
if f == "T" || f == "t" || f == "TRUE" || f == "True" || f == "true" {
Value::Boolean(true)
} else {
Value::Boolean(false)
}
}
TSV_FLOAT => f
.parse::<f64>()
.map_or_else(|_| Value::from(field), Value::from),
TSV_INT => f
.parse::<i64>()
.map_or_else(|_| Value::from(field), Value::from),
TSV_DATE => Value::from(field),
_ => Value::from(field),
}
}
}
#[cfg(test)]
mod test {
use super::*;
use crate::error::EtError;
#[test]
fn test_split() -> Result<(), EtError> {
let mut buffer = Vec::new();
assert_eq!(split(&mut buffer, b"1,2,3,4", b',', b'"')?, 4);
assert_eq!(&buffer, &["1", "2", "3", "4"]);
assert_eq!(split(&mut buffer, b"1,\"2,3\",4", b',', b'"')?, 3);
assert_eq!(&buffer, &["1", "2,3", "4"]);
assert_eq!(split(&mut buffer, b"1,\"2,\"\"3\"\"\",4", b',', b'"')?, 3);
assert_eq!(&buffer, &["1", "2,\"3\"", "4"]);
assert_eq!(
split(&mut buffer, b"1,\"\"\"2\"\",\"\"3\"\"\",4", b',', b'"')?,
3
);
assert_eq!(&buffer, &["1", "\"2\",\"3\"", "4"]);
assert_eq!(split(&mut buffer, b"\"\"\"\"\"2\"\"\"\"\"", b',', b'"')?, 1);
assert_eq!(&buffer, &["\"\"2\"\""]);
assert!(split(&mut buffer, b"\"", b',', b'"').is_err());
assert!(split(&mut buffer, b"\"\" ,2", b',', b'"').is_err());
Ok(())
}
#[test]
fn test_sniff_params() -> Result<(), EtError> {
let mut params = TsvParams::default();
sniff_params_from_data(&mut params, b"test\tthis\theader\n1\t2\t3");
assert_eq!(params.delim_char, Some(b'\t'));
assert_eq!(params.quote_char, Some(b'"'));
assert_eq!(params.skip_lines, Some(0));
let mut params = TsvParams::default();
sniff_params_from_data(&mut params, b"1,0|2,0|3,0\n4,0|5,0|6,0");
assert_eq!(params.delim_char, Some(b'|'));
assert_eq!(params.quote_char, Some(b'"'));
assert_eq!(params.skip_lines, Some(0));
let mut params = TsvParams::default();
sniff_params_from_data(&mut params, b"this is a comment\n1,2,'a'\n4,5,'b'\n6,7,'c'");
assert_eq!(params.delim_char, Some(b','));
assert_eq!(params.quote_char, Some(b'\''));
assert_eq!(params.skip_lines, Some(1));
Ok(())
}
}