use super::*;
pub struct Headers(Vec<String>);
pub struct Data {
cols: Headers,
rows: Vec<Vec<Cell>>,
}
pub enum Cell {
Num(f64),
Txt(String),
}
macro_rules! cell_impl {
($([$t:ty : $v:ident $f:path])*) => {
$(
impl From<$t> for Cell {
fn from(x: $t) -> Cell {
Cell::$v($f(x))
}
}
)*
}
}
cell_impl! {
[f64:Num std::convert::identity]
[f32:Num From::from]
[String:Txt std::convert::identity]
[&str:Txt ToString::to_string]
}
impl Headers {
pub fn len(&self) -> usize {
self.0.len()
}
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
pub fn find(&self, s: &str) -> Option<usize> {
self.find_match(|x| x.eq(s))
}
pub fn find_ignore_case(&self, s: &str) -> Option<usize> {
self.find_match(|x| x.eq_ignore_ascii_case(s))
}
pub fn find_ignore_case_and_ws(&self, s: &str) -> Option<usize> {
self.find_match(|a| str_eq_ignore_case_and_ws(a, s))
}
pub fn find_match<P>(&self, predicate: P) -> Option<usize>
where
P: Fn(&str) -> bool,
{
self.0
.iter()
.enumerate()
.find_map(|(i, x)| predicate(x).then_some(i))
}
pub fn fuzzy_match(&self, s: &str) -> impl Iterator<Item = String> + '_ {
let mut eng = simsearch::SimSearch::new();
self.0
.iter()
.enumerate()
.for_each(|(i, x)| eng.insert(i, x));
let r = eng.search(s);
r.into_iter().filter_map(|i| self.0.get(i)).map(|x| {
let mut x = x.clone();
x.retain(|x| !x.is_whitespace());
x
})
}
}
impl<T: AsRef<str>> FromIterator<T> for Headers {
fn from_iter<I: IntoIterator<Item = T>>(i: I) -> Self {
Headers(
i.into_iter()
.map(|t| t.as_ref().trim().to_string())
.collect(),
)
}
}
impl Data {
pub fn new<D, R, C>(headers: Headers, data: D) -> Result<Self>
where
D: IntoIterator<Item = R>,
R: IntoIterator<Item = C>,
C: Into<Cell>,
{
let headers_len = headers.len();
let d = data.into_iter();
let (l, u) = d.size_hint();
let mut rows = Vec::with_capacity(u.unwrap_or(l));
for (i, row) in d.into_iter().enumerate() {
let row = row.into_iter().map(Into::into).collect::<Vec<_>>();
ensure!(
headers_len == row.len(),
"row index {} does not have the same length as the headers",
i
);
rows.push(row);
}
Ok(Self {
cols: headers,
rows,
})
}
pub fn len(&self) -> usize {
self.rows.len()
}
pub fn is_empty(&self) -> bool {
self.rows.is_empty()
}
pub fn headers(&self) -> &Headers {
&self.cols
}
pub fn rows(&self) -> impl ExactSizeIterator<Item = DataRow> {
self.rows.iter().enumerate().map(|(idx, vals)| DataRow {
idx,
vals,
hdrs: &self.cols,
})
}
}
#[derive(Copy, Clone)]
pub struct DataRow<'a> {
idx: usize,
vals: &'a [Cell],
hdrs: &'a Headers,
}
impl<'a> DataRow<'a> {
pub fn get_num(&self, colidx: usize) -> Option<Result<f64>> {
self.vals.get(colidx).map(|c| match c {
Cell::Num(x) => Ok(*x),
Cell::Txt(x) => Err(miette!("failed to parse '{}' as number", x))
.wrap_err_with(|| format!("in column index {colidx}"))
.wrap_err_with(|| format!("in row index {}", self.idx + 1)),
})
}
pub fn idx(&self) -> usize {
self.idx
}
pub fn headers(&self) -> &Headers {
self.hdrs
}
}
pub struct CsvReader {
rdr: csv::Reader<Box<dyn std::io::Read>>,
cols: Option<Headers>,
}
impl CsvReader {
pub fn new<R: std::io::Read + 'static>(rdr: R) -> Self {
Self {
rdr: csv::Reader::from_reader(Box::new(rdr)),
cols: None,
}
}
fn read_headers(&mut self) -> Result<()> {
let hdrs = self
.rdr
.headers()
.into_diagnostic()
.wrap_err("failed to read CSV header row")?;
ensure!(!hdrs.is_empty(), "headers row is empty");
self.cols = Some(hdrs.iter().collect());
Ok(())
}
pub fn headers(&mut self) -> Result<&Headers> {
if self.cols.is_none() {
self.read_headers()?;
}
self.cols
.as_ref()
.map(Ok)
.expect("should be some if read_headers succeeds")
}
pub fn into_data(self) -> Result<Data> {
Data::try_from(self)
}
}
impl TryFrom<CsvReader> for Data {
type Error = miette::Report;
fn try_from(mut rdr: CsvReader) -> Result<Data> {
rdr.headers()?;
let mut data = Vec::new();
for (i, row) in rdr.rdr.records().enumerate() {
let row = row
.into_diagnostic()
.wrap_err_with(|| format!("failed to read row {} in CSV", i + 1))?;
let row: Vec<Cell> = row
.iter()
.map(|cell| {
cell.parse::<f64>()
.map(Cell::Num)
.unwrap_or_else(|_| Cell::Txt(cell.to_string()))
})
.collect();
data.push(row);
}
let headers = rdr.cols.expect("headers should be initialised");
Data::new(headers, data)
}
}
fn str_eq_ignore_case_and_ws(a: &str, b: &str) -> bool {
let mut a = a.chars().filter(|x| !x.is_whitespace());
let mut b = b.chars().filter(|x| !x.is_whitespace());
loop {
match (a.next(), b.next()) {
(None, None) => break true,
(None, Some(_)) | (Some(_), None) => break false, (Some(a), Some(b)) => match a.eq_ignore_ascii_case(&b) {
true => (), false => break false, },
}
}
}
pub(crate) fn match_hdr_help(hdrs: &Headers, col: &str) -> String {
let mut s = String::from("help - these headers are similar:");
let l = s.len();
for h in hdrs.fuzzy_match(col) {
s.push(' ');
s.push_str(&h);
}
if s.len() == l {
s.clear();
s.push_str("help - no columns match, use `cat <file> | head -n1` for inspect headers");
}
s
}
#[cfg(test)]
mod tests {
use super::*;
use fastrand::*;
use std::iter::*;
#[test]
fn eq_testing() {
use str_eq_ignore_case_and_ws as f;
assert!(f("", ""));
assert!(f(" ", " "));
assert!(f(" a ", " a"));
}
#[test]
fn fuzz_eq_testing() {
for _ in 0..10_000 {
let a: String = repeat_with(|| char(..)).take(usize(..100)).collect();
let b = a.chars().fold(String::new(), |mut s, c| {
s.extend(repeat(' ').take(usize(..2)));
s.push(c);
s
});
assert!(str_eq_ignore_case_and_ws(&a, &b));
}
}
#[test]
fn fuzz_eq_testing2() {
for _ in 0..10_000 {
let mut a: String = repeat_with(|| char(..)).take(usize(..100)).collect();
let mut b: String = repeat_with(|| char(..)).take(usize(..100)).collect();
let x = str_eq_ignore_case_and_ws(&a, &b);
a.retain(|c| !c.is_whitespace());
b.retain(|c| !c.is_whitespace());
let y = a.eq_ignore_ascii_case(&b);
assert_eq!(x, y);
}
}
}