use std::{
borrow::Cow,
collections::{BTreeMap, HashMap},
fmt::Debug,
fs::File,
io::{BufRead, BufReader, Write},
ops::Range,
str::FromStr,
sync::Arc,
};
use context_error::*;
use flate2::bufread::GzDecoder;
use itertools::Itertools;
use serde::{Deserialize, Serialize};
use crate::helper_functions::check_extension;
#[derive(Clone, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)]
pub struct CsvLine {
pub line_index: usize,
pub line: String,
pub fields: Vec<(Arc<String>, Range<usize>)>,
pub file: Option<Arc<Box<str>>>,
}
#[allow(dead_code)]
impl CsvLine {
pub const fn line_index(&self) -> usize {
self.line_index
}
pub fn line(&self) -> &str {
&self.line
}
pub fn headers(&self) -> impl Iterator<Item = &str> {
self.fields.iter().map(|f| f.0.as_str())
}
pub fn values(&self) -> impl Iterator<Item = (Arc<String>, &str)> {
self.fields
.iter()
.map(|f| (f.0.clone(), &self.line[f.1.clone()]))
}
pub const fn number_of_columns(&self) -> usize {
self.fields.len()
}
pub fn column_context(&self, column: usize) -> Context<'_> {
let base = Context::none()
.line_index(self.line_index as u32)
.lines(0, &self.line)
.add_highlight((
0,
self.fields[column].1.clone(),
self.fields[column].0.as_str(),
));
if let Some(source) = &self.file {
base.source(source.as_ref().as_ref())
} else {
base
}
}
pub fn range_context<'a>(
&'a self,
range: Range<usize>,
comment: Option<Cow<'a, str>>,
) -> Context<'a> {
let base = Context::none()
.line_index(self.line_index as u32)
.lines(0, &self.line);
let base = if let Some(comment) = comment {
base.add_highlight((0, range, comment))
} else {
base.add_highlight((0, range))
};
if let Some(source) = &self.file {
base.source(source.as_ref().as_ref())
} else {
base
}
}
pub fn full_context(&self) -> Context<'_> {
let base = Context::none()
.line_index(self.line_index as u32)
.lines(0, &self.line);
if let Some(source) = &self.file {
base.source(source.as_ref().as_ref())
} else {
base
}
}
pub fn range(&self, index: usize) -> &Range<usize> {
&self.fields[index].1
}
pub fn index_column<'a>(
&'a self,
name: &str,
) -> Result<(&'a str, &'a Range<usize>), BoxedError<'a, BasicKind>> {
self.fields
.iter()
.find(|f| f.0.eq_ignore_ascii_case(name))
.map(|f| (&self.line[f.1.clone()], &f.1))
.ok_or_else(|| {
BoxedError::new(
BasicKind::Error,
"Could not find given column",
format!("This CSV file does not contain the needed column '{name}'"),
self.full_context(),
)
})
}
pub fn parse_column<'a, F: FromStr>(
&'a self,
column: usize,
base_error: BoxedError<'a, BasicKind>,
) -> Result<F, BoxedError<'a, BasicKind>> {
self[column]
.parse()
.map_err(|_| base_error.replace_context(self.column_context(column)))
}
pub fn parse_column_or_empty<'a, F: FromStr>(
&'a self,
column: usize,
base_error: BoxedError<'a, BasicKind>,
) -> Result<Option<F>, BoxedError<'a, BasicKind>> {
let text = &self[column];
if text.is_empty() || text == "-" {
Ok(None)
} else {
Ok(Some(text.parse().map_err(|_| {
base_error.replace_context(self.column_context(column))
})?))
}
}
}
impl<Hasher: ::std::hash::BuildHasher + Default> From<&CsvLine>
for HashMap<String, String, Hasher>
{
fn from(value: &CsvLine) -> Self {
value
.fields
.iter()
.map(|(name, range)| (name.to_string(), value.line[range.clone()].to_string()))
.collect()
}
}
impl From<&CsvLine> for BTreeMap<String, String> {
fn from(value: &CsvLine) -> Self {
value
.fields
.iter()
.map(|(name, range)| (name.to_string(), value.line[range.clone()].to_string()))
.collect()
}
}
impl std::ops::Index<usize> for CsvLine {
type Output = str;
fn index(&self, index: usize) -> &str {
&self.line[self.fields[index].1.clone()]
}
}
pub fn parse_csv(
path: impl AsRef<std::path::Path>,
separator: u8,
provided_header: Option<Vec<String>>,
) -> Result<
Box<dyn Iterator<Item = Result<CsvLine, BoxedError<'static, BasicKind>>>>,
BoxedError<'static, BasicKind>,
> {
let path = path.as_ref();
let file = File::open(path).map_err(|e| {
BoxedError::new(
BasicKind::Error,
"Could not open file",
e.to_string(),
Context::default().source(path.to_string_lossy()).to_owned(),
)
})?;
if check_extension(path, "gz") {
Ok(Box::new(parse_csv_raw(
GzDecoder::new(BufReader::new(file)),
separator,
provided_header,
Some(path.to_string_lossy().to_string().into_boxed_str()),
)?))
} else {
Ok(Box::new(parse_csv_raw(
file,
separator,
provided_header,
Some(path.to_string_lossy().to_string().into_boxed_str()),
)?))
}
}
pub fn parse_csv_raw<T: std::io::Read>(
reader: T,
mut separator: u8,
provided_header: Option<Vec<String>>,
path: Option<Box<str>>,
) -> Result<CsvLineIter<T>, BoxedError<'static, BasicKind>> {
let reader = BufReader::new(reader);
let mut lines = reader.lines().enumerate().peekable();
let mut skip = false;
if let Some(sep) = lines
.peek()
.and_then(|(_, l)| l.as_ref().ok())
.map(|l| l.trim_start_matches("\u{feff}"))
.and_then(|l| l.strip_prefix("sep="))
{
skip = true;
if let Some(c) = sep.chars().next() {
if c.len_utf8() == 1 {
separator = c as u8;
} else {
return Err(BoxedError::new(
BasicKind::Error,
"Unicode value separators not supported",
"This is a character that takes more than 1 byte to represent in Unicode, this is not supported in parsing CSV files.",
Context::line_with_comment(Some(0), format!("sep={sep}"), 4, sep.len(), None),
));
}
}
}
if skip {
let _unused = lines.next();
}
let column_headers = if let Some(header) = provided_header {
let (_, column_headers) = lines.peek().ok_or_else(|| {
BoxedError::new(
BasicKind::Error,
"Could parse csv file",
"The file is empty",
Context::default(),
)
})?;
let header_line = column_headers
.as_ref()
.map_err(|err| {
BoxedError::new(
BasicKind::Error,
"Could not read header line",
err.to_string(),
Context::default(),
)
})?
.trim_start_matches("\u{feff}");
let first_line = csv_separate(header_line, separator)
.map_err(BoxedError::to_owned)?
.into_iter()
.map(|r| Arc::new(header_line[r].to_lowercase()))
.collect_vec();
let provided_header = header.into_iter().map(Arc::new).collect();
if first_line == provided_header {
drop(lines.next()); }
provided_header
} else {
let (_, column_headers) = lines.next().ok_or_else(|| {
BoxedError::new(
BasicKind::Error,
"Could parse csv file",
"The file is empty",
Context::none(),
)
})?;
let header_line = column_headers.map_err(|err| {
BoxedError::new(
BasicKind::Error,
"Could not read header line",
err.to_string(),
Context::none(),
)
})?;
let header_line = header_line.trim_start_matches("\u{feff}");
csv_separate(header_line, separator)
.map_err(BoxedError::to_owned)?
.into_iter()
.map(|r| Arc::new(header_line[r].to_lowercase()))
.collect()
};
Ok(CsvLineIter {
lines,
header: column_headers,
separator,
file: path.map(Arc::new),
})
}
#[derive(Debug)]
pub struct CsvLineIter<T: std::io::Read> {
lines: std::iter::Peekable<std::iter::Enumerate<std::io::Lines<BufReader<T>>>>,
header: Vec<Arc<String>>,
separator: u8,
file: Option<Arc<Box<str>>>,
}
impl<T: std::io::Read> Iterator for CsvLineIter<T> {
type Item = Result<CsvLine, BoxedError<'static, BasicKind>>;
fn next(&mut self) -> Option<Self::Item> {
self.lines.next().map(|(line_index, line)| {
let line = line.map_err(|err|BoxedError::new(BasicKind::Error,
"Could not read line",
err.to_string(),
Context::default().line_index(line_index as u32),
))?;
csv_separate(&line, self.separator).map_err(BoxedError::to_owned).and_then(|row| {
if self.header.len() == row.len() {
Ok(CsvLine {
line_index,
line,
fields: self.header.iter().cloned().zip(row).collect(),
file: self.file.clone(),
})
} else {
Err(BoxedError::new(BasicKind::Error,
"Incorrect number of columns",
format!("It does not have the correct number of columns. {} columns were expected but {} were found.", self.header.len(), row.len()),
Context::full_line(line_index as u32, line),
))
}
})
})
}
}
pub fn csv_separate(
line: &str,
separator: u8,
) -> Result<Vec<Range<usize>>, BoxedError<'_, BasicKind>> {
if line.is_empty() {
return Err(BoxedError::new(
BasicKind::Error,
"Empty line",
"The line is empty",
Context::none(),
));
}
let mut enclosed = None;
let mut was_enclosed = false;
let mut was_double = false;
let mut row = Vec::new();
let mut start = None;
let mut last_non_whitespace = None;
for (index, ch) in line.bytes().enumerate() {
match (ch, enclosed, start) {
(b'\"' | b'\'', None, None) => {
enclosed = Some(ch);
start = Some(index + 1);
was_double = false;
}
(c, Some(e), Some(s)) if c == e => {
if line.as_bytes().get(index + 1).copied() == Some(e) && !was_double {
was_double = true;
} else if was_double {
was_double = false;
} else {
enclosed = None;
if c.is_ascii_whitespace() {
row.push(s..last_non_whitespace.unwrap_or(index));
} else {
row.push(s..index);
}
start = None;
last_non_whitespace = None;
was_enclosed = true;
}
}
(sep, None, Some(s)) if sep == separator => {
if sep.is_ascii_whitespace() {
row.push(s..last_non_whitespace.unwrap_or(index));
} else {
row.push(s..index);
}
start = None;
last_non_whitespace = None;
was_enclosed = false;
}
(sep, None, None) if sep == separator => {
if !was_enclosed {
row.push(index..index);
start = None;
last_non_whitespace = None;
}
was_enclosed = false;
}
(c, _, _) if c.is_ascii_whitespace() => (), (_, _, None) => {
start = Some(index);
last_non_whitespace = Some(index + 1);
}
_ => last_non_whitespace = Some(index + 1),
}
}
if let Some(s) = start {
row.push(s..last_non_whitespace.unwrap_or(line.len()));
} else if !was_enclosed {
row.push(line.len()..line.len());
}
Ok(row)
}
impl std::fmt::Display for CsvLine {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
writeln!(
f,
"{}",
Context::default()
.line_index(self.line_index as u32)
.lines(0, &self.line)
.add_highlights(self.fields.iter().map(|f| (0, f.1.clone())))
)
}
}
#[allow(dead_code)]
pub fn write_csv(
mut f: impl Write,
data: impl IntoIterator<Item = impl IntoIterator<Item = (String, String)>>,
separator: char,
) -> Result<(), std::io::Error> {
let mut order: Vec<String> = Vec::new();
let sorted: Vec<Vec<String>> = data
.into_iter()
.map(|row| {
let mut new_row = vec![String::new(); order.len()];
for (mut column, mut value) in row {
if value.contains(separator) {
value = format!("\"{}\"", value.replace('\"', "\'"));
}
if let Some(index) = order.iter().position(|i| *i == column) {
new_row[index] = value;
} else {
if column.contains(separator) {
column = format!("\"{}\"", column.replace('\"', "\'"));
}
order.push(column);
new_row.push(value);
}
}
new_row
})
.collect_vec();
let separator = separator.to_string();
writeln!(f, "{}", order.iter().join(&separator))?;
for row in sorted {
let len = order.len() - row.len();
writeln!(
f,
"{}",
row.into_iter()
.chain(std::iter::repeat_n(String::new(), len))
.join(&separator)
)?;
}
Ok(())
}