use std::io::Read;
use super::CsvwError;
#[derive(Debug, Clone, PartialEq)]
pub struct CsvRecord {
pub fields: Vec<String>,
}
pub struct CsvReader<R: Read> {
inner: R,
delimiter: u8,
quote: u8,
buf: Vec<u8>,
pos: usize,
line: usize,
loaded: bool,
}
impl<R: Read> CsvReader<R> {
pub fn new(inner: R) -> Self {
Self::with_delimiter(inner, b',')
}
pub fn with_delimiter(inner: R, delimiter: u8) -> Self {
Self {
inner,
delimiter,
quote: b'"',
buf: Vec::new(),
pos: 0,
line: 1,
loaded: false,
}
}
fn ensure_loaded(&mut self) -> Result<(), CsvwError> {
if !self.loaded {
self.inner
.read_to_end(&mut self.buf)
.map_err(|e| CsvwError::IoError(e.to_string()))?;
self.loaded = true;
}
Ok(())
}
fn peek(&self) -> Option<u8> {
self.buf.get(self.pos).copied()
}
fn next_byte(&mut self) -> Option<u8> {
let byte = self.buf.get(self.pos).copied();
if byte.is_some() {
self.pos += 1;
}
byte
}
fn skip_cr_if_lf(&mut self) {
if self.peek() == Some(b'\n') {
self.pos += 1;
}
self.line += 1;
}
fn parse_quoted_field(&mut self) -> Result<String, CsvwError> {
let mut value = Vec::new();
let start_line = self.line;
loop {
match self.next_byte() {
None => {
return Err(CsvwError::CsvError {
line: start_line,
msg: "unterminated quoted field".into(),
});
}
Some(byte) if byte == self.quote => {
if self.peek() == Some(self.quote) {
self.pos += 1;
value.push(self.quote);
} else {
break;
}
}
Some(b'\r') => {
if self.peek() == Some(b'\n') {
self.pos += 1;
}
self.line += 1;
value.push(b'\n');
}
Some(b'\n') => {
self.line += 1;
value.push(b'\n');
}
Some(byte) => value.push(byte),
}
}
String::from_utf8(value).map_err(|e| CsvwError::CsvError {
line: self.line,
msg: format!("invalid UTF-8 in quoted field: {e}"),
})
}
fn parse_unquoted_field(&mut self, first_byte: u8) -> Result<String, CsvwError> {
let mut value = vec![first_byte];
loop {
match self.peek() {
None | Some(b'\n') | Some(b'\r') => break,
Some(peeked) if peeked == self.delimiter => break,
_ => {
if let Some(byte) = self.next_byte() {
value.push(byte);
}
}
}
}
String::from_utf8(value).map_err(|e| CsvwError::CsvError {
line: self.line,
msg: format!("invalid UTF-8 in field: {e}"),
})
}
fn consume_post_field(&mut self) -> bool {
match self.peek() {
Some(peeked) if peeked == self.delimiter => {
self.pos += 1; false
}
Some(b'\r') => {
self.pos += 1;
self.skip_cr_if_lf();
true
}
Some(b'\n') => {
self.pos += 1;
self.line += 1;
true
}
None => true, _ => false, }
}
pub fn read_record(&mut self) -> Result<Option<CsvRecord>, CsvwError> {
self.ensure_loaded()?;
if self.pos >= self.buf.len() {
return Ok(None);
}
let mut fields: Vec<String> = Vec::new();
loop {
match self.peek() {
None => break,
Some(b'\r') if fields.is_empty() => {
self.pos += 1;
self.skip_cr_if_lf();
return Ok(None);
}
Some(b'\n') if fields.is_empty() => {
self.pos += 1;
self.line += 1;
return Ok(None);
}
Some(b'\r') => {
self.pos += 1;
self.skip_cr_if_lf();
break;
}
Some(b'\n') => {
self.pos += 1;
self.line += 1;
break;
}
Some(peeked) if peeked == self.delimiter => {
fields.push(String::new());
self.pos += 1; }
Some(peeked) if peeked == self.quote => {
self.pos += 1; let field = self.parse_quoted_field()?;
fields.push(field);
if self.consume_post_field() {
break;
}
}
Some(first_peek) => {
self.pos += 1; let field = self.parse_unquoted_field(first_peek)?;
fields.push(field);
if self.consume_post_field() {
break;
}
}
}
}
if fields.is_empty() {
return Ok(None);
}
Ok(Some(CsvRecord { fields }))
}
pub fn read_all(&mut self) -> Result<Vec<CsvRecord>, CsvwError> {
let mut records = Vec::new();
while let Some(record) = self.read_record()? {
records.push(record);
}
Ok(records)
}
}
pub fn parse_csv(input: &str) -> Result<(Vec<String>, Vec<CsvRecord>), CsvwError> {
let cursor = std::io::Cursor::new(input.as_bytes());
let mut reader = CsvReader::new(cursor);
let header_record = reader.read_record()?.ok_or_else(|| CsvwError::CsvError {
line: 1,
msg: "CSV input has no rows".into(),
})?;
let headers = header_record.fields;
let data = reader.read_all()?;
Ok((headers, data))
}