#![deny(missing_docs)]
extern crate rustc_serialize;
pub mod columns;
pub mod error;
use self::columns::{Columns, BytesColumns};
use std::fs::File;
use std::io::{self, BufRead, BufReader};
use std::iter::Iterator;
use std::path::Path;
use error::{Error, Result};
use rustc_serialize::Decodable;
#[cfg(test)] mod test;
const UTF8_BOM: &'static [u8] = b"\xef\xbb\xbf";
pub struct Csv<B: BufRead> {
delimiter: u8,
reader: B,
has_header: bool,
headers: Option<Vec<String>>,
flexible: bool,
len: Option<usize>,
exit: bool,
current_line: usize,
}
impl<B: BufRead> Csv<B> {
pub fn from_reader(mut reader: B) -> Csv<B> {
let result = try_consume_utf8_bom(&mut reader);
Csv {
reader: reader,
delimiter: b',',
has_header: false,
headers: None,
flexible: false,
len: None,
exit: result.is_err(),
current_line: 0,
}
}
pub fn delimiter(mut self, delimiter: u8) -> Csv<B> {
self.delimiter = delimiter;
self
}
pub fn flexible(mut self, flexible: bool) -> Csv<B> {
self.flexible = flexible;
self
}
pub fn has_header(mut self, has_header: bool) -> Csv<B> {
self.has_header = has_header;
let _ = self.headers();
self
}
pub fn headers(&mut self) -> Vec<String> {
if let Some(ref h) = self.headers {
return h.clone();
}
if self.has_header {
if let Some(r) = self.next() {
if let Ok(r) = r {
let h = r.decode().ok().unwrap_or_else(Vec::new);
self.headers = Some(h.clone());
return h;
}
}
}
Vec::new()
}
pub fn column_count(&self) -> Option<usize> {
self.len
}
pub fn current_line(&self) -> usize {
self.current_line
}
}
impl Csv<BufReader<File>> {
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Csv<BufReader<File>>>
{
let reader = BufReader::new(try!(File::open(path)));
Ok(Csv::from_reader(reader))
}
}
impl<'a> Csv<&'a [u8]> {
pub fn from_string(s: &'a str) -> Csv<&'a [u8]> {
Csv::from_reader(s.as_bytes())
}
}
impl<B: BufRead> Iterator for Csv<B> {
type Item = Result<Row>;
fn next(&mut self) -> Option<Result<Row>> {
if self.exit { return None; }
let mut buf = Vec::new();
let mut cols = self.len.map_or_else(Vec::new, Vec::with_capacity);
match read_line(&mut self.reader, &mut buf, self.delimiter, &mut cols) {
Ok(0) => None,
Ok(_n) => {
if buf.ends_with(&[b'\r']) {
buf.pop();
}
cols.push(buf.len());
let c = cols.len();
if let Some(n) = self.len {
if n != c && !self.flexible {
self.exit = true;
return Some(Err(Error::ColumnMismatch(n, c)));
}
} else {
self.len = Some(c);
}
self.current_line += 1;
Some(Ok(Row {
line: buf,
cols: cols,
}))
}
Err(e) => {
self.exit = true;
Some(Err(e))
},
}
}
}
pub struct Row {
line: Vec<u8>,
cols: Vec<usize>,
}
impl Row {
pub fn columns(&self) -> Result<Columns> {
match ::std::str::from_utf8(&self.line) {
Err(_) => Err(Error::Io(io::Error::new(io::ErrorKind::InvalidData,
"stream did not contain valid UTF-8"))),
Ok(s) => Ok(Columns::new(s, &self.cols)),
}
}
pub fn bytes_columns(&self) -> BytesColumns {
BytesColumns::new(&self.line, &self.cols)
}
pub fn decode<T: Decodable>(&self) -> Result<T> {
let mut columns = try!(self.columns());
Decodable::decode(&mut columns)
}
pub fn len(&self) -> usize {
self.cols.len()
}
pub fn is_empty(&self) -> bool {
self.cols.is_empty()
}
}
macro_rules! consume_quote {
($bytes: expr, $delimiter: expr, $in_quote: expr,
$start: expr, $buf: expr, $available: expr, $quote_count: expr) => {
$in_quote = false;
loop {
match $bytes.next() {
Some((_, &b'\"')) => {
match $bytes.clone().next() {
Some((i, &b'\"')) => {
$bytes.next(); $buf.extend_from_slice(&$available[$start..i]);
$start = i + 1;
$quote_count += 1;
},
None | Some((_, &b'\r')) | Some((_, &b'\n')) => break,
Some((_, d)) if *d == $delimiter => break,
Some((_, _)) => return Err(Error::UnescapedQuote),
}
},
None => {
$in_quote = true;
break;
},
_ => (),
}
}
}
}
fn read_line<R: BufRead>(r: &mut R, buf: &mut Vec<u8>,
delimiter: u8, cols: &mut Vec<usize>) -> Result<usize>
{
let mut read = 0;
let mut in_quote = false;
let mut done = false;
let mut quote_count = 0;
while !done {
let used = {
let available = match r.fill_buf() {
Ok(n) if n.is_empty() => return Ok(read),
Ok(n) => n,
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => return Err(Error::from(e)),
};
let mut bytes = available.iter().enumerate();
let mut start = 0;
if in_quote {
consume_quote!(bytes, delimiter, in_quote, start, buf, available, quote_count);
}
let used: usize;
loop {
match bytes.next() {
Some((i, &b'\"')) => {
if i == 0 || available[i - 1] == delimiter {
consume_quote!(bytes, delimiter, in_quote, start, buf, available, quote_count);
} else {
return Err(Error::UnexpextedQuote);
}
},
Some((i, &b'\n')) => {
done = true;
used = i + 1;
buf.extend_from_slice(&available[start..i]);
break;
},
Some((i, &d)) => {
if d == delimiter { cols.push(read + i - quote_count); }
},
None => {
used = available.len();
buf.extend_from_slice(&available[start..used]);
break;
},
}
}
used
};
r.consume(used);
read += used;
}
Ok(read)
}
fn try_consume_utf8_bom<B: BufRead>(reader: &mut B) -> Result<()> {
if try!(reader.fill_buf()).starts_with(UTF8_BOM) {
reader.consume(UTF8_BOM.len());
}
Ok(())
}