extern crate test;
use std::borrow::Cow;
use std::vec::Vec;
use std::mem::replace;
use std::io::{IoResult,IoErrorKind};
static STRING_INITIAL_CAPACITY: uint = 64u;
enum CurrentParseState {
Neutral,
InField,
InQuotedField,
EncounteredQuoteInQuotedField,
EndOfRow
}
pub struct SimpleCsv<B: Buffer> {
state: CurrentParseState,
row_data: Vec<String>,
column_buffer: String,
input_reader: B,
delimiter : char
}
impl<B: Buffer> SimpleCsv<B> {
pub fn new(buffer: B) -> SimpleCsv<B> {
SimpleCsv::with_delimiter(buffer,',')
}
pub fn with_delimiter(buffer: B, delimiter: char) -> SimpleCsv<B> {
SimpleCsv {
state : CurrentParseState::Neutral,
row_data : Vec::new(),
column_buffer : String::with_capacity(STRING_INITIAL_CAPACITY),
input_reader : buffer,
delimiter : delimiter
}
}
#[inline]
fn new_column(&mut self) {
let column_data = replace(&mut self.column_buffer,String::with_capacity(STRING_INITIAL_CAPACITY));
self.row_data.push(column_data);
self.state = CurrentParseState::Neutral;
}
#[inline]
fn process_line<'b>(&mut self, line : &Cow<'b, String, str>) {
let delimiter = self.delimiter;
for c in line.chars() {
match self.state {
CurrentParseState::Neutral => {
match c {
'"' => { self.state = CurrentParseState::InQuotedField;
},
_ if c==delimiter => { self.row_data.push(String::new());
},
'\n' => { self.new_column();
self.state = CurrentParseState::EndOfRow;
},
'\r' => { },
_ => { self.column_buffer.push(c);
self.state = CurrentParseState::InField;
}
}
},
CurrentParseState::InQuotedField => {
match c {
'"' => {
self.state = CurrentParseState::EncounteredQuoteInQuotedField
},
_ => { self.column_buffer.push(c);
}
}
},
CurrentParseState::InField => {
match c {
_ if c==delimiter => {
self.new_column();
},
'\n' => {
self.new_column();
self.state = CurrentParseState::EndOfRow;
},
'\r' => { },
_ => {
self.column_buffer.push(c);
}
}
},
CurrentParseState::EncounteredQuoteInQuotedField => {
match c {
'"' => { self.column_buffer.push(c);
self.state = CurrentParseState::InQuotedField;
},
_ if c==delimiter => { self.new_column();
},
'\n' => { self.new_column();
self.state = CurrentParseState::EndOfRow;
},
_ => { self.column_buffer.push(c);
self.state = CurrentParseState::InField;
}
}
},
CurrentParseState::EndOfRow => {
assert!(false,"Should never reach match for EndOfRow");
},
}
}
}
pub fn next_row<'b>(&'b mut self) -> IoResult<&'b [String]> {
self.row_data.drain();
self.state = CurrentParseState::Neutral;
let mut line_count = 0u;
loop {
let line_result = self.input_reader.read_until('\n' as u8);
match line_result {
Ok(ref line_bytes) => {
line_count += 1;
let line = String::from_utf8_lossy(line_bytes.as_slice());
self.process_line(&line);
match self.state {
CurrentParseState::EndOfRow => {
break;
},
_ => {}
}
},
Err(e) => {
match e.kind {
IoErrorKind::EndOfFile if line_count > 0 => {
if !self.column_buffer.is_empty() {
self.new_column();
}
break;
},
_ => {
return Err(e);
}
}
}
}
}
return Ok(self.row_data.as_slice())
}
}
impl<B: Buffer> Iterator<Vec<String>> for SimpleCsv<B> {
fn next(&mut self) -> Option<Vec<String>> {
let x = self.next_row().is_ok();
match x {
true => {
let cap = self.row_data.capacity();
let row = replace(&mut self.row_data, Vec::with_capacity(cap));
Some(row)
},
false => {
None
}
}
}
fn size_hint(&self) -> (uint, Option<uint>) {
return (0,None);
}
}
#[test]
fn simple_csv_test() {
let test_string = "1,2,3\r\n4,5,6".to_string();
let bytes = test_string.into_bytes();
let test_csv_reader = bytes.as_slice();
let mut parser = SimpleCsv::new(test_csv_reader);
assert_eq!(parser.next_row(), Ok(vec!["1".to_string(),"2".to_string(),"3".to_string()].as_slice()));
assert_eq!(parser.next_row(), Ok(vec!["4".to_string(),"5".to_string(),"6".to_string()].as_slice()));
assert!(parser.next_row().is_err());
}
#[test]
fn quoted_csv_test() {
let test_string = "1,\"2\",3\r\n4,\"5\",6".to_string();
let bytes = test_string.into_bytes();
let test_csv_reader = bytes.as_slice();
let mut parser = SimpleCsv::new(test_csv_reader);
assert_eq!(parser.next_row(), Ok(vec!["1".to_string(),"2".to_string(),"3".to_string()].as_slice()));
assert_eq!(parser.next_row(), Ok(vec!["4".to_string(),"5".to_string(),"6".to_string()].as_slice()));
assert!(parser.next_row().is_err());
}
#[test]
fn quote_in_quoted_csv_test() {
let test_string = r#"1,"""2",3"#.to_string();
let bytes = test_string.into_bytes();
let test_csv_reader = bytes.as_slice();
let mut parser = SimpleCsv::new(test_csv_reader);
assert_eq!(parser.next_row(), Ok(vec!["1".to_string(),r#""2"#.to_string(),"3".to_string()].as_slice()));
assert!(parser.next_row().is_err());
}
#[test]
fn newline_in_quoted_csv_test() {
let test_string = "1,\"2\",3\r\n4,\"5\r\n\",6".to_string();
let bytes = test_string.into_bytes();
let test_csv_reader = bytes.as_slice();
let mut parser = SimpleCsv::new(test_csv_reader);
assert_eq!(parser.next_row(), Ok(vec!["1".to_string(),"2".to_string(),"3".to_string()].as_slice()));
assert_eq!(parser.next_row(), Ok(vec!["4".to_string(),"5\r\n".to_string(),"6".to_string()].as_slice()));
assert!(parser.next_row().is_err());
}
#[test]
fn eof_in_quoted_csv_test() {
let test_string = "1,2,3\r\n4,5,\"6".to_string();
let bytes = test_string.into_bytes();
let test_csv_reader = bytes.as_slice();
let mut parser = SimpleCsv::new(test_csv_reader);
assert_eq!(parser.next_row(), Ok(vec!["1".to_string(),"2".to_string(),"3".to_string()].as_slice()));
assert_eq!(parser.next_row(), Ok(vec!["4".to_string(),"5".to_string(),"6".to_string()].as_slice()));
assert!(parser.next_row().is_err());
}
#[test]
fn data_after_quoted_csv_test() {
let test_string = "1,2,3\r\n4,5,\"6\"data_after_quoted_field".to_string();
let bytes = test_string.into_bytes();
let test_csv_reader = bytes.as_slice();
let mut parser = SimpleCsv::new(test_csv_reader);
assert_eq!(parser.next_row(), Ok(vec!["1".to_string(),"2".to_string(),"3".to_string()].as_slice()));
assert_eq!(parser.next_row(), Ok(vec!["4".to_string(),"5".to_string(),"6data_after_quoted_field".to_string()].as_slice()));
assert!(parser.next_row().is_err());
}
#[test]
fn newline_only_on_last_column() {
let test_string = "1,2,3\r\n4,5,\r\n".to_string();
let bytes = test_string.into_bytes();
let test_csv_reader = bytes.as_slice();
let mut parser = SimpleCsv::new(test_csv_reader);
assert_eq!(parser.next_row(), Ok(vec!["1".to_string(),"2".to_string(),"3".to_string()].as_slice()));
assert_eq!(parser.next_row(), Ok(vec!["4".to_string(),"5".to_string(),"".to_string()].as_slice()));
assert!(parser.next_row().is_err());
}
#[test]
fn empty_line_in_file() {
let test_string = "1,2,3\r\n\r\n4,5,6".to_string();
let bytes = test_string.into_bytes();
let test_csv_reader = bytes.as_slice();
let mut parser = SimpleCsv::new(test_csv_reader);
assert_eq!(parser.next_row(), Ok(vec!["1".to_string(),"2".to_string(),"3".to_string()].as_slice()));
assert_eq!(parser.next_row(), Ok(vec!["".to_string()].as_slice()));
assert_eq!(parser.next_row(), Ok(vec!["4".to_string(),"5".to_string(),"6".to_string()].as_slice()));
assert!(parser.next_row().is_err());
}
#[test]
fn bad_utf8() {
let test_string = "1,2,3\r\n4,5,6".to_string();
let mut str_bytes = test_string.into_bytes();
str_bytes.push(0xff);
let test_csv_reader = str_bytes.as_slice();
let mut parser = SimpleCsv::new(test_csv_reader);
assert_eq!(parser.next_row(), Ok(vec!["1".to_string(),"2".to_string(),"3".to_string()].as_slice()));
assert_eq!(parser.next_row(), Ok(vec!["4".to_string(),"5".to_string(),"6\u{FFFD}".to_string()].as_slice()));
}
#[test]
fn different_delimiter() {
let test_string = "1|2|3\r\n4|5|6".to_string();
let bytes = test_string.into_bytes();
let test_csv_reader = bytes.as_slice();
let mut parser = SimpleCsv::with_delimiter(test_csv_reader,'|');
assert_eq!(parser.next_row(), Ok(vec!["1".to_string(),"2".to_string(),"3".to_string()].as_slice()));
assert_eq!(parser.next_row(), Ok(vec!["4".to_string(),"5".to_string(),"6".to_string()].as_slice()));
assert!(parser.next_row().is_err());
}
#[bench]
fn bench_throughput(b: &mut test::Bencher) {
let num_rows = 10000;
let seed_string = "1,\"2\",3,4,\"5\",6\r\n";
let total_bytes = seed_string.len() * num_rows;
let mut test_string = String::with_capacity(total_bytes);
for _ in range(0,num_rows) {
test_string.push_str(seed_string);
}
let bytes = test_string.into_bytes();
b.bytes = total_bytes as u64;
b.iter(|| {
let r = bytes.as_slice();
let mut x=0;
let mut parser = SimpleCsv::new(r);
while let Ok(_) = parser.next_row() {
x+=1;
}
assert_eq!(x,num_rows);
});
}
#[bench]
fn bench_throughput_long_columns(b: &mut test::Bencher) {
let num_rows = 10000;
let seed_string = "1222222211112,\"231231231231\",3312312312312312312,4312312312312312323123132312312313,\"53123123123123123123123213213\",6233123123123123132\r\n";
let total_bytes = seed_string.len() * num_rows;
let mut test_string = String::with_capacity(total_bytes);
for _ in range(0,num_rows) {
test_string.push_str(seed_string);
}
let bytes = test_string.into_bytes();
b.bytes = total_bytes as u64;
b.iter(|| {
let r = bytes.as_slice();
let mut x=0;
let mut parser = SimpleCsv::new(r);
while let Ok(_) = parser.next_row() {
x+=1;
}
assert_eq!(x,num_rows);
});
}
#[bench]
fn bench_throughput_iter(b: &mut test::Bencher) {
let num_rows = 10000;
let seed_string = "1,\"2\",3,4,\"5\",6\r\n";
let total_bytes = seed_string.len() * num_rows;
let mut test_string = String::with_capacity(total_bytes);
for _ in range(0,num_rows) {
test_string.push_str(seed_string);
}
let bytes = test_string.into_bytes();
b.bytes = total_bytes as u64;
b.iter(|| {
let r = bytes.as_slice();
let mut x=0;
let mut parser = SimpleCsv::new(r);
for _ in parser {
x+=1;
}
assert_eq!(x,num_rows);
});
}