use crate::classifier::CsvClassifier;
use crate::classifier::{COMMA_CLASS, NEW_LINE_CLASS, QUOTATION_CLASS};
use crate::grammar::Row;
use crate::u8x16::u8x16;
use std::ops::Range;
#[derive(Debug)]
pub struct CsvReader {
quotation_bitsets: Vec<u64>,
comma_bitsets: Vec<u64>,
new_line_bitsets: Vec<u64>,
}
impl CsvReader {
pub fn new(data: &[u8]) -> Self {
let vectors = CsvClassifier::new(data).classify();
let capacity = vectors.len() / 4 + (vectors.len() % 4 != 0) as usize;
let comma_broadcast = u8x16::broadcast(COMMA_CLASS);
let new_line_broadcast = u8x16::broadcast(NEW_LINE_CLASS);
let quotation_broadcast = u8x16::broadcast(QUOTATION_CLASS);
let mut comma_bitsets = Vec::with_capacity(capacity);
let mut new_line_bitsets = Vec::with_capacity(capacity);
let mut quotation_bitsets = Vec::with_capacity(capacity);
vectors.chunks(4).for_each(|chunk| {
comma_bitsets.push(build_u64(chunk, comma_broadcast));
new_line_bitsets.push(build_u64(chunk, new_line_broadcast));
quotation_bitsets.push(build_u64(chunk, quotation_broadcast));
});
Self {
comma_bitsets,
new_line_bitsets,
quotation_bitsets,
}
}
pub fn read(&mut self) -> Vec<Row> {
let mut rows = Vec::new();
let mut current_row = Vec::new();
let mut start = 0;
let mut end = 0;
for i in 0..self.quotation_bitsets.len() {
let valid_quotations = remove_escaped_quotations(self.quotation_bitsets[i]);
let outside_quotations = !mark_inside_quotations(valid_quotations);
let mut valid_commas = self.comma_bitsets[i] & outside_quotations;
let mut valid_new_line = self.new_line_bitsets[i] & outside_quotations;
if valid_commas == 0 && valid_new_line == 0 {
end += 64;
continue;
}
loop {
let first_comma = valid_commas.leading_zeros() as usize;
let first_new_line = valid_new_line.leading_zeros() as usize;
let bits_traveled = first_comma.min(first_new_line);
if bits_traveled == 64 {
end = (i + 1) * 64;
break;
}
end += bits_traveled;
if start < end {
current_row.push(Range { start, end });
if first_new_line < first_comma {
rows.push(Row::from(current_row.clone()));
current_row.clear();
}
}
end += 1;
valid_commas <<= bits_traveled + 1;
valid_new_line <<= bits_traveled + 1;
start = end;
}
}
rows
}
}
const fn remove_escaped_quotations(q: u64) -> u64 {
let escaped = q & (q << 1);
let escaped = escaped | (escaped >> 1);
q & !escaped
}
#[inline]
const fn mark_inside_quotations(mut x: u64) -> u64 {
x ^= x << 1;
x ^= x << 2;
x ^= x << 4;
x ^= x << 8;
x ^= x << 16;
x ^= x << 32;
x << 1
}
#[inline]
fn build_u64(chunks: &[u8x16], broadcast: u8x16) -> u64 {
let mut packed: u64 = 0;
for (i, &chunk) in chunks.iter().enumerate() {
let word = chunk.eq(broadcast).bitset() as u64;
packed |= word << (48 - i * 16);
}
packed
}
#[cfg(test)]
mod tests {
use super::*;
fn parse_rows(test: &[u8], expected: Vec<Vec<String>>) {
let mut reader = CsvReader::new(test);
let rows = reader
.read()
.iter()
.map(|row| {
row.fields()
.iter()
.map(|range| String::from_utf8(test[range.clone()].to_vec()).unwrap())
.collect::<Vec<_>>()
})
.collect::<Vec<_>>();
assert_eq!(rows, expected);
}
#[test]
fn read_basic() {
let data = b"aaa,bbb,ccc";
parse_rows(
data,
vec![vec![
"aaa".to_string(),
"bbb".to_string(),
"ccc".to_string(),
]],
);
}
#[test]
fn read_basic2() {
let data = b"aaa,\"bbb\",ccc";
parse_rows(
data,
vec![vec![
"aaa".to_string(),
"\"bbb\"".to_string(),
"ccc".to_string(),
]],
);
}
#[test]
fn read_nested() {
let data = b"\"aaa,howdy\",\"b\"\"bb\",\"ccc\"";
parse_rows(
data,
vec![vec![
"\"aaa,howdy\"".to_string(),
"\"b\"\"bb\"".to_string(),
"\"ccc\"".to_string(),
]],
);
}
#[test]
fn read_newline_field() {
let data = b"\"aaa,ho\nwdy\",\"b\"\"bb\",\"ccc\"";
parse_rows(
data,
vec![vec![
"\"aaa,ho\nwdy\"".to_string(),
"\"b\"\"bb\"".to_string(),
"\"ccc\"".to_string(),
]],
);
}
#[test]
fn read_crlf_field() {
let data = b"\"aaa,ho\r\nwdy\",\"b\"\"bb\",\"ccc\"";
parse_rows(
data,
vec![vec![
"\"aaa,ho\r\nwdy\"".to_string(),
"\"b\"\"bb\"".to_string(),
"\"ccc\"".to_string(),
]],
);
}
#[test]
fn read_taxi_zone_lookup_header() {
let data = b"\"LocationID\",\"Borough\",\"Zone\",\"service_zone\"\n";
parse_rows(
data,
vec![vec![
r#""LocationID""#.to_string(),
r#""Borough""#.to_string(),
r#""Zone""#.to_string(),
r#""service_zone""#.to_string(),
]],
);
}
#[test]
fn test_taxi_zone_should_split_two_rows() {
let data = b"\"LocationID\",\"Borough\",\"Zone\",\"service_zone\"\r\n1,\"EWR\"";
parse_rows(
data,
vec![
vec![
r#""LocationID""#.to_string(),
r#""Borough""#.to_string(),
r#""Zone""#.to_string(),
r#""service_zone""#.to_string(),
],
vec!["1".to_string(), r#""EWR""#.to_string()],
],
);
}
#[test]
fn test_mark_inside_quotations() {
let res = mark_inside_quotations(0b10001000);
assert_eq!(res, 0b11110000);
let res2 = mark_inside_quotations(0b1001_1001);
assert_eq!(res2, 0b1110_1110);
let res4 = mark_inside_quotations(0b1100_1001);
assert_eq!(res4, 0b1000_1110);
}
}