quick_csv/
lib.rs

1//! Quick Csv reader which performs **very** well.
2//! 
3//! ## Example
4//! 
5//! First, create a `Csv` from a `BufRead` reader, a file or a string
6//! 
7//! ```rust
8//! extern crate quick_csv;
9//! 
10//! fn main() {
11//!     let data = "a,b\r\nc,d\r\ne,f";
12//!     let csv = quick_csv::Csv::from_string(data);
13//!     for row in csv.into_iter() {
14//!         // work on csv row ...
15//!         if let Ok(_) = row {
16//!             println!("new row!");
17//!         } else {
18//!             println!("cannot read next line");
19//!         }
20//!     }
21//! }
22//! ```
23//! 
24//! `Row` is on the other hand provides 3 methods to access csv columns:
25//! - `columns`: 
26//!   - iterator over columns.
27//!   - Iterator item is a `&str`, which means you only have to `parse()` it to the needed type and you're done
28//! 
29//!   ```rust
30//!   # let row = quick_csv::Csv::from_string("a,b,c,d,e,38,f").next().unwrap().unwrap();
31//!   let mut cols = row.columns().expect("cannot convert to utf8");
32//!   let fifth = cols.nth(5).unwrap().parse::<f64>().unwrap();
33//!   println!("Doubled fifth column: {}", fifth * 2.0);
34//!   ```
35//! 
36//! - `decode`:
37//!   - deserialize into you `Decodable` struct, a-la rust-csv.
38//!   - most convenient way to deal with your csv data
39//! 
40//!   ```rust
41//!   let row = quick_csv::Csv::from_string("a,b,54").next().unwrap().unwrap();
42//!   if let Ok((col1, col2, col3)) = row.decode::<(String, u64, f64)>() {
43//!       println!("col1: '{}', col2: {}, col3: {}", col1, col2, col3);
44//!   }
45//!   ``` 
46//! 
47//! - `bytes_columns`:
48//!   - similar to `columns` but columns are of type `&[u8]`, which means you may want to convert it to &str first
49//!   - performance gain compared to `columns` is minimal, use it only if you *really* need to as it is less convenient
50
51#![deny(missing_docs)]
52
53extern crate rustc_serialize;
54
55pub mod columns;
56pub mod error;
57
58use self::columns::{Columns, BytesColumns};
59use std::fs::File;
60use std::io::{self, BufRead, BufReader};
61use std::iter::Iterator;
62use std::path::Path;
63
64use error::{Error, Result};
65use rustc_serialize::Decodable;
66
67#[cfg(test)] mod test;
68
69const UTF8_BOM: &'static [u8] = b"\xef\xbb\xbf";
70
71/// Csv reader
72/// 
73/// Iterates over the rows of the csv
74///
75/// # Example
76///
77/// ```rust
78/// let csv = quick_csv::Csv::from_file("./examples/data/bench.csv").unwrap();
79/// for row in csv.into_iter() {
80///     let row = row.unwrap(); // unwrap result, panic if not utf8 
81///     {
82///         // either use columns iterator directly (Item = &str)
83///         if let Ok(mut columns) = row.columns() {
84///             println!("Column 1: '{:?}', Column 2: '{:?}'", columns.next(), columns.next());
85///         }
86///     }
87///
88///     {
89///         // or decode it directly into something simpler
90///         if let Ok((col1, col2)) = row.decode::<(String, u64)>() {
91///             println!("Column 1: '{:?}', Column 2: '{:?}'", &col1, &col2);
92///         }
93///     }
94///
95/// }
96/// ```
97pub struct Csv<B: BufRead> {
98    /// delimiter
99    delimiter: u8,
100    /// reader
101    reader: B,
102    /// header
103    has_header: bool,
104    /// header
105    headers: Option<Vec<String>>,
106    /// flexible column count
107    flexible: bool,
108    /// column count
109    len: Option<usize>,
110    /// if was error, exit next
111    exit: bool,
112    /// line count
113    current_line: usize,
114}
115
116impl<B: BufRead> Csv<B> {
117
118    /// Creates a Csv from a generic BufReader
119    /// 
120    /// Note: default delimiter = ','
121    pub fn from_reader(mut reader: B) -> Csv<B> {
122        let result = try_consume_utf8_bom(&mut reader);
123
124        Csv {
125            reader: reader,
126            delimiter: b',',
127            has_header: false,
128            headers: None,
129            flexible: false,
130            len: None,
131            exit: result.is_err(),
132            current_line: 0,
133        }
134    }
135
136    /// Sets a new delimiter
137    pub fn delimiter(mut self, delimiter: u8) -> Csv<B> {
138        self.delimiter = delimiter;
139        self
140    }
141
142    /// Sets flexible columns
143    pub fn flexible(mut self, flexible: bool) -> Csv<B> {
144        self.flexible = flexible;
145        self
146    }
147
148    /// Defines whether there is a header or not
149    pub fn has_header(mut self, has_header: bool) -> Csv<B> {
150        self.has_header = has_header;
151        let _ = self.headers();
152        self
153    }
154
155   /// gets first row as Vec<String>
156    pub fn headers(&mut self) -> Vec<String> {
157        if let Some(ref h) = self.headers {
158            return h.clone();
159        }
160        if self.has_header {            
161            if let Some(r) = self.next() {
162                if let Ok(r) = r {
163                    let h = r.decode().ok().unwrap_or_else(Vec::new);
164                    self.headers = Some(h.clone());
165                    return h;
166                }
167            }
168        }
169        Vec::new()
170    }
171
172    /// Get column count
173    pub fn column_count(&self) -> Option<usize> {
174        self.len
175    }
176
177    /// Gets the current line number
178    ///
179    /// Useful if you get an error and want to investigate the source
180    pub fn current_line(&self) -> usize {
181        self.current_line
182    }
183
184}
185
186impl Csv<BufReader<File>> {
187    /// Creates a csv from a file path
188    pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Csv<BufReader<File>>>
189    {
190        let reader = BufReader::new(try!(File::open(path)));
191        Ok(Csv::from_reader(reader))
192    }
193}
194
195impl<'a> Csv<&'a [u8]> {
196    /// Creates a CSV reader for an in memory string buffer.
197    pub fn from_string(s: &'a str) -> Csv<&'a [u8]> {
198        Csv::from_reader(s.as_bytes())
199    }
200}
201
202/// Iterator on csv `Row`s
203impl<B: BufRead> Iterator for Csv<B> {
204    type Item = Result<Row>;
205    fn next(&mut self) -> Option<Result<Row>> {
206        if self.exit { return None; }
207        let mut buf = Vec::new();
208        let mut cols = self.len.map_or_else(Vec::new, Vec::with_capacity);
209        match read_line(&mut self.reader, &mut buf, self.delimiter, &mut cols) {
210            Ok(0) => None,
211            Ok(_n) => {
212                if buf.ends_with(&[b'\r']) {
213                    buf.pop();
214                }
215                cols.push(buf.len());
216                let c = cols.len();
217                if let Some(n) = self.len {
218                    if n != c && !self.flexible {
219                        self.exit = true;
220                        return Some(Err(Error::ColumnMismatch(n, c)));
221                    }
222                } else {
223                    self.len = Some(c);
224                }
225                self.current_line += 1;
226                Some(Ok(Row {
227                    line: buf,
228                    cols: cols,
229                }))
230            }
231            Err(e) => {
232                self.exit = true;
233                Some(Err(e))
234            },
235        }
236    }
237}
238
239/// Row struct used as Csv iterator Item
240///
241/// Row can be decoded into a Result<T: Decodable>
242pub struct Row {
243    line: Vec<u8>,
244    cols: Vec<usize>,
245}
246
247impl Row {
248
249    /// Gets an iterator over columns
250    pub fn columns(&self) -> Result<Columns> {
251        match ::std::str::from_utf8(&self.line) {
252            Err(_) => Err(Error::Io(io::Error::new(io::ErrorKind::InvalidData,
253                                    "stream did not contain valid UTF-8"))),
254            Ok(s) => Ok(Columns::new(s, &self.cols)),
255        }
256    }
257
258    ///  Creates a new BytesColumns iterator over &[u8]
259    pub fn bytes_columns(&self) -> BytesColumns {
260        BytesColumns::new(&self.line, &self.cols)
261    }
262
263    /// Decode row into custom decodable type
264    pub fn decode<T: Decodable>(&self) -> Result<T> {
265        let mut columns = try!(self.columns());
266        Decodable::decode(&mut columns)
267    }
268
269    /// Gets columns count
270    pub fn len(&self) -> usize {
271        self.cols.len()
272    }
273
274    /// `Row` is empty if there is no columns
275    pub fn is_empty(&self) -> bool {
276        self.cols.is_empty()
277    }
278
279}
280
281/// Consumes bytes as long as they are within quotes
282/// manages "" as quote escape
283/// returns
284/// - Ok(true) if entirely consumed
285/// - Ok(false) if no issue but it reached end of buffer
286/// - Err(Error::UnescapeQuote) if a quote if found within the column
287macro_rules! consume_quote {
288    ($bytes: expr, $delimiter: expr, $in_quote: expr,
289     $start: expr, $buf: expr, $available: expr, $quote_count: expr) => {
290        $in_quote = false;
291        loop {
292            match $bytes.next() {
293                Some((_, &b'\"')) => {
294                    match $bytes.clone().next() {
295                        Some((i, &b'\"')) => {
296                            $bytes.next(); // escaping quote
297                            $buf.extend_from_slice(&$available[$start..i]);
298                            $start = i + 1;
299                            $quote_count += 1;
300                        },
301                        None | Some((_, &b'\r')) | Some((_, &b'\n')) => break,
302                        Some((_, d)) if *d == $delimiter => break,
303                        Some((_, _)) => return Err(Error::UnescapedQuote),
304                    }
305                },
306                None => {
307                    $in_quote = true;
308                    break;
309                },
310                _ => (),
311            }
312        }
313    }
314}
315
316/// Reads an entire line into memory
317fn read_line<R: BufRead>(r: &mut R, buf: &mut Vec<u8>,
318                         delimiter: u8, cols: &mut Vec<usize>) -> Result<usize>
319{
320    let mut read = 0;
321    let mut in_quote = false;
322    let mut done = false;
323    let mut quote_count = 0;
324    while !done {
325        let used = {
326            let available = match r.fill_buf() {
327                Ok(n) if n.is_empty() => return Ok(read),
328                Ok(n) => n,
329                Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
330                Err(e) => return Err(Error::from(e)),
331            };
332
333            let mut bytes = available.iter().enumerate();
334            let mut start = 0;
335
336            // previous buffer was exhausted without exiting from quotes
337            if in_quote {
338                consume_quote!(bytes, delimiter, in_quote, start, buf, available, quote_count);
339            }
340
341            // use a simple loop instead of for loop to allow nested loop
342            let used: usize;
343            loop {
344                match bytes.next() {
345                    Some((i, &b'\"')) => {
346                        if i == 0 || available[i - 1] == delimiter {
347                            consume_quote!(bytes, delimiter, in_quote, start, buf, available, quote_count);
348                        } else {
349                            return Err(Error::UnexpextedQuote);
350                        }
351                    },
352                    Some((i, &b'\n')) => {
353                        done = true;
354                        used = i + 1;
355                        buf.extend_from_slice(&available[start..i]);
356                        break;
357                    },
358                    Some((i, &d)) => {
359                        if d == delimiter { cols.push(read + i - quote_count); }
360                    },
361                    None => {
362                        used = available.len();
363                        buf.extend_from_slice(&available[start..used]);
364                        break;
365                    },
366                }
367            }
368            used
369        };
370        r.consume(used);
371        read += used;
372    }
373    Ok(read)
374}
375
376fn try_consume_utf8_bom<B: BufRead>(reader: &mut B) -> Result<()> {
377    if try!(reader.fill_buf()).starts_with(UTF8_BOM) {
378        reader.consume(UTF8_BOM.len());
379    }
380
381    Ok(())
382}