Skip to main content

csv_sniffer/
sniffer.rs

1use std::collections::HashMap;
2use std::fs::File;
3use std::io::{Read, Seek, SeekFrom};
4use std::path::Path;
5
6use csv::{self, Reader, StringRecord};
7use csv_core as csvc;
8use regex::Regex;
9
10use chain::*;
11use error::*;
12use field_type::{get_best_types, infer_record_types, infer_types, Type, TypeGuesses};
13use metadata::*;
14use sample::{take_sample_from_start, SampleIter, SampleSize};
15use snip::snip_preamble;
16
17/// A CSV sniffer.
18///
19/// The sniffer examines a CSV file, passed in either through a file or a reader.
20#[derive(Debug, Default)]
21pub struct Sniffer {
22    // CSV file dialect guesses
23    delimiter: Option<u8>,
24    num_preamble_rows: Option<usize>,
25    has_header_row: Option<bool>,
26    quote: Option<Quote>,
27    flexible: Option<bool>,
28
29    // Metadata guesses
30    delimiter_freq: Option<usize>,
31    types: Vec<Type>,
32
33    // sample size to sniff
34    sample_size: Option<SampleSize>,
35}
36impl Sniffer {
37    /// Create a new CSV sniffer.
38    pub fn new() -> Sniffer {
39        Sniffer::default()
40    }
41    /// Specify the delimiter character.
42    pub fn delimiter(&mut self, delimiter: u8) -> &mut Sniffer {
43        self.delimiter = Some(delimiter);
44        self
45    }
46    /// Specify the header type (whether the CSV file has a header row, and where the data starts).
47    pub fn header(&mut self, header: Header) -> &mut Sniffer {
48        self.num_preamble_rows = Some(header.num_preamble_rows);
49        self.has_header_row = Some(header.has_header_row);
50        self
51    }
52    /// Specify the quote character (if any), and whether two quotes in a row as to be interepreted
53    /// as an escaped quote.
54    pub fn quote(&mut self, quote: Quote) -> &mut Sniffer {
55        self.quote = Some(quote);
56        self
57    }
58
59    /// The size of the sample to examine while sniffing. If using `SampleSize::Records`, the
60    /// sniffer will use the `Terminator::CRLF` as record separator.
61    ///
62    /// The sample size defaults to `SampleSize::Bytes(4096)`.
63    pub fn sample_size(&mut self, sample_size: SampleSize) -> &mut Sniffer {
64        self.sample_size = Some(sample_size);
65        self
66    }
67
68    fn get_sample_size(&self) -> SampleSize {
69        self.sample_size.unwrap_or(SampleSize::Bytes(1 << 14))
70    }
71
72    /// Sniff the CSV file located at the provided path, and return a `Reader` (from the
73    /// [`csv`](https://docs.rs/csv) crate) ready to ready the file.
74    ///
75    /// Fails on file opening or readering errors, or on an error examining the file.
76    pub fn open_path<P: AsRef<Path>>(&mut self, path: P) -> Result<Reader<File>> {
77        self.open_reader(File::open(path)?)
78    }
79    /// Sniff the CSV file provided by the reader, and return a [`csv`](https://docs.rs/csv)
80    /// `Reader` object.
81    ///
82    /// Fails on file opening or readering errors, or on an error examining the file.
83    pub fn open_reader<R: Read + Seek>(&mut self, mut reader: R) -> Result<Reader<R>> {
84        let metadata = self.sniff_reader(&mut reader)?;
85        reader.seek(SeekFrom::Start(0))?;
86        metadata.dialect.open_reader(reader)
87    }
88
89    /// Sniff the CSV file located at the provided path, and return a
90    /// [`Metadata`](struct.Metadata.html) object containing information about the CSV file.
91    ///
92    /// Fails on file opening or readering errors, or on an error examining the file.
93    pub fn sniff_path<P: AsRef<Path>>(&mut self, path: P) -> Result<Metadata> {
94        let file = File::open(path)?;
95        self.sniff_reader(&file)
96    }
97    /// Sniff the CSV file provider by the reader, and return a
98    /// [`Metadata`](struct.Metadata.html) object containing information about the CSV file.
99    ///
100    /// Fails on file opening or readering errors, or on an error examining the file.
101    pub fn sniff_reader<R: Read + Seek>(&mut self, mut reader: R) -> Result<Metadata> {
102        // guess quotes & delim
103        self.infer_quotes_delim(&mut reader)?;
104
105        // if we have a delimiter, we just need to search for num_preamble_rows and check for
106        // flexible. Otherwise, we need to guess a delimiter as well.
107        if self.delimiter.is_some() {
108            self.infer_preamble_known_delim(&mut reader)?;
109        } else {
110            self.infer_delim_preamble(&mut reader)?;
111        }
112
113        self.infer_types(&mut reader)?;
114
115        // as this point of the process, we should have all these filled in.
116        assert!(
117            self.delimiter.is_some()
118                && self.num_preamble_rows.is_some()
119                && self.quote.is_some()
120                && self.flexible.is_some()
121                && self.delimiter_freq.is_some()
122                && self.has_header_row.is_some()
123        );
124        Ok(Metadata {
125            dialect: Dialect {
126                delimiter: self.delimiter.unwrap(),
127                header: Header {
128                    num_preamble_rows: self.num_preamble_rows.unwrap(),
129                    has_header_row: self.has_header_row.unwrap(),
130                },
131                quote: self.quote.clone().unwrap(),
132                flexible: self.flexible.unwrap(),
133            },
134            num_fields: self.delimiter_freq.unwrap() + 1,
135            types: self.types.clone(),
136        })
137    }
138
139    // Infers quotes and delimiter from quoted (or possibly quoted) files. If quotes detected,
140    // updates self.quote and self.delimiter. If quotes not detected, updates self.quote to
141    // Quote::None. Only valid quote characters: " (double-quote), ' (single-quote), ` (back-tick).
142    fn infer_quotes_delim<R: Read + Seek>(&mut self, reader: &mut R) -> Result<()> {
143        if let (&Some(_), &Some(_)) = (&self.quote, &self.delimiter) {
144            // nothing let to infer!
145            return Ok(());
146        }
147        let quote_guesses = match self.quote {
148            Some(Quote::Some(chr)) => vec![chr],
149            Some(Quote::None) => {
150                // this function only checks quoted (or possibly quoted) files, nothing left to
151                // do if we know there are no quotes
152                return Ok(());
153            }
154            None => vec![b'\'', b'"', b'`'],
155        };
156        // TODO: this can probably be replaced with a try_fold whenever that leaves nightly
157        let (quote_chr, (quote_cnt, delim_guess)) = quote_guesses.iter().fold(
158            Ok((b'"', (0, b'\0'))),
159            |acc: Result<(u8, (usize, u8))>, &chr| {
160                if let Ok(acc) = acc {
161                    let mut sample_reader = take_sample_from_start(reader, self.get_sample_size())?;
162                    if let Some((cnt, delim_chr)) =
163                        quote_count(&mut sample_reader, char::from(chr), &self.delimiter)?
164                    {
165                        Ok(if cnt > (acc.1).0 {
166                            (chr, (cnt, delim_chr))
167                        } else {
168                            acc
169                        })
170                    } else {
171                        Ok(acc)
172                    }
173                } else {
174                    acc
175                }
176            },
177        )?;
178        if quote_cnt == 0 {
179            self.quote = Some(Quote::None);
180        } else {
181            self.quote = Some(Quote::Some(quote_chr));
182            self.delimiter = Some(delim_guess);
183        };
184        Ok(())
185    }
186
187    // Updates delimiter frequency, number of preamble rows, and flexible boolean.
188    fn infer_preamble_known_delim<R: Read + Seek>(&mut self, reader: &mut R) -> Result<()> {
189        // prerequisites for calling this function:
190        assert!(self.delimiter.is_some() && self.quote.is_some());
191        // unwraps for delimiter and quote are safe
192        let (quote, delim) = (self.quote.clone().unwrap(), self.delimiter.unwrap());
193
194        let sample_iter = take_sample_from_start(reader, self.get_sample_size())?;
195
196        let mut chain = Chain::default();
197
198        if let Quote::Some(character) = quote {
199            // since we have a quote, we need to run this data through the csv_core::Reader (which
200            // properly escapes quoted fields
201            let mut csv_reader = csvc::ReaderBuilder::new()
202                .delimiter(delim)
203                .quote(character)
204                .build();
205
206            let mut output = vec![];
207            let mut ends = vec![];
208            for line in sample_iter {
209                let line = line?;
210                if line.len() > output.len() {
211                    output.resize(line.len(), 0);
212                }
213                if line.len() > ends.len() {
214                    ends.resize(line.len(), 0);
215                }
216                let (result, _, _, n_ends) =
217                    csv_reader.read_record(line.as_bytes(), &mut output, &mut ends);
218                // check to make sure record was read correctly
219                match result {
220                    csvc::ReadRecordResult::OutputFull | csvc::ReadRecordResult::OutputEndsFull => {
221                        return Err(SnifferError::SniffingFailed(format!(
222                            "failure to read quoted CSV record: {:?}",
223                            result
224                        )));
225                    }
226                    _ => {} // non-error results, do nothing
227                }
228                // n_ends is the number of barries between fields, so it's the same as the number
229                // of delimiters
230                chain.add_observation(n_ends);
231            }
232        } else {
233            for line in sample_iter {
234                let line = line?;
235                let freq = line.as_bytes().iter().filter(|&&c| c == delim).count();
236                chain.add_observation(freq);
237            }
238        }
239        self.run_chains(vec![chain])
240    }
241
242    // Updates delimiter, delimiter frequency, number of preamble rows, and flexible boolean.
243    fn infer_delim_preamble<R: Read + Seek>(&mut self, reader: &mut R) -> Result<()> {
244        let sample_iter = take_sample_from_start(reader, self.get_sample_size())?;
245
246        const NUM_ASCII_CHARS: usize = 128;
247        let mut chains = vec![Chain::default(); NUM_ASCII_CHARS];
248        for line in sample_iter {
249            let line = line?;
250            let mut freqs = [0; NUM_ASCII_CHARS];
251            for &chr in line.as_bytes() {
252                if chr < NUM_ASCII_CHARS as u8 {
253                    freqs[chr as usize] += 1;
254                }
255            }
256            for (chr, &freq) in freqs.iter().enumerate() {
257                chains[chr as usize].add_observation(freq);
258            }
259        }
260
261        self.run_chains(chains)
262    }
263
264    // Updates delimiter (if not already known), delimiter frequency, number of preamble rows, and
265    // flexible boolean.
266    fn run_chains(&mut self, mut chains: Vec<Chain>) -> Result<()> {
267        // Find the 'best' delimiter: choose strict (non-flexible) delimiters over flexible ones,
268        // and choose the one that had the highest probability markov chain in the end.
269        //
270        // In the case where delim is already known, 'best_delim' will be incorrect (since it won't
271        // correspond with position in a vector of Chains), but we'll just ignore it when
272        // constructing our return value later. 'best_state' and 'path' are necessary, though, to
273        // compute the preamble rows.
274        let (best_delim, delim_freq, best_state, path, _) = chains.iter_mut().enumerate().fold(
275            (b',', 0, STATE_UNSTEADY, vec![], 0.0),
276            |acc, (i, ref mut chain)| {
277                let (_, _, best_state, _, best_state_prob) = acc;
278                let ViterbiResults {
279                    max_delim_freq,
280                    path,
281                } = chain.viterbi();
282                let (final_state, final_viter) = path[path.len() - 1];
283                if final_state < best_state
284                    || (final_state == best_state && final_viter.prob > best_state_prob)
285                {
286                    (i as u8, max_delim_freq, final_state, path, final_viter.prob)
287                } else {
288                    acc
289                }
290            },
291        );
292        self.flexible = Some(match best_state {
293            STATE_STEADYSTRICT => false,
294            STATE_STEADYFLEX => true,
295            _ => {
296                return Err(SnifferError::SniffingFailed(
297                    "unable to find valid delimiter".to_string(),
298                ));
299            }
300        });
301
302        // Find the number of preamble rows (the number of rows during which the state fluctuated
303        // before getting to the final state).
304        let mut num_preamble_rows = 0;
305        // since path has an extra state as the beginning, skip one
306        for &(state, _) in path.iter().skip(1) {
307            if state == best_state {
308                break;
309            }
310            num_preamble_rows += 1;
311        }
312        if self.delimiter.is_none() {
313            self.delimiter = Some(best_delim);
314        }
315        self.delimiter_freq = Some(delim_freq);
316        self.num_preamble_rows = Some(num_preamble_rows);
317        Ok(())
318    }
319
320    fn infer_types<R: Read + Seek>(&mut self, reader: &mut R) -> Result<()> {
321        // prerequisites for calling this function:
322        assert!(self.delimiter_freq.is_some());
323        // unwrap is safe
324        let field_count = self.delimiter_freq.unwrap() + 1;
325
326        let mut csv_reader = self.create_csv_reader(reader)?;
327        let mut records_iter = csv_reader.records();
328        let mut n_bytes = 0;
329        let mut n_records = 0;
330        let sample_size = self.get_sample_size();
331
332        // Infer types for the top row. We'll save this set of types to check against the types
333        // of the remaining rows to see if this is part of the data or a separate header row.
334        let header_row_types = match records_iter.next() {
335            Some(record) => {
336                let record = record?;
337                n_records += 1;
338                n_bytes += count_bytes(&record);
339                infer_record_types(&record)
340            }
341            None => {
342                return Err(SnifferError::SniffingFailed(
343                    "CSV empty (after preamble)".into(),
344                ));
345            }
346        };
347        let mut row_types = vec![TypeGuesses::all(); field_count];
348
349        for record in records_iter {
350            let record = record?;
351            for (i, field) in record.iter().enumerate() {
352                row_types[i] &= infer_types(field);
353            }
354            n_records += 1;
355            n_bytes += count_bytes(&record);
356            // break if we pass sample size limits
357            match sample_size {
358                SampleSize::Records(recs) => {
359                    if n_records > recs {
360                        break;
361                    }
362                }
363                SampleSize::Bytes(bytes) => {
364                    if n_bytes > bytes {
365                        break;
366                    }
367                }
368                SampleSize::All => {}
369            }
370        }
371        if n_records == 1 {
372            // there's only one row in the whole data file (the top row already parsed),
373            // so we're going to assume it's a data row, not a header row.
374            self.has_header_row = Some(false);
375            self.types = get_best_types(header_row_types);
376            return Ok(());
377        }
378
379        if header_row_types
380            .iter()
381            .zip(&row_types)
382            .any(|(header, data)| !data.allows(header))
383        {
384            self.has_header_row = Some(true);
385        } else {
386            self.has_header_row = Some(false);
387        }
388
389        self.types = get_best_types(row_types);
390        Ok(())
391    }
392
393    fn create_csv_reader<'a, R: Read + Seek>(
394        &self,
395        mut reader: &'a mut R,
396    ) -> Result<Reader<&'a mut R>> {
397        reader.seek(SeekFrom::Start(0))?;
398        if let Some(num_preamble_rows) = self.num_preamble_rows {
399            snip_preamble(&mut reader, num_preamble_rows)?;
400        }
401
402        let mut builder = csv::ReaderBuilder::new();
403        if let Some(delim) = self.delimiter {
404            builder.delimiter(delim);
405        }
406        if let Some(has_header_row) = self.has_header_row {
407            builder.has_headers(has_header_row);
408        }
409        match self.quote {
410            Some(Quote::Some(chr)) => {
411                builder.quoting(true);
412                builder.quote(chr);
413            }
414            Some(Quote::None) => {
415                builder.quoting(false);
416            }
417            _ => {}
418        }
419        if let Some(flexible) = self.flexible {
420            builder.flexible(flexible);
421        }
422
423        Ok(builder.from_reader(reader))
424    }
425}
426
427fn quote_count<R: Read>(
428    sample_iter: &mut SampleIter<R>,
429    character: char,
430    delim: &Option<u8>,
431) -> Result<Option<(usize, u8)>> {
432    let pattern = match *delim {
433        Some(delim) => format!(r#"{}\s*?{}\s*{}"#, character, delim, character),
434        None => format!(r#"{}\s*?(?P<delim>[^\w\n'"`])\s*{}"#, character, character),
435    };
436    let re = Regex::new(&pattern).unwrap();
437
438    // TODO: a hashmap isn't an ideal choice for this, I believe (since it requires a linear
439    // search of the values at the end). Consider other options
440    let mut delim_count_map: HashMap<String, usize> = HashMap::new();
441    let mut count = 0;
442    for line in sample_iter {
443        let line = line?;
444        for cap in re.captures_iter(&line) {
445            count += 1;
446            // if we already know delimiter, we don't need to count
447            if delim.is_some() {
448            } else {
449                *delim_count_map.entry(cap["delim"].to_string()).or_insert(0) += 1;
450            }
451        }
452    }
453    if count == 0 {
454        return Ok(None);
455    }
456
457    // if we already know delimiter, no need to go through map
458    if let Some(delim) = *delim {
459        return Ok(Some((count, delim)));
460    }
461
462    // find the highest-count delimiter in the map
463    let (delim_count, delim) =
464        delim_count_map
465            .iter()
466            .fold((0, b'\0'), |acc, (delim, &delim_count)| {
467                assert!(delim.len() == 1);
468                if delim_count > acc.0 {
469                    (delim_count, (delim.as_ref() as &[u8])[0])
470                } else {
471                    acc
472                }
473            });
474
475    // delim_count should be nonzero; delim should always match at least something
476    assert_ne!(delim_count, 0, "invalid regex match: no delimiter found");
477    Ok(Some((count, delim)))
478}
479
480fn count_bytes(record: &StringRecord) -> usize {
481    record.iter().fold(0, |acc, field| acc + field.len())
482}