1use std::collections::HashMap;
2use std::fs::File;
3use std::io::{Read, Seek, SeekFrom};
4use std::path::Path;
5
6use csv::{self, Reader, StringRecord};
7use csv_core as csvc;
8use regex::Regex;
9
10use chain::*;
11use error::*;
12use field_type::{get_best_types, infer_record_types, infer_types, Type, TypeGuesses};
13use metadata::*;
14use sample::{take_sample_from_start, SampleIter, SampleSize};
15use snip::snip_preamble;
16
17#[derive(Debug, Default)]
21pub struct Sniffer {
22 delimiter: Option<u8>,
24 num_preamble_rows: Option<usize>,
25 has_header_row: Option<bool>,
26 quote: Option<Quote>,
27 flexible: Option<bool>,
28
29 delimiter_freq: Option<usize>,
31 types: Vec<Type>,
32
33 sample_size: Option<SampleSize>,
35}
36impl Sniffer {
37 pub fn new() -> Sniffer {
39 Sniffer::default()
40 }
41 pub fn delimiter(&mut self, delimiter: u8) -> &mut Sniffer {
43 self.delimiter = Some(delimiter);
44 self
45 }
46 pub fn header(&mut self, header: Header) -> &mut Sniffer {
48 self.num_preamble_rows = Some(header.num_preamble_rows);
49 self.has_header_row = Some(header.has_header_row);
50 self
51 }
52 pub fn quote(&mut self, quote: Quote) -> &mut Sniffer {
55 self.quote = Some(quote);
56 self
57 }
58
59 pub fn sample_size(&mut self, sample_size: SampleSize) -> &mut Sniffer {
64 self.sample_size = Some(sample_size);
65 self
66 }
67
68 fn get_sample_size(&self) -> SampleSize {
69 self.sample_size.unwrap_or(SampleSize::Bytes(1 << 14))
70 }
71
72 pub fn open_path<P: AsRef<Path>>(&mut self, path: P) -> Result<Reader<File>> {
77 self.open_reader(File::open(path)?)
78 }
79 pub fn open_reader<R: Read + Seek>(&mut self, mut reader: R) -> Result<Reader<R>> {
84 let metadata = self.sniff_reader(&mut reader)?;
85 reader.seek(SeekFrom::Start(0))?;
86 metadata.dialect.open_reader(reader)
87 }
88
89 pub fn sniff_path<P: AsRef<Path>>(&mut self, path: P) -> Result<Metadata> {
94 let file = File::open(path)?;
95 self.sniff_reader(&file)
96 }
97 pub fn sniff_reader<R: Read + Seek>(&mut self, mut reader: R) -> Result<Metadata> {
102 self.infer_quotes_delim(&mut reader)?;
104
105 if self.delimiter.is_some() {
108 self.infer_preamble_known_delim(&mut reader)?;
109 } else {
110 self.infer_delim_preamble(&mut reader)?;
111 }
112
113 self.infer_types(&mut reader)?;
114
115 assert!(
117 self.delimiter.is_some()
118 && self.num_preamble_rows.is_some()
119 && self.quote.is_some()
120 && self.flexible.is_some()
121 && self.delimiter_freq.is_some()
122 && self.has_header_row.is_some()
123 );
124 Ok(Metadata {
125 dialect: Dialect {
126 delimiter: self.delimiter.unwrap(),
127 header: Header {
128 num_preamble_rows: self.num_preamble_rows.unwrap(),
129 has_header_row: self.has_header_row.unwrap(),
130 },
131 quote: self.quote.clone().unwrap(),
132 flexible: self.flexible.unwrap(),
133 },
134 num_fields: self.delimiter_freq.unwrap() + 1,
135 types: self.types.clone(),
136 })
137 }
138
139 fn infer_quotes_delim<R: Read + Seek>(&mut self, reader: &mut R) -> Result<()> {
143 if let (&Some(_), &Some(_)) = (&self.quote, &self.delimiter) {
144 return Ok(());
146 }
147 let quote_guesses = match self.quote {
148 Some(Quote::Some(chr)) => vec![chr],
149 Some(Quote::None) => {
150 return Ok(());
153 }
154 None => vec![b'\'', b'"', b'`'],
155 };
156 let (quote_chr, (quote_cnt, delim_guess)) = quote_guesses.iter().fold(
158 Ok((b'"', (0, b'\0'))),
159 |acc: Result<(u8, (usize, u8))>, &chr| {
160 if let Ok(acc) = acc {
161 let mut sample_reader = take_sample_from_start(reader, self.get_sample_size())?;
162 if let Some((cnt, delim_chr)) =
163 quote_count(&mut sample_reader, char::from(chr), &self.delimiter)?
164 {
165 Ok(if cnt > (acc.1).0 {
166 (chr, (cnt, delim_chr))
167 } else {
168 acc
169 })
170 } else {
171 Ok(acc)
172 }
173 } else {
174 acc
175 }
176 },
177 )?;
178 if quote_cnt == 0 {
179 self.quote = Some(Quote::None);
180 } else {
181 self.quote = Some(Quote::Some(quote_chr));
182 self.delimiter = Some(delim_guess);
183 };
184 Ok(())
185 }
186
187 fn infer_preamble_known_delim<R: Read + Seek>(&mut self, reader: &mut R) -> Result<()> {
189 assert!(self.delimiter.is_some() && self.quote.is_some());
191 let (quote, delim) = (self.quote.clone().unwrap(), self.delimiter.unwrap());
193
194 let sample_iter = take_sample_from_start(reader, self.get_sample_size())?;
195
196 let mut chain = Chain::default();
197
198 if let Quote::Some(character) = quote {
199 let mut csv_reader = csvc::ReaderBuilder::new()
202 .delimiter(delim)
203 .quote(character)
204 .build();
205
206 let mut output = vec![];
207 let mut ends = vec![];
208 for line in sample_iter {
209 let line = line?;
210 if line.len() > output.len() {
211 output.resize(line.len(), 0);
212 }
213 if line.len() > ends.len() {
214 ends.resize(line.len(), 0);
215 }
216 let (result, _, _, n_ends) =
217 csv_reader.read_record(line.as_bytes(), &mut output, &mut ends);
218 match result {
220 csvc::ReadRecordResult::OutputFull | csvc::ReadRecordResult::OutputEndsFull => {
221 return Err(SnifferError::SniffingFailed(format!(
222 "failure to read quoted CSV record: {:?}",
223 result
224 )));
225 }
226 _ => {} }
228 chain.add_observation(n_ends);
231 }
232 } else {
233 for line in sample_iter {
234 let line = line?;
235 let freq = line.as_bytes().iter().filter(|&&c| c == delim).count();
236 chain.add_observation(freq);
237 }
238 }
239 self.run_chains(vec![chain])
240 }
241
242 fn infer_delim_preamble<R: Read + Seek>(&mut self, reader: &mut R) -> Result<()> {
244 let sample_iter = take_sample_from_start(reader, self.get_sample_size())?;
245
246 const NUM_ASCII_CHARS: usize = 128;
247 let mut chains = vec![Chain::default(); NUM_ASCII_CHARS];
248 for line in sample_iter {
249 let line = line?;
250 let mut freqs = [0; NUM_ASCII_CHARS];
251 for &chr in line.as_bytes() {
252 if chr < NUM_ASCII_CHARS as u8 {
253 freqs[chr as usize] += 1;
254 }
255 }
256 for (chr, &freq) in freqs.iter().enumerate() {
257 chains[chr as usize].add_observation(freq);
258 }
259 }
260
261 self.run_chains(chains)
262 }
263
264 fn run_chains(&mut self, mut chains: Vec<Chain>) -> Result<()> {
267 let (best_delim, delim_freq, best_state, path, _) = chains.iter_mut().enumerate().fold(
275 (b',', 0, STATE_UNSTEADY, vec![], 0.0),
276 |acc, (i, ref mut chain)| {
277 let (_, _, best_state, _, best_state_prob) = acc;
278 let ViterbiResults {
279 max_delim_freq,
280 path,
281 } = chain.viterbi();
282 let (final_state, final_viter) = path[path.len() - 1];
283 if final_state < best_state
284 || (final_state == best_state && final_viter.prob > best_state_prob)
285 {
286 (i as u8, max_delim_freq, final_state, path, final_viter.prob)
287 } else {
288 acc
289 }
290 },
291 );
292 self.flexible = Some(match best_state {
293 STATE_STEADYSTRICT => false,
294 STATE_STEADYFLEX => true,
295 _ => {
296 return Err(SnifferError::SniffingFailed(
297 "unable to find valid delimiter".to_string(),
298 ));
299 }
300 });
301
302 let mut num_preamble_rows = 0;
305 for &(state, _) in path.iter().skip(1) {
307 if state == best_state {
308 break;
309 }
310 num_preamble_rows += 1;
311 }
312 if self.delimiter.is_none() {
313 self.delimiter = Some(best_delim);
314 }
315 self.delimiter_freq = Some(delim_freq);
316 self.num_preamble_rows = Some(num_preamble_rows);
317 Ok(())
318 }
319
320 fn infer_types<R: Read + Seek>(&mut self, reader: &mut R) -> Result<()> {
321 assert!(self.delimiter_freq.is_some());
323 let field_count = self.delimiter_freq.unwrap() + 1;
325
326 let mut csv_reader = self.create_csv_reader(reader)?;
327 let mut records_iter = csv_reader.records();
328 let mut n_bytes = 0;
329 let mut n_records = 0;
330 let sample_size = self.get_sample_size();
331
332 let header_row_types = match records_iter.next() {
335 Some(record) => {
336 let record = record?;
337 n_records += 1;
338 n_bytes += count_bytes(&record);
339 infer_record_types(&record)
340 }
341 None => {
342 return Err(SnifferError::SniffingFailed(
343 "CSV empty (after preamble)".into(),
344 ));
345 }
346 };
347 let mut row_types = vec![TypeGuesses::all(); field_count];
348
349 for record in records_iter {
350 let record = record?;
351 for (i, field) in record.iter().enumerate() {
352 row_types[i] &= infer_types(field);
353 }
354 n_records += 1;
355 n_bytes += count_bytes(&record);
356 match sample_size {
358 SampleSize::Records(recs) => {
359 if n_records > recs {
360 break;
361 }
362 }
363 SampleSize::Bytes(bytes) => {
364 if n_bytes > bytes {
365 break;
366 }
367 }
368 SampleSize::All => {}
369 }
370 }
371 if n_records == 1 {
372 self.has_header_row = Some(false);
375 self.types = get_best_types(header_row_types);
376 return Ok(());
377 }
378
379 if header_row_types
380 .iter()
381 .zip(&row_types)
382 .any(|(header, data)| !data.allows(header))
383 {
384 self.has_header_row = Some(true);
385 } else {
386 self.has_header_row = Some(false);
387 }
388
389 self.types = get_best_types(row_types);
390 Ok(())
391 }
392
393 fn create_csv_reader<'a, R: Read + Seek>(
394 &self,
395 mut reader: &'a mut R,
396 ) -> Result<Reader<&'a mut R>> {
397 reader.seek(SeekFrom::Start(0))?;
398 if let Some(num_preamble_rows) = self.num_preamble_rows {
399 snip_preamble(&mut reader, num_preamble_rows)?;
400 }
401
402 let mut builder = csv::ReaderBuilder::new();
403 if let Some(delim) = self.delimiter {
404 builder.delimiter(delim);
405 }
406 if let Some(has_header_row) = self.has_header_row {
407 builder.has_headers(has_header_row);
408 }
409 match self.quote {
410 Some(Quote::Some(chr)) => {
411 builder.quoting(true);
412 builder.quote(chr);
413 }
414 Some(Quote::None) => {
415 builder.quoting(false);
416 }
417 _ => {}
418 }
419 if let Some(flexible) = self.flexible {
420 builder.flexible(flexible);
421 }
422
423 Ok(builder.from_reader(reader))
424 }
425}
426
427fn quote_count<R: Read>(
428 sample_iter: &mut SampleIter<R>,
429 character: char,
430 delim: &Option<u8>,
431) -> Result<Option<(usize, u8)>> {
432 let pattern = match *delim {
433 Some(delim) => format!(r#"{}\s*?{}\s*{}"#, character, delim, character),
434 None => format!(r#"{}\s*?(?P<delim>[^\w\n'"`])\s*{}"#, character, character),
435 };
436 let re = Regex::new(&pattern).unwrap();
437
438 let mut delim_count_map: HashMap<String, usize> = HashMap::new();
441 let mut count = 0;
442 for line in sample_iter {
443 let line = line?;
444 for cap in re.captures_iter(&line) {
445 count += 1;
446 if delim.is_some() {
448 } else {
449 *delim_count_map.entry(cap["delim"].to_string()).or_insert(0) += 1;
450 }
451 }
452 }
453 if count == 0 {
454 return Ok(None);
455 }
456
457 if let Some(delim) = *delim {
459 return Ok(Some((count, delim)));
460 }
461
462 let (delim_count, delim) =
464 delim_count_map
465 .iter()
466 .fold((0, b'\0'), |acc, (delim, &delim_count)| {
467 assert!(delim.len() == 1);
468 if delim_count > acc.0 {
469 (delim_count, (delim.as_ref() as &[u8])[0])
470 } else {
471 acc
472 }
473 });
474
475 assert_ne!(delim_count, 0, "invalid regex match: no delimiter found");
477 Ok(Some((count, delim)))
478}
479
480fn count_bytes(record: &StringRecord) -> usize {
481 record.iter().fold(0, |acc, field| acc + field.len())
482}