csv_scout/sniffer.rs
1use hashbrown::HashMap;
2use itertools::Itertools;
3use std::cell::RefCell;
4use std::fs::File;
5use std::io::{Read, Seek, SeekFrom};
6use std::path::Path;
7
8use csv::Reader;
9use csv_core as csvc;
10use regex::{Captures, Regex};
11
12use crate::{
13 chain::{Chain, STATE_STEADYFLEX, STATE_STEADYSTRICT, STATE_UNSTEADY, ViterbiResults},
14 error::{Result, SnifferError},
15 // field_type::DatePreference,
16 metadata::{Dialect, Metadata, Quote},
17 sample::{SampleIter, SampleSize, take_sample_from_start},
18};
19
20type NumberOfOccurrences = u32;
21type NumberOfLines = u32;
22type AdjacentFrequency = u32;
23
24const TOLERANCE: u32 = 1;
25const NUM_ASCII_CHARS: usize = 128;
26const CANDIDATES: &[u8] = b"\t,;|:";
27
28thread_local! (pub static IS_UTF8: RefCell<bool> = const { RefCell::new(true) });
29// thread_local! (pub static DATE_PREFERENCE: RefCell<DatePreference> = const { RefCell::new(DatePreference::MdyFormat) });
30
31/// A CSV sniffer.
32///
33/// The sniffer examines a CSV file, passed in either through a file or a reader.
34#[derive(Debug, Default)]
35pub struct Sniffer {
36 // CSV file dialect guesses
37 delimiter: Option<u8>,
38 // num_preamble_rows: Option<usize>,
39 // has_header_row: Option<bool>,
40 quote: Option<Quote>,
41 flexible: Option<bool>,
42 is_utf8: Option<bool>,
43
44 // Metadata guesses
45 // delimiter_freq: Option<usize>,
46 // fields: Vec<String>,
47 // types: Vec<Type>,
48 // avg_record_len: Option<usize>,
49
50 // sample size to sniff
51 sample_size: Option<SampleSize>,
52 // date format preference
53 // date_preference: Option<DatePreference>,
54}
55impl Sniffer {
56 /// Create a new CSV sniffer.
57 pub fn new() -> Self {
58 Self::default()
59 }
60 /// Specify the delimiter character.
61 pub fn delimiter(&mut self, delimiter: u8) -> &mut Self {
62 self.delimiter = Some(delimiter);
63 self
64 }
65 /// Specify the header type (whether the CSV file has a header row, and where the data starts).
66 // pub fn header(&mut self, header: &Header) -> &mut Self {
67 // self.num_preamble_rows = Some(header.num_preamble_rows);
68 // self.has_header_row = Some(header.has_header_row);
69 // self
70 // }
71 /// Specify the quote character (if any), and whether two quotes in a row as to be interpreted
72 /// as an escaped quote.
73 pub fn quote(&mut self, quote: Quote) -> &mut Self {
74 self.quote = Some(quote);
75 self
76 }
77
78 /// The size of the sample to examine while sniffing. If using `SampleSize::Records`, the
79 /// sniffer will use the `Terminator::CRLF` as record separator.
80 ///
81 /// The sample size defaults to `SampleSize::Bytes(4096)`.
82 pub fn sample_size(&mut self, sample_size: SampleSize) -> &mut Self {
83 self.sample_size = Some(sample_size);
84 self
85 }
86
87 fn get_sample_size(&self) -> SampleSize {
88 self.sample_size.unwrap_or(SampleSize::Bytes(1 << 14))
89 }
90
91 // The date format preference when sniffing.
92 //
93 // The date format preference defaults to `DatePreference::MDY`.
94 // pub fn date_preference(&mut self, date_preference: DatePreference) -> &mut Self {
95 // DATE_PREFERENCE.with(|preference| {
96 // *preference.borrow_mut() = date_preference;
97 // });
98 // self.date_preference = Some(date_preference);
99 // self
100 // }
101
102 /// Sniff the CSV file located at the provided path, and return a `Reader` (from the
103 /// [`csv`](https://docs.rs/csv) crate) ready to ready the file.
104 ///
105 /// Fails on file opening or rendering errors, or on an error examining the file.
106 pub fn open_path<P: AsRef<Path>>(&mut self, path: P) -> Result<Reader<File>> {
107 self.open_reader(File::open(path)?)
108 }
109 /// Sniff the CSV file provided by the reader, and return a [`csv`](https://docs.rs/csv)
110 /// `Reader` object.
111 ///
112 /// Fails on file opening or rendering errors, or on an error examining the file.
113 pub fn open_reader<R: Read + Seek>(&mut self, mut reader: R) -> Result<Reader<R>> {
114 let metadata = self.sniff_reader(&mut reader)?;
115 reader.seek(SeekFrom::Start(0))?;
116 metadata.dialect.open_reader(reader)
117 }
118
119 /// Sniff the CSV file located at the provided path, and return a
120 /// [`Metadata`](struct.Metadata.html) object containing information about the CSV file.
121 ///
122 /// Fails on file opening or rendering errors, or on an error examining the file.
123 pub fn sniff_path<P: AsRef<Path>>(&mut self, path: P) -> Result<Metadata> {
124 let file = File::open(path)?;
125 self.sniff_reader(&file)
126 }
127 /// Sniff the CSV file provider by the reader, and return a
128 /// [`Metadata`](struct.Metadata.html) object containing information about the CSV file.
129 ///
130 /// Fails on file opening or readering errors, or on an error examining the file.
131 pub fn sniff_reader<R: Read + Seek>(&mut self, mut reader: R) -> Result<Metadata> {
132 // init IS_UTF8 global var to true
133 IS_UTF8.with(|flag| {
134 *flag.borrow_mut() = true;
135 });
136 // guess quotes & delim
137 self.infer_quotes_delim(&mut reader)?;
138
139 // if we have a delimiter, we just need to search for num_preamble_rows and check for
140 // flexible. Otherwise, we need to guess a delimiter as well.
141 if self.delimiter.is_some() {
142 self.infer_preamble_known_delim(&mut reader)?;
143 } else {
144 self.infer_delim_preamble(&mut reader)?;
145 }
146
147 // self.infer_types(&mut reader)?;
148 self.is_utf8 = Some(IS_UTF8.with(|flag| *flag.borrow()));
149
150 // as this point of the process, we should have all these filled in.
151 // assert!(
152 // self.delimiter.is_some()
153 // && self.num_preamble_rows.is_some()
154 // && self.quote.is_some()
155 // && self.flexible.is_some()
156 // && self.is_utf8.is_some()
157 // && self.delimiter_freq.is_some()
158 // && self.has_header_row.is_some()
159 // && self.avg_record_len.is_some()
160 // && self.delimiter_freq.is_some()
161 // );
162 if !(
163 self.delimiter.is_some()
164 // && self.num_preamble_rows.is_some()
165 && self.quote.is_some()
166 && self.flexible.is_some()
167 && self.is_utf8.is_some()
168 // && self.has_header_row.is_some()
169 // && self.avg_record_len.is_some()
170 ) {
171 return Err(SnifferError::SniffingFailed(format!(
172 "Failed to infer all metadata: {self:?}"
173 )));
174 }
175 // safety: we just checked that all these are Some, so it's safe to unwrap
176 Ok(Metadata {
177 dialect: Dialect {
178 delimiter: self.delimiter.unwrap(),
179 // header: Header {
180 // num_preamble_rows: self.num_preamble_rows.unwrap(),
181 // has_header_row: self.has_header_row.unwrap(),
182 // },
183 quote: self.quote.clone().unwrap(),
184 // flexible: self.flexible.unwrap(),
185 // is_utf8: self.is_utf8.unwrap(),
186 },
187 // avg_record_len: self.avg_record_len.unwrap(),
188 // num_fields: self.delimiter_freq.unwrap() + 1,
189 // fields: self.fields.clone(),
190 // types: self.types.clone(),
191 })
192 }
193
194 // Infers quotes and delimiter from quoted (or possibly quoted) files. If quotes detected,
195 // updates self.quote and self.delimiter. If quotes not detected, updates self.quote to
196 // Quote::None. Only valid quote characters: " (double-quote), ' (single-quote), ` (back-tick).
197 fn infer_quotes_delim<R: Read + Seek>(&mut self, reader: &mut R) -> Result<()> {
198 if let (&Some(_), &Some(_)) = (&self.quote, &self.delimiter) {
199 // nothing left to infer!
200 return Ok(());
201 }
202 let quote_guesses = match self.quote {
203 Some(Quote::Some(chr)) => vec![chr],
204 Some(Quote::None) => {
205 // this function only checks quoted (or possibly quoted) files, nothing left to
206 // do if we know there are no quotes
207 return Ok(());
208 }
209 None => vec![b'\'', b'"', b'`'],
210 };
211 let (quote_chr, (quote_cnt, delim_guess)) = quote_guesses.iter().try_fold(
212 (b'"', (0, b'\0')),
213 |acc, &chr| -> Result<(u8, (usize, u8))> {
214 let mut sample_reader = take_sample_from_start(reader, self.get_sample_size())?;
215 if let Some((cnt, delim_chr)) =
216 quote_count(&mut sample_reader, char::from(chr), self.delimiter)?
217 {
218 Ok(if cnt > acc.1.0 {
219 (chr, (cnt, delim_chr))
220 } else {
221 acc
222 })
223 } else {
224 Ok(acc)
225 }
226 },
227 )?;
228 if quote_cnt == 0 {
229 self.quote = Some(Quote::None);
230 } else {
231 self.quote = Some(Quote::Some(quote_chr));
232 self.delimiter = Some(delim_guess);
233 };
234 Ok(())
235 }
236
237 // Updates delimiter frequency, number of preamble rows, and flexible boolean.
238 fn infer_preamble_known_delim<R: Read + Seek>(&mut self, reader: &mut R) -> Result<()> {
239 // prerequisites for calling this function:
240 if !(self.delimiter.is_some() && self.quote.is_some()) {
241 // instead of assert, return an error
242 // assert!(self.delimiter.is_some() && self.quote.is_some());
243 return Err(SnifferError::SniffingFailed(
244 "infer_preamble_known_delim called without delimiter and quote".into(),
245 ));
246 }
247 // safety: unwraps for delimiter and quote are safe since we just checked above
248 let (quote, delim) = (self.quote.clone().unwrap(), self.delimiter.unwrap());
249
250 let sample_iter = take_sample_from_start(reader, self.get_sample_size())?;
251
252 let mut chain = Chain::default();
253
254 if let Quote::Some(character) = quote {
255 // since we have a quote, we need to run this data through the csv_core::Reader (which
256 // properly escapes quoted fields
257 let mut csv_reader = csvc::ReaderBuilder::new()
258 .delimiter(delim)
259 .quote(character)
260 .build();
261
262 let mut output = vec![];
263 let mut ends = vec![];
264 for line in sample_iter {
265 let line = line?;
266 if line.len() > output.len() {
267 output.resize(line.len(), 0);
268 }
269 if line.len() > ends.len() {
270 ends.resize(line.len(), 0);
271 }
272 let (result, _, _, n_ends) =
273 csv_reader.read_record(line.as_bytes(), &mut output, &mut ends);
274 // check to make sure record was read correctly
275 match result {
276 csvc::ReadRecordResult::OutputFull | csvc::ReadRecordResult::OutputEndsFull => {
277 return Err(SnifferError::SniffingFailed(format!(
278 "failure to read quoted CSV record: {result:?}"
279 )));
280 }
281 _ => {} // non-error results, do nothing
282 }
283 // n_ends is the number of barries between fields, so it's the same as the number
284 // of delimiters
285 chain.add_observation(n_ends);
286 }
287 } else {
288 for line in sample_iter {
289 let line = line?;
290 let freq = bytecount::count(line.as_bytes(), delim);
291 chain.add_observation(freq);
292 }
293 }
294 self.run_chains(vec![chain])
295 }
296
297 // Updates delimiter, delimiter frequency, number of preamble rows, and flexible boolean.
298 fn infer_delim_preamble<R: Read + Seek>(&mut self, reader: &mut R) -> Result<()> {
299 let sample_iter =
300 take_sample_from_start(reader, self.get_sample_size())?.collect::<Result<Vec<_>>>()?;
301
302 let mut chars_frequency: HashMap<u8, HashMap<NumberOfOccurrences, NumberOfLines>> =
303 HashMap::with_capacity(NUM_ASCII_CHARS);
304
305 let mut modes: HashMap<u8, (NumberOfOccurrences, AdjacentFrequency)> =
306 HashMap::with_capacity(NUM_ASCII_CHARS);
307
308 for line in &sample_iter {
309 let mut line_frequency = HashMap::with_capacity(128);
310 for character in line.chars() {
311 let Ok(ascii_char) = u8::try_from(character) else {
312 continue;
313 };
314 if !CANDIDATES.contains(&ascii_char) {
315 continue;
316 }
317 *line_frequency.entry(ascii_char).or_default() += 1;
318 }
319 for (ascii_char, freq) in line_frequency {
320 let char_frequency = chars_frequency.entry(ascii_char).or_default();
321 *char_frequency.entry(freq).or_default() += 1;
322 }
323 }
324 for (&ascii_char, line_count_map) in &chars_frequency {
325 let Some((&mode_value, _)) = line_count_map
326 .iter()
327 .max_by_key(|&(_count, num_lines)| num_lines)
328 else {
329 continue; // skip empty maps, just in case
330 };
331
332 let mut adjusted_count = 0;
333 for delta in 0..=TOLERANCE {
334 for count in [mode_value.saturating_sub(delta), mode_value + delta] {
335 if let Some(&lines) = line_count_map.get(&count) {
336 adjusted_count += lines;
337 }
338 }
339 }
340 if TOLERANCE > 0 {
341 if let Some(&lines) = line_count_map.get(&mode_value) {
342 adjusted_count -= lines;
343 }
344 }
345
346 modes.insert(ascii_char, (mode_value, adjusted_count));
347 }
348 let top_candidates: Vec<u8> = modes
349 .iter()
350 .filter(|(_, (_, score))| *score > 0)
351 .sorted_by_key(|&(_, &(_, score))| std::cmp::Reverse(score)) // needs itertools or just sort
352 .take(6)
353 .map(|(&ch, _)| ch)
354 .collect();
355 dbg!(
356 &top_candidates
357 .iter()
358 .map(|c| char::from(*c))
359 .collect::<Vec<_>>()
360 );
361
362 let mut chains = vec![Chain::default(); NUM_ASCII_CHARS];
363
364 for line in sample_iter {
365 let mut freqs = [0; NUM_ASCII_CHARS];
366 for &chr in line.as_bytes() {
367 if chr < NUM_ASCII_CHARS as u8 {
368 freqs[chr as usize] += 1;
369 }
370 }
371 for &ch in &top_candidates {
372 chains[ch as usize].add_observation(freqs[ch as usize]);
373 }
374 }
375
376 self.run_chains(chains)
377 }
378
379 // Updates delimiter (if not already known), delimiter frequency, number of preamble rows, and
380 // flexible boolean.
381 fn run_chains(&mut self, mut chains: Vec<Chain>) -> Result<()> {
382 // Find the 'best' delimiter: choose strict (non-flexible) delimiters over flexible ones,
383 // and choose the one that had the highest probability markov chain in the end.
384 //
385 // In the case where delim is already known, 'best_delim' will be incorrect (since it won't
386 // correspond with position in a vector of Chains), but we'll just ignore it when
387 // constructing our return value later. 'best_state' and 'path' are necessary, though, to
388 // compute the preamble rows.
389 let (best_delim, _, best_state, _, _) = chains.iter_mut().enumerate().fold(
390 (b',', 0, STATE_UNSTEADY, vec![], 0.0),
391 |acc, (i, ref mut chain)| {
392 let (_, _, best_state, _, best_state_prob) = acc;
393 let ViterbiResults {
394 max_delim_freq,
395 path,
396 } = chain.viterbi();
397 if path.is_empty() {
398 return acc;
399 }
400 let (final_state, final_viter) = path[path.len() - 1];
401 if final_state < best_state
402 || (final_state == best_state && final_viter.prob > best_state_prob)
403 {
404 (i as u8, max_delim_freq, final_state, path, final_viter.prob)
405 } else {
406 acc
407 }
408 },
409 );
410 self.flexible = Some(match best_state {
411 STATE_STEADYSTRICT => false,
412 STATE_STEADYFLEX => true,
413 _ => {
414 return Err(SnifferError::SniffingFailed(
415 "unable to find valid delimiter".to_string(),
416 ));
417 }
418 });
419
420 // Find the number of preamble rows (the number of rows during which the state fluctuated
421 // before getting to the final state).
422 // let mut num_preamble_rows = 0;
423 // since path has an extra state as the beginning, skip one
424 // for &(state, _) in path.iter().skip(2) {
425 // if state == best_state {
426 // break;
427 // }
428 // num_preamble_rows += 1;
429 // }
430 // if num_preamble_rows > 0 {
431 // num_preamble_rows += 1;
432 // }
433 if self.delimiter.is_none() {
434 self.delimiter = Some(best_delim);
435 }
436 // self.num_preamble_rows = Some(num_preamble_rows);
437 Ok(())
438 }
439
440 // fn infer_types<R: Read + Seek>(&mut self, reader: &mut R) -> Result<()> {
441 // // prerequisites for calling this function:
442 // if self.delimiter_freq.is_none() {
443 // // instead of assert, return error
444 // // assert!(self.delimiter_freq.is_some());
445 // return Err(SnifferError::SniffingFailed(
446 // "delimiter frequency not known".to_string(),
447 // ));
448 // }
449 // // safety: unwrap is safe as we just checked that delimiter_freq is Some
450 // let field_count = self.delimiter_freq.unwrap() + 1;
451
452 // let mut csv_reader = self.create_csv_reader(reader)?;
453 // let mut records_iter = csv_reader.byte_records();
454 // let mut n_bytes = 0;
455 // let mut n_records = 0;
456 // let sample_size = self.get_sample_size();
457
458 // // Infer types for the top row. We'll save this set of types to check against the types
459 // // of the remaining rows to see if this is part of the data or a separate header row.
460 // let header_row_types = match records_iter.next() {
461 // Some(record) => {
462 // let byte_record = record?;
463 // let str_record = StringRecord::from_byte_record_lossy(byte_record);
464 // n_records += 1;
465 // n_bytes += count_bytes(&str_record);
466 // infer_record_types(&str_record)
467 // }
468 // None => {
469 // return Err(SnifferError::SniffingFailed(
470 // "CSV empty (after preamble)".into(),
471 // ));
472 // }
473 // };
474 // let mut row_types = vec![TypeGuesses::all(); field_count];
475
476 // for record in records_iter {
477 // let record = record?;
478 // for (i, field) in record.iter().enumerate() {
479 // let str_field = String::from_utf8_lossy(field).to_string();
480 // row_types[i] &= infer_types(&str_field);
481 // }
482 // n_records += 1;
483 // n_bytes += record.as_slice().len();
484 // // break if we pass sample size limits
485 // match sample_size {
486 // SampleSize::Records(recs) => {
487 // if n_records > recs {
488 // break;
489 // }
490 // }
491 // SampleSize::Bytes(bytes) => {
492 // if n_bytes > bytes {
493 // break;
494 // }
495 // }
496 // SampleSize::All => {}
497 // }
498 // }
499 // if n_records == 1 {
500 // // there's only one row in the whole data file (the top row already parsed),
501 // // so we're going to assume it's a data row, not a header row.
502 // self.has_header_row = Some(false);
503 // self.types = get_best_types(&header_row_types);
504 // self.avg_record_len = Some(n_bytes);
505 // return Ok(());
506 // }
507
508 // if header_row_types
509 // .iter()
510 // .zip(&row_types)
511 // .any(|(header, data)| !data.allows(*header))
512 // {
513 // self.has_header_row = Some(true);
514 // // get field names in header
515 // for field in csv_reader.byte_headers()? {
516 // self.fields.push(String::from_utf8_lossy(field).to_string());
517 // }
518 // } else {
519 // self.has_header_row = Some(false);
520 // }
521
522 // self.types = get_best_types(&row_types);
523 // self.avg_record_len = Some(n_bytes / n_records);
524 // Ok(())
525 // }
526
527 // fn create_csv_reader<'a, R: Read + Seek>(
528 // &self,
529 // mut reader: &'a mut R,
530 // ) -> Result<Reader<&'a mut R>> {
531 // reader.seek(SeekFrom::Start(0))?;
532 // if let Some(num_preamble_rows) = self.num_preamble_rows {
533 // snip_preamble(&mut reader, num_preamble_rows)?;
534 // }
535
536 // let mut builder = csv::ReaderBuilder::new();
537 // if let Some(delim) = self.delimiter {
538 // builder.delimiter(delim);
539 // }
540 // if let Some(has_header_row) = self.has_header_row {
541 // builder.has_headers(has_header_row);
542 // }
543 // match self.quote {
544 // Some(Quote::Some(chr)) => {
545 // builder.quoting(true);
546 // builder.quote(chr);
547 // }
548 // Some(Quote::None) => {
549 // builder.quoting(false);
550 // }
551 // _ => {}
552 // }
553 // if let Some(flexible) = self.flexible {
554 // builder.flexible(flexible);
555 // }
556
557 // Ok(builder.from_reader(reader))
558 // }
559}
560
561fn quote_count<R: Read>(
562 sample_iter: &mut SampleIter<R>,
563 character: char,
564 delim: Option<u8>,
565) -> Result<Option<(usize, u8)>> {
566 // Build a regex that matches a quoted CSV cell,
567 // optionally followed by a delimiter.
568 // If delim is None, we try to capture a candidate delimiter.
569 let pattern = delim.map_or_else(
570 || {
571 // When delim is not provided, capture candidate delimiters in a group.
572 format!(
573 r#"(?<delim1>[^\w\n\"ֿ\'])(?: ?)(?:{character}).*?(?:{character})(?<delim2>[^\w\n\"\'])|
574 (?:^|\n)(?:{character}).*?(?:{character})(?<delim3>[^\w\n\"\'])(?: ?)|
575 (?<delim4>[^\w\n\"\'])(?: ?)(?:{character}).*?(?:{character})(?:$|\n)|
576 (?:^|\n)(?:{character}).*?(?:{character})(?:$|\n)"#
577 )
578 },
579 |delim| {
580 // When a delimiter is provided, enforce its presence if it appears.
581 format!(
582 r"{q}(?P<field>(?:[^{q}]|{q}{q})*){q}(?:\s*{d}\s*)?",
583 q = character,
584 d = delim as char
585 )
586 },
587 );
588 // Safety: unwrap is safe here because we control the regex pattern.
589 let re = Regex::new(&pattern).unwrap();
590
591 let mut delim_count_map: HashMap<u8, usize> = HashMap::new();
592 let mut count = 0;
593 for line in sample_iter {
594 let line = line?;
595 // Iterate through all quoted cell matches in the line.
596 for cap in re.captures_iter(&line) {
597 count += 1;
598
599 if let Some(delim) = get_delimiter(&cap) {
600 *delim_count_map.entry(delim).or_insert(0) += 1;
601 }
602 }
603 }
604 if count == 0 {
605 return Ok(None);
606 }
607
608 // If a delimiter was provided, just return it.
609 if let Some(delim) = delim {
610 return Ok(Some((count, delim)));
611 }
612
613 // Otherwise, select the candidate delimiter that was matched most frequently.
614 let (delim_count, delim) =
615 delim_count_map
616 .into_iter()
617 .fold((0, b'\0'), |acc, (delim, d_count)| {
618 if d_count > acc.0 {
619 (d_count, delim)
620 } else {
621 acc
622 }
623 });
624
625 if delim_count == 0 {
626 return Err(SnifferError::SniffingFailed(
627 "invalid regex match: no delimiter found".into(),
628 ));
629 }
630 Ok(Some((count, delim)))
631}
632
633fn get_delimiter(captures: &Captures<'_>) -> Option<u8> {
634 let mut counts: HashMap<char, usize> = HashMap::new();
635 // Check groups delim1 through delim4.
636 for i in 1..=4 {
637 let group_name = format!("delim{i}");
638 if let Some(matched) = captures.name(&group_name) {
639 if let Some(ch) = matched.as_str().chars().next() {
640 *counts.entry(ch).or_insert(0) += 1;
641 }
642 }
643 }
644
645 // If no candidates were found, return None.
646 if counts.is_empty() {
647 return None;
648 }
649
650 // Select the candidate with the highest frequency.
651 let (candidate, _) = counts.into_iter().max_by_key(|&(_, count)| count)?;
652 u8::try_from(candidate).ok()
653}