csv_nose/
sniffer.rs

1//! Main Sniffer builder and sniff methods.
2//!
3//! This module provides the qsv-sniffer compatible API.
4
5use std::fs::File;
6use std::io::{Read, Seek};
7use std::path::Path;
8
9use crate::encoding::{detect_and_transcode, detect_encoding, skip_bom};
10use crate::error::{Result, SnifferError};
11use crate::field_type::Type;
12use crate::metadata::{Dialect, Header, Metadata, Quote};
13use crate::sample::{DatePreference, SampleSize};
14use crate::tum::potential_dialects::{
15    PotentialDialect, detect_line_terminator, generate_dialects_with_terminator,
16};
17use crate::tum::score::{DialectScore, find_best_dialect, score_all_dialects};
18use crate::tum::table::parse_table;
19use crate::tum::type_detection::infer_column_types;
20
21/// CSV dialect sniffer using the Table Uniformity Method.
22///
23/// # Example
24///
25/// ```no_run
26/// use csv_nose::{Sniffer, SampleSize};
27///
28/// let mut sniffer = Sniffer::new();
29/// sniffer.sample_size(SampleSize::Records(100));
30///
31/// let metadata = sniffer.sniff_path("data.csv").unwrap();
32/// println!("Delimiter: {}", metadata.dialect.delimiter as char);
33/// println!("Has header: {}", metadata.dialect.header.has_header_row);
34/// ```
35#[derive(Debug, Clone)]
36pub struct Sniffer {
37    /// Sample size for sniffing.
38    sample_size: SampleSize,
39    /// Date format preference for ambiguous dates.
40    date_preference: DatePreference,
41    /// Optional forced delimiter.
42    forced_delimiter: Option<u8>,
43    /// Optional forced quote character.
44    forced_quote: Option<Quote>,
45}
46
47impl Default for Sniffer {
48    fn default() -> Self {
49        Self::new()
50    }
51}
52
53impl Sniffer {
54    /// Create a new Sniffer with default settings.
55    pub fn new() -> Self {
56        Self {
57            sample_size: SampleSize::Records(100),
58            date_preference: DatePreference::MdyFormat,
59            forced_delimiter: None,
60            forced_quote: None,
61        }
62    }
63
64    /// Set the sample size for sniffing.
65    pub fn sample_size(&mut self, sample_size: SampleSize) -> &mut Self {
66        self.sample_size = sample_size;
67        self
68    }
69
70    /// Set the date preference for ambiguous date parsing.
71    pub fn date_preference(&mut self, date_preference: DatePreference) -> &mut Self {
72        self.date_preference = date_preference;
73        self
74    }
75
76    /// Force a specific delimiter (skip delimiter detection).
77    pub fn delimiter(&mut self, delimiter: u8) -> &mut Self {
78        self.forced_delimiter = Some(delimiter);
79        self
80    }
81
82    /// Force a specific quote character.
83    pub fn quote(&mut self, quote: Quote) -> &mut Self {
84        self.forced_quote = Some(quote);
85        self
86    }
87
88    /// Sniff a CSV file at the given path.
89    pub fn sniff_path<P: AsRef<Path>>(&mut self, path: P) -> Result<Metadata> {
90        let file = File::open(path.as_ref())?;
91        let mut reader = std::io::BufReader::new(file);
92        self.sniff_reader(&mut reader)
93    }
94
95    /// Sniff CSV data from a reader.
96    pub fn sniff_reader<R: Read + Seek>(&mut self, reader: R) -> Result<Metadata> {
97        let data = self.read_sample(reader)?;
98
99        if data.is_empty() {
100            return Err(SnifferError::EmptyData);
101        }
102
103        self.sniff_bytes(&data)
104    }
105
106    /// Sniff CSV data from bytes.
107    pub fn sniff_bytes(&self, data: &[u8]) -> Result<Metadata> {
108        if data.is_empty() {
109            return Err(SnifferError::EmptyData);
110        }
111
112        // Detect encoding and transcode to UTF-8 if necessary
113        let (transcoded_data, was_transcoded) = detect_and_transcode(data);
114        let data = &transcoded_data[..];
115
116        // Detect encoding info (for metadata)
117        let encoding_info = detect_encoding(data);
118        let is_utf8 = !was_transcoded || encoding_info.is_utf8;
119
120        // Skip BOM
121        let data = skip_bom(data);
122
123        // Skip comment/preamble lines (lines starting with #)
124        let (comment_preamble_rows, data) = skip_preamble(data);
125
126        // Detect line terminator first to reduce search space
127        let line_terminator = detect_line_terminator(data);
128
129        // Generate potential dialects
130        let dialects = if let Some(delim) = self.forced_delimiter {
131            // If delimiter is forced, only test that delimiter with different quotes
132            let quotes = if let Some(q) = self.forced_quote {
133                vec![q]
134            } else {
135                vec![Quote::Some(b'"'), Quote::Some(b'\''), Quote::None]
136            };
137
138            quotes
139                .into_iter()
140                .map(|q| PotentialDialect::new(delim, q, line_terminator))
141                .collect()
142        } else {
143            generate_dialects_with_terminator(line_terminator)
144        };
145
146        // Determine max rows for scoring
147        let max_rows = match self.sample_size {
148            SampleSize::Records(n) => n,
149            SampleSize::Bytes(_) | SampleSize::All => 0, // Already limited by read_sample
150        };
151
152        // Score all dialects
153        let scores = score_all_dialects(data, &dialects, max_rows);
154
155        // Find the best dialect
156        let best = find_best_dialect(&scores)
157            .ok_or_else(|| SnifferError::NoDialectDetected("No valid dialect found".to_string()))?;
158
159        // Detect structural preamble (rows with inconsistent field counts)
160        let max_rows = match self.sample_size {
161            SampleSize::Records(n) => n,
162            SampleSize::Bytes(_) | SampleSize::All => 0,
163        };
164        let table_for_preamble = parse_table(data, &best.dialect, max_rows);
165        let structural_preamble = detect_structural_preamble(&table_for_preamble);
166
167        // Total preamble = comment rows + structural rows
168        let total_preamble_rows = comment_preamble_rows + structural_preamble;
169
170        // Build metadata from the best dialect
171        // Pass structural_preamble for table row indexing (since comment rows are already skipped from data)
172        // Pass total_preamble_rows for Header metadata (to report true preamble count in original file)
173        self.build_metadata(
174            data,
175            best,
176            is_utf8,
177            structural_preamble,
178            total_preamble_rows,
179        )
180    }
181
182    /// Read a sample of data from the reader based on sample_size settings.
183    fn read_sample<R: Read + Seek>(&self, mut reader: R) -> Result<Vec<u8>> {
184        match self.sample_size {
185            SampleSize::Bytes(n) => {
186                let mut buffer = vec![0u8; n];
187                let bytes_read = reader.read(&mut buffer)?;
188                buffer.truncate(bytes_read);
189                Ok(buffer)
190            }
191            SampleSize::All => {
192                let mut buffer = Vec::new();
193                reader.read_to_end(&mut buffer)?;
194                Ok(buffer)
195            }
196            SampleSize::Records(n) => {
197                // For records, we read enough to capture n records
198                // Estimate ~1KB per record as a starting point, with a minimum
199                let estimated_size = (n * 1024).max(8192);
200                let mut buffer = vec![0u8; estimated_size];
201                let bytes_read = reader.read(&mut buffer)?;
202                buffer.truncate(bytes_read);
203
204                // If we need more data, keep reading
205                if bytes_read == estimated_size {
206                    // Count newlines to see if we have enough records
207                    let newlines = bytecount::count(&buffer, b'\n');
208                    if newlines < n {
209                        // Read more data
210                        let additional = (n - newlines) * 2048;
211                        let mut more = vec![0u8; additional];
212                        let more_read = reader.read(&mut more)?;
213                        more.truncate(more_read);
214                        buffer.extend(more);
215                    }
216                }
217
218                Ok(buffer)
219            }
220        }
221    }
222
223    /// Build Metadata from the best scoring dialect.
224    ///
225    /// # Arguments
226    /// * `structural_preamble` - Number of structural preamble rows in the table (for row indexing)
227    /// * `total_preamble_rows` - Total preamble rows including comments (for Header metadata)
228    fn build_metadata(
229        &self,
230        data: &[u8],
231        score: &DialectScore,
232        is_utf8: bool,
233        structural_preamble: usize,
234        total_preamble_rows: usize,
235    ) -> Result<Metadata> {
236        // Parse the table with the best dialect
237        let max_rows = match self.sample_size {
238            SampleSize::Records(n) => n,
239            _ => 0,
240        };
241
242        let table = parse_table(data, &score.dialect, max_rows);
243
244        if table.is_empty() {
245            return Err(SnifferError::EmptyData);
246        }
247
248        // Create a view of the table without structural preamble
249        // (comment preamble rows are already stripped from data)
250        let effective_table = if structural_preamble > 0 && table.rows.len() > structural_preamble {
251            let mut et = crate::tum::table::Table::new();
252            et.rows = table.rows[structural_preamble..].to_vec();
253            et.field_counts = table.field_counts[structural_preamble..].to_vec();
254            et
255        } else {
256            table.clone()
257        };
258
259        // Detect header on the effective table (pass total_preamble_rows for Header metadata)
260        let header = detect_header(&effective_table, &score.dialect, total_preamble_rows);
261
262        // Get field names from the effective table (first row after structural preamble)
263        let fields = if header.has_header_row && !effective_table.rows.is_empty() {
264            effective_table.rows[0].clone()
265        } else {
266            // Generate field names
267            (0..score.num_fields)
268                .map(|i| format!("field_{}", i + 1))
269                .collect()
270        };
271
272        // Skip header row for type inference if present
273        let data_table = if header.has_header_row && effective_table.rows.len() > 1 {
274            let mut dt = crate::tum::table::Table::new();
275            dt.rows = effective_table.rows[1..].to_vec();
276            dt.field_counts = effective_table.field_counts[1..].to_vec();
277            dt
278        } else {
279            effective_table.clone()
280        };
281
282        // Infer types for each column
283        let types = infer_column_types(&data_table);
284
285        // Build dialect
286        let dialect = Dialect {
287            delimiter: score.dialect.delimiter,
288            header,
289            quote: score.dialect.quote,
290            flexible: !score.is_uniform,
291            is_utf8,
292        };
293
294        // Calculate average record length
295        let avg_record_len = calculate_avg_record_len(data, table.num_rows());
296
297        Ok(Metadata {
298            dialect,
299            avg_record_len,
300            num_fields: score.num_fields,
301            fields,
302            types,
303        })
304    }
305}
306
307/// Detect if the first row (after preamble) is likely a header row.
308fn detect_header(
309    table: &crate::tum::table::Table,
310    _dialect: &PotentialDialect,
311    preamble_rows: usize,
312) -> Header {
313    if table.rows.is_empty() {
314        return Header::new(false, preamble_rows);
315    }
316
317    if table.rows.len() < 2 {
318        // Can't determine header with only one row
319        return Header::new(false, preamble_rows);
320    }
321
322    let first_row = &table.rows[0];
323    let second_row = &table.rows[1];
324
325    // Heuristics for header detection:
326    // 1. First row has different types than subsequent rows
327    // 2. First row values look like labels (text when data is numeric)
328    // 3. First row has no duplicates (header columns should be unique)
329
330    let mut header_score = 0.0;
331    let mut checks = 0;
332
333    // Check 1: First row is all text, second row has typed data
334    let first_types: Vec<Type> = first_row
335        .iter()
336        .map(|s| crate::tum::type_detection::detect_cell_type(s))
337        .collect();
338    let second_types: Vec<Type> = second_row
339        .iter()
340        .map(|s| crate::tum::type_detection::detect_cell_type(s))
341        .collect();
342
343    let first_text_count = first_types.iter().filter(|&&t| t == Type::Text).count();
344    let second_text_count = second_types.iter().filter(|&&t| t == Type::Text).count();
345
346    if first_text_count > second_text_count {
347        header_score += 1.0;
348    }
349    checks += 1;
350
351    // Check 2: First row has more text than numeric
352    let first_numeric_count = first_types.iter().filter(|&&t| t.is_numeric()).count();
353    if first_text_count > first_numeric_count {
354        header_score += 0.5;
355    }
356    checks += 1;
357
358    // Check 3: No duplicates in first row
359    let unique_count = {
360        let mut seen = std::collections::HashSet::new();
361        first_row.iter().filter(|s| seen.insert(s.as_str())).count()
362    };
363    if unique_count == first_row.len() {
364        header_score += 0.5;
365    }
366    checks += 1;
367
368    // Check 4: First row values are shorter (headers tend to be concise)
369    let avg_first_len: f64 = first_row
370        .iter()
371        .map(std::string::String::len)
372        .sum::<usize>() as f64
373        / first_row.len().max(1) as f64;
374    let avg_second_len: f64 = second_row
375        .iter()
376        .map(std::string::String::len)
377        .sum::<usize>() as f64
378        / second_row.len().max(1) as f64;
379
380    if avg_first_len <= avg_second_len {
381        header_score += 0.3;
382    }
383    checks += 1;
384
385    // Threshold for header detection
386    let has_header = (header_score / checks as f64) > 0.4;
387
388    Header::new(has_header, preamble_rows)
389}
390
391/// Calculate average record length.
392fn calculate_avg_record_len(data: &[u8], num_rows: usize) -> usize {
393    if num_rows == 0 {
394        return 0;
395    }
396    data.len() / num_rows
397}
398
399/// Skip preamble/comment lines at the start of data.
400///
401/// Detects lines starting with '#' at the beginning of the file and returns
402/// the number of preamble rows and a slice starting after the preamble.
403fn skip_preamble(data: &[u8]) -> (usize, &[u8]) {
404    let mut preamble_rows = 0;
405    let mut offset = 0;
406
407    while offset < data.len() {
408        // Skip leading whitespace on the line
409        let mut line_start = offset;
410        while line_start < data.len() && (data[line_start] == b' ' || data[line_start] == b'\t') {
411            line_start += 1;
412        }
413
414        // Check if line starts with #
415        if line_start < data.len() && data[line_start] == b'#' {
416            // Find end of line
417            let mut line_end = line_start;
418            while line_end < data.len() && data[line_end] != b'\n' && data[line_end] != b'\r' {
419                line_end += 1;
420            }
421
422            // Skip line terminator
423            if line_end < data.len() && data[line_end] == b'\r' {
424                line_end += 1;
425            }
426            if line_end < data.len() && data[line_end] == b'\n' {
427                line_end += 1;
428            }
429
430            preamble_rows += 1;
431            offset = line_end;
432        } else {
433            // Not a comment line, stop
434            break;
435        }
436    }
437
438    (preamble_rows, &data[offset..])
439}
440
441/// Detect structural preamble rows using field count consistency analysis.
442///
443/// Identifies rows at the start that don't match the predominant field count
444/// pattern (metadata rows, empty rows, title rows with different structure).
445fn detect_structural_preamble(table: &crate::tum::table::Table) -> usize {
446    if table.field_counts.len() < 3 {
447        return 0;
448    }
449
450    let modal_count = table.modal_field_count();
451
452    // Find first row where remaining data is 80%+ consistent with modal field count
453    for (i, &field_count) in table.field_counts.iter().enumerate() {
454        if field_count == modal_count {
455            let remaining = &table.field_counts[i..];
456            let matching = remaining.iter().filter(|&&fc| fc == modal_count).count();
457            let consistency = matching as f64 / remaining.len() as f64;
458
459            if consistency >= 0.8 {
460                return i;
461            }
462        }
463    }
464
465    0
466}
467
468#[cfg(test)]
469mod tests {
470    use super::*;
471
472    #[test]
473    fn test_sniffer_builder() {
474        let mut sniffer = Sniffer::new();
475        sniffer
476            .sample_size(SampleSize::Records(50))
477            .date_preference(DatePreference::DmyFormat)
478            .delimiter(b',');
479
480        assert_eq!(sniffer.sample_size, SampleSize::Records(50));
481        assert_eq!(sniffer.date_preference, DatePreference::DmyFormat);
482        assert_eq!(sniffer.forced_delimiter, Some(b','));
483    }
484
485    #[test]
486    fn test_sniff_bytes() {
487        let data = b"name,age,city\nAlice,30,NYC\nBob,25,LA\n";
488        let sniffer = Sniffer::new();
489
490        let metadata = sniffer.sniff_bytes(data).unwrap();
491
492        assert_eq!(metadata.dialect.delimiter, b',');
493        assert!(metadata.dialect.header.has_header_row);
494        assert_eq!(metadata.num_fields, 3);
495        assert_eq!(metadata.fields, vec!["name", "age", "city"]);
496    }
497
498    #[test]
499    fn test_sniff_tsv() {
500        let data = b"name\tage\tcity\nAlice\t30\tNYC\nBob\t25\tLA\n";
501        let sniffer = Sniffer::new();
502
503        let metadata = sniffer.sniff_bytes(data).unwrap();
504
505        assert_eq!(metadata.dialect.delimiter, b'\t');
506        assert!(metadata.dialect.header.has_header_row);
507    }
508
509    #[test]
510    fn test_sniff_semicolon() {
511        let data = b"name;age;city\nAlice;30;NYC\nBob;25;LA\n";
512        let sniffer = Sniffer::new();
513
514        let metadata = sniffer.sniff_bytes(data).unwrap();
515
516        assert_eq!(metadata.dialect.delimiter, b';');
517    }
518
519    #[test]
520    fn test_sniff_no_header() {
521        let data = b"1,2,3\n4,5,6\n7,8,9\n";
522        let sniffer = Sniffer::new();
523
524        let metadata = sniffer.sniff_bytes(data).unwrap();
525
526        assert_eq!(metadata.dialect.delimiter, b',');
527        // All numeric data - should not detect header
528        assert!(!metadata.dialect.header.has_header_row);
529    }
530
531    #[test]
532    fn test_sniff_with_quotes() {
533        let data = b"\"name\",\"value\"\n\"hello, world\",123\n\"test\",456\n";
534        let sniffer = Sniffer::new();
535
536        let metadata = sniffer.sniff_bytes(data).unwrap();
537
538        assert_eq!(metadata.dialect.delimiter, b',');
539        assert_eq!(metadata.dialect.quote, Quote::Some(b'"'));
540    }
541
542    #[test]
543    fn test_sniff_empty() {
544        let data = b"";
545        let sniffer = Sniffer::new();
546
547        let result = sniffer.sniff_bytes(data);
548        assert!(result.is_err());
549    }
550
551    #[test]
552    fn test_skip_preamble() {
553        // Test with comment lines
554        let data = b"# This is a comment\n# Another comment\nname,age\nAlice,30\n";
555        let (preamble_rows, remaining) = skip_preamble(data);
556        assert_eq!(preamble_rows, 2);
557        assert_eq!(remaining, b"name,age\nAlice,30\n");
558
559        // Test without comment lines
560        let data = b"name,age\nAlice,30\n";
561        let (preamble_rows, remaining) = skip_preamble(data);
562        assert_eq!(preamble_rows, 0);
563        assert_eq!(remaining, b"name,age\nAlice,30\n");
564
565        // Test with whitespace before #
566        let data = b"  # Indented comment\nname,age\n";
567        let (preamble_rows, remaining) = skip_preamble(data);
568        assert_eq!(preamble_rows, 1);
569        assert_eq!(remaining, b"name,age\n");
570    }
571
572    #[test]
573    fn test_sniff_with_preamble() {
574        let data = b"# LimeSurvey export\n# Generated 2024-01-01\nname,age,city\nAlice,30,NYC\nBob,25,LA\n";
575        let sniffer = Sniffer::new();
576
577        let metadata = sniffer.sniff_bytes(data).unwrap();
578
579        assert_eq!(metadata.dialect.delimiter, b',');
580        assert!(metadata.dialect.header.has_header_row);
581        assert_eq!(metadata.num_fields, 3);
582    }
583
584    #[test]
585    fn test_comment_preamble_propagated() {
586        let data = b"# Comment 1\n# Comment 2\nname,age\nAlice,30\nBob,25\n";
587        let metadata = Sniffer::new().sniff_bytes(data).unwrap();
588        assert_eq!(metadata.dialect.header.num_preamble_rows, 2);
589        assert!(metadata.dialect.header.has_header_row);
590        assert_eq!(metadata.fields, vec!["name", "age"]);
591    }
592
593    #[test]
594    fn test_structural_preamble_detection() {
595        // TITLE row has 1 field, SUBTITLE has 2 fields, data has 5 fields
596        let data = b"TITLE\nSUB,TITLE\nA,B,C,D,E\n1,2,3,4,5\n2,3,4,5,6\n3,4,5,6,7\n";
597        let metadata = Sniffer::new().sniff_bytes(data).unwrap();
598        assert_eq!(metadata.dialect.header.num_preamble_rows, 2);
599        assert!(metadata.dialect.header.has_header_row);
600        assert_eq!(metadata.fields, vec!["A", "B", "C", "D", "E"]);
601    }
602
603    #[test]
604    fn test_mixed_preamble_detection() {
605        // Both comment preamble and structural preamble
606        // METADATA has 1 field, data has 3 fields
607        let data =
608            b"# File header\nMETADATA\nname,age,city\nAlice,30,NYC\nBob,25,LA\nCharlie,35,CHI\n";
609        let metadata = Sniffer::new().sniff_bytes(data).unwrap();
610        // 1 comment + 1 structural = 2 total
611        assert_eq!(metadata.dialect.header.num_preamble_rows, 2);
612        assert!(metadata.dialect.header.has_header_row);
613        assert_eq!(metadata.fields, vec!["name", "age", "city"]);
614    }
615
616    #[test]
617    fn test_no_preamble() {
618        let data = b"a,b,c\n1,2,3\n4,5,6\n";
619        let metadata = Sniffer::new().sniff_bytes(data).unwrap();
620        assert_eq!(metadata.dialect.header.num_preamble_rows, 0);
621    }
622
623    #[test]
624    fn test_detect_structural_preamble_function() {
625        use crate::tum::table::Table;
626
627        // Table with 2 preamble rows (different field counts)
628        let mut table = Table::new();
629        table.rows = vec![
630            vec!["TITLE".to_string()],
631            vec!["".to_string(), "".to_string()],
632            vec!["A".to_string(), "B".to_string(), "C".to_string()],
633            vec!["1".to_string(), "2".to_string(), "3".to_string()],
634            vec!["4".to_string(), "5".to_string(), "6".to_string()],
635        ];
636        table.field_counts = vec![1, 2, 3, 3, 3];
637        assert_eq!(detect_structural_preamble(&table), 2);
638
639        // Table with no preamble (uniform field counts)
640        let mut table = Table::new();
641        table.rows = vec![
642            vec!["A".to_string(), "B".to_string(), "C".to_string()],
643            vec!["1".to_string(), "2".to_string(), "3".to_string()],
644        ];
645        table.field_counts = vec![3, 3];
646        assert_eq!(detect_structural_preamble(&table), 0);
647
648        // Table too small to determine preamble
649        let mut table = Table::new();
650        table.rows = vec![vec!["A".to_string()]];
651        table.field_counts = vec![1];
652        assert_eq!(detect_structural_preamble(&table), 0);
653    }
654}