Skip to main content

csv_nose/
sniffer.rs

1//! Main Sniffer builder and sniff methods.
2//!
3//! This module provides the qsv-sniffer compatible API.
4
5use std::fs::File;
6use std::io::{Read, Seek};
7use std::path::Path;
8
9use crate::encoding::{detect_and_transcode, detect_encoding, skip_bom};
10use crate::error::{Result, SnifferError};
11use crate::field_type::Type;
12use crate::metadata::{Dialect, Header, Metadata, Quote};
13use crate::sample::{DatePreference, SampleSize};
14use crate::tum::potential_dialects::{
15    PotentialDialect, detect_line_terminator, generate_dialects_with_terminator,
16};
17use crate::tum::score::{DialectScore, find_best_dialect, score_all_dialects_with_best_table};
18use crate::tum::table::{Table, parse_table};
19use crate::tum::type_detection::infer_column_types;
20
21/// CSV dialect sniffer using the Table Uniformity Method.
22///
23/// # Example
24///
25/// ```no_run
26/// use csv_nose::{Sniffer, SampleSize};
27///
28/// let mut sniffer = Sniffer::new();
29/// sniffer.sample_size(SampleSize::Records(100));
30///
31/// let metadata = sniffer.sniff_path("data.csv").unwrap();
32/// println!("Delimiter: {}", metadata.dialect.delimiter as char);
33/// println!("Has header: {}", metadata.dialect.header.has_header_row);
34/// ```
35#[derive(Debug, Clone)]
36pub struct Sniffer {
37    /// Sample size for sniffing.
38    sample_size: SampleSize,
39    /// Date format preference for ambiguous dates.
40    date_preference: DatePreference,
41    /// Optional forced delimiter.
42    forced_delimiter: Option<u8>,
43    /// Optional forced quote character.
44    forced_quote: Option<Quote>,
45}
46
47impl Default for Sniffer {
48    fn default() -> Self {
49        Self::new()
50    }
51}
52
53impl Sniffer {
54    /// Create a new Sniffer with default settings.
55    pub const fn new() -> Self {
56        Self {
57            sample_size: SampleSize::Records(100),
58            date_preference: DatePreference::MdyFormat,
59            forced_delimiter: None,
60            forced_quote: None,
61        }
62    }
63
64    /// Set the sample size for sniffing.
65    pub fn sample_size(&mut self, sample_size: SampleSize) -> &mut Self {
66        self.sample_size = sample_size;
67        self
68    }
69
70    /// Set the date preference for ambiguous date parsing.
71    pub fn date_preference(&mut self, date_preference: DatePreference) -> &mut Self {
72        self.date_preference = date_preference;
73        self
74    }
75
76    /// Force a specific delimiter (skip delimiter detection).
77    pub fn delimiter(&mut self, delimiter: u8) -> &mut Self {
78        self.forced_delimiter = Some(delimiter);
79        self
80    }
81
82    /// Force a specific quote character.
83    pub fn quote(&mut self, quote: Quote) -> &mut Self {
84        self.forced_quote = Some(quote);
85        self
86    }
87
88    /// Sniff a CSV file at the given path.
89    pub fn sniff_path<P: AsRef<Path>>(&mut self, path: P) -> Result<Metadata> {
90        let file = File::open(path.as_ref())?;
91        let mut reader = std::io::BufReader::new(file);
92        self.sniff_reader(&mut reader)
93    }
94
95    /// Sniff CSV data from a reader.
96    pub fn sniff_reader<R: Read + Seek>(&mut self, reader: R) -> Result<Metadata> {
97        let data = self.read_sample(reader)?;
98
99        if data.is_empty() {
100            return Err(SnifferError::EmptyData);
101        }
102
103        self.sniff_bytes(&data)
104    }
105
106    /// Sniff CSV data from bytes.
107    pub fn sniff_bytes(&self, data: &[u8]) -> Result<Metadata> {
108        if data.is_empty() {
109            return Err(SnifferError::EmptyData);
110        }
111
112        // Detect encoding and transcode to UTF-8 if necessary
113        let (transcoded_data, was_transcoded) = detect_and_transcode(data);
114        let data = &transcoded_data[..];
115
116        // Detect encoding info (for metadata)
117        let encoding_info = detect_encoding(data);
118        let is_utf8 = !was_transcoded || encoding_info.is_utf8;
119
120        // Skip BOM
121        let data = skip_bom(data);
122
123        // Skip comment/preamble lines (lines starting with #)
124        let (comment_preamble_rows, data) = skip_preamble(data);
125
126        // Detect line terminator first to reduce search space
127        let line_terminator = detect_line_terminator(data);
128
129        // Generate potential dialects
130        let dialects = self.forced_delimiter.map_or_else(
131            || generate_dialects_with_terminator(line_terminator),
132            |delim| {
133                // If delimiter is forced, only test that delimiter with different quotes
134                let quotes = if let Some(q) = self.forced_quote {
135                    vec![q]
136                } else {
137                    vec![Quote::Some(b'"'), Quote::Some(b'\''), Quote::None]
138                };
139
140                quotes
141                    .into_iter()
142                    .map(|q| PotentialDialect::new(delim, q, line_terminator))
143                    .collect()
144            },
145        );
146        // Determine max rows for scoring
147        let max_rows = match self.sample_size {
148            SampleSize::Records(n) => n,
149            SampleSize::Bytes(_) | SampleSize::All => 0, // Already limited by read_sample
150        };
151
152        // Score all dialects and get the best table (avoids re-parsing)
153        let (scores, best_table) = score_all_dialects_with_best_table(data, &dialects, max_rows);
154
155        // Find the best dialect
156        let best = find_best_dialect(&scores)
157            .ok_or_else(|| SnifferError::NoDialectDetected("No valid dialect found".to_string()))?;
158
159        // Detect structural preamble using the already-parsed table
160        let table_for_preamble =
161            best_table.unwrap_or_else(|| parse_table(data, &best.dialect, max_rows));
162        let structural_preamble = detect_structural_preamble(&table_for_preamble);
163
164        // Total preamble = comment rows + structural rows
165        let total_preamble_rows = comment_preamble_rows + structural_preamble;
166
167        // Build metadata from the best dialect, reusing the already-parsed table
168        // Pass structural_preamble for table row indexing (since comment rows are already skipped from data)
169        // Pass total_preamble_rows for Header metadata (to report true preamble count in original file)
170        self.build_metadata(
171            best,
172            is_utf8,
173            structural_preamble,
174            total_preamble_rows,
175            &table_for_preamble,
176            data,
177        )
178    }
179
180    /// Read a sample of data from the reader based on `sample_size` settings.
181    fn read_sample<R: Read + Seek>(&self, mut reader: R) -> Result<Vec<u8>> {
182        match self.sample_size {
183            SampleSize::Bytes(n) => {
184                let mut buffer = vec![0u8; n];
185                let bytes_read = reader.read(&mut buffer)?;
186                buffer.truncate(bytes_read);
187                Ok(buffer)
188            }
189            SampleSize::All => {
190                let mut buffer = Vec::new();
191                reader.read_to_end(&mut buffer)?;
192                Ok(buffer)
193            }
194            SampleSize::Records(n) => {
195                // For records, we read enough to capture n records
196                // Estimate ~1KB per record as a starting point, with a minimum
197                let estimated_size = (n * 1024).max(8192);
198                let mut buffer = vec![0u8; estimated_size];
199                let bytes_read = reader.read(&mut buffer)?;
200                buffer.truncate(bytes_read);
201
202                // If we need more data, keep reading
203                if bytes_read == estimated_size {
204                    // Count newlines to see if we have enough records
205                    let newlines = bytecount::count(&buffer, b'\n');
206                    if newlines < n {
207                        // Read more data
208                        let additional = (n - newlines) * 2048;
209                        let mut more = vec![0u8; additional];
210                        let more_read = reader.read(&mut more)?;
211                        more.truncate(more_read);
212                        buffer.extend(more);
213                    }
214                }
215
216                Ok(buffer)
217            }
218        }
219    }
220
221    /// Build Metadata from the best scoring dialect.
222    ///
223    /// # Arguments
224    /// * `structural_preamble` - Number of structural preamble rows in the table (for row indexing)
225    /// * `total_preamble_rows` - Total preamble rows including comments (for Header metadata)
226    /// * `table` - Pre-parsed table to avoid redundant parsing
227    /// * `data` - Raw data bytes for accurate avg_record_len calculation
228    fn build_metadata(
229        &self,
230        score: &DialectScore,
231        is_utf8: bool,
232        structural_preamble: usize,
233        total_preamble_rows: usize,
234        table: &Table,
235        data: &[u8],
236    ) -> Result<Metadata> {
237        if table.is_empty() {
238            return Err(SnifferError::EmptyData);
239        }
240
241        // Create a view of the table without structural preamble
242        // (comment preamble rows are already stripped from data)
243        let effective_table = if structural_preamble > 0 && table.rows.len() > structural_preamble {
244            let mut et = crate::tum::table::Table::new();
245            et.rows = table.rows[structural_preamble..].to_vec();
246            et.field_counts = table.field_counts[structural_preamble..].to_vec();
247            et.update_modal_field_count();
248            et
249        } else {
250            table.clone()
251        };
252
253        // Detect header on the effective table (pass total_preamble_rows for Header metadata)
254        let header = detect_header(&effective_table, &score.dialect, total_preamble_rows);
255
256        // Get field names from the effective table (first row after structural preamble)
257        let fields = if header.has_header_row && !effective_table.rows.is_empty() {
258            effective_table.rows[0].clone()
259        } else {
260            // Generate field names
261            (0..score.num_fields)
262                .map(|i| format!("field_{}", i + 1))
263                .collect()
264        };
265
266        // Skip header row for type inference if present
267        let data_table = if header.has_header_row && effective_table.rows.len() > 1 {
268            let mut dt = crate::tum::table::Table::new();
269            dt.rows = effective_table.rows[1..].to_vec();
270            dt.field_counts = effective_table.field_counts[1..].to_vec();
271            dt.update_modal_field_count();
272            dt
273        } else {
274            effective_table
275        };
276
277        // Infer types for each column
278        let types = infer_column_types(&data_table);
279
280        // Build dialect
281        let dialect = Dialect {
282            delimiter: score.dialect.delimiter,
283            header,
284            quote: score.dialect.quote,
285            flexible: !score.is_uniform,
286            is_utf8,
287        };
288
289        // Calculate average record length from the raw data
290        let avg_record_len = calculate_avg_record_len(data, table.num_rows());
291
292        Ok(Metadata {
293            dialect,
294            avg_record_len,
295            num_fields: score.num_fields,
296            fields,
297            types,
298        })
299    }
300}
301
302/// Detect if the first row (after preamble) is likely a header row.
303///
304/// Optimized: Computes type counts in a single pass without allocating Vecs.
305fn detect_header(
306    table: &crate::tum::table::Table,
307    _dialect: &PotentialDialect,
308    preamble_rows: usize,
309) -> Header {
310    if table.rows.is_empty() {
311        return Header::new(false, preamble_rows);
312    }
313
314    if table.rows.len() < 2 {
315        // Can't determine header with only one row
316        return Header::new(false, preamble_rows);
317    }
318
319    let first_row = &table.rows[0];
320    let second_row = &table.rows[1];
321
322    // Heuristics for header detection:
323    // 1. First row has different types than subsequent rows
324    // 2. First row values look like labels (text when data is numeric)
325    // 3. First row has no duplicates (header columns should be unique)
326
327    let mut header_score = 0.0;
328    let mut checks = 0;
329
330    // Check 1 & 2: Count types in a single pass for first row
331    let (first_text_count, first_numeric_count) =
332        first_row.iter().fold((0, 0), |(text, num), s| {
333            let t = crate::tum::type_detection::detect_cell_type(s);
334            (
335                text + usize::from(t == Type::Text),
336                num + usize::from(t.is_numeric()),
337            )
338        });
339
340    // Count types in a single pass for second row
341    let second_text_count = second_row
342        .iter()
343        .filter(|s| crate::tum::type_detection::detect_cell_type(s) == Type::Text)
344        .count();
345
346    if first_text_count > second_text_count {
347        header_score += 1.0;
348    }
349    checks += 1;
350
351    // Check 2: First row has more text than numeric
352    if first_text_count > first_numeric_count {
353        header_score += 0.5;
354    }
355    checks += 1;
356
357    // Check 3: No duplicates in first row
358    let unique_count = {
359        let mut seen = std::collections::HashSet::new();
360        first_row.iter().filter(|s| seen.insert(s.as_str())).count()
361    };
362    if unique_count == first_row.len() {
363        header_score += 0.5;
364    }
365    checks += 1;
366
367    // Check 4: First row values are shorter (headers tend to be concise)
368    let avg_first_len: f64 = first_row
369        .iter()
370        .map(std::string::String::len)
371        .sum::<usize>() as f64
372        / first_row.len().max(1) as f64;
373    let avg_second_len: f64 = second_row
374        .iter()
375        .map(std::string::String::len)
376        .sum::<usize>() as f64
377        / second_row.len().max(1) as f64;
378
379    if avg_first_len <= avg_second_len {
380        header_score += 0.3;
381    }
382    checks += 1;
383
384    // Threshold for header detection
385    let has_header = (header_score / checks as f64) > 0.4;
386
387    Header::new(has_header, preamble_rows)
388}
389
390/// Calculate average record length from raw data.
391///
392/// Uses the byte length of the first `num_rows` rows for accurate results
393/// that include quote characters and actual line terminators.
394/// This handles the case where `data` contains more bytes than `num_rows` rows
395/// (e.g., when `SampleSize::Records(n)` reads more data than needed).
396fn calculate_avg_record_len(data: &[u8], num_rows: usize) -> usize {
397    if num_rows == 0 || data.is_empty() {
398        return 0;
399    }
400
401    // Find the byte offset where the num_rows-th row ends
402    // by counting newlines (handling both \n and \r\n)
403    let mut rows_seen = 0;
404    let mut byte_offset = 0;
405
406    for (i, &byte) in data.iter().enumerate() {
407        if byte == b'\n' {
408            rows_seen += 1;
409            if rows_seen >= num_rows {
410                byte_offset = i + 1; // Include the newline
411                break;
412            }
413        }
414    }
415
416    // If we didn't find enough newlines, use the entire data length
417    // (this handles files without trailing newlines or small files)
418    if byte_offset == 0 {
419        byte_offset = data.len();
420    }
421
422    byte_offset / num_rows
423}
424
425/// Skip preamble/comment lines at the start of data.
426///
427/// Detects lines starting with '#' at the beginning of the file and returns
428/// the number of preamble rows and a slice starting after the preamble.
429fn skip_preamble(data: &[u8]) -> (usize, &[u8]) {
430    let mut preamble_rows = 0;
431    let mut offset = 0;
432
433    while offset < data.len() {
434        // Skip leading whitespace on the line
435        let mut line_start = offset;
436        while line_start < data.len() && (data[line_start] == b' ' || data[line_start] == b'\t') {
437            line_start += 1;
438        }
439
440        // Check if line starts with #
441        if line_start < data.len() && data[line_start] == b'#' {
442            // Find end of line
443            let mut line_end = line_start;
444            while line_end < data.len() && data[line_end] != b'\n' && data[line_end] != b'\r' {
445                line_end += 1;
446            }
447
448            // Skip line terminator
449            if line_end < data.len() && data[line_end] == b'\r' {
450                line_end += 1;
451            }
452            if line_end < data.len() && data[line_end] == b'\n' {
453                line_end += 1;
454            }
455
456            preamble_rows += 1;
457            offset = line_end;
458        } else {
459            // Not a comment line, stop
460            break;
461        }
462    }
463
464    (preamble_rows, &data[offset..])
465}
466
467/// Detect structural preamble rows using field count consistency analysis.
468///
469/// Identifies rows at the start that don't match the predominant field count
470/// pattern (metadata rows, empty rows, title rows with different structure).
471fn detect_structural_preamble(table: &crate::tum::table::Table) -> usize {
472    let n = table.field_counts.len();
473    if n < 3 {
474        return 0;
475    }
476
477    let modal_count = table.modal_field_count();
478
479    // Pre-compute suffix counts: for each position i, how many rows from i to end match modal_count
480    // This converts O(n²) scanning to O(n) preprocessing + O(1) lookups
481    let mut matching_suffix = vec![0usize; n];
482    let mut count = 0;
483    for i in (0..n).rev() {
484        if table.field_counts[i] == modal_count {
485            count += 1;
486        }
487        matching_suffix[i] = count;
488    }
489
490    // Find first row where remaining data is 80%+ consistent with modal field count
491    for (i, &field_count) in table.field_counts.iter().enumerate() {
492        if field_count == modal_count {
493            let remaining_len = n - i;
494            let matching = matching_suffix[i];
495            let consistency = matching as f64 / remaining_len as f64;
496
497            if consistency >= 0.8 {
498                return i;
499            }
500        }
501    }
502
503    0
504}
505
506#[cfg(test)]
507mod tests {
508    use super::*;
509
510    #[test]
511    fn test_sniffer_builder() {
512        let mut sniffer = Sniffer::new();
513        sniffer
514            .sample_size(SampleSize::Records(50))
515            .date_preference(DatePreference::DmyFormat)
516            .delimiter(b',');
517
518        assert_eq!(sniffer.sample_size, SampleSize::Records(50));
519        assert_eq!(sniffer.date_preference, DatePreference::DmyFormat);
520        assert_eq!(sniffer.forced_delimiter, Some(b','));
521    }
522
523    #[test]
524    fn test_sniff_bytes() {
525        let data = b"name,age,city\nAlice,30,NYC\nBob,25,LA\n";
526        let sniffer = Sniffer::new();
527
528        let metadata = sniffer.sniff_bytes(data).unwrap();
529
530        assert_eq!(metadata.dialect.delimiter, b',');
531        assert!(metadata.dialect.header.has_header_row);
532        assert_eq!(metadata.num_fields, 3);
533        assert_eq!(metadata.fields, vec!["name", "age", "city"]);
534    }
535
536    #[test]
537    fn test_sniff_tsv() {
538        let data = b"name\tage\tcity\nAlice\t30\tNYC\nBob\t25\tLA\n";
539        let sniffer = Sniffer::new();
540
541        let metadata = sniffer.sniff_bytes(data).unwrap();
542
543        assert_eq!(metadata.dialect.delimiter, b'\t');
544        assert!(metadata.dialect.header.has_header_row);
545    }
546
547    #[test]
548    fn test_sniff_semicolon() {
549        let data = b"name;age;city\nAlice;30;NYC\nBob;25;LA\n";
550        let sniffer = Sniffer::new();
551
552        let metadata = sniffer.sniff_bytes(data).unwrap();
553
554        assert_eq!(metadata.dialect.delimiter, b';');
555    }
556
557    #[test]
558    fn test_sniff_no_header() {
559        let data = b"1,2,3\n4,5,6\n7,8,9\n";
560        let sniffer = Sniffer::new();
561
562        let metadata = sniffer.sniff_bytes(data).unwrap();
563
564        assert_eq!(metadata.dialect.delimiter, b',');
565        // All numeric data - should not detect header
566        assert!(!metadata.dialect.header.has_header_row);
567    }
568
569    #[test]
570    fn test_sniff_with_quotes() {
571        let data = b"\"name\",\"value\"\n\"hello, world\",123\n\"test\",456\n";
572        let sniffer = Sniffer::new();
573
574        let metadata = sniffer.sniff_bytes(data).unwrap();
575
576        assert_eq!(metadata.dialect.delimiter, b',');
577        assert_eq!(metadata.dialect.quote, Quote::Some(b'"'));
578    }
579
580    #[test]
581    fn test_sniff_empty() {
582        let data = b"";
583        let sniffer = Sniffer::new();
584
585        let result = sniffer.sniff_bytes(data);
586        assert!(result.is_err());
587    }
588
589    #[test]
590    fn test_skip_preamble() {
591        // Test with comment lines
592        let data = b"# This is a comment\n# Another comment\nname,age\nAlice,30\n";
593        let (preamble_rows, remaining) = skip_preamble(data);
594        assert_eq!(preamble_rows, 2);
595        assert_eq!(remaining, b"name,age\nAlice,30\n");
596
597        // Test without comment lines
598        let data = b"name,age\nAlice,30\n";
599        let (preamble_rows, remaining) = skip_preamble(data);
600        assert_eq!(preamble_rows, 0);
601        assert_eq!(remaining, b"name,age\nAlice,30\n");
602
603        // Test with whitespace before #
604        let data = b"  # Indented comment\nname,age\n";
605        let (preamble_rows, remaining) = skip_preamble(data);
606        assert_eq!(preamble_rows, 1);
607        assert_eq!(remaining, b"name,age\n");
608    }
609
610    #[test]
611    fn test_sniff_with_preamble() {
612        let data = b"# LimeSurvey export\n# Generated 2024-01-01\nname,age,city\nAlice,30,NYC\nBob,25,LA\n";
613        let sniffer = Sniffer::new();
614
615        let metadata = sniffer.sniff_bytes(data).unwrap();
616
617        assert_eq!(metadata.dialect.delimiter, b',');
618        assert!(metadata.dialect.header.has_header_row);
619        assert_eq!(metadata.num_fields, 3);
620    }
621
622    #[test]
623    fn test_comment_preamble_propagated() {
624        let data = b"# Comment 1\n# Comment 2\nname,age\nAlice,30\nBob,25\n";
625        let metadata = Sniffer::new().sniff_bytes(data).unwrap();
626        assert_eq!(metadata.dialect.header.num_preamble_rows, 2);
627        assert!(metadata.dialect.header.has_header_row);
628        assert_eq!(metadata.fields, vec!["name", "age"]);
629    }
630
631    #[test]
632    fn test_structural_preamble_detection() {
633        // TITLE row has 1 field, SUBTITLE has 2 fields, data has 5 fields
634        let data = b"TITLE\nSUB,TITLE\nA,B,C,D,E\n1,2,3,4,5\n2,3,4,5,6\n3,4,5,6,7\n";
635        let metadata = Sniffer::new().sniff_bytes(data).unwrap();
636        assert_eq!(metadata.dialect.header.num_preamble_rows, 2);
637        assert!(metadata.dialect.header.has_header_row);
638        assert_eq!(metadata.fields, vec!["A", "B", "C", "D", "E"]);
639    }
640
641    #[test]
642    fn test_mixed_preamble_detection() {
643        // Both comment preamble and structural preamble
644        // METADATA has 1 field, data has 3 fields
645        let data =
646            b"# File header\nMETADATA\nname,age,city\nAlice,30,NYC\nBob,25,LA\nCharlie,35,CHI\n";
647        let metadata = Sniffer::new().sniff_bytes(data).unwrap();
648        // 1 comment + 1 structural = 2 total
649        assert_eq!(metadata.dialect.header.num_preamble_rows, 2);
650        assert!(metadata.dialect.header.has_header_row);
651        assert_eq!(metadata.fields, vec!["name", "age", "city"]);
652    }
653
654    #[test]
655    fn test_no_preamble() {
656        let data = b"a,b,c\n1,2,3\n4,5,6\n";
657        let metadata = Sniffer::new().sniff_bytes(data).unwrap();
658        assert_eq!(metadata.dialect.header.num_preamble_rows, 0);
659    }
660
661    #[test]
662    fn test_detect_structural_preamble_function() {
663        use crate::tum::table::Table;
664
665        // Table with 2 preamble rows (different field counts)
666        let mut table = Table::new();
667        table.rows = vec![
668            vec!["TITLE".to_string()],
669            vec!["".to_string(), "".to_string()],
670            vec!["A".to_string(), "B".to_string(), "C".to_string()],
671            vec!["1".to_string(), "2".to_string(), "3".to_string()],
672            vec!["4".to_string(), "5".to_string(), "6".to_string()],
673        ];
674        table.field_counts = vec![1, 2, 3, 3, 3];
675        table.update_modal_field_count();
676        assert_eq!(detect_structural_preamble(&table), 2);
677
678        // Table with no preamble (uniform field counts)
679        let mut table = Table::new();
680        table.rows = vec![
681            vec!["A".to_string(), "B".to_string(), "C".to_string()],
682            vec!["1".to_string(), "2".to_string(), "3".to_string()],
683        ];
684        table.field_counts = vec![3, 3];
685        table.update_modal_field_count();
686        assert_eq!(detect_structural_preamble(&table), 0);
687
688        // Table too small to determine preamble
689        let mut table = Table::new();
690        table.rows = vec![vec!["A".to_string()]];
691        table.field_counts = vec![1];
692        table.update_modal_field_count();
693        assert_eq!(detect_structural_preamble(&table), 0);
694    }
695
696    #[test]
697    fn test_avg_record_len_calculated_from_data() {
698        // Test that avg_record_len uses raw bytes, not parsed content
699        let short_data = b"a,b\n1,2\n3,4\n";
700        let sniffer = Sniffer::new();
701        let metadata = sniffer.sniff_bytes(short_data).unwrap();
702
703        // Each row: "a,b\n" = 4 bytes, "1,2\n" = 4 bytes, "3,4\n" = 4 bytes
704        // Average: 12 / 3 = 4 bytes
705        assert_eq!(metadata.avg_record_len, 4);
706    }
707
708    #[test]
709    fn test_avg_record_len_with_quoted_fields() {
710        let quoted_data = b"\"hello\",\"world\"\n\"foo\",\"bar\"\n";
711        let sniffer = Sniffer::new();
712        let metadata = sniffer.sniff_bytes(quoted_data).unwrap();
713
714        // Raw: 16 + 12 = 28 bytes for 2 rows = 14 bytes avg
715        assert_eq!(metadata.avg_record_len, 14);
716    }
717}