1use std::borrow::Cow;
6use std::fs::File;
7use std::io::{Read, Seek};
8use std::path::Path;
9
10use crate::encoding::{detect_and_transcode, detect_encoding, skip_bom};
11use crate::error::{Result, SnifferError};
12use crate::field_type::Type;
13use crate::metadata::{Dialect, Header, Metadata, Quote};
14use crate::sample::{DatePreference, SampleSize};
15use crate::tum::potential_dialects::{
16 PotentialDialect, detect_line_terminator, generate_dialects_with_terminator,
17};
18use crate::tum::score::{DialectScore, find_best_dialect, score_all_dialects_with_best_table};
19use crate::tum::table::{Table, parse_table};
20use crate::tum::type_detection::infer_column_types;
21
22const MAX_RECORDS_BYTES: usize = 100 * 1024 * 1024;
24
25#[derive(Debug, Clone)]
40pub struct Sniffer {
41 sample_size: SampleSize,
43 date_preference: DatePreference,
45 forced_delimiter: Option<u8>,
47 forced_quote: Option<Quote>,
49}
50
51impl Default for Sniffer {
52 fn default() -> Self {
53 Self::new()
54 }
55}
56
57impl Sniffer {
58 pub const fn new() -> Self {
60 Self {
61 sample_size: SampleSize::Records(100),
62 date_preference: DatePreference::MdyFormat,
63 forced_delimiter: None,
64 forced_quote: None,
65 }
66 }
67
68 pub fn sample_size(&mut self, sample_size: SampleSize) -> &mut Self {
70 self.sample_size = sample_size;
71 self
72 }
73
74 pub fn date_preference(&mut self, date_preference: DatePreference) -> &mut Self {
76 self.date_preference = date_preference;
77 self
78 }
79
80 pub fn delimiter(&mut self, delimiter: u8) -> &mut Self {
82 self.forced_delimiter = Some(delimiter);
83 self
84 }
85
86 pub fn quote(&mut self, quote: Quote) -> &mut Self {
88 self.forced_quote = Some(quote);
89 self
90 }
91
92 pub fn sniff_path<P: AsRef<Path>>(&mut self, path: P) -> Result<Metadata> {
94 let file = File::open(path.as_ref())?;
95 let mut reader = std::io::BufReader::new(file);
96 self.sniff_reader(&mut reader)
97 }
98
99 pub fn sniff_reader<R: Read + Seek>(&mut self, reader: R) -> Result<Metadata> {
101 let data = self.read_sample(reader)?;
102
103 if data.is_empty() {
104 return Err(SnifferError::EmptyData);
105 }
106
107 self.sniff_bytes(&data)
108 }
109
110 pub fn sniff_bytes(&self, data: &[u8]) -> Result<Metadata> {
112 if data.is_empty() {
113 return Err(SnifferError::EmptyData);
114 }
115
116 let (transcoded_data, was_transcoded) = detect_and_transcode(data);
118 let data = &transcoded_data[..];
119
120 let encoding_info = detect_encoding(data);
122 let is_utf8 = !was_transcoded || encoding_info.is_utf8;
123
124 let data = skip_bom(data);
126
127 let (comment_preamble_rows, data) = skip_preamble(data);
129
130 let line_terminator = detect_line_terminator(data);
132
133 let dialects = self.forced_delimiter.map_or_else(
135 || generate_dialects_with_terminator(line_terminator),
136 |delim| {
137 let quotes = if let Some(q) = self.forced_quote {
139 vec![q]
140 } else {
141 vec![Quote::Some(b'"'), Quote::Some(b'\''), Quote::None]
142 };
143
144 quotes
145 .into_iter()
146 .map(|q| PotentialDialect::new(delim, q, line_terminator))
147 .collect()
148 },
149 );
150 let max_rows = match self.sample_size {
152 SampleSize::Records(n) => n,
153 SampleSize::Bytes(_) | SampleSize::All => 0, };
155
156 let (scores, best_table) = score_all_dialects_with_best_table(data, &dialects, max_rows);
158
159 let best = find_best_dialect(&scores)
161 .ok_or_else(|| SnifferError::NoDialectDetected("No valid dialect found".to_string()))?;
162
163 let table_for_preamble = match best_table {
172 Some((dialect, table)) if dialect == best.dialect => table,
173 _ => parse_table(data, &best.dialect, max_rows),
174 };
175 let structural_preamble = detect_structural_preamble(&table_for_preamble);
176
177 let total_preamble_rows = comment_preamble_rows + structural_preamble;
179
180 self.build_metadata(
184 best,
185 is_utf8,
186 structural_preamble,
187 total_preamble_rows,
188 &table_for_preamble,
189 data,
190 )
191 }
192
193 fn read_sample<R: Read + Seek>(&self, mut reader: R) -> Result<Vec<u8>> {
195 fn fill<R: Read>(reader: &mut R, buf: &mut [u8]) -> std::io::Result<usize> {
200 let mut filled = 0;
201 while filled < buf.len() {
202 match reader.read(&mut buf[filled..]) {
203 Ok(0) => break,
204 Ok(n) => filled += n,
205 Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
206 Err(e) => return Err(e),
207 }
208 }
209 Ok(filled)
210 }
211
212 match self.sample_size {
213 SampleSize::Bytes(n) => {
214 let mut buffer = vec![0u8; n];
215 let bytes_read = fill(&mut reader, &mut buffer)?;
216 buffer.truncate(bytes_read);
217 Ok(buffer)
218 }
219 SampleSize::All => {
220 const MAX_BYTES: u64 = 1024 * 1024 * 1024; let mut buffer = Vec::new();
222 (&mut reader).take(MAX_BYTES).read_to_end(&mut buffer)?;
223 if buffer.len() as u64 == MAX_BYTES {
224 let mut probe = [0u8; 1];
225 if reader.read(&mut probe)? > 0 {
226 eprintln!(
227 "warning: input exceeds 1 GB; sniffing on truncated sample — results may be inaccurate"
228 );
229 }
230 }
231 Ok(buffer)
232 }
233 SampleSize::Records(n) => {
234 let estimated_size = n.saturating_mul(1024).clamp(8192, MAX_RECORDS_BYTES);
237 let mut buffer = vec![0u8; estimated_size];
238 let bytes_read = fill(&mut reader, &mut buffer)?;
239 buffer.truncate(bytes_read);
240
241 if bytes_read == estimated_size {
243 let newlines = bytecount::count(&buffer, b'\n');
245 if newlines < n {
246 let remaining = MAX_RECORDS_BYTES.saturating_sub(buffer.len());
250 let additional = (n - newlines).saturating_mul(2048).min(remaining);
251 let mut more = vec![0u8; additional];
252 let more_read = fill(&mut reader, &mut more)?;
253 more.truncate(more_read);
254 buffer.extend(more);
255 }
256 }
257
258 if buffer.len() >= MAX_RECORDS_BYTES {
259 let mut probe = [0u8; 1];
260 if reader.read(&mut probe)? > 0 {
261 eprintln!(
262 "warning: Records sample capped at 100 MB; \
263 sniff result may be approximate for very large inputs"
264 );
265 }
266 }
267
268 Ok(buffer)
269 }
270 }
271 }
272
273 fn build_metadata(
281 &self,
282 score: &DialectScore,
283 is_utf8: bool,
284 structural_preamble: usize,
285 total_preamble_rows: usize,
286 table: &Table,
287 data: &[u8],
288 ) -> Result<Metadata> {
289 if table.is_empty() {
290 return Err(SnifferError::EmptyData);
291 }
292
293 let effective_table: Cow<'_, Table> =
297 if structural_preamble > 0 && table.rows.len() > structural_preamble {
298 let mut et = Table::new();
299 et.rows = table.rows[structural_preamble..].to_vec();
300 et.field_counts = table.field_counts[structural_preamble..].to_vec();
301 et.update_modal_field_count();
302 Cow::Owned(et)
303 } else {
304 Cow::Borrowed(table)
305 };
306
307 let header = detect_header(&effective_table, total_preamble_rows);
309
310 let fields = if header.has_header_row && !effective_table.rows.is_empty() {
312 effective_table.rows[0].clone()
313 } else {
314 (0..score.num_fields)
316 .map(|i| format!("field_{}", i + 1))
317 .collect()
318 };
319
320 let data_table = if header.has_header_row && effective_table.rows.len() > 1 {
322 let mut dt = crate::tum::table::Table::new();
323 dt.rows = effective_table.rows[1..].to_vec();
324 dt.field_counts = effective_table.field_counts[1..].to_vec();
325 dt.update_modal_field_count();
326 dt
327 } else {
328 effective_table.into_owned()
329 };
330
331 let types = infer_column_types(&data_table);
333
334 let dialect = Dialect {
336 delimiter: score.dialect.delimiter,
337 header,
338 quote: score.dialect.quote,
339 flexible: !score.is_uniform,
340 is_utf8,
341 };
342
343 let avg_record_len = calculate_avg_record_len(data, table.num_rows());
345
346 Ok(Metadata {
347 dialect,
348 avg_record_len,
349 num_fields: score.num_fields,
350 fields,
351 types,
352 })
353 }
354}
355
356fn detect_header(table: &crate::tum::table::Table, preamble_rows: usize) -> Header {
360 if table.rows.is_empty() {
361 return Header::new(false, preamble_rows);
362 }
363
364 if table.rows.len() < 2 {
365 return Header::new(false, preamble_rows);
367 }
368
369 let first_row = &table.rows[0];
370 let second_row = &table.rows[1];
371
372 let mut header_score = 0.0;
378 let mut checks = 0;
379
380 let (first_text_count, first_numeric_count) =
382 first_row.iter().fold((0, 0), |(text, num), s| {
383 let t = crate::tum::type_detection::detect_cell_type(s);
384 (
385 text + usize::from(t == Type::Text),
386 num + usize::from(t.is_numeric()),
387 )
388 });
389
390 let second_text_count = second_row
392 .iter()
393 .filter(|s| crate::tum::type_detection::detect_cell_type(s) == Type::Text)
394 .count();
395
396 if first_text_count > second_text_count {
397 header_score += 1.0;
398 }
399 checks += 1;
400
401 if first_text_count > first_numeric_count {
403 header_score += 0.5;
404 }
405 checks += 1;
406
407 let unique_count = {
409 let mut seen = std::collections::HashSet::new();
410 first_row.iter().filter(|s| seen.insert(s.as_str())).count()
411 };
412 if unique_count == first_row.len() {
413 header_score += 0.5;
414 }
415 checks += 1;
416
417 let avg_first_len: f64 = first_row
419 .iter()
420 .map(std::string::String::len)
421 .sum::<usize>() as f64
422 / first_row.len().max(1) as f64;
423 let avg_second_len: f64 = second_row
424 .iter()
425 .map(std::string::String::len)
426 .sum::<usize>() as f64
427 / second_row.len().max(1) as f64;
428
429 if avg_first_len <= avg_second_len {
430 header_score += 0.3;
431 }
432 checks += 1;
433
434 let has_header = (header_score / checks as f64) > 0.4;
436
437 Header::new(has_header, preamble_rows)
438}
439
440fn calculate_avg_record_len(data: &[u8], num_rows: usize) -> usize {
447 if num_rows == 0 || data.is_empty() {
448 return 0;
449 }
450
451 let mut rows_seen = 0;
454 let mut byte_offset = 0;
455
456 for (i, &byte) in data.iter().enumerate() {
457 if byte == b'\n' {
458 rows_seen += 1;
459 if rows_seen >= num_rows {
460 byte_offset = i + 1; break;
462 }
463 }
464 }
465
466 if byte_offset == 0 {
469 byte_offset = data.len();
470 }
471
472 byte_offset / num_rows
473}
474
475fn skip_preamble(data: &[u8]) -> (usize, &[u8]) {
480 let mut preamble_rows = 0;
481 let mut offset = 0;
482
483 while offset < data.len() {
484 let mut line_start = offset;
486 while line_start < data.len() && (data[line_start] == b' ' || data[line_start] == b'\t') {
487 line_start += 1;
488 }
489
490 if line_start < data.len() && data[line_start] == b'#' {
492 let mut line_end = line_start;
494 while line_end < data.len() && data[line_end] != b'\n' && data[line_end] != b'\r' {
495 line_end += 1;
496 }
497
498 if line_end < data.len() && data[line_end] == b'\r' {
500 line_end += 1;
501 }
502 if line_end < data.len() && data[line_end] == b'\n' {
503 line_end += 1;
504 }
505
506 preamble_rows += 1;
507 offset = line_end;
508 } else {
509 break;
511 }
512 }
513
514 (preamble_rows, &data[offset..])
515}
516
517fn detect_structural_preamble(table: &crate::tum::table::Table) -> usize {
522 let n = table.field_counts.len();
523 if n < 3 {
524 return 0;
525 }
526
527 let modal_count = table.modal_field_count();
528
529 let mut matching_suffix = vec![0usize; n];
532 let mut count = 0;
533 for i in (0..n).rev() {
534 if table.field_counts[i] == modal_count {
535 count += 1;
536 }
537 matching_suffix[i] = count;
538 }
539
540 for (i, &field_count) in table.field_counts.iter().enumerate() {
542 if field_count == modal_count {
543 let remaining_len = n - i;
544 let matching = matching_suffix[i];
545 let consistency = matching as f64 / remaining_len as f64;
546
547 if consistency >= 0.8 {
548 return i;
549 }
550 }
551 }
552
553 0
554}
555
556#[cfg(test)]
557mod tests {
558 use super::*;
559
560 #[test]
561 fn test_sniffer_builder() {
562 let mut sniffer = Sniffer::new();
563 sniffer
564 .sample_size(SampleSize::Records(50))
565 .date_preference(DatePreference::DmyFormat)
566 .delimiter(b',');
567
568 assert_eq!(sniffer.sample_size, SampleSize::Records(50));
569 assert_eq!(sniffer.date_preference, DatePreference::DmyFormat);
570 assert_eq!(sniffer.forced_delimiter, Some(b','));
571 }
572
573 #[test]
574 fn test_sniff_bytes() {
575 let data = b"name,age,city\nAlice,30,NYC\nBob,25,LA\n";
576 let sniffer = Sniffer::new();
577
578 let metadata = sniffer.sniff_bytes(data).unwrap();
579
580 assert_eq!(metadata.dialect.delimiter, b',');
581 assert!(metadata.dialect.header.has_header_row);
582 assert_eq!(metadata.num_fields, 3);
583 assert_eq!(metadata.fields, vec!["name", "age", "city"]);
584 }
585
586 #[test]
587 fn test_sniff_tsv() {
588 let data = b"name\tage\tcity\nAlice\t30\tNYC\nBob\t25\tLA\n";
589 let sniffer = Sniffer::new();
590
591 let metadata = sniffer.sniff_bytes(data).unwrap();
592
593 assert_eq!(metadata.dialect.delimiter, b'\t');
594 assert!(metadata.dialect.header.has_header_row);
595 }
596
597 #[test]
598 fn test_sniff_semicolon() {
599 let data = b"name;age;city\nAlice;30;NYC\nBob;25;LA\n";
600 let sniffer = Sniffer::new();
601
602 let metadata = sniffer.sniff_bytes(data).unwrap();
603
604 assert_eq!(metadata.dialect.delimiter, b';');
605 }
606
607 #[test]
608 fn test_sniff_no_header() {
609 let data = b"1,2,3\n4,5,6\n7,8,9\n";
610 let sniffer = Sniffer::new();
611
612 let metadata = sniffer.sniff_bytes(data).unwrap();
613
614 assert_eq!(metadata.dialect.delimiter, b',');
615 assert!(!metadata.dialect.header.has_header_row);
617 }
618
619 #[test]
620 fn test_sniff_with_quotes() {
621 let data = b"\"name\",\"value\"\n\"hello, world\",123\n\"test\",456\n";
622 let sniffer = Sniffer::new();
623
624 let metadata = sniffer.sniff_bytes(data).unwrap();
625
626 assert_eq!(metadata.dialect.delimiter, b',');
627 assert_eq!(metadata.dialect.quote, Quote::Some(b'"'));
628 }
629
630 #[test]
631 fn test_sniff_empty() {
632 let data = b"";
633 let sniffer = Sniffer::new();
634
635 let result = sniffer.sniff_bytes(data);
636 assert!(result.is_err());
637 }
638
639 #[test]
640 fn test_skip_preamble() {
641 let data = b"# This is a comment\n# Another comment\nname,age\nAlice,30\n";
643 let (preamble_rows, remaining) = skip_preamble(data);
644 assert_eq!(preamble_rows, 2);
645 assert_eq!(remaining, b"name,age\nAlice,30\n");
646
647 let data = b"name,age\nAlice,30\n";
649 let (preamble_rows, remaining) = skip_preamble(data);
650 assert_eq!(preamble_rows, 0);
651 assert_eq!(remaining, b"name,age\nAlice,30\n");
652
653 let data = b" # Indented comment\nname,age\n";
655 let (preamble_rows, remaining) = skip_preamble(data);
656 assert_eq!(preamble_rows, 1);
657 assert_eq!(remaining, b"name,age\n");
658 }
659
660 #[test]
661 fn test_sniff_with_preamble() {
662 let data = b"# LimeSurvey export\n# Generated 2024-01-01\nname,age,city\nAlice,30,NYC\nBob,25,LA\n";
663 let sniffer = Sniffer::new();
664
665 let metadata = sniffer.sniff_bytes(data).unwrap();
666
667 assert_eq!(metadata.dialect.delimiter, b',');
668 assert!(metadata.dialect.header.has_header_row);
669 assert_eq!(metadata.num_fields, 3);
670 }
671
672 #[test]
673 fn test_comment_preamble_propagated() {
674 let data = b"# Comment 1\n# Comment 2\nname,age\nAlice,30\nBob,25\n";
675 let metadata = Sniffer::new().sniff_bytes(data).unwrap();
676 assert_eq!(metadata.dialect.header.num_preamble_rows, 2);
677 assert!(metadata.dialect.header.has_header_row);
678 assert_eq!(metadata.fields, vec!["name", "age"]);
679 }
680
681 #[test]
682 fn test_structural_preamble_detection() {
683 let data = b"TITLE\nSUB,TITLE\nA,B,C,D,E\n1,2,3,4,5\n2,3,4,5,6\n3,4,5,6,7\n";
685 let metadata = Sniffer::new().sniff_bytes(data).unwrap();
686 assert_eq!(metadata.dialect.header.num_preamble_rows, 2);
687 assert!(metadata.dialect.header.has_header_row);
688 assert_eq!(metadata.fields, vec!["A", "B", "C", "D", "E"]);
689 }
690
691 #[test]
692 fn test_mixed_preamble_detection() {
693 let data =
696 b"# File header\nMETADATA\nname,age,city\nAlice,30,NYC\nBob,25,LA\nCharlie,35,CHI\n";
697 let metadata = Sniffer::new().sniff_bytes(data).unwrap();
698 assert_eq!(metadata.dialect.header.num_preamble_rows, 2);
700 assert!(metadata.dialect.header.has_header_row);
701 assert_eq!(metadata.fields, vec!["name", "age", "city"]);
702 }
703
704 #[test]
705 fn test_no_preamble() {
706 let data = b"a,b,c\n1,2,3\n4,5,6\n";
707 let metadata = Sniffer::new().sniff_bytes(data).unwrap();
708 assert_eq!(metadata.dialect.header.num_preamble_rows, 0);
709 }
710
711 #[test]
712 fn test_detect_structural_preamble_function() {
713 use crate::tum::table::Table;
714
715 let mut table = Table::new();
717 table.rows = vec![
718 vec!["TITLE".to_string()],
719 vec!["".to_string(), "".to_string()],
720 vec!["A".to_string(), "B".to_string(), "C".to_string()],
721 vec!["1".to_string(), "2".to_string(), "3".to_string()],
722 vec!["4".to_string(), "5".to_string(), "6".to_string()],
723 ];
724 table.field_counts = vec![1, 2, 3, 3, 3];
725 table.update_modal_field_count();
726 assert_eq!(detect_structural_preamble(&table), 2);
727
728 let mut table = Table::new();
730 table.rows = vec![
731 vec!["A".to_string(), "B".to_string(), "C".to_string()],
732 vec!["1".to_string(), "2".to_string(), "3".to_string()],
733 ];
734 table.field_counts = vec![3, 3];
735 table.update_modal_field_count();
736 assert_eq!(detect_structural_preamble(&table), 0);
737
738 let mut table = Table::new();
740 table.rows = vec![vec!["A".to_string()]];
741 table.field_counts = vec![1];
742 table.update_modal_field_count();
743 assert_eq!(detect_structural_preamble(&table), 0);
744 }
745
746 #[test]
747 fn test_avg_record_len_calculated_from_data() {
748 let short_data = b"a,b\n1,2\n3,4\n";
750 let sniffer = Sniffer::new();
751 let metadata = sniffer.sniff_bytes(short_data).unwrap();
752
753 assert_eq!(metadata.avg_record_len, 4);
756 }
757
758 #[test]
759 fn test_avg_record_len_with_quoted_fields() {
760 let quoted_data = b"\"hello\",\"world\"\n\"foo\",\"bar\"\n";
761 let sniffer = Sniffer::new();
762 let metadata = sniffer.sniff_bytes(quoted_data).unwrap();
763
764 assert_eq!(metadata.avg_record_len, 14);
766 }
767
768 #[test]
769 fn test_records_mode_cap_boundary_ok() {
770 let row = b"col1,col2,col3\n1,2,3\n"; let total = MAX_RECORDS_BYTES + row.len();
777 let data: Vec<u8> = row.iter().copied().cycle().take(total).collect();
778 assert!(
780 data.len() > MAX_RECORDS_BYTES,
781 "test data must exceed MAX_RECORDS_BYTES to exercise probe-read path"
782 );
783 let cursor = std::io::Cursor::new(data);
784 let mut sniffer = Sniffer::new();
785 sniffer.sample_size(SampleSize::Records(200_000));
787 let result = sniffer.sniff_reader(cursor);
788 assert!(
789 result.is_ok(),
790 "sniff should succeed at cap boundary: {result:?}"
791 );
792 }
796}