1use std::borrow::Cow;
6use std::fs::File;
7use std::io::{Read, Seek};
8use std::path::Path;
9
10use crate::encoding::{detect_and_transcode, detect_encoding, skip_bom};
11use crate::error::{Result, SnifferError};
12use crate::field_type::Type;
13use crate::metadata::{Dialect, Header, Metadata, Quote};
14use crate::sample::{DatePreference, SampleSize};
15use crate::tum::potential_dialects::{
16 PotentialDialect, detect_line_terminator, generate_dialects_with_terminator,
17};
18use crate::tum::score::{DialectScore, find_best_dialect, score_all_dialects_with_best_table};
19use crate::tum::table::{Table, parse_table};
20use crate::tum::type_detection::infer_column_types;
21
22const MAX_RECORDS_BYTES: usize = 100 * 1024 * 1024;
24
25#[derive(Debug, Clone)]
40pub struct Sniffer {
41 sample_size: SampleSize,
43 date_preference: DatePreference,
45 forced_delimiter: Option<u8>,
47 forced_quote: Option<Quote>,
49}
50
51impl Default for Sniffer {
52 fn default() -> Self {
53 Self::new()
54 }
55}
56
57impl Sniffer {
58 pub const fn new() -> Self {
60 Self {
61 sample_size: SampleSize::Records(100),
62 date_preference: DatePreference::MdyFormat,
63 forced_delimiter: None,
64 forced_quote: None,
65 }
66 }
67
68 pub fn sample_size(&mut self, sample_size: SampleSize) -> &mut Self {
70 self.sample_size = sample_size;
71 self
72 }
73
74 pub fn date_preference(&mut self, date_preference: DatePreference) -> &mut Self {
76 self.date_preference = date_preference;
77 self
78 }
79
80 pub fn delimiter(&mut self, delimiter: u8) -> &mut Self {
82 self.forced_delimiter = Some(delimiter);
83 self
84 }
85
86 pub fn quote(&mut self, quote: Quote) -> &mut Self {
88 self.forced_quote = Some(quote);
89 self
90 }
91
92 pub fn sniff_path<P: AsRef<Path>>(&mut self, path: P) -> Result<Metadata> {
94 let file = File::open(path.as_ref())?;
95 let mut reader = std::io::BufReader::new(file);
96 self.sniff_reader(&mut reader)
97 }
98
99 pub fn sniff_reader<R: Read + Seek>(&mut self, reader: R) -> Result<Metadata> {
101 let data = self.read_sample(reader)?;
102
103 if data.is_empty() {
104 return Err(SnifferError::EmptyData);
105 }
106
107 self.sniff_bytes(&data)
108 }
109
110 pub fn sniff_bytes(&self, data: &[u8]) -> Result<Metadata> {
112 if data.is_empty() {
113 return Err(SnifferError::EmptyData);
114 }
115
116 let (transcoded_data, was_transcoded) = detect_and_transcode(data);
118 let data = &transcoded_data[..];
119
120 let encoding_info = detect_encoding(data);
122 let is_utf8 = !was_transcoded || encoding_info.is_utf8;
123
124 let data = skip_bom(data);
126
127 let (comment_preamble_rows, data) = skip_preamble(data);
129
130 let line_terminator = detect_line_terminator(data);
132
133 let dialects = self.forced_delimiter.map_or_else(
135 || generate_dialects_with_terminator(line_terminator),
136 |delim| {
137 let quotes = if let Some(q) = self.forced_quote {
139 vec![q]
140 } else {
141 vec![Quote::Some(b'"'), Quote::Some(b'\''), Quote::None]
142 };
143
144 quotes
145 .into_iter()
146 .map(|q| PotentialDialect::new(delim, q, line_terminator))
147 .collect()
148 },
149 );
150 let max_rows = match self.sample_size {
152 SampleSize::Records(n) => n,
153 SampleSize::Bytes(_) | SampleSize::All => 0, };
155
156 let (scores, best_table) = score_all_dialects_with_best_table(data, &dialects, max_rows);
158
159 let best = find_best_dialect(&scores)
161 .ok_or_else(|| SnifferError::NoDialectDetected("No valid dialect found".to_string()))?;
162
163 let table_for_preamble =
165 best_table.unwrap_or_else(|| parse_table(data, &best.dialect, max_rows));
166 let structural_preamble = detect_structural_preamble(&table_for_preamble);
167
168 let total_preamble_rows = comment_preamble_rows + structural_preamble;
170
171 self.build_metadata(
175 best,
176 is_utf8,
177 structural_preamble,
178 total_preamble_rows,
179 &table_for_preamble,
180 data,
181 )
182 }
183
184 fn read_sample<R: Read + Seek>(&self, mut reader: R) -> Result<Vec<u8>> {
186 match self.sample_size {
187 SampleSize::Bytes(n) => {
188 let mut buffer = vec![0u8; n];
189 let bytes_read = reader.read(&mut buffer)?;
190 buffer.truncate(bytes_read);
191 Ok(buffer)
192 }
193 SampleSize::All => {
194 const MAX_BYTES: u64 = 1024 * 1024 * 1024; let mut buffer = Vec::new();
196 (&mut reader).take(MAX_BYTES).read_to_end(&mut buffer)?;
197 if buffer.len() as u64 == MAX_BYTES {
198 let mut probe = [0u8; 1];
199 if reader.read(&mut probe)? > 0 {
200 eprintln!(
201 "warning: input exceeds 1 GB; sniffing on truncated sample — results may be inaccurate"
202 );
203 }
204 }
205 Ok(buffer)
206 }
207 SampleSize::Records(n) => {
208 let estimated_size = n.saturating_mul(1024).clamp(8192, MAX_RECORDS_BYTES);
211 let mut buffer = vec![0u8; estimated_size];
212 let bytes_read = reader.read(&mut buffer)?;
213 buffer.truncate(bytes_read);
214
215 if bytes_read == estimated_size {
217 let newlines = bytecount::count(&buffer, b'\n');
219 if newlines < n {
220 let additional = (n - newlines).saturating_mul(2048).min(MAX_RECORDS_BYTES);
222 let mut more = vec![0u8; additional];
223 let more_read = reader.read(&mut more)?;
224 more.truncate(more_read);
225 buffer.extend(more);
226 }
227 }
228
229 if buffer.len() >= MAX_RECORDS_BYTES {
230 let mut probe = [0u8; 1];
231 if reader.read(&mut probe)? > 0 {
232 eprintln!(
233 "warning: Records sample capped at 100 MB; \
234 sniff result may be approximate for very large inputs"
235 );
236 }
237 }
238
239 Ok(buffer)
240 }
241 }
242 }
243
244 fn build_metadata(
252 &self,
253 score: &DialectScore,
254 is_utf8: bool,
255 structural_preamble: usize,
256 total_preamble_rows: usize,
257 table: &Table,
258 data: &[u8],
259 ) -> Result<Metadata> {
260 if table.is_empty() {
261 return Err(SnifferError::EmptyData);
262 }
263
264 let effective_table: Cow<'_, Table> =
268 if structural_preamble > 0 && table.rows.len() > structural_preamble {
269 let mut et = Table::new();
270 et.rows = table.rows[structural_preamble..].to_vec();
271 et.field_counts = table.field_counts[structural_preamble..].to_vec();
272 et.update_modal_field_count();
273 Cow::Owned(et)
274 } else {
275 Cow::Borrowed(table)
276 };
277
278 let header = detect_header(&effective_table, &score.dialect, total_preamble_rows);
280
281 let fields = if header.has_header_row && !effective_table.rows.is_empty() {
283 effective_table.rows[0].clone()
284 } else {
285 (0..score.num_fields)
287 .map(|i| format!("field_{}", i + 1))
288 .collect()
289 };
290
291 let data_table = if header.has_header_row && effective_table.rows.len() > 1 {
293 let mut dt = crate::tum::table::Table::new();
294 dt.rows = effective_table.rows[1..].to_vec();
295 dt.field_counts = effective_table.field_counts[1..].to_vec();
296 dt.update_modal_field_count();
297 dt
298 } else {
299 effective_table.into_owned()
300 };
301
302 let types = infer_column_types(&data_table);
304
305 let dialect = Dialect {
307 delimiter: score.dialect.delimiter,
308 header,
309 quote: score.dialect.quote,
310 flexible: !score.is_uniform,
311 is_utf8,
312 };
313
314 let avg_record_len = calculate_avg_record_len(data, table.num_rows());
316
317 Ok(Metadata {
318 dialect,
319 avg_record_len,
320 num_fields: score.num_fields,
321 fields,
322 types,
323 })
324 }
325}
326
327fn detect_header(
331 table: &crate::tum::table::Table,
332 _dialect: &PotentialDialect,
333 preamble_rows: usize,
334) -> Header {
335 if table.rows.is_empty() {
336 return Header::new(false, preamble_rows);
337 }
338
339 if table.rows.len() < 2 {
340 return Header::new(false, preamble_rows);
342 }
343
344 let first_row = &table.rows[0];
345 let second_row = &table.rows[1];
346
347 let mut header_score = 0.0;
353 let mut checks = 0;
354
355 let (first_text_count, first_numeric_count) =
357 first_row.iter().fold((0, 0), |(text, num), s| {
358 let t = crate::tum::type_detection::detect_cell_type(s);
359 (
360 text + usize::from(t == Type::Text),
361 num + usize::from(t.is_numeric()),
362 )
363 });
364
365 let second_text_count = second_row
367 .iter()
368 .filter(|s| crate::tum::type_detection::detect_cell_type(s) == Type::Text)
369 .count();
370
371 if first_text_count > second_text_count {
372 header_score += 1.0;
373 }
374 checks += 1;
375
376 if first_text_count > first_numeric_count {
378 header_score += 0.5;
379 }
380 checks += 1;
381
382 let unique_count = {
384 let mut seen = std::collections::HashSet::new();
385 first_row.iter().filter(|s| seen.insert(s.as_str())).count()
386 };
387 if unique_count == first_row.len() {
388 header_score += 0.5;
389 }
390 checks += 1;
391
392 let avg_first_len: f64 = first_row
394 .iter()
395 .map(std::string::String::len)
396 .sum::<usize>() as f64
397 / first_row.len().max(1) as f64;
398 let avg_second_len: f64 = second_row
399 .iter()
400 .map(std::string::String::len)
401 .sum::<usize>() as f64
402 / second_row.len().max(1) as f64;
403
404 if avg_first_len <= avg_second_len {
405 header_score += 0.3;
406 }
407 checks += 1;
408
409 let has_header = (header_score / checks as f64) > 0.4;
411
412 Header::new(has_header, preamble_rows)
413}
414
415fn calculate_avg_record_len(data: &[u8], num_rows: usize) -> usize {
422 if num_rows == 0 || data.is_empty() {
423 return 0;
424 }
425
426 let mut rows_seen = 0;
429 let mut byte_offset = 0;
430
431 for (i, &byte) in data.iter().enumerate() {
432 if byte == b'\n' {
433 rows_seen += 1;
434 if rows_seen >= num_rows {
435 byte_offset = i + 1; break;
437 }
438 }
439 }
440
441 if byte_offset == 0 {
444 byte_offset = data.len();
445 }
446
447 byte_offset / num_rows
448}
449
450fn skip_preamble(data: &[u8]) -> (usize, &[u8]) {
455 let mut preamble_rows = 0;
456 let mut offset = 0;
457
458 while offset < data.len() {
459 let mut line_start = offset;
461 while line_start < data.len() && (data[line_start] == b' ' || data[line_start] == b'\t') {
462 line_start += 1;
463 }
464
465 if line_start < data.len() && data[line_start] == b'#' {
467 let mut line_end = line_start;
469 while line_end < data.len() && data[line_end] != b'\n' && data[line_end] != b'\r' {
470 line_end += 1;
471 }
472
473 if line_end < data.len() && data[line_end] == b'\r' {
475 line_end += 1;
476 }
477 if line_end < data.len() && data[line_end] == b'\n' {
478 line_end += 1;
479 }
480
481 preamble_rows += 1;
482 offset = line_end;
483 } else {
484 break;
486 }
487 }
488
489 (preamble_rows, &data[offset..])
490}
491
492fn detect_structural_preamble(table: &crate::tum::table::Table) -> usize {
497 let n = table.field_counts.len();
498 if n < 3 {
499 return 0;
500 }
501
502 let modal_count = table.modal_field_count();
503
504 let mut matching_suffix = vec![0usize; n];
507 let mut count = 0;
508 for i in (0..n).rev() {
509 if table.field_counts[i] == modal_count {
510 count += 1;
511 }
512 matching_suffix[i] = count;
513 }
514
515 for (i, &field_count) in table.field_counts.iter().enumerate() {
517 if field_count == modal_count {
518 let remaining_len = n - i;
519 let matching = matching_suffix[i];
520 let consistency = matching as f64 / remaining_len as f64;
521
522 if consistency >= 0.8 {
523 return i;
524 }
525 }
526 }
527
528 0
529}
530
531#[cfg(test)]
532mod tests {
533 use super::*;
534
535 #[test]
536 fn test_sniffer_builder() {
537 let mut sniffer = Sniffer::new();
538 sniffer
539 .sample_size(SampleSize::Records(50))
540 .date_preference(DatePreference::DmyFormat)
541 .delimiter(b',');
542
543 assert_eq!(sniffer.sample_size, SampleSize::Records(50));
544 assert_eq!(sniffer.date_preference, DatePreference::DmyFormat);
545 assert_eq!(sniffer.forced_delimiter, Some(b','));
546 }
547
548 #[test]
549 fn test_sniff_bytes() {
550 let data = b"name,age,city\nAlice,30,NYC\nBob,25,LA\n";
551 let sniffer = Sniffer::new();
552
553 let metadata = sniffer.sniff_bytes(data).unwrap();
554
555 assert_eq!(metadata.dialect.delimiter, b',');
556 assert!(metadata.dialect.header.has_header_row);
557 assert_eq!(metadata.num_fields, 3);
558 assert_eq!(metadata.fields, vec!["name", "age", "city"]);
559 }
560
561 #[test]
562 fn test_sniff_tsv() {
563 let data = b"name\tage\tcity\nAlice\t30\tNYC\nBob\t25\tLA\n";
564 let sniffer = Sniffer::new();
565
566 let metadata = sniffer.sniff_bytes(data).unwrap();
567
568 assert_eq!(metadata.dialect.delimiter, b'\t');
569 assert!(metadata.dialect.header.has_header_row);
570 }
571
572 #[test]
573 fn test_sniff_semicolon() {
574 let data = b"name;age;city\nAlice;30;NYC\nBob;25;LA\n";
575 let sniffer = Sniffer::new();
576
577 let metadata = sniffer.sniff_bytes(data).unwrap();
578
579 assert_eq!(metadata.dialect.delimiter, b';');
580 }
581
582 #[test]
583 fn test_sniff_no_header() {
584 let data = b"1,2,3\n4,5,6\n7,8,9\n";
585 let sniffer = Sniffer::new();
586
587 let metadata = sniffer.sniff_bytes(data).unwrap();
588
589 assert_eq!(metadata.dialect.delimiter, b',');
590 assert!(!metadata.dialect.header.has_header_row);
592 }
593
594 #[test]
595 fn test_sniff_with_quotes() {
596 let data = b"\"name\",\"value\"\n\"hello, world\",123\n\"test\",456\n";
597 let sniffer = Sniffer::new();
598
599 let metadata = sniffer.sniff_bytes(data).unwrap();
600
601 assert_eq!(metadata.dialect.delimiter, b',');
602 assert_eq!(metadata.dialect.quote, Quote::Some(b'"'));
603 }
604
605 #[test]
606 fn test_sniff_empty() {
607 let data = b"";
608 let sniffer = Sniffer::new();
609
610 let result = sniffer.sniff_bytes(data);
611 assert!(result.is_err());
612 }
613
614 #[test]
615 fn test_skip_preamble() {
616 let data = b"# This is a comment\n# Another comment\nname,age\nAlice,30\n";
618 let (preamble_rows, remaining) = skip_preamble(data);
619 assert_eq!(preamble_rows, 2);
620 assert_eq!(remaining, b"name,age\nAlice,30\n");
621
622 let data = b"name,age\nAlice,30\n";
624 let (preamble_rows, remaining) = skip_preamble(data);
625 assert_eq!(preamble_rows, 0);
626 assert_eq!(remaining, b"name,age\nAlice,30\n");
627
628 let data = b" # Indented comment\nname,age\n";
630 let (preamble_rows, remaining) = skip_preamble(data);
631 assert_eq!(preamble_rows, 1);
632 assert_eq!(remaining, b"name,age\n");
633 }
634
635 #[test]
636 fn test_sniff_with_preamble() {
637 let data = b"# LimeSurvey export\n# Generated 2024-01-01\nname,age,city\nAlice,30,NYC\nBob,25,LA\n";
638 let sniffer = Sniffer::new();
639
640 let metadata = sniffer.sniff_bytes(data).unwrap();
641
642 assert_eq!(metadata.dialect.delimiter, b',');
643 assert!(metadata.dialect.header.has_header_row);
644 assert_eq!(metadata.num_fields, 3);
645 }
646
647 #[test]
648 fn test_comment_preamble_propagated() {
649 let data = b"# Comment 1\n# Comment 2\nname,age\nAlice,30\nBob,25\n";
650 let metadata = Sniffer::new().sniff_bytes(data).unwrap();
651 assert_eq!(metadata.dialect.header.num_preamble_rows, 2);
652 assert!(metadata.dialect.header.has_header_row);
653 assert_eq!(metadata.fields, vec!["name", "age"]);
654 }
655
656 #[test]
657 fn test_structural_preamble_detection() {
658 let data = b"TITLE\nSUB,TITLE\nA,B,C,D,E\n1,2,3,4,5\n2,3,4,5,6\n3,4,5,6,7\n";
660 let metadata = Sniffer::new().sniff_bytes(data).unwrap();
661 assert_eq!(metadata.dialect.header.num_preamble_rows, 2);
662 assert!(metadata.dialect.header.has_header_row);
663 assert_eq!(metadata.fields, vec!["A", "B", "C", "D", "E"]);
664 }
665
666 #[test]
667 fn test_mixed_preamble_detection() {
668 let data =
671 b"# File header\nMETADATA\nname,age,city\nAlice,30,NYC\nBob,25,LA\nCharlie,35,CHI\n";
672 let metadata = Sniffer::new().sniff_bytes(data).unwrap();
673 assert_eq!(metadata.dialect.header.num_preamble_rows, 2);
675 assert!(metadata.dialect.header.has_header_row);
676 assert_eq!(metadata.fields, vec!["name", "age", "city"]);
677 }
678
679 #[test]
680 fn test_no_preamble() {
681 let data = b"a,b,c\n1,2,3\n4,5,6\n";
682 let metadata = Sniffer::new().sniff_bytes(data).unwrap();
683 assert_eq!(metadata.dialect.header.num_preamble_rows, 0);
684 }
685
686 #[test]
687 fn test_detect_structural_preamble_function() {
688 use crate::tum::table::Table;
689
690 let mut table = Table::new();
692 table.rows = vec![
693 vec!["TITLE".to_string()],
694 vec!["".to_string(), "".to_string()],
695 vec!["A".to_string(), "B".to_string(), "C".to_string()],
696 vec!["1".to_string(), "2".to_string(), "3".to_string()],
697 vec!["4".to_string(), "5".to_string(), "6".to_string()],
698 ];
699 table.field_counts = vec![1, 2, 3, 3, 3];
700 table.update_modal_field_count();
701 assert_eq!(detect_structural_preamble(&table), 2);
702
703 let mut table = Table::new();
705 table.rows = vec![
706 vec!["A".to_string(), "B".to_string(), "C".to_string()],
707 vec!["1".to_string(), "2".to_string(), "3".to_string()],
708 ];
709 table.field_counts = vec![3, 3];
710 table.update_modal_field_count();
711 assert_eq!(detect_structural_preamble(&table), 0);
712
713 let mut table = Table::new();
715 table.rows = vec![vec!["A".to_string()]];
716 table.field_counts = vec![1];
717 table.update_modal_field_count();
718 assert_eq!(detect_structural_preamble(&table), 0);
719 }
720
721 #[test]
722 fn test_avg_record_len_calculated_from_data() {
723 let short_data = b"a,b\n1,2\n3,4\n";
725 let sniffer = Sniffer::new();
726 let metadata = sniffer.sniff_bytes(short_data).unwrap();
727
728 assert_eq!(metadata.avg_record_len, 4);
731 }
732
733 #[test]
734 fn test_avg_record_len_with_quoted_fields() {
735 let quoted_data = b"\"hello\",\"world\"\n\"foo\",\"bar\"\n";
736 let sniffer = Sniffer::new();
737 let metadata = sniffer.sniff_bytes(quoted_data).unwrap();
738
739 assert_eq!(metadata.avg_record_len, 14);
741 }
742
743 #[test]
744 fn test_records_mode_cap_boundary_ok() {
745 let row = b"col1,col2,col3\n1,2,3\n"; let total = MAX_RECORDS_BYTES + row.len();
752 let data: Vec<u8> = row.iter().copied().cycle().take(total).collect();
753 assert!(
755 data.len() > MAX_RECORDS_BYTES,
756 "test data must exceed MAX_RECORDS_BYTES to exercise probe-read path"
757 );
758 let cursor = std::io::Cursor::new(data);
759 let mut sniffer = Sniffer::new();
760 sniffer.sample_size(SampleSize::Records(200_000));
762 let result = sniffer.sniff_reader(cursor);
763 assert!(
764 result.is_ok(),
765 "sniff should succeed at cap boundary: {result:?}"
766 );
767 }
771}