1use std::fs::File;
6use std::io::{Read, Seek};
7use std::path::Path;
8
9use crate::encoding::{detect_and_transcode, detect_encoding, skip_bom};
10use crate::error::{Result, SnifferError};
11use crate::field_type::Type;
12use crate::metadata::{Dialect, Header, Metadata, Quote};
13use crate::sample::{DatePreference, SampleSize};
14use crate::tum::potential_dialects::{
15 PotentialDialect, detect_line_terminator, generate_dialects_with_terminator,
16};
17use crate::tum::score::{DialectScore, find_best_dialect, score_all_dialects_with_best_table};
18use crate::tum::table::{Table, parse_table};
19use crate::tum::type_detection::infer_column_types;
20
21#[derive(Debug, Clone)]
36pub struct Sniffer {
37 sample_size: SampleSize,
39 date_preference: DatePreference,
41 forced_delimiter: Option<u8>,
43 forced_quote: Option<Quote>,
45}
46
47impl Default for Sniffer {
48 fn default() -> Self {
49 Self::new()
50 }
51}
52
53impl Sniffer {
54 pub const fn new() -> Self {
56 Self {
57 sample_size: SampleSize::Records(100),
58 date_preference: DatePreference::MdyFormat,
59 forced_delimiter: None,
60 forced_quote: None,
61 }
62 }
63
64 pub fn sample_size(&mut self, sample_size: SampleSize) -> &mut Self {
66 self.sample_size = sample_size;
67 self
68 }
69
70 pub fn date_preference(&mut self, date_preference: DatePreference) -> &mut Self {
72 self.date_preference = date_preference;
73 self
74 }
75
76 pub fn delimiter(&mut self, delimiter: u8) -> &mut Self {
78 self.forced_delimiter = Some(delimiter);
79 self
80 }
81
82 pub fn quote(&mut self, quote: Quote) -> &mut Self {
84 self.forced_quote = Some(quote);
85 self
86 }
87
88 pub fn sniff_path<P: AsRef<Path>>(&mut self, path: P) -> Result<Metadata> {
90 let file = File::open(path.as_ref())?;
91 let mut reader = std::io::BufReader::new(file);
92 self.sniff_reader(&mut reader)
93 }
94
95 pub fn sniff_reader<R: Read + Seek>(&mut self, reader: R) -> Result<Metadata> {
97 let data = self.read_sample(reader)?;
98
99 if data.is_empty() {
100 return Err(SnifferError::EmptyData);
101 }
102
103 self.sniff_bytes(&data)
104 }
105
106 pub fn sniff_bytes(&self, data: &[u8]) -> Result<Metadata> {
108 if data.is_empty() {
109 return Err(SnifferError::EmptyData);
110 }
111
112 let (transcoded_data, was_transcoded) = detect_and_transcode(data);
114 let data = &transcoded_data[..];
115
116 let encoding_info = detect_encoding(data);
118 let is_utf8 = !was_transcoded || encoding_info.is_utf8;
119
120 let data = skip_bom(data);
122
123 let (comment_preamble_rows, data) = skip_preamble(data);
125
126 let line_terminator = detect_line_terminator(data);
128
129 let dialects = self.forced_delimiter.map_or_else(
131 || generate_dialects_with_terminator(line_terminator),
132 |delim| {
133 let quotes = if let Some(q) = self.forced_quote {
135 vec![q]
136 } else {
137 vec![Quote::Some(b'"'), Quote::Some(b'\''), Quote::None]
138 };
139
140 quotes
141 .into_iter()
142 .map(|q| PotentialDialect::new(delim, q, line_terminator))
143 .collect()
144 },
145 );
146 let max_rows = match self.sample_size {
148 SampleSize::Records(n) => n,
149 SampleSize::Bytes(_) | SampleSize::All => 0, };
151
152 let (scores, best_table) = score_all_dialects_with_best_table(data, &dialects, max_rows);
154
155 let best = find_best_dialect(&scores)
157 .ok_or_else(|| SnifferError::NoDialectDetected("No valid dialect found".to_string()))?;
158
159 let table_for_preamble =
161 best_table.unwrap_or_else(|| parse_table(data, &best.dialect, max_rows));
162 let structural_preamble = detect_structural_preamble(&table_for_preamble);
163
164 let total_preamble_rows = comment_preamble_rows + structural_preamble;
166
167 self.build_metadata(
171 best,
172 is_utf8,
173 structural_preamble,
174 total_preamble_rows,
175 &table_for_preamble,
176 data,
177 )
178 }
179
180 fn read_sample<R: Read + Seek>(&self, mut reader: R) -> Result<Vec<u8>> {
182 match self.sample_size {
183 SampleSize::Bytes(n) => {
184 let mut buffer = vec![0u8; n];
185 let bytes_read = reader.read(&mut buffer)?;
186 buffer.truncate(bytes_read);
187 Ok(buffer)
188 }
189 SampleSize::All => {
190 let mut buffer = Vec::new();
191 reader.read_to_end(&mut buffer)?;
192 Ok(buffer)
193 }
194 SampleSize::Records(n) => {
195 let estimated_size = (n * 1024).max(8192);
198 let mut buffer = vec![0u8; estimated_size];
199 let bytes_read = reader.read(&mut buffer)?;
200 buffer.truncate(bytes_read);
201
202 if bytes_read == estimated_size {
204 let newlines = bytecount::count(&buffer, b'\n');
206 if newlines < n {
207 let additional = (n - newlines) * 2048;
209 let mut more = vec![0u8; additional];
210 let more_read = reader.read(&mut more)?;
211 more.truncate(more_read);
212 buffer.extend(more);
213 }
214 }
215
216 Ok(buffer)
217 }
218 }
219 }
220
221 fn build_metadata(
229 &self,
230 score: &DialectScore,
231 is_utf8: bool,
232 structural_preamble: usize,
233 total_preamble_rows: usize,
234 table: &Table,
235 data: &[u8],
236 ) -> Result<Metadata> {
237 if table.is_empty() {
238 return Err(SnifferError::EmptyData);
239 }
240
241 let effective_table = if structural_preamble > 0 && table.rows.len() > structural_preamble {
244 let mut et = crate::tum::table::Table::new();
245 et.rows = table.rows[structural_preamble..].to_vec();
246 et.field_counts = table.field_counts[structural_preamble..].to_vec();
247 et.update_modal_field_count();
248 et
249 } else {
250 table.clone()
251 };
252
253 let header = detect_header(&effective_table, &score.dialect, total_preamble_rows);
255
256 let fields = if header.has_header_row && !effective_table.rows.is_empty() {
258 effective_table.rows[0].clone()
259 } else {
260 (0..score.num_fields)
262 .map(|i| format!("field_{}", i + 1))
263 .collect()
264 };
265
266 let data_table = if header.has_header_row && effective_table.rows.len() > 1 {
268 let mut dt = crate::tum::table::Table::new();
269 dt.rows = effective_table.rows[1..].to_vec();
270 dt.field_counts = effective_table.field_counts[1..].to_vec();
271 dt.update_modal_field_count();
272 dt
273 } else {
274 effective_table
275 };
276
277 let types = infer_column_types(&data_table);
279
280 let dialect = Dialect {
282 delimiter: score.dialect.delimiter,
283 header,
284 quote: score.dialect.quote,
285 flexible: !score.is_uniform,
286 is_utf8,
287 };
288
289 let avg_record_len = calculate_avg_record_len(data, table.num_rows());
291
292 Ok(Metadata {
293 dialect,
294 avg_record_len,
295 num_fields: score.num_fields,
296 fields,
297 types,
298 })
299 }
300}
301
302fn detect_header(
306 table: &crate::tum::table::Table,
307 _dialect: &PotentialDialect,
308 preamble_rows: usize,
309) -> Header {
310 if table.rows.is_empty() {
311 return Header::new(false, preamble_rows);
312 }
313
314 if table.rows.len() < 2 {
315 return Header::new(false, preamble_rows);
317 }
318
319 let first_row = &table.rows[0];
320 let second_row = &table.rows[1];
321
322 let mut header_score = 0.0;
328 let mut checks = 0;
329
330 let (first_text_count, first_numeric_count) =
332 first_row.iter().fold((0, 0), |(text, num), s| {
333 let t = crate::tum::type_detection::detect_cell_type(s);
334 (
335 text + usize::from(t == Type::Text),
336 num + usize::from(t.is_numeric()),
337 )
338 });
339
340 let second_text_count = second_row
342 .iter()
343 .filter(|s| crate::tum::type_detection::detect_cell_type(s) == Type::Text)
344 .count();
345
346 if first_text_count > second_text_count {
347 header_score += 1.0;
348 }
349 checks += 1;
350
351 if first_text_count > first_numeric_count {
353 header_score += 0.5;
354 }
355 checks += 1;
356
357 let unique_count = {
359 let mut seen = std::collections::HashSet::new();
360 first_row.iter().filter(|s| seen.insert(s.as_str())).count()
361 };
362 if unique_count == first_row.len() {
363 header_score += 0.5;
364 }
365 checks += 1;
366
367 let avg_first_len: f64 = first_row
369 .iter()
370 .map(std::string::String::len)
371 .sum::<usize>() as f64
372 / first_row.len().max(1) as f64;
373 let avg_second_len: f64 = second_row
374 .iter()
375 .map(std::string::String::len)
376 .sum::<usize>() as f64
377 / second_row.len().max(1) as f64;
378
379 if avg_first_len <= avg_second_len {
380 header_score += 0.3;
381 }
382 checks += 1;
383
384 let has_header = (header_score / checks as f64) > 0.4;
386
387 Header::new(has_header, preamble_rows)
388}
389
390fn calculate_avg_record_len(data: &[u8], num_rows: usize) -> usize {
397 if num_rows == 0 || data.is_empty() {
398 return 0;
399 }
400
401 let mut rows_seen = 0;
404 let mut byte_offset = 0;
405
406 for (i, &byte) in data.iter().enumerate() {
407 if byte == b'\n' {
408 rows_seen += 1;
409 if rows_seen >= num_rows {
410 byte_offset = i + 1; break;
412 }
413 }
414 }
415
416 if byte_offset == 0 {
419 byte_offset = data.len();
420 }
421
422 byte_offset / num_rows
423}
424
425fn skip_preamble(data: &[u8]) -> (usize, &[u8]) {
430 let mut preamble_rows = 0;
431 let mut offset = 0;
432
433 while offset < data.len() {
434 let mut line_start = offset;
436 while line_start < data.len() && (data[line_start] == b' ' || data[line_start] == b'\t') {
437 line_start += 1;
438 }
439
440 if line_start < data.len() && data[line_start] == b'#' {
442 let mut line_end = line_start;
444 while line_end < data.len() && data[line_end] != b'\n' && data[line_end] != b'\r' {
445 line_end += 1;
446 }
447
448 if line_end < data.len() && data[line_end] == b'\r' {
450 line_end += 1;
451 }
452 if line_end < data.len() && data[line_end] == b'\n' {
453 line_end += 1;
454 }
455
456 preamble_rows += 1;
457 offset = line_end;
458 } else {
459 break;
461 }
462 }
463
464 (preamble_rows, &data[offset..])
465}
466
467fn detect_structural_preamble(table: &crate::tum::table::Table) -> usize {
472 let n = table.field_counts.len();
473 if n < 3 {
474 return 0;
475 }
476
477 let modal_count = table.modal_field_count();
478
479 let mut matching_suffix = vec![0usize; n];
482 let mut count = 0;
483 for i in (0..n).rev() {
484 if table.field_counts[i] == modal_count {
485 count += 1;
486 }
487 matching_suffix[i] = count;
488 }
489
490 for (i, &field_count) in table.field_counts.iter().enumerate() {
492 if field_count == modal_count {
493 let remaining_len = n - i;
494 let matching = matching_suffix[i];
495 let consistency = matching as f64 / remaining_len as f64;
496
497 if consistency >= 0.8 {
498 return i;
499 }
500 }
501 }
502
503 0
504}
505
506#[cfg(test)]
507mod tests {
508 use super::*;
509
510 #[test]
511 fn test_sniffer_builder() {
512 let mut sniffer = Sniffer::new();
513 sniffer
514 .sample_size(SampleSize::Records(50))
515 .date_preference(DatePreference::DmyFormat)
516 .delimiter(b',');
517
518 assert_eq!(sniffer.sample_size, SampleSize::Records(50));
519 assert_eq!(sniffer.date_preference, DatePreference::DmyFormat);
520 assert_eq!(sniffer.forced_delimiter, Some(b','));
521 }
522
523 #[test]
524 fn test_sniff_bytes() {
525 let data = b"name,age,city\nAlice,30,NYC\nBob,25,LA\n";
526 let sniffer = Sniffer::new();
527
528 let metadata = sniffer.sniff_bytes(data).unwrap();
529
530 assert_eq!(metadata.dialect.delimiter, b',');
531 assert!(metadata.dialect.header.has_header_row);
532 assert_eq!(metadata.num_fields, 3);
533 assert_eq!(metadata.fields, vec!["name", "age", "city"]);
534 }
535
536 #[test]
537 fn test_sniff_tsv() {
538 let data = b"name\tage\tcity\nAlice\t30\tNYC\nBob\t25\tLA\n";
539 let sniffer = Sniffer::new();
540
541 let metadata = sniffer.sniff_bytes(data).unwrap();
542
543 assert_eq!(metadata.dialect.delimiter, b'\t');
544 assert!(metadata.dialect.header.has_header_row);
545 }
546
547 #[test]
548 fn test_sniff_semicolon() {
549 let data = b"name;age;city\nAlice;30;NYC\nBob;25;LA\n";
550 let sniffer = Sniffer::new();
551
552 let metadata = sniffer.sniff_bytes(data).unwrap();
553
554 assert_eq!(metadata.dialect.delimiter, b';');
555 }
556
557 #[test]
558 fn test_sniff_no_header() {
559 let data = b"1,2,3\n4,5,6\n7,8,9\n";
560 let sniffer = Sniffer::new();
561
562 let metadata = sniffer.sniff_bytes(data).unwrap();
563
564 assert_eq!(metadata.dialect.delimiter, b',');
565 assert!(!metadata.dialect.header.has_header_row);
567 }
568
569 #[test]
570 fn test_sniff_with_quotes() {
571 let data = b"\"name\",\"value\"\n\"hello, world\",123\n\"test\",456\n";
572 let sniffer = Sniffer::new();
573
574 let metadata = sniffer.sniff_bytes(data).unwrap();
575
576 assert_eq!(metadata.dialect.delimiter, b',');
577 assert_eq!(metadata.dialect.quote, Quote::Some(b'"'));
578 }
579
580 #[test]
581 fn test_sniff_empty() {
582 let data = b"";
583 let sniffer = Sniffer::new();
584
585 let result = sniffer.sniff_bytes(data);
586 assert!(result.is_err());
587 }
588
589 #[test]
590 fn test_skip_preamble() {
591 let data = b"# This is a comment\n# Another comment\nname,age\nAlice,30\n";
593 let (preamble_rows, remaining) = skip_preamble(data);
594 assert_eq!(preamble_rows, 2);
595 assert_eq!(remaining, b"name,age\nAlice,30\n");
596
597 let data = b"name,age\nAlice,30\n";
599 let (preamble_rows, remaining) = skip_preamble(data);
600 assert_eq!(preamble_rows, 0);
601 assert_eq!(remaining, b"name,age\nAlice,30\n");
602
603 let data = b" # Indented comment\nname,age\n";
605 let (preamble_rows, remaining) = skip_preamble(data);
606 assert_eq!(preamble_rows, 1);
607 assert_eq!(remaining, b"name,age\n");
608 }
609
610 #[test]
611 fn test_sniff_with_preamble() {
612 let data = b"# LimeSurvey export\n# Generated 2024-01-01\nname,age,city\nAlice,30,NYC\nBob,25,LA\n";
613 let sniffer = Sniffer::new();
614
615 let metadata = sniffer.sniff_bytes(data).unwrap();
616
617 assert_eq!(metadata.dialect.delimiter, b',');
618 assert!(metadata.dialect.header.has_header_row);
619 assert_eq!(metadata.num_fields, 3);
620 }
621
622 #[test]
623 fn test_comment_preamble_propagated() {
624 let data = b"# Comment 1\n# Comment 2\nname,age\nAlice,30\nBob,25\n";
625 let metadata = Sniffer::new().sniff_bytes(data).unwrap();
626 assert_eq!(metadata.dialect.header.num_preamble_rows, 2);
627 assert!(metadata.dialect.header.has_header_row);
628 assert_eq!(metadata.fields, vec!["name", "age"]);
629 }
630
631 #[test]
632 fn test_structural_preamble_detection() {
633 let data = b"TITLE\nSUB,TITLE\nA,B,C,D,E\n1,2,3,4,5\n2,3,4,5,6\n3,4,5,6,7\n";
635 let metadata = Sniffer::new().sniff_bytes(data).unwrap();
636 assert_eq!(metadata.dialect.header.num_preamble_rows, 2);
637 assert!(metadata.dialect.header.has_header_row);
638 assert_eq!(metadata.fields, vec!["A", "B", "C", "D", "E"]);
639 }
640
641 #[test]
642 fn test_mixed_preamble_detection() {
643 let data =
646 b"# File header\nMETADATA\nname,age,city\nAlice,30,NYC\nBob,25,LA\nCharlie,35,CHI\n";
647 let metadata = Sniffer::new().sniff_bytes(data).unwrap();
648 assert_eq!(metadata.dialect.header.num_preamble_rows, 2);
650 assert!(metadata.dialect.header.has_header_row);
651 assert_eq!(metadata.fields, vec!["name", "age", "city"]);
652 }
653
654 #[test]
655 fn test_no_preamble() {
656 let data = b"a,b,c\n1,2,3\n4,5,6\n";
657 let metadata = Sniffer::new().sniff_bytes(data).unwrap();
658 assert_eq!(metadata.dialect.header.num_preamble_rows, 0);
659 }
660
661 #[test]
662 fn test_detect_structural_preamble_function() {
663 use crate::tum::table::Table;
664
665 let mut table = Table::new();
667 table.rows = vec![
668 vec!["TITLE".to_string()],
669 vec!["".to_string(), "".to_string()],
670 vec!["A".to_string(), "B".to_string(), "C".to_string()],
671 vec!["1".to_string(), "2".to_string(), "3".to_string()],
672 vec!["4".to_string(), "5".to_string(), "6".to_string()],
673 ];
674 table.field_counts = vec![1, 2, 3, 3, 3];
675 table.update_modal_field_count();
676 assert_eq!(detect_structural_preamble(&table), 2);
677
678 let mut table = Table::new();
680 table.rows = vec![
681 vec!["A".to_string(), "B".to_string(), "C".to_string()],
682 vec!["1".to_string(), "2".to_string(), "3".to_string()],
683 ];
684 table.field_counts = vec![3, 3];
685 table.update_modal_field_count();
686 assert_eq!(detect_structural_preamble(&table), 0);
687
688 let mut table = Table::new();
690 table.rows = vec![vec!["A".to_string()]];
691 table.field_counts = vec![1];
692 table.update_modal_field_count();
693 assert_eq!(detect_structural_preamble(&table), 0);
694 }
695
696 #[test]
697 fn test_avg_record_len_calculated_from_data() {
698 let short_data = b"a,b\n1,2\n3,4\n";
700 let sniffer = Sniffer::new();
701 let metadata = sniffer.sniff_bytes(short_data).unwrap();
702
703 assert_eq!(metadata.avg_record_len, 4);
706 }
707
708 #[test]
709 fn test_avg_record_len_with_quoted_fields() {
710 let quoted_data = b"\"hello\",\"world\"\n\"foo\",\"bar\"\n";
711 let sniffer = Sniffer::new();
712 let metadata = sniffer.sniff_bytes(quoted_data).unwrap();
713
714 assert_eq!(metadata.avg_record_len, 14);
716 }
717}