1use std::fs::File;
6use std::io::{Read, Seek};
7use std::path::Path;
8
9use crate::encoding::{detect_and_transcode, detect_encoding, skip_bom};
10use crate::error::{Result, SnifferError};
11use crate::field_type::Type;
12use crate::metadata::{Dialect, Header, Metadata, Quote};
13use crate::sample::{DatePreference, SampleSize};
14use crate::tum::potential_dialects::{
15 PotentialDialect, detect_line_terminator, generate_dialects_with_terminator,
16};
17use crate::tum::score::{DialectScore, find_best_dialect, score_all_dialects_with_best_table};
18use crate::tum::table::{Table, parse_table};
19use crate::tum::type_detection::infer_column_types;
20
21#[derive(Debug, Clone)]
36pub struct Sniffer {
37 sample_size: SampleSize,
39 date_preference: DatePreference,
41 forced_delimiter: Option<u8>,
43 forced_quote: Option<Quote>,
45}
46
47impl Default for Sniffer {
48 fn default() -> Self {
49 Self::new()
50 }
51}
52
53impl Sniffer {
54 pub const fn new() -> Self {
56 Self {
57 sample_size: SampleSize::Records(100),
58 date_preference: DatePreference::MdyFormat,
59 forced_delimiter: None,
60 forced_quote: None,
61 }
62 }
63
64 pub fn sample_size(&mut self, sample_size: SampleSize) -> &mut Self {
66 self.sample_size = sample_size;
67 self
68 }
69
70 pub fn date_preference(&mut self, date_preference: DatePreference) -> &mut Self {
72 self.date_preference = date_preference;
73 self
74 }
75
76 pub fn delimiter(&mut self, delimiter: u8) -> &mut Self {
78 self.forced_delimiter = Some(delimiter);
79 self
80 }
81
82 pub fn quote(&mut self, quote: Quote) -> &mut Self {
84 self.forced_quote = Some(quote);
85 self
86 }
87
88 pub fn sniff_path<P: AsRef<Path>>(&mut self, path: P) -> Result<Metadata> {
90 let file = File::open(path.as_ref())?;
91 let mut reader = std::io::BufReader::new(file);
92 self.sniff_reader(&mut reader)
93 }
94
95 pub fn sniff_reader<R: Read + Seek>(&mut self, reader: R) -> Result<Metadata> {
97 let data = self.read_sample(reader)?;
98
99 if data.is_empty() {
100 return Err(SnifferError::EmptyData);
101 }
102
103 self.sniff_bytes(&data)
104 }
105
106 pub fn sniff_bytes(&self, data: &[u8]) -> Result<Metadata> {
108 if data.is_empty() {
109 return Err(SnifferError::EmptyData);
110 }
111
112 let (transcoded_data, was_transcoded) = detect_and_transcode(data);
114 let data = &transcoded_data[..];
115
116 let encoding_info = detect_encoding(data);
118 let is_utf8 = !was_transcoded || encoding_info.is_utf8;
119
120 let data = skip_bom(data);
122
123 let (comment_preamble_rows, data) = skip_preamble(data);
125
126 let line_terminator = detect_line_terminator(data);
128
129 let dialects = if let Some(delim) = self.forced_delimiter {
131 let quotes = if let Some(q) = self.forced_quote {
133 vec![q]
134 } else {
135 vec![Quote::Some(b'"'), Quote::Some(b'\''), Quote::None]
136 };
137
138 quotes
139 .into_iter()
140 .map(|q| PotentialDialect::new(delim, q, line_terminator))
141 .collect()
142 } else {
143 generate_dialects_with_terminator(line_terminator)
144 };
145
146 let max_rows = match self.sample_size {
148 SampleSize::Records(n) => n,
149 SampleSize::Bytes(_) | SampleSize::All => 0, };
151
152 let (scores, best_table) = score_all_dialects_with_best_table(data, &dialects, max_rows);
154
155 let best = find_best_dialect(&scores)
157 .ok_or_else(|| SnifferError::NoDialectDetected("No valid dialect found".to_string()))?;
158
159 let table_for_preamble =
161 best_table.unwrap_or_else(|| parse_table(data, &best.dialect, max_rows));
162 let structural_preamble = detect_structural_preamble(&table_for_preamble);
163
164 let total_preamble_rows = comment_preamble_rows + structural_preamble;
166
167 self.build_metadata(
171 best,
172 is_utf8,
173 structural_preamble,
174 total_preamble_rows,
175 table_for_preamble,
176 )
177 }
178
179 fn read_sample<R: Read + Seek>(&self, mut reader: R) -> Result<Vec<u8>> {
181 match self.sample_size {
182 SampleSize::Bytes(n) => {
183 let mut buffer = vec![0u8; n];
184 let bytes_read = reader.read(&mut buffer)?;
185 buffer.truncate(bytes_read);
186 Ok(buffer)
187 }
188 SampleSize::All => {
189 let mut buffer = Vec::new();
190 reader.read_to_end(&mut buffer)?;
191 Ok(buffer)
192 }
193 SampleSize::Records(n) => {
194 let estimated_size = (n * 1024).max(8192);
197 let mut buffer = vec![0u8; estimated_size];
198 let bytes_read = reader.read(&mut buffer)?;
199 buffer.truncate(bytes_read);
200
201 if bytes_read == estimated_size {
203 let newlines = bytecount::count(&buffer, b'\n');
205 if newlines < n {
206 let additional = (n - newlines) * 2048;
208 let mut more = vec![0u8; additional];
209 let more_read = reader.read(&mut more)?;
210 more.truncate(more_read);
211 buffer.extend(more);
212 }
213 }
214
215 Ok(buffer)
216 }
217 }
218 }
219
220 fn build_metadata(
227 &self,
228 score: &DialectScore,
229 is_utf8: bool,
230 structural_preamble: usize,
231 total_preamble_rows: usize,
232 table: Table,
233 ) -> Result<Metadata> {
234 if table.is_empty() {
235 return Err(SnifferError::EmptyData);
236 }
237
238 let effective_table = if structural_preamble > 0 && table.rows.len() > structural_preamble {
241 let mut et = crate::tum::table::Table::new();
242 et.rows = table.rows[structural_preamble..].to_vec();
243 et.field_counts = table.field_counts[structural_preamble..].to_vec();
244 et.update_modal_field_count();
245 et
246 } else {
247 table.clone()
248 };
249
250 let header = detect_header(&effective_table, &score.dialect, total_preamble_rows);
252
253 let fields = if header.has_header_row && !effective_table.rows.is_empty() {
255 effective_table.rows[0].clone()
256 } else {
257 (0..score.num_fields)
259 .map(|i| format!("field_{}", i + 1))
260 .collect()
261 };
262
263 let data_table = if header.has_header_row && effective_table.rows.len() > 1 {
265 let mut dt = crate::tum::table::Table::new();
266 dt.rows = effective_table.rows[1..].to_vec();
267 dt.field_counts = effective_table.field_counts[1..].to_vec();
268 dt.update_modal_field_count();
269 dt
270 } else {
271 effective_table
272 };
273
274 let types = infer_column_types(&data_table);
276
277 let dialect = Dialect {
279 delimiter: score.dialect.delimiter,
280 header,
281 quote: score.dialect.quote,
282 flexible: !score.is_uniform,
283 is_utf8,
284 };
285
286 let avg_record_len = calculate_avg_record_len(&table);
288
289 Ok(Metadata {
290 dialect,
291 avg_record_len,
292 num_fields: score.num_fields,
293 fields,
294 types,
295 })
296 }
297}
298
299fn detect_header(
301 table: &crate::tum::table::Table,
302 _dialect: &PotentialDialect,
303 preamble_rows: usize,
304) -> Header {
305 if table.rows.is_empty() {
306 return Header::new(false, preamble_rows);
307 }
308
309 if table.rows.len() < 2 {
310 return Header::new(false, preamble_rows);
312 }
313
314 let first_row = &table.rows[0];
315 let second_row = &table.rows[1];
316
317 let mut header_score = 0.0;
323 let mut checks = 0;
324
325 let first_types: Vec<Type> = first_row
327 .iter()
328 .map(|s| crate::tum::type_detection::detect_cell_type(s))
329 .collect();
330 let second_types: Vec<Type> = second_row
331 .iter()
332 .map(|s| crate::tum::type_detection::detect_cell_type(s))
333 .collect();
334
335 let first_text_count = first_types.iter().filter(|&&t| t == Type::Text).count();
336 let second_text_count = second_types.iter().filter(|&&t| t == Type::Text).count();
337
338 if first_text_count > second_text_count {
339 header_score += 1.0;
340 }
341 checks += 1;
342
343 let first_numeric_count = first_types.iter().filter(|&&t| t.is_numeric()).count();
345 if first_text_count > first_numeric_count {
346 header_score += 0.5;
347 }
348 checks += 1;
349
350 let unique_count = {
352 let mut seen = std::collections::HashSet::new();
353 first_row.iter().filter(|s| seen.insert(s.as_str())).count()
354 };
355 if unique_count == first_row.len() {
356 header_score += 0.5;
357 }
358 checks += 1;
359
360 let avg_first_len: f64 = first_row
362 .iter()
363 .map(std::string::String::len)
364 .sum::<usize>() as f64
365 / first_row.len().max(1) as f64;
366 let avg_second_len: f64 = second_row
367 .iter()
368 .map(std::string::String::len)
369 .sum::<usize>() as f64
370 / second_row.len().max(1) as f64;
371
372 if avg_first_len <= avg_second_len {
373 header_score += 0.3;
374 }
375 checks += 1;
376
377 let has_header = (header_score / checks as f64) > 0.4;
379
380 Header::new(has_header, preamble_rows)
381}
382
383fn calculate_avg_record_len(table: &crate::tum::table::Table) -> usize {
388 if table.num_rows() == 0 {
389 return 0;
390 }
391
392 let total_len: usize = table
393 .rows
394 .iter()
395 .map(|row| {
396 let field_len: usize = row.iter().map(String::len).sum();
398 let delimiter_overhead = row.len().saturating_sub(1);
400 field_len + delimiter_overhead + 2
402 })
403 .sum();
404
405 total_len / table.num_rows()
406}
407
408fn skip_preamble(data: &[u8]) -> (usize, &[u8]) {
413 let mut preamble_rows = 0;
414 let mut offset = 0;
415
416 while offset < data.len() {
417 let mut line_start = offset;
419 while line_start < data.len() && (data[line_start] == b' ' || data[line_start] == b'\t') {
420 line_start += 1;
421 }
422
423 if line_start < data.len() && data[line_start] == b'#' {
425 let mut line_end = line_start;
427 while line_end < data.len() && data[line_end] != b'\n' && data[line_end] != b'\r' {
428 line_end += 1;
429 }
430
431 if line_end < data.len() && data[line_end] == b'\r' {
433 line_end += 1;
434 }
435 if line_end < data.len() && data[line_end] == b'\n' {
436 line_end += 1;
437 }
438
439 preamble_rows += 1;
440 offset = line_end;
441 } else {
442 break;
444 }
445 }
446
447 (preamble_rows, &data[offset..])
448}
449
450fn detect_structural_preamble(table: &crate::tum::table::Table) -> usize {
455 let n = table.field_counts.len();
456 if n < 3 {
457 return 0;
458 }
459
460 let modal_count = table.modal_field_count();
461
462 let mut matching_suffix = vec![0usize; n];
465 let mut count = 0;
466 for i in (0..n).rev() {
467 if table.field_counts[i] == modal_count {
468 count += 1;
469 }
470 matching_suffix[i] = count;
471 }
472
473 for (i, &field_count) in table.field_counts.iter().enumerate() {
475 if field_count == modal_count {
476 let remaining_len = n - i;
477 let matching = matching_suffix[i];
478 let consistency = matching as f64 / remaining_len as f64;
479
480 if consistency >= 0.8 {
481 return i;
482 }
483 }
484 }
485
486 0
487}
488
489#[cfg(test)]
490mod tests {
491 use super::*;
492
493 #[test]
494 fn test_sniffer_builder() {
495 let mut sniffer = Sniffer::new();
496 sniffer
497 .sample_size(SampleSize::Records(50))
498 .date_preference(DatePreference::DmyFormat)
499 .delimiter(b',');
500
501 assert_eq!(sniffer.sample_size, SampleSize::Records(50));
502 assert_eq!(sniffer.date_preference, DatePreference::DmyFormat);
503 assert_eq!(sniffer.forced_delimiter, Some(b','));
504 }
505
506 #[test]
507 fn test_sniff_bytes() {
508 let data = b"name,age,city\nAlice,30,NYC\nBob,25,LA\n";
509 let sniffer = Sniffer::new();
510
511 let metadata = sniffer.sniff_bytes(data).unwrap();
512
513 assert_eq!(metadata.dialect.delimiter, b',');
514 assert!(metadata.dialect.header.has_header_row);
515 assert_eq!(metadata.num_fields, 3);
516 assert_eq!(metadata.fields, vec!["name", "age", "city"]);
517 }
518
519 #[test]
520 fn test_sniff_tsv() {
521 let data = b"name\tage\tcity\nAlice\t30\tNYC\nBob\t25\tLA\n";
522 let sniffer = Sniffer::new();
523
524 let metadata = sniffer.sniff_bytes(data).unwrap();
525
526 assert_eq!(metadata.dialect.delimiter, b'\t');
527 assert!(metadata.dialect.header.has_header_row);
528 }
529
530 #[test]
531 fn test_sniff_semicolon() {
532 let data = b"name;age;city\nAlice;30;NYC\nBob;25;LA\n";
533 let sniffer = Sniffer::new();
534
535 let metadata = sniffer.sniff_bytes(data).unwrap();
536
537 assert_eq!(metadata.dialect.delimiter, b';');
538 }
539
540 #[test]
541 fn test_sniff_no_header() {
542 let data = b"1,2,3\n4,5,6\n7,8,9\n";
543 let sniffer = Sniffer::new();
544
545 let metadata = sniffer.sniff_bytes(data).unwrap();
546
547 assert_eq!(metadata.dialect.delimiter, b',');
548 assert!(!metadata.dialect.header.has_header_row);
550 }
551
552 #[test]
553 fn test_sniff_with_quotes() {
554 let data = b"\"name\",\"value\"\n\"hello, world\",123\n\"test\",456\n";
555 let sniffer = Sniffer::new();
556
557 let metadata = sniffer.sniff_bytes(data).unwrap();
558
559 assert_eq!(metadata.dialect.delimiter, b',');
560 assert_eq!(metadata.dialect.quote, Quote::Some(b'"'));
561 }
562
563 #[test]
564 fn test_sniff_empty() {
565 let data = b"";
566 let sniffer = Sniffer::new();
567
568 let result = sniffer.sniff_bytes(data);
569 assert!(result.is_err());
570 }
571
572 #[test]
573 fn test_skip_preamble() {
574 let data = b"# This is a comment\n# Another comment\nname,age\nAlice,30\n";
576 let (preamble_rows, remaining) = skip_preamble(data);
577 assert_eq!(preamble_rows, 2);
578 assert_eq!(remaining, b"name,age\nAlice,30\n");
579
580 let data = b"name,age\nAlice,30\n";
582 let (preamble_rows, remaining) = skip_preamble(data);
583 assert_eq!(preamble_rows, 0);
584 assert_eq!(remaining, b"name,age\nAlice,30\n");
585
586 let data = b" # Indented comment\nname,age\n";
588 let (preamble_rows, remaining) = skip_preamble(data);
589 assert_eq!(preamble_rows, 1);
590 assert_eq!(remaining, b"name,age\n");
591 }
592
593 #[test]
594 fn test_sniff_with_preamble() {
595 let data = b"# LimeSurvey export\n# Generated 2024-01-01\nname,age,city\nAlice,30,NYC\nBob,25,LA\n";
596 let sniffer = Sniffer::new();
597
598 let metadata = sniffer.sniff_bytes(data).unwrap();
599
600 assert_eq!(metadata.dialect.delimiter, b',');
601 assert!(metadata.dialect.header.has_header_row);
602 assert_eq!(metadata.num_fields, 3);
603 }
604
605 #[test]
606 fn test_comment_preamble_propagated() {
607 let data = b"# Comment 1\n# Comment 2\nname,age\nAlice,30\nBob,25\n";
608 let metadata = Sniffer::new().sniff_bytes(data).unwrap();
609 assert_eq!(metadata.dialect.header.num_preamble_rows, 2);
610 assert!(metadata.dialect.header.has_header_row);
611 assert_eq!(metadata.fields, vec!["name", "age"]);
612 }
613
614 #[test]
615 fn test_structural_preamble_detection() {
616 let data = b"TITLE\nSUB,TITLE\nA,B,C,D,E\n1,2,3,4,5\n2,3,4,5,6\n3,4,5,6,7\n";
618 let metadata = Sniffer::new().sniff_bytes(data).unwrap();
619 assert_eq!(metadata.dialect.header.num_preamble_rows, 2);
620 assert!(metadata.dialect.header.has_header_row);
621 assert_eq!(metadata.fields, vec!["A", "B", "C", "D", "E"]);
622 }
623
624 #[test]
625 fn test_mixed_preamble_detection() {
626 let data =
629 b"# File header\nMETADATA\nname,age,city\nAlice,30,NYC\nBob,25,LA\nCharlie,35,CHI\n";
630 let metadata = Sniffer::new().sniff_bytes(data).unwrap();
631 assert_eq!(metadata.dialect.header.num_preamble_rows, 2);
633 assert!(metadata.dialect.header.has_header_row);
634 assert_eq!(metadata.fields, vec!["name", "age", "city"]);
635 }
636
637 #[test]
638 fn test_no_preamble() {
639 let data = b"a,b,c\n1,2,3\n4,5,6\n";
640 let metadata = Sniffer::new().sniff_bytes(data).unwrap();
641 assert_eq!(metadata.dialect.header.num_preamble_rows, 0);
642 }
643
644 #[test]
645 fn test_detect_structural_preamble_function() {
646 use crate::tum::table::Table;
647
648 let mut table = Table::new();
650 table.rows = vec![
651 vec!["TITLE".to_string()],
652 vec!["".to_string(), "".to_string()],
653 vec!["A".to_string(), "B".to_string(), "C".to_string()],
654 vec!["1".to_string(), "2".to_string(), "3".to_string()],
655 vec!["4".to_string(), "5".to_string(), "6".to_string()],
656 ];
657 table.field_counts = vec![1, 2, 3, 3, 3];
658 table.update_modal_field_count();
659 assert_eq!(detect_structural_preamble(&table), 2);
660
661 let mut table = Table::new();
663 table.rows = vec![
664 vec!["A".to_string(), "B".to_string(), "C".to_string()],
665 vec!["1".to_string(), "2".to_string(), "3".to_string()],
666 ];
667 table.field_counts = vec![3, 3];
668 table.update_modal_field_count();
669 assert_eq!(detect_structural_preamble(&table), 0);
670
671 let mut table = Table::new();
673 table.rows = vec![vec!["A".to_string()]];
674 table.field_counts = vec![1];
675 table.update_modal_field_count();
676 assert_eq!(detect_structural_preamble(&table), 0);
677 }
678
679 #[test]
680 fn test_avg_record_len_calculated_from_data() {
681 let short_data = b"a,b\n1,2\n3,4\n";
683 let sniffer = Sniffer::new();
684 let metadata = sniffer.sniff_bytes(short_data).unwrap();
685
686 assert!(
689 metadata.avg_record_len < 100,
690 "avg_record_len should be small for short records, got {}",
691 metadata.avg_record_len
692 );
693
694 let long_data = b"very_long_field_name,another_long_field_name\nvalue1,value2\nval3,val4\n";
696 let metadata_long = sniffer.sniff_bytes(long_data).unwrap();
697
698 assert!(
700 metadata_long.avg_record_len > metadata.avg_record_len,
701 "longer fields should have larger avg_record_len: short={}, long={}",
702 metadata.avg_record_len,
703 metadata_long.avg_record_len
704 );
705 }
706}