csv_nose/
lib.rs

1//! csv-nose: CSV dialect sniffer using the Table Uniformity Method
2//!
3//! A drop-in replacement for qsv-sniffer with improved dialect detection accuracy
4//! using the Table Uniformity Method from the CSVsniffer paper.
5//!
6//! # Quick Start
7//!
8//! ```no_run
9//! use csv_nose::{Sniffer, SampleSize};
10//!
11//! // Create a sniffer with default settings
12//! let mut sniffer = Sniffer::new();
13//!
14//! // Optionally configure sampling
15//! sniffer.sample_size(SampleSize::Records(100));
16//!
17//! // Sniff a file
18//! let metadata = sniffer.sniff_path("data.csv").unwrap();
19//!
20//! println!("Delimiter: {}", metadata.dialect.delimiter as char);
21//! println!("Has header: {}", metadata.dialect.header.has_header_row);
22//! println!("Fields: {:?}", metadata.fields);
23//! println!("Types: {:?}", metadata.types);
24//! ```
25//!
26//! # API Compatibility
27//!
28//! This crate provides API compatibility with qsv-sniffer, making it easy to
29//! switch between implementations:
30//!
31//! ```no_run
32//! use csv_nose::{Sniffer, Metadata, Dialect, Header, Quote, Type, SampleSize, DatePreference};
33//!
34//! let mut sniffer = Sniffer::new();
35//! sniffer
36//!     .sample_size(SampleSize::Records(50))
37//!     .date_preference(DatePreference::MdyFormat)
38//!     .delimiter(b',')
39//!     .quote(Quote::Some(b'"'));
40//! ```
41//!
42//! # The Table Uniformity Method
43//!
44//! This library implements the Table Uniformity Method from:
45//! "Wrangling Messy CSV Files by Detecting Row and Type Patterns"
46//! by van den Burg, Nazábal, and Sutton (2019).
47//!
48//! The algorithm achieves ~93% accuracy on real-world messy CSV files by:
49//! 1. Testing multiple potential dialects (delimiter, quote, line terminator combinations)
50//! 2. Scoring each dialect based on table uniformity (consistent field counts)
51//! 3. Scoring based on type detection (consistent data types within columns)
52//! 4. Selecting the dialect with the highest combined score
53
54mod encoding;
55mod error;
56mod field_type;
57pub mod metadata;
58mod sample;
59mod sniffer;
60mod tum;
61
62// Re-export public API (qsv-sniffer compatible)
63pub use error::{Result, SnifferError};
64pub use field_type::Type;
65pub use metadata::{Dialect, Header, Metadata, Quote};
66pub use sample::{DatePreference, SampleSize};
67pub use sniffer::Sniffer;
68
69// Re-export for advanced usage
70pub use encoding::{EncodingInfo, detect_encoding, is_utf8};
71
72#[cfg(test)]
73mod tests {
74    use super::*;
75
76    #[test]
77    fn test_public_api() {
78        // Verify all public types are accessible
79        let _sniffer = Sniffer::new();
80        let _sample = SampleSize::Records(100);
81        let _date_pref = DatePreference::MdyFormat;
82        let _quote = Quote::Some(b'"');
83        let _type = Type::Text;
84    }
85
86    #[test]
87    fn test_sniff_simple_csv() {
88        let data = b"a,b,c\n1,2,3\n4,5,6\n";
89        let sniffer = Sniffer::new();
90
91        let metadata = sniffer.sniff_bytes(data).unwrap();
92
93        assert_eq!(metadata.dialect.delimiter, b',');
94        assert_eq!(metadata.num_fields, 3);
95    }
96
97    #[test]
98    fn test_builder_pattern() {
99        let mut sniffer = Sniffer::new();
100        sniffer
101            .sample_size(SampleSize::Bytes(4096))
102            .date_preference(DatePreference::DmyFormat)
103            .delimiter(b';')
104            .quote(Quote::None);
105
106        // Verify builder returns &mut Self for chaining
107    }
108}