reovim_module_codec_csv/
classifier.rs1use reovim_driver_codec::{ContentClassifier, ContentType};
7
8pub const CSV: &str = "text/csv";
10
11pub const TSV: &str = "text/tsv";
13
14pub const PSV: &str = "text/psv";
16
17pub const SCSV: &str = "text/scsv";
19
20const MIN_ROWS: usize = 2;
22
23const MIN_COLUMNS: usize = 2;
25
26const SAMPLE_SIZE: usize = 8192;
28
29const CSV_EXTENSIONS: &[(&str, &str)] = &[("csv", CSV), ("tsv", TSV), ("tab", TSV), ("psv", PSV)];
31
32const DELIMITERS: &[(u8, &str)] = &[
34 (b',', CSV),
35 (b'\t', TSV),
36 (b'|', PSV),
37 (b';', SCSV), ];
39
40pub struct CsvClassifier;
47
48impl CsvClassifier {
49 #[must_use]
51 pub const fn new() -> Self {
52 Self
53 }
54}
55
56#[cfg_attr(coverage_nightly, coverage(off))]
57impl Default for CsvClassifier {
58 fn default() -> Self {
59 Self::new()
60 }
61}
62
63impl ContentClassifier for CsvClassifier {
64 #[cfg_attr(coverage_nightly, coverage(off))]
65 fn classify(&self, raw: &[u8], path: &str) -> Option<ContentType> {
66 if let Some(ct) = extension_content_type(path) {
68 return Some(ContentType::new(ct));
69 }
70
71 let sample = if raw.len() > SAMPLE_SIZE {
73 &raw[..SAMPLE_SIZE]
74 } else {
75 raw
76 };
77 let text = std::str::from_utf8(sample).ok()?;
78
79 let lines: Vec<&str> = text.lines().collect();
81 if lines.len() < MIN_ROWS {
82 return None;
83 }
84
85 for &(delim, content_type) in DELIMITERS {
87 if is_consistent_delimiter(&lines, delim) {
88 return Some(ContentType::new(content_type));
89 }
90 }
91
92 None
93 }
94
95 fn priority(&self) -> u8 {
96 15
97 }
98
99 fn name(&self) -> &'static str {
100 "csv"
101 }
102}
103
104fn is_consistent_delimiter(lines: &[&str], delim: u8) -> bool {
109 let delim_char = delim as char;
110 let mut expected_count = 0;
111 let mut valid_lines = 0;
112
113 for line in lines {
114 if line.is_empty() {
115 continue;
116 }
117
118 let count = line.matches(delim_char).count() + 1;
119 if count < MIN_COLUMNS {
120 return false;
121 }
122
123 if valid_lines == 0 {
124 expected_count = count;
125 } else if count != expected_count {
126 return false;
127 }
128
129 valid_lines += 1;
130 }
131
132 valid_lines >= MIN_ROWS
133}
134
135fn extension_content_type(path: &str) -> Option<&'static str> {
137 let ext = std::path::Path::new(path).extension()?;
138 let ext_str = ext.to_str()?;
139 let lower = ext_str.to_ascii_lowercase();
140 CSV_EXTENSIONS
141 .iter()
142 .find(|(e, _)| *e == lower.as_str())
143 .map(|(_, ct)| *ct)
144}
145
146#[cfg(test)]
147#[path = "classifier_tests.rs"]
148mod tests;