Skip to main content

reovim_module_codec_csv/
classifier.rs

1//! CSV/TSV/PSV content classifier.
2//!
3//! Detects delimiter-separated value files by analyzing delimiter
4//! consistency across lines. Supports comma, tab, pipe, and semicolon.
5
6use reovim_driver_codec::{ContentClassifier, ContentType};
7
8/// Content type for comma-separated values.
9pub const CSV: &str = "text/csv";
10
11/// Content type for tab-separated values.
12pub const TSV: &str = "text/tsv";
13
14/// Content type for pipe-separated values.
15pub const PSV: &str = "text/psv";
16
17/// Content type for semicolon-separated values (European CSV).
18pub const SCSV: &str = "text/scsv";
19
20/// Minimum number of rows to consider as CSV (including header).
21const MIN_ROWS: usize = 2;
22
23/// Minimum number of columns (delimiters + 1 per line).
24const MIN_COLUMNS: usize = 2;
25
26/// Maximum bytes to sample for delimiter detection.
27const SAMPLE_SIZE: usize = 8192;
28
29/// Known CSV/TSV/PSV file extensions (fast-path).
30const CSV_EXTENSIONS: &[(&str, &str)] = &[("csv", CSV), ("tsv", TSV), ("tab", TSV), ("psv", PSV)];
31
32/// Delimiters to try, in order of preference.
33const DELIMITERS: &[(u8, &str)] = &[
34    (b',', CSV),
35    (b'\t', TSV),
36    (b'|', PSV),
37    (b';', SCSV), // Semicolon is common in European CSV
38];
39
40/// CSV/TSV/PSV content classifier (priority 15).
41///
42/// Runs after binary detection (20) but before UTF-8 (10). CSV files
43/// are always valid text, so the binary classifier won't claim them.
44/// This classifier intercepts tabular data that would otherwise be
45/// treated as plain text.
46pub struct CsvClassifier;
47
48impl CsvClassifier {
49    /// Create a new CSV classifier.
50    #[must_use]
51    pub const fn new() -> Self {
52        Self
53    }
54}
55
56#[cfg_attr(coverage_nightly, coverage(off))]
57impl Default for CsvClassifier {
58    fn default() -> Self {
59        Self::new()
60    }
61}
62
63impl ContentClassifier for CsvClassifier {
64    #[cfg_attr(coverage_nightly, coverage(off))]
65    fn classify(&self, raw: &[u8], path: &str) -> Option<ContentType> {
66        // Fast-path: known extensions
67        if let Some(ct) = extension_content_type(path) {
68            return Some(ContentType::new(ct));
69        }
70
71        // Only classify valid UTF-8 text
72        let sample = if raw.len() > SAMPLE_SIZE {
73            &raw[..SAMPLE_SIZE]
74        } else {
75            raw
76        };
77        let text = std::str::from_utf8(sample).ok()?;
78
79        // Must have enough lines
80        let lines: Vec<&str> = text.lines().collect();
81        if lines.len() < MIN_ROWS {
82            return None;
83        }
84
85        // Try each delimiter
86        for &(delim, content_type) in DELIMITERS {
87            if is_consistent_delimiter(&lines, delim) {
88                return Some(ContentType::new(content_type));
89            }
90        }
91
92        None
93    }
94
95    fn priority(&self) -> u8 {
96        15
97    }
98
99    fn name(&self) -> &'static str {
100        "csv"
101    }
102}
103
104/// Check if a delimiter produces consistent column counts across lines.
105///
106/// Returns true if every non-empty line has at least `MIN_COLUMNS`
107/// columns and the column count is identical on every line.
108fn is_consistent_delimiter(lines: &[&str], delim: u8) -> bool {
109    let delim_char = delim as char;
110    let mut expected_count = 0;
111    let mut valid_lines = 0;
112
113    for line in lines {
114        if line.is_empty() {
115            continue;
116        }
117
118        let count = line.matches(delim_char).count() + 1;
119        if count < MIN_COLUMNS {
120            return false;
121        }
122
123        if valid_lines == 0 {
124            expected_count = count;
125        } else if count != expected_count {
126            return false;
127        }
128
129        valid_lines += 1;
130    }
131
132    valid_lines >= MIN_ROWS
133}
134
135/// Look up content type from file extension.
136fn extension_content_type(path: &str) -> Option<&'static str> {
137    let ext = std::path::Path::new(path).extension()?;
138    let ext_str = ext.to_str()?;
139    let lower = ext_str.to_ascii_lowercase();
140    CSV_EXTENSIONS
141        .iter()
142        .find(|(e, _)| *e == lower.as_str())
143        .map(|(_, ct)| *ct)
144}
145
146#[cfg(test)]
147#[path = "classifier_tests.rs"]
148mod tests;