Skip to main content

reovim_module_codec_csv/
codec.rs

1//! CSV/TSV/PSV codec.
2//!
3//! Decodes delimiter-separated value files into column-aligned tabular
4//! text with annotations. This is a BIDIRECTIONAL codec โ€” edited text
5//! can be re-encoded back to the original delimiter format.
6
7use std::fmt::Write;
8
9use {
10    reovim_driver_annotation::{Annotation, AnnotationKind, AnnotationPayload, AnnotationTarget},
11    reovim_driver_codec::{CodecError, CodecMetadata, ContentType, DecodeResult},
12};
13
14/// Annotation kind for the header row.
15pub const CSV_HEADER_KIND: &str = "content.csv.header";
16
17/// Annotation kind for column boundaries.
18pub const CSV_COLUMN_KIND: &str = "content.csv.column";
19
20/// CSV/TSV/PSV codec.
21///
22/// Decodes raw bytes into a column-aligned table view. On encode,
23/// reconstructs the original delimiter format from metadata.
24pub struct CsvCodec {
25    /// The delimiter character for this codec instance.
26    delimiter: u8,
27    /// The content type string.
28    content_type: &'static str,
29}
30
31impl CsvCodec {
32    /// Create a new CSV codec with the given delimiter.
33    #[must_use]
34    pub const fn new(delimiter: u8, content_type: &'static str) -> Self {
35        Self {
36            delimiter,
37            content_type,
38        }
39    }
40
41    /// Get the delimiter used by this codec.
42    #[must_use]
43    pub const fn delimiter(&self) -> u8 {
44        self.delimiter
45    }
46}
47
48impl reovim_driver_codec::ContentCodec for CsvCodec {
49    #[cfg_attr(coverage_nightly, coverage(off))]
50    fn decode(&self, raw: &[u8]) -> Result<DecodeResult, CodecError> {
51        let text = std::str::from_utf8(raw)
52            .map_err(|e| CodecError::Other(format!("CSV decode: invalid UTF-8: {e}")))?;
53
54        let (rows, has_header) = parse_csv(text, self.delimiter);
55        let (content, annotations) = format_table(&rows, has_header);
56
57        let mut metadata = CodecMetadata::new(ContentType::new(self.content_type));
58        metadata.set("delimiter", String::from(self.delimiter as char));
59        metadata.set("has_header", has_header.to_string());
60
61        // Detect line ending style
62        let line_ending = if text.contains("\r\n") { "crlf" } else { "lf" };
63        metadata.set("line_ending", line_ending);
64
65        Ok(DecodeResult {
66            content,
67            annotations,
68            metadata,
69            lossy: false,
70            readonly: false,
71        })
72    }
73
74    #[cfg_attr(coverage_nightly, coverage(off))]
75    fn encode(
76        &self,
77        content: &str,
78        metadata: &CodecMetadata,
79    ) -> Option<Result<Vec<u8>, CodecError>> {
80        let delimiter = metadata
81            .get("delimiter")
82            .and_then(|s| s.chars().next())
83            .unwrap_or(self.delimiter as char);
84
85        let line_ending = match metadata.get("line_ending") {
86            Some("crlf") => "\r\n",
87            _ => "\n",
88        };
89
90        Some(Ok(encode_csv(content, delimiter, line_ending)))
91    }
92}
93
94/// Parse CSV text into rows of fields.
95///
96/// Returns the parsed rows and whether the first row looks like a header
97/// (contains non-numeric values while other rows contain numbers).
98fn parse_csv(text: &str, delimiter: u8) -> (Vec<Vec<String>>, bool) {
99    let mut reader = csv::ReaderBuilder::new()
100        .delimiter(delimiter)
101        .has_headers(false)
102        .flexible(true)
103        .from_reader(text.as_bytes());
104
105    let rows: Vec<Vec<String>> = reader
106        .records()
107        .filter_map(Result::ok)
108        .map(|record| record.iter().map(String::from).collect())
109        .collect();
110
111    let has_header = detect_header(&rows);
112    (rows, has_header)
113}
114
115/// Detect if the first row is a header by checking if it contains
116/// non-numeric values while subsequent rows have more numeric values.
117#[cfg_attr(coverage_nightly, coverage(off))]
118fn detect_header(rows: &[Vec<String>]) -> bool {
119    if rows.len() < 2 {
120        return false;
121    }
122
123    let first_row = &rows[0];
124    let first_numeric = first_row
125        .iter()
126        .filter(|f| f.parse::<f64>().is_ok())
127        .count();
128
129    // If first row has fewer numeric fields than data rows, it's likely a header
130    if let Some(second_row) = rows.get(1) {
131        let second_numeric = second_row
132            .iter()
133            .filter(|f| f.parse::<f64>().is_ok())
134            .count();
135        return first_numeric < second_numeric;
136    }
137
138    false
139}
140
141/// Format parsed rows into a column-aligned table.
142#[cfg_attr(coverage_nightly, coverage(off))]
143fn format_table(rows: &[Vec<String>], has_header: bool) -> (String, Vec<Annotation>) {
144    if rows.is_empty() {
145        return (String::new(), Vec::new());
146    }
147
148    // Calculate column widths
149    let col_count = rows.iter().map(Vec::len).max().unwrap_or(0);
150    let mut widths = vec![0_usize; col_count];
151
152    for row in rows {
153        for (i, field) in row.iter().enumerate() {
154            if i < col_count {
155                widths[i] = widths[i].max(field.len());
156            }
157        }
158    }
159
160    // Minimum column width of 3
161    for w in &mut widths {
162        *w = (*w).max(3);
163    }
164
165    let mut output = String::with_capacity(rows.len() * col_count * 10);
166    let mut annotations = Vec::new();
167
168    let header_kind = AnnotationKind::new(CSV_HEADER_KIND);
169    let column_kind = AnnotationKind::new(CSV_COLUMN_KIND);
170
171    for (line_idx, row) in rows.iter().enumerate() {
172        // Format each field with padding
173        for (col_idx, field) in row.iter().enumerate() {
174            if col_idx > 0 {
175                output.push_str("  "); // Column separator
176            }
177            let width = widths.get(col_idx).copied().unwrap_or(3);
178            let _ = write!(output, "{field:<width$}");
179        }
180        output.push('\n');
181
182        // Annotate header row
183        if line_idx == 0 && has_header {
184            annotations.push(Annotation {
185                kind: header_kind.clone(),
186                target: AnnotationTarget::Line(line_idx),
187                priority: 0,
188                payload: AnnotationPayload::None,
189            });
190        }
191
192        // Annotate each column
193        annotations.push(Annotation {
194            kind: column_kind.clone(),
195            target: AnnotationTarget::Line(line_idx),
196            priority: 0,
197            payload: AnnotationPayload::Number(row.len()),
198        });
199    }
200
201    (output, annotations)
202}
203
204/// Encode column-aligned text back to delimited format.
205#[cfg_attr(coverage_nightly, coverage(off))]
206fn encode_csv(content: &str, delimiter: char, line_ending: &str) -> Vec<u8> {
207    let mut result = Vec::with_capacity(content.len());
208
209    for line in content.lines() {
210        // Split on multiple spaces (column separator in aligned view)
211        let fields: Vec<&str> = split_aligned_fields(line);
212
213        for (i, field) in fields.iter().enumerate() {
214            if i > 0 {
215                result.push(delimiter as u8);
216            }
217            let trimmed = field.trim();
218
219            // Quote if field contains delimiter, quote, or newline (RFC 4180 ยง2.6)
220            if trimmed.contains(delimiter)
221                || trimmed.contains('"')
222                || trimmed.contains('\n')
223                || trimmed.contains('\r')
224            {
225                result.push(b'"');
226                for ch in trimmed.bytes() {
227                    if ch == b'"' {
228                        result.push(b'"');
229                    }
230                    result.push(ch);
231                }
232                result.push(b'"');
233            } else {
234                result.extend_from_slice(trimmed.as_bytes());
235            }
236        }
237
238        result.extend_from_slice(line_ending.as_bytes());
239    }
240
241    result
242}
243
244/// Split an aligned table line back into fields.
245///
246/// Fields in the aligned view are separated by 2+ spaces. A single
247/// space within a field is preserved.
248#[cfg_attr(coverage_nightly, coverage(off))]
249fn split_aligned_fields(line: &str) -> Vec<&str> {
250    if line.is_empty() {
251        return Vec::new();
252    }
253
254    let mut fields = Vec::new();
255    let mut start = 0;
256    let bytes = line.as_bytes();
257    let mut i = 0;
258
259    while i < bytes.len() {
260        // Look for 2+ consecutive spaces as field separator
261        if i + 1 < bytes.len() && bytes[i] == b' ' && bytes[i + 1] == b' ' {
262            // Found field separator โ€” capture the field
263            fields.push(line[start..i].trim_end());
264
265            // Skip all separator spaces
266            while i < bytes.len() && bytes[i] == b' ' {
267                i += 1;
268            }
269            start = i;
270        } else {
271            i += 1;
272        }
273    }
274
275    // Capture the last field
276    if start < bytes.len() {
277        fields.push(line[start..].trim_end());
278    } else if !fields.is_empty() {
279        // Trailing separator โ€” add empty field
280        fields.push("");
281    }
282
283    fields
284}
285
286#[cfg(test)]
287#[path = "codec_tests.rs"]
288mod tests;