Skip to main content

big_code_analysis/output/
csv.rs

1//! CSV writer for [`FuncSpace`] trees.
2//!
3//! Emits one row per space (function, class, struct, unit, etc.),
4//! flattened depth-first from the root. Each row carries the source
5//! path, space name and kind, line range, and every leaf metric
6//! value. The header order is fixed by [`CSV_HEADER`] so downstream
7//! tools (Pandas, Excel, awk) can rely on positional access.
8//!
9//! Empty / non-finite metric values render as empty CSV cells (not
10//! `0`, not `NaN`) — `f64::NAN` and `f64::INFINITY` mean "not
11//! applicable for this space" in the underlying metric structs, and
12//! we keep that signal across the format boundary.
13//!
14//! RFC 4180 quoting (commas, double-quotes, newlines in values) is
15//! handled by the [`csv`] crate; nothing in this module hand-rolls
16//! escaping.
17//!
18//! If the source path is not valid UTF-8, the writer emits the
19//! header row only (no data rows) and warns to stderr. There is no
20//! useful fallback for the CSV `path` column, mirroring the
21//! convention established by the Checkstyle writer.
22
23use std::io::{self, Write};
24use std::path::Path;
25
26use crate::output::funcspace_row::{IDENTITY_COLUMNS, METRIC_COUNT, metric_values};
27use crate::output::numfmt::CellMetric;
28use crate::spaces::FuncSpace;
29
30// Compile-time guarantee that the metric tuple matches CSV_HEADER —
31// catches drift the moment a metric is added to one without the other.
32const _: () = assert!(IDENTITY_COLUMNS + METRIC_COUNT == CSV_HEADER.len());
33
34/// File extension used when writing CSV output to a file path.
35pub const CSV_EXTENSION: &str = ".csv";
36
37/// Fixed column order for [`write_csv`] output. Asserted by tests so
38/// downstream consumers can rely on positional access. Metric column
39/// names use dotted JSON-style paths (`loc.lloc`, `halstead.volume`)
40/// so a single name addresses the metric in both JSON and CSV.
41pub const CSV_HEADER: &[&str] = &[
42    // Identity columns
43    "path",
44    "space_name",
45    "space_kind",
46    "start_line",
47    "end_line",
48    // cognitive
49    "cognitive.sum",
50    "cognitive.average",
51    "cognitive.min",
52    "cognitive.max",
53    // cyclomatic
54    "cyclomatic.sum",
55    "cyclomatic.average",
56    "cyclomatic.min",
57    "cyclomatic.max",
58    "cyclomatic.modified.sum",
59    "cyclomatic.modified.average",
60    "cyclomatic.modified.min",
61    "cyclomatic.modified.max",
62    // halstead
63    "halstead.n1",
64    "halstead.N1",
65    "halstead.n2",
66    "halstead.N2",
67    "halstead.length",
68    "halstead.estimated_program_length",
69    "halstead.purity_ratio",
70    "halstead.vocabulary",
71    "halstead.volume",
72    "halstead.difficulty",
73    "halstead.level",
74    "halstead.effort",
75    "halstead.time",
76    "halstead.bugs",
77    // loc
78    "loc.sloc",
79    "loc.ploc",
80    "loc.lloc",
81    "loc.cloc",
82    "loc.blank",
83    "loc.sloc_average",
84    "loc.ploc_average",
85    "loc.lloc_average",
86    "loc.cloc_average",
87    "loc.blank_average",
88    "loc.sloc_min",
89    "loc.sloc_max",
90    "loc.cloc_min",
91    "loc.cloc_max",
92    "loc.ploc_min",
93    "loc.ploc_max",
94    "loc.lloc_min",
95    "loc.lloc_max",
96    "loc.blank_min",
97    "loc.blank_max",
98    // nom
99    "nom.functions",
100    "nom.closures",
101    "nom.functions_average",
102    "nom.closures_average",
103    "nom.total",
104    "nom.average",
105    "nom.functions_min",
106    "nom.functions_max",
107    "nom.closures_min",
108    "nom.closures_max",
109    // nargs
110    "nargs.total_functions",
111    "nargs.total_closures",
112    "nargs.average_functions",
113    "nargs.average_closures",
114    "nargs.total",
115    "nargs.average",
116    "nargs.functions_min",
117    "nargs.functions_max",
118    "nargs.closures_min",
119    "nargs.closures_max",
120    // nexits (serialized as "nexits" in JSON)
121    "nexits.sum",
122    "nexits.average",
123    "nexits.min",
124    "nexits.max",
125    // tokens
126    "tokens.sum",
127    "tokens.average",
128    "tokens.min",
129    "tokens.max",
130    // abc
131    "abc.assignments",
132    "abc.branches",
133    "abc.conditions",
134    "abc.magnitude",
135    "abc.assignments_average",
136    "abc.branches_average",
137    "abc.conditions_average",
138    "abc.assignments_min",
139    "abc.assignments_max",
140    "abc.branches_min",
141    "abc.branches_max",
142    "abc.conditions_min",
143    "abc.conditions_max",
144    // wmc
145    "wmc.classes",
146    "wmc.interfaces",
147    "wmc.total",
148    // npm
149    "npm.classes",
150    "npm.interfaces",
151    "npm.class_methods",
152    "npm.interface_methods",
153    "npm.classes_average",
154    "npm.interfaces_average",
155    "npm.total",
156    "npm.total_methods",
157    "npm.average",
158    // npa
159    "npa.classes",
160    "npa.interfaces",
161    "npa.class_attributes",
162    "npa.interface_attributes",
163    "npa.classes_average",
164    "npa.interfaces_average",
165    "npa.total",
166    "npa.total_attributes",
167    "npa.average",
168    // mi
169    "mi.mi_original",
170    "mi.mi_sei",
171    "mi.mi_visual_studio",
172];
173
174/// Write a CSV document for the metric tree rooted at `space`. The
175/// `source_path` is recorded in the `path` column of every row; if it
176/// is not valid UTF-8 the entire document is skipped (header + zero
177/// rows) and a warning is emitted to stderr — there is no useful
178/// fallback for a CSV identifier.
179///
180/// # Errors
181///
182/// Returns any [`io::Error`] surfaced by the underlying [`csv::Writer`]
183/// while emitting the header row or any of the per-`FuncSpace` data
184/// rows. The error preserves the cause from the wrapped `writer`.
185pub fn write_csv<W: Write>(space: &FuncSpace, source_path: &Path, writer: W) -> io::Result<()> {
186    let mut wtr = csv::WriterBuilder::new()
187        .has_headers(false) // we drive the header manually so it stays in lock-step with CSV_HEADER
188        .from_writer(writer);
189
190    wtr.write_record(CSV_HEADER).map_err(csv_err)?;
191
192    let Some(path_str) = source_path.to_str() else {
193        eprintln!(
194            "Warning: skipping non-UTF-8 source path in CSV output: {}",
195            source_path.display()
196        );
197        return wtr.flush();
198    };
199
200    write_space_rows(&mut wtr, path_str, space)?;
201    wtr.flush()
202}
203
204fn write_space_rows<W: Write>(
205    wtr: &mut csv::Writer<W>,
206    path_str: &str,
207    space: &FuncSpace,
208) -> io::Result<()> {
209    write_one_row(wtr, path_str, space)?;
210    for child in &space.spaces {
211        write_space_rows(wtr, path_str, child)?;
212    }
213    Ok(())
214}
215
216fn write_one_row<W: Write>(
217    wtr: &mut csv::Writer<W>,
218    path_str: &str,
219    space: &FuncSpace,
220) -> io::Result<()> {
221    let metrics = metric_values(space);
222
223    let mut row: Vec<String> = Vec::with_capacity(CSV_HEADER.len());
224    row.push(path_str.to_owned());
225    row.push(space.name.as_deref().unwrap_or("").to_owned());
226    row.push(space.kind.to_string());
227    row.push(space.start_line.to_string());
228    row.push(space.end_line.to_string());
229
230    for v in metrics {
231        row.push(CellMetric(v).to_string());
232    }
233
234    wtr.write_record(&row).map_err(csv_err)
235}
236
237fn csv_err(e: csv::Error) -> io::Error {
238    // csv::Error wraps an io::Error for I/O failures; propagate
239    // unchanged so callers see the original errno. Other variants
240    // collapse into InvalidData since they are protocol-level
241    // problems, not I/O. csv::Error has no public From<ErrorKind>
242    // constructor, so format the kind via Debug to retain diagnostic
243    // detail.
244    match e.into_kind() {
245        csv::ErrorKind::Io(io_err) => io_err,
246        other => io::Error::new(io::ErrorKind::InvalidData, format!("{other:?}")),
247    }
248}
249
250#[cfg(test)]
251#[allow(
252    clippy::float_cmp,
253    clippy::cast_precision_loss,
254    clippy::cast_possible_truncation,
255    clippy::cast_sign_loss,
256    clippy::similar_names,
257    clippy::doc_markdown,
258    clippy::needless_raw_string_hashes,
259    clippy::too_many_lines
260)]
261mod tests {
262    use super::*;
263    use crate::spaces::{CodeMetrics, SpaceKind};
264
265    fn empty_space(name: &str, kind: SpaceKind, start: usize, end: usize) -> FuncSpace {
266        FuncSpace {
267            name: Some(name.into()),
268            start_line: start,
269            end_line: end,
270            kind,
271            spaces: Vec::new(),
272            metrics: CodeMetrics::default(),
273            suppressed: crate::SuppressionScope::default(),
274        }
275    }
276
277    fn render(space: &FuncSpace, path: &Path) -> String {
278        let mut buf = Vec::new();
279        write_csv(space, path, &mut buf).expect("writing to Vec is infallible");
280        String::from_utf8(buf).expect("output is UTF-8")
281    }
282
283    #[test]
284    fn header_constant_matches_first_row() {
285        let space = empty_space("root", SpaceKind::Unit, 1, 1);
286        let out = render(&space, Path::new("a.rs"));
287        let first = out.lines().next().expect("at least the header row");
288        let expected: Vec<&str> = CSV_HEADER.to_vec();
289        let got: Vec<&str> = first.split(',').collect();
290        assert_eq!(got, expected);
291    }
292
293    #[test]
294    fn empty_metric_values_render_as_empty_cells() {
295        // A bare unit space has NaN for every average/min/max — those
296        // must come out as empty cells, never `NaN`, never `0`.
297        let space = empty_space("root", SpaceKind::Unit, 1, 1);
298        let out = render(&space, Path::new("a.rs"));
299        assert!(
300            !out.contains("NaN"),
301            "NaN must not leak into CSV output:\n{out}"
302        );
303        assert!(
304            !out.contains("inf"),
305            "infinity must not leak into CSV output:\n{out}"
306        );
307        // Two adjacent commas indicate an empty field — there must be
308        // at least one such pair given the empty space's NaN columns.
309        assert!(out.contains(",,"), "expected empty cells in:\n{out}");
310    }
311
312    #[test]
313    fn nested_spaces_flatten_depth_first() {
314        let mut root = empty_space("root", SpaceKind::Unit, 1, 100);
315        let mut outer = empty_space("outer", SpaceKind::Function, 10, 50);
316        let inner = empty_space("inner", SpaceKind::Function, 20, 30);
317        outer.spaces.push(inner);
318        let sibling = empty_space("sibling", SpaceKind::Function, 60, 80);
319        root.spaces.push(outer);
320        root.spaces.push(sibling);
321
322        let out = render(&root, Path::new("a.rs"));
323        let names: Vec<&str> = out
324            .lines()
325            .skip(1) // header
326            .map(|line| line.split(',').nth(1).unwrap_or(""))
327            .collect();
328        assert_eq!(names, vec!["root", "outer", "inner", "sibling"]);
329    }
330
331    #[test]
332    fn rfc_4180_quoting_handled_by_csv_crate() {
333        // Names with commas, double-quotes and newlines must round-trip
334        // through the csv crate's quoting; we never hand-roll escapes.
335        let space = empty_space("a,b\"c\nd", SpaceKind::Function, 1, 1);
336        let out = render(&space, Path::new("p.rs"));
337        // The `csv` crate doubles embedded `"` and wraps the field in `"`s.
338        assert!(
339            out.contains(
340                r#""a,b""c
341d""#
342            ),
343            "expected RFC 4180 quoting in:\n{out}"
344        );
345    }
346
347    #[test]
348    fn non_utf8_path_skips_data_rows() {
349        #[cfg(unix)]
350        {
351            use std::ffi::OsStr;
352            use std::os::unix::ffi::OsStrExt;
353            use std::path::PathBuf;
354
355            let bad = PathBuf::from(OsStr::from_bytes(b"\xff\xfe.rs"));
356            let space = empty_space("root", SpaceKind::Unit, 1, 1);
357            let out = render(&space, &bad);
358            assert_eq!(
359                out.lines().count(),
360                1,
361                "header should be the only line, got:\n{out}"
362            );
363        }
364    }
365
366    #[test]
367    fn integral_values_have_no_trailing_dot_zero() {
368        // Match the JSON serializer convention: integer-valued f64s
369        // render as `42`, not `42.0`.
370        let mut space = empty_space("root", SpaceKind::Unit, 1, 1);
371        // Force a known LOC value via the public API. With `unit=true`
372        // sloc = end - start, so (0, 42) yields 42.
373        space.metrics.loc.init_unit_span(0, 42);
374        let out = render(&space, Path::new("a.rs"));
375        let row = out.lines().nth(1).expect("data row");
376        let cells: Vec<&str> = row.split(',').collect();
377        // Find the sloc column by header position.
378        let sloc_idx = CSV_HEADER
379            .iter()
380            .position(|h| *h == "loc.sloc")
381            .expect("loc.sloc in header");
382        assert_eq!(cells[sloc_idx], "42", "row was: {row}");
383    }
384}