provenant-cli 0.0.9

Provenant is a high-performance Rust scanner for licenses, packages, and source provenance.
Documentation
use std::collections::BTreeMap;
use std::io::{self, Write};

use serde_json::{Map, Value};

use crate::models::{FileType, Output};

use super::shared::{io_other, sorted_files};

type CsvRow = BTreeMap<String, String>;
type CsvRows = Vec<CsvRow>;
type CsvHeadersByGroup = BTreeMap<String, Vec<String>>;

pub(crate) fn write_csv(output: &Output, writer: &mut dyn Write) -> io::Result<()> {
    let (rows, headers_by_group) = flatten_rows(output);
    let mut headers = vec!["kind".to_string(), "path".to_string()];
    for group in ["info", "license", "copyright", "email", "url", "package"] {
        if let Some(group_headers) = headers_by_group.get(group) {
            headers.extend(group_headers.clone());
        }
    }

    let mut csv_writer = csv::Writer::from_writer(writer);
    csv_writer.write_record(&headers).map_err(io_other)?;

    for row in rows {
        let record: Vec<String> = headers
            .iter()
            .map(|header| row.get(header).cloned().unwrap_or_default())
            .collect();
        csv_writer.write_record(record).map_err(io_other)?;
    }

    csv_writer.flush().map_err(io_other)
}

fn flatten_rows(output: &Output) -> (CsvRows, CsvHeadersByGroup) {
    let mut rows = Vec::new();
    let mut headers_by_group: CsvHeadersByGroup = BTreeMap::new();

    for file in sorted_files(&output.files) {
        let normalized_path = normalize_csv_path(&file.path, file.file_type == FileType::Directory);

        let mut info = BTreeMap::new();
        info.insert("kind".to_string(), "info".to_string());
        info.insert("path".to_string(), normalized_path.clone());
        info.insert("name".to_string(), file.name.clone());
        info.insert(
            "type".to_string(),
            match file.file_type {
                FileType::File => "file",
                FileType::Directory => "directory",
            }
            .to_string(),
        );
        info.insert("size".to_string(), file.size.to_string());
        if let Some(mime_type) = &file.mime_type {
            info.insert("mime_type".to_string(), mime_type.clone());
        }
        if let Some(sha1) = &file.sha1 {
            info.insert("sha1".to_string(), sha1.clone());
        }
        info.insert("scan_errors".to_string(), file.scan_errors.join("\n"));
        push_csv_row("info", info, &mut rows, &mut headers_by_group);

        for detection in &file.license_detections {
            for m in &detection.matches {
                let mut lic = BTreeMap::new();
                lic.insert("kind".to_string(), "license".to_string());
                lic.insert("path".to_string(), normalized_path.clone());
                lic.insert(
                    "license_expression".to_string(),
                    detection.license_expression.clone(),
                );
                lic.insert("start_line".to_string(), m.start_line.to_string());
                lic.insert("end_line".to_string(), m.end_line.to_string());
                lic.insert(
                    "license_match__license_expression".to_string(),
                    m.license_expression.clone(),
                );
                lic.insert(
                    "license_match__license_expression_spdx".to_string(),
                    m.license_expression_spdx.clone(),
                );
                lic.insert(
                    "license_match__score".to_string(),
                    format!("{:.2}", m.score),
                );
                if let Some(rule_identifier) = &m.rule_identifier {
                    lic.insert(
                        "license_match__rule_identifier".to_string(),
                        rule_identifier.clone(),
                    );
                }
                push_csv_row("license", lic, &mut rows, &mut headers_by_group);
            }
        }

        for c in &file.copyrights {
            let mut row = BTreeMap::new();
            row.insert("kind".to_string(), "copyright".to_string());
            row.insert("path".to_string(), normalized_path.clone());
            row.insert("copyright".to_string(), c.copyright.clone());
            row.insert("start_line".to_string(), c.start_line.to_string());
            row.insert("end_line".to_string(), c.end_line.to_string());
            push_csv_row("copyright", row, &mut rows, &mut headers_by_group);
        }

        for h in &file.holders {
            let mut row = BTreeMap::new();
            row.insert("kind".to_string(), "holder".to_string());
            row.insert("path".to_string(), normalized_path.clone());
            row.insert("holder".to_string(), h.holder.clone());
            row.insert("start_line".to_string(), h.start_line.to_string());
            row.insert("end_line".to_string(), h.end_line.to_string());
            push_csv_row("copyright", row, &mut rows, &mut headers_by_group);
        }

        for a in &file.authors {
            let mut row = BTreeMap::new();
            row.insert("kind".to_string(), "author".to_string());
            row.insert("path".to_string(), normalized_path.clone());
            row.insert("author".to_string(), a.author.clone());
            row.insert("start_line".to_string(), a.start_line.to_string());
            row.insert("end_line".to_string(), a.end_line.to_string());
            push_csv_row("copyright", row, &mut rows, &mut headers_by_group);
        }

        for e in &file.emails {
            let mut row = BTreeMap::new();
            row.insert("kind".to_string(), "email".to_string());
            row.insert("path".to_string(), normalized_path.clone());
            row.insert("email".to_string(), e.email.clone());
            row.insert("start_line".to_string(), e.start_line.to_string());
            row.insert("end_line".to_string(), e.end_line.to_string());
            push_csv_row("email", row, &mut rows, &mut headers_by_group);
        }

        for u in &file.urls {
            let mut row = BTreeMap::new();
            row.insert("kind".to_string(), "url".to_string());
            row.insert("path".to_string(), normalized_path.clone());
            row.insert("url".to_string(), u.url.clone());
            row.insert("start_line".to_string(), u.start_line.to_string());
            row.insert("end_line".to_string(), u.end_line.to_string());
            push_csv_row("url", row, &mut rows, &mut headers_by_group);
        }

        for package in &file.package_data {
            let mut row = BTreeMap::new();
            row.insert("kind".to_string(), "package_data".to_string());
            row.insert("path".to_string(), normalized_path.clone());

            if let Ok(Value::Object(map)) = serde_json::to_value(package) {
                flatten_json_object_to_row(&map, "package__", &mut row);
            }
            push_csv_row("package", row, &mut rows, &mut headers_by_group);
        }
    }

    (rows, headers_by_group)
}

fn push_csv_row(
    group: &str,
    row: CsvRow,
    rows: &mut CsvRows,
    headers_by_group: &mut CsvHeadersByGroup,
) {
    let group_headers = headers_by_group.entry(group.to_string()).or_default();
    for key in row.keys() {
        if key == "kind" || key == "path" {
            continue;
        }
        if !group_headers.contains(key) {
            group_headers.push(key.clone());
        }
    }
    rows.push(row);
}

fn normalize_csv_path(path: &str, is_directory: bool) -> String {
    let mut normalized = path.trim_start_matches('/').to_string();
    if is_directory && !normalized.ends_with('/') {
        normalized.push('/');
    }
    normalized
}

fn flatten_json_object_to_row(map: &Map<String, Value>, prefix: &str, row: &mut CsvRow) {
    for (key, value) in map {
        let col = format!("{}{}", prefix, key);
        if key == "version"
            && let Value::String(version) = value
        {
            let version = if version.is_empty() || version.to_ascii_lowercase().starts_with('v') {
                version.clone()
            } else {
                format!("v {}", version)
            };
            row.insert(col, version);
            continue;
        }

        row.insert(col, value_to_string(value));
    }
}

fn value_to_string(value: &Value) -> String {
    match value {
        Value::Null => String::new(),
        Value::Bool(v) => v.to_string(),
        Value::Number(v) => v.to_string(),
        Value::String(v) => v.clone(),
        Value::Array(values) => values
            .iter()
            .map(value_to_string)
            .collect::<Vec<_>>()
            .join("\n"),
        Value::Object(_) => serde_json::to_string(value).unwrap_or_default(),
    }
}