feco3/writers/
csv.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
use super::base::{
    FileRecordWriterFactory, MultiFileRecordWriterFactory, MultiRecordWriter, RecordWriter,
};
use crate::{
    record::{Record, RecordSchema},
    schemas::{CoercingLineParser, LineParser},
    Error, FecFile,
};
use std::{fs::File, path::PathBuf};

/// A [RecordWriter] that writes to CSV format.
struct CSVFormWriter<W: std::io::Write> {
    csv_writer: csv::Writer<W>,
    schema: RecordSchema,
    has_written_header: bool,
}

impl<W: std::io::Write> CSVFormWriter<W> {
    fn new(raw_writer: W, schema: &RecordSchema) -> Self {
        let writer = csv::WriterBuilder::new()
            .has_headers(false) // We'll write the header ourselves
            .flexible(true)
            .from_writer(raw_writer);
        Self {
            csv_writer: writer,
            schema: schema.clone(),
            has_written_header: false,
        }
    }

    /// Write the header if it hasn't been written yet.
    fn maybe_write_header(&mut self) -> std::io::Result<()> {
        if self.has_written_header {
            return Ok(());
        }
        self.has_written_header = true;
        let fields = &self.schema.fields;
        let field_names = fields.iter().map(|f| f.name.as_str());
        self.csv_writer.write_record(field_names)?;
        Ok(())
    }
}

impl<W: std::io::Write + Send> RecordWriter for CSVFormWriter<W> {
    fn write_record(&mut self, record: &Record) -> std::io::Result<()> {
        self.maybe_write_header()?;
        // TODO: Check the length of values vs the schema
        let string_values = record.values.iter().map(|v| v.to_string());
        self.csv_writer.write_record(string_values)?;
        Ok(())
    }
}

struct CSVFileWriterFactory;

impl FileRecordWriterFactory for CSVFileWriterFactory {
    type Writer = CSVFormWriter<File>;
    fn file_name(&self, form_name: String) -> String {
        format!("{}.csv", form_name)
    }

    fn make(&mut self, path: &PathBuf, schema: &RecordSchema) -> std::io::Result<Self::Writer> {
        let file = File::create(path)?;
        let writer = CSVFormWriter::new(file, schema);
        Ok(writer)
    }
}

/// Writes forms to a directory of CSV files.
///
/// Each form type gets its own file. If the form type contains a "/"
/// (which would result in a subdirectory), it is replaced with a "-".
/// For example, "SC/10" would be written to "SC-10.csv".
pub struct CSVProcessor {
    multi_writer: MultiRecordWriter<MultiFileRecordWriterFactory<CSVFileWriterFactory>>,
}

impl CSVProcessor {
    /// Create a new CSVProcessor that writes to the given directory.
    pub fn new(out_dir: PathBuf) -> Self {
        let factory = CSVFileWriterFactory;
        let f2 = MultiFileRecordWriterFactory::new(out_dir, factory);
        let multi_writer = MultiRecordWriter::new(f2);
        Self { multi_writer }
    }

    // TODO: factor this out with ParquetProcessor.process()
    /// Process the given FEC file, writing the results to the output directory.
    pub fn process(&mut self, fec: &mut FecFile) -> Result<(), Error> {
        let fec_version = fec.get_header()?.fec_version.clone();
        let mut parser = CoercingLineParser;
        for line in fec.lines() {
            let line = line?;
            let record = parser.parse_line(&fec_version, &mut line.iter())?;
            self.multi_writer.write_record(&record)?;
        }
        self.multi_writer.finish()?;
        Ok(())
    }
}