csv_tools/
lib.rs

1//! # CSV Tools
2//!
3//! This crate is a collection of utilities to make reading, manipulating and creating CSV files
4//! easier. You can open a CSV file, read the text in it and find text, define a delimiter, change
5//! it and save the file with the new delimiter, merge CSV files, map the rows with a custom data
6//! structure, etc.
7//!
8//! Note that quotes are supported with this crate. Meaning that if a cell is surrounded by double
9//! quotes, it will count as one unique value instead of multiple values if it were to contain the
10//! delimiter.
11//!
12//! For example, assuming the delimiter is a comma:
13//!
14//! ```csv
15//! name,pseudo,age
16//! Thomas,"The Svelter",20
17//! Yoshiip,"The best, and only, Godoter",99
18//! ```
19//!
20//! The second row contains 3 values. However, without the quotes it would have been parsed as 5
21//! different values: `"Yoshiip"`, `"The best"`, `" and only"`, `" Godoter", 99` since it contains
22//! the delimiter.
23//!
24//! By default, a row without double quotes will be parsed using a simple built-in method
25//! (`split`), which is slightly more performant since less calculations are needed to find and
26//! locate the right ending of a string.
27//!
28//! Escape characters are allowed, meaning that a string can contain `\"`.
29
30use std::collections::{HashMap, HashSet};
31use std::fmt;
32use std::fs::File;
33use std::io::Error;
34use std::io::ErrorKind;
35use std::io::Write;
36use std::io::{BufRead, BufReader};
37
38/// A simple data structure for holding the raw string data of a CSV file.
39pub struct CSVFile {
40    pub delimiter: char,
41    pub columns: Vec<String>,
42    pub rows: Vec<Vec<String>>,
43}
44
45/// A simple data structure for identifying the position of a cell within a CSV file.
46#[derive(PartialEq)]
47pub struct CSVCoords {
48    pub row: usize,
49    pub column: usize,
50}
51
52impl fmt::Display for CSVCoords {
53    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
54        write!(f, "({}, {})", self.row, self.column)
55    }
56}
57
58impl fmt::Debug for CSVCoords {
59    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
60        write!(
61            f,
62            "CSVCoords {{ row: {}, column: {} }}",
63            self.row, self.column
64        )
65    }
66}
67
68impl fmt::Display for CSVFile {
69    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
70        let mut result = String::new();
71        for column in &self.columns {
72            result.push_str(column);
73            result.push(self.delimiter);
74        }
75        result.pop(); // removes the trailing delimiter
76        result.push('\n');
77
78        for row in &self.rows {
79            for field in row {
80                result.push_str(field);
81                result.push(self.delimiter);
82            }
83            result.pop();
84            result.push('\n');
85        }
86
87        write!(f, "{}", result)
88    }
89}
90
91impl fmt::Debug for CSVFile {
92    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
93        write!(
94            f,
95            "CSVFile {{ delimiter: {}, columns: {:?}, rows: {:?} }}",
96            self.delimiter, self.columns, self.rows
97        )
98    }
99}
100
101impl CSVFile {
102    /// Creates a new CSVFile from a file name and an optional delimiter (a comma by default).
103    /// It reads the first line of the file to get the columns and the rest of the file to get the data.
104    /// It may return an error if the file doesn't exist or if it can't be read properly.
105    pub fn new(file_name: &String, delimiter: &char) -> Result<Self, Error> {
106        let file = File::open(&file_name)?;
107        let mut lines = BufReader::new(&file).lines();
108        let first_line = lines.next().unwrap()?;
109        let columns = read_columns(&first_line, delimiter)?;
110        let rows = read_rows(&mut lines, delimiter, columns.len())?;
111
112        Ok(Self {
113            delimiter: *delimiter,
114            columns,
115            rows,
116        })
117    }
118
119    /// Creates a new CSVFile from the columns and the rows.
120    ///
121    /// # Example
122    ///
123    /// ```
124    /// # use csv_tools::CSVFile;
125    ///
126    /// let columns = vec!["a".to_string(), "b".to_string(), "c".to_string()];
127    /// let rows = vec![
128    ///    vec!["1".to_string(), "2".to_string(), "3".to_string()],
129    ///    vec!["4".to_string(), "5".to_string(), "6".to_string()],
130    ///    vec!["7".to_string(), "8".to_string(), "9".to_string()],
131    /// ];
132    ///
133    /// let file = CSVFile::build(&columns, &rows, &',').unwrap();
134    /// assert_eq!(file.columns, columns);
135    /// assert_eq!(file.rows, rows);
136    /// ```
137    pub fn build(
138        columns: &Vec<String>,
139        rows: &Vec<Vec<String>>,
140        delimiter: &char,
141    ) -> Result<Self, Error> {
142        for (index, row) in rows.iter().enumerate() {
143            if columns.len() != row.len() {
144                return Err(Error::new(
145          ErrorKind::InvalidData,
146          format!("Invalid number of fields for row of index {}, {} were given, but expected {}", index, row.len(), columns.len()))
147        );
148            }
149        }
150
151        Ok(Self {
152            delimiter: *delimiter,
153            columns: columns.clone(),
154            rows: rows.clone(),
155        })
156    }
157
158    /// Maps the rows of the CSV file to a type `T` using a callback function `F` called on each row.
159    ///
160    /// # Example
161    ///
162    /// ```
163    /// # use csv_tools::CSVFile;
164    ///
165    /// #[derive(Debug, PartialEq)]
166    /// struct MyData {
167    ///   a: u32,
168    ///   b: u32,
169    ///   c: u32
170    /// }
171    ///
172    /// let columns = vec!["a".to_string(), "b".to_string(), "c".to_string()];
173    /// let rows = vec![
174    ///    vec!["1".to_string(), "2".to_string(), "3".to_string()],
175    ///    vec!["4".to_string(), "5".to_string(), "6".to_string()],
176    ///    vec!["7".to_string(), "8".to_string(), "9".to_string()],
177    /// ];
178    ///
179    /// let csv_file = CSVFile::build(&columns, &rows, &',').unwrap();
180    /// let result = csv_file.map_rows(|row: &Vec<String>| {
181    ///   MyData {
182    ///     a: row[0].parse().unwrap(),
183    ///     b: row[1].parse().unwrap(),
184    ///     c: row[2].parse().unwrap()
185    ///   }
186    /// });
187    ///
188    /// assert_eq!(result.len(), 3);
189    /// assert_eq!(result[0], MyData { a: 1, b: 2, c: 3 });
190    /// assert_eq!(result[1], MyData { a: 4, b: 5, c: 6 });
191    /// assert_eq!(result[2], MyData { a: 7, b: 8, c: 9 });
192    /// ```
193    pub fn map_rows<F, T>(&self, f: F) -> Vec<T>
194    where
195        F: Fn(&Vec<String>) -> T,
196    {
197        self.rows.iter().map(f).collect()
198    }
199
200    /// Maps the columns of the CSV file to a HashMap.
201    fn map_columns<T>(&self) -> HashMap<String, Vec<T>> {
202        let mut map = HashMap::new();
203        for column in &self.columns {
204            map.insert(column.clone(), Vec::new());
205        }
206
207        map
208    }
209
210    /// Maps the columns of the CSV file. The keys are column names and the associated value is a vector of type `T`.
211    ///
212    /// # Example
213    ///
214    /// ```
215    /// # use csv_tools::CSVFile;
216    ///
217    /// let columns = vec!["a".to_string(), "b".to_string(), "c".to_string()];
218    /// let rows = vec![
219    ///    vec!["1".to_string(), "2".to_string(), "3".to_string()],
220    ///    vec!["4".to_string(), "5".to_string(), "6".to_string()],
221    ///    vec!["7".to_string(), "8".to_string(), "9".to_string()],
222    /// ];
223    ///  
224    /// let csv_file = CSVFile::build(&columns, &rows, &',').unwrap();
225    /// let result = csv_file.to_map(|row: &String| row.parse::<u32>().unwrap());
226    ///
227    /// assert_eq!(result.len(), 3);
228    /// assert_eq!(result.get(&String::from("a")).unwrap(), &vec![1, 4, 7]);
229    /// assert_eq!(result.get(&String::from("b")).unwrap(), &vec![2, 5, 8]);
230    /// assert_eq!(result.get(&String::from("c")).unwrap(), &vec![3, 6, 9]);
231    /// ```
232    pub fn to_map<F, T>(&self, f: F) -> HashMap<String, Vec<T>>
233    where
234        F: Fn(&String) -> T,
235    {
236        let mut map: HashMap<String, Vec<T>> = self.map_columns();
237        for row in &self.rows {
238            for (i, field) in row.iter().enumerate() {
239                map.get_mut(&self.columns[i]).unwrap().push(f(field));
240            }
241        }
242
243        map
244    }
245
246    /// Writes the CSV file to a file.
247    pub fn write(&self, filename: &String) -> Result<(), Error> {
248        let mut file = File::create(filename)?;
249        file.write_all(self.to_string().as_bytes())?;
250        Ok(())
251    }
252
253    /// Returns the number of columns in the CSV file.
254    pub fn len(&self) -> usize {
255        self.columns.len()
256    }
257
258    /// Returns the number of rows in the CSV file.
259    /// It doesn't count the header.
260    pub fn count_rows(&self) -> usize {
261        self.rows.len()
262    }
263
264    /// Returns `true` if the CSV file has the given column.
265    pub fn has_column(&self, column_name: &String) -> bool {
266        self.columns.contains(column_name)
267    }
268
269    /// Returns `true` if the CSV file has no row.
270    pub fn has_no_rows(&self) -> bool {
271        self.rows.is_empty()
272    }
273
274    /// Returns `true` if the CSV file has no column.
275    pub fn has_no_columns(&self) -> bool {
276        self.columns.is_empty()
277    }
278
279    /// Returns `true` if the CSV file is empty,
280    /// meaning it doesn't have any column and any row.
281    pub fn empty(&self) -> bool {
282        self.has_no_rows() && self.has_no_columns()
283    }
284
285    /// Sets the delimiter of the CSV file.
286    pub fn set_delimiter(&mut self, new_delimiter: &char) {
287        self.delimiter = *new_delimiter;
288    }
289
290    /// Gets the index of a column by its name.
291    pub fn get_column_idx(&self, column_name: &String) -> Option<usize> {
292        self.columns.iter().position(|c| c == column_name)
293    }
294
295    /// Gets a cell at given coordinates.
296    /// It returns `None` if the coordinates are out of range.
297    ///
298    /// # Example
299    ///
300    /// ```
301    /// # use csv_tools::{CSVFile, CSVCoords};
302    /// let columns = vec!["a".to_string(), "b".to_string(), "c".to_string()];
303    /// let rows = vec![
304    ///    vec!["1".to_string(), "2".to_string(), "3".to_string()],
305    ///    vec!["4".to_string(), "5".to_string(), "6".to_string()],
306    ///    vec!["7".to_string(), "8".to_string(), "9".to_string()],
307    /// ];
308    ///
309    /// let file = CSVFile::build(&columns, &rows, &',').unwrap();
310    ///
311    /// assert_eq!(file.get_cell(&CSVCoords { row: 0, column: 0 }), Some(&"1".to_string()));
312    /// assert_eq!(file.get_cell(&CSVCoords { row: 1, column: 1 }), Some(&"5".to_string()));
313    /// assert_eq!(file.get_cell(&CSVCoords { row: 2, column: 2 }), Some(&"9".to_string()));
314    /// ```
315    pub fn get_cell(&self, coordinates: &CSVCoords) -> Option<&String> {
316        self.rows.get(coordinates.row)?.get(coordinates.column)
317    }
318
319    /// Finds text in the CSV file and returns the coordinates of the cells.
320    pub fn find_text(&self, text: &String) -> Vec<CSVCoords> {
321        let mut coords: Vec<CSVCoords> = Vec::new();
322        for (i, row) in self.rows.iter().enumerate() {
323            for (j, cell) in row.iter().enumerate() {
324                if cell.contains(text) {
325                    coords.push(CSVCoords { row: i, column: j });
326                }
327            }
328        }
329
330        coords
331    }
332
333    /// Checks if the CSV file is valid.
334    /// It checks for duplicates in the columns and if the rows have the right length.
335    pub fn check_validity(&self) -> bool {
336        // Check for duplicates in the columns
337        let mut column_names: HashSet<&str> = HashSet::new();
338        for column in &self.columns {
339            if column_names.contains(column.as_str()) {
340                return false;
341            }
342            column_names.insert(column);
343        }
344
345        // Make sure the rows have the right length
346        let number_of_columns = self.len();
347        for row in &self.rows {
348            if row.len() != number_of_columns {
349                return false;
350            }
351        }
352
353        true
354    }
355
356    /// Fills a column with the given data.
357    /// It may return an error if the column doesn't exist
358    /// or if the length of the data is different from the number of rows.
359    pub fn fill_column(&mut self, column_name: &String, data: &Vec<String>) -> Result<(), Error> {
360        let column_idx = self.columns.iter().position(|c| c == column_name);
361
362        if column_idx.is_none() {
363            Err(Error::new(
364                ErrorKind::InvalidData,
365                format!("The column {} doesn't exist", column_name),
366            ))
367        } else {
368            if data.len() != self.count_rows() {
369                Err(Error::new(
370                    ErrorKind::InvalidData,
371                    format!(
372                        "Invalid number of fields, {} were given, but expected {}",
373                        data.len(),
374                        self.count_rows()
375                    ),
376                ))
377            } else {
378                let column_idx = column_idx.unwrap();
379                for (i, row) in self.rows.iter_mut().enumerate() {
380                    row[column_idx] = data[i].clone();
381                }
382
383                Ok(())
384            }
385        }
386    }
387
388    /// Merges two CSV files together.
389    /// It may return an error if a duplicated column is found.
390    /// If the number of rows are different, then the rows are extended with empty strings.
391    ///
392    /// The other CSVFile instance is supposed to be valid.
393    pub fn merge(&mut self, other: &CSVFile) -> Result<(), Error> {
394        for column in &other.columns {
395            if self.columns.contains(column) {
396                return Err(Error::new(
397                    ErrorKind::InvalidData,
398                    format!("The column {} already exists", column),
399                ));
400            }
401        }
402
403        // If self has less rows than other
404        //   -> add rows composed of empty strings to self until the lengths match
405        // If self has more rows than other
406        //   -> extend the rows of self with empty strings (from the point where the lengths dismatch to the end of the file).
407        //      Add as many empty strings as the number of columns in other.
408        // Finally:
409        //   -> extend the rows of self with the data from other
410
411        let initial_self_len = self.len();
412        let self_rows = self.count_rows();
413        let other_rows = other.count_rows();
414
415        // Add the columns of other to self
416        self.columns.extend(other.columns.iter().cloned());
417
418        if self_rows < other_rows {
419            for _ in self_rows..other_rows {
420                self.rows.push(vec![String::new(); initial_self_len]);
421            }
422        } else if self_rows > other_rows {
423            for i in other_rows..self_rows {
424                self.rows[i].extend(vec![String::new(); other.len()].iter().cloned());
425            }
426        }
427
428        for i in 0..other_rows {
429            self.rows[i].extend(other.rows[i].iter().cloned());
430        }
431
432        Ok(())
433    }
434
435    /// Adds a row to the CSV file.
436    /// It may return an error if the number of fields
437    /// in the row is different from the number of columns.
438    pub fn add_row(&mut self, data: &Vec<String>) -> Result<(), Error> {
439        if data.len() != self.len() {
440            return Err(Error::new(
441                ErrorKind::InvalidData,
442                format!(
443                    "Invalid number of fields, {} were given, but expected {}",
444                    data.len(),
445                    self.len()
446                ),
447            ));
448        }
449
450        self.rows.push(data.clone());
451
452        Ok(())
453    }
454
455    /// Adds a column to the CSV file.
456    /// It may return an error if the column already exists.
457    /// It appends an empty string to each row.
458    pub fn add_column(&mut self, name: &String) -> Result<(), Error> {
459        if self.columns.contains(&name) {
460            return Err(Error::new(
461                ErrorKind::InvalidData,
462                format!("The column {} already exists", name),
463            ));
464        }
465
466        self.columns.push(name.clone());
467        for row in &mut self.rows {
468            row.push(String::new());
469        }
470
471        Ok(())
472    }
473
474    /// Inserts a column to the CSV file at a specific index.
475    /// It may return an error if the column already exists or if the index is out of range.
476    /// It also inserts an empty string to each row.
477    pub fn insert_column(&mut self, name: &String, column_idx: usize) -> Result<(), Error> {
478        if column_idx > self.len() {
479            return Err(Error::new(
480                ErrorKind::InvalidData,
481                format!("The column index {} is out of range", column_idx),
482            ));
483        }
484
485        if self.columns.contains(&name) {
486            return Err(Error::new(
487                ErrorKind::InvalidData,
488                format!("The column {} already exists", name),
489            ));
490        }
491
492        self.columns.insert(column_idx, name.clone());
493        for row in &mut self.rows {
494            row.insert(column_idx, String::new());
495        }
496
497        Ok(())
498    }
499
500    /// Removes a column from the CSV file.
501    /// It may return an error if the column index is out of range.
502    pub fn remove_column(&mut self, column_idx: usize) -> Result<(), Error> {
503        if column_idx >= self.len() {
504            return Err(Error::new(
505                ErrorKind::InvalidData,
506                format!("The column index {} is out of range", column_idx),
507            ));
508        }
509
510        self.columns.remove(column_idx);
511        for row in &mut self.rows {
512            row.remove(column_idx);
513        }
514
515        Ok(())
516    }
517
518    /// Removes a row from the CSV file.
519    /// It may return an error if the row index is out of range.
520    pub fn remove_row(&mut self, row_idx: usize) -> Result<(), Error> {
521        if row_idx >= self.rows.len() {
522            return Err(Error::new(
523                ErrorKind::InvalidData,
524                format!("The row index {} is out of range", row_idx),
525            ));
526        }
527
528        self.rows.remove(row_idx);
529
530        Ok(())
531    }
532
533    /// Removes all the rows that are composed of empty strings only,
534    /// starting at the very end and stopping as soon as a non-empty row is found.
535    ///
536    /// If no empty row is found, then nothing happens.
537    pub fn trim_end(&mut self) {
538        let mut i = self.rows.len() - 1;
539        loop {
540            if self.rows[i].iter().all(|s| s.is_empty()) {
541                self.rows.remove(i);
542                if i == 0 {
543                    break;
544                } else {
545                    i -= 1;
546                }
547            } else {
548                break;
549            }
550        }
551    }
552
553    /// Removes all the rows that are composed of empty strings only,
554    /// starting at the very beginning and stopping as soon as a non-empty row is found.
555    ///
556    /// If no empty row is found, then nothing happens.
557    pub fn trim_start(&mut self) {
558        let mut to_remove: Vec<usize> = Vec::new();
559        let mut i = 0;
560        while i < self.rows.len() {
561            if self.rows[i].iter().all(|s| s.is_empty()) {
562                to_remove.push(i);
563                i += 1;
564            } else {
565                break;
566            }
567        }
568        for i in to_remove.into_iter().rev() {
569            self.rows.remove(i);
570        }
571    }
572
573    /// Removes all the rows that are composed of empty strings only at the beginning and at the end.
574    pub fn trim(&mut self) {
575        self.trim_start();
576        self.trim_end();
577    }
578
579    /// Removes all the empty lines from the CSV file.
580    pub fn remove_empty_lines(&mut self) {
581        self.rows.retain(|row| !row.iter().all(|s| s.is_empty()));
582    }
583}
584
585/// Parses the line into a vector of strings.
586/// It does so by reading the line character by character.
587/// If the character is not the delimiter, it appends it to the current field.
588/// If the character is the delimiter, it appends the current field to the vector and starts a new field.
589///
590/// The point of this function is to avoid using the split method, as it would ignore quotes.
591/// Indeed, if a cell is a string we want to ignore the delimiters inside it.
592///
593/// The "number_of_fields" parameter is used to pre-allocate the vectors.
594/// This is useful when we know the number of fields in advance.
595pub(crate) fn parse_line(
596    line: &String,
597    delimiter: &char,
598    number_of_fields: Option<u32>,
599) -> Result<Vec<String>, Error> {
600    let mut fields: Vec<String> = match number_of_fields {
601        Some(n) => Vec::with_capacity(n as usize),
602        None => Vec::new(),
603    };
604
605    let mut chars = line.chars();
606    let mut current_field = String::new();
607    let mut is_in_quote = false;
608    let mut is_escaped = false;
609
610    while let Some(c) = chars.next() {
611        if c == '\\' {
612            if is_escaped {
613                current_field.push(c);
614            }
615            is_escaped = !is_escaped;
616        } else {
617            if c == '"' {
618                if !is_escaped {
619                    if is_in_quote {
620                        fields.push(current_field);
621                        current_field = String::new();
622                        // skip the next character because it should be
623                        // the delimiter (or the end of the line)
624                        chars.next();
625                    }
626                    is_in_quote = !is_in_quote;
627                } else {
628                    current_field.push(c);
629                }
630            } else {
631                if c == *delimiter && !is_in_quote {
632                    fields.push(current_field);
633                    current_field = String::new();
634                } else {
635                    current_field.push(c);
636                }
637            }
638            // If the character immediately following a blackslash
639            // isn't another backslash, then make sure to be unescaped.
640            is_escaped = false;
641        }
642    }
643
644    if is_escaped || is_in_quote {
645        return Err(Error::new(
646            ErrorKind::InvalidData,
647            "Invalid escape sequence",
648        ));
649    }
650
651    // Push the last field
652    fields.push(current_field);
653
654    Ok(fields)
655}
656
657/// Splits the line into a vector of strings using the delimiter.
658/// Contrary to [parse_line](`#parse_line`), this function uses the split method.
659pub(crate) fn split_line(line: &String, delimiter: &char) -> Vec<String> {
660    line.split(*delimiter).map(|s| s.to_string()).collect()
661}
662
663/// Reads the columns of the CSV file.
664/// If the line contains quotes (double quotes), it uses the [parse_line](`#parse_line`) function.
665/// Otherwise, it uses the [split_line](`#split_line`) function.
666///
667/// It returns a Result because it can fail if the line,
668/// contains an invalid escape sequence or an unclosed quote.
669pub(crate) fn read_columns(line: &String, delimiter: &char) -> Result<Vec<String>, Error> {
670    if line.contains('"') {
671        parse_line(line, delimiter, None)
672    } else {
673        Ok(split_line(line, delimiter))
674    }
675}
676
677/// Reads the data of the CSV file.
678/// It reads the lines of the file and uses the [parse_line](`#parse_line`) function if the line contains double quotes.
679/// Otherwise, it uses the [split_line](`#split_line`) function.
680///
681/// It returns a Result because it can fail if the line,
682/// contains an invalid escape sequence or an unclosed quote.
683///
684/// The "number_of_fields" parameter is used to pre-allocate the vectors.
685/// This is useful when we know the number of fields in advance.
686pub(crate) fn read_rows(
687    lines: &mut std::io::Lines<BufReader<&File>>,
688    delimiter: &char,
689    number_of_fields: usize,
690) -> Result<Vec<Vec<String>>, Error> {
691    let mut data: Vec<Vec<String>> = Vec::new();
692
693    for line in lines {
694        let line = line?;
695        let fields: Vec<String>;
696        if line.contains('"') {
697            fields = parse_line(&line, delimiter, Some(number_of_fields as u32))?;
698        } else {
699            fields = split_line(&line, delimiter);
700        }
701        data.push(fields);
702    }
703
704    Ok(data)
705}
706
707mod tests;