csv_tools/lib.rs
1//! # CSV Tools
2//!
3//! This crate is a collection of utilities to make reading, manipulating and creating CSV files
4//! easier. You can open a CSV file, read the text in it and find text, define a delimiter, change
5//! it and save the file with the new delimiter, merge CSV files, map the rows with a custom data
6//! structure, etc.
7//!
8//! Note that quotes are supported with this crate. Meaning that if a cell is surrounded by double
9//! quotes, it will count as one unique value instead of multiple values if it were to contain the
10//! delimiter.
11//!
12//! For example, assuming the delimiter is a comma:
13//!
14//! ```csv
15//! name,pseudo,age
16//! Thomas,"The Svelter",20
17//! Yoshiip,"The best, and only, Godoter",99
18//! ```
19//!
20//! The second row contains 3 values. However, without the quotes it would have been parsed as 5
21//! different values: `"Yoshiip"`, `"The best"`, `" and only"`, `" Godoter", 99` since it contains
22//! the delimiter.
23//!
24//! By default, a row without double quotes will be parsed using a simple built-in method
25//! (`split`), which is slightly more performant since less calculations are needed to find and
26//! locate the right ending of a string.
27//!
28//! Escape characters are allowed, meaning that a string can contain `\"`.
29
30use std::collections::{HashMap, HashSet};
31use std::fmt;
32use std::fs::File;
33use std::io::Error;
34use std::io::ErrorKind;
35use std::io::Write;
36use std::io::{BufRead, BufReader};
37
38/// A simple data structure for holding the raw string data of a CSV file.
39pub struct CSVFile {
40 pub delimiter: char,
41 pub columns: Vec<String>,
42 pub rows: Vec<Vec<String>>,
43}
44
45/// A simple data structure for identifying the position of a cell within a CSV file.
46#[derive(PartialEq)]
47pub struct CSVCoords {
48 pub row: usize,
49 pub column: usize,
50}
51
52impl fmt::Display for CSVCoords {
53 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
54 write!(f, "({}, {})", self.row, self.column)
55 }
56}
57
58impl fmt::Debug for CSVCoords {
59 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
60 write!(
61 f,
62 "CSVCoords {{ row: {}, column: {} }}",
63 self.row, self.column
64 )
65 }
66}
67
68impl fmt::Display for CSVFile {
69 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
70 let mut result = String::new();
71 for column in &self.columns {
72 result.push_str(column);
73 result.push(self.delimiter);
74 }
75 result.pop(); // removes the trailing delimiter
76 result.push('\n');
77
78 for row in &self.rows {
79 for field in row {
80 result.push_str(field);
81 result.push(self.delimiter);
82 }
83 result.pop();
84 result.push('\n');
85 }
86
87 write!(f, "{}", result)
88 }
89}
90
91impl fmt::Debug for CSVFile {
92 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
93 write!(
94 f,
95 "CSVFile {{ delimiter: {}, columns: {:?}, rows: {:?} }}",
96 self.delimiter, self.columns, self.rows
97 )
98 }
99}
100
101impl CSVFile {
102 /// Creates a new CSVFile from a file name and an optional delimiter (a comma by default).
103 /// It reads the first line of the file to get the columns and the rest of the file to get the data.
104 /// It may return an error if the file doesn't exist or if it can't be read properly.
105 pub fn new(file_name: &String, delimiter: &char) -> Result<Self, Error> {
106 let file = File::open(&file_name)?;
107 let mut lines = BufReader::new(&file).lines();
108 let first_line = lines.next().unwrap()?;
109 let columns = read_columns(&first_line, delimiter)?;
110 let rows = read_rows(&mut lines, delimiter, columns.len())?;
111
112 Ok(Self {
113 delimiter: *delimiter,
114 columns,
115 rows,
116 })
117 }
118
119 /// Creates a new CSVFile from the columns and the rows.
120 ///
121 /// # Example
122 ///
123 /// ```
124 /// # use csv_tools::CSVFile;
125 ///
126 /// let columns = vec!["a".to_string(), "b".to_string(), "c".to_string()];
127 /// let rows = vec![
128 /// vec!["1".to_string(), "2".to_string(), "3".to_string()],
129 /// vec!["4".to_string(), "5".to_string(), "6".to_string()],
130 /// vec!["7".to_string(), "8".to_string(), "9".to_string()],
131 /// ];
132 ///
133 /// let file = CSVFile::build(&columns, &rows, &',').unwrap();
134 /// assert_eq!(file.columns, columns);
135 /// assert_eq!(file.rows, rows);
136 /// ```
137 pub fn build(
138 columns: &Vec<String>,
139 rows: &Vec<Vec<String>>,
140 delimiter: &char,
141 ) -> Result<Self, Error> {
142 for (index, row) in rows.iter().enumerate() {
143 if columns.len() != row.len() {
144 return Err(Error::new(
145 ErrorKind::InvalidData,
146 format!("Invalid number of fields for row of index {}, {} were given, but expected {}", index, row.len(), columns.len()))
147 );
148 }
149 }
150
151 Ok(Self {
152 delimiter: *delimiter,
153 columns: columns.clone(),
154 rows: rows.clone(),
155 })
156 }
157
158 /// Maps the rows of the CSV file to a type `T` using a callback function `F` called on each row.
159 ///
160 /// # Example
161 ///
162 /// ```
163 /// # use csv_tools::CSVFile;
164 ///
165 /// #[derive(Debug, PartialEq)]
166 /// struct MyData {
167 /// a: u32,
168 /// b: u32,
169 /// c: u32
170 /// }
171 ///
172 /// let columns = vec!["a".to_string(), "b".to_string(), "c".to_string()];
173 /// let rows = vec![
174 /// vec!["1".to_string(), "2".to_string(), "3".to_string()],
175 /// vec!["4".to_string(), "5".to_string(), "6".to_string()],
176 /// vec!["7".to_string(), "8".to_string(), "9".to_string()],
177 /// ];
178 ///
179 /// let csv_file = CSVFile::build(&columns, &rows, &',').unwrap();
180 /// let result = csv_file.map_rows(|row: &Vec<String>| {
181 /// MyData {
182 /// a: row[0].parse().unwrap(),
183 /// b: row[1].parse().unwrap(),
184 /// c: row[2].parse().unwrap()
185 /// }
186 /// });
187 ///
188 /// assert_eq!(result.len(), 3);
189 /// assert_eq!(result[0], MyData { a: 1, b: 2, c: 3 });
190 /// assert_eq!(result[1], MyData { a: 4, b: 5, c: 6 });
191 /// assert_eq!(result[2], MyData { a: 7, b: 8, c: 9 });
192 /// ```
193 pub fn map_rows<F, T>(&self, f: F) -> Vec<T>
194 where
195 F: Fn(&Vec<String>) -> T,
196 {
197 self.rows.iter().map(f).collect()
198 }
199
200 /// Maps the columns of the CSV file to a HashMap.
201 fn map_columns<T>(&self) -> HashMap<String, Vec<T>> {
202 let mut map = HashMap::new();
203 for column in &self.columns {
204 map.insert(column.clone(), Vec::new());
205 }
206
207 map
208 }
209
210 /// Maps the columns of the CSV file. The keys are column names and the associated value is a vector of type `T`.
211 ///
212 /// # Example
213 ///
214 /// ```
215 /// # use csv_tools::CSVFile;
216 ///
217 /// let columns = vec!["a".to_string(), "b".to_string(), "c".to_string()];
218 /// let rows = vec![
219 /// vec!["1".to_string(), "2".to_string(), "3".to_string()],
220 /// vec!["4".to_string(), "5".to_string(), "6".to_string()],
221 /// vec!["7".to_string(), "8".to_string(), "9".to_string()],
222 /// ];
223 ///
224 /// let csv_file = CSVFile::build(&columns, &rows, &',').unwrap();
225 /// let result = csv_file.to_map(|row: &String| row.parse::<u32>().unwrap());
226 ///
227 /// assert_eq!(result.len(), 3);
228 /// assert_eq!(result.get(&String::from("a")).unwrap(), &vec![1, 4, 7]);
229 /// assert_eq!(result.get(&String::from("b")).unwrap(), &vec![2, 5, 8]);
230 /// assert_eq!(result.get(&String::from("c")).unwrap(), &vec![3, 6, 9]);
231 /// ```
232 pub fn to_map<F, T>(&self, f: F) -> HashMap<String, Vec<T>>
233 where
234 F: Fn(&String) -> T,
235 {
236 let mut map: HashMap<String, Vec<T>> = self.map_columns();
237 for row in &self.rows {
238 for (i, field) in row.iter().enumerate() {
239 map.get_mut(&self.columns[i]).unwrap().push(f(field));
240 }
241 }
242
243 map
244 }
245
246 /// Writes the CSV file to a file.
247 pub fn write(&self, filename: &String) -> Result<(), Error> {
248 let mut file = File::create(filename)?;
249 file.write_all(self.to_string().as_bytes())?;
250 Ok(())
251 }
252
253 /// Returns the number of columns in the CSV file.
254 pub fn len(&self) -> usize {
255 self.columns.len()
256 }
257
258 /// Returns the number of rows in the CSV file.
259 /// It doesn't count the header.
260 pub fn count_rows(&self) -> usize {
261 self.rows.len()
262 }
263
264 /// Returns `true` if the CSV file has the given column.
265 pub fn has_column(&self, column_name: &String) -> bool {
266 self.columns.contains(column_name)
267 }
268
269 /// Returns `true` if the CSV file has no row.
270 pub fn has_no_rows(&self) -> bool {
271 self.rows.is_empty()
272 }
273
274 /// Returns `true` if the CSV file has no column.
275 pub fn has_no_columns(&self) -> bool {
276 self.columns.is_empty()
277 }
278
279 /// Returns `true` if the CSV file is empty,
280 /// meaning it doesn't have any column and any row.
281 pub fn empty(&self) -> bool {
282 self.has_no_rows() && self.has_no_columns()
283 }
284
285 /// Sets the delimiter of the CSV file.
286 pub fn set_delimiter(&mut self, new_delimiter: &char) {
287 self.delimiter = *new_delimiter;
288 }
289
290 /// Gets the index of a column by its name.
291 pub fn get_column_idx(&self, column_name: &String) -> Option<usize> {
292 self.columns.iter().position(|c| c == column_name)
293 }
294
295 /// Gets a cell at given coordinates.
296 /// It returns `None` if the coordinates are out of range.
297 ///
298 /// # Example
299 ///
300 /// ```
301 /// # use csv_tools::{CSVFile, CSVCoords};
302 /// let columns = vec!["a".to_string(), "b".to_string(), "c".to_string()];
303 /// let rows = vec![
304 /// vec!["1".to_string(), "2".to_string(), "3".to_string()],
305 /// vec!["4".to_string(), "5".to_string(), "6".to_string()],
306 /// vec!["7".to_string(), "8".to_string(), "9".to_string()],
307 /// ];
308 ///
309 /// let file = CSVFile::build(&columns, &rows, &',').unwrap();
310 ///
311 /// assert_eq!(file.get_cell(&CSVCoords { row: 0, column: 0 }), Some(&"1".to_string()));
312 /// assert_eq!(file.get_cell(&CSVCoords { row: 1, column: 1 }), Some(&"5".to_string()));
313 /// assert_eq!(file.get_cell(&CSVCoords { row: 2, column: 2 }), Some(&"9".to_string()));
314 /// ```
315 pub fn get_cell(&self, coordinates: &CSVCoords) -> Option<&String> {
316 self.rows.get(coordinates.row)?.get(coordinates.column)
317 }
318
319 /// Finds text in the CSV file and returns the coordinates of the cells.
320 pub fn find_text(&self, text: &String) -> Vec<CSVCoords> {
321 let mut coords: Vec<CSVCoords> = Vec::new();
322 for (i, row) in self.rows.iter().enumerate() {
323 for (j, cell) in row.iter().enumerate() {
324 if cell.contains(text) {
325 coords.push(CSVCoords { row: i, column: j });
326 }
327 }
328 }
329
330 coords
331 }
332
333 /// Checks if the CSV file is valid.
334 /// It checks for duplicates in the columns and if the rows have the right length.
335 pub fn check_validity(&self) -> bool {
336 // Check for duplicates in the columns
337 let mut column_names: HashSet<&str> = HashSet::new();
338 for column in &self.columns {
339 if column_names.contains(column.as_str()) {
340 return false;
341 }
342 column_names.insert(column);
343 }
344
345 // Make sure the rows have the right length
346 let number_of_columns = self.len();
347 for row in &self.rows {
348 if row.len() != number_of_columns {
349 return false;
350 }
351 }
352
353 true
354 }
355
356 /// Fills a column with the given data.
357 /// It may return an error if the column doesn't exist
358 /// or if the length of the data is different from the number of rows.
359 pub fn fill_column(&mut self, column_name: &String, data: &Vec<String>) -> Result<(), Error> {
360 let column_idx = self.columns.iter().position(|c| c == column_name);
361
362 if column_idx.is_none() {
363 Err(Error::new(
364 ErrorKind::InvalidData,
365 format!("The column {} doesn't exist", column_name),
366 ))
367 } else {
368 if data.len() != self.count_rows() {
369 Err(Error::new(
370 ErrorKind::InvalidData,
371 format!(
372 "Invalid number of fields, {} were given, but expected {}",
373 data.len(),
374 self.count_rows()
375 ),
376 ))
377 } else {
378 let column_idx = column_idx.unwrap();
379 for (i, row) in self.rows.iter_mut().enumerate() {
380 row[column_idx] = data[i].clone();
381 }
382
383 Ok(())
384 }
385 }
386 }
387
388 /// Merges two CSV files together.
389 /// It may return an error if a duplicated column is found.
390 /// If the number of rows are different, then the rows are extended with empty strings.
391 ///
392 /// The other CSVFile instance is supposed to be valid.
393 pub fn merge(&mut self, other: &CSVFile) -> Result<(), Error> {
394 for column in &other.columns {
395 if self.columns.contains(column) {
396 return Err(Error::new(
397 ErrorKind::InvalidData,
398 format!("The column {} already exists", column),
399 ));
400 }
401 }
402
403 // If self has less rows than other
404 // -> add rows composed of empty strings to self until the lengths match
405 // If self has more rows than other
406 // -> extend the rows of self with empty strings (from the point where the lengths dismatch to the end of the file).
407 // Add as many empty strings as the number of columns in other.
408 // Finally:
409 // -> extend the rows of self with the data from other
410
411 let initial_self_len = self.len();
412 let self_rows = self.count_rows();
413 let other_rows = other.count_rows();
414
415 // Add the columns of other to self
416 self.columns.extend(other.columns.iter().cloned());
417
418 if self_rows < other_rows {
419 for _ in self_rows..other_rows {
420 self.rows.push(vec![String::new(); initial_self_len]);
421 }
422 } else if self_rows > other_rows {
423 for i in other_rows..self_rows {
424 self.rows[i].extend(vec![String::new(); other.len()].iter().cloned());
425 }
426 }
427
428 for i in 0..other_rows {
429 self.rows[i].extend(other.rows[i].iter().cloned());
430 }
431
432 Ok(())
433 }
434
435 /// Adds a row to the CSV file.
436 /// It may return an error if the number of fields
437 /// in the row is different from the number of columns.
438 pub fn add_row(&mut self, data: &Vec<String>) -> Result<(), Error> {
439 if data.len() != self.len() {
440 return Err(Error::new(
441 ErrorKind::InvalidData,
442 format!(
443 "Invalid number of fields, {} were given, but expected {}",
444 data.len(),
445 self.len()
446 ),
447 ));
448 }
449
450 self.rows.push(data.clone());
451
452 Ok(())
453 }
454
455 /// Adds a column to the CSV file.
456 /// It may return an error if the column already exists.
457 /// It appends an empty string to each row.
458 pub fn add_column(&mut self, name: &String) -> Result<(), Error> {
459 if self.columns.contains(&name) {
460 return Err(Error::new(
461 ErrorKind::InvalidData,
462 format!("The column {} already exists", name),
463 ));
464 }
465
466 self.columns.push(name.clone());
467 for row in &mut self.rows {
468 row.push(String::new());
469 }
470
471 Ok(())
472 }
473
474 /// Inserts a column to the CSV file at a specific index.
475 /// It may return an error if the column already exists or if the index is out of range.
476 /// It also inserts an empty string to each row.
477 pub fn insert_column(&mut self, name: &String, column_idx: usize) -> Result<(), Error> {
478 if column_idx > self.len() {
479 return Err(Error::new(
480 ErrorKind::InvalidData,
481 format!("The column index {} is out of range", column_idx),
482 ));
483 }
484
485 if self.columns.contains(&name) {
486 return Err(Error::new(
487 ErrorKind::InvalidData,
488 format!("The column {} already exists", name),
489 ));
490 }
491
492 self.columns.insert(column_idx, name.clone());
493 for row in &mut self.rows {
494 row.insert(column_idx, String::new());
495 }
496
497 Ok(())
498 }
499
500 /// Removes a column from the CSV file.
501 /// It may return an error if the column index is out of range.
502 pub fn remove_column(&mut self, column_idx: usize) -> Result<(), Error> {
503 if column_idx >= self.len() {
504 return Err(Error::new(
505 ErrorKind::InvalidData,
506 format!("The column index {} is out of range", column_idx),
507 ));
508 }
509
510 self.columns.remove(column_idx);
511 for row in &mut self.rows {
512 row.remove(column_idx);
513 }
514
515 Ok(())
516 }
517
518 /// Removes a row from the CSV file.
519 /// It may return an error if the row index is out of range.
520 pub fn remove_row(&mut self, row_idx: usize) -> Result<(), Error> {
521 if row_idx >= self.rows.len() {
522 return Err(Error::new(
523 ErrorKind::InvalidData,
524 format!("The row index {} is out of range", row_idx),
525 ));
526 }
527
528 self.rows.remove(row_idx);
529
530 Ok(())
531 }
532
533 /// Removes all the rows that are composed of empty strings only,
534 /// starting at the very end and stopping as soon as a non-empty row is found.
535 ///
536 /// If no empty row is found, then nothing happens.
537 pub fn trim_end(&mut self) {
538 let mut i = self.rows.len() - 1;
539 loop {
540 if self.rows[i].iter().all(|s| s.is_empty()) {
541 self.rows.remove(i);
542 if i == 0 {
543 break;
544 } else {
545 i -= 1;
546 }
547 } else {
548 break;
549 }
550 }
551 }
552
553 /// Removes all the rows that are composed of empty strings only,
554 /// starting at the very beginning and stopping as soon as a non-empty row is found.
555 ///
556 /// If no empty row is found, then nothing happens.
557 pub fn trim_start(&mut self) {
558 let mut to_remove: Vec<usize> = Vec::new();
559 let mut i = 0;
560 while i < self.rows.len() {
561 if self.rows[i].iter().all(|s| s.is_empty()) {
562 to_remove.push(i);
563 i += 1;
564 } else {
565 break;
566 }
567 }
568 for i in to_remove.into_iter().rev() {
569 self.rows.remove(i);
570 }
571 }
572
573 /// Removes all the rows that are composed of empty strings only at the beginning and at the end.
574 pub fn trim(&mut self) {
575 self.trim_start();
576 self.trim_end();
577 }
578
579 /// Removes all the empty lines from the CSV file.
580 pub fn remove_empty_lines(&mut self) {
581 self.rows.retain(|row| !row.iter().all(|s| s.is_empty()));
582 }
583}
584
585/// Parses the line into a vector of strings.
586/// It does so by reading the line character by character.
587/// If the character is not the delimiter, it appends it to the current field.
588/// If the character is the delimiter, it appends the current field to the vector and starts a new field.
589///
590/// The point of this function is to avoid using the split method, as it would ignore quotes.
591/// Indeed, if a cell is a string we want to ignore the delimiters inside it.
592///
593/// The "number_of_fields" parameter is used to pre-allocate the vectors.
594/// This is useful when we know the number of fields in advance.
595pub(crate) fn parse_line(
596 line: &String,
597 delimiter: &char,
598 number_of_fields: Option<u32>,
599) -> Result<Vec<String>, Error> {
600 let mut fields: Vec<String> = match number_of_fields {
601 Some(n) => Vec::with_capacity(n as usize),
602 None => Vec::new(),
603 };
604
605 let mut chars = line.chars();
606 let mut current_field = String::new();
607 let mut is_in_quote = false;
608 let mut is_escaped = false;
609
610 while let Some(c) = chars.next() {
611 if c == '\\' {
612 if is_escaped {
613 current_field.push(c);
614 }
615 is_escaped = !is_escaped;
616 } else {
617 if c == '"' {
618 if !is_escaped {
619 if is_in_quote {
620 fields.push(current_field);
621 current_field = String::new();
622 // skip the next character because it should be
623 // the delimiter (or the end of the line)
624 chars.next();
625 }
626 is_in_quote = !is_in_quote;
627 } else {
628 current_field.push(c);
629 }
630 } else {
631 if c == *delimiter && !is_in_quote {
632 fields.push(current_field);
633 current_field = String::new();
634 } else {
635 current_field.push(c);
636 }
637 }
638 // If the character immediately following a blackslash
639 // isn't another backslash, then make sure to be unescaped.
640 is_escaped = false;
641 }
642 }
643
644 if is_escaped || is_in_quote {
645 return Err(Error::new(
646 ErrorKind::InvalidData,
647 "Invalid escape sequence",
648 ));
649 }
650
651 // Push the last field
652 fields.push(current_field);
653
654 Ok(fields)
655}
656
657/// Splits the line into a vector of strings using the delimiter.
658/// Contrary to [parse_line](`#parse_line`), this function uses the split method.
659pub(crate) fn split_line(line: &String, delimiter: &char) -> Vec<String> {
660 line.split(*delimiter).map(|s| s.to_string()).collect()
661}
662
663/// Reads the columns of the CSV file.
664/// If the line contains quotes (double quotes), it uses the [parse_line](`#parse_line`) function.
665/// Otherwise, it uses the [split_line](`#split_line`) function.
666///
667/// It returns a Result because it can fail if the line,
668/// contains an invalid escape sequence or an unclosed quote.
669pub(crate) fn read_columns(line: &String, delimiter: &char) -> Result<Vec<String>, Error> {
670 if line.contains('"') {
671 parse_line(line, delimiter, None)
672 } else {
673 Ok(split_line(line, delimiter))
674 }
675}
676
677/// Reads the data of the CSV file.
678/// It reads the lines of the file and uses the [parse_line](`#parse_line`) function if the line contains double quotes.
679/// Otherwise, it uses the [split_line](`#split_line`) function.
680///
681/// It returns a Result because it can fail if the line,
682/// contains an invalid escape sequence or an unclosed quote.
683///
684/// The "number_of_fields" parameter is used to pre-allocate the vectors.
685/// This is useful when we know the number of fields in advance.
686pub(crate) fn read_rows(
687 lines: &mut std::io::Lines<BufReader<&File>>,
688 delimiter: &char,
689 number_of_fields: usize,
690) -> Result<Vec<Vec<String>>, Error> {
691 let mut data: Vec<Vec<String>> = Vec::new();
692
693 for line in lines {
694 let line = line?;
695 let fields: Vec<String>;
696 if line.contains('"') {
697 fields = parse_line(&line, delimiter, Some(number_of_fields as u32))?;
698 } else {
699 fields = split_line(&line, delimiter);
700 }
701 data.push(fields);
702 }
703
704 Ok(data)
705}
706
707mod tests;