Skip to main content

plotlars_core/io/
csv.rs

1use std::path::{Path, PathBuf};
2
3use polars::frame::DataFrame;
4use polars::prelude::*;
5
6use super::PlotlarsError;
7
8/// A CSV file reader with configurable parsing options.
9///
10/// Uses a fluent builder pattern: construct with [`CsvReader::new`], chain
11/// optional configuration methods, then call [`CsvReader::finish`] to load
12/// the data into a [`DataFrame`].
13///
14/// # Example
15///
16/// ```rust,no_run
17/// use plotlars_core::io::CsvReader;
18///
19/// let df = CsvReader::new("data/penguins.csv")
20///     .has_header(true)
21///     .try_parse_dates(true)
22///     .finish()
23///     .unwrap();
24/// ```
25#[derive(Clone)]
26pub struct CsvReader {
27    path: PathBuf,
28    delimiter: Option<u8>,
29    has_header: Option<bool>,
30    skip_rows: Option<usize>,
31    null_values: Option<Vec<String>>,
32    try_parse_dates: Option<bool>,
33}
34
35impl CsvReader {
36    /// Create a new CSV reader for the given file path.
37    pub fn new(path: impl AsRef<Path>) -> Self {
38        Self {
39            path: path.as_ref().to_path_buf(),
40            delimiter: None,
41            has_header: None,
42            skip_rows: None,
43            null_values: None,
44            try_parse_dates: None,
45        }
46    }
47
48    /// Set the column delimiter byte. Defaults to `b','`.
49    pub fn delimiter(mut self, delimiter: u8) -> Self {
50        self.delimiter = Some(delimiter);
51        self
52    }
53
54    /// Set whether the first row is a header row. Defaults to `true`.
55    pub fn has_header(mut self, has_header: bool) -> Self {
56        self.has_header = Some(has_header);
57        self
58    }
59
60    /// Set the number of rows to skip before reading data.
61    pub fn skip_rows(mut self, skip_rows: usize) -> Self {
62        self.skip_rows = Some(skip_rows);
63        self
64    }
65
66    /// Set strings that should be interpreted as null values.
67    pub fn null_values(mut self, null_values: Vec<&str>) -> Self {
68        self.null_values = Some(null_values.into_iter().map(|s| s.to_string()).collect());
69        self
70    }
71
72    /// Attempt to automatically parse date and datetime columns.
73    pub fn try_parse_dates(mut self, try_parse_dates: bool) -> Self {
74        self.try_parse_dates = Some(try_parse_dates);
75        self
76    }
77
78    /// Execute the read and return a [`DataFrame`].
79    ///
80    /// # Errors
81    ///
82    /// Returns [`PlotlarsError::Io`] if the file cannot be opened, or
83    /// [`PlotlarsError::CsvParse`] if the CSV data cannot be parsed.
84    pub fn finish(self) -> Result<DataFrame, PlotlarsError> {
85        let path_str = self.path.display().to_string();
86
87        let mut options =
88            CsvReadOptions::default().with_has_header(self.has_header.unwrap_or(true));
89
90        if let Some(skip) = self.skip_rows {
91            options = options.with_skip_rows(skip);
92        }
93
94        let mut parse_options = CsvParseOptions::default();
95
96        if let Some(delim) = self.delimiter {
97            parse_options = parse_options.with_separator(delim);
98        }
99
100        if let Some(nulls) = self.null_values {
101            let nulls: Vec<PlSmallStr> = nulls.into_iter().map(PlSmallStr::from).collect();
102            parse_options = parse_options.with_null_values(Some(NullValues::AllColumns(nulls)));
103        }
104
105        if let Some(try_dates) = self.try_parse_dates {
106            parse_options = parse_options.with_try_parse_dates(try_dates);
107        }
108
109        options = options.with_parse_options(parse_options);
110
111        options
112            .try_into_reader_with_file_path(Some(self.path))
113            .map_err(|e| PlotlarsError::CsvParse {
114                path: path_str.clone(),
115                source: Box::new(e),
116            })?
117            .finish()
118            .map_err(|e| PlotlarsError::CsvParse {
119                path: path_str,
120                source: Box::new(e),
121            })
122    }
123}
124
125#[cfg(test)]
126mod tests {
127    use super::*;
128
129    fn data_path(name: &str) -> String {
130        format!("{}/../../data/{}", env!("CARGO_MANIFEST_DIR"), name)
131    }
132
133    #[test]
134    fn read_csv_default() {
135        let df = CsvReader::new(data_path("penguins.csv")).finish().unwrap();
136        assert!(df.height() > 0);
137        assert!(df.width() > 0);
138    }
139
140    #[test]
141    fn read_csv_with_options() {
142        let df = CsvReader::new(data_path("penguins.csv"))
143            .has_header(true)
144            .try_parse_dates(false)
145            .finish()
146            .unwrap();
147        assert!(df.height() > 0);
148    }
149
150    #[test]
151    fn read_csv_file_not_found() {
152        let result = CsvReader::new("nonexistent.csv").finish();
153        assert!(result.is_err());
154    }
155
156    #[test]
157    fn read_csv_custom_delimiter() {
158        let df = CsvReader::new(data_path("penguins.csv"))
159            .delimiter(b',')
160            .finish()
161            .unwrap();
162        assert!(df.height() > 0);
163    }
164
165    #[test]
166    fn read_csv_skip_rows() {
167        let df_full = CsvReader::new(data_path("animal_statistics.csv"))
168            .finish()
169            .unwrap();
170        let df_skip = CsvReader::new(data_path("animal_statistics.csv"))
171            .skip_rows(2)
172            .finish()
173            .unwrap();
174        assert_eq!(df_full.height() - 2, df_skip.height());
175    }
176
177    #[test]
178    fn read_csv_null_values() {
179        let df = CsvReader::new(data_path("penguins.csv"))
180            .null_values(vec!["NA", ""])
181            .finish()
182            .unwrap();
183        assert!(df.height() > 0);
184    }
185}