Skip to main content

ferray_io/text/
parser.rs

1// ferray-io: Delimited text parser with missing value handling
2
3use ferray_core::error::{FerrayError, FerrayResult};
4
5/// Options for parsing delimited text files.
6#[derive(Debug, Clone)]
7pub struct TextParseOptions {
8    /// Column delimiter character (default: ',').
9    pub delimiter: char,
10    /// Number of header rows to skip (default: 0).
11    pub skiprows: usize,
12    /// Comment character: lines starting with this (after trimming) are skipped.
13    pub comments: Option<char>,
14    /// Maximum number of rows to read (None = all).
15    pub max_rows: Option<usize>,
16}
17
18impl Default for TextParseOptions {
19    fn default() -> Self {
20        Self {
21            delimiter: ',',
22            skiprows: 0,
23            comments: Some('#'),
24            max_rows: None,
25        }
26    }
27}
28
29/// Parse a delimited text into a 2D grid of string cells.
30///
31/// Returns `(rows, ncols)` where `rows` is a flat vector of cells in row-major order.
32pub fn parse_text_grid(
33    content: &str,
34    opts: &TextParseOptions,
35) -> FerrayResult<(Vec<String>, usize, usize)> {
36    let all_lines: Vec<&str> = content.lines().collect();
37
38    // Filter comments and empty lines first, then skip rows.
39    // This matches NumPy behavior: comments are stripped, then skiprows
40    // counts over the remaining data lines.
41    let non_comment_lines: Vec<&str> = all_lines
42        .iter()
43        .filter(|line| {
44            let trimmed = line.trim();
45            if trimmed.is_empty() {
46                return false;
47            }
48            if let Some(comment_char) = opts.comments {
49                if trimmed.starts_with(comment_char) {
50                    return false;
51                }
52            }
53            true
54        })
55        .copied()
56        .collect();
57
58    // Skip initial rows
59    let data_lines: &[&str] = if opts.skiprows < non_comment_lines.len() {
60        &non_comment_lines[opts.skiprows..]
61    } else {
62        &[]
63    };
64
65    // Apply max_rows
66    let data_lines = if let Some(max) = opts.max_rows {
67        &data_lines[..data_lines.len().min(max)]
68    } else {
69        data_lines
70    };
71
72    if data_lines.is_empty() {
73        return Ok((vec![], 0, 0));
74    }
75
76    let delim = opts.delimiter;
77    let nrows = data_lines.len();
78
79    // Parse first line to determine number of columns
80    let ncols = data_lines[0].split(delim).count();
81
82    let mut cells = Vec::with_capacity(nrows * ncols);
83
84    for (row_idx, line) in data_lines.iter().enumerate() {
85        let fields: Vec<&str> = line.split(delim).collect();
86        if fields.len() != ncols {
87            return Err(FerrayError::io_error(format!(
88                "row {} has {} columns, expected {} (line: '{}')",
89                row_idx + opts.skiprows,
90                fields.len(),
91                ncols,
92                line,
93            )));
94        }
95        for field in fields {
96            cells.push(field.trim().to_string());
97        }
98    }
99
100    Ok((cells, nrows, ncols))
101}
102
103/// Parse a delimited text into a 2D grid, allowing missing values.
104///
105/// Missing values (empty cells or cells matching `missing_values`) are returned
106/// as `None`. Present values are returned as `Some(String)`.
107pub fn parse_text_grid_with_missing(
108    content: &str,
109    opts: &TextParseOptions,
110    missing_values: &[&str],
111) -> FerrayResult<(Vec<Option<String>>, usize, usize)> {
112    let all_lines: Vec<&str> = content.lines().collect();
113
114    // Filter comments and empty lines first, then skip rows.
115    let non_comment_lines: Vec<&str> = all_lines
116        .iter()
117        .filter(|line| {
118            let trimmed = line.trim();
119            if trimmed.is_empty() {
120                return false;
121            }
122            if let Some(comment_char) = opts.comments {
123                if trimmed.starts_with(comment_char) {
124                    return false;
125                }
126            }
127            true
128        })
129        .copied()
130        .collect();
131
132    let data_lines: &[&str] = if opts.skiprows < non_comment_lines.len() {
133        &non_comment_lines[opts.skiprows..]
134    } else {
135        &[]
136    };
137
138    let data_lines = if let Some(max) = opts.max_rows {
139        &data_lines[..data_lines.len().min(max)]
140    } else {
141        data_lines
142    };
143
144    if data_lines.is_empty() {
145        return Ok((vec![], 0, 0));
146    }
147
148    let delim = opts.delimiter;
149    let nrows = data_lines.len();
150
151    let ncols = data_lines[0].split(delim).count();
152
153    let mut cells = Vec::with_capacity(nrows * ncols);
154
155    for (row_idx, line) in data_lines.iter().enumerate() {
156        let fields: Vec<&str> = line.split(delim).collect();
157        // Allow rows with fewer columns (missing trailing values)
158        for col_idx in 0..ncols {
159            if col_idx >= fields.len() {
160                cells.push(None);
161            } else {
162                let field = fields[col_idx].trim();
163                if field.is_empty() || missing_values.contains(&field) {
164                    cells.push(None);
165                } else {
166                    cells.push(Some(field.to_string()));
167                }
168            }
169        }
170        // Extra columns beyond ncols generate an error
171        if fields.len() > ncols {
172            return Err(FerrayError::io_error(format!(
173                "row {} has {} columns, expected {} (line: '{}')",
174                row_idx + opts.skiprows,
175                fields.len(),
176                ncols,
177                line,
178            )));
179        }
180    }
181
182    Ok((cells, nrows, ncols))
183}
184
185#[cfg(test)]
186mod tests {
187    use super::*;
188
189    #[test]
190    fn parse_simple_csv() {
191        let content = "1,2,3\n4,5,6\n";
192        let opts = TextParseOptions {
193            delimiter: ',',
194            ..Default::default()
195        };
196        let (cells, nrows, ncols) = parse_text_grid(content, &opts).unwrap();
197        assert_eq!(nrows, 2);
198        assert_eq!(ncols, 3);
199        assert_eq!(cells, vec!["1", "2", "3", "4", "5", "6"]);
200    }
201
202    #[test]
203    fn parse_with_skiprows() {
204        let content = "# header\nname,value\n1,10\n2,20\n";
205        let opts = TextParseOptions {
206            delimiter: ',',
207            skiprows: 1,
208            comments: Some('#'),
209            ..Default::default()
210        };
211        let (cells, nrows, ncols) = parse_text_grid(content, &opts).unwrap();
212        assert_eq!(nrows, 2);
213        assert_eq!(ncols, 2);
214        assert_eq!(cells[0], "1");
215    }
216
217    #[test]
218    fn parse_with_comments() {
219        let content = "1,2\n# comment\n3,4\n";
220        let opts = TextParseOptions {
221            delimiter: ',',
222            comments: Some('#'),
223            ..Default::default()
224        };
225        let (cells, nrows, _) = parse_text_grid(content, &opts).unwrap();
226        assert_eq!(nrows, 2);
227        assert_eq!(cells, vec!["1", "2", "3", "4"]);
228    }
229
230    #[test]
231    fn parse_tab_delimited() {
232        let content = "1\t2\t3\n4\t5\t6\n";
233        let opts = TextParseOptions {
234            delimiter: '\t',
235            ..Default::default()
236        };
237        let (cells, nrows, ncols) = parse_text_grid(content, &opts).unwrap();
238        assert_eq!(nrows, 2);
239        assert_eq!(ncols, 3);
240        assert_eq!(cells[0], "1");
241    }
242
243    #[test]
244    fn parse_inconsistent_columns_error() {
245        let content = "1,2,3\n4,5\n";
246        let opts = TextParseOptions::default();
247        assert!(parse_text_grid(content, &opts).is_err());
248    }
249
250    #[test]
251    fn parse_missing_values() {
252        let content = "1,2,3\n4,,6\n7,8,\n";
253        let opts = TextParseOptions::default();
254        let (cells, nrows, ncols) = parse_text_grid_with_missing(content, &opts, &[]).unwrap();
255        assert_eq!(nrows, 3);
256        assert_eq!(ncols, 3);
257        assert_eq!(cells[0], Some("1".to_string()));
258        assert_eq!(cells[4], None); // empty field
259        assert_eq!(cells[8], None); // trailing empty
260    }
261
262    #[test]
263    fn parse_custom_missing_marker() {
264        let content = "1,NA,3\n4,5,NA\n";
265        let opts = TextParseOptions::default();
266        let (cells, _, _) = parse_text_grid_with_missing(content, &opts, &["NA"]).unwrap();
267        assert_eq!(cells[1], None);
268        assert_eq!(cells[5], None);
269        assert_eq!(cells[0], Some("1".to_string()));
270    }
271
272    #[test]
273    fn parse_empty_content() {
274        let content = "";
275        let opts = TextParseOptions::default();
276        let (cells, nrows, ncols) = parse_text_grid(content, &opts).unwrap();
277        assert_eq!(nrows, 0);
278        assert_eq!(ncols, 0);
279        assert!(cells.is_empty());
280    }
281}