Skip to main content

ferray_io/text/
parser.rs

1// ferray-io: Delimited text parser with missing value handling
2
3use ferray_core::error::{FerrayError, FerrayResult};
4
5/// Options for parsing delimited text files.
6#[derive(Debug, Clone)]
7pub struct TextParseOptions {
8    /// Column delimiter character (default: ',').
9    pub delimiter: char,
10    /// Number of header rows to skip (default: 0).
11    pub skiprows: usize,
12    /// Comment character: lines starting with this (after trimming) are skipped.
13    pub comments: Option<char>,
14    /// Maximum number of rows to read (None = all).
15    pub max_rows: Option<usize>,
16}
17
18impl Default for TextParseOptions {
19    fn default() -> Self {
20        Self {
21            delimiter: ',',
22            skiprows: 0,
23            comments: Some('#'),
24            max_rows: None,
25        }
26    }
27}
28
29/// Parse a delimited text into a 2D grid of string cells.
30///
31/// Returns `(rows, ncols)` where `rows` is a flat vector of cells in row-major order.
32pub fn parse_text_grid(
33    content: &str,
34    opts: &TextParseOptions,
35) -> FerrayResult<(Vec<String>, usize, usize)> {
36    let all_lines: Vec<&str> = content.lines().collect();
37
38    // Filter comments and empty lines first, then skip rows.
39    // This matches NumPy behavior: comments are stripped, then skiprows
40    // counts over the remaining data lines.
41    let non_comment_lines: Vec<&str> = all_lines
42        .iter()
43        .filter(|line| {
44            let trimmed = line.trim();
45            if trimmed.is_empty() {
46                return false;
47            }
48            if let Some(comment_char) = opts.comments {
49                if trimmed.starts_with(comment_char) {
50                    return false;
51                }
52            }
53            true
54        })
55        .copied()
56        .collect();
57
58    // Skip initial rows
59    let data_lines: &[&str] = if opts.skiprows < non_comment_lines.len() {
60        &non_comment_lines[opts.skiprows..]
61    } else {
62        &[]
63    };
64
65    // Apply max_rows
66    let data_lines = if let Some(max) = opts.max_rows {
67        &data_lines[..data_lines.len().min(max)]
68    } else {
69        data_lines
70    };
71
72    if data_lines.is_empty() {
73        return Ok((vec![], 0, 0));
74    }
75
76    let delim = opts.delimiter;
77    let nrows = data_lines.len();
78
79    // Parse first line to determine number of columns
80    let first_fields: Vec<&str> = data_lines[0].split(delim).collect();
81    let ncols = first_fields.len();
82
83    let mut cells = Vec::with_capacity(nrows * ncols);
84
85    for (row_idx, line) in data_lines.iter().enumerate() {
86        let fields: Vec<&str> = line.split(delim).collect();
87        if fields.len() != ncols {
88            return Err(FerrayError::io_error(format!(
89                "row {} has {} columns, expected {} (line: '{}')",
90                row_idx + opts.skiprows,
91                fields.len(),
92                ncols,
93                line,
94            )));
95        }
96        for field in fields {
97            cells.push(field.trim().to_string());
98        }
99    }
100
101    Ok((cells, nrows, ncols))
102}
103
104/// Parse a delimited text into a 2D grid, allowing missing values.
105///
106/// Missing values (empty cells or cells matching `missing_values`) are returned
107/// as `None`. Present values are returned as `Some(String)`.
108pub fn parse_text_grid_with_missing(
109    content: &str,
110    opts: &TextParseOptions,
111    missing_values: &[&str],
112) -> FerrayResult<(Vec<Option<String>>, usize, usize)> {
113    let all_lines: Vec<&str> = content.lines().collect();
114
115    // Filter comments and empty lines first, then skip rows.
116    let non_comment_lines: Vec<&str> = all_lines
117        .iter()
118        .filter(|line| {
119            let trimmed = line.trim();
120            if trimmed.is_empty() {
121                return false;
122            }
123            if let Some(comment_char) = opts.comments {
124                if trimmed.starts_with(comment_char) {
125                    return false;
126                }
127            }
128            true
129        })
130        .copied()
131        .collect();
132
133    let data_lines: &[&str] = if opts.skiprows < non_comment_lines.len() {
134        &non_comment_lines[opts.skiprows..]
135    } else {
136        &[]
137    };
138
139    let data_lines = if let Some(max) = opts.max_rows {
140        &data_lines[..data_lines.len().min(max)]
141    } else {
142        data_lines
143    };
144
145    if data_lines.is_empty() {
146        return Ok((vec![], 0, 0));
147    }
148
149    let delim = opts.delimiter;
150    let nrows = data_lines.len();
151
152    let first_fields: Vec<&str> = data_lines[0].split(delim).collect();
153    let ncols = first_fields.len();
154
155    let mut cells = Vec::with_capacity(nrows * ncols);
156
157    for (row_idx, line) in data_lines.iter().enumerate() {
158        let fields: Vec<&str> = line.split(delim).collect();
159        // Allow rows with fewer columns (missing trailing values)
160        for col_idx in 0..ncols {
161            if col_idx >= fields.len() {
162                cells.push(None);
163            } else {
164                let field = fields[col_idx].trim();
165                if field.is_empty() || missing_values.contains(&field) {
166                    cells.push(None);
167                } else {
168                    cells.push(Some(field.to_string()));
169                }
170            }
171        }
172        // Extra columns beyond ncols generate an error
173        if fields.len() > ncols {
174            return Err(FerrayError::io_error(format!(
175                "row {} has {} columns, expected {} (line: '{}')",
176                row_idx + opts.skiprows,
177                fields.len(),
178                ncols,
179                line,
180            )));
181        }
182    }
183
184    Ok((cells, nrows, ncols))
185}
186
187#[cfg(test)]
188mod tests {
189    use super::*;
190
191    #[test]
192    fn parse_simple_csv() {
193        let content = "1,2,3\n4,5,6\n";
194        let opts = TextParseOptions {
195            delimiter: ',',
196            ..Default::default()
197        };
198        let (cells, nrows, ncols) = parse_text_grid(content, &opts).unwrap();
199        assert_eq!(nrows, 2);
200        assert_eq!(ncols, 3);
201        assert_eq!(cells, vec!["1", "2", "3", "4", "5", "6"]);
202    }
203
204    #[test]
205    fn parse_with_skiprows() {
206        let content = "# header\nname,value\n1,10\n2,20\n";
207        let opts = TextParseOptions {
208            delimiter: ',',
209            skiprows: 1,
210            comments: Some('#'),
211            ..Default::default()
212        };
213        let (cells, nrows, ncols) = parse_text_grid(content, &opts).unwrap();
214        assert_eq!(nrows, 2);
215        assert_eq!(ncols, 2);
216        assert_eq!(cells[0], "1");
217    }
218
219    #[test]
220    fn parse_with_comments() {
221        let content = "1,2\n# comment\n3,4\n";
222        let opts = TextParseOptions {
223            delimiter: ',',
224            comments: Some('#'),
225            ..Default::default()
226        };
227        let (cells, nrows, _) = parse_text_grid(content, &opts).unwrap();
228        assert_eq!(nrows, 2);
229        assert_eq!(cells, vec!["1", "2", "3", "4"]);
230    }
231
232    #[test]
233    fn parse_tab_delimited() {
234        let content = "1\t2\t3\n4\t5\t6\n";
235        let opts = TextParseOptions {
236            delimiter: '\t',
237            ..Default::default()
238        };
239        let (cells, nrows, ncols) = parse_text_grid(content, &opts).unwrap();
240        assert_eq!(nrows, 2);
241        assert_eq!(ncols, 3);
242        assert_eq!(cells[0], "1");
243    }
244
245    #[test]
246    fn parse_inconsistent_columns_error() {
247        let content = "1,2,3\n4,5\n";
248        let opts = TextParseOptions::default();
249        assert!(parse_text_grid(content, &opts).is_err());
250    }
251
252    #[test]
253    fn parse_missing_values() {
254        let content = "1,2,3\n4,,6\n7,8,\n";
255        let opts = TextParseOptions::default();
256        let (cells, nrows, ncols) = parse_text_grid_with_missing(content, &opts, &[]).unwrap();
257        assert_eq!(nrows, 3);
258        assert_eq!(ncols, 3);
259        assert_eq!(cells[0], Some("1".to_string()));
260        assert_eq!(cells[4], None); // empty field
261        assert_eq!(cells[8], None); // trailing empty
262    }
263
264    #[test]
265    fn parse_custom_missing_marker() {
266        let content = "1,NA,3\n4,5,NA\n";
267        let opts = TextParseOptions::default();
268        let (cells, _, _) = parse_text_grid_with_missing(content, &opts, &["NA"]).unwrap();
269        assert_eq!(cells[1], None);
270        assert_eq!(cells[5], None);
271        assert_eq!(cells[0], Some("1".to_string()));
272    }
273
274    #[test]
275    fn parse_empty_content() {
276        let content = "";
277        let opts = TextParseOptions::default();
278        let (cells, nrows, ncols) = parse_text_grid(content, &opts).unwrap();
279        assert_eq!(nrows, 0);
280        assert_eq!(ncols, 0);
281        assert!(cells.is_empty());
282    }
283}