Skip to main content

ferray_io/text/
mod.rs

1// ferray-io: Text I/O
2//
3// REQ-7: savetxt(path, &array, delimiter, fmt) writes 2D array as delimited text
4// REQ-8: loadtxt::<T>(path, delimiter, skiprows) reads delimited text into 2D array
5// REQ-9: genfromtxt(path, delimiter, filling_values) reads text with missing value handling
6
7pub mod parser;
8
9use std::fmt::Display;
10use std::fs;
11use std::io::Write;
12use std::path::Path;
13use std::str::FromStr;
14
15use ferray_core::Array;
16use ferray_core::dimension::Ix2;
17use ferray_core::dtype::Element;
18use ferray_core::error::{FerrayError, FerrayResult};
19
20use self::parser::{TextParseOptions, parse_text_grid, parse_text_grid_with_missing};
21
22/// Options for saving text files.
23#[derive(Debug, Clone)]
24pub struct SaveTxtOptions {
25    /// Column delimiter (default: ',').
26    pub delimiter: char,
27    /// Format string for each element. Uses Rust format syntax.
28    /// If `None`, the default `Display` formatting is used.
29    pub fmt: Option<String>,
30    /// Optional header line written before data.
31    pub header: Option<String>,
32    /// Optional footer line written after data.
33    pub footer: Option<String>,
34    /// Line ending (default: "\n").
35    pub newline: String,
36}
37
38impl Default for SaveTxtOptions {
39    fn default() -> Self {
40        Self {
41            delimiter: ',',
42            fmt: None,
43            header: None,
44            footer: None,
45            newline: "\n".to_string(),
46        }
47    }
48}
49
50/// Format a single value using a format string.
51///
52/// Supports:
53/// - `NumPy` printf-style: `"%.6e"`, `"%.18e"`, `"%10.5f"`, `"%.4f"`, `"%d"`
54/// - Rust-style with braces: `"{:.6}"`, `"{:.6e}"`, `"{:>10.5}"`
55/// - Plain `"{}"` — default Display
56///
57/// Unrecognized patterns fall back to default Display formatting.
58fn format_value<T: Display>(val: &T, fmt_str: &str) -> String {
59    // Rust-style: contains "{" — parse precision patterns.
60    if fmt_str.contains('{') {
61        if let Some(spec) = fmt_str.strip_prefix("{:").and_then(|s| s.strip_suffix('}')) {
62            if let Some(prec_str) = spec.strip_prefix('.') {
63                let is_sci = prec_str.ends_with('e') || prec_str.ends_with('E');
64                let digits_str = if is_sci {
65                    &prec_str[..prec_str.len() - 1]
66                } else {
67                    prec_str
68                };
69                if let Ok(prec) = digits_str.parse::<usize>() {
70                    // Parse value as f64 for numeric formatting
71                    if let Ok(v) = val.to_string().parse::<f64>() {
72                        return if is_sci {
73                            format!("{v:.prec$e}")
74                        } else {
75                            format!("{v:.prec$}")
76                        };
77                    }
78                }
79            }
80        }
81        // Fallback: simple substitution
82        return fmt_str.replace("{}", &val.to_string());
83    }
84
85    // NumPy printf-style: starts with "%"
86    if let Some(spec) = fmt_str.strip_prefix('%') {
87        let (body, mode) = if let Some(rest) = spec.strip_suffix('e') {
88            (rest, 'e')
89        } else if let Some(rest) = spec.strip_suffix('E') {
90            (rest, 'E')
91        } else if let Some(rest) = spec.strip_suffix('f') {
92            (rest, 'f')
93        } else if let Some(rest) = spec.strip_suffix('g') {
94            (rest, 'g')
95        } else {
96            // %d, %i, or unrecognized — use default Display
97            return format!("{val}");
98        };
99
100        // Parse value as f64 for numeric formatting
101        if let Ok(v) = val.to_string().parse::<f64>() {
102            if let Some(dot_pos) = body.find('.') {
103                let prec_str = &body[dot_pos + 1..];
104                if let Ok(prec) = prec_str.parse::<usize>() {
105                    return match mode {
106                        'e' => format!("{v:.prec$e}"),
107                        'E' => format!("{v:.prec$E}"),
108                        _ => format!("{v:.prec$}"),
109                    };
110                }
111            } else if body.is_empty() {
112                return match mode {
113                    'e' => format!("{v:e}"),
114                    'E' => format!("{v:E}"),
115                    _ => format!("{v}"),
116                };
117            }
118        }
119    }
120
121    // Unrecognized format — default Display
122    format!("{val}")
123}
124
125/// Save a 2D array as delimited text.
126///
127/// The `fmt` field in [`SaveTxtOptions`] supports:
128/// - `NumPy` printf-style: `"%.6e"`, `"%.18e"`, `"%10.5f"`, `"%d"`
129/// - Rust-style: `"{:.6}"`, `"{:.6e}"`
130/// - Default: `None` uses standard `Display` formatting.
131///
132/// # Errors
133/// Returns `FerrayError::IoError` on file write failures.
134/// Returns `FerrayError::IoError` if the array is not contiguous.
135pub fn savetxt<T: Element + Display, P: AsRef<Path>>(
136    path: P,
137    array: &Array<T, Ix2>,
138    opts: &SaveTxtOptions,
139) -> FerrayResult<()> {
140    let mut file = std::fs::File::create(path.as_ref()).map_err(|e| {
141        FerrayError::io_error(format!(
142            "failed to create file '{}': {e}",
143            path.as_ref().display()
144        ))
145    })?;
146
147    savetxt_to_writer(&mut file, array, opts)
148}
149
150/// Save a 2D array as delimited text to a writer.
151pub fn savetxt_to_writer<T: Element + Display, W: Write>(
152    writer: &mut W,
153    array: &Array<T, Ix2>,
154    opts: &SaveTxtOptions,
155) -> FerrayResult<()> {
156    let shape = array.shape();
157    let nrows = shape[0];
158    let ncols = shape[1];
159
160    if let Some(ref header) = opts.header {
161        write!(writer, "{header}").map_err(|e| FerrayError::io_error(e.to_string()))?;
162        writer
163            .write_all(opts.newline.as_bytes())
164            .map_err(|e| FerrayError::io_error(e.to_string()))?;
165    }
166
167    let slice = array
168        .as_slice()
169        .ok_or_else(|| FerrayError::io_error("cannot save non-contiguous array as text"))?;
170
171    for row in 0..nrows {
172        for col in 0..ncols {
173            if col > 0 {
174                write!(writer, "{}", opts.delimiter)
175                    .map_err(|e| FerrayError::io_error(e.to_string()))?;
176            }
177            let val = &slice[row * ncols + col];
178            if let Some(ref fmt_str) = opts.fmt {
179                // Format string support:
180                // - NumPy printf-style: "%.6e", "%.18e", "%10.5f", "%d"
181                // - Rust-style: "{:.6}", "{:.6e}", "{:>10.5}"
182                // We convert common printf patterns to Rust format, then
183                // fall back to string substitution.
184                let formatted = format_value(val, fmt_str);
185                write!(writer, "{formatted}").map_err(|e| FerrayError::io_error(e.to_string()))?;
186            } else {
187                write!(writer, "{val}").map_err(|e| FerrayError::io_error(e.to_string()))?;
188            }
189        }
190        writer
191            .write_all(opts.newline.as_bytes())
192            .map_err(|e| FerrayError::io_error(e.to_string()))?;
193    }
194
195    if let Some(ref footer) = opts.footer {
196        write!(writer, "{footer}").map_err(|e| FerrayError::io_error(e.to_string()))?;
197        writer
198            .write_all(opts.newline.as_bytes())
199            .map_err(|e| FerrayError::io_error(e.to_string()))?;
200    }
201
202    writer
203        .flush()
204        .map_err(|e| FerrayError::io_error(e.to_string()))?;
205    Ok(())
206}
207
208/// Load a delimited text file into a 2D array.
209///
210/// Each row of the text file becomes a row in the array. All rows must
211/// have the same number of columns.
212///
213/// # Type Parameters
214/// - `T`: Element type to parse each cell into. Must implement `FromStr`.
215///
216/// # Errors
217/// - Returns `FerrayError::IoError` on file read or parse failures.
218pub fn loadtxt<T, P>(path: P, delimiter: char, skiprows: usize) -> FerrayResult<Array<T, Ix2>>
219where
220    T: Element + FromStr,
221    T::Err: Display,
222    P: AsRef<Path>,
223{
224    let content = fs::read_to_string(path.as_ref()).map_err(|e| {
225        FerrayError::io_error(format!(
226            "failed to read file '{}': {e}",
227            path.as_ref().display()
228        ))
229    })?;
230
231    loadtxt_from_str(&content, delimiter, skiprows)
232}
233
234/// Load delimited text from a string into a 2D array.
235pub fn loadtxt_from_str<T>(
236    content: &str,
237    delimiter: char,
238    skiprows: usize,
239) -> FerrayResult<Array<T, Ix2>>
240where
241    T: Element + FromStr,
242    T::Err: Display,
243{
244    let opts = TextParseOptions {
245        delimiter,
246        skiprows,
247        ..Default::default()
248    };
249
250    let (cells, nrows, ncols) = parse_text_grid(content, &opts)?;
251
252    if nrows == 0 {
253        return Array::from_vec(Ix2::new([0, 0]), vec![]);
254    }
255
256    let data: FerrayResult<Vec<T>> = cells
257        .iter()
258        .enumerate()
259        .map(|(i, cell)| {
260            cell.parse::<T>().map_err(|e| {
261                let row = i / ncols;
262                let col = i % ncols;
263                FerrayError::io_error(format!(
264                    "failed to parse value '{cell}' at row {row}, col {col}: {e}"
265                ))
266            })
267        })
268        .collect();
269
270    let data = data?;
271    Array::from_vec(Ix2::new([nrows, ncols]), data)
272}
273
274/// Load a delimited text file with missing value handling.
275///
276/// Missing values (empty cells or cells matching common missing indicators)
277/// are replaced with `filling_values`. This is analogous to `NumPy`'s `genfromtxt`.
278///
279/// Returns a 2D `f64` array where missing values are replaced with `filling_value`
280/// (typically `f64::NAN`).
281///
282/// # Errors
283/// Returns `FerrayError::IoError` on file read or parse failures.
284pub fn genfromtxt<P: AsRef<Path>>(
285    path: P,
286    delimiter: char,
287    filling_value: f64,
288    skiprows: usize,
289    missing_values: &[&str],
290) -> FerrayResult<Array<f64, Ix2>> {
291    let content = fs::read_to_string(path.as_ref()).map_err(|e| {
292        FerrayError::io_error(format!(
293            "failed to read file '{}': {e}",
294            path.as_ref().display()
295        ))
296    })?;
297
298    genfromtxt_from_str(&content, delimiter, filling_value, skiprows, missing_values)
299}
300
301/// Load delimited text from a string with missing value handling.
302pub fn genfromtxt_from_str(
303    content: &str,
304    delimiter: char,
305    filling_value: f64,
306    skiprows: usize,
307    missing_values: &[&str],
308) -> FerrayResult<Array<f64, Ix2>> {
309    let opts = TextParseOptions {
310        delimiter,
311        skiprows,
312        ..Default::default()
313    };
314
315    // Default missing markers
316    let mut all_missing: Vec<&str> = vec!["", "NA", "N/A", "nan", "NaN", "NAN", "--", "null"];
317    for mv in missing_values {
318        if !all_missing.contains(mv) {
319            all_missing.push(mv);
320        }
321    }
322
323    let (cells, nrows, ncols) = parse_text_grid_with_missing(content, &opts, &all_missing)?;
324
325    if nrows == 0 {
326        return Array::from_vec(Ix2::new([0, 0]), vec![]);
327    }
328
329    let data: FerrayResult<Vec<f64>> = cells
330        .iter()
331        .enumerate()
332        .map(|(i, cell)| match cell {
333            None => Ok(filling_value),
334            Some(s) => s.parse::<f64>().map_err(|e| {
335                let row = i / ncols;
336                let col = i % ncols;
337                FerrayError::io_error(format!(
338                    "failed to parse value '{s}' at row {row}, col {col}: {e}"
339                ))
340            }),
341        })
342        .collect();
343
344    let data = data?;
345    Array::from_vec(Ix2::new([nrows, ncols]), data)
346}
347
348// ---------------------------------------------------------------------------
349// fromregex
350// ---------------------------------------------------------------------------
351
352/// Read text using a regular expression to extract structured groups.
353///
354/// `regex` must contain at least one capturing group. For every line in
355/// `content`, the regex is matched against the full line; matches where every
356/// capture is parsed successfully via `T::from_str` produce one row of the
357/// output. Lines that do not match (or that contain unparseable captures)
358/// are skipped.
359///
360/// The result is a 2-D `Array<T, Ix2>` of shape `(rows, captures)`.
361///
362/// Analogous to `numpy.fromregex`. NumPy's structured-dtype support is not
363/// modeled here — every capture group must parse to the same `T`; for mixed
364/// dtypes use one call per column or the structured-record API in
365/// `ferray-core::record`.
366///
367/// # Errors
368/// - `FerrayError::InvalidValue` if the regex cannot be compiled or contains
369///   no capture groups.
370pub fn fromregex<T>(content: &str, regex: &str) -> FerrayResult<Array<T, Ix2>>
371where
372    T: Element + FromStr,
373    T::Err: Display,
374{
375    let re = regex::Regex::new(regex)
376        .map_err(|e| FerrayError::invalid_value(format!("fromregex: invalid regex: {e}")))?;
377    let n_groups = re.captures_len().saturating_sub(1);
378    if n_groups == 0 {
379        return Err(FerrayError::invalid_value(
380            "fromregex: regex must contain at least one capture group",
381        ));
382    }
383    let mut data: Vec<T> = Vec::new();
384    let mut nrows = 0usize;
385    'lines: for line in content.lines() {
386        if let Some(caps) = re.captures(line) {
387            // Try to parse every capture group into T. If any fails, skip this row.
388            let start = data.len();
389            for g in 1..=n_groups {
390                let m = caps.get(g).map_or("", |m| m.as_str());
391                match m.parse::<T>() {
392                    Ok(v) => data.push(v),
393                    Err(_) => {
394                        // Roll back this row's pushes, then continue with next line.
395                        data.truncate(start);
396                        continue 'lines;
397                    }
398                }
399            }
400            nrows += 1;
401        }
402    }
403    Array::from_vec(Ix2::new([nrows, n_groups]), data)
404}
405
406/// Read regex-extracted rows from a file, parsing every capture group as `T`.
407///
408/// Convenience wrapper that reads `path` to a string and calls [`fromregex`].
409///
410/// # Errors
411/// - `FerrayError::IoError` if the file cannot be read.
412/// - Errors from [`fromregex`] (regex compile / no groups).
413pub fn fromregex_from_file<T, P>(path: P, regex: &str) -> FerrayResult<Array<T, Ix2>>
414where
415    T: Element + FromStr,
416    T::Err: Display,
417    P: AsRef<Path>,
418{
419    let content = fs::read_to_string(path.as_ref()).map_err(|e| {
420        FerrayError::io_error(format!(
421            "fromregex: failed to read file '{}': {e}",
422            path.as_ref().display()
423        ))
424    })?;
425    fromregex::<T>(&content, regex)
426}
427
428#[cfg(test)]
429#[allow(clippy::float_cmp)] // Roundtrip tests assert exact equality on hand-picked text values.
430mod tests {
431    use super::*;
432
433    #[test]
434    fn loadtxt_simple_csv() {
435        let content = "1.0,2.0,3.0\n4.0,5.0,6.0\n";
436        let arr: Array<f64, Ix2> = loadtxt_from_str(content, ',', 0).unwrap();
437        assert_eq!(arr.shape(), &[2, 3]);
438        assert_eq!(arr.as_slice().unwrap(), &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]);
439    }
440
441    #[test]
442    fn loadtxt_with_skiprows() {
443        let content = "# header\nname,value\n1.0,10.0\n2.0,20.0\n";
444        let arr: Array<f64, Ix2> = loadtxt_from_str(content, ',', 1).unwrap();
445        assert_eq!(arr.shape(), &[2, 2]);
446        assert_eq!(arr.as_slice().unwrap()[0], 1.0);
447    }
448
449    #[test]
450    fn loadtxt_tab_delimited() {
451        let content = "1\t2\t3\n4\t5\t6\n";
452        let arr: Array<i32, Ix2> = loadtxt_from_str(content, '\t', 0).unwrap();
453        assert_eq!(arr.shape(), &[2, 3]);
454        assert_eq!(arr.as_slice().unwrap(), &[1, 2, 3, 4, 5, 6]);
455    }
456
457    #[test]
458    fn loadtxt_integers() {
459        let content = "10,20\n30,40\n";
460        let arr: Array<i64, Ix2> = loadtxt_from_str(content, ',', 0).unwrap();
461        assert_eq!(arr.as_slice().unwrap(), &[10i64, 20, 30, 40]);
462    }
463
464    #[test]
465    fn loadtxt_file_roundtrip() {
466        let data = vec![1.0f64, 2.0, 3.0, 4.0, 5.0, 6.0];
467        let arr = Array::<f64, Ix2>::from_vec(Ix2::new([2, 3]), data.clone()).unwrap();
468
469        let dir = std::env::temp_dir().join(format!("ferray_io_text_{}", std::process::id()));
470        let _ = std::fs::create_dir_all(&dir);
471        let path = dir.join("test.csv");
472
473        savetxt(&path, &arr, &SaveTxtOptions::default()).unwrap();
474        let loaded: Array<f64, Ix2> = loadtxt(&path, ',', 0).unwrap();
475
476        assert_eq!(loaded.shape(), &[2, 3]);
477        assert_eq!(loaded.as_slice().unwrap(), &data[..]);
478        let _ = std::fs::remove_file(&path);
479    }
480
481    #[test]
482    fn savetxt_custom_delimiter() {
483        let data = vec![1.0f64, 2.0, 3.0, 4.0];
484        let arr = Array::<f64, Ix2>::from_vec(Ix2::new([2, 2]), data).unwrap();
485
486        let mut buf = Vec::new();
487        let opts = SaveTxtOptions {
488            delimiter: '\t',
489            ..Default::default()
490        };
491        savetxt_to_writer(&mut buf, &arr, &opts).unwrap();
492        let output = String::from_utf8(buf).unwrap();
493        assert!(output.contains('\t'));
494        assert!(!output.contains(','));
495    }
496
497    #[test]
498    fn savetxt_with_header_footer() {
499        let data = vec![1.0f64, 2.0];
500        let arr = Array::<f64, Ix2>::from_vec(Ix2::new([1, 2]), data).unwrap();
501
502        let mut buf = Vec::new();
503        let opts = SaveTxtOptions {
504            header: Some("# my header".to_string()),
505            footer: Some("# end".to_string()),
506            ..Default::default()
507        };
508        savetxt_to_writer(&mut buf, &arr, &opts).unwrap();
509        let output = String::from_utf8(buf).unwrap();
510        assert!(output.starts_with("# my header\n"));
511        assert!(output.ends_with("# end\n"));
512    }
513
514    #[test]
515    fn genfromtxt_missing_nan() {
516        let content = "1.0,2.0,3.0\n4.0,,6.0\n7.0,8.0,\n";
517        let arr = genfromtxt_from_str(content, ',', f64::NAN, 0, &[]).unwrap();
518        assert_eq!(arr.shape(), &[3, 3]);
519        let slice = arr.as_slice().unwrap();
520        assert_eq!(slice[0], 1.0);
521        assert!(slice[4].is_nan()); // missing value replaced with NaN
522        assert!(slice[8].is_nan()); // trailing empty
523    }
524
525    #[test]
526    fn genfromtxt_na_marker() {
527        let content = "1.0,NA,3.0\n4.0,5.0,NA\n";
528        let arr = genfromtxt_from_str(content, ',', -999.0, 0, &["NA"]).unwrap();
529        assert_eq!(arr.shape(), &[2, 3]);
530        let slice = arr.as_slice().unwrap();
531        assert_eq!(slice[1], -999.0);
532        assert_eq!(slice[5], -999.0);
533    }
534
535    #[test]
536    fn genfromtxt_with_skiprows() {
537        let content = "col1,col2\n1.0,2.0\n3.0,4.0\n";
538        let arr = genfromtxt_from_str(content, ',', f64::NAN, 1, &[]).unwrap();
539        assert_eq!(arr.shape(), &[2, 2]);
540        assert_eq!(arr.as_slice().unwrap()[0], 1.0);
541    }
542
543    #[test]
544    fn genfromtxt_file() {
545        let content = "1.0,2.0\n,4.0\n";
546        let dir = std::env::temp_dir().join(format!("ferray_io_text_{}", std::process::id()));
547        let _ = std::fs::create_dir_all(&dir);
548        let path = dir.join("genfromtxt_test.csv");
549        std::fs::write(&path, content).unwrap();
550
551        let arr = genfromtxt(&path, ',', f64::NAN, 0, &[]).unwrap();
552        assert_eq!(arr.shape(), &[2, 2]);
553        assert!(arr.as_slice().unwrap()[2].is_nan());
554        let _ = std::fs::remove_file(&path);
555    }
556
557    #[test]
558    fn loadtxt_empty() {
559        let content = "";
560        let arr: Array<f64, Ix2> = loadtxt_from_str(content, ',', 0).unwrap();
561        assert_eq!(arr.shape(), &[0, 0]);
562    }
563
564    // -- fromregex --
565
566    #[test]
567    fn fromregex_basic_one_group() {
568        // Pull integers out of "value=NN" lines, ignore other lines.
569        let s = "value=10\nvalue=20\nirrelevant\nvalue=30\n";
570        let arr: Array<i32, Ix2> = fromregex(s, r"^value=(\d+)$").unwrap();
571        assert_eq!(arr.shape(), &[3, 1]);
572        assert_eq!(arr.as_slice().unwrap(), &[10, 20, 30]);
573    }
574
575    #[test]
576    fn fromregex_multiple_groups() {
577        // Two captures per row → shape (n, 2).
578        let s = "1,2\n3,4\n5,6\n";
579        let arr: Array<f64, Ix2> = fromregex(s, r"^([\d.]+),([\d.]+)$").unwrap();
580        assert_eq!(arr.shape(), &[3, 2]);
581        assert_eq!(arr.as_slice().unwrap(), &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]);
582    }
583
584    #[test]
585    fn fromregex_no_groups_errs() {
586        let r: FerrayResult<Array<i32, Ix2>> = fromregex("a\nb\n", r"^[ab]$");
587        assert!(r.is_err());
588    }
589
590    #[test]
591    fn fromregex_invalid_regex_errs() {
592        let r: FerrayResult<Array<i32, Ix2>> = fromregex("", r"(unclosed");
593        assert!(r.is_err());
594    }
595
596    #[test]
597    fn fromregex_skips_unparseable_rows() {
598        // Second row has a non-numeric capture; it should be skipped silently.
599        let s = "v=10\nv=foo\nv=20\n";
600        let arr: Array<i32, Ix2> = fromregex(s, r"^v=(\S+)$").unwrap();
601        assert_eq!(arr.shape(), &[2, 1]);
602        assert_eq!(arr.as_slice().unwrap(), &[10, 20]);
603    }
604
605    #[test]
606    fn fromregex_from_file_roundtrip() {
607        let dir = std::env::temp_dir().join(format!("ferray_io_fromregex_{}", std::process::id()));
608        let _ = std::fs::create_dir_all(&dir);
609        let path = dir.join("regex_test.txt");
610        std::fs::write(&path, "x=1\nx=2\nx=3\n").unwrap();
611        let arr: Array<i32, Ix2> = fromregex_from_file(&path, r"^x=(\d+)$").unwrap();
612        assert_eq!(arr.shape(), &[3, 1]);
613        assert_eq!(arr.as_slice().unwrap(), &[1, 2, 3]);
614        let _ = std::fs::remove_file(&path);
615    }
616}