data_matrix/
datamatrix_builder.rs

1use flate2::read;
2use std::collections::HashMap;
3use std::ffi::OsStr;
4use std::fs::File;
5use std::io::{self, BufRead, BufReader, ErrorKind};
6use std::path::Path;
7
8use crate::{DataMatrix, Error};
9
10/// A builder for loading labeled matrices from plain text, CSV, or TSV files.
11///
12/// [`DataMatrixBuilder`] provides flexible configuration for how files are parsed:
13/// - specify which columns contain row labels, column labels, and values,
14/// - optionally specify explicit row and column indices (for 5-column formats),
15/// - control the separator (space, comma, tab, etc.),
16/// - skip header lines,
17/// - control whether the matrix should be symmetric.
18///
19/// # Supported formats
20/// - **Three-column format**: row label, column label, value
21/// - **Five-column format**: row label, column label, row index, column index, value
22/// - **Single-column format**: raw values for a square matrix (handled separately); requires labels provided by a user
23///   with `DataMatrixBuilder::labels()`.
24///
25/// Lines starting with `#` are ignored as comments.
26///
27/// # Examples
28///
29/// ## Reading a 5-column file (e.g., `five_columns_short.txt`)
30/// ```text
31/// # Comment lines are allowed
32/// Alice Bob 0 1 1.5
33/// Bob John 1 2 2.2
34/// ```
35///
36/// ```rust
37/// use data_matrix::{DataMatrixBuilder, Error};
38/// # fn main() -> Result<(), Error> {
39/// # let input_fname = "./tests/test_files/five_columns_short.txt";
40/// let matrix = DataMatrixBuilder::new()
41///     .label_columns(0, 1)    // columns 0 and 1: row and column labels
42///     .index_columns(2, 3)    // columns 2 and 3: row and column indices
43///     .data_column(4)         // column 4: value
44///     .separator(' ')         // whitespace separator
45///     .symmetric(true)        // make symmetric
46///     .from_file(input_fname)?;
47/// # assert_eq!(matrix.ncols(), 3);
48/// # assert_eq!(matrix.nrows(), 3);
49/// # Ok(())
50/// # }
51/// ```
52///
53/// ## Reading a 3-column file (e.g., `three_columns_short.txt`)
54/// ```text
55/// # Comment lines are allowed
56/// Alice Bob 1.2
57/// Bob John 2.4
58/// ```
59///
60/// ```rust
61/// use data_matrix::{DataMatrixBuilder, Error};
62/// # fn main() -> Result<(), Error> {
63/// # let input_fname = "./tests/test_files/three_columns_short.txt";
64///
65/// let matrix = DataMatrixBuilder::new()
66///     .label_columns(0, 1)    // columns 0 and 1: row and column labels
67///     .data_column(2)         // column 2: value
68///     .separator(' ')         // whitespace separator
69///     .symmetric(true)        // make symmetric
70///     .skip_header(false)     // this is the default behaviour
71///     .from_file(input_fname)?;
72/// # assert_eq!(matrix.ncols(), 3);
73/// # assert_eq!(matrix.nrows(), 3);
74/// # Ok(())
75/// # }
76/// ```
77///
78/// # Notes
79/// - Columns are indexed starting **from 0**
80/// - field separator must be a single character (with an exception for `' '`, see below); if not given, the value will be inferred from the file extension,
81///     e.g. `'\t'` for `.tsv`
82/// - when `' '` (a space) is used a separator, the builder splits by all white spaces, i.e.  `str.split_whitespace(&self)`
83///   method is used
84/// - `.symmetric(true)` ensures that if (i,j) is set, (j,i) will also be set automatically.
85#[derive(Debug, Clone)]
86pub struct DataMatrixBuilder {
87    row_label_col: usize,
88    col_label_col: usize,
89    data_col: usize,
90    row_idx_col: Option<usize>,
91    col_idx_col: Option<usize>,
92    separator: Option<char>,
93    symmetric: bool,
94    skip_header: bool,
95    labels: Option<Vec<String>>,
96}
97
98#[allow(clippy::new_without_default)]
99impl DataMatrixBuilder {
100    /// Creates just a new builder.
101    ///
102    /// Now use its methods to set up column indexes (e.g. [`label_columns()`](DataMatrixBuilder::label_columns)), then provide some data (e.g. [`from_file()`](DataMatrixBuilder::from_file))
103    pub fn new() -> Self {
104        Self {
105            row_label_col: 0,
106            col_label_col: 1,
107            data_col: 2,
108            row_idx_col: None,
109            col_idx_col: None,
110            separator: None,
111            symmetric: false,
112            skip_header: false,
113            labels: None,
114        }
115    }
116
117    /// Specifies which columns contain the row and column labels.
118    ///
119    /// Column indices are **0-based** (i.e., the first column is 0).
120    ///
121    /// # Arguments
122    /// * `row` — Column number for row labels.
123    /// * `col` — Column number for column labels.
124    ///
125    /// # Example
126    /// ```rust
127    /// use data_matrix::DataMatrixBuilder;
128    /// let mut builder = DataMatrixBuilder::new();
129    /// builder.label_columns(1, 2);
130    /// ```
131    pub fn label_columns(mut self, row: usize, col: usize) -> Self {
132        self.row_label_col = row;
133        self.col_label_col = col;
134        self
135    }
136
137    /// Provides labels for the case when the input data is a single column.
138    pub fn labels<I, S>(mut self, labels: I) -> Self
139    where
140        I: IntoIterator<Item = S>,
141        S: Into<String>,
142    {
143        self.labels = Some(labels.into_iter().map(Into::into).collect());
144        self
145    }
146
147    /// Specifies which column contains the numeric value.
148    ///
149    /// Column index is **0-based**.
150    pub fn data_column(mut self, val: usize) -> Self {
151        self.data_col = val;
152        self
153    }
154
155    /// Specifies which columns provide explicit row and column indices.
156    ///
157    /// Column indices are **0-based**.
158    ///
159    /// # Arguments
160    /// * `row_idx` — Column number for the row index.
161    /// * `col_idx` — Column number for the column index.
162    ///
163    /// # Example
164    /// ```rust
165    /// use data_matrix::DataMatrixBuilder;
166    /// let mut builder = DataMatrixBuilder::new();
167    /// builder.index_columns(3, 4);
168    /// ```
169    pub fn index_columns(mut self, row_idx: usize, col_idx: usize) -> Self {
170        self.row_idx_col = Some(row_idx);
171        self.col_idx_col = Some(col_idx);
172        self
173    }
174
175    /// Sets the character used to separate fields in the input file.
176    ///
177    /// Common choices: `' '`, `','`, `'\t'`.
178    pub fn separator(mut self, sep: char) -> Self {
179        self.separator = Some(sep);
180        self
181    }
182
183    /// If set to `true`, the first line of the file should be skipped as a header.
184    pub fn skip_header(mut self, if_header: bool) -> Self {
185        self.skip_header = if_header;
186        self
187    }
188
189    /// Sets whether the matrix should be treated as symmetric.
190    ///
191    /// If enabled, for every entry `(row, col, value)`, the symmetric entry `(col, row, value)`
192    /// is automatically added.
193    pub fn symmetric(mut self, if_symmetric: bool) -> Self {
194        self.symmetric = if_symmetric;
195        self
196    }
197
198    /// Creates a new [`DataMatrix`] from a given 1D vector of data.
199    ///
200    /// This method is devised to turn a 1D column of numbers into a **square** (usually symmetrix)
201    /// 2D [`DataMatrix`] object.
202    /// Labels should be provided with [`labels()`](DataMatrixBuilder::labels) method,
203    /// otherwise they will be automatically generated as `"row-{}", i + 1` and `col-{}", i + 1`
204    /// for rows and columns, respectively.
205    ///
206    /// # Examples
207    /// Creates a square matrix with automatically generated labels:
208    ///
209    /// ```rust
210    /// use data_matrix::{DataMatrixBuilder, Error};
211    /// # fn main() -> Result<(), Error> {
212    /// let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0];
213    /// let matrix = DataMatrixBuilder::new().from_data(&data).unwrap();
214    /// assert_eq!(matrix.ncols(), 3);
215    /// assert_eq!(matrix.get(0,0).unwrap(), 1.0);
216    /// assert_eq!(matrix.row_label(0), "row-1");
217    /// # Ok(())
218    /// # }
219    /// ```
220    ///
221    /// Creates a square symmetric matrix with user-defined labels:
222    ///
223    /// ```rust
224    /// use data_matrix::{DataMatrixBuilder, Error};
225    /// # fn main() -> Result<(), Error> {
226    /// let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0];
227    /// let labels = ["data-1", "data-2", "data-3"];
228    /// let matrix = DataMatrixBuilder::new().labels(labels).from_data(&data).unwrap();
229    /// assert_eq!(matrix.ncols(), 3);
230    /// assert_eq!(matrix.get(0,0).unwrap(), 1.0);
231    /// assert_eq!(matrix.row_label(0), "data-1");
232    /// # Ok(())
233    /// # }
234    /// ```
235    ///
236    pub fn from_data(self, data: &[f64]) -> Result<DataMatrix, Error> {
237        let len = data.len();
238        let n = (len as f64).sqrt() as usize;
239        if n * n != len {
240            return Err(Error::WrongNumberOfData { n_data: len });
241        }
242
243        let (row_labels, col_labels) = match &self.labels {
244            Some(given) => (given.clone(), given.clone()),
245            None => {
246                let rows = (0..n).map(|i| format!("row-{}", i + 1)).collect();
247                let cols = (0..n).map(|i| format!("col-{}", i + 1)).collect();
248                (rows, cols)
249            }
250        };
251
252        let mut matrix = Vec::with_capacity(n);
253        for i in 0..n {
254            let start = i * n;
255            let end = start + n;
256            matrix.push(data[start..end].to_vec());
257        }
258
259        DataMatrix::new(matrix, row_labels, col_labels)
260    }
261
262    /// Loads the matrix from the given file path according to the current builder settings.
263    pub fn from_file<P: AsRef<Path>>(self, filename: P) -> Result<DataMatrix, Error> {
264        if let Some(ref labels) = self.labels {
265            return self.read_one_column(filename, self.data_col, labels.clone());
266        }
267
268        let mut row_indexer = Indexer::new();
269        let mut col_indexer = Indexer::new();
270
271        let separator = match self.separator {
272            None => guess_separator(&filename),
273            Some(c) => c,
274        };
275
276        let lines = parse_plain(filename, separator, self.skip_header)?;
277        // ---------- Build the label_to_index map if we have explicit entry indexing
278        if let (Some(r_idx), Some(c_idx)) = (self.row_idx_col, self.col_idx_col) {
279            for (line_no, parts) in lines.iter().enumerate() {
280                let row_idx: usize = parts[r_idx].parse().map_err(|_| Error::ParseError {
281                    line: line_no,
282                    content: parts[r_idx].to_string(),
283                })?;
284                let col_idx: usize = parts[c_idx].parse().map_err(|_| Error::ParseError {
285                    line: line_no,
286                    content: parts[c_idx].to_string(),
287                })?;
288                row_indexer.add_explicit(&parts[self.row_label_col], row_idx);
289                if self.symmetric {
290                    row_indexer.add_explicit(&parts[self.col_label_col], col_idx);
291                } else {
292                    col_indexer.add_explicit(&parts[self.col_label_col], col_idx);
293                }
294            }
295        } else {
296            // ---------- Build the label_to_index map if we don't have explicit entry indexing
297            for parts in &lines {
298                row_indexer.add(&parts[self.row_label_col]);
299                if self.symmetric {
300                    row_indexer.add(&parts[self.col_label_col]);
301                } else {
302                    col_indexer.add(&parts[self.col_label_col]);
303                }
304            }
305        }
306
307        if self.symmetric {
308            col_indexer = row_indexer.clone();
309        }
310        let mut data = vec![vec![0.0; col_indexer.max_index()]; row_indexer.max_index()];
311        let row_labels = row_indexer.to_vec();
312        let col_labels = col_indexer.to_vec();
313
314        for (line_no, parts) in lines.into_iter().enumerate() {
315            let i_row = row_indexer.index(&parts[self.row_label_col]);
316            let j_col = col_indexer.index(&parts[self.col_label_col]);
317            let value: f64 = parts[self.data_col]
318                .parse()
319                .map_err(|_| Error::ParseError {
320                    line: line_no,
321                    content: parts[self.data_col].to_string()
322                })?;
323            data[i_row][j_col] = value;
324            if self.symmetric {
325                data[j_col][i_row] = value;
326            }
327        }
328
329        DataMatrix::new(data, row_labels, col_labels)
330    }
331
332    fn read_one_column<P: AsRef<Path>>(
333        &self,
334        filename: P,
335        column: usize,
336        labels: Vec<String>,
337    ) -> Result<DataMatrix, Error> {
338        let rows = parse_plain(filename, ' ', self.skip_header)?;
339        let col_idx = column;
340
341        let mut values = Vec::new();
342
343        for (line_num, parts) in rows.into_iter().enumerate() {
344            if col_idx >= parts.len() {
345                return Err(Error::NotEnoughColumns {
346                    line: line_num + 1,
347                    needed: col_idx + 1,
348                    content: format!("{:?}", parts),
349                });
350            }
351
352            let value: f64 = parts[col_idx].parse().map_err(|_| Error::ParseError {
353                line: line_num + 1,
354                content: parts[col_idx].clone(),
355            })?;
356
357            values.push(value);
358        }
359
360        let n = labels.len();
361        if n * n != values.len() {
362            return Err(Error::ParseError {
363                line: 0,
364                content: format!(
365                    "Expected {}² = {} values, but found {}",
366                    n,
367                    n * n,
368                    values.len()
369                ),
370            });
371        }
372
373        let mut data = Vec::with_capacity(n);
374        for i in 0..n {
375            let start = i * n;
376            let end = start + n;
377            data.push(values[start..end].to_vec());
378        }
379
380        DataMatrix::new(data, labels.clone(), labels)
381    }
382}
383
384fn parse_plain<P: AsRef<Path>>(
385    filename: P,
386    separator: char,
387    skip_header: bool,
388) -> std::io::Result<Vec<Vec<String>>> {
389    // --- read the file, possibly gzipped
390    let reader = open_file(filename)?;
391
392    let mut first_passed = false;
393    let mut lines = Vec::new();
394    for line in reader.lines() {
395        let line = line?;
396        if line.trim().is_empty() || line.starts_with('#') {
397            continue;
398        }
399        // skip the first line if this is a header
400        if !first_passed && skip_header {
401            first_passed = true;
402            continue;
403        }
404        let parts: Vec<String> = if separator == ' ' {
405            line.split_whitespace().map(|s| s.to_string()).collect()
406        } else {
407            line.split(separator).map(|s| s.to_string()).collect()
408        };
409        lines.push(parts);
410    }
411    Ok(lines)
412}
413
414#[derive(Clone)]
415struct Indexer {
416    label_to_index: HashMap<String, usize>,
417}
418
419impl Indexer {
420    fn new() -> Self {
421        Self {
422            label_to_index: HashMap::new(),
423        }
424    }
425
426    fn add(&mut self, label: &str) -> usize {
427        if let Some(&idx) = self.label_to_index.get(label) {
428            idx
429        } else {
430            let idx = self.label_to_index.len();
431            self.label_to_index.insert(label.to_string(), idx);
432            idx
433        }
434    }
435
436    fn add_explicit(&mut self, label: &str, idx: usize) {
437        self.label_to_index.entry(label.to_string()).or_insert(idx);
438    }
439
440    fn index(&self, label: &str) -> usize {
441        *self
442            .label_to_index
443            .get(label)
444            .expect("Label not found in indexer")
445    }
446
447    fn max_index(&self) -> usize {
448        self.label_to_index.len()
449    }
450
451    fn to_vec(&self) -> Vec<String> {
452        let mut result = vec!["".to_string(); self.label_to_index.len()];
453        for (label, &idx) in &self.label_to_index {
454            result[idx] = label.clone();
455        }
456        result
457    }
458}
459
460/// Guess a field separator from the filename extension.
461///
462/// Supported (case-insensitive):
463/// - `csv` → `,`
464/// - `tsv`, `tab` → `\t`
465/// - `psv` (pipe-separated) → `|`
466/// - `ssv` (semicolon-separated) → `;`
467///
468/// Also handles compressed files like `data.csv.gz` (peels one layer).
469///
470/// By default returns ` ` (a space character) if the separator cannot be determined.
471///
472/// # Examples
473/// ```rust,ignore
474/// use std::path::Path;
475///
476/// assert_eq!(guess_separator("data.csv"), ',');
477/// assert_eq!(guess_separator("data.TSV"), '\t');
478/// assert_eq!(guess_separator("table.tab"), '\t');
479/// assert_eq!(guess_separator("log.psv"), '|');
480/// assert_eq!(guess_separator("semi.ssv"), ';');
481/// assert_eq!(guess_separator("archive.csv.gz"), ','); // compressed
482/// ```
483fn guess_separator<P: AsRef<Path>>(path: P) -> char {
484    let path = path.as_ref();
485
486    // Get the likely data extension, handling a single compression suffix.
487    let ext = match path.extension().and_then(|e| e.to_str()) {
488        Some(ext) => {
489            let ext = ext.to_ascii_lowercase();
490            match ext.as_str() {
491                // Peel one compression layer and check the previous extension
492                "gz" | "bz2" | "xz" | "zst" | "zip" => {
493                    // file_stem() of "...csv.gz" is "....csv"
494                    path.file_stem()
495                        .and_then(|s| Path::new(s).extension())
496                        .and_then(|e| e.to_str())
497                        .map(|e| e.to_ascii_lowercase())
498                        .unwrap_or_default()
499                }
500                other => other.to_string(),
501            }
502        }
503        None => String::new(),
504    };
505
506    match ext.as_str() {
507        "dat" => ' ',
508        "csv" => ',',
509        "tsv" | "tab" => '\t',
510        "psv" => '|',
511        "ssv" => ';',
512        _ => ' ',
513    }
514}
515
516/// This function can open a regular file or a gzipped one, as determined by the extension
517/// of the input file name. A boxed reader to the content is returned.
518///
519/// The code has been copied from bioshell-io::utils
520fn open_file<P: AsRef<Path>>(file_path: P) -> io::Result<Box<dyn BufRead>> {
521    let path = file_path.as_ref();
522
523    if path.as_os_str().is_empty() {
524        return Err(io::Error::new(
525            ErrorKind::InvalidInput,
526            "Couldn't open file: empty path",
527        ));
528    }
529    let file = File::open(path)?;
530
531    if file_path.as_ref().extension() == Some(OsStr::new("gz")) {
532        Ok(Box::new(BufReader::with_capacity(
533            128 * 1024,
534            read::GzDecoder::new(file),
535        )))
536    } else {
537        Ok(Box::new(BufReader::with_capacity(128 * 1024, file)))
538    }
539}