data_matrix/datamatrix_builder.rs
1use flate2::read;
2use std::collections::HashMap;
3use std::ffi::OsStr;
4use std::fs::File;
5use std::io::{self, BufRead, BufReader, ErrorKind};
6use std::path::Path;
7
8use crate::{DataMatrix, Error};
9
10/// A builder for loading labeled matrices from plain text, CSV, or TSV files.
11///
12/// [`DataMatrixBuilder`] provides flexible configuration for how files are parsed:
13/// - specify which columns contain row labels, column labels, and values,
14/// - optionally specify explicit row and column indices (for 5-column formats),
15/// - control the separator (space, comma, tab, etc.),
16/// - skip header lines,
17/// - control whether the matrix should be symmetric.
18///
19/// # Supported formats
20/// - **Three-column format**: row label, column label, value
21/// - **Five-column format**: row label, column label, row index, column index, value
22/// - **Single-column format**: raw values for a square matrix (handled separately); requires labels provided by a user
23/// with `DataMatrixBuilder::labels()`.
24///
25/// Lines starting with `#` are ignored as comments.
26///
27/// # Examples
28///
29/// ## Reading a 5-column file (e.g., `five_columns_short.txt`)
30/// ```text
31/// # Comment lines are allowed
32/// Alice Bob 0 1 1.5
33/// Bob John 1 2 2.2
34/// ```
35///
36/// ```rust
37/// use data_matrix::{DataMatrixBuilder, Error};
38/// # fn main() -> Result<(), Error> {
39/// # let input_fname = "./tests/test_files/five_columns_short.txt";
40/// let matrix = DataMatrixBuilder::new()
41/// .label_columns(0, 1) // columns 0 and 1: row and column labels
42/// .index_columns(2, 3) // columns 2 and 3: row and column indices
43/// .data_column(4) // column 4: value
44/// .separator(' ') // whitespace separator
45/// .symmetric(true) // make symmetric
46/// .from_file(input_fname)?;
47/// # assert_eq!(matrix.ncols(), 3);
48/// # assert_eq!(matrix.nrows(), 3);
49/// # Ok(())
50/// # }
51/// ```
52///
53/// ## Reading a 3-column file (e.g., `three_columns_short.txt`)
54/// ```text
55/// # Comment lines are allowed
56/// Alice Bob 1.2
57/// Bob John 2.4
58/// ```
59///
60/// ```rust
61/// use data_matrix::{DataMatrixBuilder, Error};
62/// # fn main() -> Result<(), Error> {
63/// # let input_fname = "./tests/test_files/three_columns_short.txt";
64///
65/// let matrix = DataMatrixBuilder::new()
66/// .label_columns(0, 1) // columns 0 and 1: row and column labels
67/// .data_column(2) // column 2: value
68/// .separator(' ') // whitespace separator
69/// .symmetric(true) // make symmetric
70/// .skip_header(false) // this is the default behaviour
71/// .from_file(input_fname)?;
72/// # assert_eq!(matrix.ncols(), 3);
73/// # assert_eq!(matrix.nrows(), 3);
74/// # Ok(())
75/// # }
76/// ```
77///
78/// # Notes
79/// - Columns are indexed starting **from 0**
80/// - field separator must be a single character (with an exception for `' '`, see below); if not given, the value will be inferred from the file extension,
81/// e.g. `'\t'` for `.tsv`
82/// - when `' '` (a space) is used a separator, the builder splits by all white spaces, i.e. `str.split_whitespace(&self)`
83/// method is used
84/// - `.symmetric(true)` ensures that if (i,j) is set, (j,i) will also be set automatically.
85#[derive(Debug, Clone)]
86pub struct DataMatrixBuilder {
87 row_label_col: usize,
88 col_label_col: usize,
89 data_col: usize,
90 row_idx_col: Option<usize>,
91 col_idx_col: Option<usize>,
92 separator: Option<char>,
93 symmetric: bool,
94 skip_header: bool,
95 labels: Option<Vec<String>>,
96}
97
98#[allow(clippy::new_without_default)]
99impl DataMatrixBuilder {
100 /// Creates just a new builder.
101 ///
102 /// Now use its methods to set up column indexes (e.g. [`label_columns()`](DataMatrixBuilder::label_columns)), then provide some data (e.g. [`from_file()`](DataMatrixBuilder::from_file))
103 pub fn new() -> Self {
104 Self {
105 row_label_col: 0,
106 col_label_col: 1,
107 data_col: 2,
108 row_idx_col: None,
109 col_idx_col: None,
110 separator: None,
111 symmetric: false,
112 skip_header: false,
113 labels: None,
114 }
115 }
116
117 /// Specifies which columns contain the row and column labels.
118 ///
119 /// Column indices are **0-based** (i.e., the first column is 0).
120 ///
121 /// # Arguments
122 /// * `row` — Column number for row labels.
123 /// * `col` — Column number for column labels.
124 ///
125 /// # Example
126 /// ```rust
127 /// use data_matrix::DataMatrixBuilder;
128 /// let mut builder = DataMatrixBuilder::new();
129 /// builder.label_columns(1, 2);
130 /// ```
131 pub fn label_columns(mut self, row: usize, col: usize) -> Self {
132 self.row_label_col = row;
133 self.col_label_col = col;
134 self
135 }
136
137 /// Provides labels for the case when the input data is a single column.
138 pub fn labels<I, S>(mut self, labels: I) -> Self
139 where
140 I: IntoIterator<Item = S>,
141 S: Into<String>,
142 {
143 self.labels = Some(labels.into_iter().map(Into::into).collect());
144 self
145 }
146
147 /// Specifies which column contains the numeric value.
148 ///
149 /// Column index is **0-based**.
150 pub fn data_column(mut self, val: usize) -> Self {
151 self.data_col = val;
152 self
153 }
154
155 /// Specifies which columns provide explicit row and column indices.
156 ///
157 /// Column indices are **0-based**.
158 ///
159 /// # Arguments
160 /// * `row_idx` — Column number for the row index.
161 /// * `col_idx` — Column number for the column index.
162 ///
163 /// # Example
164 /// ```rust
165 /// use data_matrix::DataMatrixBuilder;
166 /// let mut builder = DataMatrixBuilder::new();
167 /// builder.index_columns(3, 4);
168 /// ```
169 pub fn index_columns(mut self, row_idx: usize, col_idx: usize) -> Self {
170 self.row_idx_col = Some(row_idx);
171 self.col_idx_col = Some(col_idx);
172 self
173 }
174
175 /// Sets the character used to separate fields in the input file.
176 ///
177 /// Common choices: `' '`, `','`, `'\t'`.
178 pub fn separator(mut self, sep: char) -> Self {
179 self.separator = Some(sep);
180 self
181 }
182
183 /// If set to `true`, the first line of the file should be skipped as a header.
184 pub fn skip_header(mut self, if_header: bool) -> Self {
185 self.skip_header = if_header;
186 self
187 }
188
189 /// Sets whether the matrix should be treated as symmetric.
190 ///
191 /// If enabled, for every entry `(row, col, value)`, the symmetric entry `(col, row, value)`
192 /// is automatically added.
193 pub fn symmetric(mut self, if_symmetric: bool) -> Self {
194 self.symmetric = if_symmetric;
195 self
196 }
197
198 /// Creates a new [`DataMatrix`] from a given 1D vector of data.
199 ///
200 /// This method is devised to turn a 1D column of numbers into a **square** (usually symmetrix)
201 /// 2D [`DataMatrix`] object.
202 /// Labels should be provided with [`labels()`](DataMatrixBuilder::labels) method,
203 /// otherwise they will be automatically generated as `"row-{}", i + 1` and `col-{}", i + 1`
204 /// for rows and columns, respectively.
205 ///
206 /// # Examples
207 /// Creates a square matrix with automatically generated labels:
208 ///
209 /// ```rust
210 /// use data_matrix::{DataMatrixBuilder, Error};
211 /// # fn main() -> Result<(), Error> {
212 /// let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0];
213 /// let matrix = DataMatrixBuilder::new().from_data(&data).unwrap();
214 /// assert_eq!(matrix.ncols(), 3);
215 /// assert_eq!(matrix.get(0,0).unwrap(), 1.0);
216 /// assert_eq!(matrix.row_label(0), "row-1");
217 /// # Ok(())
218 /// # }
219 /// ```
220 ///
221 /// Creates a square symmetric matrix with user-defined labels:
222 ///
223 /// ```rust
224 /// use data_matrix::{DataMatrixBuilder, Error};
225 /// # fn main() -> Result<(), Error> {
226 /// let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0];
227 /// let labels = ["data-1", "data-2", "data-3"];
228 /// let matrix = DataMatrixBuilder::new().labels(labels).from_data(&data).unwrap();
229 /// assert_eq!(matrix.ncols(), 3);
230 /// assert_eq!(matrix.get(0,0).unwrap(), 1.0);
231 /// assert_eq!(matrix.row_label(0), "data-1");
232 /// # Ok(())
233 /// # }
234 /// ```
235 ///
236 pub fn from_data(self, data: &[f64]) -> Result<DataMatrix, Error> {
237 let len = data.len();
238 let n = (len as f64).sqrt() as usize;
239 if n * n != len {
240 return Err(Error::WrongNumberOfData { n_data: len });
241 }
242
243 let (row_labels, col_labels) = match &self.labels {
244 Some(given) => (given.clone(), given.clone()),
245 None => {
246 let rows = (0..n).map(|i| format!("row-{}", i + 1)).collect();
247 let cols = (0..n).map(|i| format!("col-{}", i + 1)).collect();
248 (rows, cols)
249 }
250 };
251
252 let mut matrix = Vec::with_capacity(n);
253 for i in 0..n {
254 let start = i * n;
255 let end = start + n;
256 matrix.push(data[start..end].to_vec());
257 }
258
259 DataMatrix::new(matrix, row_labels, col_labels)
260 }
261
262 /// Loads the matrix from the given file path according to the current builder settings.
263 pub fn from_file<P: AsRef<Path>>(self, filename: P) -> Result<DataMatrix, Error> {
264 if let Some(ref labels) = self.labels {
265 return self.read_one_column(filename, self.data_col, labels.clone());
266 }
267
268 let mut row_indexer = Indexer::new();
269 let mut col_indexer = Indexer::new();
270
271 let separator = match self.separator {
272 None => guess_separator(&filename),
273 Some(c) => c,
274 };
275
276 let lines = parse_plain(filename, separator, self.skip_header)?;
277 // ---------- Build the label_to_index map if we have explicit entry indexing
278 if let (Some(r_idx), Some(c_idx)) = (self.row_idx_col, self.col_idx_col) {
279 for (line_no, parts) in lines.iter().enumerate() {
280 let row_idx: usize = parts[r_idx].parse().map_err(|_| Error::ParseError {
281 line: line_no,
282 content: parts[r_idx].to_string(),
283 })?;
284 let col_idx: usize = parts[c_idx].parse().map_err(|_| Error::ParseError {
285 line: line_no,
286 content: parts[c_idx].to_string(),
287 })?;
288 row_indexer.add_explicit(&parts[self.row_label_col], row_idx);
289 if self.symmetric {
290 row_indexer.add_explicit(&parts[self.col_label_col], col_idx);
291 } else {
292 col_indexer.add_explicit(&parts[self.col_label_col], col_idx);
293 }
294 }
295 } else {
296 // ---------- Build the label_to_index map if we don't have explicit entry indexing
297 for parts in &lines {
298 row_indexer.add(&parts[self.row_label_col]);
299 if self.symmetric {
300 row_indexer.add(&parts[self.col_label_col]);
301 } else {
302 col_indexer.add(&parts[self.col_label_col]);
303 }
304 }
305 }
306
307 if self.symmetric {
308 col_indexer = row_indexer.clone();
309 }
310 let mut data = vec![vec![0.0; col_indexer.max_index()]; row_indexer.max_index()];
311 let row_labels = row_indexer.to_vec();
312 let col_labels = col_indexer.to_vec();
313
314 for (line_no, parts) in lines.into_iter().enumerate() {
315 let i_row = row_indexer.index(&parts[self.row_label_col]);
316 let j_col = col_indexer.index(&parts[self.col_label_col]);
317 let value: f64 = parts[self.data_col]
318 .parse()
319 .map_err(|_| Error::ParseError {
320 line: line_no,
321 content: parts[self.data_col].to_string()
322 })?;
323 data[i_row][j_col] = value;
324 if self.symmetric {
325 data[j_col][i_row] = value;
326 }
327 }
328
329 DataMatrix::new(data, row_labels, col_labels)
330 }
331
332 fn read_one_column<P: AsRef<Path>>(
333 &self,
334 filename: P,
335 column: usize,
336 labels: Vec<String>,
337 ) -> Result<DataMatrix, Error> {
338 let rows = parse_plain(filename, ' ', self.skip_header)?;
339 let col_idx = column;
340
341 let mut values = Vec::new();
342
343 for (line_num, parts) in rows.into_iter().enumerate() {
344 if col_idx >= parts.len() {
345 return Err(Error::NotEnoughColumns {
346 line: line_num + 1,
347 needed: col_idx + 1,
348 content: format!("{:?}", parts),
349 });
350 }
351
352 let value: f64 = parts[col_idx].parse().map_err(|_| Error::ParseError {
353 line: line_num + 1,
354 content: parts[col_idx].clone(),
355 })?;
356
357 values.push(value);
358 }
359
360 let n = labels.len();
361 if n * n != values.len() {
362 return Err(Error::ParseError {
363 line: 0,
364 content: format!(
365 "Expected {}² = {} values, but found {}",
366 n,
367 n * n,
368 values.len()
369 ),
370 });
371 }
372
373 let mut data = Vec::with_capacity(n);
374 for i in 0..n {
375 let start = i * n;
376 let end = start + n;
377 data.push(values[start..end].to_vec());
378 }
379
380 DataMatrix::new(data, labels.clone(), labels)
381 }
382}
383
384fn parse_plain<P: AsRef<Path>>(
385 filename: P,
386 separator: char,
387 skip_header: bool,
388) -> std::io::Result<Vec<Vec<String>>> {
389 // --- read the file, possibly gzipped
390 let reader = open_file(filename)?;
391
392 let mut first_passed = false;
393 let mut lines = Vec::new();
394 for line in reader.lines() {
395 let line = line?;
396 if line.trim().is_empty() || line.starts_with('#') {
397 continue;
398 }
399 // skip the first line if this is a header
400 if !first_passed && skip_header {
401 first_passed = true;
402 continue;
403 }
404 let parts: Vec<String> = if separator == ' ' {
405 line.split_whitespace().map(|s| s.to_string()).collect()
406 } else {
407 line.split(separator).map(|s| s.to_string()).collect()
408 };
409 lines.push(parts);
410 }
411 Ok(lines)
412}
413
414#[derive(Clone)]
415struct Indexer {
416 label_to_index: HashMap<String, usize>,
417}
418
419impl Indexer {
420 fn new() -> Self {
421 Self {
422 label_to_index: HashMap::new(),
423 }
424 }
425
426 fn add(&mut self, label: &str) -> usize {
427 if let Some(&idx) = self.label_to_index.get(label) {
428 idx
429 } else {
430 let idx = self.label_to_index.len();
431 self.label_to_index.insert(label.to_string(), idx);
432 idx
433 }
434 }
435
436 fn add_explicit(&mut self, label: &str, idx: usize) {
437 self.label_to_index.entry(label.to_string()).or_insert(idx);
438 }
439
440 fn index(&self, label: &str) -> usize {
441 *self
442 .label_to_index
443 .get(label)
444 .expect("Label not found in indexer")
445 }
446
447 fn max_index(&self) -> usize {
448 self.label_to_index.len()
449 }
450
451 fn to_vec(&self) -> Vec<String> {
452 let mut result = vec!["".to_string(); self.label_to_index.len()];
453 for (label, &idx) in &self.label_to_index {
454 result[idx] = label.clone();
455 }
456 result
457 }
458}
459
460/// Guess a field separator from the filename extension.
461///
462/// Supported (case-insensitive):
463/// - `csv` → `,`
464/// - `tsv`, `tab` → `\t`
465/// - `psv` (pipe-separated) → `|`
466/// - `ssv` (semicolon-separated) → `;`
467///
468/// Also handles compressed files like `data.csv.gz` (peels one layer).
469///
470/// By default returns ` ` (a space character) if the separator cannot be determined.
471///
472/// # Examples
473/// ```rust,ignore
474/// use std::path::Path;
475///
476/// assert_eq!(guess_separator("data.csv"), ',');
477/// assert_eq!(guess_separator("data.TSV"), '\t');
478/// assert_eq!(guess_separator("table.tab"), '\t');
479/// assert_eq!(guess_separator("log.psv"), '|');
480/// assert_eq!(guess_separator("semi.ssv"), ';');
481/// assert_eq!(guess_separator("archive.csv.gz"), ','); // compressed
482/// ```
483fn guess_separator<P: AsRef<Path>>(path: P) -> char {
484 let path = path.as_ref();
485
486 // Get the likely data extension, handling a single compression suffix.
487 let ext = match path.extension().and_then(|e| e.to_str()) {
488 Some(ext) => {
489 let ext = ext.to_ascii_lowercase();
490 match ext.as_str() {
491 // Peel one compression layer and check the previous extension
492 "gz" | "bz2" | "xz" | "zst" | "zip" => {
493 // file_stem() of "...csv.gz" is "....csv"
494 path.file_stem()
495 .and_then(|s| Path::new(s).extension())
496 .and_then(|e| e.to_str())
497 .map(|e| e.to_ascii_lowercase())
498 .unwrap_or_default()
499 }
500 other => other.to_string(),
501 }
502 }
503 None => String::new(),
504 };
505
506 match ext.as_str() {
507 "dat" => ' ',
508 "csv" => ',',
509 "tsv" | "tab" => '\t',
510 "psv" => '|',
511 "ssv" => ';',
512 _ => ' ',
513 }
514}
515
516/// This function can open a regular file or a gzipped one, as determined by the extension
517/// of the input file name. A boxed reader to the content is returned.
518///
519/// The code has been copied from bioshell-io::utils
520fn open_file<P: AsRef<Path>>(file_path: P) -> io::Result<Box<dyn BufRead>> {
521 let path = file_path.as_ref();
522
523 if path.as_os_str().is_empty() {
524 return Err(io::Error::new(
525 ErrorKind::InvalidInput,
526 "Couldn't open file: empty path",
527 ));
528 }
529 let file = File::open(path)?;
530
531 if file_path.as_ref().extension() == Some(OsStr::new("gz")) {
532 Ok(Box::new(BufReader::with_capacity(
533 128 * 1024,
534 read::GzDecoder::new(file),
535 )))
536 } else {
537 Ok(Box::new(BufReader::with_capacity(128 * 1024, file)))
538 }
539}