distmat/formats/
tabular.rs

1use std::io::Read;
2use std::io::{BufRead, BufReader};
3use thiserror::Error;
4
5use crate::builder::{DataError, DistBuilder};
6use crate::symmetric::flip_order;
7use crate::{DistMatrix, SquareMatrix};
8
9#[derive(Clone, Copy, Debug)]
10pub enum Separator {
11    /// Values are separated by a single character.
12    Char(char),
13
14    /// Values are separated by any amount of ASCII whitespace.
15    Whitespace,
16}
17
18pub enum TabularShape {
19    /// Wide tabular data has rows and columns corresponding to the matrix entries.
20    /// The first row and the first column should contain the taxa labels in the
21    /// same order. The top leftmost cell is ignored.
22    ///
23    /// This file uses `Separator::Char(',')` and has 3 taxa:
24    ///
25    /// ```{txt}
26    /// ,A,B,C
27    /// A,0,1,2
28    /// B,1,0,1
29    /// C,2,1,0
30    /// ```
31    Wide,
32
33    /// Long tabular data has exactly 3 columns: the two taxa and the distance.
34    /// A header row is optional and ignored. This shape can either represent a
35    /// complete square matrix with `N * N` rows, or the lower triangle with
36    /// `N * (N - 1) / 2` rows.
37    ///
38    /// This file uses `Separator::Whitespace` and represents only the lower
39    /// triangle for a matrix with 3 taxa:
40    ///
41    /// ```{txt}
42    /// from    to   dist
43    /// A       B    1
44    /// A       C    2
45    /// B       C    1
46    /// ```
47    ///
48    /// This file represents a complete matrix:
49    ///
50    /// ```{txt}
51    /// from,to,d
52    /// A,A,0
53    /// A,B,1
54    /// A,C,2
55    /// B,A,1
56    /// B,B,0
57    /// B,C,1
58    /// C,A,2
59    /// C,B,1
60    /// C,C,0
61    /// ```
62    ///
63    /// Note that [SquareMatrix::from_labelled_distances] and [DistMatrix::from_labelled_distances]
64    /// construct matrix types from an iterator of this shape if your data is not stored in
65    /// a tabular file.
66    Long,
67}
68
69#[derive(Error, Debug)]
70pub enum TabularError {
71    /// An underlying I/O error occurred.
72    #[error("unable to read distance matrix file")]
73    Io(#[from] std::io::Error),
74
75    #[error("unable to read header row with taxa labels")]
76    Header,
77
78    #[error("the file contained no data (empty or header had no delimeters)")]
79    NoData,
80
81    #[error("matrix row {0} (label '{1}') had {2} entries when {3} were expected")]
82    RowWidth(usize, String, usize, usize),
83
84    #[error("expected 3 columns: {0}")]
85    ColsTruncated(String),
86
87    #[error("matrix row {0} had label '{1}' but '{2}' was expected")]
88    RowOrder(usize, String, String),
89
90    #[error("row did not start with a label: {0}")]
91    Label(String),
92
93    #[error("reached end of file while expecting {0} more matrix rows")]
94    RowsTruncated(usize),
95
96    #[error("data has incorrect shape")]
97    Data(#[from] DataError),
98
99    /// Unable to parse a numeric value.
100    #[error("expected integer found `{0}': {1}")]
101    Numeric(String, std::num::ParseIntError),
102}
103
104/// Parse a distance matrix in a square format.
105pub fn parse<R: Read>(
106    reader: R,
107    separator: Separator,
108    shape: TabularShape,
109) -> Result<SquareMatrix<u32>, TabularError> {
110    let (labels, data, size) = match shape {
111        TabularShape::Wide => parse_wide(reader, separator)?,
112        TabularShape::Long => parse_long(reader, separator, false)?,
113    };
114    let labels = Some(labels);
115    let matrix = SquareMatrix { data, size, labels };
116    Ok(matrix)
117}
118
119/// Parse a distance matrix where only the lower triangle is represented.
120pub fn parse_lt<R: Read>(reader: R, separator: Separator) -> Result<DistMatrix<u32>, TabularError> {
121    let (labels, data, size) = parse_long(reader, separator, true)?;
122    let labels = Some(labels);
123    let data = flip_order(&data, size);
124    let matrix = DistMatrix { data, size, labels };
125    Ok(matrix)
126}
127
128fn parse_wide<R: Read>(
129    reader: R,
130    separator: Separator,
131) -> Result<(Vec<String>, Vec<u32>, usize), TabularError> {
132    let labels;
133    let mut data;
134
135    {
136        let mut br = BufReader::new(reader);
137        let mut buf = String::new();
138
139        //read the header row
140        br.read_line(&mut buf).map_err(|_| TabularError::Header)?;
141        let (_, rest) = separator.split_label(&buf)?;
142        labels = separator.split_str(rest.trim_end());
143        if labels.is_empty() {
144            return Err(TabularError::NoData);
145        }
146        data = Vec::with_capacity(labels.len() * labels.len());
147
148        let mut row = 0;
149
150        loop {
151            row += 1;
152            buf.clear();
153            let n = br.read_line(&mut buf)?;
154            if n > 0 {
155                let (label, rest) = separator.split_label(&buf)?;
156                if label != labels[row - 1] {
157                    return Err(TabularError::RowOrder(
158                        row,
159                        label.to_owned(),
160                        labels[row - 1].clone(),
161                    ));
162                }
163
164                let n_read = separator.split_u32(rest.trim_end(), &mut data)?;
165                if n_read != labels.len() {
166                    return Err(TabularError::RowWidth(
167                        row,
168                        label.to_owned(),
169                        n_read,
170                        labels.len(),
171                    ));
172                }
173            } else {
174                break; // EOF
175            }
176        }
177
178        if row < labels.len() {
179            return Err(TabularError::RowsTruncated(labels.len() - row));
180        }
181    }
182
183    let size = labels.len();
184    Ok((labels, data, size))
185}
186
187fn parse_long<R: Read>(
188    reader: R,
189    separator: Separator,
190    lower_triangle: bool,
191) -> Result<(Vec<String>, Vec<u32>, usize), TabularError> {
192    let builder = parse_long_impl(reader, separator)?;
193    let labels = builder.labels.clone();
194    let size = labels.len();
195
196    if lower_triangle {
197        let matrix: DistMatrix<u32> = builder.try_into()?;
198        Ok((labels, matrix.data, size))
199    } else {
200        let matrix: SquareMatrix<u32> = builder.try_into()?;
201        Ok((labels, matrix.data, size))
202    }
203}
204
205fn parse_long_impl<R: Read>(
206    reader: R,
207    separator: Separator,
208) -> Result<DistBuilder<u32>, TabularError> {
209    let mut builder = DistBuilder::<u32>::new();
210
211    let mut br = BufReader::new(reader);
212    let mut buf = String::new();
213
214    let mut row = 0;
215    let mut header_seen = false;
216
217    loop {
218        row += 1;
219        buf.clear();
220        let n = br.read_line(&mut buf)?;
221        if n > 0 {
222            let parts = separator.split_3(buf.trim_end());
223            if row == 1 && !header_seen {
224                if let Err(TabularError::Numeric(_, _)) = parts {
225                    row = 0;
226                    header_seen = true;
227                    continue;
228                }
229            }
230
231            let (name1, name2, distance) = parts?;
232            builder.add(name1, name2, distance)?;
233        } else {
234            break; // EOF
235        }
236    }
237
238    Ok(builder)
239}
240
241impl Separator {
242    fn split_str(&self, line: &str) -> Vec<String> {
243        match self {
244            Separator::Char(c) => line.split(*c).map(str::to_owned).collect(),
245            Separator::Whitespace => line.split_ascii_whitespace().map(str::to_owned).collect(),
246        }
247    }
248
249    fn split_label<'a>(&self, line: &'a str) -> Result<(&'a str, &'a str), TabularError> {
250        match self {
251            Separator::Char(c) => line
252                .split_once(*c)
253                .ok_or_else(|| TabularError::Label(line.to_owned())),
254            Separator::Whitespace => {
255                let (label, rest) = line
256                    .split_once(|x| char::is_ascii_whitespace(&x))
257                    .ok_or_else(|| TabularError::Label(line.to_owned()))?;
258                Ok((label, rest.trim_start()))
259            }
260        }
261    }
262
263    fn split_u32(&self, line: &str, data: &mut Vec<u32>) -> Result<usize, TabularError> {
264        let orig_size = data.len();
265
266        match self {
267            Separator::Char(c) => {
268                for number in line.trim_end().split(*c) {
269                    data.push(
270                        number
271                            .parse()
272                            .map_err(|e| TabularError::Numeric(number.to_owned(), e))?,
273                    );
274                }
275            }
276            Separator::Whitespace => {
277                for number in line.trim_end().split_ascii_whitespace() {
278                    data.push(
279                        number
280                            .parse()
281                            .map_err(|e| TabularError::Numeric(number.to_owned(), e))?,
282                    );
283                }
284            }
285        }
286
287        Ok(data.len() - orig_size)
288    }
289
290    fn split_3<'a>(&self, line: &'a str) -> Result<(&'a str, &'a str, u32), TabularError> {
291        let (p1, p2, p3) = match self {
292            Separator::Char(c) => extract_3(line, line.split(*c))?,
293            Separator::Whitespace => extract_3(line, line.split_ascii_whitespace())?,
294        };
295
296        let p3 = p3
297            .parse()
298            .map_err(|e| TabularError::Numeric(p3.to_owned(), e))?;
299        Ok((p1, p2, p3))
300    }
301}
302
303fn extract_3<'a>(
304    line: &'a str,
305    mut splitter: impl Iterator<Item = &'a str>,
306) -> Result<(&'a str, &'a str, &'a str), TabularError> {
307    let p1 = splitter
308        .next()
309        .ok_or_else(|| TabularError::ColsTruncated(line.to_owned()))?;
310    let p2 = splitter
311        .next()
312        .ok_or_else(|| TabularError::ColsTruncated(line.to_owned()))?;
313    let p3 = splitter
314        .next()
315        .ok_or_else(|| TabularError::ColsTruncated(line.to_owned()))?;
316    if splitter.next().is_some() {
317        return Err(TabularError::ColsTruncated(line.to_owned()));
318    }
319    Ok((p1, p2, p3))
320}
321
322#[cfg(test)]
323mod tests {
324    use super::*;
325
326    fn expected_labels() -> Vec<String> {
327        vec![
328            "seq1".to_owned(),
329            "seq2".to_owned(),
330            "seq3".to_owned(),
331            "seq4".to_owned(),
332        ]
333    }
334
335    fn expected_data() -> Vec<u32> {
336        vec![
337            // seq1 seq2 seq3 seq4
338            0, 1, 2, 3, // seq1
339            1, 0, 3, 4, // seq2
340            2, 3, 0, 4, // seq3
341            3, 4, 4, 0, // seq4
342        ]
343    }
344
345    #[test]
346    fn test_wide() {
347        let f = include_bytes!("../../tests/snp-dists/default.dat");
348        let (labels, data, _size) = parse_wide(f.as_slice(), Separator::Char('\t')).unwrap();
349        assert_eq!(labels, expected_labels());
350        assert_eq!(data, expected_data());
351    }
352
353    #[test]
354    fn test_version() {
355        let f = include_bytes!("../../tests/snp-dists/version.dat");
356        let (labels, data, _size) = parse_wide(f.as_slice(), Separator::Char('\t')).unwrap();
357        assert_eq!(labels, expected_labels());
358        assert_eq!(data, expected_data());
359    }
360
361    #[test]
362    fn test_comma() {
363        let f = include_bytes!("../../tests/snp-dists/comma.dat");
364        let (labels, data, _size) = parse_wide(f.as_slice(), Separator::Char(',')).unwrap();
365        assert_eq!(labels, expected_labels());
366        assert_eq!(data, expected_data());
367    }
368
369    #[test]
370    fn test_melt() {
371        let f = include_bytes!("../../tests/snp-dists/melt.dat");
372        let (labels, data, _size) = parse_long(f.as_slice(), Separator::Char('\t'), false).unwrap();
373        assert_eq!(labels, expected_labels());
374        assert_eq!(data, expected_data());
375    }
376
377    #[test]
378    fn test_melt_comma() {
379        let f = include_bytes!("../../tests/snp-dists/melt-comma.dat");
380        let (labels, data, _size) = parse_long(f.as_slice(), Separator::Char(','), false).unwrap();
381        assert_eq!(labels, expected_labels());
382        assert_eq!(data, expected_data());
383    }
384
385    #[test]
386    fn test_melt_lt() {
387        let f = include_bytes!("../../tests/long_lt.dat");
388        let (labels, data, _size) = parse_long(f.as_slice(), Separator::Char('\t'), true).unwrap();
389        assert_eq!(labels, expected_labels());
390        assert_eq!(data, vec![1, 2, 3, 3, 4, 4]);
391    }
392}