distmat/formats/
phylip.rs

1use std::io::Read;
2use std::io::{BufRead, BufReader};
3use thiserror::Error;
4
5use crate::symmetric::flip_order;
6use crate::{DistMatrix, SquareMatrix};
7
8/// Dialect used for taxon labels.
9///
10/// The known variants of the PHYLIP format differ in how they separate the
11/// taxon labels from the distance data. In either format the matrix entries
12/// are separated from each other by any amount of whitespace (including new
13/// lines).
14#[derive(Clone, Copy, Debug)]
15pub enum PhylipDialect {
16    /// The traditional format where taxon labels occupy the first 10
17    /// characters of each line. Longer labels are not possible, and shorter
18    /// labels must be padded with spaces to reach at least 10 characters.
19    /// Taxon labels may contain spaces.
20    ///
21    /// The following file has 3 taxa: `"taxon 1"`, `"taxon 2"`, and `"taxon 3"`:
22    /// ```{txt}
23    /// 3
24    /// taxon 1    0 1 2
25    /// taxon 2    1 0 1
26    /// taxon 3    2 1 0
27    /// ```
28    Strict,
29
30    /// The relaxed dialect. Taxon labels may not contain spaces, but may be of
31    /// any length. They are separated from the distace data by any number of
32    /// spaces.
33    ///
34    /// The following file has 3 taxa `"taxon"`, `"taxon_very_long"`, and
35    /// `"taxon-other-delim"`:
36    /// ```{txt}
37    /// 3
38    /// taxon 0 1 2
39    /// taxon_very_long    1 0 1
40    /// taxon-other-delim 2 1 0
41    /// ```
42    Relaxed,
43}
44
45/// An error in parsing the distance matrix file.
46#[derive(Error, Debug)]
47pub enum PhylipError {
48    /// An underlying I/O error occurred.
49    #[error("unable to read distance matrix file")]
50    Io(#[from] std::io::Error),
51
52    #[error("unable to read header row with taxa count")]
53    Header,
54
55    #[error("row missing label")]
56    Label,
57
58    #[error("matrix row {0} had at least {1} entries when {2} were expected")]
59    Extra(usize, usize, usize),
60
61    #[error("reached end of file while expecting {0} more matrix rows")]
62    RowsTruncated(usize),
63
64    /// Unable to parse a numeric value.
65    #[error("expected floating point number found `{0}': {1}")]
66    Numeric(String, std::num::ParseFloatError),
67}
68
69/// Parse a distance matrix in a square format.
70pub fn parse<R: Read>(reader: R, dialect: PhylipDialect) -> Result<SquareMatrix<f32>, PhylipError> {
71    let (labels, data, size) = parse_impl(reader, dialect, false)?;
72    let labels = Some(labels);
73    let matrix = SquareMatrix { data, size, labels };
74    Ok(matrix)
75}
76
77/// Parse a distance matrix where only the lower triangle is represented.
78pub fn parse_lt<R: Read>(
79    reader: R,
80    dialect: PhylipDialect,
81) -> Result<DistMatrix<f32>, PhylipError> {
82    let (labels, data, size) = parse_impl(reader, dialect, true)?;
83    let labels = Some(labels);
84    let matrix = DistMatrix { data, size, labels };
85    Ok(matrix)
86}
87
88fn parse_impl<R: Read>(
89    reader: R,
90    dialect: PhylipDialect,
91    lower_triangle: bool,
92) -> Result<(Vec<String>, Vec<f32>, usize), PhylipError> {
93    let mut br = BufReader::new(reader);
94    let mut buf = String::new();
95
96    br.read_line(&mut buf)?;
97    let size: usize = buf.trim().parse().map_err(|_| PhylipError::Header)?;
98
99    let mut labels = Vec::<String>::with_capacity(size);
100    let mut data = Vec::<f32>::with_capacity(size * size);
101    let mut expected_entries = 0;
102
103    loop {
104        buf.clear();
105        let n = br.read_line(&mut buf)?;
106        if n > 0 {
107            let n_read;
108
109            if expected_entries == 0 {
110                let (label, new_data) = parse_label(&buf, dialect)?;
111                expected_entries = if lower_triangle { labels.len() } else { size };
112                labels.push(label.to_owned());
113                n_read = parse_data(new_data, &mut data)?;
114            } else {
115                n_read = parse_data(&buf, &mut data)?;
116            }
117
118            if n_read > expected_entries {
119                return Err(PhylipError::Extra(
120                    labels.len(),
121                    expected_entries + n_read,
122                    size,
123                ));
124            } else {
125                expected_entries -= n_read;
126            }
127        } else {
128            // EOF
129            let remaining = size - labels.len();
130            if remaining > 0 {
131                return Err(PhylipError::RowsTruncated(remaining));
132            }
133
134            break;
135        }
136    }
137
138    if lower_triangle {
139        data = flip_order(&data, size);
140    }
141
142    Ok((labels, data, size))
143}
144
145fn parse_label(line: &str, dialect: PhylipDialect) -> Result<(&str, &str), PhylipError> {
146    match dialect {
147        PhylipDialect::Strict => {
148            if line.len() >= 10 {
149                Ok((line[0..10].trim_end(), line[10..].trim_start()))
150            } else {
151                Err(PhylipError::Label)
152            }
153        }
154
155        PhylipDialect::Relaxed => match line.split_once(|x| char::is_ascii_whitespace(&x)) {
156            Some((label, rest)) => Ok((label, rest.trim_start())),
157            None => Err(PhylipError::Label),
158        },
159    }
160}
161
162fn parse_data(line: &str, data: &mut Vec<f32>) -> Result<usize, PhylipError> {
163    let orig_size = data.len();
164
165    for number in line.trim_end().split_ascii_whitespace() {
166        data.push(
167            number
168                .parse()
169                .map_err(|e| PhylipError::Numeric(number.to_owned(), e))?,
170        );
171    }
172
173    Ok(data.len() - orig_size)
174}
175
176#[cfg(test)]
177mod tests {
178    use super::*;
179
180    fn expected_matrix() -> SquareMatrix<f32> {
181        let mut matrix: SquareMatrix<f32> = [
182            0.0000, 1.6866, 1.7198, 1.6606, 1.5243, 1.6043, 1.5905, 1.6866, 0.0000, 1.5232, 1.4841,
183            1.4465, 1.4389, 1.4629, 1.7198, 1.5232, 0.0000, 0.7115, 0.5958, 0.6179, 0.5583, 1.6606,
184            1.4841, 0.7115, 0.0000, 0.4631, 0.5061, 0.4710, 1.5243, 1.4465, 0.5958, 0.4631, 0.0000,
185            0.3484, 0.3083, 1.6043, 1.4389, 0.6179, 0.5061, 0.3484, 0.0000, 0.2692, 1.5905, 1.4629,
186            0.5583, 0.4710, 0.3083, 0.2692, 0.0000,
187        ]
188        .into_iter()
189        .collect();
190        matrix.set_labels(vec![
191            "Bovine".to_owned(),
192            "Mouse".to_owned(),
193            "Gibbon".to_owned(),
194            "Orang".to_owned(),
195            "Gorilla".to_owned(),
196            "Chimp".to_owned(),
197            "Human".to_owned(),
198        ]);
199        matrix
200    }
201
202    #[test]
203    fn test_square() {
204        let f = include_bytes!("../../tests/phylip/square.dist");
205
206        let matrix = parse(f.as_slice(), PhylipDialect::Strict).unwrap();
207        assert_eq!(matrix, expected_matrix());
208
209        let matrix = parse(f.as_slice(), PhylipDialect::Relaxed).unwrap();
210        assert_eq!(matrix, expected_matrix());
211    }
212
213    #[test]
214    fn test_square_multiline() {
215        let f = include_bytes!("../../tests/phylip/square_multiline.dist");
216        let matrix = parse(f.as_slice(), PhylipDialect::Relaxed).unwrap();
217        assert_eq!(matrix, expected_matrix());
218    }
219
220    #[test]
221    fn test_lower_triangle() {
222        let f = include_bytes!("../../tests/phylip/lower.dist");
223        let matrix = parse_lt(f.as_slice(), PhylipDialect::Relaxed).unwrap();
224        assert_eq!(matrix, expected_matrix().lower_triangle());
225    }
226
227    #[test]
228    fn test_lower_triangle_multiline() {
229        let f = include_bytes!("../../tests/phylip/lower_multiline.dist");
230        let matrix = parse_lt(f.as_slice(), PhylipDialect::Strict).unwrap();
231
232        let mut m_exp: DistMatrix<f32> = [
233            1.7043, 2.0235, 2.1378, 1.5232, 1.8261, 1.9182, 2.0039, 1.9431, 1.9663, 2.0593, 1.6664,
234            1.732, 1.7101, 1.1901, 1.3287, 1.2423, 1.2508, 1.2536, 1.3066, 1.2827, 1.3296, 1.2005,
235            1.346, 1.3757, 1.3956, 1.2905, 1.3199, 1.3887, 1.4658, 1.4826, 1.4502, 1.8708, 1.5356,
236            1.4577, 1.7803, 1.6661, 1.7878, 1.3137, 1.3788, 1.3826, 1.4543, 1.6683, 1.6606, 1.5935,
237            1.7119, 1.7599, 1.0642, 1.1124, 0.9832, 1.0629, 0.9228, 1.0681, 0.9127, 1.0635, 1.0557,
238            0.1022, 0.2061, 0.3895, 0.8035, 0.7239, 0.7278, 0.7899, 0.6933, 0.2681, 0.393, 0.7109,
239            0.729, 0.7412, 0.8742, 0.7118, 0.3665, 0.8132, 0.7894, 0.8763, 0.8868, 0.7589, 0.7858,
240            0.714, 0.7966, 0.8288, 0.8542, 0.7095, 0.5959, 0.6213, 0.5612, 0.4604, 0.5065, 0.47,
241            0.3502, 0.3097, 0.2712,
242        ]
243        .into_iter()
244        .collect();
245        m_exp.set_labels(vec![
246            "Mouse".to_owned(),
247            "Bovine".to_owned(),
248            "Lemur".to_owned(),
249            "Tarsier".to_owned(),
250            "Squir Monk".to_owned(),
251            "Jpn Macaq".to_owned(),
252            "Rhesus Mac".to_owned(),
253            "Crab-E.Mac".to_owned(),
254            "BarbMacaq".to_owned(),
255            "Gibbon".to_owned(),
256            "Orang".to_owned(),
257            "Gorilla".to_owned(),
258            "Chimp".to_owned(),
259            "Human".to_owned(),
260        ]);
261
262        assert_eq!(matrix, m_exp);
263    }
264
265    #[test]
266    fn test_label_parsing() {
267        assert_eq!(
268            parse_label("1234 67890 0.12345", PhylipDialect::Strict).unwrap(),
269            ("1234 67890", "0.12345")
270        );
271        assert_eq!(
272            parse_label("1234 67890 0.12345", PhylipDialect::Relaxed).unwrap(),
273            ("1234", "67890 0.12345")
274        );
275        assert_eq!(
276            parse_label("1234       0.12345", PhylipDialect::Strict).unwrap(),
277            ("1234", "0.12345")
278        );
279        assert_eq!(
280            parse_label("1234567 0.12345", PhylipDialect::Strict).unwrap(),
281            ("1234567 0.", "12345")
282        );
283        assert_eq!(
284            parse_label("1234567 0.12345", PhylipDialect::Relaxed).unwrap(),
285            ("1234567", "0.12345")
286        );
287        assert!(parse_label("12345 0.1", PhylipDialect::Strict).is_err());
288    }
289}