1use std::io::Read;
2use std::io::{BufRead, BufReader};
3use thiserror::Error;
4
5use crate::symmetric::flip_order;
6use crate::{DistMatrix, SquareMatrix};
7
8#[derive(Clone, Copy, Debug)]
15pub enum PhylipDialect {
16 Strict,
29
30 Relaxed,
43}
44
45#[derive(Error, Debug)]
47pub enum PhylipError {
48 #[error("unable to read distance matrix file")]
50 Io(#[from] std::io::Error),
51
52 #[error("unable to read header row with taxa count")]
53 Header,
54
55 #[error("row missing label")]
56 Label,
57
58 #[error("matrix row {0} had at least {1} entries when {2} were expected")]
59 Extra(usize, usize, usize),
60
61 #[error("reached end of file while expecting {0} more matrix rows")]
62 RowsTruncated(usize),
63
64 #[error("expected floating point number found `{0}': {1}")]
66 Numeric(String, std::num::ParseFloatError),
67}
68
69pub fn parse<R: Read>(reader: R, dialect: PhylipDialect) -> Result<SquareMatrix<f32>, PhylipError> {
71 let (labels, data, size) = parse_impl(reader, dialect, false)?;
72 let labels = Some(labels);
73 let matrix = SquareMatrix { data, size, labels };
74 Ok(matrix)
75}
76
77pub fn parse_lt<R: Read>(
79 reader: R,
80 dialect: PhylipDialect,
81) -> Result<DistMatrix<f32>, PhylipError> {
82 let (labels, data, size) = parse_impl(reader, dialect, true)?;
83 let labels = Some(labels);
84 let matrix = DistMatrix { data, size, labels };
85 Ok(matrix)
86}
87
88fn parse_impl<R: Read>(
89 reader: R,
90 dialect: PhylipDialect,
91 lower_triangle: bool,
92) -> Result<(Vec<String>, Vec<f32>, usize), PhylipError> {
93 let mut br = BufReader::new(reader);
94 let mut buf = String::new();
95
96 br.read_line(&mut buf)?;
97 let size: usize = buf.trim().parse().map_err(|_| PhylipError::Header)?;
98
99 let mut labels = Vec::<String>::with_capacity(size);
100 let mut data = Vec::<f32>::with_capacity(size * size);
101 let mut expected_entries = 0;
102
103 loop {
104 buf.clear();
105 let n = br.read_line(&mut buf)?;
106 if n > 0 {
107 let n_read;
108
109 if expected_entries == 0 {
110 let (label, new_data) = parse_label(&buf, dialect)?;
111 expected_entries = if lower_triangle { labels.len() } else { size };
112 labels.push(label.to_owned());
113 n_read = parse_data(new_data, &mut data)?;
114 } else {
115 n_read = parse_data(&buf, &mut data)?;
116 }
117
118 if n_read > expected_entries {
119 return Err(PhylipError::Extra(
120 labels.len(),
121 expected_entries + n_read,
122 size,
123 ));
124 } else {
125 expected_entries -= n_read;
126 }
127 } else {
128 let remaining = size - labels.len();
130 if remaining > 0 {
131 return Err(PhylipError::RowsTruncated(remaining));
132 }
133
134 break;
135 }
136 }
137
138 if lower_triangle {
139 data = flip_order(&data, size);
140 }
141
142 Ok((labels, data, size))
143}
144
145fn parse_label(line: &str, dialect: PhylipDialect) -> Result<(&str, &str), PhylipError> {
146 match dialect {
147 PhylipDialect::Strict => {
148 if line.len() >= 10 {
149 Ok((line[0..10].trim_end(), line[10..].trim_start()))
150 } else {
151 Err(PhylipError::Label)
152 }
153 }
154
155 PhylipDialect::Relaxed => match line.split_once(|x| char::is_ascii_whitespace(&x)) {
156 Some((label, rest)) => Ok((label, rest.trim_start())),
157 None => Err(PhylipError::Label),
158 },
159 }
160}
161
162fn parse_data(line: &str, data: &mut Vec<f32>) -> Result<usize, PhylipError> {
163 let orig_size = data.len();
164
165 for number in line.trim_end().split_ascii_whitespace() {
166 data.push(
167 number
168 .parse()
169 .map_err(|e| PhylipError::Numeric(number.to_owned(), e))?,
170 );
171 }
172
173 Ok(data.len() - orig_size)
174}
175
176#[cfg(test)]
177mod tests {
178 use super::*;
179
180 fn expected_matrix() -> SquareMatrix<f32> {
181 let mut matrix: SquareMatrix<f32> = [
182 0.0000, 1.6866, 1.7198, 1.6606, 1.5243, 1.6043, 1.5905, 1.6866, 0.0000, 1.5232, 1.4841,
183 1.4465, 1.4389, 1.4629, 1.7198, 1.5232, 0.0000, 0.7115, 0.5958, 0.6179, 0.5583, 1.6606,
184 1.4841, 0.7115, 0.0000, 0.4631, 0.5061, 0.4710, 1.5243, 1.4465, 0.5958, 0.4631, 0.0000,
185 0.3484, 0.3083, 1.6043, 1.4389, 0.6179, 0.5061, 0.3484, 0.0000, 0.2692, 1.5905, 1.4629,
186 0.5583, 0.4710, 0.3083, 0.2692, 0.0000,
187 ]
188 .into_iter()
189 .collect();
190 matrix.set_labels(vec![
191 "Bovine".to_owned(),
192 "Mouse".to_owned(),
193 "Gibbon".to_owned(),
194 "Orang".to_owned(),
195 "Gorilla".to_owned(),
196 "Chimp".to_owned(),
197 "Human".to_owned(),
198 ]);
199 matrix
200 }
201
202 #[test]
203 fn test_square() {
204 let f = include_bytes!("../../tests/phylip/square.dist");
205
206 let matrix = parse(f.as_slice(), PhylipDialect::Strict).unwrap();
207 assert_eq!(matrix, expected_matrix());
208
209 let matrix = parse(f.as_slice(), PhylipDialect::Relaxed).unwrap();
210 assert_eq!(matrix, expected_matrix());
211 }
212
213 #[test]
214 fn test_square_multiline() {
215 let f = include_bytes!("../../tests/phylip/square_multiline.dist");
216 let matrix = parse(f.as_slice(), PhylipDialect::Relaxed).unwrap();
217 assert_eq!(matrix, expected_matrix());
218 }
219
220 #[test]
221 fn test_lower_triangle() {
222 let f = include_bytes!("../../tests/phylip/lower.dist");
223 let matrix = parse_lt(f.as_slice(), PhylipDialect::Relaxed).unwrap();
224 assert_eq!(matrix, expected_matrix().lower_triangle());
225 }
226
227 #[test]
228 fn test_lower_triangle_multiline() {
229 let f = include_bytes!("../../tests/phylip/lower_multiline.dist");
230 let matrix = parse_lt(f.as_slice(), PhylipDialect::Strict).unwrap();
231
232 let mut m_exp: DistMatrix<f32> = [
233 1.7043, 2.0235, 2.1378, 1.5232, 1.8261, 1.9182, 2.0039, 1.9431, 1.9663, 2.0593, 1.6664,
234 1.732, 1.7101, 1.1901, 1.3287, 1.2423, 1.2508, 1.2536, 1.3066, 1.2827, 1.3296, 1.2005,
235 1.346, 1.3757, 1.3956, 1.2905, 1.3199, 1.3887, 1.4658, 1.4826, 1.4502, 1.8708, 1.5356,
236 1.4577, 1.7803, 1.6661, 1.7878, 1.3137, 1.3788, 1.3826, 1.4543, 1.6683, 1.6606, 1.5935,
237 1.7119, 1.7599, 1.0642, 1.1124, 0.9832, 1.0629, 0.9228, 1.0681, 0.9127, 1.0635, 1.0557,
238 0.1022, 0.2061, 0.3895, 0.8035, 0.7239, 0.7278, 0.7899, 0.6933, 0.2681, 0.393, 0.7109,
239 0.729, 0.7412, 0.8742, 0.7118, 0.3665, 0.8132, 0.7894, 0.8763, 0.8868, 0.7589, 0.7858,
240 0.714, 0.7966, 0.8288, 0.8542, 0.7095, 0.5959, 0.6213, 0.5612, 0.4604, 0.5065, 0.47,
241 0.3502, 0.3097, 0.2712,
242 ]
243 .into_iter()
244 .collect();
245 m_exp.set_labels(vec![
246 "Mouse".to_owned(),
247 "Bovine".to_owned(),
248 "Lemur".to_owned(),
249 "Tarsier".to_owned(),
250 "Squir Monk".to_owned(),
251 "Jpn Macaq".to_owned(),
252 "Rhesus Mac".to_owned(),
253 "Crab-E.Mac".to_owned(),
254 "BarbMacaq".to_owned(),
255 "Gibbon".to_owned(),
256 "Orang".to_owned(),
257 "Gorilla".to_owned(),
258 "Chimp".to_owned(),
259 "Human".to_owned(),
260 ]);
261
262 assert_eq!(matrix, m_exp);
263 }
264
265 #[test]
266 fn test_label_parsing() {
267 assert_eq!(
268 parse_label("1234 67890 0.12345", PhylipDialect::Strict).unwrap(),
269 ("1234 67890", "0.12345")
270 );
271 assert_eq!(
272 parse_label("1234 67890 0.12345", PhylipDialect::Relaxed).unwrap(),
273 ("1234", "67890 0.12345")
274 );
275 assert_eq!(
276 parse_label("1234 0.12345", PhylipDialect::Strict).unwrap(),
277 ("1234", "0.12345")
278 );
279 assert_eq!(
280 parse_label("1234567 0.12345", PhylipDialect::Strict).unwrap(),
281 ("1234567 0.", "12345")
282 );
283 assert_eq!(
284 parse_label("1234567 0.12345", PhylipDialect::Relaxed).unwrap(),
285 ("1234567", "0.12345")
286 );
287 assert!(parse_label("12345 0.1", PhylipDialect::Strict).is_err());
288 }
289}