1use std::io::Read;
2use std::io::{BufRead, BufReader};
3use thiserror::Error;
4
5use crate::builder::{DataError, DistBuilder};
6use crate::symmetric::flip_order;
7use crate::{DistMatrix, SquareMatrix};
8
9#[derive(Clone, Copy, Debug)]
10pub enum Separator {
11 Char(char),
13
14 Whitespace,
16}
17
18pub enum TabularShape {
19 Wide,
32
33 Long,
67}
68
69#[derive(Error, Debug)]
70pub enum TabularError {
71 #[error("unable to read distance matrix file")]
73 Io(#[from] std::io::Error),
74
75 #[error("unable to read header row with taxa labels")]
76 Header,
77
78 #[error("the file contained no data (empty or header had no delimeters)")]
79 NoData,
80
81 #[error("matrix row {0} (label '{1}') had {2} entries when {3} were expected")]
82 RowWidth(usize, String, usize, usize),
83
84 #[error("expected 3 columns: {0}")]
85 ColsTruncated(String),
86
87 #[error("matrix row {0} had label '{1}' but '{2}' was expected")]
88 RowOrder(usize, String, String),
89
90 #[error("row did not start with a label: {0}")]
91 Label(String),
92
93 #[error("reached end of file while expecting {0} more matrix rows")]
94 RowsTruncated(usize),
95
96 #[error("data has incorrect shape")]
97 Data(#[from] DataError),
98
99 #[error("expected integer found `{0}': {1}")]
101 Numeric(String, std::num::ParseIntError),
102}
103
104pub fn parse<R: Read>(
106 reader: R,
107 separator: Separator,
108 shape: TabularShape,
109) -> Result<SquareMatrix<u32>, TabularError> {
110 let (labels, data, size) = match shape {
111 TabularShape::Wide => parse_wide(reader, separator)?,
112 TabularShape::Long => parse_long(reader, separator, false)?,
113 };
114 let labels = Some(labels);
115 let matrix = SquareMatrix { data, size, labels };
116 Ok(matrix)
117}
118
119pub fn parse_lt<R: Read>(reader: R, separator: Separator) -> Result<DistMatrix<u32>, TabularError> {
121 let (labels, data, size) = parse_long(reader, separator, true)?;
122 let labels = Some(labels);
123 let data = flip_order(&data, size);
124 let matrix = DistMatrix { data, size, labels };
125 Ok(matrix)
126}
127
128fn parse_wide<R: Read>(
129 reader: R,
130 separator: Separator,
131) -> Result<(Vec<String>, Vec<u32>, usize), TabularError> {
132 let labels;
133 let mut data;
134
135 {
136 let mut br = BufReader::new(reader);
137 let mut buf = String::new();
138
139 br.read_line(&mut buf).map_err(|_| TabularError::Header)?;
141 let (_, rest) = separator.split_label(&buf)?;
142 labels = separator.split_str(rest.trim_end());
143 if labels.is_empty() {
144 return Err(TabularError::NoData);
145 }
146 data = Vec::with_capacity(labels.len() * labels.len());
147
148 let mut row = 0;
149
150 loop {
151 row += 1;
152 buf.clear();
153 let n = br.read_line(&mut buf)?;
154 if n > 0 {
155 let (label, rest) = separator.split_label(&buf)?;
156 if label != labels[row - 1] {
157 return Err(TabularError::RowOrder(
158 row,
159 label.to_owned(),
160 labels[row - 1].clone(),
161 ));
162 }
163
164 let n_read = separator.split_u32(rest.trim_end(), &mut data)?;
165 if n_read != labels.len() {
166 return Err(TabularError::RowWidth(
167 row,
168 label.to_owned(),
169 n_read,
170 labels.len(),
171 ));
172 }
173 } else {
174 break; }
176 }
177
178 if row < labels.len() {
179 return Err(TabularError::RowsTruncated(labels.len() - row));
180 }
181 }
182
183 let size = labels.len();
184 Ok((labels, data, size))
185}
186
187fn parse_long<R: Read>(
188 reader: R,
189 separator: Separator,
190 lower_triangle: bool,
191) -> Result<(Vec<String>, Vec<u32>, usize), TabularError> {
192 let builder = parse_long_impl(reader, separator)?;
193 let labels = builder.labels.clone();
194 let size = labels.len();
195
196 if lower_triangle {
197 let matrix: DistMatrix<u32> = builder.try_into()?;
198 Ok((labels, matrix.data, size))
199 } else {
200 let matrix: SquareMatrix<u32> = builder.try_into()?;
201 Ok((labels, matrix.data, size))
202 }
203}
204
205fn parse_long_impl<R: Read>(
206 reader: R,
207 separator: Separator,
208) -> Result<DistBuilder<u32>, TabularError> {
209 let mut builder = DistBuilder::<u32>::new();
210
211 let mut br = BufReader::new(reader);
212 let mut buf = String::new();
213
214 let mut row = 0;
215 let mut header_seen = false;
216
217 loop {
218 row += 1;
219 buf.clear();
220 let n = br.read_line(&mut buf)?;
221 if n > 0 {
222 let parts = separator.split_3(buf.trim_end());
223 if row == 1 && !header_seen {
224 if let Err(TabularError::Numeric(_, _)) = parts {
225 row = 0;
226 header_seen = true;
227 continue;
228 }
229 }
230
231 let (name1, name2, distance) = parts?;
232 builder.add(name1, name2, distance)?;
233 } else {
234 break; }
236 }
237
238 Ok(builder)
239}
240
241impl Separator {
242 fn split_str(&self, line: &str) -> Vec<String> {
243 match self {
244 Separator::Char(c) => line.split(*c).map(str::to_owned).collect(),
245 Separator::Whitespace => line.split_ascii_whitespace().map(str::to_owned).collect(),
246 }
247 }
248
249 fn split_label<'a>(&self, line: &'a str) -> Result<(&'a str, &'a str), TabularError> {
250 match self {
251 Separator::Char(c) => line
252 .split_once(*c)
253 .ok_or_else(|| TabularError::Label(line.to_owned())),
254 Separator::Whitespace => {
255 let (label, rest) = line
256 .split_once(|x| char::is_ascii_whitespace(&x))
257 .ok_or_else(|| TabularError::Label(line.to_owned()))?;
258 Ok((label, rest.trim_start()))
259 }
260 }
261 }
262
263 fn split_u32(&self, line: &str, data: &mut Vec<u32>) -> Result<usize, TabularError> {
264 let orig_size = data.len();
265
266 match self {
267 Separator::Char(c) => {
268 for number in line.trim_end().split(*c) {
269 data.push(
270 number
271 .parse()
272 .map_err(|e| TabularError::Numeric(number.to_owned(), e))?,
273 );
274 }
275 }
276 Separator::Whitespace => {
277 for number in line.trim_end().split_ascii_whitespace() {
278 data.push(
279 number
280 .parse()
281 .map_err(|e| TabularError::Numeric(number.to_owned(), e))?,
282 );
283 }
284 }
285 }
286
287 Ok(data.len() - orig_size)
288 }
289
290 fn split_3<'a>(&self, line: &'a str) -> Result<(&'a str, &'a str, u32), TabularError> {
291 let (p1, p2, p3) = match self {
292 Separator::Char(c) => extract_3(line, line.split(*c))?,
293 Separator::Whitespace => extract_3(line, line.split_ascii_whitespace())?,
294 };
295
296 let p3 = p3
297 .parse()
298 .map_err(|e| TabularError::Numeric(p3.to_owned(), e))?;
299 Ok((p1, p2, p3))
300 }
301}
302
303fn extract_3<'a>(
304 line: &'a str,
305 mut splitter: impl Iterator<Item = &'a str>,
306) -> Result<(&'a str, &'a str, &'a str), TabularError> {
307 let p1 = splitter
308 .next()
309 .ok_or_else(|| TabularError::ColsTruncated(line.to_owned()))?;
310 let p2 = splitter
311 .next()
312 .ok_or_else(|| TabularError::ColsTruncated(line.to_owned()))?;
313 let p3 = splitter
314 .next()
315 .ok_or_else(|| TabularError::ColsTruncated(line.to_owned()))?;
316 if splitter.next().is_some() {
317 return Err(TabularError::ColsTruncated(line.to_owned()));
318 }
319 Ok((p1, p2, p3))
320}
321
322#[cfg(test)]
323mod tests {
324 use super::*;
325
326 fn expected_labels() -> Vec<String> {
327 vec![
328 "seq1".to_owned(),
329 "seq2".to_owned(),
330 "seq3".to_owned(),
331 "seq4".to_owned(),
332 ]
333 }
334
335 fn expected_data() -> Vec<u32> {
336 vec![
337 0, 1, 2, 3, 1, 0, 3, 4, 2, 3, 0, 4, 3, 4, 4, 0, ]
343 }
344
345 #[test]
346 fn test_wide() {
347 let f = include_bytes!("../../tests/snp-dists/default.dat");
348 let (labels, data, _size) = parse_wide(f.as_slice(), Separator::Char('\t')).unwrap();
349 assert_eq!(labels, expected_labels());
350 assert_eq!(data, expected_data());
351 }
352
353 #[test]
354 fn test_version() {
355 let f = include_bytes!("../../tests/snp-dists/version.dat");
356 let (labels, data, _size) = parse_wide(f.as_slice(), Separator::Char('\t')).unwrap();
357 assert_eq!(labels, expected_labels());
358 assert_eq!(data, expected_data());
359 }
360
361 #[test]
362 fn test_comma() {
363 let f = include_bytes!("../../tests/snp-dists/comma.dat");
364 let (labels, data, _size) = parse_wide(f.as_slice(), Separator::Char(',')).unwrap();
365 assert_eq!(labels, expected_labels());
366 assert_eq!(data, expected_data());
367 }
368
369 #[test]
370 fn test_melt() {
371 let f = include_bytes!("../../tests/snp-dists/melt.dat");
372 let (labels, data, _size) = parse_long(f.as_slice(), Separator::Char('\t'), false).unwrap();
373 assert_eq!(labels, expected_labels());
374 assert_eq!(data, expected_data());
375 }
376
377 #[test]
378 fn test_melt_comma() {
379 let f = include_bytes!("../../tests/snp-dists/melt-comma.dat");
380 let (labels, data, _size) = parse_long(f.as_slice(), Separator::Char(','), false).unwrap();
381 assert_eq!(labels, expected_labels());
382 assert_eq!(data, expected_data());
383 }
384
385 #[test]
386 fn test_melt_lt() {
387 let f = include_bytes!("../../tests/long_lt.dat");
388 let (labels, data, _size) = parse_long(f.as_slice(), Separator::Char('\t'), true).unwrap();
389 assert_eq!(labels, expected_labels());
390 assert_eq!(data, vec![1, 2, 3, 3, 4, 4]);
391 }
392}