use thiserror::Error;
use crate::model::{Alignment, Sequence};
#[derive(Error, Debug)]
pub enum PhylipError {
#[error("Empty PHYLIP file")]
EmptyFile,
#[error("Invalid header: expected 'ntax nchar' (two integers), got '{0}'")]
InvalidHeader(String),
#[error("Invalid sequence count in header: '{0}' is not a valid number")]
InvalidSequenceCount(String),
#[error("Invalid sequence length in header: '{0}' is not a valid number")]
InvalidSequenceLength(String),
#[error("Expected {expected} sequences but found {found}")]
SequenceCountMismatch { expected: usize, found: usize },
#[error("Sequence '{name}' has length {found}, expected {expected}")]
SequenceLengthMismatch {
name: String,
expected: usize,
found: usize,
},
#[error("No sequence data found after header")]
NoSequenceData,
#[error("Missing sequence data for '{0}' in interleaved block")]
MissingInterleavedData(String),
#[error("Line {line}: {message}")]
ParseError { line: usize, message: String },
}
pub type PhylipResult<T> = Result<T, PhylipError>;
pub fn parse_phylip_str(content: &str) -> PhylipResult<Alignment> {
let lines: Vec<&str> = content.lines().collect();
if lines.is_empty() {
return Err(PhylipError::EmptyFile);
}
let (header_idx, header) = lines
.iter()
.enumerate()
.find(|(_, line)| !line.trim().is_empty())
.ok_or(PhylipError::EmptyFile)?;
let header = header.trim();
let parts: Vec<&str> = header.split_whitespace().collect();
if parts.len() < 2 {
return Err(PhylipError::InvalidHeader(header.to_string()));
}
let ntax: usize = parts[0]
.parse()
.map_err(|_| PhylipError::InvalidSequenceCount(parts[0].to_string()))?;
let nchar: usize = parts[1]
.parse()
.map_err(|_| PhylipError::InvalidSequenceLength(parts[1].to_string()))?;
if ntax == 0 {
return Err(PhylipError::InvalidSequenceCount("0".to_string()));
}
let data_lines: Vec<&str> = lines
.iter()
.skip(header_idx + 1)
.copied()
.collect();
if data_lines.iter().all(|l| l.trim().is_empty()) {
return Err(PhylipError::NoSequenceData);
}
let sequences = parse_phylip_data(&data_lines, ntax, nchar)?;
Ok(Alignment::new(sequences))
}
fn split_name_and_sequence(line: &str) -> (Option<String>, String) {
let line = line.trim();
if line.is_empty() {
return (None, String::new());
}
if line.len() >= 10 {
let potential_name = &line[..10];
let potential_seq = &line[10..];
let name_trimmed = potential_name.trim();
let seq_chars: String = potential_seq
.chars()
.filter(|c| !c.is_whitespace())
.collect();
let name_words: Vec<&str> = name_trimmed.split_whitespace().collect();
if name_words.len() == 1 && !name_trimmed.is_empty()
&& !seq_chars.is_empty()
&& seq_chars.chars().all(|c| is_sequence_char(c))
{
return (Some(name_trimmed.to_string()), seq_chars);
}
}
if let Some(space_idx) = line.find(|c: char| c.is_whitespace()) {
let name = line[..space_idx].trim();
let rest = &line[space_idx..];
let seq: String = rest.chars().filter(|c| !c.is_whitespace()).collect();
if !name.is_empty() && !seq.is_empty() && seq.chars().all(|c| is_sequence_char(c)) {
return (Some(name.to_string()), seq);
}
}
let seq: String = line.chars().filter(|c| !c.is_whitespace()).collect();
if !seq.is_empty() && seq.chars().all(|c| is_sequence_char(c)) {
return (None, seq);
}
(Some(line.to_string()), String::new())
}
fn is_sequence_char(c: char) -> bool {
c.is_ascii_alphabetic() || c == '-' || c == '.' || c == '*' || c == '?'
}
fn parse_phylip_data(
lines: &[&str],
ntax: usize,
nchar: usize,
) -> PhylipResult<Vec<Sequence>> {
let mut sequences: Vec<(String, Vec<u8>)> = Vec::with_capacity(ntax);
let mut in_interleaved_continuation = false;
let mut interleaved_idx = 0;
for line in lines {
let trimmed = line.trim();
if trimmed.is_empty() {
if sequences.len() == ntax {
in_interleaved_continuation = true;
interleaved_idx = 0;
}
continue;
}
let (name, seq) = split_name_and_sequence(trimmed);
if in_interleaved_continuation {
if interleaved_idx < sequences.len() {
sequences[interleaved_idx].1.extend(seq.into_bytes());
interleaved_idx += 1;
}
} else if let Some(n) = name {
if sequences.len() < ntax {
sequences.push((n, seq.into_bytes()));
} else {
if let Some(pos) = sequences.iter().position(|(sn, _)| sn == &n) {
sequences[pos].1.extend(seq.into_bytes());
}
}
} else if !seq.is_empty() {
if !sequences.is_empty() {
sequences.last_mut().unwrap().1.extend(seq.into_bytes());
}
}
if sequences.len() == ntax && nchar > 0 {
let all_complete = sequences.iter().all(|(_, data)| data.len() >= nchar);
if all_complete {
break;
}
}
}
if sequences.is_empty() {
return Err(PhylipError::NoSequenceData);
}
for (_, data) in &mut sequences {
data.shrink_to_fit();
}
let mut result: Vec<Sequence> = sequences
.into_iter()
.map(|(name, data)| Sequence::from_bytes(name, data))
.collect();
result.shrink_to_fit();
Ok(result)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_sequential_simple() {
let content = " 3 10
Seq1 ACGTACGTAC
Seq2 TGCATGCATG
Seq3 AAAACCCCGG
";
let alignment = parse_phylip_str(content).unwrap();
assert_eq!(alignment.sequence_count(), 3);
assert_eq!(alignment.get(0).unwrap().id, "Seq1");
assert_eq!(alignment.get(0).unwrap().as_str(), "ACGTACGTAC");
assert_eq!(alignment.get(1).unwrap().id, "Seq2");
assert_eq!(alignment.get(2).unwrap().id, "Seq3");
}
#[test]
fn test_parse_sequential_multiline() {
let content = " 2 20
Seq1 ACGTACGTAC
GGGGGGGGGG
Seq2 TGCATGCATG
CCCCCCCCCC
";
let alignment = parse_phylip_str(content).unwrap();
assert_eq!(alignment.sequence_count(), 2);
assert_eq!(alignment.get(0).unwrap().as_str(), "ACGTACGTACGGGGGGGGGG");
assert_eq!(alignment.get(1).unwrap().as_str(), "TGCATGCATGCCCCCCCCCC");
}
#[test]
fn test_parse_interleaved() {
let content = " 3 20
Seq1 ACGTACGTAC
Seq2 TGCATGCATG
Seq3 AAAACCCCGG
GGGGGGGGGG
CCCCCCCCCC
TTTTTTTTTT
";
let alignment = parse_phylip_str(content).unwrap();
assert_eq!(alignment.sequence_count(), 3);
assert_eq!(alignment.get(0).unwrap().as_str(), "ACGTACGTACGGGGGGGGGG");
assert_eq!(alignment.get(1).unwrap().as_str(), "TGCATGCATGCCCCCCCCCC");
assert_eq!(alignment.get(2).unwrap().as_str(), "AAAACCCCGGTTTTTTTTTT");
}
#[test]
fn test_parse_relaxed_names() {
let content = "3 10
seq1 ACGTACGTAC
seq2 TGCATGCATG
seq3 AAAACCCCGG
";
let alignment = parse_phylip_str(content).unwrap();
assert_eq!(alignment.sequence_count(), 3);
assert_eq!(alignment.get(0).unwrap().id, "seq1");
}
#[test]
fn test_parse_with_gaps() {
let content = " 2 10
Seq1 ACGT--GTAC
Seq2 TG--TGCATG
";
let alignment = parse_phylip_str(content).unwrap();
assert_eq!(alignment.get(0).unwrap().as_str(), "ACGT--GTAC");
assert_eq!(alignment.get(1).unwrap().as_str(), "TG--TGCATG");
}
#[test]
fn test_empty_file() {
let content = "";
assert!(matches!(
parse_phylip_str(content),
Err(PhylipError::EmptyFile)
));
}
#[test]
fn test_invalid_header() {
let content = "not a valid header
Seq1 ACGT
";
let result = parse_phylip_str(content);
assert!(result.is_err());
let content2 = "invalid
Seq1 ACGT
";
assert!(matches!(
parse_phylip_str(content2),
Err(PhylipError::InvalidHeader(_))
));
}
#[test]
fn test_too_few_sequences() {
let content = " 3 10
Seq1 ACGTACGTAC
Seq2 TGCATGCATG
";
let result = parse_phylip_str(content);
assert!(result.is_ok());
assert_eq!(result.unwrap().sequence_count(), 2);
}
}