use thiserror::Error;
use crate::model::{Alignment, Sequence};
#[derive(Error, Debug)]
pub enum NexusError {
#[error("Not a NEXUS file (must start with #NEXUS)")]
NotNexus,
#[error("Empty NEXUS file")]
EmptyFile,
#[error("No DATA or CHARACTERS block found")]
NoDataBlock,
#[error("Missing DIMENSIONS command in {block} block")]
MissingDimensions { block: String },
#[error("Missing MATRIX command in {block} block")]
MissingMatrix { block: String },
#[error("Invalid DIMENSIONS: {0}")]
InvalidDimensions(String),
#[error("NTAX not specified in DIMENSIONS")]
MissingNtax,
#[error("Expected {expected} sequences (NTAX), found {found}")]
SequenceCountMismatch { expected: usize, found: usize },
#[error("Sequence '{name}' has length {found}, expected {expected} (NCHAR)")]
SequenceLengthMismatch {
name: String,
expected: usize,
found: usize,
},
#[error("Unterminated MATRIX (missing ';')")]
UnterminatedMatrix,
#[error("Duplicate sequence name: '{0}'")]
DuplicateName(String),
#[error("Parse error at line {line}: {message}")]
ParseError { line: usize, message: String },
}
pub type NexusResult<T> = Result<T, NexusError>;
pub fn parse_nexus_str(content: &str) -> NexusResult<Alignment> {
let lines: Vec<&str> = content.lines().collect();
if lines.is_empty() {
return Err(NexusError::EmptyFile);
}
let first_non_empty = lines
.iter()
.find(|line| !line.trim().is_empty())
.ok_or(NexusError::EmptyFile)?;
if !first_non_empty.trim().to_uppercase().starts_with("#NEXUS") {
return Err(NexusError::NotNexus);
}
let block_content = find_data_block(&lines)?;
parse_data_block(&block_content)
}
fn find_data_block(lines: &[&str]) -> NexusResult<String> {
let mut in_block = false;
let mut block_lines: Vec<&str> = Vec::new();
for line in lines {
let trimmed = line.trim();
let upper = trimmed.to_uppercase();
if !in_block {
if upper.starts_with("BEGIN") && (upper.contains("DATA") || upper.contains("CHARACTERS")) {
in_block = true;
}
} else {
if upper.starts_with("END") || upper.starts_with("ENDBLOCK") {
break;
}
block_lines.push(trimmed);
}
}
if block_lines.is_empty() && !in_block {
return Err(NexusError::NoDataBlock);
}
Ok(block_lines.join("\n"))
}
fn parse_data_block(content: &str) -> NexusResult<Alignment> {
let normalized = normalize_nexus_commands(content);
let mut ntax: Option<usize> = None;
let mut nchar: Option<usize> = None;
let mut interleave = false;
let mut matchchar: Option<char> = None;
let mut matrix_content = String::new();
let mut in_matrix = false;
for line in normalized.lines() {
let trimmed = line.trim();
let upper = trimmed.to_uppercase();
if trimmed.is_empty() {
continue;
}
if !in_matrix && trimmed.starts_with('[') {
continue;
}
if upper.starts_with("DIMENSIONS") {
if let Some(n) = extract_param(&upper, "NTAX") {
ntax = n.parse().ok();
}
if let Some(n) = extract_param(&upper, "NCHAR") {
nchar = n.parse().ok();
}
} else if upper.starts_with("FORMAT") {
interleave = upper.contains("INTERLEAVE");
if let Some(mc) = extract_param(&upper, "MATCHCHAR") {
matchchar = mc.chars().next();
}
} else if upper.starts_with("MATRIX") || in_matrix {
in_matrix = true;
if upper.starts_with("MATRIX") {
if let Some(idx) = upper.find("MATRIX") {
let after = trimmed[idx + 6..].trim();
if !after.is_empty() && after != ";" {
matrix_content.push_str(after.trim_end_matches(';'));
matrix_content.push(' ');
}
}
} else {
let clean = trimmed.trim_end_matches(';');
if !clean.is_empty() {
matrix_content.push_str(clean);
matrix_content.push(' ');
}
}
if trimmed.ends_with(';') {
in_matrix = false;
}
}
}
let expected_ntax = ntax.unwrap_or(0);
let expected_nchar = nchar.unwrap_or(0);
let sequences = parse_matrix(&matrix_content, expected_ntax, expected_nchar, interleave, matchchar)?;
if sequences.is_empty() {
return Err(NexusError::NoDataBlock);
}
Ok(Alignment::new(sequences))
}
fn normalize_nexus_commands(content: &str) -> String {
let mut result = String::new();
let mut current_command = String::new();
let mut in_matrix = false;
for line in content.lines() {
let trimmed = line.trim();
if trimmed.is_empty() {
if in_matrix {
result.push('\n');
}
continue;
}
if in_matrix {
let line_no_comments = remove_nexus_comments(trimmed);
if line_no_comments.trim().ends_with(';') {
let clean = line_no_comments.trim().trim_end_matches(';');
if !clean.is_empty() {
result.push_str(clean);
result.push('\n');
}
in_matrix = false;
} else {
result.push_str(trimmed);
result.push('\n');
}
continue;
}
let line_no_comments = remove_nexus_comments(trimmed);
let line_no_comments = line_no_comments.trim();
if line_no_comments.is_empty() {
continue;
}
let upper = line_no_comments.to_uppercase();
if upper.starts_with("MATRIX") {
if !current_command.is_empty() {
result.push_str(¤t_command);
result.push('\n');
current_command.clear();
}
in_matrix = true;
result.push_str(line_no_comments);
result.push('\n');
} else {
if !current_command.is_empty() {
current_command.push(' ');
}
current_command.push_str(line_no_comments);
if line_no_comments.ends_with(';') {
result.push_str(¤t_command);
result.push('\n');
current_command.clear();
}
}
}
if !current_command.is_empty() {
result.push_str(¤t_command);
result.push('\n');
}
result
}
fn remove_nexus_comments(line: &str) -> String {
let mut result = String::new();
let mut in_comment = false;
for c in line.chars() {
if c == '[' {
in_comment = true;
} else if c == ']' {
in_comment = false;
} else if !in_comment {
result.push(c);
}
}
result
}
fn extract_param<'a>(line: &'a str, param: &str) -> Option<&'a str> {
let idx = line.find(param)?;
let after = &line[idx + param.len()..];
let eq_idx = after.find('=')?;
let after_eq = &after[eq_idx + 1..];
let end = after_eq
.find(|c: char| c.is_whitespace() || c == ';')
.unwrap_or(after_eq.len());
let value = after_eq[..end].trim();
if value.is_empty() {
None
} else {
Some(value)
}
}
fn parse_matrix(content: &str, ntax: usize, nchar: usize, interleave: bool, matchchar: Option<char>) -> NexusResult<Vec<Sequence>> {
let tokens = tokenize_matrix(content);
if tokens.is_empty() {
return Ok(Vec::new());
}
let mut raw_sequences = if interleave && ntax > 0 {
parse_interleaved_tokens_raw(&tokens, ntax, nchar)?
} else {
parse_sequential_tokens_raw(&tokens, ntax, nchar)?
};
if let Some(mc) = matchchar {
apply_matchchar_raw(&mut raw_sequences, mc);
}
for (_, data) in &mut raw_sequences {
data.shrink_to_fit();
}
let mut result: Vec<Sequence> = raw_sequences
.into_iter()
.map(|(name, data)| Sequence::from_bytes(name, data))
.collect();
result.shrink_to_fit();
Ok(result)
}
fn apply_matchchar_raw(sequences: &mut [(String, Vec<u8>)], matchchar: char) {
if sequences.len() < 2 {
return;
}
let matchchar_byte = matchchar as u8;
let first_seq_data: Vec<u8> = sequences[0].1.clone();
for (_, seq_data) in sequences.iter_mut().skip(1) {
for (i, byte) in seq_data.iter_mut().enumerate() {
if *byte == matchchar_byte && i < first_seq_data.len() {
*byte = first_seq_data[i];
}
}
}
}
fn tokenize_matrix(content: &str) -> Vec<String> {
let mut tokens = Vec::new();
let mut current_token = String::new();
let mut in_comment = false;
let mut in_single_quote = false;
let mut in_double_quote = false;
for c in content.chars() {
if in_comment {
if c == ']' {
in_comment = false;
}
continue;
}
if c == '[' && !in_single_quote && !in_double_quote {
if !current_token.is_empty() {
tokens.push(std::mem::take(&mut current_token));
}
in_comment = true;
continue;
}
if c == '\'' && !in_double_quote {
in_single_quote = !in_single_quote;
current_token.push(c);
continue;
}
if c == '"' && !in_single_quote {
in_double_quote = !in_double_quote;
current_token.push(c);
continue;
}
if c.is_whitespace() && !in_single_quote && !in_double_quote {
if !current_token.is_empty() {
tokens.push(std::mem::take(&mut current_token));
}
} else if c != ';' {
current_token.push(c);
}
}
if !current_token.is_empty() {
tokens.push(current_token);
}
tokens
}
fn parse_sequential_tokens_raw(tokens: &[String], ntax: usize, nchar: usize) -> NexusResult<Vec<(String, Vec<u8>)>> {
let mut sequences: Vec<(String, Vec<u8>)> = Vec::new();
let mut i = 0;
while i < tokens.len() && (ntax == 0 || sequences.len() < ntax) {
let name = unquote(&tokens[i]);
i += 1;
let mut seq_data = Vec::new();
while i < tokens.len() {
if nchar > 0 && seq_data.len() >= nchar {
break;
}
let token = &tokens[i];
if nchar == 0 && !seq_data.is_empty() && looks_like_name(token) {
break;
}
seq_data.extend(token.as_bytes());
i += 1;
}
sequences.push((name, seq_data));
}
Ok(sequences)
}
fn parse_interleaved_tokens_raw(tokens: &[String], ntax: usize, nchar: usize) -> NexusResult<Vec<(String, Vec<u8>)>> {
let mut sequences: Vec<(String, Vec<u8>)> = Vec::with_capacity(ntax);
let mut name_to_idx: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
let mut i = 0;
while i < tokens.len() {
if nchar > 0 && sequences.len() == ntax && sequences.iter().all(|(_, data)| data.len() >= nchar) {
break;
}
let token = unquote(&tokens[i]);
i += 1;
if let Some(&idx) = name_to_idx.get(&token) {
if i < tokens.len() {
sequences[idx].1.extend(tokens[i].as_bytes());
i += 1;
}
} else if sequences.len() < ntax {
let name = token;
name_to_idx.insert(name.clone(), sequences.len());
if i < tokens.len() {
let seq_data = tokens[i].as_bytes().to_vec();
i += 1;
sequences.push((name, seq_data));
} else {
sequences.push((name, Vec::new()));
}
} else {
let seq_idx = (sequences.len() - 1) % ntax;
sequences[seq_idx].1.extend(token.as_bytes());
}
}
Ok(sequences)
}
fn unquote(s: &str) -> String {
let s = s.trim();
if (s.starts_with('\'') && s.ends_with('\'')) || (s.starts_with('"') && s.ends_with('"')) {
s[1..s.len()-1].to_string()
} else {
s.to_string()
}
}
fn looks_like_name(token: &str) -> bool {
if token.starts_with('\'') || token.starts_with('"') {
return true;
}
let has_letters = token.chars().any(|c| c.is_ascii_alphabetic());
let has_digits = token.chars().any(|c| c.is_ascii_digit());
if has_letters && has_digits {
return true;
}
if token.contains('_') {
return true;
}
false
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_simple_nexus() {
let content = r#"#NEXUS
BEGIN DATA;
DIMENSIONS NTAX=3 NCHAR=10;
FORMAT DATATYPE=DNA GAP=- MISSING=?;
MATRIX
seq1 ACGTACGTAC
seq2 TGCATGCATG
seq3 AAAACCCCGG
;
END;
"#;
let alignment = parse_nexus_str(content).unwrap();
assert_eq!(alignment.sequence_count(), 3);
assert_eq!(alignment.get(0).unwrap().id, "seq1");
assert_eq!(alignment.get(0).unwrap().as_str(), "ACGTACGTAC");
}
#[test]
fn test_parse_interleaved_nexus() {
let content = r#"#NEXUS
BEGIN DATA;
DIMENSIONS NTAX=2 NCHAR=20;
FORMAT DATATYPE=DNA INTERLEAVE;
MATRIX
seq1 ACGTACGTAC
seq2 TGCATGCATG
seq1 GGGGGGGGGG
seq2 CCCCCCCCCC
;
END;
"#;
let alignment = parse_nexus_str(content).unwrap();
assert_eq!(alignment.sequence_count(), 2);
assert_eq!(alignment.get(0).unwrap().as_str(), "ACGTACGTACGGGGGGGGGG");
assert_eq!(alignment.get(1).unwrap().as_str(), "TGCATGCATGCCCCCCCCCC");
}
#[test]
fn test_parse_quoted_names() {
let content = r#"#NEXUS
BEGIN DATA;
DIMENSIONS NTAX=2 NCHAR=10;
FORMAT DATATYPE=DNA;
MATRIX
'seq 1' ACGTACGTAC
'seq 2' TGCATGCATG
;
END;
"#;
let alignment = parse_nexus_str(content).unwrap();
assert_eq!(alignment.get(0).unwrap().id, "seq 1");
assert_eq!(alignment.get(1).unwrap().id, "seq 2");
}
#[test]
fn test_parse_characters_block() {
let content = r#"#NEXUS
BEGIN CHARACTERS;
DIMENSIONS NCHAR=10;
FORMAT DATATYPE=PROTEIN;
MATRIX
seq1 ACDEFGHIKL
seq2 MNPQRSTVWY
;
END;
"#;
let alignment = parse_nexus_str(content).unwrap();
assert_eq!(alignment.sequence_count(), 2);
}
#[test]
fn test_not_nexus() {
let content = ">seq1\nACGT\n";
assert!(matches!(parse_nexus_str(content), Err(NexusError::NotNexus)));
}
#[test]
fn test_no_data_block() {
let content = r#"#NEXUS
BEGIN TAXA;
DIMENSIONS NTAX=3;
END;
"#;
assert!(matches!(
parse_nexus_str(content),
Err(NexusError::NoDataBlock)
));
}
#[test]
fn test_case_insensitive() {
let content = r#"#nexus
begin data;
dimensions ntax=2 nchar=5;
format datatype=dna;
matrix
seq1 ACGTA
seq2 TGCAT
;
end;
"#;
let alignment = parse_nexus_str(content).unwrap();
assert_eq!(alignment.sequence_count(), 2);
}
#[test]
fn test_with_gaps() {
let content = r#"#NEXUS
BEGIN DATA;
DIMENSIONS NTAX=2 NCHAR=10;
FORMAT DATATYPE=DNA GAP=-;
MATRIX
seq1 ACGT--GTAC
seq2 TG--TGCATG
;
END;
"#;
let alignment = parse_nexus_str(content).unwrap();
assert_eq!(alignment.get(0).unwrap().as_str(), "ACGT--GTAC");
}
#[test]
fn test_multiline_format() {
let content = r#"#NEXUS
[saved by seaview on Tue Dec 15 15:49:06 2009]
BEGIN DATA;
DIMENSIONS NTAX=2 NCHAR=10;
FORMAT DATATYPE=DNA
GAP=-
;
MATRIX
seq1 ACGT--GTAC
seq2 TG--TGCATG
;
END;
"#;
let alignment = parse_nexus_str(content).unwrap();
assert_eq!(alignment.sequence_count(), 2);
assert_eq!(alignment.get(0).unwrap().id, "seq1");
assert_eq!(alignment.get(0).unwrap().as_str(), "ACGT--GTAC");
}
#[test]
fn test_multiline_sequences() {
let content = r#"#NEXUS
BEGIN DATA;
DIMENSIONS NTAX=2 NCHAR=20;
FORMAT DATATYPE=DNA GAP=-;
MATRIX
[1] seq_1
ACGTACGTAC
GGGGGGGGGG
[2] seq_2
TGCATGCATG
CCCCCCCCCC
;
END;
"#;
let alignment = parse_nexus_str(content).unwrap();
assert_eq!(alignment.sequence_count(), 2);
assert_eq!(alignment.get(0).unwrap().id, "seq_1");
assert_eq!(alignment.get(0).unwrap().as_str(), "ACGTACGTACGGGGGGGGGG");
assert_eq!(alignment.get(1).unwrap().id, "seq_2");
assert_eq!(alignment.get(1).unwrap().as_str(), "TGCATGCATGCCCCCCCCCC");
}
#[test]
fn test_matchchar() {
let content = r#"#NEXUS
BEGIN DATA;
DIMENSIONS NTAX=3 NCHAR=10;
FORMAT DATATYPE=DNA GAP=- MATCHCHAR=.;
MATRIX
seq1 ACGTACGTAC
seq2 ....TG....
seq3 T.T.T.T.T.
;
END;
"#;
let alignment = parse_nexus_str(content).unwrap();
assert_eq!(alignment.sequence_count(), 3);
assert_eq!(alignment.get(0).unwrap().as_str(), "ACGTACGTAC");
assert_eq!(alignment.get(1).unwrap().as_str(), "ACGTTGGTAC");
assert_eq!(alignment.get(2).unwrap().as_str(), "TCTTTCTTTC");
}
}