use cyanea_core::{CyaneaError, Result};
#[derive(Debug, Clone)]
pub struct ClustalAlignment {
pub sequences: Vec<(String, String)>,
pub conservation: Option<String>,
}
pub fn parse_clustal(input: &str) -> Result<ClustalAlignment> {
let mut lines = input.lines().peekable();
let mut found_header = false;
while let Some(line) = lines.peek() {
let trimmed = line.trim();
if trimmed.is_empty() {
lines.next();
continue;
}
if trimmed.starts_with("CLUSTAL") {
found_header = true;
lines.next();
break;
} else {
break;
}
}
if !found_header {
return Err(CyaneaError::Parse(
"missing CLUSTAL header line".to_string(),
));
}
let mut seq_order: Vec<String> = Vec::new();
let mut seq_data: std::collections::HashMap<String, String> =
std::collections::HashMap::new();
let mut conservation = String::new();
let mut block_seq_count = 0usize;
let mut expected_seqs_per_block: Option<usize> = None;
for line in lines {
let line = line.trim_end();
if line.trim().is_empty() {
if block_seq_count > 0 {
if expected_seqs_per_block.is_none() {
expected_seqs_per_block = Some(block_seq_count);
}
block_seq_count = 0;
}
continue;
}
let trimmed = line.trim();
if !trimmed.is_empty() && trimmed.chars().all(|c| c == '*' || c == ':' || c == '.' || c == ' ') {
conservation.push_str(trimmed);
continue;
}
let parts: Vec<&str> = line.split_whitespace().collect();
if parts.is_empty() {
continue;
}
if parts.len() >= 2 {
let name = parts[0];
let seq = parts[1];
if !seq_data.contains_key(name) {
seq_order.push(name.to_string());
}
seq_data
.entry(name.to_string())
.and_modify(|v| v.push_str(seq))
.or_insert_with(|| seq.to_string());
block_seq_count += 1;
}
}
let sequences: Vec<(String, String)> = seq_order
.into_iter()
.map(|name| {
let seq = seq_data.remove(&name).unwrap_or_default();
(name, seq)
})
.collect();
let conservation = if conservation.is_empty() {
None
} else {
Some(conservation)
};
Ok(ClustalAlignment {
sequences,
conservation,
})
}
pub fn write_clustal(aln: &ClustalAlignment) -> String {
const BLOCK_SIZE: usize = 60;
let mut out = String::new();
out.push_str("CLUSTAL W multiple sequence alignment\n\n");
if aln.sequences.is_empty() {
return out;
}
let max_name_len = aln.sequences.iter().map(|(n, _)| n.len()).max().unwrap_or(0);
let pad = max_name_len + 4;
let total_len = aln.sequences.iter().map(|(_, s)| s.len()).max().unwrap_or(0);
let conservation = aln.conservation.as_deref().unwrap_or("");
let mut offset = 0;
while offset < total_len {
let end = std::cmp::min(offset + BLOCK_SIZE, total_len);
for (name, seq) in &aln.sequences {
let fragment = if offset < seq.len() {
&seq[offset..std::cmp::min(end, seq.len())]
} else {
""
};
out.push_str(&format!("{:<width$}{}\n", name, fragment, width = pad));
}
if !conservation.is_empty() && offset < conservation.len() {
let cons_fragment = &conservation[offset..std::cmp::min(end, conservation.len())];
out.push_str(&format!("{:width$}{}\n", "", cons_fragment, width = pad));
}
out.push('\n');
offset = end;
}
out
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn clustal_round_trip() {
let aln = ClustalAlignment {
sequences: vec![
("seq1".to_string(), "ACGT--AACCU".to_string()),
("seq2".to_string(), "ACGU--AACCU".to_string()),
],
conservation: Some("**** *****".to_string()),
};
let written = write_clustal(&aln);
let parsed = parse_clustal(&written).unwrap();
assert_eq!(parsed.sequences.len(), 2);
assert_eq!(parsed.sequences[0].0, "seq1");
assert_eq!(parsed.sequences[0].1, "ACGT--AACCU");
assert_eq!(parsed.sequences[1].0, "seq2");
assert_eq!(parsed.sequences[1].1, "ACGU--AACCU");
assert!(parsed.conservation.is_some());
assert_eq!(parsed.conservation.unwrap(), "**** *****");
}
#[test]
fn clustal_multi_block_interleaved() {
let input = "\
CLUSTAL W (1.83) multiple sequence alignment
seq1 ACGT
seq2 TGCA
*..*
seq1 AAAA
seq2 CCCC
";
let aln = parse_clustal(input).unwrap();
assert_eq!(aln.sequences.len(), 2);
assert_eq!(aln.sequences[0].1, "ACGTAAAA");
assert_eq!(aln.sequences[1].1, "TGCACCCC");
}
#[test]
fn clustal_conservation_preserved() {
let input = "\
CLUSTAL W (1.83) multiple sequence alignment
seq1 ACGT
seq2 ACGT
****
";
let aln = parse_clustal(input).unwrap();
assert_eq!(aln.conservation, Some("****".to_string()));
}
#[test]
fn clustal_missing_header_error() {
let input = "seq1 ACGT\nseq2 ACGT\n";
let result = parse_clustal(input);
assert!(result.is_err());
let err = result.unwrap_err().to_string();
assert!(err.contains("CLUSTAL"));
}
#[test]
fn clustal_empty_sequences() {
let input = "CLUSTAL W (1.83) multiple sequence alignment\n\n";
let aln = parse_clustal(input).unwrap();
assert!(aln.sequences.is_empty());
assert!(aln.conservation.is_none());
}
}