use std::collections::HashMap;
use crate::error::AsrError;
pub struct Alignment {
pub seqs: HashMap<String, Vec<u8>>,
pub width: usize,
}
impl Alignment {
pub fn from_fasta_bytes(data: &[u8]) -> Result<Self, AsrError> {
let mut seqs = HashMap::new();
let mut current_id = String::new();
let mut current_seq = Vec::new();
let lines = data.split(|&b| b == b'\n');
for line in lines {
let line = line.trim_ascii();
if line.is_empty() {
continue;
}
if line[0] == b'>' {
if !current_id.is_empty() {
if seqs.contains_key(¤t_id) {
return Err(AsrError::InvalidAlignment(format!("Duplicate taxon ID: {}", current_id)));
}
seqs.insert(current_id.clone(), current_seq);
}
let header = &line[1..];
let end = header.iter().position(|&b| b.is_ascii_whitespace()).unwrap_or(header.len());
let raw_name = String::from_utf8_lossy(&header[..end]);
if raw_name.is_empty() {
return Err(AsrError::InvalidAlignment("Empty taxon ID in FASTA header".to_string()));
}
current_id = raw_name.into_owned();
current_seq = Vec::new();
} else {
let normalized: Vec<u8> = line.iter().map(|&b| b.to_ascii_uppercase()).collect();
current_seq.extend(normalized);
}
}
if !current_id.is_empty() {
if seqs.contains_key(¤t_id) {
return Err(AsrError::InvalidAlignment(format!("Duplicate taxon ID: {}", current_id)));
}
seqs.insert(current_id, current_seq);
}
if seqs.is_empty() {
return Err(AsrError::InvalidAlignment("Empty FASTA file".to_string()));
}
let width = seqs.values().next().unwrap().len();
if !seqs.values().all(|s| s.len() == width) {
return Err(AsrError::InvalidAlignment("Ragged alignment: sequences have different lengths".to_string()));
}
Ok(Self { seqs, width })
}
pub fn from_fasta_file(path: &std::path::Path) -> std::io::Result<Self> {
let bytes = std::fs::read(path)?;
Self::from_fasta_bytes(&bytes).map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))
}
pub fn compress_columns(&self) -> CompressedColumns {
let mut pattern_to_idx = HashMap::new();
let mut patterns = Vec::new();
let mut multiplicity = Vec::new();
let mut site_to_pattern = Vec::with_capacity(self.width);
let leaf_order: Vec<String> = self.seqs.keys().cloned().collect();
for i in 0..self.width {
let mut col = Vec::with_capacity(leaf_order.len());
for name in &leaf_order {
col.push(self.seqs[name][i]);
}
let idx = *pattern_to_idx.entry(col.clone()).or_insert_with(|| {
let p_idx = patterns.len();
patterns.push(col);
multiplicity.push(0);
p_idx
});
multiplicity[idx] += 1;
site_to_pattern.push(idx);
}
CompressedColumns {
patterns,
site_to_pattern,
multiplicity,
leaf_order,
}
}
}
pub struct CompressedColumns {
pub patterns: Vec<Vec<u8>>,
pub site_to_pattern: Vec<usize>,
pub multiplicity: Vec<usize>,
pub leaf_order: Vec<String>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_fasta_parse() {
let data = b">Seq1\nACGT\n>Seq2\nAGGT\n";
let aln = Alignment::from_fasta_bytes(data).unwrap();
assert_eq!(aln.width, 4);
assert_eq!(aln.seqs["Seq1"], b"ACGT");
assert_eq!(aln.seqs["Seq2"], b"AGGT");
}
#[test]
fn test_compression() {
let mut seqs = HashMap::new();
seqs.insert("S1".to_string(), b"AAT".to_vec());
seqs.insert("S2".to_string(), b"ATT".to_vec());
let aln = Alignment { seqs, width: 3 };
let comp = aln.compress_columns();
assert_eq!(comp.patterns.len(), 3);
assert_eq!(comp.multiplicity, vec![1, 1, 1]);
}
}