use std::collections::HashMap;
use cyanea_core::{CyaneaError, Result};
#[derive(Debug, Clone)]
pub struct StockholmAlignment {
pub sequences: Vec<(String, String)>,
pub gc_annotations: HashMap<String, String>,
pub gs_annotations: HashMap<String, HashMap<String, String>>,
pub gr_annotations: HashMap<String, HashMap<String, String>>,
}
pub fn parse_stockholm(input: &str) -> Result<Vec<StockholmAlignment>> {
if input.trim().is_empty() {
return Ok(Vec::new());
}
let mut alignments = Vec::new();
let mut in_block = false;
let mut seq_order: Vec<String> = Vec::new();
let mut seq_data: HashMap<String, String> = HashMap::new();
let mut gc_annotations: HashMap<String, String> = HashMap::new();
let mut gs_annotations: HashMap<String, HashMap<String, String>> = HashMap::new();
let mut gr_annotations: HashMap<String, HashMap<String, String>> = HashMap::new();
let mut block_started = false;
for (line_num, line) in input.lines().enumerate() {
let line = line.trim_end();
if line.starts_with("# STOCKHOLM 1.0") {
if in_block {
return Err(CyaneaError::Parse(format!(
"line {}: nested STOCKHOLM header without preceding //",
line_num + 1
)));
}
in_block = true;
block_started = true;
seq_order.clear();
seq_data.clear();
gc_annotations.clear();
gs_annotations.clear();
gr_annotations.clear();
continue;
}
if !in_block {
continue;
}
if line == "//" {
let sequences: Vec<(String, String)> = seq_order
.iter()
.map(|name| {
let seq = seq_data.get(name).cloned().unwrap_or_default();
(name.clone(), seq)
})
.collect();
alignments.push(StockholmAlignment {
sequences,
gc_annotations: gc_annotations.clone(),
gs_annotations: gs_annotations.clone(),
gr_annotations: gr_annotations.clone(),
});
in_block = false;
continue;
}
if line.trim().is_empty() {
continue;
}
if let Some(rest) = line.strip_prefix("#=GC ") {
let rest = rest.trim_start();
if let Some((tag, value)) = rest.split_once(char::is_whitespace) {
let tag = tag.trim();
let value = value.trim();
gc_annotations
.entry(tag.to_string())
.and_modify(|v| v.push_str(value))
.or_insert_with(|| value.to_string());
}
continue;
}
if let Some(rest) = line.strip_prefix("#=GS ") {
let rest = rest.trim_start();
let parts: Vec<&str> = rest.splitn(3, char::is_whitespace).collect();
if parts.len() >= 3 {
let name = parts[0].trim();
let tag = parts[1].trim();
let value = parts[2].trim();
gs_annotations
.entry(name.to_string())
.or_default()
.insert(tag.to_string(), value.to_string());
}
continue;
}
if let Some(rest) = line.strip_prefix("#=GR ") {
let rest = rest.trim_start();
let parts: Vec<&str> = rest.splitn(3, char::is_whitespace).collect();
if parts.len() >= 3 {
let name = parts[0].trim();
let tag = parts[1].trim();
let value = parts[2].trim();
gr_annotations
.entry(name.to_string())
.or_default()
.entry(tag.to_string())
.and_modify(|v| v.push_str(value))
.or_insert_with(|| value.to_string());
}
continue;
}
if line.starts_with('#') {
continue;
}
let parts: Vec<&str> = line.splitn(2, char::is_whitespace).collect();
if parts.len() == 2 {
let name = parts[0].trim();
let seq = parts[1].trim();
if !seq_data.contains_key(name) {
seq_order.push(name.to_string());
}
seq_data
.entry(name.to_string())
.and_modify(|v| v.push_str(seq))
.or_insert_with(|| seq.to_string());
}
}
if in_block && block_started {
return Err(CyaneaError::Parse(
"unterminated Stockholm block (missing //)".to_string(),
));
}
Ok(alignments)
}
pub fn write_stockholm(aln: &StockholmAlignment) -> String {
let mut out = String::new();
out.push_str("# STOCKHOLM 1.0\n");
for (name, tags) in &aln.gs_annotations {
for (tag, value) in tags {
out.push_str(&format!("#=GS {} {} {}\n", name, tag, value));
}
}
for (name, seq) in &aln.sequences {
out.push_str(&format!("{} {}\n", name, seq));
}
for (name, tags) in &aln.gr_annotations {
for (tag, value) in tags {
out.push_str(&format!("#=GR {} {} {}\n", name, tag, value));
}
}
for (tag, value) in &aln.gc_annotations {
out.push_str(&format!("#=GC {} {}\n", tag, value));
}
out.push_str("//\n");
out
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn stockholm_round_trip() {
let aln = StockholmAlignment {
sequences: vec![
("seq1".to_string(), "ACGU--AACCU".to_string()),
("seq2".to_string(), "ACGU--AACCU".to_string()),
],
gc_annotations: HashMap::new(),
gs_annotations: HashMap::new(),
gr_annotations: HashMap::new(),
};
let written = write_stockholm(&aln);
let parsed = parse_stockholm(&written).unwrap();
assert_eq!(parsed.len(), 1);
assert_eq!(parsed[0].sequences.len(), 2);
assert_eq!(parsed[0].sequences[0].0, "seq1");
assert_eq!(parsed[0].sequences[0].1, "ACGU--AACCU");
assert_eq!(parsed[0].sequences[1].0, "seq2");
assert_eq!(parsed[0].sequences[1].1, "ACGU--AACCU");
}
#[test]
fn stockholm_multi_block() {
let input = "\
# STOCKHOLM 1.0
seq1 ACGT
seq2 TGCA
//
# STOCKHOLM 1.0
seqA AAAA
seqB CCCC
seqC GGGG
//
";
let alns = parse_stockholm(input).unwrap();
assert_eq!(alns.len(), 2);
assert_eq!(alns[0].sequences.len(), 2);
assert_eq!(alns[1].sequences.len(), 3);
assert_eq!(alns[1].sequences[2].0, "seqC");
assert_eq!(alns[1].sequences[2].1, "GGGG");
}
#[test]
fn stockholm_annotations_preserved() {
let input = "\
# STOCKHOLM 1.0
#=GF AU Eddy SR
#=GS seq1 AC PF00001
#=GS seq2 AC PF00002
seq1 ACGU..AACCU
seq2 ACGU..AACCU
#=GR seq1 PP 99998877665
#=GC SS_cons ..<<..>>...
//
";
let alns = parse_stockholm(input).unwrap();
assert_eq!(alns.len(), 1);
let aln = &alns[0];
assert_eq!(aln.sequences.len(), 2);
assert_eq!(
aln.gs_annotations.get("seq1").unwrap().get("AC").unwrap(),
"PF00001"
);
assert_eq!(
aln.gs_annotations.get("seq2").unwrap().get("AC").unwrap(),
"PF00002"
);
assert_eq!(
aln.gr_annotations.get("seq1").unwrap().get("PP").unwrap(),
"99998877665"
);
assert_eq!(
aln.gc_annotations.get("SS_cons").unwrap(),
"..<<..>>..."
);
}
#[test]
fn stockholm_empty_input() {
let alns = parse_stockholm("").unwrap();
assert!(alns.is_empty());
let alns = parse_stockholm(" \n \n").unwrap();
assert!(alns.is_empty());
}
#[test]
fn stockholm_unterminated_error() {
let input = "# STOCKHOLM 1.0\nseq1 ACGT\nseq2 TGCA\n";
let result = parse_stockholm(input);
assert!(result.is_err());
let err = result.unwrap_err().to_string();
assert!(err.contains("unterminated"));
}
#[test]
fn stockholm_interleaved_sequences() {
let input = "\
# STOCKHOLM 1.0
seq1 ACGT
seq2 TGCA
seq1 AAAA
seq2 CCCC
//
";
let alns = parse_stockholm(input).unwrap();
assert_eq!(alns.len(), 1);
assert_eq!(alns[0].sequences[0].1, "ACGTAAAA");
assert_eq!(alns[0].sequences[1].1, "TGCACCCC");
}
}