use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;
use cyanea_core::{CyaneaError, Result};
#[derive(Debug, Clone)]
pub struct MafBlock {
pub score: Option<f64>,
pub sequences: Vec<MafSequence>,
}
#[derive(Debug, Clone)]
pub struct MafSequence {
pub src: String,
pub start: u64,
pub size: u64,
pub strand: char,
pub src_size: u64,
pub text: String,
}
#[derive(Debug, Clone)]
pub struct MafStats {
pub block_count: u64,
pub total_aligned_bases: u64,
pub species: Vec<String>,
}
pub fn parse_maf(path: impl AsRef<Path>) -> Result<Vec<MafBlock>> {
let path = path.as_ref();
let file = File::open(path).map_err(|e| {
CyaneaError::Io(std::io::Error::new(
e.kind(),
format!("{}: {}", path.display(), e),
))
})?;
let reader = BufReader::new(file);
let mut blocks = Vec::new();
let mut current_block: Option<MafBlock> = None;
for (line_num, line_result) in reader.lines().enumerate() {
let line = line_result.map_err(|e| {
CyaneaError::Io(std::io::Error::new(
e.kind(),
format!("{}: line {}: {}", path.display(), line_num + 1, e),
))
})?;
let line = line.trim();
if line.is_empty() {
if let Some(block) = current_block.take() {
if !block.sequences.is_empty() {
blocks.push(block);
}
}
continue;
}
if line.starts_with('#') {
continue;
}
if line.starts_with("a ") || line == "a" {
if let Some(block) = current_block.take() {
if !block.sequences.is_empty() {
blocks.push(block);
}
}
let score = parse_score(line);
current_block = Some(MafBlock {
score,
sequences: Vec::new(),
});
} else if line.starts_with("s ") {
let seq = parse_s_line(line, line_num + 1, path)?;
if let Some(ref mut block) = current_block {
block.sequences.push(seq);
}
}
}
if let Some(block) = current_block.take() {
if !block.sequences.is_empty() {
blocks.push(block);
}
}
Ok(blocks)
}
pub fn maf_stats(path: impl AsRef<Path>) -> Result<MafStats> {
let blocks = parse_maf(path)?;
let mut total_aligned_bases: u64 = 0;
let mut species_set = std::collections::HashSet::new();
let mut species_order = Vec::new();
for block in &blocks {
for seq in &block.sequences {
total_aligned_bases += seq.size;
if species_set.insert(seq.src.clone()) {
species_order.push(seq.src.clone());
}
}
}
Ok(MafStats {
block_count: blocks.len() as u64,
total_aligned_bases,
species: species_order,
})
}
fn parse_score(line: &str) -> Option<f64> {
for part in line.split_whitespace() {
if let Some(val) = part.strip_prefix("score=") {
return val.parse().ok();
}
}
None
}
fn parse_s_line(line: &str, line_num: usize, path: &Path) -> Result<MafSequence> {
let fields: Vec<&str> = line.split_whitespace().collect();
if fields.len() < 7 {
return Err(CyaneaError::Parse(format!(
"{}: line {}: expected 7 fields in 's' line, found {}",
path.display(),
line_num,
fields.len()
)));
}
let src = fields[1].to_string();
let start: u64 = fields[2].parse().map_err(|_| {
CyaneaError::Parse(format!(
"{}: line {}: invalid start '{}'",
path.display(),
line_num,
fields[2]
))
})?;
let size: u64 = fields[3].parse().map_err(|_| {
CyaneaError::Parse(format!(
"{}: line {}: invalid size '{}'",
path.display(),
line_num,
fields[3]
))
})?;
let strand = fields[4].chars().next().unwrap_or('+');
if strand != '+' && strand != '-' {
return Err(CyaneaError::Parse(format!(
"{}: line {}: invalid strand '{}'",
path.display(),
line_num,
fields[4]
)));
}
let src_size: u64 = fields[5].parse().map_err(|_| {
CyaneaError::Parse(format!(
"{}: line {}: invalid srcSize '{}'",
path.display(),
line_num,
fields[5]
))
})?;
let text = fields[6].to_string();
Ok(MafSequence {
src,
start,
size,
strand,
src_size,
text,
})
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
fn write_maf(content: &str) -> NamedTempFile {
let mut file = NamedTempFile::with_suffix(".maf").unwrap();
write!(file, "{}", content).unwrap();
file.flush().unwrap();
file
}
#[test]
fn maf_parse_two_blocks() {
let file = write_maf(
"##maf version=1 scoring=tba.v8\n\
\n\
a score=23262.0\n\
s hg38.chr7 27578828 38 + 159345973 AAA-GGGAATGTTAACCAAATGA---ATTGTCTCTTACGGTG\n\
s panTro4.chr6 28741140 38 + 161576975 AAA-GGGAATGTTAACCAAATGA---ATTGTCTCTTACGGTG\n\
s ponAbe2.chr6 28594507 38 + 174210431 AAA-GGGAATGTTAACCAAATGA---ATTGTCTCTTACGGTG\n\
\n\
a score=5000.0\n\
s hg38.chr7 27699739 6 + 159345973 TAAAGA\n\
s panTro4.chr6 28862317 6 + 161576975 TAAAGA\n\
\n",
);
let blocks = parse_maf(file.path()).unwrap();
assert_eq!(blocks.len(), 2);
assert!((blocks[0].score.unwrap() - 23262.0).abs() < f64::EPSILON);
assert_eq!(blocks[0].sequences.len(), 3);
assert_eq!(blocks[0].sequences[0].src, "hg38.chr7");
assert_eq!(blocks[0].sequences[0].start, 27578828);
assert_eq!(blocks[0].sequences[0].size, 38);
assert_eq!(blocks[0].sequences[0].strand, '+');
assert_eq!(blocks[0].sequences[0].src_size, 159345973);
assert_eq!(blocks[1].sequences.len(), 2);
assert!((blocks[1].score.unwrap() - 5000.0).abs() < f64::EPSILON);
}
#[test]
fn maf_pairwise() {
let file = write_maf(
"a score=100.0\n\
s ref.chr1 100 50 + 1000 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC\n\
s query.ctg1 0 50 + 500 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC\n\
\n",
);
let blocks = parse_maf(file.path()).unwrap();
assert_eq!(blocks.len(), 1);
assert_eq!(blocks[0].sequences.len(), 2);
assert_eq!(blocks[0].sequences[0].src, "ref.chr1");
assert_eq!(blocks[0].sequences[1].src, "query.ctg1");
}
#[test]
fn maf_minus_strand() {
let file = write_maf(
"a score=200.0\n\
s hg38.chr1 1000 50 + 248956422 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC\n\
s mm10.chr5 5000 50 - 151834684 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC\n\
\n",
);
let blocks = parse_maf(file.path()).unwrap();
assert_eq!(blocks[0].sequences[0].strand, '+');
assert_eq!(blocks[0].sequences[1].strand, '-');
assert_eq!(blocks[0].sequences[1].src_size, 151834684);
}
#[test]
fn maf_stats_computed() {
let file = write_maf(
"a score=100.0\n\
s species1.chr1 0 30 + 1000 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n\
s species2.chr2 0 30 + 2000 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n\
\n\
a score=200.0\n\
s species1.chr1 50 20 + 1000 AAAAAAAAAAAAAAAAAAAA\n\
s species3.chr3 10 20 + 3000 AAAAAAAAAAAAAAAAAAAA\n\
\n",
);
let stats = maf_stats(file.path()).unwrap();
assert_eq!(stats.block_count, 2);
assert_eq!(stats.total_aligned_bases, 100); assert_eq!(stats.species.len(), 3);
assert!(stats.species.contains(&"species1.chr1".to_string()));
assert!(stats.species.contains(&"species2.chr2".to_string()));
assert!(stats.species.contains(&"species3.chr3".to_string()));
}
#[test]
fn maf_empty_block_skipped() {
let file = write_maf(
"a score=0\n\
\n\
a score=100.0\n\
s src1 0 10 + 100 AAAAAAAAAA\n\
\n",
);
let blocks = parse_maf(file.path()).unwrap();
assert_eq!(blocks.len(), 1);
assert_eq!(blocks[0].sequences.len(), 1);
}
}