use anyhow::{Context, Result};
use flate2::read::MultiGzDecoder;
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Genome {
pub chr_names: Vec<String>,
pub seqs: Vec<Vec<u8>>,
pub name_to_id: HashMap<String, usize>,
}
impl Genome {
pub fn from_fasta<P: AsRef<Path>>(path: P) -> Result<Self> {
let path_ref = path.as_ref();
let reader = Self::open_fasta_reader(path_ref)?;
let mut records: Vec<(String, Vec<u8>)> = Vec::new();
let mut current_name: Option<String> = None;
let mut current_seq: Vec<u8> = Vec::new();
for line_result in reader.lines() {
let line = line_result.with_context(|| {
format!("failed to read FASTA line from {}", path_ref.display())
})?;
if let Some(rest) = line.strip_prefix('>') {
if let Some(name) = current_name.take() {
records.push((name, std::mem::take(&mut current_seq)));
}
let name = rest.split_whitespace().next().unwrap_or("").to_string();
if name.is_empty() {
anyhow::bail!("empty FASTA record name");
}
current_name = Some(name);
} else if !line.trim().is_empty() {
current_seq.extend(line.trim().as_bytes());
}
}
if let Some(name) = current_name.take() {
records.push((name, current_seq));
}
if records.is_empty() {
anyhow::bail!("FASTA file contains no records: {}", path_ref.display());
}
Self::new(records)
}
fn open_fasta_reader<P: AsRef<Path>>(path: P) -> Result<Box<dyn BufRead>> {
let path_ref = path.as_ref();
let file = File::open(path_ref)
.with_context(|| format!("failed to open FASTA file {}", path_ref.display()))?;
let is_gz = path_ref.extension().map(|ext| ext == "gz").unwrap_or(false);
if is_gz {
let decoder = MultiGzDecoder::new(file);
Ok(Box::new(BufReader::new(decoder)))
} else {
Ok(Box::new(BufReader::new(file)))
}
}
pub fn new(records: Vec<(String, Vec<u8>)>) -> Result<Self> {
let mut chr_names = Vec::with_capacity(records.len());
let mut seqs = Vec::with_capacity(records.len());
let mut name_to_id = HashMap::with_capacity(records.len());
for (name, seq) in records {
if name.is_empty() {
anyhow::bail!("empty chromosome name");
}
if name_to_id.contains_key(&name) {
anyhow::bail!("duplicate chromosome name: {name}");
}
let chr_id = chr_names.len();
name_to_id.insert(name.clone(), chr_id);
chr_names.push(name);
seqs.push(Self::uppercase_sequence(&seq));
}
Ok(Self {
chr_names,
seqs,
name_to_id,
})
}
pub fn chr_id(&self, name: &str) -> Option<usize> {
self.name_to_id.get(name).copied()
}
pub fn chr_name(&self, chr_id: usize) -> Option<&str> {
self.chr_names.get(chr_id).map(|s| s.as_str())
}
pub fn chr_len(&self, chr_id: usize) -> Option<u32> {
self.seqs.get(chr_id).map(|seq| seq.len() as u32)
}
pub fn chr_lengths(&self) -> Vec<u32> {
self.seqs.iter().map(|seq| seq.len() as u32).collect()
}
pub fn chr_names(&self) -> Vec<String> {
self.chr_names.clone()
}
pub fn base(&self, chr_id: usize, pos0: u32) -> Option<u8> {
self.seqs
.get(chr_id)
.and_then(|seq| seq.get(pos0 as usize))
.copied()
.map(|b| b.to_ascii_uppercase())
}
pub fn slice(&self, chr_id: usize, start0: u32, end0: u32) -> Option<&[u8]> {
if start0 > end0 {
return None;
}
self.seqs
.get(chr_id)
.and_then(|seq| seq.get(start0 as usize..end0 as usize))
}
pub fn base_matches(&self, chr_id: usize, pos0: u32, base: u8) -> bool {
self.base(chr_id, pos0)
.map(|ref_base| ref_base == base.to_ascii_uppercase())
.unwrap_or(false)
}
pub fn record_name(id: &[u8]) -> Result<String> {
let id_str = std::str::from_utf8(id).context("FASTA record id is not valid UTF-8")?;
let name = id_str.split_whitespace().next().unwrap_or("").to_string();
if name.is_empty() {
anyhow::bail!("empty FASTA record id");
}
Ok(name)
}
pub fn uppercase_sequence(seq: &[u8]) -> Vec<u8> {
seq.iter().map(|b| b.to_ascii_uppercase()).collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::AlignedRead;
use crate::ReadOpKind;
use crate::Strand;
#[test]
fn new_builds_genome_and_uppercases_sequences() {
let genome = Genome::new(vec![
("chr1".to_string(), b"acgtN".to_vec()),
("chr2".to_string(), b"ttcc".to_vec()),
])
.unwrap();
assert_eq!(genome.chr_id("chr1"), Some(0));
assert_eq!(genome.chr_id("chr2"), Some(1));
assert_eq!(genome.chr_name(0), Some("chr1"));
assert_eq!(genome.chr_len(0), Some(5));
assert_eq!(genome.chr_lengths(), vec![5, 4]);
assert_eq!(genome.base(0, 0), Some(b'A'));
assert_eq!(genome.base(0, 1), Some(b'C'));
assert_eq!(genome.base(0, 4), Some(b'N'));
}
#[test]
fn new_rejects_duplicate_names() {
let result = Genome::new(vec![
("chr1".to_string(), b"AAAA".to_vec()),
("chr1".to_string(), b"CCCC".to_vec()),
]);
assert!(result.is_err());
}
#[test]
fn base_returns_none_outside_reference() {
let genome = Genome::new(vec![("chr1".to_string(), b"ACGT".to_vec())]).unwrap();
assert_eq!(genome.base(0, 4), None);
assert_eq!(genome.base(1, 0), None);
}
#[test]
fn slice_returns_half_open_reference_interval() {
let genome = Genome::new(vec![("chr1".to_string(), b"ACGTACGT".to_vec())]).unwrap();
assert_eq!(genome.slice(0, 2, 6), Some(&b"GTAC"[..]));
assert_eq!(genome.slice(0, 6, 2), None);
assert_eq!(genome.slice(0, 0, 99), None);
}
#[test]
fn base_matches_is_case_insensitive_for_query_base() {
let genome = Genome::new(vec![("chr1".to_string(), b"ACGT".to_vec())]).unwrap();
assert!(genome.base_matches(0, 1, b'C'));
assert!(genome.base_matches(0, 1, b'c'));
assert!(!genome.base_matches(0, 1, b'A'));
assert!(!genome.base_matches(0, 99, b'A'));
}
#[test]
fn record_name_uses_first_whitespace_token() {
let name = Genome::record_name(b"chr1 some description here").unwrap();
assert_eq!(name, "chr1");
}
#[test]
fn record_name_rejects_empty_id() {
assert!(Genome::record_name(b"").is_err());
assert!(Genome::record_name(b" ").is_err());
}
fn make_read(
seq: &[u8],
pairs: Vec<(ReadOpKind, u32)>,
) -> AlignedRead {
AlignedRead::new(
0,
Strand::Plus,
0,
seq.to_vec(),
Some(vec![30; seq.len()]),
pairs,
)
}
#[test]
fn base_at_ref_pos_simple() {
let read = make_read(
b"ACGT",
vec![(ReadOpKind::Match, 4)],
);
assert_eq!(read.base_at_ref_pos(0).unwrap().base, b'A');
assert_eq!(read.base_at_ref_pos(1).unwrap().base, b'C');
}
#[test]
fn base_at_ref_pos_softclip() {
let read = make_read(
b"NNNNACGT",
vec![
(ReadOpKind::SoftClip, 4),
(ReadOpKind::Match, 4),
],
);
assert_eq!(read.base_at_ref_pos(0).unwrap().base, b'A');
}
#[test]
fn base_at_ref_pos_splice() {
let read = make_read(
b"AAAACCCC",
vec![
(ReadOpKind::Match, 4),
(ReadOpKind::RefSkip, 100),
(ReadOpKind::Match, 4),
],
);
assert_eq!(read.base_at_ref_pos(0).unwrap().base, b'A');
assert_eq!(read.base_at_ref_pos(3).unwrap().base, b'A');
assert!(read.base_at_ref_pos(50).is_none());
assert_eq!(read.base_at_ref_pos(104).unwrap().base, b'C');
}
#[test]
fn base_at_ref_pos_insertion() {
let read = make_read(
b"AACGT",
vec![
(ReadOpKind::Match, 2),
(ReadOpKind::Ins, 1),
(ReadOpKind::Match, 2),
],
);
assert_eq!(read.base_at_ref_pos(0).unwrap().base, b'A');
assert_eq!(read.base_at_ref_pos(1).unwrap().base, b'A');
assert_eq!(read.base_at_ref_pos(2).unwrap().base, b'G');
}
}