use bytes::Bytes;
use genomap::GenomeMap;
use indexmap::IndexMap;
use noodles::core::Region;
use noodles::fasta::indexed_reader;
use noodles::fasta::{io::BufReadSeek, reader, record::Sequence, IndexedReader};
use std::ops::Deref;
use std::ops::Index;
use std::path::PathBuf;
use std::str;
use std::{cell::Ref, collections::HashSet, fmt};
use super::lazy::LazyLoader;
use crate::prelude::GRangesError;
use crate::ranges::try_range;
use crate::traits::Sequences;
use crate::{Position, INTERNAL_ERROR_MESSAGE};
#[derive(Clone, Debug, PartialEq)]
pub struct Nucleotides(Bytes);
impl fmt::Display for Nucleotides {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match str::from_utf8(&self.0) {
Ok(s) => write!(f, "{}", s),
Err(_) => Err(fmt::Error),
}
}
}
impl Deref for Nucleotides {
type Target = Bytes;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl From<&Sequence> for Nucleotides {
fn from(sequence: &Sequence) -> Self {
let seq = Bytes::from(sequence.as_ref().to_vec());
Nucleotides(seq)
}
}
impl From<String> for Nucleotides {
fn from(s: String) -> Self {
let bytes = Bytes::from(s.into_bytes());
Nucleotides(bytes)
}
}
impl<'a> From<&'a str> for Nucleotides {
fn from(s: &'a str) -> Self {
let bytes = Bytes::from(s.as_bytes().to_vec());
Nucleotides(bytes)
}
}
impl Nucleotides {
pub fn len(&self) -> usize {
self.0.len()
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn iter(&self) -> NucleotideIterator<'_> {
let inner = self.0.iter();
NucleotideIterator { inner }
}
}
pub struct NucleotideIterator<'a> {
inner: std::slice::Iter<'a, u8>,
}
impl<'a> Iterator for NucleotideIterator<'a> {
type Item = &'a u8;
fn next(&mut self) -> Option<Self::Item> {
self.inner.next()
}
}
pub struct NucleotideSequences {
data: GenomeMap<Nucleotides>,
}
impl NucleotideSequences {
pub fn from_fasta(
filepath: impl Into<PathBuf>,
seqnames: Option<Vec<String>>,
) -> Result<Self, GRangesError> {
let data = parse_fasta(filepath, seqnames)?;
Ok(Self { data })
}
pub fn seqlens(&self) -> IndexMap<String, Position> {
self.data
.iter()
.map(|(k, v)| (k.clone(), v.len().try_into().unwrap()))
.collect()
}
}
impl Index<&str> for NucleotideSequences {
type Output = Nucleotides;
fn index(&self, index: &str) -> &Self::Output {
self.get_sequence(index).unwrap()
}
}
impl Sequences for NucleotideSequences {
type Container<'a> = &'a Nucleotides;
type Slice<'a> = &'a [u8];
fn seqnames(&self) -> Vec<String> {
self.seqlens().keys().cloned().collect()
}
fn get_sequence(&self, seqname: &str) -> Result<Self::Container<'_>, GRangesError> {
self.data
.get(seqname)
.ok_or(GRangesError::MissingSequenceName(seqname.to_string()))
}
fn region_map<V, F>(
&self,
func: &F,
seqname: &str,
start: Position,
end: Position,
) -> Result<V, GRangesError>
where
F: Fn(Self::Slice<'_>) -> V,
{
let seq = self.get_sequence(seqname)?;
let range = try_range(start, end, seq.len().try_into().unwrap())?;
let data = &seq[range];
Ok(func(data))
}
fn get_sequence_length(&self, seqname: &str) -> Result<Position, GRangesError> {
let len: Position = self.get_sequence(seqname)?.len().try_into().unwrap();
Ok(len)
}
}
#[derive(Debug)]
pub struct LazyNucleotideSequences {
seqlens: IndexMap<String, Position>,
lazy: LazyLoader<IndexedReader<Box<dyn BufReadSeek>>, Nucleotides, String>,
}
impl LazyNucleotideSequences {
pub fn new(
filepath: impl Into<PathBuf>,
seqnames: Option<Vec<String>>,
) -> Result<Self, GRangesError> {
let filepath = filepath.into();
let reader = indexed_reader::Builder::default().build_from_path(filepath)?;
let allowed_seqnames = seqnames.map(|c| c.into_iter().collect::<HashSet<_>>());
let seqlens: IndexMap<String, Position> = reader
.index()
.iter()
.filter_map(|r| {
let name = String::from_utf8(r.name().to_vec()).unwrap_or_else(|_| {
panic!(
"{}\nError in converting FASTA name to a UTF8 String.",
&INTERNAL_ERROR_MESSAGE
)
});
if allowed_seqnames
.as_ref()
.map_or(true, |seqnames| seqnames.contains(&name))
{
#[allow(clippy::useless_conversion)] Some((name, r.length().try_into().unwrap()))
} else {
None
}
})
.collect();
let lazy = LazyLoader::new(reader, move |reader, seqname: &String| {
if allowed_seqnames
.as_ref()
.map_or(true, |seqnames| seqnames.contains(seqname))
{
let seqname_u8 = seqname.clone().into_bytes();
let region = Region::new(seqname_u8, ..);
let record = reader.query(®ion)?;
let seq = record.sequence();
let nucs: Nucleotides = seq.into();
Ok(nucs)
} else {
Err(GRangesError::MissingSequenceName(seqname.to_string()))
}
});
Ok(Self { seqlens, lazy })
}
pub fn is_empty(&self) -> bool {
self.lazy.is_empty()
}
pub fn clear(&self) {
self.lazy.clear()
}
pub fn seqlens(&self) -> IndexMap<String, Position> {
self.seqlens.clone()
}
pub fn is_loaded(&self, key: &str) -> bool {
self.lazy.is_loaded(&key.to_string())
}
}
impl Sequences for LazyNucleotideSequences {
type Container<'a> = Ref<'a, Nucleotides>;
type Slice<'a> = &'a [u8];
fn seqnames(&self) -> Vec<String> {
self.seqlens.keys().cloned().collect()
}
fn get_sequence(&self, seqname: &str) -> Result<Self::Container<'_>, GRangesError> {
self.lazy.get_data(&seqname.to_string())
}
fn region_map<V, F>(
&self,
func: &F,
seqname: &str,
start: Position,
end: Position,
) -> Result<V, GRangesError>
where
F: for<'b> Fn(&'b [u8]) -> V,
{
let seq = self.get_sequence(seqname)?;
let range = try_range(start, end, seq.len().try_into().unwrap())?;
Ok(func(&seq[range]))
}
fn get_sequence_length(&self, seqname: &str) -> Result<Position, GRangesError> {
self.seqlens
.get(seqname)
.ok_or(GRangesError::MissingSequenceName(seqname.to_string()))
.copied()
}
}
fn option_vec_to_hashset(x: Option<Vec<String>>) -> Option<HashSet<String>> {
x.map(HashSet::from_iter)
}
pub fn parse_fasta(
filepath: impl Into<PathBuf>,
seqnames: Option<Vec<String>>,
) -> Result<GenomeMap<Nucleotides>, GRangesError> {
let seqnames_set = option_vec_to_hashset(seqnames);
let filepath = filepath.into();
let mut reader = reader::Builder.build_from_path(filepath)?;
let mut sequences = GenomeMap::new();
for result in reader.records() {
let record = result?;
let name = String::from_utf8(record.definition().name().to_vec())?;
if seqnames_set
.as_ref()
.map_or(true, |keep_seqnames| keep_seqnames.contains(&name))
{
let seq = Bytes::from(record.sequence().as_ref().to_vec());
sequences.insert(&name, Nucleotides(seq))?;
}
}
Ok(sequences)
}
pub fn gc_content(seq: &[u8]) -> f64 {
let gc_count = seq
.iter()
.filter(|&&base| {
matches!(
base.to_ascii_uppercase(),
b'G' | b'C' | b'S' | b'V' | b'H' | b'D' | b'B' | b'N'
)
})
.count() as f64;
let total_count = seq.len() as f64;
if total_count == 0.0 {
0.0
} else {
gc_count / total_count
}
}
pub fn gc_content_strict(seq: &[u8]) -> f64 {
let gc_count = seq
.iter()
.filter(|&&base| matches!(base.to_ascii_uppercase(), b'G' | b'C'))
.count() as f64;
let total_count = seq
.iter()
.filter(|&&base| matches!(base.to_ascii_uppercase(), b'A' | b'T' | b'G' | b'C'))
.count() as f64;
if total_count == 0.0 {
0.0
} else {
gc_count / total_count
}
}
#[cfg(test)]
mod tests {
use super::{gc_content_strict, LazyNucleotideSequences, NucleotideSequences};
use crate::{granges::GRangesEmpty, sequences::nucleotide::Nucleotides, traits::Sequences};
#[test]
fn test_nucleotide_sequences() {
let ref_file = "tests_data/sequences/test_case_01.fa.gz";
let reference =
NucleotideSequences::from_fasta(ref_file, None).expect("could not load reference");
assert_eq!(
*reference.get_sequence("chr1").unwrap(),
Nucleotides::from("TTCACTACTATTAGTACTCACGGCGCAATA")
);
assert_eq!(reference.seqnames().len(), 2);
assert_eq!(*reference.seqlens().get("chr1").unwrap(), 30);
}
#[test]
fn test_lazyload() {
let ref_file = "tests_data/sequences/test_case_01.fa.gz";
let reference =
LazyNucleotideSequences::new(ref_file, None).expect("could not load reference");
assert_eq!(*reference.seqlens.get("chr1").unwrap(), 30);
assert_eq!(*reference.seqlens.get("chr2").unwrap(), 100);
let seqlens = reference.seqlens();
assert_eq!(reference.is_loaded("chr1"), false);
assert_eq!(reference.is_loaded("chr2"), false);
let seq1_len = seqlens.get("chr1").unwrap();
fn get_len(seq: &[u8]) -> usize {
seq.len()
}
let total_len = reference
.region_map(&get_len, "chr1", 0, *seq1_len)
.unwrap();
assert_eq!(total_len, *seq1_len as usize);
assert_eq!(reference.is_loaded("chr1"), true);
assert_eq!(reference.is_loaded("chr2"), false);
let chr2_len = seqlens.get("chr2").unwrap();
let total_len = reference
.region_map(&get_len, "chr2", 0, *chr2_len)
.unwrap();
assert_eq!(total_len, *chr2_len as usize);
assert!(reference.is_loaded("chr2"));
}
#[test]
fn lazy_region_apply_slice() {
let ref_file = "tests_data/sequences/test_case_01.fa.gz";
let reference =
LazyNucleotideSequences::new(ref_file, None).expect("could not load reference");
#[allow(non_snake_case)]
let total_Cs = reference
.region_map(
&|seq| seq.iter().filter(|c| **c == b'C').count(),
"chr1",
3,
10,
)
.unwrap();
assert_eq!(total_Cs, 2);
#[allow(non_snake_case)]
let total_As = reference
.region_map(
&|seq| seq.iter().filter(|c| **c == b'A').count(),
"chr1",
3,
10,
)
.unwrap();
assert_eq!(total_As, 3);
}
#[test]
fn test_map_into_granges() {
let ref_file = "tests_data/sequences/test_case_01.fa.gz";
let reference =
LazyNucleotideSequences::new(ref_file, None).expect("could not load reference");
let seqlens = reference.seqlens();
let windows = GRangesEmpty::from_windows(&seqlens, 10, None, false).unwrap();
let make_string = |seq: &[u8]| String::from_utf8(seq.to_vec()).unwrap();
let seqs_windows = reference
.region_map_into_granges(&windows, &make_string)
.unwrap();
let subseqs = seqs_windows.data_refs_by_seqname().unwrap();
let subseqs_chr1 = subseqs.get("chr1").unwrap();
let chr1_reconstructed: Nucleotides = subseqs_chr1
.iter()
.map(AsRef::as_ref)
.collect::<Vec<&str>>()
.join("")
.into();
let chr1_seq = reference.get_sequence("chr1").unwrap();
assert_eq!(chr1_reconstructed, *chr1_seq);
}
#[test]
fn test_map_into_granges_gc() {
let ref_file = "tests_data/sequences/test_case_01.fa.gz";
let reference =
LazyNucleotideSequences::new(ref_file, None).expect("could not load reference");
let seqlens = reference.seqlens();
let windows = GRangesEmpty::from_windows(&seqlens, 10, Some(5), false).unwrap();
let gc_windows = reference
.region_map_into_granges(&windows, &gc_content_strict)
.unwrap();
let gc = gc_windows.data_by_seqname().unwrap();
assert_eq!(
gc.get("chr1").unwrap().to_vec(),
vec![0.3, 0.2, 0.3, 0.7, 0.6, 0.2]
);
}
#[cfg(test)]
#[cfg(feature = "human_tests")]
fn test_lazyload_human() {
use crate::{
misc::assert_float_eq,
prelude::*,
sequences::nucleotide::{gc_content_strict, LazyNucleotideSequences},
};
let ref_file = "tests_data/human/hg38.analysisSet.fa.bgz";
let file_exists = std::fs::metadata(ref_file).is_ok();
let reference;
if file_exists {
let chroms: Vec<String> = (1..=22).map(|v| format!("chr{}", v)).collect();
assert_eq!(chroms.len(), 22);
reference = LazyNucleotideSequences::new(ref_file, Some(chroms))
.expect("could not load reference");
let expected_numseqs = reference.seqlens().unwrap().len();
assert_eq!(expected_numseqs, 22);
} else {
println!(
"human reference file '{}' not found, tests skipped.",
ref_file
);
return;
}
fn get_len(seq: &[u8]) -> usize {
seq.len()
}
let seqlens = reference.seqlens().unwrap();
assert_eq!(reference.is_loaded("chr1"), false);
let chr1_len = seqlens.get("chr1").unwrap();
let total_len = reference
.region_apply(get_len, "chr1", 0, *chr1_len - 1)
.unwrap();
assert_eq!(total_len, *chr1_len as usize);
assert_eq!(reference.is_loaded("chr1"), true);
assert_eq!(reference.is_loaded("chr2"), false);
let chr2_len = seqlens.get("chr2").unwrap();
let total_len = reference
.region_apply(get_len, "chr2", 0, *chr2_len - 1)
.unwrap();
assert_eq!(total_len, *chr2_len as usize);
let gc = reference
.region_apply(gc_content_strict, "chr1", 0, *chr1_len - 1)
.unwrap();
let expected_gc = 0.4172;
assert_float_eq(gc, expected_gc, 0.0001);
}
}