#![warn(clippy::all, clippy::pedantic, clippy::cargo)]
#![allow(
clippy::missing_errors_doc,
clippy::module_name_repetitions,
clippy::cast_precision_loss,
clippy::cast_possible_truncation
)]
mod block;
pub mod convert;
mod counts;
mod error;
pub mod nucleotide;
mod quad;
mod reader;
use std::cmp::min;
use std::default::Default;
use std::fs::File;
use std::io::{BufReader, Cursor, Read, Seek, SeekFrom};
use std::ops::{Bound, Deref, RangeBounds};
use std::path::Path;
use crate::block::{Block, Blocks};
use crate::quad::NUC_QUAD_LOOKUP;
use crate::reader::{Reader, ValueReader};
pub use crate::counts::BaseCounts;
pub use crate::error::{Error, Result};
pub struct TwoBitFile<R: Read + Seek> {
reader: ValueReader<R>,
sequences: SequenceRecords,
softmask_enabled: bool,
}
pub type BoxTwoBitFile = TwoBitFile<Box<dyn Reader>>;
pub type TwoBitPhysicalFile = TwoBitFile<BufReader<File>>;
pub type TwoBitMemoryFile = TwoBitFile<Cursor<Vec<u8>>>;
#[derive(Debug, Clone)]
pub(crate) struct SequenceRecord {
chr: String,
offset: u64,
length: usize,
blocks_n: Blocks,
blocks_soft_mask: Blocks,
}
#[derive(Debug, Clone)]
pub(crate) struct SequenceRecords(Vec<SequenceRecord>);
impl SequenceRecords {
#[inline]
pub fn query(&self, chr: impl AsRef<str>) -> Result<&SequenceRecord> {
let chr = chr.as_ref();
self.0
.iter()
.find(|seq| seq.chr == chr)
.ok_or_else(|| Error::MissingName(chr.to_string()))
}
}
impl Deref for SequenceRecords {
type Target = [SequenceRecord];
#[inline]
fn deref(&self) -> &Self::Target {
&self.0
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct SequenceInfo {
pub chr: String,
pub length: usize,
pub hard_masks_total_length: usize,
pub soft_masks_total_length: usize,
}
impl SequenceRecord {
pub fn info(&self) -> SequenceInfo {
SequenceInfo {
chr: self.chr.clone(),
length: self.length,
hard_masks_total_length: self.blocks_n.count(),
soft_masks_total_length: self.blocks_soft_mask.count(),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct TwoBitFileInfo {
pub file_size: u64,
pub num_chromosomes: usize,
pub total_sequence_length: usize,
pub hard_masks_total_length: usize,
pub soft_masks_total_length: usize,
}
impl TwoBitFileInfo {
pub(crate) const fn new(file_size: u64) -> Self {
Self {
file_size,
num_chromosomes: 0,
total_sequence_length: 0,
hard_masks_total_length: 0,
soft_masks_total_length: 0,
}
}
pub(crate) fn update(&mut self, seq: &SequenceRecord) {
self.num_chromosomes += 1;
self.total_sequence_length += seq.length;
self.hard_masks_total_length += seq.blocks_n.count();
self.soft_masks_total_length += seq.blocks_soft_mask.count();
}
}
impl TwoBitPhysicalFile {
pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
Self::from_value_reader(ValueReader::open(path)?)
}
}
impl<T> TwoBitFile<Cursor<T>>
where
Cursor<T>: Reader,
{
pub fn from_buf(buf: T) -> Result<Self> {
Self::from_value_reader(ValueReader::from_buf(buf)?)
}
}
impl TwoBitMemoryFile {
pub fn open_and_read<P: AsRef<Path>>(path: P) -> Result<Self> {
Self::from_value_reader(ValueReader::open_and_read(path)?)
}
}
impl<R: Read + Seek> TwoBitFile<R> {
pub fn new(reader: R) -> Result<Self> {
Self::from_value_reader(ValueReader::new(reader)?)
}
pub fn boxed(self) -> BoxTwoBitFile
where
R: 'static,
{
TwoBitFile {
reader: self.reader.boxed(),
sequences: self.sequences,
softmask_enabled: self.softmask_enabled,
}
}
#[must_use]
pub fn enable_softmask(self, softmask_enabled: bool) -> Self {
Self {
softmask_enabled,
..self
}
}
fn from_value_reader(mut reader: ValueReader<R>) -> Result<Self> {
let sequences = reader.sequence_records()?;
Ok(Self {
reader,
sequences,
softmask_enabled: false,
})
}
pub fn read_sequence(
&mut self,
chr: impl AsRef<str>,
range: impl RangeBounds<usize>,
) -> Result<String> {
const NUC: &[u8; 4] = b"TCAG";
let seq = self.sequences.query(&chr)?;
let reader = &mut self.reader;
let start = {
let pos = match range.start_bound() {
Bound::Included(&v) => v,
Bound::Excluded(&v) => v + 1,
Bound::Unbounded => 0,
};
min(pos, seq.length)
};
let end = {
let pos = match range.end_bound() {
Bound::Included(&v) => v + 1,
Bound::Excluded(&v) => v,
Bound::Unbounded => seq.length,
};
min(pos, seq.length)
};
if start >= end {
return Ok(String::new()); }
let first_byte = start / 4;
reader.seek(SeekFrom::Start(seq.offset))?; reader.seek(SeekFrom::Current(first_byte as _))?;
let length = end - start;
let mut out = vec![0_u8; length];
let last_byte = (end - 1) / 4; let skip_start = start % 4;
let parse_byte = |buf: &mut [u8], mut byte: u8| {
for v in buf {
*v = NUC[((byte & 192) >> 6) as usize];
byte <<= 2;
}
};
if first_byte == last_byte {
parse_byte(&mut out, reader.byte()? << (skip_start * 2)); } else {
if skip_start != 0 {
parse_byte(
&mut out[..4 - skip_start],
reader.byte()? << (skip_start * 2),
);
}
for chunk in out[(4 - skip_start) % 4..].chunks_exact_mut(4) {
chunk.copy_from_slice(&NUC_QUAD_LOOKUP[reader.byte()? as usize]);
}
let include_end = ((end - 1) % 4) + 1; if include_end != 4 {
parse_byte(&mut out[length - include_end..], reader.byte()?);
}
}
seq.blocks_n.apply_masks::<true>(&mut out, start);
if self.softmask_enabled {
seq.blocks_soft_mask.apply_masks::<false>(&mut out, start);
}
Ok(unsafe { String::from_utf8_unchecked(out) }) }
pub fn info(&mut self) -> Result<TwoBitFileInfo> {
let mut info = TwoBitFileInfo::new(self.reader.stream_len()?);
self.sequences.iter().for_each(|seq| info.update(seq));
Ok(info)
}
pub fn sequence_info(&self) -> Vec<SequenceInfo> {
self.sequences.iter().map(SequenceRecord::info).collect()
}
pub fn chrom_names(&self) -> Vec<String> {
self.sequences.iter().map(|seq| seq.chr.clone()).collect()
}
pub fn chrom_sizes(&self) -> Vec<usize> {
self.sequences.iter().map(|seq| seq.length).collect()
}
pub fn base_counts(
&mut self,
chr: impl AsRef<str>,
range: impl RangeBounds<usize>,
) -> Result<BaseCounts<usize>> {
let mut counts = BaseCounts::default();
for &nuc in self.read_sequence(chr, range)?.as_bytes() {
counts.update(nuc)?;
}
Ok(counts)
}
pub fn hard_masked_blocks(
&mut self,
chr: impl AsRef<str>,
range: impl RangeBounds<usize>,
) -> Result<Vec<Block>> {
Ok(self
.sequences
.query(chr)?
.blocks_n
.overlaps(range)
.cloned()
.collect())
}
pub fn soft_masked_blocks(
&mut self,
chr: impl AsRef<str>,
range: impl RangeBounds<usize>,
) -> Result<Vec<Block>> {
Ok(self
.sequences
.query(chr)?
.blocks_soft_mask
.overlaps(range)
.cloned()
.collect())
}
}
#[cfg(test)]
mod tests {
use super::error::Result;
use super::reader::Reader;
use super::{BaseCounts, SequenceInfo, TwoBitFile, TwoBitFileInfo};
const TESTFILE: &str = "assets/foo.2bit";
fn run_test(softmask_enabled: bool, func: impl Fn(TwoBitFile<Box<dyn Reader>>) -> Result<()>) {
let mut files = vec![
TwoBitFile::open(TESTFILE).expect("unit-test").boxed(),
TwoBitFile::open_and_read(TESTFILE)
.expect("unit-test")
.boxed(),
];
for mut tb in files.drain(..) {
if softmask_enabled {
tb = tb.enable_softmask(true);
}
func(tb).expect("unit-test");
}
}
#[test]
fn test_chrom() {
run_test(true, |bit| {
assert_eq!(bit.chrom_names(), vec!["chr1", "chr2"]);
assert_eq!(bit.chrom_sizes(), vec![150, 100]);
Ok(())
});
}
#[test]
fn test_sequence_info() {
run_test(true, |bit| {
let expected_info = vec![
SequenceInfo {
chr: "chr1".to_owned(),
length: 150,
hard_masks_total_length: 100,
soft_masks_total_length: 8,
},
SequenceInfo {
chr: "chr2".to_owned(),
length: 100,
hard_masks_total_length: 50,
soft_masks_total_length: 0,
},
];
assert_eq!(bit.sequence_info(), expected_info);
Ok(())
});
}
#[test]
fn test_read_sequence() {
run_test(true, |mut bit| {
let seq = "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNACGTACGTACGTagctagctGATCGATCGTAGCTAGCTAGCTAGCTGATCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN";
assert_eq!(bit.read_sequence("chr1", ..)?, seq);
let n = 12;
for start in 0..n {
for end_range in &[start..(start + n), (seq.len() - n)..seq.len()] {
for end in end_range.clone() {
if end > start {
assert_eq!(bit.read_sequence("chr1", start..end)?, &seq[start..end]);
}
if end > 0 {
assert_eq!(bit.read_sequence("chr1", ..end)?, &seq[..end]);
}
assert_eq!(bit.read_sequence("chr1", start..)?, &seq[start..]);
if end < seq.len() {
assert_eq!(bit.read_sequence("chr1", start..=end)?, &seq[start..=end]);
assert_eq!(bit.read_sequence("chr1", ..=end)?, &seq[..=end]);
}
}
}
}
Ok(())
});
}
#[test]
fn test_bases() {
run_test(true, |mut bit| {
let full_counts = BaseCounts::new(12, 12, 13, 13, 100);
let full_percentages = BaseCounts::new(0.08, 0.08, 13. / 150., 13. / 150., 100. / 150.);
let c = bit.base_counts("chr1", ..)?;
assert_eq!(c, full_counts);
assert_eq!(c.percentages(), full_percentages);
let partial_counts = BaseCounts::new(6, 6, 6, 6, 26);
let partial_percentages = BaseCounts::new(0.12, 0.12, 0.12, 0.12, 26. / 50.);
let c = bit.base_counts("chr1", 24..74)?;
assert_eq!(c, partial_counts);
assert_eq!(c.percentages(), partial_percentages);
Ok(())
});
}
#[test]
fn test_info() {
run_test(true, |mut bit| {
let info = TwoBitFileInfo {
file_size: 161,
num_chromosomes: 2,
total_sequence_length: 250,
hard_masks_total_length: 150,
soft_masks_total_length: 8,
};
assert_eq!(bit.info()?, info);
Ok(())
});
}
#[test]
fn test_hard_masked_blocks() {
run_test(true, |mut bit| {
assert_eq!(bit.hard_masked_blocks("chr1", ..)?, vec![0..50, 100..150]);
Ok(())
});
}
#[test]
fn test_soft_masked_blocks() {
run_test(true, |mut bit| {
assert_eq!(bit.soft_masked_blocks("chr1", ..)?, vec![62..70]);
Ok(())
});
}
}