use std::{
ops::Range,
path::{Path, PathBuf},
};
use rust_htslib::faidx;
use tokio::sync::Mutex;
use crate::common::{
ImmutableSequence, MutableSequence,
coords::{GenomePosition, GenomeRegion},
};
pub struct ReferenceReader {
name: String,
inner: Mutex<faidx::Reader>,
soft_mask: bool,
}
#[derive(Debug)]
pub struct ReferenceQueryResult {
pub region: GenomeRegion,
pub sequence: ImmutableSequence,
pub range_in_sequence: Range<usize>,
}
impl ReferenceReader {
pub fn get_name(&self) -> &str {
&self.name
}
pub async fn get_seq_exact_unmasked(
&self,
query: GenomeRegion,
) -> anyhow::Result<ImmutableSequence> {
let contig = query.contig();
let reader = self.inner.lock().await;
let seq_len =
tokio::task::block_in_place(|| reader.fetch_seq_len(contig.as_str().unwrap()));
let seq_len = usize::try_from(seq_len)?;
let this_contig =
GenomeRegion::new_bounded(GenomePosition::new_0(contig.clone(), 0), seq_len);
if !this_contig.contains(&query) {
anyhow::bail!("Coordinates out of bounds: {query}");
}
let mut seq = tokio::task::block_in_place(|| {
reader.fetch_seq(
query.contig().as_str().unwrap(),
query.start().position_0(),
query.end_excl().unwrap().position_0() - 1,
)
})?;
seq.make_ascii_uppercase();
Ok(seq.into())
}
pub async fn get_seq(
&self,
mut query: GenomeRegion,
padding_left: usize,
padding_right: usize,
) -> anyhow::Result<Option<ReferenceQueryResult>> {
let contig = query.contig();
let reader = self.inner.lock().await;
let seq_len =
tokio::task::block_in_place(|| reader.fetch_seq_len(contig.as_str().unwrap()));
let seq_len = usize::try_from(seq_len)?;
let this_contig =
GenomeRegion::new_bounded(GenomePosition::new_0(contig.clone(), 0), seq_len);
if !this_contig.contains(&query) {
anyhow::bail!("Coordinates out of bounds: {query}");
}
query = query.intersection(&this_contig).unwrap();
let reader_query = {
let reader_query_start =
query.start().clone() - padding_left.min(query.start().position_0());
let reader_query_end = query.end_excl().map(|end| end + padding_right);
let mut reader_query =
GenomeRegion::from_incl_excl(reader_query_start, reader_query_end)?;
reader_query = reader_query.intersection(&this_contig).unwrap(); reader_query
};
let mut seq = tokio::task::block_in_place(|| {
reader.fetch_seq(
reader_query.contig().as_str().unwrap(),
reader_query.start().position_0(),
reader_query.end_excl().unwrap().position_0() - 1,
)
})?;
if self.soft_mask {
let result = Self::exclude_lowercase(seq, &reader_query, &query);
Ok(result)
} else {
seq.make_ascii_uppercase();
let rs = reader_query.start().abs_diff(query.start())
..reader_query.start().abs_diff(&query.end_excl().unwrap());
Ok(Some(ReferenceQueryResult {
region: reader_query,
sequence: seq.into(),
range_in_sequence: rs,
}))
}
}
fn exclude_lowercase(
mut sequence: MutableSequence,
sequence_coordinates: &GenomeRegion,
query_coordinates: &GenomeRegion,
) -> Option<ReferenceQueryResult> {
let to_offset =
|pos: &GenomePosition| pos.position_0() - sequence_coordinates.start().position_0();
let (query_start_offset, query_end_excl_offset) = (
to_offset(query_coordinates.start()),
to_offset(&query_coordinates.end_excl().unwrap()),
);
let ok = sequence[query_start_offset..query_end_excl_offset]
.iter()
.all(u8::is_ascii_uppercase);
if ok {
sequence.make_ascii_uppercase();
Some(ReferenceQueryResult {
region: sequence_coordinates.clone(),
sequence: sequence.into(),
range_in_sequence: to_offset(query_coordinates.start())
..to_offset(&query_coordinates.end_excl().unwrap()),
})
} else {
None
}
}
}
impl TryFrom<&CliReferenceArg> for ReferenceReader {
type Error = rust_htslib::errors::Error;
fn try_from(value: &CliReferenceArg) -> Result<Self, Self::Error> {
let path = PathBuf::from((&value.file).as_ref());
let inner = faidx::Reader::from_path(&path)?;
let name = path
.file_name()
.and_then(|str| str.to_str())
.unwrap_or("<unknown>")
.to_string();
Ok(Self {
name,
inner: inner.into(),
soft_mask: value.soft_mask,
})
}
}
#[derive(clap::Args, Clone, Debug)]
pub struct CliReferenceArg {
#[command(flatten)]
pub file: CliReferenceFileArg,
#[arg(short = 's', long = "soft-mask")]
pub soft_mask: bool,
}
#[derive(clap::Args, Clone, Debug)]
#[group(multiple = false, required = true)]
pub struct CliReferenceFileArg {
pub reference: Option<String>,
#[arg(long = "reference", value_name = "FILE")]
pub reference_arg: Option<String>,
}
impl AsRef<Path> for &CliReferenceFileArg {
fn as_ref(&self) -> &Path {
let path = match (&self.reference_arg, &self.reference) {
(None, None) | (Some(_), Some(_)) => unreachable!(),
(None, Some(path)) | (Some(path), None) => path,
};
path.as_ref()
}
}
impl<S: AsRef<str>> From<S> for CliReferenceArg {
fn from(value: S) -> Self {
Self {
file: CliReferenceFileArg {
reference: Some(value.as_ref().to_string()),
reference_arg: None,
},
soft_mask: false,
}
}
}
impl Default for CliReferenceArg {
fn default() -> Self {
Self {
file: CliReferenceFileArg {
reference: Some("reference.fa".to_string()),
reference_arg: None,
},
soft_mask: false,
}
}
}