twitcher 0.1.8

Find template switch mutations in genomic data
use std::{
    ops::Range,
    path::{Path, PathBuf},
};

use rust_htslib::faidx;
use tokio::sync::Mutex;

use crate::common::{
    ImmutableSequence, MutableSequence,
    coords::{GenomePosition, GenomeRegion},
};

pub struct ReferenceReader {
    name: String,
    inner: Mutex<faidx::Reader>,
    soft_mask: bool,
}

#[derive(Debug)]
pub struct ReferenceQueryResult {
    pub region: GenomeRegion,
    pub sequence: ImmutableSequence,
    pub range_in_sequence: Range<usize>,
}

impl ReferenceReader {
    pub fn get_name(&self) -> &str {
        &self.name
    }

    pub async fn get_seq_exact_unmasked(
        &self,
        query: GenomeRegion,
    ) -> anyhow::Result<ImmutableSequence> {
        let contig = query.contig();
        let reader = self.inner.lock().await;

        let seq_len =
            tokio::task::block_in_place(|| reader.fetch_seq_len(contig.as_str().unwrap()));
        let seq_len = usize::try_from(seq_len)?;
        let this_contig =
            GenomeRegion::new_bounded(GenomePosition::new_0(contig.clone(), 0), seq_len);

        if !this_contig.contains(&query) {
            anyhow::bail!("Coordinates out of bounds: {query}");
        }

        let mut seq = tokio::task::block_in_place(|| {
            reader.fetch_seq(
                query.contig().as_str().unwrap(),
                query.start().position_0(),
                query.end_excl().unwrap().position_0() - 1,
            )
        })?;

        seq.make_ascii_uppercase();
        Ok(seq.into())
    }

    pub async fn get_seq(
        &self,
        mut query: GenomeRegion,
        padding_left: usize,
        padding_right: usize,
    ) -> anyhow::Result<Option<ReferenceQueryResult>> {
        let contig = query.contig();
        let reader = self.inner.lock().await;
        let seq_len =
            tokio::task::block_in_place(|| reader.fetch_seq_len(contig.as_str().unwrap()));
        let seq_len = usize::try_from(seq_len)?;
        let this_contig =
            GenomeRegion::new_bounded(GenomePosition::new_0(contig.clone(), 0), seq_len);

        if !this_contig.contains(&query) {
            anyhow::bail!("Coordinates out of bounds: {query}");
        }

        query = query.intersection(&this_contig).unwrap();

        let reader_query = {
            let reader_query_start =
                query.start().clone() - padding_left.min(query.start().position_0());
            let reader_query_end = query.end_excl().map(|end| end + padding_right);
            let mut reader_query =
                GenomeRegion::from_incl_excl(reader_query_start, reader_query_end)?;
            reader_query = reader_query.intersection(&this_contig).unwrap(); // might cut off at the end
            reader_query
        };

        let mut seq = tokio::task::block_in_place(|| {
            reader.fetch_seq(
                reader_query.contig().as_str().unwrap(),
                reader_query.start().position_0(),
                reader_query.end_excl().unwrap().position_0() - 1,
            )
        })?;

        // Now we have query region and sequence, only need to check for soft_mask
        if self.soft_mask {
            let result = Self::exclude_lowercase(seq, &reader_query, &query);
            Ok(result)
        } else {
            seq.make_ascii_uppercase();
            let rs = reader_query.start().abs_diff(query.start())
                ..reader_query.start().abs_diff(&query.end_excl().unwrap());
            Ok(Some(ReferenceQueryResult {
                region: reader_query,
                sequence: seq.into(),
                range_in_sequence: rs,
            }))
        }
    }

    fn exclude_lowercase(
        mut sequence: MutableSequence,
        sequence_coordinates: &GenomeRegion,
        query_coordinates: &GenomeRegion,
    ) -> Option<ReferenceQueryResult> {
        let to_offset =
            |pos: &GenomePosition| pos.position_0() - sequence_coordinates.start().position_0();

        let (query_start_offset, query_end_excl_offset) = (
            to_offset(query_coordinates.start()),
            to_offset(&query_coordinates.end_excl().unwrap()),
        );

        let ok = sequence[query_start_offset..query_end_excl_offset]
            .iter()
            .all(u8::is_ascii_uppercase);

        if ok {
            sequence.make_ascii_uppercase();
            Some(ReferenceQueryResult {
                region: sequence_coordinates.clone(),
                sequence: sequence.into(),
                range_in_sequence: to_offset(query_coordinates.start())
                    ..to_offset(&query_coordinates.end_excl().unwrap()),
            })
        } else {
            None
        }
    }
}

impl TryFrom<&CliReferenceArg> for ReferenceReader {
    type Error = rust_htslib::errors::Error;

    fn try_from(value: &CliReferenceArg) -> Result<Self, Self::Error> {
        let path = PathBuf::from((&value.file).as_ref());
        let inner = faidx::Reader::from_path(&path)?;
        let name = path
            .file_name()
            .and_then(|str| str.to_str())
            .unwrap_or("<unknown>")
            .to_string();
        Ok(Self {
            name,
            inner: inner.into(),
            soft_mask: value.soft_mask,
        })
    }
}

#[derive(clap::Args, Clone, Debug)]
pub struct CliReferenceArg {
    #[command(flatten)]
    pub file: CliReferenceFileArg,

    /// When enabled, lowercase letters in the reference sequence will be excluded from any alignments.
    #[arg(short = 's', long = "soft-mask")]
    pub soft_mask: bool,
}

#[derive(clap::Args, Clone, Debug)]
#[group(multiple = false, required = true)]
pub struct CliReferenceFileArg {
    /// The FASTA file containing the reference as the second positional argument.
    pub reference: Option<String>,

    /// The reference may also be provided as an argument with --reference at any position.
    #[arg(long = "reference", value_name = "FILE")]
    pub reference_arg: Option<String>,
}

impl AsRef<Path> for &CliReferenceFileArg {
    fn as_ref(&self) -> &Path {
        let path = match (&self.reference_arg, &self.reference) {
            (None, None) | (Some(_), Some(_)) => unreachable!(),
            (None, Some(path)) | (Some(path), None) => path,
        };
        path.as_ref()
    }
}

impl<S: AsRef<str>> From<S> for CliReferenceArg {
    fn from(value: S) -> Self {
        Self {
            file: CliReferenceFileArg {
                reference: Some(value.as_ref().to_string()),
                reference_arg: None,
            },
            soft_mask: false,
        }
    }
}

impl Default for CliReferenceArg {
    fn default() -> Self {
        Self {
            file: CliReferenceFileArg {
                reference: Some("reference.fa".to_string()),
                reference_arg: None,
            },
            soft_mask: false,
        }
    }
}