use std::path::PathBuf;
use anyhow::{Context, Result};
use clap::Args;
use log::info;
use crate::data_dir;
use crate::hmmer::{self, DomainAnnotator, PyHMMER};
use crate::io::genbank;
use crate::io::tables::{FeatureTable, GeneTable};
use crate::orf::{ORFFinder, ProdigalFinder};
#[derive(Args)]
pub struct AnnotateArgs {
#[arg(short, long)]
pub genome: PathBuf,
#[arg(short, long, default_value = ".")]
pub output_dir: PathBuf,
#[arg(long)]
pub data_dir: Option<PathBuf>,
#[arg(short, long, default_value = "0")]
pub jobs: usize,
#[arg(short = 'M', long)]
pub mask: bool,
#[arg(long)]
pub cds_feature: Option<String>,
#[arg(long, default_value = "locus_tag")]
pub locus_tag: String,
#[arg(long)]
pub hmm: Vec<PathBuf>,
#[arg(short, long)]
pub e_filter: Option<f64>,
#[arg(short, long, default_value = "1e-9")]
pub p_filter: f64,
#[arg(long)]
pub disentangle: bool,
#[arg(long)]
pub force_tsv: bool,
}
impl AnnotateArgs {
pub fn execute(&self) -> Result<()> {
let base = self
.genome
.file_stem()
.unwrap_or_default()
.to_string_lossy()
.to_string();
std::fs::create_dir_all(&self.output_dir)?;
info!("Loading sequences from {:?}", self.genome);
let records = genbank::read_sequences(&self.genome)
.with_context(|| format!("loading sequences from {:?}", self.genome))?;
info!("Loaded {} sequence(s)", records.len());
info!("Finding genes");
let finder = ProdigalFinder {
metagenome: true,
mask: self.mask,
cpus: self.jobs,
..Default::default()
};
let mut genes = finder.find_genes(&records)?;
info!("Found {} genes", genes.len());
if genes.is_empty() {
log::warn!("No genes found");
if self.force_tsv {
GeneTable::write_from_genes(
std::fs::File::create(self.output_dir.join(format!("{}.genes.tsv", base)))?,
&[],
)?;
FeatureTable::write_from_genes(
std::fs::File::create(self.output_dir.join(format!("{}.features.tsv", base)))?,
&[],
)?;
}
return Ok(());
}
info!("Annotating protein domains");
let data_dir = data_dir::resolve(self.data_dir.as_ref());
let interpro = super::run::load_interpro(&data_dir)?;
let hmms = super::run::load_hmm_configs(&self.hmm, &data_dir)?;
for hmm_config in &hmms {
let annotator = PyHMMER::new(hmm_config.clone()).with_cpus(self.jobs);
annotator.run(&mut genes, &interpro, None)?;
}
let domain_count: usize = genes.iter().map(|g| g.protein.domains.len()).sum();
info!("Found {} domains", domain_count);
if self.disentangle {
for gene in &mut genes {
hmmer::disentangle(gene);
}
}
if let Some(e) = self.e_filter {
hmmer::filter_by_evalue(&mut genes, e);
}
hmmer::filter_by_pvalue(&mut genes, self.p_filter);
genes.sort_by(|a, b| a.source_id.cmp(&b.source_id).then(a.start.cmp(&b.start)));
info!("Writing results to {:?}", self.output_dir);
GeneTable::write_from_genes(
std::fs::File::create(self.output_dir.join(format!("{}.genes.tsv", base)))?,
&genes,
)?;
FeatureTable::write_from_genes(
std::fs::File::create(self.output_dir.join(format!("{}.features.tsv", base)))?,
&genes,
)?;
info!("Done");
Ok(())
}
}