use std::path::PathBuf;
use anyhow::{Context, Result};
use log::info;
use crate::crf::backend::CrfSuiteModel;
use crate::crf::ClusterCRF;
use crate::data_dir;
use crate::hmmer::{self, PyHMMER, HMM, LoadedHMM};
use crate::interpro::InterPro;
use crate::model::{Cluster, Gene};
use crate::orf::{ProdigalFinder, SeqRecord, ORFFinder};
use crate::refine::ClusterRefiner;
pub struct GeccoBuilder {
data_dir: Option<PathBuf>,
crf_model_path: Option<PathBuf>,
hmm_configs: Vec<HMM>,
hmm_paths: Vec<PathBuf>,
interpro_path: Option<PathBuf>,
metagenome: bool,
mask: bool,
jobs: usize,
p_filter: f64,
e_filter: Option<f64>,
disentangle: bool,
threshold: f64,
n_cds: usize,
edge_distance: usize,
trim: bool,
pad: bool,
window_size: usize,
feature_type: String,
}
impl Default for GeccoBuilder {
fn default() -> Self {
Self {
data_dir: None,
crf_model_path: None,
hmm_configs: Vec::new(),
hmm_paths: Vec::new(),
interpro_path: None,
metagenome: true,
mask: false,
jobs: 0,
p_filter: 1e-9,
e_filter: None,
disentangle: false,
threshold: 0.8,
n_cds: 3,
edge_distance: 0,
trim: true,
pad: true,
window_size: 5,
feature_type: "protein".to_string(),
}
}
}
impl GeccoBuilder {
pub fn data_dir(mut self, path: impl Into<PathBuf>) -> Self {
self.data_dir = Some(path.into());
self
}
pub fn crf_model(mut self, path: impl Into<PathBuf>) -> Self {
self.crf_model_path = Some(path.into());
self
}
pub fn hmm(mut self, path: impl Into<PathBuf>) -> Self {
self.hmm_paths.push(path.into());
self
}
pub fn hmm_config(mut self, hmm: HMM) -> Self {
self.hmm_configs.push(hmm);
self
}
pub fn interpro(mut self, path: impl Into<PathBuf>) -> Self {
self.interpro_path = Some(path.into());
self
}
pub fn metagenome(mut self, v: bool) -> Self {
self.metagenome = v;
self
}
pub fn mask(mut self, v: bool) -> Self {
self.mask = v;
self
}
pub fn jobs(mut self, v: usize) -> Self {
self.jobs = v;
self
}
pub fn p_filter(mut self, v: f64) -> Self {
self.p_filter = v;
self
}
pub fn e_filter(mut self, v: f64) -> Self {
self.e_filter = Some(v);
self
}
pub fn disentangle(mut self, v: bool) -> Self {
self.disentangle = v;
self
}
pub fn threshold(mut self, v: f64) -> Self {
self.threshold = v;
self
}
pub fn n_cds(mut self, v: usize) -> Self {
self.n_cds = v;
self
}
pub fn edge_distance(mut self, v: usize) -> Self {
self.edge_distance = v;
self
}
pub fn trim(mut self, v: bool) -> Self {
self.trim = v;
self
}
pub fn pad(mut self, v: bool) -> Self {
self.pad = v;
self
}
pub fn window_size(mut self, v: usize) -> Self {
self.window_size = v;
self
}
pub fn feature_type(mut self, v: impl Into<String>) -> Self {
self.feature_type = v.into();
self
}
pub fn build(self) -> Result<Gecco> {
let resolved_data_dir = data_dir::resolve(self.data_dir.as_ref());
let crf_model_path = self.crf_model_path.unwrap_or_else(|| {
data_dir::crf_model_path(&resolved_data_dir)
});
let crf_model = CrfSuiteModel::from_file(&crf_model_path)
.with_context(|| format!("loading CRF model from {:?}", crf_model_path))?;
let mut hmms = self.hmm_configs;
for (i, path) in self.hmm_paths.iter().enumerate() {
hmms.push(HMM {
id: format!("HMM{}", i),
version: String::new(),
url: String::new(),
path: path.clone(),
size: None,
relabel_with: None,
md5: None,
});
}
if hmms.is_empty() {
let default_hmm = data_dir::hmm_path(&resolved_data_dir);
if default_hmm.exists() {
hmms.push(HMM {
id: "Pfam".to_string(),
version: String::new(),
url: String::new(),
path: default_hmm,
size: None,
relabel_with: Some("s/(PF\\d+).\\d+/\\1/".to_string()),
md5: None,
});
}
}
let interpro = match &self.interpro_path {
Some(path) => {
let data = std::fs::read(path)
.with_context(|| format!("reading InterPro JSON from {:?}", path))?;
InterPro::from_json(&data)?
}
None => {
let default_path = data_dir::interpro_path(&resolved_data_dir);
if default_path.exists() {
let data = std::fs::read(&default_path)
.with_context(|| format!("reading InterPro JSON from {:?}", default_path))?;
InterPro::from_json(&data)?
} else {
InterPro::from_json(b"[]")?
}
}
};
let mut loaded_hmms = Vec::with_capacity(hmms.len());
for hmm_config in hmms {
let annotator = PyHMMER::new(hmm_config.clone());
let profiles = annotator.load_hmms()
.with_context(|| format!("loading HMM profiles from {:?}", hmm_config.path))?;
loaded_hmms.push(LoadedHMM { meta: hmm_config, profiles });
}
Ok(Gecco {
crf_model,
hmms: loaded_hmms,
interpro,
metagenome: self.metagenome,
mask: self.mask,
jobs: self.jobs,
p_filter: self.p_filter,
e_filter: self.e_filter,
disentangle: self.disentangle,
threshold: self.threshold,
n_cds: self.n_cds,
edge_distance: self.edge_distance,
trim: self.trim,
pad: self.pad,
window_size: self.window_size,
feature_type: self.feature_type,
})
}
}
pub struct Gecco {
crf_model: CrfSuiteModel,
hmms: Vec<LoadedHMM>,
interpro: InterPro,
metagenome: bool,
mask: bool,
jobs: usize,
p_filter: f64,
e_filter: Option<f64>,
disentangle: bool,
threshold: f64,
n_cds: usize,
edge_distance: usize,
trim: bool,
pad: bool,
window_size: usize,
feature_type: String,
}
impl Gecco {
pub fn builder() -> GeccoBuilder {
GeccoBuilder::default()
}
pub fn scan(&self, records: &[SeqRecord]) -> Result<Vec<Cluster>> {
let mut genes = self.find_genes(records)?;
if genes.is_empty() {
return Ok(Vec::new());
}
self.annotate_domains(&mut genes)?;
let genes = self.predict_probabilities(&genes)?;
Ok(self.extract_clusters(&genes))
}
pub fn scan_detailed(&self, records: &[SeqRecord]) -> Result<(Vec<Gene>, Vec<Cluster>)> {
let mut genes = self.find_genes(records)?;
if genes.is_empty() {
return Ok((Vec::new(), Vec::new()));
}
self.annotate_domains(&mut genes)?;
let genes = self.predict_probabilities(&genes)?;
let clusters = self.extract_clusters(&genes);
Ok((genes, clusters))
}
pub fn predict_from_genes(&self, genes: &[Gene]) -> Result<(Vec<Gene>, Vec<Cluster>)> {
let genes = self.predict_probabilities(genes)?;
let clusters = self.extract_clusters(&genes);
Ok((genes, clusters))
}
pub fn find_genes(&self, records: &[SeqRecord]) -> Result<Vec<Gene>> {
info!("Finding genes with Prodigal ({} sequences)", records.len());
let finder = ProdigalFinder {
metagenome: self.metagenome,
mask: self.mask,
cpus: self.jobs,
..Default::default()
};
let genes = finder.find_genes(records)?;
info!("Found {} genes", genes.len());
Ok(genes)
}
pub fn annotate_domains(&self, genes: &mut Vec<Gene>) -> Result<()> {
info!("Annotating protein domains ({} HMM databases)", self.hmms.len());
for loaded in &self.hmms {
let annotator = PyHMMER::new(loaded.meta.clone());
annotator.run_with_profiles(genes, &self.interpro, &loaded.profiles, None)?;
}
if self.disentangle {
for gene in genes.iter_mut() {
hmmer::disentangle(gene);
}
}
if let Some(e) = self.e_filter {
hmmer::filter_by_evalue(genes, e);
}
hmmer::filter_by_pvalue(genes, self.p_filter);
genes.sort_by(|a, b| {
a.source_id
.cmp(&b.source_id)
.then(a.start.cmp(&b.start))
});
let domain_count: usize = genes.iter().map(|g| g.protein.domains.len()).sum();
info!("Found {} domains across all proteins", domain_count);
Ok(())
}
pub fn predict_probabilities(&self, genes: &[Gene]) -> Result<Vec<Gene>> {
info!("Predicting cluster probabilities");
let mut crf = ClusterCRF::new(&self.feature_type, self.window_size, 1);
crf.set_model(Box::new(self.crf_model.clone()));
crf.predict_probabilities(genes, self.pad, None)
}
pub fn extract_clusters(&self, genes: &[Gene]) -> Vec<Cluster> {
info!("Extracting clusters");
let refiner = ClusterRefiner {
threshold: self.threshold,
n_cds: self.n_cds,
edge_distance: self.edge_distance,
trim: self.trim,
..Default::default()
};
let clusters = refiner.iter_clusters(genes);
info!("Found {} cluster(s)", clusters.len());
clusters
}
}