use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;
use crate::structs::Profile;
use anyhow::{Context, Result};
use lazy_static::lazy_static;
use regex::Regex;
use thiserror::Error;
pub mod constants {
pub const HMM_MATCH_TO_MATCH: usize = 0;
pub const HMM_MATCH_TO_INSERT: usize = 1;
pub const HMM_MATCH_TO_DELETE: usize = 2;
pub const HMM_INSERT_TO_MATCH: usize = 3;
pub const HMM_INSERT_TO_INSERT: usize = 4;
pub const HMM_DELETE_TO_MATCH: usize = 5;
pub const HMM_DELETE_TO_DELETE: usize = 6;
}
const P7_HEADER_FORMAT_FLAG: &str = "HMMER3/f";
const P7_HEADER_NAME_FLAG: &str = "NAME";
const P7_HEADER_ACCESSION_FLAG: &str = "ACC";
const P7_HEADER_DESCRIPTION_FLAG: &str = "DESC";
const P7_HEADER_LENGTH_FLAG: &str = "LENG";
const P7_HEADER_MAXL_FLAG: &str = "MAXL";
const P7_HEADER_ALPHABET_FLAG: &str = "ALPH";
const P7_HEADER_REFERENCE_FLAG: &str = "RF";
const P7_HEADER_MASK_FLAG: &str = "MM";
const P7_HEADER_CONSENSUS_RESIDUE_FLAG: &str = "CONS";
const P7_HEADER_CONSENSUS_STRUCTURE_FLAG: &str = "CS";
const P7_HEADER_MAP_FLAG: &str = "MAP";
const P7_HEADER_DATE_FLAG: &str = "DATE";
const P7_HEADER_COMMAND_FLAG: &str = "COM";
const P7_HEADER_NSEQ_FLAG: &str = "NSEQ";
const P7_HEADER_EFFN_FLAG: &str = "EFFN";
const P7_HEADER_CHECKSUM_FLAG: &str = "CKSUM";
const P7_HEADER_GATHERING_FLAG: &str = "GA";
const P7_HEADER_TRUSTED_FLAG: &str = "TC";
const P7_HEADER_NOISE_FLAG: &str = "NC";
const P7_HEADER_STATS_FLAG: &str = "STATS";
const P7_HEADER_STATS_MSV_FLAG: &str = "MSV";
const P7_HEADER_STATS_VITERBI_FLAG: &str = "VITERBI";
const P7_HEADER_STATS_FORWARD_FLAG: &str = "FORWARD";
const P7_BODY_HMM_MODEL_START_FLAG: &str = "HMM";
const P7_BODY_COMPO_FLAG: &str = "COMPO";
const P7_BODY_END_FLAG: &str = "//";
lazy_static! {
static ref FLOAT_RE: Regex = Regex::new(r"-*\d\.*\d*").unwrap();
}
enum ParserState {
Idle,
Header,
ModelHead,
ModelBody,
}
enum ModelParserState {
MatchEmissions,
InsertEmissions,
StateTransitions,
}
#[derive(Error, Debug)]
#[error("unknown header flag")]
struct UnknownHeaderFlagError;
#[derive(Error, Debug)]
#[error("token index out of bounds")]
struct TokenIndexError;
#[derive(Error, Debug)]
#[error("unable to find a float-like substring")]
struct FloatRegexError;
#[derive(Default, Clone)]
pub enum Alphabet {
Amino,
Dna,
Rna,
#[default]
AlphabetNotSet,
}
#[derive(Default)]
pub struct Header {
pub name: String,
pub version: String,
pub accession_number: String,
pub description: String,
pub date: String,
pub command_line_history: String,
pub has_reference_annotation: bool,
pub has_model_mask: bool,
pub has_consensus_residue: bool,
pub has_consensus_structure: bool,
pub has_map_annotation: bool,
pub model_length: usize,
pub max_length: usize,
pub checksum: usize,
pub num_sequences: usize,
pub effective_num_sequences: f32,
pub gathering_thresholds: [f32; 2],
pub trusted_cutoffs: [f32; 2],
pub noise_cutoffs: [f32; 2],
pub alphabet: Alphabet,
}
#[derive(Default)]
pub struct Stats {
pub msv_gumble_mu: f32,
pub msv_gumble_lambda: f32,
pub viterbi_gumble_mu: f32,
pub viterbi_gumble_lambda: f32,
pub forward_tau: f32,
pub forward_lambda: f32,
}
#[derive(Default)]
pub struct Model {
pub composition: Vec<f32>,
pub match_probabilities: Vec<Vec<f32>>,
pub insert_probabilities: Vec<Vec<f32>>,
pub transition_probabilities: Vec<Vec<f32>>,
pub map_annotations: Vec<usize>,
pub consensus_residues: String,
pub reference_annotation: String,
pub model_mask: Vec<bool>,
pub consensus_structure: String,
}
#[derive(Default)]
pub struct Hmm {
pub header: Header,
pub stats: Stats,
pub model: Model,
}
impl Hmm {
pub fn new() -> Self {
let mut hmm = Hmm::default();
hmm.model
.match_probabilities
.push(vec![0.0; Profile::MAX_DEGENERATE_ALPHABET_SIZE]);
hmm.model
.insert_probabilities
.push(vec![0.0; Profile::MAX_DEGENERATE_ALPHABET_SIZE]);
hmm.model.transition_probabilities.push(vec![0.0f32; 7]);
hmm
}
}
pub fn parse_hmms_from_p7hmm_file<P: AsRef<Path>>(path: P) -> Result<Vec<Hmm>> {
let phmm_file = File::open(&path)?;
let mut phmm_lines = BufReader::new(phmm_file).lines();
let mut line_number: usize = 0;
let mut current_model_line_number: usize = 0;
let mut parser_state = ParserState::Idle;
let mut body_parser_state = ModelParserState::MatchEmissions;
let mut hmm_list: Vec<Hmm> = vec![Hmm::new()];
let mut current_hmm = hmm_list.get_mut(0).unwrap();
let mut hmm_count: usize = 0;
while let Some(Ok(line)) = phmm_lines.next() {
let tokens: Vec<&str> = line.split_whitespace().collect();
let flag: &str = get_token_as_str(&tokens, 0)?;
line_number += 1;
if line.trim().is_empty() {
continue;
}
match parser_state {
ParserState::Idle => match flag {
P7_HEADER_FORMAT_FLAG => {
if hmm_count > 0 {
hmm_list.push(Hmm::new());
current_hmm = hmm_list.last_mut().unwrap();
}
current_hmm.header.version =
get_joined_tokens(&tokens, 1).with_context(|| "")?;
parser_state = ParserState::Header;
}
_ => panic!("unknown flag: {}", flag),
},
ParserState::Header => {
let error_context = || {
format!(
"failed to parse p7hmm file header: {}\n on line: {}\n with flag: {}",
&path.as_ref().to_string_lossy(),
line_number,
flag
)
};
match flag {
P7_HEADER_NAME_FLAG => {
current_hmm.header.name =
get_token_as_string(&tokens, 1).with_context(error_context)?;
}
P7_HEADER_ACCESSION_FLAG => {
current_hmm.header.accession_number =
get_token_as_string(&tokens, 1).with_context(error_context)?;
}
P7_HEADER_DESCRIPTION_FLAG => {
current_hmm.header.description =
get_token_as_string(&tokens, 1).with_context(error_context)?;
}
P7_HEADER_LENGTH_FLAG => {
current_hmm.header.model_length =
get_token_as_usize(&tokens, 1).with_context(error_context)?;
}
P7_HEADER_MAXL_FLAG => {
current_hmm.header.max_length =
get_token_as_usize(&tokens, 1).with_context(error_context)?;
}
P7_HEADER_ALPHABET_FLAG => {
let value = get_token_as_str(&tokens, 1).with_context(error_context)?;
current_hmm.header.alphabet = match value {
"amino" => Alphabet::Amino,
"dna" => Alphabet::Dna,
"rna" => Alphabet::Rna,
_ => panic!("unknown alphabet"),
}
}
P7_HEADER_REFERENCE_FLAG => {
let value = get_token_as_str(&tokens, 1).with_context(error_context)?;
current_hmm.header.has_reference_annotation = match value {
"yes" => true,
"no" => false,
_ => panic!("unknown reference annotation value"),
}
}
P7_HEADER_MASK_FLAG => {
let value = get_token_as_str(&tokens, 1).with_context(error_context)?;
current_hmm.header.has_model_mask = match value {
"yes" => true,
"no" => false,
_ => panic!("unknown model mask value"),
}
}
P7_HEADER_CONSENSUS_RESIDUE_FLAG => {
let value = get_token_as_str(&tokens, 1).with_context(error_context)?;
current_hmm.header.has_consensus_residue = match value {
"yes" => true,
"no" => false,
_ => panic!("unknown consensus residue value"),
}
}
P7_HEADER_CONSENSUS_STRUCTURE_FLAG => {
let value = get_token_as_str(&tokens, 1).with_context(error_context)?;
current_hmm.header.has_consensus_structure = match value {
"yes" => true,
"no" => false,
_ => panic!("unknown consensus structure value"),
}
}
P7_HEADER_MAP_FLAG => {
let value = get_token_as_str(&tokens, 1).with_context(error_context)?;
current_hmm.header.has_map_annotation = match value {
"yes" => true,
"no" => false,
_ => panic!("unknown map annotation value"),
}
}
P7_HEADER_DATE_FLAG => {
current_hmm.header.date =
get_joined_tokens(&tokens, 1).with_context(error_context)?;
}
P7_HEADER_COMMAND_FLAG => {
current_hmm.header.command_line_history =
get_joined_tokens(&tokens, 1).with_context(error_context)?;
}
P7_HEADER_NSEQ_FLAG => {
current_hmm.header.num_sequences =
get_token_as_usize(&tokens, 1).with_context(error_context)?;
}
P7_HEADER_EFFN_FLAG => {
current_hmm.header.effective_num_sequences =
get_token_as_f32(&tokens, 1).with_context(error_context)?;
}
P7_HEADER_CHECKSUM_FLAG => {
current_hmm.header.checksum =
get_token_as_usize(&tokens, 1).with_context(error_context)?;
}
P7_HEADER_GATHERING_FLAG => {
current_hmm.header.gathering_thresholds = [
get_token_as_f32(&tokens, 1).with_context(error_context)?,
get_token_as_f32(&tokens, 2).with_context(error_context)?,
]
}
P7_HEADER_TRUSTED_FLAG => {
current_hmm.header.trusted_cutoffs = [
get_token_as_f32(&tokens, 1).with_context(error_context)?,
get_token_as_f32(&tokens, 2).with_context(error_context)?,
]
}
P7_HEADER_NOISE_FLAG => {
current_hmm.header.noise_cutoffs = [
get_token_as_f32(&tokens, 1).with_context(error_context)?,
get_token_as_f32(&tokens, 2).with_context(error_context)?,
]
}
P7_HEADER_STATS_FLAG => {
let mu_or_tau = get_token_as_f32(&tokens, 3).with_context(error_context)?;
let lambda = get_token_as_f32(&tokens, 4).with_context(error_context)?;
let sub_flag = get_token_as_str(&tokens, 1).with_context(error_context)?;
match sub_flag {
"LOCAL" => {
let sub_sub_flag =
get_token_as_str(&tokens, 2).with_context(error_context)?;
match sub_sub_flag {
P7_HEADER_STATS_MSV_FLAG => {
current_hmm.stats.msv_gumble_mu = mu_or_tau;
current_hmm.stats.msv_gumble_lambda = lambda;
}
P7_HEADER_STATS_VITERBI_FLAG => {
current_hmm.stats.viterbi_gumble_mu = mu_or_tau;
current_hmm.stats.viterbi_gumble_lambda = lambda;
}
P7_HEADER_STATS_FORWARD_FLAG => {
current_hmm.stats.forward_tau = mu_or_tau;
current_hmm.stats.forward_lambda = lambda;
}
_ => panic!("unknown header stats value (2)"),
}
}
_ => panic!("unknown stats value (1)"),
};
}
P7_BODY_HMM_MODEL_START_FLAG => {
parser_state = ParserState::ModelHead;
}
_ => {
}
}
}
ParserState::ModelHead => {
let error_context = || {
format!(
"failed to parse p7hmm file: {}\n on line: {}\n",
&path.as_ref().to_string_lossy(),
line_number
)
};
match body_parser_state {
ModelParserState::MatchEmissions => match flag {
P7_BODY_COMPO_FLAG => {
current_hmm.model.composition =
get_tokens_as_probability_vec(&tokens, 1, 21)
.with_context(error_context)?;
body_parser_state = ModelParserState::InsertEmissions;
}
_ => {
}
},
ModelParserState::InsertEmissions => {
current_hmm.model.insert_probabilities[0] =
get_tokens_as_probability_vec(&tokens, 0, 20)
.with_context(error_context)?;
body_parser_state = ModelParserState::StateTransitions;
}
ModelParserState::StateTransitions => {
current_hmm.model.transition_probabilities[0] =
get_tokens_as_probability_vec(&tokens, 0, 7)
.with_context(error_context)?;
current_model_line_number = 1;
parser_state = ParserState::ModelBody;
body_parser_state = ModelParserState::MatchEmissions;
}
}
}
ParserState::ModelBody => {
let error_context = || {
format!(
"failed to parse p7hmm file body: {}\n on line: {}\n",
&path.as_ref().to_string_lossy(),
line_number,
)
};
match flag {
P7_BODY_END_FLAG => {
assert_eq!(
current_hmm.model.match_probabilities.len(),
current_hmm.header.model_length + 1
);
assert_eq!(
current_hmm.model.insert_probabilities.len(),
current_hmm.header.model_length + 1
);
hmm_count += 1;
parser_state = ParserState::Idle;
}
_ => match body_parser_state {
ModelParserState::MatchEmissions => {
let line_number_flag = current_model_line_number.to_string();
if flag == line_number_flag {
current_hmm.model.match_probabilities.push(
get_tokens_as_probability_vec(&tokens, 1, 21)
.with_context(error_context)?,
);
body_parser_state = ModelParserState::InsertEmissions;
} else {
println!("{}", line);
panic!("mismatched line number");
}
}
ModelParserState::InsertEmissions => {
current_hmm.model.insert_probabilities.push(
get_tokens_as_probability_vec(&tokens, 0, 20)
.with_context(error_context)?,
);
body_parser_state = ModelParserState::StateTransitions;
}
ModelParserState::StateTransitions => {
current_hmm.model.transition_probabilities.push(
get_tokens_as_probability_vec(&tokens, 0, 7)
.with_context(error_context)?,
);
current_model_line_number += 1;
body_parser_state = ModelParserState::MatchEmissions;
}
},
}
}
}
}
Ok(hmm_list)
}
fn token_index_check(tokens: &Vec<&str>, idx: usize) -> Result<()> {
if tokens.len() + 1 < idx {
return Err(TokenIndexError.into());
}
Ok(())
}
fn get_token_as_str<'a>(tokens: &'a Vec<&str>, idx: usize) -> Result<&'a str> {
token_index_check(tokens, idx)?;
Ok(tokens[idx])
}
fn get_token_as_string(tokens: &Vec<&str>, idx: usize) -> Result<String> {
token_index_check(tokens, idx)?;
Ok(String::from(tokens[idx]))
}
fn get_joined_tokens(tokens: &Vec<&str>, idx: usize) -> Result<String> {
token_index_check(tokens, idx)?;
Ok(tokens[idx..].join(" "))
}
fn get_token_as_f32(tokens: &Vec<&str>, idx: usize) -> Result<f32> {
token_index_check(tokens, idx)?;
if tokens[idx] == "*" {
return Ok(0.0);
}
let float_str = match FLOAT_RE.find(tokens[idx]) {
Some(str) => str,
None => {
return Err(FloatRegexError)
.with_context(|| format!("failed to parse token \"{}\" as f32", tokens[idx]));
}
};
float_str
.as_str()
.parse::<f32>()
.with_context(|| format!("failed to parse token \"{}\" as f32", tokens[idx]))
}
fn get_token_as_usize(tokens: &Vec<&str>, idx: usize) -> Result<usize> {
token_index_check(tokens, idx)?;
tokens[idx]
.parse::<usize>()
.with_context(|| format!("failed to parse token \"{}\" as f32", tokens[idx]))
}
fn get_token_as_probability(tokens: &Vec<&str>, idx: usize) -> Result<f32> {
let float = get_token_as_f32(tokens, idx)?;
Ok((-float).exp())
}
fn get_tokens_as_probability_vec(tokens: &Vec<&str>, start: usize, end: usize) -> Result<Vec<f32>> {
let mut vec: Vec<f32> = vec![];
for i in start..end {
vec.push(get_token_as_probability(tokens, i)?)
}
Ok(vec)
}