use crate::error::{Error, Result};
use std::path::Path;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum HmmFormat {
Hmmer3,
PfamAscii,
HmmSearch,
InterPro,
}
#[derive(Debug, Clone)]
pub struct UniversalHmmProfile {
pub name: String,
pub accession: Option<String>,
pub description: Option<String>,
pub length: usize,
pub alphabet: String,
pub emission_scores: Vec<Vec<f32>>,
pub transition_scores: Vec<Vec<f32>>,
pub meta: HmmMetadata,
}
#[derive(Debug, Clone, Default)]
pub struct HmmMetadata {
pub ga_threshold: Option<f32>,
pub tc_threshold: Option<f32>,
pub nc_threshold: Option<f32>,
pub author: Option<String>,
pub source: Option<String>,
pub date: Option<String>,
pub command_line: Option<String>,
}
pub trait HmmParser: Send + Sync {
fn parse(&self, content: &str) -> Result<UniversalHmmProfile>;
fn detect(&self, content: &str) -> bool;
fn format_name(&self) -> &'static str;
}
pub struct Hmmer3Parser;
impl HmmParser for Hmmer3Parser {
fn parse(&self, content: &str) -> Result<UniversalHmmProfile> {
let mut profile = UniversalHmmProfile {
name: String::new(),
accession: None,
description: None,
length: 0,
alphabet: "amino".to_string(),
emission_scores: Vec::new(),
transition_scores: Vec::new(),
meta: HmmMetadata::default(),
};
for line in content.lines() {
if line.starts_with("NAME") {
profile.name = line.split_whitespace().nth(1).unwrap_or("").to_string();
} else if line.starts_with("ACC") {
profile.accession = Some(line.split_whitespace().nth(1).unwrap_or("").to_string());
} else if line.starts_with("DESC") {
profile.description = Some(line[5..].trim().to_string());
} else if line.starts_with("LENG") {
if let Ok(len) = line.split_whitespace().nth(1).unwrap_or("0").parse::<usize>() {
profile.length = len;
}
} else if line.starts_with("ALPH") {
profile.alphabet = line.split_whitespace().nth(1).unwrap_or("amino").to_string();
} else if line.starts_with("GA") {
profile.meta.ga_threshold = line.split_whitespace().nth(1).and_then(|s| s.parse().ok());
} else if line.starts_with("TC") {
profile.meta.tc_threshold = line.split_whitespace().nth(1).and_then(|s| s.parse().ok());
} else if line.starts_with("NC") {
profile.meta.nc_threshold = line.split_whitespace().nth(1).and_then(|s| s.parse().ok());
}
}
Ok(profile)
}
fn detect(&self, content: &str) -> bool {
content.contains("HMMER") && content.contains("NAME") && content.contains("LENG")
}
fn format_name(&self) -> &'static str {
"HMMER3"
}
}
pub struct PfamParser;
impl HmmParser for PfamParser {
fn parse(&self, content: &str) -> Result<UniversalHmmProfile> {
let mut profile = UniversalHmmProfile {
name: String::new(),
accession: None,
description: None,
length: 0,
alphabet: "amino".to_string(),
emission_scores: Vec::new(),
transition_scores: Vec::new(),
meta: HmmMetadata::default(),
};
for line in content.lines() {
if line.starts_with("#=GF ID") {
profile.name = line.split_whitespace().nth(2).unwrap_or("").to_string();
} else if line.starts_with("#=GF AC") {
profile.accession = Some(line.split_whitespace().nth(2).unwrap_or("").to_string());
} else if line.starts_with("#=GF DE") {
profile.description = Some(line[8..].trim().to_string());
} else if line.starts_with("#=GF LEN") {
if let Ok(len) = line.split_whitespace().nth(2).unwrap_or("0").parse::<usize>() {
profile.length = len;
}
}
}
Ok(profile)
}
fn detect(&self, content: &str) -> bool {
content.contains("#=GF ID") || content.contains("# STOCKHOLM")
}
fn format_name(&self) -> &'static str {
"PFAM"
}
}
pub struct HmmSearchParser;
impl HmmParser for HmmSearchParser {
fn parse(&self, content: &str) -> Result<UniversalHmmProfile> {
let mut profile = UniversalHmmProfile {
name: String::new(),
accession: None,
description: None,
length: 0,
alphabet: "protein".to_string(),
emission_scores: Vec::new(),
transition_scores: Vec::new(),
meta: HmmMetadata::default(),
};
for line in content.lines() {
if line.starts_with("Query:") {
if let Some(query_name) = line.split_whitespace().nth(1) {
profile.name = query_name.to_string();
}
if let Some(start_idx) = line.find("[M=") {
if let Some(end_idx) = line[start_idx..].find(']') {
let length_str = &line[start_idx+3..start_idx+end_idx];
if let Ok(len) = length_str.trim().parse::<usize>() {
profile.length = len;
}
}
}
} else if line.starts_with("Accession:") {
if let Some(acc) = line.split_whitespace().nth(1) {
profile.accession = Some(acc.to_string());
}
} else if line.starts_with("Description:") {
profile.description = Some(line[12..].trim().to_string());
}
}
Ok(profile)
}
fn detect(&self, content: &str) -> bool {
content.contains("Query:") && (content.contains("HMM") || content.contains("hmmsearch"))
}
fn format_name(&self) -> &'static str {
"HMMSearch"
}
}
pub struct InterProParser;
impl HmmParser for InterProParser {
fn parse(&self, content: &str) -> Result<UniversalHmmProfile> {
let mut profile = UniversalHmmProfile {
name: String::new(),
accession: None,
description: None,
length: 0,
alphabet: "amino".to_string(),
emission_scores: Vec::new(),
transition_scores: Vec::new(),
meta: HmmMetadata::default(),
};
for line in content.lines() {
if line.starts_with("ID") {
profile.name = line.split_whitespace().nth(1).unwrap_or("").to_string();
} else if line.starts_with("AC") {
profile.accession = Some(line.split_whitespace().nth(1).unwrap_or("").to_string());
} else if line.starts_with("DE") {
profile.description = Some(line[3..].trim().to_string());
}
}
Ok(profile)
}
fn detect(&self, content: &str) -> bool {
(content.contains("^ID ") || content.starts_with("ID "))
&& content.contains("^AC ") || content.contains("AC ")
}
fn format_name(&self) -> &'static str {
"InterPro"
}
}
pub struct MultiFormatHmmParser {
parsers: Vec<Box<dyn HmmParser>>,
}
impl MultiFormatHmmParser {
pub fn new() -> Self {
let parsers: Vec<Box<dyn HmmParser>> = vec![
Box::new(Hmmer3Parser),
Box::new(PfamParser),
Box::new(HmmSearchParser),
Box::new(InterProParser),
];
MultiFormatHmmParser { parsers }
}
pub fn parse_file<P: AsRef<Path>>(&self, path: P) -> Result<UniversalHmmProfile> {
let content = std::fs::read_to_string(path)
.map_err(|e| Error::AlignmentError(format!("Failed to read HMM file: {}", e)))?;
self.parse_string(&content)
}
pub fn parse_string(&self, content: &str) -> Result<UniversalHmmProfile> {
for parser in &self.parsers {
if parser.detect(content) {
eprintln!("Detected format: {}", parser.format_name());
return parser.parse(content);
}
}
Err(Error::AlignmentError(
"Unable to detect HMM format. Supported: HMMER3, PFAM, HMMSearch, InterPro".to_string()
))
}
pub fn supported_formats(&self) -> Vec<&'static str> {
self.parsers.iter().map(|p| p.format_name()).collect()
}
}
impl Default for MultiFormatHmmParser {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_hmmer3_detection() {
let hmmer3_content = r#"HMMER3/f [3.3 | Nov 2019]
NAME TestProfile
ACC PF00001
DESC Test HMM
LENG 100
ALPH amino
"#;
let parser = Hmmer3Parser;
assert!(parser.detect(hmmer3_content));
}
#[test]
fn test_pfam_detection() {
let pfam_content = r#"# STOCKHOLM 1.0
#=GF ID TestProfile
#=GF AC PF00001
#=GF LEN 100
"#;
let parser = PfamParser;
assert!(parser.detect(pfam_content));
}
#[test]
fn test_multi_format_parser() {
let parser = MultiFormatHmmParser::new();
assert_eq!(parser.supported_formats().len(), 4);
}
}