use crate::error::{Error, Result};
use crate::protein::{AminoAcid, Protein};
use crate::futures::cli_file_io::SeqRecord;
use crate::alignment::Cigar;
use crate::scoring::{ScoringMatrix, MatrixType, AffinePenalty};
use crate::alignment::hmmer3_parser::KarlinParameters;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct StJudeAminoAcid(pub u8);
impl StJudeAminoAcid {
pub const CANONICAL_COUNT: usize = 20;
pub const TOTAL_COUNT: usize = 24;
pub fn from_code(code: char) -> Result<Self> {
let idx = match code.to_ascii_uppercase() {
'A' => 0, 'R' => 1, 'N' => 2, 'D' => 3, 'C' => 4, 'E' => 5, 'Q' => 6, 'G' => 7, 'H' => 8, 'I' => 9, 'L' => 10, 'K' => 11, 'M' => 12, 'F' => 13, 'P' => 14, 'S' => 15, 'T' => 16, 'W' => 17, 'Y' => 18, 'V' => 19, 'B' => 20, 'Z' => 21, 'X' => 22, '*' => 23, _ => return Err(Error::InvalidAminoAcid(code)),
};
Ok(StJudeAminoAcid(idx as u8))
}
pub fn to_code(&self) -> Result<char> {
let codes = [
'A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I',
'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V',
'B', 'Z', 'X', '*',
];
codes.get(self.0 as usize).copied()
.ok_or_else(|| Error::AlignmentError(format!("Invalid amino acid index: {}", self.0)))
}
pub fn to_three_letter(&self) -> Result<&'static str> {
let names = [
"Ala", "Arg", "Asn", "Asp", "Cys", "Glu", "Gln", "Gly", "His", "Ile",
"Leu", "Lys", "Met", "Phe", "Pro", "Ser", "Thr", "Trp", "Tyr", "Val",
"Asx", "Glx", "Xaa", "***",
];
names.get(self.0 as usize).copied()
.ok_or_else(|| Error::AlignmentError(format!("Invalid amino acid index: {}", self.0)))
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StJudeSequence {
pub id: String,
pub accession: Option<String>,
pub sequence: Vec<u8>,
pub sequence_type: SequenceType,
pub description: Option<String>,
pub genomic_location: Option<String>,
pub source_db: Option<String>,
pub taxonomy_id: Option<u32>,
pub clinical_flags: Vec<String>,
pub metadata: HashMap<String, String>,
}
impl StJudeSequence {
pub fn new(id: String, sequence: Vec<u8>, sequence_type: SequenceType) -> Self {
StJudeSequence {
id,
accession: None,
sequence,
sequence_type,
description: None,
genomic_location: None,
source_db: None,
taxonomy_id: None,
clinical_flags: Vec::new(),
metadata: HashMap::new(),
}
}
pub fn len(&self) -> usize {
self.sequence.len()
}
pub fn is_empty(&self) -> bool {
self.sequence.is_empty()
}
pub fn add_clinical_flag(&mut self, flag: String) {
self.clinical_flags.push(flag);
}
pub fn is_clinically_significant(&self) -> bool {
!self.clinical_flags.is_empty()
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum SequenceType {
Protein,
Dna,
Rna,
Codon,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StJudeAlignment {
pub query_id: String,
pub subject_id: String,
pub score: i32,
pub evalue: f64,
pub bit_score: f64,
pub identity: f64,
pub query_start: u32,
pub query_end: u32,
pub subject_start: u32,
pub subject_end: u32,
pub matches: u32,
pub alignment_length: u32,
pub gap_opens: u32,
pub gaps: u32,
pub cigar: String,
pub query_string: String,
pub subject_string: String,
pub interpretation: Option<String>,
pub databases: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StJudeScoringMatrix {
pub name: String,
pub scores: Vec<Vec<i32>>,
pub size: usize,
pub gap_open: i32,
pub gap_extend: i32,
pub reference: Option<String>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct ParsimonyState(pub u8);
impl ParsimonyState {
pub fn from_code(code: char) -> Result<Self> {
let idx = match code.to_ascii_uppercase() {
'A' => 0,
'R' => 1,
'N' => 2,
'D' => 3,
'C' => 4,
'E' => 5,
'Q' => 6,
'G' => 7,
'H' => 8,
'I' => 9,
'L' => 10,
'K' => 11,
'M' => 12,
'F' => 13,
'P' => 14,
'S' => 15,
'T' => 16,
'W' => 17,
'Y' => 18,
'V' => 19,
_ => return Err(Error::InvalidAminoAcid(code)),
};
Ok(ParsimonyState(idx as u8))
}
pub fn transition_cost(&self, other: ParsimonyState) -> u32 {
if self.0 != other.0 { 1 } else { 0 }
}
}
#[derive(Debug, Clone)]
pub struct BridgeConfig {
pub include_coordinates: bool,
pub include_clinical: bool,
pub default_source_db: Option<String>,
pub default_taxonomy_id: Option<u32>,
pub validate_sequences: bool,
}
impl Default for BridgeConfig {
fn default() -> Self {
BridgeConfig {
include_coordinates: true,
include_clinical: true,
default_source_db: Some("Ensembl".to_string()),
default_taxonomy_id: Some(9606), validate_sequences: true,
}
}
}
pub struct StJudeBridge {
config: BridgeConfig,
}
impl StJudeBridge {
pub fn new(config: BridgeConfig) -> Self {
StJudeBridge { config }
}
pub fn to_st_jude_amino_acid(&self, aa: AminoAcid) -> Result<StJudeAminoAcid> {
StJudeAminoAcid::from_code(aa.to_code())
}
pub fn from_st_jude_amino_acid(&self, st_jude_aa: StJudeAminoAcid) -> Result<AminoAcid> {
let code = st_jude_aa.to_code()?;
AminoAcid::from_code(code)
}
pub fn to_st_jude_sequence(&self, protein: &Protein) -> Result<StJudeSequence> {
let mut sequence = Vec::new();
for &aa in protein.sequence() {
let st_jude_aa = self.to_st_jude_amino_acid(aa)?;
sequence.push(st_jude_aa.0);
}
let mut st_jude_seq = StJudeSequence::new(
protein.id().unwrap_or("unknown").to_string(),
sequence,
SequenceType::Protein,
);
if let Some(desc) = protein.description() {
st_jude_seq.description = Some(desc.to_string());
}
if let Some(db) = &self.config.default_source_db {
st_jude_seq.source_db = Some(db.clone());
}
st_jude_seq.taxonomy_id = self.config.default_taxonomy_id;
Ok(st_jude_seq)
}
pub fn from_st_jude_sequence(&self, st_jude_seq: &StJudeSequence) -> Result<Protein> {
if st_jude_seq.sequence_type != SequenceType::Protein {
return Err(Error::AlignmentError(
"St. Jude sequence must be Protein type for conversion".to_string(),
));
}
if self.config.validate_sequences && st_jude_seq.is_empty() {
return Err(Error::EmptySequence);
}
let mut sequence = Vec::new();
for &idx in &st_jude_seq.sequence {
let st_jude_aa = StJudeAminoAcid(idx);
let aa = self.from_st_jude_amino_acid(st_jude_aa)?;
sequence.push(aa);
}
let mut protein = Protein::new(sequence)?;
protein = protein.with_id(st_jude_seq.id.clone());
if let Some(desc) = &st_jude_seq.description {
protein = protein.with_description(desc.clone());
}
Ok(protein)
}
pub fn seq_record_to_st_jude(&self, record: &SeqRecord) -> Result<StJudeSequence> {
let sequence_type = match record.quality {
Some(_) => SequenceType::Dna, None => SequenceType::Dna, };
let sequence = if sequence_type == SequenceType::Dna {
let mut seq = Vec::new();
for c in record.sequence.to_uppercase().chars() {
let idx = match c {
'A' => 0,
'C' => 1,
'G' => 2,
'T' => 3,
'N' | 'X' => 4, _ => 4,
};
seq.push(idx);
}
seq
} else {
vec![]
};
let mut st_jude_seq = StJudeSequence::new(record.id.clone(), sequence, sequence_type);
st_jude_seq.description = record.description.clone();
if let Some(db) = &self.config.default_source_db {
st_jude_seq.source_db = Some(db.clone());
}
st_jude_seq.taxonomy_id = self.config.default_taxonomy_id;
Ok(st_jude_seq)
}
pub fn st_jude_to_seq_record(&self, st_jude_seq: &StJudeSequence) -> Result<SeqRecord> {
let sequence = if st_jude_seq.sequence_type == SequenceType::Dna {
let mut seq = String::new();
for &idx in &st_jude_seq.sequence {
let c = match idx {
0 => 'A',
1 => 'C',
2 => 'G',
3 => 'T',
4 => 'N',
_ => 'N',
};
seq.push(c);
}
seq
} else {
String::new()
};
Ok(SeqRecord {
id: st_jude_seq.id.clone(),
description: st_jude_seq.description.clone(),
sequence,
quality: None,
})
}
pub fn to_st_jude_alignment(
&self,
query_id: &str,
subject_id: &str,
score: i32,
cigar: &Cigar,
query_string: &str,
subject_string: &str,
karlin_params: &KarlinParameters,
database_size: u64, ) -> Result<StJudeAlignment> {
let alignment_length = cigar.query_length();
let query_start = 1; let query_end = query_start + cigar.query_length();
let subject_start = 1;
let subject_end = subject_start + cigar.reference_length();
let mut matches = 0;
let mut gaps = 0;
for &c in query_string.as_bytes() {
if c == b'-' {
gaps += 1;
} else {
matches += 1;
}
}
let identity = if alignment_length > 0 {
matches as f64 / alignment_length as f64
} else {
0.0
};
let bit_score = karlin_params.bit_score(score as f64);
let evalue = karlin_params.evalue(bit_score, database_size);
Ok(StJudeAlignment {
query_id: query_id.to_string(),
subject_id: subject_id.to_string(),
score,
evalue,
bit_score: bit_score.max(0.0),
identity,
query_start,
query_end,
subject_start,
subject_end,
matches,
alignment_length,
gap_opens: 1, gaps,
cigar: cigar.to_string(),
query_string: query_string.to_string(),
subject_string: subject_string.to_string(),
interpretation: None,
databases: Vec::new(),
})
}
pub fn to_st_jude_matrix(
&self,
matrix: &ScoringMatrix,
gap_penalty: &AffinePenalty,
) -> Result<StJudeScoringMatrix> {
let name = matrix.matrix_type().to_string();
let scores = matrix.raw_scores();
Ok(StJudeScoringMatrix {
name,
scores: scores.clone(),
size: scores.len(),
gap_open: gap_penalty.open,
gap_extend: gap_penalty.extend,
reference: Some("NCBI Standard".to_string()),
})
}
pub fn from_st_jude_matrix(&self, st_jude_matrix: &StJudeScoringMatrix) -> Result<ScoringMatrix> {
let matrix_type = match st_jude_matrix.name.to_uppercase().as_str() {
"BLOSUM62" => MatrixType::Blosum62,
"BLOSUM45" => MatrixType::Blosum45,
"BLOSUM80" => MatrixType::Blosum80,
"PAM30" => MatrixType::Pam30,
"PAM70" => MatrixType::Pam70,
_ => MatrixType::Blosum62, };
ScoringMatrix::new(matrix_type)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_st_jude_amino_acid_conversion() -> Result<()> {
let st_jude_aa = StJudeAminoAcid::from_code('A')?;
assert_eq!(st_jude_aa.0, 0);
assert_eq!(st_jude_aa.to_code()?, 'A');
let st_jude_aa = StJudeAminoAcid::from_code('W')?;
assert_eq!(st_jude_aa.0, 17);
for c in "ACDEFGHIKLMNPQRSTVWY".chars() {
let st_jude_aa = StJudeAminoAcid::from_code(c)?;
assert_eq!(st_jude_aa.to_code()?, c);
}
Ok(())
}
#[test]
fn test_parsimony_state_creation() -> Result<()> {
let state = ParsimonyState::from_code('M')?;
assert_eq!(state.0, 12);
let state2 = ParsimonyState::from_code('M')?;
assert_eq!(state.transition_cost(state2), 0);
let state3 = ParsimonyState::from_code('L')?;
assert_eq!(state.transition_cost(state3), 1);
Ok(())
}
#[test]
fn test_bridge_protein_to_st_jude() -> Result<()> {
let bridge = StJudeBridge::new(BridgeConfig::default());
let protein = Protein::from_string("MVHLTPEEKS")?
.with_id("HBB".to_string())
.with_description("Beta-globin".to_string());
let st_jude_seq = bridge.to_st_jude_sequence(&protein)?;
assert_eq!(st_jude_seq.id, "HBB");
assert_eq!(st_jude_seq.description, Some("Beta-globin".to_string()));
assert_eq!(st_jude_seq.sequence_type, SequenceType::Protein);
assert_eq!(st_jude_seq.len(), 10);
assert_eq!(st_jude_seq.source_db, Some("Ensembl".to_string()));
Ok(())
}
#[test]
fn test_bridge_st_jude_to_protein() -> Result<()> {
let bridge = StJudeBridge::new(BridgeConfig::default());
let mut st_jude_seq =
StJudeSequence::new("TP53".to_string(), vec![3, 14, 19], SequenceType::Protein);
st_jude_seq.description = Some("Tumor suppressor p53".to_string());
let protein = bridge.from_st_jude_sequence(&st_jude_seq)?;
assert_eq!(protein.id(), Some("TP53"));
assert_eq!(protein.description(), Some("Tumor suppressor p53"));
assert_eq!(protein.len(), 3);
Ok(())
}
#[test]
fn test_bridge_roundtrip_conversion() -> Result<()> {
let bridge = StJudeBridge::new(BridgeConfig::default());
let original_protein = Protein::from_string("ARNDCQEGHILKMFPSTWYV")?
.with_id("ALL_AAS".to_string());
let st_jude_seq = bridge.to_st_jude_sequence(&original_protein)?;
let recovered_protein = bridge.from_st_jude_sequence(&st_jude_seq)?;
assert_eq!(original_protein.sequence(), recovered_protein.sequence());
assert_eq!(original_protein.id(), recovered_protein.id());
Ok(())
}
#[test]
fn test_seq_record_to_st_jude() -> Result<()> {
let bridge = StJudeBridge::new(BridgeConfig::default());
let record = SeqRecord {
id: "chr1:1000-1100".to_string(),
description: Some("Test region".to_string()),
sequence: "ACGTACGTACGT".to_string(),
quality: None,
};
let st_jude_seq = bridge.seq_record_to_st_jude(&record)?;
assert_eq!(st_jude_seq.id, "chr1:1000-1100");
assert_eq!(st_jude_seq.sequence_type, SequenceType::Dna);
assert_eq!(st_jude_seq.len(), 12);
Ok(())
}
#[test]
fn test_alignment_conversion() -> Result<()> {
let bridge = StJudeBridge::new(BridgeConfig::default());
let karlin = KarlinParameters::default_protein();
let cigar = crate::alignment::Cigar::new();
let alignment = bridge.to_st_jude_alignment(
"query1",
"subject1",
85,
&cigar,
"ACDEF",
"ACGEF",
&karlin,
1_000_000_000, )?;
assert_eq!(alignment.query_id, "query1");
assert_eq!(alignment.subject_id, "subject1");
assert_eq!(alignment.score, 85);
assert!(alignment.identity >= 0.0 && alignment.identity <= 1.0);
Ok(())
}
#[test]
fn test_clinical_flags() -> Result<()> {
let mut st_jude_seq = StJudeSequence::new("BRCA1".to_string(), vec![], SequenceType::Protein);
assert!(!st_jude_seq.is_clinically_significant());
st_jude_seq.add_clinical_flag("pathogenic".to_string());
st_jude_seq.add_clinical_flag("loss-of-function".to_string());
assert!(st_jude_seq.is_clinically_significant());
assert_eq!(st_jude_seq.clinical_flags.len(), 2);
Ok(())
}
#[test]
fn test_bridge_empty_sequence_validation() -> Result<()> {
let bridge = StJudeBridge::new(BridgeConfig {
validate_sequences: true,
..Default::default()
});
let empty_seq = StJudeSequence::new("empty".to_string(), vec![], SequenceType::Protein);
let result = bridge.from_st_jude_sequence(&empty_seq);
assert!(result.is_err());
Ok(())
}
#[test]
fn test_metadata_preservation() -> Result<()> {
let mut st_jude_seq =
StJudeSequence::new("TEST".to_string(), vec![0], SequenceType::Protein);
st_jude_seq.metadata.insert("gene_name".to_string(), "PTEN".to_string());
st_jude_seq.metadata.insert("chr".to_string(), "10".to_string());
assert_eq!(st_jude_seq.metadata.get("gene_name"), Some(&"PTEN".to_string()));
assert_eq!(st_jude_seq.metadata.get("chr"), Some(&"10".to_string()));
Ok(())
}
#[test]
fn test_taxonomy_id_defaults() {
let bridge = StJudeBridge::new(BridgeConfig::default());
assert_eq!(bridge.config.default_taxonomy_id, Some(9606)); }
#[test]
fn test_three_letter_codes() -> Result<()> {
let st_jude_aa = StJudeAminoAcid::from_code('G')?;
assert_eq!(st_jude_aa.to_three_letter()?, "Gly");
let st_jude_aa = StJudeAminoAcid::from_code('P')?;
assert_eq!(st_jude_aa.to_three_letter()?, "Pro");
Ok(())
}
}