use std::f32;
#[derive(Debug, Clone)]
pub struct Pssm {
pub scores: Vec<Vec<f32>>,
pub conservation: Vec<f32>,
pub weights: Vec<f32>,
pub bg_freq: Vec<f32>,
}
pub struct PssmKernel;
impl PssmKernel {
pub fn construct_pssm(
alignment: &[Vec<u8>],
background: &[f32],
) -> Result<Pssm, String> {
if alignment.is_empty() || alignment[0].is_empty() {
return Err("Empty alignment".to_string());
}
if background.len() != 24 {
return Err("Background frequencies must have 24 elements".to_string());
}
let num_seqs = alignment.len();
let num_pos = alignment[0].len();
let weights = Self::compute_henikoff_weights(alignment)?;
let mut scores = vec![vec![0.0f32; 24]; num_pos];
let mut conservation = vec![0.0f32; num_pos];
for pos in 0..num_pos {
let mut weighted_counts = vec![0.0f32; 24];
let mut total_weight = 0.0f32;
for seq_idx in 0..num_seqs {
if let Some(&aa) = alignment[seq_idx].get(pos) {
let aa_idx = (aa as usize) % 24;
let weight = weights.get(seq_idx).copied().unwrap_or(1.0 / num_seqs as f32);
weighted_counts[aa_idx] += weight;
total_weight += weight;
}
}
let mut entropy = 0.0f32;
for aa in 0..24 {
let pseudocount = 1.0f32;
let count = (weighted_counts[aa] + pseudocount) / (total_weight + 24.0);
let bg = background.get(aa).copied().unwrap_or(1.0 / 24.0);
let score = (count / bg).max(0.001).ln();
scores[pos][aa] = score;
if count > 0.0 {
entropy -= count * count.ln();
}
}
conservation[pos] = entropy; }
Ok(Pssm {
scores,
conservation,
weights,
bg_freq: background.to_vec(),
})
}
fn compute_henikoff_weights(alignment: &[Vec<u8>]) -> Result<Vec<f32>, String> {
let num_seqs = alignment.len();
let num_pos = alignment.get(0).map(|v| v.len()).unwrap_or(0);
if num_seqs == 0 || num_pos == 0 {
return Err("Empty alignment for weight computation".to_string());
}
let mut weights = vec![0.0f32; num_seqs];
for pos in 0..num_pos {
let mut aa_counts = vec![0.0f32; 24];
for seq_idx in 0..num_seqs {
if let Some(&aa) = alignment[seq_idx].get(pos) {
let aa_idx = (aa as usize) % 24;
aa_counts[aa_idx] += 1.0;
}
}
for seq_idx in 0..num_seqs {
if let Some(&aa) = alignment[seq_idx].get(pos) {
let aa_idx = (aa as usize) % 24;
let count = (aa_counts[aa_idx]).max(1.0);
let num_distinct = aa_counts.iter().filter(|&&c| c > 0.0).count() as f32;
weights[seq_idx] += 1.0 / (num_distinct * count);
}
}
}
let sum: f32 = weights.iter().sum();
if sum > 0.0 {
for w in &mut weights {
*w = (*w / sum) * num_seqs as f32;
}
}
Ok(weights)
}
pub fn apply_dirichlet_prior(
pssm: &mut Pssm,
alpha: f32,
) {
for pos in 0..pssm.scores.len() {
for aa in 0..24 {
let uniform: f32 = 1.0 / 24.0;
pssm.scores[pos][aa] =
pssm.scores[pos][aa] * (1.0 - alpha) + uniform.ln() * alpha;
}
}
}
}
pub struct ProfileAlignmentKernel;
impl ProfileAlignmentKernel {
pub fn score_profile(
sequence: &[u8],
pssm: &Pssm,
) -> Result<f32, String> {
if sequence.len() != pssm.scores.len() {
return Err(format!(
"Sequence length {} doesn't match PSSM length {}",
sequence.len(),
pssm.scores.len()
));
}
let mut total_score = 0.0f32;
for (pos, &aa) in sequence.iter().enumerate() {
let aa_idx = (aa as usize) % 24;
if let Some(score) = pssm.scores[pos].get(aa_idx) {
total_score += score;
}
}
Ok(total_score)
}
pub fn score_profile_alignment(
sequences: &[Vec<u8>],
pssm: &Pssm,
_gap_penalty: f32,
) -> Result<Vec<f32>, String> {
let mut scores = Vec::with_capacity(sequences.len());
for sequence in sequences {
let score = Self::score_profile(sequence, pssm)?;
scores.push(score);
}
Ok(scores)
}
}
pub struct ConservationKernel;
impl ConservationKernel {
pub fn compute_entropy(pssm: &Pssm) -> Vec<f32> {
pssm.conservation.clone()
}
pub fn compute_kl_divergence(pssm: &Pssm) -> Result<Vec<f32>, String> {
let mut divergences = Vec::with_capacity(pssm.scores.len());
for pos in 0..pssm.scores.len() {
let mut kl = 0.0f32;
for aa in 0..24 {
let score = pssm.scores[pos].get(aa).copied().unwrap_or(0.0);
let prob_ratio = score.exp();
let bg = pssm.bg_freq.get(aa).copied().unwrap_or(1.0 / 24.0);
let prob = prob_ratio * bg;
if prob > 0.0 {
kl += prob * prob_ratio.ln();
}
}
divergences.push(kl);
}
Ok(divergences)
}
pub fn compute_score_frequency(pssm: &Pssm, threshold: f32) -> Vec<usize> {
pssm.scores
.iter()
.map(|position| {
position
.iter()
.filter(|&&score| score >= threshold)
.count()
})
.collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_pssm_construction() {
let alignment = vec![
vec![0u8, 1u8, 2u8],
vec![0u8, 1u8, 2u8],
vec![1u8, 2u8, 3u8],
];
let bg = vec![1.0 / 24.0; 24];
let result = PssmKernel::construct_pssm(&alignment, &bg);
assert!(result.is_ok());
let pssm = result.unwrap();
assert_eq!(pssm.scores.len(), 3);
assert_eq!(pssm.scores[0].len(), 24);
}
#[test]
fn test_henikoff_weights() {
let alignment = vec![
vec![0u8, 1u8, 2u8],
vec![0u8, 1u8, 2u8],
vec![1u8, 2u8, 3u8],
];
let weights = PssmKernel::compute_henikoff_weights(&alignment).unwrap();
assert_eq!(weights.len(), 3);
let sum: f32 = weights.iter().sum();
assert!((sum - 3.0).abs() < 0.01);
}
#[test]
fn test_profile_scoring() {
let alignment = vec![
vec![0u8, 1u8, 2u8],
vec![0u8, 1u8, 2u8],
];
let bg = vec![1.0 / 24.0; 24];
let pssm = PssmKernel::construct_pssm(&alignment, &bg).unwrap();
let sequence = vec![0u8, 1u8, 2u8];
let score = ProfileAlignmentKernel::score_profile(&sequence, &pssm);
assert!(score.is_ok());
assert!(score.unwrap().is_finite());
}
#[test]
fn test_entropy_computation() {
let alignment = vec![
vec![0u8, 1u8, 2u8],
vec![0u8, 1u8, 2u8],
];
let bg = vec![1.0 / 24.0; 24];
let pssm = PssmKernel::construct_pssm(&alignment, &bg).unwrap();
let entropy = ConservationKernel::compute_entropy(&pssm);
assert_eq!(entropy.len(), 3);
for e in entropy {
assert!(e >= 0.0);
}
}
#[test]
fn test_kl_divergence() {
let alignment = vec![
vec![0u8, 1u8, 2u8],
vec![0u8, 1u8, 2u8],
];
let bg = vec![1.0 / 24.0; 24];
let pssm = PssmKernel::construct_pssm(&alignment, &bg).unwrap();
let kl = ConservationKernel::compute_kl_divergence(&pssm);
assert!(kl.is_ok());
assert_eq!(kl.unwrap().len(), 3);
}
#[test]
fn test_pssm_single_sequence() {
let alignment = vec![vec![0u8, 1u8, 2u8, 3u8, 4u8]];
let bg = vec![1.0 / 24.0; 24];
let result = PssmKernel::construct_pssm(&alignment, &bg);
assert!(result.is_ok());
let pssm = result.unwrap();
assert_eq!(pssm.scores.len(), 5);
}
#[test]
fn test_pssm_identical_sequences() {
let alignment = vec![
vec![0u8, 1u8, 2u8],
vec![0u8, 1u8, 2u8],
vec![0u8, 1u8, 2u8],
vec![0u8, 1u8, 2u8],
];
let bg = vec![1.0 / 24.0; 24];
let result = PssmKernel::construct_pssm(&alignment, &bg);
assert!(result.is_ok());
let pssm = result.unwrap();
for (i, w1) in pssm.weights.iter().enumerate() {
for j in 0..i {
assert!((w1 - pssm.weights[j]).abs() < 0.01);
}
}
}
#[test]
fn test_pssm_diverse_alignment() {
let alignment = vec![
vec![0u8, 1u8, 2u8],
vec![3u8, 4u8, 5u8],
vec![6u8, 7u8, 8u8],
vec![9u8, 10u8, 11u8],
];
let bg = vec![1.0 / 24.0; 24];
let result = PssmKernel::construct_pssm(&alignment, &bg);
assert!(result.is_ok());
let pssm = result.unwrap();
assert_eq!(pssm.weights.len(), 4);
}
#[test]
fn test_dirichlet_prior_application() {
let alignment = vec![
vec![0u8, 1u8, 2u8],
vec![0u8, 1u8, 2u8],
];
let bg = vec![1.0 / 24.0; 24];
let mut pssm = PssmKernel::construct_pssm(&alignment, &bg).unwrap();
let original_score = pssm.scores[0][0];
PssmKernel::apply_dirichlet_prior(&mut pssm, 0.5);
assert!(pssm.scores[0][0] != original_score);
assert!(pssm.scores[0][0].is_finite());
}
#[test]
fn test_profile_alignment_multiple_sequences() {
let alignment = vec![
vec![0u8, 1u8, 2u8],
vec![0u8, 1u8, 2u8],
];
let bg = vec![1.0 / 24.0; 24];
let pssm = PssmKernel::construct_pssm(&alignment, &bg).unwrap();
let sequences = vec![
vec![0u8, 1u8, 2u8],
vec![1u8, 2u8, 3u8],
vec![0u8, 0u8, 0u8],
];
let result = ProfileAlignmentKernel::score_profile_alignment(&sequences, &pssm, -1.0);
assert!(result.is_ok());
let scores = result.unwrap();
assert_eq!(scores.len(), 3);
for score in scores {
assert!(score.is_finite());
}
}
#[test]
fn test_entropy_conservation() {
let alignment = vec![
vec![0u8, 0u8, 0u8],
vec![0u8, 0u8, 0u8], ];
let bg = vec![1.0 / 24.0; 24];
let pssm = PssmKernel::construct_pssm(&alignment, &bg).unwrap();
let entropy = ConservationKernel::compute_entropy(&pssm);
assert!(entropy.len() == 3);
assert!(entropy[0] >= 0.0 && entropy[0] < 3.18); }
#[test]
fn test_entropy_divergence() {
let alignment = vec![
vec![0u8, 1u8, 2u8, 3u8],
vec![1u8, 0u8, 3u8, 2u8],
vec![2u8, 3u8, 0u8, 1u8],
vec![3u8, 2u8, 1u8, 0u8],
];
let bg = vec![1.0 / 24.0; 24];
let pssm = PssmKernel::construct_pssm(&alignment, &bg).unwrap();
let entropy = ConservationKernel::compute_entropy(&pssm);
for e in entropy {
assert!(e >= 0.0 && e <= 10.0);
}
}
#[test]
fn test_score_frequency() {
let alignment = vec![
vec![0u8, 1u8, 2u8],
vec![0u8, 1u8, 2u8],
];
let bg = vec![1.0 / 24.0; 24];
let pssm = PssmKernel::construct_pssm(&alignment, &bg).unwrap();
let frequency = ConservationKernel::compute_score_frequency(&pssm, 0.0);
assert_eq!(frequency.len(), 3);
for freq in frequency {
assert!(freq > 0);
assert!(freq <= 24);
}
}
#[test]
fn test_long_alignment() {
let seq: Vec<u8> = (0..50).map(|i| (i % 20) as u8).collect();
let alignment = vec![seq.clone(), seq.clone(), seq];
let bg = vec![1.0 / 24.0; 24];
let result = PssmKernel::construct_pssm(&alignment, &bg);
assert!(result.is_ok());
let pssm = result.unwrap();
assert_eq!(pssm.scores.len(), 50);
}
#[test]
fn test_pssm_score_range() {
let alignment = vec![
vec![0u8, 1u8, 2u8],
vec![0u8, 1u8, 2u8],
];
let bg = vec![1.0 / 24.0; 24];
let pssm = PssmKernel::construct_pssm(&alignment, &bg).unwrap();
for pos in &pssm.scores {
for score in pos {
assert!(score.is_finite());
assert!(*score > -100.0 && *score < 100.0);
}
}
}
#[test]
fn test_edge_case_invalid_alignment() {
let alignment = vec![];
let bg = vec![1.0 / 24.0; 24];
let result = PssmKernel::construct_pssm(&alignment, &bg);
assert!(result.is_err());
}
#[test]
fn test_edge_case_invalid_background() {
let alignment = vec![vec![0u8, 1u8, 2u8]];
let bg = vec![1.0 / 20.0; 20];
let result = PssmKernel::construct_pssm(&alignment, &bg);
assert!(result.is_err());
}
#[test]
fn test_dirichlet_prior() {
let alignment = vec![
vec![0u8, 1u8, 2u8],
vec![0u8, 1u8, 2u8],
];
let bg = vec![1.0 / 24.0; 24];
let mut pssm = PssmKernel::construct_pssm(&alignment, &bg).unwrap();
let original_score = pssm.scores[0][0];
PssmKernel::apply_dirichlet_prior(&mut pssm, 0.1);
let smoothed_score = pssm.scores[0][0];
assert_ne!(original_score, smoothed_score);
}
#[test]
fn test_empty_alignment_error() {
let alignment: Vec<Vec<u8>> = vec![];
let bg = vec![1.0 / 24.0; 24];
let result = PssmKernel::construct_pssm(&alignment, &bg);
assert!(result.is_err());
}
#[test]
fn test_batch_profile_scoring() {
let alignment = vec![
vec![0u8, 1u8, 2u8],
vec![0u8, 1u8, 2u8],
];
let bg = vec![1.0 / 24.0; 24];
let pssm = PssmKernel::construct_pssm(&alignment, &bg).unwrap();
let sequences = vec![
vec![0u8, 1u8, 2u8],
vec![1u8, 2u8, 3u8],
];
let scores = ProfileAlignmentKernel::score_profile_alignment(&sequences, &pssm, -1.0);
assert!(scores.is_ok());
assert_eq!(scores.unwrap().len(), 2);
}
}