use std::sync::LazyLock;
use itertools::Itertools;
use serde::{Deserialize, Serialize};
use crate::sequence::{AminoAcid, SequenceElement};
#[derive(Clone, Debug, Deserialize, Eq, Hash, PartialEq, Serialize)]
pub struct Protease {
pub before: Vec<Option<Vec<AminoAcid>>>,
pub after: Vec<Option<Vec<AminoAcid>>>,
}
impl Protease {
pub fn between_stretches(before: &[AminoAcid], after: &[AminoAcid]) -> Self {
Self {
before: before.iter().map(|aa| Some(vec![*aa])).collect_vec(),
after: after.iter().map(|aa| Some(vec![*aa])).collect_vec(),
}
}
pub fn between_options(before: Vec<AminoAcid>, after: Vec<AminoAcid>) -> Self {
Self {
before: vec![Some(before)],
after: vec![Some(after)],
}
}
pub fn c_terminal_of(residues: Vec<AminoAcid>) -> Self {
Self {
before: vec![Some(residues)],
after: Vec::new(),
}
}
pub fn n_terminal_of(residues: Vec<AminoAcid>) -> Self {
Self {
before: Vec::new(),
after: vec![Some(residues)],
}
}
pub fn get_exclusive(exclude: &[AminoAcid]) -> Vec<AminoAcid> {
AminoAcid::ALL_AMINO_ACIDS
.iter()
.copied()
.filter(|aa| !exclude.contains(aa))
.collect_vec()
}
pub fn match_locations<T>(&self, sequence: &[SequenceElement<T>]) -> Vec<usize> {
let upper = sequence
.len()
.saturating_sub(self.after.len())
.min(sequence.len().saturating_sub(1));
(self.before.len()..=upper)
.filter(|i| self.matches_at(&sequence[i - self.before.len()..i + self.after.len()]))
.collect_vec()
}
fn matches_at<T>(&self, slice: &[SequenceElement<T>]) -> bool {
debug_assert!(slice.len() == self.before.len() + self.after.len());
'positions: for (actual, pattern) in slice
.iter()
.zip(self.before.iter().chain(self.after.iter()))
{
if let Some(pattern) = pattern {
for option in pattern {
if option.canonical_identical(actual.aminoacid.aminoacid()) {
continue 'positions;
}
}
return false;
}
}
true
}
}
pub mod known_proteases {
use super::*;
pub static TRYPSIN: LazyLock<Protease> =
LazyLock::new(|| Protease::c_terminal_of(vec![AminoAcid::Lysine, AminoAcid::Arginine]));
pub static TRYPSIN_P: LazyLock<Protease> = LazyLock::new(|| {
Protease::between_options(
vec![AminoAcid::Lysine, AminoAcid::Arginine],
Protease::get_exclusive(&[AminoAcid::Proline]),
)
});
pub static CHYMOTRYPSIN: LazyLock<Protease> = LazyLock::new(|| {
Protease::between_options(
vec![
AminoAcid::Phenylalanine,
AminoAcid::Tryptophan,
AminoAcid::Tyrosine,
],
Protease::get_exclusive(&[AminoAcid::Proline]),
)
});
pub static PEPSIN: LazyLock<Protease> = LazyLock::new(|| {
Protease::c_terminal_of(vec![
AminoAcid::Phenylalanine,
AminoAcid::Tryptophan,
AminoAcid::Tyrosine,
AminoAcid::Leucine,
])
});
pub static ASPN: LazyLock<Protease> =
LazyLock::new(|| Protease::n_terminal_of(vec![AminoAcid::AsparticAcid]));
pub static GLUC: LazyLock<Protease> =
LazyLock::new(|| Protease::c_terminal_of(vec![AminoAcid::GlutamicAcid]));
pub static LYSC: LazyLock<Protease> =
LazyLock::new(|| Protease::c_terminal_of(vec![AminoAcid::Lysine]));
pub static ARGC: LazyLock<Protease> =
LazyLock::new(|| Protease::c_terminal_of(vec![AminoAcid::Arginine]));
}
#[cfg(test)]
#[allow(clippy::missing_panics_doc)]
mod tests {
use crate::sequence::{Linear, Peptidoform};
use super::*;
pub(super) struct ProteaseTestCase {
pub sequence: Peptidoform<Linear>,
pub expected_cut_sites: Vec<usize>,
pub expected_peptides: Vec<Peptidoform<Linear>>,
}
pub(super) fn test_protease(protease: &Protease, test_case: &ProteaseTestCase) {
let cut_sites = protease.match_locations(test_case.sequence.sequence());
assert_eq!(
cut_sites, test_case.expected_cut_sites,
"Incorrect cut sites: found '{cut_sites:?}' expected '{:?}'",
test_case.expected_cut_sites
);
let peptides = test_case.sequence.digest(protease, 0, 4..40);
if peptides.len() != test_case.expected_peptides.len() {
for peptide in &peptides {
println!("{peptide}");
}
panic!("Incorrect number of peptides")
}
for (i, peptide) in peptides.iter().enumerate() {
assert_eq!(
peptide, &test_case.expected_peptides[i],
"Peptides don't match: found '{peptide}' expected '{}'",
test_case.expected_peptides[i]
);
}
}
fn str_to_peptidoform(str_peptide: &str) -> Peptidoform<Linear> {
Peptidoform::pro_forma(str_peptide, &crate::ontology::STATIC_ONTOLOGIES)
.unwrap()
.0
.into_linear()
.unwrap()
}
#[test]
fn trypsin() {
let test_cases = vec![
ProteaseTestCase {
sequence: str_to_peptidoform("AKRPGKR"),
expected_cut_sites: vec![2, 6],
expected_peptides: vec![str_to_peptidoform("RPGK")],
},
ProteaseTestCase {
sequence: str_to_peptidoform("ARAKGCVLRPKDGR"),
expected_cut_sites: vec![2, 4, 11],
expected_peptides: vec![str_to_peptidoform("GCVLRPK")],
},
];
for test_case in test_cases {
test_protease(&known_proteases::TRYPSIN_P, &test_case);
}
}
#[test]
fn chymotrypsin() {
let test_cases = vec![
ProteaseTestCase {
sequence: str_to_peptidoform("AFWYPLGF"),
expected_cut_sites: vec![2, 3],
expected_peptides: vec![str_to_peptidoform("YPLGF")],
},
ProteaseTestCase {
sequence: str_to_peptidoform("AVFUDGWTYPMSR"),
expected_cut_sites: vec![3, 7],
expected_peptides: vec![str_to_peptidoform("UDGW"), str_to_peptidoform("TYPMSR")],
},
];
for test_case in test_cases {
test_protease(&known_proteases::CHYMOTRYPSIN, &test_case);
}
}
#[test]
fn pepsin() {
let test_cases = vec![
ProteaseTestCase {
sequence: str_to_peptidoform("AACVFLPAKLURF"),
expected_cut_sites: vec![5, 6, 10],
expected_peptides: vec![str_to_peptidoform("AACVF"), str_to_peptidoform("PAKL")],
},
ProteaseTestCase {
sequence: str_to_peptidoform("GFLPKDLVMSRG"),
expected_cut_sites: vec![2, 3, 7],
expected_peptides: vec![str_to_peptidoform("PKDL"), str_to_peptidoform("VMSRG")],
},
];
for test_case in test_cases {
test_protease(&known_proteases::PEPSIN, &test_case);
}
}
#[test]
fn aspn() {
let test_cases = vec![
ProteaseTestCase {
sequence: str_to_peptidoform("FARDKPGLFD"),
expected_cut_sites: vec![3, 9],
expected_peptides: vec![str_to_peptidoform("DKPGLF")],
},
ProteaseTestCase {
sequence: str_to_peptidoform("PFKDLTMSR"),
expected_cut_sites: vec![3],
expected_peptides: vec![str_to_peptidoform("DLTMSR")],
},
];
for test_case in test_cases {
test_protease(&known_proteases::ASPN, &test_case);
}
}
#[test]
fn gluc() {
let test_cases = vec![
ProteaseTestCase {
sequence: str_to_peptidoform("FAREDKPGLF"),
expected_cut_sites: vec![4],
expected_peptides: vec![str_to_peptidoform("FARE"), str_to_peptidoform("DKPGLF")],
},
ProteaseTestCase {
sequence: str_to_peptidoform("PFKELGTMSR"),
expected_cut_sites: vec![4],
expected_peptides: vec![str_to_peptidoform("PFKE"), str_to_peptidoform("LGTMSR")],
},
];
for test_case in test_cases {
test_protease(&known_proteases::GLUC, &test_case);
}
}
#[test]
fn lysc() {
let test_cases = vec![
ProteaseTestCase {
sequence: str_to_peptidoform("FARKDPGLF"),
expected_cut_sites: vec![4],
expected_peptides: vec![str_to_peptidoform("FARK"), str_to_peptidoform("DPGLF")],
},
ProteaseTestCase {
sequence: str_to_peptidoform("PFKDLTKMSR"),
expected_cut_sites: vec![3, 7],
expected_peptides: vec![str_to_peptidoform("DLTK")],
},
];
for test_case in test_cases {
test_protease(&known_proteases::LYSC, &test_case);
}
}
#[test]
fn argc() {
let test_cases = vec![
ProteaseTestCase {
sequence: str_to_peptidoform("FARKDPGLF"),
expected_cut_sites: vec![3],
expected_peptides: vec![str_to_peptidoform("KDPGLF")],
},
ProteaseTestCase {
sequence: str_to_peptidoform("PFKDLRTMSR"),
expected_cut_sites: vec![6],
expected_peptides: vec![str_to_peptidoform("PFKDLR"), str_to_peptidoform("TMSR")],
},
];
for test_case in test_cases {
test_protease(&known_proteases::ARGC, &test_case);
}
}
}