blastdb_sequence_util/
protein.rs

1/// The indices here should match the enum values in AminoAcid
2const NCBISTDAA_TO_IUPACAA: [u8; 32] = [
3    0xff, b'A', b'B', b'C', b'D', b'E', b'F', b'G', b'H', b'I', b'K', b'L', b'M', b'N', b'P', b'Q',
4    b'R', b'S', b'T', b'V', b'W', b'X', b'Y', b'Z', b'U', 0xff, b'O', b'J', 0xff, 0xff, 0xff, 0xff,
5];
6
7const fn generate_iupacaa_to_ncbistdaa() -> [u8; 256] {
8    let mut r = [0xffu8; 256];
9    let mut i = 0;
10    while i < NCBISTDAA_TO_IUPACAA.len() {
11        if NCBISTDAA_TO_IUPACAA[i] != 0xff {
12            r[NCBISTDAA_TO_IUPACAA[i] as usize] = i as u8;
13            r[NCBISTDAA_TO_IUPACAA[i].to_ascii_lowercase() as usize] = i as u8;
14        }
15        i += 1;
16    }
17    r
18}
19const IUPACAA_TO_NCBISTDAA: [u8; 256] = generate_iupacaa_to_ncbistdaa();
20
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum ProteinConversionError {
23    InvalidBase,
24}
25
26impl std::fmt::Display for ProteinConversionError {
27    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
28        write!(f, "{:?}", self)
29    }
30}
31
32impl std::error::Error for ProteinConversionError {}
33
34/// Single byte base representation that uses 5 bits per base.
35#[repr(transparent)]
36#[derive(PartialEq, Eq, Copy, Clone, Debug)]
37pub struct NcbiStdaaBase(u8);
38
39impl TryFrom<u8> for NcbiStdaaBase {
40    type Error = ProteinConversionError;
41
42    #[inline]
43    fn try_from(value: u8) -> Result<Self, Self::Error> {
44        let base = IUPACAA_TO_NCBISTDAA[value as usize];
45        if base != u8::MAX {
46            Ok(NcbiStdaaBase(base))
47        } else {
48            Err(ProteinConversionError::InvalidBase)
49        }
50    }
51}
52
53impl From<NcbiStdaaBase> for u8 {
54    #[inline]
55    fn from(value: NcbiStdaaBase) -> Self {
56        NCBISTDAA_TO_IUPACAA[value.0 as usize]
57    }
58}
59
60impl From<NcbiStdaaBase> for char {
61    #[inline]
62    fn from(value: NcbiStdaaBase) -> Self {
63        u8::from(value).into()
64    }
65}
66
67/// BLAST database representation of a protein sequence.
68#[derive(Clone, PartialEq, PartialOrd, Eq, Ord, Debug)]
69pub struct ProteinSequence {
70    seq: Vec<u8>,
71}
72
73impl ProteinSequence {
74    /// Returns an iterator over the IUPAC amino acid representation fo the sequence.
75    pub fn iter(&self) -> impl Iterator<Item = NcbiStdaaBase> + '_ {
76        self.seq.iter().copied().map(NcbiStdaaBase)
77    }
78
79    /// Returns the number of bases in the sequence
80    pub fn len(&self) -> usize {
81        self.seq.len()
82    }
83
84    pub fn is_empty(&self) -> bool {
85        self.len() == 0
86    }
87
88    /// Returns the raw sequence as bytes.
89    pub fn sequence_bytes(&self) -> &[u8] {
90        &self.seq[..]
91    }
92}
93
94impl FromIterator<NcbiStdaaBase> for ProteinSequence {
95    fn from_iter<T: IntoIterator<Item = NcbiStdaaBase>>(iter: T) -> Self {
96        ProteinSequence {
97            seq: iter.into_iter().map(|b| b.0).collect(),
98        }
99    }
100}
101
102/// Create a `ProteinSequence` from an IUPAC amino acid sequence.
103/// Accepts both upper and lowercase versions of the bases.
104impl std::str::FromStr for ProteinSequence {
105    type Err = ProteinConversionError;
106
107    fn from_str(s: &str) -> Result<Self, Self::Err> {
108        s.as_bytes()
109            .iter()
110            .map(|b| NcbiStdaaBase::try_from(*b))
111            .collect::<Result<_, _>>()
112    }
113}
114
115/// Render the sequence as an IUPAC amino acid sequence.
116impl std::fmt::Display for ProteinSequence {
117    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
118        write!(f, "{}", self.iter().map(char::from).collect::<String>())
119    }
120}
121
122#[cfg(test)]
123mod test {
124    use crate::protein::ProteinConversionError;
125
126    use super::ProteinSequence;
127    use std::str::FromStr;
128
129    #[test]
130    fn from_iupac() {
131        let seq = ProteinSequence::from_str("AARDVARK").unwrap();
132        assert_eq!(seq.len(), 8);
133        assert_eq!(seq.to_string(), "AARDVARK");
134        assert_eq!(seq.sequence_bytes(), [1, 1, 16, 4, 19, 1, 16, 10]);
135    }
136
137    #[test]
138    fn from_iupac_invalid() {
139        assert_eq!(
140            ProteinSequence::from_str("AARDVARK1"),
141            Err(ProteinConversionError::InvalidBase)
142        );
143    }
144}