blastdb_sequence_util/
protein.rs1const NCBISTDAA_TO_IUPACAA: [u8; 32] = [
3 0xff, b'A', b'B', b'C', b'D', b'E', b'F', b'G', b'H', b'I', b'K', b'L', b'M', b'N', b'P', b'Q',
4 b'R', b'S', b'T', b'V', b'W', b'X', b'Y', b'Z', b'U', 0xff, b'O', b'J', 0xff, 0xff, 0xff, 0xff,
5];
6
7const fn generate_iupacaa_to_ncbistdaa() -> [u8; 256] {
8 let mut r = [0xffu8; 256];
9 let mut i = 0;
10 while i < NCBISTDAA_TO_IUPACAA.len() {
11 if NCBISTDAA_TO_IUPACAA[i] != 0xff {
12 r[NCBISTDAA_TO_IUPACAA[i] as usize] = i as u8;
13 r[NCBISTDAA_TO_IUPACAA[i].to_ascii_lowercase() as usize] = i as u8;
14 }
15 i += 1;
16 }
17 r
18}
19const IUPACAA_TO_NCBISTDAA: [u8; 256] = generate_iupacaa_to_ncbistdaa();
20
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum ProteinConversionError {
23 InvalidBase,
24}
25
26impl std::fmt::Display for ProteinConversionError {
27 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
28 write!(f, "{:?}", self)
29 }
30}
31
32impl std::error::Error for ProteinConversionError {}
33
34#[repr(transparent)]
36#[derive(PartialEq, Eq, Copy, Clone, Debug)]
37pub struct NcbiStdaaBase(u8);
38
39impl TryFrom<u8> for NcbiStdaaBase {
40 type Error = ProteinConversionError;
41
42 #[inline]
43 fn try_from(value: u8) -> Result<Self, Self::Error> {
44 let base = IUPACAA_TO_NCBISTDAA[value as usize];
45 if base != u8::MAX {
46 Ok(NcbiStdaaBase(base))
47 } else {
48 Err(ProteinConversionError::InvalidBase)
49 }
50 }
51}
52
53impl From<NcbiStdaaBase> for u8 {
54 #[inline]
55 fn from(value: NcbiStdaaBase) -> Self {
56 NCBISTDAA_TO_IUPACAA[value.0 as usize]
57 }
58}
59
60impl From<NcbiStdaaBase> for char {
61 #[inline]
62 fn from(value: NcbiStdaaBase) -> Self {
63 u8::from(value).into()
64 }
65}
66
67#[derive(Clone, PartialEq, PartialOrd, Eq, Ord, Debug)]
69pub struct ProteinSequence {
70 seq: Vec<u8>,
71}
72
73impl ProteinSequence {
74 pub fn iter(&self) -> impl Iterator<Item = NcbiStdaaBase> + '_ {
76 self.seq.iter().copied().map(NcbiStdaaBase)
77 }
78
79 pub fn len(&self) -> usize {
81 self.seq.len()
82 }
83
84 pub fn is_empty(&self) -> bool {
85 self.len() == 0
86 }
87
88 pub fn sequence_bytes(&self) -> &[u8] {
90 &self.seq[..]
91 }
92}
93
94impl FromIterator<NcbiStdaaBase> for ProteinSequence {
95 fn from_iter<T: IntoIterator<Item = NcbiStdaaBase>>(iter: T) -> Self {
96 ProteinSequence {
97 seq: iter.into_iter().map(|b| b.0).collect(),
98 }
99 }
100}
101
102impl std::str::FromStr for ProteinSequence {
105 type Err = ProteinConversionError;
106
107 fn from_str(s: &str) -> Result<Self, Self::Err> {
108 s.as_bytes()
109 .iter()
110 .map(|b| NcbiStdaaBase::try_from(*b))
111 .collect::<Result<_, _>>()
112 }
113}
114
115impl std::fmt::Display for ProteinSequence {
117 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
118 write!(f, "{}", self.iter().map(char::from).collect::<String>())
119 }
120}
121
122#[cfg(test)]
123mod test {
124 use crate::protein::ProteinConversionError;
125
126 use super::ProteinSequence;
127 use std::str::FromStr;
128
129 #[test]
130 fn from_iupac() {
131 let seq = ProteinSequence::from_str("AARDVARK").unwrap();
132 assert_eq!(seq.len(), 8);
133 assert_eq!(seq.to_string(), "AARDVARK");
134 assert_eq!(seq.sequence_bytes(), [1, 1, 16, 4, 19, 1, 16, 10]);
135 }
136
137 #[test]
138 fn from_iupac_invalid() {
139 assert_eq!(
140 ProteinSequence::from_str("AARDVARK1"),
141 Err(ProteinConversionError::InvalidBase)
142 );
143 }
144}