na_seq/
lib.rs

1use std::{io, io::ErrorKind};
2
3use bincode::{Decode, Encode};
4
5use crate::Nucleotide::*;
6pub use crate::{
7    amino_acids::{AaIdent, AminoAcid, CodingResult},
8    element::{AtomTypeInRes, Element},
9    nucleotide::{Nucleotide, NucleotideGeneral},
10    restriction_enzyme::RestrictionEnzyme,
11};
12
13pub mod amino_acids;
14pub mod element;
15pub mod ligation;
16pub mod nucleotide;
17pub mod re_lib;
18pub mod restriction_enzyme;
19
20// Index 0: 5' end.
21pub type Seq = Vec<Nucleotide>;
22
23pub struct IndexError {}
24
25/// Reverse direction, and swap C for G, A for T.
26pub fn seq_complement(seq: &[Nucleotide]) -> Seq {
27    let mut result = seq.to_vec();
28    result.reverse();
29
30    for nt in &mut result {
31        *nt = nt.complement();
32    }
33
34    result
35}
36
37/// Create a nucleotide sequence from a string. (Case insensitive)
38pub fn seq_from_str(str: &str) -> Seq {
39    let mut result = Vec::new();
40
41    for char in str.to_lowercase().chars() {
42        match char {
43            'a' => result.push(A),
44            't' => result.push(T),
45            'c' => result.push(C),
46            'g' => result.push(G),
47            _ => (),
48        };
49    }
50
51    result
52}
53
54/// Create an amino-acid sequence from a string of single-letter identifiers. (Case insensitive)
55pub fn seq_aa_from_str(str: &str) -> Vec<AminoAcid> {
56    let mut result = Vec::new();
57
58    for char in str.chars() {
59        let letter = char.to_string(); // Convert `char` to `String`
60        if let Ok(aa) = letter.parse::<AminoAcid>() {
61            result.push(aa);
62        }
63    }
64
65    result
66}
67
68/// Convert a nucleotide sequence to string.
69pub fn seq_to_str_lower(seq: &[Nucleotide]) -> String {
70    let mut result = String::new();
71
72    for nt in seq {
73        result.push_str(&nt.to_str_lower());
74    }
75
76    result
77}
78
79/// Convert a nucleotide sequence to string.
80pub fn seq_to_str_upper(seq: &[Nucleotide]) -> String {
81    let mut result = String::new();
82
83    for nt in seq {
84        result.push_str(&nt.to_str_upper());
85    }
86
87    result
88}
89
90/// Convert an amino acid sequence to string of single-letter idents.
91pub fn seq_aa_to_str(seq: &[AminoAcid]) -> String {
92    let mut result = String::new();
93
94    for aa in seq {
95        result.push_str(&aa.to_str(AaIdent::OneLetter));
96    }
97
98    result
99}
100
101/// Convert a sequence to bytes associated with UTF-8 letters. For compatibility with external libraries.
102pub fn seq_to_u8_upper(seq: &[Nucleotide]) -> Vec<u8> {
103    seq.iter().map(|nt| nt.to_u8_upper()).collect()
104}
105
106/// Convert a sequence of amino acids to bytes associated with UTF-8 letters. For compatibility with external libraries.
107pub fn seq_to_u8_lower(seq: &[Nucleotide]) -> Vec<u8> {
108    seq.iter().map(|nt| nt.to_u8_lower()).collect()
109}
110
111/// Convert a sequence of amino acids to bytes associated with UTF-8 letters. For compatibility with external libraries.
112pub fn seq_aa_to_u8_upper(seq: &[AminoAcid]) -> Vec<u8> {
113    seq.iter().map(|aa| aa.to_u8_upper()).collect()
114}
115
116/// Convert a string to bytes associated with UTF-8 letters. For compatibility with external libraries.
117pub fn seq_aa_to_u8_lower(seq: &[AminoAcid]) -> Vec<u8> {
118    seq.iter().map(|aa| aa.to_u8_lower()).collect()
119}
120
121/// Sequence weight, in Daltons. Assumes single-stranded.
122pub fn seq_weight(seq: &[Nucleotide]) -> f32 {
123    let mut result = 0.;
124
125    for nt in seq {
126        result += nt.weight();
127    }
128
129    result -= 61.96;
130
131    result
132}
133
134/// Calculate portion of a sequence that is either the G or C nucleotide, on a scale of 0 to 1.
135pub fn calc_gc(seq: &[Nucleotide]) -> f32 {
136    let num_gc = seq.iter().filter(|&&nt| nt == C || nt == G).count();
137    num_gc as f32 / seq.len() as f32
138}
139
140/// A compact binary serialization of our sequence. Useful for file storage.
141/// The first four bytes is sequence length, big endian; we need this, since one of our nucleotides necessarily serializes
142/// to 0b00.
143///
144/// MSB. Nucleotides are right-to-left in a given byte. Example: A byte containing
145/// nucleotides TCAG is `0b1110_0100`.
146pub fn serialize_seq_bin(seq: &[Nucleotide]) -> Vec<u8> {
147    let mut result = Vec::new();
148    result.extend(&(seq.len() as u32).to_be_bytes());
149
150    for i in 0..seq.len() / 4 + 1 {
151        let mut val = 0;
152        for j in 0..4 {
153            let ind = i * 4 + j;
154            if ind + 1 > seq.len() {
155                break;
156            }
157            let nt = seq[ind];
158            val |= (nt as u8) << (j * 2);
159        }
160        result.push(val);
161    }
162    result
163}
164
165/// A compact binary deserialization of our sequence. Useful for file storage.
166/// The first four bytes is sequence length, big endian; we need this, since one of our nucleotides necessarily serializes
167/// to 0b00.
168pub fn deser_seq_bin(data: &[u8]) -> io::Result<Seq> {
169    let mut result = Vec::new();
170
171    if data.len() < 4 {
172        return Err(io::Error::new(
173            ErrorKind::InvalidData,
174            "Bin nucleotide sequence is too short.",
175        ));
176    }
177
178    let seq_len = u32::from_be_bytes(data[0..4].try_into().unwrap()) as usize;
179
180    for byte in &data[4..] {
181        for i in 0..4 {
182            // This trimming removes extra 00-serialized nucleotides.
183            if result.len() >= seq_len {
184                break;
185            }
186
187            let bits = (byte >> (2 * i)) & 0b11;
188            result.push(Nucleotide::try_from(bits).map_err(|_| {
189                io::Error::new(
190                    ErrorKind::InvalidData,
191                    format!("Invalid NT serialization: {}, {}", byte, bits),
192                )
193            })?);
194        }
195    }
196
197    Ok(result)
198}
199
200#[derive(Clone, Copy, PartialEq, Debug, Encode, Decode)]
201pub enum SeqTopology {
202    Linear,
203    Circular,
204}
205
206impl Default for SeqTopology {
207    fn default() -> Self {
208        Self::Circular
209    }
210}
211
212/// Insert a segment of one sequence into another. For example, for cloning.
213/// Note that `insert_loc` uses 1-based indexing.
214pub fn insert_into_seq(
215    seq_vector: &mut Seq,
216    insert: &[Nucleotide],
217    insert_loc: usize,
218) -> Result<(), IndexError> {
219    if insert_loc == 0 || insert_loc > seq_vector.len() {
220        eprintln!("Error: Insert location out of bounds: {insert_loc}");
221        return Err(IndexError {});
222    }
223
224    let insert_i = insert_loc - 1; // 1-based indexing.
225    seq_vector.splice(insert_i..insert_i, insert.iter().cloned());
226
227    Ok(())
228}