na_seq/
lib.rs

1use std::{io, io::ErrorKind};
2
3use bincode::{Decode, Encode};
4
5use crate::Nucleotide::*;
6pub use crate::{
7    amino_acids::{
8        AaIdent, AminoAcid, AminoAcidGeneral, AminoAcidProtenationVariant, CodingResult,
9    },
10    element::{AtomTypeInRes, Element},
11    nucleotide::{Nucleotide, NucleotideGeneral},
12    restriction_enzyme::RestrictionEnzyme,
13};
14
15pub mod amino_acids;
16pub mod element;
17pub mod ligation;
18pub mod nucleotide;
19pub mod re_lib;
20pub mod restriction_enzyme;
21
22// Index 0: 5' end.
23pub type Seq = Vec<Nucleotide>;
24
25pub struct IndexError {}
26
27/// Reverse direction, and swap C for G, A for T.
28pub fn seq_complement(seq: &[Nucleotide]) -> Seq {
29    let mut result = seq.to_vec();
30    result.reverse();
31
32    for nt in &mut result {
33        *nt = nt.complement();
34    }
35
36    result
37}
38
39/// Create a nucleotide sequence from a string. (Case insensitive)
40pub fn seq_from_str(str: &str) -> Seq {
41    let mut result = Vec::new();
42
43    for char in str.to_lowercase().chars() {
44        match char {
45            'a' => result.push(A),
46            't' => result.push(T),
47            'c' => result.push(C),
48            'g' => result.push(G),
49            _ => (),
50        };
51    }
52
53    result
54}
55
56/// Create an amino-acid sequence from a string of single-letter identifiers. (Case insensitive)
57pub fn seq_aa_from_str(str: &str) -> Vec<AminoAcid> {
58    let mut result = Vec::new();
59
60    for char in str.chars() {
61        let letter = char.to_string(); // Convert `char` to `String`
62        if let Ok(aa) = letter.parse::<AminoAcid>() {
63            result.push(aa);
64        }
65    }
66
67    result
68}
69
70/// Convert a nucleotide sequence to string.
71pub fn seq_to_str_lower(seq: &[Nucleotide]) -> String {
72    let mut result = String::new();
73
74    for nt in seq {
75        result.push_str(&nt.to_str_lower());
76    }
77
78    result
79}
80
81/// Convert a nucleotide sequence to string.
82pub fn seq_to_str_upper(seq: &[Nucleotide]) -> String {
83    let mut result = String::new();
84
85    for nt in seq {
86        result.push_str(&nt.to_str_upper());
87    }
88
89    result
90}
91
92/// Convert an amino acid sequence to string of single-letter idents.
93pub fn seq_aa_to_str(seq: &[AminoAcid]) -> String {
94    let mut result = String::new();
95
96    for aa in seq {
97        result.push_str(&aa.to_str(AaIdent::OneLetter));
98    }
99
100    result
101}
102
103/// Convert a sequence to bytes associated with UTF-8 letters. For compatibility with external libraries.
104pub fn seq_to_u8_upper(seq: &[Nucleotide]) -> Vec<u8> {
105    seq.iter().map(|nt| nt.to_u8_upper()).collect()
106}
107
108/// Convert a sequence of amino acids to bytes associated with UTF-8 letters. For compatibility with external libraries.
109pub fn seq_to_u8_lower(seq: &[Nucleotide]) -> Vec<u8> {
110    seq.iter().map(|nt| nt.to_u8_lower()).collect()
111}
112
113/// Convert a sequence of amino acids to bytes associated with UTF-8 letters. For compatibility with external libraries.
114pub fn seq_aa_to_u8_upper(seq: &[AminoAcid]) -> Vec<u8> {
115    seq.iter().map(|aa| aa.to_u8_upper()).collect()
116}
117
118/// Convert a string to bytes associated with UTF-8 letters. For compatibility with external libraries.
119pub fn seq_aa_to_u8_lower(seq: &[AminoAcid]) -> Vec<u8> {
120    seq.iter().map(|aa| aa.to_u8_lower()).collect()
121}
122
123/// Sequence weight, in Daltons. Assumes single-stranded.
124pub fn seq_weight(seq: &[Nucleotide]) -> f32 {
125    let mut result = 0.;
126
127    for nt in seq {
128        result += nt.weight();
129    }
130
131    result -= 61.96;
132
133    result
134}
135
136/// Calculate portion of a sequence that is either the G or C nucleotide, on a scale of 0 to 1.
137pub fn calc_gc(seq: &[Nucleotide]) -> f32 {
138    let num_gc = seq.iter().filter(|&&nt| nt == C || nt == G).count();
139    num_gc as f32 / seq.len() as f32
140}
141
142/// A compact binary serialization of our sequence. Useful for file storage.
143/// The first four bytes is sequence length, big endian; we need this, since one of our nucleotides necessarily serializes
144/// to 0b00.
145///
146/// MSB. Nucleotides are right-to-left in a given byte. Example: A byte containing
147/// nucleotides TCAG is `0b1110_0100`.
148pub fn serialize_seq_bin(seq: &[Nucleotide]) -> Vec<u8> {
149    let mut result = Vec::new();
150    result.extend(&(seq.len() as u32).to_be_bytes());
151
152    for i in 0..seq.len() / 4 + 1 {
153        let mut val = 0;
154        for j in 0..4 {
155            let ind = i * 4 + j;
156            if ind + 1 > seq.len() {
157                break;
158            }
159            let nt = seq[ind];
160            val |= (nt as u8) << (j * 2);
161        }
162        result.push(val);
163    }
164    result
165}
166
167/// A compact binary deserialization of our sequence. Useful for file storage.
168/// The first four bytes is sequence length, big endian; we need this, since one of our nucleotides necessarily serializes
169/// to 0b00.
170pub fn deser_seq_bin(data: &[u8]) -> io::Result<Seq> {
171    let mut result = Vec::new();
172
173    if data.len() < 4 {
174        return Err(io::Error::new(
175            ErrorKind::InvalidData,
176            "Bin nucleotide sequence is too short.",
177        ));
178    }
179
180    let seq_len = u32::from_be_bytes(data[0..4].try_into().unwrap()) as usize;
181
182    for byte in &data[4..] {
183        for i in 0..4 {
184            // This trimming removes extra 00-serialized nucleotides.
185            if result.len() >= seq_len {
186                break;
187            }
188
189            let bits = (byte >> (2 * i)) & 0b11;
190            result.push(Nucleotide::try_from(bits).map_err(|_| {
191                io::Error::new(
192                    ErrorKind::InvalidData,
193                    format!("Invalid NT serialization: {}, {}", byte, bits),
194                )
195            })?);
196        }
197    }
198
199    Ok(result)
200}
201
202#[derive(Clone, Copy, PartialEq, Debug, Encode, Decode)]
203pub enum SeqTopology {
204    Linear,
205    Circular,
206}
207
208impl Default for SeqTopology {
209    fn default() -> Self {
210        Self::Circular
211    }
212}
213
214/// Insert a segment of one sequence into another. For example, for cloning.
215/// Note that `insert_loc` uses 1-based indexing.
216pub fn insert_into_seq(
217    seq_vector: &mut Seq,
218    insert: &[Nucleotide],
219    insert_loc: usize,
220) -> Result<(), IndexError> {
221    if insert_loc == 0 || insert_loc > seq_vector.len() {
222        eprintln!("Error: Insert location out of bounds: {insert_loc}");
223        return Err(IndexError {});
224    }
225
226    let insert_i = insert_loc - 1; // 1-based indexing.
227    seq_vector.splice(insert_i..insert_i, insert.iter().cloned());
228
229    Ok(())
230}