nale/structs/
sequence.rs

1use seq_io::fasta::{Reader, Record};
2use std::fmt::{Debug, Formatter};
3use std::path::Path;
4
5use crate::alphabet::{AMINO_INVERSE_MAP, UTF8_TO_DIGITAL_AMINO};
6use anyhow::Result;
7use thiserror::Error;
8
9#[derive(Error, Debug)]
10#[error("unknown UTF8 sequence byte: {byte}")]
11pub struct UnknownUtf8SequenceByteError {
12    byte: u8,
13}
14
15#[derive(Error, Debug)]
16#[error("unknown digital sequence byte: {byte}")]
17pub struct UnknownDigitalSequenceByteError {
18    byte: u8,
19}
20
21/// This holds the both the "digital" data and string data of a biological sequence.
22pub struct Sequence {
23    /// The name of the sequence
24    pub name: String,
25    /// The length of the sequence
26    pub length: usize,
27    /// The "digital" data of the sequence. These are the string bytes, but mapped to [0u8..25u8]
28    pub digital_bytes: Vec<u8>,
29    /// The string data of the sequence. These are the UTF8 bytes that make up the sequence in the "normal" alphabet
30    pub utf8_bytes: Vec<u8>,
31}
32
33impl Sequence {
34    pub fn amino_from_fasta<P: AsRef<Path>>(path: P) -> Result<Vec<Self>> {
35        let mut seqs: Vec<Self> = vec![];
36
37        let mut reader = Reader::from_path(path).unwrap();
38
39        while let Some(record) = reader.next() {
40            let record = record.expect("Error reading record");
41            let record_header = String::from_utf8(record.head().to_vec())?;
42            let record_name = record_header.split_whitespace().next().unwrap().to_string();
43            // We want position 1 of the sequence to be at index 1, so we'll buffer with 255
44            let mut utf8_bytes: Vec<u8> = vec![255];
45            let mut digital_bytes: Vec<u8> = vec![255];
46
47            for line in record.seq_lines() {
48                for utf8_byte in line {
49                    utf8_bytes.push(*utf8_byte);
50
51                    let digital_byte = match UTF8_TO_DIGITAL_AMINO.get(utf8_byte) {
52                        Some(b) => b,
53                        None => {
54                            return Err(UnknownUtf8SequenceByteError { byte: *utf8_byte }.into())
55                        }
56                    };
57                    digital_bytes.push(*digital_byte)
58                }
59            }
60
61            seqs.push(Sequence {
62                name: record_name,
63                length: digital_bytes.len() - 1,
64                digital_bytes,
65                utf8_bytes,
66            });
67        }
68        Ok(seqs)
69    }
70
71    pub fn from_digital(bytes: &[u8]) -> Result<Self> {
72        let mut digital_bytes: Vec<u8> = vec![255; bytes.len() + 1];
73        digital_bytes[1..].copy_from_slice(bytes);
74        let mut utf8_bytes: Vec<u8> = vec![255; digital_bytes.len()];
75
76        for (idx, digital_byte) in digital_bytes[1..].iter().enumerate() {
77            let utf8_byte = match AMINO_INVERSE_MAP.get(digital_byte) {
78                Some(b) => *b,
79                None => {
80                    return Err(UnknownDigitalSequenceByteError {
81                        byte: *digital_byte,
82                    }
83                    .into())
84                }
85            };
86            utf8_bytes[idx + 1] = utf8_byte;
87        }
88
89        Ok(Sequence {
90            name: "".to_string(),
91            length: utf8_bytes.len() - 1,
92            digital_bytes,
93            utf8_bytes,
94        })
95    }
96
97    pub fn from_utf8(bytes: &[u8]) -> Result<Self> {
98        let mut utf8_bytes: Vec<u8> = vec![255; bytes.len() + 1];
99        utf8_bytes[1..].copy_from_slice(bytes);
100        let mut digital_bytes: Vec<u8> = vec![255; utf8_bytes.len()];
101
102        for (idx, utf8_byte) in utf8_bytes[1..].iter().enumerate() {
103            let digital_byte = match UTF8_TO_DIGITAL_AMINO.get(utf8_byte) {
104                Some(b) => *b,
105                None => return Err(UnknownUtf8SequenceByteError { byte: *utf8_byte }.into()),
106            };
107            digital_bytes[idx + 1] = digital_byte;
108        }
109
110        Ok(Sequence {
111            name: "".to_string(),
112            length: digital_bytes.len() - 1,
113            digital_bytes,
114            utf8_bytes,
115        })
116    }
117}
118
119impl Debug for Sequence {
120    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
121        write!(f, "{}", std::str::from_utf8(&self.utf8_bytes[1..]).unwrap())?;
122        Ok(())
123    }
124}