1use seq_io::fasta::{Reader, Record};
2use std::fmt::{Debug, Formatter};
3use std::path::Path;
4
5use crate::alphabet::{AMINO_INVERSE_MAP, UTF8_TO_DIGITAL_AMINO};
6use anyhow::Result;
7use thiserror::Error;
8
9#[derive(Error, Debug)]
10#[error("unknown UTF8 sequence byte: {byte}")]
11pub struct UnknownUtf8SequenceByteError {
12 byte: u8,
13}
14
15#[derive(Error, Debug)]
16#[error("unknown digital sequence byte: {byte}")]
17pub struct UnknownDigitalSequenceByteError {
18 byte: u8,
19}
20
21pub struct Sequence {
23 pub name: String,
25 pub length: usize,
27 pub digital_bytes: Vec<u8>,
29 pub utf8_bytes: Vec<u8>,
31}
32
33impl Sequence {
34 pub fn amino_from_fasta<P: AsRef<Path>>(path: P) -> Result<Vec<Self>> {
35 let mut seqs: Vec<Self> = vec![];
36
37 let mut reader = Reader::from_path(path).unwrap();
38
39 while let Some(record) = reader.next() {
40 let record = record.expect("Error reading record");
41 let record_header = String::from_utf8(record.head().to_vec())?;
42 let record_name = record_header.split_whitespace().next().unwrap().to_string();
43 let mut utf8_bytes: Vec<u8> = vec![255];
45 let mut digital_bytes: Vec<u8> = vec![255];
46
47 for line in record.seq_lines() {
48 for utf8_byte in line {
49 utf8_bytes.push(*utf8_byte);
50
51 let digital_byte = match UTF8_TO_DIGITAL_AMINO.get(utf8_byte) {
52 Some(b) => b,
53 None => {
54 return Err(UnknownUtf8SequenceByteError { byte: *utf8_byte }.into())
55 }
56 };
57 digital_bytes.push(*digital_byte)
58 }
59 }
60
61 seqs.push(Sequence {
62 name: record_name,
63 length: digital_bytes.len() - 1,
64 digital_bytes,
65 utf8_bytes,
66 });
67 }
68 Ok(seqs)
69 }
70
71 pub fn from_digital(bytes: &[u8]) -> Result<Self> {
72 let mut digital_bytes: Vec<u8> = vec![255; bytes.len() + 1];
73 digital_bytes[1..].copy_from_slice(bytes);
74 let mut utf8_bytes: Vec<u8> = vec![255; digital_bytes.len()];
75
76 for (idx, digital_byte) in digital_bytes[1..].iter().enumerate() {
77 let utf8_byte = match AMINO_INVERSE_MAP.get(digital_byte) {
78 Some(b) => *b,
79 None => {
80 return Err(UnknownDigitalSequenceByteError {
81 byte: *digital_byte,
82 }
83 .into())
84 }
85 };
86 utf8_bytes[idx + 1] = utf8_byte;
87 }
88
89 Ok(Sequence {
90 name: "".to_string(),
91 length: utf8_bytes.len() - 1,
92 digital_bytes,
93 utf8_bytes,
94 })
95 }
96
97 pub fn from_utf8(bytes: &[u8]) -> Result<Self> {
98 let mut utf8_bytes: Vec<u8> = vec![255; bytes.len() + 1];
99 utf8_bytes[1..].copy_from_slice(bytes);
100 let mut digital_bytes: Vec<u8> = vec![255; utf8_bytes.len()];
101
102 for (idx, utf8_byte) in utf8_bytes[1..].iter().enumerate() {
103 let digital_byte = match UTF8_TO_DIGITAL_AMINO.get(utf8_byte) {
104 Some(b) => *b,
105 None => return Err(UnknownUtf8SequenceByteError { byte: *utf8_byte }.into()),
106 };
107 digital_bytes[idx + 1] = digital_byte;
108 }
109
110 Ok(Sequence {
111 name: "".to_string(),
112 length: digital_bytes.len() - 1,
113 digital_bytes,
114 utf8_bytes,
115 })
116 }
117}
118
119impl Debug for Sequence {
120 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
121 write!(f, "{}", std::str::from_utf8(&self.utf8_bytes[1..]).unwrap())?;
122 Ok(())
123 }
124}