use serde::{Deserialize, Serialize};
use std::fmt::Display;
use std::str::FromStr;
pub struct Alphabet {
pub alphabet_type: AlphabetType,
pub encoding_array: &'static [u8; 256],
pub decoding_array: &'static [u8; 256],
pub bits_per_symbol: usize,
}
pub struct AlphabetGuesser {
alphabet_type: AlphabetType,
}
impl AlphabetGuesser {
#[allow(clippy::new_without_default)]
pub fn new() -> Self {
AlphabetGuesser {
alphabet_type: AlphabetType::Dna2bit,
}
}
pub fn update(&mut self, sequence: &[u8]) {
if self.alphabet_type == AlphabetType::Ascii {
return;
}
for &byte in sequence {
let byte_upper = byte.to_ascii_uppercase();
let char_required_alphabet = get_minimum_alphabet_for_char(byte_upper);
if is_more_general_alphabet(char_required_alphabet, self.alphabet_type) {
self.alphabet_type = char_required_alphabet;
}
if self.alphabet_type == AlphabetType::Ascii {
break;
}
}
}
pub fn guess(&self) -> AlphabetType {
self.alphabet_type
}
}
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
pub enum AlphabetType {
Dna2bit,
Dna3bit,
DnaIupac,
Protein,
Ascii,
Unknown,
}
impl Display for AlphabetType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
AlphabetType::Dna2bit => write!(f, "dna2bit"),
AlphabetType::Dna3bit => write!(f, "dna3bit"),
AlphabetType::DnaIupac => write!(f, "dnaio"),
AlphabetType::Protein => write!(f, "protein"),
AlphabetType::Ascii => write!(f, "ASCII"),
AlphabetType::Unknown => write!(f, "Unknown"),
}
}
}
impl FromStr for AlphabetType {
type Err = ();
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"dna2bit" => Ok(AlphabetType::Dna2bit),
"dna3bit" => Ok(AlphabetType::Dna3bit),
"dnaio" => Ok(AlphabetType::DnaIupac),
"protein" => Ok(AlphabetType::Protein),
"ascii" => Ok(AlphabetType::Ascii),
"unknown" => Ok(AlphabetType::Unknown),
_ => Err(()),
}
}
}
impl AlphabetType {
pub fn bits_per_symbol(&self) -> usize {
match self {
AlphabetType::Dna2bit => 2,
AlphabetType::Dna3bit => 3,
AlphabetType::DnaIupac => 4,
AlphabetType::Protein => 5,
AlphabetType::Ascii => 8,
AlphabetType::Unknown => 8, }
}
}
const DNA_2BIT_ENCODING_ARRAY: [u8; 256] = {
let mut arr = [0u8; 256];
arr[b'T' as usize] = 0b00;
arr[b't' as usize] = 0b00;
arr[b'C' as usize] = 0b01;
arr[b'c' as usize] = 0b01;
arr[b'A' as usize] = 0b10;
arr[b'a' as usize] = 0b10;
arr[b'G' as usize] = 0b11;
arr[b'g' as usize] = 0b11;
arr
};
const DNA_2BIT_DECODING_ARRAY: [u8; 256] = {
let mut arr = [b'N'; 256]; arr[0b00] = b'T';
arr[0b01] = b'C';
arr[0b10] = b'A';
arr[0b11] = b'G';
arr
};
const DNA_3BIT_ENCODING_ARRAY: [u8; 256] = {
let mut arr = [0b111; 256]; arr[b'A' as usize] = 0b000;
arr[b'a' as usize] = 0b000; arr[b'C' as usize] = 0b001;
arr[b'c' as usize] = 0b001; arr[b'G' as usize] = 0b010;
arr[b'g' as usize] = 0b010; arr[b'T' as usize] = 0b011;
arr[b't' as usize] = 0b011; arr[b'N' as usize] = 0b100;
arr[b'n' as usize] = 0b100; arr[b'R' as usize] = 0b101;
arr[b'r' as usize] = 0b101; arr[b'Y' as usize] = 0b110;
arr[b'y' as usize] = 0b110; arr
};
const DNA_3BIT_DECODING_ARRAY: [u8; 256] = {
let mut arr = [b'X'; 256]; arr[0b000] = b'A'; arr[0b001] = b'C'; arr[0b010] = b'G'; arr[0b011] = b'T'; arr[0b100] = b'N'; arr[0b101] = b'R'; arr[0b110] = b'Y'; arr
};
const DNA_IUPAC_ENCODING_ARRAY: [u8; 256] = {
let mut arr = [0u8; 256];
arr[b'A' as usize] = 0b0001; arr[b'C' as usize] = 0b0010; arr[b'G' as usize] = 0b0100; arr[b'T' as usize] = 0b1000; arr[b'U' as usize] = 0b1000; arr[b'R' as usize] = 0b0101; arr[b'Y' as usize] = 0b1010; arr[b'S' as usize] = 0b0110; arr[b'W' as usize] = 0b1001; arr[b'K' as usize] = 0b0111; arr[b'M' as usize] = 0b0011; arr[b'B' as usize] = 0b1100; arr[b'D' as usize] = 0b1101; arr[b'H' as usize] = 0b1110; arr[b'V' as usize] = 0b1111; arr[b'N' as usize] = 0b0000; arr[b'a' as usize] = 0b0001;
arr[b'c' as usize] = 0b0010;
arr[b'g' as usize] = 0b0100;
arr[b't' as usize] = 0b1000;
arr[b'u' as usize] = 0b1000;
arr[b'r' as usize] = 0b0101;
arr[b'y' as usize] = 0b1010;
arr[b's' as usize] = 0b0110;
arr[b'w' as usize] = 0b1001;
arr[b'k' as usize] = 0b0111;
arr[b'm' as usize] = 0b0011;
arr[b'b' as usize] = 0b1100;
arr[b'd' as usize] = 0b1101;
arr[b'h' as usize] = 0b1110;
arr[b'v' as usize] = 0b1111;
arr[b'n' as usize] = 0b0000;
arr
};
const DNA_IUPAC_DECODING_ARRAY: [u8; 256] = {
let mut arr = [b'N'; 256]; arr[0b0000] = b'N'; arr[0b0001] = b'A';
arr[0b0010] = b'C';
arr[0b0011] = b'M'; arr[0b0100] = b'G';
arr[0b0101] = b'R'; arr[0b0110] = b'S'; arr[0b0111] = b'K'; arr[0b1000] = b'T';
arr[0b1001] = b'W'; arr[0b1010] = b'Y'; arr[0b1011] = b'D'; arr[0b1100] = b'B'; arr[0b1101] = b'H'; arr[0b1110] = b'V'; arr[0b1111] = b'V'; arr
};
const PROTEIN_ENCODING_ARRAY: [u8; 256] = {
let mut arr = [0u8; 256];
arr[b'A' as usize] = 0b00000;
arr[b'a' as usize] = 0b00000; arr[b'C' as usize] = 0b00001;
arr[b'c' as usize] = 0b00001; arr[b'D' as usize] = 0b00010;
arr[b'd' as usize] = 0b00010; arr[b'E' as usize] = 0b00011;
arr[b'e' as usize] = 0b00011; arr[b'F' as usize] = 0b00100;
arr[b'f' as usize] = 0b00100; arr[b'G' as usize] = 0b00101;
arr[b'g' as usize] = 0b00101; arr[b'H' as usize] = 0b00110;
arr[b'h' as usize] = 0b00110; arr[b'I' as usize] = 0b00111;
arr[b'i' as usize] = 0b00111; arr[b'K' as usize] = 0b01000;
arr[b'k' as usize] = 0b01000; arr[b'L' as usize] = 0b01001;
arr[b'l' as usize] = 0b01001; arr[b'M' as usize] = 0b01010;
arr[b'm' as usize] = 0b01010; arr[b'N' as usize] = 0b01011;
arr[b'n' as usize] = 0b01011; arr[b'P' as usize] = 0b01100;
arr[b'p' as usize] = 0b01100; arr[b'Q' as usize] = 0b01101;
arr[b'q' as usize] = 0b01101; arr[b'R' as usize] = 0b01110;
arr[b'r' as usize] = 0b01110; arr[b'S' as usize] = 0b01111;
arr[b's' as usize] = 0b01111; arr[b'T' as usize] = 0b10000;
arr[b't' as usize] = 0b10000; arr[b'V' as usize] = 0b10001;
arr[b'v' as usize] = 0b10001; arr[b'W' as usize] = 0b10010;
arr[b'w' as usize] = 0b10010; arr[b'Y' as usize] = 0b10011;
arr[b'y' as usize] = 0b10011; arr[b'*' as usize] = 0b10100; arr[b'X' as usize] = 0b10101;
arr[b'x' as usize] = 0b10101; arr[b'-' as usize] = 0b10110; arr[b'.' as usize] = 0b10111; arr
};
const PROTEIN_DECODING_ARRAY: [u8; 256] = {
let mut arr = [b'X'; 256]; arr[0b00000] = b'A'; arr[0b00001] = b'C'; arr[0b00010] = b'D'; arr[0b00011] = b'E'; arr[0b00100] = b'F'; arr[0b00101] = b'G'; arr[0b00110] = b'H'; arr[0b00111] = b'I'; arr[0b01000] = b'K'; arr[0b01001] = b'L'; arr[0b01010] = b'M'; arr[0b01011] = b'N'; arr[0b01100] = b'P'; arr[0b01101] = b'Q'; arr[0b01110] = b'R'; arr[0b01111] = b'S'; arr[0b10000] = b'T'; arr[0b10001] = b'V'; arr[0b10010] = b'W'; arr[0b10011] = b'Y'; arr[0b10100] = b'*'; arr[0b10101] = b'X'; arr[0b10110] = b'-'; arr[0b10111] = b'.'; arr
};
const fn const_u8_array() -> [u8; 256] {
let mut arr = [0u8; 256];
let mut i = 0;
while i < 256 {
arr[i] = i as u8;
i += 1;
}
arr
}
const ASCII_ENCODING_ARRAY: [u8; 256] = const_u8_array();
pub const DNA_3BIT_ALPHABET: Alphabet = Alphabet {
alphabet_type: AlphabetType::Dna3bit,
bits_per_symbol: 3,
encoding_array: &DNA_3BIT_ENCODING_ARRAY,
decoding_array: &DNA_3BIT_DECODING_ARRAY,
};
pub const DNA_2BIT_ALPHABET: Alphabet = Alphabet {
alphabet_type: AlphabetType::Dna2bit,
bits_per_symbol: 2,
encoding_array: &DNA_2BIT_ENCODING_ARRAY,
decoding_array: &DNA_2BIT_DECODING_ARRAY,
};
pub const DNA_IUPAC_ALPHABET: Alphabet = Alphabet {
alphabet_type: AlphabetType::DnaIupac,
bits_per_symbol: 4,
encoding_array: &DNA_IUPAC_ENCODING_ARRAY,
decoding_array: &DNA_IUPAC_DECODING_ARRAY,
};
pub const PROTEIN_ALPHABET: Alphabet = Alphabet {
alphabet_type: AlphabetType::Protein,
bits_per_symbol: 5,
encoding_array: &PROTEIN_ENCODING_ARRAY,
decoding_array: &PROTEIN_DECODING_ARRAY,
};
pub const ASCII_ALPHABET: Alphabet = Alphabet {
alphabet_type: AlphabetType::Ascii,
bits_per_symbol: 8,
encoding_array: &ASCII_ENCODING_ARRAY,
decoding_array: &ASCII_ENCODING_ARRAY,
};
pub fn lookup_alphabet(alphabet_type: &AlphabetType) -> &'static Alphabet {
match alphabet_type {
AlphabetType::Dna2bit => &DNA_2BIT_ALPHABET,
AlphabetType::Dna3bit => &DNA_3BIT_ALPHABET,
AlphabetType::DnaIupac => &DNA_IUPAC_ALPHABET,
AlphabetType::Protein => &PROTEIN_ALPHABET,
AlphabetType::Ascii => &ASCII_ALPHABET,
AlphabetType::Unknown => &ASCII_ALPHABET, }
}
pub fn guess_alphabet_fast(sequence: &[u8]) -> AlphabetType {
let mut smallest_alphabet = AlphabetType::Dna2bit;
for &byte in sequence {
let byte_upper = byte.to_ascii_uppercase();
match smallest_alphabet {
AlphabetType::Dna2bit => {
if !matches!(byte_upper, b'A' | b'C' | b'G' | b'T') {
smallest_alphabet = AlphabetType::Dna3bit;
}
}
AlphabetType::Dna3bit => {
if !matches!(byte_upper, b'A' | b'C' | b'G' | b'T' | b'N' | b'R' | b'Y') {
smallest_alphabet = AlphabetType::DnaIupac;
}
}
AlphabetType::DnaIupac => {
if DNA_IUPAC_ENCODING_ARRAY[byte as usize] == 0 && byte_upper != b'N' {
smallest_alphabet = AlphabetType::Protein;
}
}
AlphabetType::Protein => {
if PROTEIN_ENCODING_ARRAY[byte as usize] == 0 && byte != b'-' && byte != b'*' {
smallest_alphabet = AlphabetType::Ascii;
break;
}
}
_ => break, }
}
smallest_alphabet
}
pub fn guess_alphabet(sequence: &[u8]) -> AlphabetType {
let mut required_alphabet = AlphabetType::Dna2bit;
for &byte in sequence {
let byte_upper = byte.to_ascii_uppercase();
let char_required_alphabet = get_minimum_alphabet_for_char(byte_upper);
if is_more_general_alphabet(char_required_alphabet, required_alphabet) {
required_alphabet = char_required_alphabet;
}
if required_alphabet == AlphabetType::Ascii {
break;
}
}
required_alphabet
}
fn get_minimum_alphabet_for_char(byte: u8) -> AlphabetType {
if matches!(byte, b'A' | b'C' | b'G' | b'T') {
return AlphabetType::Dna2bit;
}
if matches!(byte, b'N' | b'R' | b'Y') {
return AlphabetType::Dna3bit;
}
if DNA_IUPAC_ENCODING_ARRAY[byte as usize] != 0 || byte == b'N' {
return AlphabetType::DnaIupac;
}
if PROTEIN_ENCODING_ARRAY[byte as usize] != 0 || byte == b'-' || byte == b'*' {
return AlphabetType::Protein;
}
AlphabetType::Ascii
}
fn is_more_general_alphabet(a: AlphabetType, b: AlphabetType) -> bool {
let alphabet_hierarchy = [
AlphabetType::Dna2bit,
AlphabetType::Dna3bit,
AlphabetType::DnaIupac,
AlphabetType::Protein,
AlphabetType::Ascii,
];
let pos_a = alphabet_hierarchy.iter().position(|&x| x == a).unwrap_or(4);
let pos_b = alphabet_hierarchy.iter().position(|&x| x == b).unwrap_or(4);
pos_a > pos_b
}
#[cfg(test)]
mod tests {
use super::{AlphabetGuesser, AlphabetType, guess_alphabet, guess_alphabet_fast};
#[test]
fn test_guess_alphabet() {
assert_eq!(guess_alphabet(b"ACGT"), AlphabetType::Dna2bit);
assert_eq!(guess_alphabet(b"ACGTNRY"), AlphabetType::Dna3bit);
assert_eq!(guess_alphabet(b"ACGTRYMK"), AlphabetType::DnaIupac);
assert_eq!(guess_alphabet(b"EFILPQ"), AlphabetType::Protein);
assert_eq!(guess_alphabet(b"Hello, World!"), AlphabetType::Ascii);
assert_eq!(guess_alphabet(b"ACTGEG"), AlphabetType::Protein);
assert_eq!(guess_alphabet(b"ACTGM"), AlphabetType::DnaIupac);
assert_eq!(guess_alphabet(b"ACGTSKWV"), AlphabetType::DnaIupac);
assert_eq!(guess_alphabet(b"ACGTE"), AlphabetType::Protein);
assert_eq!(guess_alphabet(b"ACGT*"), AlphabetType::Protein);
assert_eq!(guess_alphabet(b"ACGT-"), AlphabetType::Protein);
assert_eq!(guess_alphabet(b"actgEFIL"), AlphabetType::Protein);
assert_eq!(guess_alphabet(b"ACGT123"), AlphabetType::Ascii);
assert_eq!(guess_alphabet(b"ACGT@#$"), AlphabetType::Ascii);
assert_eq!(guess_alphabet(b"A"), AlphabetType::Dna2bit);
assert_eq!(guess_alphabet(b"N"), AlphabetType::Dna3bit);
assert_eq!(guess_alphabet(b"M"), AlphabetType::DnaIupac);
assert_eq!(guess_alphabet(b"E"), AlphabetType::Protein);
assert_eq!(guess_alphabet(b"1"), AlphabetType::Ascii);
assert_eq!(guess_alphabet_fast(b"ACTGM"), AlphabetType::Dna3bit); assert_eq!(guess_alphabet(b"ACTGM"), AlphabetType::DnaIupac); }
#[test]
fn test_alphabet_guesser_matches_guess_alphabet() {
let test_cases: Vec<&[u8]> = vec![
b"ACGT",
b"ACGTNRY",
b"ACGTRYMK",
b"EFILPQ",
b"Hello, World!",
b"ACTGEG",
b"ACTGM",
b"ACGTSKWV",
b"ACGTE",
b"ACGT*",
b"ACGT-",
b"actgEFIL",
b"ACGT123",
b"ACGT@#$",
b"A",
b"N",
b"M",
b"E",
b"1",
];
for test_case in test_cases {
let mut guesser = AlphabetGuesser::new();
guesser.update(test_case);
let guesser_result = guesser.guess();
let function_result = guess_alphabet(test_case);
assert_eq!(
guesser_result,
function_result,
"AlphabetGuesser and guess_alphabet disagree on sequence: {:?}",
std::str::from_utf8(test_case).unwrap_or("(invalid UTF-8)")
);
}
}
#[test]
fn test_alphabet_guesser_streaming() {
let mut guesser = AlphabetGuesser::new();
guesser.update(b"ACGT");
assert_eq!(guesser.guess(), AlphabetType::Dna2bit);
guesser.update(b"NRY");
assert_eq!(guesser.guess(), AlphabetType::Dna3bit);
guesser.update(b"MKS");
assert_eq!(guesser.guess(), AlphabetType::DnaIupac);
guesser.update(b"EFIL");
assert_eq!(guesser.guess(), AlphabetType::Protein);
guesser.update(b"123");
assert_eq!(guesser.guess(), AlphabetType::Ascii);
let full_sequence = b"ACGTNRYMSKEFILP123";
assert_eq!(guesser.guess(), guess_alphabet(full_sequence));
}
}