Module bio_seq::codec::iupac

source ·
Expand description

4-bit IUPAC nucleotide ambiguity codes

IUPAC nucleotide ambiguity codes are represented with 4 bits

ACGT
A1000
C0100
G0010
T0001
Y0101
R1010
W1001
S0110
K0011
M1100
D1011
V1110
H1101
B0111
N1111
X/-0000

This naturally supports set membership operations:

use bio_seq::prelude::*;

// Set union:
assert_eq!(iupac!("AS-GYTNA") | iupac!("ANTGCAT-"), iupac!("ANTGYWNA"));

// Set intersection:
assert_eq!(iupac!("ACGTSWKM") & iupac!("WKMSTNNA"), iupac!("A----WKA"));

Which can be used to implement pattern matching:

use bio_seq::prelude::*;

let seq = iupac!("AGCTNNCAGTCGACGTATGTA");
let pattern = iupac!("AYG");

for slice in seq.windows(pattern.len()) {
   if pattern.contains(slice) {
       println!("{slice} matches pattern");
   }
}

// ACG matches pattern
// ATG matches pattern

Enums§