use crate::bitmap;
use crate::constants::MASK_SIZE;
use crate::types::*;
pub mod encoded;
pub mod io;
pub mod processing;
use crate::bitmap::{set_bit, test_bit, toggle_bit};
use rayon::prelude::*;
pub use io::*;
pub use processing::*;
use wide::CmpEq;
use wide::u8x32;
#[must_use]
pub const fn char_to_nuc(c: u8) -> u8 {
match c.to_ascii_uppercase() {
b'A' => 0,
b'C' => 1,
b'G' => 2,
b'T' | b'U' => 3,
_ => 4,
}
}
#[must_use]
pub fn is_a(encoded_sequence: &[u8], n: usize) -> bool {
let bit_index = n * 2;
!(test_bit(encoded_sequence, bit_index) || test_bit(encoded_sequence, bit_index + 1))
}
#[must_use]
pub fn is_c(encoded_sequence: &[u8], n: usize) -> bool {
let bit_index = n * 2;
!test_bit(encoded_sequence, bit_index) && test_bit(encoded_sequence, bit_index + 1)
}
#[must_use]
pub fn is_g(encoded_sequence: &[u8], n: usize) -> bool {
let bit_index = n * 2;
test_bit(encoded_sequence, bit_index) && !test_bit(encoded_sequence, bit_index + 1)
}
#[must_use]
pub fn is_t(encoded_sequence: &[u8], n: usize) -> bool {
let bit_index = n * 2;
test_bit(encoded_sequence, bit_index) && test_bit(encoded_sequence, bit_index + 1)
}
pub fn is_n(unknown_sequence: &[u8], n: usize) -> bool {
if n >= unknown_sequence.len() * 8 {
return false;
}
test_bit(unknown_sequence, n)
}
pub fn is_gc(encoded_sequence: &[u8], n: usize) -> bool {
let bit_index = n * 2;
test_bit(encoded_sequence, bit_index) != test_bit(encoded_sequence, bit_index + 1)
}
const fn uses_only_atg(trans_table: i32) -> bool {
matches!(trans_table, 6 | 10 | 14 | 15 | 16 | 22)
}
const fn gtg_not_start(trans_table: i32) -> bool {
matches!(trans_table, 1 | 3 | 12 | 22)
}
fn ttg_not_start(trans_table: i32) -> bool {
trans_table < 4 || trans_table == 9 || (21..25).contains(&trans_table)
}
pub fn is_start(encoded_sequence: &[u8], pos: usize, training: &Training) -> bool {
if is_atg(encoded_sequence, pos) {
return true;
}
if uses_only_atg(training.translation_table) {
return false;
}
if is_gtg(encoded_sequence, pos) && !gtg_not_start(training.translation_table) {
return true;
}
if is_ttg(encoded_sequence, pos) && !ttg_not_start(training.translation_table) {
return true;
}
false
}
pub fn is_atg(encoded_sequence: &[u8], pos: usize) -> bool {
is_a(encoded_sequence, pos)
&& is_t(encoded_sequence, pos + 1)
&& is_g(encoded_sequence, pos + 2)
}
pub fn is_gtg(encoded_sequence: &[u8], pos: usize) -> bool {
is_g(encoded_sequence, pos)
&& is_t(encoded_sequence, pos + 1)
&& is_g(encoded_sequence, pos + 2)
}
pub fn is_ttg(encoded_sequence: &[u8], pos: usize) -> bool {
is_t(encoded_sequence, pos)
&& is_t(encoded_sequence, pos + 1)
&& is_g(encoded_sequence, pos + 2)
}
const fn is_tag_stop(trans_table: i32) -> bool {
!matches!(trans_table, 6 | 15 | 16 | 22)
}
const fn is_tga_stop(trans_table: i32) -> bool {
!matches!(trans_table, 2..=5 | 9 | 10 | 13 | 14 | 21 | 25)
}
const fn is_taa_stop(trans_table: i32) -> bool {
!matches!(trans_table, 6 | 14)
}
#[inline]
pub fn is_stop(encoded_sequence: &[u8], pos: usize, training: &Training) -> bool {
if is_t(encoded_sequence, pos) {
if is_a(encoded_sequence, pos + 1) {
if is_g(encoded_sequence, pos + 2) {
return is_tag_stop(training.translation_table);
}
if is_a(encoded_sequence, pos + 2) {
return is_taa_stop(training.translation_table);
}
} else if is_g(encoded_sequence, pos + 1) && is_a(encoded_sequence, pos + 2) {
return is_tga_stop(training.translation_table);
}
}
match training.translation_table {
2 => {
is_a(encoded_sequence, pos)
&& is_g(encoded_sequence, pos + 1)
&& (is_a(encoded_sequence, pos + 2) || is_g(encoded_sequence, pos + 2))
}
22 => {
is_t(encoded_sequence, pos)
&& is_c(encoded_sequence, pos + 1)
&& is_a(encoded_sequence, pos + 2)
}
23 => {
is_t(encoded_sequence, pos)
&& is_t(encoded_sequence, pos + 1)
&& is_a(encoded_sequence, pos + 2)
}
_ => false,
}
}
pub fn gc_content(encoded_sequence: &[u8], start: usize, end: usize) -> f64 {
if start > end {
return 0.0;
}
let (gc_count, total) = (start..=end)
.map(|i| {
if is_g(encoded_sequence, i) || is_c(encoded_sequence, i) {
(1, 1)
} else {
(0, 1)
}
})
.fold((0, 0), |(gc, tot), (g, t)| (gc + g, tot + t));
if total == 0 {
0.0
} else {
f64::from(gc_count) / f64::from(total)
}
}
pub const fn reverse_strand_reading_frame(forward_frame: usize, sequence_length: usize) -> usize {
let frame_modulus = if sequence_length.is_multiple_of(3) {
3
} else {
sequence_length % 3
};
(frame_modulus - 1 - forward_frame) % 3
}
pub const fn find_max_reading_frame(
frame_0_value: i32,
frame_1_value: i32,
frame_2_value: i32,
) -> usize {
if frame_0_value > frame_1_value {
if frame_0_value > frame_2_value { 0 } else { 2 }
} else if frame_1_value > frame_2_value {
1
} else {
2
}
}
pub fn create_reverse_complement_sequence(
forward_sequence: &[u8],
unknown_sequence: &[u8],
nucleotide_length: usize,
) -> Vec<u8> {
let mut reverse_complement_encoded_sequence = vec![0; forward_sequence.len()];
let sequence_length = nucleotide_length * 2;
for i in 0..sequence_length {
if !test_bit(forward_sequence, i) {
let target_pos = if i % 2 == 0 {
sequence_length - i - 2
} else {
sequence_length - i
};
if target_pos < sequence_length {
set_bit(&mut reverse_complement_encoded_sequence, target_pos);
}
}
}
for i in 0..nucleotide_length {
if test_bit(unknown_sequence, i) && sequence_length >= 2 + i * 2 {
toggle_bit(
&mut reverse_complement_encoded_sequence,
sequence_length - 1 - i * 2,
);
toggle_bit(
&mut reverse_complement_encoded_sequence,
sequence_length - 2 - i * 2,
);
}
}
reverse_complement_encoded_sequence
}
#[inline]
pub fn min_of_two_integers(first_value: i32, second_value: i32) -> i32 {
first_value.min(second_value)
}
#[must_use]
pub fn calculate_kmer_index(kmer_length: usize, encoded_sequence: &[u8], position: usize) -> usize {
let mut kmer_index = 0;
for i in 0..(2 * kmer_length) {
let bit_pos = position * 2 + i;
kmer_index |= usize::from(test_bit(encoded_sequence, bit_pos)) << i;
}
kmer_index
}
pub fn calculate_background_mer_frequencies(
length: usize,
encoded_sequence: &[u8],
reverse_complement_encoded_sequence: &[u8],
sequence_length: usize,
bg: &mut [f64],
) {
let mut size = 1usize;
for _i in 1..=length {
size *= 4;
}
let chunk_size = std::cmp::max(
1000,
(sequence_length - length + 1) / rayon::current_num_threads(),
);
let total_counts: Vec<i32> = (0..(sequence_length - length + 1))
.into_par_iter()
.chunks(chunk_size)
.map(|chunk| {
let mut local_counts = vec![0i32; size];
for i in chunk {
let seq_idx = calculate_kmer_index(length, encoded_sequence, i);
if seq_idx < size {
local_counts[seq_idx] += 1;
}
let rseq_idx = calculate_kmer_index(length, reverse_complement_encoded_sequence, i);
if rseq_idx < size {
local_counts[rseq_idx] += 1;
}
}
local_counts
})
.reduce(
|| vec![0i32; size],
|mut acc, local_counts| {
for (i, &count) in local_counts.iter().enumerate() {
acc[i] += count;
}
acc
},
);
let glob = (sequence_length - length + 1) * 2;
bg.par_iter_mut()
.enumerate()
.take(size)
.for_each(|(i, bg_val)| {
*bg_val = f64::from(total_counts[i]) / (glob as f64);
});
}
pub fn mer_text(len: usize, bit_index: usize) -> String {
use crate::constants::NUCLEOTIDE_LETTERS;
if len == 0 {
return "None".to_string();
}
let mut result = String::with_capacity(len);
let index = bit_index;
for i in 0..len {
let val = (index & (1 << (2 * i))) | (index & (1 << (2 * i + 1)));
let val = val >> (i * 2);
let base_idx = val & 0b11;
if base_idx < 4 {
result.push(NUCLEOTIDE_LETTERS[base_idx]);
} else {
result.push('N'); }
}
result
}
pub fn encode_sequence(
sequence: &[u8],
encoded_sequence: &mut [u8],
unknown_sequence: &mut [u8],
masks: &mut Vec<Mask>,
do_mask: bool,
) -> Result<f64, OrphosError> {
let mut gc_count = 0;
let mut total_count = 0;
let mut mask_start: Option<usize> = None;
for (i, &byte) in sequence.iter().enumerate() {
if i * 2 + 1 >= encoded_sequence.len() * 8 {
break;
}
if do_mask {
if let Some(start) = mask_start {
if byte != b'N' && byte != b'n' {
if i - start >= MASK_SIZE {
masks.push(Mask {
begin: start,
end: i - 1,
});
}
mask_start = None;
}
} else if byte == b'N' || byte == b'n' {
mask_start = Some(i);
}
}
let bctr = i * 2;
match byte.to_ascii_uppercase() {
b'A' => {
total_count += 1;
}
b'C' => {
bitmap::set_bit(encoded_sequence, bctr + 1);
gc_count += 1;
total_count += 1;
}
b'G' => {
bitmap::set_bit(encoded_sequence, bctr);
gc_count += 1;
total_count += 1;
}
b'T' | b'U' => {
bitmap::set_bit(encoded_sequence, bctr);
bitmap::set_bit(encoded_sequence, bctr + 1);
total_count += 1;
}
_ => {
bitmap::set_bit(encoded_sequence, bctr + 1);
bitmap::set_bit(unknown_sequence, i);
total_count += 1;
}
}
}
if do_mask
&& let Some(start) = mask_start
&& sequence.len() - start >= MASK_SIZE
{
masks.push(Mask {
begin: start,
end: sequence.len() - 1,
});
}
let gc_content = if total_count > 0 {
f64::from(gc_count) / f64::from(total_count)
} else {
0.0
};
Ok(gc_content)
}
pub fn encode_sequence_simd_wide(
sequence: &[u8],
encoded_sequence: &mut [u8],
unknown_sequence: &mut [u8],
) -> Result<f64, OrphosError> {
let mut gc_count = 0u32;
let mut total_count = 0u32;
use crate::constants::CHUNK_SIZE;
let chunks = sequence.len() / CHUNK_SIZE;
let a_upper = u8x32::splat(b'A');
let c_upper = u8x32::splat(b'C');
let g_upper = u8x32::splat(b'G');
let t_upper = u8x32::splat(b'T');
let u_upper = u8x32::splat(b'U');
let a_lower = u8x32::splat(b'a');
let c_lower = u8x32::splat(b'c');
let g_lower = u8x32::splat(b'g');
let t_lower = u8x32::splat(b't');
let u_lower = u8x32::splat(b'u');
for chunk_idx in 0..chunks {
let chunk_start = chunk_idx * CHUNK_SIZE;
let input_slice = &sequence[chunk_start..chunk_start + CHUNK_SIZE];
let mut input_array = [0u8; 32];
input_array.copy_from_slice(input_slice);
let input = u8x32::from(input_array);
let is_a = input.cmp_eq(a_upper) | input.cmp_eq(a_lower);
let is_c = input.cmp_eq(c_upper) | input.cmp_eq(c_lower);
let is_g = input.cmp_eq(g_upper) | input.cmp_eq(g_lower);
let is_t = input.cmp_eq(t_upper)
| input.cmp_eq(t_lower)
| input.cmp_eq(u_upper)
| input.cmp_eq(u_lower);
let gc_mask = is_g | is_c;
let valid_mask = is_a | is_c | is_g | is_t;
gc_count += gc_mask.move_mask().count_ones();
total_count += valid_mask.move_mask().count_ones();
let is_a_array: [u8; 32] = is_a.into();
let is_c_array: [u8; 32] = is_c.into();
let is_g_array: [u8; 32] = is_g.into();
let is_t_array: [u8; 32] = is_t.into();
for i in 0..CHUNK_SIZE {
let pos = chunk_start + i;
if pos >= sequence.len() || pos * 2 + 1 >= encoded_sequence.len() * 8 {
break;
}
let bit_pos = pos * 2;
if is_a_array[i] != 0 {
} else if is_c_array[i] != 0 {
crate::bitmap::set_bit(encoded_sequence, bit_pos + 1);
} else if is_g_array[i] != 0 {
crate::bitmap::set_bit(encoded_sequence, bit_pos);
} else if is_t_array[i] != 0 {
crate::bitmap::set_bit(encoded_sequence, bit_pos);
crate::bitmap::set_bit(encoded_sequence, bit_pos + 1);
} else {
crate::bitmap::set_bit(encoded_sequence, bit_pos + 1);
crate::bitmap::set_bit(unknown_sequence, pos);
}
}
}
for (pos, byte) in sequence.iter().enumerate().skip(chunks * CHUNK_SIZE) {
if pos * 2 + 1 >= encoded_sequence.len() * 8 {
break;
}
let bit_pos = pos * 2;
match byte.to_ascii_uppercase() {
b'A' => {
total_count += 1;
}
b'C' => {
crate::bitmap::set_bit(encoded_sequence, bit_pos + 1);
gc_count += 1;
total_count += 1;
}
b'G' => {
crate::bitmap::set_bit(encoded_sequence, bit_pos);
gc_count += 1;
total_count += 1;
}
b'T' | b'U' => {
crate::bitmap::set_bit(encoded_sequence, bit_pos);
crate::bitmap::set_bit(encoded_sequence, bit_pos + 1);
total_count += 1;
}
_ => {
crate::bitmap::set_bit(encoded_sequence, bit_pos + 1);
crate::bitmap::set_bit(unknown_sequence, pos);
total_count += 1;
}
}
}
let gc_content = if total_count > 0 {
gc_count as f64 / total_count as f64
} else {
0.0
};
Ok(gc_content)
}
pub fn encode_sequence_simd_wide_packed(
sequence: &[u8],
encoded_sequence: &mut [u8],
unknown_sequence: &mut [u8],
) -> Result<f64, OrphosError> {
let mut gc_count = 0u32;
let mut total_count = 0u32;
use crate::constants::CHUNK_SIZE;
let chunks = sequence.len() / CHUNK_SIZE;
for chunk_idx in 0..chunks {
let chunk_start = chunk_idx * CHUNK_SIZE;
let input_slice = &sequence[chunk_start..chunk_start + CHUNK_SIZE];
let mut input_array = [0u8; 32];
input_array.copy_from_slice(input_slice);
let input = u8x32::from(input_array);
let a_upper = u8x32::splat(b'A');
let c_upper = u8x32::splat(b'C');
let g_upper = u8x32::splat(b'G');
let t_upper = u8x32::splat(b'T');
let u_upper = u8x32::splat(b'U');
let a_lower = u8x32::splat(b'a');
let c_lower = u8x32::splat(b'c');
let g_lower = u8x32::splat(b'g');
let t_lower = u8x32::splat(b't');
let u_lower = u8x32::splat(b'u');
let is_a = input.cmp_eq(a_upper) | input.cmp_eq(a_lower);
let is_c = input.cmp_eq(c_upper) | input.cmp_eq(c_lower);
let is_g = input.cmp_eq(g_upper) | input.cmp_eq(g_lower);
let is_t = input.cmp_eq(t_upper)
| input.cmp_eq(t_lower)
| input.cmp_eq(u_upper)
| input.cmp_eq(u_lower);
let gc_mask = is_g | is_c;
let valid_mask = is_a | is_c | is_g | is_t;
gc_count += gc_mask.move_mask().count_ones();
total_count += valid_mask.move_mask().count_ones();
let is_c_mask: i32 = is_c.move_mask();
let is_g_mask: i32 = is_g.move_mask();
let is_t_mask: i32 = is_t.move_mask();
let unknown_mask: i32 = !valid_mask.move_mask();
for i in 0..CHUNK_SIZE {
let pos = chunk_start + i;
if pos >= sequence.len() || pos * 2 + 1 >= encoded_sequence.len() * 8 {
break;
}
let bit_pos = pos * 2;
let bit_flag = 1i32 << i;
if (is_c_mask & bit_flag) != 0 {
crate::bitmap::set_bit(encoded_sequence, bit_pos + 1);
} else if (is_g_mask & bit_flag) != 0 {
crate::bitmap::set_bit(encoded_sequence, bit_pos);
} else if (is_t_mask & bit_flag) != 0 {
crate::bitmap::set_bit(encoded_sequence, bit_pos);
crate::bitmap::set_bit(encoded_sequence, bit_pos + 1);
} else if (unknown_mask & bit_flag) != 0 {
crate::bitmap::set_bit(encoded_sequence, bit_pos + 1);
crate::bitmap::set_bit(unknown_sequence, pos);
}
}
}
for (pos, byte) in sequence.iter().enumerate().skip(chunks * CHUNK_SIZE) {
if pos * 2 + 1 >= encoded_sequence.len() * 8 {
break;
}
let bit_pos = pos * 2;
match byte.to_ascii_uppercase() {
b'A' => {
total_count += 1;
}
b'C' => {
crate::bitmap::set_bit(encoded_sequence, bit_pos + 1);
gc_count += 1;
total_count += 1;
}
b'G' => {
crate::bitmap::set_bit(encoded_sequence, bit_pos);
gc_count += 1;
total_count += 1;
}
b'T' | b'U' => {
crate::bitmap::set_bit(encoded_sequence, bit_pos);
crate::bitmap::set_bit(encoded_sequence, bit_pos + 1);
total_count += 1;
}
_ => {
crate::bitmap::set_bit(encoded_sequence, bit_pos + 1);
crate::bitmap::set_bit(unknown_sequence, pos);
total_count += 1;
}
}
}
let gc_content = if total_count > 0 {
gc_count as f64 / total_count as f64
} else {
0.0
};
Ok(gc_content)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::types::Training;
#[test]
fn test_char_to_nuc_valid_bases() {
assert_eq!(char_to_nuc(b'A'), 0);
assert_eq!(char_to_nuc(b'a'), 0);
assert_eq!(char_to_nuc(b'C'), 1);
assert_eq!(char_to_nuc(b'c'), 1);
assert_eq!(char_to_nuc(b'G'), 2);
assert_eq!(char_to_nuc(b'g'), 2);
assert_eq!(char_to_nuc(b'T'), 3);
assert_eq!(char_to_nuc(b't'), 3);
assert_eq!(char_to_nuc(b'U'), 3);
assert_eq!(char_to_nuc(b'u'), 3);
}
#[test]
fn test_char_to_nuc_invalid_bases() {
assert_eq!(char_to_nuc(b'N'), 4);
assert_eq!(char_to_nuc(b'n'), 4);
assert_eq!(char_to_nuc(b'X'), 4);
assert_eq!(char_to_nuc(b'-'), 4);
assert_eq!(char_to_nuc(b' '), 4);
}
#[test]
fn test_nucleotide_check_functions() {
let mut seq = vec![0u8; 10];
crate::bitmap::set_bit(&mut seq, 2); crate::bitmap::set_bit(&mut seq, 3);
crate::bitmap::set_bit(&mut seq, 5); crate::bitmap::set_bit(&mut seq, 6);
assert!(is_a(&seq, 0));
assert!(!is_a(&seq, 1));
assert!(!is_a(&seq, 2));
assert!(!is_a(&seq, 3));
assert!(!is_t(&seq, 0));
assert!(is_t(&seq, 1));
assert!(!is_t(&seq, 2));
assert!(!is_t(&seq, 3));
assert!(!is_c(&seq, 0));
assert!(!is_c(&seq, 1));
assert!(is_c(&seq, 2));
assert!(!is_c(&seq, 3));
assert!(!is_g(&seq, 0));
assert!(!is_g(&seq, 1));
assert!(!is_g(&seq, 2));
assert!(is_g(&seq, 3));
}
#[test]
fn test_start_codon_functions() {
let mut seq = vec![0u8; 20];
crate::bitmap::set_bit(&mut seq, 2); crate::bitmap::set_bit(&mut seq, 3);
crate::bitmap::set_bit(&mut seq, 4);
assert!(is_atg(&seq, 0));
assert!(!is_gtg(&seq, 0));
assert!(!is_ttg(&seq, 0));
crate::bitmap::set_bit(&mut seq, 6); crate::bitmap::set_bit(&mut seq, 8); crate::bitmap::set_bit(&mut seq, 9);
crate::bitmap::set_bit(&mut seq, 10);
assert!(!is_atg(&seq, 3));
assert!(is_gtg(&seq, 3));
assert!(!is_ttg(&seq, 3));
}
#[test]
fn test_stop_codon_functions() {
let mut seq = vec![0u8; 20];
let training = Training::default();
crate::bitmap::set_bit(&mut seq, 0); crate::bitmap::set_bit(&mut seq, 1);
assert!(is_stop(&seq, 0, &training));
crate::bitmap::set_bit(&mut seq, 6); crate::bitmap::set_bit(&mut seq, 7);
crate::bitmap::set_bit(&mut seq, 10);
assert!(is_stop(&seq, 3, &training));
}
#[test]
fn test_gc_content_calculation() {
let mut seq = vec![0u8; 20];
crate::bitmap::set_bit(&mut seq, 2); crate::bitmap::set_bit(&mut seq, 3);
crate::bitmap::set_bit(&mut seq, 5); crate::bitmap::set_bit(&mut seq, 6);
let gc = gc_content(&seq, 0, 3);
assert!((gc - 0.5).abs() < 0.001); }
#[test]
fn test_reverse_strand_reading_frame() {
assert_eq!(reverse_strand_reading_frame(0, 9), 2); assert_eq!(reverse_strand_reading_frame(1, 9), 1); assert_eq!(reverse_strand_reading_frame(2, 9), 0);
assert_eq!(reverse_strand_reading_frame(0, 10), 0);
assert_eq!(reverse_strand_reading_frame(0, 8), 1); assert_eq!(reverse_strand_reading_frame(1, 8), 0); }
#[test]
fn test_find_max_reading_frame() {
assert_eq!(find_max_reading_frame(10, 5, 3), 0);
assert_eq!(find_max_reading_frame(5, 10, 3), 1);
assert_eq!(find_max_reading_frame(5, 3, 10), 2);
assert_eq!(find_max_reading_frame(5, 5, 3), 1); }
#[test]
fn test_min_of_two_integers() {
assert_eq!(min_of_two_integers(5, 3), 3);
assert_eq!(min_of_two_integers(3, 5), 3);
assert_eq!(min_of_two_integers(-1, 5), -1);
assert_eq!(min_of_two_integers(5, 5), 5);
}
#[test]
fn test_calculate_kmer_index() {
let mut seq = vec![0u8; 20];
crate::bitmap::set_bit(&mut seq, 1);
let idx = calculate_kmer_index(2, &seq, 0);
assert_eq!(idx, 2); }
#[test]
fn test_mer_text() {
assert_eq!(mer_text(0, 0), "None");
assert_eq!(mer_text(2, 0), "AA");
assert_eq!(mer_text(2, 1), "GA");
assert_eq!(mer_text(2, 2), "CA");
assert_eq!(mer_text(2, 3), "TA");
}
#[test]
fn test_encode_sequence_basic() {
let sequence = b"ATCG";
let mut encoded = vec![0u8; 10];
let mut unknown_sequence = vec![0u8; 10];
let mut masks = Vec::new();
let gc = encode_sequence(
sequence,
&mut encoded,
&mut unknown_sequence,
&mut masks,
false,
)
.unwrap();
assert!((gc - 0.5).abs() < 0.001); }
#[test]
fn test_encode_sequence_with_n() {
let sequence = b"ATNG";
let mut encoded = vec![0u8; 10];
let mut unknown_sequence = vec![0u8; 10];
let mut masks = Vec::new();
let gc = encode_sequence(
sequence,
&mut encoded,
&mut unknown_sequence,
&mut masks,
false,
)
.unwrap();
assert!((gc - 0.25).abs() < 0.001); assert!(crate::bitmap::test_bit(&unknown_sequence, 2)); }
#[test]
fn test_encode_sequence_masking() {
let mut sequence = b"ATC".to_vec();
sequence.extend(vec![b'N'; 52]); sequence.extend(b"GCG");
let mut encoded = vec![0u8; 60];
let mut unknown_sequence = vec![0u8; 60];
let mut masks = Vec::new();
let _gc = encode_sequence(
&sequence,
&mut encoded,
&mut unknown_sequence,
&mut masks,
true,
)
.unwrap();
assert!(!masks.is_empty()); assert_eq!(masks.len(), 1);
assert_eq!(masks[0].begin, 3); assert_eq!(masks[0].end, 54); }
#[test]
fn test_is_gc() {
let mut seq = vec![0u8; 10];
assert!(!is_gc(&seq, 0));
crate::bitmap::set_bit(&mut seq, 1);
assert!(is_gc(&seq, 0));
let mut seq2 = vec![0u8; 10];
crate::bitmap::set_bit(&mut seq2, 0);
assert!(is_gc(&seq2, 0));
let mut seq3 = vec![0u8; 10];
crate::bitmap::set_bit(&mut seq3, 0);
crate::bitmap::set_bit(&mut seq3, 1);
assert!(!is_gc(&seq3, 0));
}
#[test]
fn test_is_n() {
let mut unknown_sequence = vec![0u8; 10];
assert!(!is_n(&unknown_sequence, 0));
assert!(!is_n(&unknown_sequence, 100));
crate::bitmap::set_bit(&mut unknown_sequence, 5);
assert!(is_n(&unknown_sequence, 5));
}
#[test]
fn test_calculate_background_mer_frequencies() {
let seq = vec![0u8; 20]; let rseq = vec![0u8; 20]; let mut bg = vec![0.0; 16];
calculate_background_mer_frequencies(2, &seq, &rseq, 10, &mut bg);
assert!(bg[0] > 0.5); }
#[test]
fn test_rcom_seq() {
let seq = vec![0u8; 10];
let unknown_sequence = vec![0u8; 10];
let rseq = create_reverse_complement_sequence(&seq, &unknown_sequence, 2);
assert!(is_t(&rseq, 1));
}
#[test]
fn test_translation_table_functions() {
assert!(uses_only_atg(6));
assert!(uses_only_atg(10));
assert!(!uses_only_atg(11));
assert!(gtg_not_start(1));
assert!(gtg_not_start(22));
assert!(!gtg_not_start(11));
assert!(ttg_not_start(1));
assert!(ttg_not_start(9));
assert!(!ttg_not_start(11));
}
#[test]
fn test_start_codon_with_training() {
let mut training = Training {
translation_table: 11,
..Training::default()
};
let mut seq = vec![0u8; 20];
crate::bitmap::set_bit(&mut seq, 2); crate::bitmap::set_bit(&mut seq, 3);
crate::bitmap::set_bit(&mut seq, 4);
assert!(is_start(&seq, 0, &training));
training.translation_table = 6;
assert!(is_start(&seq, 0, &training));
let mut seq2 = vec![0u8; 20];
crate::bitmap::set_bit(&mut seq2, 0); crate::bitmap::set_bit(&mut seq2, 2); crate::bitmap::set_bit(&mut seq2, 3);
crate::bitmap::set_bit(&mut seq2, 4);
assert!(!is_start(&seq2, 0, &training)); }
#[test]
fn test_stop_codon_special_tables() {
let mut training = Training::default();
let mut seq = vec![0u8; 20];
training.translation_table = 2;
crate::bitmap::set_bit(&mut seq, 2);
assert!(is_stop(&seq, 0, &training));
training.translation_table = 22;
let mut seq2 = vec![0u8; 20];
crate::bitmap::set_bit(&mut seq2, 0); crate::bitmap::set_bit(&mut seq2, 1);
crate::bitmap::set_bit(&mut seq2, 3);
assert!(is_stop(&seq2, 0, &training));
}
}