orphos_core/sequence/
mod.rs

1//! Sequence encoding and manipulation utilities.
2//!
3//! This module provides functions for encoding DNA sequences into compact bitmap
4//! representations and performing sequence analysis operations.
5//!
6//! ## Overview
7//!
8//! DNA sequences are encoded using a 2-bit representation where:
9//! - A (adenine): 00
10//! - C (cytosine): 01
11//! - G (guanine): 10
12//! - T/U (thymine/uracil): 11
13//!
14//! This encoding reduces memory usage by 75% compared to ASCII representation
15//! and enables fast bitwise operations for sequence analysis.
16//!
17//! ## Modules
18//!
19//! - [`encoded`]: Encoded sequence structures with forward and reverse-complement
20//! - [`io`]: FASTA file reading and parsing
21//! - [`processing`]: Sequence analysis functions (GC content, codon detection)
22//!
23//! ## Examples
24//!
25//! ### Encode a sequence
26//!
27//! ```rust
28//! use orphos_core::sequence::encoded::EncodedSequence;
29//!
30//! let sequence = b"ATGAAACGCATTAGCACCACCATT";
31//! let encoded = EncodedSequence::without_masking(sequence);
32//!
33//! println!("Length: {} bp", encoded.sequence_length);
34//! println!("GC content: {:.2}%", encoded.gc_content * 100.0);
35//! ```
36//!
37//! ### Test for specific nucleotides
38//!
39//! ```rust
40//! use orphos_core::sequence::{is_a, is_gc};
41//! use orphos_core::sequence::encoded::EncodedSequence;
42//!
43//! let sequence = b"ATGC";
44//! let encoded = EncodedSequence::without_masking(sequence);
45//!
46//! assert!(is_a(&encoded.forward_sequence, 0)); // Position 0 is 'A'
47//! assert!(is_gc(&encoded.forward_sequence, 3)); // Position 3 is 'C'
48//! ```
49
50use crate::bitmap;
51use crate::constants::MASK_SIZE;
52use crate::types::*;
53
54pub mod encoded;
55pub mod io;
56pub mod processing;
57
58use crate::bitmap::{set_bit, test_bit, toggle_bit};
59use rayon::prelude::*;
60
61pub use io::*;
62pub use processing::*;
63use wide::CmpEq;
64use wide::u8x32;
65
66/// Converts nucleotide character to 2-bit encoding for bitmap storage.
67///
68/// Maps nucleotides to compact 2-bit representation for efficient storage
69/// and fast sequence operations.
70///
71/// # Encoding
72///
73/// - A: 00 (0)
74/// - C: 01 (1)
75/// - G: 10 (2)
76/// - T/U: 11 (3)
77/// - Other: 4 (invalid marker)
78///
79/// # Arguments
80///
81/// * `c` - ASCII nucleotide character (case-insensitive)
82///
83/// # Returns
84///
85/// A value 0-3 for valid nucleotides, 4 for invalid characters.
86///
87/// # Examples
88///
89/// ```rust
90/// use orphos_core::sequence::char_to_nuc;
91///
92/// assert_eq!(char_to_nuc(b'A'), 0);
93/// assert_eq!(char_to_nuc(b'a'), 0);
94/// assert_eq!(char_to_nuc(b'C'), 1);
95/// assert_eq!(char_to_nuc(b'G'), 2);
96/// assert_eq!(char_to_nuc(b'T'), 3);
97/// assert_eq!(char_to_nuc(b'N'), 4); // Invalid
98/// ```
99#[must_use]
100pub const fn char_to_nuc(c: u8) -> u8 {
101    match c.to_ascii_uppercase() {
102        b'A' => 0,
103        b'C' => 1,
104        b'G' => 2,
105        b'T' | b'U' => 3,
106        _ => 4,
107    }
108}
109
110/// Test if nucleotide at given position is adenine (A)
111#[must_use]
112pub fn is_a(encoded_sequence: &[u8], n: usize) -> bool {
113    let bit_index = n * 2;
114    !(test_bit(encoded_sequence, bit_index) || test_bit(encoded_sequence, bit_index + 1))
115}
116
117/// Test if nucleotide at given position is cytosine (C)
118#[must_use]
119pub fn is_c(encoded_sequence: &[u8], n: usize) -> bool {
120    let bit_index = n * 2;
121    !test_bit(encoded_sequence, bit_index) && test_bit(encoded_sequence, bit_index + 1)
122}
123
124/// Test if nucleotide at given position is guanine (G)
125#[must_use]
126pub fn is_g(encoded_sequence: &[u8], n: usize) -> bool {
127    let bit_index = n * 2;
128    test_bit(encoded_sequence, bit_index) && !test_bit(encoded_sequence, bit_index + 1)
129}
130
131/// Test if nucleotide at given position is thymine (T)
132#[must_use]
133pub fn is_t(encoded_sequence: &[u8], n: usize) -> bool {
134    let bit_index = n * 2;
135    test_bit(encoded_sequence, bit_index) && test_bit(encoded_sequence, bit_index + 1)
136}
137
138/// Test if position contains an unknown nucleotide (N)
139pub fn is_n(unknown_sequence: &[u8], n: usize) -> bool {
140    if n >= unknown_sequence.len() * 8 {
141        return false;
142    }
143    test_bit(unknown_sequence, n)
144}
145
146/// Test if nucleotide at given position is G or C (high GC content indicator)
147pub fn is_gc(encoded_sequence: &[u8], n: usize) -> bool {
148    let bit_index = n * 2;
149    test_bit(encoded_sequence, bit_index) != test_bit(encoded_sequence, bit_index + 1)
150}
151
152/// Check if genetic code table uses only ATG as start codon
153const fn uses_only_atg(trans_table: i32) -> bool {
154    matches!(trans_table, 6 | 10 | 14 | 15 | 16 | 22)
155}
156
157/// Check if GTG is not used as start codon in given translation table
158const fn gtg_not_start(trans_table: i32) -> bool {
159    matches!(trans_table, 1 | 3 | 12 | 22)
160}
161
162/// Check if TTG is not used as start codon in given translation table
163fn ttg_not_start(trans_table: i32) -> bool {
164    trans_table < 4 || trans_table == 9 || (21..25).contains(&trans_table)
165}
166
167/// Test if codon at given position is a valid start codon
168///
169/// Checks ATG, GTG, and TTG based on the genetic code table rules.
170/// ATG is universally accepted as a start codon across all tables.
171pub fn is_start(encoded_sequence: &[u8], pos: usize, training: &Training) -> bool {
172    // ATG is always a start
173    if is_atg(encoded_sequence, pos) {
174        return true;
175    }
176
177    // Tables that only use ATG
178    if uses_only_atg(training.translation_table) {
179        return false;
180    }
181
182    // GTG
183    if is_gtg(encoded_sequence, pos) && !gtg_not_start(training.translation_table) {
184        return true;
185    }
186
187    // TTG
188    if is_ttg(encoded_sequence, pos) && !ttg_not_start(training.translation_table) {
189        return true;
190    }
191
192    false
193}
194
195/// Test if codon at position is ATG (methionine start codon)
196pub fn is_atg(encoded_sequence: &[u8], pos: usize) -> bool {
197    is_a(encoded_sequence, pos)
198        && is_t(encoded_sequence, pos + 1)
199        && is_g(encoded_sequence, pos + 2)
200}
201
202/// Test if codon at position is GTG (valine start codon)
203pub fn is_gtg(encoded_sequence: &[u8], pos: usize) -> bool {
204    is_g(encoded_sequence, pos)
205        && is_t(encoded_sequence, pos + 1)
206        && is_g(encoded_sequence, pos + 2)
207}
208
209/// Test if codon at position is TTG (leucine start codon)
210pub fn is_ttg(encoded_sequence: &[u8], pos: usize) -> bool {
211    is_t(encoded_sequence, pos)
212        && is_t(encoded_sequence, pos + 1)
213        && is_g(encoded_sequence, pos + 2)
214}
215
216/// Check if TAG is recognized as stop codon in the given translation table
217const fn is_tag_stop(trans_table: i32) -> bool {
218    !matches!(trans_table, 6 | 15 | 16 | 22)
219}
220
221/// Check if TGA is recognized as stop codon in the given translation table
222const fn is_tga_stop(trans_table: i32) -> bool {
223    !matches!(trans_table, 2..=5 | 9 | 10 | 13 | 14 | 21 | 25)
224}
225
226/// Check if TAA is recognized as stop codon in the given translation table
227const fn is_taa_stop(trans_table: i32) -> bool {
228    !matches!(trans_table, 6 | 14)
229}
230
231/// Test if codon at given position is a stop codon
232///
233/// Checks for standard stop codons (TAA, TAG, TGA) and special cases
234/// based on the genetic code translation table being used.
235#[inline]
236pub fn is_stop(encoded_sequence: &[u8], pos: usize, training: &Training) -> bool {
237    if is_t(encoded_sequence, pos) {
238        if is_a(encoded_sequence, pos + 1) {
239            if is_g(encoded_sequence, pos + 2) {
240                return is_tag_stop(training.translation_table);
241            }
242            if is_a(encoded_sequence, pos + 2) {
243                return is_taa_stop(training.translation_table);
244            }
245        } else if is_g(encoded_sequence, pos + 1) && is_a(encoded_sequence, pos + 2) {
246            return is_tga_stop(training.translation_table);
247        }
248    }
249
250    // Special cases for different translation tables
251    match training.translation_table {
252        2 => {
253            // AGA or AGG are stop codons in translation table 2
254            is_a(encoded_sequence, pos)
255                && is_g(encoded_sequence, pos + 1)
256                && (is_a(encoded_sequence, pos + 2) || is_g(encoded_sequence, pos + 2))
257        }
258        22 => {
259            // TCA is a stop codon in translation table 22
260            is_t(encoded_sequence, pos)
261                && is_c(encoded_sequence, pos + 1)
262                && is_a(encoded_sequence, pos + 2)
263        }
264        23 => {
265            // TTA is a stop codon in translation table 23
266            is_t(encoded_sequence, pos)
267                && is_t(encoded_sequence, pos + 1)
268                && is_a(encoded_sequence, pos + 2)
269        }
270        _ => false,
271    }
272}
273
274/// Calculate the GC content of a sequence region
275///
276/// Returns the fraction of nucleotides that are G or C within
277/// the specified range (inclusive).
278pub fn gc_content(encoded_sequence: &[u8], start: usize, end: usize) -> f64 {
279    if start > end {
280        return 0.0;
281    }
282
283    let (gc_count, total) = (start..=end)
284        .map(|i| {
285            if is_g(encoded_sequence, i) || is_c(encoded_sequence, i) {
286                (1, 1)
287            } else {
288                (0, 1)
289            }
290        })
291        .fold((0, 0), |(gc, tot), (g, t)| (gc + g, tot + t));
292
293    if total == 0 {
294        0.0
295    } else {
296        f64::from(gc_count) / f64::from(total)
297    }
298}
299
300/// Convert a forward strand reading frame to its corresponding reverse strand frame
301///
302/// Maps reading frames between forward and reverse strands accounting for
303/// sequence length and frame relationships.
304pub const fn reverse_strand_reading_frame(forward_frame: usize, sequence_length: usize) -> usize {
305    let frame_modulus = if sequence_length.is_multiple_of(3) {
306        3
307    } else {
308        sequence_length % 3
309    };
310    (frame_modulus - 1 - forward_frame) % 3
311}
312
313/// Determine which of three reading frames has the highest score
314///
315/// Returns the index (0, 1, or 2) of the frame with maximum value.
316pub const fn find_max_reading_frame(
317    frame_0_value: i32,
318    frame_1_value: i32,
319    frame_2_value: i32,
320) -> usize {
321    if frame_0_value > frame_1_value {
322        if frame_0_value > frame_2_value { 0 } else { 2 }
323    } else if frame_1_value > frame_2_value {
324        1
325    } else {
326        2
327    }
328}
329
330/// Generate the reverse complement of an encoded DNA sequence
331///
332/// Creates a reverse complement sequence using 2-bit encoding,
333/// handling both known and unknown nucleotides properly.
334pub fn create_reverse_complement_sequence(
335    forward_sequence: &[u8],
336    unknown_sequence: &[u8],
337    nucleotide_length: usize,
338) -> Vec<u8> {
339    let mut reverse_complement_encoded_sequence = vec![0; forward_sequence.len()];
340    let sequence_length = nucleotide_length * 2;
341
342    for i in 0..sequence_length {
343        if !test_bit(forward_sequence, i) {
344            let target_pos = if i % 2 == 0 {
345                sequence_length - i - 2
346            } else {
347                sequence_length - i
348            };
349            if target_pos < sequence_length {
350                set_bit(&mut reverse_complement_encoded_sequence, target_pos);
351            }
352        }
353    }
354
355    for i in 0..nucleotide_length {
356        if test_bit(unknown_sequence, i) && sequence_length >= 2 + i * 2 {
357            toggle_bit(
358                &mut reverse_complement_encoded_sequence,
359                sequence_length - 1 - i * 2,
360            );
361            toggle_bit(
362                &mut reverse_complement_encoded_sequence,
363                sequence_length - 2 - i * 2,
364            );
365        }
366    }
367    reverse_complement_encoded_sequence
368}
369
370/// Return the minimum of two integers (utility function)
371#[inline]
372pub fn min_of_two_integers(first_value: i32, second_value: i32) -> i32 {
373    first_value.min(second_value)
374}
375
376/// Calculate k-mer index from sequence position for frequency analysis
377///
378/// Converts a sequence position to a numeric index representing the
379/// k-mer pattern, used for codon usage and frequency calculations.
380#[must_use]
381pub fn calculate_kmer_index(kmer_length: usize, encoded_sequence: &[u8], position: usize) -> usize {
382    let mut kmer_index = 0;
383    for i in 0..(2 * kmer_length) {
384        let bit_pos = position * 2 + i;
385        kmer_index |= usize::from(test_bit(encoded_sequence, bit_pos)) << i;
386    }
387    kmer_index
388}
389
390/// Calculate background k-mer frequencies for both strands
391///
392/// Computes frequency distributions of k-mers across the entire sequence,
393/// used for statistical modeling of codon usage patterns.
394pub fn calculate_background_mer_frequencies(
395    length: usize,
396    encoded_sequence: &[u8],
397    reverse_complement_encoded_sequence: &[u8],
398    sequence_length: usize,
399    bg: &mut [f64],
400) {
401    let mut size = 1usize;
402
403    for _i in 1..=length {
404        size *= 4;
405    }
406
407    // Use parallel processing to count k-mers
408    let chunk_size = std::cmp::max(
409        1000,
410        (sequence_length - length + 1) / rayon::current_num_threads(),
411    );
412    let total_counts: Vec<i32> = (0..(sequence_length - length + 1))
413        .into_par_iter()
414        .chunks(chunk_size)
415        .map(|chunk| {
416            let mut local_counts = vec![0i32; size];
417
418            for i in chunk {
419                let seq_idx = calculate_kmer_index(length, encoded_sequence, i);
420                if seq_idx < size {
421                    local_counts[seq_idx] += 1;
422                }
423
424                let rseq_idx = calculate_kmer_index(length, reverse_complement_encoded_sequence, i);
425                if rseq_idx < size {
426                    local_counts[rseq_idx] += 1;
427                }
428            }
429
430            local_counts
431        })
432        .reduce(
433            || vec![0i32; size],
434            |mut acc, local_counts| {
435                for (i, &count) in local_counts.iter().enumerate() {
436                    acc[i] += count;
437                }
438                acc
439            },
440        );
441
442    let glob = (sequence_length - length + 1) * 2;
443
444    bg.par_iter_mut()
445        .enumerate()
446        .take(size)
447        .for_each(|(i, bg_val)| {
448            *bg_val = f64::from(total_counts[i]) / (glob as f64);
449        });
450}
451
452/// Convert k-mer index back to nucleotide sequence representation
453///
454/// Decodes a numeric k-mer index back to its original DNA sequence
455/// for display and debugging purposes.
456pub fn mer_text(len: usize, bit_index: usize) -> String {
457    use crate::constants::NUCLEOTIDE_LETTERS;
458
459    if len == 0 {
460        return "None".to_string();
461    }
462
463    let mut result = String::with_capacity(len);
464    let index = bit_index;
465
466    for i in 0..len {
467        // Extract 2 bits for position i
468        let val = (index & (1 << (2 * i))) | (index & (1 << (2 * i + 1)));
469        let val = val >> (i * 2);
470        let base_idx = val & 0b11; // Ensure we only get 2 bits
471
472        if base_idx < 4 {
473            result.push(NUCLEOTIDE_LETTERS[base_idx]);
474        } else {
475            result.push('N'); // Fallback for invalid values
476        }
477    }
478
479    result
480}
481
482/// Encode a DNA sequence into compact 2-bit representation
483///
484/// Converts raw DNA sequence to bitmap format for efficient storage and processing.
485/// Also handles masking of low-complexity regions and tracks unknown nucleotides.
486/// Returns the GC content of the encoded sequence.
487pub fn encode_sequence(
488    sequence: &[u8],
489    encoded_sequence: &mut [u8],
490    unknown_sequence: &mut [u8],
491    masks: &mut Vec<Mask>,
492    do_mask: bool,
493) -> Result<f64, OrphosError> {
494    let mut gc_count = 0;
495    let mut total_count = 0;
496    let mut mask_start: Option<usize> = None;
497
498    for (i, &byte) in sequence.iter().enumerate() {
499        if i * 2 + 1 >= encoded_sequence.len() * 8 {
500            break;
501        }
502
503        // Handle masking for runs of N's
504        if do_mask {
505            if let Some(start) = mask_start {
506                if byte != b'N' && byte != b'n' {
507                    if i - start >= MASK_SIZE {
508                        masks.push(Mask {
509                            begin: start,
510                            end: i - 1,
511                        });
512                    }
513                    mask_start = None;
514                }
515            } else if byte == b'N' || byte == b'n' {
516                mask_start = Some(i);
517            }
518        }
519
520        // Encode nucleotide in bitmap format
521        let bctr = i * 2;
522        match byte.to_ascii_uppercase() {
523            b'A' => {
524                total_count += 1;
525            }
526            b'C' => {
527                bitmap::set_bit(encoded_sequence, bctr + 1);
528                gc_count += 1;
529                total_count += 1;
530            }
531            b'G' => {
532                bitmap::set_bit(encoded_sequence, bctr);
533                gc_count += 1;
534                total_count += 1;
535            }
536            b'T' | b'U' => {
537                bitmap::set_bit(encoded_sequence, bctr);
538                bitmap::set_bit(encoded_sequence, bctr + 1);
539                total_count += 1;
540            }
541            _ => {
542                bitmap::set_bit(encoded_sequence, bctr + 1);
543                bitmap::set_bit(unknown_sequence, i);
544                total_count += 1;
545            }
546        }
547    }
548
549    // Handle final mask if sequence ends with N's
550    if do_mask
551        && let Some(start) = mask_start
552        && sequence.len() - start >= MASK_SIZE
553    {
554        masks.push(Mask {
555            begin: start,
556            end: sequence.len() - 1,
557        });
558    }
559
560    let gc_content = if total_count > 0 {
561        f64::from(gc_count) / f64::from(total_count)
562    } else {
563        0.0
564    };
565
566    Ok(gc_content)
567}
568
569/// SIMD-accelerated encoding using the `wide` crate with u8x32
570///
571/// Uses portable SIMD operations to process 32 nucleotides at once.
572/// Returns the GC content of the encoded sequence.
573pub fn encode_sequence_simd_wide(
574    sequence: &[u8],
575    encoded_sequence: &mut [u8],
576    unknown_sequence: &mut [u8],
577) -> Result<f64, OrphosError> {
578    let mut gc_count = 0u32;
579    let mut total_count = 0u32;
580
581    // Process 32 bytes at a time with u8x32
582    use crate::constants::CHUNK_SIZE;
583    let chunks = sequence.len() / CHUNK_SIZE;
584
585    // SIMD constants for nucleotide detection
586    let a_upper = u8x32::splat(b'A');
587    let c_upper = u8x32::splat(b'C');
588    let g_upper = u8x32::splat(b'G');
589    let t_upper = u8x32::splat(b'T');
590    let u_upper = u8x32::splat(b'U');
591    // let n_upper = u8x32::splat(b'N');
592
593    let a_lower = u8x32::splat(b'a');
594    let c_lower = u8x32::splat(b'c');
595    let g_lower = u8x32::splat(b'g');
596    let t_lower = u8x32::splat(b't');
597    let u_lower = u8x32::splat(b'u');
598    // let n_lower = u8x32::splat(b'n');
599
600    for chunk_idx in 0..chunks {
601        let chunk_start = chunk_idx * CHUNK_SIZE;
602
603        // Safely load 32 bytes into SIMD vector
604        let input_slice = &sequence[chunk_start..chunk_start + CHUNK_SIZE];
605
606        // Convert slice to array for u8x32::from()
607        let mut input_array = [0u8; 32];
608        input_array.copy_from_slice(input_slice);
609        let input = u8x32::from(input_array);
610
611        // Create masks for each nucleotide type (case-insensitive)
612        let is_a = input.cmp_eq(a_upper) | input.cmp_eq(a_lower);
613        let is_c = input.cmp_eq(c_upper) | input.cmp_eq(c_lower);
614        let is_g = input.cmp_eq(g_upper) | input.cmp_eq(g_lower);
615        let is_t = input.cmp_eq(t_upper)
616            | input.cmp_eq(t_lower)
617            | input.cmp_eq(u_upper)
618            | input.cmp_eq(u_lower);
619        // let is_n = input.cmp_eq(n_upper) | input.cmp_eq(n_lower);
620
621        let gc_mask = is_g | is_c;
622        let valid_mask = is_a | is_c | is_g | is_t;
623
624        // Convert to bitmask and count set bits
625        gc_count += gc_mask.move_mask().count_ones();
626        total_count += valid_mask.move_mask().count_ones();
627
628        // Process each nucleotide for 2-bit encoding
629        // let input_array: [u8; 32] = input.into();
630        let is_a_array: [u8; 32] = is_a.into();
631        let is_c_array: [u8; 32] = is_c.into();
632        let is_g_array: [u8; 32] = is_g.into();
633        let is_t_array: [u8; 32] = is_t.into();
634        // let is_n_array: [u8; 32] = is_n.into();
635
636        for i in 0..CHUNK_SIZE {
637            let pos = chunk_start + i;
638            if pos >= sequence.len() || pos * 2 + 1 >= encoded_sequence.len() * 8 {
639                break;
640            }
641
642            let bit_pos = pos * 2;
643
644            // Use SIMD results to determine nucleotide type
645            if is_a_array[i] != 0 {
646                // A = 00 (default, no bits to set)
647            } else if is_c_array[i] != 0 {
648                // C = 01
649                crate::bitmap::set_bit(encoded_sequence, bit_pos + 1);
650            } else if is_g_array[i] != 0 {
651                // G = 10
652                crate::bitmap::set_bit(encoded_sequence, bit_pos);
653            } else if is_t_array[i] != 0 {
654                // T/U = 11
655                crate::bitmap::set_bit(encoded_sequence, bit_pos);
656                crate::bitmap::set_bit(encoded_sequence, bit_pos + 1);
657            } else {
658                crate::bitmap::set_bit(encoded_sequence, bit_pos + 1);
659                crate::bitmap::set_bit(unknown_sequence, pos);
660            }
661        }
662    }
663
664    // Handle remaining bytes (scalar fallback)
665    for (pos, byte) in sequence.iter().enumerate().skip(chunks * CHUNK_SIZE) {
666        if pos * 2 + 1 >= encoded_sequence.len() * 8 {
667            break;
668        }
669
670        // let byte = sequence[pos];
671        let bit_pos = pos * 2;
672
673        match byte.to_ascii_uppercase() {
674            b'A' => {
675                total_count += 1;
676            }
677            b'C' => {
678                crate::bitmap::set_bit(encoded_sequence, bit_pos + 1);
679                gc_count += 1;
680                total_count += 1;
681            }
682            b'G' => {
683                crate::bitmap::set_bit(encoded_sequence, bit_pos);
684                gc_count += 1;
685                total_count += 1;
686            }
687            b'T' | b'U' => {
688                crate::bitmap::set_bit(encoded_sequence, bit_pos);
689                crate::bitmap::set_bit(encoded_sequence, bit_pos + 1);
690                total_count += 1;
691            }
692            _ => {
693                crate::bitmap::set_bit(encoded_sequence, bit_pos + 1);
694                crate::bitmap::set_bit(unknown_sequence, pos);
695                total_count += 1;
696            }
697        }
698    }
699
700    let gc_content = if total_count > 0 {
701        gc_count as f64 / total_count as f64
702    } else {
703        0.0
704    };
705
706    Ok(gc_content)
707}
708
709/// Optimized packed encoding version with u8x32 and batch bit operations
710pub fn encode_sequence_simd_wide_packed(
711    sequence: &[u8],
712    encoded_sequence: &mut [u8],
713    unknown_sequence: &mut [u8],
714) -> Result<f64, OrphosError> {
715    let mut gc_count = 0u32;
716    let mut total_count = 0u32;
717
718    // Process 32 bytes at a time
719    use crate::constants::CHUNK_SIZE;
720    let chunks = sequence.len() / CHUNK_SIZE;
721
722    for chunk_idx in 0..chunks {
723        let chunk_start = chunk_idx * CHUNK_SIZE;
724
725        // Load 32 bytes
726        let input_slice = &sequence[chunk_start..chunk_start + CHUNK_SIZE];
727        let mut input_array = [0u8; 32];
728        input_array.copy_from_slice(input_slice);
729        let input = u8x32::from(input_array);
730
731        // SIMD nucleotide detection
732        let a_upper = u8x32::splat(b'A');
733        let c_upper = u8x32::splat(b'C');
734        let g_upper = u8x32::splat(b'G');
735        let t_upper = u8x32::splat(b'T');
736        let u_upper = u8x32::splat(b'U');
737
738        let a_lower = u8x32::splat(b'a');
739        let c_lower = u8x32::splat(b'c');
740        let g_lower = u8x32::splat(b'g');
741        let t_lower = u8x32::splat(b't');
742        let u_lower = u8x32::splat(b'u');
743
744        let is_a = input.cmp_eq(a_upper) | input.cmp_eq(a_lower);
745        let is_c = input.cmp_eq(c_upper) | input.cmp_eq(c_lower);
746        let is_g = input.cmp_eq(g_upper) | input.cmp_eq(g_lower);
747        let is_t = input.cmp_eq(t_upper)
748            | input.cmp_eq(t_lower)
749            | input.cmp_eq(u_upper)
750            | input.cmp_eq(u_lower);
751
752        let gc_mask = is_g | is_c;
753        let valid_mask = is_a | is_c | is_g | is_t;
754
755        gc_count += gc_mask.move_mask().count_ones();
756        total_count += valid_mask.move_mask().count_ones();
757
758        // Extract SIMD results for bit setting - move_mask() returns i32
759        // let is_a_mask: i32 = is_a.move_mask();
760        let is_c_mask: i32 = is_c.move_mask();
761        let is_g_mask: i32 = is_g.move_mask();
762        let is_t_mask: i32 = is_t.move_mask();
763        let unknown_mask: i32 = !valid_mask.move_mask();
764
765        // Batch process nucleotides using bit operations
766        for i in 0..CHUNK_SIZE {
767            let pos = chunk_start + i;
768            if pos >= sequence.len() || pos * 2 + 1 >= encoded_sequence.len() * 8 {
769                break;
770            }
771
772            let bit_pos = pos * 2;
773            let bit_flag = 1i32 << i;
774
775            // Use bit operations to check SIMD results
776            if (is_c_mask & bit_flag) != 0 {
777                // C = 01
778                crate::bitmap::set_bit(encoded_sequence, bit_pos + 1);
779            } else if (is_g_mask & bit_flag) != 0 {
780                // G = 10
781                crate::bitmap::set_bit(encoded_sequence, bit_pos);
782            } else if (is_t_mask & bit_flag) != 0 {
783                // T/U = 11
784                crate::bitmap::set_bit(encoded_sequence, bit_pos);
785                crate::bitmap::set_bit(encoded_sequence, bit_pos + 1);
786            } else if (unknown_mask & bit_flag) != 0 {
787                crate::bitmap::set_bit(encoded_sequence, bit_pos + 1);
788                crate::bitmap::set_bit(unknown_sequence, pos);
789            }
790            // A = 00 (default, no bits to set)
791        }
792    }
793
794    // Handle remaining bytes with scalar fallback
795    // for pos in (chunks * CHUNK_SIZE)..sequence.len() {
796    for (pos, byte) in sequence.iter().enumerate().skip(chunks * CHUNK_SIZE) {
797        if pos * 2 + 1 >= encoded_sequence.len() * 8 {
798            break;
799        }
800
801        // let byte = sequence[pos];
802        let bit_pos = pos * 2;
803
804        match byte.to_ascii_uppercase() {
805            b'A' => {
806                total_count += 1;
807            }
808            b'C' => {
809                crate::bitmap::set_bit(encoded_sequence, bit_pos + 1);
810                gc_count += 1;
811                total_count += 1;
812            }
813            b'G' => {
814                crate::bitmap::set_bit(encoded_sequence, bit_pos);
815                gc_count += 1;
816                total_count += 1;
817            }
818            b'T' | b'U' => {
819                crate::bitmap::set_bit(encoded_sequence, bit_pos);
820                crate::bitmap::set_bit(encoded_sequence, bit_pos + 1);
821                total_count += 1;
822            }
823            _ => {
824                crate::bitmap::set_bit(encoded_sequence, bit_pos + 1);
825                crate::bitmap::set_bit(unknown_sequence, pos);
826                total_count += 1;
827            }
828        }
829    }
830
831    let gc_content = if total_count > 0 {
832        gc_count as f64 / total_count as f64
833    } else {
834        0.0
835    };
836
837    Ok(gc_content)
838}
839
840#[cfg(test)]
841mod tests {
842    use super::*;
843    use crate::types::Training;
844
845    #[test]
846    fn test_char_to_nuc_valid_bases() {
847        assert_eq!(char_to_nuc(b'A'), 0);
848        assert_eq!(char_to_nuc(b'a'), 0);
849        assert_eq!(char_to_nuc(b'C'), 1);
850        assert_eq!(char_to_nuc(b'c'), 1);
851        assert_eq!(char_to_nuc(b'G'), 2);
852        assert_eq!(char_to_nuc(b'g'), 2);
853        assert_eq!(char_to_nuc(b'T'), 3);
854        assert_eq!(char_to_nuc(b't'), 3);
855        assert_eq!(char_to_nuc(b'U'), 3);
856        assert_eq!(char_to_nuc(b'u'), 3);
857    }
858
859    #[test]
860    fn test_char_to_nuc_invalid_bases() {
861        assert_eq!(char_to_nuc(b'N'), 4);
862        assert_eq!(char_to_nuc(b'n'), 4);
863        assert_eq!(char_to_nuc(b'X'), 4);
864        assert_eq!(char_to_nuc(b'-'), 4);
865        assert_eq!(char_to_nuc(b' '), 4);
866    }
867
868    #[test]
869    fn test_nucleotide_check_functions() {
870        let mut seq = vec![0u8; 10];
871
872        // Encode ATCG at positions 0,1,2,3
873        // A = 00 (default)
874        crate::bitmap::set_bit(&mut seq, 2); // T = 11
875        crate::bitmap::set_bit(&mut seq, 3);
876        crate::bitmap::set_bit(&mut seq, 5); // C = 01
877        crate::bitmap::set_bit(&mut seq, 6); // G = 10
878
879        assert!(is_a(&seq, 0));
880        assert!(!is_a(&seq, 1));
881        assert!(!is_a(&seq, 2));
882        assert!(!is_a(&seq, 3));
883
884        assert!(!is_t(&seq, 0));
885        assert!(is_t(&seq, 1));
886        assert!(!is_t(&seq, 2));
887        assert!(!is_t(&seq, 3));
888
889        assert!(!is_c(&seq, 0));
890        assert!(!is_c(&seq, 1));
891        assert!(is_c(&seq, 2));
892        assert!(!is_c(&seq, 3));
893
894        assert!(!is_g(&seq, 0));
895        assert!(!is_g(&seq, 1));
896        assert!(!is_g(&seq, 2));
897        assert!(is_g(&seq, 3));
898    }
899
900    #[test]
901    fn test_start_codon_functions() {
902        let mut seq = vec![0u8; 20];
903
904        // Encode ATG at position 0: A(00) T(11) G(10)
905        crate::bitmap::set_bit(&mut seq, 2); // T
906        crate::bitmap::set_bit(&mut seq, 3);
907        crate::bitmap::set_bit(&mut seq, 4); // G
908
909        assert!(is_atg(&seq, 0));
910        assert!(!is_gtg(&seq, 0));
911        assert!(!is_ttg(&seq, 0));
912
913        // Encode GTG at position 3: G(10) T(11) G(10)
914        crate::bitmap::set_bit(&mut seq, 6); // G
915        crate::bitmap::set_bit(&mut seq, 8); // T
916        crate::bitmap::set_bit(&mut seq, 9);
917        crate::bitmap::set_bit(&mut seq, 10); // G
918
919        assert!(!is_atg(&seq, 3));
920        assert!(is_gtg(&seq, 3));
921        assert!(!is_ttg(&seq, 3));
922    }
923
924    #[test]
925    fn test_stop_codon_functions() {
926        let mut seq = vec![0u8; 20];
927        let training = Training::default();
928
929        // Encode TAA at position 0: T(11) A(00) A(00)
930        crate::bitmap::set_bit(&mut seq, 0); // T
931        crate::bitmap::set_bit(&mut seq, 1);
932        // A and A are default (00)
933
934        assert!(is_stop(&seq, 0, &training));
935
936        // Encode TAG at position 3: T(11) A(00) G(10)
937        crate::bitmap::set_bit(&mut seq, 6); // T
938        crate::bitmap::set_bit(&mut seq, 7);
939        crate::bitmap::set_bit(&mut seq, 10); // G
940
941        assert!(is_stop(&seq, 3, &training));
942    }
943
944    #[test]
945    fn test_gc_content_calculation() {
946        let mut seq = vec![0u8; 20];
947
948        // Encode ATCG: A(00) T(11) C(01) G(10)
949        crate::bitmap::set_bit(&mut seq, 2); // T
950        crate::bitmap::set_bit(&mut seq, 3);
951        crate::bitmap::set_bit(&mut seq, 5); // C
952        crate::bitmap::set_bit(&mut seq, 6); // G
953
954        let gc = gc_content(&seq, 0, 3);
955        assert!((gc - 0.5).abs() < 0.001); // 2 GC out of 4 = 50%
956    }
957
958    #[test]
959    fn test_reverse_strand_reading_frame() {
960        // Test with sequence length 9 (frame_modulus = 0, since 9 % 3 == 0, so frame_modulus = 3)
961        assert_eq!(reverse_strand_reading_frame(0, 9), 2); // (3 - 1 - 0) % 3 = 2
962        assert_eq!(reverse_strand_reading_frame(1, 9), 1); // (3 - 1 - 1) % 3 = 1
963        assert_eq!(reverse_strand_reading_frame(2, 9), 0); // (3 - 1 - 2) % 3 = 0
964
965        // Test with sequence length 10 (frame_modulus = 1, since 10 % 3 == 1)
966        assert_eq!(reverse_strand_reading_frame(0, 10), 0); // (1 - 1 - 0) % 3 = 0
967
968        // Test with sequence length 8 (frame_modulus = 2, since 8 % 3 == 2)
969        assert_eq!(reverse_strand_reading_frame(0, 8), 1); // (2 - 1 - 0) % 3 = 1
970        assert_eq!(reverse_strand_reading_frame(1, 8), 0); // (2 - 1 - 1) % 3 = 0
971    }
972
973    #[test]
974    fn test_find_max_reading_frame() {
975        assert_eq!(find_max_reading_frame(10, 5, 3), 0);
976        assert_eq!(find_max_reading_frame(5, 10, 3), 1);
977        assert_eq!(find_max_reading_frame(5, 3, 10), 2);
978        assert_eq!(find_max_reading_frame(5, 5, 3), 1); // tie goes to second
979    }
980
981    #[test]
982    fn test_min_of_two_integers() {
983        assert_eq!(min_of_two_integers(5, 3), 3);
984        assert_eq!(min_of_two_integers(3, 5), 3);
985        assert_eq!(min_of_two_integers(-1, 5), -1);
986        assert_eq!(min_of_two_integers(5, 5), 5);
987    }
988
989    #[test]
990    fn test_calculate_kmer_index() {
991        let mut seq = vec![0u8; 20];
992
993        // Encode AC at position 0: A(00) C(01)
994        crate::bitmap::set_bit(&mut seq, 1); // C
995
996        let idx = calculate_kmer_index(2, &seq, 0);
997        assert_eq!(idx, 2); // AC should give index 2
998    }
999
1000    #[test]
1001    fn test_mer_text() {
1002        assert_eq!(mer_text(0, 0), "None");
1003        assert_eq!(mer_text(2, 0), "AA");
1004        assert_eq!(mer_text(2, 1), "GA");
1005        assert_eq!(mer_text(2, 2), "CA");
1006        assert_eq!(mer_text(2, 3), "TA");
1007    }
1008
1009    #[test]
1010    fn test_encode_sequence_basic() {
1011        let sequence = b"ATCG";
1012        let mut encoded = vec![0u8; 10];
1013        let mut unknown_sequence = vec![0u8; 10];
1014        let mut masks = Vec::new();
1015
1016        let gc = encode_sequence(
1017            sequence,
1018            &mut encoded,
1019            &mut unknown_sequence,
1020            &mut masks,
1021            false,
1022        )
1023        .unwrap();
1024        assert!((gc - 0.5).abs() < 0.001); // 2 GC out of 4 = 50%
1025    }
1026
1027    #[test]
1028    fn test_encode_sequence_with_n() {
1029        let sequence = b"ATNG";
1030        let mut encoded = vec![0u8; 10];
1031        let mut unknown_sequence = vec![0u8; 10];
1032        let mut masks = Vec::new();
1033
1034        let gc = encode_sequence(
1035            sequence,
1036            &mut encoded,
1037            &mut unknown_sequence,
1038            &mut masks,
1039            false,
1040        )
1041        .unwrap();
1042        assert!((gc - 0.25).abs() < 0.001); // 1 GC out of 4 = 25%
1043        assert!(crate::bitmap::test_bit(&unknown_sequence, 2)); // N should be marked in unknown_sequence
1044    }
1045
1046    #[test]
1047    fn test_encode_sequence_masking() {
1048        // Create a sequence with 50+ N's to trigger masking (MASK_SIZE = 50)
1049        let mut sequence = b"ATC".to_vec();
1050        sequence.extend(vec![b'N'; 52]); // 52 N's should create a mask
1051        sequence.extend(b"GCG");
1052
1053        let mut encoded = vec![0u8; 60];
1054        let mut unknown_sequence = vec![0u8; 60];
1055        let mut masks = Vec::new();
1056
1057        let _gc = encode_sequence(
1058            &sequence,
1059            &mut encoded,
1060            &mut unknown_sequence,
1061            &mut masks,
1062            true,
1063        )
1064        .unwrap();
1065        assert!(!masks.is_empty()); // Should create at least one mask since we have 52 N's (> MASK_SIZE)
1066        assert_eq!(masks.len(), 1);
1067        assert_eq!(masks[0].begin, 3); // Start after "ATC"
1068        assert_eq!(masks[0].end, 54); // End at last N (3 + 52 - 1)
1069    }
1070
1071    #[test]
1072    fn test_is_gc() {
1073        let mut seq = vec![0u8; 10];
1074
1075        assert!(!is_gc(&seq, 0));
1076
1077        crate::bitmap::set_bit(&mut seq, 1);
1078        assert!(is_gc(&seq, 0));
1079
1080        let mut seq2 = vec![0u8; 10];
1081        crate::bitmap::set_bit(&mut seq2, 0);
1082        assert!(is_gc(&seq2, 0));
1083
1084        let mut seq3 = vec![0u8; 10];
1085        crate::bitmap::set_bit(&mut seq3, 0);
1086        crate::bitmap::set_bit(&mut seq3, 1);
1087        assert!(!is_gc(&seq3, 0));
1088    }
1089
1090    #[test]
1091    fn test_is_n() {
1092        let mut unknown_sequence = vec![0u8; 10];
1093
1094        assert!(!is_n(&unknown_sequence, 0));
1095        assert!(!is_n(&unknown_sequence, 100));
1096
1097        crate::bitmap::set_bit(&mut unknown_sequence, 5);
1098        assert!(is_n(&unknown_sequence, 5));
1099    }
1100
1101    #[test]
1102    fn test_calculate_background_mer_frequencies() {
1103        let seq = vec![0u8; 20]; // All A's
1104        let rseq = vec![0u8; 20]; // All A's
1105        let mut bg = vec![0.0; 16]; // 4^2 = 16 possible 2-mers
1106
1107        calculate_background_mer_frequencies(2, &seq, &rseq, 10, &mut bg);
1108
1109        // Should have high frequency for AA (index 0) and low for others
1110        assert!(bg[0] > 0.5); // AA should be common
1111    }
1112
1113    #[test]
1114    fn test_rcom_seq() {
1115        let seq = vec![0u8; 10];
1116        let unknown_sequence = vec![0u8; 10];
1117
1118        // Encode A at position 0
1119        // A = 00, complement = T = 11
1120
1121        let rseq = create_reverse_complement_sequence(&seq, &unknown_sequence, 2);
1122
1123        assert!(is_t(&rseq, 1));
1124    }
1125
1126    #[test]
1127    fn test_translation_table_functions() {
1128        assert!(uses_only_atg(6));
1129        assert!(uses_only_atg(10));
1130        assert!(!uses_only_atg(11));
1131
1132        assert!(gtg_not_start(1));
1133        assert!(gtg_not_start(22));
1134        assert!(!gtg_not_start(11));
1135
1136        assert!(ttg_not_start(1));
1137        assert!(ttg_not_start(9));
1138        assert!(!ttg_not_start(11));
1139    }
1140
1141    #[test]
1142    fn test_start_codon_with_training() {
1143        let mut training = Training {
1144            translation_table: 11,
1145            ..Training::default()
1146        };
1147
1148        let mut seq = vec![0u8; 20];
1149
1150        // Encode ATG at position 0
1151        crate::bitmap::set_bit(&mut seq, 2); // T
1152        crate::bitmap::set_bit(&mut seq, 3);
1153        crate::bitmap::set_bit(&mut seq, 4); // G
1154
1155        assert!(is_start(&seq, 0, &training));
1156
1157        // Test with table that only uses ATG
1158        training.translation_table = 6;
1159        assert!(is_start(&seq, 0, &training)); // ATG still works
1160
1161        // Encode GTG and test
1162        let mut seq2 = vec![0u8; 20];
1163        crate::bitmap::set_bit(&mut seq2, 0); // G
1164        crate::bitmap::set_bit(&mut seq2, 2); // T
1165        crate::bitmap::set_bit(&mut seq2, 3);
1166        crate::bitmap::set_bit(&mut seq2, 4); // G
1167
1168        assert!(!is_start(&seq2, 0, &training)); // GTG not allowed in table 6
1169    }
1170
1171    #[test]
1172    fn test_stop_codon_special_tables() {
1173        let mut training = Training::default();
1174        let mut seq = vec![0u8; 20];
1175
1176        // Test AGA stop in table 2
1177        training.translation_table = 2;
1178        // Encode AGA: A(00) G(10) A(00)
1179        crate::bitmap::set_bit(&mut seq, 2); // G
1180
1181        assert!(is_stop(&seq, 0, &training));
1182
1183        // Test TCA stop in table 22
1184        training.translation_table = 22;
1185        let mut seq2 = vec![0u8; 20];
1186        // Encode TCA: T(11) C(01) A(00)
1187        crate::bitmap::set_bit(&mut seq2, 0); // T
1188        crate::bitmap::set_bit(&mut seq2, 1);
1189        crate::bitmap::set_bit(&mut seq2, 3); // C
1190
1191        assert!(is_stop(&seq2, 0, &training));
1192    }
1193}