orphos_core/node/
motifs.rs

1use bio::bio_types::strand::Strand;
2use rayon::prelude::*;
3
4use crate::{
5    constants::{
6        INITIAL_MAX_SCORE, MIN_MOTIF_SCORE, MOTIF_THRESHOLD_OFFSET, RBS_DOWNSTREAM_DISTANCE,
7        RBS_UPSTREAM_DISTANCE,
8    },
9    sequence::{calculate_kmer_index, shine_dalgarno_exact, shine_dalgarno_mm},
10    types::{CodonType, Motif, Node, Training},
11};
12
13/// Calculate both exact and mismatch Shine-Dalgarno scores for a position
14fn calculate_rbs_scores(
15    seq: &[u8],
16    pos: usize,
17    target_pos: usize,
18    rbs_weights: &[f64],
19) -> [usize; 2] {
20    [
21        shine_dalgarno_exact(seq, pos, target_pos, rbs_weights),
22        shine_dalgarno_mm(seq, pos, target_pos, rbs_weights),
23    ]
24}
25
26/// RBS Scoring Function: Calculate the RBS motif and then multiply it by the
27/// appropriate weight for that motif (determined in the start training function).
28///
29/// # Parameters
30/// * `seq` - Forward strand sequence
31/// * `rseq` - Reverse strand sequence
32/// * `sequence_length` - Sequence length
33/// * `nodes` - Nodes to score
34/// * `training` - Training data with RBS weights
35///
36/// # Effect
37/// Updates the ribosome_binding_sites scores in each node's motif_info
38pub fn rbs_score(
39    seq: &[u8],
40    reverse_complement_encoded_sequence: &[u8],
41    sequence_length: usize,
42    nodes: &mut [Node],
43    training: &Training,
44) {
45    // Parallelize RBS scoring across all start nodes
46    nodes
47        .par_iter_mut()
48        .enumerate()
49        .filter(|(_, node)| node.position.codon_type != CodonType::Stop && !node.position.is_edge)
50        .for_each(|(_, node)| {
51            node.motif_info.ribosome_binding_sites[0] = 0;
52            node.motif_info.ribosome_binding_sites[1] = 0;
53
54            match node.position.strand {
55                Strand::Forward => {
56                    let search_start = node.position.index.saturating_sub(RBS_UPSTREAM_DISTANCE);
57                    let search_end = node.position.index.saturating_sub(RBS_DOWNSTREAM_DISTANCE);
58
59                    for j in search_start..=search_end {
60                        let cur_sc = calculate_rbs_scores(
61                            seq,
62                            j,
63                            node.position.index,
64                            &*training.rbs_weights,
65                        );
66
67                        if cur_sc[0] > node.motif_info.ribosome_binding_sites[0] {
68                            node.motif_info.ribosome_binding_sites[0] = cur_sc[0];
69                        }
70                        if cur_sc[1] > node.motif_info.ribosome_binding_sites[1] {
71                            node.motif_info.ribosome_binding_sites[1] = cur_sc[1];
72                        }
73                    }
74                }
75                Strand::Reverse => {
76                    let upstream_offset = RBS_UPSTREAM_DISTANCE + 1;
77                    let downstream_offset = RBS_DOWNSTREAM_DISTANCE + 1;
78                    if node.position.index >= sequence_length {
79                        return; // Skip invalid node positions
80                    }
81                    let start_pos = if sequence_length >= node.position.index + upstream_offset {
82                        sequence_length - node.position.index - upstream_offset
83                    } else {
84                        0 // If calculation would underflow, start from beginning
85                    };
86
87                    let end_pos = if sequence_length >= node.position.index + downstream_offset {
88                        sequence_length - node.position.index - downstream_offset
89                    } else {
90                        0 // If calculation would underflow, set to 0
91                    };
92                    let target_pos = sequence_length - 1 - node.position.index;
93
94                    for j in start_pos..=end_pos {
95                        if !(0..sequence_length).contains(&j) {
96                            continue;
97                        }
98
99                        let cur_sc = calculate_rbs_scores(
100                            reverse_complement_encoded_sequence,
101                            j,
102                            target_pos,
103                            &*training.rbs_weights,
104                        );
105
106                        if cur_sc[0] > node.motif_info.ribosome_binding_sites[0] {
107                            node.motif_info.ribosome_binding_sites[0] = cur_sc[0];
108                        }
109                        if cur_sc[1] > node.motif_info.ribosome_binding_sites[1] {
110                            node.motif_info.ribosome_binding_sites[1] = cur_sc[1];
111                        }
112                    }
113                }
114                Strand::Unknown => unreachable!(),
115            }
116        });
117}
118
119/// Find the highest scoring motif/spacer combination for a node.
120///
121/// Given the weights for various motifs/distances from the training file,
122/// return the highest scoring mer/spacer combination of 3-6bp motifs with a
123/// spacer ranging from 3bp to 15bp. In the final stage of start training, only
124/// good scoring motifs are returned.
125///
126/// # Parameters
127/// * `training` - Training data with motif weights
128/// * `seq` - Forward strand sequence
129/// * `rseq` - Reverse strand sequence
130/// * `sequence_length` - Sequence length
131/// * `node` - Node to find motif for
132/// * `stage` - Training stage (affects whether poor motifs are filtered)
133///
134/// # Effect
135/// Updates the best_motif in the node's motif_info
136pub fn find_best_upstream_motif(
137    training: &Training,
138    seq: &[u8],
139    reverse_complement_encoded_sequence: &[u8],
140    sequence_length: usize,
141    node: &mut Node,
142    stage: usize,
143) {
144    if node.position.codon_type == CodonType::Stop || node.position.is_edge {
145        return;
146    }
147
148    let (wseq, start) = match node.position.strand {
149        Strand::Forward => (seq, node.position.index),
150        Strand::Reverse => (
151            reverse_complement_encoded_sequence,
152            sequence_length - 1 - node.position.index,
153        ),
154        Strand::Unknown => unreachable!(),
155    };
156
157    let mut max_sc = INITIAL_MAX_SCORE;
158    let mut max_spacendx = 0;
159    let mut max_spacer = 0;
160    let mut max_ndx = 0;
161    let mut max_len = 0;
162
163    // Search through motif lengths 3-6 (i goes from 3 down to 0, representing lengths 6 down to 3)
164    for i in (0..=3).rev() {
165        let motif_len = i + 3;
166
167        // Search positions from start-18-i to start-6-i
168        let search_start: isize = start as isize - 18 - i;
169        let search_end: isize = start as isize - 6 - i;
170
171        for j in search_start..=search_end {
172            if j < 0 {
173                continue;
174            }
175            // Ensure the k-mer window [j, j+motif_len) is within the nucleotide sequence.
176            // the nucleotide length (sequence_length), not the byte length of the buffer.
177            if (j as usize) + (motif_len as usize) > sequence_length {
178                continue;
179            }
180            let spacer = start as isize - j - i - 3;
181            let spacendx = if j <= start as isize - 16 - i {
182                3
183            } else if j <= start as isize - 14 - i {
184                2
185            } else if j >= start as isize - 7 - i {
186                1
187            } else {
188                0
189            };
190
191            let index = calculate_kmer_index(motif_len as usize, wseq, j as usize);
192            let score = training.motif_weights[i as usize][spacendx][index];
193
194            if score > max_sc {
195                max_sc = score;
196                max_spacendx = spacendx;
197                max_spacer = spacer;
198                max_ndx = index;
199                max_len = motif_len;
200            }
201        }
202    }
203
204    // In stage 2, only accept good scoring motifs
205    let is_stage_two = stage == 2;
206    let is_poor_motif =
207        max_sc == MIN_MOTIF_SCORE || max_sc < training.no_motif_weight + MOTIF_THRESHOLD_OFFSET;
208
209    // Do NOT neutralize the score in early stages when no window is found.
210    // C keeps max_sc at -100.0 in this case, strongly disfavoring edge starts.
211    let effective_max_sc = max_sc;
212
213    node.motif_info.best_motif = if is_stage_two && is_poor_motif {
214        Motif {
215            index: 0,
216            length: 0,
217            space_index: 0,
218            spacer: 0,
219            score: training.no_motif_weight,
220        }
221    } else {
222        Motif {
223            index: max_ndx,
224            length: max_len as usize,
225            space_index: max_spacendx,
226            spacer: max_spacer as usize,
227            score: effective_max_sc,
228        }
229    };
230}
231
232#[cfg(test)]
233mod tests {
234    use super::*;
235    use crate::types::*;
236    use bio::bio_types::strand::Strand;
237
238    fn create_test_node(strand: Strand, index: usize, codon_type: CodonType) -> Node {
239        Node {
240            position: NodePosition {
241                index,
242                strand,
243                codon_type,
244                stop_value: (index + 100) as isize,
245                is_edge: false,
246            },
247            scores: NodeScores::default(),
248            state: NodeState::default(),
249            motif_info: NodeMotifInfo {
250                ribosome_binding_sites: [0; 2],
251                best_motif: Motif::default(),
252            },
253        }
254    }
255
256    fn create_test_training() -> Training {
257        Training {
258            gc_content: 0.5,
259            translation_table: 11,
260            uses_shine_dalgarno: true,
261            start_type_weights: [1.0, 2.0, 3.0],
262            rbs_weights: Box::new([1.0; 28]),
263            upstream_composition: Box::new([[0.25; 4]; 32]),
264            motif_weights: Box::new([[[1.0; 4096]; 4]; 4]),
265            no_motif_weight: 0.5,
266            start_weight_factor: 4.35,
267            gc_bias_factors: [1.0; 3],
268            gene_dicodon_table: Box::new([0.0; 4096]),
269            total_dicodons: 0,
270        }
271    }
272
273    #[test]
274    fn test_calculate_rbs_scores() {
275        let seq = vec![0, 1, 2, 3, 0, 1, 2, 3]; // 8-base sequence
276        let rbs_weights = [
277            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
278        ];
279        let pos = 2;
280        let target_pos = 6;
281
282        let scores = calculate_rbs_scores(&seq, pos, target_pos, &rbs_weights);
283
284        assert_eq!(scores.len(), 2);
285    }
286
287    #[test]
288    fn test_calculate_rbs_scores_boundary_conditions() {
289        let seq = vec![0, 1, 2, 3];
290        let rbs_weights = [1.0; 16];
291
292        // Test at start of sequence
293        let scores = calculate_rbs_scores(&seq, 0, 3, &rbs_weights);
294        assert_eq!(scores.len(), 2);
295
296        // Test near end of sequence
297        let scores = calculate_rbs_scores(&seq, 2, 3, &rbs_weights);
298        assert_eq!(scores.len(), 2);
299    }
300
301    #[test]
302    fn test_rbs_score_forward_strand() {
303        let seq = vec![
304            0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0,
305            1, 2, 3,
306        ];
307        let reverse_seq = vec![
308            3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3,
309            2, 1, 0,
310        ];
311        let training = create_test_training();
312
313        let mut nodes = vec![create_test_node(Strand::Forward, 25, CodonType::Atg)];
314
315        rbs_score(&seq, &reverse_seq, seq.len(), &mut nodes, &training);
316
317        assert_eq!(nodes[0].motif_info.ribosome_binding_sites.len(), 2);
318    }
319
320    #[test]
321    fn test_rbs_score_reverse_strand() {
322        // Create a longer sequence to handle reverse strand calculations
323        let seq = vec![0; 60]; // 60-base sequence
324        let reverse_seq = vec![3; 60];
325        let training = create_test_training();
326
327        let mut nodes = vec![
328            create_test_node(Strand::Reverse, 35, CodonType::Atg), // Position far from edges
329        ];
330
331        rbs_score(&seq, &reverse_seq, seq.len(), &mut nodes, &training);
332
333        assert_eq!(nodes[0].motif_info.ribosome_binding_sites.len(), 2);
334    }
335
336    #[test]
337    fn test_rbs_score_stop_codon_skipped() {
338        let seq = vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
339        let reverse_seq = vec![3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0];
340        let training = create_test_training();
341
342        let mut nodes = vec![create_test_node(Strand::Forward, 6, CodonType::Stop)];
343
344        let original_rbs = nodes[0].motif_info.ribosome_binding_sites;
345        rbs_score(&seq, &reverse_seq, seq.len(), &mut nodes, &training);
346
347        assert_eq!(nodes[0].motif_info.ribosome_binding_sites, original_rbs);
348    }
349
350    #[test]
351    fn test_rbs_score_edge_node_skipped() {
352        let seq = vec![0, 1, 2, 3, 0, 1, 2, 3];
353        let reverse_seq = vec![3, 2, 1, 0, 3, 2, 1, 0];
354        let training = create_test_training();
355
356        let mut node = create_test_node(Strand::Forward, 6, CodonType::Atg);
357        node.position.is_edge = true;
358        let mut nodes = vec![node];
359
360        let original_rbs = nodes[0].motif_info.ribosome_binding_sites;
361        rbs_score(&seq, &reverse_seq, seq.len(), &mut nodes, &training);
362
363        assert_eq!(nodes[0].motif_info.ribosome_binding_sites, original_rbs);
364    }
365
366    #[test]
367    fn test_rbs_score_parallel_processing() {
368        // Create a long sequence to handle all positions safely
369        let seq = vec![0; 80];
370        let reverse_seq = vec![3; 80];
371        let training = create_test_training();
372
373        let mut nodes = vec![
374            create_test_node(Strand::Forward, 30, CodonType::Atg),
375            create_test_node(Strand::Reverse, 50, CodonType::Atg),
376            create_test_node(Strand::Forward, 70, CodonType::Atg),
377        ];
378
379        rbs_score(&seq, &reverse_seq, seq.len(), &mut nodes, &training);
380
381        for node in &nodes {
382            assert_eq!(node.motif_info.ribosome_binding_sites.len(), 2);
383        }
384    }
385
386    #[test]
387    fn test_find_best_upstream_motif_stop_codon() {
388        let seq = vec![0, 1, 2, 3, 0, 1, 2, 3];
389        let reverse_seq = vec![3, 2, 1, 0, 3, 2, 1, 0];
390        let training = create_test_training();
391
392        let mut node = create_test_node(Strand::Forward, 6, CodonType::Stop);
393        let original_motif_index = node.motif_info.best_motif.index;
394        let original_motif_length = node.motif_info.best_motif.length;
395
396        find_best_upstream_motif(&training, &seq, &reverse_seq, seq.len(), &mut node, 1);
397
398        assert_eq!(node.motif_info.best_motif.index, original_motif_index);
399        assert_eq!(node.motif_info.best_motif.length, original_motif_length);
400    }
401
402    #[test]
403    fn test_find_best_upstream_motif_edge_node() {
404        let seq = vec![0, 1, 2, 3, 0, 1, 2, 3];
405        let reverse_seq = vec![3, 2, 1, 0, 3, 2, 1, 0];
406        let training = create_test_training();
407
408        let mut node = create_test_node(Strand::Forward, 6, CodonType::Atg);
409        node.position.is_edge = true;
410        let original_motif_index = node.motif_info.best_motif.index;
411        let original_motif_length = node.motif_info.best_motif.length;
412
413        find_best_upstream_motif(&training, &seq, &reverse_seq, seq.len(), &mut node, 1);
414
415        assert_eq!(node.motif_info.best_motif.index, original_motif_index);
416        assert_eq!(node.motif_info.best_motif.length, original_motif_length);
417    }
418
419    #[test]
420    fn test_find_best_upstream_motif_forward_strand() {
421        let seq = vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
422        let reverse_seq = vec![3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0];
423        let training = create_test_training();
424
425        let mut node = create_test_node(Strand::Forward, 15, CodonType::Atg);
426
427        find_best_upstream_motif(&training, &seq, &reverse_seq, seq.len(), &mut node, 1);
428
429        // length and spacer are usize, so always >= 0
430    }
431
432    #[test]
433    fn test_find_best_upstream_motif_reverse_strand() {
434        let seq = vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
435        let reverse_seq = vec![3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0];
436        let training = create_test_training();
437
438        let mut node = create_test_node(Strand::Reverse, 15, CodonType::Atg);
439
440        find_best_upstream_motif(&training, &seq, &reverse_seq, seq.len(), &mut node, 1);
441
442        // length and spacer are usize, so always >= 0
443    }
444
445    #[test]
446    fn test_find_best_upstream_motif_stage_two_filtering() {
447        let mut training = create_test_training();
448        training.no_motif_weight = 10.0; // High no-motif weight
449
450        let seq = vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
451        let reverse_seq = vec![3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0];
452
453        let mut node = create_test_node(Strand::Forward, 15, CodonType::Atg);
454
455        find_best_upstream_motif(&training, &seq, &reverse_seq, seq.len(), &mut node, 2);
456
457        if node.motif_info.best_motif.score < training.no_motif_weight + MOTIF_THRESHOLD_OFFSET {
458            assert_eq!(node.motif_info.best_motif.length, 0);
459            assert_eq!(node.motif_info.best_motif.score, training.no_motif_weight);
460        }
461    }
462
463    #[test]
464    fn test_find_best_upstream_motif_stage_one_accepts_all() {
465        let training = create_test_training();
466        let seq = vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
467        let reverse_seq = vec![3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0];
468
469        let mut node = create_test_node(Strand::Forward, 15, CodonType::Atg);
470
471        find_best_upstream_motif(&training, &seq, &reverse_seq, seq.len(), &mut node, 1);
472
473        // length is usize, so always >= 0
474    }
475
476    #[test]
477    fn test_find_best_upstream_motif_empty_sequence() {
478        let seq = vec![];
479        let reverse_seq = vec![];
480        let training = create_test_training();
481
482        let mut node = create_test_node(Strand::Forward, 0, CodonType::Atg);
483
484        find_best_upstream_motif(&training, &seq, &reverse_seq, 0, &mut node, 1);
485
486        // Should handle empty sequence gracefully
487        // length is usize, so always >= 0
488    }
489
490    #[test]
491    fn test_find_best_upstream_motif_short_sequence() {
492        let seq = vec![0, 1, 2];
493        let reverse_seq = vec![2, 1, 0];
494        let training = create_test_training();
495
496        let mut node = create_test_node(Strand::Forward, 2, CodonType::Atg);
497
498        find_best_upstream_motif(&training, &seq, &reverse_seq, seq.len(), &mut node, 1);
499
500        // Should handle short sequences without panicking
501        // length is usize, so always >= 0
502    }
503
504    #[test]
505    fn test_motif_spacer_classification() {
506        let training = create_test_training();
507        let seq = vec![
508            0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
509        ];
510        let reverse_seq = vec![
511            3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0,
512        ];
513
514        let mut node = create_test_node(Strand::Forward, 20, CodonType::Atg);
515
516        find_best_upstream_motif(&training, &seq, &reverse_seq, seq.len(), &mut node, 1);
517
518        assert!(node.motif_info.best_motif.space_index <= 3);
519    }
520
521    #[test]
522    fn test_motif_length_range() {
523        let training = create_test_training();
524        let seq = vec![
525            0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
526        ];
527        let reverse_seq = vec![
528            3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0,
529        ];
530
531        let mut node = create_test_node(Strand::Forward, 20, CodonType::Atg);
532
533        find_best_upstream_motif(&training, &seq, &reverse_seq, seq.len(), &mut node, 1);
534
535        assert!(node.motif_info.best_motif.length <= 6);
536    }
537}