1use crate::{
2 constants::{
3 CODING_SCORE_SENTINEL, CODING_SCORE_THRESHOLD, DICODON_SIZE, EDGE_BONUS,
4 EDGE_POSITION_OFFSET, EDGE_POSITION_THRESHOLD, EDGE_UPSTREAM, GENE_SIZE_SCALING_FACTOR,
5 LENGTH_FACTOR_MULTIPLIER, LENGTH_FACTOR_THRESHOLD, MAX_GENE_SIZE_CODONS,
6 METAGENOMIC_LENGTH_THRESHOLD, METAGENOMIC_MIN_LENGTH_FACTOR, METAGENOMIC_PENALTY,
7 METAGENOMIC_PENALTY_DIVISOR, MIN_GENE_SIZE_CODONS, MIN_META_GENE_LENGTH,
8 NEGATIVE_SCORE_PENALTY, NO_MOTIF_THRESHOLD, NODE_SEARCH_WINDOW, SHORT_GENE_THRESHOLD,
9 UPSTREAM_COMPOSITION_WEIGHT, UPSTREAM_SCAN_RANGE, UPSTREAM_SKIP_END, UPSTREAM_SKIP_START,
10 },
11 node::{calc_orf_gc, find_best_upstream_motif},
12 rbs_score,
13 sequence::encoded::EncodedSequence,
14 sequence::{calculate_kmer_index, is_stop},
15 types::{CodonType, Node, OrphosError, Training},
16};
17
18use bio::bio_types::strand::Strand;
19
20#[derive(Debug)]
22pub struct ScoringContext<'a> {
23 pub encoded_sequence: &'a EncodedSequence,
25 pub training: &'a Training,
27 pub closed: bool,
29 pub is_meta: bool,
31 pub upstream_scan_indices: Vec<usize>,
33}
34
35impl<'a> ScoringContext<'a> {
36 pub fn new(
38 encoded_sequence: &'a EncodedSequence,
39 training: &'a Training,
40 closed: bool,
41 is_meta: bool,
42 ) -> Self {
43 let upstream_scan_indices: Vec<usize> = (1..UPSTREAM_SCAN_RANGE)
45 .filter(|&i| i <= UPSTREAM_SKIP_START || i >= UPSTREAM_SKIP_END)
46 .collect();
47
48 Self {
49 encoded_sequence,
50 training,
51 closed,
52 is_meta,
53 upstream_scan_indices,
54 }
55 }
56}
57
58#[derive(Debug, Clone)]
60pub struct ScoringParams {
61 pub edge_upstream_bonus: f64,
63 pub edge_bonus_factor: f64,
65 pub penalty_factor: f64,
67 pub start_weight_factor: f64,
69 pub metagenomic_penalty_coeff: f64,
71 pub no_stop_probability: f64,
73}
74
75impl ScoringParams {
76 pub fn from_training(training: &Training) -> Self {
78 let start_weight_factor = training.start_weight_factor;
79 let edge_upstream_bonus = EDGE_UPSTREAM * start_weight_factor;
80 let edge_bonus_factor = EDGE_BONUS * start_weight_factor;
81 let penalty_factor = NEGATIVE_SCORE_PENALTY * edge_bonus_factor;
82 let metagenomic_penalty_coeff = METAGENOMIC_PENALTY / METAGENOMIC_PENALTY_DIVISOR;
83 let no_stop_probability =
84 calculate_no_stop_probability(training.gc_content, training.translation_table);
85
86 Self {
87 edge_upstream_bonus,
88 edge_bonus_factor,
89 penalty_factor,
90 start_weight_factor,
91 metagenomic_penalty_coeff,
92 no_stop_probability,
93 }
94 }
95}
96
97#[derive(Debug, Clone)]
99pub struct NodeCache {
100 pub gene_length: usize,
102 pub is_boundary_node: bool,
104 pub short_gene_factors: Option<(f64, f64)>,
106 pub needs_metagenomic_penalty: bool,
108}
109
110impl NodeCache {
111 pub fn new(node: &Node, context: &ScoringContext) -> Self {
113 let gene_length = calculate_gene_length(node);
114
115 let is_boundary_node = is_node_at_boundary(node, context);
116
117 let short_gene_factors = if gene_length < SHORT_GENE_THRESHOLD {
118 Some(calculate_short_gene_factors(gene_length))
119 } else {
120 None
121 };
122
123 let needs_metagenomic_penalty = context.is_meta
124 && context.encoded_sequence.sequence_length < METAGENOMIC_LENGTH_THRESHOLD
125 && (node.scores.coding_score < CODING_SCORE_THRESHOLD
126 || gene_length < MIN_META_GENE_LENGTH);
127
128 Self {
129 gene_length,
130 is_boundary_node,
131 short_gene_factors,
132 needs_metagenomic_penalty,
133 }
134 }
135}
136
137fn is_node_at_boundary(node: &Node, context: &ScoringContext) -> bool {
139 (node.position.index <= EDGE_POSITION_THRESHOLD && node.position.strand == Strand::Forward)
140 || (node.position.index >= context.encoded_sequence.sequence_length - EDGE_POSITION_OFFSET
141 && node.position.strand == Strand::Reverse)
142}
143
144fn calculate_edge_gene_status(node: &Node, context: &ScoringContext) -> usize {
150 let mut edge_gene = 0;
151
152 if node.position.is_edge {
153 edge_gene += 1;
154 }
155
156 if (node.position.strand == Strand::Forward
158 && (node.position.stop_value < 0
159 || !is_stop(
160 &context.encoded_sequence.forward_sequence,
161 node.position.stop_value as usize,
162 context.training,
163 )))
164 || (node.position.strand == Strand::Reverse
165 && (node.position.stop_value < 0
166 || !is_stop(
167 &context.encoded_sequence.reverse_complement_sequence,
168 context.encoded_sequence.sequence_length
169 - 1
170 - node.position.stop_value as usize,
171 context.training,
172 )))
173 {
174 edge_gene += 1;
175 }
176
177 edge_gene
178}
179
180fn calculate_type_scores(
185 node: &mut Node,
186 edge_gene: usize,
187 params: &ScoringParams,
188 training: &Training,
189) {
190 if node.position.is_edge {
191 node.scores.type_score = params.edge_bonus_factor / edge_gene as f64;
192 node.scores.upstream_score = 0.0;
193 node.scores.ribosome_binding_score = 0.0;
194 } else {
195 node.scores.type_score = training.start_type_weights[node.position.codon_type.to_index()]
197 * params.start_weight_factor;
198 }
199}
200
201fn calculate_rbs_scores(node: &mut Node, training: &Training, params: &ScoringParams) {
205 if node.position.is_edge {
206 return; }
208
209 let rbs1 = training.rbs_weights[node.motif_info.ribosome_binding_sites[0]];
211 let rbs2 = training.rbs_weights[node.motif_info.ribosome_binding_sites[1]];
212 let sd_score = rbs1.max(rbs2) * params.start_weight_factor;
213
214 if training.uses_shine_dalgarno {
215 node.scores.ribosome_binding_score = sd_score;
216 } else {
217 node.scores.ribosome_binding_score =
218 params.start_weight_factor * node.motif_info.best_motif.score;
219 if node.scores.ribosome_binding_score < sd_score
220 && training.no_motif_weight > NO_MOTIF_THRESHOLD
221 {
222 node.scores.ribosome_binding_score = sd_score;
223 }
224 }
225}
226
227fn calculate_upstream_scores(
232 node: &mut Node,
233 context: &ScoringContext,
234 params: &ScoringParams,
235 cache: &NodeCache,
236) {
237 if node.position.is_edge {
238 return; }
240
241 if node.position.strand == Strand::Forward {
242 node.scores.upstream_score =
243 score_upstream_composition(&context.encoded_sequence.forward_sequence, node, context);
244 } else {
245 node.scores.upstream_score = score_upstream_composition(
246 &context.encoded_sequence.reverse_complement_sequence,
247 node,
248 context,
249 );
250 }
251
252 if !context.closed && cache.is_boundary_node {
254 node.scores.upstream_score += params.edge_upstream_bonus;
255 }
256}
257
258fn apply_edge_upstream_bonus_for_neighbors(
263 nodes: &mut [Node],
264 current_index: usize,
265 node_count: usize,
266 params: &ScoringParams,
267) {
268 let current_node = &nodes[current_index];
269
270 if current_node.position.is_edge {
271 return;
272 }
273
274 if current_index < NODE_SEARCH_WINDOW && current_node.position.strand == Strand::Forward {
275 for j in (0..current_index).rev() {
277 if nodes[j].position.is_edge
278 && current_node.position.stop_value == nodes[j].position.stop_value
279 {
280 nodes[current_index].scores.upstream_score += params.edge_upstream_bonus;
281 break;
282 }
283 }
284 } else if current_index >= node_count.saturating_sub(NODE_SEARCH_WINDOW)
285 && current_node.position.strand == Strand::Reverse
286 {
287 for j in (current_index + 1)..node_count {
289 if nodes[j].position.is_edge
290 && current_node.position.stop_value == nodes[j].position.stop_value
291 {
292 nodes[current_index].scores.upstream_score += params.edge_upstream_bonus;
293 break;
294 }
295 }
296 }
297}
298
299fn convert_boundary_nodes_to_edge_genes(
304 node: &mut Node,
305 context: &ScoringContext,
306 edge_gene: &mut usize,
307 params: &ScoringParams,
308 cache: &NodeCache,
309) {
310 if cache.is_boundary_node && !node.position.is_edge && !context.closed {
312 *edge_gene += 1;
313 node.position.is_edge = true;
314 node.scores.type_score = 0.0;
315 let new_upstream_score = params.edge_bonus_factor / *edge_gene as f64;
316 node.scores.upstream_score = new_upstream_score;
317 node.scores.ribosome_binding_score = 0.0;
318 }
319}
320
321fn apply_penalties_and_bonuses(
329 node: &mut Node,
330 edge_gene: usize,
331 context: &ScoringContext,
332 params: &ScoringParams,
333 cache: &NodeCache,
334) {
335 if !node.position.is_edge && edge_gene == 1 {
337 node.scores.upstream_score -= params.penalty_factor;
338 }
339
340 if edge_gene == 0
342 && let Some((negative_factor, positive_factor)) = cache.short_gene_factors
343 {
344 if node.scores.ribosome_binding_score < 0.0 {
345 node.scores.ribosome_binding_score *= negative_factor;
346 }
347 if node.scores.upstream_score < 0.0 {
348 node.scores.upstream_score *= negative_factor;
349 }
350 if node.scores.type_score < 0.0 {
351 node.scores.type_score *= negative_factor;
352 }
353
354 if node.scores.ribosome_binding_score > 0.0 {
355 node.scores.ribosome_binding_score *= positive_factor;
356 }
357 if node.scores.upstream_score > 0.0 {
358 node.scores.upstream_score *= positive_factor;
359 }
360 if node.scores.type_score > 0.0 {
361 node.scores.type_score *= positive_factor;
362 }
363 }
364
365 if cache.needs_metagenomic_penalty && edge_gene == 0 {
367 let penalty = params.metagenomic_penalty_coeff
368 * (METAGENOMIC_LENGTH_THRESHOLD - context.encoded_sequence.sequence_length) as f64;
369 node.scores.coding_score -= penalty;
370 }
371
372 node.scores.start_score =
374 node.scores.type_score + node.scores.ribosome_binding_score + node.scores.upstream_score;
375
376 if node.scores.coding_score < 0.0 {
378 if edge_gene > 0 && !node.position.is_edge {
379 if !context.is_meta
380 || context.encoded_sequence.sequence_length > METAGENOMIC_MIN_LENGTH_FACTOR
381 {
382 node.scores.start_score -= params.start_weight_factor;
383 } else {
384 let penalty =
385 10.31f64.mul_add(-(context.encoded_sequence.sequence_length as f64), 0.004);
386 node.scores.start_score -= penalty;
387 }
388 } else if context.is_meta
389 && context.encoded_sequence.sequence_length < METAGENOMIC_LENGTH_THRESHOLD
390 && node.position.is_edge
391 {
392 let min_meta_length =
393 calculate_min_metagenomic_length(context.encoded_sequence.sequence_length);
394 if cache.gene_length as f64 >= min_meta_length {
395 if node.scores.coding_score >= 0.0 {
396 node.scores.coding_score = -1.0;
397 }
398 node.scores.start_score = 0.0;
399 node.scores.upstream_score = 0.0;
400 }
401 } else {
402 node.scores.start_score -= NEGATIVE_SCORE_PENALTY;
403 }
404 } else if node.scores.coding_score < CODING_SCORE_THRESHOLD
405 && context.is_meta
406 && cache.gene_length < MIN_META_GENE_LENGTH
407 && node.scores.start_score < 0.0
408 {
409 node.scores.start_score -= params.start_weight_factor;
410 }
411}
412
413fn calculate_final_scores(node: &mut Node) {
418 node.scores.total_score = node.scores.coding_score + node.scores.start_score;
419}
420
421pub fn score_nodes(
423 encoded_sequence: &EncodedSequence,
424 nodes: &mut [Node],
425 training: &Training,
426 closed: bool,
427 is_meta: bool,
428) -> Result<(), OrphosError> {
429 let node_count = nodes.len();
430
431 if node_count == 0 {
433 return Ok(());
434 }
435
436 let context = ScoringContext::new(encoded_sequence, training, closed, is_meta);
438 let params = ScoringParams::from_training(training);
439
440 calc_orf_gc(
441 &encoded_sequence.forward_sequence,
442 encoded_sequence.sequence_length,
443 nodes,
444 );
445 raw_coding_score_optimized(
446 &encoded_sequence.forward_sequence,
447 &encoded_sequence.reverse_complement_sequence,
448 encoded_sequence.sequence_length,
449 nodes,
450 training,
451 params.no_stop_probability,
452 );
453
454 if training.uses_shine_dalgarno {
456 rbs_score(
457 &encoded_sequence.forward_sequence,
458 &encoded_sequence.reverse_complement_sequence,
459 encoded_sequence.sequence_length,
460 nodes,
461 training,
462 );
463 } else {
464 for node in nodes.iter_mut() {
466 if node.position.codon_type != CodonType::Stop && !node.position.is_edge {
467 find_best_upstream_motif(
468 training,
469 &encoded_sequence.forward_sequence,
470 &encoded_sequence.reverse_complement_sequence,
471 encoded_sequence.sequence_length,
472 node,
473 2,
474 );
475 }
476 }
477 }
478
479 for i in 0..node_count {
481 if nodes[i].position.codon_type == CodonType::Stop {
482 continue;
483 }
484
485 let cache = NodeCache::new(&nodes[i], &context);
487
488 let mut edge_gene = calculate_edge_gene_status(&nodes[i], &context);
490
491 calculate_type_scores(&mut nodes[i], edge_gene, ¶ms, training);
493
494 calculate_rbs_scores(&mut nodes[i], training, ¶ms);
495
496 calculate_upstream_scores(&mut nodes[i], &context, ¶ms, &cache);
497
498 apply_edge_upstream_bonus_for_neighbors(nodes, i, node_count, ¶ms);
500
501 convert_boundary_nodes_to_edge_genes(
503 &mut nodes[i],
504 &context,
505 &mut edge_gene,
506 ¶ms,
507 &cache,
508 );
509
510 calculate_final_scores(&mut nodes[i]);
512
513 apply_penalties_and_bonuses(&mut nodes[i], edge_gene, &context, ¶ms, &cache);
515
516 nodes[i].scores.total_score = nodes[i].scores.coding_score + nodes[i].scores.start_score;
518 }
519
520 Ok(())
521}
522
523fn score_upstream_composition(seq: &[u8], node: &Node, context: &ScoringContext) -> f64 {
525 let start = if node.position.strand == Strand::Forward {
526 node.position.index
527 } else {
528 context.encoded_sequence.sequence_length - 1 - node.position.index
529 };
530
531 let mut upstream_score = 0.0;
532 let mut count = 0;
533
534 for &i in &context.upstream_scan_indices {
536 if start < i {
537 continue;
538 }
539
540 let pos = start - i;
541
542 let mer_idx = calculate_kmer_index(1, seq, pos);
544
545 upstream_score += UPSTREAM_COMPOSITION_WEIGHT
547 * context.training.start_weight_factor
548 * context.training.upstream_composition[count][mer_idx];
549 count += 1;
550 }
551 upstream_score
552}
553
554const fn calculate_gene_length(node: &Node) -> usize {
558 if node.position.stop_value < 0 {
559 (node.position.index as isize - node.position.stop_value) as usize
560 } else {
561 (node.position.stop_value as usize).abs_diff(node.position.index)
562 }
563}
564
565fn calculate_gene_size_codons(node: &Node) -> f64 {
569 ((calculate_gene_length(node) + 3) as f64) / 3.0
570}
571
572fn calculate_short_gene_factors(gene_length: usize) -> (f64, f64) {
576 let negative_factor = SHORT_GENE_THRESHOLD as f64 / gene_length as f64;
577 let positive_factor = gene_length as f64 / SHORT_GENE_THRESHOLD as f64;
578 (negative_factor, positive_factor)
579}
580
581fn calculate_min_metagenomic_length(sequence_length: usize) -> f64 {
585 (sequence_length as f64).sqrt() * 5.0
586}
587
588fn calculate_no_stop_probability(gc_content: f64, translation_table: i32) -> f64 {
594 let at_content = 1.0 - gc_content;
595 let at_squared = at_content * at_content;
596
597 if translation_table != 11 {
598 let at_stop_prob = (at_squared * gc_content) / 8.0;
602 let mixed_stop_prob = (at_squared * at_content) / 8.0;
604
605 1.0 - (at_stop_prob + mixed_stop_prob)
606 } else {
607 let tag_taa_prob = (at_squared * gc_content) / 4.0;
611 let tga_prob = (at_squared * at_content) / 8.0;
613
614 1.0 - (tag_taa_prob + tga_prob)
615 }
616}
617
618fn process_strand_first_pass(
622 nodes: &mut [Node],
623 encoded_sequence: &[u8],
624 sequence_length: usize,
625 strand: Strand,
626 training: &Training,
627 frame_scores: &mut [f64; 3],
628 last_positions: &mut [usize; 3],
629) {
630 frame_scores.fill(0.0);
631
632 if strand == Strand::Forward {
633 for i in (0..nodes.len()).rev() {
635 if nodes[i].position.strand != Strand::Forward {
636 continue;
637 }
638
639 let frame_index = nodes[i].position.index % 3;
640
641 if nodes[i].position.codon_type == CodonType::Stop {
642 last_positions[frame_index] = nodes[i].position.index;
643 frame_scores[frame_index] = 0.0;
644 } else {
645 if last_positions[frame_index] >= 3 {
647 let mut j = last_positions[frame_index] - 3;
648 loop {
649 if j < nodes[i].position.index {
650 break;
651 }
652 let mer_idx = calculate_kmer_index(DICODON_SIZE, encoded_sequence, j);
653 let dicodon_score = training.gene_dicodon_table[mer_idx];
654 frame_scores[frame_index] += dicodon_score;
655
656 if j < 3 {
657 break;
658 }
659 j -= 3;
660 }
661 }
662 nodes[i].scores.coding_score = frame_scores[frame_index];
663 last_positions[frame_index] = nodes[i].position.index;
664 }
665 }
666 } else {
667 for node in nodes.iter_mut() {
669 if node.position.strand != Strand::Reverse {
670 continue;
671 }
672
673 let frame_index = node.position.index % 3;
674
675 if node.position.codon_type == CodonType::Stop {
676 last_positions[frame_index] = node.position.index;
677 frame_scores[frame_index] = 0.0;
678 } else {
679 let mut j = last_positions[frame_index] + 3;
681 while j <= node.position.index {
682 let reverse_seq_pos = sequence_length - j - 1;
683 let mer_idx =
684 calculate_kmer_index(DICODON_SIZE, encoded_sequence, reverse_seq_pos);
685 let dicodon_score = training.gene_dicodon_table[mer_idx];
686 frame_scores[frame_index] += dicodon_score;
687 j += 3;
688 }
689 node.scores.coding_score = frame_scores[frame_index];
690 last_positions[frame_index] = node.position.index;
691 }
692 }
693 }
694}
695
696fn process_strand_second_pass(nodes: &mut [Node], strand: Strand, frame_scores: &mut [f64; 3]) {
700 frame_scores.fill(CODING_SCORE_SENTINEL);
701
702 if strand == Strand::Forward {
703 for node in nodes.iter_mut() {
705 if node.position.strand != Strand::Forward {
706 continue;
707 }
708
709 let frame_index = node.position.index % 3;
710
711 if node.position.codon_type == CodonType::Stop {
712 frame_scores[frame_index] = CODING_SCORE_SENTINEL;
713 } else if node.scores.coding_score > frame_scores[frame_index] {
714 frame_scores[frame_index] = node.scores.coding_score;
715 } else {
716 node.scores.coding_score -= frame_scores[frame_index] - node.scores.coding_score;
717 }
718 }
719 } else {
720 for i in (0..nodes.len()).rev() {
722 if nodes[i].position.strand != Strand::Reverse {
723 continue;
724 }
725
726 let frame_index = nodes[i].position.index % 3;
727
728 if nodes[i].position.codon_type == CodonType::Stop {
729 frame_scores[frame_index] = CODING_SCORE_SENTINEL;
730 } else if nodes[i].scores.coding_score > frame_scores[frame_index] {
731 frame_scores[frame_index] = nodes[i].scores.coding_score;
732 } else {
733 nodes[i].scores.coding_score -=
734 frame_scores[frame_index] - nodes[i].scores.coding_score;
735 }
736 }
737 }
738}
739
740fn calculate_length_factor(gene_size_codons: f64, no_stop_probability: f64) -> f64 {
745 if gene_size_codons > MAX_GENE_SIZE_CODONS as f64 {
746 let mut factor = ((1.0 - no_stop_probability.powi(MAX_GENE_SIZE_CODONS))
748 / no_stop_probability.powi(MAX_GENE_SIZE_CODONS))
749 .ln();
750 factor -= ((1.0 - no_stop_probability.powi(MIN_GENE_SIZE_CODONS))
751 / no_stop_probability.powi(MIN_GENE_SIZE_CODONS))
752 .ln();
753 factor *= (gene_size_codons - MIN_GENE_SIZE_CODONS as f64) / GENE_SIZE_SCALING_FACTOR;
754 factor
755 } else {
756 let mut factor = ((1.0 - no_stop_probability.powf(gene_size_codons))
758 / no_stop_probability.powf(gene_size_codons))
759 .ln();
760 factor -= ((1.0 - no_stop_probability.powi(MIN_GENE_SIZE_CODONS))
761 / no_stop_probability.powi(MIN_GENE_SIZE_CODONS))
762 .ln();
763 factor
764 }
765}
766
767fn process_strand_third_pass(
771 nodes: &mut [Node],
772 strand: Strand,
773 no_stop_probability: f64,
774 frame_scores: &mut [f64; 3],
775) {
776 frame_scores.fill(CODING_SCORE_SENTINEL);
777
778 let process_node = |node: &mut Node, frame_scores: &mut [f64; 3]| {
779 if node.position.strand != strand {
780 return;
781 }
782
783 let frame_index = node.position.index % 3;
784
785 if node.position.codon_type == CodonType::Stop {
786 frame_scores[frame_index] = CODING_SCORE_SENTINEL;
787 } else {
788 let gene_size_codons = calculate_gene_size_codons(node);
789 let length_factor = calculate_length_factor(gene_size_codons, no_stop_probability);
790 let mut adjusted_length_factor = length_factor;
791
792 if adjusted_length_factor > frame_scores[frame_index] {
794 frame_scores[frame_index] = adjusted_length_factor;
795 } else {
796 let temp = (frame_scores[frame_index] - adjusted_length_factor)
797 .min(adjusted_length_factor)
798 .max(0.0);
799 adjusted_length_factor -= temp;
800 }
801
802 if adjusted_length_factor > LENGTH_FACTOR_THRESHOLD
804 && node.scores.coding_score < LENGTH_FACTOR_MULTIPLIER * adjusted_length_factor
805 {
806 node.scores.coding_score = LENGTH_FACTOR_MULTIPLIER * adjusted_length_factor;
807 }
808 node.scores.coding_score += adjusted_length_factor;
809 }
810 };
811
812 if strand == Strand::Forward {
813 for node in nodes.iter_mut() {
815 process_node(node, frame_scores);
816 }
817 } else {
818 for i in (0..nodes.len()).rev() {
820 process_node(&mut nodes[i], frame_scores);
821 }
822 }
823}
824
825pub fn raw_coding_score_optimized(
832 encoded_sequence: &[u8],
833 reverse_complement_encoded_sequence: &[u8],
834 sequence_length: usize,
835 nodes: &mut [Node],
836 training: &Training,
837 no_stop_probability: f64,
838) {
839 let mut frame_scores = [0.0; 3];
840 let mut last_positions = [0usize; 3];
841
842 process_strand_first_pass(
844 nodes,
845 encoded_sequence,
846 sequence_length,
847 Strand::Forward,
848 training,
849 &mut frame_scores,
850 &mut last_positions,
851 );
852
853 process_strand_first_pass(
854 nodes,
855 reverse_complement_encoded_sequence,
856 sequence_length,
857 Strand::Reverse,
858 training,
859 &mut frame_scores,
860 &mut last_positions,
861 );
862
863 process_strand_second_pass(nodes, Strand::Forward, &mut frame_scores);
865 process_strand_second_pass(nodes, Strand::Reverse, &mut frame_scores);
866
867 process_strand_third_pass(
869 nodes,
870 Strand::Forward,
871 no_stop_probability,
872 &mut frame_scores,
873 );
874 process_strand_third_pass(
875 nodes,
876 Strand::Reverse,
877 no_stop_probability,
878 &mut frame_scores,
879 );
880}
881
882pub fn raw_coding_score(
889 encoded_sequence: &[u8],
890 reverse_complement_encoded_sequence: &[u8],
891 sequence_length: usize,
892 nodes: &mut [Node],
893 training: &Training,
894) {
895 let no_stop_probability =
896 calculate_no_stop_probability(training.gc_content, training.translation_table);
897 let mut frame_scores = [0.0; 3];
898 let mut last_positions = [0usize; 3];
899
900 process_strand_first_pass(
902 nodes,
903 encoded_sequence,
904 sequence_length,
905 Strand::Forward,
906 training,
907 &mut frame_scores,
908 &mut last_positions,
909 );
910
911 process_strand_first_pass(
912 nodes,
913 reverse_complement_encoded_sequence,
914 sequence_length,
915 Strand::Reverse,
916 training,
917 &mut frame_scores,
918 &mut last_positions,
919 );
920
921 process_strand_second_pass(nodes, Strand::Forward, &mut frame_scores);
923 process_strand_second_pass(nodes, Strand::Reverse, &mut frame_scores);
924
925 process_strand_third_pass(
927 nodes,
928 Strand::Forward,
929 no_stop_probability,
930 &mut frame_scores,
931 );
932 process_strand_third_pass(
933 nodes,
934 Strand::Reverse,
935 no_stop_probability,
936 &mut frame_scores,
937 );
938}
939
940#[cfg(test)]
941mod tests {
942 use super::*;
943 use crate::{constants::EXPECTED_NO_STOP_PROB, types::*};
944 use bio::bio_types::strand::Strand;
945
946 fn create_test_node(
947 strand: Strand,
948 index: usize,
949 codon_type: CodonType,
950 stop_value: isize,
951 ) -> Node {
952 Node {
953 position: NodePosition {
954 index,
955 strand,
956 codon_type,
957 stop_value,
958 is_edge: false,
959 },
960 scores: NodeScores::default(),
961 state: NodeState::default(),
962 motif_info: NodeMotifInfo {
963 ribosome_binding_sites: [0; 2],
964 best_motif: Motif::default(),
965 },
966 }
967 }
968
969 fn create_test_training() -> Training {
970 Training {
971 gc_content: 0.5,
972 translation_table: 11,
973 uses_shine_dalgarno: true,
974 start_type_weights: [2.0, 1.5, 1.0],
975 rbs_weights: Box::new([1.0; 28]),
976 upstream_composition: Box::new([[0.25; 4]; 32]),
977 motif_weights: Box::new([[[1.0; 4096]; 4]; 4]),
978 no_motif_weight: 0.5,
979 start_weight_factor: 4.35,
980 gc_bias_factors: [1.0; 3],
981 gene_dicodon_table: Box::new([1.0; 4096]),
982 total_dicodons: 0,
983 }
984 }
985
986 #[test]
987 fn test_score_nodes_basic() {
988 let seq = vec![0; 60];
989 let reverse_seq = vec![3; 60];
990 let training = create_test_training();
991
992 let mut nodes = vec![create_test_node(Strand::Forward, 30, CodonType::Atg, 45)];
993
994 let encoded_sequence = EncodedSequence {
995 forward_sequence: seq,
996 reverse_complement_sequence: reverse_seq,
997 unknown_sequence: vec![0; 60],
998 masks: vec![],
999 gc_content: 0.5,
1000 sequence_length: 60,
1001 };
1002
1003 let result = score_nodes(&encoded_sequence, &mut nodes, &training, false, false);
1004
1005 assert!(result.is_ok());
1006 assert!(nodes[0].scores.total_score.is_finite());
1007 }
1008
1009 #[test]
1010 fn test_score_nodes_edge_gene() {
1011 let seq = vec![0; 60];
1012 let reverse_seq = vec![3; 60];
1013 let training = create_test_training();
1014
1015 let mut nodes = vec![Node {
1017 position: NodePosition {
1018 index: 2,
1019 stop_value: 30,
1020 strand: Strand::Forward,
1021 codon_type: CodonType::Atg,
1022 is_edge: true, },
1024 scores: NodeScores::default(),
1025 state: NodeState::default(),
1026 motif_info: NodeMotifInfo {
1027 ribosome_binding_sites: [0; 2],
1028 best_motif: Motif::default(),
1029 },
1030 }];
1031
1032 let encoded_sequence = EncodedSequence {
1033 forward_sequence: seq,
1034 reverse_complement_sequence: reverse_seq,
1035 unknown_sequence: vec![0; 60],
1036 masks: vec![],
1037 gc_content: 0.5,
1038 sequence_length: 60,
1039 };
1040
1041 let result = score_nodes(&encoded_sequence, &mut nodes, &training, false, false);
1042
1043 assert!(result.is_ok());
1044 assert!(nodes[0].position.is_edge);
1045 }
1046
1047 #[test]
1048 fn test_score_nodes_closed_mode() {
1049 let seq = vec![0; 40];
1050 let reverse_seq = vec![3; 40];
1051 let training = create_test_training();
1052
1053 let mut nodes = vec![create_test_node(Strand::Forward, 25, CodonType::Atg, 35)];
1054
1055 let encoded_sequence = EncodedSequence {
1056 forward_sequence: seq,
1057 reverse_complement_sequence: reverse_seq,
1058 unknown_sequence: vec![0; 40],
1059 masks: vec![],
1060 gc_content: 0.5,
1061 sequence_length: 40,
1062 };
1063
1064 let result = score_nodes(&encoded_sequence, &mut nodes, &training, true, false);
1065
1066 assert!(result.is_ok());
1067 assert!(!nodes[0].position.is_edge);
1068 }
1069
1070 #[test]
1071 fn test_score_nodes_metagenomic() {
1072 let seq = vec![0; 100]; let reverse_seq = vec![3; 100];
1074 let training = create_test_training();
1075
1076 let mut nodes = vec![create_test_node(Strand::Forward, 40, CodonType::Atg, 60)];
1077
1078 let encoded_sequence = EncodedSequence {
1079 forward_sequence: seq,
1080 reverse_complement_sequence: reverse_seq,
1081 unknown_sequence: vec![0; 100],
1082 masks: vec![],
1083 gc_content: 0.5,
1084 sequence_length: 100,
1085 };
1086
1087 let result = score_nodes(&encoded_sequence, &mut nodes, &training, false, true);
1088
1089 assert!(result.is_ok());
1090 assert!(nodes[0].scores.total_score.is_finite());
1091 }
1092
1093 #[test]
1094 fn test_score_nodes_non_shine_dalgarno() {
1095 let seq = vec![0; 60];
1096 let reverse_seq = vec![3; 60];
1097 let mut training = create_test_training();
1098 training.uses_shine_dalgarno = false;
1099
1100 let mut nodes = vec![create_test_node(Strand::Forward, 30, CodonType::Atg, 45)];
1101
1102 let encoded_sequence = EncodedSequence {
1103 forward_sequence: seq,
1104 reverse_complement_sequence: reverse_seq,
1105 unknown_sequence: vec![0; 60],
1106 masks: vec![],
1107 gc_content: 0.5,
1108 sequence_length: 60,
1109 };
1110
1111 let result = score_nodes(&encoded_sequence, &mut nodes, &training, false, false);
1112
1113 assert!(result.is_ok());
1114 assert!(nodes[0].scores.ribosome_binding_score.is_finite());
1115 }
1116
1117 #[test]
1118 fn test_score_nodes_stop_codon_skip() {
1119 let seq = vec![0; 40];
1120 let reverse_seq = vec![3; 40];
1121 let training = create_test_training();
1122
1123 let mut nodes = vec![create_test_node(Strand::Forward, 30, CodonType::Stop, 30)];
1124
1125 let encoded_sequence = EncodedSequence {
1126 forward_sequence: seq,
1127 reverse_complement_sequence: reverse_seq,
1128 unknown_sequence: vec![0; 40],
1129 masks: vec![],
1130 gc_content: 0.5,
1131 sequence_length: 40,
1132 };
1133
1134 let result = score_nodes(&encoded_sequence, &mut nodes, &training, false, false);
1135
1136 assert!(result.is_ok());
1137 assert_eq!(nodes[0].scores.total_score, 0.0);
1138 }
1139
1140 #[test]
1141 fn test_score_upstream_composition() {
1142 let seq = vec![
1143 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0,
1144 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
1145 ];
1146 let training = create_test_training();
1147 let encoded_sequence = EncodedSequence {
1148 forward_sequence: seq.clone(),
1149 reverse_complement_sequence: seq.clone(),
1150 unknown_sequence: vec![0; seq.len()],
1151 masks: vec![],
1152 gc_content: 0.5,
1153 sequence_length: seq.len(),
1154 };
1155 let context = ScoringContext::new(&encoded_sequence, &training, false, false);
1156 let node = create_test_node(Strand::Forward, 45, CodonType::Atg, 60);
1157
1158 let score = score_upstream_composition(&seq, &node, &context);
1159
1160 assert!(score.is_finite());
1161 }
1162
1163 #[test]
1164 fn test_score_upstream_composition_reverse() {
1165 let seq = vec![
1166 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0,
1167 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
1168 ];
1169 let training = create_test_training();
1170 let encoded_sequence = EncodedSequence {
1171 forward_sequence: seq.clone(),
1172 reverse_complement_sequence: seq.clone(),
1173 unknown_sequence: vec![0; seq.len()],
1174 masks: vec![],
1175 gc_content: 0.5,
1176 sequence_length: seq.len(),
1177 };
1178 let context = ScoringContext::new(&encoded_sequence, &training, false, false);
1179 let node = create_test_node(Strand::Reverse, 45, CodonType::Atg, 30);
1180
1181 let score = score_upstream_composition(&seq, &node, &context);
1182
1183 assert!(score.is_finite());
1184 }
1185
1186 #[test]
1187 fn test_score_upstream_composition_near_start() {
1188 let seq = vec![0, 1, 2, 3, 0, 1, 2, 3];
1189 let training = create_test_training();
1190 let encoded_sequence = EncodedSequence {
1191 forward_sequence: seq.clone(),
1192 reverse_complement_sequence: seq.clone(),
1193 unknown_sequence: vec![0; seq.len()],
1194 masks: vec![],
1195 gc_content: 0.5,
1196 sequence_length: seq.len(),
1197 };
1198 let context = ScoringContext::new(&encoded_sequence, &training, false, false);
1199 let node = create_test_node(Strand::Forward, 3, CodonType::Atg, 6);
1200
1201 let score = score_upstream_composition(&seq, &node, &context);
1202
1203 assert!(score.is_finite());
1205 }
1206
1207 #[test]
1208 fn test_raw_coding_score_forward() {
1209 let seq = vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
1210 let reverse_seq = vec![3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0];
1211 let training = create_test_training();
1212
1213 let mut nodes = vec![
1214 create_test_node(Strand::Forward, 3, CodonType::Atg, 12),
1215 create_test_node(Strand::Forward, 12, CodonType::Stop, 12),
1216 ];
1217
1218 raw_coding_score(&seq, &reverse_seq, seq.len(), &mut nodes, &training);
1219
1220 assert!(nodes[0].scores.coding_score.is_finite());
1221 }
1222
1223 #[test]
1224 fn test_raw_coding_score_reverse() {
1225 let seq = vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
1226 let reverse_seq = vec![3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0];
1227 let training = create_test_training();
1228
1229 let mut nodes = vec![
1230 create_test_node(Strand::Reverse, 12, CodonType::Atg, 3),
1231 create_test_node(Strand::Reverse, 3, CodonType::Stop, 3),
1232 ];
1233
1234 raw_coding_score(&seq, &reverse_seq, seq.len(), &mut nodes, &training);
1235
1236 assert!(nodes[0].scores.coding_score.is_finite());
1237 }
1238
1239 #[test]
1240 fn test_raw_coding_score_mixed_strands() {
1241 let seq = vec![
1242 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
1243 ];
1244 let reverse_seq = vec![
1245 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0,
1246 ];
1247 let training = create_test_training();
1248
1249 let mut nodes = vec![
1250 create_test_node(Strand::Forward, 3, CodonType::Atg, 15),
1251 create_test_node(Strand::Reverse, 18, CodonType::Gtg, 6),
1252 create_test_node(Strand::Forward, 15, CodonType::Stop, 15),
1253 create_test_node(Strand::Reverse, 6, CodonType::Stop, 6),
1254 ];
1255
1256 raw_coding_score(&seq, &reverse_seq, seq.len(), &mut nodes, &training);
1257
1258 assert!(nodes[0].scores.coding_score.is_finite());
1259 assert!(nodes[1].scores.coding_score.is_finite());
1260 }
1261
1262 #[test]
1263 fn test_raw_coding_score_different_translation_table() {
1264 let seq = vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
1265 let reverse_seq = vec![3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0];
1266 let mut training = create_test_training();
1267 training.translation_table = 4; let mut nodes = vec![
1270 create_test_node(Strand::Forward, 3, CodonType::Atg, 9),
1271 create_test_node(Strand::Forward, 9, CodonType::Stop, 9),
1272 ];
1273
1274 raw_coding_score(&seq, &reverse_seq, seq.len(), &mut nodes, &training);
1275
1276 assert!(nodes[0].scores.coding_score.is_finite());
1278 }
1279
1280 #[test]
1281 fn test_raw_coding_score_long_gene() {
1282 let seq = vec![0; 3000]; let reverse_seq = vec![3; 3000];
1284 let training = create_test_training();
1285
1286 let mut nodes = vec![
1287 create_test_node(Strand::Forward, 100, CodonType::Atg, 2900),
1288 create_test_node(Strand::Forward, 2900, CodonType::Stop, 2900),
1289 ];
1290
1291 raw_coding_score(&seq, &reverse_seq, seq.len(), &mut nodes, &training);
1292
1293 assert!(nodes[0].scores.coding_score.is_finite());
1294 }
1295
1296 #[test]
1297 fn test_raw_coding_score_empty_nodes() {
1298 let seq = vec![0, 1, 2, 3];
1299 let reverse_seq = vec![3, 2, 1, 0];
1300 let training = create_test_training();
1301
1302 let mut nodes = vec![];
1303
1304 raw_coding_score(&seq, &reverse_seq, seq.len(), &mut nodes, &training);
1305
1306 assert_eq!(nodes.len(), 0);
1308 }
1309
1310 #[test]
1311 fn test_calculate_gene_length() {
1312 let node = create_test_node(Strand::Forward, 100, CodonType::Atg, 400);
1313 let length = calculate_gene_length(&node);
1314 assert_eq!(length, 300);
1315
1316 let node_reverse = create_test_node(Strand::Reverse, 400, CodonType::Atg, 100);
1317 let length_reverse = calculate_gene_length(&node_reverse);
1318 assert_eq!(length_reverse, 300);
1319 }
1320
1321 #[test]
1322 fn test_calculate_gene_size_codons() {
1323 let node = create_test_node(Strand::Forward, 100, CodonType::Atg, 403); let size_codons = calculate_gene_size_codons(&node);
1325 assert_eq!(size_codons, 102.0); let node_partial = create_test_node(Strand::Forward, 100, CodonType::Atg, 401); let size_codons_partial = calculate_gene_size_codons(&node_partial);
1330 assert_eq!(size_codons_partial, 101.33333333333333); }
1332
1333 #[test]
1334 fn test_calculate_short_gene_factors() {
1335 let gene_length = 100;
1336 let (negative_factor, positive_factor) = calculate_short_gene_factors(gene_length);
1337
1338 assert_eq!(negative_factor, 2.5); assert_eq!(positive_factor, 0.4); let (neg_edge, pos_edge) = calculate_short_gene_factors(SHORT_GENE_THRESHOLD);
1343 assert_eq!(neg_edge, 1.0);
1344 assert_eq!(pos_edge, 1.0);
1345 }
1346
1347 #[test]
1348 fn test_calculate_min_metagenomic_length() {
1349 let sequence_length = 1000;
1350 let min_length = calculate_min_metagenomic_length(sequence_length);
1351 assert_eq!(min_length, (1000.0_f64).sqrt() * 5.0); let min_length_small = calculate_min_metagenomic_length(100);
1355 assert_eq!(min_length_small, 50.0); }
1357
1358 #[test]
1359 fn test_score_nodes_short_gene_penalty() {
1360 let seq = vec![0; 60];
1361 let reverse_seq = vec![3; 60];
1362 let training = create_test_training();
1363
1364 let mut nodes = vec![
1365 create_test_node(Strand::Forward, 30, CodonType::Atg, 35), ];
1367
1368 let encoded_sequence = EncodedSequence {
1369 forward_sequence: seq,
1370 reverse_complement_sequence: reverse_seq,
1371 unknown_sequence: vec![0; 60],
1372 masks: vec![],
1373 gc_content: 0.5,
1374 sequence_length: 60,
1375 };
1376
1377 let result = score_nodes(&encoded_sequence, &mut nodes, &training, false, false);
1378
1379 assert!(result.is_ok());
1380 assert!(nodes[0].scores.total_score.is_finite());
1381 }
1382
1383 #[test]
1384 fn test_score_nodes_negative_coding_score() {
1385 let seq = vec![0; 40];
1386 let reverse_seq = vec![3; 40];
1387 let mut training = create_test_training();
1388
1389 for score in training.gene_dicodon_table.iter_mut() {
1391 *score = -1.0;
1392 }
1393
1394 let mut nodes = vec![create_test_node(Strand::Forward, 25, CodonType::Atg, 35)];
1395
1396 let encoded_sequence = EncodedSequence {
1397 forward_sequence: seq,
1398 reverse_complement_sequence: reverse_seq,
1399 unknown_sequence: vec![0; 40],
1400 masks: vec![],
1401 gc_content: 0.5,
1402 sequence_length: 40,
1403 };
1404
1405 let result = score_nodes(&encoded_sequence, &mut nodes, &training, false, false);
1406
1407 assert!(result.is_ok());
1408 assert!(nodes[0].scores.total_score.is_finite());
1409 }
1410
1411 #[test]
1412 fn test_calculate_no_stop_probability_standard_genetic_code() {
1413 let gc_content = 0.5;
1414 let translation_table = 11;
1415
1416 let prob = calculate_no_stop_probability(gc_content, translation_table);
1417
1418 assert!(prob > 0.0 && prob < 1.0);
1420 assert!((prob - EXPECTED_NO_STOP_PROB).abs() < 0.01);
1421 }
1422
1423 #[test]
1424 fn test_calculate_no_stop_probability_non_standard_genetic_code() {
1425 let gc_content = 0.3;
1426 let translation_table = 4; let prob = calculate_no_stop_probability(gc_content, translation_table);
1429
1430 assert!(prob > 0.0 && prob < 1.0);
1432 }
1433
1434 #[test]
1435 fn test_calculate_length_factor_normal_gene() {
1436 let gene_size_codons = 200.0;
1437 let no_stop_probability = 0.95;
1438
1439 let factor = calculate_length_factor(gene_size_codons, no_stop_probability);
1440
1441 assert!(factor.is_finite());
1443 }
1444
1445 #[test]
1446 fn test_calculate_length_factor_long_gene() {
1447 let gene_size_codons = 1500.0; let no_stop_probability = 0.95;
1449
1450 let factor = calculate_length_factor(gene_size_codons, no_stop_probability);
1451
1452 assert!(factor.is_finite());
1454 }
1455
1456 #[test]
1457 fn test_process_strand_first_pass_forward() {
1458 let seq = vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
1459 let training = create_test_training();
1460 let mut frame_scores = [0.0; 3];
1461 let mut last_positions = [0usize; 3];
1462
1463 let mut nodes = vec![
1464 create_test_node(Strand::Forward, 3, CodonType::Atg, 12),
1465 create_test_node(Strand::Forward, 12, CodonType::Stop, 12),
1466 ];
1467
1468 process_strand_first_pass(
1469 &mut nodes,
1470 &seq,
1471 seq.len(),
1472 Strand::Forward,
1473 &training,
1474 &mut frame_scores,
1475 &mut last_positions,
1476 );
1477
1478 assert!(nodes[0].scores.coding_score.is_finite());
1480 }
1481
1482 #[test]
1483 fn test_process_strand_second_pass_forward() {
1484 let mut nodes = vec![
1485 create_test_node(Strand::Forward, 3, CodonType::Atg, 12),
1486 create_test_node(Strand::Forward, 6, CodonType::Gtg, 15),
1487 ];
1488
1489 nodes[0].scores.coding_score = 10.0;
1491 nodes[1].scores.coding_score = 5.0;
1492
1493 let mut frame_scores = [0.0; 3];
1494
1495 process_strand_second_pass(&mut nodes, Strand::Forward, &mut frame_scores);
1496
1497 assert!(nodes[0].scores.coding_score.is_finite());
1499 assert!(nodes[1].scores.coding_score.is_finite());
1500 }
1501
1502 #[test]
1503 fn test_process_strand_third_pass_forward() {
1504 let mut nodes = vec![create_test_node(Strand::Forward, 3, CodonType::Atg, 300)];
1505
1506 nodes[0].scores.coding_score = 5.0;
1508
1509 let no_stop_probability = 0.95;
1510 let mut frame_scores = [0.0; 3];
1511
1512 process_strand_third_pass(
1513 &mut nodes,
1514 Strand::Forward,
1515 no_stop_probability,
1516 &mut frame_scores,
1517 );
1518
1519 assert!(nodes[0].scores.coding_score > 5.0);
1521 assert!(nodes[0].scores.coding_score.is_finite());
1522 }
1523}