orphos_core/node/
scoring.rs

1use crate::{
2    constants::{
3        CODING_SCORE_SENTINEL, CODING_SCORE_THRESHOLD, DICODON_SIZE, EDGE_BONUS,
4        EDGE_POSITION_OFFSET, EDGE_POSITION_THRESHOLD, EDGE_UPSTREAM, GENE_SIZE_SCALING_FACTOR,
5        LENGTH_FACTOR_MULTIPLIER, LENGTH_FACTOR_THRESHOLD, MAX_GENE_SIZE_CODONS,
6        METAGENOMIC_LENGTH_THRESHOLD, METAGENOMIC_MIN_LENGTH_FACTOR, METAGENOMIC_PENALTY,
7        METAGENOMIC_PENALTY_DIVISOR, MIN_GENE_SIZE_CODONS, MIN_META_GENE_LENGTH,
8        NEGATIVE_SCORE_PENALTY, NO_MOTIF_THRESHOLD, NODE_SEARCH_WINDOW, SHORT_GENE_THRESHOLD,
9        UPSTREAM_COMPOSITION_WEIGHT, UPSTREAM_SCAN_RANGE, UPSTREAM_SKIP_END, UPSTREAM_SKIP_START,
10    },
11    node::{calc_orf_gc, find_best_upstream_motif},
12    rbs_score,
13    sequence::encoded::EncodedSequence,
14    sequence::{calculate_kmer_index, is_stop},
15    types::{CodonType, Node, OrphosError, Training},
16};
17
18use bio::bio_types::strand::Strand;
19
20/// Context struct containing sequence data and metadata
21#[derive(Debug)]
22pub struct ScoringContext<'a> {
23    /// Encoded sequence containing forward and reverse strands
24    pub encoded_sequence: &'a EncodedSequence,
25    /// Training data for scoring
26    pub training: &'a Training,
27    /// Whether the sequence is in closed reading frame mode
28    pub closed: bool,
29    /// Whether this is metagenomic mode
30    pub is_meta: bool,
31    /// Cached upstream composition scan indices (for performance)
32    pub upstream_scan_indices: Vec<usize>,
33}
34
35impl<'a> ScoringContext<'a> {
36    /// Create a new scoring context
37    pub fn new(
38        encoded_sequence: &'a EncodedSequence,
39        training: &'a Training,
40        closed: bool,
41        is_meta: bool,
42    ) -> Self {
43        // Pre-calculate upstream scan indices to avoid repeated computation
44        let upstream_scan_indices: Vec<usize> = (1..UPSTREAM_SCAN_RANGE)
45            .filter(|&i| i <= UPSTREAM_SKIP_START || i >= UPSTREAM_SKIP_END)
46            .collect();
47
48        Self {
49            encoded_sequence,
50            training,
51            closed,
52            is_meta,
53            upstream_scan_indices,
54        }
55    }
56}
57
58/// Context struct containing scoring parameters and factors
59#[derive(Debug, Clone)]
60pub struct ScoringParams {
61    /// Edge upstream bonus factor
62    pub edge_upstream_bonus: f64,
63    /// Edge bonus factor
64    pub edge_bonus_factor: f64,
65    /// Penalty factor for various score adjustments
66    pub penalty_factor: f64,
67    /// Cached start weight factor
68    pub start_weight_factor: f64,
69    /// Cached metagenomic penalty coefficient
70    pub metagenomic_penalty_coeff: f64,
71    /// Cached no-stop probability for length calculations
72    pub no_stop_probability: f64,
73}
74
75impl ScoringParams {
76    /// Create scoring parameters from training data
77    pub fn from_training(training: &Training) -> Self {
78        let start_weight_factor = training.start_weight_factor;
79        let edge_upstream_bonus = EDGE_UPSTREAM * start_weight_factor;
80        let edge_bonus_factor = EDGE_BONUS * start_weight_factor;
81        let penalty_factor = NEGATIVE_SCORE_PENALTY * edge_bonus_factor;
82        let metagenomic_penalty_coeff = METAGENOMIC_PENALTY / METAGENOMIC_PENALTY_DIVISOR;
83        let no_stop_probability =
84            calculate_no_stop_probability(training.gc_content, training.translation_table);
85
86        Self {
87            edge_upstream_bonus,
88            edge_bonus_factor,
89            penalty_factor,
90            start_weight_factor,
91            metagenomic_penalty_coeff,
92            no_stop_probability,
93        }
94    }
95}
96
97/// Cached calculations for a node to avoid repetitive computations
98#[derive(Debug, Clone)]
99pub struct NodeCache {
100    /// Gene length in base pairs
101    pub gene_length: usize,
102    /// Whether the node is at sequence boundaries
103    pub is_boundary_node: bool,
104    /// Calculated short gene factors (negative, positive)
105    pub short_gene_factors: Option<(f64, f64)>,
106    /// Whether metagenomic penalties apply
107    pub needs_metagenomic_penalty: bool,
108}
109
110impl NodeCache {
111    /// Create a new cache for a node
112    pub fn new(node: &Node, context: &ScoringContext) -> Self {
113        let gene_length = calculate_gene_length(node);
114
115        let is_boundary_node = is_node_at_boundary(node, context);
116
117        let short_gene_factors = if gene_length < SHORT_GENE_THRESHOLD {
118            Some(calculate_short_gene_factors(gene_length))
119        } else {
120            None
121        };
122
123        let needs_metagenomic_penalty = context.is_meta
124            && context.encoded_sequence.sequence_length < METAGENOMIC_LENGTH_THRESHOLD
125            && (node.scores.coding_score < CODING_SCORE_THRESHOLD
126                || gene_length < MIN_META_GENE_LENGTH);
127
128        Self {
129            gene_length,
130            is_boundary_node,
131            short_gene_factors,
132            needs_metagenomic_penalty,
133        }
134    }
135}
136
137/// Check if a node is at sequence boundaries
138fn is_node_at_boundary(node: &Node, context: &ScoringContext) -> bool {
139    (node.position.index <= EDGE_POSITION_THRESHOLD && node.position.strand == Strand::Forward)
140        || (node.position.index >= context.encoded_sequence.sequence_length - EDGE_POSITION_OFFSET
141            && node.position.strand == Strand::Reverse)
142}
143
144/// Calculate edge gene status for a node
145///
146/// Returns the number of edge gene indicators:
147/// - 1 if the node is already marked as edge
148/// - +1 if the stop codon is invalid (suggesting edge condition)
149fn calculate_edge_gene_status(node: &Node, context: &ScoringContext) -> usize {
150    let mut edge_gene = 0;
151
152    if node.position.is_edge {
153        edge_gene += 1;
154    }
155
156    // Check if stop codon is valid
157    if (node.position.strand == Strand::Forward
158        && (node.position.stop_value < 0
159            || !is_stop(
160                &context.encoded_sequence.forward_sequence,
161                node.position.stop_value as usize,
162                context.training,
163            )))
164        || (node.position.strand == Strand::Reverse
165            && (node.position.stop_value < 0
166                || !is_stop(
167                    &context.encoded_sequence.reverse_complement_sequence,
168                    context.encoded_sequence.sequence_length
169                        - 1
170                        - node.position.stop_value as usize,
171                    context.training,
172                )))
173    {
174        edge_gene += 1;
175    }
176
177    edge_gene
178}
179
180/// Calculate type scores for a node
181///
182/// For edge genes: Uses edge bonus factor divided by edge gene count
183/// For regular genes: Uses training weights based on codon type
184fn calculate_type_scores(
185    node: &mut Node,
186    edge_gene: usize,
187    params: &ScoringParams,
188    training: &Training,
189) {
190    if node.position.is_edge {
191        node.scores.type_score = params.edge_bonus_factor / edge_gene as f64;
192        node.scores.upstream_score = 0.0;
193        node.scores.ribosome_binding_score = 0.0;
194    } else {
195        // Type score using cached weight factor
196        node.scores.type_score = training.start_type_weights[node.position.codon_type.to_index()]
197            * params.start_weight_factor;
198    }
199}
200
201/// Calculate RBS (ribosome binding site) scores for a node
202///
203/// Uses either Shine-Dalgarno detection or motif scoring based on training parameters
204fn calculate_rbs_scores(node: &mut Node, training: &Training, params: &ScoringParams) {
205    if node.position.is_edge {
206        return; // Already handled in calculate_type_scores
207    }
208
209    // RBS score using cached weight factor
210    let rbs1 = training.rbs_weights[node.motif_info.ribosome_binding_sites[0]];
211    let rbs2 = training.rbs_weights[node.motif_info.ribosome_binding_sites[1]];
212    let sd_score = rbs1.max(rbs2) * params.start_weight_factor;
213
214    if training.uses_shine_dalgarno {
215        node.scores.ribosome_binding_score = sd_score;
216    } else {
217        node.scores.ribosome_binding_score =
218            params.start_weight_factor * node.motif_info.best_motif.score;
219        if node.scores.ribosome_binding_score < sd_score
220            && training.no_motif_weight > NO_MOTIF_THRESHOLD
221        {
222            node.scores.ribosome_binding_score = sd_score;
223        }
224    }
225}
226
227/// Calculate upstream scores for a node
228///
229/// Analyzes nucleotide composition upstream of the start codon and applies
230/// edge bonuses for nodes near sequence boundaries
231fn calculate_upstream_scores(
232    node: &mut Node,
233    context: &ScoringContext,
234    params: &ScoringParams,
235    cache: &NodeCache,
236) {
237    if node.position.is_edge {
238        return; // Already handled in calculate_type_scores
239    }
240
241    if node.position.strand == Strand::Forward {
242        node.scores.upstream_score =
243            score_upstream_composition(&context.encoded_sequence.forward_sequence, node, context);
244    } else {
245        node.scores.upstream_score = score_upstream_composition(
246            &context.encoded_sequence.reverse_complement_sequence,
247            node,
248            context,
249        );
250    }
251
252    // Apply edge upstream bonus using cached boundary check
253    if !context.closed && cache.is_boundary_node {
254        node.scores.upstream_score += params.edge_upstream_bonus;
255    }
256}
257
258/// Apply edge upstream bonus based on neighboring nodes
259///
260/// Searches for edge nodes with the same stop position within a limited window
261/// and applies bonus to the current node if found
262fn apply_edge_upstream_bonus_for_neighbors(
263    nodes: &mut [Node],
264    current_index: usize,
265    node_count: usize,
266    params: &ScoringParams,
267) {
268    let current_node = &nodes[current_index];
269
270    if current_node.position.is_edge {
271        return;
272    }
273
274    if current_index < NODE_SEARCH_WINDOW && current_node.position.strand == Strand::Forward {
275        // Forward strand: check previous nodes
276        for j in (0..current_index).rev() {
277            if nodes[j].position.is_edge
278                && current_node.position.stop_value == nodes[j].position.stop_value
279            {
280                nodes[current_index].scores.upstream_score += params.edge_upstream_bonus;
281                break;
282            }
283        }
284    } else if current_index >= node_count.saturating_sub(NODE_SEARCH_WINDOW)
285        && current_node.position.strand == Strand::Reverse
286    {
287        // Reverse strand: check following nodes
288        for j in (current_index + 1)..node_count {
289            if nodes[j].position.is_edge
290                && current_node.position.stop_value == nodes[j].position.stop_value
291            {
292                nodes[current_index].scores.upstream_score += params.edge_upstream_bonus;
293                break;
294            }
295        }
296    }
297}
298
299/// Convert nodes to edge genes if they are at sequence boundaries
300///
301/// In open reading frame mode (!closed), nodes near sequence start/end
302/// are converted to edge genes with special scoring
303fn convert_boundary_nodes_to_edge_genes(
304    node: &mut Node,
305    context: &ScoringContext,
306    edge_gene: &mut usize,
307    params: &ScoringParams,
308    cache: &NodeCache,
309) {
310    // Convert starts at base 1 and sequence_length to edge genes if closed = 0
311    if cache.is_boundary_node && !node.position.is_edge && !context.closed {
312        *edge_gene += 1;
313        node.position.is_edge = true;
314        node.scores.type_score = 0.0;
315        let new_upstream_score = params.edge_bonus_factor / *edge_gene as f64;
316        node.scores.upstream_score = new_upstream_score;
317        node.scores.ribosome_binding_score = 0.0;
318    }
319}
320
321/// Apply penalties and bonuses to node scores
322///
323/// Handles various scoring adjustments including:
324/// - Penalties for genes without proper stop codons
325/// - Short gene penalties/bonuses
326/// - Metagenomic-specific penalties
327/// - Negative coding score penalties
328fn apply_penalties_and_bonuses(
329    node: &mut Node,
330    edge_gene: usize,
331    context: &ScoringContext,
332    params: &ScoringParams,
333    cache: &NodeCache,
334) {
335    // Apply penalty for starts with no stop codon - this happens AFTER boundary conversion
336    if !node.position.is_edge && edge_gene == 1 {
337        node.scores.upstream_score -= params.penalty_factor;
338    }
339
340    // Apply short gene penalties/bonuses using cached factors
341    if edge_gene == 0
342        && let Some((negative_factor, positive_factor)) = cache.short_gene_factors
343    {
344        if node.scores.ribosome_binding_score < 0.0 {
345            node.scores.ribosome_binding_score *= negative_factor;
346        }
347        if node.scores.upstream_score < 0.0 {
348            node.scores.upstream_score *= negative_factor;
349        }
350        if node.scores.type_score < 0.0 {
351            node.scores.type_score *= negative_factor;
352        }
353
354        if node.scores.ribosome_binding_score > 0.0 {
355            node.scores.ribosome_binding_score *= positive_factor;
356        }
357        if node.scores.upstream_score > 0.0 {
358            node.scores.upstream_score *= positive_factor;
359        }
360        if node.scores.type_score > 0.0 {
361            node.scores.type_score *= positive_factor;
362        }
363    }
364
365    // Apply metagenomic penalties using cached calculation
366    if cache.needs_metagenomic_penalty && edge_gene == 0 {
367        let penalty = params.metagenomic_penalty_coeff
368            * (METAGENOMIC_LENGTH_THRESHOLD - context.encoded_sequence.sequence_length) as f64;
369        node.scores.coding_score -= penalty;
370    }
371
372    // Base start score calculation - this happens in C code at line 498
373    node.scores.start_score =
374        node.scores.type_score + node.scores.ribosome_binding_score + node.scores.upstream_score;
375
376    // Penalize negative coding scores
377    if node.scores.coding_score < 0.0 {
378        if edge_gene > 0 && !node.position.is_edge {
379            if !context.is_meta
380                || context.encoded_sequence.sequence_length > METAGENOMIC_MIN_LENGTH_FACTOR
381            {
382                node.scores.start_score -= params.start_weight_factor;
383            } else {
384                let penalty =
385                    10.31f64.mul_add(-(context.encoded_sequence.sequence_length as f64), 0.004);
386                node.scores.start_score -= penalty;
387            }
388        } else if context.is_meta
389            && context.encoded_sequence.sequence_length < METAGENOMIC_LENGTH_THRESHOLD
390            && node.position.is_edge
391        {
392            let min_meta_length =
393                calculate_min_metagenomic_length(context.encoded_sequence.sequence_length);
394            if cache.gene_length as f64 >= min_meta_length {
395                if node.scores.coding_score >= 0.0 {
396                    node.scores.coding_score = -1.0;
397                }
398                node.scores.start_score = 0.0;
399                node.scores.upstream_score = 0.0;
400            }
401        } else {
402            node.scores.start_score -= NEGATIVE_SCORE_PENALTY;
403        }
404    } else if node.scores.coding_score < CODING_SCORE_THRESHOLD
405        && context.is_meta
406        && cache.gene_length < MIN_META_GENE_LENGTH
407        && node.scores.start_score < 0.0
408    {
409        node.scores.start_score -= params.start_weight_factor;
410    }
411}
412
413/// Calculate final combined scores for a node
414///
415/// Computes the total score from coding + start scores
416/// Start score is already calculated in apply_penalties_and_bonuses
417fn calculate_final_scores(node: &mut Node) {
418    node.scores.total_score = node.scores.coding_score + node.scores.start_score;
419}
420
421/// Score nodes for gene finding
422pub fn score_nodes(
423    encoded_sequence: &EncodedSequence,
424    nodes: &mut [Node],
425    training: &Training,
426    closed: bool,
427    is_meta: bool,
428) -> Result<(), OrphosError> {
429    let node_count = nodes.len();
430
431    // Early exit for empty nodes
432    if node_count == 0 {
433        return Ok(());
434    }
435
436    // Create context structs to reduce parameter passing
437    let context = ScoringContext::new(encoded_sequence, training, closed, is_meta);
438    let params = ScoringParams::from_training(training);
439
440    calc_orf_gc(
441        &encoded_sequence.forward_sequence,
442        encoded_sequence.sequence_length,
443        nodes,
444    );
445    raw_coding_score_optimized(
446        &encoded_sequence.forward_sequence,
447        &encoded_sequence.reverse_complement_sequence,
448        encoded_sequence.sequence_length,
449        nodes,
450        training,
451        params.no_stop_probability,
452    );
453
454    // Calculate RBS scores using optimized approach
455    if training.uses_shine_dalgarno {
456        rbs_score(
457            &encoded_sequence.forward_sequence,
458            &encoded_sequence.reverse_complement_sequence,
459            encoded_sequence.sequence_length,
460            nodes,
461            training,
462        );
463    } else {
464        // For non-SD mode, process motif finding with bounds checking
465        for node in nodes.iter_mut() {
466            if node.position.codon_type != CodonType::Stop && !node.position.is_edge {
467                find_best_upstream_motif(
468                    training,
469                    &encoded_sequence.forward_sequence,
470                    &encoded_sequence.reverse_complement_sequence,
471                    encoded_sequence.sequence_length,
472                    node,
473                    2,
474                );
475            }
476        }
477    }
478
479    // Score each start node using extracted helper functions with cached calculations
480    for i in 0..node_count {
481        if nodes[i].position.codon_type == CodonType::Stop {
482            continue;
483        }
484
485        // Create cache for repetitive calculations
486        let cache = NodeCache::new(&nodes[i], &context);
487
488        // Calculate edge gene status
489        let mut edge_gene = calculate_edge_gene_status(&nodes[i], &context);
490
491        // Calculate various score components
492        calculate_type_scores(&mut nodes[i], edge_gene, &params, training);
493
494        calculate_rbs_scores(&mut nodes[i], training, &params);
495
496        calculate_upstream_scores(&mut nodes[i], &context, &params, &cache);
497
498        // Apply edge upstream bonus based on neighboring nodes
499        apply_edge_upstream_bonus_for_neighbors(nodes, i, node_count, &params);
500
501        // Convert boundary nodes to edge genes if necessary
502        convert_boundary_nodes_to_edge_genes(
503            &mut nodes[i],
504            &context,
505            &mut edge_gene,
506            &params,
507            &cache,
508        );
509
510        // Calculate base start score before applying penalties
511        calculate_final_scores(&mut nodes[i]);
512
513        // Apply penalties and bonuses (includes recalculating start score)
514        apply_penalties_and_bonuses(&mut nodes[i], edge_gene, &context, &params, &cache);
515
516        // Calculate final total score
517        nodes[i].scores.total_score = nodes[i].scores.coding_score + nodes[i].scores.start_score;
518    }
519
520    Ok(())
521}
522
523/// Score the upstream composition for a given node with optimized scanning
524fn score_upstream_composition(seq: &[u8], node: &Node, context: &ScoringContext) -> f64 {
525    let start = if node.position.strand == Strand::Forward {
526        node.position.index
527    } else {
528        context.encoded_sequence.sequence_length - 1 - node.position.index
529    };
530
531    let mut upstream_score = 0.0;
532    let mut count = 0;
533
534    // Use pre-calculated scan indices to avoid repeated filtering
535    for &i in &context.upstream_scan_indices {
536        if start < i {
537            continue;
538        }
539
540        let pos = start - i;
541
542        // Get the nucleotide index at position
543        let mer_idx = calculate_kmer_index(1, seq, pos);
544
545        // Add to upstream score
546        upstream_score += UPSTREAM_COMPOSITION_WEIGHT
547            * context.training.start_weight_factor
548            * context.training.upstream_composition[count][mer_idx];
549        count += 1;
550    }
551    upstream_score
552}
553
554/// Calculate the length of a gene in base pairs
555///
556/// Returns the absolute difference between start and stop positions
557const fn calculate_gene_length(node: &Node) -> usize {
558    if node.position.stop_value < 0 {
559        (node.position.index as isize - node.position.stop_value) as usize
560    } else {
561        (node.position.stop_value as usize).abs_diff(node.position.index)
562    }
563}
564
565/// Calculate the length of a gene in codons
566///
567/// Converts gene length in base pairs to codons, adding 3 to account for partial codons
568fn calculate_gene_size_codons(node: &Node) -> f64 {
569    ((calculate_gene_length(node) + 3) as f64) / 3.0
570}
571
572/// Calculate short gene penalty factors
573///
574/// Returns (negative_factor, positive_factor) for penalizing/rewarding short genes
575fn calculate_short_gene_factors(gene_length: usize) -> (f64, f64) {
576    let negative_factor = SHORT_GENE_THRESHOLD as f64 / gene_length as f64;
577    let positive_factor = gene_length as f64 / SHORT_GENE_THRESHOLD as f64;
578    (negative_factor, positive_factor)
579}
580
581/// Calculate minimum metagenomic gene length threshold
582///
583/// Uses sequence length to determine minimum acceptable gene length for metagenomic mode
584fn calculate_min_metagenomic_length(sequence_length: usize) -> f64 {
585    (sequence_length as f64).sqrt() * 5.0
586}
587
588/// Calculate the probability that a random codon is not a stop codon
589///
590/// This depends on the genetic code and GC content of the organism.
591/// For the standard genetic code (11), there are 3 stop codons out of 64.
592/// For other genetic codes, the number varies.
593fn calculate_no_stop_probability(gc_content: f64, translation_table: i32) -> f64 {
594    let at_content = 1.0 - gc_content;
595    let at_squared = at_content * at_content;
596
597    if translation_table != 11 {
598        // For non-standard genetic codes, calculate stop codon probability
599
600        // TAA and TAG (AT-rich stop codons)
601        let at_stop_prob = (at_squared * gc_content) / 8.0;
602        // TGA (mixed stop codon)
603        let mixed_stop_prob = (at_squared * at_content) / 8.0;
604
605        1.0 - (at_stop_prob + mixed_stop_prob)
606    } else {
607        // Standard genetic code has 3 stop codons: TAA, TAG, TGA
608
609        // TAA and TAG probability
610        let tag_taa_prob = (at_squared * gc_content) / 4.0;
611        // TGA probability
612        let tga_prob = (at_squared * at_content) / 8.0;
613
614        1.0 - (tag_taa_prob + tga_prob)
615    }
616}
617
618/// Process nodes for a specific strand in the first pass (dicodon scoring)
619///
620/// This calculates cumulative dicodon scores from stop codons to start codons
621fn process_strand_first_pass(
622    nodes: &mut [Node],
623    encoded_sequence: &[u8],
624    sequence_length: usize,
625    strand: Strand,
626    training: &Training,
627    frame_scores: &mut [f64; 3],
628    last_positions: &mut [usize; 3],
629) {
630    frame_scores.fill(0.0);
631
632    if strand == Strand::Forward {
633        // Process forward strand in reverse order (from end to start)
634        for i in (0..nodes.len()).rev() {
635            if nodes[i].position.strand != Strand::Forward {
636                continue;
637            }
638
639            let frame_index = nodes[i].position.index % 3;
640
641            if nodes[i].position.codon_type == CodonType::Stop {
642                last_positions[frame_index] = nodes[i].position.index;
643                frame_scores[frame_index] = 0.0;
644            } else {
645                // Calculate dicodon scores from last stop to current start
646                if last_positions[frame_index] >= 3 {
647                    let mut j = last_positions[frame_index] - 3;
648                    loop {
649                        if j < nodes[i].position.index {
650                            break;
651                        }
652                        let mer_idx = calculate_kmer_index(DICODON_SIZE, encoded_sequence, j);
653                        let dicodon_score = training.gene_dicodon_table[mer_idx];
654                        frame_scores[frame_index] += dicodon_score;
655
656                        if j < 3 {
657                            break;
658                        }
659                        j -= 3;
660                    }
661                }
662                nodes[i].scores.coding_score = frame_scores[frame_index];
663                last_positions[frame_index] = nodes[i].position.index;
664            }
665        }
666    } else {
667        // Process reverse strand in forward order
668        for node in nodes.iter_mut() {
669            if node.position.strand != Strand::Reverse {
670                continue;
671            }
672
673            let frame_index = node.position.index % 3;
674
675            if node.position.codon_type == CodonType::Stop {
676                last_positions[frame_index] = node.position.index;
677                frame_scores[frame_index] = 0.0;
678            } else {
679                // Calculate dicodon scores from last stop to current start
680                let mut j = last_positions[frame_index] + 3;
681                while j <= node.position.index {
682                    let reverse_seq_pos = sequence_length - j - 1;
683                    let mer_idx =
684                        calculate_kmer_index(DICODON_SIZE, encoded_sequence, reverse_seq_pos);
685                    let dicodon_score = training.gene_dicodon_table[mer_idx];
686                    frame_scores[frame_index] += dicodon_score;
687                    j += 3;
688                }
689                node.scores.coding_score = frame_scores[frame_index];
690                last_positions[frame_index] = node.position.index;
691            }
692        }
693    }
694}
695
696/// Process nodes for a specific strand in the second pass (score normalization)
697///
698/// This normalizes scores relative to the maximum score in each frame
699fn process_strand_second_pass(nodes: &mut [Node], strand: Strand, frame_scores: &mut [f64; 3]) {
700    frame_scores.fill(CODING_SCORE_SENTINEL);
701
702    if strand == Strand::Forward {
703        // Process forward strand in forward order
704        for node in nodes.iter_mut() {
705            if node.position.strand != Strand::Forward {
706                continue;
707            }
708
709            let frame_index = node.position.index % 3;
710
711            if node.position.codon_type == CodonType::Stop {
712                frame_scores[frame_index] = CODING_SCORE_SENTINEL;
713            } else if node.scores.coding_score > frame_scores[frame_index] {
714                frame_scores[frame_index] = node.scores.coding_score;
715            } else {
716                node.scores.coding_score -= frame_scores[frame_index] - node.scores.coding_score;
717            }
718        }
719    } else {
720        // Process reverse strand in reverse order
721        for i in (0..nodes.len()).rev() {
722            if nodes[i].position.strand != Strand::Reverse {
723                continue;
724            }
725
726            let frame_index = nodes[i].position.index % 3;
727
728            if nodes[i].position.codon_type == CodonType::Stop {
729                frame_scores[frame_index] = CODING_SCORE_SENTINEL;
730            } else if nodes[i].scores.coding_score > frame_scores[frame_index] {
731                frame_scores[frame_index] = nodes[i].scores.coding_score;
732            } else {
733                nodes[i].scores.coding_score -=
734                    frame_scores[frame_index] - nodes[i].scores.coding_score;
735            }
736        }
737    }
738}
739
740/// Calculate the length-based scoring factor for a gene
741///
742/// This implements the biological principle that very short or very long genes
743/// are less likely to be real genes
744fn calculate_length_factor(gene_size_codons: f64, no_stop_probability: f64) -> f64 {
745    if gene_size_codons > MAX_GENE_SIZE_CODONS as f64 {
746        // For very long genes, use a scaled length factor
747        let mut factor = ((1.0 - no_stop_probability.powi(MAX_GENE_SIZE_CODONS))
748            / no_stop_probability.powi(MAX_GENE_SIZE_CODONS))
749        .ln();
750        factor -= ((1.0 - no_stop_probability.powi(MIN_GENE_SIZE_CODONS))
751            / no_stop_probability.powi(MIN_GENE_SIZE_CODONS))
752        .ln();
753        factor *= (gene_size_codons - MIN_GENE_SIZE_CODONS as f64) / GENE_SIZE_SCALING_FACTOR;
754        factor
755    } else {
756        // For normal-sized genes, use direct calculation
757        let mut factor = ((1.0 - no_stop_probability.powf(gene_size_codons))
758            / no_stop_probability.powf(gene_size_codons))
759        .ln();
760        factor -= ((1.0 - no_stop_probability.powi(MIN_GENE_SIZE_CODONS))
761            / no_stop_probability.powi(MIN_GENE_SIZE_CODONS))
762        .ln();
763        factor
764    }
765}
766
767/// Apply length-based adjustments to coding scores for a specific strand
768///
769/// This adds length factors and applies score thresholds based on gene size
770fn process_strand_third_pass(
771    nodes: &mut [Node],
772    strand: Strand,
773    no_stop_probability: f64,
774    frame_scores: &mut [f64; 3],
775) {
776    frame_scores.fill(CODING_SCORE_SENTINEL);
777
778    let process_node = |node: &mut Node, frame_scores: &mut [f64; 3]| {
779        if node.position.strand != strand {
780            return;
781        }
782
783        let frame_index = node.position.index % 3;
784
785        if node.position.codon_type == CodonType::Stop {
786            frame_scores[frame_index] = CODING_SCORE_SENTINEL;
787        } else {
788            let gene_size_codons = calculate_gene_size_codons(node);
789            let length_factor = calculate_length_factor(gene_size_codons, no_stop_probability);
790            let mut adjusted_length_factor = length_factor;
791
792            // Apply frame-based adjustments
793            if adjusted_length_factor > frame_scores[frame_index] {
794                frame_scores[frame_index] = adjusted_length_factor;
795            } else {
796                let temp = (frame_scores[frame_index] - adjusted_length_factor)
797                    .min(adjusted_length_factor)
798                    .max(0.0);
799                adjusted_length_factor -= temp;
800            }
801
802            // Apply length factor threshold
803            if adjusted_length_factor > LENGTH_FACTOR_THRESHOLD
804                && node.scores.coding_score < LENGTH_FACTOR_MULTIPLIER * adjusted_length_factor
805            {
806                node.scores.coding_score = LENGTH_FACTOR_MULTIPLIER * adjusted_length_factor;
807            }
808            node.scores.coding_score += adjusted_length_factor;
809        }
810    };
811
812    if strand == Strand::Forward {
813        // Process forward strand in forward order
814        for node in nodes.iter_mut() {
815            process_node(node, frame_scores);
816        }
817    } else {
818        // Process reverse strand in reverse order
819        for i in (0..nodes.len()).rev() {
820            process_node(&mut nodes[i], frame_scores);
821        }
822    }
823}
824
825/// Calculate raw coding scores with cached no_stop_probability
826///
827/// This is a three-pass algorithm:
828/// 1. First pass: Calculate cumulative dicodon scores from stop to start
829/// 2. Second pass: Normalize scores relative to maximum in each frame
830/// 3. Third pass: Add length-based factors and apply thresholds
831pub fn raw_coding_score_optimized(
832    encoded_sequence: &[u8],
833    reverse_complement_encoded_sequence: &[u8],
834    sequence_length: usize,
835    nodes: &mut [Node],
836    training: &Training,
837    no_stop_probability: f64,
838) {
839    let mut frame_scores = [0.0; 3];
840    let mut last_positions = [0usize; 3];
841
842    // FIRST PASS: Score coding potential (start->stop) for both strands
843    process_strand_first_pass(
844        nodes,
845        encoded_sequence,
846        sequence_length,
847        Strand::Forward,
848        training,
849        &mut frame_scores,
850        &mut last_positions,
851    );
852
853    process_strand_first_pass(
854        nodes,
855        reverse_complement_encoded_sequence,
856        sequence_length,
857        Strand::Reverse,
858        training,
859        &mut frame_scores,
860        &mut last_positions,
861    );
862
863    // SECOND PASS: Normalize scores relative to maximum in each frame
864    process_strand_second_pass(nodes, Strand::Forward, &mut frame_scores);
865    process_strand_second_pass(nodes, Strand::Reverse, &mut frame_scores);
866
867    // THIRD PASS: Add length-based factors and apply thresholds
868    process_strand_third_pass(
869        nodes,
870        Strand::Forward,
871        no_stop_probability,
872        &mut frame_scores,
873    );
874    process_strand_third_pass(
875        nodes,
876        Strand::Reverse,
877        no_stop_probability,
878        &mut frame_scores,
879    );
880}
881
882/// Calculate raw coding scores
883///
884/// This is a three-pass algorithm:
885/// 1. First pass: Calculate cumulative dicodon scores from stop to start
886/// 2. Second pass: Normalize scores relative to maximum in each frame
887/// 3. Third pass: Add length-based factors and apply thresholds
888pub fn raw_coding_score(
889    encoded_sequence: &[u8],
890    reverse_complement_encoded_sequence: &[u8],
891    sequence_length: usize,
892    nodes: &mut [Node],
893    training: &Training,
894) {
895    let no_stop_probability =
896        calculate_no_stop_probability(training.gc_content, training.translation_table);
897    let mut frame_scores = [0.0; 3];
898    let mut last_positions = [0usize; 3];
899
900    // FIRST PASS: Score coding potential (start->stop) for both strands
901    process_strand_first_pass(
902        nodes,
903        encoded_sequence,
904        sequence_length,
905        Strand::Forward,
906        training,
907        &mut frame_scores,
908        &mut last_positions,
909    );
910
911    process_strand_first_pass(
912        nodes,
913        reverse_complement_encoded_sequence,
914        sequence_length,
915        Strand::Reverse,
916        training,
917        &mut frame_scores,
918        &mut last_positions,
919    );
920
921    // SECOND PASS: Normalize scores relative to maximum in each frame
922    process_strand_second_pass(nodes, Strand::Forward, &mut frame_scores);
923    process_strand_second_pass(nodes, Strand::Reverse, &mut frame_scores);
924
925    // THIRD PASS: Add length-based factors and apply thresholds
926    process_strand_third_pass(
927        nodes,
928        Strand::Forward,
929        no_stop_probability,
930        &mut frame_scores,
931    );
932    process_strand_third_pass(
933        nodes,
934        Strand::Reverse,
935        no_stop_probability,
936        &mut frame_scores,
937    );
938}
939
940#[cfg(test)]
941mod tests {
942    use super::*;
943    use crate::{constants::EXPECTED_NO_STOP_PROB, types::*};
944    use bio::bio_types::strand::Strand;
945
946    fn create_test_node(
947        strand: Strand,
948        index: usize,
949        codon_type: CodonType,
950        stop_value: isize,
951    ) -> Node {
952        Node {
953            position: NodePosition {
954                index,
955                strand,
956                codon_type,
957                stop_value,
958                is_edge: false,
959            },
960            scores: NodeScores::default(),
961            state: NodeState::default(),
962            motif_info: NodeMotifInfo {
963                ribosome_binding_sites: [0; 2],
964                best_motif: Motif::default(),
965            },
966        }
967    }
968
969    fn create_test_training() -> Training {
970        Training {
971            gc_content: 0.5,
972            translation_table: 11,
973            uses_shine_dalgarno: true,
974            start_type_weights: [2.0, 1.5, 1.0],
975            rbs_weights: Box::new([1.0; 28]),
976            upstream_composition: Box::new([[0.25; 4]; 32]),
977            motif_weights: Box::new([[[1.0; 4096]; 4]; 4]),
978            no_motif_weight: 0.5,
979            start_weight_factor: 4.35,
980            gc_bias_factors: [1.0; 3],
981            gene_dicodon_table: Box::new([1.0; 4096]),
982            total_dicodons: 0,
983        }
984    }
985
986    #[test]
987    fn test_score_nodes_basic() {
988        let seq = vec![0; 60];
989        let reverse_seq = vec![3; 60];
990        let training = create_test_training();
991
992        let mut nodes = vec![create_test_node(Strand::Forward, 30, CodonType::Atg, 45)];
993
994        let encoded_sequence = EncodedSequence {
995            forward_sequence: seq,
996            reverse_complement_sequence: reverse_seq,
997            unknown_sequence: vec![0; 60],
998            masks: vec![],
999            gc_content: 0.5,
1000            sequence_length: 60,
1001        };
1002
1003        let result = score_nodes(&encoded_sequence, &mut nodes, &training, false, false);
1004
1005        assert!(result.is_ok());
1006        assert!(nodes[0].scores.total_score.is_finite());
1007    }
1008
1009    #[test]
1010    fn test_score_nodes_edge_gene() {
1011        let seq = vec![0; 60];
1012        let reverse_seq = vec![3; 60];
1013        let training = create_test_training();
1014
1015        // Create a node that will become an edge gene
1016        let mut nodes = vec![Node {
1017            position: NodePosition {
1018                index: 2,
1019                stop_value: 30,
1020                strand: Strand::Forward,
1021                codon_type: CodonType::Atg,
1022                is_edge: true, // Mark as edge to skip RBS scoring
1023            },
1024            scores: NodeScores::default(),
1025            state: NodeState::default(),
1026            motif_info: NodeMotifInfo {
1027                ribosome_binding_sites: [0; 2],
1028                best_motif: Motif::default(),
1029            },
1030        }];
1031
1032        let encoded_sequence = EncodedSequence {
1033            forward_sequence: seq,
1034            reverse_complement_sequence: reverse_seq,
1035            unknown_sequence: vec![0; 60],
1036            masks: vec![],
1037            gc_content: 0.5,
1038            sequence_length: 60,
1039        };
1040
1041        let result = score_nodes(&encoded_sequence, &mut nodes, &training, false, false);
1042
1043        assert!(result.is_ok());
1044        assert!(nodes[0].position.is_edge);
1045    }
1046
1047    #[test]
1048    fn test_score_nodes_closed_mode() {
1049        let seq = vec![0; 40];
1050        let reverse_seq = vec![3; 40];
1051        let training = create_test_training();
1052
1053        let mut nodes = vec![create_test_node(Strand::Forward, 25, CodonType::Atg, 35)];
1054
1055        let encoded_sequence = EncodedSequence {
1056            forward_sequence: seq,
1057            reverse_complement_sequence: reverse_seq,
1058            unknown_sequence: vec![0; 40],
1059            masks: vec![],
1060            gc_content: 0.5,
1061            sequence_length: 40,
1062        };
1063
1064        let result = score_nodes(&encoded_sequence, &mut nodes, &training, true, false);
1065
1066        assert!(result.is_ok());
1067        assert!(!nodes[0].position.is_edge);
1068    }
1069
1070    #[test]
1071    fn test_score_nodes_metagenomic() {
1072        let seq = vec![0; 100]; // Short sequence for metagenomic
1073        let reverse_seq = vec![3; 100];
1074        let training = create_test_training();
1075
1076        let mut nodes = vec![create_test_node(Strand::Forward, 40, CodonType::Atg, 60)];
1077
1078        let encoded_sequence = EncodedSequence {
1079            forward_sequence: seq,
1080            reverse_complement_sequence: reverse_seq,
1081            unknown_sequence: vec![0; 100],
1082            masks: vec![],
1083            gc_content: 0.5,
1084            sequence_length: 100,
1085        };
1086
1087        let result = score_nodes(&encoded_sequence, &mut nodes, &training, false, true);
1088
1089        assert!(result.is_ok());
1090        assert!(nodes[0].scores.total_score.is_finite());
1091    }
1092
1093    #[test]
1094    fn test_score_nodes_non_shine_dalgarno() {
1095        let seq = vec![0; 60];
1096        let reverse_seq = vec![3; 60];
1097        let mut training = create_test_training();
1098        training.uses_shine_dalgarno = false;
1099
1100        let mut nodes = vec![create_test_node(Strand::Forward, 30, CodonType::Atg, 45)];
1101
1102        let encoded_sequence = EncodedSequence {
1103            forward_sequence: seq,
1104            reverse_complement_sequence: reverse_seq,
1105            unknown_sequence: vec![0; 60],
1106            masks: vec![],
1107            gc_content: 0.5,
1108            sequence_length: 60,
1109        };
1110
1111        let result = score_nodes(&encoded_sequence, &mut nodes, &training, false, false);
1112
1113        assert!(result.is_ok());
1114        assert!(nodes[0].scores.ribosome_binding_score.is_finite());
1115    }
1116
1117    #[test]
1118    fn test_score_nodes_stop_codon_skip() {
1119        let seq = vec![0; 40];
1120        let reverse_seq = vec![3; 40];
1121        let training = create_test_training();
1122
1123        let mut nodes = vec![create_test_node(Strand::Forward, 30, CodonType::Stop, 30)];
1124
1125        let encoded_sequence = EncodedSequence {
1126            forward_sequence: seq,
1127            reverse_complement_sequence: reverse_seq,
1128            unknown_sequence: vec![0; 40],
1129            masks: vec![],
1130            gc_content: 0.5,
1131            sequence_length: 40,
1132        };
1133
1134        let result = score_nodes(&encoded_sequence, &mut nodes, &training, false, false);
1135
1136        assert!(result.is_ok());
1137        assert_eq!(nodes[0].scores.total_score, 0.0);
1138    }
1139
1140    #[test]
1141    fn test_score_upstream_composition() {
1142        let seq = vec![
1143            0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0,
1144            1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
1145        ];
1146        let training = create_test_training();
1147        let encoded_sequence = EncodedSequence {
1148            forward_sequence: seq.clone(),
1149            reverse_complement_sequence: seq.clone(),
1150            unknown_sequence: vec![0; seq.len()],
1151            masks: vec![],
1152            gc_content: 0.5,
1153            sequence_length: seq.len(),
1154        };
1155        let context = ScoringContext::new(&encoded_sequence, &training, false, false);
1156        let node = create_test_node(Strand::Forward, 45, CodonType::Atg, 60);
1157
1158        let score = score_upstream_composition(&seq, &node, &context);
1159
1160        assert!(score.is_finite());
1161    }
1162
1163    #[test]
1164    fn test_score_upstream_composition_reverse() {
1165        let seq = vec![
1166            0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0,
1167            1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
1168        ];
1169        let training = create_test_training();
1170        let encoded_sequence = EncodedSequence {
1171            forward_sequence: seq.clone(),
1172            reverse_complement_sequence: seq.clone(),
1173            unknown_sequence: vec![0; seq.len()],
1174            masks: vec![],
1175            gc_content: 0.5,
1176            sequence_length: seq.len(),
1177        };
1178        let context = ScoringContext::new(&encoded_sequence, &training, false, false);
1179        let node = create_test_node(Strand::Reverse, 45, CodonType::Atg, 30);
1180
1181        let score = score_upstream_composition(&seq, &node, &context);
1182
1183        assert!(score.is_finite());
1184    }
1185
1186    #[test]
1187    fn test_score_upstream_composition_near_start() {
1188        let seq = vec![0, 1, 2, 3, 0, 1, 2, 3];
1189        let training = create_test_training();
1190        let encoded_sequence = EncodedSequence {
1191            forward_sequence: seq.clone(),
1192            reverse_complement_sequence: seq.clone(),
1193            unknown_sequence: vec![0; seq.len()],
1194            masks: vec![],
1195            gc_content: 0.5,
1196            sequence_length: seq.len(),
1197        };
1198        let context = ScoringContext::new(&encoded_sequence, &training, false, false);
1199        let node = create_test_node(Strand::Forward, 3, CodonType::Atg, 6);
1200
1201        let score = score_upstream_composition(&seq, &node, &context);
1202
1203        // Should handle nodes near sequence start
1204        assert!(score.is_finite());
1205    }
1206
1207    #[test]
1208    fn test_raw_coding_score_forward() {
1209        let seq = vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
1210        let reverse_seq = vec![3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0];
1211        let training = create_test_training();
1212
1213        let mut nodes = vec![
1214            create_test_node(Strand::Forward, 3, CodonType::Atg, 12),
1215            create_test_node(Strand::Forward, 12, CodonType::Stop, 12),
1216        ];
1217
1218        raw_coding_score(&seq, &reverse_seq, seq.len(), &mut nodes, &training);
1219
1220        assert!(nodes[0].scores.coding_score.is_finite());
1221    }
1222
1223    #[test]
1224    fn test_raw_coding_score_reverse() {
1225        let seq = vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
1226        let reverse_seq = vec![3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0];
1227        let training = create_test_training();
1228
1229        let mut nodes = vec![
1230            create_test_node(Strand::Reverse, 12, CodonType::Atg, 3),
1231            create_test_node(Strand::Reverse, 3, CodonType::Stop, 3),
1232        ];
1233
1234        raw_coding_score(&seq, &reverse_seq, seq.len(), &mut nodes, &training);
1235
1236        assert!(nodes[0].scores.coding_score.is_finite());
1237    }
1238
1239    #[test]
1240    fn test_raw_coding_score_mixed_strands() {
1241        let seq = vec![
1242            0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
1243        ];
1244        let reverse_seq = vec![
1245            3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0,
1246        ];
1247        let training = create_test_training();
1248
1249        let mut nodes = vec![
1250            create_test_node(Strand::Forward, 3, CodonType::Atg, 15),
1251            create_test_node(Strand::Reverse, 18, CodonType::Gtg, 6),
1252            create_test_node(Strand::Forward, 15, CodonType::Stop, 15),
1253            create_test_node(Strand::Reverse, 6, CodonType::Stop, 6),
1254        ];
1255
1256        raw_coding_score(&seq, &reverse_seq, seq.len(), &mut nodes, &training);
1257
1258        assert!(nodes[0].scores.coding_score.is_finite());
1259        assert!(nodes[1].scores.coding_score.is_finite());
1260    }
1261
1262    #[test]
1263    fn test_raw_coding_score_different_translation_table() {
1264        let seq = vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
1265        let reverse_seq = vec![3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0];
1266        let mut training = create_test_training();
1267        training.translation_table = 4; // Non-standard table
1268
1269        let mut nodes = vec![
1270            create_test_node(Strand::Forward, 3, CodonType::Atg, 9),
1271            create_test_node(Strand::Forward, 9, CodonType::Stop, 9),
1272        ];
1273
1274        raw_coding_score(&seq, &reverse_seq, seq.len(), &mut nodes, &training);
1275
1276        // Should handle different translation tables
1277        assert!(nodes[0].scores.coding_score.is_finite());
1278    }
1279
1280    #[test]
1281    fn test_raw_coding_score_long_gene() {
1282        let seq = vec![0; 3000]; // Long sequence for large gene
1283        let reverse_seq = vec![3; 3000];
1284        let training = create_test_training();
1285
1286        let mut nodes = vec![
1287            create_test_node(Strand::Forward, 100, CodonType::Atg, 2900),
1288            create_test_node(Strand::Forward, 2900, CodonType::Stop, 2900),
1289        ];
1290
1291        raw_coding_score(&seq, &reverse_seq, seq.len(), &mut nodes, &training);
1292
1293        assert!(nodes[0].scores.coding_score.is_finite());
1294    }
1295
1296    #[test]
1297    fn test_raw_coding_score_empty_nodes() {
1298        let seq = vec![0, 1, 2, 3];
1299        let reverse_seq = vec![3, 2, 1, 0];
1300        let training = create_test_training();
1301
1302        let mut nodes = vec![];
1303
1304        raw_coding_score(&seq, &reverse_seq, seq.len(), &mut nodes, &training);
1305
1306        // Should handle empty nodes gracefully
1307        assert_eq!(nodes.len(), 0);
1308    }
1309
1310    #[test]
1311    fn test_calculate_gene_length() {
1312        let node = create_test_node(Strand::Forward, 100, CodonType::Atg, 400);
1313        let length = calculate_gene_length(&node);
1314        assert_eq!(length, 300);
1315
1316        let node_reverse = create_test_node(Strand::Reverse, 400, CodonType::Atg, 100);
1317        let length_reverse = calculate_gene_length(&node_reverse);
1318        assert_eq!(length_reverse, 300);
1319    }
1320
1321    #[test]
1322    fn test_calculate_gene_size_codons() {
1323        let node = create_test_node(Strand::Forward, 100, CodonType::Atg, 403); // 303 bp = 102 codons
1324        let size_codons = calculate_gene_size_codons(&node);
1325        assert_eq!(size_codons, 102.0); // (303 + 3) / 3 = 102
1326
1327        // Test with partial codon
1328        let node_partial = create_test_node(Strand::Forward, 100, CodonType::Atg, 401); // 301 bp
1329        let size_codons_partial = calculate_gene_size_codons(&node_partial);
1330        assert_eq!(size_codons_partial, 101.33333333333333); // (301 + 3) / 3 = 101.33
1331    }
1332
1333    #[test]
1334    fn test_calculate_short_gene_factors() {
1335        let gene_length = 100;
1336        let (negative_factor, positive_factor) = calculate_short_gene_factors(gene_length);
1337
1338        // For 100 bp gene with SHORT_GENE_THRESHOLD = 250
1339        assert_eq!(negative_factor, 2.5); // 250 / 100
1340        assert_eq!(positive_factor, 0.4); // 100 / 250
1341
1342        let (neg_edge, pos_edge) = calculate_short_gene_factors(SHORT_GENE_THRESHOLD);
1343        assert_eq!(neg_edge, 1.0);
1344        assert_eq!(pos_edge, 1.0);
1345    }
1346
1347    #[test]
1348    fn test_calculate_min_metagenomic_length() {
1349        let sequence_length = 1000;
1350        let min_length = calculate_min_metagenomic_length(sequence_length);
1351        assert_eq!(min_length, (1000.0_f64).sqrt() * 5.0); // sqrt(1000) * 5 ≈ 158.11
1352
1353        // Test different sequence length
1354        let min_length_small = calculate_min_metagenomic_length(100);
1355        assert_eq!(min_length_small, 50.0); // sqrt(100) * 5 = 50
1356    }
1357
1358    #[test]
1359    fn test_score_nodes_short_gene_penalty() {
1360        let seq = vec![0; 60];
1361        let reverse_seq = vec![3; 60];
1362        let training = create_test_training();
1363
1364        let mut nodes = vec![
1365            create_test_node(Strand::Forward, 30, CodonType::Atg, 35), // Short gene (length 5)
1366        ];
1367
1368        let encoded_sequence = EncodedSequence {
1369            forward_sequence: seq,
1370            reverse_complement_sequence: reverse_seq,
1371            unknown_sequence: vec![0; 60],
1372            masks: vec![],
1373            gc_content: 0.5,
1374            sequence_length: 60,
1375        };
1376
1377        let result = score_nodes(&encoded_sequence, &mut nodes, &training, false, false);
1378
1379        assert!(result.is_ok());
1380        assert!(nodes[0].scores.total_score.is_finite());
1381    }
1382
1383    #[test]
1384    fn test_score_nodes_negative_coding_score() {
1385        let seq = vec![0; 40];
1386        let reverse_seq = vec![3; 40];
1387        let mut training = create_test_training();
1388
1389        // Set negative dicodon scores to force negative coding scores
1390        for score in training.gene_dicodon_table.iter_mut() {
1391            *score = -1.0;
1392        }
1393
1394        let mut nodes = vec![create_test_node(Strand::Forward, 25, CodonType::Atg, 35)];
1395
1396        let encoded_sequence = EncodedSequence {
1397            forward_sequence: seq,
1398            reverse_complement_sequence: reverse_seq,
1399            unknown_sequence: vec![0; 40],
1400            masks: vec![],
1401            gc_content: 0.5,
1402            sequence_length: 40,
1403        };
1404
1405        let result = score_nodes(&encoded_sequence, &mut nodes, &training, false, false);
1406
1407        assert!(result.is_ok());
1408        assert!(nodes[0].scores.total_score.is_finite());
1409    }
1410
1411    #[test]
1412    fn test_calculate_no_stop_probability_standard_genetic_code() {
1413        let gc_content = 0.5;
1414        let translation_table = 11;
1415
1416        let prob = calculate_no_stop_probability(gc_content, translation_table);
1417
1418        // Should be a valid probability between 0 and 1
1419        assert!(prob > 0.0 && prob < 1.0);
1420        assert!((prob - EXPECTED_NO_STOP_PROB).abs() < 0.01);
1421    }
1422
1423    #[test]
1424    fn test_calculate_no_stop_probability_non_standard_genetic_code() {
1425        let gc_content = 0.3;
1426        let translation_table = 4; // Non-standard
1427
1428        let prob = calculate_no_stop_probability(gc_content, translation_table);
1429
1430        // Should be a valid probability between 0 and 1
1431        assert!(prob > 0.0 && prob < 1.0);
1432    }
1433
1434    #[test]
1435    fn test_calculate_length_factor_normal_gene() {
1436        let gene_size_codons = 200.0;
1437        let no_stop_probability = 0.95;
1438
1439        let factor = calculate_length_factor(gene_size_codons, no_stop_probability);
1440
1441        // Should return a finite value
1442        assert!(factor.is_finite());
1443    }
1444
1445    #[test]
1446    fn test_calculate_length_factor_long_gene() {
1447        let gene_size_codons = 1500.0; // Longer than MAX_GENE_SIZE_CODONS
1448        let no_stop_probability = 0.95;
1449
1450        let factor = calculate_length_factor(gene_size_codons, no_stop_probability);
1451
1452        // Should return a finite value with scaling applied
1453        assert!(factor.is_finite());
1454    }
1455
1456    #[test]
1457    fn test_process_strand_first_pass_forward() {
1458        let seq = vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
1459        let training = create_test_training();
1460        let mut frame_scores = [0.0; 3];
1461        let mut last_positions = [0usize; 3];
1462
1463        let mut nodes = vec![
1464            create_test_node(Strand::Forward, 3, CodonType::Atg, 12),
1465            create_test_node(Strand::Forward, 12, CodonType::Stop, 12),
1466        ];
1467
1468        process_strand_first_pass(
1469            &mut nodes,
1470            &seq,
1471            seq.len(),
1472            Strand::Forward,
1473            &training,
1474            &mut frame_scores,
1475            &mut last_positions,
1476        );
1477
1478        // Should have calculated coding scores
1479        assert!(nodes[0].scores.coding_score.is_finite());
1480    }
1481
1482    #[test]
1483    fn test_process_strand_second_pass_forward() {
1484        let mut nodes = vec![
1485            create_test_node(Strand::Forward, 3, CodonType::Atg, 12),
1486            create_test_node(Strand::Forward, 6, CodonType::Gtg, 15),
1487        ];
1488
1489        // Set initial coding scores
1490        nodes[0].scores.coding_score = 10.0;
1491        nodes[1].scores.coding_score = 5.0;
1492
1493        let mut frame_scores = [0.0; 3];
1494
1495        process_strand_second_pass(&mut nodes, Strand::Forward, &mut frame_scores);
1496
1497        // Should have adjusted scores based on maximums
1498        assert!(nodes[0].scores.coding_score.is_finite());
1499        assert!(nodes[1].scores.coding_score.is_finite());
1500    }
1501
1502    #[test]
1503    fn test_process_strand_third_pass_forward() {
1504        let mut nodes = vec![create_test_node(Strand::Forward, 3, CodonType::Atg, 300)];
1505
1506        // Set initial coding score
1507        nodes[0].scores.coding_score = 5.0;
1508
1509        let no_stop_probability = 0.95;
1510        let mut frame_scores = [0.0; 3];
1511
1512        process_strand_third_pass(
1513            &mut nodes,
1514            Strand::Forward,
1515            no_stop_probability,
1516            &mut frame_scores,
1517        );
1518
1519        // Should have added length factor to coding score
1520        assert!(nodes[0].scores.coding_score > 5.0);
1521        assert!(nodes[0].scores.coding_score.is_finite());
1522    }
1523}