orphos_core/sequence/
processing.rs

1use super::*;
2use crate::constants::{
3    MAX_MOTIF_LENGTH, MAX_RIBOSOME_DISTANCE, MIN_CUMULATIVE_SCORE, MIN_DISTANCE_FROM_START,
4    MIN_MOTIF_LENGTH, READING_FRAMES, SLIDING_WINDOW_SIZE,
5};
6
7/// Calculate most GC-rich frame for each position
8pub fn calc_most_gc_frame(sequence: &[u8], sequence_length: usize) -> Vec<i32> {
9    if sequence_length < READING_FRAMES {
10        return vec![-1; sequence_length];
11    }
12
13    let forward_gc_counts = calculate_forward_gc_counts(sequence, sequence_length);
14    let backward_gc_counts = calculate_backward_gc_counts(sequence, sequence_length);
15    let total_gc_counts = calculate_total_gc_counts(
16        &forward_gc_counts,
17        &backward_gc_counts,
18        sequence,
19        sequence_length,
20    );
21
22    assign_gc_rich_frames(&total_gc_counts, sequence_length)
23}
24
25fn calculate_forward_gc_counts(sequence: &[u8], sequence_length: usize) -> Vec<i32> {
26    let mut counts = vec![0; sequence_length];
27
28    for reading_frame in 0..READING_FRAMES {
29        for position in (reading_frame..sequence_length).step_by(READING_FRAMES) {
30            counts[position] = if position < READING_FRAMES {
31                i32::from(is_gc(sequence, position))
32            } else {
33                counts[position - READING_FRAMES] + i32::from(is_gc(sequence, position))
34            };
35        }
36    }
37
38    counts
39}
40
41fn calculate_backward_gc_counts(sequence: &[u8], sequence_length: usize) -> Vec<i32> {
42    let mut counts = vec![0; sequence_length];
43
44    for reading_frame in 0..READING_FRAMES {
45        for position in (reading_frame..sequence_length).step_by(READING_FRAMES) {
46            let reverse_position = sequence_length - position - 1;
47            counts[reverse_position] = if position < READING_FRAMES {
48                i32::from(is_gc(sequence, reverse_position))
49            } else {
50                counts[reverse_position + READING_FRAMES]
51                    + i32::from(is_gc(sequence, reverse_position))
52            };
53        }
54    }
55
56    counts
57}
58
59fn calculate_total_gc_counts(
60    forward_counts: &[i32],
61    backward_counts: &[i32],
62    sequence: &[u8],
63    sequence_length: usize,
64) -> Vec<i32> {
65    (0..sequence_length)
66        .map(|position| {
67            let mut total = forward_counts[position] + backward_counts[position]
68                - i32::from(is_gc(sequence, position));
69
70            if position >= SLIDING_WINDOW_SIZE / 2 {
71                total -= forward_counts[position - SLIDING_WINDOW_SIZE / 2];
72            }
73
74            if position + SLIDING_WINDOW_SIZE / 2 < sequence_length {
75                total -= backward_counts[position + SLIDING_WINDOW_SIZE / 2];
76            }
77
78            total
79        })
80        .collect()
81}
82
83fn assign_gc_rich_frames(total_gc_counts: &[i32], sequence_length: usize) -> Vec<i32> {
84    let mut gc_rich_frames = vec![-1; sequence_length];
85
86    for triplet_start in (0..sequence_length.saturating_sub(2)).step_by(READING_FRAMES) {
87        let counts = [
88            total_gc_counts[triplet_start],
89            total_gc_counts.get(triplet_start + 1).copied().unwrap_or(0),
90            total_gc_counts.get(triplet_start + 2).copied().unwrap_or(0),
91        ];
92
93        let max_gc_frame = find_max_reading_frame(counts[0], counts[1], counts[2]) as i32;
94
95        for frame_offset in 0..READING_FRAMES.min(sequence_length - triplet_start) {
96            gc_rich_frames[triplet_start + frame_offset] = max_gc_frame;
97        }
98    }
99
100    gc_rich_frames
101}
102
103/// Find exact Shine-Dalgarno motifs
104#[must_use]
105pub fn shine_dalgarno_exact(
106    sequence: &[u8],
107    search_position: usize,
108    start_codon_position: usize,
109    ribosome_weights: &[f64],
110) -> usize {
111    if start_codon_position <= search_position + MIN_DISTANCE_FROM_START {
112        return 0;
113    }
114
115    let search_limit =
116        MAX_MOTIF_LENGTH.min(start_codon_position - MIN_DISTANCE_FROM_START - search_position);
117    let base_scores = calculate_exact_base_scores(sequence, search_position, search_limit);
118
119    find_best_exact_motif(
120        &base_scores,
121        search_limit,
122        search_position,
123        start_codon_position,
124        ribosome_weights,
125    )
126}
127
128fn calculate_exact_base_scores(
129    sequence: &[u8],
130    search_position: usize,
131    search_limit: usize,
132) -> Vec<f64> {
133    (0..search_limit)
134        .map(|pattern_index| {
135            let sequence_position = search_position + pattern_index;
136            match pattern_index % 3 {
137                0 if is_a(sequence, sequence_position) => 2.0,
138                1 | 2 if is_g(sequence, sequence_position) => 3.0,
139                _ => -10.0,
140            }
141        })
142        .collect()
143}
144
145fn find_best_exact_motif(
146    base_scores: &[f64],
147    search_limit: usize,
148    search_position: usize,
149    start_codon_position: usize,
150    ribosome_weights: &[f64],
151) -> usize {
152    let mut best_motif_index = 0;
153
154    for motif_length in (MIN_MOTIF_LENGTH..=search_limit).rev() {
155        for motif_start_offset in 0..=(search_limit - motif_length) {
156            if let Some(motif_index) = evaluate_exact_motif(
157                base_scores,
158                motif_start_offset,
159                motif_length,
160                search_position,
161                start_codon_position,
162            ) && is_better_motif(motif_index, best_motif_index, ribosome_weights)
163            {
164                best_motif_index = motif_index;
165            }
166        }
167    }
168
169    best_motif_index
170}
171
172fn evaluate_exact_motif(
173    base_scores: &[f64],
174    motif_start_offset: usize,
175    motif_length: usize,
176    search_position: usize,
177    start_codon_position: usize,
178) -> Option<usize> {
179    let start = motif_start_offset;
180    let end = motif_start_offset + motif_length;
181    let window = &base_scores[start..end];
182
183    if window.iter().any(|&score| score < 0.0) {
184        return None;
185    }
186
187    let cumulative_score: f64 = window.iter().copied().sum::<f64>() - 2.0;
188    let ribosome_distance =
189        start_codon_position - (search_position + motif_start_offset + motif_length);
190
191    if ribosome_distance > MAX_RIBOSOME_DISTANCE || cumulative_score < MIN_CUMULATIVE_SCORE {
192        return None;
193    }
194
195    let distance_category = categorize_distance(ribosome_distance, motif_length);
196    Some(map_score_to_motif_index(
197        cumulative_score as i32,
198        distance_category,
199    ))
200}
201
202/// Find Shine-Dalgarno motifs with single mismatch
203#[must_use]
204pub fn shine_dalgarno_mm(
205    sequence: &[u8],
206    search_position: usize,
207    start_codon_position: usize,
208    ribosome_weights: &[f64],
209) -> usize {
210    if start_codon_position <= search_position + MIN_DISTANCE_FROM_START {
211        return 0;
212    }
213
214    let search_limit =
215        MAX_MOTIF_LENGTH.min(start_codon_position - MIN_DISTANCE_FROM_START - search_position);
216    let base_scores = calculate_mismatch_base_scores(sequence, search_position, search_limit);
217
218    find_best_mismatch_motif(
219        &base_scores,
220        search_limit,
221        search_position,
222        start_codon_position,
223        ribosome_weights,
224    )
225}
226
227fn calculate_mismatch_base_scores(
228    sequence: &[u8],
229    search_position: usize,
230    search_limit: usize,
231) -> Vec<f64> {
232    (0..search_limit)
233        .map(|pattern_index| {
234            let sequence_position = search_position + pattern_index;
235            match pattern_index % 3 {
236                0 => {
237                    if is_a(sequence, sequence_position) {
238                        2.0
239                    } else {
240                        -3.0
241                    }
242                }
243                _ => {
244                    if is_g(sequence, sequence_position) {
245                        3.0
246                    } else {
247                        -2.0
248                    }
249                }
250            }
251        })
252        .collect()
253}
254
255fn find_best_mismatch_motif(
256    base_scores: &[f64],
257    search_limit: usize,
258    search_position: usize,
259    start_codon_position: usize,
260    ribosome_weights: &[f64],
261) -> usize {
262    let mut best_motif_index = 0;
263
264    for motif_length in (5..=search_limit).rev() {
265        for motif_start_offset in 0..=(search_limit - motif_length) {
266            if let Some(motif_index) = evaluate_mismatch_motif(
267                base_scores,
268                motif_start_offset,
269                motif_length,
270                search_position,
271                start_codon_position,
272            ) && is_better_motif(motif_index, best_motif_index, ribosome_weights)
273            {
274                best_motif_index = motif_index;
275            }
276        }
277    }
278
279    best_motif_index
280}
281
282fn evaluate_mismatch_motif(
283    base_scores: &[f64],
284    motif_start_offset: usize,
285    motif_length: usize,
286    search_position: usize,
287    start_codon_position: usize,
288) -> Option<usize> {
289    let mut cumulative_score = -2.0;
290    let mut mismatch_count = 0;
291
292    let start = motif_start_offset;
293    let end = motif_start_offset + motif_length;
294    for (pos_in_motif, &score) in base_scores[start..end].iter().enumerate() {
295        cumulative_score += score;
296        if score < 0.0 {
297            mismatch_count += 1;
298            if pos_in_motif <= 1 || pos_in_motif >= motif_length - 2 {
299                cumulative_score -= 10.0;
300            }
301        }
302    }
303
304    if mismatch_count != 1 {
305        return None;
306    }
307
308    let ribosome_distance =
309        start_codon_position - (search_position + motif_start_offset + motif_length);
310
311    if ribosome_distance > MAX_RIBOSOME_DISTANCE || cumulative_score < MIN_CUMULATIVE_SCORE {
312        return None;
313    }
314
315    let distance_category = categorize_mismatch_distance(ribosome_distance);
316    Some(map_mismatch_score_to_motif_index(
317        cumulative_score as i32,
318        distance_category,
319    ))
320}
321
322const fn categorize_distance(ribosome_distance: usize, motif_length: usize) -> usize {
323    match ribosome_distance {
324        0..=4 => {
325            if motif_length < 5 {
326                2
327            } else {
328                1
329            }
330        }
331        5..=10 => 0,
332        11..=12 => {
333            if motif_length < 5 {
334                1
335            } else {
336                2
337            }
338        }
339        _ => 3,
340    }
341}
342
343const fn categorize_mismatch_distance(ribosome_distance: usize) -> usize {
344    match ribosome_distance {
345        0..=4 => 1,
346        5..=10 => 0,
347        11..=12 => 2,
348        _ => 3,
349    }
350}
351
352const fn map_score_to_motif_index(score: i32, distance_category: usize) -> usize {
353    match (score, distance_category) {
354        (6, 2) => 1,
355        (6, 3) => 2,
356        (8 | 9, 3) => 3,
357        (6, 1) => 6,
358        (11 | 12 | 14, 3) => 10,
359        (8 | 9, 2) => 11,
360        (8 | 9, 1) => 12,
361        (6, 0) => 13,
362        (8, 0) => 15,
363        (9, 0) => 16,
364        (11 | 12, 2) => 20,
365        (11, 1) => 21,
366        (11, 0) => 22,
367        (12, 1) => 23,
368        (12, 0) => 24,
369        (14, 2) => 25,
370        (14, 1) => 26,
371        (14, 0) => 27,
372        _ => 0,
373    }
374}
375
376const fn map_mismatch_score_to_motif_index(score: i32, distance_category: usize) -> usize {
377    match (score, distance_category) {
378        (6 | 7, 3) => 2,
379        (9, 3) => 3,
380        (6, 2) => 4,
381        (6, 1) => 5,
382        (6, 0) => 9,
383        (7, 2) => 7,
384        (7, 1) => 8,
385        (7, 0) => 14,
386        (9, 2) => 17,
387        (9, 1) => 18,
388        (9, 0) => 19,
389        _ => 0,
390    }
391}
392
393fn is_better_motif(current_index: usize, best_index: usize, ribosome_weights: &[f64]) -> bool {
394    ribosome_weights[current_index] > ribosome_weights[best_index]
395        || (ribosome_weights[current_index] == ribosome_weights[best_index]
396            && current_index > best_index)
397}
398
399#[cfg(test)]
400mod tests {
401    use super::*;
402
403    #[test]
404    fn test_calc_most_gc_frame_basic() {
405        let sequence = b"ATCGGCGCGCTAATCGGCGC";
406        let result = calc_most_gc_frame(sequence, sequence.len());
407        assert_eq!(result.len(), sequence.len());
408        for &frame in &result {
409            assert!((-1..=2).contains(&frame));
410        }
411    }
412
413    #[test]
414    fn test_calc_most_gc_frame_empty() {
415        let sequence = b"";
416        let result = calc_most_gc_frame(sequence, 0);
417        assert_eq!(result.len(), 0);
418    }
419
420    #[test]
421    fn test_shine_dalgarno_exact_basic() {
422        let ribosome_weights = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
423        let sequence = b"AGGAGGTATGATCGGC";
424
425        let result = shine_dalgarno_exact(sequence, 5, 12, &ribosome_weights);
426        assert!(result < ribosome_weights.len());
427    }
428
429    #[test]
430    fn test_shine_dalgarno_mm_basic() {
431        let ribosome_weights = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
432        let sequence = b"AGGAGGTATGATCGGC";
433
434        let result = shine_dalgarno_mm(sequence, 0, 8, &ribosome_weights);
435        assert!(result < ribosome_weights.len());
436    }
437
438    #[test]
439    fn test_assign_gc_rich_frames_basic() {
440        let total_gc_counts = vec![5, 3, 8, 2, 7, 1, 9, 4];
441        let result = assign_gc_rich_frames(&total_gc_counts, 8);
442        assert_eq!(result.len(), 8);
443
444        for &frame in &result {
445            assert!((-1..=2).contains(&frame));
446        }
447    }
448}