1use super::*;
2use crate::constants::{
3 MAX_MOTIF_LENGTH, MAX_RIBOSOME_DISTANCE, MIN_CUMULATIVE_SCORE, MIN_DISTANCE_FROM_START,
4 MIN_MOTIF_LENGTH, READING_FRAMES, SLIDING_WINDOW_SIZE,
5};
6
7pub fn calc_most_gc_frame(sequence: &[u8], sequence_length: usize) -> Vec<i32> {
9 if sequence_length < READING_FRAMES {
10 return vec![-1; sequence_length];
11 }
12
13 let forward_gc_counts = calculate_forward_gc_counts(sequence, sequence_length);
14 let backward_gc_counts = calculate_backward_gc_counts(sequence, sequence_length);
15 let total_gc_counts = calculate_total_gc_counts(
16 &forward_gc_counts,
17 &backward_gc_counts,
18 sequence,
19 sequence_length,
20 );
21
22 assign_gc_rich_frames(&total_gc_counts, sequence_length)
23}
24
25fn calculate_forward_gc_counts(sequence: &[u8], sequence_length: usize) -> Vec<i32> {
26 let mut counts = vec![0; sequence_length];
27
28 for reading_frame in 0..READING_FRAMES {
29 for position in (reading_frame..sequence_length).step_by(READING_FRAMES) {
30 counts[position] = if position < READING_FRAMES {
31 i32::from(is_gc(sequence, position))
32 } else {
33 counts[position - READING_FRAMES] + i32::from(is_gc(sequence, position))
34 };
35 }
36 }
37
38 counts
39}
40
41fn calculate_backward_gc_counts(sequence: &[u8], sequence_length: usize) -> Vec<i32> {
42 let mut counts = vec![0; sequence_length];
43
44 for reading_frame in 0..READING_FRAMES {
45 for position in (reading_frame..sequence_length).step_by(READING_FRAMES) {
46 let reverse_position = sequence_length - position - 1;
47 counts[reverse_position] = if position < READING_FRAMES {
48 i32::from(is_gc(sequence, reverse_position))
49 } else {
50 counts[reverse_position + READING_FRAMES]
51 + i32::from(is_gc(sequence, reverse_position))
52 };
53 }
54 }
55
56 counts
57}
58
59fn calculate_total_gc_counts(
60 forward_counts: &[i32],
61 backward_counts: &[i32],
62 sequence: &[u8],
63 sequence_length: usize,
64) -> Vec<i32> {
65 (0..sequence_length)
66 .map(|position| {
67 let mut total = forward_counts[position] + backward_counts[position]
68 - i32::from(is_gc(sequence, position));
69
70 if position >= SLIDING_WINDOW_SIZE / 2 {
71 total -= forward_counts[position - SLIDING_WINDOW_SIZE / 2];
72 }
73
74 if position + SLIDING_WINDOW_SIZE / 2 < sequence_length {
75 total -= backward_counts[position + SLIDING_WINDOW_SIZE / 2];
76 }
77
78 total
79 })
80 .collect()
81}
82
83fn assign_gc_rich_frames(total_gc_counts: &[i32], sequence_length: usize) -> Vec<i32> {
84 let mut gc_rich_frames = vec![-1; sequence_length];
85
86 for triplet_start in (0..sequence_length.saturating_sub(2)).step_by(READING_FRAMES) {
87 let counts = [
88 total_gc_counts[triplet_start],
89 total_gc_counts.get(triplet_start + 1).copied().unwrap_or(0),
90 total_gc_counts.get(triplet_start + 2).copied().unwrap_or(0),
91 ];
92
93 let max_gc_frame = find_max_reading_frame(counts[0], counts[1], counts[2]) as i32;
94
95 for frame_offset in 0..READING_FRAMES.min(sequence_length - triplet_start) {
96 gc_rich_frames[triplet_start + frame_offset] = max_gc_frame;
97 }
98 }
99
100 gc_rich_frames
101}
102
103#[must_use]
105pub fn shine_dalgarno_exact(
106 sequence: &[u8],
107 search_position: usize,
108 start_codon_position: usize,
109 ribosome_weights: &[f64],
110) -> usize {
111 if start_codon_position <= search_position + MIN_DISTANCE_FROM_START {
112 return 0;
113 }
114
115 let search_limit =
116 MAX_MOTIF_LENGTH.min(start_codon_position - MIN_DISTANCE_FROM_START - search_position);
117 let base_scores = calculate_exact_base_scores(sequence, search_position, search_limit);
118
119 find_best_exact_motif(
120 &base_scores,
121 search_limit,
122 search_position,
123 start_codon_position,
124 ribosome_weights,
125 )
126}
127
128fn calculate_exact_base_scores(
129 sequence: &[u8],
130 search_position: usize,
131 search_limit: usize,
132) -> Vec<f64> {
133 (0..search_limit)
134 .map(|pattern_index| {
135 let sequence_position = search_position + pattern_index;
136 match pattern_index % 3 {
137 0 if is_a(sequence, sequence_position) => 2.0,
138 1 | 2 if is_g(sequence, sequence_position) => 3.0,
139 _ => -10.0,
140 }
141 })
142 .collect()
143}
144
145fn find_best_exact_motif(
146 base_scores: &[f64],
147 search_limit: usize,
148 search_position: usize,
149 start_codon_position: usize,
150 ribosome_weights: &[f64],
151) -> usize {
152 let mut best_motif_index = 0;
153
154 for motif_length in (MIN_MOTIF_LENGTH..=search_limit).rev() {
155 for motif_start_offset in 0..=(search_limit - motif_length) {
156 if let Some(motif_index) = evaluate_exact_motif(
157 base_scores,
158 motif_start_offset,
159 motif_length,
160 search_position,
161 start_codon_position,
162 ) && is_better_motif(motif_index, best_motif_index, ribosome_weights)
163 {
164 best_motif_index = motif_index;
165 }
166 }
167 }
168
169 best_motif_index
170}
171
172fn evaluate_exact_motif(
173 base_scores: &[f64],
174 motif_start_offset: usize,
175 motif_length: usize,
176 search_position: usize,
177 start_codon_position: usize,
178) -> Option<usize> {
179 let start = motif_start_offset;
180 let end = motif_start_offset + motif_length;
181 let window = &base_scores[start..end];
182
183 if window.iter().any(|&score| score < 0.0) {
184 return None;
185 }
186
187 let cumulative_score: f64 = window.iter().copied().sum::<f64>() - 2.0;
188 let ribosome_distance =
189 start_codon_position - (search_position + motif_start_offset + motif_length);
190
191 if ribosome_distance > MAX_RIBOSOME_DISTANCE || cumulative_score < MIN_CUMULATIVE_SCORE {
192 return None;
193 }
194
195 let distance_category = categorize_distance(ribosome_distance, motif_length);
196 Some(map_score_to_motif_index(
197 cumulative_score as i32,
198 distance_category,
199 ))
200}
201
202#[must_use]
204pub fn shine_dalgarno_mm(
205 sequence: &[u8],
206 search_position: usize,
207 start_codon_position: usize,
208 ribosome_weights: &[f64],
209) -> usize {
210 if start_codon_position <= search_position + MIN_DISTANCE_FROM_START {
211 return 0;
212 }
213
214 let search_limit =
215 MAX_MOTIF_LENGTH.min(start_codon_position - MIN_DISTANCE_FROM_START - search_position);
216 let base_scores = calculate_mismatch_base_scores(sequence, search_position, search_limit);
217
218 find_best_mismatch_motif(
219 &base_scores,
220 search_limit,
221 search_position,
222 start_codon_position,
223 ribosome_weights,
224 )
225}
226
227fn calculate_mismatch_base_scores(
228 sequence: &[u8],
229 search_position: usize,
230 search_limit: usize,
231) -> Vec<f64> {
232 (0..search_limit)
233 .map(|pattern_index| {
234 let sequence_position = search_position + pattern_index;
235 match pattern_index % 3 {
236 0 => {
237 if is_a(sequence, sequence_position) {
238 2.0
239 } else {
240 -3.0
241 }
242 }
243 _ => {
244 if is_g(sequence, sequence_position) {
245 3.0
246 } else {
247 -2.0
248 }
249 }
250 }
251 })
252 .collect()
253}
254
255fn find_best_mismatch_motif(
256 base_scores: &[f64],
257 search_limit: usize,
258 search_position: usize,
259 start_codon_position: usize,
260 ribosome_weights: &[f64],
261) -> usize {
262 let mut best_motif_index = 0;
263
264 for motif_length in (5..=search_limit).rev() {
265 for motif_start_offset in 0..=(search_limit - motif_length) {
266 if let Some(motif_index) = evaluate_mismatch_motif(
267 base_scores,
268 motif_start_offset,
269 motif_length,
270 search_position,
271 start_codon_position,
272 ) && is_better_motif(motif_index, best_motif_index, ribosome_weights)
273 {
274 best_motif_index = motif_index;
275 }
276 }
277 }
278
279 best_motif_index
280}
281
282fn evaluate_mismatch_motif(
283 base_scores: &[f64],
284 motif_start_offset: usize,
285 motif_length: usize,
286 search_position: usize,
287 start_codon_position: usize,
288) -> Option<usize> {
289 let mut cumulative_score = -2.0;
290 let mut mismatch_count = 0;
291
292 let start = motif_start_offset;
293 let end = motif_start_offset + motif_length;
294 for (pos_in_motif, &score) in base_scores[start..end].iter().enumerate() {
295 cumulative_score += score;
296 if score < 0.0 {
297 mismatch_count += 1;
298 if pos_in_motif <= 1 || pos_in_motif >= motif_length - 2 {
299 cumulative_score -= 10.0;
300 }
301 }
302 }
303
304 if mismatch_count != 1 {
305 return None;
306 }
307
308 let ribosome_distance =
309 start_codon_position - (search_position + motif_start_offset + motif_length);
310
311 if ribosome_distance > MAX_RIBOSOME_DISTANCE || cumulative_score < MIN_CUMULATIVE_SCORE {
312 return None;
313 }
314
315 let distance_category = categorize_mismatch_distance(ribosome_distance);
316 Some(map_mismatch_score_to_motif_index(
317 cumulative_score as i32,
318 distance_category,
319 ))
320}
321
322const fn categorize_distance(ribosome_distance: usize, motif_length: usize) -> usize {
323 match ribosome_distance {
324 0..=4 => {
325 if motif_length < 5 {
326 2
327 } else {
328 1
329 }
330 }
331 5..=10 => 0,
332 11..=12 => {
333 if motif_length < 5 {
334 1
335 } else {
336 2
337 }
338 }
339 _ => 3,
340 }
341}
342
343const fn categorize_mismatch_distance(ribosome_distance: usize) -> usize {
344 match ribosome_distance {
345 0..=4 => 1,
346 5..=10 => 0,
347 11..=12 => 2,
348 _ => 3,
349 }
350}
351
352const fn map_score_to_motif_index(score: i32, distance_category: usize) -> usize {
353 match (score, distance_category) {
354 (6, 2) => 1,
355 (6, 3) => 2,
356 (8 | 9, 3) => 3,
357 (6, 1) => 6,
358 (11 | 12 | 14, 3) => 10,
359 (8 | 9, 2) => 11,
360 (8 | 9, 1) => 12,
361 (6, 0) => 13,
362 (8, 0) => 15,
363 (9, 0) => 16,
364 (11 | 12, 2) => 20,
365 (11, 1) => 21,
366 (11, 0) => 22,
367 (12, 1) => 23,
368 (12, 0) => 24,
369 (14, 2) => 25,
370 (14, 1) => 26,
371 (14, 0) => 27,
372 _ => 0,
373 }
374}
375
376const fn map_mismatch_score_to_motif_index(score: i32, distance_category: usize) -> usize {
377 match (score, distance_category) {
378 (6 | 7, 3) => 2,
379 (9, 3) => 3,
380 (6, 2) => 4,
381 (6, 1) => 5,
382 (6, 0) => 9,
383 (7, 2) => 7,
384 (7, 1) => 8,
385 (7, 0) => 14,
386 (9, 2) => 17,
387 (9, 1) => 18,
388 (9, 0) => 19,
389 _ => 0,
390 }
391}
392
393fn is_better_motif(current_index: usize, best_index: usize, ribosome_weights: &[f64]) -> bool {
394 ribosome_weights[current_index] > ribosome_weights[best_index]
395 || (ribosome_weights[current_index] == ribosome_weights[best_index]
396 && current_index > best_index)
397}
398
399#[cfg(test)]
400mod tests {
401 use super::*;
402
403 #[test]
404 fn test_calc_most_gc_frame_basic() {
405 let sequence = b"ATCGGCGCGCTAATCGGCGC";
406 let result = calc_most_gc_frame(sequence, sequence.len());
407 assert_eq!(result.len(), sequence.len());
408 for &frame in &result {
409 assert!((-1..=2).contains(&frame));
410 }
411 }
412
413 #[test]
414 fn test_calc_most_gc_frame_empty() {
415 let sequence = b"";
416 let result = calc_most_gc_frame(sequence, 0);
417 assert_eq!(result.len(), 0);
418 }
419
420 #[test]
421 fn test_shine_dalgarno_exact_basic() {
422 let ribosome_weights = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
423 let sequence = b"AGGAGGTATGATCGGC";
424
425 let result = shine_dalgarno_exact(sequence, 5, 12, &ribosome_weights);
426 assert!(result < ribosome_weights.len());
427 }
428
429 #[test]
430 fn test_shine_dalgarno_mm_basic() {
431 let ribosome_weights = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
432 let sequence = b"AGGAGGTATGATCGGC";
433
434 let result = shine_dalgarno_mm(sequence, 0, 8, &ribosome_weights);
435 assert!(result < ribosome_weights.len());
436 }
437
438 #[test]
439 fn test_assign_gc_rich_frames_basic() {
440 let total_gc_counts = vec![5, 3, 8, 2, 7, 1, 9, 4];
441 let result = assign_gc_rich_frames(&total_gc_counts, 8);
442 assert_eq!(result.len(), 8);
443
444 for &frame in &result {
445 assert!((-1..=2).contains(&frame));
446 }
447 }
448}