primaldimer_rs/
lib.rs

1mod scores;
2use scores::{MATCH_ARRAY, NN_SCORES, SEQ1_OVERHANG_ARRAY, SEQ2_OVERHANG_ARRAY};
3
4use itertools::Itertools;
5
6static BONUS_ARRAY: [f64; 10] = [
7    1.11217618,
8    0.55187469,
9    1.01582516,
10    1.03180592,
11    -2.76687727,
12    -0.81903133,
13    0.93596145,
14    2.32758405,
15    3.24507248,
16    0.80416919,
17];
18// 0 = PENALTY_DOUBLE_MISMATCH
19// 1 = PENALTY_LEFT_OVERHANG_MISMATCH
20// 2 = PENALTY_RIGHT_OVERHANG_MISMATCH
21// 3 = BONUS_ALL_MATCH
22// 4 = BONUS_3P_MATCH_GC
23// 5 = BONUS_3P_MATCH_AT
24// 6 = SCORE_3P_MISMATCH
25// 7 = LONGEST_MATCH_COEF
26// 8 = MATCH_PROP_COEF
27// 9 = BUBBLE_COEF
28
29//base_to_u8 = {"A": 65, "T": 84, "C": 67, "G": 71}
30
31// base_to_encode = {"A": 0, "T": 3, "C": 1, "G": 2}
32pub fn encode_base(sequence: &str) -> Vec<usize> {
33    let encoded_base = sequence
34        .as_bytes()
35        .iter()
36        .map(|base| match *base {
37            65 => 0,
38            84 => 3,
39            67 => 1,
40            71 => 2,
41            _ => panic!("NON STANDRD BASE found in {}", sequence),
42        })
43        .collect();
44    return encoded_base;
45}
46
47fn calc_dangling_ends_stabilty(
48    seq1: &[usize],
49    seq2: &[usize],
50    mapping: &Vec<(usize, usize)>,
51) -> f64 {
52    let mut dg_score = 0.;
53
54    // Look for overhang on the right side
55    let (seq2_i, seq1_i) = mapping[mapping.len() - 1];
56
57    match SEQ2_OVERHANG_ARRAY[seq1[seq1_i]][seq2[seq2_i]][seq2[seq2_i + 1]] {
58        Some(score) => dg_score += score,
59        None => dg_score += BONUS_ARRAY[2],
60    }
61
62    // Look for overhang on the leftside
63    let (seq2_i, seq1_i) = mapping[0];
64
65    if seq1_i > 0 {
66        match SEQ1_OVERHANG_ARRAY[seq1[seq1_i]][seq2[seq2_i]][seq1[seq1_i - 1]] {
67            Some(score) => dg_score += score,
68            None => dg_score += BONUS_ARRAY[1],
69        }
70    } else if seq2_i > 0 {
71        match SEQ2_OVERHANG_ARRAY[seq1[seq1_i]][seq2[seq2_i]][seq2[seq2_i - 1]] {
72            Some(score) => dg_score += score,
73            None => dg_score += BONUS_ARRAY[1],
74        }
75    }
76
77    return dg_score;
78}
79
80fn calc_nn_thermo(seq1: &[usize], seq2: &[usize], mapping: &Vec<(usize, usize)>) -> f64 {
81    let mut dg_score: f64 = 0.;
82    for (seq2_i, seq1_i) in mapping.iter() {
83        match NN_SCORES[seq1[*seq1_i]][seq1[*seq1_i + 1]][seq2[*seq2_i]][seq2[*seq2_i + 1]] {
84            Some(score) => dg_score += score,   // If match or single mismatch
85            None => dg_score += BONUS_ARRAY[0], // If Double mismatch
86        }
87    }
88    return dg_score;
89}
90
91fn calc_extention(seq1: &[usize], match_bool: &Vec<bool>) -> Option<f64> {
92    // Guard for no matches in final two 3' bases
93    if !match_bool[match_bool.len() - 2..].iter().any(|f| *f) {
94        return None;
95    }
96
97    let mut score: f64 = 0.;
98
99    let kmer_3p_bool: Vec<(usize, &bool)> = match_bool
100        .iter()
101        .rev()
102        .enumerate()
103        .filter(|(index, _bool)| *index < 4)
104        .collect();
105
106    // Look at the last 4 bases in the match
107
108    for (index, match_bool) in kmer_3p_bool.iter() {
109        let seq1_index = seq1.len() - 1 - index;
110        // Only count matches
111        if **match_bool {
112            // Add match score
113            match seq1[seq1_index] {
114                1 | 2 => score += 3. * (1. / (index + 1) as f64), // CG match
115                0 | 3 => score += 2. * (1. / (index + 1) as f64), // AT match
116                _ => continue,
117            }
118        }
119    }
120
121    if kmer_3p_bool.iter().all(|(_index, bool)| **bool) {
122        score += 2.;
123    }
124
125    return Some(-score);
126}
127
128fn apply_bonus(match_bool: &Vec<bool>) -> f64 {
129    // Find the longest continous match
130    let mut current_match = 0;
131    let mut longest_match = 0;
132
133    for m in match_bool.iter() {
134        match m {
135            true => current_match += 1,
136            false => current_match = 0,
137        }
138        if current_match > longest_match {
139            longest_match = current_match
140        }
141    }
142    let mut score = 0.;
143
144    // Find proportion of matches
145    score += -((0.8
146        - (match_bool.iter().filter(|b| **b).count() as f64 / match_bool.len() as f64))
147        * BONUS_ARRAY[8]);
148
149    // Group the match bool
150    let grouped_match_bool: Vec<(bool, usize)> = match_bool
151        .iter()
152        .group_by(|bool| **bool)
153        .into_iter()
154        .map(|(bool, iter)| (bool, iter.count()))
155        .collect();
156
157    // Work out the longest match
158    let longest_match = grouped_match_bool
159        .iter()
160        .filter(|(bool, _count)| *bool)
161        .map(|(_bool, count)| count)
162        .max();
163
164    match longest_match {
165        Some(max) => score += -(*max as f64 * BONUS_ARRAY[7]),
166        None => (),
167    }
168
169    // Resolve bubbles
170    for (match_bool, count) in grouped_match_bool.iter() {
171        if !*match_bool && count > &2 {
172            score += -((*count as f64 - 2.) * BONUS_ARRAY[0]) * BONUS_ARRAY[9]
173        }
174    }
175
176    return score;
177}
178
179pub fn calc_at_offset(seq1: &[usize], seq2: &[usize], offset: i32) -> Option<f64> {
180    // Create the mapping
181    let mut mapping: Vec<(usize, usize)> = Vec::new();
182    for x in 0..seq1.len() {
183        let seq2_index = x as i32 + offset;
184        if seq2_index >= 0 {
185            mapping.push((seq2_index as usize, x))
186        }
187    }
188
189    // Create the match_bool
190    let match_bool = mapping
191        .iter()
192        .map(|(seq2i, seq1i)| MATCH_ARRAY[seq1[*seq1i] as usize][seq2[*seq2i] as usize])
193        .collect();
194
195    let mut dg_score = calc_dangling_ends_stabilty(&seq1, &seq2, &mapping);
196
197    match calc_extention(seq1, &match_bool) {
198        Some(score) => dg_score += score,
199        None => return None,
200    };
201
202    // Apply longest match, and match proportion
203    dg_score += apply_bonus(&match_bool);
204
205    // Remove the end element of mapping before giving to NN
206    mapping.pop();
207    dg_score += calc_nn_thermo(seq1, seq2, &mapping);
208
209    return Some(dg_score);
210}
211
212fn does_seq1_extend(seq1: &[usize], seq2: &[usize], t: f64) -> bool {
213    let mut seq2_rev = seq2.to_owned();
214    seq2_rev.reverse();
215
216    for offset in -(seq1.len() as i32 - 2)..(seq2.len() as i32) - (seq1.len() as i32) {
217        match calc_at_offset(&seq1, &seq2_rev, offset) {
218            Some(score) => {
219                //println!("{}", score);
220                if score <= t {
221                    return true;
222                }
223            }
224            None => (),
225        }
226    }
227    return false;
228}
229
230pub fn do_seqs_interact(seq1: &str, seq2: &str, t: f64) -> bool {
231    let s1 = encode_base(seq1);
232    let s2 = encode_base(seq2);
233
234    return does_seq1_extend(&s1, &s2, t) | does_seq1_extend(&s2, &s1, t);
235}
236
237pub fn do_pools_interact(pool1: Vec<&str>, pool2: Vec<&str>, t: f64) -> bool {
238    // Encode the pools
239    let pool1_encoded: Vec<Vec<usize>> = pool1.iter().map(|s| encode_base(s)).collect();
240    let pool2_encoded: Vec<Vec<usize>> = pool2.iter().map(|s| encode_base(s)).collect();
241
242    // Will look for interactions between every seq in pool1 and pool2
243    for (s1, s2) in pool1_encoded.iter().cartesian_product(pool2_encoded.iter()) {
244        if does_seq1_extend(&s1, &s2, t) | does_seq1_extend(&s2, &s1, t) {
245            return true;
246        }
247    }
248    return false;
249}
250
251pub fn do_pools_interact_adv(pool1: &Vec<String>, pool2: &Vec<String>, t: f64) -> bool {
252    // Encode the pools
253    let pool1_encoded: Vec<Vec<usize>> = pool1.iter().map(|s| encode_base(s)).collect();
254    let pool2_encoded: Vec<Vec<usize>> = pool2.iter().map(|s| encode_base(s)).collect();
255
256    // Will look for interactions between every seq in pool1 and pool2
257    for (s1, s2) in pool1_encoded.iter().cartesian_product(pool2_encoded.iter()) {
258        if does_seq1_extend(&s1, &s2, t) | does_seq1_extend(&s2, &s1, t) {
259            return true;
260        }
261    }
262    return false;
263}
264
265#[cfg(test)]
266mod tests {
267    use crate::{calc_nn_thermo, encode_base, NN_SCORES};
268    #[test]
269    fn test_valid_encode_base() {
270        let seq = "ATCG";
271
272        // base_to_encode = {"A": 0, "T": 3, "C": 1, "G": 2}
273        assert_eq!(encode_base(seq), vec![0, 3, 1, 2])
274    }
275    #[test]
276    #[should_panic]
277    fn test_invalid_encode_base() {
278        encode_base("z");
279    }
280    #[test]
281    fn test_all_match() {
282        // Set up values
283        let seq1 = "ACGAT";
284        let seq2 = "TGCTA";
285        let offset = 0;
286
287        let a = encode_base("A")[0];
288        let t = encode_base("T")[0];
289        let c = encode_base("C")[0];
290        let g = encode_base("G")[0];
291
292        let mut mapping: Vec<(usize, usize)> = Vec::new();
293        for x in 0..seq1.len() {
294            let seq2_index = x as i32 + offset;
295            if seq2_index >= 0 {
296                mapping.push((seq2_index as usize, x))
297            }
298        }
299        // Remove the last element
300        mapping.pop();
301        let mut pred_score: f64 = 0.;
302
303        // AC / TG match
304        pred_score += NN_SCORES[a][c][t][g].unwrap_or(0.);
305        // CG / GC match
306        pred_score += NN_SCORES[c][g][g][c].unwrap_or(0.);
307        // GA / CT match
308        pred_score += NN_SCORES[g][a][c][t].unwrap_or(0.);
309        // AT / TA match
310        pred_score += NN_SCORES[a][t][t][a].unwrap_or(0.);
311
312        // base_to_encode = {"A": 0, "T": 3, "C": 1, "G": 2}
313        assert_eq!(
314            calc_nn_thermo(&encode_base(seq1), &encode_base(seq2), &mapping),
315            pred_score
316        )
317    }
318    #[test]
319    fn test_mismatch_with_offset() {
320        // Set up values
321        //   ACCTC
322        //   |||.|
323        // ACTGGTGCTAC
324        let seq1 = "ACCTC";
325        let seq2 = "ACTGGTGCTAC";
326        let offset = 2;
327
328        let a = encode_base("A")[0];
329        let t = encode_base("T")[0];
330        let c = encode_base("C")[0];
331        let g = encode_base("G")[0];
332
333        let mut mapping: Vec<(usize, usize)> = Vec::new();
334        for x in 0..seq1.len() {
335            let seq2_index = x as i32 + offset;
336            if seq2_index >= 0 {
337                mapping.push((seq2_index as usize, x))
338            }
339        }
340        // Remove the last element
341        mapping.pop();
342
343        let mut pred_score = 0.;
344
345        // AC / TG match
346        pred_score += NN_SCORES[a][c][t][g].unwrap_or(0.);
347        // CC / GG  match
348        pred_score += NN_SCORES[c][c][g][g].unwrap_or(0.);
349        // CT / GT mismatch
350        pred_score += NN_SCORES[c][t][g][t].unwrap_or(0.);
351        // TC / TG mismatch
352        pred_score += NN_SCORES[t][c][t][g].unwrap_or(0.);
353
354        // base_to_encode = {"A": 0, "T": 1, "C": 2, "G": 3}
355        assert_eq!(
356            calc_nn_thermo(&encode_base(seq1), &encode_base(seq2), &mapping),
357            pred_score
358        )
359    }
360    #[test]
361    fn test_match_array() {
362        let a = encode_base("A")[0];
363        let t = encode_base("T")[0];
364        let c = encode_base("C")[0];
365        let g = encode_base("G")[0];
366
367        // MATCHES
368        // A / T
369        assert!(super::MATCH_ARRAY[a][t]);
370        // T / A
371        assert!(super::MATCH_ARRAY[t][a]);
372        // C / G
373        assert!(super::MATCH_ARRAY[c][g]);
374        // G / C
375        assert!(super::MATCH_ARRAY[g][c]);
376        // MISMATCHES
377        // A / A
378        assert_eq!(super::MATCH_ARRAY[a][a], false);
379        // A / C
380        assert_eq!(super::MATCH_ARRAY[a][c], false);
381        // A / G
382        assert_eq!(super::MATCH_ARRAY[a][g], false);
383
384        // T / T
385        assert_eq!(super::MATCH_ARRAY[t][t], false);
386        // T / C
387        assert_eq!(super::MATCH_ARRAY[t][c], false);
388        // T / G
389        assert_eq!(super::MATCH_ARRAY[t][g], false);
390
391        // C / C
392        assert_eq!(super::MATCH_ARRAY[c][c], false);
393        // C / A
394        assert_eq!(super::MATCH_ARRAY[c][a], false);
395        // C / T
396        assert_eq!(super::MATCH_ARRAY[c][t], false);
397
398        // G / G
399        assert_eq!(super::MATCH_ARRAY[g][g], false);
400        // G / A
401        assert_eq!(super::MATCH_ARRAY[g][a], false);
402        // G / T
403        assert_eq!(super::MATCH_ARRAY[g][t], false);
404    }
405    #[test]
406    fn test_ensure_consistant_result() {
407        // nCoV-2019_76_RIGHT_0 nCoV-2019_18_LEFT_0
408        // score: -40.74 (-40.736826004)
409        // 5'-ACACCTGTGCCTGTTAAACCAT-3' >
410        //                ||||||||||
411        //             3'-CAATTTGGTAATTGAACACCCATAAAGGT-5'
412
413        let s1 = "ACACCTGTGCCTGTTAAACCAT"; //5'-3'
414        let s2 = "CAATTTGGTAATTGAACACCCATAAAGGT"; //3'-5'
415        let offset = -12;
416
417        assert_eq!(
418            super::calc_at_offset(&encode_base(s1), &encode_base(s2), offset),
419            Some(-40.736826004)
420        );
421    }
422    #[test]
423    fn test_ensure_detection() {
424        // nCoV-2019_76_RIGHT_0 nCoV-2019_18_LEFT_0
425        // score: -40.74 (-40.736826004)
426        // 5'-ACACCTGTGCCTGTTAAACCAT-3' >
427        //                ||||||||||
428        //             3'-CAATTTGGTAATTGAACACCCATAAAGGT-5'
429
430        let s1 = "ACACCTGTGCCTGTTAAACCAT"; //5'-3'
431        let s2 = "TGGAAATACCCACAAGTTAATGGTTTAAC"; //5'-3'
432        let threshold = -27.0;
433
434        assert!(super::does_seq1_extend(
435            &encode_base(s1),
436            &encode_base(s2),
437            threshold,
438        ));
439    }
440}