1mod scores;
2use scores::{MATCH_ARRAY, NN_SCORES, SEQ1_OVERHANG_ARRAY, SEQ2_OVERHANG_ARRAY};
3
4use itertools::Itertools;
5
6static BONUS_ARRAY: [f64; 10] = [
7 1.11217618,
8 0.55187469,
9 1.01582516,
10 1.03180592,
11 -2.76687727,
12 -0.81903133,
13 0.93596145,
14 2.32758405,
15 3.24507248,
16 0.80416919,
17];
18pub fn encode_base(sequence: &str) -> Vec<usize> {
33 let encoded_base = sequence
34 .as_bytes()
35 .iter()
36 .map(|base| match *base {
37 65 => 0,
38 84 => 3,
39 67 => 1,
40 71 => 2,
41 _ => panic!("NON STANDRD BASE found in {}", sequence),
42 })
43 .collect();
44 return encoded_base;
45}
46
47fn calc_dangling_ends_stabilty(
48 seq1: &[usize],
49 seq2: &[usize],
50 mapping: &Vec<(usize, usize)>,
51) -> f64 {
52 let mut dg_score = 0.;
53
54 let (seq2_i, seq1_i) = mapping[mapping.len() - 1];
56
57 match SEQ2_OVERHANG_ARRAY[seq1[seq1_i]][seq2[seq2_i]][seq2[seq2_i + 1]] {
58 Some(score) => dg_score += score,
59 None => dg_score += BONUS_ARRAY[2],
60 }
61
62 let (seq2_i, seq1_i) = mapping[0];
64
65 if seq1_i > 0 {
66 match SEQ1_OVERHANG_ARRAY[seq1[seq1_i]][seq2[seq2_i]][seq1[seq1_i - 1]] {
67 Some(score) => dg_score += score,
68 None => dg_score += BONUS_ARRAY[1],
69 }
70 } else if seq2_i > 0 {
71 match SEQ2_OVERHANG_ARRAY[seq1[seq1_i]][seq2[seq2_i]][seq2[seq2_i - 1]] {
72 Some(score) => dg_score += score,
73 None => dg_score += BONUS_ARRAY[1],
74 }
75 }
76
77 return dg_score;
78}
79
80fn calc_nn_thermo(seq1: &[usize], seq2: &[usize], mapping: &Vec<(usize, usize)>) -> f64 {
81 let mut dg_score: f64 = 0.;
82 for (seq2_i, seq1_i) in mapping.iter() {
83 match NN_SCORES[seq1[*seq1_i]][seq1[*seq1_i + 1]][seq2[*seq2_i]][seq2[*seq2_i + 1]] {
84 Some(score) => dg_score += score, None => dg_score += BONUS_ARRAY[0], }
87 }
88 return dg_score;
89}
90
91fn calc_extention(seq1: &[usize], match_bool: &Vec<bool>) -> Option<f64> {
92 if !match_bool[match_bool.len() - 2..].iter().any(|f| *f) {
94 return None;
95 }
96
97 let mut score: f64 = 0.;
98
99 let kmer_3p_bool: Vec<(usize, &bool)> = match_bool
100 .iter()
101 .rev()
102 .enumerate()
103 .filter(|(index, _bool)| *index < 4)
104 .collect();
105
106 for (index, match_bool) in kmer_3p_bool.iter() {
109 let seq1_index = seq1.len() - 1 - index;
110 if **match_bool {
112 match seq1[seq1_index] {
114 1 | 2 => score += 3. * (1. / (index + 1) as f64), 0 | 3 => score += 2. * (1. / (index + 1) as f64), _ => continue,
117 }
118 }
119 }
120
121 if kmer_3p_bool.iter().all(|(_index, bool)| **bool) {
122 score += 2.;
123 }
124
125 return Some(-score);
126}
127
128fn apply_bonus(match_bool: &Vec<bool>) -> f64 {
129 let mut current_match = 0;
131 let mut longest_match = 0;
132
133 for m in match_bool.iter() {
134 match m {
135 true => current_match += 1,
136 false => current_match = 0,
137 }
138 if current_match > longest_match {
139 longest_match = current_match
140 }
141 }
142 let mut score = 0.;
143
144 score += -((0.8
146 - (match_bool.iter().filter(|b| **b).count() as f64 / match_bool.len() as f64))
147 * BONUS_ARRAY[8]);
148
149 let grouped_match_bool: Vec<(bool, usize)> = match_bool
151 .iter()
152 .group_by(|bool| **bool)
153 .into_iter()
154 .map(|(bool, iter)| (bool, iter.count()))
155 .collect();
156
157 let longest_match = grouped_match_bool
159 .iter()
160 .filter(|(bool, _count)| *bool)
161 .map(|(_bool, count)| count)
162 .max();
163
164 match longest_match {
165 Some(max) => score += -(*max as f64 * BONUS_ARRAY[7]),
166 None => (),
167 }
168
169 for (match_bool, count) in grouped_match_bool.iter() {
171 if !*match_bool && count > &2 {
172 score += -((*count as f64 - 2.) * BONUS_ARRAY[0]) * BONUS_ARRAY[9]
173 }
174 }
175
176 return score;
177}
178
179pub fn calc_at_offset(seq1: &[usize], seq2: &[usize], offset: i32) -> Option<f64> {
180 let mut mapping: Vec<(usize, usize)> = Vec::new();
182 for x in 0..seq1.len() {
183 let seq2_index = x as i32 + offset;
184 if seq2_index >= 0 {
185 mapping.push((seq2_index as usize, x))
186 }
187 }
188
189 let match_bool = mapping
191 .iter()
192 .map(|(seq2i, seq1i)| MATCH_ARRAY[seq1[*seq1i] as usize][seq2[*seq2i] as usize])
193 .collect();
194
195 let mut dg_score = calc_dangling_ends_stabilty(&seq1, &seq2, &mapping);
196
197 match calc_extention(seq1, &match_bool) {
198 Some(score) => dg_score += score,
199 None => return None,
200 };
201
202 dg_score += apply_bonus(&match_bool);
204
205 mapping.pop();
207 dg_score += calc_nn_thermo(seq1, seq2, &mapping);
208
209 return Some(dg_score);
210}
211
212fn does_seq1_extend(seq1: &[usize], seq2: &[usize], t: f64) -> bool {
213 let mut seq2_rev = seq2.to_owned();
214 seq2_rev.reverse();
215
216 for offset in -(seq1.len() as i32 - 2)..(seq2.len() as i32) - (seq1.len() as i32) {
217 match calc_at_offset(&seq1, &seq2_rev, offset) {
218 Some(score) => {
219 if score <= t {
221 return true;
222 }
223 }
224 None => (),
225 }
226 }
227 return false;
228}
229
230pub fn do_seqs_interact(seq1: &str, seq2: &str, t: f64) -> bool {
231 let s1 = encode_base(seq1);
232 let s2 = encode_base(seq2);
233
234 return does_seq1_extend(&s1, &s2, t) | does_seq1_extend(&s2, &s1, t);
235}
236
237pub fn do_pools_interact(pool1: Vec<&str>, pool2: Vec<&str>, t: f64) -> bool {
238 let pool1_encoded: Vec<Vec<usize>> = pool1.iter().map(|s| encode_base(s)).collect();
240 let pool2_encoded: Vec<Vec<usize>> = pool2.iter().map(|s| encode_base(s)).collect();
241
242 for (s1, s2) in pool1_encoded.iter().cartesian_product(pool2_encoded.iter()) {
244 if does_seq1_extend(&s1, &s2, t) | does_seq1_extend(&s2, &s1, t) {
245 return true;
246 }
247 }
248 return false;
249}
250
251pub fn do_pools_interact_adv(pool1: &Vec<String>, pool2: &Vec<String>, t: f64) -> bool {
252 let pool1_encoded: Vec<Vec<usize>> = pool1.iter().map(|s| encode_base(s)).collect();
254 let pool2_encoded: Vec<Vec<usize>> = pool2.iter().map(|s| encode_base(s)).collect();
255
256 for (s1, s2) in pool1_encoded.iter().cartesian_product(pool2_encoded.iter()) {
258 if does_seq1_extend(&s1, &s2, t) | does_seq1_extend(&s2, &s1, t) {
259 return true;
260 }
261 }
262 return false;
263}
264
265#[cfg(test)]
266mod tests {
267 use crate::{calc_nn_thermo, encode_base, NN_SCORES};
268 #[test]
269 fn test_valid_encode_base() {
270 let seq = "ATCG";
271
272 assert_eq!(encode_base(seq), vec![0, 3, 1, 2])
274 }
275 #[test]
276 #[should_panic]
277 fn test_invalid_encode_base() {
278 encode_base("z");
279 }
280 #[test]
281 fn test_all_match() {
282 let seq1 = "ACGAT";
284 let seq2 = "TGCTA";
285 let offset = 0;
286
287 let a = encode_base("A")[0];
288 let t = encode_base("T")[0];
289 let c = encode_base("C")[0];
290 let g = encode_base("G")[0];
291
292 let mut mapping: Vec<(usize, usize)> = Vec::new();
293 for x in 0..seq1.len() {
294 let seq2_index = x as i32 + offset;
295 if seq2_index >= 0 {
296 mapping.push((seq2_index as usize, x))
297 }
298 }
299 mapping.pop();
301 let mut pred_score: f64 = 0.;
302
303 pred_score += NN_SCORES[a][c][t][g].unwrap_or(0.);
305 pred_score += NN_SCORES[c][g][g][c].unwrap_or(0.);
307 pred_score += NN_SCORES[g][a][c][t].unwrap_or(0.);
309 pred_score += NN_SCORES[a][t][t][a].unwrap_or(0.);
311
312 assert_eq!(
314 calc_nn_thermo(&encode_base(seq1), &encode_base(seq2), &mapping),
315 pred_score
316 )
317 }
318 #[test]
319 fn test_mismatch_with_offset() {
320 let seq1 = "ACCTC";
325 let seq2 = "ACTGGTGCTAC";
326 let offset = 2;
327
328 let a = encode_base("A")[0];
329 let t = encode_base("T")[0];
330 let c = encode_base("C")[0];
331 let g = encode_base("G")[0];
332
333 let mut mapping: Vec<(usize, usize)> = Vec::new();
334 for x in 0..seq1.len() {
335 let seq2_index = x as i32 + offset;
336 if seq2_index >= 0 {
337 mapping.push((seq2_index as usize, x))
338 }
339 }
340 mapping.pop();
342
343 let mut pred_score = 0.;
344
345 pred_score += NN_SCORES[a][c][t][g].unwrap_or(0.);
347 pred_score += NN_SCORES[c][c][g][g].unwrap_or(0.);
349 pred_score += NN_SCORES[c][t][g][t].unwrap_or(0.);
351 pred_score += NN_SCORES[t][c][t][g].unwrap_or(0.);
353
354 assert_eq!(
356 calc_nn_thermo(&encode_base(seq1), &encode_base(seq2), &mapping),
357 pred_score
358 )
359 }
360 #[test]
361 fn test_match_array() {
362 let a = encode_base("A")[0];
363 let t = encode_base("T")[0];
364 let c = encode_base("C")[0];
365 let g = encode_base("G")[0];
366
367 assert!(super::MATCH_ARRAY[a][t]);
370 assert!(super::MATCH_ARRAY[t][a]);
372 assert!(super::MATCH_ARRAY[c][g]);
374 assert!(super::MATCH_ARRAY[g][c]);
376 assert_eq!(super::MATCH_ARRAY[a][a], false);
379 assert_eq!(super::MATCH_ARRAY[a][c], false);
381 assert_eq!(super::MATCH_ARRAY[a][g], false);
383
384 assert_eq!(super::MATCH_ARRAY[t][t], false);
386 assert_eq!(super::MATCH_ARRAY[t][c], false);
388 assert_eq!(super::MATCH_ARRAY[t][g], false);
390
391 assert_eq!(super::MATCH_ARRAY[c][c], false);
393 assert_eq!(super::MATCH_ARRAY[c][a], false);
395 assert_eq!(super::MATCH_ARRAY[c][t], false);
397
398 assert_eq!(super::MATCH_ARRAY[g][g], false);
400 assert_eq!(super::MATCH_ARRAY[g][a], false);
402 assert_eq!(super::MATCH_ARRAY[g][t], false);
404 }
405 #[test]
406 fn test_ensure_consistant_result() {
407 let s1 = "ACACCTGTGCCTGTTAAACCAT"; let s2 = "CAATTTGGTAATTGAACACCCATAAAGGT"; let offset = -12;
416
417 assert_eq!(
418 super::calc_at_offset(&encode_base(s1), &encode_base(s2), offset),
419 Some(-40.736826004)
420 );
421 }
422 #[test]
423 fn test_ensure_detection() {
424 let s1 = "ACACCTGTGCCTGTTAAACCAT"; let s2 = "TGGAAATACCCACAAGTTAATGGTTTAAC"; let threshold = -27.0;
433
434 assert!(super::does_seq1_extend(
435 &encode_base(s1),
436 &encode_base(s2),
437 threshold,
438 ));
439 }
440}