1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
//! This module handles assessing various primer metrics, such as GC concentration, and repeats.
use bincode::{Decode, Encode};
use lin_alg::map_linear;
use na_seq::{
Nucleotide,
Nucleotide::{C, G},
calc_gc,
};
use crate::{
TM_TARGET, melting_temp_calcs,
primer::{IonConcentrations, MIN_PRIMER_LEN, Primer, TuneSetting},
util::remove_duplicates,
};
/// Metrics related to primer quality.
#[derive(Clone, Debug, Default, Encode, Decode)]
pub struct PrimerMetrics {
/// C
pub melting_temp: f32,
/// 0. to 1.
pub gc_portion: f32,
/// How many G and C nts are in the last 5 (3' end) nts of the sequence.
pub gc_3p_count: u8,
pub self_end_dimer: u8,
pub seq_len: usize,
pub repeats: u8,
pub tm_score: f32,
pub gc_score: f32,
pub gc_3p_score: f32,
pub len_score: f32,
pub dimer_score: f32,
/// https://www.benchling.com/primer-design-for-pcr
/// "Avoid runs of four or more of a single base (e.g., ACCCCC), or four or more dinucleotide
/// repeats (e.g., ATATATATAT) as they will cause mispriming."
pub repeats_score: f32,
/// This is a weighted overall score.
pub quality_score: f32,
}
impl PrimerMetrics {
/// Return a quality score, on a scale from 0 to 1.
/// `dual_end` indicates if this is a double-end-tunable primer, which generally means a cloning
/// insert primer. This affects the len-based score.
pub fn update_scores(&mut self, dual_end: bool) {
const GC_TARGET: f32 = 0.5;
// todo: Do these weights have to add up to 1/total?
const WEIGHT_TM: f32 = 1.;
const WEIGHT_GC: f32 = 1.;
const WEIGHT_STAB: f32 = 1.;
// const WEIGHT_COMPLEXITY: f32 = 1.;
const WEIGHT_DIMER: f32 = 1.;
const WEIGHT_LEN: f32 = 1.5;
const WEIGHT_REPEATS: f32 = 0.5;
// todo: Instead of closeness to 59, should it be >54??
// Also: 50-60C. And within 5C of the complement primer.
self.tm_score = map_linear((self.melting_temp - TM_TARGET).abs(), (0., 18.), (1., 0.));
self.tm_score = self.tm_score.clamp(0., 1.);
// This is currently a linear map, between 0 and 1.
self.gc_score = 1. - (self.gc_portion - GC_TARGET).abs() * 2.;
// todo: This is not sophisticated enough, but is a start; we need to assess the length on both
// todo sides of the anchor individually.
self.len_score = if dual_end {
let max_falloff_dist = 16.;
let ideal = 42.;
match self.seq_len {
36..=48 => 1.,
_ => {
// More gentle penalty for long primers.
let max_falloff = if self.seq_len > ideal as usize {
ideal + max_falloff_dist
} else {
ideal - max_falloff_dist
};
map_linear(
(ideal - self.seq_len as f32).abs(),
(0., max_falloff),
(1., 0.),
)
}
}
} else {
let max_falloff_dist = 8.;
let ideal = 21.;
// todo: DRy with above.
match self.seq_len {
18..=24 => 1.,
_ => {
// More gentle penalty for long primers.
let max_falloff = if self.seq_len > ideal as usize {
ideal + max_falloff_dist
} else {
ideal - max_falloff_dist
};
map_linear(
(ideal - self.seq_len as f32).abs(),
(0., max_falloff),
(1., 0.),
)
}
}
};
// Sources differ on if 4 is an ok value. AmplifX calls it "good"; [the DNA universe](https://the-dna-universe.com/2022/09/05/primer-design-guide-the-top-5-factors-to-consider-for-optimum-performance/)
// considers it to be bad.
self.gc_3p_score = match self.gc_3p_count {
0 | 1 => 0.,
2 => 1.,
3 => 1.,
4 => 0.5,
5 => 0.,
_ => unreachable!(),
};
let repeats = if dual_end {
self.repeats / 2 // todo: odd-number rounding
} else {
self.repeats
};
self.repeats_score = match repeats {
0 => 1.,
1 => 0.8,
2 => 0.6,
3 => 0.3,
4 => 0.2,
_ => 0.,
};
self.quality_score = (WEIGHT_TM * self.tm_score
+ WEIGHT_GC * self.gc_score
+ WEIGHT_STAB * self.gc_3p_score
// + WEIGHT_COMPLEXITY * self.complexity_score
+ WEIGHT_DIMER * self.dimer_score
+ WEIGHT_LEN * self.len_score
+ WEIGHT_REPEATS * self.repeats_score)
/ 6.
}
}
impl Primer {
/// Calculate melting temperature (TM), in C.
///
/// [Calc data from AmplifX](https://inp.univ-amu.fr/en/amplifx-manage-test-and-design-your-primers-for-pcr):
/// "TM = 81.5 +16.6 X log10([Na+]+[K+])+0.41 x(%GC) - 675/N (with default values: [ Na+]+[K+]=0.05 (50mM))
///
/// "the most precise... "bases stacking method” : TM=dH/(dS+0.368xNxln[Na+]+ R
/// x ln[Primer]/4) with R=1.987 and the different dH and dS taken in [Santalucia J PNAS 95 pp1460-1465 (1998)]"
///
/// https://primerexplorer.jp/e/v4_manual/03.html (Different formula)
///
/// https://www.rosalind.bio/en/knowledge/what-formula-is-used-to-calculate-tm
/// Come up with something better A/R
/// "ASSUMPTIONS:
/// Equations above assume that the annealing occurs under the standard conditions of 50 nM
/// primer, 50 mM Na+, and pH 7.0."
///
/// We use the "bases stacking method" as defined hte Amplifx guide above.
///
/// See [The BioPython MeltingTemp module](https://github.com/biopython/biopython/blob/master/Bio/SeqUtils/MeltingTemp.py)
pub fn calc_tm(&self, ion_concentrations: &IonConcentrations) -> f32 {
// const NAP_K_P: f32 = 0.05;
//
// // TM = 81.5 +16.6 X log10([Na+]+[K+])+0.41 x(%GC) - 675/N
// // 81.5 + 16.6 * NAP_K_P.log10() + 0.41 * self.calc_gc() * 100.
// // - 675. / (self.sequence.len() as f32)
//
// const dH: f32 = 1. // todo
// const dS: f32 = 1. // todo
// const R: f32 = 1.987; // Universal gas constant (Cal/C * Mol)
// let N = self.sequence.len();
//
// dH / (dS + 0.368 * N * NAP.ln() + R * primer.ln() / 4.)
melting_temp_calcs::calc_tm(&self.sequence, ion_concentrations).unwrap_or(0.)
}
/// This is a metric known as 3' end stability. Return the number of Gs and Cs in the last 5 bases.
/// 2-4 is ideal. (Other sources, 2-3, with 4 being too many). 1 or 5 is considered suboptimal.
pub fn count_3p_g_c(&self) -> u8 {
let mut result = 0;
let len = self.sequence.len();
let last_5 = if len > 5 {
self.sequence.split_at(len - 5).1
} else {
&self.sequence[..]
};
for nt in last_5 {
if *nt == C || *nt == G {
result += 1
}
}
result
}
/// Calculate self end dimer score. (Details on what this means here)
///
/// http://www.premierbiosoft.com/tech_notes/PCR_Primer_Design.html
/// "A primer self-dimer is formed by intermolecular interactions between the two (same sense)
/// primers, where the primer is homologous to itself. Generally a large amount of primers are
/// used in PCR compared to the amount of target gene. When primers form intermolecular dimers
/// much more readily than hybridizing to target DNA, they reduce the product yield. Optimally a 3'
/// end self dimer with a ΔG of -5 kcal/mol and an internal self dimer with a ΔG of -6 kcal/mol is
/// tolerated generally."
pub fn calc_self_end_dimer(&self) -> u8 {
0
}
/// Calculate how many single or double nucleotide sequences exist that are of len 4 or more of the
/// same nt or nt pair respectively. It currently doesn't differentiate between 4, and more.
/// Also includes repeats of a set of 3 nucleotides anywhere in the seq.
pub fn calc_repeats(&self) -> u8 {
if self.sequence.len() < 4 {
return 0;
}
let mut result = 0;
result += single_nt_repeats(&self.sequence) as u8;
result += double_nt_repeats(&self.sequence) as u8;
result += triplet_repeats(&self.sequence) as u8;
result
}
/// Calculate all primer metrics.
/// todo: methods on Metrics instead?
pub fn calc_metrics(&self, ion_concentrations: &IonConcentrations) -> Option<PrimerMetrics> {
if self.sequence.len() < MIN_PRIMER_LEN {
return None;
}
let mut result = PrimerMetrics {
melting_temp: self.calc_tm(ion_concentrations),
gc_portion: calc_gc(&self.sequence),
seq_len: self.sequence.len(),
gc_3p_count: self.count_3p_g_c(),
// complexity: self.calc_complexity(),
self_end_dimer: self.calc_self_end_dimer(),
repeats: self.calc_repeats(),
..Default::default()
};
let dual_ended = matches!(self.volatile.tune_setting, TuneSetting::Both(_));
result.update_scores(dual_ended);
Some(result)
}
}
/// Count the number of single-nucleotide repeats in a sequence. Counts when it's > 4.
fn single_nt_repeats(seq: &[Nucleotide]) -> u16 {
let mut result = 0;
let mut prev_nt = seq[0];
let mut repeat_len = 1; // Counts the char.
for nt in seq {
if *nt == prev_nt {
repeat_len += 1;
if repeat_len >= 4 {
result += 1;
repeat_len = 1;
}
} else {
repeat_len = 1;
}
prev_nt = *nt;
}
result
}
/// Count the number of double-nucleotide repeats in a sequence. Counts when it's > 4. eg `atatatat`
fn double_nt_repeats(seq: &[Nucleotide]) -> u16 {
let mut result = 0;
let mut prev_nt = (seq[0], seq[1]);
let mut repeat_len = 1; // Counts the char.
for i in 0..seq.len() / 2 - 1 {
// todo: Incomplete: Need to do the same offset by one.
let nts = (seq[i * 2], seq[(i * 2) + 1]);
if nts == prev_nt {
repeat_len += 1;
if repeat_len >= 4 {
result += 1;
repeat_len = 1;
}
} else {
repeat_len = 1;
}
prev_nt = nts;
}
result
}
/// Count the number of times a set of three nucleotides is repeated in a sequence.
/// This does not have to be adjacent.
fn triplet_repeats(seq: &[Nucleotide]) -> u16 {
let mut triplets = Vec::new();
for (i, nt) in seq.iter().enumerate() {
if i == seq.len() - 2 {
break;
}
triplets.push((i, nt, seq[i + 1], seq[i + 2]));
}
let mut triplet_repeat_seqs = Vec::new();
for (i, nt) in seq.iter().enumerate() {
if i == seq.len() - 2 {
break;
}
let triplet_this = (nt, seq[i + 1], seq[i + 2]);
for triplet_other in &triplets {
if triplet_this == (triplet_other.1, triplet_other.2, triplet_other.3)
&& i != triplet_other.0
{
// Dount count each additional nt match beyond 3 as a new repeat
if i >= 3 && triplet_other.0 >= 3 && seq[i - 1] == seq[triplet_other.0 - 1] {
continue;
}
triplet_repeat_seqs.push(triplet_this);
}
}
}
triplet_repeat_seqs = remove_duplicates(triplet_repeat_seqs);
triplet_repeat_seqs.len() as u16
}