dna/
lib.rs

1// Copyright (c) 2020 10X Genomics, Inc. All rights reserved.
2
3// This file provides a function tm_nearest_neighbor.  All the code in this file is a verbatim
4// translation to rust of C++ code in the BroadCRD codebase (copyright 2006), as
5// dna/DNAHybridization.{cc,h}.  It is conceivable that mistakes were introduced in translation.
6// That code in turn was a verbatim translation of calculations in published work as cited below.
7
8// tm_nearest_neighbor: compute melting temperature of a given DNA sequence s
9// at molarity s_mol (default 0.25 microM), where Na+ concentration is na_mol
10// (default 50 mM), using a nearest-neighbor model, as used by
11//
12// http://www.idtdna.com/ANALYZER/Applications/OligoAnalyzer
13//
14// and described in
15//
16// [1] Allawi, H. T. and SantaLucia J. Jr.  Thermodynamics and NMR of internal
17// G.T mismatches in DNA.  Biochemistry 36 (1997), 10581-10594.
18//
19// [2] Owczarzy, et al.  Effects of sodium ions on DNA duplex oligomers: improved
20// predictions of melting temperatures.  Biochemistry 43 (2004), 3537-3554.
21//
22// [3] Patricia M. McTigue, Raymond J. Peterson, Jason D. Kuhn.  Sequence-dependent
23// thermodynamic parameters for locked nucleic acid (LNA)-DNA duplex formation.
24// Biochemistry 43 (2004), 5388-5405.
25// (See also www.celadonlabs.com/Software/ModChem/default.aspx.)
26//
27// This method should be quite good for short sequences.
28//
29// The results are close to (but not identical to) the results given by the IDT
30// OligoAnalyzer.  I don't know what the discrepancy is due to.
31//
32// We allow some nucleotides to be locked, as specified by the vector "locked".
33// The following assumptions are enforced:
34// (a) no two locked bases in a row;
35// (b) no locked base at the beginning or end, or adjacent to those positions.
36// These are consistent with the assumptions in [3].
37//
38// Alternatively, if S contains + symbols, they will be interpreted as instructions
39// to "lock the following nucleotide", and the vector "locked" will be generated
40// accordingly.
41
42pub fn tm_nearest_neighbor(s: &str) -> f64 {
43    let locked = Vec::<bool>::new();
44    tm_nearest_neighbor_full(s, 0.00000025, 0.05, &locked)
45}
46
47pub fn tm_nearest_neighbor_full(s: &str, s_mol: f64, na_mol: f64, locked: &[bool]) -> f64 {
48    // Allow for + symbols.
49
50    if s.contains('+') {
51        assert_eq!(locked.len(), 0);
52        assert!(!s.contains("++"));
53        assert!(!s.ends_with('+'));
54        let mut sx = String::new();
55        let schars: Vec<char> = s.chars().collect();
56        let mut lockedx = Vec::<bool>::with_capacity(schars.len());
57        let mut i = 0;
58        while i < schars.len() {
59            if schars[i] != '+' {
60                sx.push(schars[i]);
61                lockedx.push(false);
62            } else {
63                sx.push(schars[i + 1]);
64                lockedx.push(true);
65                i += 1;
66            }
67        }
68        return tm_nearest_neighbor_full(&sx, s_mol, na_mol, &lockedx);
69    }
70
71    // Validate sequence.
72
73    verify_dna(s);
74
75    // Compute thermodynamic sums.
76
77    let mut dh_sum = 0.0;
78    let mut ds_sum = 0.0;
79    let mut dg_sum = 0.0;
80    thermodynamic_sums_dna(s, &mut dh_sum, &mut ds_sum, &mut dg_sum, true, true, locked);
81
82    // Compute melting temperature based on nearest-neighbor model.
83
84    let ideal_gas_const = 1.987; // calories per Kelvin per mole
85    let kelvin_to_celsius = 273.15;
86    let mut temp = 1000.0 * dh_sum / (ds_sum + ideal_gas_const * s_mol.ln()) - kelvin_to_celsius;
87
88    // Correct for Na concentration, following [2].
89
90    let mut sx = Vec::<char>::new();
91    for c in s.chars() {
92        sx.push(c);
93    }
94    let mut gc = 0;
95    for i in 0..sx.len() {
96        if sx[i] == 'G' || sx[i] == 'C' {
97            gc += 1;
98        }
99    }
100    let gc_fract = gc as f64 / sx.len() as f64;
101    let ln_na_mol = na_mol.ln();
102
103    temp = -kelvin_to_celsius
104        + 1.0
105            / (1.0 / (temp + kelvin_to_celsius)
106                + (4.29 * gc_fract - 3.95) * 0.00001 * ln_na_mol
107                + 9.40 * 0.000001 * ln_na_mol * ln_na_mol);
108    temp
109}
110
111pub fn verify_dna(s: &str) {
112    for c in s.chars() {
113        assert!(c == 'A' || c == 'C' || c == 'G' || c == 'T');
114    }
115}
116
117// get_thermodynamic_parameters_dna.
118//
119// Return nearest neighbor thermodynamic parameters, from table 1 in reference [1],
120// Allawi and SantaLucia (1997).
121//
122// dH = enthalpy (kcal/mole) = dH^\circ
123// dS = entropy = dS^\circ
124// dG = Gibb's free energy at 37 degrees = dG^\circ_37
125// dH, dS, dG are all {A,C,G,T} x {A,C,G,T} matrices.
126// (d means delta)
127//
128// Although there are dG entries in the table, I've computed them here directly
129// from dH and dS.  I checked a few entries and they agree with what's in the table.
130// dG = dH - T dS (theoretical)
131// dG = dH - (37 + 273.15) dS / 1000 (used).
132// I'm not sure quite why the factor of 1000 is needed.
133
134pub fn get_thermodynamic_parameters_dna(
135    dh: &mut Vec<Vec<f64>>,
136    ds: &mut Vec<Vec<f64>>,
137    dg: &mut Vec<Vec<f64>>,
138    dh_g_or_c_init: &mut f64,
139    dh_a_or_t_init: &mut f64,
140    ds_g_or_c_init: &mut f64,
141    ds_a_or_t_init: &mut f64,
142    dg_g_or_c_init: &mut f64,
143    dg_a_or_t_init: &mut f64,
144    dh_symmetry_correction: &mut f64,
145    ds_symmetry_correction: &mut f64,
146    dg_symmetry_correction: &mut f64,
147) {
148    let a = 0;
149    let c = 1;
150    let g = 2;
151    let t = 3;
152
153    *dh = vec![vec![0.0; 4]; 4];
154    *ds = vec![vec![0.0; 4]; 4];
155    *dg = vec![vec![0.0; 4]; 4];
156
157    dh[a][a] = -7.9;
158    dh[t][t] = -7.9;
159    dh[a][t] = -7.2;
160    dh[t][a] = -7.2;
161    dh[c][a] = -8.5;
162    dh[t][g] = -8.5;
163    dh[g][t] = -8.4;
164    dh[a][c] = -8.4;
165    dh[c][t] = -7.8;
166    dh[a][g] = -7.8;
167    dh[g][a] = -8.2;
168    dh[t][c] = -8.2;
169    dh[c][g] = -10.6;
170    dh[g][c] = -9.8;
171    dh[g][g] = -8.0;
172    dh[c][c] = -8.0;
173    ds[a][a] = -22.2;
174    ds[t][t] = -22.2;
175    ds[a][t] = -20.4;
176    ds[t][a] = -21.3;
177    ds[c][a] = -22.7;
178    ds[t][g] = -22.7;
179    ds[g][t] = -22.4;
180    ds[a][c] = -22.4;
181    ds[c][t] = -21.0;
182    ds[a][g] = -21.0;
183    ds[g][a] = -22.2;
184    ds[t][c] = -22.2;
185    ds[c][g] = -27.2;
186    ds[g][c] = -24.4;
187    ds[g][g] = -19.9;
188    ds[c][c] = -19.9;
189
190    /*
191    // here are the parameters from santaLucia and hicks (2004).  they are almost the same.
192    dh[a][a] = dh[t][t] = -7.6;
193    dh[a][t] = -7.2;
194    dh[t][a] = -7.2;
195    dh[c][a] = dh[t][g] = -8.5;
196    dh[g][t] = dh[a][c] = -8.4;
197    dh[c][t] = dh[a][g] = -7.8;
198    dh[g][a] = dh[t][c] = -8.2;
199    dh[c][g] = -10.6;
200    dh[g][c] = -9.8;
201    dh[g][g] = dh[c][c] = -8.0;
202    ds[a][a] = ds[t][t] = -21.3;
203    ds[a][t] = -20.4;
204    ds[t][a] = -21.3;
205    ds[c][a] = ds[t][g] = -22.7;
206    ds[g][t] = ds[a][c] = -22.4;
207    ds[c][t] = ds[a][g] = -21.0;
208    ds[g][a] = ds[t][c] = -22.2;
209    ds[c][g] = -27.2;
210    ds[g][c] = -24.4;
211    ds[g][g] = ds[c][c] = -19.9;
212    */
213
214    let temp = 37.0 + 273.15;
215    for i in 0..4 {
216        for j in 0..4 {
217            dg[i][j] = dh[i][j] - temp * ds[i][j] / 1000.0;
218        }
219    }
220    *dh_g_or_c_init = 0.1;
221    *dh_a_or_t_init = 2.3;
222    *ds_g_or_c_init = -2.8;
223    *ds_a_or_t_init = 4.1;
224    *dg_g_or_c_init = *dh_g_or_c_init - temp * *ds_g_or_c_init / 1000.0;
225    *dg_a_or_t_init = *dh_a_or_t_init - temp * *ds_a_or_t_init / 1000.0;
226    *dh_symmetry_correction = 0.0;
227    *ds_symmetry_correction = -1.4;
228    *dg_symmetry_correction = *dh_symmetry_correction - temp * *ds_symmetry_correction / 1000.0;
229}
230
231// get_locked_thermodynamic_parameters_dna.  Return the 32 nearest-neighbor
232// thermodynamic parameter set for LNA incorporation, from the left half of Table 4
233// in reference [3].
234
235pub fn get_locked_thermodynamic_parameters_dna(
236    ddh_left: &mut Vec<Vec<f64>>,
237    dds_left: &mut Vec<Vec<f64>>,
238    ddg_left: &mut Vec<Vec<f64>>,
239    ddh_right: &mut Vec<Vec<f64>>,
240    dds_right: &mut Vec<Vec<f64>>,
241    ddg_right: &mut Vec<Vec<f64>>,
242) {
243    let a = 0;
244    let c = 1;
245    let g = 2;
246    let t = 3;
247    *ddh_left = vec![vec![0.0; 4]; 4];
248    *dds_left = vec![vec![0.0; 4]; 4];
249    *ddg_left = vec![vec![0.0; 4]; 4];
250    *ddh_right = vec![vec![0.0; 4]; 4];
251    *dds_right = vec![vec![0.0; 4]; 4];
252    *ddg_right = vec![vec![0.0; 4]; 4];
253    ddh_left[a][a] = 0.707;
254    ddh_left[a][c] = 1.131;
255    ddh_left[a][g] = 0.264;
256    ddh_left[a][t] = 2.282;
257    ddh_left[c][a] = 1.049;
258    ddh_left[c][c] = 2.096;
259    ddh_left[c][g] = 0.785;
260    ddh_left[c][t] = 0.708;
261    ddh_left[g][a] = 3.162;
262    ddh_left[g][c] = -0.360;
263    ddh_left[g][g] = -2.844;
264    ddh_left[g][t] = -0.212;
265    ddh_left[t][a] = -0.046;
266    ddh_left[t][c] = 1.893;
267    ddh_left[t][g] = -1.540;
268    ddh_left[t][t] = 1.528;
269    dds_left[a][a] = 2.477;
270    dds_left[a][c] = 4.064;
271    dds_left[a][g] = 2.613;
272    dds_left[a][t] = 7.457;
273    dds_left[c][a] = 4.320;
274    dds_left[c][c] = 7.996;
275    dds_left[c][g] = 3.709;
276    dds_left[c][t] = 4.175;
277    dds_left[g][a] = 10.544;
278    dds_left[g][c] = -0.251;
279    dds_left[g][g] = -6.680;
280    dds_left[g][t] = 0.073;
281    dds_left[t][a] = 1.562;
282    dds_left[t][c] = 6.685;
283    dds_left[t][g] = -3.044;
284    dds_left[t][t] = 5.298;
285    ddg_left[a][a] = -0.092;
286    ddg_left[a][c] = -0.122;
287    ddg_left[a][g] = -0.561;
288    ddg_left[a][t] = -0.007;
289    ddg_left[c][a] = -0.270;
290    ddg_left[c][c] = -0.457;
291    ddg_left[c][g] = -0.332;
292    ddg_left[c][t] = -0.666;
293    ddg_left[g][a] = -0.072;
294    ddg_left[g][c] = -0.414;
295    ddg_left[g][g] = -0.700;
296    ddg_left[g][t] = -0.194;
297    ddg_left[t][a] = -0.563;
298    ddg_left[t][c] = -0.208;
299    ddg_left[t][g] = -0.548;
300    ddg_left[t][t] = -0.130;
301    ddh_right[a][a] = 0.992;
302    ddh_right[a][c] = 2.890;
303    ddh_right[a][g] = -1.200;
304    ddh_right[a][t] = 1.816;
305    ddh_right[c][a] = 1.358;
306    ddh_right[c][c] = 2.063;
307    ddh_right[c][g] = -0.276;
308    ddh_right[c][t] = -1.671;
309    ddh_right[g][a] = 0.444;
310    ddh_right[g][c] = -0.925;
311    ddh_right[g][g] = -0.943;
312    ddh_right[g][t] = -0.635;
313    ddh_right[t][a] = 1.591;
314    ddh_right[t][c] = 0.609;
315    ddh_right[t][g] = 2.165;
316    ddh_right[t][t] = 2.326;
317    dds_right[a][a] = 4.065;
318    dds_right[a][c] = 10.576;
319    dds_right[a][g] = -1.826;
320    dds_right[a][t] = 6.863;
321    dds_right[c][a] = 4.367;
322    dds_right[c][c] = 7.565;
323    dds_right[c][g] = -0.718;
324    dds_right[c][t] = -4.070;
325    dds_right[g][a] = 2.898;
326    dds_right[g][c] = -1.111;
327    dds_right[g][g] = -0.933;
328    dds_right[g][t] = -0.342;
329    dds_right[t][a] = 5.281;
330    dds_right[t][c] = 3.169;
331    dds_right[t][g] = 7.163;
332    dds_right[t][t] = 8.051;
333    ddg_right[a][a] = -0.396;
334    ddg_right[a][c] = -0.390;
335    ddg_right[a][g] = -0.603;
336    ddg_right[a][t] = -0.309;
337    ddg_right[c][a] = 0.046;
338    ddg_right[c][c] = -0.404;
339    ddg_right[c][g] = -0.003;
340    ddg_right[c][t] = -0.409;
341    ddg_right[g][a] = -0.437;
342    ddg_right[g][c] = -0.535;
343    ddg_right[g][g] = -0.666;
344    ddg_right[g][t] = -0.520;
345    ddg_right[t][a] = 0.004;
346    ddg_right[t][c] = -0.396;
347    ddg_right[t][g] = -0.106;
348    ddg_right[t][t] = -0.212;
349}
350
351// thermodynamic_sums_dna.  Compute dh_sum, ds_sum, dg_sum.  We allow some nucleotides
352// to be locked, as specified by the variable "locked".  The following assumptions
353// are enforced:
354// (a) no two locked bases in a row;
355// (b) no locked base at the beginning or end, or adjacent to those positions.
356// These are consistent with the assumptions in [3].
357
358pub fn thermodynamic_sums_dna(
359    s: &str,
360    dh_sum: &mut f64,
361    ds_sum: &mut f64,
362    dg_sum: &mut f64,
363    include_symmetry_correction: bool,
364    include_initiation_terms: bool,
365    locked: &[bool],
366) {
367    // defaults for last: true, true, empty
368
369    let mut dh = Vec::<Vec<f64>>::new();
370    let mut ds = Vec::<Vec<f64>>::new();
371    let mut dg = Vec::<Vec<f64>>::new();
372
373    let mut dh_g_or_c_init = 0.0;
374    let mut dh_a_or_t_init = 0.0;
375    let mut ds_g_or_c_init = 0.0;
376    let mut ds_a_or_t_init = 0.0;
377    let mut dg_g_or_c_init = 0.0;
378    let mut dg_a_or_t_init = 0.0;
379    let mut dh_symmetry_correction = 0.0;
380    let mut ds_symmetry_correction = 0.0;
381    let mut dg_symmetry_correction = 0.0;
382
383    get_thermodynamic_parameters_dna(
384        &mut dh,
385        &mut ds,
386        &mut dg,
387        &mut dh_g_or_c_init,
388        &mut dh_a_or_t_init,
389        &mut ds_g_or_c_init,
390        &mut ds_a_or_t_init,
391        &mut dg_g_or_c_init,
392        &mut dg_a_or_t_init,
393        &mut dh_symmetry_correction,
394        &mut ds_symmetry_correction,
395        &mut dg_symmetry_correction,
396    );
397    *dh_sum = 0.0;
398    *ds_sum = 0.0;
399    *dg_sum = 0.0;
400    if include_symmetry_correction {
401        *dh_sum += dh_symmetry_correction;
402        *ds_sum += ds_symmetry_correction;
403        *dg_sum += dg_symmetry_correction;
404    }
405    let mut sx = Vec::<char>::new();
406    for c in s.chars() {
407        sx.push(c);
408    }
409    if include_initiation_terms {
410        if sx[0] == 'A' || sx[0] == 'T' {
411            *dh_sum += dh_a_or_t_init;
412            *ds_sum += ds_a_or_t_init;
413            *dg_sum += dg_a_or_t_init;
414        } else {
415            *dh_sum += dh_g_or_c_init;
416            *ds_sum += ds_g_or_c_init;
417            *dg_sum += dg_g_or_c_init;
418        }
419        if sx[sx.len() - 1] == 'A' || sx[sx.len() - 1] == 'T' {
420            *dh_sum += dh_a_or_t_init;
421            *ds_sum += ds_a_or_t_init;
422            *dg_sum += dg_a_or_t_init;
423        } else {
424            *dh_sum += dh_g_or_c_init;
425            *ds_sum += ds_g_or_c_init;
426            *dg_sum += dg_g_or_c_init;
427        }
428    }
429    let mut sx = Vec::<char>::new();
430    for c in s.chars() {
431        sx.push(c);
432    }
433    let mut b = Vec::<usize>::new();
434    for i in 0..sx.len() {
435        if sx[i] == 'A' {
436            b.push(0);
437        } else if sx[i] == 'C' {
438            b.push(1);
439        } else if sx[i] == 'G' {
440            b.push(2);
441        } else {
442            b.push(3);
443        }
444    }
445    for i in 0..sx.len() - 1 {
446        *dh_sum += dh[b[i]][b[i + 1]];
447        *ds_sum += ds[b[i]][b[i + 1]];
448        *dg_sum += dg[b[i]][b[i + 1]];
449    }
450
451    // Handle locked bases.
452
453    let mut have_lock = false;
454    for i in 0..locked.len() {
455        if locked[i] {
456            have_lock = true;
457        }
458    }
459    if have_lock {
460        for i in 1..locked.len() {
461            if locked[i] {
462                assert!(!locked[i - 1]);
463            }
464        }
465        assert!(locked.len() >= 5);
466        assert!(!locked[0] && !locked[1]);
467        assert!(!locked[locked.len() - 1]);
468        assert!(!locked[locked.len() - 2]);
469        let mut ddh_left = Vec::<Vec<f64>>::new();
470        let mut dds_left = Vec::<Vec<f64>>::new();
471        let mut ddg_left = Vec::<Vec<f64>>::new();
472        let mut ddh_right = Vec::<Vec<f64>>::new();
473        let mut dds_right = Vec::<Vec<f64>>::new();
474        let mut ddg_right = Vec::<Vec<f64>>::new();
475        get_locked_thermodynamic_parameters_dna(
476            &mut ddh_left,
477            &mut dds_left,
478            &mut ddg_left,
479            &mut ddh_right,
480            &mut dds_right,
481            &mut ddg_right,
482        );
483        for i in 0..locked.len() {
484            if locked[i] {
485                *dh_sum += ddh_left[b[i]][b[i + 1]] + ddh_right[b[i - 1]][b[i]];
486                *ds_sum += dds_left[b[i]][b[i + 1]] + dds_right[b[i - 1]][b[i]];
487                *dg_sum += ddg_left[b[i]][b[i + 1]] + ddg_right[b[i - 1]][b[i]];
488            }
489        }
490    }
491}