1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
use super::bamlift::*;
use bio::alphabets::dna::revcomp;
use itertools::{izip, multiunzip};
use lazy_static::lazy_static;
use regex::Regex;
use rust_htslib::{
bam,
bam::record::{Aux, AuxArray},
};
use std::convert::TryFrom;
#[derive(Eq, PartialEq, Debug)]
pub struct BaseMod {
pub modified_base: u8,
pub strand: char,
pub modification_type: char,
record_is_reverse: bool,
modified_bases: Vec<i64>,
modified_bases_forward: Vec<i64>,
modified_probabilities: Vec<u8>,
reference_positions: Vec<i64>,
}
impl BaseMod {
pub fn new(
record: &bam::Record,
modified_base: u8,
strand: char,
modification_type: char,
modified_bases_forward: Vec<i64>,
modified_probabilities_forward: Vec<u8>,
) -> Self {
let modified_bases = positions_on_complimented_sequence(record, &modified_bases_forward);
let modified_probabilities = if record.is_reverse() {
modified_probabilities_forward.into_iter().rev().collect()
} else {
modified_probabilities_forward
};
let record_is_reverse = record.is_reverse();
let reference_positions = get_exact_reference_positions(record, &modified_bases);
Self {
modified_base,
strand,
modification_type,
record_is_reverse,
modified_bases,
modified_bases_forward,
modified_probabilities,
reference_positions,
}
}
pub fn get_reference_positions(&self) -> Vec<i64> {
self.reference_positions.clone()
}
pub fn get_modified_bases(&self) -> Vec<i64> {
self.modified_bases.clone()
}
pub fn get_modified_bases_forward(&self) -> Vec<i64> {
self.modified_bases_forward.clone()
}
pub fn get_modified_probabilities(&self) -> Vec<u8> {
self.modified_probabilities.clone()
}
pub fn get_modified_probabilities_forward(&self) -> Vec<u8> {
if self.record_is_reverse {
self.modified_probabilities.iter().rev().cloned().collect()
} else {
self.modified_probabilities.clone()
}
}
pub fn is_m6a(&self) -> bool {
self.modification_type == 'a'
}
pub fn is_cpg(&self) -> bool {
self.modification_type == 'm'
}
}
#[derive(Eq, PartialEq, Debug)]
pub struct BaseMods {
pub base_mods: Vec<BaseMod>,
}
impl BaseMods {
pub fn new(record: &bam::Record, min_ml_score: u8) -> BaseMods {
lazy_static! {
static ref MM_RE: Regex =
Regex::new(r"((([ACGTUN])([-+])([a-z]+|[0-9]+))[.?]?((,[0-9]+)*;)*)").unwrap();
}
let mut rtn = vec![];
let ml_tag = get_u8_tag(record, b"ML");
let mut num_mods_seen = 0;
if let Ok(Aux::String(mm_text)) = record.aux(b"MM") {
for cap in MM_RE.captures_iter(mm_text) {
let mod_base = cap.get(3).map(|m| m.as_str().as_bytes()[0]).unwrap();
let mod_strand = cap.get(4).map_or("", |m| m.as_str());
let modification_type = cap.get(5).map_or("", |m| m.as_str());
let mod_dists_str = cap.get(6).map_or("", |m| m.as_str());
let mod_dists: Vec<i64> = mod_dists_str
.trim_end_matches(';')
.split(',')
.map(|s| s.trim())
.filter(|s| !s.is_empty())
.map(|s| s.parse().unwrap())
.collect();
let forward_bases = if record.is_reverse() {
revcomp(record.seq().as_bytes())
} else {
record.seq().as_bytes()
};
let mut cur_mod_idx = 0;
let mut cur_seq_idx = 0;
let mut dist_from_last_mod_base = 0;
let mut unfiltered_modified_positions: Vec<i64> = vec![0; mod_dists.len()];
while cur_seq_idx < forward_bases.len() && cur_mod_idx < mod_dists.len() {
let cur_base = forward_bases[cur_seq_idx];
if cur_base == mod_base && dist_from_last_mod_base == mod_dists[cur_mod_idx] {
unfiltered_modified_positions[cur_mod_idx] =
i64::try_from(cur_seq_idx).unwrap();
dist_from_last_mod_base = 0;
cur_mod_idx += 1;
} else if cur_base == mod_base {
dist_from_last_mod_base += 1
}
cur_seq_idx += 1;
}
assert_eq!(cur_mod_idx, mod_dists.len());
let num_mods_cur_end = num_mods_seen + unfiltered_modified_positions.len();
let unfiltered_modified_probabilities = if num_mods_cur_end > ml_tag.len() {
let needed_num_of_zeros = num_mods_cur_end - ml_tag.len();
let mut to_add = vec![0; needed_num_of_zeros];
let mut has = ml_tag[num_mods_seen..ml_tag.len()].to_vec();
has.append(&mut to_add);
log::warn!(
"ML tag is too short for the number of modifications found in the MM tag. Assuming an ML value of 0 after the first {num_mods_cur_end} modifications."
);
has
} else {
ml_tag[num_mods_seen..num_mods_cur_end].to_vec()
};
num_mods_seen = num_mods_cur_end;
assert_eq!(
unfiltered_modified_positions.len(),
unfiltered_modified_probabilities.len()
);
let (modified_probabilities, modified_positions): (Vec<u8>, Vec<i64>) =
unfiltered_modified_probabilities
.iter()
.zip(unfiltered_modified_positions.iter())
.filter(|(&ml, &_mm)| ml >= min_ml_score)
.unzip();
let mods = BaseMod::new(
record,
mod_base,
mod_strand.chars().next().unwrap(),
modification_type.chars().next().unwrap(),
modified_positions,
modified_probabilities,
);
rtn.push(mods);
}
} else {
log::debug!("No MM tag found");
}
if ml_tag.len() != num_mods_seen {
log::warn!(
"ML tag ({}) different number than MM tag ({}).",
ml_tag.len(),
num_mods_seen
);
}
BaseMods { base_mods: rtn }
}
pub fn drop_m6a(&mut self) {
self.base_mods.retain(|bm| !bm.is_m6a());
}
pub fn m6a_positions(&self, reference: bool) -> Vec<i64> {
let m6a: Vec<&BaseMod> = self.base_mods.iter().filter(|x| x.is_m6a()).collect();
if m6a.is_empty() {
return vec![];
}
if m6a.len() == 1 {
if reference {
return m6a[0].get_reference_positions();
} else {
return m6a[0].get_modified_bases();
}
}
if reference {
merge_two_lists(
&m6a[0].get_reference_positions(),
&m6a[1].get_reference_positions(),
)
} else {
merge_two_lists(&m6a[0].get_modified_bases(), &m6a[1].get_modified_bases())
}
}
pub fn m6a_full_probabilities(&self, record: &bam::Record) -> Vec<(i64, f32)> {
let mp = get_f32_tag(record, b"mp");
let m6a: Vec<&BaseMod> = self.base_mods.iter().filter(|x| x.is_m6a()).collect();
if m6a.is_empty() || mp.is_empty() {
return vec![];
}
let m6a: Vec<i64> = m6a.iter().flat_map(|x| x.get_modified_bases()).collect();
if m6a.len() != mp.len() {
log::warn!(
"In {} m6A mods ({}) not equal to number of predictions ({}), returning nothing for this read.",
String::from_utf8_lossy(record.qname()),
m6a.len(),
mp.len()
);
return vec![];
}
m6a.into_iter().zip(mp.into_iter()).collect()
}
pub fn m6a(&self) -> (Vec<i64>, Vec<i64>, Vec<u8>) {
let m6a: Vec<&BaseMod> = self.base_mods.iter().filter(|x| x.is_m6a()).collect();
if m6a.is_empty() {
return (vec![], vec![], vec![]);
}
let m6a_pos: Vec<i64> = m6a.iter().flat_map(|x| x.get_modified_bases()).collect();
let m6a_ref: Vec<i64> = m6a
.iter()
.flat_map(|x| x.get_reference_positions())
.collect();
let m6a_qual: Vec<u8> = m6a
.iter()
.flat_map(|x| x.get_modified_probabilities())
.collect();
assert_eq!(m6a_pos.len(), m6a_ref.len());
assert_eq!(m6a_ref.len(), m6a_qual.len());
let mut z: Vec<(i64, i64, u8)> = izip!(m6a_pos, m6a_ref, m6a_qual).collect();
z.sort_by_key(|(p, _r, _q)| *p);
multiunzip(z)
}
pub fn cpg_positions(&self, reference: bool) -> Vec<i64> {
let cpg: Vec<&BaseMod> = self.base_mods.iter().filter(|x| x.is_cpg()).collect();
if cpg.is_empty() {
return vec![];
}
if reference {
cpg[0].get_reference_positions()
} else {
cpg[0].get_modified_bases()
}
}
pub fn add_mm_and_ml_tags(&self, record: &mut bam::Record) {
let mut ml_tag: Vec<u8> = vec![];
let mut mm_tag = "".to_string();
let mut seq = record.seq().as_bytes();
if record.is_reverse() {
seq = revcomp(seq);
}
for basemod in self.base_mods.iter() {
ml_tag.extend(basemod.get_modified_probabilities_forward());
let mut cur_mm = vec![];
let positions = basemod.get_modified_bases_forward();
let mut last_pos = 0;
for pos in positions {
let u_pos = pos as usize;
let mut in_between = 0;
if last_pos < u_pos {
for base in seq[last_pos..u_pos].iter() {
if *base == basemod.modified_base {
in_between += 1;
}
}
}
last_pos = u_pos + 1;
cur_mm.push(in_between);
}
mm_tag.push(basemod.modified_base as char);
mm_tag.push(basemod.strand);
mm_tag.push(basemod.modification_type);
for diff in cur_mm {
mm_tag.push_str(&format!(",{}", diff));
}
mm_tag.push(';')
}
log::trace!(
"{}\n{}\n{}\n",
record.is_reverse(),
mm_tag,
String::from_utf8_lossy(&seq)
);
record.remove_aux(b"MM").unwrap_or(());
record.remove_aux(b"ML").unwrap_or(());
let aux_integer_field = Aux::String(&mm_tag);
record.push_aux(b"MM", aux_integer_field).unwrap();
let aux_array: AuxArray<u8> = (&ml_tag).into();
let aux_array_field = Aux::ArrayU8(aux_array);
record.push_aux(b"ML", aux_array_field).unwrap();
}
}
#[cfg(test)]
mod tests {
use super::*;
use env_logger::{Builder, Target};
use log;
use rust_htslib::{bam, bam::Read};
#[test]
fn test_mods_do_not_change() {
Builder::new()
.target(Target::Stderr)
.filter(None, log::LevelFilter::Debug)
.init();
let mut bam = bam::Reader::from_path(&".test/all.bam").unwrap();
for rec in bam.records() {
let mut rec = rec.unwrap();
let mods = BaseMods::new(&rec, 0);
mods.add_mm_and_ml_tags(&mut rec);
let mods_2 = BaseMods::new(&rec, 0);
assert_eq!(mods, mods_2);
}
}
}