1#![deny(warnings, missing_docs)]
2use nom::{
30 bytes::complete::tag,
31 character::complete::{
32 alpha0, char, digit0, digit1, line_ending, multispace0, multispace1, not_line_ending,
33 },
34 error::Error,
35 multi::{many0, many1},
36 number::complete::float,
37 sequence::{pair, tuple},
38 Err, IResult, InputTakeAtPosition,
39};
40
41#[derive(Debug, PartialEq)]
42struct RawRepeatSpacer<'a> {
46 start: usize,
48 end: usize,
50 repeat_diff: &'a str,
52 spacer: &'a str,
54}
55
56#[derive(Debug, PartialEq)]
57pub struct RepeatSpacer<'a> {
59 pub start: usize,
61 pub end: usize,
63 pub spacer_start: usize,
65 pub spacer_end: usize,
67 pub repeat_start: usize,
69 pub repeat_end: usize,
71 pub repeat: String,
73 pub spacer: &'a str,
75}
76
77#[derive(Debug, PartialEq)]
78pub struct Array<'a> {
80 pub accession: &'a str,
82 pub order: usize,
84 pub start: usize,
86 pub end: usize,
88 pub consensus_repeat_sequence: &'a str,
90 pub repeat_spacers: Vec<RepeatSpacer<'a>>,
92}
93
94pub fn parse(input: &str) -> Result<Vec<Array>, Err<Error<&str>>> {
96 let result = tuple((skip_header, many0(parse_array)))(input);
97 match result {
98 Ok((_, (_, arrays))) => Ok(arrays),
99 Err(e) => Err(e),
100 }
101}
102
103fn not_space(input: &str) -> IResult<&str, &str> {
105 input.split_at_position_complete(char::is_whitespace)
106}
107
108fn skip_header(input: &str) -> IResult<&str, ()> {
110 let result = tuple((
111 skip_one_line,
112 skip_one_line,
113 skip_empty_line,
114 skip_one_line,
115 skip_empty_line,
116 skip_empty_line,
117 skip_empty_line,
118 skip_one_line,
119 skip_empty_line,
120 skip_empty_line,
121 skip_empty_line,
122 ))(input);
123 match result {
124 Ok((remainder, _)) => Ok((remainder, ())),
125 Err(e) => Err(e),
126 }
127}
128
129fn parse_raw_repeat_spacer(input: &str) -> IResult<&str, RawRepeatSpacer> {
132 let result = tuple((
133 multispace0,
134 digit1,
135 multispace1,
136 digit1,
137 multispace1,
138 float,
139 multispace1,
140 digit0,
141 multispace0,
142 alpha0,
143 multispace1,
144 not_space,
145 multispace1,
146 alpha0,
147 ))(input);
148 match result {
149 Ok((remainder, data)) => {
150 let repeat_diff = data.11;
151 let spacer = data.13;
152 let start = data.1.parse::<usize>().unwrap() - 1;
153 let end = start + repeat_diff.len() + spacer.len();
154 let raw_repeat_spacer = RawRepeatSpacer {
155 start,
156 end,
157 repeat_diff,
158 spacer,
159 };
160 Ok((remainder, raw_repeat_spacer))
161 }
162 Err(e) => Err(e),
163 }
164}
165
166fn skip_one_line(input: &str) -> IResult<&str, ()> {
168 let result = pair(not_line_ending, line_ending)(input);
169 match result {
170 Ok((remaining, _)) => Ok((remaining, ())),
171 Err(e) => Err(e),
172 }
173}
174
175fn skip_empty_line(input: &str) -> IResult<&str, ()> {
177 let result = line_ending(input);
178 match result {
179 Ok((remaining, _)) => Ok((remaining, ())),
180 Err(e) => Err(e),
181 }
182}
183
184fn parse_array_summary_line(input: &str) -> IResult<&str, &str> {
186 let result = tuple((
187 multispace0,
188 digit1,
189 multispace1,
190 digit1,
191 multispace1,
192 digit1,
193 multispace1,
194 not_space,
195 line_ending,
196 ))(input);
197 match result {
198 Ok((remainder, data)) => Ok((remainder, data.7)),
199 Err(e) => Err(e),
200 }
201}
202
203fn parse_array(input: &str) -> IResult<&str, Array> {
205 let result = tuple((
206 tag("Array "),
207 digit1,
208 line_ending,
209 char('>'),
210 not_space,
211 line_ending,
212 skip_empty_line,
213 skip_one_line,
214 skip_one_line,
215 many1(parse_raw_repeat_spacer),
216 skip_one_line,
217 skip_one_line,
218 parse_array_summary_line,
219 skip_empty_line,
220 skip_empty_line,
221 ))(input);
222 match result {
223 Err(e) => Err(e),
224 Ok((remainder, data)) => {
225 let order = data.1.parse::<usize>().unwrap() - 1;
226 let accession = data.4;
227 let raw_repeat_spacers = data.9;
228 let consensus_repeat_sequence = data.12;
229 let repeat_spacers =
230 convert_raw_rs_to_final_rs(consensus_repeat_sequence, &raw_repeat_spacers);
231 let start = repeat_spacers.first().unwrap().start;
232 let end = repeat_spacers.last().unwrap().end;
233 Ok((
234 remainder,
235 Array {
236 start,
237 end,
238 order,
239 accession,
240 consensus_repeat_sequence,
241 repeat_spacers,
242 },
243 ))
244 }
245 }
246}
247
248fn convert_raw_rs_to_final_rs<'a>(
252 consensus_repeat: &'a str,
253 raw_repeat_spacers: &[RawRepeatSpacer<'a>],
254) -> Vec<RepeatSpacer<'a>> {
255 let mut output = vec![];
256 let mut total_gap_count = 0usize;
257 for raw in raw_repeat_spacers {
258 assert_eq!(raw.repeat_diff.len(), consensus_repeat.len());
259 let repeat = raw
260 .repeat_diff
261 .chars()
262 .zip(consensus_repeat.chars())
263 .filter(|(r, _)| *r != '-')
264 .map(|(r, c)| if r == '.' { c } else { r })
265 .collect::<String>();
266 let gap_count = raw.repeat_diff.matches('-').count();
267 let rs = RepeatSpacer {
268 start: raw.start - total_gap_count,
269 end: raw.end - total_gap_count - gap_count,
270 repeat_start: raw.start - total_gap_count,
271 repeat_end: raw.start - total_gap_count + repeat.len(),
272 spacer_start: raw.start - total_gap_count + repeat.len(),
273 spacer_end: raw.end - total_gap_count - gap_count,
274 repeat,
275 spacer: raw.spacer,
276 };
277 output.push(rs);
278 total_gap_count += gap_count;
279 }
280 output
281}
282
283#[cfg(test)]
284mod tests {
285 use super::*;
286
287 #[test]
288 fn test_parse_raw_repeat_spacer() {
289 let input = " 462 36 100.0 29 CTTTCTGAAG .................................... CGTGCTCGCTTTGAATTTGTAGAACCCGA";
290 let expected = RawRepeatSpacer {
291 start: 461,
292 end: 461 + 36 + 29,
293 repeat_diff: "....................................",
294 spacer: "CGTGCTCGCTTTGAATTTGTAGAACCCGA",
295 };
296 let (_, actual) = parse_raw_repeat_spacer(input).unwrap();
297 assert_eq!(expected, actual);
298 }
299
300 #[test]
301 fn test_parse_array_summary_line() {
302 let input = " 22 36 29 GTTGTGGTTTGATGTAGGAATCAAAAGATATACAAC\n";
303 let expected = "GTTGTGGTTTGATGTAGGAATCAAAAGATATACAAC";
304 let (_, actual) = parse_array_summary_line(input).unwrap();
305 assert_eq!(expected, actual);
306 }
307
308 #[test]
309 fn test_convert_raw_rs_to_final_rs() {
310 let consensus = "AAGTTTCCGTCCCCTTTCGGGGAATCATTTAGAAAAT--A";
311 let raws = vec![
312 RawRepeatSpacer {
313 start: 3831,
314 end: 3906,
315 repeat_diff: "..A..................................CC.",
316 spacer: "GAATTACATCGTATGCCAATACGCAGTTGCTTTT",
317 },
318 RawRepeatSpacer {
319 start: 3831,
320 end: 3906,
321 repeat_diff: "GG............-......................--.",
322 spacer: "ATCACATTCA",
323 },
324 ];
325 let rs = convert_raw_rs_to_final_rs(consensus, &raws);
326 assert_eq!(rs.len(), 2);
327 assert_eq!(rs[0].repeat, "AAATTTCCGTCCCCTTTCGGGGAATCATTTAGAAAATCCA");
328 assert_eq!(rs[1].repeat, "GGGTTTCCGTCCCCTTCGGGGAATCATTTAGAAAATA");
329 }
330
331 #[test]
332 fn test_parse_array() {
333 let input = "Array 5
334>MGYG000273829_14
335
336 Pos Repeat %id Spacer Left flank Repeat Spacer
337========== ====== ====== ====== ========== ==================================== ======
338 16576 36 100.0 30 AAACAGTTCT .................................... ACGAACTTAGTACCCTTTTCTGGGCGGCAT
339 16642 36 100.0 30 TGGGCGGCAT .................................... CCGCAGGTGCTACCGCTGTTATACTCTGTT
340 16708 36 100.0 30 ATACTCTGTT .................................... CGTAAATCGTTGGCGAAACGCTACCAACTG
341 16774 36 100.0 30 CTACCAACTG .................................... CCTCGGTCTGCTCTAACAGATCCCCCAAGT
342 16840 36 100.0 30 TCCCCCAAGT .................................... ACAGAGAAAGAAAGAGAGATTAACGACTAC
343 16906 36 100.0 30 TAACGACTAC .................................... TGAAACGGAGTGGACAGGTAAAGGAATGGG
344 16972 36 100.0 30 AAGGAATGGG .................................... TGCGGTCCCTTGGTTCCGTCAACAACATCA
345 17038 36 100.0 30 AACAACATCA .................................... TGTCCTATTCCCTTTTATGCTGCGTGTATA
346 17104 36 100.0 30 TGCGTGTATA .................................... AATACAAGCATAAAGAACGAACCGCAACGG
347 17170 36 100.0 ACCGCAACGG .................................... AGGGAA
348========== ====== ====== ====== ========== ====================================
349 10 36 30 GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT
350
351
352";
353 let (_, actual) = parse_array(input).unwrap();
354 assert_eq!(actual.repeat_spacers.len(), 10);
355 assert_eq!(actual.accession, "MGYG000273829_14");
356 assert_eq!(
357 actual.consensus_repeat_sequence,
358 "GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT"
359 );
360 assert_eq!(actual.repeat_spacers[0].start, 16575);
361 assert_eq!(actual.repeat_spacers[9].start, 17169);
362 }
363
364 #[test]
365 fn test_parse_array_with_gaps() {
366 let input = "Array 18
367>MGYG000232241_150
368
369 Pos Repeat %id Spacer Left flank Repeat Spacer
370========== ====== ====== ====== ========== ======================================== ======
371 3832 40 92.5 34 CATATAGCAA ..A..................................CC. GAATTACATCGTATGCCAATACGCAGTTGCTTTT
372 3906 40 97.5 41 AGTTGCTTTT .....................................--- TGTACTACTATGCGGTATTCCATCTGAAGGATGGCGGCTAC
373 3987 40 92.5 TGGCGGCTAC GG............-......................--. ATCACATTCA
374========== ====== ====== ====== ========== ========================================
375 3 40 37 AAGTTTCCGTCCCCTTTCGGGGAATCATTTAGAAAAT--A
376
377
378";
379 let expected = Array {
380 accession: "MGYG000232241_150",
381 consensus_repeat_sequence: "AAGTTTCCGTCCCCTTTCGGGGAATCATTTAGAAAAT--A",
382 start: 3831,
383 end: 4030,
384 order: 17,
385 repeat_spacers: vec![
386 RepeatSpacer {
387 start: 3831,
388 end: 3905,
389 repeat_start: 3831,
390 repeat_end: 3871,
391 spacer_start: 3871,
392 spacer_end: 3905,
393 spacer: "GAATTACATCGTATGCCAATACGCAGTTGCTTTT",
394 repeat: "AAATTTCCGTCCCCTTTCGGGGAATCATTTAGAAAATCCA".to_string(),
395 },
396 RepeatSpacer {
397 start: 3905,
398 end: 3983,
399 repeat_start: 3905,
400 repeat_end: 3942,
401 spacer_start: 3942,
402 spacer_end: 3983,
403 spacer: "TGTACTACTATGCGGTATTCCATCTGAAGGATGGCGGCTAC",
404 repeat: "AAGTTTCCGTCCCCTTTCGGGGAATCATTTAGAAAAT".to_string(),
405 },
406 RepeatSpacer {
407 start: 3983,
408 end: 4030,
409 repeat_start: 3983,
410 repeat_end: 4020,
411 spacer_start: 4020,
412 spacer_end: 4030,
413 spacer: "ATCACATTCA",
414 repeat: "GGGTTTCCGTCCCCTTCGGGGAATCATTTAGAAAATA".to_string(),
415 },
416 ],
417 };
418 let (_, actual) = parse_array(input).unwrap();
419 assert_eq!(expected, actual);
420 }
421}