minced_parser/
lib.rs

1#![deny(warnings, missing_docs)]
2//! Parses the output produced by MinCED (<https://github.com/ctSkennerton/minced>), a CRISPR array
3//! annotation tool.
4//!
5//! ## Example
6//!
7//! ```rust
8//! use minced_parser::parse;
9//! use std::fs::File;
10//! use std::io::{BufReader, Read};
11//!
12//! let file = File::open("examples/minced.txt").unwrap();
13//! let mut reader = BufReader::new(file);
14//! let mut input = String::new();
15//! reader.read_to_string(&mut input).unwrap();
16//! let contigs = parse(&input).unwrap();
17//! for contig in contigs {
18//!     println!("{} has {} arrays", contig.accession, contig.arrays.len());
19//! }
20//! ```
21
22use nom::{
23    branch::alt,
24    bytes::complete::{tag, take_until},
25    character::complete::{alpha1, char, digit1, line_ending, multispace1, not_line_ending},
26    error::Error,
27    multi::{many0, many1},
28    sequence::{pair, tuple},
29    Err, IResult,
30};
31
32#[derive(Debug, PartialEq)]
33/// A single repeat and spacer.
34pub struct RepeatSpacer<'a> {
35    /// Sequence of the repeat.
36    pub repeat: &'a str,
37    /// Sequence of the spacer.
38    pub spacer: &'a str,
39    /// Zero-indexed inclusive start coordinate.
40    pub start: usize,
41    /// Zero-indexed exclusive end coordinate.
42    pub end: usize,
43    /// Zero-indexed inclusive start coordinate of the spacer.
44    pub spacer_start: usize,
45    /// Zero-indexed exclusive end coordinate of the spacer.
46    pub spacer_end: usize,
47    /// Zero-indexed inclusive start coordinate of the repeat.
48    pub repeat_start: usize,
49    /// Zero-indexed exclusive end coordinate of the repeat.
50    pub repeat_end: usize,
51}
52
53#[derive(Debug, PartialEq)]
54/// A single repeat, without a spacer. This is the last repeat in the CRISPR array.
55pub struct RepeatOnly<'a> {
56    /// Sequence of the repeat.
57    pub repeat: &'a str,
58    /// Zero-indexed inclusive start coordinate.
59    pub start: usize,
60    /// Zero-indexed exclusive end coordinate.
61    pub end: usize,
62}
63
64/// Represents one component of a CRISPR array.
65#[derive(Debug, PartialEq)]
66pub enum Repeat<'a> {
67    /// A repeat with a spacer
68    WithSpacer(RepeatSpacer<'a>),
69    /// A repeat without a spacer (the last repeat in the array)
70    WithoutSpacer(RepeatOnly<'a>),
71}
72
73#[derive(Debug, PartialEq)]
74/// A single CRISPR array.
75pub struct Array<'a> {
76    /// The nth CRISPR array in this genome/contig.
77    pub order: usize,
78    /// Zero-indexed inclusive start coordinate.
79    pub start: usize,
80    /// Zero-indexed exclusive end coordinate.
81    pub end: usize,
82    /// All of the repeat-spacer pairs in this CRISPR array.
83    pub repeat_spacers: Vec<Repeat<'a>>,
84}
85
86#[derive(Debug)]
87/// Represents all of the CRISPR arrays in a single contig or genome.
88pub struct Contig<'a> {
89    /// Accession of the contig/genome.
90    pub accession: &'a str,
91    /// Length of the contig/genome in base pairs.
92    pub bp: usize,
93    /// The CRISPR arrays in this contig/genome.
94    pub arrays: Vec<Array<'a>>,
95}
96
97/// Parses the output of minCED for a single contig/genome.
98pub fn parse(input: &str) -> Result<Vec<Contig>, Err<Error<&str>>> {
99    let result = many0(parse_contig_arrays)(input);
100    match result {
101        Ok((_, contigs)) => Ok(contigs),
102        Err(e) => Err(e),
103    }
104}
105
106/// Parses the accession and arrays for a single contig/genome
107fn parse_contig_arrays(input: &str) -> IResult<&str, Contig> {
108    let result = tuple((
109        parse_accession_line,
110        skip_empty_line,
111        many1(parse_array),
112        parse_footer,
113    ))(input);
114    match result {
115        Ok((remainder, ((accession, bp), _, arrays, _))) => Ok((
116            remainder,
117            Contig {
118                accession,
119                bp,
120                arrays,
121            },
122        )),
123        Err(e) => Err(e),
124    }
125}
126
127/// Parses a single CRISPR array.
128fn parse_array(input: &str) -> IResult<&str, Array> {
129    let result = tuple((
130        skip_empty_line,
131        parse_crispr_order_and_coordinates,
132        skip_empty_line,
133        skip_one_line,
134        skip_one_line,
135        many1(parse_repeat_spacer_line),
136        skip_one_line,
137        skip_one_line,
138    ))(input);
139    match result {
140        Ok((remainder, (_, (order, start, end), _, _, _, repeat_spacers, _, _))) => Ok((
141            remainder,
142            Array {
143                order,
144                start,
145                end,
146                repeat_spacers,
147            },
148        )),
149        Err(e) => Err(e),
150    }
151}
152
153/// Skips a line with text.
154fn skip_one_line(input: &str) -> IResult<&str, ()> {
155    let result = pair(not_line_ending, line_ending)(input);
156    match result {
157        Ok((remaining, _)) => Ok((remaining, ())),
158        Err(e) => Err(e),
159    }
160}
161
162/// Skips an empty line.
163fn skip_empty_line(input: &str) -> IResult<&str, ()> {
164    let result = line_ending(input);
165    match result {
166        Ok((remaining, _)) => Ok((remaining, ())),
167        Err(e) => Err(e),
168    }
169}
170
171/// Skips the four lines at the end of each contig.
172fn parse_footer(input: &str) -> IResult<&str, ()> {
173    let result = tuple((
174        skip_empty_line,
175        skip_one_line,
176        skip_empty_line,
177        skip_empty_line,
178    ))(input);
179    match result {
180        Ok((remainder, _)) => Ok((remainder, ())),
181        Err(e) => Err(e),
182    }
183}
184
185/// Parses the order (i.e. the nth CRISPR array found for a given run of minCED) and start/end
186/// coordinates of the array.
187fn parse_crispr_order_and_coordinates(input: &str) -> IResult<&str, (usize, usize, usize)> {
188    let result = tuple((
189        tag("CRISPR"),
190        char(' '),
191        digit1,
192        multispace1,
193        tag("Range:"),
194        char(' '),
195        digit1,
196        tag(" - "),
197        digit1,
198    ))(input);
199    match result {
200        Ok((remaining, (_, _, raw_order, _, _, _, start, _, end))) => Ok((
201            remaining,
202            (
203                raw_order.parse::<usize>().unwrap() - 1,
204                start.parse::<usize>().unwrap() - 1,
205                end.parse::<usize>().unwrap(),
206            ),
207        )),
208        Err(e) => Err(e),
209    }
210}
211
212/// Parses the contig/genome accession and length
213fn parse_accession_line(input: &str) -> IResult<&str, (&str, usize)> {
214    let result = tuple((
215        tag("Sequence '"),
216        take_until("'"),
217        tag("'"),
218        char(' '),
219        tag("("),
220        take_until(" "),
221        tag(" bp)"),
222    ))(input);
223    match result {
224        Ok((remainder, (_, accession, _, _, _, bp, _))) => {
225            Ok((remainder, (accession, bp.parse::<usize>().unwrap())))
226        }
227        Err(e) => Err(e),
228    }
229}
230
231/// Parses a single repeat/spacer line
232fn parse_repeat_spacer_line(input: &str) -> IResult<&str, Repeat> {
233    alt((parse_repeat_with_spacer, parse_repeat_only))(input)
234}
235
236/// Parses a repeat entry that has no spacer. This is always the final repeat in the array.
237fn parse_repeat_only(input: &str) -> IResult<&str, Repeat> {
238    let result = tuple((digit1, multispace1, alpha1, multispace1))(input);
239    match result {
240        Ok((remaining, (raw_start, _, repeat, _))) => {
241            let start = raw_start.parse::<usize>().unwrap() - 1;
242            Ok((
243                remaining,
244                Repeat::WithoutSpacer(RepeatOnly {
245                    repeat,
246                    start,
247                    end: start + repeat.len(),
248                }),
249            ))
250        }
251        Err(e) => Err(e),
252    }
253}
254
255/// Parses a repeat and spacer entry.
256fn parse_repeat_with_spacer(input: &str) -> IResult<&str, Repeat> {
257    let result = tuple((
258        digit1,
259        multispace1,
260        alpha1,
261        multispace1,
262        alpha1,
263        not_line_ending,
264        line_ending,
265    ))(input);
266    match result {
267        Ok((remaining, (raw_start, _, repeat, _, spacer, _, _))) => {
268            let start = raw_start.parse::<usize>().unwrap() - 1;
269            Ok((
270                remaining,
271                Repeat::WithSpacer(RepeatSpacer {
272                    repeat,
273                    spacer,
274                    start,
275                    end: start + repeat.len() + spacer.len(),
276                    repeat_start: start,
277                    repeat_end: start + repeat.len(),
278                    spacer_start: start + repeat.len(),
279                    spacer_end: start + repeat.len() + spacer.len(),
280                }),
281            ))
282        }
283        Err(e) => Err(e),
284    }
285}
286
287#[cfg(test)]
288mod tests {
289    use super::*;
290
291    #[test]
292    fn test_parse_array() {
293        let input = "\nCRISPR 1   Range: 10648 - 10814
294POSITION        REPEAT                          SPACER
295--------        -----------------------------   ----------------------------------------
29610648           CAAGTGCACCAACCAATCTCACCACCTCA   GGGGGTGCACTTAAAGGGGGTGCACTTGTCTCAAGTGCACCAAGAA  [ 29, 46 ]
29710723           CAAGTGCACCAACCAATCTCACCACCTCA   CCATCTCACCACCTCTCAGGGGGTGCAGTTGTCT      [ 29, 34 ]
29810786           CAAGTGCACCAACCAATCTCACCACCTCA
299--------        -----------------------------   ----------------------------------------
300Repeats: 3      Average Length: 29              Average Length: 40\n";
301        let expected = Array {
302            order: 0,
303            start: 10647,
304            end: 10814,
305            repeat_spacers: vec![
306                Repeat::WithSpacer(RepeatSpacer {
307                    start: 10647,
308                    end: 10722,
309                    repeat_start: 10647,
310                    repeat_end: 10676,
311                    spacer_start: 10676,
312                    spacer_end: 10722,
313                    repeat: "CAAGTGCACCAACCAATCTCACCACCTCA",
314                    spacer: "GGGGGTGCACTTAAAGGGGGTGCACTTGTCTCAAGTGCACCAAGAA",
315                }),
316                Repeat::WithSpacer(RepeatSpacer {
317                    start: 10722,
318                    end: 10785,
319                    repeat_start: 10722,
320                    repeat_end: 10751,
321                    spacer_start: 10751,
322                    spacer_end: 10785,
323                    repeat: "CAAGTGCACCAACCAATCTCACCACCTCA",
324                    spacer: "CCATCTCACCACCTCTCAGGGGGTGCAGTTGTCT",
325                }),
326                Repeat::WithoutSpacer(RepeatOnly {
327                    start: 10785,
328                    end: 10814,
329                    repeat: "CAAGTGCACCAACCAATCTCACCACCTCA",
330                }),
331            ],
332        };
333        let (_, actual) = parse_array(input).unwrap();
334        assert_eq!(expected, actual);
335    }
336
337    #[test]
338    fn test_parse_crispr_order_and_coordinates() {
339        let input = r"CRISPR 1   Range: 1214 - 1776";
340        let expected = (0, 1213, 1776);
341        let (_, actual) = parse_crispr_order_and_coordinates(input).unwrap();
342        assert_eq!(expected, actual);
343    }
344
345    #[test]
346    fn test_parse_accession_line() {
347        let input = "Sequence 'MGYG000166779_38' (12280 bp)";
348        let expected = ("MGYG000166779_38", 12280);
349        let (_, actual) = parse_accession_line(input).unwrap();
350        assert_eq!(expected, actual);
351    }
352
353    #[test]
354    fn test_parse_repeat_spacer() {
355        let input = "10723           CAAGTGCACCAACCAATCTCACCACCTCA   CCATCTCACCACCTCTCAGGGGGTGCAGTTGTCT      [ 29, 34 ]\n";
356        let expected = RepeatSpacer {
357            repeat: "CAAGTGCACCAACCAATCTCACCACCTCA",
358            spacer: "CCATCTCACCACCTCTCAGGGGGTGCAGTTGTCT",
359            start: 10722,
360            end: 10785,
361            repeat_start: 10722,
362            repeat_end: 10751,
363            spacer_start: 10751,
364            spacer_end: 10785,
365        };
366        let (_, actual) = parse_repeat_spacer_line(input).unwrap();
367        match actual {
368            Repeat::WithSpacer(act) => {
369                assert_eq!(expected, act);
370            }
371            _ => {
372                unreachable!()
373            }
374        }
375    }
376
377    #[test]
378    fn test_parse_repeat_only_line() {
379        let input = "10786		CAAGTGCACCAACCAATCTCACCACCTCA\n";
380        let expected = RepeatOnly {
381            repeat: "CAAGTGCACCAACCAATCTCACCACCTCA",
382            start: 10785,
383            end: 10814,
384        };
385        let (_, actual) = parse_repeat_spacer_line(input).unwrap();
386        match actual {
387            Repeat::WithoutSpacer(act) => {
388                assert_eq!(expected, act);
389            }
390            _ => {
391                unreachable!()
392            }
393        }
394    }
395
396    #[test]
397    fn test_parse_contig_arrays() {
398        let input = "Sequence 'MGYG000242676_4' (164254 bp)
399
400CRISPR 3   Range: 60487 - 61025
401POSITION	REPEAT				SPACER
402--------	------------------------------------	--------------------------
40360487		TTTAATAACCCTATATAATTTCTACTATTGTAGATA	TCTCCTTTGTAACTTCTTTGATTCGG	[ 36, 26 ]
40460549		TTTAATAACCCTATATAATTTCTACTGTCGTAGATA	TTGTTCTTTTATATGTGTACATAGCTAGA	[ 36, 29 ]
40560990		TTTAATAACCCTATATAATTTCTACTTTTTTGATTA
406--------	------------------------------------	--------------------------
407Repeats: 9	Average Length: 36		Average Length: 26
408
409CRISPR 4   Range: 157550 - 157915
410POSITION	REPEAT				SPACER
411--------	------------------------------------	------------------------------
412157550		GTTTTACTACCTTATAGATTTACACTATTCTCAAAC	GAGGGGTTGTCCTTCATGTACTCTTTACCT	[ 36, 30 ]
413157748		GTTTTACTACCTTATAGATTTACACTATTCTCAAAC	GGGCTTATACTCTGACTTTCAACAAGTTAG	[ 36, 30 ]
414157814		GTTTTACTACCTTATAGATTTACACTATTCTCAAAC	CCGATTTTTTCATTGCCAAAACGATATTTT	[ 36, 30 ]
415157880		GTTTTACTACCTTATAGATTTACACTATTCTCAAAC
416--------	------------------------------------	------------------------------
417Repeats: 6	Average Length: 36		Average Length: 30
418
419Time to find repeats: 22 ms
420
421
422";
423        let (_, contig) = parse_contig_arrays(input).unwrap();
424        assert_eq!(contig.accession, "MGYG000242676_4");
425        assert_eq!(contig.bp, 164254);
426        assert_eq!(contig.arrays.len(), 2);
427    }
428
429    #[test]
430    fn test_parse() {
431        let input = "Sequence 'MGYG000166779_38' (12280 bp)
432
433CRISPR 1   Range: 10648 - 10814
434POSITION	REPEAT				SPACER
435--------	-----------------------------	----------------------------------------
43610648		CAAGTGCACCAACCAATCTCACCACCTCA	GGGGGTGCACTTAAAGGGGGTGCACTTGTCTCAAGTGCACCAAGAA	[ 29, 46 ]
43710723		CAAGTGCACCAACCAATCTCACCACCTCA	CCATCTCACCACCTCTCAGGGGGTGCAGTTGTCT	[ 29, 34 ]
43810786		CAAGTGCACCAACCAATCTCACCACCTCA	
439--------	-----------------------------	----------------------------------------
440Repeats: 3	Average Length: 29		Average Length: 40
441
442Time to find repeats: 3 ms
443
444
445Sequence 'MGYG000166779_43' (11302 bp)
446
447CRISPR 2   Range: 4 - 1413
448POSITION	REPEAT				SPACER
449--------	------------------------------------	-----------------------------
4504		GTTGTGGTTTGATGTAGGAATCAAAAGATATACAAC	ACGGGTGCACTTTCGATGTCGCACTTTTTG	[ 36, 30 ]
45170		GTTGTGGTTTGATGTAGGAATCAAAAGATATACAAC	TATACATCATCGTACATATAAGCATACAG	[ 36, 29 ]
452135		GTTGTGGTTTGATGTAGGAATCAAAAGATATACAAC	GAAAAATCAGAGCCCAAAGTACGAGTAAC	[ 36, 29 ]
453200		GTTGTGGTTTGATGTAGGAATCAAAAGATATACAAC	CCAGTTCCCGAATTTGATGCTCTTGGCAT	[ 36, 29 ]
454265		GTTGTGGTTTGATGTAGGAATCAAAAGATATACAAC	ACTTACAACAACAACAATAACAATAAATG	[ 36, 29 ]
455330		GTTGTGGTTTGATGTAGGAATCAAAAGATATACAAC	ATACGTGTGCTCTATATACGCACCCATTGG	[ 36, 30 ]
456396		GTTGTGGTTTGATGTAGGAATCAAAAGATATACAAC	GGAGCTCTTTCGATGTCGCACTTTCTGAAG	[ 36, 30 ]
457462		GTTGTGGTTTGATGTAGGAATCAAAAGATATACAAC	CGTGCTCGCTTTGAATTTGTAGAACCCGA	[ 36, 29 ]
458527		GTTGTGGTTTGATGTAGGAATCAAAAGATATACAAC	TCTCGACACTATTTCTAACGAGGAAATTAA	[ 36, 30 ]
459593		GTTGTGGTTTGATGTAGGAATCAAAAGATATACAAC	GCGCTGAGAAGTTACCACCGACCGCTTGA	[ 36, 29 ]
460658		GTTGTGGTTTGATGTAGGAATCAAAAGATATACAAC	AATACTAAACCAAGATTGCCAAAGGTCCA	[ 36, 29 ]
461723		GTTGTGGTTTGATGTAGGAATCAAAAGATATACAAC	AGATGATCTACGCTCAATATTAGAAAAAC	[ 36, 29 ]
462788		GTTGTGGTTTGATGTAGGAATCAAAAGATATACAAC	GTATCTGCGGAACAAGTACAGAGAACATGA	[ 36, 30 ]
463854		GTTGTGGTTTGATGTAGGAATCAAAAGATATACAAC	CGAACCTAATACGGCTTTAGCCTTTTTGCA	[ 36, 30 ]
464920		GTTGTGGTTTGATGTAGGAATCAAAAGATATACAAC	AATGAGTACCAAAAGCAAAGAACAAATCGA	[ 36, 30 ]
465986		GTTGTGGTTTGATGTAGGAATCAAAAGATATACAAC	TATATTTTTGTGCGTTACCCGTCCGTGAGG	[ 36, 30 ]
4661052		GTTGTGGTTTGATGTAGGAATCAAAAGATATACAAC	TTACTTACGACTATTACGACCAGGTGAAC	[ 36, 29 ]
4671117		GTTGTGGTTTGATGTAGGAATCAAAAGATATACAAC	ATAATTATAATCGGAAATCAAGCGGATAA	[ 36, 29 ]
4681182		GTTGTGGTTTGATGTAGGAATCAAAAGATATACAAC	TTTGAATTAGATTCGGCAACCTTAGCATT	[ 36, 29 ]
4691247		GTTGTGGTTTGATGTAGGAATCAAAAGATATACAAC	TTTTCATCATATTCATAAGAATAGCGACC	[ 36, 29 ]
4701312		GTTGTGGTTTGATGTAGGAATCAAAAGATATACAAC	CCATACGCTCCTTGGTGGTCTTGGTAAGGA	[ 36, 30 ]
4711378		GTTGTGGTTTGATGTAGAAATCAAAAGACATACAAC	
472--------	------------------------------------	-----------------------------
473Repeats: 22	Average Length: 36		Average Length: 29
474
475Time to find repeats: 3 ms
476
477
478Sequence 'MGYG000242676_4' (164254 bp)
479
480CRISPR 3   Range: 60487 - 61025
481POSITION	REPEAT				SPACER
482--------	------------------------------------	--------------------------
48360487		TTTAATAACCCTATATAATTTCTACTATTGTAGATA	TCTCCTTTGTAACTTCTTTGATTCGG	[ 36, 26 ]
48460549		TTTAATAACCCTATATAATTTCTACTGTCGTAGATA	TTGTTCTTTTATATGTGTACATAGCTAGA	[ 36, 29 ]
48560614		TTTAATAACCCTATATAATTTCTACTATTGTAGATA	ACCTCCTTTGGATTTTCAGCAAATCAGG	[ 36, 28 ]
48660678		TTTAATAACCCTATATAATTTCTACTATTTTAGATA	ATACTGCTTGTTCTGTAAAAATTTTG	[ 36, 26 ]
48760740		TTTAATAACTCTATATAATTTCTACTATTGTAGATG	GAGTTCTCCAACCGTTTGCGGCAATA	[ 36, 26 ]
48860802		TTTAATAACCCTATATAATTTCTACTATTGTAGATA	ACGGTTGAATCAATGAGAAATGTTGTG	[ 36, 27 ]
48960865		TTTAATAACCCTATATAATTTCTACTATTGTAGATA	TGATATTGACGGTGACCTGATTAACCG	[ 36, 27 ]
49060928		TTTAATAACCCTATATAATTTCTACTATTGTAGATA	TGTCAATCACATCTGTGACCGCAAGG	[ 36, 26 ]
49160990		TTTAATAACCCTATATAATTTCTACTTTTTTGATTA	
492--------	------------------------------------	--------------------------
493Repeats: 9	Average Length: 36		Average Length: 26
494
495CRISPR 4   Range: 157550 - 157915
496POSITION	REPEAT				SPACER
497--------	------------------------------------	------------------------------
498157550		GTTTTACTACCTTATAGATTTACACTATTCTCAAAC	GAGGGGTTGTCCTTCATGTACTCTTTACCT	[ 36, 30 ]
499157616		GTTTTACTACCTTATAGATTTACACTATTCTCAAAC	ATACAAATGCATTGCCGAGGACAGTGTTTT	[ 36, 30 ]
500157682		GTTTTACTACCTTATAGATTTACACTATTCTCAAAC	TATACGGTTTGCCCGTGCAGTCTTGTACAA	[ 36, 30 ]
501157748		GTTTTACTACCTTATAGATTTACACTATTCTCAAAC	GGGCTTATACTCTGACTTTCAACAAGTTAG	[ 36, 30 ]
502157814		GTTTTACTACCTTATAGATTTACACTATTCTCAAAC	CCGATTTTTTCATTGCCAAAACGATATTTT	[ 36, 30 ]
503157880		GTTTTACTACCTTATAGATTTACACTATTCTCAAAC	
504--------	------------------------------------	------------------------------
505Repeats: 6	Average Length: 36		Average Length: 30
506
507Time to find repeats: 22 ms
508
509
510Sequence 'MGYG000273829_14' (62198 bp)
511
512CRISPR 5   Range: 15191 - 17205
513POSITION	REPEAT				SPACER
514--------	------------------------------------	-----------------------------
51515191		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	ATCGCTGAACCTACAACAGACGCAAGAACA	[ 36, 30 ]
51615257		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	GATATTGTCATACCTAAGTAAATAGGTGCG	[ 36, 30 ]
51715323		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	TACAATCAGTCTATAACATTTGCAACTACG	[ 36, 30 ]
51815389		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	TATTATAGACAGCAAGCAACTTGATGTAT	[ 36, 29 ]
51915454		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	CTTGAATTTGGGGAGATGTTCTCAGCTGGT	[ 36, 30 ]
52015520		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	AAAGTTTGCTGACAGGGACATTCAAAGCCG	[ 36, 30 ]
52115586		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	AAACCTGTCTGTCCGATCTGCACCATATAT	[ 36, 30 ]
52215652		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	ACCGCTATTGCGCTGCAGCATCCACAAGGA	[ 36, 30 ]
52315718		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	GCATCTTCCTGCGCTCTCTCTGAAAACATG	[ 36, 30 ]
52415784		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	CGAAGCCTAAAGCTCATTTCGCTTAGGCTT	[ 36, 30 ]
52515850		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	AAACATGGTGTTGATGTCAAAGAGCTGTAT	[ 36, 30 ]
52615916		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	TGGGCGAATATAAATTCCATCGGTGGCAAG	[ 36, 30 ]
52715982		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	AATATTGGATGATGTGTATGGCATTTTACT	[ 36, 30 ]
52816048		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	AGTGTATATGTGAACCCTGCTCCCAGTGCT	[ 36, 30 ]
52916114		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	AAAGACCGGAGCAAAGATGTCCGGGAGCCG	[ 36, 30 ]
53016180		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	TGAAAGTGGTGTAATTGTTATAACTCATTG	[ 36, 30 ]
53116246		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	AGACAACAGGTGTGGAAGCATATGTCTTTA	[ 36, 30 ]
53216312		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	TGCTGCATAGGTGTGTATTTTCTCATGTCG	[ 36, 30 ]
53316378		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	GGTAATGGTGGTGGCGGTTATACCGCAACT	[ 36, 30 ]
53416444		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	ATGGTCGGGGCTACATATTACGCCGCAGTA	[ 36, 30 ]
53516510		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	CGTGAGGTCTCCGACCGTGAAAACAGTTCT	[ 36, 30 ]
53616576		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	ACGAACTTAGTACCCTTTTCTGGGCGGCAT	[ 36, 30 ]
53716642		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	CCGCAGGTGCTACCGCTGTTATACTCTGTT	[ 36, 30 ]
53816708		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	CGTAAATCGTTGGCGAAACGCTACCAACTG	[ 36, 30 ]
53916774		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	CCTCGGTCTGCTCTAACAGATCCCCCAAGT	[ 36, 30 ]
54016840		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	ACAGAGAAAGAAAGAGAGATTAACGACTAC	[ 36, 30 ]
54116906		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	TGAAACGGAGTGGACAGGTAAAGGAATGGG	[ 36, 30 ]
54216972		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	TGCGGTCCCTTGGTTCCGTCAACAACATCA	[ 36, 30 ]
54317038		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	TGTCCTATTCCCTTTTATGCTGCGTGTATA	[ 36, 30 ]
54417104		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	AATACAAGCATAAAGAACGAACCGCAACGG	[ 36, 30 ]
54517170		GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT	
546--------	------------------------------------	-----------------------------
547Repeats: 31	Average Length: 36		Average Length: 29
548
549Time to find repeats: 9 ms
550
551
552";
553        let contigs = parse(input).unwrap();
554        assert_eq!(contigs.len(), 4);
555        let array_count: usize = contigs.iter().map(|c| c.arrays.len()).sum();
556        assert_eq!(array_count, 5);
557    }
558}