pilercr_parser/
lib.rs

1#![deny(warnings, missing_docs)]
2//! Parses the output produced by PILER-CR (<https://www.drive5.com/pilercr/>), a CRISPR array
3//! annotation tool.
4//!
5//! PILER-CR v1.06 (at least) reports incorrect coordinates if any of the repeat sequences contains gaps.
6//! This parser will correct those errors, and also provides the repeat sequence of each repeat-spacer
7//! (which is given only as a difference pattern to the consensus in the PILER-CR output).
8//!
9//! ## Example
10//!
11//! ```rust
12//! use std::fs::File;
13//! use std::io::{BufReader, Read};
14//!
15//! let file = File::open("examples/example.txt").unwrap();
16//! let mut reader = BufReader::new(file);
17//! let mut input = String::new();
18//! reader.read_to_string(&mut input).unwrap();
19//! let arrays = pilercr_parser::parse(&input).unwrap();
20//! for array in arrays {
21//!     println!(
22//!         "{} has {} arrays",
23//!         array.accession,
24//!         array.repeat_spacers.len()
25//!     );
26//! }
27//! ```
28
29use nom::{
30    bytes::complete::tag,
31    character::complete::{
32        alpha0, char, digit0, digit1, line_ending, multispace0, multispace1, not_line_ending,
33    },
34    error::Error,
35    multi::{many0, many1},
36    number::complete::float,
37    sequence::{pair, tuple},
38    Err, IResult, InputTakeAtPosition,
39};
40
41#[derive(Debug, PartialEq)]
42/// Represents the information of a repeat-spacer as reflected in the PILER-CR output.
43/// The coordinates here can be incorrect (we will correct them later) and the
44/// repeat sequence has not yet been constructed.
45struct RawRepeatSpacer<'a> {
46    /// Zero-indexed, inclusive start coordinate.
47    start: usize,
48    /// Zero-indexed, exclusive end coordinate.
49    end: usize,
50    /// A pattern representing the difference between this repeat and the consensus repeat.
51    repeat_diff: &'a str,
52    /// Sequence of the spacer.
53    spacer: &'a str,
54}
55
56#[derive(Debug, PartialEq)]
57/// A single repeat-spacer.
58pub struct RepeatSpacer<'a> {
59    /// Zero-indexed, inclusive start coordinate.
60    pub start: usize,
61    /// Zero-indexed, exclusive end coordinate.
62    pub end: usize,
63    /// Zero-indexed, inclusive start coordinate of the spacer.
64    pub spacer_start: usize,
65    /// Zero-indexed, exclusive end coordinate of the spacer.
66    pub spacer_end: usize,
67    /// Zero-indexed, inclusive start coordinate of the repeat.
68    pub repeat_start: usize,
69    /// Zero-indexed, exclusive end coordinate of the repeat.
70    pub repeat_end: usize,
71    /// Sequence of the repeat.
72    pub repeat: String,
73    /// Sequence of the spacer.
74    pub spacer: &'a str,
75}
76
77#[derive(Debug, PartialEq)]
78/// A single CRISPR array.
79pub struct Array<'a> {
80    /// Accession of the contig/genome.
81    pub accession: &'a str,
82    /// The Nth CRISPR array in the PILER-CR output.
83    pub order: usize,
84    /// Zero-indexed, inclusive start coordinate.
85    pub start: usize,
86    /// Zero-indexed, exclusive end coordinate.
87    pub end: usize,
88    /// The consensus repeat sequence for this array. May include gaps.
89    pub consensus_repeat_sequence: &'a str,
90    /// The repeat-spacers in this array.
91    pub repeat_spacers: Vec<RepeatSpacer<'a>>,
92}
93
94/// Parses the output of PILER-CR for a single contig/genome.
95pub fn parse(input: &str) -> Result<Vec<Array>, Err<Error<&str>>> {
96    let result = tuple((skip_header, many0(parse_array)))(input);
97    match result {
98        Ok((_, (_, arrays))) => Ok(arrays),
99        Err(e) => Err(e),
100    }
101}
102
103/// Gets space-delimited text.
104fn not_space(input: &str) -> IResult<&str, &str> {
105    input.split_at_position_complete(char::is_whitespace)
106}
107
108/// Skips the lines at the beginning of the PILER-CR output.
109fn skip_header(input: &str) -> IResult<&str, ()> {
110    let result = tuple((
111        skip_one_line,
112        skip_one_line,
113        skip_empty_line,
114        skip_one_line,
115        skip_empty_line,
116        skip_empty_line,
117        skip_empty_line,
118        skip_one_line,
119        skip_empty_line,
120        skip_empty_line,
121        skip_empty_line,
122    ))(input);
123    match result {
124        Ok((remainder, _)) => Ok((remainder, ())),
125        Err(e) => Err(e),
126    }
127}
128
129/// Parses a single repeat spacer. These may have incorrect coordinates, and the repeat sequence
130/// has not yet been determined.
131fn parse_raw_repeat_spacer(input: &str) -> IResult<&str, RawRepeatSpacer> {
132    let result = tuple((
133        multispace0,
134        digit1,
135        multispace1,
136        digit1,
137        multispace1,
138        float,
139        multispace1,
140        digit0,
141        multispace0,
142        alpha0,
143        multispace1,
144        not_space,
145        multispace1,
146        alpha0,
147    ))(input);
148    match result {
149        Ok((remainder, data)) => {
150            let repeat_diff = data.11;
151            let spacer = data.13;
152            let start = data.1.parse::<usize>().unwrap() - 1;
153            let end = start + repeat_diff.len() + spacer.len();
154            let raw_repeat_spacer = RawRepeatSpacer {
155                start,
156                end,
157                repeat_diff,
158                spacer,
159            };
160            Ok((remainder, raw_repeat_spacer))
161        }
162        Err(e) => Err(e),
163    }
164}
165
166/// Skips a line with text.
167fn skip_one_line(input: &str) -> IResult<&str, ()> {
168    let result = pair(not_line_ending, line_ending)(input);
169    match result {
170        Ok((remaining, _)) => Ok((remaining, ())),
171        Err(e) => Err(e),
172    }
173}
174
175/// Skips an empty line.
176fn skip_empty_line(input: &str) -> IResult<&str, ()> {
177    let result = line_ending(input);
178    match result {
179        Ok((remaining, _)) => Ok((remaining, ())),
180        Err(e) => Err(e),
181    }
182}
183
184/// Gets the consensus sequence from the last line of an array and discards everything else.
185fn parse_array_summary_line(input: &str) -> IResult<&str, &str> {
186    let result = tuple((
187        multispace0,
188        digit1,
189        multispace1,
190        digit1,
191        multispace1,
192        digit1,
193        multispace1,
194        not_space,
195        line_ending,
196    ))(input);
197    match result {
198        Ok((remainder, data)) => Ok((remainder, data.7)),
199        Err(e) => Err(e),
200    }
201}
202
203/// Parses a single CRISPR array.
204fn parse_array(input: &str) -> IResult<&str, Array> {
205    let result = tuple((
206        tag("Array "),
207        digit1,
208        line_ending,
209        char('>'),
210        not_space,
211        line_ending,
212        skip_empty_line,
213        skip_one_line,
214        skip_one_line,
215        many1(parse_raw_repeat_spacer),
216        skip_one_line,
217        skip_one_line,
218        parse_array_summary_line,
219        skip_empty_line,
220        skip_empty_line,
221    ))(input);
222    match result {
223        Err(e) => Err(e),
224        Ok((remainder, data)) => {
225            let order = data.1.parse::<usize>().unwrap() - 1;
226            let accession = data.4;
227            let raw_repeat_spacers = data.9;
228            let consensus_repeat_sequence = data.12;
229            let repeat_spacers =
230                convert_raw_rs_to_final_rs(consensus_repeat_sequence, &raw_repeat_spacers);
231            let start = repeat_spacers.first().unwrap().start;
232            let end = repeat_spacers.last().unwrap().end;
233            Ok((
234                remainder,
235                Array {
236                    start,
237                    end,
238                    order,
239                    accession,
240                    consensus_repeat_sequence,
241                    repeat_spacers,
242                },
243            ))
244        }
245    }
246}
247
248/// Due to a bug in PILER-CR, coordinates don't take gaps in repeat sequences into account.
249/// We correct those coordinates here. Additionally, each repeat has a difference pattern instead
250/// of an actual sequence, so we determine what the true repeat sequence is.
251fn convert_raw_rs_to_final_rs<'a>(
252    consensus_repeat: &'a str,
253    raw_repeat_spacers: &[RawRepeatSpacer<'a>],
254) -> Vec<RepeatSpacer<'a>> {
255    let mut output = vec![];
256    let mut total_gap_count = 0usize;
257    for raw in raw_repeat_spacers {
258        assert_eq!(raw.repeat_diff.len(), consensus_repeat.len());
259        let repeat = raw
260            .repeat_diff
261            .chars()
262            .zip(consensus_repeat.chars())
263            .filter(|(r, _)| *r != '-')
264            .map(|(r, c)| if r == '.' { c } else { r })
265            .collect::<String>();
266        let gap_count = raw.repeat_diff.matches('-').count();
267        let rs = RepeatSpacer {
268            start: raw.start - total_gap_count,
269            end: raw.end - total_gap_count - gap_count,
270            repeat_start: raw.start - total_gap_count,
271            repeat_end: raw.start - total_gap_count + repeat.len(),
272            spacer_start: raw.start - total_gap_count + repeat.len(),
273            spacer_end: raw.end - total_gap_count - gap_count,
274            repeat,
275            spacer: raw.spacer,
276        };
277        output.push(rs);
278        total_gap_count += gap_count;
279    }
280    output
281}
282
283#[cfg(test)]
284mod tests {
285    use super::*;
286
287    #[test]
288    fn test_parse_raw_repeat_spacer() {
289        let input = "       462      36   100.0      29  CTTTCTGAAG    ....................................    CGTGCTCGCTTTGAATTTGTAGAACCCGA";
290        let expected = RawRepeatSpacer {
291            start: 461,
292            end: 461 + 36 + 29,
293            repeat_diff: "....................................",
294            spacer: "CGTGCTCGCTTTGAATTTGTAGAACCCGA",
295        };
296        let (_, actual) = parse_raw_repeat_spacer(input).unwrap();
297        assert_eq!(expected, actual);
298    }
299
300    #[test]
301    fn test_parse_array_summary_line() {
302        let input = "        22      36              29                GTTGTGGTTTGATGTAGGAATCAAAAGATATACAAC\n";
303        let expected = "GTTGTGGTTTGATGTAGGAATCAAAAGATATACAAC";
304        let (_, actual) = parse_array_summary_line(input).unwrap();
305        assert_eq!(expected, actual);
306    }
307
308    #[test]
309    fn test_convert_raw_rs_to_final_rs() {
310        let consensus = "AAGTTTCCGTCCCCTTTCGGGGAATCATTTAGAAAAT--A";
311        let raws = vec![
312            RawRepeatSpacer {
313                start: 3831,
314                end: 3906,
315                repeat_diff: "..A..................................CC.",
316                spacer: "GAATTACATCGTATGCCAATACGCAGTTGCTTTT",
317            },
318            RawRepeatSpacer {
319                start: 3831,
320                end: 3906,
321                repeat_diff: "GG............-......................--.",
322                spacer: "ATCACATTCA",
323            },
324        ];
325        let rs = convert_raw_rs_to_final_rs(consensus, &raws);
326        assert_eq!(rs.len(), 2);
327        assert_eq!(rs[0].repeat, "AAATTTCCGTCCCCTTTCGGGGAATCATTTAGAAAATCCA");
328        assert_eq!(rs[1].repeat, "GGGTTTCCGTCCCCTTCGGGGAATCATTTAGAAAATA");
329    }
330
331    #[test]
332    fn test_parse_array() {
333        let input = "Array 5
334>MGYG000273829_14
335
336       Pos  Repeat     %id  Spacer  Left flank    Repeat                                  Spacer
337==========  ======  ======  ======  ==========    ====================================    ======
338     16576      36   100.0      30  AAACAGTTCT    ....................................    ACGAACTTAGTACCCTTTTCTGGGCGGCAT
339     16642      36   100.0      30  TGGGCGGCAT    ....................................    CCGCAGGTGCTACCGCTGTTATACTCTGTT
340     16708      36   100.0      30  ATACTCTGTT    ....................................    CGTAAATCGTTGGCGAAACGCTACCAACTG
341     16774      36   100.0      30  CTACCAACTG    ....................................    CCTCGGTCTGCTCTAACAGATCCCCCAAGT
342     16840      36   100.0      30  TCCCCCAAGT    ....................................    ACAGAGAAAGAAAGAGAGATTAACGACTAC
343     16906      36   100.0      30  TAACGACTAC    ....................................    TGAAACGGAGTGGACAGGTAAAGGAATGGG
344     16972      36   100.0      30  AAGGAATGGG    ....................................    TGCGGTCCCTTGGTTCCGTCAACAACATCA
345     17038      36   100.0      30  AACAACATCA    ....................................    TGTCCTATTCCCTTTTATGCTGCGTGTATA
346     17104      36   100.0      30  TGCGTGTATA    ....................................    AATACAAGCATAAAGAACGAACCGCAACGG
347     17170      36   100.0          ACCGCAACGG    ....................................    AGGGAA
348==========  ======  ======  ======  ==========    ====================================
349        10      36              30                GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT
350
351
352";
353        let (_, actual) = parse_array(input).unwrap();
354        assert_eq!(actual.repeat_spacers.len(), 10);
355        assert_eq!(actual.accession, "MGYG000273829_14");
356        assert_eq!(
357            actual.consensus_repeat_sequence,
358            "GCTGTAGTTCCCGGTTATTACTTGGTATGTTATAAT"
359        );
360        assert_eq!(actual.repeat_spacers[0].start, 16575);
361        assert_eq!(actual.repeat_spacers[9].start, 17169);
362    }
363
364    #[test]
365    fn test_parse_array_with_gaps() {
366        let input = "Array 18
367>MGYG000232241_150
368
369       Pos  Repeat     %id  Spacer  Left flank    Repeat                                      Spacer
370==========  ======  ======  ======  ==========    ========================================    ======
371      3832      40    92.5      34  CATATAGCAA    ..A..................................CC.    GAATTACATCGTATGCCAATACGCAGTTGCTTTT
372      3906      40    97.5      41  AGTTGCTTTT    .....................................---    TGTACTACTATGCGGTATTCCATCTGAAGGATGGCGGCTAC
373      3987      40    92.5          TGGCGGCTAC    GG............-......................--.    ATCACATTCA
374==========  ======  ======  ======  ==========    ========================================
375         3      40              37                AAGTTTCCGTCCCCTTTCGGGGAATCATTTAGAAAAT--A
376
377
378";
379        let expected = Array {
380            accession: "MGYG000232241_150",
381            consensus_repeat_sequence: "AAGTTTCCGTCCCCTTTCGGGGAATCATTTAGAAAAT--A",
382            start: 3831,
383            end: 4030,
384            order: 17,
385            repeat_spacers: vec![
386                RepeatSpacer {
387                    start: 3831,
388                    end: 3905,
389                    repeat_start: 3831,
390                    repeat_end: 3871,
391                    spacer_start: 3871,
392                    spacer_end: 3905,
393                    spacer: "GAATTACATCGTATGCCAATACGCAGTTGCTTTT",
394                    repeat: "AAATTTCCGTCCCCTTTCGGGGAATCATTTAGAAAATCCA".to_string(),
395                },
396                RepeatSpacer {
397                    start: 3905,
398                    end: 3983,
399                    repeat_start: 3905,
400                    repeat_end: 3942,
401                    spacer_start: 3942,
402                    spacer_end: 3983,
403                    spacer: "TGTACTACTATGCGGTATTCCATCTGAAGGATGGCGGCTAC",
404                    repeat: "AAGTTTCCGTCCCCTTTCGGGGAATCATTTAGAAAAT".to_string(),
405                },
406                RepeatSpacer {
407                    start: 3983,
408                    end: 4030,
409                    repeat_start: 3983,
410                    repeat_end: 4020,
411                    spacer_start: 4020,
412                    spacer_end: 4030,
413                    spacer: "ATCACATTCA",
414                    repeat: "GGGTTTCCGTCCCCTTCGGGGAATCATTTAGAAAATA".to_string(),
415                },
416            ],
417        };
418        let (_, actual) = parse_array(input).unwrap();
419        assert_eq!(expected, actual);
420    }
421}