vcf/record/
parser.rs

1use super::VCFRecord;
2use crate::U8Vec;
3use nom::{
4    self, branch::alt, bytes::complete::is_not, bytes::complete::tag, bytes::complete::take_while1,
5    character::is_digit, combinator::opt, combinator::recognize, sequence::tuple,
6};
7use once_cell::sync::Lazy;
8use std::str;
9
10pub fn parse_separated_values<'a, U, F, G, E>(
11    result: &mut Vec<U8Vec>,
12    input: &'a [u8],
13    data: F,
14    separator: G,
15    require_one_entry: bool,
16) -> nom::IResult<&'a [u8], (), E>
17where
18    F: Fn(&'a [u8]) -> nom::IResult<&'a [u8], &'a [u8], E>,
19    G: Fn(&'a [u8]) -> nom::IResult<&'a [u8], U, E>,
20    E: nom::error::ParseError<&'a [u8]>,
21{
22    let mut index = 0;
23    let mut rest = input;
24
25    loop {
26        if let Ok((r, d)) = data(rest) {
27            if index < result.len() {
28                result[index].clear();
29                result[index].extend_from_slice(d);
30            } else {
31                result.push(d.to_vec());
32            }
33            index += 1;
34            rest = r;
35        }
36        if let Ok((r, _)) = separator(rest) {
37            rest = r;
38            continue;
39        }
40        if index == 0 && require_one_entry {
41            return Err(nom::Err::Error(nom::error::make_error(
42                input,
43                nom::error::ErrorKind::SeparatedNonEmptyList,
44            )));
45        }
46        if index <= result.len() {
47            result.drain(index..);
48            // remove overflowed content
49        }
50        return Ok((rest, ()));
51    }
52}
53
54pub fn parse_nested_separated_values<'a, U, V, F, G, H, E>(
55    result: &mut Vec<Vec<U8Vec>>,
56    input: &'a [u8],
57    data: F,
58    separator_inside: H,
59    separator_outside: G,
60    require_one_entry: bool,
61) -> nom::IResult<&'a [u8], (), E>
62where
63    F: Fn(&'a [u8]) -> nom::IResult<&'a [u8], &'a [u8], E>,
64    G: Fn(&'a [u8]) -> nom::IResult<&'a [u8], U, E>,
65    H: Fn(&'a [u8]) -> nom::IResult<&'a [u8], V, E>,
66    E: nom::error::ParseError<&'a [u8]>,
67{
68    let mut index = 0;
69    let mut rest = input;
70
71    loop {
72        while result.len() <= index {
73            result.push(Vec::new());
74        }
75        if let Ok((r, _)) =
76            parse_separated_values(&mut result[index], rest, &data, &separator_inside, true)
77        {
78            index += 1;
79            rest = r;
80        }
81        if let Ok((r, _)) = separator_outside(rest) {
82            rest = r;
83            continue;
84        }
85        if index == 0 && require_one_entry {
86            return Err(nom::Err::Error(nom::error::make_error(
87                input,
88                nom::error::ErrorKind::SeparatedNonEmptyList,
89            )));
90        }
91        if index <= result.len() {
92            result.drain(index..);
93            // remove overflowed content
94        }
95        return Ok((rest, ()));
96    }
97}
98
99pub fn parse_double_nested_separated_values<'a, U, V, W, F, G, H, I, E>(
100    result: &mut Vec<Vec<Vec<U8Vec>>>,
101    input: &'a [u8],
102    data: F,
103    separator_inside: H,
104    separator_outside: G,
105    separator_outside2: I,
106) -> nom::IResult<&'a [u8], (), E>
107where
108    F: Fn(&'a [u8]) -> nom::IResult<&'a [u8], &'a [u8], E>,
109    G: Fn(&'a [u8]) -> nom::IResult<&'a [u8], U, E>,
110    H: Fn(&'a [u8]) -> nom::IResult<&'a [u8], V, E>,
111    I: Fn(&'a [u8]) -> nom::IResult<&'a [u8], W, E>,
112    E: nom::error::ParseError<&'a [u8]>,
113{
114    let mut index = 0;
115    let mut rest = input;
116
117    loop {
118        while result.len() <= index {
119            result.push(Vec::new());
120        }
121        if let Ok((r, _)) = parse_nested_separated_values(
122            &mut result[index],
123            rest,
124            &data,
125            &separator_inside,
126            &separator_outside,
127            false,
128        ) {
129            index += 1;
130            rest = r;
131        }
132        if let Ok((r, _)) = separator_outside2(rest) {
133            rest = r;
134            continue;
135        }
136        if index <= result.len() {
137            result.drain(index..);
138            // remove overflowed content
139        }
140        return Ok((rest, ()));
141    }
142}
143
144static EMPTY_INFO: Lazy<Vec<(U8Vec, Vec<U8Vec>)>> = Lazy::new(|| vec![(b".".to_vec(), vec![])]);
145
146pub fn parse_info<'a, E>(
147    input: &'a [u8],
148    info: &mut Vec<(U8Vec, Vec<U8Vec>)>,
149) -> nom::IResult<&'a [u8], (), E>
150where
151    E: nom::error::ParseError<&'a [u8]>,
152{
153    let mut index = 0;
154    let mut rest = input;
155
156    while let Ok((r, key)) = is_not::<_, _, E>(&b"\t\r\n=;"[..])(rest) {
157        if info.len() <= index {
158            info.push((key.to_vec(), Vec::new()));
159        } else {
160            info[index].0.clear();
161            info[index].0.extend_from_slice(key);
162        }
163
164        if let Ok((r, _)) = tag::<_, _, E>(b"=")(r) {
165            let (r, _) = parse_separated_values(
166                &mut info[index].1,
167                r,
168                is_not(&b"\t\r\n,;"[..]),
169                tag(b","),
170                false,
171            )?;
172            rest = r;
173        } else {
174            info[index].1.clear();
175            rest = r;
176        }
177        index += 1;
178
179        if let Ok((r, _)) = tag::<_, _, E>(b";")(rest) {
180            rest = r;
181        } else {
182            //eprintln!("No colon: {:?}", rest);
183            break;
184        }
185    }
186
187    if index <= info.len() {
188        info.drain(index..);
189        // remove overflowed content
190    }
191
192    if info == &*EMPTY_INFO {
193        info.clear();
194    }
195
196    Ok((rest, ()))
197}
198
199fn parse_float<'a, E>(data: &'a [u8]) -> nom::IResult<&'a [u8], &'a [u8], E>
200where
201    E: nom::error::ParseError<&'a [u8]>,
202{
203    alt((
204        tag(b"."),
205        recognize(tuple((
206            take_while1(is_digit),
207            opt(tuple((tag(b"."), opt(take_while1(is_digit))))),
208            opt(tuple((
209                alt((tag(b"e"), tag(b"E"))),
210                opt(alt((tag("+"), tag("-")))),
211                take_while1(is_digit),
212            ))),
213        ))),
214    ))(data)
215}
216
217fn parse_record_optional_columns<'a, E>(
218    rest: &'a [u8],
219    record: &mut VCFRecord,
220) -> nom::IResult<&'a [u8], (), E>
221where
222    E: nom::error::ParseError<&'a [u8]>,
223{
224    let rest = match tag::<_, _, E>(b"\t")(rest) {
225        Ok((rest, _)) => rest,
226        Err(_) => {
227            record.qual = None;
228            record.filter.clear();
229            record.info.clear();
230            record.format.clear();
231            record.genotype.clear();
232            return Ok((rest, ()));
233        }
234    };
235    let (rest, qual) = parse_float(rest)?;
236    if qual == b"." {
237        record.qual = None;
238    } else {
239        record.qual = Some(str::from_utf8(qual).unwrap().parse().unwrap());
240    }
241    let rest = match tag::<_, _, E>(b"\t")(rest) {
242        Ok((rest, _)) => rest,
243        Err(_) => {
244            record.filter.clear();
245            record.info.clear();
246            record.format.clear();
247            record.genotype.clear();
248            return Ok((rest, ()));
249        }
250    };
251    let (rest, _) = parse_separated_values(
252        &mut record.filter,
253        rest,
254        is_not(&b"\t\r\n,"[..]),
255        tag(b","),
256        false,
257    )?;
258    if record.filter == [b"."] {
259        record.filter.clear();
260    }
261    let rest = match tag::<_, _, E>(b"\t")(rest) {
262        Ok((rest, _)) => rest,
263        Err(_) => {
264            record.info.clear();
265            record.format.clear();
266            record.genotype.clear();
267            return Ok((rest, ()));
268        }
269    };
270    let (rest, _) = parse_info(rest, &mut record.info)?;
271    let rest = match tag::<_, _, E>(b"\t")(rest) {
272        Ok((rest, _)) => rest,
273        Err(_) => {
274            record.format.clear();
275            record.genotype.clear();
276            return Ok((rest, ()));
277        }
278    };
279    let (rest, _) = parse_separated_values(
280        &mut record.format,
281        rest,
282        is_not(&b"\t\r\n:"[..]),
283        tag(b":"),
284        false,
285    )?;
286    if record.format == [b"."] {
287        record.format.clear();
288    }
289    let rest = match tag::<_, _, E>(b"\t")(rest) {
290        Ok((rest, _)) => rest,
291        Err(_) => {
292            record.genotype.clear();
293            return Ok((rest, ()));
294        }
295    };
296    let (rest, _) = parse_double_nested_separated_values(
297        &mut record.genotype,
298        rest,
299        is_not(&b"\t\r\n:,"[..]),
300        tag(b","),
301        tag(b":"),
302        tag(b"\t"),
303    )?;
304    if record.genotype == [[[b"."]]] {
305        record.genotype.clear();
306    }
307
308    Ok((rest, ()))
309}
310
311fn eof<I, E>(data: I) -> nom::IResult<I, I, E>
312where
313    E: nom::error::ParseError<I>,
314    I: nom::InputLength + nom::InputTake,
315{
316    if data.input_len() == 0 {
317        Ok(data.take_split(0))
318    } else {
319        Err(nom::Err::Failure(nom::error::make_error(
320            data,
321            nom::error::ErrorKind::Eof,
322        )))
323    }
324}
325
326pub fn parse_record<'a, E>(line: &'a [u8], record: &mut VCFRecord) -> nom::IResult<&'a [u8], ()>
327where
328    E: nom::error::ParseError<&'a [u8]>,
329{
330    let (rest, chromosome) = is_not(&b"\t\r\n"[..])(line)?;
331    record.chromosome.clear();
332    record.chromosome.extend_from_slice(chromosome);
333    let (rest, _) = tag(b"\t")(rest)?;
334
335    let (rest, position) = take_while1(is_digit)(rest)?;
336    record.position = str::from_utf8(position).unwrap().parse().unwrap();
337    let (rest, _) = tag(b"\t")(rest)?;
338
339    let (rest, _) = parse_separated_values(
340        &mut record.id,
341        rest,
342        is_not(&b"\t\r\n,"[..]),
343        tag(b","),
344        false,
345    )?;
346    if record.id == [b"."] {
347        record.id.clear();
348    }
349    let (rest, _) = tag(b"\t")(rest)?;
350    let (rest, reference) = is_not(&b"\t\r\n"[..])(rest)?;
351    record.reference.clear();
352    record.reference.extend_from_slice(reference);
353
354    let (rest, _) = tag(b"\t")(rest)?;
355    let (rest, _) = parse_separated_values(
356        &mut record.alternative,
357        rest,
358        is_not(&b"\t\r\n,"[..]),
359        tag(b","),
360        false,
361    )?;
362    if record.alternative == [b"."] {
363        record.alternative.clear();
364    }
365    let (rest, _) = parse_record_optional_columns(rest, record)?;
366    let (rest, _) = alt((tag("\r\n"), tag("\n"), eof))(rest)?;
367
368    record.recreate_info_and_genotype_index();
369
370    Ok((rest, ()))
371}