1use super::VCFRecord;
2use crate::U8Vec;
3use nom::{
4 self, branch::alt, bytes::complete::is_not, bytes::complete::tag, bytes::complete::take_while1,
5 character::is_digit, combinator::opt, combinator::recognize, sequence::tuple,
6};
7use once_cell::sync::Lazy;
8use std::str;
9
10pub fn parse_separated_values<'a, U, F, G, E>(
11 result: &mut Vec<U8Vec>,
12 input: &'a [u8],
13 data: F,
14 separator: G,
15 require_one_entry: bool,
16) -> nom::IResult<&'a [u8], (), E>
17where
18 F: Fn(&'a [u8]) -> nom::IResult<&'a [u8], &'a [u8], E>,
19 G: Fn(&'a [u8]) -> nom::IResult<&'a [u8], U, E>,
20 E: nom::error::ParseError<&'a [u8]>,
21{
22 let mut index = 0;
23 let mut rest = input;
24
25 loop {
26 if let Ok((r, d)) = data(rest) {
27 if index < result.len() {
28 result[index].clear();
29 result[index].extend_from_slice(d);
30 } else {
31 result.push(d.to_vec());
32 }
33 index += 1;
34 rest = r;
35 }
36 if let Ok((r, _)) = separator(rest) {
37 rest = r;
38 continue;
39 }
40 if index == 0 && require_one_entry {
41 return Err(nom::Err::Error(nom::error::make_error(
42 input,
43 nom::error::ErrorKind::SeparatedNonEmptyList,
44 )));
45 }
46 if index <= result.len() {
47 result.drain(index..);
48 }
50 return Ok((rest, ()));
51 }
52}
53
54pub fn parse_nested_separated_values<'a, U, V, F, G, H, E>(
55 result: &mut Vec<Vec<U8Vec>>,
56 input: &'a [u8],
57 data: F,
58 separator_inside: H,
59 separator_outside: G,
60 require_one_entry: bool,
61) -> nom::IResult<&'a [u8], (), E>
62where
63 F: Fn(&'a [u8]) -> nom::IResult<&'a [u8], &'a [u8], E>,
64 G: Fn(&'a [u8]) -> nom::IResult<&'a [u8], U, E>,
65 H: Fn(&'a [u8]) -> nom::IResult<&'a [u8], V, E>,
66 E: nom::error::ParseError<&'a [u8]>,
67{
68 let mut index = 0;
69 let mut rest = input;
70
71 loop {
72 while result.len() <= index {
73 result.push(Vec::new());
74 }
75 if let Ok((r, _)) =
76 parse_separated_values(&mut result[index], rest, &data, &separator_inside, true)
77 {
78 index += 1;
79 rest = r;
80 }
81 if let Ok((r, _)) = separator_outside(rest) {
82 rest = r;
83 continue;
84 }
85 if index == 0 && require_one_entry {
86 return Err(nom::Err::Error(nom::error::make_error(
87 input,
88 nom::error::ErrorKind::SeparatedNonEmptyList,
89 )));
90 }
91 if index <= result.len() {
92 result.drain(index..);
93 }
95 return Ok((rest, ()));
96 }
97}
98
99pub fn parse_double_nested_separated_values<'a, U, V, W, F, G, H, I, E>(
100 result: &mut Vec<Vec<Vec<U8Vec>>>,
101 input: &'a [u8],
102 data: F,
103 separator_inside: H,
104 separator_outside: G,
105 separator_outside2: I,
106) -> nom::IResult<&'a [u8], (), E>
107where
108 F: Fn(&'a [u8]) -> nom::IResult<&'a [u8], &'a [u8], E>,
109 G: Fn(&'a [u8]) -> nom::IResult<&'a [u8], U, E>,
110 H: Fn(&'a [u8]) -> nom::IResult<&'a [u8], V, E>,
111 I: Fn(&'a [u8]) -> nom::IResult<&'a [u8], W, E>,
112 E: nom::error::ParseError<&'a [u8]>,
113{
114 let mut index = 0;
115 let mut rest = input;
116
117 loop {
118 while result.len() <= index {
119 result.push(Vec::new());
120 }
121 if let Ok((r, _)) = parse_nested_separated_values(
122 &mut result[index],
123 rest,
124 &data,
125 &separator_inside,
126 &separator_outside,
127 false,
128 ) {
129 index += 1;
130 rest = r;
131 }
132 if let Ok((r, _)) = separator_outside2(rest) {
133 rest = r;
134 continue;
135 }
136 if index <= result.len() {
137 result.drain(index..);
138 }
140 return Ok((rest, ()));
141 }
142}
143
144static EMPTY_INFO: Lazy<Vec<(U8Vec, Vec<U8Vec>)>> = Lazy::new(|| vec![(b".".to_vec(), vec![])]);
145
146pub fn parse_info<'a, E>(
147 input: &'a [u8],
148 info: &mut Vec<(U8Vec, Vec<U8Vec>)>,
149) -> nom::IResult<&'a [u8], (), E>
150where
151 E: nom::error::ParseError<&'a [u8]>,
152{
153 let mut index = 0;
154 let mut rest = input;
155
156 while let Ok((r, key)) = is_not::<_, _, E>(&b"\t\r\n=;"[..])(rest) {
157 if info.len() <= index {
158 info.push((key.to_vec(), Vec::new()));
159 } else {
160 info[index].0.clear();
161 info[index].0.extend_from_slice(key);
162 }
163
164 if let Ok((r, _)) = tag::<_, _, E>(b"=")(r) {
165 let (r, _) = parse_separated_values(
166 &mut info[index].1,
167 r,
168 is_not(&b"\t\r\n,;"[..]),
169 tag(b","),
170 false,
171 )?;
172 rest = r;
173 } else {
174 info[index].1.clear();
175 rest = r;
176 }
177 index += 1;
178
179 if let Ok((r, _)) = tag::<_, _, E>(b";")(rest) {
180 rest = r;
181 } else {
182 break;
184 }
185 }
186
187 if index <= info.len() {
188 info.drain(index..);
189 }
191
192 if info == &*EMPTY_INFO {
193 info.clear();
194 }
195
196 Ok((rest, ()))
197}
198
199fn parse_float<'a, E>(data: &'a [u8]) -> nom::IResult<&'a [u8], &'a [u8], E>
200where
201 E: nom::error::ParseError<&'a [u8]>,
202{
203 alt((
204 tag(b"."),
205 recognize(tuple((
206 take_while1(is_digit),
207 opt(tuple((tag(b"."), opt(take_while1(is_digit))))),
208 opt(tuple((
209 alt((tag(b"e"), tag(b"E"))),
210 opt(alt((tag("+"), tag("-")))),
211 take_while1(is_digit),
212 ))),
213 ))),
214 ))(data)
215}
216
217fn parse_record_optional_columns<'a, E>(
218 rest: &'a [u8],
219 record: &mut VCFRecord,
220) -> nom::IResult<&'a [u8], (), E>
221where
222 E: nom::error::ParseError<&'a [u8]>,
223{
224 let rest = match tag::<_, _, E>(b"\t")(rest) {
225 Ok((rest, _)) => rest,
226 Err(_) => {
227 record.qual = None;
228 record.filter.clear();
229 record.info.clear();
230 record.format.clear();
231 record.genotype.clear();
232 return Ok((rest, ()));
233 }
234 };
235 let (rest, qual) = parse_float(rest)?;
236 if qual == b"." {
237 record.qual = None;
238 } else {
239 record.qual = Some(str::from_utf8(qual).unwrap().parse().unwrap());
240 }
241 let rest = match tag::<_, _, E>(b"\t")(rest) {
242 Ok((rest, _)) => rest,
243 Err(_) => {
244 record.filter.clear();
245 record.info.clear();
246 record.format.clear();
247 record.genotype.clear();
248 return Ok((rest, ()));
249 }
250 };
251 let (rest, _) = parse_separated_values(
252 &mut record.filter,
253 rest,
254 is_not(&b"\t\r\n,"[..]),
255 tag(b","),
256 false,
257 )?;
258 if record.filter == [b"."] {
259 record.filter.clear();
260 }
261 let rest = match tag::<_, _, E>(b"\t")(rest) {
262 Ok((rest, _)) => rest,
263 Err(_) => {
264 record.info.clear();
265 record.format.clear();
266 record.genotype.clear();
267 return Ok((rest, ()));
268 }
269 };
270 let (rest, _) = parse_info(rest, &mut record.info)?;
271 let rest = match tag::<_, _, E>(b"\t")(rest) {
272 Ok((rest, _)) => rest,
273 Err(_) => {
274 record.format.clear();
275 record.genotype.clear();
276 return Ok((rest, ()));
277 }
278 };
279 let (rest, _) = parse_separated_values(
280 &mut record.format,
281 rest,
282 is_not(&b"\t\r\n:"[..]),
283 tag(b":"),
284 false,
285 )?;
286 if record.format == [b"."] {
287 record.format.clear();
288 }
289 let rest = match tag::<_, _, E>(b"\t")(rest) {
290 Ok((rest, _)) => rest,
291 Err(_) => {
292 record.genotype.clear();
293 return Ok((rest, ()));
294 }
295 };
296 let (rest, _) = parse_double_nested_separated_values(
297 &mut record.genotype,
298 rest,
299 is_not(&b"\t\r\n:,"[..]),
300 tag(b","),
301 tag(b":"),
302 tag(b"\t"),
303 )?;
304 if record.genotype == [[[b"."]]] {
305 record.genotype.clear();
306 }
307
308 Ok((rest, ()))
309}
310
311fn eof<I, E>(data: I) -> nom::IResult<I, I, E>
312where
313 E: nom::error::ParseError<I>,
314 I: nom::InputLength + nom::InputTake,
315{
316 if data.input_len() == 0 {
317 Ok(data.take_split(0))
318 } else {
319 Err(nom::Err::Failure(nom::error::make_error(
320 data,
321 nom::error::ErrorKind::Eof,
322 )))
323 }
324}
325
326pub fn parse_record<'a, E>(line: &'a [u8], record: &mut VCFRecord) -> nom::IResult<&'a [u8], ()>
327where
328 E: nom::error::ParseError<&'a [u8]>,
329{
330 let (rest, chromosome) = is_not(&b"\t\r\n"[..])(line)?;
331 record.chromosome.clear();
332 record.chromosome.extend_from_slice(chromosome);
333 let (rest, _) = tag(b"\t")(rest)?;
334
335 let (rest, position) = take_while1(is_digit)(rest)?;
336 record.position = str::from_utf8(position).unwrap().parse().unwrap();
337 let (rest, _) = tag(b"\t")(rest)?;
338
339 let (rest, _) = parse_separated_values(
340 &mut record.id,
341 rest,
342 is_not(&b"\t\r\n,"[..]),
343 tag(b","),
344 false,
345 )?;
346 if record.id == [b"."] {
347 record.id.clear();
348 }
349 let (rest, _) = tag(b"\t")(rest)?;
350 let (rest, reference) = is_not(&b"\t\r\n"[..])(rest)?;
351 record.reference.clear();
352 record.reference.extend_from_slice(reference);
353
354 let (rest, _) = tag(b"\t")(rest)?;
355 let (rest, _) = parse_separated_values(
356 &mut record.alternative,
357 rest,
358 is_not(&b"\t\r\n,"[..]),
359 tag(b","),
360 false,
361 )?;
362 if record.alternative == [b"."] {
363 record.alternative.clear();
364 }
365 let (rest, _) = parse_record_optional_columns(rest, record)?;
366 let (rest, _) = alt((tag("\r\n"), tag("\n"), eof))(rest)?;
367
368 record.recreate_info_and_genotype_index();
369
370 Ok((rest, ()))
371}