1use super::*;
2use nom::{
3 self, branch::alt, bytes::complete::is_not, bytes::complete::tag, bytes::complete::take_while,
4 character::is_digit, combinator::eof, combinator::map, combinator::opt, multi::separated_list0,
5 sequence::separated_pair, sequence::tuple,
6};
7use std::str;
8
9pub type EntryPair<'a> = (&'a [u8], &'a [u8]);
10
11pub fn parse_header_item(header_line: &[u8]) -> VResult<&[u8], VCFHeaderLine> {
12 let line = header_line.to_vec();
13 let (rest, _) = tag("##")(header_line)?;
14 let (rest, contents) = parse_header_content(rest)?;
15 let (rest, _) = alt((tag("\r\n"), tag("\n"), eof))(rest)?;
16 Ok((rest, VCFHeaderLine { line, contents }))
17}
18
19pub fn parse_header_content(header_line_without_sharp: &[u8]) -> VResult<&[u8], VCFHeaderContent> {
20 alt((
21 parse_vcf_file_format_header,
22 parse_vcf_contig_header,
23 parse_vcf_info_header,
24 parse_vcf_format_header,
25 parse_vcf_alt_header,
26 parse_vcf_filter_header,
27 parse_other_header_item,
28 ))(header_line_without_sharp)
29}
30
31pub fn parse_header_entries(value: &[u8]) -> VResult<&[u8], Vec<EntryPair>> {
32 separated_list0(
33 tag(b","),
34 separated_pair(
35 is_not(&b">,= \r\n\t"[..]),
36 tag(b"="),
37 alt((
38 map(tuple((tag(b"\""), is_not(&b"\""[..]), tag(b"\""))), |x| x.1),
39 is_not(&b">, \r\n\t"[..]),
40 )),
41 ),
42 )(value)
43}
44
45pub fn find_key<'a>(entry_pair: &[EntryPair<'a>], key: &[u8]) -> Option<&'a [u8]> {
46 entry_pair.iter().find(|(k, _)| *k == key).map(|(_, v)| *v)
47}
48
49pub fn find_key_or_error<'a>(
50 entry_pair: &[EntryPair<'a>],
51 key: &[u8],
52 error_content: &'a [u8],
53 error_message: &'static str,
54) -> Result<&'a [u8], nom::Err<nom::error::VerboseError<&'a [u8]>>> {
55 find_key(entry_pair, key).ok_or_else(|| {
56 nom::Err::Error(nom::error::VerboseError {
57 errors: vec![(
58 error_content,
59 nom::error::VerboseErrorKind::Context(error_message),
60 )],
61 })
62 })
63}
64
65pub fn parse_number(value: &[u8]) -> Number {
66 match value {
67 b"R" => Number::Reference,
68 b"A" => Number::Allele,
69 b"G" => Number::Genotype,
70 b"0" => Number::Zero,
71 b"." => Number::Unknown,
72 x if x.iter().copied().all(is_digit) => {
73 Number::Number(str::from_utf8(x).unwrap().parse().unwrap())
74 }
75 _ => Number::Other(value.to_vec()),
76 }
77}
78
79pub fn parse_value_type(value: &[u8]) -> ValueType {
80 match value {
81 b"String" => ValueType::String,
82 b"Integer" => ValueType::Integer,
83 b"Flag" => ValueType::Flag,
84 b"Character" => ValueType::Character,
85 b"Float" => ValueType::Float,
86 _ => ValueType::Other(value.to_vec()),
87 }
88}
89
90pub fn parse_vcf_file_format_header(header_line: &[u8]) -> VResult<&[u8], VCFHeaderContent> {
91 let (rest, _) = tag(b"fileformat=")(header_line)?;
92 let (rest, version) = take_while(|x: u8| x != b'\n' && x != b'\r')(rest)?;
93 let parsed_version = match version {
94 b"VCFv4.3" => VCFVersion::Vcf4_3,
95 b"VCFv4.2" => VCFVersion::Vcf4_2,
96 b"VCFv4.1" => VCFVersion::Vcf4_1,
97 b"VCFv4.0" => VCFVersion::Vcf4_0,
98 _ => VCFVersion::Other(version.to_vec()),
99 };
100 Ok((rest, VCFHeaderContent::FileFormat(parsed_version)))
101}
102
103pub fn parse_vcf_contig_header(header_line: &[u8]) -> VResult<&[u8], VCFHeaderContent> {
104 let (rest, _) = tag(b"contig=<")(header_line)?;
105 let (rest, entries) = parse_header_entries(rest)?;
106 let (rest, _) = tag(b">")(rest)?;
107 let id = find_key_or_error(&entries, b"ID", header_line, "No ID tag")?.to_vec();
108 let length = entries
109 .iter()
110 .find(|(k, _)| k == b"length")
111 .map(|(_, v)| str::from_utf8(v).ok())
112 .unwrap_or(Option::None)
113 .map(|x| x.parse::<u64>().ok())
114 .unwrap_or(Option::None);
115 Ok((rest, VCFHeaderContent::Contig { id, length }))
116}
117
118pub fn parse_vcf_info_header(header_line: &[u8]) -> VResult<&[u8], VCFHeaderContent> {
119 let (rest, _) = tag(b"INFO=<")(header_line)?;
120 let (rest, entries) = parse_header_entries(rest)?;
121 let (rest, _) = tag(b">")(rest)?;
122 let id = find_key_or_error(&entries, b"ID", header_line, "No ID tag")?.to_vec();
123 let number = parse_number(find_key_or_error(
124 &entries,
125 b"Number",
126 header_line,
127 "No Number tag",
128 )?);
129 let value_type = parse_value_type(find_key_or_error(
130 &entries,
131 b"Type",
132 header_line,
133 "No Type tag",
134 )?);
135 let description =
136 find_key_or_error(&entries, b"Description", header_line, "No Description tag")?.to_vec();
137 let source = find_key(&entries, b"Source").map(|x| x.to_vec());
138 let version = find_key(&entries, b"Version").map(|x| x.to_vec());
139
140 Ok((
141 rest,
142 VCFHeaderContent::INFO {
143 id,
144 number,
145 value_type,
146 description,
147 source,
148 version,
149 },
150 ))
151}
152
153pub fn parse_vcf_format_header(header_line: &[u8]) -> VResult<&[u8], VCFHeaderContent> {
154 let (rest, _) = tag(b"FORMAT=<")(header_line)?;
155 let (rest, entries) = parse_header_entries(rest)?;
156 let (rest, _) = tag(b">")(rest)?;
157 let id = find_key_or_error(&entries, b"ID", header_line, "No ID tag")?.to_vec();
158 let number = parse_number(find_key_or_error(
159 &entries,
160 b"Number",
161 header_line,
162 "No Number tag",
163 )?);
164 let value_type = parse_value_type(find_key_or_error(
165 &entries,
166 b"Type",
167 header_line,
168 "No Type tag",
169 )?);
170 let description =
171 find_key_or_error(&entries, b"Description", header_line, "No Description tag")?.to_vec();
172 let source = find_key(&entries, b"Source").map(|x| x.to_vec());
173 let version = find_key(&entries, b"Version").map(|x| x.to_vec());
174
175 Ok((
176 rest,
177 VCFHeaderContent::FORMAT {
178 id,
179 number,
180 value_type,
181 description,
182 source,
183 version,
184 },
185 ))
186}
187
188pub fn parse_vcf_filter_header(header_line: &[u8]) -> VResult<&[u8], VCFHeaderContent> {
189 let (rest, _) = tag(b"FILTER=<")(header_line)?;
190 let (rest, entries) = parse_header_entries(rest)?;
191 let (rest, _) = tag(b">")(rest)?;
192 let id = find_key_or_error(&entries, b"ID", header_line, "No ID tag")?.to_vec();
193 let description =
194 find_key_or_error(&entries, b"Description", header_line, "No Description tag")?.to_vec();
195
196 Ok((rest, VCFHeaderContent::FILTER { id, description }))
197}
198
199pub fn parse_vcf_alt_header(header_line: &[u8]) -> VResult<&[u8], VCFHeaderContent> {
200 let (rest, _) = tag(b"ALT=<")(header_line)?;
201 let (rest, entries) = parse_header_entries(rest)?;
202 let (rest, _) = tag(b">")(rest)?;
203 let id = find_key_or_error(&entries, b"ID", header_line, "No ID tag")?.to_vec();
204 let description =
205 find_key_or_error(&entries, b"Description", header_line, "No Description tag")?.to_vec();
206
207 Ok((rest, VCFHeaderContent::ALT { id, description }))
208}
209
210pub fn parse_other_header_item(header_line: &[u8]) -> VResult<&[u8], VCFHeaderContent> {
211 let (rest, _) = is_not(&b"\r\n"[..])(header_line)?;
212 Ok((rest, VCFHeaderContent::Other))
213}
214
215pub fn parse_samples(header_line: &[u8]) -> VResult<&[u8], Vec<U8Vec>> {
216 let (rest, data) = tuple((
217 tag(b"#CHROM\tPOS\tID\tREF\tALT"),
218 opt(tuple((
219 tag("\tQUAL"),
220 opt(tuple((
221 tag("\tFILTER"),
222 opt(tuple((
223 tag("\tINFO"),
224 opt(tuple((
225 tag("\tFORMAT"),
226 opt(tuple((
227 tag("\t"),
228 separated_list0(tag("\t"), is_not(&b"\t\r\n"[..])),
229 ))),
230 ))),
231 ))),
232 ))),
233 ))),
234 alt((tag("\r\n"), tag("\n"))),
235 ))(header_line)?;
236
237 let samples: Vec<U8Vec> = data
238 .1
239 .map(|(_, v)| v)
240 .flatten()
241 .map(|(_, v)| v)
242 .flatten()
243 .map(|(_, v)| v)
244 .flatten()
245 .map(|(_, v)| v)
246 .flatten()
247 .map(|(_, v)| v.iter().map(|x| x.to_vec()).collect())
248 .unwrap_or_else(Vec::new);
249
250 Ok((rest, samples))
251}