vcf/header/
parser.rs

1use super::*;
2use nom::{
3    self, branch::alt, bytes::complete::is_not, bytes::complete::tag, bytes::complete::take_while,
4    character::is_digit, combinator::eof, combinator::map, combinator::opt, multi::separated_list0,
5    sequence::separated_pair, sequence::tuple,
6};
7use std::str;
8
9pub type EntryPair<'a> = (&'a [u8], &'a [u8]);
10
11pub fn parse_header_item(header_line: &[u8]) -> VResult<&[u8], VCFHeaderLine> {
12    let line = header_line.to_vec();
13    let (rest, _) = tag("##")(header_line)?;
14    let (rest, contents) = parse_header_content(rest)?;
15    let (rest, _) = alt((tag("\r\n"), tag("\n"), eof))(rest)?;
16    Ok((rest, VCFHeaderLine { line, contents }))
17}
18
19pub fn parse_header_content(header_line_without_sharp: &[u8]) -> VResult<&[u8], VCFHeaderContent> {
20    alt((
21        parse_vcf_file_format_header,
22        parse_vcf_contig_header,
23        parse_vcf_info_header,
24        parse_vcf_format_header,
25        parse_vcf_alt_header,
26        parse_vcf_filter_header,
27        parse_other_header_item,
28    ))(header_line_without_sharp)
29}
30
31pub fn parse_header_entries(value: &[u8]) -> VResult<&[u8], Vec<EntryPair>> {
32    separated_list0(
33        tag(b","),
34        separated_pair(
35            is_not(&b">,= \r\n\t"[..]),
36            tag(b"="),
37            alt((
38                map(tuple((tag(b"\""), is_not(&b"\""[..]), tag(b"\""))), |x| x.1),
39                is_not(&b">, \r\n\t"[..]),
40            )),
41        ),
42    )(value)
43}
44
45pub fn find_key<'a>(entry_pair: &[EntryPair<'a>], key: &[u8]) -> Option<&'a [u8]> {
46    entry_pair.iter().find(|(k, _)| *k == key).map(|(_, v)| *v)
47}
48
49pub fn find_key_or_error<'a>(
50    entry_pair: &[EntryPair<'a>],
51    key: &[u8],
52    error_content: &'a [u8],
53    error_message: &'static str,
54) -> Result<&'a [u8], nom::Err<nom::error::VerboseError<&'a [u8]>>> {
55    find_key(entry_pair, key).ok_or_else(|| {
56        nom::Err::Error(nom::error::VerboseError {
57            errors: vec![(
58                error_content,
59                nom::error::VerboseErrorKind::Context(error_message),
60            )],
61        })
62    })
63}
64
65pub fn parse_number(value: &[u8]) -> Number {
66    match value {
67        b"R" => Number::Reference,
68        b"A" => Number::Allele,
69        b"G" => Number::Genotype,
70        b"0" => Number::Zero,
71        b"." => Number::Unknown,
72        x if x.iter().copied().all(is_digit) => {
73            Number::Number(str::from_utf8(x).unwrap().parse().unwrap())
74        }
75        _ => Number::Other(value.to_vec()),
76    }
77}
78
79pub fn parse_value_type(value: &[u8]) -> ValueType {
80    match value {
81        b"String" => ValueType::String,
82        b"Integer" => ValueType::Integer,
83        b"Flag" => ValueType::Flag,
84        b"Character" => ValueType::Character,
85        b"Float" => ValueType::Float,
86        _ => ValueType::Other(value.to_vec()),
87    }
88}
89
90pub fn parse_vcf_file_format_header(header_line: &[u8]) -> VResult<&[u8], VCFHeaderContent> {
91    let (rest, _) = tag(b"fileformat=")(header_line)?;
92    let (rest, version) = take_while(|x: u8| x != b'\n' && x != b'\r')(rest)?;
93    let parsed_version = match version {
94        b"VCFv4.3" => VCFVersion::Vcf4_3,
95        b"VCFv4.2" => VCFVersion::Vcf4_2,
96        b"VCFv4.1" => VCFVersion::Vcf4_1,
97        b"VCFv4.0" => VCFVersion::Vcf4_0,
98        _ => VCFVersion::Other(version.to_vec()),
99    };
100    Ok((rest, VCFHeaderContent::FileFormat(parsed_version)))
101}
102
103pub fn parse_vcf_contig_header(header_line: &[u8]) -> VResult<&[u8], VCFHeaderContent> {
104    let (rest, _) = tag(b"contig=<")(header_line)?;
105    let (rest, entries) = parse_header_entries(rest)?;
106    let (rest, _) = tag(b">")(rest)?;
107    let id = find_key_or_error(&entries, b"ID", header_line, "No ID tag")?.to_vec();
108    let length = entries
109        .iter()
110        .find(|(k, _)| k == b"length")
111        .map(|(_, v)| str::from_utf8(v).ok())
112        .unwrap_or(Option::None)
113        .map(|x| x.parse::<u64>().ok())
114        .unwrap_or(Option::None);
115    Ok((rest, VCFHeaderContent::Contig { id, length }))
116}
117
118pub fn parse_vcf_info_header(header_line: &[u8]) -> VResult<&[u8], VCFHeaderContent> {
119    let (rest, _) = tag(b"INFO=<")(header_line)?;
120    let (rest, entries) = parse_header_entries(rest)?;
121    let (rest, _) = tag(b">")(rest)?;
122    let id = find_key_or_error(&entries, b"ID", header_line, "No ID tag")?.to_vec();
123    let number = parse_number(find_key_or_error(
124        &entries,
125        b"Number",
126        header_line,
127        "No Number tag",
128    )?);
129    let value_type = parse_value_type(find_key_or_error(
130        &entries,
131        b"Type",
132        header_line,
133        "No Type tag",
134    )?);
135    let description =
136        find_key_or_error(&entries, b"Description", header_line, "No Description tag")?.to_vec();
137    let source = find_key(&entries, b"Source").map(|x| x.to_vec());
138    let version = find_key(&entries, b"Version").map(|x| x.to_vec());
139
140    Ok((
141        rest,
142        VCFHeaderContent::INFO {
143            id,
144            number,
145            value_type,
146            description,
147            source,
148            version,
149        },
150    ))
151}
152
153pub fn parse_vcf_format_header(header_line: &[u8]) -> VResult<&[u8], VCFHeaderContent> {
154    let (rest, _) = tag(b"FORMAT=<")(header_line)?;
155    let (rest, entries) = parse_header_entries(rest)?;
156    let (rest, _) = tag(b">")(rest)?;
157    let id = find_key_or_error(&entries, b"ID", header_line, "No ID tag")?.to_vec();
158    let number = parse_number(find_key_or_error(
159        &entries,
160        b"Number",
161        header_line,
162        "No Number tag",
163    )?);
164    let value_type = parse_value_type(find_key_or_error(
165        &entries,
166        b"Type",
167        header_line,
168        "No Type tag",
169    )?);
170    let description =
171        find_key_or_error(&entries, b"Description", header_line, "No Description tag")?.to_vec();
172    let source = find_key(&entries, b"Source").map(|x| x.to_vec());
173    let version = find_key(&entries, b"Version").map(|x| x.to_vec());
174
175    Ok((
176        rest,
177        VCFHeaderContent::FORMAT {
178            id,
179            number,
180            value_type,
181            description,
182            source,
183            version,
184        },
185    ))
186}
187
188pub fn parse_vcf_filter_header(header_line: &[u8]) -> VResult<&[u8], VCFHeaderContent> {
189    let (rest, _) = tag(b"FILTER=<")(header_line)?;
190    let (rest, entries) = parse_header_entries(rest)?;
191    let (rest, _) = tag(b">")(rest)?;
192    let id = find_key_or_error(&entries, b"ID", header_line, "No ID tag")?.to_vec();
193    let description =
194        find_key_or_error(&entries, b"Description", header_line, "No Description tag")?.to_vec();
195
196    Ok((rest, VCFHeaderContent::FILTER { id, description }))
197}
198
199pub fn parse_vcf_alt_header(header_line: &[u8]) -> VResult<&[u8], VCFHeaderContent> {
200    let (rest, _) = tag(b"ALT=<")(header_line)?;
201    let (rest, entries) = parse_header_entries(rest)?;
202    let (rest, _) = tag(b">")(rest)?;
203    let id = find_key_or_error(&entries, b"ID", header_line, "No ID tag")?.to_vec();
204    let description =
205        find_key_or_error(&entries, b"Description", header_line, "No Description tag")?.to_vec();
206
207    Ok((rest, VCFHeaderContent::ALT { id, description }))
208}
209
210pub fn parse_other_header_item(header_line: &[u8]) -> VResult<&[u8], VCFHeaderContent> {
211    let (rest, _) = is_not(&b"\r\n"[..])(header_line)?;
212    Ok((rest, VCFHeaderContent::Other))
213}
214
215pub fn parse_samples(header_line: &[u8]) -> VResult<&[u8], Vec<U8Vec>> {
216    let (rest, data) = tuple((
217        tag(b"#CHROM\tPOS\tID\tREF\tALT"),
218        opt(tuple((
219            tag("\tQUAL"),
220            opt(tuple((
221                tag("\tFILTER"),
222                opt(tuple((
223                    tag("\tINFO"),
224                    opt(tuple((
225                        tag("\tFORMAT"),
226                        opt(tuple((
227                            tag("\t"),
228                            separated_list0(tag("\t"), is_not(&b"\t\r\n"[..])),
229                        ))),
230                    ))),
231                ))),
232            ))),
233        ))),
234        alt((tag("\r\n"), tag("\n"))),
235    ))(header_line)?;
236
237    let samples: Vec<U8Vec> = data
238        .1
239        .map(|(_, v)| v)
240        .flatten()
241        .map(|(_, v)| v)
242        .flatten()
243        .map(|(_, v)| v)
244        .flatten()
245        .map(|(_, v)| v)
246        .flatten()
247        .map(|(_, v)| v.iter().map(|x| x.to_vec()).collect())
248        .unwrap_or_else(Vec::new);
249
250    Ok((rest, samples))
251}