noodles_vcf/header/
parser.rs

1//! VCF header parser.
2
3mod builder;
4mod entry;
5mod file_format_option;
6pub(super) mod record;
7
8use std::{error, str};
9
10use indexmap::IndexMap;
11
12pub(super) use self::record::parse_record;
13pub use self::{builder::Builder, entry::Entry, file_format_option::FileFormatOption};
14use super::{
15    file_format::FileFormat,
16    record::value::{
17        map::{AlternativeAllele, Contig, Filter, Format, Info},
18        Map,
19    },
20    AlternativeAlleles, Contigs, Filters, Formats, Header, Infos, OtherRecords, Record,
21    SampleNames, StringMaps,
22};
23
24#[derive(Debug, Default, Eq, PartialEq)]
25enum State {
26    #[default]
27    Empty,
28    Ready,
29    Done,
30}
31
32/// A VCF header parser.
33#[derive(Debug, Default, Eq, PartialEq)]
34pub struct Parser {
35    file_format_option: FileFormatOption,
36    state: State,
37    file_format: FileFormat,
38    infos: Infos,
39    filters: Filters,
40    formats: Formats,
41    alternative_alleles: AlternativeAlleles,
42    contigs: Contigs,
43    sample_names: SampleNames,
44    other_records: OtherRecords,
45}
46
47impl Parser {
48    /// Creates a VCF header parser builder.
49    pub fn builder() -> Builder {
50        Builder::default()
51    }
52
53    /// Parses a raw VCF header.
54    pub fn parse(&self, s: &str) -> Result<Header, ParseError> {
55        let mut parser = Self::default();
56
57        for line in s.lines() {
58            parser.parse_partial(line.as_bytes())?;
59        }
60
61        parser.finish()
62    }
63
64    /// Parses and adds a raw record to the header.
65    pub fn parse_partial(&mut self, src: &[u8]) -> Result<Entry<'_>, ParseError> {
66        if self.state == State::Done {
67            return Err(ParseError::ExpectedEof);
68        }
69
70        if self.state == State::Empty {
71            let file_format = match parse_file_format(src) {
72                Ok(f) => match self.file_format_option {
73                    FileFormatOption::Auto => f,
74                    FileFormatOption::FileFormat(g) => g,
75                },
76                Err(e) => return Err(e),
77            };
78
79            self.file_format = file_format;
80            self.state = State::Ready;
81
82            return Ok(Entry::FileFormat(file_format));
83        }
84
85        if src.starts_with(b"#CHROM") {
86            parse_header(src, &mut self.sample_names)?;
87            self.state = State::Done;
88            return Ok(Entry::Header);
89        }
90
91        let record = parse_record(src, self.file_format).map_err(ParseError::InvalidRecord)?;
92
93        match record {
94            Record::FileFormat(_) => Err(ParseError::UnexpectedFileFormat),
95            Record::Info(id, info) => try_insert_info(&mut self.infos, id, info),
96            Record::Filter(id, filter) => try_insert_filter(&mut self.filters, id, filter),
97            Record::Format(id, format) => try_insert_format(&mut self.formats, id, format),
98            Record::AlternativeAllele(id, alternative_allele) => {
99                try_insert_alternative_allele(&mut self.alternative_alleles, id, alternative_allele)
100            }
101            Record::Contig(id, contig) => try_insert_contig(&mut self.contigs, id, contig),
102            Record::Other(key, value) => insert_other_record(&mut self.other_records, key, value),
103        }
104    }
105
106    /// Builds the VCF header.
107    pub fn finish(self) -> Result<Header, ParseError> {
108        match self.state {
109            State::Empty => Err(ParseError::Empty),
110            State::Ready => Err(ParseError::MissingHeader),
111            State::Done => Ok(Header {
112                file_format: self.file_format,
113                infos: self.infos,
114                filters: self.filters,
115                formats: self.formats,
116                alternative_alleles: self.alternative_alleles,
117                contigs: self.contigs,
118                sample_names: self.sample_names,
119                other_records: self.other_records,
120                string_maps: StringMaps::default(),
121            }),
122        }
123    }
124}
125
126/// An error returned when a raw VCF header fails to parse.
127#[derive(Clone, Debug, Eq, PartialEq)]
128pub enum ParseError {
129    /// The input is empty.
130    Empty,
131    /// The input contains invalid UTF-8.
132    InvalidUtf8(str::Utf8Error),
133    /// The file format (`fileformat`) is missing.
134    MissingFileFormat,
135    /// The file format (`fileformat`) appears other than the first line.
136    UnexpectedFileFormat,
137    /// A record is invalid.
138    InvalidRecord(record::ParseError),
139    /// An info ID is duplicated.
140    DuplicateInfoId(String),
141    /// A filter ID is duplicated.
142    DuplicateFilterId(String),
143    /// A format ID is duplicated.
144    DuplicateFormatId(String),
145    /// An alternative allele ID is duplicated.
146    DuplicateAlternativeAlleleId(String),
147    /// A contig ID is duplicated.
148    DuplicateContigId(String),
149    /// A record has an invalid value.
150    InvalidRecordValue(super::record::value::collection::AddError),
151    /// The header is missing.
152    MissingHeader,
153    /// The header is invalid.
154    InvalidHeader(String, String),
155    /// A sample name is duplicated.
156    ///
157    /// ยง 1.5 Header line syntax (2021-01-13): "Duplicate sample IDs are not allowed."
158    DuplicateSampleName(String),
159    /// More data unexpectedly appears after the header header (`#CHROM`...).
160    ExpectedEof,
161    /// The position of the entry in the string match does not match the absolute position defined
162    /// by the `IDX` field of a record.
163    StringMapPositionMismatch((usize, String), (usize, String)),
164}
165
166impl error::Error for ParseError {
167    fn source(&self) -> Option<&(dyn error::Error + 'static)> {
168        match self {
169            Self::InvalidUtf8(e) => Some(e),
170            Self::InvalidRecord(e) => Some(e),
171            Self::InvalidRecordValue(e) => Some(e),
172            _ => None,
173        }
174    }
175}
176
177impl std::fmt::Display for ParseError {
178    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
179        match self {
180            Self::Empty => f.write_str("empty input"),
181            Self::InvalidUtf8(_) => f.write_str("invalid UTF-8"),
182            Self::MissingFileFormat => f.write_str("missing fileformat"),
183            Self::UnexpectedFileFormat => f.write_str("unexpected file format"),
184            Self::InvalidRecord(_) => f.write_str("invalid record"),
185            Self::DuplicateInfoId(id) => write!(f, "duplicate INFO ID: {id}"),
186            Self::DuplicateFilterId(id) => write!(f, "duplicate FILTER ID: {id}"),
187            Self::DuplicateFormatId(id) => write!(f, "duplicate FORMAT ID: {id}"),
188            Self::DuplicateAlternativeAlleleId(id) => write!(f, "duplicate ALT ID: {id}"),
189            Self::DuplicateContigId(id) => write!(f, "duplicate contig ID: {id}"),
190            Self::InvalidRecordValue(_) => f.write_str("invalid record value"),
191            Self::MissingHeader => f.write_str("missing header"),
192            Self::InvalidHeader(actual, expected) => {
193                write!(f, "invalid header: expected {expected}, got {actual}")
194            }
195            Self::DuplicateSampleName(sample_name) => {
196                write!(f, "duplicate sample name: {sample_name}")
197            }
198            Self::ExpectedEof => f.write_str("expected EOF"),
199            Self::StringMapPositionMismatch(actual, expected) => write!(
200                f,
201                "string map position mismatch: expected {} (IDX={}), got {} (IDX={})",
202                expected.1, expected.0, actual.1, actual.0,
203            ),
204        }
205    }
206}
207
208fn parse_file_format(src: &[u8]) -> Result<FileFormat, ParseError> {
209    let record = parse_record(src, FileFormat::default()).map_err(ParseError::InvalidRecord)?;
210
211    match record {
212        Record::FileFormat(file_format) => Ok(file_format),
213        _ => Err(ParseError::MissingFileFormat),
214    }
215}
216
217fn try_insert_info(
218    infos: &mut Infos,
219    id: String,
220    info: Map<Info>,
221) -> Result<Entry<'_>, ParseError> {
222    use indexmap::map::Entry;
223
224    match infos.entry(id) {
225        Entry::Vacant(entry) => {
226            let i = entry.index();
227
228            entry.insert(info);
229
230            // SAFETY: The entry was inserted at `i`.
231            Ok(infos
232                .get_index(i)
233                .map(|(k, v)| self::Entry::Info(k, v))
234                .unwrap())
235        }
236        Entry::Occupied(entry) => Err(ParseError::DuplicateInfoId(entry.key().into())),
237    }
238}
239
240fn try_insert_filter(
241    filters: &mut Filters,
242    id: String,
243    filter: Map<Filter>,
244) -> Result<Entry<'_>, ParseError> {
245    use indexmap::map::Entry;
246
247    match filters.entry(id) {
248        Entry::Vacant(entry) => {
249            let i = entry.index();
250
251            entry.insert(filter);
252
253            // SAFETY: The entry was inserted at `i`.
254            Ok(filters
255                .get_index(i)
256                .map(|(k, v)| self::Entry::Filter(k, v))
257                .unwrap())
258        }
259        Entry::Occupied(entry) => Err(ParseError::DuplicateFilterId(entry.key().into())),
260    }
261}
262
263fn try_insert_format(
264    formats: &mut Formats,
265    id: String,
266    format: Map<Format>,
267) -> Result<Entry<'_>, ParseError> {
268    use indexmap::map::Entry;
269
270    match formats.entry(id) {
271        Entry::Vacant(entry) => {
272            let i = entry.index();
273
274            entry.insert(format);
275
276            // SAFETY: The entry was inserted at `i`.
277            Ok(formats
278                .get_index(i)
279                .map(|(k, v)| self::Entry::Format(k, v))
280                .unwrap())
281        }
282        Entry::Occupied(entry) => Err(ParseError::DuplicateFormatId(entry.key().into())),
283    }
284}
285
286fn try_insert_alternative_allele(
287    alternative_alleles: &mut AlternativeAlleles,
288    id: String,
289    alternative_allele: Map<AlternativeAllele>,
290) -> Result<Entry<'_>, ParseError> {
291    use indexmap::map::Entry;
292
293    match alternative_alleles.entry(id) {
294        Entry::Vacant(entry) => {
295            let i = entry.index();
296
297            entry.insert(alternative_allele);
298
299            // SAFETY: The entry was inserted at `i`.
300            Ok(alternative_alleles
301                .get_index(i)
302                .map(|(k, v)| self::Entry::AlternativeAllele(k, v))
303                .unwrap())
304        }
305        Entry::Occupied(entry) => Err(ParseError::DuplicateAlternativeAlleleId(entry.key().into())),
306    }
307}
308
309fn try_insert_contig(
310    contigs: &mut Contigs,
311    id: String,
312    contig: Map<Contig>,
313) -> Result<Entry<'_>, ParseError> {
314    use indexmap::map::Entry;
315
316    match contigs.entry(id) {
317        Entry::Vacant(entry) => {
318            let i = entry.index();
319
320            entry.insert(contig);
321
322            // SAFETY: The entry was inserted at `i`.
323            Ok(contigs
324                .get_index(i)
325                .map(|(k, v)| self::Entry::Contig(k, v))
326                .unwrap())
327        }
328        Entry::Occupied(entry) => Err(ParseError::DuplicateContigId(entry.key().into())),
329    }
330}
331
332fn insert_other_record(
333    other_records: &mut OtherRecords,
334    key: super::record::key::Other,
335    value: super::record::Value,
336) -> Result<Entry<'_>, ParseError> {
337    let collection = other_records.entry(key).or_insert_with(|| match value {
338        super::record::Value::String(_) => {
339            super::record::value::Collection::Unstructured(Vec::new())
340        }
341        super::record::Value::Map(..) => {
342            super::record::value::Collection::Structured(IndexMap::new())
343        }
344    });
345
346    collection
347        .add(value)
348        .map_err(ParseError::InvalidRecordValue)?;
349
350    Ok(Entry::Other)
351}
352
353fn parse_header(src: &[u8], sample_names: &mut SampleNames) -> Result<(), ParseError> {
354    static HEADERS: &[&str] = &[
355        "#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO",
356    ];
357    static FORMAT_HEADER: &str = "FORMAT";
358
359    const DELIMITER: char = '\t';
360
361    let line = str::from_utf8(src).map_err(ParseError::InvalidUtf8)?;
362    let mut fields = line.split(DELIMITER);
363
364    for &expected in HEADERS.iter() {
365        if let Some(actual) = fields.next() {
366            if actual != expected {
367                return Err(ParseError::InvalidHeader(actual.into(), expected.into()));
368            }
369        } else {
370            return Err(ParseError::InvalidHeader(String::from(""), expected.into()));
371        }
372    }
373
374    if let Some(field) = fields.next() {
375        if field != FORMAT_HEADER {
376            return Err(ParseError::InvalidHeader(
377                field.into(),
378                FORMAT_HEADER.into(),
379            ));
380        }
381
382        for sample_name in fields {
383            if !sample_names.insert(sample_name.into()) {
384                return Err(ParseError::DuplicateSampleName(sample_name.into()));
385            }
386        }
387    }
388
389    Ok(())
390}
391
392#[cfg(test)]
393mod tests {
394    use super::*;
395
396    #[test]
397    fn test_from_str() -> Result<(), Box<dyn std::error::Error>> {
398        use crate::{
399            header::record::{value::map::Other, Value},
400            variant::record::{info, samples},
401        };
402
403        let s = r#"##fileformat=VCFv4.3
404##fileDate=20200506
405##source=noodles-vcf
406##contig=<ID=sq0,length=8>
407##contig=<ID=sq1,length=13>
408##contig=<ID=sq2,length=21>
409##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
410##FILTER=<ID=q10,Description="Quality below 10">
411##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
412##ALT=<ID=DEL,Description="Deletion">
413##META=<ID=Assay,Type=String,Number=.,Values=[WholeGenome, Exome]>
414##SAMPLE=<ID=sample0,Assay=WholeGenome>
415##PEDIGREE=<ID=cid,Father=fid,Mother=mid>
416#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sample0
417"#;
418
419        let actual = Parser::default().parse(s)?;
420
421        let expected = Header::builder()
422            .set_file_format(FileFormat::new(4, 3))
423            .insert("fileDate".parse()?, Value::String(String::from("20200506")))?
424            .insert(
425                "source".parse()?,
426                Value::String(String::from("noodles-vcf")),
427            )?
428            .add_contig("sq0", Map::<Contig>::builder().set_length(8).build()?)
429            .add_contig("sq1", Map::<Contig>::builder().set_length(13).build()?)
430            .add_contig("sq2", Map::<Contig>::builder().set_length(21).build()?)
431            .add_info(
432                info::field::key::SAMPLES_WITH_DATA_COUNT,
433                Map::<Info>::from(info::field::key::SAMPLES_WITH_DATA_COUNT),
434            )
435            .add_filter("q10", Map::<Filter>::new("Quality below 10"))
436            .add_format(
437                samples::keys::key::GENOTYPE,
438                Map::<Format>::from(samples::keys::key::GENOTYPE),
439            )
440            .add_alternative_allele("DEL", Map::<AlternativeAllele>::new("Deletion"))
441            .insert(
442                "META".parse()?,
443                Value::Map(
444                    String::from("Assay"),
445                    Map::<Other>::builder()
446                        .insert("Type".parse()?, "String")
447                        .insert("Number".parse()?, ".")
448                        .insert("Values".parse()?, "[WholeGenome, Exome]")
449                        .build()?,
450                ),
451            )?
452            .insert(
453                "SAMPLE".parse()?,
454                Value::Map(
455                    String::from("sample0"),
456                    Map::<Other>::builder()
457                        .insert("Assay".parse()?, "WholeGenome")
458                        .build()?,
459                ),
460            )?
461            .insert(
462                "PEDIGREE".parse()?,
463                Value::Map(
464                    String::from("cid"),
465                    Map::<Other>::builder()
466                        .insert("Father".parse()?, "fid")
467                        .insert("Mother".parse()?, "mid")
468                        .build()?,
469                ),
470            )?
471            .add_sample_name("sample0")
472            .build();
473
474        assert_eq!(actual, expected);
475
476        Ok(())
477    }
478
479    #[test]
480    fn test_from_str_without_file_format() {
481        let s = r#"##ALT=<ID=DEL,Description="Deletion">
482"#;
483
484        assert_eq!(
485            Parser::default().parse(s),
486            Err(ParseError::MissingFileFormat)
487        );
488    }
489
490    #[test]
491    fn test_from_str_with_data_after_header() {
492        let s = r#"##fileformat=VCFv4.3
493#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
494##contig=<ID=sq0,length=8>
495"#;
496
497        assert_eq!(Parser::default().parse(s), Err(ParseError::ExpectedEof));
498    }
499
500    #[test]
501    fn test_from_str_with_multiple_fileformats() {
502        let s = "\
503##fileformat=VCFv4.3
504##fileformat=VCFv4.3
505";
506
507        assert_eq!(
508            Parser::default().parse(s),
509            Err(ParseError::UnexpectedFileFormat)
510        );
511    }
512
513    #[test]
514    fn test_from_str_with_missing_headers() {
515        let s = "##fileformat=VCFv4.3
516";
517        assert_eq!(Parser::default().parse(s), Err(ParseError::MissingHeader));
518    }
519
520    #[test]
521    fn test_from_str_with_invalid_headers() {
522        let s = "##fileformat=VCFv4.3
523#CHROM	POS	ID	REF	ALT	QUALITY	FILTER	INFO
524";
525
526        assert_eq!(
527            Parser::default().parse(s),
528            Err(ParseError::InvalidHeader(
529                String::from("QUALITY"),
530                String::from("QUAL")
531            ))
532        );
533
534        let s = "##fileformat=VCFv4.3
535#CHROM	POS	ID
536";
537
538        assert_eq!(
539            Parser::default().parse(s),
540            Err(ParseError::InvalidHeader(
541                String::from(""),
542                String::from("REF")
543            ))
544        );
545
546        let s = "##fileformat=VCFv4.3
547#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	sample0
548";
549
550        assert_eq!(
551            Parser::default().parse(s),
552            Err(ParseError::InvalidHeader(
553                String::from("sample0"),
554                String::from("FORMAT")
555            ))
556        );
557    }
558
559    #[test]
560    fn test_from_str_with_duplicate_map_id() {
561        let s = r#"##fileformat=VCFv4.3
562##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
563##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
564#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
565"#;
566
567        assert!(matches!(
568            Parser::default().parse(s),
569            Err(ParseError::DuplicateInfoId(_))
570        ));
571
572        let s = r#"##fileformat=VCFv4.3
573##FILTER=<ID=q10,Description="Quality below 10">
574##FILTER=<ID=q10,Description="Quality below 10">
575#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
576"#;
577
578        assert_eq!(
579            Parser::default().parse(s),
580            Err(ParseError::DuplicateFilterId(String::from("q10")))
581        );
582
583        let s = r#"##fileformat=VCFv4.3
584##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
585##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
586#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
587"#;
588
589        assert_eq!(
590            Parser::default().parse(s),
591            Err(ParseError::DuplicateFormatId(String::from(
592                crate::variant::record::samples::keys::key::GENOTYPE
593            )))
594        );
595
596        let s = r#"##fileformat=VCFv4.3
597##ALT=<ID=DEL,Description="Deletion">
598##ALT=<ID=DEL,Description="Deletion">
599#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
600"#;
601
602        assert!(matches!(
603            Parser::default().parse(s),
604            Err(ParseError::DuplicateAlternativeAlleleId(_))
605        ));
606
607        let s = r#"##fileformat=VCFv4.3
608##contig=<ID=sq0,length=8>
609##contig=<ID=sq0,length=8>
610#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
611"#;
612
613        assert!(matches!(
614            Parser::default().parse(s),
615            Err(ParseError::DuplicateContigId(_))
616        ));
617
618        let s = r#"##fileformat=VCFv4.3
619##contig=<ID=sq0,length=8>
620##contig=<ID=sq0,length=8>
621#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
622"#;
623
624        assert!(matches!(
625            Parser::default().parse(s),
626            Err(ParseError::DuplicateContigId(_))
627        ));
628
629        let s = r#"##fileformat=VCFv4.3
630##SAMPLE=<ID=sample0,Assay=WholeGenome>
631##SAMPLE=<ID=sample0,Assay=WholeGenome>
632#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
633"#;
634
635        assert!(matches!(
636            Parser::default().parse(s),
637            Err(ParseError::InvalidRecordValue(_))
638        ));
639    }
640
641    #[test]
642    fn test_from_str_with_duplicate_sample_names() {
643        let s = "##fileformat=VCFv4.3
644#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	sample0	sample0
645";
646
647        assert_eq!(
648            Parser::default().parse(s),
649            Err(ParseError::DuplicateSampleName(String::from("sample0")))
650        );
651    }
652}