1mod builder;
4mod entry;
5mod file_format_option;
6pub(super) mod record;
7
8use std::{error, str};
9
10use indexmap::IndexMap;
11
12pub(super) use self::record::parse_record;
13pub use self::{builder::Builder, entry::Entry, file_format_option::FileFormatOption};
14use super::{
15 file_format::FileFormat,
16 record::value::{
17 map::{AlternativeAllele, Contig, Filter, Format, Info},
18 Map,
19 },
20 AlternativeAlleles, Contigs, Filters, Formats, Header, Infos, OtherRecords, Record,
21 SampleNames, StringMaps,
22};
23
24#[derive(Debug, Default, Eq, PartialEq)]
25enum State {
26 #[default]
27 Empty,
28 Ready,
29 Done,
30}
31
32#[derive(Debug, Default, Eq, PartialEq)]
34pub struct Parser {
35 file_format_option: FileFormatOption,
36 state: State,
37 file_format: FileFormat,
38 infos: Infos,
39 filters: Filters,
40 formats: Formats,
41 alternative_alleles: AlternativeAlleles,
42 contigs: Contigs,
43 sample_names: SampleNames,
44 other_records: OtherRecords,
45}
46
47impl Parser {
48 pub fn builder() -> Builder {
50 Builder::default()
51 }
52
53 pub fn parse(&self, s: &str) -> Result<Header, ParseError> {
55 let mut parser = Self::default();
56
57 for line in s.lines() {
58 parser.parse_partial(line.as_bytes())?;
59 }
60
61 parser.finish()
62 }
63
64 pub fn parse_partial(&mut self, src: &[u8]) -> Result<Entry<'_>, ParseError> {
66 if self.state == State::Done {
67 return Err(ParseError::ExpectedEof);
68 }
69
70 if self.state == State::Empty {
71 let file_format = match parse_file_format(src) {
72 Ok(f) => match self.file_format_option {
73 FileFormatOption::Auto => f,
74 FileFormatOption::FileFormat(g) => g,
75 },
76 Err(e) => return Err(e),
77 };
78
79 self.file_format = file_format;
80 self.state = State::Ready;
81
82 return Ok(Entry::FileFormat(file_format));
83 }
84
85 if src.starts_with(b"#CHROM") {
86 parse_header(src, &mut self.sample_names)?;
87 self.state = State::Done;
88 return Ok(Entry::Header);
89 }
90
91 let record = parse_record(src, self.file_format).map_err(ParseError::InvalidRecord)?;
92
93 match record {
94 Record::FileFormat(_) => Err(ParseError::UnexpectedFileFormat),
95 Record::Info(id, info) => try_insert_info(&mut self.infos, id, info),
96 Record::Filter(id, filter) => try_insert_filter(&mut self.filters, id, filter),
97 Record::Format(id, format) => try_insert_format(&mut self.formats, id, format),
98 Record::AlternativeAllele(id, alternative_allele) => {
99 try_insert_alternative_allele(&mut self.alternative_alleles, id, alternative_allele)
100 }
101 Record::Contig(id, contig) => try_insert_contig(&mut self.contigs, id, contig),
102 Record::Other(key, value) => insert_other_record(&mut self.other_records, key, value),
103 }
104 }
105
106 pub fn finish(self) -> Result<Header, ParseError> {
108 match self.state {
109 State::Empty => Err(ParseError::Empty),
110 State::Ready => Err(ParseError::MissingHeader),
111 State::Done => Ok(Header {
112 file_format: self.file_format,
113 infos: self.infos,
114 filters: self.filters,
115 formats: self.formats,
116 alternative_alleles: self.alternative_alleles,
117 contigs: self.contigs,
118 sample_names: self.sample_names,
119 other_records: self.other_records,
120 string_maps: StringMaps::default(),
121 }),
122 }
123 }
124}
125
126#[derive(Clone, Debug, Eq, PartialEq)]
128pub enum ParseError {
129 Empty,
131 InvalidUtf8(str::Utf8Error),
133 MissingFileFormat,
135 UnexpectedFileFormat,
137 InvalidRecord(record::ParseError),
139 DuplicateInfoId(String),
141 DuplicateFilterId(String),
143 DuplicateFormatId(String),
145 DuplicateAlternativeAlleleId(String),
147 DuplicateContigId(String),
149 InvalidRecordValue(super::record::value::collection::AddError),
151 MissingHeader,
153 InvalidHeader(String, String),
155 DuplicateSampleName(String),
159 ExpectedEof,
161 StringMapPositionMismatch((usize, String), (usize, String)),
164}
165
166impl error::Error for ParseError {
167 fn source(&self) -> Option<&(dyn error::Error + 'static)> {
168 match self {
169 Self::InvalidUtf8(e) => Some(e),
170 Self::InvalidRecord(e) => Some(e),
171 Self::InvalidRecordValue(e) => Some(e),
172 _ => None,
173 }
174 }
175}
176
177impl std::fmt::Display for ParseError {
178 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
179 match self {
180 Self::Empty => f.write_str("empty input"),
181 Self::InvalidUtf8(_) => f.write_str("invalid UTF-8"),
182 Self::MissingFileFormat => f.write_str("missing fileformat"),
183 Self::UnexpectedFileFormat => f.write_str("unexpected file format"),
184 Self::InvalidRecord(_) => f.write_str("invalid record"),
185 Self::DuplicateInfoId(id) => write!(f, "duplicate INFO ID: {id}"),
186 Self::DuplicateFilterId(id) => write!(f, "duplicate FILTER ID: {id}"),
187 Self::DuplicateFormatId(id) => write!(f, "duplicate FORMAT ID: {id}"),
188 Self::DuplicateAlternativeAlleleId(id) => write!(f, "duplicate ALT ID: {id}"),
189 Self::DuplicateContigId(id) => write!(f, "duplicate contig ID: {id}"),
190 Self::InvalidRecordValue(_) => f.write_str("invalid record value"),
191 Self::MissingHeader => f.write_str("missing header"),
192 Self::InvalidHeader(actual, expected) => {
193 write!(f, "invalid header: expected {expected}, got {actual}")
194 }
195 Self::DuplicateSampleName(sample_name) => {
196 write!(f, "duplicate sample name: {sample_name}")
197 }
198 Self::ExpectedEof => f.write_str("expected EOF"),
199 Self::StringMapPositionMismatch(actual, expected) => write!(
200 f,
201 "string map position mismatch: expected {} (IDX={}), got {} (IDX={})",
202 expected.1, expected.0, actual.1, actual.0,
203 ),
204 }
205 }
206}
207
208fn parse_file_format(src: &[u8]) -> Result<FileFormat, ParseError> {
209 let record = parse_record(src, FileFormat::default()).map_err(ParseError::InvalidRecord)?;
210
211 match record {
212 Record::FileFormat(file_format) => Ok(file_format),
213 _ => Err(ParseError::MissingFileFormat),
214 }
215}
216
217fn try_insert_info(
218 infos: &mut Infos,
219 id: String,
220 info: Map<Info>,
221) -> Result<Entry<'_>, ParseError> {
222 use indexmap::map::Entry;
223
224 match infos.entry(id) {
225 Entry::Vacant(entry) => {
226 let i = entry.index();
227
228 entry.insert(info);
229
230 Ok(infos
232 .get_index(i)
233 .map(|(k, v)| self::Entry::Info(k, v))
234 .unwrap())
235 }
236 Entry::Occupied(entry) => Err(ParseError::DuplicateInfoId(entry.key().into())),
237 }
238}
239
240fn try_insert_filter(
241 filters: &mut Filters,
242 id: String,
243 filter: Map<Filter>,
244) -> Result<Entry<'_>, ParseError> {
245 use indexmap::map::Entry;
246
247 match filters.entry(id) {
248 Entry::Vacant(entry) => {
249 let i = entry.index();
250
251 entry.insert(filter);
252
253 Ok(filters
255 .get_index(i)
256 .map(|(k, v)| self::Entry::Filter(k, v))
257 .unwrap())
258 }
259 Entry::Occupied(entry) => Err(ParseError::DuplicateFilterId(entry.key().into())),
260 }
261}
262
263fn try_insert_format(
264 formats: &mut Formats,
265 id: String,
266 format: Map<Format>,
267) -> Result<Entry<'_>, ParseError> {
268 use indexmap::map::Entry;
269
270 match formats.entry(id) {
271 Entry::Vacant(entry) => {
272 let i = entry.index();
273
274 entry.insert(format);
275
276 Ok(formats
278 .get_index(i)
279 .map(|(k, v)| self::Entry::Format(k, v))
280 .unwrap())
281 }
282 Entry::Occupied(entry) => Err(ParseError::DuplicateFormatId(entry.key().into())),
283 }
284}
285
286fn try_insert_alternative_allele(
287 alternative_alleles: &mut AlternativeAlleles,
288 id: String,
289 alternative_allele: Map<AlternativeAllele>,
290) -> Result<Entry<'_>, ParseError> {
291 use indexmap::map::Entry;
292
293 match alternative_alleles.entry(id) {
294 Entry::Vacant(entry) => {
295 let i = entry.index();
296
297 entry.insert(alternative_allele);
298
299 Ok(alternative_alleles
301 .get_index(i)
302 .map(|(k, v)| self::Entry::AlternativeAllele(k, v))
303 .unwrap())
304 }
305 Entry::Occupied(entry) => Err(ParseError::DuplicateAlternativeAlleleId(entry.key().into())),
306 }
307}
308
309fn try_insert_contig(
310 contigs: &mut Contigs,
311 id: String,
312 contig: Map<Contig>,
313) -> Result<Entry<'_>, ParseError> {
314 use indexmap::map::Entry;
315
316 match contigs.entry(id) {
317 Entry::Vacant(entry) => {
318 let i = entry.index();
319
320 entry.insert(contig);
321
322 Ok(contigs
324 .get_index(i)
325 .map(|(k, v)| self::Entry::Contig(k, v))
326 .unwrap())
327 }
328 Entry::Occupied(entry) => Err(ParseError::DuplicateContigId(entry.key().into())),
329 }
330}
331
332fn insert_other_record(
333 other_records: &mut OtherRecords,
334 key: super::record::key::Other,
335 value: super::record::Value,
336) -> Result<Entry<'_>, ParseError> {
337 let collection = other_records.entry(key).or_insert_with(|| match value {
338 super::record::Value::String(_) => {
339 super::record::value::Collection::Unstructured(Vec::new())
340 }
341 super::record::Value::Map(..) => {
342 super::record::value::Collection::Structured(IndexMap::new())
343 }
344 });
345
346 collection
347 .add(value)
348 .map_err(ParseError::InvalidRecordValue)?;
349
350 Ok(Entry::Other)
351}
352
353fn parse_header(src: &[u8], sample_names: &mut SampleNames) -> Result<(), ParseError> {
354 static HEADERS: &[&str] = &[
355 "#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO",
356 ];
357 static FORMAT_HEADER: &str = "FORMAT";
358
359 const DELIMITER: char = '\t';
360
361 let line = str::from_utf8(src).map_err(ParseError::InvalidUtf8)?;
362 let mut fields = line.split(DELIMITER);
363
364 for &expected in HEADERS.iter() {
365 if let Some(actual) = fields.next() {
366 if actual != expected {
367 return Err(ParseError::InvalidHeader(actual.into(), expected.into()));
368 }
369 } else {
370 return Err(ParseError::InvalidHeader(String::from(""), expected.into()));
371 }
372 }
373
374 if let Some(field) = fields.next() {
375 if field != FORMAT_HEADER {
376 return Err(ParseError::InvalidHeader(
377 field.into(),
378 FORMAT_HEADER.into(),
379 ));
380 }
381
382 for sample_name in fields {
383 if !sample_names.insert(sample_name.into()) {
384 return Err(ParseError::DuplicateSampleName(sample_name.into()));
385 }
386 }
387 }
388
389 Ok(())
390}
391
392#[cfg(test)]
393mod tests {
394 use super::*;
395
396 #[test]
397 fn test_from_str() -> Result<(), Box<dyn std::error::Error>> {
398 use crate::{
399 header::record::{value::map::Other, Value},
400 variant::record::{info, samples},
401 };
402
403 let s = r#"##fileformat=VCFv4.3
404##fileDate=20200506
405##source=noodles-vcf
406##contig=<ID=sq0,length=8>
407##contig=<ID=sq1,length=13>
408##contig=<ID=sq2,length=21>
409##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
410##FILTER=<ID=q10,Description="Quality below 10">
411##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
412##ALT=<ID=DEL,Description="Deletion">
413##META=<ID=Assay,Type=String,Number=.,Values=[WholeGenome, Exome]>
414##SAMPLE=<ID=sample0,Assay=WholeGenome>
415##PEDIGREE=<ID=cid,Father=fid,Mother=mid>
416#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample0
417"#;
418
419 let actual = Parser::default().parse(s)?;
420
421 let expected = Header::builder()
422 .set_file_format(FileFormat::new(4, 3))
423 .insert("fileDate".parse()?, Value::String(String::from("20200506")))?
424 .insert(
425 "source".parse()?,
426 Value::String(String::from("noodles-vcf")),
427 )?
428 .add_contig("sq0", Map::<Contig>::builder().set_length(8).build()?)
429 .add_contig("sq1", Map::<Contig>::builder().set_length(13).build()?)
430 .add_contig("sq2", Map::<Contig>::builder().set_length(21).build()?)
431 .add_info(
432 info::field::key::SAMPLES_WITH_DATA_COUNT,
433 Map::<Info>::from(info::field::key::SAMPLES_WITH_DATA_COUNT),
434 )
435 .add_filter("q10", Map::<Filter>::new("Quality below 10"))
436 .add_format(
437 samples::keys::key::GENOTYPE,
438 Map::<Format>::from(samples::keys::key::GENOTYPE),
439 )
440 .add_alternative_allele("DEL", Map::<AlternativeAllele>::new("Deletion"))
441 .insert(
442 "META".parse()?,
443 Value::Map(
444 String::from("Assay"),
445 Map::<Other>::builder()
446 .insert("Type".parse()?, "String")
447 .insert("Number".parse()?, ".")
448 .insert("Values".parse()?, "[WholeGenome, Exome]")
449 .build()?,
450 ),
451 )?
452 .insert(
453 "SAMPLE".parse()?,
454 Value::Map(
455 String::from("sample0"),
456 Map::<Other>::builder()
457 .insert("Assay".parse()?, "WholeGenome")
458 .build()?,
459 ),
460 )?
461 .insert(
462 "PEDIGREE".parse()?,
463 Value::Map(
464 String::from("cid"),
465 Map::<Other>::builder()
466 .insert("Father".parse()?, "fid")
467 .insert("Mother".parse()?, "mid")
468 .build()?,
469 ),
470 )?
471 .add_sample_name("sample0")
472 .build();
473
474 assert_eq!(actual, expected);
475
476 Ok(())
477 }
478
479 #[test]
480 fn test_from_str_without_file_format() {
481 let s = r#"##ALT=<ID=DEL,Description="Deletion">
482"#;
483
484 assert_eq!(
485 Parser::default().parse(s),
486 Err(ParseError::MissingFileFormat)
487 );
488 }
489
490 #[test]
491 fn test_from_str_with_data_after_header() {
492 let s = r#"##fileformat=VCFv4.3
493#CHROM POS ID REF ALT QUAL FILTER INFO
494##contig=<ID=sq0,length=8>
495"#;
496
497 assert_eq!(Parser::default().parse(s), Err(ParseError::ExpectedEof));
498 }
499
500 #[test]
501 fn test_from_str_with_multiple_fileformats() {
502 let s = "\
503##fileformat=VCFv4.3
504##fileformat=VCFv4.3
505";
506
507 assert_eq!(
508 Parser::default().parse(s),
509 Err(ParseError::UnexpectedFileFormat)
510 );
511 }
512
513 #[test]
514 fn test_from_str_with_missing_headers() {
515 let s = "##fileformat=VCFv4.3
516";
517 assert_eq!(Parser::default().parse(s), Err(ParseError::MissingHeader));
518 }
519
520 #[test]
521 fn test_from_str_with_invalid_headers() {
522 let s = "##fileformat=VCFv4.3
523#CHROM POS ID REF ALT QUALITY FILTER INFO
524";
525
526 assert_eq!(
527 Parser::default().parse(s),
528 Err(ParseError::InvalidHeader(
529 String::from("QUALITY"),
530 String::from("QUAL")
531 ))
532 );
533
534 let s = "##fileformat=VCFv4.3
535#CHROM POS ID
536";
537
538 assert_eq!(
539 Parser::default().parse(s),
540 Err(ParseError::InvalidHeader(
541 String::from(""),
542 String::from("REF")
543 ))
544 );
545
546 let s = "##fileformat=VCFv4.3
547#CHROM POS ID REF ALT QUAL FILTER INFO sample0
548";
549
550 assert_eq!(
551 Parser::default().parse(s),
552 Err(ParseError::InvalidHeader(
553 String::from("sample0"),
554 String::from("FORMAT")
555 ))
556 );
557 }
558
559 #[test]
560 fn test_from_str_with_duplicate_map_id() {
561 let s = r#"##fileformat=VCFv4.3
562##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
563##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
564#CHROM POS ID REF ALT QUAL FILTER INFO
565"#;
566
567 assert!(matches!(
568 Parser::default().parse(s),
569 Err(ParseError::DuplicateInfoId(_))
570 ));
571
572 let s = r#"##fileformat=VCFv4.3
573##FILTER=<ID=q10,Description="Quality below 10">
574##FILTER=<ID=q10,Description="Quality below 10">
575#CHROM POS ID REF ALT QUAL FILTER INFO
576"#;
577
578 assert_eq!(
579 Parser::default().parse(s),
580 Err(ParseError::DuplicateFilterId(String::from("q10")))
581 );
582
583 let s = r#"##fileformat=VCFv4.3
584##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
585##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
586#CHROM POS ID REF ALT QUAL FILTER INFO
587"#;
588
589 assert_eq!(
590 Parser::default().parse(s),
591 Err(ParseError::DuplicateFormatId(String::from(
592 crate::variant::record::samples::keys::key::GENOTYPE
593 )))
594 );
595
596 let s = r#"##fileformat=VCFv4.3
597##ALT=<ID=DEL,Description="Deletion">
598##ALT=<ID=DEL,Description="Deletion">
599#CHROM POS ID REF ALT QUAL FILTER INFO
600"#;
601
602 assert!(matches!(
603 Parser::default().parse(s),
604 Err(ParseError::DuplicateAlternativeAlleleId(_))
605 ));
606
607 let s = r#"##fileformat=VCFv4.3
608##contig=<ID=sq0,length=8>
609##contig=<ID=sq0,length=8>
610#CHROM POS ID REF ALT QUAL FILTER INFO
611"#;
612
613 assert!(matches!(
614 Parser::default().parse(s),
615 Err(ParseError::DuplicateContigId(_))
616 ));
617
618 let s = r#"##fileformat=VCFv4.3
619##contig=<ID=sq0,length=8>
620##contig=<ID=sq0,length=8>
621#CHROM POS ID REF ALT QUAL FILTER INFO
622"#;
623
624 assert!(matches!(
625 Parser::default().parse(s),
626 Err(ParseError::DuplicateContigId(_))
627 ));
628
629 let s = r#"##fileformat=VCFv4.3
630##SAMPLE=<ID=sample0,Assay=WholeGenome>
631##SAMPLE=<ID=sample0,Assay=WholeGenome>
632#CHROM POS ID REF ALT QUAL FILTER INFO
633"#;
634
635 assert!(matches!(
636 Parser::default().parse(s),
637 Err(ParseError::InvalidRecordValue(_))
638 ));
639 }
640
641 #[test]
642 fn test_from_str_with_duplicate_sample_names() {
643 let s = "##fileformat=VCFv4.3
644#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample0 sample0
645";
646
647 assert_eq!(
648 Parser::default().parse(s),
649 Err(ParseError::DuplicateSampleName(String::from("sample0")))
650 );
651 }
652}