1mod context;
2mod record;
3
4use std::{error, fmt, hash::Hash, str};
5
6use bstr::BString;
7use indexmap::IndexMap;
8
9pub(crate) use self::context::Context;
10use self::record::parse_record;
11use super::{
12 Header, Programs, ReadGroups, Record, ReferenceSequences,
13 record::value::{
14 Map,
15 map::{self, header::Version},
16 },
17};
18
19#[derive(Clone, Debug, Eq, PartialEq)]
21pub enum ParseError {
22 UnexpectedHeader,
24 InvalidRecord(record::ParseError),
26 DuplicateReferenceSequenceName(BString),
28 DuplicateReadGroupId(BString),
30 DuplicateProgramId(BString),
32 InvalidComment,
34}
35
36impl error::Error for ParseError {
37 fn source(&self) -> Option<&(dyn error::Error + 'static)> {
38 match self {
39 Self::InvalidRecord(e) => Some(e),
40 _ => None,
41 }
42 }
43}
44
45impl fmt::Display for ParseError {
46 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
47 match self {
48 Self::UnexpectedHeader => write!(f, "unexpected header (HD) record"),
49 Self::InvalidRecord(_) => f.write_str("invalid record"),
50 Self::DuplicateReferenceSequenceName(name) => {
51 write!(f, "duplicate reference sequence name: {name}")
52 }
53 Self::DuplicateReadGroupId(id) => write!(f, "duplicate read group ID: {id}"),
54 Self::DuplicateProgramId(id) => write!(f, "duplicate program ID: {id}"),
55 Self::InvalidComment => f.write_str("invalid comment record"),
56 }
57 }
58}
59
60#[derive(Default)]
62pub struct Parser {
63 ctx: Context,
64 header: Option<Map<map::Header>>,
65 reference_sequences: ReferenceSequences,
66 read_groups: ReadGroups,
67 programs: Programs,
68 comments: Vec<BString>,
69}
70
71impl Parser {
72 fn is_empty(&self) -> bool {
73 self.header.is_none()
74 && self.reference_sequences.is_empty()
75 && self.read_groups.is_empty()
76 && self.programs.as_ref().is_empty()
77 && self.comments.is_empty()
78 }
79
80 pub fn parse_partial(&mut self, src: &[u8]) -> Result<(), ParseError> {
91 if self.is_empty() {
92 if let Some(version) = extract_version(src) {
93 self.ctx = Context::from(version);
94 }
95 }
96
97 let record = parse_record(src, &self.ctx).map_err(ParseError::InvalidRecord)?;
98
99 match record {
100 Record::Header(header) => {
101 if self.is_empty() {
102 self.header = Some(header);
103 } else {
104 return Err(ParseError::UnexpectedHeader);
105 }
106 }
107 Record::ReferenceSequence(name, reference_sequence) => try_insert(
108 &mut self.reference_sequences,
109 name,
110 reference_sequence,
111 ParseError::DuplicateReferenceSequenceName,
112 )?,
113 Record::ReadGroup(id, read_group) => try_insert(
114 &mut self.read_groups,
115 id,
116 read_group,
117 ParseError::DuplicateReadGroupId,
118 )?,
119 Record::Program(id, program) => try_insert(
120 self.programs.as_mut(),
121 id,
122 program,
123 ParseError::DuplicateProgramId,
124 )?,
125 Record::Comment(comment) => self.comments.push(comment),
126 }
127
128 Ok(())
129 }
130
131 pub fn finish(self) -> Header {
143 Header {
144 header: self.header,
145 reference_sequences: self.reference_sequences,
146 read_groups: self.read_groups,
147 programs: self.programs,
148 comments: self.comments,
149 }
150 }
151}
152
153fn extract_version(src: &[u8]) -> Option<Version> {
154 use self::record::value::map::header::parse_version;
155
156 const RECORD_PREFIX: &[u8] = b"@HD\t";
157 const DELIMITER: u8 = b'\t';
158 const FIELD_PREFIX: &[u8] = b"VN:";
159
160 if let Some(raw_value) = src.strip_prefix(RECORD_PREFIX) {
161 for raw_field in raw_value.split(|&b| b == DELIMITER) {
162 if let Some(s) = raw_field.strip_prefix(FIELD_PREFIX) {
163 return parse_version(s).ok();
164 }
165 }
166 }
167
168 None
169}
170
171fn try_insert<K, V, F, E>(map: &mut IndexMap<K, V>, key: K, value: V, f: F) -> Result<(), E>
172where
173 K: Hash + Eq + Clone,
174 F: FnOnce(K) -> E,
175{
176 use indexmap::map::Entry;
177
178 match map.entry(key) {
179 Entry::Vacant(e) => {
180 e.insert(value);
181 Ok(())
182 }
183 Entry::Occupied(e) => Err(f(e.key().clone())),
184 }
185}
186
187pub(super) fn parse(s: &str) -> Result<Header, ParseError> {
210 let mut parser = Parser::default();
211
212 for line in s.lines() {
213 parser.parse_partial(line.as_bytes())?;
214 }
215
216 Ok(parser.finish())
217}
218
219#[cfg(test)]
220mod tests {
221 use super::*;
222
223 #[test]
224 fn test_parse() -> Result<(), Box<dyn std::error::Error>> {
225 use std::num::NonZero;
226
227 use crate::header::record::value::map::{
228 self, Map, Program, ReadGroup, ReferenceSequence,
229 header::{self, Version},
230 program,
231 };
232
233 let s = "\
234@HD\tVN:1.6\tSO:coordinate
235@SQ\tSN:sq0\tLN:8
236@SQ\tSN:sq1\tLN:13
237@RG\tID:rg0
238@PG\tID:pg0\tPN:noodles
239@CO\tndls
240";
241
242 let actual = parse(s)?;
243
244 let expected = Header::builder()
245 .set_header(
246 Map::<map::Header>::builder()
247 .set_version(Version::new(1, 6))
248 .insert(header::tag::SORT_ORDER, "coordinate")
249 .build()?,
250 )
251 .add_reference_sequence(
252 "sq0",
253 Map::<ReferenceSequence>::new(const { NonZero::new(8).unwrap() }),
254 )
255 .add_reference_sequence(
256 "sq1",
257 Map::<ReferenceSequence>::new(const { NonZero::new(13).unwrap() }),
258 )
259 .add_read_group("rg0", Map::<ReadGroup>::default())
260 .add_program(
261 "pg0",
262 Map::<Program>::builder()
263 .insert(program::tag::NAME, "noodles")
264 .build()?,
265 )
266 .add_comment("ndls")
267 .build();
268
269 assert_eq!(actual, expected);
270
271 Ok(())
272 }
273
274 #[test]
275 fn test_parse_with_empty_input() -> Result<(), ParseError> {
276 let header = parse("")?;
277
278 assert!(header.header().is_none());
279 assert!(header.reference_sequences().is_empty());
280 assert!(header.read_groups().is_empty());
281 assert!(header.programs().as_ref().is_empty());
282 assert!(header.comments().is_empty());
283
284 Ok(())
285 }
286
287 #[test]
288 fn test_parse_without_hd() -> Result<(), ParseError> {
289 let header = parse("@SQ\tSN:sq0\tLN:8\n")?;
290 assert!(header.header().is_none());
291 assert_eq!(header.reference_sequences().len(), 1);
292 Ok(())
293 }
294
295 #[test]
296 fn test_parse_with_multiple_hd() {
297 let s = "\
298@HD\tVN:1.6\tSO:coordinate
299@HD\tVN:1.6\tSO:coordinate
300";
301
302 assert_eq!(parse(s), Err(ParseError::UnexpectedHeader));
303 }
304
305 #[test]
306 fn test_parse_with_duplicate_reference_sequence_names() {
307 let s = "\
308@SQ\tSN:sq0\tLN:8
309@SQ\tSN:sq0\tLN:8
310";
311
312 assert_eq!(
313 parse(s),
314 Err(ParseError::DuplicateReferenceSequenceName(BString::from(
315 "sq0"
316 )))
317 );
318 }
319
320 #[test]
321 fn test_parse_with_duplicate_read_group_ids() {
322 let s = "\
323@RG\tID:rg0
324@RG\tID:rg0
325";
326
327 assert_eq!(
328 parse(s),
329 Err(ParseError::DuplicateReadGroupId(BString::from("rg0")))
330 );
331 }
332
333 #[test]
334 fn test_parse_with_duplicate_program_ids() {
335 let s = "\
336@PG\tID:pg0
337@PG\tID:pg0
338";
339 assert_eq!(
340 parse(s),
341 Err(ParseError::DuplicateProgramId(BString::from("pg0")))
342 );
343 }
344
345 #[test]
346 fn test_extract_version() {
347 assert_eq!(extract_version(b"@HD\tVN:1.6"), Some(Version::new(1, 6)));
348 assert_eq!(
349 extract_version(b"@HD\tSO:coordinate\tVN:1.6"),
350 Some(Version::new(1, 6))
351 );
352 assert!(extract_version(b"@HD\tVN:NA").is_none());
353 assert!(extract_version(b"@SQ\tSN:sq0\tLN:8\tVN:1.6").is_none());
354 assert!(extract_version(b"@CO\tVN:1.6").is_none());
355 }
356}