noodles_sam/header/
parser.rs

1mod context;
2mod record;
3
4use std::{error, fmt, hash::Hash, str};
5
6use bstr::BString;
7use indexmap::IndexMap;
8
9pub(crate) use self::context::Context;
10use self::record::parse_record;
11use super::{
12    Header, Programs, ReadGroups, Record, ReferenceSequences,
13    record::value::{
14        Map,
15        map::{self, header::Version},
16    },
17};
18
19/// An error returned when a raw SAM header fails to parse.
20#[derive(Clone, Debug, Eq, PartialEq)]
21pub enum ParseError {
22    /// A header record is not on the first line.
23    UnexpectedHeader,
24    /// The record is invalid.
25    InvalidRecord(record::ParseError),
26    /// A reference sequence name is duplicated.
27    DuplicateReferenceSequenceName(BString),
28    /// A read group ID is duplicated.
29    DuplicateReadGroupId(BString),
30    /// A program ID is duplicated.
31    DuplicateProgramId(BString),
32    /// A comment record is invalid.
33    InvalidComment,
34}
35
36impl error::Error for ParseError {
37    fn source(&self) -> Option<&(dyn error::Error + 'static)> {
38        match self {
39            Self::InvalidRecord(e) => Some(e),
40            _ => None,
41        }
42    }
43}
44
45impl fmt::Display for ParseError {
46    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
47        match self {
48            Self::UnexpectedHeader => write!(f, "unexpected header (HD) record"),
49            Self::InvalidRecord(_) => f.write_str("invalid record"),
50            Self::DuplicateReferenceSequenceName(name) => {
51                write!(f, "duplicate reference sequence name: {name}")
52            }
53            Self::DuplicateReadGroupId(id) => write!(f, "duplicate read group ID: {id}"),
54            Self::DuplicateProgramId(id) => write!(f, "duplicate program ID: {id}"),
55            Self::InvalidComment => f.write_str("invalid comment record"),
56        }
57    }
58}
59
60/// A SAM header parser.
61#[derive(Default)]
62pub struct Parser {
63    ctx: Context,
64    header: Option<Map<map::Header>>,
65    reference_sequences: ReferenceSequences,
66    read_groups: ReadGroups,
67    programs: Programs,
68    comments: Vec<BString>,
69}
70
71impl Parser {
72    fn is_empty(&self) -> bool {
73        self.header.is_none()
74            && self.reference_sequences.is_empty()
75            && self.read_groups.is_empty()
76            && self.programs.as_ref().is_empty()
77            && self.comments.is_empty()
78    }
79
80    /// Parses and adds a raw record to the header.
81    ///
82    /// # Examples
83    ///
84    /// ```
85    /// use noodles_sam as sam;
86    /// let mut parser = sam::header::Parser::default();
87    /// parser.parse_partial(b"@HD\tVN:1.6")?;
88    /// # Ok::<_, sam::header::ParseError>(())
89    /// ```
90    pub fn parse_partial(&mut self, src: &[u8]) -> Result<(), ParseError> {
91        if self.is_empty() {
92            if let Some(version) = extract_version(src) {
93                self.ctx = Context::from(version);
94            }
95        }
96
97        let record = parse_record(src, &self.ctx).map_err(ParseError::InvalidRecord)?;
98
99        match record {
100            Record::Header(header) => {
101                if self.is_empty() {
102                    self.header = Some(header);
103                } else {
104                    return Err(ParseError::UnexpectedHeader);
105                }
106            }
107            Record::ReferenceSequence(name, reference_sequence) => try_insert(
108                &mut self.reference_sequences,
109                name,
110                reference_sequence,
111                ParseError::DuplicateReferenceSequenceName,
112            )?,
113            Record::ReadGroup(id, read_group) => try_insert(
114                &mut self.read_groups,
115                id,
116                read_group,
117                ParseError::DuplicateReadGroupId,
118            )?,
119            Record::Program(id, program) => try_insert(
120                self.programs.as_mut(),
121                id,
122                program,
123                ParseError::DuplicateProgramId,
124            )?,
125            Record::Comment(comment) => self.comments.push(comment),
126        }
127
128        Ok(())
129    }
130
131    /// Builds the SAM header.
132    ///
133    /// # Examples
134    ///
135    /// ```
136    /// use noodles_sam as sam;
137    /// let parser = sam::header::Parser::default();
138    /// let header = parser.finish();
139    /// assert!(header.is_empty());
140    /// # Ok::<_, sam::header::ParseError>(())
141    /// ```
142    pub fn finish(self) -> Header {
143        Header {
144            header: self.header,
145            reference_sequences: self.reference_sequences,
146            read_groups: self.read_groups,
147            programs: self.programs,
148            comments: self.comments,
149        }
150    }
151}
152
153fn extract_version(src: &[u8]) -> Option<Version> {
154    use self::record::value::map::header::parse_version;
155
156    const RECORD_PREFIX: &[u8] = b"@HD\t";
157    const DELIMITER: u8 = b'\t';
158    const FIELD_PREFIX: &[u8] = b"VN:";
159
160    if let Some(raw_value) = src.strip_prefix(RECORD_PREFIX) {
161        for raw_field in raw_value.split(|&b| b == DELIMITER) {
162            if let Some(s) = raw_field.strip_prefix(FIELD_PREFIX) {
163                return parse_version(s).ok();
164            }
165        }
166    }
167
168    None
169}
170
171fn try_insert<K, V, F, E>(map: &mut IndexMap<K, V>, key: K, value: V, f: F) -> Result<(), E>
172where
173    K: Hash + Eq + Clone,
174    F: FnOnce(K) -> E,
175{
176    use indexmap::map::Entry;
177
178    match map.entry(key) {
179        Entry::Vacant(e) => {
180            e.insert(value);
181            Ok(())
182        }
183        Entry::Occupied(e) => Err(f(e.key().clone())),
184    }
185}
186
187/// Parses a raw SAM header.
188///
189/// # Examples
190///
191/// ```
192/// use noodles_sam as sam;
193///
194/// let s = "\
195/// @HD\tVN:1.6\tSO:coordinate
196/// @SQ\tSN:sq0\tLN:8
197/// @SQ\tSN:sq1\tLN:13
198/// ";
199///
200/// let header: sam::Header = s.parse()?;
201///
202/// assert!(header.header().is_some());
203/// assert_eq!(header.reference_sequences().len(), 2);
204/// assert!(header.read_groups().is_empty());
205/// assert!(header.programs().as_ref().is_empty());
206/// assert!(header.comments().is_empty());
207/// # Ok::<(), sam::header::ParseError>(())
208/// ```
209pub(super) fn parse(s: &str) -> Result<Header, ParseError> {
210    let mut parser = Parser::default();
211
212    for line in s.lines() {
213        parser.parse_partial(line.as_bytes())?;
214    }
215
216    Ok(parser.finish())
217}
218
219#[cfg(test)]
220mod tests {
221    use super::*;
222
223    #[test]
224    fn test_parse() -> Result<(), Box<dyn std::error::Error>> {
225        use std::num::NonZero;
226
227        use crate::header::record::value::map::{
228            self, Map, Program, ReadGroup, ReferenceSequence,
229            header::{self, Version},
230            program,
231        };
232
233        let s = "\
234@HD\tVN:1.6\tSO:coordinate
235@SQ\tSN:sq0\tLN:8
236@SQ\tSN:sq1\tLN:13
237@RG\tID:rg0
238@PG\tID:pg0\tPN:noodles
239@CO\tndls
240";
241
242        let actual = parse(s)?;
243
244        let expected = Header::builder()
245            .set_header(
246                Map::<map::Header>::builder()
247                    .set_version(Version::new(1, 6))
248                    .insert(header::tag::SORT_ORDER, "coordinate")
249                    .build()?,
250            )
251            .add_reference_sequence(
252                "sq0",
253                Map::<ReferenceSequence>::new(const { NonZero::new(8).unwrap() }),
254            )
255            .add_reference_sequence(
256                "sq1",
257                Map::<ReferenceSequence>::new(const { NonZero::new(13).unwrap() }),
258            )
259            .add_read_group("rg0", Map::<ReadGroup>::default())
260            .add_program(
261                "pg0",
262                Map::<Program>::builder()
263                    .insert(program::tag::NAME, "noodles")
264                    .build()?,
265            )
266            .add_comment("ndls")
267            .build();
268
269        assert_eq!(actual, expected);
270
271        Ok(())
272    }
273
274    #[test]
275    fn test_parse_with_empty_input() -> Result<(), ParseError> {
276        let header = parse("")?;
277
278        assert!(header.header().is_none());
279        assert!(header.reference_sequences().is_empty());
280        assert!(header.read_groups().is_empty());
281        assert!(header.programs().as_ref().is_empty());
282        assert!(header.comments().is_empty());
283
284        Ok(())
285    }
286
287    #[test]
288    fn test_parse_without_hd() -> Result<(), ParseError> {
289        let header = parse("@SQ\tSN:sq0\tLN:8\n")?;
290        assert!(header.header().is_none());
291        assert_eq!(header.reference_sequences().len(), 1);
292        Ok(())
293    }
294
295    #[test]
296    fn test_parse_with_multiple_hd() {
297        let s = "\
298@HD\tVN:1.6\tSO:coordinate
299@HD\tVN:1.6\tSO:coordinate
300";
301
302        assert_eq!(parse(s), Err(ParseError::UnexpectedHeader));
303    }
304
305    #[test]
306    fn test_parse_with_duplicate_reference_sequence_names() {
307        let s = "\
308@SQ\tSN:sq0\tLN:8
309@SQ\tSN:sq0\tLN:8
310";
311
312        assert_eq!(
313            parse(s),
314            Err(ParseError::DuplicateReferenceSequenceName(BString::from(
315                "sq0"
316            )))
317        );
318    }
319
320    #[test]
321    fn test_parse_with_duplicate_read_group_ids() {
322        let s = "\
323@RG\tID:rg0
324@RG\tID:rg0
325";
326
327        assert_eq!(
328            parse(s),
329            Err(ParseError::DuplicateReadGroupId(BString::from("rg0")))
330        );
331    }
332
333    #[test]
334    fn test_parse_with_duplicate_program_ids() {
335        let s = "\
336@PG\tID:pg0
337@PG\tID:pg0
338";
339        assert_eq!(
340            parse(s),
341            Err(ParseError::DuplicateProgramId(BString::from("pg0")))
342        );
343    }
344
345    #[test]
346    fn test_extract_version() {
347        assert_eq!(extract_version(b"@HD\tVN:1.6"), Some(Version::new(1, 6)));
348        assert_eq!(
349            extract_version(b"@HD\tSO:coordinate\tVN:1.6"),
350            Some(Version::new(1, 6))
351        );
352        assert!(extract_version(b"@HD\tVN:NA").is_none());
353        assert!(extract_version(b"@SQ\tSN:sq0\tLN:8\tVN:1.6").is_none());
354        assert!(extract_version(b"@CO\tVN:1.6").is_none());
355    }
356}