chainfile/alignment/section/
header.rs

1//! A header record.
2
3pub mod sequence;
4
5use std::num::ParseIntError;
6use std::str::FromStr;
7
8use omics::coordinate::position::Number;
9pub use sequence::Sequence;
10
11/// The prefix for a header record.
12pub const HEADER_PREFIX: &str = "chain";
13
14/// The delimiter for a header record.
15pub const DELIMITER: char = ' ';
16
17/// The number of expected fields in a header record.
18pub const NUM_HEADER_FIELDS: usize = 13;
19
20////////////////////////////////////////////////////////////////////////////////////////
21// Errors
22////////////////////////////////////////////////////////////////////////////////////////
23
24/// An error associated with parsing a header record.
25#[derive(Debug)]
26pub enum ParseError {
27    /// An incorrect number of fields in the header line.
28    IncorrectNumberOfFields(usize),
29
30    /// An invalid prefix.
31    InvalidPrefix(String),
32
33    /// An invalid score.
34    InvalidScore(ParseIntError),
35
36    /// An invalid reference sequence.
37    InvalidReferenceSequence(sequence::Error),
38
39    /// An invalid query sequence.
40    InvalidQuerySequence(sequence::Error),
41
42    /// An invalid id.
43    InvalidId(ParseIntError),
44
45    /// The end position exceeds the size of the chromosome.
46    EndPositionExceedsSize(String, Number, Number),
47}
48
49impl std::fmt::Display for ParseError {
50    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
51        match self {
52            ParseError::IncorrectNumberOfFields(fields) => write!(
53                f,
54                "invalid number of fields in header: expected {} fields, found {} fields",
55                NUM_HEADER_FIELDS, fields
56            ),
57            ParseError::InvalidPrefix(prefix) => {
58                write!(
59                    f,
60                    "invalid prefix: expected \"{}\", found \"{}\"",
61                    HEADER_PREFIX, prefix
62                )
63            }
64            ParseError::InvalidScore(err) => write!(f, "invalid score: {}", err),
65            ParseError::InvalidReferenceSequence(err) => {
66                write!(f, "invalid reference sequence: {}", err)
67            }
68            ParseError::InvalidQuerySequence(err) => write!(f, "invalid query sequence: {}", err),
69            ParseError::InvalidId(err) => write!(f, "invalid id: {}", err),
70            ParseError::EndPositionExceedsSize(chrom, pos, size) => write!(
71                f,
72                "the end position ({}) exceeds the size of the chromosome `{}` ({})",
73                pos, chrom, size
74            ),
75        }
76    }
77}
78
79impl std::error::Error for ParseError {}
80
81/// An error related to a [`Record`].
82#[derive(Debug)]
83pub enum Error {
84    /// A parse error.
85    Parse(ParseError),
86}
87
88impl std::fmt::Display for Error {
89    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
90        match self {
91            Error::Parse(err) => write!(f, "parse error: {err}"),
92        }
93    }
94}
95
96impl std::error::Error for Error {}
97
98/// A [`Result`](std::result::Result) with an [`Error`].
99type Result<T> = std::result::Result<T, Error>;
100
101////////////////////////////////////////////////////////////////////////////////////////
102// Record
103////////////////////////////////////////////////////////////////////////////////////////
104
105/// A header record within a chain file.
106#[derive(Clone, Debug, Eq, PartialEq)]
107pub struct Record {
108    /// The chain score.
109    score: usize,
110
111    /// The reference sequence.
112    reference_sequence: Sequence,
113
114    /// The query sequence.
115    query_sequence: Sequence,
116
117    /// The chain id.
118    id: usize,
119}
120
121impl Record {
122    /// Gets the score.
123    ///
124    /// # Examples
125    ///
126    /// ```
127    /// use chainfile::alignment::section::header;
128    ///
129    /// let header = "chain 0 seq0 2 + 0 2 seq0 2 - 0 2 1".parse::<header::Record>()?;
130    ///
131    /// assert_eq!(header.score(), 0);
132    /// # Ok::<(), Box<dyn std::error::Error>>(())
133    /// ```
134    pub fn score(&self) -> usize {
135        self.score
136    }
137
138    /// Gets the reference sequence.
139    ///
140    /// # Examples
141    ///
142    /// ```
143    /// use chainfile::alignment::section::header;
144    /// use omics::coordinate::Strand;
145    ///
146    /// let header = "chain 0 seq0 2 + 0 2 seq0 2 - 0 2 1".parse::<header::Record>()?;
147    ///
148    /// assert_eq!(header.reference_sequence().chromosome_name(), "seq0");
149    /// assert_eq!(header.reference_sequence().chromosome_size(), 2);
150    /// assert_eq!(header.reference_sequence().strand(), Strand::Positive);
151    /// assert_eq!(header.reference_sequence().alignment_start(), 0);
152    /// assert_eq!(header.reference_sequence().alignment_end(), 2);
153    ///
154    /// # Ok::<(), Box<dyn std::error::Error>>(())
155    /// ```
156    pub fn reference_sequence(&self) -> &Sequence {
157        &self.reference_sequence
158    }
159
160    /// Gets the query sequence.
161    ///
162    /// # Examples
163    ///
164    /// ```
165    /// use chainfile::alignment::section::header;
166    /// use omics::coordinate::Strand;
167    ///
168    /// let header = "chain 0 seq0 2 + 0 2 seq0 2 - 0 2 1".parse::<header::Record>()?;
169    ///
170    /// assert_eq!(header.query_sequence().chromosome_name(), "seq0");
171    /// assert_eq!(header.query_sequence().chromosome_size(), 2);
172    /// assert_eq!(header.query_sequence().strand(), Strand::Negative);
173    /// assert_eq!(header.query_sequence().alignment_start(), 0);
174    /// assert_eq!(header.query_sequence().alignment_end(), 2);
175    ///
176    /// # Ok::<(), Box<dyn std::error::Error>>(())
177    /// ```
178    pub fn query_sequence(&self) -> &Sequence {
179        &self.query_sequence
180    }
181
182    /// Gets the id.
183    ///
184    /// # Examples
185    ///
186    /// ```
187    /// use chainfile::alignment::section::header;
188    /// use omics::coordinate::Strand;
189    ///
190    /// let header = "chain 0 seq0 2 + 0 2 seq0 2 - 0 2 1".parse::<header::Record>()?;
191    ///
192    /// assert_eq!(header.id(), 1);
193    /// # Ok::<(), Box<dyn std::error::Error>>(())
194    /// ```
195    pub fn id(&self) -> usize {
196        self.id
197    }
198}
199
200impl FromStr for Record {
201    type Err = Error;
202
203    fn from_str(s: &str) -> Result<Self> {
204        let parts = s.split(DELIMITER).collect::<Vec<_>>();
205        if parts.len() != NUM_HEADER_FIELDS {
206            return Err(Error::Parse(ParseError::IncorrectNumberOfFields(
207                parts.len(),
208            )));
209        }
210
211        let chain = parts[0];
212        if chain != HEADER_PREFIX {
213            return Err(Error::Parse(ParseError::InvalidPrefix(chain.into())));
214        }
215
216        let score = parts[1]
217            .parse()
218            .map_err(|err| Error::Parse(ParseError::InvalidScore(err)))?;
219        let reference_sequence =
220            Sequence::try_from_str_parts(parts[2], parts[3], parts[4], parts[5], parts[6])
221                .map_err(|err| Error::Parse(ParseError::InvalidReferenceSequence(err)))?;
222        let query_sequence =
223            Sequence::try_from_str_parts(parts[7], parts[8], parts[9], parts[10], parts[11])
224                .map_err(|err| Error::Parse(ParseError::InvalidQuerySequence(err)))?;
225        let id = parts[12]
226            .parse()
227            .map_err(|err| Error::Parse(ParseError::InvalidId(err)))?;
228
229        if reference_sequence.chromosome_size() < reference_sequence.alignment_end() {
230            return Err(Error::Parse(ParseError::EndPositionExceedsSize(
231                reference_sequence.chromosome_name().to_string(),
232                reference_sequence.alignment_end(),
233                reference_sequence.chromosome_size(),
234            )));
235        }
236
237        if query_sequence.chromosome_size() < query_sequence.alignment_end() {
238            return Err(Error::Parse(ParseError::EndPositionExceedsSize(
239                query_sequence.chromosome_name().to_string(),
240                query_sequence.alignment_end(),
241                query_sequence.chromosome_size(),
242            )));
243        }
244
245        Ok(Record {
246            score,
247            reference_sequence,
248            query_sequence,
249            id,
250        })
251    }
252}
253
254impl std::fmt::Display for Record {
255    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
256        write!(
257            f,
258            "{}{}{}{}{}{}{}{}{}",
259            HEADER_PREFIX,
260            DELIMITER,
261            self.score,
262            DELIMITER,
263            self.reference_sequence,
264            DELIMITER,
265            self.query_sequence,
266            DELIMITER,
267            self.id
268        )
269    }
270}
271
272#[cfg(test)]
273mod tests {
274    use omics::coordinate::Strand;
275
276    use super::*;
277
278    #[test]
279    pub fn parse() {
280        let header = "chain 0 seq0 2 + 0 2 seq0 2 - 0 2 1"
281            .parse::<Record>()
282            .unwrap();
283
284        assert_eq!(header.score(), 0);
285
286        assert_eq!(header.reference_sequence().chromosome_name(), "seq0");
287        assert_eq!(header.reference_sequence().chromosome_size(), 2);
288        assert_eq!(header.reference_sequence().strand(), Strand::Positive);
289        assert_eq!(header.reference_sequence().alignment_start(), 0);
290        assert_eq!(header.reference_sequence().alignment_end(), 2);
291
292        assert_eq!(header.query_sequence().chromosome_name(), "seq0");
293        assert_eq!(header.query_sequence().chromosome_size(), 2);
294        assert_eq!(header.query_sequence().strand(), Strand::Negative);
295        assert_eq!(header.query_sequence().alignment_start(), 0);
296        assert_eq!(header.query_sequence().alignment_end(), 2);
297
298        assert_eq!(header.id(), 1);
299    }
300
301    #[test]
302    fn incorrect_number_of_fields() {
303        let err = "foo 0 seq0 2 + 0 2 seq0 2 - 0 2"
304            .parse::<Record>()
305            .unwrap_err();
306
307        assert!(matches!(
308            err,
309            Error::Parse(ParseError::IncorrectNumberOfFields(_))
310        ));
311
312        assert_eq!(
313            err.to_string(),
314            "parse error: invalid number of fields in header: expected 13 fields, found 12 fields"
315        );
316    }
317
318    #[test]
319    fn invalid_prefix() {
320        let err = "foo 0 seq0 2 + 0 2 seq0 2 - 0 2 1"
321            .parse::<Record>()
322            .unwrap_err();
323
324        assert!(matches!(err, Error::Parse(ParseError::InvalidPrefix(_))));
325        assert_eq!(
326            err.to_string(),
327            "parse error: invalid prefix: expected \"chain\", found \"foo\""
328        );
329    }
330
331    #[test]
332    fn invalid_score() {
333        let err = "chain ? seq0 2 + 0 2 seq0 2 - 0 2 1"
334            .parse::<Record>()
335            .unwrap_err();
336
337        assert!(matches!(err, Error::Parse(ParseError::InvalidScore(_))));
338        assert_eq!(
339            err.to_string(),
340            "parse error: invalid score: invalid digit found in string"
341        );
342    }
343
344    #[test]
345    fn invalid_reference_sequence() {
346        let err = "chain 0 seq0 ? + 0 2 seq0 2 - 0 2 1"
347            .parse::<Record>()
348            .unwrap_err();
349
350        assert!(matches!(
351            err,
352            Error::Parse(ParseError::InvalidReferenceSequence(_))
353        ));
354
355        assert_eq!(
356            err.to_string(),
357            "parse error: invalid reference sequence: parse error: invalid chromosome size: \
358             invalid digit found in string"
359        );
360    }
361
362    #[test]
363    fn invalid_query_sequence() {
364        let err = "chain 0 seq0 2 + 0 2 seq0 ? - 0 2 1"
365            .parse::<Record>()
366            .unwrap_err();
367
368        assert!(matches!(
369            err,
370            Error::Parse(ParseError::InvalidQuerySequence(_))
371        ));
372
373        assert_eq!(
374            err.to_string(),
375            "parse error: invalid query sequence: parse error: invalid chromosome size: invalid \
376             digit found in string"
377        );
378    }
379
380    #[test]
381    fn invalid_id() {
382        let err = "chain 0 seq0 2 + 0 2 seq0 2 - 0 2 ?"
383            .parse::<Record>()
384            .unwrap_err();
385
386        assert!(matches!(err, Error::Parse(ParseError::InvalidId(_))));
387        assert_eq!(
388            err.to_string(),
389            "parse error: invalid id: invalid digit found in string"
390        );
391    }
392
393    #[test]
394    fn end_is_greater_than_size_reference() {
395        let err = "chain 0 seq0 2 + 0 3 seq0 2 - 0 1 1"
396            .parse::<Record>()
397            .unwrap_err();
398
399        assert!(matches!(
400            err,
401            Error::Parse(ParseError::EndPositionExceedsSize(_, _, _))
402        ));
403
404        assert_eq!(
405            err.to_string(),
406            "parse error: the end position (3) exceeds the size of the chromosome `seq0` (2)"
407        );
408    }
409
410    #[test]
411    fn end_is_greater_than_size_query() {
412        let err = "chain 0 seq0 2 + 0 1 seq0 2 - 0 3 1"
413            .parse::<Record>()
414            .unwrap_err();
415
416        assert!(matches!(
417            err,
418            Error::Parse(ParseError::EndPositionExceedsSize(_, _, _))
419        ));
420
421        assert_eq!(
422            err.to_string(),
423            "parse error: the end position (3) exceeds the size of the chromosome `seq0` (2)"
424        );
425    }
426
427    #[test]
428    pub fn display() {
429        let header = "chain 0 seq0 2 + 0 2 seq0 2 - 0 2 1"
430            .parse::<Record>()
431            .unwrap();
432
433        assert_eq!(header.to_string(), "chain 0 seq0 2 + 0 2 seq0 2 - 0 2 1");
434    }
435}