atglib/refgene/
reader.rs

1use std::convert::TryFrom;
2use std::fs::File;
3use std::io::{BufRead, BufReader};
4use std::path::Path;
5use std::str::FromStr;
6
7use crate::models::{CdsStat, Exon, Frame, Strand, TranscriptBuilder};
8use crate::models::{Transcript, TranscriptRead, Transcripts};
9use crate::refgene::constants::*;
10use crate::utils::errors::{ParseRefGeneError, ReadWriteError};
11use crate::utils::exon_cds_overlap;
12
13/// Parses RefGene data and creates [`Transcript`]s.
14///
15/// RefGene data can be read from a file, stdin or remote sources
16/// All sources are supported that provide a `std::io::Read` implementation.
17///
18/// # Examples
19///
20/// ```rust
21/// use atglib::refgene::Reader;
22/// use atglib::models::TranscriptRead;
23///
24/// // create a reader from the tests RefGene file
25/// let reader = Reader::from_file("tests/data/example.refgene");
26/// assert_eq!(reader.is_ok(), true);
27///
28/// // parse the RefGene file
29/// let transcripts = reader
30///     .unwrap()
31///     .transcripts()
32///     .unwrap();
33///
34/// assert_eq!(transcripts.len(), 27);
35/// ```
36pub struct Reader<R> {
37    inner: std::io::BufReader<R>,
38}
39
40impl Reader<File> {
41    /// Creates a Reader instance that reads from a File
42    ///
43    /// Use this method when you want to read from a RefGene file
44    /// on your local file system
45    ///
46    /// # Examples
47    ///
48    /// ```rust
49    /// use atglib::refgene::Reader;
50    /// use atglib::models::TranscriptRead;
51    ///
52    /// // create a reader from the tests RefGene file
53    /// let reader = Reader::from_file("tests/data/example.refgene");
54    /// assert_eq!(reader.is_ok(), true);
55    ///
56    /// // parse the RefGene file
57    /// let transcripts = reader
58    ///     .unwrap()
59    ///     .transcripts()
60    ///     .unwrap();
61    ///
62    /// assert_eq!(transcripts.len(), 27);
63    /// ```
64    pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self, ReadWriteError> {
65        match File::open(path.as_ref()) {
66            Ok(file) => Ok(Self::new(file)),
67            Err(err) => Err(ReadWriteError::new(err)),
68        }
69    }
70}
71
72impl<R: std::io::Read> Reader<R> {
73    /// creates a new Reader instance from any `std::io::Read` object
74    ///
75    /// Use this method when you want to read from stdin or from
76    /// a remote source, e.g. via HTTP
77    pub fn new(reader: R) -> Self {
78        Reader {
79            inner: BufReader::new(reader),
80        }
81    }
82
83    /// Creates a new Reader instance with a known capcity
84    ///
85    /// Use this when you know the size of your RefGene source
86    pub fn with_capacity(capacity: usize, reader: R) -> Self {
87        Reader {
88            inner: BufReader::with_capacity(capacity, reader),
89        }
90    }
91
92    /// Returns one line of a RefGene file as `Transcript`
93    ///
94    /// This method should rarely be used. GTF files can contain unordered
95    /// records and handling lines individually is rarely desired.
96    pub fn line(&mut self) -> Option<Result<Transcript, ParseRefGeneError>> {
97        let mut line = String::new();
98        match self.inner.read_line(&mut line) {
99            Ok(_) => {}
100            Err(x) => {
101                return Some(Err(ParseRefGeneError {
102                    message: x.to_string(),
103                }))
104            }
105        }
106
107        if line.starts_with('#') {
108            return self.line();
109        }
110
111        if line.is_empty() {
112            None
113        } else {
114            let cols: Vec<&str> = line.trim().split('\t').collect();
115            Some(Transcript::try_from(cols))
116        }
117    }
118}
119
120impl<R: std::io::Read> TranscriptRead for Reader<R> {
121    /// Reads in RefGene data and returns the final list of `Transcripts`
122    ///
123    /// # Examples
124    ///
125    /// ```rust
126    /// use atglib::refgene::Reader;
127    /// use atglib::models::TranscriptRead;
128    ///
129    /// // create a reader from the tests RefGene file
130    /// let reader = Reader::from_file("tests/data/example.refgene");
131    /// assert_eq!(reader.is_ok(), true);
132    ///
133    /// // parse the RefGene file
134    /// let transcripts = reader
135    ///     .unwrap()
136    ///     .transcripts()
137    ///     .unwrap();
138    ///
139    /// assert_eq!(transcripts.len(), 27);
140    /// ```
141    fn transcripts(&mut self) -> Result<Transcripts, ReadWriteError> {
142        let mut res = Transcripts::new();
143        while let Some(line) = self.line() {
144            match line {
145                Ok(t) => res.push(t),
146                Err(x) => return Err(ReadWriteError::from(x)),
147            }
148        }
149        Ok(res)
150    }
151}
152
153impl TryFrom<Vec<&str>> for Transcript {
154    type Error = ParseRefGeneError;
155
156    /// Returns a ```Transcript``` based on features of the GTF file,
157    /// belonging to one transcript
158    fn try_from(cols: Vec<&str>) -> Result<Self, ParseRefGeneError> {
159        if cols.len() != N_REFGENE_COLUMNS {
160            return Err(ParseRefGeneError {
161                message: format!(
162                    "Invalid number of columns in line\nvv\n{}\n^^",
163                    cols.join("\t")
164                ),
165            });
166        }
167
168        let bin = match cols[BIN_COL].parse::<u16>() {
169            Ok(x) => Some(x),
170            _ => None,
171        };
172        let strand = match Strand::from_str(cols[STRAND_COL]) {
173            Ok(x) => x,
174            Err(message) => return Err(ParseRefGeneError { message }),
175        };
176        let mut exons = instantiate_exons(&cols)?;
177        let cds_start_stat = match CdsStat::from_str(cols[CDS_START_STAT_COL]) {
178            Ok(x) => x,
179            Err(message) => return Err(ParseRefGeneError { message }),
180        };
181
182        let cds_end_stat = match CdsStat::from_str(cols[CDS_END_STAT_COL]) {
183            Ok(x) => x,
184            Err(message) => return Err(ParseRefGeneError { message }),
185        };
186        let score = match cols[SCORE_COL].parse::<f32>() {
187            Ok(x) => Some(x),
188            _ => None,
189        };
190
191        let mut transcript = TranscriptBuilder::new()
192            .bin(bin)
193            .name(cols[TRANSCRIPT_COL])
194            .chrom(cols[CHROMOSOME_COL])
195            .strand(strand)
196            .gene(cols[GENE_SYMBOL_COL])
197            .cds_start_stat(cds_start_stat)
198            .cds_end_stat(cds_end_stat)
199            .score(score)
200            .build()
201            .unwrap();
202        transcript.append_exons(&mut exons);
203        Ok(transcript)
204    }
205}
206
207/// Returns a Vector of `Exon`s
208///
209/// It parses the columns that specify the start and end positions
210/// of an exon, as well as frame etc.
211/// It also check for CDS overlap.
212fn instantiate_exons(cols: &[&str]) -> Result<Vec<Exon>, ParseRefGeneError> {
213    // RefGene specifies the number of exons.
214    // This assumes this number to be correct.
215    let exon_count = cols[EXON_COUNT_COL].parse::<usize>().unwrap();
216
217    // Create an empty vector to hold all exons
218    let mut exons: Vec<Exon> = Vec::with_capacity(exon_count);
219
220    // create temporary vectors for the start, end coordinates
221    // and the frame-offsets for every exon
222    let starts: Vec<&str> = cols[EXON_STARTS_COL]
223        .trim_end_matches(',')
224        .split(',')
225        .collect();
226    let ends: Vec<&str> = cols[EXON_ENDS_COL]
227        .trim_end_matches(',')
228        .split(',')
229        .collect();
230    let frame_offsets: Vec<&str> = cols[EXON_FRAMES_COL]
231        .trim_end_matches(',')
232        .split(',')
233        .collect();
234
235    // Parse the cds-start and end positions
236    // Also check if the transcript is coding at all
237    let (coding, cds_start, cds_end) = match (
238        cols[CDS_START_COL].parse::<u32>(),
239        cols[CDS_END_COL].parse::<u32>(),
240    ) {
241        (Ok(start), Ok(end)) => (true, Some(start), Some(end)),
242        _ => (false, None, None),
243    };
244
245    // Iterate through all exons and create `Exon` instances
246    for i in 0..exon_count {
247        let start = starts
248            .get(i)
249            .ok_or("Too few exon starts in input")?
250            // RefGene start positions are excluded
251            // but atg includes them. So we change the start coordinate
252            .parse::<u32>()?
253            + 1;
254        let end = ends
255            .get(i)
256            .ok_or("Too few exon ends in input")?
257            .parse::<u32>()?;
258
259        // Check of the exon is coding
260        let exon_cds = if coding {
261            exon_cds_overlap(
262                &start,
263                &end,
264                // RefGene start positions are excluded
265                // but atg includes them. So we change the start coordinate
266                // unwrapping is safe here, this is checked previously
267                &(cds_start.unwrap() + 1),
268                &cds_end.unwrap(),
269            )
270        } else {
271            (None, None)
272        };
273
274        exons.push(Exon::new(
275            start,
276            end,
277            exon_cds.0,
278            exon_cds.1,
279            Frame::from_refgene(frame_offsets.get(i).ok_or("Too few exon Frame offsets")?)?,
280        ));
281    }
282    Ok(exons)
283}
284
285#[cfg(test)]
286mod tests {
287    use super::*;
288    use crate::tests::transcripts;
289
290    #[test]
291    fn test_parse_exons_no_cds() {
292        let cols = vec![
293            "585",
294            "NR_046018.2",
295            "chr1",
296            "+",
297            "11873",
298            "14409",
299            "14409",
300            "14409",
301            "3",
302            "11873,12612,13220,",
303            "12227,12721,14409,",
304            "0",
305            "DDX11L1",
306            "unk",
307            "unk",
308            "-1,-1,-1,",
309        ];
310        let exons = instantiate_exons(&cols).unwrap();
311
312        assert_eq!(exons.len(), 3);
313
314        assert_eq!(exons[0].start(), 11874);
315        assert_eq!(exons[0].end(), 12227);
316        assert_eq!(*exons[0].cds_start(), None);
317        assert_eq!(*exons[0].cds_end(), None);
318
319        assert_eq!(exons[1].start(), 12613);
320        assert_eq!(exons[1].end(), 12721);
321        assert_eq!(*exons[1].cds_start(), None);
322        assert_eq!(*exons[1].cds_end(), None);
323
324        assert_eq!(exons[2].start(), 13221);
325        assert_eq!(exons[2].end(), 14409);
326        assert_eq!(*exons[2].cds_start(), None);
327        assert_eq!(*exons[2].cds_end(), None);
328    }
329
330    #[test]
331    fn test_missing_exon_stop() {
332        let cols = vec![
333            "585",
334            "NR_046018.2",
335            "chr1",
336            "+",
337            "11873",
338            "14409",
339            "14409",
340            "14409",
341            "3",
342            "11873,12612,13220",
343            "12227,12721,",
344            "0",
345            "DDX11L1",
346            "unk",
347            "unk",
348            "-1,-1,-1,",
349        ];
350        let exons = instantiate_exons(&cols);
351
352        assert!(exons.is_err());
353        assert_eq!(
354            exons.unwrap_err().message,
355            "Too few exon ends in input".to_string()
356        );
357    }
358
359    #[test]
360    fn test_missing_exon_start() {
361        let cols = vec![
362            "585",
363            "NR_046018.2",
364            "chr1",
365            "+",
366            "11873",
367            "14409",
368            "14409",
369            "14409",
370            "3",
371            "11873,12612,",
372            "12227,12721,14409,",
373            "0",
374            "DDX11L1",
375            "unk",
376            "unk",
377            "-1,-1,-1,",
378        ];
379        let exons = instantiate_exons(&cols);
380
381        assert!(exons.is_err());
382        assert_eq!(
383            exons.unwrap_err().message,
384            "Too few exon starts in input".to_string()
385        );
386    }
387
388    #[test]
389    fn test_missing_exon_frame() {
390        let cols = vec![
391            "585",
392            "NR_046018.2",
393            "chr1",
394            "+",
395            "11873",
396            "14409",
397            "14409",
398            "14409",
399            "3",
400            "11873,12612,13220",
401            "12227,12721,14409,",
402            "0",
403            "DDX11L1",
404            "unk",
405            "unk",
406            "-1,-1,",
407        ];
408        let exons = instantiate_exons(&cols);
409
410        assert!(exons.is_err());
411        assert_eq!(
412            exons.unwrap_err().message,
413            "Too few exon Frame offsets".to_string()
414        );
415    }
416
417    #[test]
418    fn test_wrong_exon_frame() {
419        let cols = vec![
420            "585",
421            "NR_046018.2",
422            "chr1",
423            "+",
424            "11873",
425            "14409",
426            "14409",
427            "14409",
428            "3",
429            "11873,12612,13220",
430            "12227,12721,14409,",
431            "0",
432            "DDX11L1",
433            "unk",
434            "unk",
435            "-1,/,-1",
436        ];
437        let exons = instantiate_exons(&cols);
438
439        assert!(exons.is_err());
440        assert_eq!(
441            exons.unwrap_err().message,
442            "invalid frame indicator /".to_string()
443        );
444    }
445
446    #[test]
447    fn test_nm_001365057() {
448        let transcripts = Reader::from_file("tests/data/NM_001365057.2.refgene")
449            .unwrap()
450            .transcripts()
451            .unwrap();
452
453        assert_eq!(
454            transcripts.by_name("NM_001365057.2")[0],
455            &transcripts::nm_001365057()
456        )
457    }
458
459    #[test]
460    fn test_nm_001365408() {
461        let transcripts = Reader::from_file("tests/data/NM_001365408.1.refgene")
462            .unwrap()
463            .transcripts()
464            .unwrap();
465
466        assert_eq!(
467            transcripts.by_name("NM_001365408.1")[0],
468            &transcripts::nm_001365408()
469        )
470    }
471
472    #[test]
473    fn test_nm_001371720() {
474        let transcripts = Reader::from_file("tests/data/NM_001371720.1.refgene")
475            .unwrap()
476            .transcripts()
477            .unwrap();
478
479        assert_eq!(
480            transcripts.by_name("NM_001371720.1")[0],
481            &transcripts::nm_001371720(false)
482        )
483    }
484
485    #[test]
486    fn test_nm_201550() {
487        let transcripts = Reader::from_file("tests/data/NM_201550.4.refgene")
488            .unwrap()
489            .transcripts()
490            .unwrap();
491
492        assert_eq!(
493            transcripts.by_name("NM_201550.4")[0],
494            &transcripts::nm_201550()
495        )
496    }
497}