atglib/gtf/
reader.rs

1use std::collections::HashMap;
2use std::convert::TryFrom;
3use std::fs::File;
4use std::io::{BufRead, BufReader};
5use std::path::Path;
6use std::str::FromStr;
7
8use crate::gtf::{GtfFeature, GtfRecord, GtfRecordsGroup};
9use crate::models::{Transcript, TranscriptRead, Transcripts};
10use crate::utils::errors::ParseGtfError;
11use crate::utils::errors::ReadWriteError;
12
13use super::record::UncheckedGtfRecord;
14
15/// Parses GTF data and creates [`Transcript`]s.
16///
17/// GTF data can be read from a file, stdin or remote sources
18/// All sources are supported that provide a `std::io::Read` implementation.
19///
20/// # Examples
21///
22/// ```rust
23/// use atglib::gtf::Reader;
24/// use atglib::models::TranscriptRead;
25///
26/// // create a reader from the tests GTF file
27/// let reader = Reader::from_file("tests/data/example.gtf");
28/// assert_eq!(reader.is_ok(), true);
29///
30/// // parse the GTF file
31/// let transcripts = reader
32///     .unwrap()
33///     .transcripts()
34///     .unwrap();
35///
36/// assert_eq!(transcripts.len(), 27);
37/// ```
38pub struct Reader<R> {
39    inner: std::io::BufReader<R>,
40    line_content: String,
41}
42
43impl Reader<File> {
44    /// Creates a Reader instance that reads from a File
45    ///
46    /// Use this method when you want to read from a GTF file
47    /// on your local file system
48    ///
49    /// # Examples
50    ///
51    /// ```rust
52    /// use atglib::gtf::Reader;
53    /// use atglib::models::TranscriptRead;
54    ///
55    /// // create a reader from the tests GTF file
56    /// let reader = Reader::from_file("tests/data/example.gtf");
57    /// assert_eq!(reader.is_ok(), true);
58    ///
59    /// // parse the GTF file
60    /// let transcripts = reader
61    ///     .unwrap()
62    ///     .transcripts()
63    ///     .unwrap();
64    ///
65    /// assert_eq!(transcripts.len(), 27);
66    /// ```
67    pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self, ReadWriteError> {
68        match File::open(path.as_ref()) {
69            Ok(file) => Ok(Self::new(file)),
70            Err(err) => Err(ReadWriteError::new(err)),
71        }
72    }
73}
74
75impl<R: std::io::Read> Reader<R> {
76    /// creates a new Reader instance from any `std::io::Read` object
77    ///
78    /// Use this method when you want to read from stdin or from
79    /// a remote source, e.g. via HTTP
80    pub fn new(reader: R) -> Self {
81        Reader {
82            inner: BufReader::new(reader),
83            line_content: String::with_capacity(200),
84        }
85    }
86
87    /// Creates a new Reader instance with a known capcity
88    ///
89    /// Use this when you know the size of your GTF source
90    pub fn with_capacity(capacity: usize, reader: R) -> Self {
91        Reader {
92            inner: BufReader::with_capacity(capacity, reader),
93            line_content: String::with_capacity(200),
94        }
95    }
96
97    /// Returns one line of a GTF file as `GtfRecord`
98    ///
99    /// This method should rarely be used. GTF files can contain unordered
100    /// records and handling lines individually is rarely desired.
101    fn line(&mut self) -> Option<Result<GtfRecord, ParseGtfError>> {
102        self.line_content.clear();
103        match self.inner.read_line(&mut self.line_content) {
104            Ok(_) => {}
105            Err(x) => {
106                return Some(Err(ParseGtfError {
107                    message: x.to_string(),
108                }))
109            }
110        }
111
112        if self.line_content.starts_with('#') {
113            return self.line();
114        }
115
116        if self.line_content.is_empty() {
117            return None;
118        }
119
120        UncheckedGtfRecord::from_str(&self.line_content)
121            .map(|record| {
122                if record.standard_feature() {
123                    Some(GtfRecord::try_from(record))
124                } else {
125                    self.line()
126                }
127            })
128            .unwrap_or_else(|err| Some(Err(err)))
129    }
130}
131
132impl<R: std::io::Read> TranscriptRead for Reader<R> {
133    /// Reads in GTF data and returns the final list of `Transcripts`
134    ///
135    /// # Examples
136    ///
137    /// ```rust
138    /// use atglib::gtf::Reader;
139    /// use atglib::models::TranscriptRead;
140    ///
141    /// // create a reader from the tests GTF file
142    /// let reader = Reader::from_file("tests/data/example.gtf");
143    ///
144    /// // parse the GTF file
145    /// let transcripts = reader
146    ///     .unwrap()
147    ///     .transcripts()
148    ///     .unwrap();
149    ///
150    /// assert_eq!(transcripts.len(), 27);
151    /// ```
152    fn transcripts(&mut self) -> Result<Transcripts, ReadWriteError> {
153        let mut transcript_hashmap: HashMap<String, GtfRecordsGroup> = HashMap::new();
154        while let Some(line) = self.line() {
155            let gtf_record = line?;
156            let transcript = transcript_hashmap
157                .entry(gtf_record.transcript().to_string())
158                .or_insert_with(|| GtfRecordsGroup::new(gtf_record.transcript()));
159
160            match gtf_record.feature() {
161                GtfFeature::Exon => transcript.add_exon(gtf_record),
162                GtfFeature::CDS => transcript.add_exon(gtf_record),
163                GtfFeature::UTR | GtfFeature::UTR3 | GtfFeature::UTR5 => {
164                    transcript.add_exon(gtf_record)
165                }
166                GtfFeature::StartCodon => {
167                    transcript.add_exon(gtf_record);
168                }
169                GtfFeature::StopCodon => transcript.add_exon(gtf_record),
170                _ => {}
171            }
172        }
173
174        let mut res = Transcripts::with_capacity(transcript_hashmap.len());
175
176        for (_, gtf_transcript) in transcript_hashmap.drain() {
177            match Transcript::try_from(gtf_transcript) {
178                Ok(transcript) => res.push(transcript),
179                Err(err) => {
180                    return Err(ReadWriteError::new(format!("Error parsing {err}")));
181                }
182            }
183        }
184        Ok(res)
185    }
186}
187
188#[cfg(test)]
189mod test_nm_001385228 {
190    use super::*;
191
192    #[test]
193    fn test_read() {
194        let mut reader = Reader::from_file("tests/data/NM_001385228.1_2.gtf").unwrap();
195        let tr = match reader.transcripts() {
196            Ok(res) => res,
197            _ => panic!("No transcripts could be read"),
198        };
199
200        assert_eq!(tr.len(), 1);
201        let t = tr.by_name("NM_001385228.1_2")[0];
202        assert_eq!(t.exons().len(), 9);
203        assert_eq!(t.cds_start().unwrap(), 206105119);
204        assert_eq!(t.cds_end().unwrap(), 206135359);
205
206        let start = t.start_codon();
207        assert_eq!(start.len(), 1);
208
209        let stop = t.stop_codon();
210        assert_eq!(stop.len(), 2);
211
212        assert!(!t.exons()[0].is_coding());
213        assert!(!t.exons()[1].is_coding());
214        assert!(!t.exons()[2].is_coding());
215        assert!(!t.exons()[3].is_coding());
216        assert!(!t.exons()[4].is_coding());
217        assert!(t.exons()[5].is_coding());
218        assert!(t.exons()[6].is_coding());
219        assert!(t.exons()[7].is_coding());
220        assert!(!t.exons()[8].is_coding());
221
222        assert_eq!(t.exons()[5].cds_start().unwrap(), 206105119);
223        assert_eq!(t.exons()[6].cds_start().unwrap(), 206105123);
224        assert_eq!(t.exons()[7].cds_start().unwrap(), 206135293);
225
226        assert_eq!(t.exons()[5].cds_end().unwrap(), 206105120);
227        assert_eq!(t.exons()[6].cds_end().unwrap(), 206105206);
228        assert_eq!(t.exons()[7].cds_end().unwrap(), 206135359);
229    }
230}
231
232#[cfg(test)]
233mod test_nm_201550 {
234    use super::*;
235
236    #[test]
237    fn test_read() {
238        let mut reader = Reader::from_file("tests/data/NM_201550.4.gtf")
239            .expect("Something failed reading the GTF file NM_201550.4.gtf");
240        let tr = match reader.transcripts() {
241            Ok(res) => res,
242            _ => panic!("No transcripts could be read"),
243        };
244        assert_eq!(tr.len(), 1);
245        let t = tr.by_name("NM_201550.4")[0];
246        assert_eq!(t.exons().len(), 1);
247        assert_eq!(t.cds_start().unwrap(), 70003785);
248        assert_eq!(t.cds_end().unwrap(), 70004618);
249
250        let start = t.start_codon();
251        assert_eq!(start.len(), 1);
252
253        let stop = t.stop_codon();
254        assert_eq!(stop.len(), 1);
255
256        assert!(t.exons()[0].is_coding());
257    }
258}
259
260/*
261This one fails due to book-ended exons in the input file
262#[cfg(test)]
263mod test_nm_001371720 {
264    use super::*;
265
266    #[test]
267    fn test_read() {
268        let mut reader = Reader::from_file("tests/data/NM_001371720.1.gtf")
269            .expect("Something failed reading the GTF file NM_001371720.1.gtf");
270        let tr = match reader.transcripts() {
271            Ok(res) => res,
272            _ => panic!("No transcripts could be read")
273        };
274        assert_eq!(tr.len(), 1);
275        let t = tr.by_name("NM_001371720.1")[0];
276        assert_eq!(t.exons().len(), 8);
277        assert_eq!(t.cds_start().unwrap(), 155158611);
278        assert_eq!(t.cds_end().unwrap(), 155162634);
279
280        let start = t.start_codon();
281        assert_eq!(start.len(), 1);
282
283        let stop = t.stop_codon();
284        assert_eq!(stop.len(), 1);
285
286        assert_eq!(t.exons()[0].is_coding(), true);
287        assert_eq!(t.exons()[1].is_coding(), true);
288        assert_eq!(t.exons()[2].is_coding(), true);
289        assert_eq!(t.exons()[3].is_coding(), true);
290        assert_eq!(t.exons()[4].is_coding(), true);
291        assert_eq!(t.exons()[5].is_coding(), true);
292        assert_eq!(t.exons()[6].is_coding(), true);
293        assert_eq!(t.exons()[7].is_coding(), true);
294    }
295}
296*/
297
298#[cfg(test)]
299mod test_ighm {
300    use super::*;
301    use crate::models::CdsStat;
302
303    #[test]
304    fn test_read() {
305        let mut reader = Reader::from_file("tests/data/id-IGHM.gtf")
306            .expect("Something failed reading the GTF file id-IGHM.gtf");
307        let tr = match reader.transcripts() {
308            Ok(res) => res,
309            _ => panic!("No transcripts could be read"),
310        };
311        assert_eq!(tr.len(), 1);
312        let t = tr.by_name("id-IGHM")[0];
313        assert_eq!(t.exons().len(), 6);
314        assert_eq!(t.cds_start().unwrap(), 106318298);
315        assert_eq!(t.cds_end().unwrap(), 106322322);
316
317        let start = t.start_codon();
318        assert_eq!(start.len(), 1);
319
320        let stop = t.stop_codon();
321        assert_eq!(stop.len(), 1);
322
323        assert!(t.exons()[0].is_coding());
324        assert!(t.exons()[1].is_coding());
325        assert!(t.exons()[2].is_coding());
326        assert!(t.exons()[3].is_coding());
327        assert!(t.exons()[4].is_coding());
328
329        assert_eq!(t.cds_start_codon_stat(), CdsStat::Complete);
330        assert_eq!(t.cds_stop_codon_stat(), CdsStat::Incomplete);
331    }
332}
333
334#[cfg(test)]
335mod tests {
336    use super::*;
337    use crate::tests::transcripts;
338
339    #[test]
340    fn test_nm_001365057() {
341        let transcripts = Reader::from_file("tests/data/NM_001365057.2.gtf")
342            .unwrap()
343            .transcripts()
344            .unwrap();
345
346        assert_eq!(
347            transcripts.by_name("NM_001365057.2")[0],
348            &transcripts::nm_001365057()
349        )
350    }
351
352    #[test]
353    fn test_nm_001365408() {
354        let transcripts = Reader::from_file("tests/data/NM_001365408.1.gtf")
355            .unwrap()
356            .transcripts()
357            .unwrap();
358
359        assert_eq!(
360            transcripts.by_name("NM_001365408.1")[0],
361            &transcripts::nm_001365408()
362        )
363    }
364
365    #[test]
366    fn test_nm_001371720() {
367        let transcripts = Reader::from_file("tests/data/NM_001371720.1.gtf")
368            .unwrap()
369            .transcripts()
370            .unwrap();
371
372        assert_eq!(
373            transcripts.by_name("NM_001371720.1")[0],
374            &transcripts::nm_001371720(true)
375        )
376    }
377
378    #[test]
379    fn test_nm_201550() {
380        let transcripts = Reader::from_file("tests/data/NM_201550.4.gtf")
381            .unwrap()
382            .transcripts()
383            .unwrap();
384
385        assert_eq!(
386            transcripts.by_name("NM_201550.4")[0],
387            &transcripts::nm_201550()
388        )
389    }
390
391    #[test]
392    fn test_gene_line_returns_none() {
393        // no newline at the end
394        let transcript = "chr16\tncbiRefSeq.2021-05-17\tgene\t66969419\t66978999\t.\t+\t.\tgene_id \"CES2\"; transcript_id \"NM_001365408.1\"; gene_name \"CES2\";".as_bytes();
395        let mut reader = Reader::new(transcript);
396        assert!(reader.line().is_none());
397
398        // with newline at the end
399        let transcript = "chr16\tncbiRefSeq.2021-05-17\tgene\t66969419\t66978999\t.\t+\t.\tgene_id \"CES2\"; transcript_id \"NM_001365408.1\"; gene_name \"CES2\";\n".as_bytes();
400        let mut reader = Reader::new(transcript);
401        assert!(reader.line().is_none());
402
403        // no transcript and no gene tag
404        let transcript = "chr16\tncbiRefSeq.2021-05-17\tgene\t66969419\t66978999\t.\t+\t.\tgene_name \"CES2\";\n".as_bytes();
405        let mut reader = Reader::new(transcript);
406        assert!(reader.line().is_none());
407
408        // no transcript tag
409        let transcript = "chr16\tncbiRefSeq.2021-05-17\tgene\t66969419\t66978999\t.\t+\t.\tgene_id \"CES2\"; gene_name \"CES2\";\n".as_bytes();
410        let mut reader = Reader::new(transcript);
411        assert!(reader.line().is_none());
412
413        // no gene tag
414        let transcript = "chr16\tncbiRefSeq.2021-05-17\tgene\t66969419\t66978999\t.\t+\t.\ttranscript_id \"NM_001365408.1\"; gene_name \"CES2\";\n".as_bytes();
415        let mut reader = Reader::new(transcript);
416        assert!(reader.line().is_none());
417    }
418
419    #[test]
420    fn test_gene_line_is_skipped() {
421        let transcript = "chr16\tncbiRefSeq.2021-05-17\tgene\t66969419\t66978999\t.\t+\t.\tgene_id \"CES2\"; transcript_id \"NM_001365408.1\"; gene_name \"CES2\";\n\
422        chr1\tncbiRefSeq.2021-05-17\texon\t206100298\t206100445\t.\t-\t.\tgene_id \"SRGAP2B\"; transcript_id \"NM_001385228.1_2\"; exon_number \"9\"; exon_id \"NM_001385228.1_2.9\"; gene_name \"SRGAP2B\";".as_bytes();
423        let mut reader = Reader::new(transcript);
424        assert!(reader
425            .line()
426            .expect("The record contains one proper line")
427            .is_ok());
428        assert!(reader.line().is_none());
429    }
430
431    #[test]
432    fn test_gene_line_contains_invalid_data() {
433        let transcript = "chr16\tncbiRefSeq.2021-05-17\tgene\t66969419\t66978999\t.\tINVALID\t.\tgene_id \"CES2\"; transcript_id \"NM_001365408.1\"; gene_name \"CES2\";\n".as_bytes();
434        let mut reader = Reader::new(transcript);
435        assert!(reader
436            .line()
437            .expect("The record contains one proper line")
438            .is_err());
439        assert!(reader.line().is_none());
440    }
441
442    #[test]
443    fn test_without_gene_or_transcript_errors() {
444        let transcript = "chr16\tncbiRefSeq.2021-05-17\texon\t66969419\t66978999\t.\t+\t.\tgene_id \"CES2\"; transcript_id \"NM_001365408.1\"; gene_name \"CES2\";\n".as_bytes();
445        let mut reader = Reader::new(transcript);
446        assert!(reader
447            .line()
448            .expect("The record contains one proper line")
449            .is_ok());
450        assert!(reader.line().is_none());
451
452        let transcript = "chr16\tncbiRefSeq.2021-05-17\texon\t66969419\t66978999\t.\t+\t.\ttranscript_id \"NM_001365408.1\"; gene_name \"CES2\";\n".as_bytes();
453        let mut reader = Reader::new(transcript);
454        assert!(reader
455            .line()
456            .expect("The record contains one proper line")
457            .is_err());
458        assert!(reader.line().is_none());
459
460        let transcript = "chr16\tncbiRefSeq.2021-05-17\texon\t66969419\t66978999\t.\t+\t.\tgene_id \"CES2\"; gene_name \"CES2\";\n".as_bytes();
461        let mut reader = Reader::new(transcript);
462        assert!(reader
463            .line()
464            .expect("The record contains one proper line")
465            .is_err());
466        assert!(reader.line().is_none());
467    }
468
469    #[test]
470    fn test_5_utr() {
471        let transcript = "chr16\tncbiRefSeq.2021-05-17\t5UTR\t66969419\t66978999\t.\t+\t.\tgene_id \"CES2\"; transcript_id \"NM_001365408.1\"; gene_name \"CES2\";".as_bytes();
472        let mut reader = Reader::new(transcript);
473        assert_eq!(
474            reader
475                .line()
476                .expect("The record contains one proper line")
477                .expect("The data is correct")
478                .feature(),
479            &GtfFeature::UTR5
480        );
481
482        let transcript = "chr16\tncbiRefSeq.2021-05-17\tfive_prime_utr\t66969419\t66978999\t.\t+\t.\tgene_id \"CES2\"; transcript_id \"NM_001365408.1\"; gene_name \"CES2\";".as_bytes();
483        let mut reader = Reader::new(transcript);
484        assert_eq!(
485            reader
486                .line()
487                .expect("The record contains one proper line")
488                .expect("The data is correct")
489                .feature(),
490            &GtfFeature::UTR5
491        );
492    }
493
494    #[test]
495    fn test_3_utr() {
496        let transcript = "chr16\tncbiRefSeq.2021-05-17\t3UTR\t66969419\t66978999\t.\t+\t.\tgene_id \"CES2\"; transcript_id \"NM_001365408.1\"; gene_name \"CES2\";".as_bytes();
497        let mut reader = Reader::new(transcript);
498        assert_eq!(
499            reader
500                .line()
501                .expect("The record contains one proper line")
502                .expect("The data is correct")
503                .feature(),
504            &GtfFeature::UTR3
505        );
506
507        let transcript = "chr16\tncbiRefSeq.2021-05-17\tthree_prime_utr\t66969419\t66978999\t.\t+\t.\tgene_id \"CES2\"; transcript_id \"NM_001365408.1\"; gene_name \"CES2\";".as_bytes();
508        let mut reader = Reader::new(transcript);
509        assert_eq!(
510            reader
511                .line()
512                .expect("The record contains one proper line")
513                .expect("The data is correct")
514                .feature(),
515            &GtfFeature::UTR3
516        );
517    }
518
519    #[test]
520    fn test_utr() {
521        let transcript = "chr16\tncbiRefSeq.2021-05-17\tUTR\t66969419\t66978999\t.\t+\t.\tgene_id \"CES2\"; transcript_id \"NM_001365408.1\"; gene_name \"CES2\";".as_bytes();
522        let mut reader = Reader::new(transcript);
523        assert_eq!(
524            reader
525                .line()
526                .expect("The record contains one proper line")
527                .expect("The data is correct")
528                .feature(),
529            &GtfFeature::UTR
530        );
531    }
532}