1use std::collections::HashMap;
2use std::convert::TryFrom;
3use std::fs::File;
4use std::io::{BufRead, BufReader};
5use std::path::Path;
6use std::str::FromStr;
7
8use crate::gtf::{GtfFeature, GtfRecord, GtfRecordsGroup};
9use crate::models::{Transcript, TranscriptRead, Transcripts};
10use crate::utils::errors::ParseGtfError;
11use crate::utils::errors::ReadWriteError;
12
13use super::record::UncheckedGtfRecord;
14
15pub struct Reader<R> {
39 inner: std::io::BufReader<R>,
40 line_content: String,
41}
42
43impl Reader<File> {
44 pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self, ReadWriteError> {
68 match File::open(path.as_ref()) {
69 Ok(file) => Ok(Self::new(file)),
70 Err(err) => Err(ReadWriteError::new(err)),
71 }
72 }
73}
74
75impl<R: std::io::Read> Reader<R> {
76 pub fn new(reader: R) -> Self {
81 Reader {
82 inner: BufReader::new(reader),
83 line_content: String::with_capacity(200),
84 }
85 }
86
87 pub fn with_capacity(capacity: usize, reader: R) -> Self {
91 Reader {
92 inner: BufReader::with_capacity(capacity, reader),
93 line_content: String::with_capacity(200),
94 }
95 }
96
97 fn line(&mut self) -> Option<Result<GtfRecord, ParseGtfError>> {
102 self.line_content.clear();
103 match self.inner.read_line(&mut self.line_content) {
104 Ok(_) => {}
105 Err(x) => {
106 return Some(Err(ParseGtfError {
107 message: x.to_string(),
108 }))
109 }
110 }
111
112 if self.line_content.starts_with('#') {
113 return self.line();
114 }
115
116 if self.line_content.is_empty() {
117 return None;
118 }
119
120 UncheckedGtfRecord::from_str(&self.line_content)
121 .map(|record| {
122 if record.standard_feature() {
123 Some(GtfRecord::try_from(record))
124 } else {
125 self.line()
126 }
127 })
128 .unwrap_or_else(|err| Some(Err(err)))
129 }
130}
131
132impl<R: std::io::Read> TranscriptRead for Reader<R> {
133 fn transcripts(&mut self) -> Result<Transcripts, ReadWriteError> {
153 let mut transcript_hashmap: HashMap<String, GtfRecordsGroup> = HashMap::new();
154 while let Some(line) = self.line() {
155 let gtf_record = line?;
156 let transcript = transcript_hashmap
157 .entry(gtf_record.transcript().to_string())
158 .or_insert_with(|| GtfRecordsGroup::new(gtf_record.transcript()));
159
160 match gtf_record.feature() {
161 GtfFeature::Exon => transcript.add_exon(gtf_record),
162 GtfFeature::CDS => transcript.add_exon(gtf_record),
163 GtfFeature::UTR | GtfFeature::UTR3 | GtfFeature::UTR5 => {
164 transcript.add_exon(gtf_record)
165 }
166 GtfFeature::StartCodon => {
167 transcript.add_exon(gtf_record);
168 }
169 GtfFeature::StopCodon => transcript.add_exon(gtf_record),
170 _ => {}
171 }
172 }
173
174 let mut res = Transcripts::with_capacity(transcript_hashmap.len());
175
176 for (_, gtf_transcript) in transcript_hashmap.drain() {
177 match Transcript::try_from(gtf_transcript) {
178 Ok(transcript) => res.push(transcript),
179 Err(err) => {
180 return Err(ReadWriteError::new(format!("Error parsing {err}")));
181 }
182 }
183 }
184 Ok(res)
185 }
186}
187
188#[cfg(test)]
189mod test_nm_001385228 {
190 use super::*;
191
192 #[test]
193 fn test_read() {
194 let mut reader = Reader::from_file("tests/data/NM_001385228.1_2.gtf").unwrap();
195 let tr = match reader.transcripts() {
196 Ok(res) => res,
197 _ => panic!("No transcripts could be read"),
198 };
199
200 assert_eq!(tr.len(), 1);
201 let t = tr.by_name("NM_001385228.1_2")[0];
202 assert_eq!(t.exons().len(), 9);
203 assert_eq!(t.cds_start().unwrap(), 206105119);
204 assert_eq!(t.cds_end().unwrap(), 206135359);
205
206 let start = t.start_codon();
207 assert_eq!(start.len(), 1);
208
209 let stop = t.stop_codon();
210 assert_eq!(stop.len(), 2);
211
212 assert!(!t.exons()[0].is_coding());
213 assert!(!t.exons()[1].is_coding());
214 assert!(!t.exons()[2].is_coding());
215 assert!(!t.exons()[3].is_coding());
216 assert!(!t.exons()[4].is_coding());
217 assert!(t.exons()[5].is_coding());
218 assert!(t.exons()[6].is_coding());
219 assert!(t.exons()[7].is_coding());
220 assert!(!t.exons()[8].is_coding());
221
222 assert_eq!(t.exons()[5].cds_start().unwrap(), 206105119);
223 assert_eq!(t.exons()[6].cds_start().unwrap(), 206105123);
224 assert_eq!(t.exons()[7].cds_start().unwrap(), 206135293);
225
226 assert_eq!(t.exons()[5].cds_end().unwrap(), 206105120);
227 assert_eq!(t.exons()[6].cds_end().unwrap(), 206105206);
228 assert_eq!(t.exons()[7].cds_end().unwrap(), 206135359);
229 }
230}
231
232#[cfg(test)]
233mod test_nm_201550 {
234 use super::*;
235
236 #[test]
237 fn test_read() {
238 let mut reader = Reader::from_file("tests/data/NM_201550.4.gtf")
239 .expect("Something failed reading the GTF file NM_201550.4.gtf");
240 let tr = match reader.transcripts() {
241 Ok(res) => res,
242 _ => panic!("No transcripts could be read"),
243 };
244 assert_eq!(tr.len(), 1);
245 let t = tr.by_name("NM_201550.4")[0];
246 assert_eq!(t.exons().len(), 1);
247 assert_eq!(t.cds_start().unwrap(), 70003785);
248 assert_eq!(t.cds_end().unwrap(), 70004618);
249
250 let start = t.start_codon();
251 assert_eq!(start.len(), 1);
252
253 let stop = t.stop_codon();
254 assert_eq!(stop.len(), 1);
255
256 assert!(t.exons()[0].is_coding());
257 }
258}
259
260#[cfg(test)]
299mod test_ighm {
300 use super::*;
301 use crate::models::CdsStat;
302
303 #[test]
304 fn test_read() {
305 let mut reader = Reader::from_file("tests/data/id-IGHM.gtf")
306 .expect("Something failed reading the GTF file id-IGHM.gtf");
307 let tr = match reader.transcripts() {
308 Ok(res) => res,
309 _ => panic!("No transcripts could be read"),
310 };
311 assert_eq!(tr.len(), 1);
312 let t = tr.by_name("id-IGHM")[0];
313 assert_eq!(t.exons().len(), 6);
314 assert_eq!(t.cds_start().unwrap(), 106318298);
315 assert_eq!(t.cds_end().unwrap(), 106322322);
316
317 let start = t.start_codon();
318 assert_eq!(start.len(), 1);
319
320 let stop = t.stop_codon();
321 assert_eq!(stop.len(), 1);
322
323 assert!(t.exons()[0].is_coding());
324 assert!(t.exons()[1].is_coding());
325 assert!(t.exons()[2].is_coding());
326 assert!(t.exons()[3].is_coding());
327 assert!(t.exons()[4].is_coding());
328
329 assert_eq!(t.cds_start_codon_stat(), CdsStat::Complete);
330 assert_eq!(t.cds_stop_codon_stat(), CdsStat::Incomplete);
331 }
332}
333
334#[cfg(test)]
335mod tests {
336 use super::*;
337 use crate::tests::transcripts;
338
339 #[test]
340 fn test_nm_001365057() {
341 let transcripts = Reader::from_file("tests/data/NM_001365057.2.gtf")
342 .unwrap()
343 .transcripts()
344 .unwrap();
345
346 assert_eq!(
347 transcripts.by_name("NM_001365057.2")[0],
348 &transcripts::nm_001365057()
349 )
350 }
351
352 #[test]
353 fn test_nm_001365408() {
354 let transcripts = Reader::from_file("tests/data/NM_001365408.1.gtf")
355 .unwrap()
356 .transcripts()
357 .unwrap();
358
359 assert_eq!(
360 transcripts.by_name("NM_001365408.1")[0],
361 &transcripts::nm_001365408()
362 )
363 }
364
365 #[test]
366 fn test_nm_001371720() {
367 let transcripts = Reader::from_file("tests/data/NM_001371720.1.gtf")
368 .unwrap()
369 .transcripts()
370 .unwrap();
371
372 assert_eq!(
373 transcripts.by_name("NM_001371720.1")[0],
374 &transcripts::nm_001371720(true)
375 )
376 }
377
378 #[test]
379 fn test_nm_201550() {
380 let transcripts = Reader::from_file("tests/data/NM_201550.4.gtf")
381 .unwrap()
382 .transcripts()
383 .unwrap();
384
385 assert_eq!(
386 transcripts.by_name("NM_201550.4")[0],
387 &transcripts::nm_201550()
388 )
389 }
390
391 #[test]
392 fn test_gene_line_returns_none() {
393 let transcript = "chr16\tncbiRefSeq.2021-05-17\tgene\t66969419\t66978999\t.\t+\t.\tgene_id \"CES2\"; transcript_id \"NM_001365408.1\"; gene_name \"CES2\";".as_bytes();
395 let mut reader = Reader::new(transcript);
396 assert!(reader.line().is_none());
397
398 let transcript = "chr16\tncbiRefSeq.2021-05-17\tgene\t66969419\t66978999\t.\t+\t.\tgene_id \"CES2\"; transcript_id \"NM_001365408.1\"; gene_name \"CES2\";\n".as_bytes();
400 let mut reader = Reader::new(transcript);
401 assert!(reader.line().is_none());
402
403 let transcript = "chr16\tncbiRefSeq.2021-05-17\tgene\t66969419\t66978999\t.\t+\t.\tgene_name \"CES2\";\n".as_bytes();
405 let mut reader = Reader::new(transcript);
406 assert!(reader.line().is_none());
407
408 let transcript = "chr16\tncbiRefSeq.2021-05-17\tgene\t66969419\t66978999\t.\t+\t.\tgene_id \"CES2\"; gene_name \"CES2\";\n".as_bytes();
410 let mut reader = Reader::new(transcript);
411 assert!(reader.line().is_none());
412
413 let transcript = "chr16\tncbiRefSeq.2021-05-17\tgene\t66969419\t66978999\t.\t+\t.\ttranscript_id \"NM_001365408.1\"; gene_name \"CES2\";\n".as_bytes();
415 let mut reader = Reader::new(transcript);
416 assert!(reader.line().is_none());
417 }
418
419 #[test]
420 fn test_gene_line_is_skipped() {
421 let transcript = "chr16\tncbiRefSeq.2021-05-17\tgene\t66969419\t66978999\t.\t+\t.\tgene_id \"CES2\"; transcript_id \"NM_001365408.1\"; gene_name \"CES2\";\n\
422 chr1\tncbiRefSeq.2021-05-17\texon\t206100298\t206100445\t.\t-\t.\tgene_id \"SRGAP2B\"; transcript_id \"NM_001385228.1_2\"; exon_number \"9\"; exon_id \"NM_001385228.1_2.9\"; gene_name \"SRGAP2B\";".as_bytes();
423 let mut reader = Reader::new(transcript);
424 assert!(reader
425 .line()
426 .expect("The record contains one proper line")
427 .is_ok());
428 assert!(reader.line().is_none());
429 }
430
431 #[test]
432 fn test_gene_line_contains_invalid_data() {
433 let transcript = "chr16\tncbiRefSeq.2021-05-17\tgene\t66969419\t66978999\t.\tINVALID\t.\tgene_id \"CES2\"; transcript_id \"NM_001365408.1\"; gene_name \"CES2\";\n".as_bytes();
434 let mut reader = Reader::new(transcript);
435 assert!(reader
436 .line()
437 .expect("The record contains one proper line")
438 .is_err());
439 assert!(reader.line().is_none());
440 }
441
442 #[test]
443 fn test_without_gene_or_transcript_errors() {
444 let transcript = "chr16\tncbiRefSeq.2021-05-17\texon\t66969419\t66978999\t.\t+\t.\tgene_id \"CES2\"; transcript_id \"NM_001365408.1\"; gene_name \"CES2\";\n".as_bytes();
445 let mut reader = Reader::new(transcript);
446 assert!(reader
447 .line()
448 .expect("The record contains one proper line")
449 .is_ok());
450 assert!(reader.line().is_none());
451
452 let transcript = "chr16\tncbiRefSeq.2021-05-17\texon\t66969419\t66978999\t.\t+\t.\ttranscript_id \"NM_001365408.1\"; gene_name \"CES2\";\n".as_bytes();
453 let mut reader = Reader::new(transcript);
454 assert!(reader
455 .line()
456 .expect("The record contains one proper line")
457 .is_err());
458 assert!(reader.line().is_none());
459
460 let transcript = "chr16\tncbiRefSeq.2021-05-17\texon\t66969419\t66978999\t.\t+\t.\tgene_id \"CES2\"; gene_name \"CES2\";\n".as_bytes();
461 let mut reader = Reader::new(transcript);
462 assert!(reader
463 .line()
464 .expect("The record contains one proper line")
465 .is_err());
466 assert!(reader.line().is_none());
467 }
468
469 #[test]
470 fn test_5_utr() {
471 let transcript = "chr16\tncbiRefSeq.2021-05-17\t5UTR\t66969419\t66978999\t.\t+\t.\tgene_id \"CES2\"; transcript_id \"NM_001365408.1\"; gene_name \"CES2\";".as_bytes();
472 let mut reader = Reader::new(transcript);
473 assert_eq!(
474 reader
475 .line()
476 .expect("The record contains one proper line")
477 .expect("The data is correct")
478 .feature(),
479 &GtfFeature::UTR5
480 );
481
482 let transcript = "chr16\tncbiRefSeq.2021-05-17\tfive_prime_utr\t66969419\t66978999\t.\t+\t.\tgene_id \"CES2\"; transcript_id \"NM_001365408.1\"; gene_name \"CES2\";".as_bytes();
483 let mut reader = Reader::new(transcript);
484 assert_eq!(
485 reader
486 .line()
487 .expect("The record contains one proper line")
488 .expect("The data is correct")
489 .feature(),
490 &GtfFeature::UTR5
491 );
492 }
493
494 #[test]
495 fn test_3_utr() {
496 let transcript = "chr16\tncbiRefSeq.2021-05-17\t3UTR\t66969419\t66978999\t.\t+\t.\tgene_id \"CES2\"; transcript_id \"NM_001365408.1\"; gene_name \"CES2\";".as_bytes();
497 let mut reader = Reader::new(transcript);
498 assert_eq!(
499 reader
500 .line()
501 .expect("The record contains one proper line")
502 .expect("The data is correct")
503 .feature(),
504 &GtfFeature::UTR3
505 );
506
507 let transcript = "chr16\tncbiRefSeq.2021-05-17\tthree_prime_utr\t66969419\t66978999\t.\t+\t.\tgene_id \"CES2\"; transcript_id \"NM_001365408.1\"; gene_name \"CES2\";".as_bytes();
508 let mut reader = Reader::new(transcript);
509 assert_eq!(
510 reader
511 .line()
512 .expect("The record contains one proper line")
513 .expect("The data is correct")
514 .feature(),
515 &GtfFeature::UTR3
516 );
517 }
518
519 #[test]
520 fn test_utr() {
521 let transcript = "chr16\tncbiRefSeq.2021-05-17\tUTR\t66969419\t66978999\t.\t+\t.\tgene_id \"CES2\"; transcript_id \"NM_001365408.1\"; gene_name \"CES2\";".as_bytes();
522 let mut reader = Reader::new(transcript);
523 assert_eq!(
524 reader
525 .line()
526 .expect("The record contains one proper line")
527 .expect("The data is correct")
528 .feature(),
529 &GtfFeature::UTR
530 );
531 }
532}