1use std::convert::TryFrom;
2use std::fs::File;
3use std::io::{BufRead, BufReader};
4use std::path::Path;
5use std::str::FromStr;
6
7use crate::models::{CdsStat, Exon, Frame, Strand, TranscriptBuilder};
8use crate::models::{Transcript, TranscriptRead, Transcripts};
9use crate::refgene::constants::*;
10use crate::utils::errors::{ParseRefGeneError, ReadWriteError};
11use crate::utils::exon_cds_overlap;
12
13pub struct Reader<R> {
37 inner: std::io::BufReader<R>,
38}
39
40impl Reader<File> {
41 pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self, ReadWriteError> {
65 match File::open(path.as_ref()) {
66 Ok(file) => Ok(Self::new(file)),
67 Err(err) => Err(ReadWriteError::new(err)),
68 }
69 }
70}
71
72impl<R: std::io::Read> Reader<R> {
73 pub fn new(reader: R) -> Self {
78 Reader {
79 inner: BufReader::new(reader),
80 }
81 }
82
83 pub fn with_capacity(capacity: usize, reader: R) -> Self {
87 Reader {
88 inner: BufReader::with_capacity(capacity, reader),
89 }
90 }
91
92 pub fn line(&mut self) -> Option<Result<Transcript, ParseRefGeneError>> {
97 let mut line = String::new();
98 match self.inner.read_line(&mut line) {
99 Ok(_) => {}
100 Err(x) => {
101 return Some(Err(ParseRefGeneError {
102 message: x.to_string(),
103 }))
104 }
105 }
106
107 if line.starts_with('#') {
108 return self.line();
109 }
110
111 if line.is_empty() {
112 None
113 } else {
114 let cols: Vec<&str> = line.trim().split('\t').collect();
115 Some(Transcript::try_from(cols))
116 }
117 }
118}
119
120impl<R: std::io::Read> TranscriptRead for Reader<R> {
121 fn transcripts(&mut self) -> Result<Transcripts, ReadWriteError> {
142 let mut res = Transcripts::new();
143 while let Some(line) = self.line() {
144 match line {
145 Ok(t) => res.push(t),
146 Err(x) => return Err(ReadWriteError::from(x)),
147 }
148 }
149 Ok(res)
150 }
151}
152
153impl TryFrom<Vec<&str>> for Transcript {
154 type Error = ParseRefGeneError;
155
156 fn try_from(cols: Vec<&str>) -> Result<Self, ParseRefGeneError> {
159 if cols.len() != N_REFGENE_COLUMNS {
160 return Err(ParseRefGeneError {
161 message: format!(
162 "Invalid number of columns in line\nvv\n{}\n^^",
163 cols.join("\t")
164 ),
165 });
166 }
167
168 let bin = match cols[BIN_COL].parse::<u16>() {
169 Ok(x) => Some(x),
170 _ => None,
171 };
172 let strand = match Strand::from_str(cols[STRAND_COL]) {
173 Ok(x) => x,
174 Err(message) => return Err(ParseRefGeneError { message }),
175 };
176 let mut exons = instantiate_exons(&cols)?;
177 let cds_start_stat = match CdsStat::from_str(cols[CDS_START_STAT_COL]) {
178 Ok(x) => x,
179 Err(message) => return Err(ParseRefGeneError { message }),
180 };
181
182 let cds_end_stat = match CdsStat::from_str(cols[CDS_END_STAT_COL]) {
183 Ok(x) => x,
184 Err(message) => return Err(ParseRefGeneError { message }),
185 };
186 let score = match cols[SCORE_COL].parse::<f32>() {
187 Ok(x) => Some(x),
188 _ => None,
189 };
190
191 let mut transcript = TranscriptBuilder::new()
192 .bin(bin)
193 .name(cols[TRANSCRIPT_COL])
194 .chrom(cols[CHROMOSOME_COL])
195 .strand(strand)
196 .gene(cols[GENE_SYMBOL_COL])
197 .cds_start_stat(cds_start_stat)
198 .cds_end_stat(cds_end_stat)
199 .score(score)
200 .build()
201 .unwrap();
202 transcript.append_exons(&mut exons);
203 Ok(transcript)
204 }
205}
206
207fn instantiate_exons(cols: &[&str]) -> Result<Vec<Exon>, ParseRefGeneError> {
213 let exon_count = cols[EXON_COUNT_COL].parse::<usize>().unwrap();
216
217 let mut exons: Vec<Exon> = Vec::with_capacity(exon_count);
219
220 let starts: Vec<&str> = cols[EXON_STARTS_COL]
223 .trim_end_matches(',')
224 .split(',')
225 .collect();
226 let ends: Vec<&str> = cols[EXON_ENDS_COL]
227 .trim_end_matches(',')
228 .split(',')
229 .collect();
230 let frame_offsets: Vec<&str> = cols[EXON_FRAMES_COL]
231 .trim_end_matches(',')
232 .split(',')
233 .collect();
234
235 let (coding, cds_start, cds_end) = match (
238 cols[CDS_START_COL].parse::<u32>(),
239 cols[CDS_END_COL].parse::<u32>(),
240 ) {
241 (Ok(start), Ok(end)) => (true, Some(start), Some(end)),
242 _ => (false, None, None),
243 };
244
245 for i in 0..exon_count {
247 let start = starts
248 .get(i)
249 .ok_or("Too few exon starts in input")?
250 .parse::<u32>()?
253 + 1;
254 let end = ends
255 .get(i)
256 .ok_or("Too few exon ends in input")?
257 .parse::<u32>()?;
258
259 let exon_cds = if coding {
261 exon_cds_overlap(
262 &start,
263 &end,
264 &(cds_start.unwrap() + 1),
268 &cds_end.unwrap(),
269 )
270 } else {
271 (None, None)
272 };
273
274 exons.push(Exon::new(
275 start,
276 end,
277 exon_cds.0,
278 exon_cds.1,
279 Frame::from_refgene(frame_offsets.get(i).ok_or("Too few exon Frame offsets")?)?,
280 ));
281 }
282 Ok(exons)
283}
284
285#[cfg(test)]
286mod tests {
287 use super::*;
288 use crate::tests::transcripts;
289
290 #[test]
291 fn test_parse_exons_no_cds() {
292 let cols = vec![
293 "585",
294 "NR_046018.2",
295 "chr1",
296 "+",
297 "11873",
298 "14409",
299 "14409",
300 "14409",
301 "3",
302 "11873,12612,13220,",
303 "12227,12721,14409,",
304 "0",
305 "DDX11L1",
306 "unk",
307 "unk",
308 "-1,-1,-1,",
309 ];
310 let exons = instantiate_exons(&cols).unwrap();
311
312 assert_eq!(exons.len(), 3);
313
314 assert_eq!(exons[0].start(), 11874);
315 assert_eq!(exons[0].end(), 12227);
316 assert_eq!(*exons[0].cds_start(), None);
317 assert_eq!(*exons[0].cds_end(), None);
318
319 assert_eq!(exons[1].start(), 12613);
320 assert_eq!(exons[1].end(), 12721);
321 assert_eq!(*exons[1].cds_start(), None);
322 assert_eq!(*exons[1].cds_end(), None);
323
324 assert_eq!(exons[2].start(), 13221);
325 assert_eq!(exons[2].end(), 14409);
326 assert_eq!(*exons[2].cds_start(), None);
327 assert_eq!(*exons[2].cds_end(), None);
328 }
329
330 #[test]
331 fn test_missing_exon_stop() {
332 let cols = vec![
333 "585",
334 "NR_046018.2",
335 "chr1",
336 "+",
337 "11873",
338 "14409",
339 "14409",
340 "14409",
341 "3",
342 "11873,12612,13220",
343 "12227,12721,",
344 "0",
345 "DDX11L1",
346 "unk",
347 "unk",
348 "-1,-1,-1,",
349 ];
350 let exons = instantiate_exons(&cols);
351
352 assert!(exons.is_err());
353 assert_eq!(
354 exons.unwrap_err().message,
355 "Too few exon ends in input".to_string()
356 );
357 }
358
359 #[test]
360 fn test_missing_exon_start() {
361 let cols = vec![
362 "585",
363 "NR_046018.2",
364 "chr1",
365 "+",
366 "11873",
367 "14409",
368 "14409",
369 "14409",
370 "3",
371 "11873,12612,",
372 "12227,12721,14409,",
373 "0",
374 "DDX11L1",
375 "unk",
376 "unk",
377 "-1,-1,-1,",
378 ];
379 let exons = instantiate_exons(&cols);
380
381 assert!(exons.is_err());
382 assert_eq!(
383 exons.unwrap_err().message,
384 "Too few exon starts in input".to_string()
385 );
386 }
387
388 #[test]
389 fn test_missing_exon_frame() {
390 let cols = vec![
391 "585",
392 "NR_046018.2",
393 "chr1",
394 "+",
395 "11873",
396 "14409",
397 "14409",
398 "14409",
399 "3",
400 "11873,12612,13220",
401 "12227,12721,14409,",
402 "0",
403 "DDX11L1",
404 "unk",
405 "unk",
406 "-1,-1,",
407 ];
408 let exons = instantiate_exons(&cols);
409
410 assert!(exons.is_err());
411 assert_eq!(
412 exons.unwrap_err().message,
413 "Too few exon Frame offsets".to_string()
414 );
415 }
416
417 #[test]
418 fn test_wrong_exon_frame() {
419 let cols = vec![
420 "585",
421 "NR_046018.2",
422 "chr1",
423 "+",
424 "11873",
425 "14409",
426 "14409",
427 "14409",
428 "3",
429 "11873,12612,13220",
430 "12227,12721,14409,",
431 "0",
432 "DDX11L1",
433 "unk",
434 "unk",
435 "-1,/,-1",
436 ];
437 let exons = instantiate_exons(&cols);
438
439 assert!(exons.is_err());
440 assert_eq!(
441 exons.unwrap_err().message,
442 "invalid frame indicator /".to_string()
443 );
444 }
445
446 #[test]
447 fn test_nm_001365057() {
448 let transcripts = Reader::from_file("tests/data/NM_001365057.2.refgene")
449 .unwrap()
450 .transcripts()
451 .unwrap();
452
453 assert_eq!(
454 transcripts.by_name("NM_001365057.2")[0],
455 &transcripts::nm_001365057()
456 )
457 }
458
459 #[test]
460 fn test_nm_001365408() {
461 let transcripts = Reader::from_file("tests/data/NM_001365408.1.refgene")
462 .unwrap()
463 .transcripts()
464 .unwrap();
465
466 assert_eq!(
467 transcripts.by_name("NM_001365408.1")[0],
468 &transcripts::nm_001365408()
469 )
470 }
471
472 #[test]
473 fn test_nm_001371720() {
474 let transcripts = Reader::from_file("tests/data/NM_001371720.1.refgene")
475 .unwrap()
476 .transcripts()
477 .unwrap();
478
479 assert_eq!(
480 transcripts.by_name("NM_001371720.1")[0],
481 &transcripts::nm_001371720(false)
482 )
483 }
484
485 #[test]
486 fn test_nm_201550() {
487 let transcripts = Reader::from_file("tests/data/NM_201550.4.refgene")
488 .unwrap()
489 .transcripts()
490 .unwrap();
491
492 assert_eq!(
493 transcripts.by_name("NM_201550.4")[0],
494 &transcripts::nm_201550()
495 )
496 }
497}