use std::convert::TryFrom;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;
use std::str::FromStr;
use crate::models::{CdsStat, Exon, Frame, Strand, TranscriptBuilder};
use crate::models::{Transcript, TranscriptRead, Transcripts};
use crate::refgene::constants::*;
use crate::utils::errors::{ParseRefGeneError, ReadWriteError};
use crate::utils::exon_cds_overlap;
pub struct Reader<R> {
inner: std::io::BufReader<R>,
}
impl Reader<File> {
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self, ReadWriteError> {
match File::open(path.as_ref()) {
Ok(file) => Ok(Self::new(file)),
Err(err) => Err(ReadWriteError::new(err)),
}
}
}
impl<R: std::io::Read> Reader<R> {
pub fn new(reader: R) -> Self {
Reader {
inner: BufReader::new(reader),
}
}
pub fn with_capacity(capacity: usize, reader: R) -> Self {
Reader {
inner: BufReader::with_capacity(capacity, reader),
}
}
pub fn line(&mut self) -> Option<Result<Transcript, ParseRefGeneError>> {
let mut line = String::new();
match self.inner.read_line(&mut line) {
Ok(_) => {}
Err(x) => {
return Some(Err(ParseRefGeneError {
message: x.to_string(),
}))
}
}
if line.starts_with('#') {
return self.line();
}
if line.is_empty() {
None
} else {
let cols: Vec<&str> = line.trim().split('\t').collect();
Some(Transcript::try_from(cols))
}
}
}
impl<R: std::io::Read> TranscriptRead for Reader<R> {
fn transcripts(&mut self) -> Result<Transcripts, ReadWriteError> {
let mut res = Transcripts::new();
while let Some(line) = self.line() {
match line {
Ok(t) => res.push(t),
Err(x) => return Err(ReadWriteError::from(x)),
}
}
Ok(res)
}
}
impl TryFrom<Vec<&str>> for Transcript {
type Error = ParseRefGeneError;
fn try_from(cols: Vec<&str>) -> Result<Self, ParseRefGeneError> {
if cols.len() != N_REFGENE_COLUMNS {
return Err(ParseRefGeneError {
message: format!(
"Invalid number of columns in line\nvv\n{}\n^^",
cols.join("\t")
),
});
}
let bin = match cols[BIN_COL].parse::<u16>() {
Ok(x) => Some(x),
_ => None,
};
let strand = match Strand::from_str(cols[STRAND_COL]) {
Ok(x) => x,
Err(message) => return Err(ParseRefGeneError { message }),
};
let mut exons = instantiate_exons(&cols)?;
let cds_start_stat = match CdsStat::from_str(cols[CDS_START_STAT_COL]) {
Ok(x) => x,
Err(message) => return Err(ParseRefGeneError { message }),
};
let cds_end_stat = match CdsStat::from_str(cols[CDS_END_STAT_COL]) {
Ok(x) => x,
Err(message) => return Err(ParseRefGeneError { message }),
};
let score = match cols[SCORE_COL].parse::<f32>() {
Ok(x) => Some(x),
_ => None,
};
let mut transcript = TranscriptBuilder::new()
.bin(bin)
.name(cols[TRANSCRIPT_COL])
.chrom(cols[CHROMOSOME_COL])
.strand(strand)
.gene(cols[GENE_SYMBOL_COL])
.cds_start_stat(cds_start_stat)
.cds_end_stat(cds_end_stat)
.score(score)
.build()
.unwrap();
transcript.append_exons(&mut exons);
Ok(transcript)
}
}
fn instantiate_exons(cols: &[&str]) -> Result<Vec<Exon>, ParseRefGeneError> {
let exon_count = cols[EXON_COUNT_COL].parse::<usize>().unwrap();
let mut exons: Vec<Exon> = Vec::with_capacity(exon_count);
let starts: Vec<&str> = cols[EXON_STARTS_COL]
.trim_end_matches(',')
.split(',')
.collect();
let ends: Vec<&str> = cols[EXON_ENDS_COL]
.trim_end_matches(',')
.split(',')
.collect();
let frame_offsets: Vec<&str> = cols[EXON_FRAMES_COL]
.trim_end_matches(',')
.split(',')
.collect();
let (coding, cds_start, cds_end) = match (
cols[CDS_START_COL].parse::<u32>(),
cols[CDS_END_COL].parse::<u32>(),
) {
(Ok(start), Ok(end)) => (true, Some(start), Some(end)),
_ => (false, None, None),
};
for i in 0..exon_count {
let start = starts
.get(i)
.ok_or("Too few exon starts in input")?
.parse::<u32>()?
+ 1;
let end = ends
.get(i)
.ok_or("Too few exon ends in input")?
.parse::<u32>()?;
let exon_cds = if coding {
exon_cds_overlap(
&start,
&end,
&(cds_start.unwrap() + 1),
&cds_end.unwrap(),
)
} else {
(None, None)
};
exons.push(Exon::new(
start,
end,
exon_cds.0,
exon_cds.1,
Frame::from_refgene(frame_offsets.get(i).ok_or("Too few exon Frame offsets")?)?,
));
}
Ok(exons)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tests::transcripts;
#[test]
fn test_parse_exons_no_cds() {
let cols = vec![
"585",
"NR_046018.2",
"chr1",
"+",
"11873",
"14409",
"14409",
"14409",
"3",
"11873,12612,13220,",
"12227,12721,14409,",
"0",
"DDX11L1",
"unk",
"unk",
"-1,-1,-1,",
];
let exons = instantiate_exons(&cols).unwrap();
assert_eq!(exons.len(), 3);
assert_eq!(exons[0].start(), 11874);
assert_eq!(exons[0].end(), 12227);
assert_eq!(*exons[0].cds_start(), None);
assert_eq!(*exons[0].cds_end(), None);
assert_eq!(exons[1].start(), 12613);
assert_eq!(exons[1].end(), 12721);
assert_eq!(*exons[1].cds_start(), None);
assert_eq!(*exons[1].cds_end(), None);
assert_eq!(exons[2].start(), 13221);
assert_eq!(exons[2].end(), 14409);
assert_eq!(*exons[2].cds_start(), None);
assert_eq!(*exons[2].cds_end(), None);
}
#[test]
fn test_missing_exon_stop() {
let cols = vec![
"585",
"NR_046018.2",
"chr1",
"+",
"11873",
"14409",
"14409",
"14409",
"3",
"11873,12612,13220",
"12227,12721,",
"0",
"DDX11L1",
"unk",
"unk",
"-1,-1,-1,",
];
let exons = instantiate_exons(&cols);
assert_eq!(exons.is_err(), true);
assert_eq!(
exons.unwrap_err().message,
"Too few exon ends in input".to_string()
);
}
#[test]
fn test_missing_exon_start() {
let cols = vec![
"585",
"NR_046018.2",
"chr1",
"+",
"11873",
"14409",
"14409",
"14409",
"3",
"11873,12612,",
"12227,12721,14409,",
"0",
"DDX11L1",
"unk",
"unk",
"-1,-1,-1,",
];
let exons = instantiate_exons(&cols);
assert_eq!(exons.is_err(), true);
assert_eq!(
exons.unwrap_err().message,
"Too few exon starts in input".to_string()
);
}
#[test]
fn test_missing_exon_frame() {
let cols = vec![
"585",
"NR_046018.2",
"chr1",
"+",
"11873",
"14409",
"14409",
"14409",
"3",
"11873,12612,13220",
"12227,12721,14409,",
"0",
"DDX11L1",
"unk",
"unk",
"-1,-1,",
];
let exons = instantiate_exons(&cols);
assert_eq!(exons.is_err(), true);
assert_eq!(
exons.unwrap_err().message,
"Too few exon Frame offsets".to_string()
);
}
#[test]
fn test_wrong_exon_frame() {
let cols = vec![
"585",
"NR_046018.2",
"chr1",
"+",
"11873",
"14409",
"14409",
"14409",
"3",
"11873,12612,13220",
"12227,12721,14409,",
"0",
"DDX11L1",
"unk",
"unk",
"-1,/,-1",
];
let exons = instantiate_exons(&cols);
assert_eq!(exons.is_err(), true);
assert_eq!(
exons.unwrap_err().message,
"invalid frame indicator /".to_string()
);
}
#[test]
fn test_nm_001365057() {
let transcripts = Reader::from_file("tests/data/NM_001365057.2.refgene")
.unwrap()
.transcripts()
.unwrap();
assert_eq!(
transcripts.by_name("NM_001365057.2")[0],
&transcripts::nm_001365057()
)
}
#[test]
fn test_nm_001365408() {
let transcripts = Reader::from_file("tests/data/NM_001365408.1.refgene")
.unwrap()
.transcripts()
.unwrap();
assert_eq!(
transcripts.by_name("NM_001365408.1")[0],
&transcripts::nm_001365408()
)
}
#[test]
fn test_nm_001371720() {
let transcripts = Reader::from_file("tests/data/NM_001371720.1.refgene")
.unwrap()
.transcripts()
.unwrap();
assert_eq!(
transcripts.by_name("NM_001371720.1")[0],
&transcripts::nm_001371720(false)
)
}
#[test]
fn test_nm_201550() {
let transcripts = Reader::from_file("tests/data/NM_201550.4.refgene")
.unwrap()
.transcripts()
.unwrap();
assert_eq!(
transcripts.by_name("NM_201550.4")[0],
&transcripts::nm_201550()
)
}
}