use std::fs::File;
use std::io::{BufRead, BufReader};
use flate2::read::GzDecoder;
use crate::genes::Genes;
use crate::granges::GRanges;
use crate::meta::MetaData;
use crate::utility::is_gzip;
impl Genes {
fn set_expr_column(&mut self, expr: Vec<f64>) -> Result<(), Box<dyn std::error::Error>> {
self.granges.meta.delete_meta("expr");
self.granges.meta.add("expr", MetaData::FloatArray(expr))?;
Ok(())
}
pub fn read_gtf_expr<R: BufRead>(
&mut self,
reader: R,
gene_id_name: &str,
expr_id_name: &str,
) -> Result<(), Box<dyn std::error::Error>> {
let granges = GRanges::read_gtf(
reader,
vec![gene_id_name, expr_id_name],
vec!["str", "float"],
vec![],
)?;
let gene_ids = granges
.meta
.get_column_str(gene_id_name)
.ok_or_else(|| format!("invalid gene_id_name `{}`", gene_id_name))?;
let expr_values = granges
.meta
.get_column_float(expr_id_name)
.ok_or_else(|| format!("invalid expr_id_name `{}`", expr_id_name))?;
let mut expr = vec![0.0; self.granges.num_rows()];
for i in 0..granges.num_rows() {
if let Some(j) = self.find_gene(&gene_ids[i]) {
expr[j] += expr_values[i];
}
}
self.set_expr_column(expr)
}
pub fn import_gtf_expr(
&mut self,
filename: &str,
gene_id_name: &str,
expr_id_name: &str,
) -> Result<(), Box<dyn std::error::Error>> {
let file = File::open(filename)?;
let reader: Box<dyn BufRead> = if is_gzip(filename) {
Box::new(BufReader::new(GzDecoder::new(file)))
} else {
Box::new(BufReader::new(file))
};
self.read_gtf_expr(reader, gene_id_name, expr_id_name)
}
pub fn read_cufflinks_fpkm_tracking<R: BufRead>(
&mut self,
reader: R,
verbose: bool,
) -> Result<(), Box<dyn std::error::Error>> {
let mut expr = vec![0.0; self.granges.num_rows()];
let mut lines = reader.lines();
let header = lines.next().ok_or("invalid header")??;
let header_fields: Vec<&str> = header.split_whitespace().collect();
if header_fields.len() != 13
|| header_fields[0] != "tracking_id"
|| (header_fields[9] != "FPKM" && header_fields[9] != "RPKM")
{
return Err("invalid header".into());
}
for line in lines {
let line = line?;
let fields: Vec<&str> = line.split_whitespace().collect();
if fields.is_empty() {
continue;
}
if fields.len() != 13 {
return Err("file must have 13 columns".into());
}
if fields[12] != "OK" {
continue;
}
let gene_id = fields[0];
let expr_value = fields[9].parse::<f64>()?;
if let Some(i) = self.find_gene(gene_id) {
expr[i] += expr_value;
} else if verbose {
eprintln!("`{}` not present in gene list!", gene_id);
}
}
self.set_expr_column(expr)
}
pub fn import_cufflinks_fpkm_tracking(
&mut self,
filename: &str,
verbose: bool,
) -> Result<(), Box<dyn std::error::Error>> {
let file = File::open(filename)?;
let reader: Box<dyn BufRead> = if is_gzip(filename) {
Box::new(BufReader::new(GzDecoder::new(file)))
} else {
Box::new(BufReader::new(file))
};
self.read_cufflinks_fpkm_tracking(reader, verbose)
}
}
#[cfg(test)]
mod tests {
use std::io::{BufReader, Cursor};
use crate::genes::Genes;
fn test_genes() -> Genes {
Genes::new(
vec!["geneA".into(), "geneB".into(), "geneC".into()],
vec!["chr1".into(), "chr1".into(), "chr2".into()],
vec![100, 300, 500],
vec![200, 400, 600],
vec![120, 320, 520],
vec![180, 380, 580],
vec!['+', '-', '+'],
)
}
#[test]
fn read_gtf_expr_sums_expression_by_gene() {
let data = br#"chr1 src transcript 100 200 . + . transcript_id "geneA"; FPKM "1.5";
chr1 src exon 120 180 . + . transcript_id "geneA"; FPKM "2.5";
chr1 src transcript 300 400 . - . transcript_id "geneB"; FPKM "4.0";
"#;
let mut genes = test_genes();
genes
.read_gtf_expr(
BufReader::new(Cursor::new(&data[..])),
"transcript_id",
"FPKM",
)
.unwrap();
assert_eq!(
genes.granges.meta.get_column_float("expr").unwrap(),
&vec![4.0, 4.0, 0.0]
);
}
#[test]
fn read_gtf_expr_replaces_existing_expr_column() {
let data1 = br#"chr1 src transcript 100 200 . + . transcript_id "geneA"; FPKM "1.0";
"#;
let data2 = br#"chr1 src transcript 300 400 . - . transcript_id "geneB"; FPKM "3.0";
"#;
let mut genes = test_genes();
genes
.read_gtf_expr(
BufReader::new(Cursor::new(&data1[..])),
"transcript_id",
"FPKM",
)
.unwrap();
genes
.read_gtf_expr(
BufReader::new(Cursor::new(&data2[..])),
"transcript_id",
"FPKM",
)
.unwrap();
assert_eq!(
genes.granges.meta.get_column_float("expr").unwrap(),
&vec![0.0, 3.0, 0.0]
);
assert_eq!(
genes
.granges
.meta
.meta_name
.iter()
.filter(|name| name.as_str() == "expr")
.count(),
1
);
}
#[test]
fn read_cufflinks_fpkm_tracking_imports_ok_rows() {
let data = br#"tracking_id class_code nearest_ref_id gene_id gene_short_name tss_id locus length coverage FPKM FPKM_conf_lo FPKM_conf_hi status
geneA - - - - - chr1:100-200 100 10.0 1.5 0.0 2.0 OK
geneB - - - - - chr1:300-400 100 8.0 3.0 0.0 4.0 OK
geneX - - - - - chr1:500-600 100 5.0 7.0 0.0 8.0 OK
geneA - - - - - chr1:100-200 100 10.0 2.5 0.0 3.0 FAIL
"#;
let mut genes = test_genes();
genes
.read_cufflinks_fpkm_tracking(BufReader::new(Cursor::new(&data[..])), false)
.unwrap();
assert_eq!(
genes.granges.meta.get_column_float("expr").unwrap(),
&vec![1.5, 3.0, 0.0]
);
}
#[test]
fn read_cufflinks_fpkm_tracking_rejects_invalid_header() {
let data =
br#"tracking_id class_code nearest_ref_id gene_id gene_short_name tss_id locus length coverage TPM FPKM_conf_lo FPKM_conf_hi status
"#;
let mut genes = test_genes();
assert!(genes
.read_cufflinks_fpkm_tracking(BufReader::new(Cursor::new(&data[..])), false)
.is_err());
}
}