use anyhow::{Context, Result};
use indexmap::IndexMap;
use std::collections::HashMap;
use std::io::BufRead;
#[derive(Debug, Clone)]
pub struct Exon {
pub chrom: String,
pub start: u64,
pub end: u64,
pub strand: char,
}
#[derive(Debug, Clone)]
pub struct Transcript {
pub transcript_id: String,
pub chrom: String,
pub start: u64,
pub end: u64,
pub strand: char,
pub exons: Vec<(u64, u64)>,
pub cds_start: Option<u64>,
pub cds_end: Option<u64>,
}
#[derive(Debug, Clone)]
pub struct Gene {
pub gene_id: String,
pub chrom: String,
pub start: u64,
pub end: u64,
pub strand: char,
pub exons: Vec<Exon>,
pub effective_length: u64,
pub attributes: HashMap<String, String>,
pub transcripts: Vec<Transcript>,
}
fn get_attribute(attributes: &str, key: &str) -> Option<String> {
for attr in attributes.split(';') {
let attr = attr.trim();
if attr.is_empty() {
continue;
}
if let Some(pos) = attr.find(|c: char| c.is_whitespace()) {
let (k, v) = attr.split_at(pos);
if k.trim() == key {
let v = v.trim().trim_matches('"');
return Some(v.to_string());
}
}
}
None
}
fn compute_non_overlapping_length(exons: &[Exon]) -> u64 {
if exons.is_empty() {
return 0;
}
let mut intervals: Vec<(u64, u64)> = exons.iter().map(|e| (e.start, e.end)).collect();
intervals.sort_unstable();
let mut total_bases: u64 = 0;
let mut current_start = intervals[0].0;
let mut current_end = intervals[0].1;
for &(start, end) in &intervals[1..] {
if start <= current_end + 1 {
current_end = current_end.max(end);
} else {
total_bases += current_end - current_start + 1;
current_start = start;
current_end = end;
}
}
total_bases += current_end - current_start + 1;
total_bases
}
struct TranscriptBuilder {
transcript_id: String,
chrom: String,
strand: char,
exons: Vec<(u64, u64)>,
cds: Vec<(u64, u64)>,
start_codons: Vec<(u64, u64)>,
stop_codons: Vec<(u64, u64)>,
}
pub fn parse_gtf(path: &str, extra_attributes: &[String]) -> Result<IndexMap<String, Gene>> {
let reader = crate::io::open_reader(path)
.with_context(|| format!("Failed to open GTF file: {}", path))?;
let mut genes: IndexMap<String, Gene> = IndexMap::new();
let mut tx_builders: IndexMap<(String, String), TranscriptBuilder> = IndexMap::new();
for line in reader.lines() {
let line = line.context("Failed to read line from GTF file")?;
if line.starts_with('#') || line.is_empty() {
continue;
}
let fields: Vec<&str> = line.split('\t').collect();
if fields.len() < 9 {
continue;
}
let feature_type = fields[2];
if feature_type != "exon"
&& feature_type != "CDS"
&& feature_type != "start_codon"
&& feature_type != "stop_codon"
{
continue;
}
let chrom = fields[0].to_string();
let start: u64 = fields[3]
.parse()
.with_context(|| format!("Invalid start position: {}", fields[3]))?;
let end: u64 = fields[4]
.parse()
.with_context(|| format!("Invalid end position: {}", fields[4]))?;
anyhow::ensure!(
start <= end,
"Malformed GTF: start ({}) > end ({}) for {} on {}",
start,
end,
feature_type,
fields[0]
);
let strand = fields[6].chars().next().unwrap_or('.');
let attr_str = fields[8];
let gene_id = match get_attribute(attr_str, "gene_id") {
Some(id) => id,
None => continue, };
if feature_type == "exon" {
let exon = Exon {
chrom: chrom.clone(),
start,
end,
strand,
};
genes
.entry(gene_id.clone())
.and_modify(|gene| {
gene.start = gene.start.min(start);
gene.end = gene.end.max(end);
gene.exons.push(exon.clone());
})
.or_insert_with(|| {
let mut attrs = HashMap::new();
for attr_name in extra_attributes {
if let Some(val) = get_attribute(attr_str, attr_name) {
attrs.insert(attr_name.clone(), val);
}
}
Gene {
gene_id: gene_id.clone(),
chrom: chrom.clone(),
start,
end,
strand,
exons: vec![exon],
effective_length: 0, attributes: attrs,
transcripts: Vec::new(), }
});
}
let transcript_id = get_attribute(attr_str, "transcript_id")
.unwrap_or_else(|| format!("{}__no_tx", gene_id));
let key = (gene_id.clone(), transcript_id.clone());
let builder = tx_builders.entry(key).or_insert_with(|| TranscriptBuilder {
transcript_id,
chrom: chrom.clone(),
strand,
exons: Vec::new(),
cds: Vec::new(),
start_codons: Vec::new(),
stop_codons: Vec::new(),
});
match feature_type {
"exon" => builder.exons.push((start, end)),
"CDS" => builder.cds.push((start, end)),
"start_codon" => builder.start_codons.push((start, end)),
"stop_codon" => builder.stop_codons.push((start, end)),
_ => unreachable!(),
}
}
if genes.is_empty() {
log::warn!(
"No genes extracted from GTF file '{}'. Check that the file is in GTF (not GFF3) \
format and contains exon features with gene_id attributes.",
path
);
}
for ((gene_id, _), mut builder) in tx_builders {
if builder.exons.is_empty() {
continue;
}
builder.exons.sort_unstable();
let tx_start = builder.exons.first().map(|e| e.0).unwrap_or(0);
let tx_end = builder.exons.last().map(|e| e.1).unwrap_or(0);
let (cds_start, cds_end) = if builder.cds.is_empty() {
(None, None)
} else {
let has_start_codon = !builder.start_codons.is_empty();
let has_stop_codon = !builder.stop_codons.is_empty();
let thick_start;
let thick_end;
if has_start_codon || has_stop_codon {
let last_start_codon_start = builder.start_codons.last().map(|c| c.0);
let last_stop_codon_end = builder.stop_codons.last().map(|c| c.1);
match builder.strand {
'+' => {
thick_start = last_start_codon_start.unwrap_or(tx_start);
thick_end = last_stop_codon_end.unwrap_or(tx_end);
}
'-' => {
thick_start = last_stop_codon_end
.map(|e| e.saturating_sub(2))
.unwrap_or(tx_start);
thick_end = last_start_codon_start.map(|s| s + 2).unwrap_or(tx_end);
}
_ => {
thick_start = tx_start;
thick_end = tx_end;
}
}
} else {
thick_start = tx_start;
thick_end = tx_end;
}
(Some(thick_start), Some(thick_end))
};
let transcript = Transcript {
transcript_id: builder.transcript_id,
chrom: builder.chrom,
start: tx_start,
end: tx_end,
strand: builder.strand,
exons: builder.exons,
cds_start,
cds_end,
};
if let Some(gene) = genes.get_mut(&gene_id) {
gene.transcripts.push(transcript);
}
}
for gene in genes.values_mut() {
gene.effective_length = compute_non_overlapping_length(&gene.exons);
}
Ok(genes)
}
pub fn attribute_exists_in_gtf(path: &str, attribute_name: &str, max_lines: usize) -> bool {
let reader = match crate::io::open_reader(path) {
Ok(r) => r,
Err(_) => return false,
};
let mut checked = 0;
for line in reader.lines() {
let line = match line {
Ok(l) => l,
Err(_) => break,
};
if line.starts_with('#') || line.is_empty() {
continue;
}
let fields: Vec<&str> = line.split('\t').collect();
if fields.len() < 9 || fields[2] != "exon" {
continue;
}
if get_attribute(fields[8], attribute_name).is_some() {
return true;
}
checked += 1;
if checked >= max_lines {
break;
}
}
false
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_get_attribute() {
let attrs =
r#"gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_name "DDX11L1";"#;
assert_eq!(
get_attribute(attrs, "gene_id"),
Some("ENSG00000223972".to_string())
);
assert_eq!(
get_attribute(attrs, "gene_name"),
Some("DDX11L1".to_string())
);
assert_eq!(get_attribute(attrs, "missing"), None);
}
#[test]
fn test_non_overlapping_length() {
let exons = vec![Exon {
chrom: "chr1".to_string(),
start: 100,
end: 200,
strand: '+',
}];
assert_eq!(compute_non_overlapping_length(&exons), 101);
let exons = vec![
Exon {
chrom: "chr1".to_string(),
start: 100,
end: 200,
strand: '+',
},
Exon {
chrom: "chr1".to_string(),
start: 300,
end: 400,
strand: '+',
},
];
assert_eq!(compute_non_overlapping_length(&exons), 202);
let exons = vec![
Exon {
chrom: "chr1".to_string(),
start: 100,
end: 200,
strand: '+',
},
Exon {
chrom: "chr1".to_string(),
start: 150,
end: 250,
strand: '+',
},
];
assert_eq!(compute_non_overlapping_length(&exons), 151);
let exons: Vec<Exon> = vec![];
assert_eq!(compute_non_overlapping_length(&exons), 0);
}
fn write_temp_gtf(content: &str) -> (std::path::PathBuf, std::fs::File) {
use std::io::Write;
use std::sync::atomic::{AtomicU64, Ordering};
static COUNTER: AtomicU64 = AtomicU64::new(0);
let dir = std::env::temp_dir();
let id = COUNTER.fetch_add(1, Ordering::Relaxed);
let path = dir.join(format!(
"rustqc_test_{:?}_{}.gtf",
std::thread::current().id(),
id
));
let mut f = std::fs::File::create(&path).unwrap();
f.write_all(content.as_bytes()).unwrap();
f.flush().unwrap();
(path, f)
}
#[test]
fn test_parse_gtf_transcripts_basic() {
let (path, _f) = write_temp_gtf(
"\
chr1\ttest\texon\t100\t200\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\texon\t300\t400\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\texon\t100\t250\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T2\";\n\
chr1\ttest\tCDS\t120\t200\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tCDS\t300\t380\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tstart_codon\t120\t122\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tstop_codon\t381\t383\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n",
);
let genes = parse_gtf(path.to_str().unwrap(), &[]).unwrap();
assert_eq!(genes.len(), 1);
let gene = &genes["G1"];
assert_eq!(gene.exons.len(), 3);
assert_eq!(gene.start, 100);
assert_eq!(gene.end, 400);
assert_eq!(gene.strand, '+');
assert_eq!(gene.transcripts.len(), 2);
let t1 = gene
.transcripts
.iter()
.find(|t| t.transcript_id == "T1")
.unwrap();
assert_eq!(t1.exons, vec![(100, 200), (300, 400)]);
assert_eq!(t1.start, 100);
assert_eq!(t1.end, 400);
assert_eq!(t1.cds_start, Some(120));
assert_eq!(t1.cds_end, Some(383));
let t2 = gene
.transcripts
.iter()
.find(|t| t.transcript_id == "T2")
.unwrap();
assert_eq!(t2.exons, vec![(100, 250)]);
assert_eq!(t2.start, 100);
assert_eq!(t2.end, 250);
assert_eq!(t2.cds_start, None); assert_eq!(t2.cds_end, None);
}
#[test]
fn test_parse_gtf_no_transcript_id() {
let (path, _f) = write_temp_gtf(
"\
chr1\ttest\texon\t100\t200\t.\t+\t.\tgene_id \"G1\";\n\
chr1\ttest\texon\t300\t400\t.\t+\t.\tgene_id \"G1\";\n",
);
let genes = parse_gtf(path.to_str().unwrap(), &[]).unwrap();
let gene = &genes["G1"];
assert_eq!(gene.transcripts.len(), 1);
assert_eq!(gene.transcripts[0].transcript_id, "G1__no_tx");
assert_eq!(gene.transcripts[0].exons, vec![(100, 200), (300, 400)]);
}
#[test]
fn test_parse_gtf_cds_only_no_exon() {
let (path, _f) = write_temp_gtf(
"\
chr1\ttest\texon\t100\t200\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tCDS\t500\t600\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T_orphan\";\n",
);
let genes = parse_gtf(path.to_str().unwrap(), &[]).unwrap();
let gene = &genes["G1"];
assert_eq!(gene.transcripts.len(), 1);
assert_eq!(gene.transcripts[0].transcript_id, "T1");
}
#[test]
fn test_parse_gtf_multi_gene_transcripts() {
let (path, _f) = write_temp_gtf(
"\
chr1\ttest\texon\t100\t200\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tCDS\t120\t180\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr2\ttest\texon\t500\t600\t.\t-\t.\tgene_id \"G2\"; transcript_id \"T2\";\n\
chr2\ttest\texon\t700\t800\t.\t-\t.\tgene_id \"G2\"; transcript_id \"T2\";\n",
);
let genes = parse_gtf(path.to_str().unwrap(), &[]).unwrap();
assert_eq!(genes.len(), 2);
let g1 = &genes["G1"];
assert_eq!(g1.transcripts.len(), 1);
assert_eq!(g1.transcripts[0].cds_start, Some(100));
assert_eq!(g1.transcripts[0].cds_end, Some(200));
let g2 = &genes["G2"];
assert_eq!(g2.transcripts.len(), 1);
assert_eq!(g2.transcripts[0].exons, vec![(500, 600), (700, 800)]);
assert_eq!(g2.transcripts[0].strand, '-');
assert_eq!(g2.transcripts[0].cds_start, None);
}
#[test]
fn test_stop_codon_extends_cds_range_forward_strand() {
let (path, _f) = write_temp_gtf(
"\
chr1\ttest\texon\t1000\t2000\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\texon\t3000\t4000\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tCDS\t1200\t2000\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tCDS\t3000\t3500\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tstart_codon\t1200\t1202\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tstop_codon\t3501\t3503\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n",
);
let genes = parse_gtf(path.to_str().unwrap(), &[]).unwrap();
let t1 = &genes["G1"].transcripts[0];
assert_eq!(t1.cds_start, Some(1200));
assert_eq!(t1.cds_end, Some(3503));
}
#[test]
fn test_stop_codon_extends_cds_range_reverse_strand() {
let (path, _f) = write_temp_gtf(
"\
chr1\ttest\texon\t1000\t2000\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\texon\t3000\t4000\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tCDS\t1500\t2000\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tCDS\t3000\t3800\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tstart_codon\t3798\t3800\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tstop_codon\t1497\t1499\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n",
);
let genes = parse_gtf(path.to_str().unwrap(), &[]).unwrap();
let t1 = &genes["G1"].transcripts[0];
assert_eq!(t1.cds_start, Some(1497));
assert_eq!(t1.cds_end, Some(3800));
}
#[test]
fn test_split_stop_codon_reverse_strand() {
let (path, _f) = write_temp_gtf(
"\
chr1\ttest\texon\t100\t200\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\texon\t400\t600\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\texon\t800\t1000\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tCDS\t110\t200\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tCDS\t400\t600\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tCDS\t800\t900\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tstart_codon\t898\t900\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tstop_codon\t524\t524\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tstop_codon\t101\t102\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n",
);
let genes = parse_gtf(path.to_str().unwrap(), &[]).unwrap();
let t1 = &genes["G1"].transcripts[0];
assert_eq!(t1.cds_start, Some(100));
assert_eq!(t1.cds_end, Some(900));
}
#[test]
fn test_split_start_codon_forward_strand() {
let (path, _f) = write_temp_gtf(
"\
chr1\ttest\texon\t100\t200\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\texon\t400\t600\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tCDS\t198\t200\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tCDS\t400\t550\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tstart_codon\t198\t199\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tstart_codon\t401\t401\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tstop_codon\t551\t553\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n",
);
let genes = parse_gtf(path.to_str().unwrap(), &[]).unwrap();
let t1 = &genes["G1"].transcripts[0];
assert_eq!(t1.cds_start, Some(401));
assert_eq!(t1.cds_end, Some(553));
}
#[test]
fn test_stop_codon_without_cds_does_not_create_coding() {
let (path, _f) = write_temp_gtf(
"\
chr1\ttest\texon\t1000\t2000\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tstop_codon\t1500\t1502\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n",
);
let genes = parse_gtf(path.to_str().unwrap(), &[]).unwrap();
let t1 = &genes["G1"].transcripts[0];
assert_eq!(t1.cds_start, None);
assert_eq!(t1.cds_end, None);
}
}