use std::io::Cursor;
use crate::reference::transcript::Strand;
use noodles_gff::feature::record::{Phase, Strand as NoodlesStrand};
use smol_str::SmolStr;
use super::feature::{AttributeMap, FeatureType};
pub trait AnnotationRecord: Sized {
fn parse(line: &str, source_line: u64) -> Result<Option<Self>, RecordParseError>;
fn seqid(&self) -> &str;
fn feature_type(&self) -> &FeatureType;
fn start(&self) -> u64;
fn end(&self) -> u64;
fn strand(&self) -> Strand;
fn phase(&self) -> Option<u8>;
fn id(&self) -> Option<&str>;
fn parents(&self) -> &[String];
#[allow(dead_code)]
fn attribute(&self, key: &str) -> Option<&str>;
fn source_line(&self) -> u64;
fn into_attrs(self) -> AttributeMap;
}
fn parse_strand<E>(
nstrand: Result<NoodlesStrand, E>,
trimmed: &str,
) -> Result<Strand, RecordParseError> {
match nstrand {
Ok(NoodlesStrand::Forward) => Ok(Strand::Plus),
Ok(NoodlesStrand::Reverse) => Ok(Strand::Minus),
Ok(NoodlesStrand::None | NoodlesStrand::Unknown) => Ok(Strand::Unknown),
Err(_) => {
let raw = trimmed.split('\t').nth(6).unwrap_or("").to_string();
Err(RecordParseError::BadStrand(raw))
}
}
}
fn parse_phase<E>(nphase: Option<Result<Phase, E>>, trimmed: &str) -> Option<u8> {
match nphase {
None => None,
Some(Ok(Phase::Zero)) => Some(0u8),
Some(Ok(Phase::One)) => Some(1u8),
Some(Ok(Phase::Two)) => Some(2u8),
Some(Err(_)) => trimmed
.split('\t')
.nth(7)
.and_then(|s| s.parse::<u8>().ok()),
}
}
#[derive(Debug, thiserror::Error)]
pub enum RecordParseError {
#[error("invalid coordinate '{0}' in column {1}")]
BadCoordinate(String, usize),
#[error("invalid strand '{0}'")]
BadStrand(String),
#[error("malformed record: {0}")]
Malformed(String),
}
#[derive(Debug, Clone)]
pub struct Gff3Record {
seqid: String,
feature_type: FeatureType,
start: u64,
end: u64,
strand: Strand,
phase: Option<u8>,
id: Option<String>,
parents: Vec<String>,
attrs: AttributeMap,
source_line: u64,
}
impl Gff3Record {
pub fn parse(line: &str, source_line: u64) -> Result<Option<Self>, RecordParseError> {
let trimmed = line.trim_end_matches('\n').trim_end_matches('\r');
if trimmed.is_empty() || trimmed.starts_with('#') {
return Ok(None);
}
let cursor = Cursor::new(trimmed.as_bytes());
let mut reader = noodles_gff::io::Reader::new(cursor);
let mut nline = noodles_gff::Line::default();
reader
.read_line(&mut nline)
.map_err(|e| RecordParseError::Malformed(e.to_string()))?;
let nrecord = match nline.as_record() {
Some(Ok(r)) => r,
Some(Err(e)) => return Err(RecordParseError::Malformed(e.to_string())),
None => return Ok(None),
};
let seqid = nrecord.reference_sequence_name().to_string();
let feature_type = FeatureType::from_so_term(&nrecord.ty().to_string());
let start = nrecord
.start()
.map(|p| usize::from(p) as u64)
.map_err(|_| RecordParseError::BadCoordinate("start".into(), 4))?;
let end = nrecord
.end()
.map(|p| usize::from(p) as u64)
.map_err(|_| RecordParseError::BadCoordinate("end".into(), 5))?;
let strand = parse_strand(nrecord.strand(), trimmed)?;
let phase = parse_phase(nrecord.phase(), trimmed);
let mut id: Option<String> = None;
let mut parents: Vec<String> = Vec::new();
let mut attrs: AttributeMap = Vec::new();
for result in nrecord.attributes().iter() {
let (tag, value) = result.map_err(|e| RecordParseError::Malformed(e.to_string()))?;
let key_str = tag.to_string();
match key_str.as_str() {
"ID" => {
id = Some(value.as_ref().to_string());
}
"Parent" => {
use noodles_gff::record::attributes::field::Value as GffValue;
match value {
GffValue::String(s) => {
parents = vec![s.to_string()];
}
GffValue::Array(arr) => {
parents = arr.iter().map(|cow| cow.to_string()).collect();
}
}
}
_ => {
attrs.push((
SmolStr::new(&key_str),
SmolStr::new(value.as_ref().to_string()),
));
}
}
}
Ok(Some(Self {
seqid,
feature_type,
start,
end,
strand,
phase,
id,
parents,
attrs,
source_line,
}))
}
}
impl AnnotationRecord for Gff3Record {
fn parse(line: &str, source_line: u64) -> Result<Option<Self>, RecordParseError> {
Self::parse(line, source_line)
}
fn seqid(&self) -> &str {
&self.seqid
}
fn feature_type(&self) -> &FeatureType {
&self.feature_type
}
fn start(&self) -> u64 {
self.start
}
fn end(&self) -> u64 {
self.end
}
fn strand(&self) -> Strand {
self.strand
}
fn phase(&self) -> Option<u8> {
self.phase
}
fn id(&self) -> Option<&str> {
self.id.as_deref()
}
fn parents(&self) -> &[String] {
&self.parents
}
fn attribute(&self, key: &str) -> Option<&str> {
super::feature::attr_get(&self.attrs, key)
}
fn source_line(&self) -> u64 {
self.source_line
}
fn into_attrs(self) -> AttributeMap {
self.attrs
}
}
#[derive(Debug, Clone)]
pub struct GtfRecord {
seqid: String,
feature_type: FeatureType,
start: u64,
end: u64,
strand: Strand,
phase: Option<u8>,
id: Option<String>,
parents: Vec<String>,
attrs: AttributeMap,
source_line: u64,
}
impl GtfRecord {
pub fn parse(line: &str, source_line: u64) -> Result<Option<Self>, RecordParseError> {
let trimmed = line.trim_end_matches('\n').trim_end_matches('\r');
if trimmed.is_empty() || trimmed.starts_with('#') {
return Ok(None);
}
let cursor = Cursor::new(trimmed.as_bytes());
let mut reader = noodles_gtf::io::Reader::new(cursor);
let mut nline = noodles_gtf::Line::default();
reader
.read_line(&mut nline)
.map_err(|e| RecordParseError::Malformed(e.to_string()))?;
let nrecord = match nline.as_record() {
Some(Ok(r)) => r,
Some(Err(e)) => return Err(RecordParseError::Malformed(e.to_string())),
None => return Ok(None),
};
let seqid = nrecord.reference_sequence_name().to_string();
let feature_type = FeatureType::from_so_term(&nrecord.ty().to_string());
let start = nrecord
.start()
.map(|p| usize::from(p) as u64)
.map_err(|_| RecordParseError::BadCoordinate("start".into(), 4))?;
let end = nrecord
.end()
.map(|p| usize::from(p) as u64)
.map_err(|_| RecordParseError::BadCoordinate("end".into(), 5))?;
let strand = parse_strand(nrecord.strand(), trimmed)?;
let phase = parse_phase(nrecord.phase(), trimmed);
let nattrs = nrecord
.attributes()
.map_err(|e| RecordParseError::Malformed(e.to_string()))?;
let mut attrs: AttributeMap = Vec::new();
for result in nattrs.iter() {
let (key, value) = result.map_err(|e| RecordParseError::Malformed(e.to_string()))?;
use noodles_gtf::record::attributes::field::Value as GtfValue;
let val_str = match value {
GtfValue::String(s) => s.to_string(),
GtfValue::Array(parts) => parts
.iter()
.next()
.map(|c| c.to_string())
.unwrap_or_default(),
};
attrs.push((SmolStr::new(key.to_string()), SmolStr::new(val_str)));
}
let transcript_id = super::feature::attr_get(&attrs, "transcript_id").map(String::from);
let gene_id = super::feature::attr_get(&attrs, "gene_id").map(String::from);
let (id, parents) = match &feature_type {
FeatureType::Gene | FeatureType::PseudoGene => (gene_id.clone(), Vec::new()),
ft if ft.is_transcript_like() => (transcript_id.clone(), gene_id.into_iter().collect()),
_ => (None, transcript_id.into_iter().collect()),
};
Ok(Some(Self {
seqid,
feature_type,
start,
end,
strand,
phase,
id,
parents,
attrs,
source_line,
}))
}
}
impl AnnotationRecord for GtfRecord {
fn parse(line: &str, source_line: u64) -> Result<Option<Self>, RecordParseError> {
Self::parse(line, source_line)
}
fn seqid(&self) -> &str {
&self.seqid
}
fn feature_type(&self) -> &FeatureType {
&self.feature_type
}
fn start(&self) -> u64 {
self.start
}
fn end(&self) -> u64 {
self.end
}
fn strand(&self) -> Strand {
self.strand
}
fn phase(&self) -> Option<u8> {
self.phase
}
fn id(&self) -> Option<&str> {
self.id.as_deref()
}
fn parents(&self) -> &[String] {
&self.parents
}
fn attribute(&self, key: &str) -> Option<&str> {
super::feature::attr_get(&self.attrs, key)
}
fn source_line(&self) -> u64 {
self.source_line
}
fn into_attrs(self) -> AttributeMap {
self.attrs
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::reference::transcript::Strand;
#[test]
fn gff3_record_parses_basic_mrna_line() {
let line = "chr1\t.\tmRNA\t100\t200\t.\t+\t.\tID=tx1;Parent=gene01;gene=GENE1";
let rec = Gff3Record::parse(line, 5)
.expect("parse")
.expect("not comment");
assert_eq!(rec.seqid(), "chr1");
assert_eq!(rec.feature_type(), &FeatureType::Mrna);
assert_eq!(rec.start(), 100);
assert_eq!(rec.end(), 200);
assert_eq!(rec.strand(), Strand::Plus);
assert_eq!(rec.id(), Some("tx1"));
assert_eq!(rec.parents(), &["gene01"]);
assert_eq!(rec.attribute("gene"), Some("GENE1"));
assert_eq!(rec.source_line(), 5);
}
#[test]
fn gff3_record_handles_multi_parent() {
let line = "chr1\t.\texon\t100\t200\t.\t+\t.\tID=ex1;Parent=tx1,tx2";
let rec = Gff3Record::parse(line, 1)
.expect("parse")
.expect("not comment");
assert_eq!(rec.parents(), &["tx1", "tx2"]);
}
#[test]
fn gff3_record_handles_unknown_strand() {
let line = "chr1\t.\texon\t100\t200\t.\t.\t.\tID=ex1";
let rec = Gff3Record::parse(line, 1)
.expect("parse")
.expect("not comment");
assert_eq!(rec.strand(), Strand::Unknown);
}
#[test]
fn gff3_record_url_decodes_attributes() {
let line = "chr1\t.\tgene\t100\t200\t.\t+\t.\tID=g1;Name=My%20Gene";
let rec = Gff3Record::parse(line, 1)
.expect("parse")
.expect("not comment");
assert_eq!(rec.attribute("Name"), Some("My Gene"));
}
#[test]
fn gff3_record_url_decodes_multibyte_utf8() {
let line = "chr1\t.\tgene\t100\t200\t.\t+\t.\tID=g1;Name=caf%C3%A9";
let rec = Gff3Record::parse(line, 1)
.expect("parse")
.expect("not comment");
assert_eq!(rec.attribute("Name"), Some("café"));
}
#[test]
fn gff3_record_skips_comments() {
assert!(Gff3Record::parse("# header", 1).unwrap().is_none());
assert!(Gff3Record::parse("", 1).unwrap().is_none());
}
#[test]
fn gff3_record_rejects_malformed_coordinate() {
let line = "chr1\t.\texon\tabc\t200\t.\t+\t.\tID=ex1";
assert!(Gff3Record::parse(line, 1).is_err());
}
#[test]
fn gff3_record_bad_strand_reports_actual_value() {
let line = "chr1\t.\texon\t100\t200\t.\tx\t.\tID=ex1";
match Gff3Record::parse(line, 1) {
Err(RecordParseError::BadStrand(v)) => assert_eq!(v, "x"),
other => panic!("expected BadStrand(\"x\"), got {:?}", other),
}
}
#[test]
fn gff3_record_lenient_phase_fallback() {
let line = "chr1\t.\tCDS\t100\t200\t.\t+\t5\tID=cds1";
let rec = Gff3Record::parse(line, 1)
.expect("parse")
.expect("not comment");
assert_eq!(rec.phase(), Some(5));
}
#[test]
fn gtf_record_parses_exon_with_transcript_id() {
let line = "chr1\tHAVANA\texon\t100\t200\t.\t+\t.\tgene_id \"ENSG1\"; transcript_id \"ENST1\"; exon_number \"1\";";
let rec = GtfRecord::parse(line, 1)
.expect("parse")
.expect("not comment");
assert_eq!(rec.seqid(), "chr1");
assert_eq!(rec.feature_type(), &FeatureType::Exon);
assert_eq!(rec.start(), 100);
assert_eq!(rec.end(), 200);
assert_eq!(rec.parents(), &["ENST1"]);
assert_eq!(rec.attribute("gene_id"), Some("ENSG1"));
assert_eq!(rec.attribute("exon_number"), Some("1"));
}
#[test]
fn gtf_record_transcript_row_uses_self_id() {
let line = "chr1\tHAVANA\ttranscript\t100\t500\t.\t+\t.\tgene_id \"ENSG1\"; transcript_id \"ENST1\";";
let rec = GtfRecord::parse(line, 1)
.expect("parse")
.expect("not comment");
assert_eq!(rec.id(), Some("ENST1"));
assert_eq!(rec.parents(), &["ENSG1"]);
}
#[test]
fn gtf_record_gene_row_uses_gene_id() {
let line = "chr1\tHAVANA\tgene\t100\t500\t.\t+\t.\tgene_id \"ENSG1\";";
let rec = GtfRecord::parse(line, 1)
.expect("parse")
.expect("not comment");
assert_eq!(rec.id(), Some("ENSG1"));
assert!(rec.parents().is_empty());
}
#[test]
fn gtf_record_parses_minus_strand() {
let line = "chr1\tHAVANA\texon\t100\t200\t.\t-\t.\tgene_id \"g1\"; transcript_id \"tx1\";";
let rec = GtfRecord::parse(line, 1)
.expect("parse")
.expect("not comment");
assert_eq!(rec.strand(), Strand::Minus);
}
#[test]
fn gtf_record_parses_phase_for_cds() {
let line = "chr1\tHAVANA\tCDS\t100\t200\t.\t+\t2\tgene_id \"g1\"; transcript_id \"tx1\";";
let rec = GtfRecord::parse(line, 1)
.expect("parse")
.expect("not comment");
assert_eq!(rec.phase(), Some(2));
}
#[test]
fn gtf_record_bad_strand_reports_actual_value() {
let line = "chr1\tHAVANA\texon\t100\t200\t.\tx\t.\tgene_id \"g1\"; transcript_id \"tx1\";";
match GtfRecord::parse(line, 1) {
Err(RecordParseError::BadStrand(v)) => assert_eq!(v, "x"),
other => panic!("expected BadStrand(\"x\"), got {:?}", other),
}
}
#[test]
fn gtf_record_lenient_phase_fallback() {
let line = "chr1\tHAVANA\tCDS\t100\t200\t.\t+\t5\tgene_id \"g1\"; transcript_id \"tx1\";";
let rec = GtfRecord::parse(line, 1)
.expect("parse")
.expect("not comment");
assert_eq!(rec.phase(), Some(5));
}
}