use crate::core::{GenomicInterval, GenomicReader, GenomicRecordIterator, Strand};
use crate::error::{Error, Result};
use crate::io::Compression;
use flate2::read::MultiGzDecoder;
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;
#[derive(Debug, Clone)]
pub struct GffRecord {
pub seqid: String,
pub source: String,
pub feature_type: String,
pub start: u64,
pub end: u64,
pub score: Option<f64>,
pub strand: Strand,
pub phase: Option<u8>,
pub attributes: String,
}
impl GffRecord {
pub fn to_interval(&self) -> Result<GenomicInterval> {
GenomicInterval::new(self.seqid.clone(), self.start - 1, self.end)
}
pub fn parse_attributes(&self) -> HashMap<String, String> {
let mut attrs = HashMap::new();
if self.attributes.contains('=') {
for pair in self.attributes.split(';') {
let pair = pair.trim();
if pair.is_empty() {
continue;
}
if let Some((key, value)) = pair.split_once('=') {
attrs.insert(key.trim().to_string(), value.trim().to_string());
}
}
} else {
for pair in self.attributes.split(';') {
let pair = pair.trim();
if pair.is_empty() {
continue;
}
let parts: Vec<&str> = pair.splitn(2, ' ').collect();
if parts.len() == 2 {
let key = parts[0].trim();
let value = parts[1].trim().trim_matches('"');
attrs.insert(key.to_string(), value.to_string());
}
}
}
attrs
}
pub fn get_attribute(&self, key: &str) -> Option<String> {
self.parse_attributes().get(key).cloned()
}
pub fn len(&self) -> u64 {
self.end.saturating_sub(self.start) + 1
}
pub fn is_empty(&self) -> bool {
self.start > self.end
}
}
#[derive(Debug, Clone, Default)]
pub struct GffHeader {
pub version: Option<String>,
pub sequence_regions: Vec<String>,
pub directives: Vec<String>,
}
pub struct GffReader {
reader: Box<dyn BufRead>,
header: GffHeader,
line_buffer: String,
}
impl GffReader {
pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
let path = path.as_ref();
let file = File::open(path)?;
let compression = Compression::from_path(path);
let reader: Box<dyn BufRead> = match compression {
Compression::Gzip | Compression::Bgzip => {
Box::new(BufReader::new(MultiGzDecoder::new(file)))
}
_ => Box::new(BufReader::new(file)),
};
Self::parse_header(reader)
}
pub fn header(&self) -> &GffHeader {
&self.header
}
fn parse_header(mut reader: Box<dyn BufRead>) -> Result<Self> {
let mut header = GffHeader::default();
let mut line = String::new();
let first_data_line: Option<String> = loop {
line.clear();
let bytes_read = reader.read_line(&mut line)?;
if bytes_read == 0 {
break None;
}
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
if !trimmed.starts_with('#') {
break Some(trimmed.to_string());
}
if trimmed.starts_with("##") {
if trimmed.starts_with("##gff-version") {
header.version = trimmed
.trim_start_matches("##gff-version")
.trim()
.split_whitespace()
.next()
.map(|s| s.to_string());
} else if trimmed.starts_with("##sequence-region") {
header.sequence_regions.push(trimmed.to_string());
} else {
header.directives.push(trimmed.to_string());
}
}
};
let line_buffer = first_data_line.unwrap_or_else(|| String::with_capacity(512));
Ok(Self {
reader,
header,
line_buffer,
})
}
fn parse_record(line: &str) -> Result<GffRecord> {
let parts: Vec<&str> = line.split('\t').collect();
if parts.len() != 9 {
return Err(Error::Parse(format!(
"Invalid GFF record: expected 9 fields, got {}",
parts.len()
)));
}
let start = parts[3]
.parse::<u64>()
.map_err(|e| Error::Parse(format!("Invalid start position '{}': {}", parts[3], e)))?;
let end = parts[4]
.parse::<u64>()
.map_err(|e| Error::Parse(format!("Invalid end position '{}': {}", parts[4], e)))?;
if start > end {
return Err(Error::Parse(format!(
"Invalid GFF record: start ({}) > end ({})",
start, end
)));
}
let score = if parts[5] == "." {
None
} else {
Some(
parts[5]
.parse::<f64>()
.map_err(|e| Error::Parse(format!("Invalid score '{}': {}", parts[5], e)))?,
)
};
let strand = match parts[6] {
"+" => Strand::Forward,
"-" => Strand::Reverse,
"." => Strand::Unknown,
other => {
return Err(Error::Parse(format!(
"Invalid strand '{}': must be +, -, or .",
other
)))
}
};
let phase = if parts[7] == "." {
None
} else {
let p = parts[7]
.parse::<u8>()
.map_err(|e| Error::Parse(format!("Invalid phase '{}': {}", parts[7], e)))?;
if p > 2 {
return Err(Error::Parse(format!(
"Invalid phase '{}': must be 0, 1, 2, or .",
p
)));
}
Some(p)
};
Ok(GffRecord {
seqid: parts[0].to_string(),
source: parts[1].to_string(),
feature_type: parts[2].to_string(),
start,
end,
score,
strand,
phase,
attributes: parts[8].to_string(),
})
}
}
impl GenomicRecordIterator for GffReader {
type Record = GffRecord;
fn next_raw(&mut self) -> Result<Option<Vec<u8>>> {
Ok(None)
}
fn next_record(&mut self) -> Result<Option<Self::Record>> {
loop {
if !self.line_buffer.is_empty() {
let line = self.line_buffer.clone();
self.line_buffer.clear();
let trimmed = line.trim();
if !trimmed.is_empty() && !trimmed.starts_with('#') {
return Ok(Some(Self::parse_record(trimmed)?));
}
}
let bytes_read = self.reader.read_line(&mut self.line_buffer)?;
if bytes_read == 0 {
return Ok(None);
}
let line = self.line_buffer.trim();
if line.is_empty() || line.starts_with('#') {
self.line_buffer.clear();
continue;
}
let result = Self::parse_record(line)?;
self.line_buffer.clear();
return Ok(Some(result));
}
}
}
impl GenomicReader for GffReader {
type Metadata = GffHeader;
fn metadata(&self) -> &Self::Metadata {
&self.header
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::NamedTempFile;
#[test]
fn test_gff_header_parsing() -> Result<()> {
let gff_data = "##gff-version 3\n\
##sequence-region chr1 1 248956422\n\
chr1\tENSEMBL\tgene\t1000\t2000\t.\t+\t.\tID=gene1;Name=BRCA1\n";
let mut temp_file = NamedTempFile::new()?;
temp_file.write_all(gff_data.as_bytes())?;
temp_file.flush()?;
let mut reader = GffReader::from_path(temp_file.path())?;
assert_eq!(reader.header().version, Some("3".to_string()));
assert_eq!(reader.header().sequence_regions.len(), 1);
let record = reader.next_record()?.unwrap();
assert_eq!(record.seqid, "chr1");
assert_eq!(record.source, "ENSEMBL");
Ok(())
}
#[test]
fn test_gff_record_parsing() -> Result<()> {
let line = "chr1\tENSEMBL\tgene\t1000\t2000\t100.5\t+\t.\tID=gene1;Name=BRCA1";
let record = GffReader::parse_record(line)?;
assert_eq!(record.seqid, "chr1");
assert_eq!(record.source, "ENSEMBL");
assert_eq!(record.feature_type, "gene");
assert_eq!(record.start, 1000);
assert_eq!(record.end, 2000);
assert_eq!(record.score, Some(100.5));
assert_eq!(record.strand, Strand::Forward);
assert_eq!(record.phase, None);
assert_eq!(record.len(), 1001);
Ok(())
}
#[test]
fn test_gff3_attributes() -> Result<()> {
let line = "chr1\t.\tgene\t1000\t2000\t.\t+\t.\tID=gene1;Name=BRCA1;Dbxref=GeneID:672";
let record = GffReader::parse_record(line)?;
let attrs = record.parse_attributes();
assert_eq!(attrs.get("ID"), Some(&"gene1".to_string()));
assert_eq!(attrs.get("Name"), Some(&"BRCA1".to_string()));
assert_eq!(attrs.get("Dbxref"), Some(&"GeneID:672".to_string()));
assert_eq!(record.get_attribute("ID"), Some("gene1".to_string()));
Ok(())
}
#[test]
fn test_gtf_attributes() -> Result<()> {
let line = "chr1\t.\texon\t1000\t2000\t.\t+\t.\tgene_id \"ENSG00000000001\"; transcript_id \"ENST00000000001\";";
let record = GffReader::parse_record(line)?;
let attrs = record.parse_attributes();
assert_eq!(attrs.get("gene_id"), Some(&"ENSG00000000001".to_string()));
assert_eq!(
attrs.get("transcript_id"),
Some(&"ENST00000000001".to_string())
);
Ok(())
}
#[test]
fn test_invalid_positions() {
let line = "chr1\t.\tgene\t2000\t1000\t.\t+\t.\tID=gene1";
let result = GffReader::parse_record(line);
assert!(result.is_err());
}
#[test]
fn test_invalid_phase() {
let line = "chr1\t.\tCDS\t1000\t2000\t.\t+\t3\tID=cds1";
let result = GffReader::parse_record(line);
assert!(result.is_err());
}
#[test]
fn test_strand_parsing() -> Result<()> {
let forward = "chr1\t.\tgene\t1000\t2000\t.\t+\t.\tID=g1";
let reverse = "chr1\t.\tgene\t1000\t2000\t.\t-\t.\tID=g2";
let unknown = "chr1\t.\tgene\t1000\t2000\t.\t.\t.\tID=g3";
assert_eq!(GffReader::parse_record(forward)?.strand, Strand::Forward);
assert_eq!(GffReader::parse_record(reverse)?.strand, Strand::Reverse);
assert_eq!(GffReader::parse_record(unknown)?.strand, Strand::Unknown);
Ok(())
}
}