Skip to main content

genomicframe_core/formats/gff/
reader.rs

1//! GFF/GTF format support (gene annotations)
2//!
3//! GFF (General Feature Format) and GTF (Gene Transfer Format) are
4//! tab-delimited text formats for genomic annotations.
5//!
6//! # Design Philosophy
7//!
8//! - **Streaming by default**: Records are never buffered unless explicitly requested
9//! - **Zero-copy where possible**: Minimize allocations during parsing
10//! - **Automatic compression detection**: Handles `.gff`, `.gtf`, `.gff.gz`, `.gtf.gz` transparently
11//! - **Lazy evaluation**: Process millions of annotations with O(1) memory
12//!
13//! # Format Specification
14//!
15//! GFF/GTF files have 9 tab-delimited columns:
16//! 1. seqid - chromosome/scaffold name
17//! 2. source - annotation source (e.g., "ENSEMBL", "RefSeq")
18//! 3. type - feature type (e.g., "gene", "exon", "CDS")
19//! 4. start - 1-based start position (inclusive)
20//! 5. end - 1-based end position (inclusive)
21//! 6. score - floating point score (or ".")
22//! 7. strand - +, -, or .
23//! 8. phase - 0, 1, 2, or . (for CDS features)
24//! 9. attributes - semicolon-separated key-value pairs
25//!
26//! # Examples
27//!
28//! ```no_run
29//! use genomicframe_core::formats::gff::GffReader;
30//! use genomicframe_core::core::GenomicRecordIterator;
31//!
32//! // Streaming: O(1) memory, processes one record at a time
33//! let mut reader = GffReader::from_path("annotations.gff.gz")?;
34//!
35//! while let Some(record) = reader.next_record()? {
36//!     if record.feature_type == "gene" {
37//!         println!("Gene: {} at {}:{}-{}",
38//!             record.seqid, record.start, record.end,
39//!             record.strand);
40//!     }
41//! }
42//! # Ok::<(), genomicframe_core::error::Error>(())
43//! ```
44
45use crate::core::{GenomicInterval, GenomicReader, GenomicRecordIterator, Strand};
46use crate::error::{Error, Result};
47use crate::io::Compression;
48// use anyhow::Ok;
49use flate2::read::MultiGzDecoder;
50use std::collections::HashMap;
51use std::fs::File;
52use std::io::{BufRead, BufReader};
53use std::path::Path;
54
55/// A single GFF/GTF record (feature annotation)
56#[derive(Debug, Clone)]
57pub struct GffRecord {
58    /// Sequence/chromosome name
59    pub seqid: String,
60    /// Source of the annotation
61    pub source: String,
62    /// Feature type (e.g., "gene", "exon", "CDS")
63    pub feature_type: String,
64    /// Start position (1-based, inclusive)
65    pub start: u64,
66    /// End position (1-based, inclusive)
67    pub end: u64,
68    /// Score
69    pub score: Option<f64>,
70    /// Strand
71    pub strand: Strand,
72    /// Phase (for CDS features)
73    pub phase: Option<u8>,
74    /// Attributes (semicolon-separated key-value pairs, stored as raw string)
75    pub attributes: String,
76}
77
78impl GffRecord {
79    /// Convert to a genomic interval (0-based, half-open)
80    pub fn to_interval(&self) -> Result<GenomicInterval> {
81        GenomicInterval::new(self.seqid.clone(), self.start - 1, self.end)
82    }
83
84    /// Parse attributes into a HashMap (GTF format: key "value"; GFF3 format: key=value)
85    ///
86    /// This is lazy - only parse when needed. Most use cases just need the raw string.
87    pub fn parse_attributes(&self) -> HashMap<String, String> {
88        let mut attrs = HashMap::new();
89
90        // Detect format: GTF uses quotes, GFF3 uses equals
91        if self.attributes.contains('=') {
92            // GFF3 format: key1=value1;key2=value2
93            for pair in self.attributes.split(';') {
94                let pair = pair.trim();
95                if pair.is_empty() {
96                    continue;
97                }
98                if let Some((key, value)) = pair.split_once('=') {
99                    attrs.insert(key.trim().to_string(), value.trim().to_string());
100                }
101            }
102        } else {
103            // GTF format: key1 "value1"; key2 "value2";
104            for pair in self.attributes.split(';') {
105                let pair = pair.trim();
106                if pair.is_empty() {
107                    continue;
108                }
109                let parts: Vec<&str> = pair.splitn(2, ' ').collect();
110                if parts.len() == 2 {
111                    let key = parts[0].trim();
112                    let value = parts[1].trim().trim_matches('"');
113                    attrs.insert(key.to_string(), value.to_string());
114                }
115            }
116        }
117
118        attrs
119    }
120
121    /// Get a specific attribute value by key
122    pub fn get_attribute(&self, key: &str) -> Option<String> {
123        self.parse_attributes().get(key).cloned()
124    }
125
126    /// Get feature length
127    pub fn len(&self) -> u64 {
128        self.end.saturating_sub(self.start) + 1
129    }
130
131    /// Check if feature has zero length (shouldn't happen in valid GFF)
132    pub fn is_empty(&self) -> bool {
133        self.start > self.end
134    }
135}
136
137/// GFF/GTF header metadata
138///
139/// GFF files may have header lines starting with ##
140/// These provide metadata about the file
141#[derive(Debug, Clone, Default)]
142pub struct GffHeader {
143    /// GFF version (e.g., "3", "2")
144    pub version: Option<String>,
145    /// Sequence region definitions: ##sequence-region seqid start end
146    pub sequence_regions: Vec<String>,
147    /// Other header directives
148    pub directives: Vec<String>,
149}
150
151/// GFF/GTF file reader (streaming, memory-efficient)
152///
153/// **The only way to create a GffReader is `GffReader::from_path()`.**
154///
155/// This follows the same design as VcfReader - one obvious way to read a file.
156pub struct GffReader {
157    reader: Box<dyn BufRead>,
158    header: GffHeader,
159    line_buffer: String,
160}
161
162impl GffReader {
163    /// Open a GFF/GTF file - handles .gff, .gtf, .gff.gz, .gtf.gz automatically
164    ///
165    /// # Memory Behavior
166    ///
167    /// - Opens file handle (~8KB buffer)
168    /// - Parses header (~1-10KB typically)
169    /// - **Does NOT load annotations into memory**
170    /// - Records are parsed on-demand as you iterate
171    ///
172    /// # Example
173    ///
174    /// ```no_run
175    /// use genomicframe_core::formats::gff::GffReader;
176    /// use genomicframe_core::core::GenomicRecordIterator;
177    ///
178    /// // Works with multi-GB files using only ~10KB RAM
179    /// let mut reader = GffReader::from_path("gencode.v45.gff3.gz")?;
180    ///
181    /// while let Some(record) = reader.next_record()? {
182    ///     if record.feature_type == "exon" {
183    ///         println!("{:?}", record);
184    ///     }
185    /// }
186    /// # Ok::<(), genomicframe_core::error::Error>(())
187    /// ```
188    pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
189        let path = path.as_ref();
190        let file = File::open(path)?;
191        let compression = Compression::from_path(path);
192
193        let reader: Box<dyn BufRead> = match compression {
194            Compression::Gzip | Compression::Bgzip => {
195                Box::new(BufReader::new(MultiGzDecoder::new(file)))
196            }
197            _ => Box::new(BufReader::new(file)),
198        };
199
200        Self::parse_header(reader)
201    }
202
203    /// Get the GFF header metadata
204    pub fn header(&self) -> &GffHeader {
205        &self.header
206    }
207
208    /// Internal: Parse GFF header and return reader positioned at first record
209    fn parse_header(mut reader: Box<dyn BufRead>) -> Result<Self> {
210        let mut header = GffHeader::default();
211        let mut line = String::new();
212        let first_data_line: Option<String> = loop {
213            line.clear();
214            let bytes_read = reader.read_line(&mut line)?;
215
216            // Empty file
217            if bytes_read == 0 {
218                break None;
219            }
220
221            let trimmed = line.trim();
222
223            // Skip empty lines
224            if trimmed.is_empty() {
225                continue;
226            }
227
228            // Check if this is a header line
229            if !trimmed.starts_with('#') {
230                // Hit first data line - save it and done with header
231                break Some(trimmed.to_string());
232            }
233
234            // Parse header directives (lines starting with ##)
235            if trimmed.starts_with("##") {
236                if trimmed.starts_with("##gff-version") {
237                    header.version = trimmed
238                        .trim_start_matches("##gff-version")
239                        .trim()
240                        .split_whitespace()
241                        .next()
242                        .map(|s| s.to_string());
243                } else if trimmed.starts_with("##sequence-region") {
244                    header.sequence_regions.push(trimmed.to_string());
245                } else {
246                    header.directives.push(trimmed.to_string());
247                }
248            }
249            // Single # is a comment, just skip it
250        };
251
252        // Create reader with the first data line pre-loaded
253        let line_buffer = first_data_line.unwrap_or_else(|| String::with_capacity(512));
254
255        Ok(Self {
256            reader,
257            header,
258            line_buffer,
259        })
260    }
261
262    /// Parse a single GFF record line
263    fn parse_record(line: &str) -> Result<GffRecord> {
264        let parts: Vec<&str> = line.split('\t').collect();
265
266        if parts.len() != 9 {
267            return Err(Error::Parse(format!(
268                "Invalid GFF record: expected 9 fields, got {}",
269                parts.len()
270            )));
271        }
272
273        // Parse start and end positions
274        let start = parts[3]
275            .parse::<u64>()
276            .map_err(|e| Error::Parse(format!("Invalid start position '{}': {}", parts[3], e)))?;
277
278        let end = parts[4]
279            .parse::<u64>()
280            .map_err(|e| Error::Parse(format!("Invalid end position '{}': {}", parts[4], e)))?;
281
282        // Validate start <= end
283        if start > end {
284            return Err(Error::Parse(format!(
285                "Invalid GFF record: start ({}) > end ({})",
286                start, end
287            )));
288        }
289
290        // Parse score (optional)
291        let score = if parts[5] == "." {
292            None
293        } else {
294            Some(
295                parts[5]
296                    .parse::<f64>()
297                    .map_err(|e| Error::Parse(format!("Invalid score '{}': {}", parts[5], e)))?,
298            )
299        };
300
301        // Parse strand
302        let strand = match parts[6] {
303            "+" => Strand::Forward,
304            "-" => Strand::Reverse,
305            "." => Strand::Unknown,
306            other => {
307                return Err(Error::Parse(format!(
308                    "Invalid strand '{}': must be +, -, or .",
309                    other
310                )))
311            }
312        };
313
314        // Parse phase (optional, for CDS features)
315        let phase = if parts[7] == "." {
316            None
317        } else {
318            let p = parts[7]
319                .parse::<u8>()
320                .map_err(|e| Error::Parse(format!("Invalid phase '{}': {}", parts[7], e)))?;
321            if p > 2 {
322                return Err(Error::Parse(format!(
323                    "Invalid phase '{}': must be 0, 1, 2, or .",
324                    p
325                )));
326            }
327            Some(p)
328        };
329
330        Ok(GffRecord {
331            seqid: parts[0].to_string(),
332            source: parts[1].to_string(),
333            feature_type: parts[2].to_string(),
334            start,
335            end,
336            score,
337            strand,
338            phase,
339            attributes: parts[8].to_string(),
340        })
341    }
342}
343
344impl GenomicRecordIterator for GffReader {
345    type Record = GffRecord;
346
347    fn next_raw(&mut self) -> Result<Option<Vec<u8>>> { 
348        // TODO: Implement
349        Ok(None)
350    }
351
352    fn next_record(&mut self) -> Result<Option<Self::Record>> {
353        loop {
354            // If we have a pre-loaded line (from parse_header), use it first
355            if !self.line_buffer.is_empty() {
356                let line = self.line_buffer.clone();
357                self.line_buffer.clear();
358
359                let trimmed = line.trim();
360                if !trimmed.is_empty() && !trimmed.starts_with('#') {
361                    return Ok(Some(Self::parse_record(trimmed)?));
362                }
363                // If it was empty or a comment, fall through to read next line
364            }
365
366            // Read next line from file
367            let bytes_read = self.reader.read_line(&mut self.line_buffer)?;
368
369            // End of file
370            if bytes_read == 0 {
371                return Ok(None);
372            }
373
374            let line = self.line_buffer.trim();
375
376            // Skip empty lines and comments
377            if line.is_empty() || line.starts_with('#') {
378                self.line_buffer.clear();
379                continue;
380            }
381
382            let result = Self::parse_record(line)?;
383            self.line_buffer.clear();
384            return Ok(Some(result));
385        }
386    }
387}
388
389impl GenomicReader for GffReader {
390    type Metadata = GffHeader;
391
392    fn metadata(&self) -> &Self::Metadata {
393        &self.header
394    }
395}
396
397#[cfg(test)]
398mod tests {
399    use super::*;
400    use std::io::Write;
401    use tempfile::NamedTempFile;
402
403    #[test]
404    fn test_gff_header_parsing() -> Result<()> {
405        let gff_data = "##gff-version 3\n\
406                        ##sequence-region chr1 1 248956422\n\
407                        chr1\tENSEMBL\tgene\t1000\t2000\t.\t+\t.\tID=gene1;Name=BRCA1\n";
408
409        let mut temp_file = NamedTempFile::new()?;
410        temp_file.write_all(gff_data.as_bytes())?;
411        temp_file.flush()?;
412
413        let mut reader = GffReader::from_path(temp_file.path())?;
414
415        assert_eq!(reader.header().version, Some("3".to_string()));
416        assert_eq!(reader.header().sequence_regions.len(), 1);
417
418        let record = reader.next_record()?.unwrap();
419        assert_eq!(record.seqid, "chr1");
420        assert_eq!(record.source, "ENSEMBL");
421
422        Ok(())
423    }
424
425    #[test]
426    fn test_gff_record_parsing() -> Result<()> {
427        let line = "chr1\tENSEMBL\tgene\t1000\t2000\t100.5\t+\t.\tID=gene1;Name=BRCA1";
428        let record = GffReader::parse_record(line)?;
429
430        assert_eq!(record.seqid, "chr1");
431        assert_eq!(record.source, "ENSEMBL");
432        assert_eq!(record.feature_type, "gene");
433        assert_eq!(record.start, 1000);
434        assert_eq!(record.end, 2000);
435        assert_eq!(record.score, Some(100.5));
436        assert_eq!(record.strand, Strand::Forward);
437        assert_eq!(record.phase, None);
438        assert_eq!(record.len(), 1001);
439
440        Ok(())
441    }
442
443    #[test]
444    fn test_gff3_attributes() -> Result<()> {
445        let line = "chr1\t.\tgene\t1000\t2000\t.\t+\t.\tID=gene1;Name=BRCA1;Dbxref=GeneID:672";
446        let record = GffReader::parse_record(line)?;
447
448        let attrs = record.parse_attributes();
449        assert_eq!(attrs.get("ID"), Some(&"gene1".to_string()));
450        assert_eq!(attrs.get("Name"), Some(&"BRCA1".to_string()));
451        assert_eq!(attrs.get("Dbxref"), Some(&"GeneID:672".to_string()));
452
453        assert_eq!(record.get_attribute("ID"), Some("gene1".to_string()));
454
455        Ok(())
456    }
457
458    #[test]
459    fn test_gtf_attributes() -> Result<()> {
460        let line = "chr1\t.\texon\t1000\t2000\t.\t+\t.\tgene_id \"ENSG00000000001\"; transcript_id \"ENST00000000001\";";
461        let record = GffReader::parse_record(line)?;
462
463        let attrs = record.parse_attributes();
464        assert_eq!(attrs.get("gene_id"), Some(&"ENSG00000000001".to_string()));
465        assert_eq!(
466            attrs.get("transcript_id"),
467            Some(&"ENST00000000001".to_string())
468        );
469
470        Ok(())
471    }
472
473    #[test]
474    fn test_invalid_positions() {
475        let line = "chr1\t.\tgene\t2000\t1000\t.\t+\t.\tID=gene1";
476        let result = GffReader::parse_record(line);
477        assert!(result.is_err());
478    }
479
480    #[test]
481    fn test_invalid_phase() {
482        let line = "chr1\t.\tCDS\t1000\t2000\t.\t+\t3\tID=cds1";
483        let result = GffReader::parse_record(line);
484        assert!(result.is_err());
485    }
486
487    #[test]
488    fn test_strand_parsing() -> Result<()> {
489        let forward = "chr1\t.\tgene\t1000\t2000\t.\t+\t.\tID=g1";
490        let reverse = "chr1\t.\tgene\t1000\t2000\t.\t-\t.\tID=g2";
491        let unknown = "chr1\t.\tgene\t1000\t2000\t.\t.\t.\tID=g3";
492
493        assert_eq!(GffReader::parse_record(forward)?.strand, Strand::Forward);
494        assert_eq!(GffReader::parse_record(reverse)?.strand, Strand::Reverse);
495        assert_eq!(GffReader::parse_record(unknown)?.strand, Strand::Unknown);
496
497        Ok(())
498    }
499}