genomicframe_core/formats/gff/
reader.rs1use crate::core::{GenomicInterval, GenomicReader, GenomicRecordIterator, Strand};
46use crate::error::{Error, Result};
47use crate::io::Compression;
48use flate2::read::MultiGzDecoder;
50use std::collections::HashMap;
51use std::fs::File;
52use std::io::{BufRead, BufReader};
53use std::path::Path;
54
55#[derive(Debug, Clone)]
57pub struct GffRecord {
58 pub seqid: String,
60 pub source: String,
62 pub feature_type: String,
64 pub start: u64,
66 pub end: u64,
68 pub score: Option<f64>,
70 pub strand: Strand,
72 pub phase: Option<u8>,
74 pub attributes: String,
76}
77
78impl GffRecord {
79 pub fn to_interval(&self) -> Result<GenomicInterval> {
81 GenomicInterval::new(self.seqid.clone(), self.start - 1, self.end)
82 }
83
84 pub fn parse_attributes(&self) -> HashMap<String, String> {
88 let mut attrs = HashMap::new();
89
90 if self.attributes.contains('=') {
92 for pair in self.attributes.split(';') {
94 let pair = pair.trim();
95 if pair.is_empty() {
96 continue;
97 }
98 if let Some((key, value)) = pair.split_once('=') {
99 attrs.insert(key.trim().to_string(), value.trim().to_string());
100 }
101 }
102 } else {
103 for pair in self.attributes.split(';') {
105 let pair = pair.trim();
106 if pair.is_empty() {
107 continue;
108 }
109 let parts: Vec<&str> = pair.splitn(2, ' ').collect();
110 if parts.len() == 2 {
111 let key = parts[0].trim();
112 let value = parts[1].trim().trim_matches('"');
113 attrs.insert(key.to_string(), value.to_string());
114 }
115 }
116 }
117
118 attrs
119 }
120
121 pub fn get_attribute(&self, key: &str) -> Option<String> {
123 self.parse_attributes().get(key).cloned()
124 }
125
126 pub fn len(&self) -> u64 {
128 self.end.saturating_sub(self.start) + 1
129 }
130
131 pub fn is_empty(&self) -> bool {
133 self.start > self.end
134 }
135}
136
137#[derive(Debug, Clone, Default)]
142pub struct GffHeader {
143 pub version: Option<String>,
145 pub sequence_regions: Vec<String>,
147 pub directives: Vec<String>,
149}
150
151pub struct GffReader {
157 reader: Box<dyn BufRead>,
158 header: GffHeader,
159 line_buffer: String,
160}
161
162impl GffReader {
163 pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
189 let path = path.as_ref();
190 let file = File::open(path)?;
191 let compression = Compression::from_path(path);
192
193 let reader: Box<dyn BufRead> = match compression {
194 Compression::Gzip | Compression::Bgzip => {
195 Box::new(BufReader::new(MultiGzDecoder::new(file)))
196 }
197 _ => Box::new(BufReader::new(file)),
198 };
199
200 Self::parse_header(reader)
201 }
202
203 pub fn header(&self) -> &GffHeader {
205 &self.header
206 }
207
208 fn parse_header(mut reader: Box<dyn BufRead>) -> Result<Self> {
210 let mut header = GffHeader::default();
211 let mut line = String::new();
212 let first_data_line: Option<String> = loop {
213 line.clear();
214 let bytes_read = reader.read_line(&mut line)?;
215
216 if bytes_read == 0 {
218 break None;
219 }
220
221 let trimmed = line.trim();
222
223 if trimmed.is_empty() {
225 continue;
226 }
227
228 if !trimmed.starts_with('#') {
230 break Some(trimmed.to_string());
232 }
233
234 if trimmed.starts_with("##") {
236 if trimmed.starts_with("##gff-version") {
237 header.version = trimmed
238 .trim_start_matches("##gff-version")
239 .trim()
240 .split_whitespace()
241 .next()
242 .map(|s| s.to_string());
243 } else if trimmed.starts_with("##sequence-region") {
244 header.sequence_regions.push(trimmed.to_string());
245 } else {
246 header.directives.push(trimmed.to_string());
247 }
248 }
249 };
251
252 let line_buffer = first_data_line.unwrap_or_else(|| String::with_capacity(512));
254
255 Ok(Self {
256 reader,
257 header,
258 line_buffer,
259 })
260 }
261
262 fn parse_record(line: &str) -> Result<GffRecord> {
264 let parts: Vec<&str> = line.split('\t').collect();
265
266 if parts.len() != 9 {
267 return Err(Error::Parse(format!(
268 "Invalid GFF record: expected 9 fields, got {}",
269 parts.len()
270 )));
271 }
272
273 let start = parts[3]
275 .parse::<u64>()
276 .map_err(|e| Error::Parse(format!("Invalid start position '{}': {}", parts[3], e)))?;
277
278 let end = parts[4]
279 .parse::<u64>()
280 .map_err(|e| Error::Parse(format!("Invalid end position '{}': {}", parts[4], e)))?;
281
282 if start > end {
284 return Err(Error::Parse(format!(
285 "Invalid GFF record: start ({}) > end ({})",
286 start, end
287 )));
288 }
289
290 let score = if parts[5] == "." {
292 None
293 } else {
294 Some(
295 parts[5]
296 .parse::<f64>()
297 .map_err(|e| Error::Parse(format!("Invalid score '{}': {}", parts[5], e)))?,
298 )
299 };
300
301 let strand = match parts[6] {
303 "+" => Strand::Forward,
304 "-" => Strand::Reverse,
305 "." => Strand::Unknown,
306 other => {
307 return Err(Error::Parse(format!(
308 "Invalid strand '{}': must be +, -, or .",
309 other
310 )))
311 }
312 };
313
314 let phase = if parts[7] == "." {
316 None
317 } else {
318 let p = parts[7]
319 .parse::<u8>()
320 .map_err(|e| Error::Parse(format!("Invalid phase '{}': {}", parts[7], e)))?;
321 if p > 2 {
322 return Err(Error::Parse(format!(
323 "Invalid phase '{}': must be 0, 1, 2, or .",
324 p
325 )));
326 }
327 Some(p)
328 };
329
330 Ok(GffRecord {
331 seqid: parts[0].to_string(),
332 source: parts[1].to_string(),
333 feature_type: parts[2].to_string(),
334 start,
335 end,
336 score,
337 strand,
338 phase,
339 attributes: parts[8].to_string(),
340 })
341 }
342}
343
344impl GenomicRecordIterator for GffReader {
345 type Record = GffRecord;
346
347 fn next_raw(&mut self) -> Result<Option<Vec<u8>>> {
348 Ok(None)
350 }
351
352 fn next_record(&mut self) -> Result<Option<Self::Record>> {
353 loop {
354 if !self.line_buffer.is_empty() {
356 let line = self.line_buffer.clone();
357 self.line_buffer.clear();
358
359 let trimmed = line.trim();
360 if !trimmed.is_empty() && !trimmed.starts_with('#') {
361 return Ok(Some(Self::parse_record(trimmed)?));
362 }
363 }
365
366 let bytes_read = self.reader.read_line(&mut self.line_buffer)?;
368
369 if bytes_read == 0 {
371 return Ok(None);
372 }
373
374 let line = self.line_buffer.trim();
375
376 if line.is_empty() || line.starts_with('#') {
378 self.line_buffer.clear();
379 continue;
380 }
381
382 let result = Self::parse_record(line)?;
383 self.line_buffer.clear();
384 return Ok(Some(result));
385 }
386 }
387}
388
389impl GenomicReader for GffReader {
390 type Metadata = GffHeader;
391
392 fn metadata(&self) -> &Self::Metadata {
393 &self.header
394 }
395}
396
397#[cfg(test)]
398mod tests {
399 use super::*;
400 use std::io::Write;
401 use tempfile::NamedTempFile;
402
403 #[test]
404 fn test_gff_header_parsing() -> Result<()> {
405 let gff_data = "##gff-version 3\n\
406 ##sequence-region chr1 1 248956422\n\
407 chr1\tENSEMBL\tgene\t1000\t2000\t.\t+\t.\tID=gene1;Name=BRCA1\n";
408
409 let mut temp_file = NamedTempFile::new()?;
410 temp_file.write_all(gff_data.as_bytes())?;
411 temp_file.flush()?;
412
413 let mut reader = GffReader::from_path(temp_file.path())?;
414
415 assert_eq!(reader.header().version, Some("3".to_string()));
416 assert_eq!(reader.header().sequence_regions.len(), 1);
417
418 let record = reader.next_record()?.unwrap();
419 assert_eq!(record.seqid, "chr1");
420 assert_eq!(record.source, "ENSEMBL");
421
422 Ok(())
423 }
424
425 #[test]
426 fn test_gff_record_parsing() -> Result<()> {
427 let line = "chr1\tENSEMBL\tgene\t1000\t2000\t100.5\t+\t.\tID=gene1;Name=BRCA1";
428 let record = GffReader::parse_record(line)?;
429
430 assert_eq!(record.seqid, "chr1");
431 assert_eq!(record.source, "ENSEMBL");
432 assert_eq!(record.feature_type, "gene");
433 assert_eq!(record.start, 1000);
434 assert_eq!(record.end, 2000);
435 assert_eq!(record.score, Some(100.5));
436 assert_eq!(record.strand, Strand::Forward);
437 assert_eq!(record.phase, None);
438 assert_eq!(record.len(), 1001);
439
440 Ok(())
441 }
442
443 #[test]
444 fn test_gff3_attributes() -> Result<()> {
445 let line = "chr1\t.\tgene\t1000\t2000\t.\t+\t.\tID=gene1;Name=BRCA1;Dbxref=GeneID:672";
446 let record = GffReader::parse_record(line)?;
447
448 let attrs = record.parse_attributes();
449 assert_eq!(attrs.get("ID"), Some(&"gene1".to_string()));
450 assert_eq!(attrs.get("Name"), Some(&"BRCA1".to_string()));
451 assert_eq!(attrs.get("Dbxref"), Some(&"GeneID:672".to_string()));
452
453 assert_eq!(record.get_attribute("ID"), Some("gene1".to_string()));
454
455 Ok(())
456 }
457
458 #[test]
459 fn test_gtf_attributes() -> Result<()> {
460 let line = "chr1\t.\texon\t1000\t2000\t.\t+\t.\tgene_id \"ENSG00000000001\"; transcript_id \"ENST00000000001\";";
461 let record = GffReader::parse_record(line)?;
462
463 let attrs = record.parse_attributes();
464 assert_eq!(attrs.get("gene_id"), Some(&"ENSG00000000001".to_string()));
465 assert_eq!(
466 attrs.get("transcript_id"),
467 Some(&"ENST00000000001".to_string())
468 );
469
470 Ok(())
471 }
472
473 #[test]
474 fn test_invalid_positions() {
475 let line = "chr1\t.\tgene\t2000\t1000\t.\t+\t.\tID=gene1";
476 let result = GffReader::parse_record(line);
477 assert!(result.is_err());
478 }
479
480 #[test]
481 fn test_invalid_phase() {
482 let line = "chr1\t.\tCDS\t1000\t2000\t.\t+\t3\tID=cds1";
483 let result = GffReader::parse_record(line);
484 assert!(result.is_err());
485 }
486
487 #[test]
488 fn test_strand_parsing() -> Result<()> {
489 let forward = "chr1\t.\tgene\t1000\t2000\t.\t+\t.\tID=g1";
490 let reverse = "chr1\t.\tgene\t1000\t2000\t.\t-\t.\tID=g2";
491 let unknown = "chr1\t.\tgene\t1000\t2000\t.\t.\t.\tID=g3";
492
493 assert_eq!(GffReader::parse_record(forward)?.strand, Strand::Forward);
494 assert_eq!(GffReader::parse_record(reverse)?.strand, Strand::Reverse);
495 assert_eq!(GffReader::parse_record(unknown)?.strand, Strand::Unknown);
496
497 Ok(())
498 }
499}