1use anyhow::Context;
28use itertools::Itertools;
29use multimap::MultiMap;
30use regex::Regex;
31use std::convert::{AsRef, TryInto};
32use std::fs;
33use std::io;
34use std::path::Path;
35use std::str::FromStr;
36
37use bio_types::strand::Strand;
38use serde::{Deserialize, Deserializer, Serialize};
39
40#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug, Serialize, Deserialize)]
46pub enum GffType {
47 GFF3,
49 GFF2,
51 GTF2,
53 Any(u8, u8, u8),
57}
58
59impl FromStr for GffType {
60 type Err = String;
61
62 fn from_str(src_str: &str) -> Result<Self, Self::Err> {
68 match src_str {
69 "gff3" => Ok(GffType::GFF3),
70 "gff2" => Ok(GffType::GFF2),
71 "gtf2" => Ok(GffType::GTF2),
72 _ => Err(format!(
73 "String '{}' is not a valid GFFType (GFF/GTF format version).",
74 src_str
75 )),
76 }
77 }
78}
79
80impl GffType {
81 #[inline]
82 fn separator(self) -> (u8, u8, u8) {
86 match self {
87 GffType::GFF3 => (b'=', b';', b','),
88 GffType::GFF2 => (b' ', b';', 0u8),
89 GffType::GTF2 => (b' ', b';', 0u8),
90 GffType::Any(x, y, z) => (x, y, z),
91 }
92 }
93}
94
95#[derive(Debug)]
97pub struct Reader<R: io::Read> {
98 inner: csv::Reader<R>,
99 gff_type: GffType,
100}
101
102impl Reader<fs::File> {
103 pub fn from_file<P: AsRef<Path> + std::fmt::Debug>(
105 path: P,
106 fileformat: GffType,
107 ) -> anyhow::Result<Self> {
108 fs::File::open(&path)
109 .map(|f| Reader::new(f, fileformat))
110 .with_context(|| format!("Failed to read GFF from {:#?}", path))
111 }
112}
113
114impl<R: io::Read> Reader<R> {
115 pub fn new(reader: R, fileformat: GffType) -> Self {
117 Reader {
118 inner: csv::ReaderBuilder::new()
119 .delimiter(b'\t')
120 .has_headers(false)
121 .comment(Some(b'#'))
122 .from_reader(reader),
123 gff_type: fileformat,
124 }
125 }
126
127 pub fn records(&mut self) -> Records<'_, R> {
129 let (delim, term, vdelim) = self.gff_type.separator();
130 let r = format!(
131 r" *(?P<key>[^{delim}{term}\t]+){delim}(?P<value>[^{delim}{term}\t]+){term}?",
132 delim = delim as char,
133 term = term as char
134 );
135 let attribute_re = Regex::new(&r).unwrap();
136 Records {
137 inner: self.inner.deserialize(),
138 attribute_re,
139 value_delim: vdelim as char,
140 }
141 }
142}
143
144type GffRecordInner = (
145 String,
146 String,
147 String,
148 u64,
149 u64,
150 String,
151 String,
152 Phase,
153 String,
154);
155
156#[derive(Debug, PartialEq, Eq, Clone, Default)]
157pub struct Phase(Option<u8>);
158
159impl Phase {
160 fn validate<T: Into<u8>>(p: T) -> Option<u8> {
161 let p = p.into();
162 if p < 3 {
163 Some(p)
164 } else {
165 None
166 }
167 }
168}
169
170impl From<u8> for Phase {
171 fn from(p: u8) -> Self {
181 Phase(Self::validate(p))
182 }
183}
184
185impl From<Option<u8>> for Phase {
186 fn from(p: Option<u8>) -> Self {
197 Phase(p.and_then(Self::validate))
198 }
199}
200
201impl TryInto<u8> for Phase {
202 type Error = ();
203
204 fn try_into(self) -> Result<u8, Self::Error> {
216 match self.0 {
217 Some(p) => Ok(p),
218 None => Err(()),
219 }
220 }
221}
222
223impl TryInto<Option<u8>> for Phase {
224 type Error = ();
225
226 fn try_into(self) -> Result<Option<u8>, Self::Error> {
242 Ok(self.0)
243 }
244}
245
246impl<'de> Deserialize<'de> for Phase {
247 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
248 where
249 D: Deserializer<'de>,
250 {
251 let s = String::deserialize(deserializer)?;
252 match s.as_str() {
253 "." => Ok(Phase(None)),
254 _ => {
255 let p = u8::from_str(&s)
256 .map_err(|_| serde::de::Error::custom("Phase must be \".\", 0, 1, or 2"))?;
257 Ok(Phase(Self::validate(p)))
258 }
259 }
260 }
261}
262
263impl Serialize for Phase {
264 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
265 where
266 S: serde::Serializer,
267 {
268 match self.0 {
269 Some(p) => serializer.serialize_u8(p),
270 None => serializer.serialize_str("."),
271 }
272 }
273}
274
275pub struct Records<'a, R: io::Read> {
277 inner: csv::DeserializeRecordsIter<'a, R, GffRecordInner>,
278 attribute_re: Regex,
279 value_delim: char,
280}
281
282impl<'a, R: io::Read> Iterator for Records<'a, R> {
283 type Item = csv::Result<Record>;
284
285 fn next(&mut self) -> Option<csv::Result<Record>> {
286 self.inner.next().map(|res| {
287 res.map(
288 |(
289 seqname,
290 source,
291 feature_type,
292 start,
293 end,
294 score,
295 strand,
296 phase,
297 raw_attributes,
298 )| {
299 let trim_quotes = |s: &str| s.trim_matches('\'').trim_matches('"').to_owned();
300 let mut attributes = MultiMap::new();
301 for caps in self.attribute_re.captures_iter(&raw_attributes) {
302 for value in caps["value"].split(self.value_delim) {
303 attributes.insert(trim_quotes(&caps["key"]), trim_quotes(value));
304 }
305 }
306 Record {
307 seqname,
308 source,
309 feature_type,
310 start,
311 end,
312 score,
313 strand,
314 phase,
315 attributes,
316 }
317 },
318 )
319 })
320 }
321}
322
323#[derive(Debug)]
325pub struct Writer<W: io::Write> {
326 inner: csv::Writer<W>,
327 delimiter: char,
328 terminator: String,
329}
330
331impl Writer<fs::File> {
332 #[allow(clippy::wrong_self_convention)]
334 pub fn to_file<P: AsRef<Path>>(path: P, fileformat: GffType) -> io::Result<Self> {
335 fs::File::create(path).map(|f| Writer::new(f, fileformat))
336 }
337}
338
339impl<W: io::Write> Writer<W> {
340 pub fn new(writer: W, fileformat: GffType) -> Self {
342 let (delim, termi, _) = fileformat.separator();
343
344 Writer {
345 inner: csv::WriterBuilder::new()
346 .delimiter(b'\t')
347 .flexible(true)
348 .from_writer(writer),
349 delimiter: delim as char,
350 terminator: String::from_utf8(vec![termi]).unwrap(),
351 }
352 }
353
354 pub fn write(&mut self, record: &Record) -> csv::Result<()> {
356 let attributes = if !record.attributes.is_empty() {
357 record
358 .attributes
359 .iter()
360 .map(|(a, b)| format!("{}{}{}", a, self.delimiter, b))
361 .join(&self.terminator)
362 } else {
363 "".to_owned()
364 };
365
366 self.inner.serialize((
367 &record.seqname,
368 &record.source,
369 &record.feature_type,
370 record.start,
371 record.end,
372 &record.score,
373 &record.strand,
374 &record.phase,
375 attributes,
376 ))
377 }
378}
379
380#[derive(Default, Clone, Eq, PartialEq, Debug, Serialize, Deserialize)]
382pub struct Record {
383 seqname: String,
384 source: String,
385 feature_type: String,
386 start: u64,
387 end: u64,
388 score: String,
389 strand: String,
390 phase: Phase,
391 attributes: MultiMap<String, String>,
392}
393
394impl Record {
395 pub fn new() -> Self {
397 Record {
398 seqname: "".to_owned(),
399 source: "".to_owned(),
400 feature_type: "".to_owned(),
401 start: 0,
402 end: 0,
403 score: ".".to_owned(),
404 strand: ".".to_owned(),
405 phase: Phase(None),
406 attributes: MultiMap::<String, String>::new(),
407 }
408 }
409
410 pub fn seqname(&self) -> &str {
412 &self.seqname
413 }
414
415 pub fn source(&self) -> &str {
417 &self.source
418 }
419
420 pub fn feature_type(&self) -> &str {
422 &self.feature_type
423 }
424
425 pub fn start(&self) -> &u64 {
427 &self.start
428 }
429
430 pub fn end(&self) -> &u64 {
432 &self.end
433 }
434
435 pub fn score(&self) -> Option<u64> {
437 match self.score.as_ref() {
438 "." => None,
439 _ => self.score.parse::<u64>().ok(),
440 }
441 }
442
443 pub fn strand(&self) -> Option<Strand> {
445 match self.strand.as_ref() {
446 "+" => Some(Strand::Forward),
447 "-" => Some(Strand::Reverse),
448 _ => None,
449 }
450 }
451
452 pub fn phase(&self) -> &Phase {
454 &self.phase
455 }
456
457 pub fn attributes(&self) -> &MultiMap<String, String> {
459 &self.attributes
460 }
461
462 pub fn seqname_mut(&mut self) -> &mut String {
464 &mut self.seqname
465 }
466
467 pub fn source_mut(&mut self) -> &mut String {
469 &mut self.source
470 }
471
472 pub fn feature_type_mut(&mut self) -> &mut String {
474 &mut self.feature_type
475 }
476
477 pub fn start_mut(&mut self) -> &mut u64 {
479 &mut self.start
480 }
481
482 pub fn end_mut(&mut self) -> &mut u64 {
484 &mut self.end
485 }
486
487 pub fn score_mut(&mut self) -> &mut String {
489 &mut self.score
490 }
491
492 pub fn strand_mut(&mut self) -> &mut String {
494 &mut self.strand
495 }
496
497 pub fn phase_mut(&mut self) -> &mut Phase {
499 &mut self.phase
500 }
501
502 pub fn attributes_mut(&mut self) -> &mut MultiMap<String, String> {
504 &mut self.attributes
505 }
506}
507
508#[cfg(test)]
509mod tests {
510 use super::*;
511 use bio_types::strand::Strand;
512 use multimap::MultiMap;
513
514 const GFF_FILE: &[u8] = b"P0A7B8\tUniProtKB\tInitiator methionine\t1\t1\t.\t.\t.\t\
515Note=Removed,Obsolete;ID=test
516P0A7B8\tUniProtKB\tChain\t2\t176\t50\t+\t.\tNote=ATP-dependent protease subunit HslV;\
517ID=PRO_0000148105";
518 const GFF_FILE_WITH_COMMENT: &[u8] = b"#comment
519P0A7B8\tUniProtKB\tInitiator methionine\t1\t1\t.\t.\t.\t\
520Note=Removed,Obsolete;ID=test
521#comment
522P0A7B8\tUniProtKB\tChain\t2\t176\t50\t+\t.\tNote=ATP-dependent protease subunit HslV;\
523ID=PRO_0000148105";
524 const GFF_FILE_ONE_ATTRIB: &[u8] =
526 b"P0A7B8\tUniProtKB\tInitiator methionine\t1\t1\t.\t.\t.\tNote=Removed
527P0A7B8\tUniProtKB\tChain\t2\t176\t50\t+\t.\tID=PRO_0000148105
528";
529
530 const GTF_FILE: &[u8] =
531 b"P0A7B8\tUniProtKB\tInitiator methionine\t1\t1\t.\t.\t.\tNote Removed;ID test
532P0A7B8\tUniProtKB\tChain\t2\t176\t50\t+\t.\tNote ATP-dependent;ID PRO_0000148105
533";
534
535 const GTF_FILE_2: &[u8] = b"chr1\tHAVANA\tgene\t11869\t14409\t.\t+\t.\t\
537gene_id \"ENSG00000223972.5\"; gene_type \"transcribed_unprocessed_pseudogene\";
538chr1\tHAVANA\ttranscript\t11869\t14409\t.\t+\t.\tgene_id \"ENSG00000223972.5\";\
539transcript_id \"ENST00000456328.2\"; gene_type \"transcribed_unprocessed_pseudogene\"";
540
541 const GTF_FILE_DUP_ATTR_KEYS: &[u8] = b"chr1\tENSEMBL\ttranscript\t182393\t\
543184158\t.\t+\t.\tgene_id \"ENSG00000279928.1\"; transcript_id \"ENST00000624431.1\";\
544gene_type \"protein_coding\"; gene_status \"KNOWN\"; gene_name \"FO538757.2\";\
545transcript_type \"protein_coding\"; transcript_status \"KNOWN\";\
546transcript_name \"FO538757.2-201\"; level 3; protein_id \"ENSP00000485457.1\";\
547transcript_support_level \"1\"; tag \"basic\"; tag \"appris_principal_1\";";
548
549 const GTF_FILE_ONE_ATTRIB: &[u8] =
551 b"P0A7B8\tUniProtKB\tInitiator methionine\t1\t1\t.\t.\t.\tNote Removed
552P0A7B8\tUniProtKB\tChain\t2\t176\t50\t+\t.\tID PRO_0000148105
553";
554
555 #[test]
556 fn test_reader_gff3() {
557 let seqname = ["P0A7B8", "P0A7B8"];
558 let source = ["UniProtKB", "UniProtKB"];
559 let feature_type = ["Initiator methionine", "Chain"];
560 let starts = [1, 2];
561 let ends = [1, 176];
562 let scores = [None, Some(50)];
563 let strand = [None, Some(Strand::Forward)];
564 let phase = [Phase(None), Phase(None)];
565 let mut attributes = [MultiMap::new(), MultiMap::new()];
566 attributes[0].insert("ID".to_owned(), "test".to_owned());
567 attributes[0].insert("Note".to_owned(), "Removed".to_owned());
568 attributes[0].insert("Note".to_owned(), "Obsolete".to_owned());
569 attributes[1].insert("ID".to_owned(), "PRO_0000148105".to_owned());
570 attributes[1].insert(
571 "Note".to_owned(),
572 "ATP-dependent protease subunit HslV".to_owned(),
573 );
574
575 let mut reader = Reader::new(GFF_FILE, GffType::GFF3);
576 for (i, r) in reader.records().enumerate() {
577 let record = r.unwrap();
578 assert_eq!(record.seqname(), seqname[i]);
579 assert_eq!(record.source(), source[i]);
580 assert_eq!(record.feature_type(), feature_type[i]);
581 assert_eq!(*record.start(), starts[i]);
582 assert_eq!(*record.end(), ends[i]);
583 assert_eq!(record.score(), scores[i]);
584 assert_eq!(record.strand(), strand[i]);
585 assert_eq!(*record.phase(), phase[i]);
586 assert_eq!(record.attributes(), &attributes[i]);
587 }
588
589 let mut reader = Reader::new(GFF_FILE_WITH_COMMENT, GffType::GFF3);
590 for (i, r) in reader.records().enumerate() {
591 let record = r.unwrap();
592 assert_eq!(record.seqname(), seqname[i]);
593 assert_eq!(record.source(), source[i]);
594 assert_eq!(record.feature_type(), feature_type[i]);
595 assert_eq!(*record.start(), starts[i]);
596 assert_eq!(*record.end(), ends[i]);
597 assert_eq!(record.score(), scores[i]);
598 assert_eq!(record.strand(), strand[i]);
599 assert_eq!(*record.phase(), phase[i]);
600 assert_eq!(record.attributes(), &attributes[i]);
601 }
602 }
603
604 #[test]
605 fn test_reader_from_file_path_doesnt_exist_returns_err() {
606 let path = Path::new("/I/dont/exist.gff");
607 let error = Reader::from_file(path, GffType::GFF3)
608 .unwrap_err()
609 .downcast::<String>()
610 .unwrap();
611
612 assert_eq!(&error, "Failed to read GFF from \"/I/dont/exist.gff\"")
613 }
614
615 #[test]
616 fn test_gff_type_from_str() {
617 let gff3 = GffType::from_str("gff3").expect("Error parsing");
618 assert_eq!(gff3, GffType::GFF3);
619
620 let gff2 = GffType::from_str("gff2").expect("Error parsing");
621 assert_eq!(gff2, GffType::GFF2);
622
623 let gtf2 = GffType::from_str("gtf2").expect("Error parsing");
624 assert_eq!(gtf2, GffType::GTF2);
625
626 let unk = GffType::from_str("unknown").unwrap_err();
627 assert_eq!(
628 unk,
629 "String 'unknown' is not a valid GFFType (GFF/GTF format version)."
630 )
631 }
632
633 #[test]
634 fn test_reader_gtf2() {
635 let seqname = ["P0A7B8", "P0A7B8"];
636 let source = ["UniProtKB", "UniProtKB"];
637 let feature_type = ["Initiator methionine", "Chain"];
638 let starts = [1, 2];
639 let ends = [1, 176];
640 let scores = [None, Some(50)];
641 let strand = [None, Some(Strand::Forward)];
642 let phase = [Phase(None), Phase(None)];
643 let mut attributes = [MultiMap::new(), MultiMap::new()];
644 attributes[0].insert("ID".to_owned(), "test".to_owned());
645 attributes[0].insert("Note".to_owned(), "Removed".to_owned());
646 attributes[1].insert("ID".to_owned(), "PRO_0000148105".to_owned());
647 attributes[1].insert("Note".to_owned(), "ATP-dependent".to_owned());
648
649 let mut reader = Reader::new(GTF_FILE, GffType::GTF2);
650 for (i, r) in reader.records().enumerate() {
651 let record = r.unwrap();
652 assert_eq!(record.seqname(), seqname[i]);
653 assert_eq!(record.source(), source[i]);
654 assert_eq!(record.feature_type(), feature_type[i]);
655 assert_eq!(*record.start(), starts[i]);
656 assert_eq!(*record.end(), ends[i]);
657 assert_eq!(record.score(), scores[i]);
658 assert_eq!(record.strand(), strand[i]);
659 assert_eq!(*record.phase(), phase[i]);
660 assert_eq!(record.attributes(), &attributes[i]);
661 }
662 }
663
664 #[test]
665 fn test_reader_gtf2_2() {
666 let seqname = ["chr1", "chr1"];
667 let source = ["HAVANA", "HAVANA"];
668 let feature_type = ["gene", "transcript"];
669 let starts = [11869, 11869];
670 let ends = [14409, 14409];
671 let scores = [None, None];
672 let strand = [Some(Strand::Forward), Some(Strand::Forward)];
673 let phase = [Phase(None), Phase(None)];
674 let mut attributes = [MultiMap::new(), MultiMap::new()];
675 attributes[0].insert("gene_id".to_owned(), "ENSG00000223972.5".to_owned());
676 attributes[0].insert(
677 "gene_type".to_owned(),
678 "transcribed_unprocessed_pseudogene".to_owned(),
679 );
680 attributes[1].insert("gene_id".to_owned(), "ENSG00000223972.5".to_owned());
681 attributes[1].insert("transcript_id".to_owned(), "ENST00000456328.2".to_owned());
682 attributes[1].insert(
683 "gene_type".to_owned(),
684 "transcribed_unprocessed_pseudogene".to_owned(),
685 );
686
687 let mut reader = Reader::new(GTF_FILE_2, GffType::GTF2);
688 for (i, r) in reader.records().enumerate() {
689 let record = r.unwrap();
690 assert_eq!(record.seqname(), seqname[i]);
691 assert_eq!(record.source(), source[i]);
692 assert_eq!(record.feature_type(), feature_type[i]);
693 assert_eq!(*record.start(), starts[i]);
694 assert_eq!(*record.end(), ends[i]);
695 assert_eq!(record.score(), scores[i]);
696 assert_eq!(record.strand(), strand[i]);
697 assert_eq!(*record.phase(), phase[i]);
698 assert_eq!(record.attributes(), &attributes[i]);
699 }
700 }
701
702 #[test]
703 fn test_reader_gtf2_dup_attr_keys() {
704 let mut reader = Reader::new(GTF_FILE_DUP_ATTR_KEYS, GffType::GTF2);
705 let mut records = reader.records().collect::<Vec<_>>();
706 assert_eq!(records.len(), 1);
707 let record = records.pop().unwrap().expect("expected one record");
708 assert_eq!(record.attributes.get("tag"), Some(&"basic".to_owned()));
709 assert_eq!(
710 record.attributes.get_vec("tag"),
711 Some(&vec!["basic".to_owned(), "appris_principal_1".to_owned()])
712 );
713 }
714
715 #[test]
716 fn test_writer_gff3() {
717 let mut reader = Reader::new(GFF_FILE_ONE_ATTRIB, GffType::GFF3);
718 let mut writer = Writer::new(vec![], GffType::GFF3);
719 for r in reader.records() {
720 writer
721 .write(&r.expect("Error reading record"))
722 .expect("Error writing record");
723 }
724 assert_eq!(writer.inner.into_inner().unwrap(), GFF_FILE_ONE_ATTRIB)
725 }
726
727 #[test]
728 fn test_writer_gtf2() {
729 let mut reader = Reader::new(GTF_FILE_ONE_ATTRIB, GffType::GTF2);
730 let mut writer = Writer::new(vec![], GffType::GTF2);
731 for r in reader.records() {
732 writer
733 .write(&r.expect("Error reading record"))
734 .expect("Error writing record");
735 }
736 assert_eq!(writer.inner.into_inner().unwrap(), GTF_FILE_ONE_ATTRIB)
737 }
738
739 #[test]
740 fn test_convert_gtf2_to_gff3() {
741 let mut reader = Reader::new(GTF_FILE_ONE_ATTRIB, GffType::GTF2);
742 let mut writer = Writer::new(vec![], GffType::GFF3);
743 for r in reader.records() {
744 writer
745 .write(&r.expect("Error reading record"))
746 .expect("Error writing record");
747 }
748 assert_eq!(writer.inner.into_inner().unwrap(), GFF_FILE_ONE_ATTRIB)
749 }
750
751 #[test]
752 fn test_unknown_gff_type() {
753 assert_eq!(
754 GffType::from_str("xtf9"),
755 Err("String 'xtf9' is not a valid GFFType (GFF/GTF format version).".to_string())
756 )
757 }
758
759 #[test]
760 fn test_from_u8_creates_phase_with_value() {
761 let phase = Phase::from(1);
762 assert_eq!(phase, Phase(Some(1)));
763 }
764
765 #[test]
766 fn test_try_into_u8_returns_value_for_phase_with_value() {
767 let phase = Phase(Some(2));
768 let result: Result<u8, ()> = phase.try_into();
769 assert_eq!(result, Ok(2));
770 }
771
772 #[test]
773 fn test_try_into_u8_returns_error_for_phase_with_none() {
774 let phase = Phase(None);
775 let result: Result<u8, ()> = phase.try_into();
776 assert_eq!(result, Err(()));
777 }
778}