Skip to main content

gen_annotations/translate/
gff.rs

1use std::{
2    cmp::{max, min},
3    collections::HashMap,
4    io::{BufRead, Read, Write},
5};
6
7use gen_core::{HashId, Strand, is_terminal};
8use gen_graph::{GraphNode, project_path};
9use gen_models::{
10    block_group::BlockGroup,
11    db::GraphConnection,
12    reference_alias::{ReferenceAlias, ReferenceAliasError},
13    sample::Sample,
14};
15use interavl::IntervalTree;
16use noodles::{core::Position, gff};
17use thiserror::Error;
18
19#[derive(Debug, Error)]
20pub enum GffError {
21    #[error("Couldn't translate GFF entry: {0}")]
22    Io(#[from] std::io::Error),
23    #[error("Error loading reference aliases: {0}")]
24    ReferenceAliasError(#[from] ReferenceAliasError),
25}
26
27pub fn translate_gff<R, W>(
28    conn: &GraphConnection,
29    collection: &str,
30    sample: &str,
31    reader: R,
32    writer: &mut W,
33) -> Result<(), GffError>
34where
35    R: Read + BufRead,
36    W: Write,
37{
38    let mut gff_reader = gff::io::Reader::new(reader);
39    let mut gff_writer = gff::io::Writer::new(writer);
40
41    let bgs = Sample::get_block_groups(conn, collection, sample);
42    let sample_bgs: HashMap<String, &BlockGroup> = HashMap::from_iter(
43        bgs.iter()
44            .map(|bg| (bg.name.clone(), bg))
45            .collect::<Vec<(String, &BlockGroup)>>(),
46    );
47
48    // Load all reference aliases, to accommodate alternate reference names in the GFF file
49    let references = sample_bgs.keys().cloned().collect::<Vec<String>>();
50    let references_by_alias = ReferenceAlias::get_references_by_alias(conn, references)?;
51
52    let mut paths: HashMap<HashId, IntervalTree<i64, (GraphNode, Strand)>> = HashMap::new();
53
54    for result in gff_reader.record_bufs() {
55        let record = result?;
56        let ref_name = record.reference_sequence_name().to_string();
57        let ref_name = references_by_alias
58            .get(&ref_name)
59            .unwrap_or(&ref_name)
60            .to_string();
61        let start = record.start().get() as i64;
62        let end = record.end().get() as i64;
63        if let Some(bg) = sample_bgs.get(&ref_name) {
64            let projection = paths.entry(bg.id).or_insert_with(|| {
65                let path = BlockGroup::get_current_path(conn, &bg.id);
66                let graph = BlockGroup::get_graph(conn, &bg.id);
67                let mut tree = IntervalTree::default();
68                let mut position: i64 = 0;
69                for (node, strand) in project_path(&graph, &path.blocks(conn)) {
70                    if !is_terminal(node.node_id) {
71                        // GFF indexing is one based, inclusive, so we add 1 to the start.
72                        // Take a sequence that is 1-4 in our coordinates, this converts to:
73                        // 0123456
74                        // ATCGATC
75                        // 1234567
76                        // 1-4 in our zero-based half open interval would be 2-4 in GFF coordinates
77                        let end_position = position + node.length();
78                        tree.insert(position + 1..end_position, (node, strand));
79                        position = end_position;
80                    }
81                }
82                tree
83            });
84            let range = start..end;
85            for (overlap, (node, _overlap_strand)) in projection.iter_overlaps(&range) {
86                let overlap_start = max(start, overlap.start) as usize;
87                let overlap_end = min(end, overlap.end) as usize;
88
89                let mut updated_record_builder =
90                    gff::feature::RecordBuf::builder()
91                        .set_reference_sequence_name(format!("{nid}", nid = node.node_id))
92                        .set_source(record.source().to_string())
93                        .set_type(record.ty().to_string())
94                        .set_start(Position::try_from(overlap_start).expect(
95                            "Could not convert start ({overlap_start}) to usize for propagation",
96                        ))
97                        .set_end(Position::try_from(overlap_end).expect(
98                            "Could not convert end ({overlap_end}) to usize for propagation",
99                        ))
100                        .set_strand(record.strand())
101                        .set_attributes(record.attributes().clone());
102                if let Some(phase) = record.phase() {
103                    updated_record_builder = updated_record_builder.set_phase(phase);
104                }
105                gff_writer.write_record(&updated_record_builder.build())?;
106            }
107        }
108    }
109    Ok(())
110}
111
112#[cfg(test)]
113mod tests {
114    use std::{fs::File, io::BufReader, path::PathBuf};
115
116    use gen_models::{reference_alias::ReferenceAlias, sample::Sample};
117
118    use super::translate_gff;
119    use crate::test_helpers::{get_connection, setup_test_data};
120
121    #[test]
122    fn translates_coordinates_to_nodes() {
123        let conn = get_connection();
124        setup_test_data(&conn);
125
126        let gff_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("./fixtures/complex.gff3");
127        let collection = "test".to_string();
128
129        let mut buffer = Vec::new();
130        translate_gff(
131            &conn,
132            &collection,
133            "foo",
134            BufReader::new(File::open(gff_path.clone()).expect("should open fixture gff")),
135            &mut buffer,
136        )
137        .expect("should translate gff for sample foo");
138        let results = String::from_utf8(buffer).expect("translated output should be valid UTF-8");
139        assert_eq!(
140            results,
141            concat!(
142                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\tgene\t1\t3\t.\t-\t.\tID=ENSG00000294541.1\n",
143                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\tgene\t4\t4\t.\t-\t.\tID=ENSG00000294541.1\n",
144                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\tgene\t5\t17\t.\t-\t.\tID=ENSG00000294541.1\n",
145                "086ae30894dda8efdc19d4dfadd5e6e24af8066e9ee63e56abe897993bebd112\tHAVANA\tgene\t18\t20\t.\t-\t.\tID=ENSG00000294541.1\n",
146                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\ttranscript\t1\t3\t.\t-\t.\tID=ENST00000724296.1;Parent=ENSG00000294541.1\n",
147                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\ttranscript\t4\t4\t.\t-\t.\tID=ENST00000724296.1;Parent=ENSG00000294541.1\n",
148                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\ttranscript\t5\t17\t.\t-\t.\tID=ENST00000724296.1;Parent=ENSG00000294541.1\n",
149                "086ae30894dda8efdc19d4dfadd5e6e24af8066e9ee63e56abe897993bebd112\tHAVANA\ttranscript\t18\t20\t.\t-\t.\tID=ENST00000724296.1;Parent=ENSG00000294541.1\n",
150                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\texon\t5\t8\t.\t-\t.\tID=exon:ENST00000724296.1:1;Parent=ENST00000724296.1\n",
151                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\texon\t10\t14\t.\t-\t.\tID=exon:ENST00000724296.1:2;Parent=ENST00000724296.1\n",
152                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\texon\t16\t17\t.\t-\t.\tID=exon:ENST00000724296.1:3;Parent=ENST00000724296.1\n",
153                "086ae30894dda8efdc19d4dfadd5e6e24af8066e9ee63e56abe897993bebd112\tHAVANA\texon\t18\t19\t.\t-\t.\tID=exon:ENST00000724296.1:3;Parent=ENST00000724296.1\n",
154                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tENSEMBL\tgene\t4\t4\t.\t-\t.\tID=ENSG00000277248.1\n",
155                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tENSEMBL\tgene\t5\t15\t.\t-\t.\tID=ENSG00000277248.1\n",
156                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tENSEMBL\ttranscript\t4\t4\t.\t-\t.\tID=ENST00000615943.1;Parent=ENSG00000277248.1\n",
157                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tENSEMBL\ttranscript\t5\t15\t.\t-\t.\tID=ENST00000615943.1;Parent=ENSG00000277248.1\n",
158                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tENSEMBL\texon\t4\t4\t.\t-\t.\tID=exon:ENST00000615943.1:1;Parent=ENST00000615943.1\n",
159                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tENSEMBL\texon\t5\t15\t.\t-\t.\tID=exon:ENST00000615943.1:1;Parent=ENST00000615943.1\n",
160            )
161        );
162
163        let mut buffer = Vec::new();
164        translate_gff(
165            &conn,
166            &collection,
167            Sample::DEFAULT_NAME,
168            BufReader::new(File::open(gff_path).expect("should open fixture gff")),
169            &mut buffer,
170        )
171        .expect("should translate gff for reference sample");
172        let results = String::from_utf8(buffer).expect("translated output should be valid UTF-8");
173        assert_eq!(
174            results,
175            concat!(
176                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\tgene\t1\t17\t.\t-\t.\tID=ENSG00000294541.1\n",
177                "086ae30894dda8efdc19d4dfadd5e6e24af8066e9ee63e56abe897993bebd112\tHAVANA\tgene\t18\t20\t.\t-\t.\tID=ENSG00000294541.1\n",
178                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\ttranscript\t1\t17\t.\t-\t.\tID=ENST00000724296.1;Parent=ENSG00000294541.1\n",
179                "086ae30894dda8efdc19d4dfadd5e6e24af8066e9ee63e56abe897993bebd112\tHAVANA\ttranscript\t18\t20\t.\t-\t.\tID=ENST00000724296.1;Parent=ENSG00000294541.1\n",
180                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\texon\t4\t8\t.\t-\t.\tID=exon:ENST00000724296.1:1;Parent=ENST00000724296.1\n",
181                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\texon\t10\t14\t.\t-\t.\tID=exon:ENST00000724296.1:2;Parent=ENST00000724296.1\n",
182                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\texon\t16\t17\t.\t-\t.\tID=exon:ENST00000724296.1:3;Parent=ENST00000724296.1\n",
183                "086ae30894dda8efdc19d4dfadd5e6e24af8066e9ee63e56abe897993bebd112\tHAVANA\texon\t18\t19\t.\t-\t.\tID=exon:ENST00000724296.1:3;Parent=ENST00000724296.1\n",
184                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tENSEMBL\tgene\t3\t15\t.\t-\t.\tID=ENSG00000277248.1\n",
185                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tENSEMBL\ttranscript\t3\t15\t.\t-\t.\tID=ENST00000615943.1;Parent=ENSG00000277248.1\n",
186                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tENSEMBL\texon\t3\t15\t.\t-\t.\tID=exon:ENST00000615943.1:1;Parent=ENST00000615943.1\n",
187            )
188        );
189    }
190
191    #[test]
192    fn translates_gff_using_reference_aliases() {
193        let conn = get_connection();
194        setup_test_data(&conn);
195
196        // Add a reference alias for the block group used in the fixture gff
197        ReferenceAlias::create(
198            &conn,
199            "alternate contig",
200            Some("m456".to_string()),
201            Some("m123".to_string()),
202            Some("m789".to_string()),
203            Some("chr1".to_string()),
204            None,
205            None,
206        )
207        .expect("should create reference alias");
208
209        let gff_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
210            .join("./fixtures/complex-with-reference-alias.gff3");
211        let collection = "test".to_string();
212
213        let mut buffer = Vec::new();
214        translate_gff(
215            &conn,
216            &collection,
217            Sample::DEFAULT_NAME,
218            BufReader::new(File::open(gff_path.clone()).expect("should open fixture gff")),
219            &mut buffer,
220        )
221        .expect("should translate gff for default sample");
222        let results = String::from_utf8(buffer).expect("translated output should be valid UTF-8");
223        assert_eq!(
224            results,
225            concat!(
226                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\tgene\t1\t17\t.\t-\t.\tID=ENSG00000294541.1\n",
227                "086ae30894dda8efdc19d4dfadd5e6e24af8066e9ee63e56abe897993bebd112\tHAVANA\tgene\t18\t20\t.\t-\t.\tID=ENSG00000294541.1\n",
228                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\ttranscript\t1\t17\t.\t-\t.\tID=ENST00000724296.1;Parent=ENSG00000294541.1\n",
229                "086ae30894dda8efdc19d4dfadd5e6e24af8066e9ee63e56abe897993bebd112\tHAVANA\ttranscript\t18\t20\t.\t-\t.\tID=ENST00000724296.1;Parent=ENSG00000294541.1\n",
230                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\texon\t4\t8\t.\t-\t.\tID=exon:ENST00000724296.1:1;Parent=ENST00000724296.1\n",
231                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\texon\t10\t14\t.\t-\t.\tID=exon:ENST00000724296.1:2;Parent=ENST00000724296.1\n",
232                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\texon\t16\t17\t.\t-\t.\tID=exon:ENST00000724296.1:3;Parent=ENST00000724296.1\n",
233                "086ae30894dda8efdc19d4dfadd5e6e24af8066e9ee63e56abe897993bebd112\tHAVANA\texon\t18\t19\t.\t-\t.\tID=exon:ENST00000724296.1:3;Parent=ENST00000724296.1\n",
234                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tENSEMBL\tgene\t3\t15\t.\t-\t.\tID=ENSG00000277248.1\n",
235                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tENSEMBL\ttranscript\t3\t15\t.\t-\t.\tID=ENST00000615943.1;Parent=ENSG00000277248.1\n",
236                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tENSEMBL\texon\t3\t15\t.\t-\t.\tID=exon:ENST00000615943.1:1;Parent=ENST00000615943.1\n",
237            )
238        );
239    }
240}