gen_annotations/translate/
gff.rs1use std::{
2 cmp::{max, min},
3 collections::HashMap,
4 io::{BufRead, Read, Write},
5};
6
7use gen_core::{HashId, Strand, is_terminal};
8use gen_graph::{GraphNode, project_path};
9use gen_models::{
10 block_group::BlockGroup,
11 db::GraphConnection,
12 reference_alias::{ReferenceAlias, ReferenceAliasError},
13 sample::Sample,
14};
15use interavl::IntervalTree;
16use noodles::{core::Position, gff};
17use thiserror::Error;
18
19#[derive(Debug, Error)]
20pub enum GffError {
21 #[error("Couldn't translate GFF entry: {0}")]
22 Io(#[from] std::io::Error),
23 #[error("Error loading reference aliases: {0}")]
24 ReferenceAliasError(#[from] ReferenceAliasError),
25}
26
27pub fn translate_gff<R, W>(
28 conn: &GraphConnection,
29 collection: &str,
30 sample: &str,
31 reader: R,
32 writer: &mut W,
33) -> Result<(), GffError>
34where
35 R: Read + BufRead,
36 W: Write,
37{
38 let mut gff_reader = gff::io::Reader::new(reader);
39 let mut gff_writer = gff::io::Writer::new(writer);
40
41 let bgs = Sample::get_block_groups(conn, collection, sample);
42 let sample_bgs: HashMap<String, &BlockGroup> = HashMap::from_iter(
43 bgs.iter()
44 .map(|bg| (bg.name.clone(), bg))
45 .collect::<Vec<(String, &BlockGroup)>>(),
46 );
47
48 let references = sample_bgs.keys().cloned().collect::<Vec<String>>();
50 let references_by_alias = ReferenceAlias::get_references_by_alias(conn, references)?;
51
52 let mut paths: HashMap<HashId, IntervalTree<i64, (GraphNode, Strand)>> = HashMap::new();
53
54 for result in gff_reader.record_bufs() {
55 let record = result?;
56 let ref_name = record.reference_sequence_name().to_string();
57 let ref_name = references_by_alias
58 .get(&ref_name)
59 .unwrap_or(&ref_name)
60 .to_string();
61 let start = record.start().get() as i64;
62 let end = record.end().get() as i64;
63 if let Some(bg) = sample_bgs.get(&ref_name) {
64 let projection = paths.entry(bg.id).or_insert_with(|| {
65 let path = BlockGroup::get_current_path(conn, &bg.id);
66 let graph = BlockGroup::get_graph(conn, &bg.id);
67 let mut tree = IntervalTree::default();
68 let mut position: i64 = 0;
69 for (node, strand) in project_path(&graph, &path.blocks(conn)) {
70 if !is_terminal(node.node_id) {
71 let end_position = position + node.length();
78 tree.insert(position + 1..end_position, (node, strand));
79 position = end_position;
80 }
81 }
82 tree
83 });
84 let range = start..end;
85 for (overlap, (node, _overlap_strand)) in projection.iter_overlaps(&range) {
86 let overlap_start = max(start, overlap.start) as usize;
87 let overlap_end = min(end, overlap.end) as usize;
88
89 let mut updated_record_builder =
90 gff::feature::RecordBuf::builder()
91 .set_reference_sequence_name(format!("{nid}", nid = node.node_id))
92 .set_source(record.source().to_string())
93 .set_type(record.ty().to_string())
94 .set_start(Position::try_from(overlap_start).expect(
95 "Could not convert start ({overlap_start}) to usize for propagation",
96 ))
97 .set_end(Position::try_from(overlap_end).expect(
98 "Could not convert end ({overlap_end}) to usize for propagation",
99 ))
100 .set_strand(record.strand())
101 .set_attributes(record.attributes().clone());
102 if let Some(phase) = record.phase() {
103 updated_record_builder = updated_record_builder.set_phase(phase);
104 }
105 gff_writer.write_record(&updated_record_builder.build())?;
106 }
107 }
108 }
109 Ok(())
110}
111
112#[cfg(test)]
113mod tests {
114 use std::{fs::File, io::BufReader, path::PathBuf};
115
116 use gen_models::{reference_alias::ReferenceAlias, sample::Sample};
117
118 use super::translate_gff;
119 use crate::test_helpers::{get_connection, setup_test_data};
120
121 #[test]
122 fn translates_coordinates_to_nodes() {
123 let conn = get_connection();
124 setup_test_data(&conn);
125
126 let gff_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("./fixtures/complex.gff3");
127 let collection = "test".to_string();
128
129 let mut buffer = Vec::new();
130 translate_gff(
131 &conn,
132 &collection,
133 "foo",
134 BufReader::new(File::open(gff_path.clone()).expect("should open fixture gff")),
135 &mut buffer,
136 )
137 .expect("should translate gff for sample foo");
138 let results = String::from_utf8(buffer).expect("translated output should be valid UTF-8");
139 assert_eq!(
140 results,
141 concat!(
142 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\tgene\t1\t3\t.\t-\t.\tID=ENSG00000294541.1\n",
143 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\tgene\t4\t4\t.\t-\t.\tID=ENSG00000294541.1\n",
144 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\tgene\t5\t17\t.\t-\t.\tID=ENSG00000294541.1\n",
145 "086ae30894dda8efdc19d4dfadd5e6e24af8066e9ee63e56abe897993bebd112\tHAVANA\tgene\t18\t20\t.\t-\t.\tID=ENSG00000294541.1\n",
146 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\ttranscript\t1\t3\t.\t-\t.\tID=ENST00000724296.1;Parent=ENSG00000294541.1\n",
147 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\ttranscript\t4\t4\t.\t-\t.\tID=ENST00000724296.1;Parent=ENSG00000294541.1\n",
148 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\ttranscript\t5\t17\t.\t-\t.\tID=ENST00000724296.1;Parent=ENSG00000294541.1\n",
149 "086ae30894dda8efdc19d4dfadd5e6e24af8066e9ee63e56abe897993bebd112\tHAVANA\ttranscript\t18\t20\t.\t-\t.\tID=ENST00000724296.1;Parent=ENSG00000294541.1\n",
150 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\texon\t5\t8\t.\t-\t.\tID=exon:ENST00000724296.1:1;Parent=ENST00000724296.1\n",
151 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\texon\t10\t14\t.\t-\t.\tID=exon:ENST00000724296.1:2;Parent=ENST00000724296.1\n",
152 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\texon\t16\t17\t.\t-\t.\tID=exon:ENST00000724296.1:3;Parent=ENST00000724296.1\n",
153 "086ae30894dda8efdc19d4dfadd5e6e24af8066e9ee63e56abe897993bebd112\tHAVANA\texon\t18\t19\t.\t-\t.\tID=exon:ENST00000724296.1:3;Parent=ENST00000724296.1\n",
154 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tENSEMBL\tgene\t4\t4\t.\t-\t.\tID=ENSG00000277248.1\n",
155 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tENSEMBL\tgene\t5\t15\t.\t-\t.\tID=ENSG00000277248.1\n",
156 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tENSEMBL\ttranscript\t4\t4\t.\t-\t.\tID=ENST00000615943.1;Parent=ENSG00000277248.1\n",
157 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tENSEMBL\ttranscript\t5\t15\t.\t-\t.\tID=ENST00000615943.1;Parent=ENSG00000277248.1\n",
158 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tENSEMBL\texon\t4\t4\t.\t-\t.\tID=exon:ENST00000615943.1:1;Parent=ENST00000615943.1\n",
159 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tENSEMBL\texon\t5\t15\t.\t-\t.\tID=exon:ENST00000615943.1:1;Parent=ENST00000615943.1\n",
160 )
161 );
162
163 let mut buffer = Vec::new();
164 translate_gff(
165 &conn,
166 &collection,
167 Sample::DEFAULT_NAME,
168 BufReader::new(File::open(gff_path).expect("should open fixture gff")),
169 &mut buffer,
170 )
171 .expect("should translate gff for reference sample");
172 let results = String::from_utf8(buffer).expect("translated output should be valid UTF-8");
173 assert_eq!(
174 results,
175 concat!(
176 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\tgene\t1\t17\t.\t-\t.\tID=ENSG00000294541.1\n",
177 "086ae30894dda8efdc19d4dfadd5e6e24af8066e9ee63e56abe897993bebd112\tHAVANA\tgene\t18\t20\t.\t-\t.\tID=ENSG00000294541.1\n",
178 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\ttranscript\t1\t17\t.\t-\t.\tID=ENST00000724296.1;Parent=ENSG00000294541.1\n",
179 "086ae30894dda8efdc19d4dfadd5e6e24af8066e9ee63e56abe897993bebd112\tHAVANA\ttranscript\t18\t20\t.\t-\t.\tID=ENST00000724296.1;Parent=ENSG00000294541.1\n",
180 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\texon\t4\t8\t.\t-\t.\tID=exon:ENST00000724296.1:1;Parent=ENST00000724296.1\n",
181 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\texon\t10\t14\t.\t-\t.\tID=exon:ENST00000724296.1:2;Parent=ENST00000724296.1\n",
182 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\texon\t16\t17\t.\t-\t.\tID=exon:ENST00000724296.1:3;Parent=ENST00000724296.1\n",
183 "086ae30894dda8efdc19d4dfadd5e6e24af8066e9ee63e56abe897993bebd112\tHAVANA\texon\t18\t19\t.\t-\t.\tID=exon:ENST00000724296.1:3;Parent=ENST00000724296.1\n",
184 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tENSEMBL\tgene\t3\t15\t.\t-\t.\tID=ENSG00000277248.1\n",
185 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tENSEMBL\ttranscript\t3\t15\t.\t-\t.\tID=ENST00000615943.1;Parent=ENSG00000277248.1\n",
186 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tENSEMBL\texon\t3\t15\t.\t-\t.\tID=exon:ENST00000615943.1:1;Parent=ENST00000615943.1\n",
187 )
188 );
189 }
190
191 #[test]
192 fn translates_gff_using_reference_aliases() {
193 let conn = get_connection();
194 setup_test_data(&conn);
195
196 ReferenceAlias::create(
198 &conn,
199 "alternate contig",
200 Some("m456".to_string()),
201 Some("m123".to_string()),
202 Some("m789".to_string()),
203 Some("chr1".to_string()),
204 None,
205 None,
206 )
207 .expect("should create reference alias");
208
209 let gff_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
210 .join("./fixtures/complex-with-reference-alias.gff3");
211 let collection = "test".to_string();
212
213 let mut buffer = Vec::new();
214 translate_gff(
215 &conn,
216 &collection,
217 Sample::DEFAULT_NAME,
218 BufReader::new(File::open(gff_path.clone()).expect("should open fixture gff")),
219 &mut buffer,
220 )
221 .expect("should translate gff for default sample");
222 let results = String::from_utf8(buffer).expect("translated output should be valid UTF-8");
223 assert_eq!(
224 results,
225 concat!(
226 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\tgene\t1\t17\t.\t-\t.\tID=ENSG00000294541.1\n",
227 "086ae30894dda8efdc19d4dfadd5e6e24af8066e9ee63e56abe897993bebd112\tHAVANA\tgene\t18\t20\t.\t-\t.\tID=ENSG00000294541.1\n",
228 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\ttranscript\t1\t17\t.\t-\t.\tID=ENST00000724296.1;Parent=ENSG00000294541.1\n",
229 "086ae30894dda8efdc19d4dfadd5e6e24af8066e9ee63e56abe897993bebd112\tHAVANA\ttranscript\t18\t20\t.\t-\t.\tID=ENST00000724296.1;Parent=ENSG00000294541.1\n",
230 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\texon\t4\t8\t.\t-\t.\tID=exon:ENST00000724296.1:1;Parent=ENST00000724296.1\n",
231 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\texon\t10\t14\t.\t-\t.\tID=exon:ENST00000724296.1:2;Parent=ENST00000724296.1\n",
232 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tHAVANA\texon\t16\t17\t.\t-\t.\tID=exon:ENST00000724296.1:3;Parent=ENST00000724296.1\n",
233 "086ae30894dda8efdc19d4dfadd5e6e24af8066e9ee63e56abe897993bebd112\tHAVANA\texon\t18\t19\t.\t-\t.\tID=exon:ENST00000724296.1:3;Parent=ENST00000724296.1\n",
234 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tENSEMBL\tgene\t3\t15\t.\t-\t.\tID=ENSG00000277248.1\n",
235 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tENSEMBL\ttranscript\t3\t15\t.\t-\t.\tID=ENST00000615943.1;Parent=ENSG00000277248.1\n",
236 "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\tENSEMBL\texon\t3\t15\t.\t-\t.\tID=exon:ENST00000615943.1:1;Parent=ENST00000615943.1\n",
237 )
238 );
239 }
240}