Skip to main content

gen_annotations/
gff.rs

1use std::{collections::HashMap, fs::File, io, io::BufReader};
2
3use gen_models::{
4    block_group::BlockGroup,
5    db::GraphConnection,
6    path::{Annotation, Path},
7    sample::Sample,
8};
9use noodles::{core::Position, gff};
10
11pub fn gff_attribute_value_to_string(
12    attrs: &gff::feature::record_buf::Attributes,
13    key: &str,
14) -> Option<String> {
15    let key_bytes = key.as_bytes();
16    attrs.as_ref().iter().find_map(|(tag, value)| {
17        let tag_bytes: &[u8] = tag.as_ref();
18        if !tag_bytes.eq_ignore_ascii_case(key_bytes) {
19            return None;
20        }
21        if let Some(value) = value.as_string() {
22            Some(String::from_utf8_lossy(value.as_ref()).to_string())
23        } else {
24            value
25                .iter()
26                .next()
27                .map(|item| String::from_utf8_lossy(item.as_ref()).to_string())
28        }
29    })
30}
31
32pub fn propagate_gff(
33    conn: &GraphConnection,
34    collection_name: &str,
35    from_sample_name: &str,
36    to_sample_name: &str,
37    gff_input_filename: &str,
38    gff_output_filename: &str,
39) -> io::Result<()> {
40    let mut reader = File::open(gff_input_filename)
41        .map(BufReader::new)
42        .map(gff::io::Reader::new)?;
43
44    let output_file = File::create(gff_output_filename).unwrap();
45    let mut writer = gff::io::Writer::new(output_file);
46
47    let source_block_groups = Sample::get_block_groups(conn, collection_name, from_sample_name);
48    let target_block_groups = Sample::get_block_groups(conn, collection_name, to_sample_name);
49    let source_paths_by_bg_name = source_block_groups
50        .iter()
51        .map(|bg| (bg.name.clone(), BlockGroup::get_current_path(conn, &bg.id)))
52        .collect::<HashMap<String, Path>>();
53    let target_paths_by_bg_name = target_block_groups
54        .iter()
55        .map(|bg| (bg.name.clone(), BlockGroup::get_current_path(conn, &bg.id)))
56        .collect::<HashMap<String, Path>>();
57
58    let mut path_mappings_by_bg_name = HashMap::new();
59    for (name, target_path) in &target_paths_by_bg_name {
60        let source_path = source_paths_by_bg_name.get(name).unwrap();
61        let mapping = source_path.get_mapping_tree(conn, target_path);
62        path_mappings_by_bg_name.insert(name, mapping);
63    }
64
65    let sequence_lengths_by_path_name = target_paths_by_bg_name
66        .iter()
67        .map(|(name, path)| (name.clone(), path.sequence(conn).len() as i64))
68        .collect::<HashMap<String, i64>>();
69
70    for result in reader.record_bufs() {
71        let record = result?;
72        let path_name = record.reference_sequence_name().to_string();
73        let annotation = Annotation {
74            name: "".to_string(),
75            start: record.start().get() as i64,
76            end: record.end().get() as i64,
77        };
78        let mapping_tree = path_mappings_by_bg_name.get(&path_name).unwrap();
79        let sequence_length = sequence_lengths_by_path_name.get(&path_name).unwrap();
80        let propagated_annotation =
81            Path::propagate_annotation(annotation, mapping_tree, *sequence_length).unwrap();
82
83        let score = record.score();
84        let phase = record.phase();
85        let mut updated_record_builder = gff::feature::RecordBuf::builder()
86            .set_reference_sequence_name(path_name)
87            .set_source(record.source().to_string())
88            .set_type(record.ty().to_string())
89            .set_start(
90                Position::new(propagated_annotation.start.try_into().unwrap())
91                    .expect("Could not convert start ({start}) to usize for propagation"),
92            )
93            .set_end(
94                Position::new(propagated_annotation.end.try_into().unwrap())
95                    .expect("Could not convert end ({end}) to usize for propagation"),
96            )
97            .set_strand(record.strand())
98            .set_attributes(record.attributes().clone());
99
100        if let Some(score) = score {
101            updated_record_builder = updated_record_builder.set_score(score);
102        }
103        if let Some(phase) = phase {
104            updated_record_builder = updated_record_builder.set_phase(phase);
105        }
106
107        writer.write_record(&updated_record_builder.build())?;
108    }
109
110    Ok(())
111}
112
113#[cfg(test)]
114mod tests {
115    use std::{fs::File, io::BufReader, path::PathBuf};
116
117    use gen_core::{
118        HashId, NO_CHROMOSOME_INDEX, PATH_END_NODE_ID, PATH_START_NODE_ID, PathBlock, Strand,
119    };
120    use gen_models::{
121        block_group::{BlockGroup, NewBlockGroup, PathChange},
122        block_group_edge::{BlockGroupEdge, BlockGroupEdgeData},
123        collection::Collection,
124        db::GraphConnection,
125        edge::Edge,
126        node::Node,
127        path::Path,
128        sample::Sample,
129        sequence::Sequence,
130        traits::Query,
131    };
132    use noodles::gff;
133    use tempfile::tempdir;
134
135    use super::propagate_gff;
136    use crate::test_helpers::get_connection;
137
138    fn create_block_group(conn: &GraphConnection) {
139        let collection = Collection::create(conn, "test");
140        Sample::get_or_create(conn, Sample::DEFAULT_NAME);
141        let sequence = "ATCGATCGATCGATCGATCGGGAACACACAGAGA";
142        let reference_sequence = Sequence::new()
143            .sequence_type("DNA")
144            .sequence(sequence)
145            .save(conn);
146        let node_id = Node::create(
147            conn,
148            &reference_sequence.hash,
149            &HashId::convert_str(&format!(
150                "{collection}.m123:{hash}",
151                collection = collection.name,
152                hash = reference_sequence.hash
153            )),
154        );
155        let block_group = BlockGroup::create(
156            conn,
157            NewBlockGroup {
158                collection_name: &collection.name,
159                sample_name: Sample::DEFAULT_NAME,
160                name: "m123",
161                parent_block_group_id: None,
162                is_default: false,
163            },
164        );
165
166        let edge_into = Edge::create(
167            conn,
168            PATH_START_NODE_ID,
169            0,
170            Strand::Forward,
171            node_id,
172            0,
173            Strand::Forward,
174        );
175        let edge_out_of = Edge::create(
176            conn,
177            node_id,
178            reference_sequence.length,
179            Strand::Forward,
180            PATH_END_NODE_ID,
181            0,
182            Strand::Forward,
183        );
184
185        let new_block_group_edges = vec![
186            BlockGroupEdgeData {
187                block_group_id: block_group.id,
188                edge_id: edge_into.id,
189                chromosome_index: 0,
190                phased: 0,
191            },
192            BlockGroupEdgeData {
193                block_group_id: block_group.id,
194                edge_id: edge_out_of.id,
195                chromosome_index: 0,
196                phased: 0,
197            },
198        ];
199
200        BlockGroupEdge::bulk_create(conn, &new_block_group_edges);
201        Path::create(
202            conn,
203            "m123",
204            &block_group.id,
205            &[edge_into.id, edge_out_of.id],
206        );
207    }
208
209    fn apply_child_sample_update_from_aa_fasta(conn: &GraphConnection) {
210        Sample::get_or_create(conn, "child sample");
211        let _ = Sample::get_or_create_child(
212            conn,
213            "test",
214            "child sample",
215            vec![Sample::DEFAULT_NAME.to_string()],
216        );
217
218        let sample_bg_id = BlockGroup::get_or_create_sample_block_groups(
219            conn,
220            "test",
221            "child sample",
222            "m123",
223            vec![Sample::DEFAULT_NAME.to_string()],
224        )
225        .expect("should create child block group")[0]
226            .id;
227        let sample_path = BlockGroup::get_current_path(conn, &sample_bg_id);
228        let tree = sample_path.intervaltree(conn);
229        let replacement_sequence = "AA";
230
231        let replacement = Sequence::new()
232            .sequence_type("DNA")
233            .sequence(replacement_sequence)
234            .save(conn);
235        let node_id = Node::create(
236            conn,
237            &replacement.hash,
238            &HashId::convert_str(&format!(
239                "{path_id}:15-25->{sequence_hash}",
240                path_id = sample_path.id,
241                sequence_hash = replacement.hash,
242            )),
243        );
244        let change = PathChange {
245            block_group_id: sample_bg_id,
246            path: sample_path.clone(),
247            path_accession: None,
248            start: 15,
249            end: 25,
250            block: PathBlock {
251                node_id,
252                block_sequence: replacement_sequence.to_string(),
253                sequence_start: 0,
254                sequence_end: replacement_sequence.len() as i64,
255                path_start: 15,
256                path_end: 25,
257                strand: Strand::Forward,
258            },
259            chromosome_index: NO_CHROMOSOME_INDEX,
260            phased: 0,
261            preserve_edge: true,
262        };
263
264        BlockGroup::insert_change(conn, &change, &tree)
265            .expect("should apply AA update to child sample");
266
267        let edge_to_insert = Edge::query(
268            conn,
269            "select * from edges where target_node_id = ?1",
270            rusqlite::params![node_id],
271        )[0]
272        .clone();
273        let edge_from_insert = Edge::query(
274            conn,
275            "select * from edges where source_node_id = ?1",
276            rusqlite::params![node_id],
277        )[0]
278        .clone();
279        sample_path.new_path_with(conn, 15, 25, &edge_to_insert, &edge_from_insert);
280    }
281
282    #[test]
283    fn simple_propagate() {
284        let conn = get_connection();
285        create_block_group(&conn);
286        apply_child_sample_update_from_aa_fasta(&conn);
287
288        let gff_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("fixtures/simple.gff");
289        let temp_dir = tempdir().expect("should create temp directory");
290        let output_path = temp_dir.path().join("output.gff");
291
292        propagate_gff(
293            &conn,
294            "test",
295            Sample::DEFAULT_NAME,
296            "child sample",
297            gff_path.to_str().expect("should convert gff path to UTF-8"),
298            output_path
299                .to_str()
300                .expect("should convert output path to UTF-8"),
301        )
302        .expect("should propagate gff to child sample");
303
304        let mut reader = File::open(output_path)
305            .map(BufReader::new)
306            .map(gff::io::Reader::new)
307            .expect("should read output file");
308
309        for (index, result) in reader.record_bufs().enumerate() {
310            let record = result.expect("should parse output gff record");
311            assert_eq!(record.reference_sequence_name(), "m123");
312            if index == 0 {
313                assert_eq!(record.source(), "gen-test");
314                assert_eq!(record.ty(), "Region");
315                assert_eq!(record.start().get(), 1);
316                assert_eq!(record.end().get(), 26);
317            } else {
318                assert_eq!(record.source(), "gen-test");
319                assert_eq!(record.ty(), "Gene");
320                assert_eq!(record.start().get(), 5);
321                assert_eq!(record.end().get(), 15);
322            }
323        }
324    }
325}