Skip to main content

gen_annotations/translate/
bed.rs

1use std::{
2    cmp::{max, min},
3    collections::HashMap,
4    io::{Read, Write},
5};
6
7use gen_core::{HashId, Strand, is_terminal};
8use gen_graph::{GraphNode, project_path};
9use gen_models::{
10    block_group::BlockGroup,
11    db::GraphConnection,
12    reference_alias::{ReferenceAlias, ReferenceAliasError},
13    sample::Sample,
14};
15use interavl::IntervalTree;
16use noodles::{
17    bed,
18    bed::feature::record_buf::{OtherFields, other_fields::Value},
19    core::Position,
20};
21use thiserror::Error;
22
23#[derive(Debug, Error)]
24pub enum BedError {
25    #[error("Couldn't translate BED entry: {0}")]
26    Io(#[from] std::io::Error),
27    #[error("Error loading reference aliases: {0}")]
28    ReferenceAliasError(#[from] ReferenceAliasError),
29}
30
31pub fn translate_bed<R, W>(
32    conn: &GraphConnection,
33    collection: &str,
34    sample: &str,
35    reader: R,
36    writer: &mut W,
37) -> Result<(), BedError>
38where
39    R: Read,
40    W: Write,
41{
42    let mut record = bed::Record::default();
43    let mut bed_reader = bed::io::reader::Builder::<3>.build_from_reader(reader);
44    let mut bed_writer = bed::io::Writer::<3, _>::new(writer);
45
46    let bgs = Sample::get_block_groups(conn, collection, sample);
47    let sample_bgs: HashMap<String, &BlockGroup> = HashMap::from_iter(
48        bgs.iter()
49            .map(|bg| (bg.name.clone(), bg))
50            .collect::<Vec<(String, &BlockGroup)>>(),
51    );
52
53    // Load all reference aliases, to accommodate alternate reference names in the GFF file
54    let references = sample_bgs.keys().cloned().collect::<Vec<String>>();
55    let references_by_alias = ReferenceAlias::get_references_by_alias(conn, references)?;
56
57    let mut paths: HashMap<HashId, IntervalTree<i64, (GraphNode, Strand)>> = HashMap::new();
58
59    while bed_reader.read_record(&mut record)? != 0 {
60        let ref_name = record.reference_sequence_name().to_string();
61        let ref_name = references_by_alias
62            .get(&ref_name)
63            .unwrap_or(&ref_name)
64            .to_string();
65        // noodles converts to 1 index, keep it 0.
66        let start = record.feature_start().unwrap().get() as i64 - 1;
67        let end = record.feature_end().unwrap().unwrap().get() as i64;
68        if let Some(bg) = sample_bgs.get(&ref_name) {
69            let projection = paths.entry(bg.id).or_insert_with(|| {
70                let path = BlockGroup::get_current_path(conn, &bg.id);
71                let graph = BlockGroup::get_graph(conn, &bg.id);
72                let mut tree = IntervalTree::default();
73                let mut position: i64 = 0;
74                for (node, strand) in project_path(&graph, &path.blocks(conn)) {
75                    if !is_terminal(node.node_id) {
76                        let end_position = position + node.length();
77                        tree.insert(position..end_position, (node, strand));
78                        position = end_position;
79                    }
80                }
81                tree
82            });
83            let range = start..end;
84            let values: Vec<_> = record.other_fields().iter().map(Value::from).collect();
85            let other_fields = OtherFields::from(values);
86            for (overlap, (node, _strand)) in projection.iter_overlaps(&range) {
87                let overlap_start = max(start, overlap.start) as usize;
88                let overlap_end = min(end, overlap.end) as usize;
89                let out_record = bed::feature::RecordBuf::<3>::builder()
90                    .set_reference_sequence_name(format!("{nid}", nid = node.node_id))
91                    .set_feature_start(Position::try_from(overlap_start + 1).unwrap())
92                    .set_feature_end(Position::try_from(overlap_end).unwrap())
93                    .set_other_fields(other_fields.clone())
94                    .build();
95                bed_writer.write_feature_record(&out_record)?;
96            }
97        }
98    }
99    Ok(())
100}
101
102#[cfg(test)]
103mod tests {
104    use std::{fs::File, path::PathBuf};
105
106    use gen_models::{reference_alias::ReferenceAlias, sample::Sample};
107
108    use super::translate_bed;
109    use crate::test_helpers::{get_connection, setup_test_data};
110
111    #[test]
112    fn translates_coordinates_to_nodes() {
113        let conn = get_connection();
114        setup_test_data(&conn);
115
116        let bed_path = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("./fixtures/simple.bed");
117        let collection = "test".to_string();
118
119        let mut buffer = Vec::new();
120        translate_bed(
121            &conn,
122            &collection,
123            "foo",
124            File::open(bed_path.clone()).expect("should open fixture bed"),
125            &mut buffer,
126        )
127        .expect("should translate bed for sample foo");
128        let results = String::from_utf8(buffer).expect("translated output should be valid UTF-8");
129        assert_eq!(
130            results,
131            concat!(
132                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\t1\t3\tabc123.1\t0\t-\t1\t10\t0,0,0\t3\t102,188,129,\t0,3508,4691,\n",
133                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\t3\t4\tabc123.1\t0\t-\t1\t10\t0,0,0\t3\t102,188,129,\t0,3508,4691,\n",
134                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\t4\t10\tabc123.1\t0\t-\t1\t10\t0,0,0\t3\t102,188,129,\t0,3508,4691,\n",
135                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\t5\t8\txyz.1\t0\t-\t5\t8\t0,0,0\t1\t113,\t0,\n",
136                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\t10\t16\txyz.2\t0\t+\t10\t16\t0,0,0\t2\t142,326,\t0,10710,\n",
137                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\t14\t17\tfoo.1\t0\t+\t14\t23\t0,0,0\t2\t142,326,\t0,10710,\n",
138                "086ae30894dda8efdc19d4dfadd5e6e24af8066e9ee63e56abe897993bebd112\t17\t23\tfoo.1\t0\t+\t14\t23\t0,0,0\t2\t142,326,\t0,10710,\n",
139            )
140        );
141
142        let mut buffer = Vec::new();
143        translate_bed(
144            &conn,
145            &collection,
146            Sample::DEFAULT_NAME,
147            File::open(bed_path).expect("should open fixture bed"),
148            &mut buffer,
149        )
150        .expect("should translate bed for reference sample");
151        let results = String::from_utf8(buffer).expect("translated output should be valid UTF-8");
152        assert_eq!(
153            results,
154            concat!(
155                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\t1\t10\tabc123.1\t0\t-\t1\t10\t0,0,0\t3\t102,188,129,\t0,3508,4691,\n",
156                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\t5\t8\txyz.1\t0\t-\t5\t8\t0,0,0\t1\t113,\t0,\n",
157                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\t10\t16\txyz.2\t0\t+\t10\t16\t0,0,0\t2\t142,326,\t0,10710,\n",
158                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\t14\t17\tfoo.1\t0\t+\t14\t23\t0,0,0\t2\t142,326,\t0,10710,\n",
159                "086ae30894dda8efdc19d4dfadd5e6e24af8066e9ee63e56abe897993bebd112\t17\t23\tfoo.1\t0\t+\t14\t23\t0,0,0\t2\t142,326,\t0,10710,\n",
160            )
161        );
162    }
163
164    #[test]
165    fn translates_bed_using_reference_aliases() {
166        let conn = get_connection();
167        setup_test_data(&conn);
168
169        let bed_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
170            .join("./fixtures/simple-with-reference-alias.bed");
171        let collection = "test".to_string();
172
173        // Add a reference alias for the block group used in the fixture bed
174        ReferenceAlias::create(
175            &conn,
176            "alternate contig",
177            Some("m456".to_string()),
178            Some("m123".to_string()),
179            Some("m789".to_string()),
180            Some("chr1".to_string()),
181            None,
182            None,
183        )
184        .expect("should create reference alias");
185
186        let mut buffer = Vec::new();
187        translate_bed(
188            &conn,
189            &collection,
190            Sample::DEFAULT_NAME,
191            File::open(bed_path).expect("should open fixture bed"),
192            &mut buffer,
193        )
194        .expect("should translate bed for reference sample");
195
196        let results = String::from_utf8(buffer).expect("translated output should be valid UTF-8");
197        assert_eq!(
198            results,
199            concat!(
200                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\t1\t10\tabc123.1\t0\t-\t1\t10\t0,0,0\t3\t102,188,129,\t0,3508,4691,\n",
201                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\t5\t8\txyz.1\t0\t-\t5\t8\t0,0,0\t1\t113,\t0,\n",
202                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\t10\t16\txyz.2\t0\t+\t10\t16\t0,0,0\t2\t142,326,\t0,10710,\n",
203                "0cbb0b7e8171228e0ea97287f369c0099f0ccabc6a9950320650bc27bd974c8a\t14\t17\tfoo.1\t0\t+\t14\t23\t0,0,0\t2\t142,326,\t0,10710,\n",
204                "086ae30894dda8efdc19d4dfadd5e6e24af8066e9ee63e56abe897993bebd112\t17\t23\tfoo.1\t0\t+\t14\t23\t0,0,0\t2\t142,326,\t0,10710,\n",
205            )
206        );
207    }
208}