Skip to main content

bidirected_adjacency_array/io/
gfa1.rs

1use std::{
2    borrow::Cow,
3    collections::HashMap,
4    fmt::Debug,
5    io::{BufRead, Write},
6};
7
8use log::warn;
9use tagged_vec::TaggedVec;
10
11use crate::{
12    graph::{BidirectedAdjacencyArray, BidirectedEdge},
13    index::{EdgeIndex, GraphIndexInteger, NodeIndex},
14};
15
16#[cfg(test)]
17mod tests;
18
19pub trait GfaNodeData {
20    fn name(&'_ self) -> Cow<'_, str>;
21    fn sequence(&'_ self) -> Cow<'_, str>;
22}
23
24pub trait GfaEdgeData {
25    fn overlap(&self) -> u16;
26}
27
28#[derive(thiserror::Error, Debug)]
29pub enum GfaReadError {
30    #[error("I/O error: {0}")]
31    IoError(#[from] std::io::Error),
32
33    #[error("a header line was found after other lines")]
34    WronglyPositionedHeader,
35
36    #[error("an S line is missing the sequence name")]
37    MissingSequenceNameInSLine,
38
39    #[error("an L line is missing the four fields specifying the edge endpoints")]
40    LLineTooShort,
41
42    #[error("unknown node name '{0}' in an L line")]
43    UnknownNodeName(String),
44
45    #[error("unknown sign '{0}' in an L line")]
46    UnknownGfaNodeSign(String),
47}
48
49struct UnresolvedBidirectedEdge {
50    from: String,
51    from_forward: bool,
52    to: String,
53    to_forward: bool,
54    data: PlainGfaEdgeData,
55}
56
57impl<
58    IndexType: GraphIndexInteger,
59    NodeData: From<PlainGfaNodeData>,
60    EdgeData: From<PlainGfaEdgeData>,
61> BidirectedAdjacencyArray<IndexType, NodeData, EdgeData>
62{
63    pub fn read_gfa1(
64        reader: impl BufRead,
65    ) -> Result<BidirectedAdjacencyArray<IndexType, NodeData, EdgeData>, GfaReadError> {
66        let mut node_name_to_node = HashMap::new();
67        let mut nodes = TaggedVec::<NodeIndex<IndexType>, _>::new();
68        let mut edges = TaggedVec::<EdgeIndex<IndexType>, _>::new();
69        let mut is_header_allowed = true;
70
71        for line in reader.lines() {
72            let line = line?;
73            let line = line.trim().split('\t').collect::<Vec<_>>();
74
75            match line[0] {
76                "H" => {
77                    if is_header_allowed {
78                        if line.get(1) != Some(&"VN:Z:1.0") {
79                            warn!("Unsupported GFA version");
80                        }
81                    } else {
82                        return Err(GfaReadError::WronglyPositionedHeader);
83                    }
84                }
85
86                "S" => {
87                    let name = line
88                        .get(1)
89                        .ok_or(GfaReadError::MissingSequenceNameInSLine)?
90                        .to_string();
91                    let sequence = line.get(2).unwrap_or(&"").to_string();
92                    let node = nodes.push(
93                        PlainGfaNodeData {
94                            name: name.clone(),
95                            sequence,
96                        }
97                        .into(),
98                    );
99                    node_name_to_node.insert(name.clone(), node);
100                }
101
102                "L" => {
103                    // Parse edge line.
104                    let from = line.get(1).ok_or(GfaReadError::LLineTooShort)?.to_string();
105                    let from_forward = match *line.get(2).ok_or(GfaReadError::LLineTooShort)? {
106                        "+" => true,
107                        "-" => false,
108                        other => return Err(GfaReadError::UnknownGfaNodeSign(other.to_string())),
109                    };
110                    let to = line.get(3).ok_or(GfaReadError::LLineTooShort)?.to_string();
111                    let to_forward = match *line.get(4).ok_or(GfaReadError::LLineTooShort)? {
112                        "+" => true,
113                        "-" => false,
114                        other => return Err(GfaReadError::UnknownGfaNodeSign(other.to_string())),
115                    };
116                    let overlap_str = line.get(5).unwrap_or(&"0M");
117                    let overlap = overlap_str
118                        .trim_end_matches('M')
119                        .parse::<u16>()
120                        .unwrap_or(0);
121
122                    edges.push(UnresolvedBidirectedEdge {
123                        from,
124                        from_forward,
125                        to,
126                        to_forward,
127                        data: PlainGfaEdgeData { overlap },
128                    });
129                }
130
131                other => {
132                    warn!("Unsupported GFA line type: {}", other);
133                }
134            }
135
136            is_header_allowed = false;
137        }
138
139        let edges = edges
140            .into_values_iter()
141            .map(|edge| {
142                let from = node_name_to_node
143                    .get(&edge.from)
144                    .copied()
145                    .ok_or(GfaReadError::UnknownNodeName(edge.from))?;
146                let to = node_name_to_node
147                    .get(&edge.to)
148                    .copied()
149                    .ok_or(GfaReadError::UnknownNodeName(edge.to))?;
150
151                let from_forward = edge.from_forward;
152                let to_forward = edge.to_forward;
153                let data = EdgeData::from(edge.data);
154
155                Result::<_, GfaReadError>::Ok(BidirectedEdge {
156                    from,
157                    from_forward,
158                    to,
159                    to_forward,
160                    data,
161                })
162            })
163            .collect::<Result<Vec<_>, GfaReadError>>()?;
164
165        // Drop name index before constructing graph to save RAM.
166        drop(node_name_to_node);
167        Ok(BidirectedAdjacencyArray::new(nodes, edges.into()))
168    }
169}
170
171impl<IndexType: GraphIndexInteger, NodeData: GfaNodeData, EdgeData: GfaEdgeData>
172    BidirectedAdjacencyArray<IndexType, NodeData, EdgeData>
173{
174    pub fn write_gfa1(&self, mut writer: impl Write) -> Result<(), std::io::Error> {
175        // Write header.
176        writeln!(writer, "H\tVN:Z:1.0")?;
177
178        // Write nodes.
179        for node in self.iter_nodes() {
180            let node_data = self.node_data(node);
181            writeln!(writer, "S\t{}\t{}", node_data.name(), node_data.sequence())?;
182        }
183
184        // Write edges.
185        for edge in self.iter_edges() {
186            let edge_data = self.edge(edge);
187
188            let from_node_name = self.node_data(edge_data.from().into_bidirected()).name();
189            let to_node_name = self.node_data(edge_data.to().into_bidirected()).name();
190
191            // In mathematical notation, traversing an edge from a to b means using edge (a, \hat{b}).
192            // But in GFA1, this means using edge (a, b), where both signs are unchanged.
193            let from_node_sign = if edge_data.from().is_forward() {
194                "+"
195            } else {
196                "-"
197            };
198            let to_node_sign = if edge_data.to().is_forward() {
199                "+"
200            } else {
201                "-"
202            };
203
204            let overlap = edge_data.data().overlap();
205
206            writeln!(
207                writer,
208                "L\t{from_node_name}\t{from_node_sign}\t{to_node_name}\t{to_node_sign}\t{overlap}M",
209            )?;
210        }
211
212        Ok(())
213    }
214}
215
216#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
217pub struct PlainGfaNodeData {
218    name: String,
219    sequence: String,
220}
221
222#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
223pub struct PlainGfaEdgeData {
224    overlap: u16,
225}
226
227impl PlainGfaNodeData {
228    pub fn new(name: impl ToString, sequence: impl ToString) -> Self {
229        Self {
230            name: name.to_string(),
231            sequence: sequence.to_string(),
232        }
233    }
234}
235
236impl GfaNodeData for PlainGfaNodeData {
237    fn name(&'_ self) -> Cow<'_, str> {
238        Cow::Borrowed(&self.name)
239    }
240
241    fn sequence(&'_ self) -> Cow<'_, str> {
242        Cow::Borrowed(&self.sequence)
243    }
244}
245
246impl PlainGfaEdgeData {
247    pub fn new(overlap: u16) -> Self {
248        Self { overlap }
249    }
250}
251
252impl GfaEdgeData for PlainGfaEdgeData {
253    fn overlap(&self) -> u16 {
254        self.overlap
255    }
256}