Skip to main content

bidirected_adjacency_array/io/
gfa1.rs

1use std::{
2    borrow::Cow,
3    collections::HashMap,
4    fmt::Debug,
5    io::{BufRead, Write},
6};
7
8use log::warn;
9use tagged_vec::TaggedVec;
10
11use crate::{
12    graph::{BidirectedAdjacencyArray, BidirectedEdge},
13    index::{EdgeIndex, GraphIndexInteger, NodeIndex},
14};
15
16#[cfg(test)]
17mod tests;
18
19pub trait GfaNodeData {
20    fn name(&'_ self) -> Cow<'_, str>;
21    fn sequence(&'_ self) -> Cow<'_, str>;
22}
23
24pub trait GfaEdgeData {
25    fn overlap(&self) -> u16;
26}
27
28#[derive(thiserror::Error, Debug)]
29pub enum GfaReadError {
30    #[error("I/O error: {0}")]
31    IoError(#[from] std::io::Error),
32
33    #[error("a header line was found after other lines")]
34    WronglyPositionedHeader,
35
36    #[error("an S line is missing the sequence name")]
37    MissingSequenceNameInSLine,
38
39    #[error("an L line is missing the four fields specifying the edge endpoints")]
40    LLineTooShort,
41
42    #[error("unknown node name '{0}' in an L line")]
43    UnknownNodeName(String),
44
45    #[error("unknown sign '{0}' in an L line")]
46    UnknownGfaNodeSign(String),
47}
48
49impl<
50    IndexType: GraphIndexInteger,
51    NodeData: From<PlainGfaNodeData>,
52    EdgeData: From<PlainGfaEdgeData>,
53> BidirectedAdjacencyArray<IndexType, NodeData, EdgeData>
54{
55    pub fn read_gfa1(
56        reader: impl BufRead,
57    ) -> Result<BidirectedAdjacencyArray<IndexType, NodeData, EdgeData>, GfaReadError> {
58        let mut node_name_to_node = HashMap::new();
59        let mut nodes = TaggedVec::<NodeIndex<IndexType>, _>::new();
60        let mut edges = TaggedVec::<EdgeIndex<IndexType>, _>::new();
61        let mut is_header_allowed = true;
62
63        for line in reader.lines() {
64            let line = line?;
65            let line = line.trim().split('\t').collect::<Vec<_>>();
66
67            match line[0] {
68                "H" => {
69                    if is_header_allowed {
70                        if line.get(1) != Some(&"VN:Z:1.0") {
71                            warn!("Unsupported GFA version");
72                        }
73                    } else {
74                        return Err(GfaReadError::WronglyPositionedHeader);
75                    }
76                }
77
78                "S" => {
79                    let name = line
80                        .get(1)
81                        .ok_or(GfaReadError::MissingSequenceNameInSLine)?
82                        .to_string();
83                    let sequence = line.get(2).unwrap_or(&"").to_string();
84                    let node = nodes.push(
85                        PlainGfaNodeData {
86                            name: name.clone(),
87                            sequence,
88                        }
89                        .into(),
90                    );
91                    node_name_to_node.insert(name.clone(), node);
92                }
93
94                "L" => {
95                    // Parse edge line.
96                    let from_name = line.get(1).ok_or(GfaReadError::LLineTooShort)?;
97                    let from = node_name_to_node
98                        .get(*from_name)
99                        .copied()
100                        .ok_or_else(|| GfaReadError::UnknownNodeName(from_name.to_string()))?;
101                    let from_forward = match *line.get(2).ok_or(GfaReadError::LLineTooShort)? {
102                        "+" => true,
103                        "-" => false,
104                        other => return Err(GfaReadError::UnknownGfaNodeSign(other.to_string())),
105                    };
106                    let to_name = line.get(3).ok_or(GfaReadError::LLineTooShort)?;
107                    let to = node_name_to_node
108                        .get(*to_name)
109                        .copied()
110                        .ok_or_else(|| GfaReadError::UnknownNodeName(to_name.to_string()))?;
111                    let to_forward = match *line.get(4).ok_or(GfaReadError::LLineTooShort)? {
112                        "+" => true,
113                        "-" => false,
114                        other => return Err(GfaReadError::UnknownGfaNodeSign(other.to_string())),
115                    };
116                    let overlap_str = line.get(5).unwrap_or(&"0M");
117                    let overlap = overlap_str
118                        .trim_end_matches('M')
119                        .parse::<u16>()
120                        .unwrap_or(0);
121
122                    edges.push(BidirectedEdge {
123                        from,
124                        from_forward,
125                        to,
126                        to_forward,
127                        data: PlainGfaEdgeData { overlap }.into(),
128                    });
129                }
130
131                other => {
132                    warn!("Unsupported GFA line type: {}", other);
133                }
134            }
135
136            is_header_allowed = false;
137        }
138
139        Ok(BidirectedAdjacencyArray::new(nodes, edges))
140    }
141}
142
143impl<IndexType: GraphIndexInteger, NodeData: GfaNodeData, EdgeData: GfaEdgeData>
144    BidirectedAdjacencyArray<IndexType, NodeData, EdgeData>
145{
146    pub fn write_gfa1(&self, mut writer: impl Write) -> Result<(), std::io::Error> {
147        // Write header.
148        writeln!(writer, "H\tVN:Z:1.0")?;
149
150        // Write nodes.
151        for node in self.iter_nodes() {
152            let node_data = self.node_data(node);
153            writeln!(writer, "S\t{}\t{}", node_data.name(), node_data.sequence())?;
154        }
155
156        // Write edges.
157        for edge in self.iter_edges() {
158            let edge_data = self.edge(edge);
159
160            let from_node_name = self.node_data(edge_data.from().into_bidirected()).name();
161            let to_node_name = self.node_data(edge_data.to().into_bidirected()).name();
162
163            // In mathematical notation, traversing an edge from a to b means using edge (a, \hat{b}).
164            // But in GFA1, this means using edge (a, b), where both signs are unchanged.
165            let from_node_sign = if edge_data.from().is_forward() {
166                "+"
167            } else {
168                "-"
169            };
170            let to_node_sign = if edge_data.to().is_forward() {
171                "+"
172            } else {
173                "-"
174            };
175
176            let overlap = edge_data.data().overlap();
177
178            writeln!(
179                writer,
180                "L\t{from_node_name}\t{from_node_sign}\t{to_node_name}\t{to_node_sign}\t{overlap}M",
181            )?;
182        }
183
184        Ok(())
185    }
186}
187
188#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
189pub struct PlainGfaNodeData {
190    name: String,
191    sequence: String,
192}
193
194#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
195pub struct PlainGfaEdgeData {
196    overlap: u16,
197}
198
199impl PlainGfaNodeData {
200    pub fn new(name: impl ToString, sequence: impl ToString) -> Self {
201        Self {
202            name: name.to_string(),
203            sequence: sequence.to_string(),
204        }
205    }
206}
207
208impl GfaNodeData for PlainGfaNodeData {
209    fn name(&'_ self) -> Cow<'_, str> {
210        Cow::Borrowed(&self.name)
211    }
212
213    fn sequence(&'_ self) -> Cow<'_, str> {
214        Cow::Borrowed(&self.sequence)
215    }
216}
217
218impl PlainGfaEdgeData {
219    pub fn new(overlap: u16) -> Self {
220        Self { overlap }
221    }
222}
223
224impl GfaEdgeData for PlainGfaEdgeData {
225    fn overlap(&self) -> u16 {
226        self.overlap
227    }
228}