Skip to main content

bidirected_adjacency_array/io/
gfa1.rs

1use std::{
2    borrow::Cow,
3    collections::HashMap,
4    fmt::Debug,
5    io::{BufRead, Write},
6};
7
8use log::warn;
9use tagged_vec::TaggedVec;
10
11use crate::{
12    graph::{BidirectedAdjacencyArray, BidirectedEdge},
13    index::{EdgeIndex, GraphIndexInteger, NodeIndex},
14};
15
16#[cfg(test)]
17mod tests;
18
19pub trait GfaNodeData {
20    fn name(&'_ self) -> Cow<'_, str>;
21    fn sequence(&'_ self) -> Cow<'_, str>;
22}
23
24pub trait GfaEdgeData {
25    fn overlap(&self) -> u16;
26}
27
28#[derive(thiserror::Error, Debug)]
29pub enum GfaReadError {
30    #[error("I/O error: {0}")]
31    IoError(#[from] std::io::Error),
32
33    #[error("a header line was found after other lines")]
34    WronglyPositionedHeader,
35
36    #[error("an S line is missing the sequence name")]
37    MissingSequenceNameInSLine,
38
39    #[error("an L line is missing the four fields specifying the edge endpoints")]
40    LLineTooShort,
41
42    #[error("unknown node name '{0}' in an L line")]
43    UnknownNodeName(String),
44
45    #[error("unknown sign '{0}' in an L line")]
46    UnknownGfaNodeSign(String),
47}
48
49pub fn read_gfa1<IndexType: GraphIndexInteger>(
50    reader: impl BufRead,
51) -> Result<BidirectedAdjacencyArray<IndexType, PlainGfaNodeData, PlainGfaEdgeData>, GfaReadError> {
52    let mut node_name_to_node = HashMap::new();
53    let mut nodes = TaggedVec::<NodeIndex<IndexType>, _>::new();
54    let mut edges = TaggedVec::<EdgeIndex<IndexType>, _>::new();
55    let mut is_header_allowed = true;
56
57    for line in reader.lines() {
58        let line = line?;
59        let line = line.trim().split('\t').collect::<Vec<_>>();
60
61        match line[0] {
62            "H" => {
63                if is_header_allowed {
64                    if line.get(1) != Some(&"VN:Z:1.0") {
65                        warn!("Unsupported GFA version");
66                    }
67                } else {
68                    return Err(GfaReadError::WronglyPositionedHeader);
69                }
70            }
71
72            "S" => {
73                let name = line
74                    .get(1)
75                    .ok_or(GfaReadError::MissingSequenceNameInSLine)?
76                    .to_string();
77                let sequence = line.get(2).unwrap_or(&"").to_string();
78                let node = nodes.push(PlainGfaNodeData {
79                    name: name.clone(),
80                    sequence,
81                });
82                node_name_to_node.insert(name.clone(), node);
83            }
84
85            "L" => {
86                // Parse edge line.
87                let from_name = line.get(1).ok_or(GfaReadError::LLineTooShort)?;
88                let from = node_name_to_node
89                    .get(*from_name)
90                    .copied()
91                    .ok_or_else(|| GfaReadError::UnknownNodeName(from_name.to_string()))?;
92                let from_forward = match *line.get(2).ok_or(GfaReadError::LLineTooShort)? {
93                    "+" => true,
94                    "-" => false,
95                    other => return Err(GfaReadError::UnknownGfaNodeSign(other.to_string())),
96                };
97                let to_name = line.get(3).ok_or(GfaReadError::LLineTooShort)?;
98                let to = node_name_to_node
99                    .get(*to_name)
100                    .copied()
101                    .ok_or_else(|| GfaReadError::UnknownNodeName(to_name.to_string()))?;
102                let to_forward = match *line.get(4).ok_or(GfaReadError::LLineTooShort)? {
103                    "+" => true,
104                    "-" => false,
105                    other => return Err(GfaReadError::UnknownGfaNodeSign(other.to_string())),
106                };
107                let overlap_str = line.get(5).unwrap_or(&"0M");
108                let overlap = overlap_str
109                    .trim_end_matches('M')
110                    .parse::<u16>()
111                    .unwrap_or(0);
112
113                edges.push(BidirectedEdge {
114                    from,
115                    from_forward,
116                    to,
117                    to_forward,
118                    data: PlainGfaEdgeData { overlap },
119                });
120            }
121
122            other => {
123                warn!("Unsupported GFA line type: {}", other);
124            }
125        }
126
127        is_header_allowed = false;
128    }
129
130    Ok(BidirectedAdjacencyArray::new(nodes, edges))
131}
132
133pub fn write_gfa1<IndexType: GraphIndexInteger, NodeData: GfaNodeData, EdgeData: GfaEdgeData>(
134    graph: &BidirectedAdjacencyArray<IndexType, NodeData, EdgeData>,
135    mut writer: impl Write,
136) -> Result<(), std::io::Error> {
137    // Write header.
138    writeln!(writer, "H\tVN:Z:1.0")?;
139
140    // Write nodes.
141    for node in graph.iter_nodes() {
142        let node_data = graph.node_data(node);
143        writeln!(writer, "S\t{}\t{}", node_data.name(), node_data.sequence())?;
144    }
145
146    // Write edges.
147    for edge in graph.iter_edges() {
148        let edge_data = graph.edge(edge);
149
150        let from_node_name = graph.node_data(edge_data.from().into_bidirected()).name();
151        let to_node_name = graph.node_data(edge_data.to().into_bidirected()).name();
152
153        // In mathematical notation, traversing an edge from a to b means using edge (a, \hat{b}).
154        // But in GFA1, this means using edge (a, b), where both signs are unchanged.
155        let from_node_sign = if edge_data.from().is_forward() {
156            "+"
157        } else {
158            "-"
159        };
160        let to_node_sign = if edge_data.to().is_forward() {
161            "+"
162        } else {
163            "-"
164        };
165
166        let overlap = edge_data.data().overlap();
167
168        writeln!(
169            writer,
170            "L\t{from_node_name}\t{from_node_sign}\t{to_node_name}\t{to_node_sign}\t{overlap}M",
171        )?;
172    }
173
174    Ok(())
175}
176
177#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
178pub struct PlainGfaNodeData {
179    name: String,
180    sequence: String,
181}
182
183#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
184pub struct PlainGfaEdgeData {
185    overlap: u16,
186}
187
188impl PlainGfaNodeData {
189    pub fn new(name: impl ToString, sequence: impl ToString) -> Self {
190        Self {
191            name: name.to_string(),
192            sequence: sequence.to_string(),
193        }
194    }
195}
196
197impl GfaNodeData for PlainGfaNodeData {
198    fn name(&'_ self) -> Cow<'_, str> {
199        Cow::Borrowed(&self.name)
200    }
201
202    fn sequence(&'_ self) -> Cow<'_, str> {
203        Cow::Borrowed(&self.sequence)
204    }
205}
206
207impl PlainGfaEdgeData {
208    pub fn new(overlap: u16) -> Self {
209        Self { overlap }
210    }
211}
212
213impl GfaEdgeData for PlainGfaEdgeData {
214    fn overlap(&self) -> u16 {
215        self.overlap
216    }
217}