bidirected_adjacency_array/io/
gfa1.rs1use std::{
2 borrow::Cow,
3 collections::HashMap,
4 fmt::Debug,
5 io::{BufRead, Write},
6};
7
8use log::warn;
9use tagged_vec::TaggedVec;
10
11use crate::{
12 graph::{BidirectedAdjacencyArray, BidirectedEdge},
13 index::{EdgeIndex, GraphIndexInteger, NodeIndex},
14};
15
16#[cfg(test)]
17mod tests;
18
19pub trait GfaNodeData {
20 fn name(&'_ self) -> Cow<'_, str>;
21 fn sequence(&'_ self) -> Cow<'_, str>;
22}
23
24pub trait GfaEdgeData {
25 fn overlap(&self) -> u16;
26}
27
28#[derive(thiserror::Error, Debug)]
29pub enum GfaReadError {
30 #[error("I/O error: {0}")]
31 IoError(#[from] std::io::Error),
32
33 #[error("a header line was found after other lines")]
34 WronglyPositionedHeader,
35
36 #[error("an S line is missing the sequence name")]
37 MissingSequenceNameInSLine,
38
39 #[error("an L line is missing the four fields specifying the edge endpoints")]
40 LLineTooShort,
41
42 #[error("unknown node name '{0}' in an L line")]
43 UnknownNodeName(String),
44
45 #[error("unknown sign '{0}' in an L line")]
46 UnknownGfaNodeSign(String),
47}
48
49struct UnresolvedBidirectedEdge {
50 from: String,
51 from_forward: bool,
52 to: String,
53 to_forward: bool,
54 data: PlainGfaEdgeData,
55}
56
57impl<
58 IndexType: GraphIndexInteger,
59 NodeData: From<PlainGfaNodeData>,
60 EdgeData: From<PlainGfaEdgeData>,
61> BidirectedAdjacencyArray<IndexType, NodeData, EdgeData>
62{
63 pub fn read_gfa1(
64 reader: impl BufRead,
65 ) -> Result<BidirectedAdjacencyArray<IndexType, NodeData, EdgeData>, GfaReadError> {
66 let mut node_name_to_node = HashMap::new();
67 let mut nodes = TaggedVec::<NodeIndex<IndexType>, _>::new();
68 let mut edges = TaggedVec::<EdgeIndex<IndexType>, _>::new();
69 let mut is_header_allowed = true;
70
71 for line in reader.lines() {
72 let line = line?;
73 let line = line.trim().split('\t').collect::<Vec<_>>();
74
75 match line[0] {
76 "H" => {
77 if is_header_allowed {
78 if line.get(1) != Some(&"VN:Z:1.0") {
79 warn!("Unsupported GFA version");
80 }
81 } else {
82 return Err(GfaReadError::WronglyPositionedHeader);
83 }
84 }
85
86 "S" => {
87 let name = line
88 .get(1)
89 .ok_or(GfaReadError::MissingSequenceNameInSLine)?
90 .to_string();
91 let sequence = line.get(2).unwrap_or(&"").to_string();
92 let node = nodes.push(
93 PlainGfaNodeData {
94 name: name.clone(),
95 sequence,
96 }
97 .into(),
98 );
99 node_name_to_node.insert(name.clone(), node);
100 }
101
102 "L" => {
103 let from = line.get(1).ok_or(GfaReadError::LLineTooShort)?.to_string();
105 let from_forward = match *line.get(2).ok_or(GfaReadError::LLineTooShort)? {
106 "+" => true,
107 "-" => false,
108 other => return Err(GfaReadError::UnknownGfaNodeSign(other.to_string())),
109 };
110 let to = line.get(3).ok_or(GfaReadError::LLineTooShort)?.to_string();
111 let to_forward = match *line.get(4).ok_or(GfaReadError::LLineTooShort)? {
112 "+" => true,
113 "-" => false,
114 other => return Err(GfaReadError::UnknownGfaNodeSign(other.to_string())),
115 };
116 let overlap_str = line.get(5).unwrap_or(&"0M");
117 let overlap = overlap_str
118 .trim_end_matches('M')
119 .parse::<u16>()
120 .unwrap_or(0);
121
122 edges.push(UnresolvedBidirectedEdge {
123 from,
124 from_forward,
125 to,
126 to_forward,
127 data: PlainGfaEdgeData { overlap },
128 });
129 }
130
131 other => {
132 warn!("Unsupported GFA line type: {}", other);
133 }
134 }
135
136 is_header_allowed = false;
137 }
138
139 let edges = edges
140 .into_values_iter()
141 .map(|edge| {
142 let from = node_name_to_node
143 .get(&edge.from)
144 .copied()
145 .ok_or(GfaReadError::UnknownNodeName(edge.from))?;
146 let to = node_name_to_node
147 .get(&edge.to)
148 .copied()
149 .ok_or(GfaReadError::UnknownNodeName(edge.to))?;
150
151 let from_forward = edge.from_forward;
152 let to_forward = edge.to_forward;
153 let data = EdgeData::from(edge.data);
154
155 Result::<_, GfaReadError>::Ok(BidirectedEdge {
156 from,
157 from_forward,
158 to,
159 to_forward,
160 data,
161 })
162 })
163 .collect::<Result<Vec<_>, GfaReadError>>()?;
164
165 drop(node_name_to_node);
167 Ok(BidirectedAdjacencyArray::new(nodes, edges.into()))
168 }
169}
170
171impl<IndexType: GraphIndexInteger, NodeData: GfaNodeData, EdgeData: GfaEdgeData>
172 BidirectedAdjacencyArray<IndexType, NodeData, EdgeData>
173{
174 pub fn write_gfa1(&self, mut writer: impl Write) -> Result<(), std::io::Error> {
175 writeln!(writer, "H\tVN:Z:1.0")?;
177
178 for node in self.iter_nodes() {
180 let node_data = self.node_data(node);
181 writeln!(writer, "S\t{}\t{}", node_data.name(), node_data.sequence())?;
182 }
183
184 for edge in self.iter_edges() {
186 let edge_data = self.edge(edge);
187
188 let from_node_name = self.node_data(edge_data.from().into_bidirected()).name();
189 let to_node_name = self.node_data(edge_data.to().into_bidirected()).name();
190
191 let from_node_sign = if edge_data.from().is_forward() {
194 "+"
195 } else {
196 "-"
197 };
198 let to_node_sign = if edge_data.to().is_forward() {
199 "+"
200 } else {
201 "-"
202 };
203
204 let overlap = edge_data.data().overlap();
205
206 writeln!(
207 writer,
208 "L\t{from_node_name}\t{from_node_sign}\t{to_node_name}\t{to_node_sign}\t{overlap}M",
209 )?;
210 }
211
212 Ok(())
213 }
214}
215
216#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
217pub struct PlainGfaNodeData {
218 name: String,
219 sequence: String,
220}
221
222#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
223pub struct PlainGfaEdgeData {
224 overlap: u16,
225}
226
227impl PlainGfaNodeData {
228 pub fn new(name: impl ToString, sequence: impl ToString) -> Self {
229 Self {
230 name: name.to_string(),
231 sequence: sequence.to_string(),
232 }
233 }
234}
235
236impl GfaNodeData for PlainGfaNodeData {
237 fn name(&'_ self) -> Cow<'_, str> {
238 Cow::Borrowed(&self.name)
239 }
240
241 fn sequence(&'_ self) -> Cow<'_, str> {
242 Cow::Borrowed(&self.sequence)
243 }
244}
245
246impl PlainGfaEdgeData {
247 pub fn new(overlap: u16) -> Self {
248 Self { overlap }
249 }
250}
251
252impl GfaEdgeData for PlainGfaEdgeData {
253 fn overlap(&self) -> u16 {
254 self.overlap
255 }
256}