bidirected_adjacency_array/io/
gfa1.rs1use std::{
2 borrow::Cow,
3 collections::HashMap,
4 fmt::Debug,
5 io::{BufRead, BufReader, BufWriter, Read, Write},
6};
7
8use log::warn;
9use tagged_vec::TaggedVec;
10
11use crate::{
12 graph::{BidirectedAdjacencyArray, BidirectedEdge},
13 index::{EdgeIndex, GraphIndexInteger, NodeIndex},
14};
15
16#[cfg(test)]
17mod tests;
18
19pub trait GfaNodeData {
20 fn name(&'_ self) -> Cow<'_, str>;
21 fn sequence(&'_ self) -> Cow<'_, str>;
22}
23
24pub trait GfaEdgeData {
25 fn overlap(&self) -> u16;
26}
27
28#[derive(thiserror::Error, Debug)]
29pub enum GfaReadError {
30 #[error("I/O error: {0}")]
31 IoError(#[from] std::io::Error),
32
33 #[error("a header line was found after other lines")]
34 WronglyPositionedHeader,
35
36 #[error("an S line is missing the sequence name")]
37 MissingSequenceNameInSLine,
38
39 #[error("an L line is missing the four fields specifying the edge endpoints")]
40 LLineTooShort,
41
42 #[error("unknown node name '{0}' in an L line")]
43 UnknownNodeName(String),
44
45 #[error("unknown sign '{0}' in an L line")]
46 UnknownGfaNodeSign(String),
47}
48
49pub fn read_gfa1<IndexType: GraphIndexInteger>(
50 reader: &mut impl Read,
51) -> Result<BidirectedAdjacencyArray<IndexType, PlainGfaNodeData, PlainGfaEdgeData>, GfaReadError> {
52 let reader = BufReader::new(reader);
53 let mut node_name_to_node = HashMap::new();
54 let mut nodes = TaggedVec::<NodeIndex<IndexType>, _>::new();
55 let mut edges = TaggedVec::<EdgeIndex<IndexType>, _>::new();
56 let mut is_header_allowed = true;
57
58 for line in reader.lines() {
59 let line = line?;
60 let line = line.trim().split('\t').collect::<Vec<_>>();
61
62 match line[0] {
63 "H" => {
64 if is_header_allowed {
65 if line.get(1) != Some(&"VN:Z:1.0") {
66 warn!("Unsupported GFA version");
67 }
68 } else {
69 return Err(GfaReadError::WronglyPositionedHeader);
70 }
71 }
72
73 "S" => {
74 let name = line
75 .get(1)
76 .ok_or(GfaReadError::MissingSequenceNameInSLine)?
77 .to_string();
78 let sequence = line.get(2).unwrap_or(&"").to_string();
79 let node = nodes.push(PlainGfaNodeData {
80 name: name.clone(),
81 sequence,
82 });
83 node_name_to_node.insert(name.clone(), node);
84 }
85
86 "L" => {
87 let from_name = line.get(1).ok_or(GfaReadError::LLineTooShort)?;
89 let from = node_name_to_node
90 .get(*from_name)
91 .copied()
92 .ok_or_else(|| GfaReadError::UnknownNodeName(from_name.to_string()))?;
93 let from_forward = match *line.get(2).ok_or(GfaReadError::LLineTooShort)? {
94 "+" => true,
95 "-" => false,
96 other => return Err(GfaReadError::UnknownGfaNodeSign(other.to_string())),
97 };
98 let to_name = line.get(3).ok_or(GfaReadError::LLineTooShort)?;
99 let to = node_name_to_node
100 .get(*to_name)
101 .copied()
102 .ok_or_else(|| GfaReadError::UnknownNodeName(to_name.to_string()))?;
103 let to_forward = match *line.get(4).ok_or(GfaReadError::LLineTooShort)? {
104 "+" => true,
105 "-" => false,
106 other => return Err(GfaReadError::UnknownGfaNodeSign(other.to_string())),
107 };
108 let overlap_str = line.get(5).unwrap_or(&"0M");
109 let overlap = overlap_str
110 .trim_end_matches('M')
111 .parse::<u16>()
112 .unwrap_or(0);
113
114 edges.push(BidirectedEdge {
115 from,
116 from_forward,
117 to,
118 to_forward,
119 data: PlainGfaEdgeData { overlap },
120 });
121 }
122
123 other => {
124 warn!("Unsupported GFA line type: {}", other);
125 }
126 }
127
128 is_header_allowed = false;
129 }
130
131 Ok(BidirectedAdjacencyArray::new(nodes, edges))
132}
133
134pub fn write_gfa1<IndexType: GraphIndexInteger, NodeData: GfaNodeData, EdgeData: GfaEdgeData>(
135 graph: &BidirectedAdjacencyArray<IndexType, NodeData, EdgeData>,
136 writer: &mut impl Write,
137) -> Result<(), std::io::Error> {
138 let mut writer = BufWriter::new(writer);
139
140 writeln!(writer, "H\tVN:Z:1.0")?;
142
143 for node in graph.iter_nodes() {
145 let node_data = graph.node_data(node);
146 writeln!(writer, "S\t{}\t{}", node_data.name(), node_data.sequence())?;
147 }
148
149 for edge in graph.iter_edges() {
151 let edge_data = graph.edge(edge);
152
153 let from_node_name = graph.node_data(edge_data.from().into_bidirected()).name();
154 let to_node_name = graph.node_data(edge_data.to().into_bidirected()).name();
155
156 let from_node_sign = if edge_data.from().is_forward() {
159 "+"
160 } else {
161 "-"
162 };
163 let to_node_sign = if edge_data.to().is_forward() {
164 "+"
165 } else {
166 "-"
167 };
168
169 let overlap = edge_data.data().overlap();
170
171 writeln!(
172 writer,
173 "L\t{from_node_name}\t{from_node_sign}\t{to_node_name}\t{to_node_sign}\t{overlap}M",
174 )?;
175 }
176
177 Ok(())
178}
179
180#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
181pub struct PlainGfaNodeData {
182 name: String,
183 sequence: String,
184}
185
186#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
187pub struct PlainGfaEdgeData {
188 overlap: u16,
189}
190
191impl PlainGfaNodeData {
192 pub fn new(name: impl ToString, sequence: impl ToString) -> Self {
193 Self {
194 name: name.to_string(),
195 sequence: sequence.to_string(),
196 }
197 }
198}
199
200impl GfaNodeData for PlainGfaNodeData {
201 fn name(&'_ self) -> Cow<'_, str> {
202 Cow::Borrowed(&self.name)
203 }
204
205 fn sequence(&'_ self) -> Cow<'_, str> {
206 Cow::Borrowed(&self.sequence)
207 }
208}
209
210impl PlainGfaEdgeData {
211 pub fn new(overlap: u16) -> Self {
212 Self { overlap }
213 }
214}
215
216impl GfaEdgeData for PlainGfaEdgeData {
217 fn overlap(&self) -> u16 {
218 self.overlap
219 }
220}