1use std::collections::BTreeMap;
7
8use cyanea_core::{CyaneaError, Result};
9
10#[derive(Debug, Clone)]
15pub struct DeBruijnGraph {
16 k: usize,
17 edges: BTreeMap<Vec<u8>, Vec<Vec<u8>>>,
19 kmer_counts: BTreeMap<Vec<u8>, usize>,
21}
22
23#[derive(Debug, Clone)]
25pub struct Unitig {
26 pub sequence: Vec<u8>,
28 pub coverage: f64,
30}
31
32impl DeBruijnGraph {
33 pub fn from_sequences(sequences: &[&[u8]], k: usize) -> Result<Self> {
43 if k < 2 {
44 return Err(CyaneaError::InvalidInput(
45 "k must be at least 2 for De Bruijn graph construction".into(),
46 ));
47 }
48 let mut graph = Self {
49 k,
50 edges: BTreeMap::new(),
51 kmer_counts: BTreeMap::new(),
52 };
53 for seq in sequences {
54 if seq.len() < k {
55 return Err(CyaneaError::InvalidInput(format!(
56 "sequence length {} is shorter than k={}",
57 seq.len(),
58 k
59 )));
60 }
61 let upper: Vec<u8> = seq.iter().map(|b| b.to_ascii_uppercase()).collect();
62 for b in &upper {
63 if !matches!(b, b'A' | b'C' | b'G' | b'T') {
64 return Err(CyaneaError::InvalidInput(format!(
65 "invalid base '{}' for De Bruijn graph",
66 *b as char
67 )));
68 }
69 }
70 for window in upper.windows(k) {
71 graph.add_kmer(window);
72 }
73 }
74 Ok(graph)
75 }
76
77 pub fn from_kmers(kmers: &[&[u8]]) -> Result<Self> {
85 if kmers.is_empty() {
86 return Err(CyaneaError::InvalidInput(
87 "at least one k-mer is required".into(),
88 ));
89 }
90 let k = kmers[0].len();
91 if k < 2 {
92 return Err(CyaneaError::InvalidInput(
93 "k-mers must have length at least 2".into(),
94 ));
95 }
96 for kmer in kmers {
97 if kmer.len() != k {
98 return Err(CyaneaError::InvalidInput(format!(
99 "all k-mers must have the same length; expected {} but got {}",
100 k,
101 kmer.len()
102 )));
103 }
104 }
105 let mut graph = Self {
106 k,
107 edges: BTreeMap::new(),
108 kmer_counts: BTreeMap::new(),
109 };
110 for kmer in kmers {
111 let upper: Vec<u8> = kmer.iter().map(|b| b.to_ascii_uppercase()).collect();
112 graph.add_kmer(&upper);
113 }
114 Ok(graph)
115 }
116
117 fn add_kmer(&mut self, kmer: &[u8]) {
118 let prefix = kmer[..self.k - 1].to_vec();
119 let suffix = kmer[1..].to_vec();
120 self.edges.entry(prefix).or_default().push(suffix.clone());
121 self.edges.entry(suffix).or_default();
123 *self.kmer_counts.entry(kmer.to_vec()).or_insert(0) += 1;
124 }
125
126 pub fn node_count(&self) -> usize {
128 self.edges.len()
129 }
130
131 pub fn edge_count(&self) -> usize {
133 self.kmer_counts.len()
134 }
135
136 pub fn contains_kmer(&self, kmer: &[u8]) -> bool {
138 let upper: Vec<u8> = kmer.iter().map(|b| b.to_ascii_uppercase()).collect();
139 self.kmer_counts.contains_key(&upper)
140 }
141
142 pub fn unitigs(&self) -> Vec<Unitig> {
147 let mut in_degree: BTreeMap<&Vec<u8>, usize> = BTreeMap::new();
149 for (_, successors) in &self.edges {
150 for s in successors {
151 *in_degree.entry(s).or_insert(0) += 1;
152 }
153 }
154
155 let out_degree = |node: &Vec<u8>| -> usize {
156 self.edges.get(node).map_or(0, |v| v.len())
157 };
158 let in_deg = |node: &Vec<u8>| -> usize { in_degree.get(node).copied().unwrap_or(0) };
159
160 let is_start = |node: &Vec<u8>| -> bool {
162 in_deg(node) != 1 || out_degree(node) != 1
163 };
164
165 let mut visited: BTreeMap<(Vec<u8>, Vec<u8>), bool> = BTreeMap::new();
166 let mut unitigs = Vec::new();
167
168 let nodes: Vec<Vec<u8>> = self.edges.keys().cloned().collect();
170 for node in &nodes {
171 if !is_start(node) {
172 continue;
173 }
174 let successors = match self.edges.get(node) {
175 Some(s) => s.clone(),
176 None => continue,
177 };
178 for succ in &successors {
179 let edge_key = (node.clone(), succ.clone());
180 if visited.contains_key(&edge_key) {
181 continue;
182 }
183 visited.insert(edge_key, true);
184
185 let mut path_nodes: Vec<Vec<u8>> = vec![node.clone(), succ.clone()];
187 let mut current = succ.clone();
188
189 while !is_start(¤t) {
191 let next_successors = match self.edges.get(¤t) {
192 Some(s) if s.len() == 1 => s,
193 _ => break,
194 };
195 let next = &next_successors[0];
196 let edge_key = (current.clone(), next.clone());
197 if visited.contains_key(&edge_key) {
198 break;
199 }
200 visited.insert(edge_key, true);
201 path_nodes.push(next.clone());
202 current = next.clone();
203 }
204
205 let mut sequence = path_nodes[0].clone();
207 for p in &path_nodes[1..] {
208 sequence.push(*p.last().unwrap());
209 }
210
211 let k = self.k;
213 let mut total_count = 0usize;
214 let mut n_kmers = 0usize;
215 for window in sequence.windows(k) {
216 if let Some(&c) = self.kmer_counts.get(window) {
217 total_count += c;
218 n_kmers += 1;
219 }
220 }
221 let coverage = if n_kmers > 0 {
222 total_count as f64 / n_kmers as f64
223 } else {
224 0.0
225 };
226
227 unitigs.push(Unitig { sequence, coverage });
228 }
229 }
230
231 for (node, successors) in &self.edges {
234 for succ in successors {
235 let edge_key = (node.clone(), succ.clone());
236 if visited.contains_key(&edge_key) {
237 continue;
238 }
239 visited.insert(edge_key.clone(), true);
240
241 let mut path_nodes: Vec<Vec<u8>> = vec![node.clone(), succ.clone()];
242 let mut current = succ.clone();
243
244 loop {
245 let next_successors = match self.edges.get(¤t) {
246 Some(s) if s.len() == 1 => s,
247 _ => break,
248 };
249 let next = &next_successors[0];
250 let ek = (current.clone(), next.clone());
251 if visited.contains_key(&ek) {
252 break;
253 }
254 visited.insert(ek, true);
255 path_nodes.push(next.clone());
256 current = next.clone();
257 }
258
259 let mut sequence = path_nodes[0].clone();
260 for p in &path_nodes[1..] {
261 sequence.push(*p.last().unwrap());
262 }
263
264 let k = self.k;
265 let mut total_count = 0usize;
266 let mut n_kmers = 0usize;
267 for window in sequence.windows(k) {
268 if let Some(&c) = self.kmer_counts.get(window) {
269 total_count += c;
270 n_kmers += 1;
271 }
272 }
273 let coverage = if n_kmers > 0 {
274 total_count as f64 / n_kmers as f64
275 } else {
276 0.0
277 };
278
279 unitigs.push(Unitig { sequence, coverage });
280 }
281 }
282
283 unitigs
284 }
285}
286
287#[cfg(test)]
288mod tests {
289 use super::*;
290
291 #[test]
292 fn debruijn_simple_sequence() {
293 let graph = DeBruijnGraph::from_sequences(&[b"ACGTACGT"], 3).unwrap();
295 assert!(graph.node_count() > 0);
296 assert!(graph.edge_count() > 0);
297 }
298
299 #[test]
300 fn debruijn_node_edge_counts() {
301 let graph = DeBruijnGraph::from_sequences(&[b"ACGT"], 3).unwrap();
304 assert_eq!(graph.edge_count(), 2);
305 assert_eq!(graph.node_count(), 3);
306 }
307
308 #[test]
309 fn debruijn_contains_kmer() {
310 let graph = DeBruijnGraph::from_sequences(&[b"ACGTACGT"], 3).unwrap();
311 assert!(graph.contains_kmer(b"ACG"));
312 assert!(graph.contains_kmer(b"CGT"));
313 assert!(!graph.contains_kmer(b"AAA"));
314 }
315
316 #[test]
317 fn unitig_extraction_simple() {
318 let graph = DeBruijnGraph::from_sequences(&[b"ACGTACGT"], 3).unwrap();
320 let unitigs = graph.unitigs();
321 assert!(!unitigs.is_empty());
322 let total_len: usize = unitigs.iter().map(|u| u.sequence.len()).sum();
324 assert!(total_len >= 3); }
326
327 #[test]
328 fn unitig_coverage_correct() {
329 let graph =
331 DeBruijnGraph::from_sequences(&[b"ACGT", b"ACGT"], 3).unwrap();
332 let unitigs = graph.unitigs();
333 assert!(!unitigs.is_empty());
334 for u in &unitigs {
335 assert!((u.coverage - 2.0).abs() < 1e-10);
336 }
337 }
338}