1#![allow(clippy::missing_errors_doc, clippy::missing_panics_doc)]
2
3use std::collections::{HashMap, HashSet};
4
5use rsomics_kmer::encode::{Kmer, canonical};
6use rsomics_kmer::iter::KmerIter;
7
8#[cfg(test)]
9use rsomics_kmer::encode::decode;
10
11#[derive(Debug, thiserror::Error)]
12#[non_exhaustive]
13pub enum DbgError {
14 #[error("k must be in 1..=32 (got {0})")]
15 KOutOfRange(usize),
16 #[error("k-mer iteration: {0}")]
17 KmerIter(#[from] rsomics_kmer::KmerError),
18}
19
20pub type Result<T> = std::result::Result<T, DbgError>;
21
22#[derive(Debug, Clone, Copy)]
23pub struct DbgNode {
24 pub kmer: Kmer,
25 pub in_degree: u8,
26 pub out_degree: u8,
27}
28
29#[derive(Debug, Clone, Default)]
34pub struct Dbg {
35 pub k: usize,
36 pub nodes: HashMap<Kmer, DbgNode>,
37 pub edges: HashSet<(Kmer, Kmer)>,
38}
39
40impl Dbg {
41 pub fn build(seqs: &[&[u8]], k: usize) -> Result<Self> {
42 if !(2..=32).contains(&k) {
43 return Err(DbgError::KOutOfRange(k));
44 }
45 let mut dbg = Self {
46 k,
47 ..Self::default()
48 };
49 for seq in seqs {
50 if seq.len() < k {
51 continue;
52 }
53 for kmer_res in KmerIter::new(seq, k, false)? {
54 let kmer = match kmer_res {
55 Ok(k) => k,
56 Err(rsomics_kmer::KmerError::NonAcgt { .. }) => continue,
57 Err(e) => return Err(DbgError::KmerIter(e)),
58 };
59 let prefix = prefix_kminus1(kmer, k);
60 let suffix = suffix_kminus1(kmer, k);
61 let prefix_c = canonical(prefix, k - 1);
62 let suffix_c = canonical(suffix, k - 1);
63 dbg.nodes.entry(prefix_c).or_insert_with(|| DbgNode {
64 kmer: prefix_c,
65 in_degree: 0,
66 out_degree: 0,
67 });
68 dbg.nodes.entry(suffix_c).or_insert_with(|| DbgNode {
69 kmer: suffix_c,
70 in_degree: 0,
71 out_degree: 0,
72 });
73 if dbg.edges.insert((prefix_c, suffix_c)) {
74 dbg.nodes.get_mut(&prefix_c).unwrap().out_degree += 1;
75 dbg.nodes.get_mut(&suffix_c).unwrap().in_degree += 1;
76 }
77 }
78 }
79 Ok(dbg)
80 }
81
82 #[must_use]
83 pub fn n_nodes(&self) -> usize {
84 self.nodes.len()
85 }
86 #[must_use]
87 pub fn n_edges(&self) -> usize {
88 self.edges.len()
89 }
90
91 #[must_use]
96 pub fn unitigs(&self) -> Vec<Vec<Kmer>> {
97 let mut succ: HashMap<Kmer, Vec<Kmer>> = HashMap::new();
99 let mut pred: HashMap<Kmer, Vec<Kmer>> = HashMap::new();
100 for &(a, b) in &self.edges {
101 succ.entry(a).or_default().push(b);
102 pred.entry(b).or_default().push(a);
103 }
104 let is_branching = |k: &Kmer| -> bool {
105 let s = succ.get(k).map_or(0, Vec::len);
106 let p = pred.get(k).map_or(0, Vec::len);
107 !(s == 1 && p == 1)
108 };
109
110 let mut visited: HashSet<Kmer> = HashSet::new();
111 let mut out: Vec<Vec<Kmer>> = Vec::new();
112 for &start in self.nodes.keys() {
114 if visited.contains(&start) || !is_branching(&start) {
115 continue;
116 }
117 let outs = succ.get(&start).cloned().unwrap_or_default();
118 for next_start in outs {
119 if visited.contains(&next_start) && is_branching(&next_start) {
120 continue;
121 }
122 let mut path = vec![start];
123 let mut cur = next_start;
124 loop {
125 path.push(cur);
126 visited.insert(cur);
127 if is_branching(&cur) {
128 break;
129 }
130 let n = succ.get(&cur).and_then(|v| v.first().copied());
131 match n {
132 Some(n) => cur = n,
133 None => break,
134 }
135 }
136 out.push(path);
137 }
138 }
139 for &n in self.nodes.keys() {
141 if visited.contains(&n) {
142 continue;
143 }
144 let mut path = vec![n];
145 let mut cur = n;
146 visited.insert(n);
147 while let Some(nx) = succ.get(&cur).and_then(|v| v.first().copied()) {
148 if visited.contains(&nx) {
149 break;
150 }
151 path.push(nx);
152 visited.insert(nx);
153 cur = nx;
154 }
155 out.push(path);
156 }
157 out
158 }
159}
160
161fn prefix_kminus1(kmer: Kmer, _k: usize) -> Kmer {
162 kmer >> 2
164}
165
166fn suffix_kminus1(kmer: Kmer, k: usize) -> Kmer {
167 let mask: u64 = if k == 1 {
169 0
170 } else {
171 (1u64 << (2 * (k - 1))) - 1
172 };
173 kmer & mask
174}
175
176#[cfg(test)]
177mod tests {
178 use super::*;
179
180 #[test]
181 fn build_linear_seq_yields_n_minus_k_plus_1_edges() {
182 let seqs: &[&[u8]] = &[b"ACGTACGT"];
184 let dbg = Dbg::build(seqs, 4).unwrap();
185 assert!(dbg.n_edges() >= 1);
188 assert!(dbg.n_nodes() >= 2);
189 }
190
191 #[test]
192 fn build_skips_n_kmers() {
193 let seqs: &[&[u8]] = &[b"ACGTNACGT"];
197 let dbg = Dbg::build(seqs, 4).unwrap();
198 assert!(dbg.n_edges() >= 1);
199 assert!(dbg.n_nodes() >= 1);
200 }
201
202 #[test]
203 fn build_rejects_k_out_of_range() {
204 assert!(matches!(
205 Dbg::build(&[b"ACGT"], 1),
206 Err(DbgError::KOutOfRange(1))
207 ));
208 assert!(matches!(
209 Dbg::build(&[b"ACGT"], 33),
210 Err(DbgError::KOutOfRange(33))
211 ));
212 }
213
214 #[test]
215 fn empty_seqs_yield_empty_dbg() {
216 let dbg = Dbg::build(&[], 4).unwrap();
217 assert_eq!(dbg.n_nodes(), 0);
218 assert_eq!(dbg.n_edges(), 0);
219 }
220
221 #[test]
222 fn unitigs_collapse_linear_paths() {
223 let seq: &[u8] = b"ACGTAGCTAGCTGATCGATCAGCT";
225 let dbg = Dbg::build(&[seq], 5).unwrap();
226 let unitigs = dbg.unitigs();
227 let total_visited: usize = unitigs.iter().map(Vec::len).sum();
229 assert!(
230 total_visited >= dbg.n_nodes(),
231 "{total_visited} ≥ {}",
232 dbg.n_nodes()
233 );
234 }
235
236 #[test]
237 fn prefix_and_suffix_helpers_work_for_4mer() {
238 let kmer = rsomics_kmer::encode::encode(b"ACGT").unwrap();
239 let pref = prefix_kminus1(kmer, 4);
240 let suff = suffix_kminus1(kmer, 4);
241 assert_eq!(decode(pref, 3), b"ACG".to_vec());
242 assert_eq!(decode(suff, 3), b"CGT".to_vec());
243 }
244}