Skip to main content

rsomics_debruijn/
lib.rs

1#![allow(clippy::missing_errors_doc, clippy::missing_panics_doc)]
2
3use std::collections::{HashMap, HashSet};
4
5use rsomics_kmer::encode::{Kmer, canonical};
6use rsomics_kmer::iter::KmerIter;
7
8#[cfg(test)]
9use rsomics_kmer::encode::decode;
10
11#[derive(Debug, thiserror::Error)]
12#[non_exhaustive]
13pub enum DbgError {
14    #[error("k must be in 1..=32 (got {0})")]
15    KOutOfRange(usize),
16    #[error("k-mer iteration: {0}")]
17    KmerIter(#[from] rsomics_kmer::KmerError),
18}
19
20pub type Result<T> = std::result::Result<T, DbgError>;
21
22#[derive(Debug, Clone, Copy)]
23pub struct DbgNode {
24    pub kmer: Kmer,
25    pub in_degree: u8,
26    pub out_degree: u8,
27}
28
29/// Canonical de Bruijn graph: nodes are canonical (k-1)-mers; each input
30/// k-mer contributes one edge `prefix(k-1) → suffix(k-1)`. Storing nodes
31/// in canonical form means a sequence and its reverse-complement share
32/// the same graph (matches megahit / spades defaults).
33#[derive(Debug, Clone, Default)]
34pub struct Dbg {
35    pub k: usize,
36    pub nodes: HashMap<Kmer, DbgNode>,
37    pub edges: HashSet<(Kmer, Kmer)>,
38}
39
40impl Dbg {
41    pub fn build(seqs: &[&[u8]], k: usize) -> Result<Self> {
42        if !(2..=32).contains(&k) {
43            return Err(DbgError::KOutOfRange(k));
44        }
45        let mut dbg = Self {
46            k,
47            ..Self::default()
48        };
49        for seq in seqs {
50            if seq.len() < k {
51                continue;
52            }
53            for kmer_res in KmerIter::new(seq, k, false)? {
54                let kmer = match kmer_res {
55                    Ok(k) => k,
56                    Err(rsomics_kmer::KmerError::NonAcgt { .. }) => continue,
57                    Err(e) => return Err(DbgError::KmerIter(e)),
58                };
59                let prefix = prefix_kminus1(kmer, k);
60                let suffix = suffix_kminus1(kmer, k);
61                let prefix_c = canonical(prefix, k - 1);
62                let suffix_c = canonical(suffix, k - 1);
63                dbg.nodes.entry(prefix_c).or_insert_with(|| DbgNode {
64                    kmer: prefix_c,
65                    in_degree: 0,
66                    out_degree: 0,
67                });
68                dbg.nodes.entry(suffix_c).or_insert_with(|| DbgNode {
69                    kmer: suffix_c,
70                    in_degree: 0,
71                    out_degree: 0,
72                });
73                if dbg.edges.insert((prefix_c, suffix_c)) {
74                    dbg.nodes.get_mut(&prefix_c).unwrap().out_degree += 1;
75                    dbg.nodes.get_mut(&suffix_c).unwrap().in_degree += 1;
76                }
77            }
78        }
79        Ok(dbg)
80    }
81
82    #[must_use]
83    pub fn n_nodes(&self) -> usize {
84        self.nodes.len()
85    }
86    #[must_use]
87    pub fn n_edges(&self) -> usize {
88        self.edges.len()
89    }
90
91    /// Extract unitigs: maximal non-branching paths. Each unitig is a
92    /// vector of (k-1)-mer keys; downstream tools materialise them as
93    /// sequences by joining via the shared (k-2)-suffix/prefix overlap.
94    /// (Same definition as megahit / spades.)
95    #[must_use]
96    pub fn unitigs(&self) -> Vec<Vec<Kmer>> {
97        // Map per-node successors / predecessors derived from edges.
98        let mut succ: HashMap<Kmer, Vec<Kmer>> = HashMap::new();
99        let mut pred: HashMap<Kmer, Vec<Kmer>> = HashMap::new();
100        for &(a, b) in &self.edges {
101            succ.entry(a).or_default().push(b);
102            pred.entry(b).or_default().push(a);
103        }
104        let is_branching = |k: &Kmer| -> bool {
105            let s = succ.get(k).map_or(0, Vec::len);
106            let p = pred.get(k).map_or(0, Vec::len);
107            !(s == 1 && p == 1)
108        };
109
110        let mut visited: HashSet<Kmer> = HashSet::new();
111        let mut out: Vec<Vec<Kmer>> = Vec::new();
112        // Branching / tip starts.
113        for &start in self.nodes.keys() {
114            if visited.contains(&start) || !is_branching(&start) {
115                continue;
116            }
117            let outs = succ.get(&start).cloned().unwrap_or_default();
118            for next_start in outs {
119                if visited.contains(&next_start) && is_branching(&next_start) {
120                    continue;
121                }
122                let mut path = vec![start];
123                let mut cur = next_start;
124                loop {
125                    path.push(cur);
126                    visited.insert(cur);
127                    if is_branching(&cur) {
128                        break;
129                    }
130                    let n = succ.get(&cur).and_then(|v| v.first().copied());
131                    match n {
132                        Some(n) => cur = n,
133                        None => break,
134                    }
135                }
136                out.push(path);
137            }
138        }
139        // Pure cycles (no branching node anywhere) — pick any start.
140        for &n in self.nodes.keys() {
141            if visited.contains(&n) {
142                continue;
143            }
144            let mut path = vec![n];
145            let mut cur = n;
146            visited.insert(n);
147            while let Some(nx) = succ.get(&cur).and_then(|v| v.first().copied()) {
148                if visited.contains(&nx) {
149                    break;
150                }
151                path.push(nx);
152                visited.insert(nx);
153                cur = nx;
154            }
155            out.push(path);
156        }
157        out
158    }
159}
160
161fn prefix_kminus1(kmer: Kmer, _k: usize) -> Kmer {
162    // Drop the last 2-bit base.
163    kmer >> 2
164}
165
166fn suffix_kminus1(kmer: Kmer, k: usize) -> Kmer {
167    // Keep only the bottom (k-1) bases.
168    let mask: u64 = if k == 1 {
169        0
170    } else {
171        (1u64 << (2 * (k - 1))) - 1
172    };
173    kmer & mask
174}
175
176#[cfg(test)]
177mod tests {
178    use super::*;
179
180    #[test]
181    fn build_linear_seq_yields_n_minus_k_plus_1_edges() {
182        // Seq "ACGTACGT" (8bp) with k=4 → 5 k-mers → 5 edges.
183        let seqs: &[&[u8]] = &[b"ACGTACGT"];
184        let dbg = Dbg::build(seqs, 4).unwrap();
185        // With canonical-folding, palindromic k-mers may coalesce; just
186        // check we got at least k-mer-count - 1 edges.
187        assert!(dbg.n_edges() >= 1);
188        assert!(dbg.n_nodes() >= 2);
189    }
190
191    #[test]
192    fn build_skips_n_kmers() {
193        // "ACGTNACGT" with k=4: only windows at offsets 0 and 5 are N-free,
194        // both `ACGT`. After canonical-folding their prefix/suffix collapse
195        // to a single (k-1)-mer node with a self-loop edge.
196        let seqs: &[&[u8]] = &[b"ACGTNACGT"];
197        let dbg = Dbg::build(seqs, 4).unwrap();
198        assert!(dbg.n_edges() >= 1);
199        assert!(dbg.n_nodes() >= 1);
200    }
201
202    #[test]
203    fn build_rejects_k_out_of_range() {
204        assert!(matches!(
205            Dbg::build(&[b"ACGT"], 1),
206            Err(DbgError::KOutOfRange(1))
207        ));
208        assert!(matches!(
209            Dbg::build(&[b"ACGT"], 33),
210            Err(DbgError::KOutOfRange(33))
211        ));
212    }
213
214    #[test]
215    fn empty_seqs_yield_empty_dbg() {
216        let dbg = Dbg::build(&[], 4).unwrap();
217        assert_eq!(dbg.n_nodes(), 0);
218        assert_eq!(dbg.n_edges(), 0);
219    }
220
221    #[test]
222    fn unitigs_collapse_linear_paths() {
223        // Long random-looking seq → 1+ linear paths from non-branching nodes.
224        let seq: &[u8] = b"ACGTAGCTAGCTGATCGATCAGCT";
225        let dbg = Dbg::build(&[seq], 5).unwrap();
226        let unitigs = dbg.unitigs();
227        // The unitig count should be ≥ 1 and total length should equal n_nodes.
228        let total_visited: usize = unitigs.iter().map(Vec::len).sum();
229        assert!(
230            total_visited >= dbg.n_nodes(),
231            "{total_visited} ≥ {}",
232            dbg.n_nodes()
233        );
234    }
235
236    #[test]
237    fn prefix_and_suffix_helpers_work_for_4mer() {
238        let kmer = rsomics_kmer::encode::encode(b"ACGT").unwrap();
239        let pref = prefix_kminus1(kmer, 4);
240        let suff = suffix_kminus1(kmer, 4);
241        assert_eq!(decode(pref, 3), b"ACG".to_vec());
242        assert_eq!(decode(suff, 3), b"CGT".to_vec());
243    }
244}