rsomics-debruijn 0.1.0

de Bruijn graph types + linear-path collapse + unitig extraction for the rsomics-* tool family. Layer A primitive.
Documentation
#![allow(clippy::missing_errors_doc, clippy::missing_panics_doc)]

use std::collections::{HashMap, HashSet};

use rsomics_kmer::encode::{Kmer, canonical};
use rsomics_kmer::iter::KmerIter;

#[cfg(test)]
use rsomics_kmer::encode::decode;

#[derive(Debug, thiserror::Error)]
#[non_exhaustive]
pub enum DbgError {
    #[error("k must be in 1..=32 (got {0})")]
    KOutOfRange(usize),
    #[error("k-mer iteration: {0}")]
    KmerIter(#[from] rsomics_kmer::KmerError),
}

pub type Result<T> = std::result::Result<T, DbgError>;

#[derive(Debug, Clone, Copy)]
pub struct DbgNode {
    pub kmer: Kmer,
    pub in_degree: u8,
    pub out_degree: u8,
}

/// Canonical de Bruijn graph: nodes are canonical (k-1)-mers; each input
/// k-mer contributes one edge `prefix(k-1) → suffix(k-1)`. Storing nodes
/// in canonical form means a sequence and its reverse-complement share
/// the same graph (matches megahit / spades defaults).
#[derive(Debug, Clone, Default)]
pub struct Dbg {
    pub k: usize,
    pub nodes: HashMap<Kmer, DbgNode>,
    pub edges: HashSet<(Kmer, Kmer)>,
}

impl Dbg {
    pub fn build(seqs: &[&[u8]], k: usize) -> Result<Self> {
        if !(2..=32).contains(&k) {
            return Err(DbgError::KOutOfRange(k));
        }
        let mut dbg = Self {
            k,
            ..Self::default()
        };
        for seq in seqs {
            if seq.len() < k {
                continue;
            }
            for kmer_res in KmerIter::new(seq, k, false)? {
                let kmer = match kmer_res {
                    Ok(k) => k,
                    Err(rsomics_kmer::KmerError::NonAcgt { .. }) => continue,
                    Err(e) => return Err(DbgError::KmerIter(e)),
                };
                let prefix = prefix_kminus1(kmer, k);
                let suffix = suffix_kminus1(kmer, k);
                let prefix_c = canonical(prefix, k - 1);
                let suffix_c = canonical(suffix, k - 1);
                dbg.nodes.entry(prefix_c).or_insert_with(|| DbgNode {
                    kmer: prefix_c,
                    in_degree: 0,
                    out_degree: 0,
                });
                dbg.nodes.entry(suffix_c).or_insert_with(|| DbgNode {
                    kmer: suffix_c,
                    in_degree: 0,
                    out_degree: 0,
                });
                if dbg.edges.insert((prefix_c, suffix_c)) {
                    dbg.nodes.get_mut(&prefix_c).unwrap().out_degree += 1;
                    dbg.nodes.get_mut(&suffix_c).unwrap().in_degree += 1;
                }
            }
        }
        Ok(dbg)
    }

    #[must_use]
    pub fn n_nodes(&self) -> usize {
        self.nodes.len()
    }
    #[must_use]
    pub fn n_edges(&self) -> usize {
        self.edges.len()
    }

    /// Extract unitigs: maximal non-branching paths. Each unitig is a
    /// vector of (k-1)-mer keys; downstream tools materialise them as
    /// sequences by joining via the shared (k-2)-suffix/prefix overlap.
    /// (Same definition as megahit / spades.)
    #[must_use]
    pub fn unitigs(&self) -> Vec<Vec<Kmer>> {
        // Map per-node successors / predecessors derived from edges.
        let mut succ: HashMap<Kmer, Vec<Kmer>> = HashMap::new();
        let mut pred: HashMap<Kmer, Vec<Kmer>> = HashMap::new();
        for &(a, b) in &self.edges {
            succ.entry(a).or_default().push(b);
            pred.entry(b).or_default().push(a);
        }
        let is_branching = |k: &Kmer| -> bool {
            let s = succ.get(k).map_or(0, Vec::len);
            let p = pred.get(k).map_or(0, Vec::len);
            !(s == 1 && p == 1)
        };

        let mut visited: HashSet<Kmer> = HashSet::new();
        let mut out: Vec<Vec<Kmer>> = Vec::new();
        // Branching / tip starts.
        for &start in self.nodes.keys() {
            if visited.contains(&start) || !is_branching(&start) {
                continue;
            }
            let outs = succ.get(&start).cloned().unwrap_or_default();
            for next_start in outs {
                if visited.contains(&next_start) && is_branching(&next_start) {
                    continue;
                }
                let mut path = vec![start];
                let mut cur = next_start;
                loop {
                    path.push(cur);
                    visited.insert(cur);
                    if is_branching(&cur) {
                        break;
                    }
                    let n = succ.get(&cur).and_then(|v| v.first().copied());
                    match n {
                        Some(n) => cur = n,
                        None => break,
                    }
                }
                out.push(path);
            }
        }
        // Pure cycles (no branching node anywhere) — pick any start.
        for &n in self.nodes.keys() {
            if visited.contains(&n) {
                continue;
            }
            let mut path = vec![n];
            let mut cur = n;
            visited.insert(n);
            while let Some(nx) = succ.get(&cur).and_then(|v| v.first().copied()) {
                if visited.contains(&nx) {
                    break;
                }
                path.push(nx);
                visited.insert(nx);
                cur = nx;
            }
            out.push(path);
        }
        out
    }
}

fn prefix_kminus1(kmer: Kmer, _k: usize) -> Kmer {
    // Drop the last 2-bit base.
    kmer >> 2
}

fn suffix_kminus1(kmer: Kmer, k: usize) -> Kmer {
    // Keep only the bottom (k-1) bases.
    let mask: u64 = if k == 1 {
        0
    } else {
        (1u64 << (2 * (k - 1))) - 1
    };
    kmer & mask
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn build_linear_seq_yields_n_minus_k_plus_1_edges() {
        // Seq "ACGTACGT" (8bp) with k=4 → 5 k-mers → 5 edges.
        let seqs: &[&[u8]] = &[b"ACGTACGT"];
        let dbg = Dbg::build(seqs, 4).unwrap();
        // With canonical-folding, palindromic k-mers may coalesce; just
        // check we got at least k-mer-count - 1 edges.
        assert!(dbg.n_edges() >= 1);
        assert!(dbg.n_nodes() >= 2);
    }

    #[test]
    fn build_skips_n_kmers() {
        // "ACGTNACGT" with k=4: only windows at offsets 0 and 5 are N-free,
        // both `ACGT`. After canonical-folding their prefix/suffix collapse
        // to a single (k-1)-mer node with a self-loop edge.
        let seqs: &[&[u8]] = &[b"ACGTNACGT"];
        let dbg = Dbg::build(seqs, 4).unwrap();
        assert!(dbg.n_edges() >= 1);
        assert!(dbg.n_nodes() >= 1);
    }

    #[test]
    fn build_rejects_k_out_of_range() {
        assert!(matches!(
            Dbg::build(&[b"ACGT"], 1),
            Err(DbgError::KOutOfRange(1))
        ));
        assert!(matches!(
            Dbg::build(&[b"ACGT"], 33),
            Err(DbgError::KOutOfRange(33))
        ));
    }

    #[test]
    fn empty_seqs_yield_empty_dbg() {
        let dbg = Dbg::build(&[], 4).unwrap();
        assert_eq!(dbg.n_nodes(), 0);
        assert_eq!(dbg.n_edges(), 0);
    }

    #[test]
    fn unitigs_collapse_linear_paths() {
        // Long random-looking seq → 1+ linear paths from non-branching nodes.
        let seq: &[u8] = b"ACGTAGCTAGCTGATCGATCAGCT";
        let dbg = Dbg::build(&[seq], 5).unwrap();
        let unitigs = dbg.unitigs();
        // The unitig count should be ≥ 1 and total length should equal n_nodes.
        let total_visited: usize = unitigs.iter().map(Vec::len).sum();
        assert!(
            total_visited >= dbg.n_nodes(),
            "{total_visited} ≥ {}",
            dbg.n_nodes()
        );
    }

    #[test]
    fn prefix_and_suffix_helpers_work_for_4mer() {
        let kmer = rsomics_kmer::encode::encode(b"ACGT").unwrap();
        let pref = prefix_kminus1(kmer, 4);
        let suff = suffix_kminus1(kmer, 4);
        assert_eq!(decode(pref, 3), b"ACG".to_vec());
        assert_eq!(decode(suff, 3), b"CGT".to_vec());
    }
}