algorithms_edu/problems/graph/
reconstruct_string_from_lmers.rs

1//! # Problem
2//!
3//! Reconstruct the original string from all substrings of length `l` of the string.
4//! For example, reconstruct `"ATGCAGGT"` from `["ATG", "TGC", "GCA", "CAG", "AGG", "GGT"]`.
5//! (note that the collection of *l*-mers is disordered)
6//!
7//! # Strategy
8//!
9//! This problem can be solved by finding a Eulerian path through the graph where vertices
10//! represent `l`-mers and edges represent `l-1` overlaps among the `l`-mers. For example,
11//! between two vertices `ATG` and `TGC`, there is a directed edge pointing from `ATG` to
12//! `TGC`, because they have an `l-1`-mer` overlap, i.e. `"ATG"[1..] == "TGC"[.."TGC".len() - 1]`
13
14use crate::algo::graph::eulerian_path::*;
15use crate::algo::graph::UnweightedAdjacencyList;
16
17pub fn reconstruct_string(lmers: &[&[u8]]) -> Result<Vec<u8>, EulerianPathError> {
18    let l = lmers[0].len();
19    let mut g = UnweightedAdjacencyList::with_size(lmers.len());
20    let suffixes: Vec<_> = lmers.iter().map(|lmer| &lmer[1..]).collect();
21    let prefixes: Vec<_> = lmers.iter().map(|lmer| &lmer[..l - 1]).collect();
22    for (i, suffix) in suffixes.iter().enumerate() {
23        for (j, prefix) in prefixes.iter().enumerate() {
24            if suffix == prefix {
25                g.add_directed_edge(i, j);
26            }
27        }
28    }
29
30    let path = g.eulerian_path()?;
31    let mut res = lmers[path[0]].to_owned();
32    for &i in &path[1..] {
33        res.push(lmers[i][l - 1]);
34    }
35    Ok(res)
36}
37
38#[cfg(test)]
39mod tests {
40    use super::*;
41    use rand::seq::SliceRandom;
42    use rand::thread_rng;
43
44    fn _test(s: &[u8]) {
45        let mut lmers: Vec<_> = s.windows(5).collect();
46        let mut rng = thread_rng();
47        lmers.shuffle(&mut rng);
48
49        let reconstructed = reconstruct_string(&lmers).unwrap();
50
51        assert_eq!(&reconstructed, &s);
52    }
53
54    #[test]
55    fn test_reconstruct_string_from_lmers() {
56        _test(b"The quick brown fox jumps over the lazy dog");
57        _test(b"ATGGCGTGCA")
58    }
59}