libjp/
chain_graggle.rs

1// Copyright 2018-2019 Joe Neeman.
2//
3// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. This file may not be copied, modified, or distributed
7// except according to those terms.
8//
9// See the LICENSE-APACHE or LICENSE-MIT files at the top-level directory
10// of this distribution.
11
12use graph::Graph;
13use itertools::Itertools;
14use multimap::MMap;
15use std::collections::{BTreeMap, HashSet};
16
17use crate::NodeId;
18
19/// A version of a [`Graggle`](crate::Graggle) that has been decomposed into "chains" (for example, for
20/// prettier rendering).
21///
22/// Most graggles that you'll see in practice don't look like general directed graphs. Rather, they
23/// contain lots of long "chains," i.e., sequences of nodes, each of which has exactly one
24/// in-neighbor (the previous in the sequence) and one out-neighbor (the next). For example, a
25/// totally ordered graggle (a.k.a. a file) just consists of one long chain.
26///
27/// This struct represents the same graph as a graggle, except that every chain has been "collapsed"
28/// into a single node. That is, you can think of a `ChainGraggle` as a graph in which every node
29/// represents a chain (possibly of length 1) in the original graph.
30#[derive(Clone, Debug, Deserialize, Serialize)]
31pub struct ChainGraggle {
32    // TODO: allow retrieving liveness of NodeIds and type of edges.
33    chains: Vec<Vec<NodeId>>,
34    edges: MMap<usize, usize>,
35    clusters: Vec<HashSet<usize>>,
36}
37
38// Assumes that `node` is not part of a cycle. Therefore, it is on a chain if and only if it
39// has at most one in-neighbor and at most one out-neighbor.
40fn on_chain<G: Graph>(g: &G, node: &G::Node) -> bool {
41    g.out_edges(node).take(2).count() <= 1 && g.in_edges(node).take(2).count() <= 1
42}
43
44// Follows a chain backwards to find the first node on it.
45fn chain_first<G: Graph>(g: &G, node: &G::Node) -> G::Node {
46    if !on_chain(g, node) {
47        *node
48    } else {
49        let mut ret = *node;
50        while let Some(prev) = g.in_neighbors(&ret).next() {
51            if on_chain(g, &prev) {
52                ret = prev;
53            } else {
54                return ret;
55            }
56        }
57        ret
58    }
59}
60
61fn collect_chain<G: Graph>(g: &G, first: &G::Node) -> Vec<G::Node> {
62    let mut ret = Vec::new();
63    let mut cur = *first;
64    ret.push(cur);
65    if !on_chain(g, &cur) {
66        return ret;
67    }
68
69    while let Some(next) = g.out_neighbors(&cur).next() {
70        if on_chain(g, &next) {
71            ret.push(next);
72            cur = next;
73        } else {
74            break;
75        }
76    }
77    ret
78}
79
80impl ChainGraggle {
81    /// How many chains are there?
82    pub fn num_chains(&self) -> usize {
83        self.chains.len()
84    }
85
86    /// Returns the sequence of `NodeId`s making up the chain at index `i`.
87    pub fn chain(&self, i: usize) -> &[NodeId] {
88        &self.chains[i]
89    }
90
91    /// Returns an iterator over strongly connected components of the original graph.
92    pub fn clusters(&self) -> impl Iterator<Item = &HashSet<usize>> {
93        self.clusters.iter()
94    }
95
96    /// Given a graph, decompose it into a `ChainGraggle`.
97    pub fn from_graph<G: Graph<Node = NodeId>>(g: G) -> ChainGraggle
98    where
99        G::Edge: graph::Edge<NodeId>,
100    {
101        let sccs = g.tarjan();
102
103        // The collection of all nodes whose SCC has only one component. These are the ones that
104        // can potentially belong to chains.
105        let mut singles = sccs
106            .parts()
107            .filter(|part| part.len() == 1)
108            .flat_map(|part| part.iter())
109            .collect::<HashSet<_>>();
110
111        // An iterator over nodes in large SCCs.
112        let (others1, others2) = sccs
113            .parts()
114            .filter(|part| part.len() > 1)
115            .flat_map(|part| part.iter())
116            .cloned()
117            .tee();
118
119        // All nodes in larger SCCs get added to the final graph as length-1 chains.
120        let mut chains = others1.map(|node| vec![node]).collect::<Vec<_>>();
121
122        // Map going from NodeId to index in the Vec<Node>.
123        let mut node_part = others2
124            .enumerate()
125            .map(|(x, y)| (y, x))
126            .collect::<BTreeMap<NodeId, usize>>();
127
128        while !singles.is_empty() {
129            let u = chain_first(&g, singles.iter().next().unwrap());
130            let chain = collect_chain(&g, &u);
131            for v in &chain {
132                singles.remove(v);
133                node_part.insert(*v, chains.len());
134            }
135            chains.push(chain);
136        }
137
138        let mut edges = MMap::new();
139
140        for u in g.nodes() {
141            for v in g.out_neighbors(&u) {
142                let u_idx = node_part[&u];
143                let v_idx = node_part[&v];
144
145                // Nodes with the same index belong to the same chain, so we don't need to store an
146                // edge between them.
147                if u_idx != v_idx {
148                    edges.insert(u_idx, v_idx);
149                }
150            }
151        }
152
153        let clusters = sccs
154            .parts()
155            .filter(|part| part.len() > 1)
156            .map(|part| part.iter().map(|id| node_part[id]).collect::<HashSet<_>>())
157            .collect::<Vec<_>>();
158
159        ChainGraggle {
160            chains,
161            edges,
162            clusters,
163        }
164    }
165}
166
167/// In this implementation of [`graph::Graph`], the nodes are (semantically) chains in the original
168/// graggle. More precisely, they are `usize`s that you can use to get the chain with
169/// [`ChainGraggle::chain`].
170impl Graph for ChainGraggle {
171    type Node = usize;
172    type Edge = usize;
173
174    fn nodes(&'_ self) -> Box<dyn Iterator<Item = usize> + '_> {
175        Box::new(0..self.chains.len())
176    }
177
178    fn out_edges(&'_ self, u: &usize) -> Box<dyn Iterator<Item = usize> + '_> {
179        Box::new(self.edges.get(u).cloned())
180    }
181
182    // TODO: consider removing in_edges from the Graph trait and making it part of a different
183    // trait.
184    fn in_edges(&'_ self, _u: &usize) -> Box<dyn Iterator<Item = usize> + '_> {
185        panic!("in-edges not implemented for this graph");
186    }
187}
188
189#[cfg(test)]
190mod tests {
191    use std::collections::HashSet;
192
193    use super::*;
194    use crate::storage::graggle::tests::arb_live_graggle;
195
196    #[test]
197    fn diamond() {
198        let graggle = graggle!(
199            live: 0, 1, 2, 3
200            edges: 0-1, 0-2, 1-3, 2-3
201        );
202        let decomp = ChainGraggle::from_graph(graggle.as_graggle().as_live_graph());
203        assert_eq!(decomp.chains.len(), 4);
204        for ch in &decomp.chains {
205            assert_eq!(ch.len(), 1);
206        }
207    }
208
209    proptest! {
210        // Checks that the chains of the decomposition form a partition of the original node set.
211        #[test]
212        fn partition(ref d in arb_live_graggle(20)) {
213            let decomp = ChainGraggle::from_graph(d.as_graggle().as_live_graph());
214            let decomp_nodes = decomp.chains.iter().flat_map(|chain| chain.iter()).cloned().collect::<Vec<_>>();
215            let decomp_node_set = decomp_nodes.iter().cloned().collect::<HashSet<_>>();
216
217            // Check that there are no repeated nodes.
218            assert_eq!(decomp_nodes.len(), decomp_node_set.len());
219
220            // Check that we got all the nodes once.
221            assert_eq!(decomp_nodes.len(), d.as_graggle().nodes().count());
222        }
223    }
224}