swh_graph/views/contiguous_subgraph/mod.rs
1// Copyright (C) 2023-2025 The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6use std::collections::HashMap;
7use std::path::Path;
8use std::sync::Arc;
9
10use anyhow::{bail, Result};
11use dsi_progress_logger::{concurrent_progress_logger, ProgressLog};
12use rayon::prelude::*;
13use sux::dict::elias_fano::{EfSeqDict, EliasFanoBuilder};
14use sux::prelude::{IndexedDict, IndexedSeq};
15
16use crate::graph::*;
17use crate::mph::SwhidMphf;
18use crate::properties;
19use crate::{NodeType, OutOfBoundError, SWHID};
20
21mod contents;
22mod iterators;
23mod label_names;
24mod maps;
25mod persons;
26mod strings;
27mod timestamps;
28
29/// Alias for [`IndexedSeq`] + [`IndexedDict`] mapping from [`NodeId`] to [`NodeId`].
30pub trait ContractionBackend:
31 IndexedSeq<Input = NodeId, Output = NodeId> + IndexedDict<Input = NodeId, Output = NodeId>
32{
33}
34
35impl<
36 B: IndexedSeq<Input = NodeId, Output = NodeId> + IndexedDict<Input = NodeId, Output = NodeId>,
37 > ContractionBackend for B
38{
39}
40
41/// See [`ContiguousSubgraph`]
42pub struct Contraction<N: IndexedSeq<Input = NodeId, Output = NodeId>>(pub N);
43
44impl<N: IndexedSeq<Input = NodeId, Output = NodeId>> Contraction<N> {
45 /// Given a node id in a [`ContiguousSubgraph`], returns the corresponding node id
46 /// in the [`ContiguousSubgraph`]'s underlying graph
47 #[inline(always)]
48 pub fn underlying_node_id(&self, self_node: NodeId) -> NodeId {
49 self.0.get(self_node)
50 }
51
52 /// Returns the number of node ids in the [`ContiguousSubgraph`].
53 ///
54 /// Note: this can be an overapproximation if the underlying graph is a subgraph
55 pub fn num_nodes(&self) -> usize {
56 self.0.len()
57 }
58}
59
60impl<N: ContractionBackend> Contraction<N> {
61 /// Given a node id in a [`ContiguousSubgraph`]'s underlying graph, returns the
62 /// corresponding node id in the [`ContiguousSubgraph`]
63 #[inline(always)]
64 pub fn node_id_from_underlying(&self, underlying_node: NodeId) -> Option<NodeId> {
65 self.0.index_of(underlying_node)
66 }
67}
68
69/// A view over [`SwhGraph`] and related traits, that filters out some nodes, and renumbers
70/// remaining nodes so that the set of valid nodes becomes a range.
71///
72/// Due to limitations in the Rust type system, properties available on the underlying
73/// [`SwhGraph`] are not automatically available on [`ContiguousSubgraph`].
74/// They need to be explicitly enabled in a builder-like pattern on `ContiguousSubgraph`
75/// using these methods:
76/// * [`with_maps`][`ContiguousSubgraph::with_maps`],
77/// * [`with_timestamps`][`ContiguousSubgraph::with_timestamps`],
78/// * [`with_persons`][`ContiguousSubgraph::with_persons`],
79/// * [`with_contents`][`ContiguousSubgraph::with_contents`],
80/// * [`with_strings`][`ContiguousSubgraph::with_strings`],
81/// * [`with_label_names`][`ContiguousSubgraph::with_label_names`],
82///
83/// # Examples
84///
85/// Build a `ContiguousSubgraph` made of only content and directory nodes:
86///
87/// ```
88/// use dsi_progress_logger::progress_logger;
89/// use swh_graph::properties;
90/// use swh_graph::NodeConstraint;
91/// use swh_graph::graph::SwhGraphWithProperties;
92/// use swh_graph::views::{ContiguousSubgraph, Contraction, Subgraph, ContractionBackend};
93///
94/// fn filesystem_subgraph<G>(graph: &G) -> ContiguousSubgraph<
95/// Subgraph<&'_ G, impl Fn(usize) -> bool + use<'_, G>, fn(usize, usize) -> bool>,
96/// impl ContractionBackend,
97/// properties::NoMaps,
98/// properties::NoTimestamps,
99/// properties::NoPersons,
100/// properties::NoContents,
101/// properties::NoStrings,
102/// properties::NoLabelNames,
103/// >
104/// where G: SwhGraphWithProperties<Maps: properties::Maps> + Sync {
105/// let sparse_subgraph = Subgraph::with_node_constraint(graph, "cnt,ori".parse().unwrap());
106/// ContiguousSubgraph::new_from_noncontiguous_graph(sparse_subgraph)
107/// }
108/// ```
109///
110/// The graph created above is slightly suboptimal, as `ContiguousSubgraph` wraps
111/// a `Subgraph`, causing `Subgraph` to add needless overhead by checking that nodes
112/// exist, even though `ContiguousSubgraph` does it too.
113/// This should not be noticeable, but if it is an issue, you can skip the Subgraph
114/// by manually building a [`Contraction`]:
115///
116/// ```
117/// use dsi_progress_logger::progress_logger;
118/// use sux::dict::elias_fano::{EfSeqDict, EliasFanoBuilder};
119/// use swh_graph::properties;
120/// use swh_graph::{NodeType};
121/// use swh_graph::graph::SwhGraphWithProperties;
122/// use swh_graph::views::{ContiguousSubgraph, Contraction, Subgraph, ContractionBackend};
123///
124/// fn filesystem_subgraph<G>(graph: &G) -> ContiguousSubgraph<
125/// &'_ G,
126/// EfSeqDict,
127/// properties::NoMaps,
128/// properties::NoTimestamps,
129/// properties::NoPersons,
130/// properties::NoContents,
131/// properties::NoStrings,
132/// properties::NoLabelNames,
133/// >
134/// where G: SwhGraphWithProperties<Maps: properties::Maps> {
135///
136/// // compute exact number of nodes in the subgraph, which is required
137/// // by EliasFanoBuilder
138/// let pl = progress_logger!(
139/// item_name = "node",
140/// expected_updates = Some(graph.num_nodes()),
141/// );
142/// let num_nodes = graph
143/// .iter_nodes(pl)
144/// .filter(|&node| match graph.properties().node_type(node) {
145/// NodeType::Content | NodeType::Directory => true,
146/// _ => false,
147/// })
148/// .count();
149///
150/// // compute set of nodes in the subgraph
151/// let mut nodes_efb = EliasFanoBuilder::new(num_nodes, graph.num_nodes());
152/// for node in 0..graph.num_nodes() {
153/// match graph.properties().node_type(node) {
154/// NodeType::Content | NodeType::Directory => nodes_efb.push(node),
155/// _ => (),
156/// }
157/// }
158/// let contraction = Contraction(nodes_efb.build_with_seq_and_dict());
159///
160/// // assemble the subgraph
161/// ContiguousSubgraph::new_from_contraction(graph, contraction)
162/// }
163/// ```
164pub struct ContiguousSubgraph<
165 G: SwhGraph,
166 N: ContractionBackend,
167 MAPS: properties::MaybeMaps,
168 TIMESTAMPS: properties::MaybeTimestamps,
169 PERSONS: properties::MaybePersons,
170 CONTENTS: properties::MaybeContents,
171 STRINGS: properties::MaybeStrings,
172 LABELNAMES: properties::MaybeLabelNames,
173> {
174 inner: Arc<ContiguousSubgraphInner<G, N>>, // TODO: find a way to replace Arc with ouroboros
175 properties:
176 properties::SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>,
177}
178
179impl<G: SwhGraphWithProperties, N: ContractionBackend>
180 ContiguousSubgraph<
181 G,
182 N,
183 properties::NoMaps,
184 properties::NoTimestamps,
185 properties::NoPersons,
186 properties::NoContents,
187 properties::NoStrings,
188 properties::NoLabelNames,
189 >
190{
191 /// Creates a new [`ContiguousSubgraph`] by keeping only nodes in the [`Contraction`]
192 pub fn new_from_contraction(graph: G, contraction: Contraction<N>) -> Self {
193 let path = graph.properties().path.clone();
194 let num_nodes = contraction.num_nodes();
195 let inner = Arc::new(ContiguousSubgraphInner {
196 underlying_graph: graph,
197 contraction,
198 });
199 Self {
200 properties: properties::SwhGraphProperties {
201 path,
202 num_nodes,
203 maps: properties::NoMaps,
204 timestamps: properties::NoTimestamps,
205 persons: properties::NoPersons,
206 contents: properties::NoContents,
207 strings: properties::NoStrings,
208 label_names: properties::NoLabelNames,
209 label_names_are_in_base64_order: Default::default(),
210 },
211 inner,
212 }
213 }
214}
215
216impl<G: SwhGraph, N: ContractionBackend>
217 ContiguousSubgraph<
218 G,
219 N,
220 properties::NoMaps,
221 properties::NoTimestamps,
222 properties::NoPersons,
223 properties::NoContents,
224 properties::NoStrings,
225 properties::NoLabelNames,
226 >
227{
228 /// Returns the graph this graph is a subgraph of
229 ///
230 /// Use [`Self::contraction`] to get the mapping between both sets of node ids
231 pub fn underlying_graph(&self) -> &G {
232 &self.inner.underlying_graph
233 }
234
235 /// The structure used to match the underlying graph's node ids with this graph's node ids
236 pub fn contraction(&self) -> &Contraction<N> {
237 &self.inner.contraction
238 }
239}
240
241impl<G: SwhGraphWithProperties>
242 ContiguousSubgraph<
243 G,
244 EfSeqDict,
245 properties::NoMaps,
246 properties::NoTimestamps,
247 properties::NoPersons,
248 properties::NoContents,
249 properties::NoStrings,
250 properties::NoLabelNames,
251 >
252{
253 /// Creates a new [`ContiguousSubgraph`] from an existing graph with "holes".
254 pub fn new_from_noncontiguous_graph(graph: G) -> Self
255 where
256 G: Sync,
257 {
258 // compute exact number of nodes in the subgraph, which is required
259 // by EliasFanoBuilder
260 let actual_num_nodes = graph.actual_num_nodes().unwrap_or_else(|_| {
261 let mut pl = concurrent_progress_logger!(
262 item_name = "node",
263 expected_updates = Some(graph.num_nodes()),
264 );
265 pl.start("Computing number of nodes");
266 let actual_num_nodes = graph
267 .par_iter_nodes(pl.clone())
268 .filter(|&node| graph.has_node(node))
269 .count();
270 pl.done();
271 actual_num_nodes
272 });
273
274 // compute set of nodes in the subgraph
275 let mut nodes_efb = EliasFanoBuilder::new(actual_num_nodes, graph.num_nodes());
276 for node in 0..graph.num_nodes() {
277 if graph.has_node(node) {
278 nodes_efb.push(node);
279 }
280 }
281 let contraction = Contraction(nodes_efb.build_with_seq_and_dict());
282
283 Self::new_from_contraction(graph, contraction)
284 }
285}
286
287// content of ContiguousSubgraph that must be wrapped in an Arc in order to make
288// ContiguousSubgraphMaps live as long as G (which is an accidental requirement of
289// SwhGraphWithProperties, which doesn't seem to be removable without making its
290// API painfully hard to use).
291struct ContiguousSubgraphInner<G: SwhGraph, N: ContractionBackend> {
292 underlying_graph: G,
293 contraction: Contraction<N>,
294}
295
296impl<
297 G: SwhGraph,
298 N: ContractionBackend,
299 MAPS: properties::MaybeMaps,
300 TIMESTAMPS: properties::MaybeTimestamps,
301 PERSONS: properties::MaybePersons,
302 CONTENTS: properties::MaybeContents,
303 STRINGS: properties::MaybeStrings,
304 LABELNAMES: properties::MaybeLabelNames,
305 > SwhGraph
306 for ContiguousSubgraph<G, N, MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>
307{
308 #[inline(always)]
309 fn path(&self) -> &Path {
310 self.inner.underlying_graph.path()
311 }
312 #[inline(always)]
313 fn is_transposed(&self) -> bool {
314 self.inner.underlying_graph.is_transposed()
315 }
316 #[inline(always)]
317 // Note: this can be an overapproximation if the underlying graph is a subgraph
318 fn num_nodes(&self) -> usize {
319 self.inner.contraction.num_nodes()
320 }
321 fn has_node(&self, node_id: NodeId) -> bool {
322 node_id < self.num_nodes()
323 && self
324 .inner
325 .underlying_graph
326 .has_node(self.inner.contraction.underlying_node_id(node_id))
327 }
328 #[inline(always)]
329 // Note: this return the number or arcs in the original graph, before
330 // subgraph filtering.
331 fn num_arcs(&self) -> u64 {
332 self.inner.underlying_graph.num_arcs()
333 }
334 fn num_nodes_by_type(&self) -> Result<HashMap<NodeType, usize>> {
335 bail!("num_nodes_by_type is not supported by ContiguousSubgraph")
336 }
337 fn num_arcs_by_type(&self) -> Result<HashMap<(NodeType, NodeType), usize>> {
338 bail!("num_arcs_by_type is not supported by ContiguousSubgraph")
339 }
340 fn has_arc(&self, src_node_id: NodeId, dst_node_id: NodeId) -> bool {
341 self.inner.underlying_graph.has_arc(
342 self.inner.contraction.underlying_node_id(src_node_id),
343 self.inner.contraction.underlying_node_id(dst_node_id),
344 )
345 }
346}
347
348impl<
349 G: SwhGraphWithProperties,
350 N: ContractionBackend,
351 MAPS: properties::MaybeMaps,
352 TIMESTAMPS: properties::MaybeTimestamps,
353 PERSONS: properties::MaybePersons,
354 CONTENTS: properties::MaybeContents,
355 STRINGS: properties::MaybeStrings,
356 LABELNAMES: properties::MaybeLabelNames,
357 > SwhGraphWithProperties
358 for ContiguousSubgraph<G, N, MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>
359{
360 type Maps = MAPS;
361 type Timestamps = TIMESTAMPS;
362 type Persons = PERSONS;
363 type Contents = CONTENTS;
364 type Strings = STRINGS;
365 type LabelNames = LABELNAMES;
366
367 #[inline(always)]
368 fn properties(
369 &self,
370 ) -> &properties::SwhGraphProperties<
371 Self::Maps,
372 Self::Timestamps,
373 Self::Persons,
374 Self::Contents,
375 Self::Strings,
376 Self::LabelNames,
377 > {
378 &self.properties
379 }
380}