swh_graph/views/contiguous_subgraph/mod.rs
1// Copyright (C) 2023-2025 The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6use std::collections::HashMap;
7use std::path::Path;
8use std::sync::Arc;
9
10use anyhow::{bail, Result};
11use dsi_progress_logger::{concurrent_progress_logger, ProgressLog};
12use rayon::prelude::*;
13use sux::dict::elias_fano::{EfSeqDict, EliasFano, EliasFanoBuilder};
14use sux::traits::indexed_dict::{IndexedDict, IndexedSeq};
15
16use crate::graph::*;
17use crate::mph::SwhidMphf;
18use crate::properties;
19use crate::{NodeType, OutOfBoundError, SWHID};
20
21mod contents;
22mod iterators;
23mod label_names;
24mod maps;
25mod persons;
26mod strings;
27mod timestamps;
28
29/// Alias for [`IndexedSeq`] + [`IndexedDict`] mapping from [`NodeId`] to [`NodeId`].
30pub trait ContractionBackend
31where
32 for<'a> Self: IndexedSeq<Input = NodeId, Output<'a> = NodeId>
33 + IndexedDict<Input = NodeId, Output<'a> = NodeId>,
34{
35}
36
37impl<B> ContractionBackend for B where
38 for<'a> Self: IndexedSeq<Input = NodeId, Output<'a> = NodeId>
39 + IndexedDict<Input = NodeId, Output<'a> = NodeId>
40{
41}
42
43/// Marker trait for [`ContractionBackend`]s that guarantee that it represents a monotonic function
44///
45/// # Safety
46///
47/// Implementing this trait guarantees that for all `x` and `y`:
48///
49/// * `x < y` iff `self.get(x) < self.get(y)`, and
50/// * if `self.index_of(x) != None` and `self.index_of(y) != None`, then
51/// `x < y` iff `self.index_of(x) < self.index_of(y)`.
52pub unsafe trait MonotoneContractionBackend: ContractionBackend {}
53
54// SAFETY: EliasFano is monotone by definition
55unsafe impl<H, L> MonotoneContractionBackend for EliasFano<H, L> where Self: ContractionBackend {}
56
57unsafe impl<N: MonotoneContractionBackend> MonotoneContractionBackend for &N {}
58
59#[derive(Copy, Clone, Debug)]
60/// See [`ContiguousSubgraph`]
61pub struct Contraction<N>(pub N)
62where
63 for<'a> N: IndexedSeq<Input = NodeId, Output<'a> = NodeId>;
64
65impl<N> Contraction<N>
66where
67 for<'a> N: IndexedSeq<Input = NodeId, Output<'a> = NodeId>,
68{
69 /// Given a node id in a [`ContiguousSubgraph`], returns the corresponding node id
70 /// in the [`ContiguousSubgraph`]'s underlying graph
71 #[inline(always)]
72 pub fn underlying_node_id(&self, self_node: NodeId) -> NodeId {
73 self.0.get(self_node)
74 }
75
76 /// Returns the number of node ids in the [`ContiguousSubgraph`].
77 ///
78 /// Note: this can be an overapproximation if the underlying graph is a subgraph
79 pub fn num_nodes(&self) -> usize {
80 self.0.len()
81 }
82}
83
84impl<N: ContractionBackend> Contraction<N> {
85 /// Given a node id in a [`ContiguousSubgraph`]'s underlying graph, returns the
86 /// corresponding node id in the [`ContiguousSubgraph`]
87 #[inline(always)]
88 pub fn node_id_from_underlying(&self, underlying_node: NodeId) -> Option<NodeId> {
89 self.0.index_of(underlying_node)
90 }
91}
92
93/// A view over [`SwhGraph`] and related traits, that filters out some nodes, and renumbers
94/// remaining nodes so that the set of valid nodes becomes a range.
95///
96/// Due to limitations in the Rust type system, properties available on the underlying
97/// [`SwhGraph`] are not automatically available on [`ContiguousSubgraph`].
98/// They need to be explicitly enabled in a builder-like pattern on `ContiguousSubgraph`
99/// using these methods:
100/// * [`with_maps`][`ContiguousSubgraph::with_maps`],
101/// * [`with_timestamps`][`ContiguousSubgraph::with_timestamps`],
102/// * [`with_persons`][`ContiguousSubgraph::with_persons`],
103/// * [`with_contents`][`ContiguousSubgraph::with_contents`],
104/// * [`with_strings`][`ContiguousSubgraph::with_strings`],
105/// * [`with_label_names`][`ContiguousSubgraph::with_label_names`],
106///
107/// # Examples
108///
109/// Build a `ContiguousSubgraph` made of only content and directory nodes:
110///
111/// ```
112/// use dsi_progress_logger::progress_logger;
113/// use swh_graph::properties;
114/// use swh_graph::NodeConstraint;
115/// use swh_graph::graph::SwhGraphWithProperties;
116/// use swh_graph::views::Subgraph;
117/// use swh_graph::views::contiguous_subgraph::{ContiguousSubgraph, Contraction, ContractionBackend};
118///
119/// fn filesystem_subgraph<G>(graph: &G) -> ContiguousSubgraph<
120/// Subgraph<&'_ G, impl Fn(usize) -> bool + use<'_, G>, fn(usize, usize) -> bool>,
121/// impl ContractionBackend,
122/// properties::NoMaps,
123/// properties::NoTimestamps,
124/// properties::NoPersons,
125/// properties::NoContents,
126/// properties::NoStrings,
127/// properties::NoLabelNames,
128/// >
129/// where G: SwhGraphWithProperties<Maps: properties::Maps> + Sync {
130/// let sparse_subgraph = Subgraph::with_node_constraint(graph, "cnt,ori".parse().unwrap());
131/// ContiguousSubgraph::new_from_noncontiguous_graph(sparse_subgraph)
132/// }
133/// ```
134///
135/// The graph created above is slightly suboptimal, as `ContiguousSubgraph` wraps
136/// a `Subgraph`, causing `Subgraph` to add needless overhead by checking that nodes
137/// exist, even though `ContiguousSubgraph` does it too.
138/// This should not be noticeable, but if it is an issue, you can skip the Subgraph
139/// by manually building a [`Contraction`]:
140///
141/// ```
142/// use dsi_progress_logger::progress_logger;
143/// use sux::dict::elias_fano::{EfSeqDict, EliasFanoBuilder};
144/// use swh_graph::properties;
145/// use swh_graph::{NodeType};
146/// use swh_graph::graph::SwhGraphWithProperties;
147/// use swh_graph::views::contiguous_subgraph::{ContiguousSubgraph, Contraction, ContractionBackend};
148///
149/// fn filesystem_subgraph<G>(graph: &G) -> ContiguousSubgraph<
150/// &'_ G,
151/// EfSeqDict,
152/// properties::NoMaps,
153/// properties::NoTimestamps,
154/// properties::NoPersons,
155/// properties::NoContents,
156/// properties::NoStrings,
157/// properties::NoLabelNames,
158/// >
159/// where G: SwhGraphWithProperties<Maps: properties::Maps> {
160///
161/// // compute exact number of nodes in the subgraph, which is required
162/// // by EliasFanoBuilder
163/// let pl = progress_logger!(
164/// item_name = "node",
165/// expected_updates = Some(graph.num_nodes()),
166/// );
167/// let num_nodes = graph
168/// .iter_nodes(pl)
169/// .filter(|&node| match graph.properties().node_type(node) {
170/// NodeType::Content | NodeType::Directory => true,
171/// _ => false,
172/// })
173/// .count();
174///
175/// // compute set of nodes in the subgraph
176/// let mut nodes_efb = EliasFanoBuilder::new(num_nodes, graph.num_nodes());
177/// for node in 0..graph.num_nodes() {
178/// match graph.properties().node_type(node) {
179/// NodeType::Content | NodeType::Directory => nodes_efb.push(node),
180/// _ => (),
181/// }
182/// }
183/// let contraction = Contraction(nodes_efb.build_with_seq_and_dict());
184///
185/// // assemble the subgraph
186/// ContiguousSubgraph::new_from_contraction(graph, contraction)
187/// }
188/// ```
189pub struct ContiguousSubgraph<
190 G: SwhGraph,
191 N: ContractionBackend,
192 MAPS: properties::MaybeMaps,
193 TIMESTAMPS: properties::MaybeTimestamps,
194 PERSONS: properties::MaybePersons,
195 CONTENTS: properties::MaybeContents,
196 STRINGS: properties::MaybeStrings,
197 LABELNAMES: properties::MaybeLabelNames,
198> {
199 // Note: ContiguousSubgraph::into_raw_parts assumes there are no references
200 // to `inner` outside of `properties`. This means that `ContiguousSubgraph`
201 // cannot implement `Clone` (at least not without copying ContiguousSubgraphInner
202 // itself).
203 //
204 // TODO: find a way to replace Arc with ouroboros
205 inner: Arc<ContiguousSubgraphInner<G, N>>,
206 properties:
207 properties::SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>,
208}
209
210impl<G: SwhGraphWithProperties, N: ContractionBackend>
211 ContiguousSubgraph<
212 G,
213 N,
214 properties::NoMaps,
215 properties::NoTimestamps,
216 properties::NoPersons,
217 properties::NoContents,
218 properties::NoStrings,
219 properties::NoLabelNames,
220 >
221{
222 /// Creates a new [`ContiguousSubgraph`] by keeping only nodes in the [`Contraction`]
223 pub fn new_from_contraction(graph: G, contraction: Contraction<N>) -> Self {
224 let path = graph.properties().path.clone();
225 let num_nodes = contraction.num_nodes();
226 let inner = Arc::new(ContiguousSubgraphInner {
227 underlying_graph: graph,
228 contraction,
229 });
230 Self {
231 properties: properties::SwhGraphProperties {
232 path,
233 num_nodes,
234 maps: properties::NoMaps,
235 timestamps: properties::NoTimestamps,
236 persons: properties::NoPersons,
237 contents: properties::NoContents,
238 strings: properties::NoStrings,
239 label_names: properties::NoLabelNames,
240 label_names_are_in_base64_order: Default::default(),
241 },
242 inner,
243 }
244 }
245}
246
247impl<G: SwhGraph, N: ContractionBackend>
248 ContiguousSubgraph<
249 G,
250 N,
251 properties::NoMaps,
252 properties::NoTimestamps,
253 properties::NoPersons,
254 properties::NoContents,
255 properties::NoStrings,
256 properties::NoLabelNames,
257 >
258{
259 /// Returns the graph this graph is a subgraph of
260 ///
261 /// Use [`Self::contraction`] to get the mapping between both sets of node ids
262 pub fn underlying_graph(&self) -> &G {
263 &self.inner.underlying_graph
264 }
265
266 /// The structure used to match the underlying graph's node ids with this graph's node ids
267 pub fn contraction(&self) -> &Contraction<N> {
268 &self.inner.contraction
269 }
270
271 pub fn into_parts(self) -> (G, Contraction<N>) {
272 let inner = Arc::into_inner(self.inner).expect("Dangling references to ContiguousSubgraph::inner for ContiguousSubgraph with no properties");
273 (inner.underlying_graph, inner.contraction)
274 }
275}
276
277impl<G: SwhGraphWithProperties>
278 ContiguousSubgraph<
279 G,
280 EfSeqDict,
281 properties::NoMaps,
282 properties::NoTimestamps,
283 properties::NoPersons,
284 properties::NoContents,
285 properties::NoStrings,
286 properties::NoLabelNames,
287 >
288{
289 /// Creates a new [`ContiguousSubgraph`] from an existing graph with "holes".
290 pub fn new_from_noncontiguous_graph(graph: G) -> Self
291 where
292 G: Sync,
293 {
294 // compute exact number of nodes in the subgraph, which is required
295 // by EliasFanoBuilder
296 let actual_num_nodes = graph.actual_num_nodes().unwrap_or_else(|_| {
297 let mut pl = concurrent_progress_logger!(
298 item_name = "node",
299 expected_updates = Some(graph.num_nodes()),
300 );
301 pl.start("Computing number of nodes");
302 let actual_num_nodes = graph
303 .par_iter_nodes(pl.clone())
304 .filter(|&node| graph.has_node(node))
305 .count();
306 pl.done();
307 actual_num_nodes
308 });
309
310 // compute set of nodes in the subgraph
311 let mut nodes_efb = EliasFanoBuilder::new(actual_num_nodes, graph.num_nodes());
312 for node in 0..graph.num_nodes() {
313 if graph.has_node(node) {
314 nodes_efb.push(node);
315 }
316 }
317 let contraction = Contraction(nodes_efb.build_with_seq_and_dict());
318
319 Self::new_from_contraction(graph, contraction)
320 }
321}
322
323// content of ContiguousSubgraph that must be wrapped in an Arc in order to make
324// ContiguousSubgraphMaps live as long as G (which is an accidental requirement of
325// SwhGraphWithProperties, which doesn't seem to be removable without making its
326// API painfully hard to use).
327struct ContiguousSubgraphInner<G: SwhGraph, N: ContractionBackend> {
328 underlying_graph: G,
329 contraction: Contraction<N>,
330}
331
332impl<
333 G: SwhGraph,
334 N: ContractionBackend,
335 MAPS: properties::MaybeMaps,
336 TIMESTAMPS: properties::MaybeTimestamps,
337 PERSONS: properties::MaybePersons,
338 CONTENTS: properties::MaybeContents,
339 STRINGS: properties::MaybeStrings,
340 LABELNAMES: properties::MaybeLabelNames,
341 > SwhGraph
342 for ContiguousSubgraph<G, N, MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>
343{
344 #[inline(always)]
345 fn path(&self) -> &Path {
346 self.inner.underlying_graph.path()
347 }
348 #[inline(always)]
349 fn is_transposed(&self) -> bool {
350 self.inner.underlying_graph.is_transposed()
351 }
352 #[inline(always)]
353 // Note: this can be an overapproximation if the underlying graph is a subgraph
354 fn num_nodes(&self) -> usize {
355 self.inner.contraction.num_nodes()
356 }
357 fn has_node(&self, node_id: NodeId) -> bool {
358 node_id < self.num_nodes()
359 && self
360 .inner
361 .underlying_graph
362 .has_node(self.inner.contraction.underlying_node_id(node_id))
363 }
364 #[inline(always)]
365 // Note: this return the number or arcs in the original graph, before
366 // subgraph filtering.
367 fn num_arcs(&self) -> u64 {
368 self.inner.underlying_graph.num_arcs()
369 }
370 fn num_nodes_by_type(&self) -> Result<HashMap<NodeType, usize>> {
371 bail!("num_nodes_by_type is not supported by ContiguousSubgraph")
372 }
373 fn num_arcs_by_type(&self) -> Result<HashMap<(NodeType, NodeType), usize>> {
374 bail!("num_arcs_by_type is not supported by ContiguousSubgraph")
375 }
376 fn has_arc(&self, src_node_id: NodeId, dst_node_id: NodeId) -> bool {
377 self.inner.underlying_graph.has_arc(
378 self.inner.contraction.underlying_node_id(src_node_id),
379 self.inner.contraction.underlying_node_id(dst_node_id),
380 )
381 }
382}
383
384impl<
385 G: SwhGraphWithProperties,
386 N: ContractionBackend,
387 MAPS: properties::MaybeMaps,
388 TIMESTAMPS: properties::MaybeTimestamps,
389 PERSONS: properties::MaybePersons,
390 CONTENTS: properties::MaybeContents,
391 STRINGS: properties::MaybeStrings,
392 LABELNAMES: properties::MaybeLabelNames,
393 > SwhGraphWithProperties
394 for ContiguousSubgraph<G, N, MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>
395{
396 type Maps = MAPS;
397 type Timestamps = TIMESTAMPS;
398 type Persons = PERSONS;
399 type Contents = CONTENTS;
400 type Strings = STRINGS;
401 type LabelNames = LABELNAMES;
402
403 #[inline(always)]
404 fn properties(
405 &self,
406 ) -> &properties::SwhGraphProperties<
407 Self::Maps,
408 Self::Timestamps,
409 Self::Persons,
410 Self::Contents,
411 Self::Strings,
412 Self::LabelNames,
413 > {
414 &self.properties
415 }
416}