swh_graph/views/contiguous_subgraph/mod.rs
1// Copyright (C) 2023-2025 The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6use std::collections::HashMap;
7use std::path::Path;
8use std::sync::Arc;
9
10use anyhow::{bail, Result};
11use dsi_progress_logger::{concurrent_progress_logger, ProgressLog};
12use rayon::prelude::*;
13use sux::dict::elias_fano::{EfSeqDict, EliasFano, EliasFanoBuilder};
14use sux::traits::{IndexedDict, IndexedSeq};
15
16use crate::graph::*;
17use crate::mph::SwhidMphf;
18use crate::properties;
19use crate::{NodeType, OutOfBoundError, SWHID};
20
21mod contents;
22mod iterators;
23mod label_names;
24mod maps;
25mod persons;
26mod strings;
27mod timestamps;
28
29/// Alias for [`IndexedSeq`] + [`IndexedDict`] mapping from [`NodeId`] to [`NodeId`].
30pub trait ContractionBackend
31where
32 for<'a> Self: IndexedSeq<Input = NodeId, Output<'a> = NodeId>
33 + IndexedDict<Input = NodeId, Output<'a> = NodeId>,
34{
35}
36
37impl<B> ContractionBackend for B where
38 for<'a> Self: IndexedSeq<Input = NodeId, Output<'a> = NodeId>
39 + IndexedDict<Input = NodeId, Output<'a> = NodeId>
40{
41}
42
43/// Marker trait for [`ContractionBackend`]s that guarantee that it represents a monotonic function
44///
45/// # Safety
46///
47/// Implementing this trait guarantees that for all `x` and `y`:
48///
49/// * `x < y` iff `self.get(x) < self.get(y)`, and
50/// * if `self.index_of(x) != None` and `self.index_of(y) != None`, then
51/// `x < y` iff `self.index_of(x) < self.index_of(y)`.
52pub unsafe trait MonotoneContractionBackend: ContractionBackend {}
53
54// SAFETY: EliasFano is monotone by definition
55unsafe impl<H, L> MonotoneContractionBackend for EliasFano<H, L> where Self: ContractionBackend {}
56
57/// See [`ContiguousSubgraph`]
58pub struct Contraction<N>(pub N)
59where
60 for<'a> N: IndexedSeq<Input = NodeId, Output<'a> = NodeId>;
61
62impl<N> Contraction<N>
63where
64 for<'a> N: IndexedSeq<Input = NodeId, Output<'a> = NodeId>,
65{
66 /// Given a node id in a [`ContiguousSubgraph`], returns the corresponding node id
67 /// in the [`ContiguousSubgraph`]'s underlying graph
68 #[inline(always)]
69 pub fn underlying_node_id(&self, self_node: NodeId) -> NodeId {
70 self.0.get(self_node)
71 }
72
73 /// Returns the number of node ids in the [`ContiguousSubgraph`].
74 ///
75 /// Note: this can be an overapproximation if the underlying graph is a subgraph
76 pub fn num_nodes(&self) -> usize {
77 self.0.len()
78 }
79}
80
81impl<N: ContractionBackend> Contraction<N> {
82 /// Given a node id in a [`ContiguousSubgraph`]'s underlying graph, returns the
83 /// corresponding node id in the [`ContiguousSubgraph`]
84 #[inline(always)]
85 pub fn node_id_from_underlying(&self, underlying_node: NodeId) -> Option<NodeId> {
86 self.0.index_of(underlying_node)
87 }
88}
89
90/// A view over [`SwhGraph`] and related traits, that filters out some nodes, and renumbers
91/// remaining nodes so that the set of valid nodes becomes a range.
92///
93/// Due to limitations in the Rust type system, properties available on the underlying
94/// [`SwhGraph`] are not automatically available on [`ContiguousSubgraph`].
95/// They need to be explicitly enabled in a builder-like pattern on `ContiguousSubgraph`
96/// using these methods:
97/// * [`with_maps`][`ContiguousSubgraph::with_maps`],
98/// * [`with_timestamps`][`ContiguousSubgraph::with_timestamps`],
99/// * [`with_persons`][`ContiguousSubgraph::with_persons`],
100/// * [`with_contents`][`ContiguousSubgraph::with_contents`],
101/// * [`with_strings`][`ContiguousSubgraph::with_strings`],
102/// * [`with_label_names`][`ContiguousSubgraph::with_label_names`],
103///
104/// # Examples
105///
106/// Build a `ContiguousSubgraph` made of only content and directory nodes:
107///
108/// ```
109/// use dsi_progress_logger::progress_logger;
110/// use swh_graph::properties;
111/// use swh_graph::NodeConstraint;
112/// use swh_graph::graph::SwhGraphWithProperties;
113/// use swh_graph::views::Subgraph;
114/// use swh_graph::views::contiguous_subgraph::{ContiguousSubgraph, Contraction, ContractionBackend};
115///
116/// fn filesystem_subgraph<G>(graph: &G) -> ContiguousSubgraph<
117/// Subgraph<&'_ G, impl Fn(usize) -> bool + use<'_, G>, fn(usize, usize) -> bool>,
118/// impl ContractionBackend,
119/// properties::NoMaps,
120/// properties::NoTimestamps,
121/// properties::NoPersons,
122/// properties::NoContents,
123/// properties::NoStrings,
124/// properties::NoLabelNames,
125/// >
126/// where G: SwhGraphWithProperties<Maps: properties::Maps> + Sync {
127/// let sparse_subgraph = Subgraph::with_node_constraint(graph, "cnt,ori".parse().unwrap());
128/// ContiguousSubgraph::new_from_noncontiguous_graph(sparse_subgraph)
129/// }
130/// ```
131///
132/// The graph created above is slightly suboptimal, as `ContiguousSubgraph` wraps
133/// a `Subgraph`, causing `Subgraph` to add needless overhead by checking that nodes
134/// exist, even though `ContiguousSubgraph` does it too.
135/// This should not be noticeable, but if it is an issue, you can skip the Subgraph
136/// by manually building a [`Contraction`]:
137///
138/// ```
139/// use dsi_progress_logger::progress_logger;
140/// use sux::dict::elias_fano::{EfSeqDict, EliasFanoBuilder};
141/// use swh_graph::properties;
142/// use swh_graph::{NodeType};
143/// use swh_graph::graph::SwhGraphWithProperties;
144/// use swh_graph::views::Subgraph;
145/// use swh_graph::views::contiguous_subgraph::{ContiguousSubgraph, Contraction, ContractionBackend};
146///
147/// fn filesystem_subgraph<G>(graph: &G) -> ContiguousSubgraph<
148/// &'_ G,
149/// EfSeqDict,
150/// properties::NoMaps,
151/// properties::NoTimestamps,
152/// properties::NoPersons,
153/// properties::NoContents,
154/// properties::NoStrings,
155/// properties::NoLabelNames,
156/// >
157/// where G: SwhGraphWithProperties<Maps: properties::Maps> {
158///
159/// // compute exact number of nodes in the subgraph, which is required
160/// // by EliasFanoBuilder
161/// let pl = progress_logger!(
162/// item_name = "node",
163/// expected_updates = Some(graph.num_nodes()),
164/// );
165/// let num_nodes = graph
166/// .iter_nodes(pl)
167/// .filter(|&node| match graph.properties().node_type(node) {
168/// NodeType::Content | NodeType::Directory => true,
169/// _ => false,
170/// })
171/// .count();
172///
173/// // compute set of nodes in the subgraph
174/// let mut nodes_efb = EliasFanoBuilder::new(num_nodes, graph.num_nodes());
175/// for node in 0..graph.num_nodes() {
176/// match graph.properties().node_type(node) {
177/// NodeType::Content | NodeType::Directory => nodes_efb.push(node),
178/// _ => (),
179/// }
180/// }
181/// let contraction = Contraction(nodes_efb.build_with_seq_and_dict());
182///
183/// // assemble the subgraph
184/// ContiguousSubgraph::new_from_contraction(graph, contraction)
185/// }
186/// ```
187pub struct ContiguousSubgraph<
188 G: SwhGraph,
189 N: ContractionBackend,
190 MAPS: properties::MaybeMaps,
191 TIMESTAMPS: properties::MaybeTimestamps,
192 PERSONS: properties::MaybePersons,
193 CONTENTS: properties::MaybeContents,
194 STRINGS: properties::MaybeStrings,
195 LABELNAMES: properties::MaybeLabelNames,
196> {
197 inner: Arc<ContiguousSubgraphInner<G, N>>, // TODO: find a way to replace Arc with ouroboros
198 properties:
199 properties::SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>,
200}
201
202impl<G: SwhGraphWithProperties, N: ContractionBackend>
203 ContiguousSubgraph<
204 G,
205 N,
206 properties::NoMaps,
207 properties::NoTimestamps,
208 properties::NoPersons,
209 properties::NoContents,
210 properties::NoStrings,
211 properties::NoLabelNames,
212 >
213{
214 /// Creates a new [`ContiguousSubgraph`] by keeping only nodes in the [`Contraction`]
215 pub fn new_from_contraction(graph: G, contraction: Contraction<N>) -> Self {
216 let path = graph.properties().path.clone();
217 let num_nodes = contraction.num_nodes();
218 let inner = Arc::new(ContiguousSubgraphInner {
219 underlying_graph: graph,
220 contraction,
221 });
222 Self {
223 properties: properties::SwhGraphProperties {
224 path,
225 num_nodes,
226 maps: properties::NoMaps,
227 timestamps: properties::NoTimestamps,
228 persons: properties::NoPersons,
229 contents: properties::NoContents,
230 strings: properties::NoStrings,
231 label_names: properties::NoLabelNames,
232 label_names_are_in_base64_order: Default::default(),
233 },
234 inner,
235 }
236 }
237}
238
239impl<G: SwhGraph, N: ContractionBackend>
240 ContiguousSubgraph<
241 G,
242 N,
243 properties::NoMaps,
244 properties::NoTimestamps,
245 properties::NoPersons,
246 properties::NoContents,
247 properties::NoStrings,
248 properties::NoLabelNames,
249 >
250{
251 /// Returns the graph this graph is a subgraph of
252 ///
253 /// Use [`Self::contraction`] to get the mapping between both sets of node ids
254 pub fn underlying_graph(&self) -> &G {
255 &self.inner.underlying_graph
256 }
257
258 /// The structure used to match the underlying graph's node ids with this graph's node ids
259 pub fn contraction(&self) -> &Contraction<N> {
260 &self.inner.contraction
261 }
262}
263
264impl<G: SwhGraphWithProperties>
265 ContiguousSubgraph<
266 G,
267 EfSeqDict,
268 properties::NoMaps,
269 properties::NoTimestamps,
270 properties::NoPersons,
271 properties::NoContents,
272 properties::NoStrings,
273 properties::NoLabelNames,
274 >
275{
276 /// Creates a new [`ContiguousSubgraph`] from an existing graph with "holes".
277 pub fn new_from_noncontiguous_graph(graph: G) -> Self
278 where
279 G: Sync,
280 {
281 // compute exact number of nodes in the subgraph, which is required
282 // by EliasFanoBuilder
283 let actual_num_nodes = graph.actual_num_nodes().unwrap_or_else(|_| {
284 let mut pl = concurrent_progress_logger!(
285 item_name = "node",
286 expected_updates = Some(graph.num_nodes()),
287 );
288 pl.start("Computing number of nodes");
289 let actual_num_nodes = graph
290 .par_iter_nodes(pl.clone())
291 .filter(|&node| graph.has_node(node))
292 .count();
293 pl.done();
294 actual_num_nodes
295 });
296
297 // compute set of nodes in the subgraph
298 let mut nodes_efb = EliasFanoBuilder::new(actual_num_nodes, graph.num_nodes());
299 for node in 0..graph.num_nodes() {
300 if graph.has_node(node) {
301 nodes_efb.push(node);
302 }
303 }
304 let contraction = Contraction(nodes_efb.build_with_seq_and_dict());
305
306 Self::new_from_contraction(graph, contraction)
307 }
308}
309
310// content of ContiguousSubgraph that must be wrapped in an Arc in order to make
311// ContiguousSubgraphMaps live as long as G (which is an accidental requirement of
312// SwhGraphWithProperties, which doesn't seem to be removable without making its
313// API painfully hard to use).
314struct ContiguousSubgraphInner<G: SwhGraph, N: ContractionBackend> {
315 underlying_graph: G,
316 contraction: Contraction<N>,
317}
318
319impl<
320 G: SwhGraph,
321 N: ContractionBackend,
322 MAPS: properties::MaybeMaps,
323 TIMESTAMPS: properties::MaybeTimestamps,
324 PERSONS: properties::MaybePersons,
325 CONTENTS: properties::MaybeContents,
326 STRINGS: properties::MaybeStrings,
327 LABELNAMES: properties::MaybeLabelNames,
328 > SwhGraph
329 for ContiguousSubgraph<G, N, MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>
330{
331 #[inline(always)]
332 fn path(&self) -> &Path {
333 self.inner.underlying_graph.path()
334 }
335 #[inline(always)]
336 fn is_transposed(&self) -> bool {
337 self.inner.underlying_graph.is_transposed()
338 }
339 #[inline(always)]
340 // Note: this can be an overapproximation if the underlying graph is a subgraph
341 fn num_nodes(&self) -> usize {
342 self.inner.contraction.num_nodes()
343 }
344 fn has_node(&self, node_id: NodeId) -> bool {
345 node_id < self.num_nodes()
346 && self
347 .inner
348 .underlying_graph
349 .has_node(self.inner.contraction.underlying_node_id(node_id))
350 }
351 #[inline(always)]
352 // Note: this return the number or arcs in the original graph, before
353 // subgraph filtering.
354 fn num_arcs(&self) -> u64 {
355 self.inner.underlying_graph.num_arcs()
356 }
357 fn num_nodes_by_type(&self) -> Result<HashMap<NodeType, usize>> {
358 bail!("num_nodes_by_type is not supported by ContiguousSubgraph")
359 }
360 fn num_arcs_by_type(&self) -> Result<HashMap<(NodeType, NodeType), usize>> {
361 bail!("num_arcs_by_type is not supported by ContiguousSubgraph")
362 }
363 fn has_arc(&self, src_node_id: NodeId, dst_node_id: NodeId) -> bool {
364 self.inner.underlying_graph.has_arc(
365 self.inner.contraction.underlying_node_id(src_node_id),
366 self.inner.contraction.underlying_node_id(dst_node_id),
367 )
368 }
369}
370
371impl<
372 G: SwhGraphWithProperties,
373 N: ContractionBackend,
374 MAPS: properties::MaybeMaps,
375 TIMESTAMPS: properties::MaybeTimestamps,
376 PERSONS: properties::MaybePersons,
377 CONTENTS: properties::MaybeContents,
378 STRINGS: properties::MaybeStrings,
379 LABELNAMES: properties::MaybeLabelNames,
380 > SwhGraphWithProperties
381 for ContiguousSubgraph<G, N, MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>
382{
383 type Maps = MAPS;
384 type Timestamps = TIMESTAMPS;
385 type Persons = PERSONS;
386 type Contents = CONTENTS;
387 type Strings = STRINGS;
388 type LabelNames = LABELNAMES;
389
390 #[inline(always)]
391 fn properties(
392 &self,
393 ) -> &properties::SwhGraphProperties<
394 Self::Maps,
395 Self::Timestamps,
396 Self::Persons,
397 Self::Contents,
398 Self::Strings,
399 Self::LabelNames,
400 > {
401 &self.properties
402 }
403}