swh_graph/views/contiguous_subgraph/
mod.rs

1// Copyright (C) 2023-2025  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6use std::collections::HashMap;
7use std::path::Path;
8use std::sync::Arc;
9
10use anyhow::{bail, Result};
11use dsi_progress_logger::{concurrent_progress_logger, ProgressLog};
12use rayon::prelude::*;
13use sux::dict::elias_fano::{EfSeqDict, EliasFano, EliasFanoBuilder};
14use sux::traits::indexed_dict::{IndexedDict, IndexedSeq};
15
16use crate::graph::*;
17use crate::mph::SwhidMphf;
18use crate::properties;
19use crate::{NodeType, OutOfBoundError, SWHID};
20
21mod contents;
22mod iterators;
23mod label_names;
24mod maps;
25mod persons;
26mod strings;
27mod timestamps;
28
29/// Alias for [`IndexedSeq`] + [`IndexedDict`] mapping from [`NodeId`] to [`NodeId`].
30pub trait ContractionBackend
31where
32    for<'a> Self: IndexedSeq<Input = NodeId, Output<'a> = NodeId>
33        + IndexedDict<Input = NodeId, Output<'a> = NodeId>,
34{
35}
36
37impl<B> ContractionBackend for B where
38    for<'a> Self: IndexedSeq<Input = NodeId, Output<'a> = NodeId>
39        + IndexedDict<Input = NodeId, Output<'a> = NodeId>
40{
41}
42
43/// Marker trait for [`ContractionBackend`]s that guarantee that it represents a monotonic function
44///
45/// # Safety
46///
47/// Implementing this trait guarantees that for all `x` and `y`:
48///
49/// * `x < y` iff `self.get(x) < self.get(y)`, and
50/// * if `self.index_of(x) != None` and `self.index_of(y) != None`, then
51///   `x < y` iff `self.index_of(x) < self.index_of(y)`.
52pub unsafe trait MonotoneContractionBackend: ContractionBackend {}
53
54// SAFETY: EliasFano is monotone by definition
55unsafe impl<H, L> MonotoneContractionBackend for EliasFano<H, L> where Self: ContractionBackend {}
56
57unsafe impl<N: MonotoneContractionBackend> MonotoneContractionBackend for &N {}
58
59#[derive(Copy, Clone, Debug)]
60/// See [`ContiguousSubgraph`]
61pub struct Contraction<N>(pub N)
62where
63    for<'a> N: IndexedSeq<Input = NodeId, Output<'a> = NodeId>;
64
65impl<N> Contraction<N>
66where
67    for<'a> N: IndexedSeq<Input = NodeId, Output<'a> = NodeId>,
68{
69    /// Given a node id in a [`ContiguousSubgraph`], returns the corresponding node id
70    /// in the [`ContiguousSubgraph`]'s underlying graph
71    #[inline(always)]
72    pub fn underlying_node_id(&self, self_node: NodeId) -> NodeId {
73        self.0.get(self_node)
74    }
75
76    /// Returns the number of node ids in the [`ContiguousSubgraph`].
77    ///
78    /// Note: this can be an overapproximation if the underlying graph is a subgraph
79    pub fn num_nodes(&self) -> usize {
80        self.0.len()
81    }
82}
83
84impl<N: ContractionBackend> Contraction<N> {
85    /// Given a node id in a [`ContiguousSubgraph`]'s underlying graph, returns the
86    /// corresponding node id in the [`ContiguousSubgraph`]
87    #[inline(always)]
88    pub fn node_id_from_underlying(&self, underlying_node: NodeId) -> Option<NodeId> {
89        self.0.index_of(underlying_node)
90    }
91}
92
93/// A view over [`SwhGraph`] and related traits, that filters out some nodes, and renumbers
94/// remaining nodes so that the set of valid nodes becomes a range.
95///
96/// Due to limitations in the Rust type system, properties available on the underlying
97/// [`SwhGraph`] are not automatically available on [`ContiguousSubgraph`].
98/// They need to be explicitly enabled in a builder-like pattern on `ContiguousSubgraph`
99/// using these methods:
100/// * [`with_maps`][`ContiguousSubgraph::with_maps`],
101/// * [`with_timestamps`][`ContiguousSubgraph::with_timestamps`],
102/// * [`with_persons`][`ContiguousSubgraph::with_persons`],
103/// * [`with_contents`][`ContiguousSubgraph::with_contents`],
104/// * [`with_strings`][`ContiguousSubgraph::with_strings`],
105/// * [`with_label_names`][`ContiguousSubgraph::with_label_names`],
106///
107/// # Examples
108///
109/// Build a `ContiguousSubgraph` made of only content and directory nodes:
110///
111/// ```
112/// use dsi_progress_logger::progress_logger;
113/// use swh_graph::properties;
114/// use swh_graph::NodeConstraint;
115/// use swh_graph::graph::SwhGraphWithProperties;
116/// use swh_graph::views::Subgraph;
117/// use swh_graph::views::contiguous_subgraph::{ContiguousSubgraph, Contraction, ContractionBackend};
118///
119/// fn filesystem_subgraph<G>(graph: &G) -> ContiguousSubgraph<
120///         Subgraph<&'_ G, impl Fn(usize) -> bool + use<'_, G>, fn(usize, usize) -> bool>,
121///         impl ContractionBackend,
122///         properties::NoMaps,
123///         properties::NoTimestamps,
124///         properties::NoPersons,
125///         properties::NoContents,
126///         properties::NoStrings,
127///         properties::NoLabelNames,
128///     >
129///     where G: SwhGraphWithProperties<Maps: properties::Maps> + Sync {
130///     let sparse_subgraph = Subgraph::with_node_constraint(graph, "cnt,ori".parse().unwrap());
131///     ContiguousSubgraph::new_from_noncontiguous_graph(sparse_subgraph)
132/// }
133/// ```
134///
135/// The graph created above is slightly suboptimal, as `ContiguousSubgraph` wraps
136/// a `Subgraph`, causing `Subgraph` to add needless overhead by checking that nodes
137/// exist, even though `ContiguousSubgraph` does it too.
138/// This should not be noticeable, but if it is an issue, you can skip the Subgraph
139/// by manually building a [`Contraction`]:
140///
141/// ```
142/// use dsi_progress_logger::progress_logger;
143/// use sux::dict::elias_fano::{EfSeqDict, EliasFanoBuilder};
144/// use swh_graph::properties;
145/// use swh_graph::{NodeType};
146/// use swh_graph::graph::SwhGraphWithProperties;
147/// use swh_graph::views::contiguous_subgraph::{ContiguousSubgraph, Contraction, ContractionBackend};
148///
149/// fn filesystem_subgraph<G>(graph: &G) -> ContiguousSubgraph<
150///         &'_ G,
151///         EfSeqDict,
152///         properties::NoMaps,
153///         properties::NoTimestamps,
154///         properties::NoPersons,
155///         properties::NoContents,
156///         properties::NoStrings,
157///         properties::NoLabelNames,
158///     >
159///     where G: SwhGraphWithProperties<Maps: properties::Maps> {
160///
161///     // compute exact number of nodes in the subgraph, which is required
162///     // by EliasFanoBuilder
163///     let pl = progress_logger!(
164///         item_name = "node",
165///         expected_updates = Some(graph.num_nodes()),
166///     );
167///     let num_nodes = graph
168///         .iter_nodes(pl)
169///         .filter(|&node| match graph.properties().node_type(node) {
170///             NodeType::Content | NodeType::Directory => true,
171///             _ => false,
172///         })
173///         .count();
174///
175///     // compute set of nodes in the subgraph
176///     let mut nodes_efb = EliasFanoBuilder::new(num_nodes, graph.num_nodes());
177///     for node in 0..graph.num_nodes() {
178///         match graph.properties().node_type(node) {
179///             NodeType::Content | NodeType::Directory => nodes_efb.push(node),
180///             _ => (),
181///         }
182///     }
183///     let contraction = Contraction(nodes_efb.build_with_seq_and_dict());
184///
185///     // assemble the subgraph
186///     ContiguousSubgraph::new_from_contraction(graph, contraction)
187/// }
188/// ```
189pub struct ContiguousSubgraph<
190    G: SwhGraph,
191    N: ContractionBackend,
192    MAPS: properties::MaybeMaps,
193    TIMESTAMPS: properties::MaybeTimestamps,
194    PERSONS: properties::MaybePersons,
195    CONTENTS: properties::MaybeContents,
196    STRINGS: properties::MaybeStrings,
197    LABELNAMES: properties::MaybeLabelNames,
198> {
199    // Note: ContiguousSubgraph::into_raw_parts assumes there are no references
200    // to `inner` outside of `properties`. This means that `ContiguousSubgraph`
201    // cannot implement `Clone` (at least not without copying ContiguousSubgraphInner
202    // itself).
203    //
204    // TODO: find a way to replace Arc with ouroboros
205    inner: Arc<ContiguousSubgraphInner<G, N>>,
206    properties:
207        properties::SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>,
208}
209
210impl<G: SwhGraphWithProperties, N: ContractionBackend>
211    ContiguousSubgraph<
212        G,
213        N,
214        properties::NoMaps,
215        properties::NoTimestamps,
216        properties::NoPersons,
217        properties::NoContents,
218        properties::NoStrings,
219        properties::NoLabelNames,
220    >
221{
222    /// Creates a new [`ContiguousSubgraph`] by keeping only nodes in the [`Contraction`]
223    pub fn new_from_contraction(graph: G, contraction: Contraction<N>) -> Self {
224        let path = graph.properties().path.clone();
225        let num_nodes = contraction.num_nodes();
226        let inner = Arc::new(ContiguousSubgraphInner {
227            underlying_graph: graph,
228            contraction,
229        });
230        Self {
231            properties: properties::SwhGraphProperties {
232                path,
233                num_nodes,
234                maps: properties::NoMaps,
235                timestamps: properties::NoTimestamps,
236                persons: properties::NoPersons,
237                contents: properties::NoContents,
238                strings: properties::NoStrings,
239                label_names: properties::NoLabelNames,
240                label_names_are_in_base64_order: Default::default(),
241            },
242            inner,
243        }
244    }
245}
246
247impl<G: SwhGraph, N: ContractionBackend>
248    ContiguousSubgraph<
249        G,
250        N,
251        properties::NoMaps,
252        properties::NoTimestamps,
253        properties::NoPersons,
254        properties::NoContents,
255        properties::NoStrings,
256        properties::NoLabelNames,
257    >
258{
259    /// Returns the graph this graph is a subgraph of
260    ///
261    /// Use [`Self::contraction`] to get the mapping between both sets of node ids
262    pub fn underlying_graph(&self) -> &G {
263        &self.inner.underlying_graph
264    }
265
266    /// The structure used to match the underlying graph's node ids with this graph's node ids
267    pub fn contraction(&self) -> &Contraction<N> {
268        &self.inner.contraction
269    }
270
271    pub fn into_parts(self) -> (G, Contraction<N>) {
272        let inner = Arc::into_inner(self.inner).expect("Dangling references to ContiguousSubgraph::inner for ContiguousSubgraph with no properties");
273        (inner.underlying_graph, inner.contraction)
274    }
275}
276
277impl<G: SwhGraphWithProperties>
278    ContiguousSubgraph<
279        G,
280        EfSeqDict,
281        properties::NoMaps,
282        properties::NoTimestamps,
283        properties::NoPersons,
284        properties::NoContents,
285        properties::NoStrings,
286        properties::NoLabelNames,
287    >
288{
289    /// Creates a new [`ContiguousSubgraph`] from an existing graph with "holes".
290    pub fn new_from_noncontiguous_graph(graph: G) -> Self
291    where
292        G: Sync,
293    {
294        // compute exact number of nodes in the subgraph, which is required
295        // by EliasFanoBuilder
296        let actual_num_nodes = graph.actual_num_nodes().unwrap_or_else(|_| {
297            let mut pl = concurrent_progress_logger!(
298                item_name = "node",
299                expected_updates = Some(graph.num_nodes()),
300            );
301            pl.start("Computing number of nodes");
302            let actual_num_nodes = graph
303                .par_iter_nodes(pl.clone())
304                .filter(|&node| graph.has_node(node))
305                .count();
306            pl.done();
307            actual_num_nodes
308        });
309
310        // compute set of nodes in the subgraph
311        let mut nodes_efb = EliasFanoBuilder::new(actual_num_nodes, graph.num_nodes());
312        for node in 0..graph.num_nodes() {
313            if graph.has_node(node) {
314                nodes_efb.push(node);
315            }
316        }
317        let contraction = Contraction(nodes_efb.build_with_seq_and_dict());
318
319        Self::new_from_contraction(graph, contraction)
320    }
321}
322
323// content of ContiguousSubgraph that must be wrapped in an Arc in order to make
324// ContiguousSubgraphMaps live as long as G (which is an accidental requirement of
325// SwhGraphWithProperties, which doesn't seem to be removable without making its
326// API painfully hard to use).
327struct ContiguousSubgraphInner<G: SwhGraph, N: ContractionBackend> {
328    underlying_graph: G,
329    contraction: Contraction<N>,
330}
331
332impl<
333        G: SwhGraph,
334        N: ContractionBackend,
335        MAPS: properties::MaybeMaps,
336        TIMESTAMPS: properties::MaybeTimestamps,
337        PERSONS: properties::MaybePersons,
338        CONTENTS: properties::MaybeContents,
339        STRINGS: properties::MaybeStrings,
340        LABELNAMES: properties::MaybeLabelNames,
341    > SwhGraph
342    for ContiguousSubgraph<G, N, MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>
343{
344    #[inline(always)]
345    fn path(&self) -> &Path {
346        self.inner.underlying_graph.path()
347    }
348    #[inline(always)]
349    fn is_transposed(&self) -> bool {
350        self.inner.underlying_graph.is_transposed()
351    }
352    #[inline(always)]
353    // Note: this can be an overapproximation if the underlying graph is a subgraph
354    fn num_nodes(&self) -> usize {
355        self.inner.contraction.num_nodes()
356    }
357    fn has_node(&self, node_id: NodeId) -> bool {
358        node_id < self.num_nodes()
359            && self
360                .inner
361                .underlying_graph
362                .has_node(self.inner.contraction.underlying_node_id(node_id))
363    }
364    #[inline(always)]
365    // Note: this return the number or arcs in the original graph, before
366    // subgraph filtering.
367    fn num_arcs(&self) -> u64 {
368        self.inner.underlying_graph.num_arcs()
369    }
370    fn num_nodes_by_type(&self) -> Result<HashMap<NodeType, usize>> {
371        bail!("num_nodes_by_type is not supported by ContiguousSubgraph")
372    }
373    fn num_arcs_by_type(&self) -> Result<HashMap<(NodeType, NodeType), usize>> {
374        bail!("num_arcs_by_type is not supported by ContiguousSubgraph")
375    }
376    fn has_arc(&self, src_node_id: NodeId, dst_node_id: NodeId) -> bool {
377        self.inner.underlying_graph.has_arc(
378            self.inner.contraction.underlying_node_id(src_node_id),
379            self.inner.contraction.underlying_node_id(dst_node_id),
380        )
381    }
382}
383
384impl<
385        G: SwhGraphWithProperties,
386        N: ContractionBackend,
387        MAPS: properties::MaybeMaps,
388        TIMESTAMPS: properties::MaybeTimestamps,
389        PERSONS: properties::MaybePersons,
390        CONTENTS: properties::MaybeContents,
391        STRINGS: properties::MaybeStrings,
392        LABELNAMES: properties::MaybeLabelNames,
393    > SwhGraphWithProperties
394    for ContiguousSubgraph<G, N, MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>
395{
396    type Maps = MAPS;
397    type Timestamps = TIMESTAMPS;
398    type Persons = PERSONS;
399    type Contents = CONTENTS;
400    type Strings = STRINGS;
401    type LabelNames = LABELNAMES;
402
403    #[inline(always)]
404    fn properties(
405        &self,
406    ) -> &properties::SwhGraphProperties<
407        Self::Maps,
408        Self::Timestamps,
409        Self::Persons,
410        Self::Contents,
411        Self::Strings,
412        Self::LabelNames,
413    > {
414        &self.properties
415    }
416}