swh_graph/views/contiguous_subgraph/
mod.rs

1// Copyright (C) 2023-2025  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6use std::collections::HashMap;
7use std::path::Path;
8use std::sync::Arc;
9
10use anyhow::{bail, Result};
11use dsi_progress_logger::{concurrent_progress_logger, ProgressLog};
12use rayon::prelude::*;
13use sux::dict::elias_fano::{EfSeqDict, EliasFano, EliasFanoBuilder};
14use sux::traits::{IndexedDict, IndexedSeq};
15
16use crate::graph::*;
17use crate::mph::SwhidMphf;
18use crate::properties;
19use crate::{NodeType, OutOfBoundError, SWHID};
20
21mod contents;
22mod iterators;
23mod label_names;
24mod maps;
25mod persons;
26mod strings;
27mod timestamps;
28
29/// Alias for [`IndexedSeq`] + [`IndexedDict`] mapping from [`NodeId`] to [`NodeId`].
30pub trait ContractionBackend
31where
32    for<'a> Self: IndexedSeq<Input = NodeId, Output<'a> = NodeId>
33        + IndexedDict<Input = NodeId, Output<'a> = NodeId>,
34{
35}
36
37impl<B> ContractionBackend for B where
38    for<'a> Self: IndexedSeq<Input = NodeId, Output<'a> = NodeId>
39        + IndexedDict<Input = NodeId, Output<'a> = NodeId>
40{
41}
42
43/// Marker trait for [`ContractionBackend`]s that guarantee that it represents a monotonic function
44///
45/// # Safety
46///
47/// Implementing this trait guarantees that for all `x` and `y`:
48///
49/// * `x < y` iff `self.get(x) < self.get(y)`, and
50/// * if `self.index_of(x) != None` and `self.index_of(y) != None`, then
51///   `x < y` iff `self.index_of(x) < self.index_of(y)`.
52pub unsafe trait MonotoneContractionBackend: ContractionBackend {}
53
54// SAFETY: EliasFano is monotone by definition
55unsafe impl<H, L> MonotoneContractionBackend for EliasFano<H, L> where Self: ContractionBackend {}
56
57/// See [`ContiguousSubgraph`]
58pub struct Contraction<N>(pub N)
59where
60    for<'a> N: IndexedSeq<Input = NodeId, Output<'a> = NodeId>;
61
62impl<N> Contraction<N>
63where
64    for<'a> N: IndexedSeq<Input = NodeId, Output<'a> = NodeId>,
65{
66    /// Given a node id in a [`ContiguousSubgraph`], returns the corresponding node id
67    /// in the [`ContiguousSubgraph`]'s underlying graph
68    #[inline(always)]
69    pub fn underlying_node_id(&self, self_node: NodeId) -> NodeId {
70        self.0.get(self_node)
71    }
72
73    /// Returns the number of node ids in the [`ContiguousSubgraph`].
74    ///
75    /// Note: this can be an overapproximation if the underlying graph is a subgraph
76    pub fn num_nodes(&self) -> usize {
77        self.0.len()
78    }
79}
80
81impl<N: ContractionBackend> Contraction<N> {
82    /// Given a node id in a [`ContiguousSubgraph`]'s underlying graph, returns the
83    /// corresponding node id in the [`ContiguousSubgraph`]
84    #[inline(always)]
85    pub fn node_id_from_underlying(&self, underlying_node: NodeId) -> Option<NodeId> {
86        self.0.index_of(underlying_node)
87    }
88}
89
90/// A view over [`SwhGraph`] and related traits, that filters out some nodes, and renumbers
91/// remaining nodes so that the set of valid nodes becomes a range.
92///
93/// Due to limitations in the Rust type system, properties available on the underlying
94/// [`SwhGraph`] are not automatically available on [`ContiguousSubgraph`].
95/// They need to be explicitly enabled in a builder-like pattern on `ContiguousSubgraph`
96/// using these methods:
97/// * [`with_maps`][`ContiguousSubgraph::with_maps`],
98/// * [`with_timestamps`][`ContiguousSubgraph::with_timestamps`],
99/// * [`with_persons`][`ContiguousSubgraph::with_persons`],
100/// * [`with_contents`][`ContiguousSubgraph::with_contents`],
101/// * [`with_strings`][`ContiguousSubgraph::with_strings`],
102/// * [`with_label_names`][`ContiguousSubgraph::with_label_names`],
103///
104/// # Examples
105///
106/// Build a `ContiguousSubgraph` made of only content and directory nodes:
107///
108/// ```
109/// use dsi_progress_logger::progress_logger;
110/// use swh_graph::properties;
111/// use swh_graph::NodeConstraint;
112/// use swh_graph::graph::SwhGraphWithProperties;
113/// use swh_graph::views::Subgraph;
114/// use swh_graph::views::contiguous_subgraph::{ContiguousSubgraph, Contraction, ContractionBackend};
115///
116/// fn filesystem_subgraph<G>(graph: &G) -> ContiguousSubgraph<
117///         Subgraph<&'_ G, impl Fn(usize) -> bool + use<'_, G>, fn(usize, usize) -> bool>,
118///         impl ContractionBackend,
119///         properties::NoMaps,
120///         properties::NoTimestamps,
121///         properties::NoPersons,
122///         properties::NoContents,
123///         properties::NoStrings,
124///         properties::NoLabelNames,
125///     >
126///     where G: SwhGraphWithProperties<Maps: properties::Maps> + Sync {
127///     let sparse_subgraph = Subgraph::with_node_constraint(graph, "cnt,ori".parse().unwrap());
128///     ContiguousSubgraph::new_from_noncontiguous_graph(sparse_subgraph)
129/// }
130/// ```
131///
132/// The graph created above is slightly suboptimal, as `ContiguousSubgraph` wraps
133/// a `Subgraph`, causing `Subgraph` to add needless overhead by checking that nodes
134/// exist, even though `ContiguousSubgraph` does it too.
135/// This should not be noticeable, but if it is an issue, you can skip the Subgraph
136/// by manually building a [`Contraction`]:
137///
138/// ```
139/// use dsi_progress_logger::progress_logger;
140/// use sux::dict::elias_fano::{EfSeqDict, EliasFanoBuilder};
141/// use swh_graph::properties;
142/// use swh_graph::{NodeType};
143/// use swh_graph::graph::SwhGraphWithProperties;
144/// use swh_graph::views::Subgraph;
145/// use swh_graph::views::contiguous_subgraph::{ContiguousSubgraph, Contraction, ContractionBackend};
146///
147/// fn filesystem_subgraph<G>(graph: &G) -> ContiguousSubgraph<
148///         &'_ G,
149///         EfSeqDict,
150///         properties::NoMaps,
151///         properties::NoTimestamps,
152///         properties::NoPersons,
153///         properties::NoContents,
154///         properties::NoStrings,
155///         properties::NoLabelNames,
156///     >
157///     where G: SwhGraphWithProperties<Maps: properties::Maps> {
158///
159///     // compute exact number of nodes in the subgraph, which is required
160///     // by EliasFanoBuilder
161///     let pl = progress_logger!(
162///         item_name = "node",
163///         expected_updates = Some(graph.num_nodes()),
164///     );
165///     let num_nodes = graph
166///         .iter_nodes(pl)
167///         .filter(|&node| match graph.properties().node_type(node) {
168///             NodeType::Content | NodeType::Directory => true,
169///             _ => false,
170///         })
171///         .count();
172///
173///     // compute set of nodes in the subgraph
174///     let mut nodes_efb = EliasFanoBuilder::new(num_nodes, graph.num_nodes());
175///     for node in 0..graph.num_nodes() {
176///         match graph.properties().node_type(node) {
177///             NodeType::Content | NodeType::Directory => nodes_efb.push(node),
178///             _ => (),
179///         }
180///     }
181///     let contraction = Contraction(nodes_efb.build_with_seq_and_dict());
182///
183///     // assemble the subgraph
184///     ContiguousSubgraph::new_from_contraction(graph, contraction)
185/// }
186/// ```
187pub struct ContiguousSubgraph<
188    G: SwhGraph,
189    N: ContractionBackend,
190    MAPS: properties::MaybeMaps,
191    TIMESTAMPS: properties::MaybeTimestamps,
192    PERSONS: properties::MaybePersons,
193    CONTENTS: properties::MaybeContents,
194    STRINGS: properties::MaybeStrings,
195    LABELNAMES: properties::MaybeLabelNames,
196> {
197    inner: Arc<ContiguousSubgraphInner<G, N>>, // TODO: find a way to replace Arc with ouroboros
198    properties:
199        properties::SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>,
200}
201
202impl<G: SwhGraphWithProperties, N: ContractionBackend>
203    ContiguousSubgraph<
204        G,
205        N,
206        properties::NoMaps,
207        properties::NoTimestamps,
208        properties::NoPersons,
209        properties::NoContents,
210        properties::NoStrings,
211        properties::NoLabelNames,
212    >
213{
214    /// Creates a new [`ContiguousSubgraph`] by keeping only nodes in the [`Contraction`]
215    pub fn new_from_contraction(graph: G, contraction: Contraction<N>) -> Self {
216        let path = graph.properties().path.clone();
217        let num_nodes = contraction.num_nodes();
218        let inner = Arc::new(ContiguousSubgraphInner {
219            underlying_graph: graph,
220            contraction,
221        });
222        Self {
223            properties: properties::SwhGraphProperties {
224                path,
225                num_nodes,
226                maps: properties::NoMaps,
227                timestamps: properties::NoTimestamps,
228                persons: properties::NoPersons,
229                contents: properties::NoContents,
230                strings: properties::NoStrings,
231                label_names: properties::NoLabelNames,
232                label_names_are_in_base64_order: Default::default(),
233            },
234            inner,
235        }
236    }
237}
238
239impl<G: SwhGraph, N: ContractionBackend>
240    ContiguousSubgraph<
241        G,
242        N,
243        properties::NoMaps,
244        properties::NoTimestamps,
245        properties::NoPersons,
246        properties::NoContents,
247        properties::NoStrings,
248        properties::NoLabelNames,
249    >
250{
251    /// Returns the graph this graph is a subgraph of
252    ///
253    /// Use [`Self::contraction`] to get the mapping between both sets of node ids
254    pub fn underlying_graph(&self) -> &G {
255        &self.inner.underlying_graph
256    }
257
258    /// The structure used to match the underlying graph's node ids with this graph's node ids
259    pub fn contraction(&self) -> &Contraction<N> {
260        &self.inner.contraction
261    }
262}
263
264impl<G: SwhGraphWithProperties>
265    ContiguousSubgraph<
266        G,
267        EfSeqDict,
268        properties::NoMaps,
269        properties::NoTimestamps,
270        properties::NoPersons,
271        properties::NoContents,
272        properties::NoStrings,
273        properties::NoLabelNames,
274    >
275{
276    /// Creates a new [`ContiguousSubgraph`] from an existing graph with "holes".
277    pub fn new_from_noncontiguous_graph(graph: G) -> Self
278    where
279        G: Sync,
280    {
281        // compute exact number of nodes in the subgraph, which is required
282        // by EliasFanoBuilder
283        let actual_num_nodes = graph.actual_num_nodes().unwrap_or_else(|_| {
284            let mut pl = concurrent_progress_logger!(
285                item_name = "node",
286                expected_updates = Some(graph.num_nodes()),
287            );
288            pl.start("Computing number of nodes");
289            let actual_num_nodes = graph
290                .par_iter_nodes(pl.clone())
291                .filter(|&node| graph.has_node(node))
292                .count();
293            pl.done();
294            actual_num_nodes
295        });
296
297        // compute set of nodes in the subgraph
298        let mut nodes_efb = EliasFanoBuilder::new(actual_num_nodes, graph.num_nodes());
299        for node in 0..graph.num_nodes() {
300            if graph.has_node(node) {
301                nodes_efb.push(node);
302            }
303        }
304        let contraction = Contraction(nodes_efb.build_with_seq_and_dict());
305
306        Self::new_from_contraction(graph, contraction)
307    }
308}
309
310// content of ContiguousSubgraph that must be wrapped in an Arc in order to make
311// ContiguousSubgraphMaps live as long as G (which is an accidental requirement of
312// SwhGraphWithProperties, which doesn't seem to be removable without making its
313// API painfully hard to use).
314struct ContiguousSubgraphInner<G: SwhGraph, N: ContractionBackend> {
315    underlying_graph: G,
316    contraction: Contraction<N>,
317}
318
319impl<
320        G: SwhGraph,
321        N: ContractionBackend,
322        MAPS: properties::MaybeMaps,
323        TIMESTAMPS: properties::MaybeTimestamps,
324        PERSONS: properties::MaybePersons,
325        CONTENTS: properties::MaybeContents,
326        STRINGS: properties::MaybeStrings,
327        LABELNAMES: properties::MaybeLabelNames,
328    > SwhGraph
329    for ContiguousSubgraph<G, N, MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>
330{
331    #[inline(always)]
332    fn path(&self) -> &Path {
333        self.inner.underlying_graph.path()
334    }
335    #[inline(always)]
336    fn is_transposed(&self) -> bool {
337        self.inner.underlying_graph.is_transposed()
338    }
339    #[inline(always)]
340    // Note: this can be an overapproximation if the underlying graph is a subgraph
341    fn num_nodes(&self) -> usize {
342        self.inner.contraction.num_nodes()
343    }
344    fn has_node(&self, node_id: NodeId) -> bool {
345        node_id < self.num_nodes()
346            && self
347                .inner
348                .underlying_graph
349                .has_node(self.inner.contraction.underlying_node_id(node_id))
350    }
351    #[inline(always)]
352    // Note: this return the number or arcs in the original graph, before
353    // subgraph filtering.
354    fn num_arcs(&self) -> u64 {
355        self.inner.underlying_graph.num_arcs()
356    }
357    fn num_nodes_by_type(&self) -> Result<HashMap<NodeType, usize>> {
358        bail!("num_nodes_by_type is not supported by ContiguousSubgraph")
359    }
360    fn num_arcs_by_type(&self) -> Result<HashMap<(NodeType, NodeType), usize>> {
361        bail!("num_arcs_by_type is not supported by ContiguousSubgraph")
362    }
363    fn has_arc(&self, src_node_id: NodeId, dst_node_id: NodeId) -> bool {
364        self.inner.underlying_graph.has_arc(
365            self.inner.contraction.underlying_node_id(src_node_id),
366            self.inner.contraction.underlying_node_id(dst_node_id),
367        )
368    }
369}
370
371impl<
372        G: SwhGraphWithProperties,
373        N: ContractionBackend,
374        MAPS: properties::MaybeMaps,
375        TIMESTAMPS: properties::MaybeTimestamps,
376        PERSONS: properties::MaybePersons,
377        CONTENTS: properties::MaybeContents,
378        STRINGS: properties::MaybeStrings,
379        LABELNAMES: properties::MaybeLabelNames,
380    > SwhGraphWithProperties
381    for ContiguousSubgraph<G, N, MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>
382{
383    type Maps = MAPS;
384    type Timestamps = TIMESTAMPS;
385    type Persons = PERSONS;
386    type Contents = CONTENTS;
387    type Strings = STRINGS;
388    type LabelNames = LABELNAMES;
389
390    #[inline(always)]
391    fn properties(
392        &self,
393    ) -> &properties::SwhGraphProperties<
394        Self::Maps,
395        Self::Timestamps,
396        Self::Persons,
397        Self::Contents,
398        Self::Strings,
399        Self::LabelNames,
400    > {
401        &self.properties
402    }
403}