swh_graph/views/contiguous_subgraph/
mod.rs

1// Copyright (C) 2023-2025  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6use std::collections::HashMap;
7use std::path::Path;
8use std::sync::Arc;
9
10use anyhow::{bail, Result};
11use dsi_progress_logger::{concurrent_progress_logger, ProgressLog};
12use rayon::prelude::*;
13use sux::dict::elias_fano::{EfSeqDict, EliasFanoBuilder};
14use sux::prelude::{IndexedDict, IndexedSeq};
15
16use crate::graph::*;
17use crate::mph::SwhidMphf;
18use crate::properties;
19use crate::{NodeType, OutOfBoundError, SWHID};
20
21mod contents;
22mod iterators;
23mod label_names;
24mod maps;
25mod persons;
26mod strings;
27mod timestamps;
28
29/// Alias for [`IndexedSeq`] + [`IndexedDict`] mapping from [`NodeId`] to [`NodeId`].
30pub trait ContractionBackend:
31    IndexedSeq<Input = NodeId, Output = NodeId> + IndexedDict<Input = NodeId, Output = NodeId>
32{
33}
34
35impl<
36        B: IndexedSeq<Input = NodeId, Output = NodeId> + IndexedDict<Input = NodeId, Output = NodeId>,
37    > ContractionBackend for B
38{
39}
40
41/// See [`ContiguousSubgraph`]
42pub struct Contraction<N: IndexedSeq<Input = NodeId, Output = NodeId>>(pub N);
43
44impl<N: IndexedSeq<Input = NodeId, Output = NodeId>> Contraction<N> {
45    /// Given a node id in a [`ContiguousSubgraph`], returns the corresponding node id
46    /// in the [`ContiguousSubgraph`]'s underlying graph
47    #[inline(always)]
48    pub fn underlying_node_id(&self, self_node: NodeId) -> NodeId {
49        self.0.get(self_node)
50    }
51
52    /// Returns the number of node ids in the [`ContiguousSubgraph`].
53    ///
54    /// Note: this can be an overapproximation if the underlying graph is a subgraph
55    pub fn num_nodes(&self) -> usize {
56        self.0.len()
57    }
58}
59
60impl<N: ContractionBackend> Contraction<N> {
61    /// Given a node id in a [`ContiguousSubgraph`]'s underlying graph, returns the
62    /// corresponding node id in the [`ContiguousSubgraph`]
63    #[inline(always)]
64    pub fn node_id_from_underlying(&self, underlying_node: NodeId) -> Option<NodeId> {
65        self.0.index_of(underlying_node)
66    }
67}
68
69/// A view over [`SwhGraph`] and related traits, that filters out some nodes, and renumbers
70/// remaining nodes so that the set of valid nodes becomes a range.
71///
72/// Due to limitations in the Rust type system, properties available on the underlying
73/// [`SwhGraph`] are not automatically available on [`ContiguousSubgraph`].
74/// They need to be explicitly enabled in a builder-like pattern on `ContiguousSubgraph`
75/// using these methods:
76/// * [`with_maps`][`ContiguousSubgraph::with_maps`],
77/// * [`with_timestamps`][`ContiguousSubgraph::with_timestamps`],
78/// * [`with_persons`][`ContiguousSubgraph::with_persons`],
79/// * [`with_contents`][`ContiguousSubgraph::with_contents`],
80/// * [`with_strings`][`ContiguousSubgraph::with_strings`],
81/// * [`with_label_names`][`ContiguousSubgraph::with_label_names`],
82///
83/// # Examples
84///
85/// Build a `ContiguousSubgraph` made of only content and directory nodes:
86///
87/// ```
88/// use dsi_progress_logger::progress_logger;
89/// use swh_graph::properties;
90/// use swh_graph::NodeConstraint;
91/// use swh_graph::graph::SwhGraphWithProperties;
92/// use swh_graph::views::{ContiguousSubgraph, Contraction, Subgraph, ContractionBackend};
93///
94/// fn filesystem_subgraph<G>(graph: &G) -> ContiguousSubgraph<
95///         Subgraph<&'_ G, impl Fn(usize) -> bool + use<'_, G>, fn(usize, usize) -> bool>,
96///         impl ContractionBackend,
97///         properties::NoMaps,
98///         properties::NoTimestamps,
99///         properties::NoPersons,
100///         properties::NoContents,
101///         properties::NoStrings,
102///         properties::NoLabelNames,
103///     >
104///     where G: SwhGraphWithProperties<Maps: properties::Maps> + Sync {
105///     let sparse_subgraph = Subgraph::with_node_constraint(graph, "cnt,ori".parse().unwrap());
106///     ContiguousSubgraph::new_from_noncontiguous_graph(sparse_subgraph)
107/// }
108/// ```
109///
110/// The graph created above is slightly suboptimal, as `ContiguousSubgraph` wraps
111/// a `Subgraph`, causing `Subgraph` to add needless overhead by checking that nodes
112/// exist, even though `ContiguousSubgraph` does it too.
113/// This should not be noticeable, but if it is an issue, you can skip the Subgraph
114/// by manually building a [`Contraction`]:
115///
116/// ```
117/// use dsi_progress_logger::progress_logger;
118/// use sux::dict::elias_fano::{EfSeqDict, EliasFanoBuilder};
119/// use swh_graph::properties;
120/// use swh_graph::{NodeType};
121/// use swh_graph::graph::SwhGraphWithProperties;
122/// use swh_graph::views::{ContiguousSubgraph, Contraction, Subgraph, ContractionBackend};
123///
124/// fn filesystem_subgraph<G>(graph: &G) -> ContiguousSubgraph<
125///         &'_ G,
126///         EfSeqDict,
127///         properties::NoMaps,
128///         properties::NoTimestamps,
129///         properties::NoPersons,
130///         properties::NoContents,
131///         properties::NoStrings,
132///         properties::NoLabelNames,
133///     >
134///     where G: SwhGraphWithProperties<Maps: properties::Maps> {
135///
136///     // compute exact number of nodes in the subgraph, which is required
137///     // by EliasFanoBuilder
138///     let pl = progress_logger!(
139///         item_name = "node",
140///         expected_updates = Some(graph.num_nodes()),
141///     );
142///     let num_nodes = graph
143///         .iter_nodes(pl)
144///         .filter(|&node| match graph.properties().node_type(node) {
145///             NodeType::Content | NodeType::Directory => true,
146///             _ => false,
147///         })
148///         .count();
149///
150///     // compute set of nodes in the subgraph
151///     let mut nodes_efb = EliasFanoBuilder::new(num_nodes, graph.num_nodes());
152///     for node in 0..graph.num_nodes() {
153///         match graph.properties().node_type(node) {
154///             NodeType::Content | NodeType::Directory => nodes_efb.push(node),
155///             _ => (),
156///         }
157///     }
158///     let contraction = Contraction(nodes_efb.build_with_seq_and_dict());
159///
160///     // assemble the subgraph
161///     ContiguousSubgraph::new_from_contraction(graph, contraction)
162/// }
163/// ```
164pub struct ContiguousSubgraph<
165    G: SwhGraph,
166    N: ContractionBackend,
167    MAPS: properties::MaybeMaps,
168    TIMESTAMPS: properties::MaybeTimestamps,
169    PERSONS: properties::MaybePersons,
170    CONTENTS: properties::MaybeContents,
171    STRINGS: properties::MaybeStrings,
172    LABELNAMES: properties::MaybeLabelNames,
173> {
174    inner: Arc<ContiguousSubgraphInner<G, N>>, // TODO: find a way to replace Arc with ouroboros
175    properties:
176        properties::SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>,
177}
178
179impl<G: SwhGraphWithProperties, N: ContractionBackend>
180    ContiguousSubgraph<
181        G,
182        N,
183        properties::NoMaps,
184        properties::NoTimestamps,
185        properties::NoPersons,
186        properties::NoContents,
187        properties::NoStrings,
188        properties::NoLabelNames,
189    >
190{
191    /// Creates a new [`ContiguousSubgraph`] by keeping only nodes in the [`Contraction`]
192    pub fn new_from_contraction(graph: G, contraction: Contraction<N>) -> Self {
193        let path = graph.properties().path.clone();
194        let num_nodes = contraction.num_nodes();
195        let inner = Arc::new(ContiguousSubgraphInner {
196            underlying_graph: graph,
197            contraction,
198        });
199        Self {
200            properties: properties::SwhGraphProperties {
201                path,
202                num_nodes,
203                maps: properties::NoMaps,
204                timestamps: properties::NoTimestamps,
205                persons: properties::NoPersons,
206                contents: properties::NoContents,
207                strings: properties::NoStrings,
208                label_names: properties::NoLabelNames,
209                label_names_are_in_base64_order: Default::default(),
210            },
211            inner,
212        }
213    }
214}
215
216impl<G: SwhGraph, N: ContractionBackend>
217    ContiguousSubgraph<
218        G,
219        N,
220        properties::NoMaps,
221        properties::NoTimestamps,
222        properties::NoPersons,
223        properties::NoContents,
224        properties::NoStrings,
225        properties::NoLabelNames,
226    >
227{
228    /// Returns the graph this graph is a subgraph of
229    ///
230    /// Use [`Self::contraction`] to get the mapping between both sets of node ids
231    pub fn underlying_graph(&self) -> &G {
232        &self.inner.underlying_graph
233    }
234
235    /// The structure used to match the underlying graph's node ids with this graph's node ids
236    pub fn contraction(&self) -> &Contraction<N> {
237        &self.inner.contraction
238    }
239}
240
241impl<G: SwhGraphWithProperties>
242    ContiguousSubgraph<
243        G,
244        EfSeqDict,
245        properties::NoMaps,
246        properties::NoTimestamps,
247        properties::NoPersons,
248        properties::NoContents,
249        properties::NoStrings,
250        properties::NoLabelNames,
251    >
252{
253    /// Creates a new [`ContiguousSubgraph`] from an existing graph with "holes".
254    pub fn new_from_noncontiguous_graph(graph: G) -> Self
255    where
256        G: Sync,
257    {
258        // compute exact number of nodes in the subgraph, which is required
259        // by EliasFanoBuilder
260        let actual_num_nodes = graph.actual_num_nodes().unwrap_or_else(|_| {
261            let mut pl = concurrent_progress_logger!(
262                item_name = "node",
263                expected_updates = Some(graph.num_nodes()),
264            );
265            pl.start("Computing number of nodes");
266            let actual_num_nodes = graph
267                .par_iter_nodes(pl.clone())
268                .filter(|&node| graph.has_node(node))
269                .count();
270            pl.done();
271            actual_num_nodes
272        });
273
274        // compute set of nodes in the subgraph
275        let mut nodes_efb = EliasFanoBuilder::new(actual_num_nodes, graph.num_nodes());
276        for node in 0..graph.num_nodes() {
277            if graph.has_node(node) {
278                nodes_efb.push(node);
279            }
280        }
281        let contraction = Contraction(nodes_efb.build_with_seq_and_dict());
282
283        Self::new_from_contraction(graph, contraction)
284    }
285}
286
287// content of ContiguousSubgraph that must be wrapped in an Arc in order to make
288// ContiguousSubgraphMaps live as long as G (which is an accidental requirement of
289// SwhGraphWithProperties, which doesn't seem to be removable without making its
290// API painfully hard to use).
291struct ContiguousSubgraphInner<G: SwhGraph, N: ContractionBackend> {
292    underlying_graph: G,
293    contraction: Contraction<N>,
294}
295
296impl<
297        G: SwhGraph,
298        N: ContractionBackend,
299        MAPS: properties::MaybeMaps,
300        TIMESTAMPS: properties::MaybeTimestamps,
301        PERSONS: properties::MaybePersons,
302        CONTENTS: properties::MaybeContents,
303        STRINGS: properties::MaybeStrings,
304        LABELNAMES: properties::MaybeLabelNames,
305    > SwhGraph
306    for ContiguousSubgraph<G, N, MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>
307{
308    #[inline(always)]
309    fn path(&self) -> &Path {
310        self.inner.underlying_graph.path()
311    }
312    #[inline(always)]
313    fn is_transposed(&self) -> bool {
314        self.inner.underlying_graph.is_transposed()
315    }
316    #[inline(always)]
317    // Note: this can be an overapproximation if the underlying graph is a subgraph
318    fn num_nodes(&self) -> usize {
319        self.inner.contraction.num_nodes()
320    }
321    fn has_node(&self, node_id: NodeId) -> bool {
322        node_id < self.num_nodes()
323            && self
324                .inner
325                .underlying_graph
326                .has_node(self.inner.contraction.underlying_node_id(node_id))
327    }
328    #[inline(always)]
329    // Note: this return the number or arcs in the original graph, before
330    // subgraph filtering.
331    fn num_arcs(&self) -> u64 {
332        self.inner.underlying_graph.num_arcs()
333    }
334    fn num_nodes_by_type(&self) -> Result<HashMap<NodeType, usize>> {
335        bail!("num_nodes_by_type is not supported by ContiguousSubgraph")
336    }
337    fn num_arcs_by_type(&self) -> Result<HashMap<(NodeType, NodeType), usize>> {
338        bail!("num_arcs_by_type is not supported by ContiguousSubgraph")
339    }
340    fn has_arc(&self, src_node_id: NodeId, dst_node_id: NodeId) -> bool {
341        self.inner.underlying_graph.has_arc(
342            self.inner.contraction.underlying_node_id(src_node_id),
343            self.inner.contraction.underlying_node_id(dst_node_id),
344        )
345    }
346}
347
348impl<
349        G: SwhGraphWithProperties,
350        N: ContractionBackend,
351        MAPS: properties::MaybeMaps,
352        TIMESTAMPS: properties::MaybeTimestamps,
353        PERSONS: properties::MaybePersons,
354        CONTENTS: properties::MaybeContents,
355        STRINGS: properties::MaybeStrings,
356        LABELNAMES: properties::MaybeLabelNames,
357    > SwhGraphWithProperties
358    for ContiguousSubgraph<G, N, MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>
359{
360    type Maps = MAPS;
361    type Timestamps = TIMESTAMPS;
362    type Persons = PERSONS;
363    type Contents = CONTENTS;
364    type Strings = STRINGS;
365    type LabelNames = LABELNAMES;
366
367    #[inline(always)]
368    fn properties(
369        &self,
370    ) -> &properties::SwhGraphProperties<
371        Self::Maps,
372        Self::Timestamps,
373        Self::Persons,
374        Self::Contents,
375        Self::Strings,
376        Self::LabelNames,
377    > {
378        &self.properties
379    }
380}