swh_graph/
graph.rs

1// Copyright (C) 2023-2025  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6//! Structures to manipulate the Software Heritage graph
7//!
8//! In order to load only what is necessary, these structures are initially created
9//! by calling [`SwhUnidirectionalGraph::new`] or [`SwhBidirectionalGraph::new`], then calling methods
10//! on them to progressively load additional data (`load_properties`, `load_all_properties`,
11//! `load_labels`)
12
13#![allow(clippy::type_complexity)]
14
15use std::borrow::Borrow;
16use std::collections::HashMap;
17use std::iter::Iterator;
18use std::path::{Path, PathBuf};
19
20use anyhow::{anyhow, bail, Context, Result};
21use dsi_bitstream::prelude::BE;
22use dsi_progress_logger::{ConcurrentProgressLog, ProgressLog};
23use rayon::prelude::*;
24use webgraph::graphs::vec_graph::LabeledVecGraph;
25use webgraph::prelude::*;
26
27use crate::arc_iterators::{
28    DelabelingIterator, LabelTypingSuccessorIterator, LabeledArcIterator, LabeledSuccessorIterator,
29};
30// Useful trait to import as part of `use swh_graph::graph::*;`
31pub use crate::arc_iterators::IntoFlattenedLabeledArcsIterator;
32use crate::labeling::SwhLabeling;
33use crate::labels::{EdgeLabel, UntypedEdgeLabel};
34use crate::mph::LoadableSwhidMphf;
35use crate::properties;
36pub use crate::underlying_graph::DefaultUnderlyingGraph;
37use crate::utils::shuffle::par_iter_shuffled_range;
38use crate::utils::suffix_path;
39use crate::NodeType;
40
41/// Alias for [`usize`], which may become a newtype in a future version.
42pub type NodeId = usize;
43
44/// Supertrait of [`RandomAccessLabeling`] with methods to act like a [`RandomAccessGraph`].
45///
46/// If `Self` implements [`RandomAccessGraph`], then this is implemented as no-ops.
47/// Otherwise, it "unpeels" layers of zipping until it reaches the graph at the bottom:
48///
49/// - If `Self` is `Zip<L, R>`, this defers to `L` (aka. `Left<Zip<L, R>`).
50/// - If `Self` is `VecGraph` or `LabeledVecGraph<_>, this does the equivalent of deferring to `VecGraph`
51///   (through [`DelabelingIterator`] because it cannot create a new `VecGraph` without copying)
52pub trait UnderlyingGraph: RandomAccessLabeling {
53    type UnlabeledSuccessors<'succ>: IntoIterator<Item = NodeId>
54    where
55        Self: 'succ;
56
57    /// Workaround for some implementations of `<Self as RandomAccessLabeling>::num_arcs`
58    /// being missing
59    ///
60    /// `Zip::num_arcs` runs `assert_eq!(self.0.num_arcs(), self.1.num_arcs());`
61    /// but `BitStreamLabeling::num_arcs` always panics as of
62    /// f460742fe0f776df2248a5f09a3425b81eb70b07, so we can't use that.
63    fn num_arcs(&self) -> u64;
64    fn has_arc(&self, src_node_id: NodeId, dst_node_id: NodeId) -> bool;
65    fn unlabeled_successors(&self, node_id: NodeId) -> Self::UnlabeledSuccessors<'_>;
66}
67
68impl<F: RandomAccessDecoderFactory> UnderlyingGraph for BvGraph<F> {
69    type UnlabeledSuccessors<'succ>
70        = <Self as RandomAccessLabeling>::Labels<'succ>
71    where
72        Self: 'succ;
73
74    fn num_arcs(&self) -> u64 {
75        <Self as RandomAccessLabeling>::num_arcs(self)
76    }
77    fn has_arc(&self, src_node_id: NodeId, dst_node_id: NodeId) -> bool {
78        <Self as RandomAccessGraph>::has_arc(self, src_node_id, dst_node_id)
79    }
80    fn unlabeled_successors(&self, node_id: NodeId) -> Self::UnlabeledSuccessors<'_> {
81        <Self as RandomAccessGraph>::successors(self, node_id)
82    }
83}
84
85impl<G: UnderlyingGraph, L: RandomAccessLabeling> UnderlyingGraph for Zip<G, L> {
86    type UnlabeledSuccessors<'succ>
87        = <G as UnderlyingGraph>::UnlabeledSuccessors<'succ>
88    where
89        Self: 'succ;
90
91    fn num_arcs(&self) -> u64 {
92        <G as UnderlyingGraph>::num_arcs(&self.0)
93    }
94    fn has_arc(&self, src_node_id: NodeId, dst_node_id: NodeId) -> bool {
95        self.0.has_arc(src_node_id, dst_node_id)
96    }
97    fn unlabeled_successors(&self, node_id: NodeId) -> Self::UnlabeledSuccessors<'_> {
98        self.0.unlabeled_successors(node_id)
99    }
100}
101
102impl UnderlyingGraph for VecGraph {
103    type UnlabeledSuccessors<'succ>
104        = <Self as RandomAccessLabeling>::Labels<'succ>
105    where
106        Self: 'succ;
107
108    fn num_arcs(&self) -> u64 {
109        <Self as RandomAccessLabeling>::num_arcs(self)
110    }
111    fn has_arc(&self, src_node_id: NodeId, dst_node_id: NodeId) -> bool {
112        <Self as RandomAccessGraph>::has_arc(self, src_node_id, dst_node_id)
113    }
114    fn unlabeled_successors(&self, node_id: NodeId) -> Self::UnlabeledSuccessors<'_> {
115        <Self as RandomAccessGraph>::successors(self, node_id)
116    }
117}
118
119impl<L: Clone> UnderlyingGraph for LabeledVecGraph<L> {
120    type UnlabeledSuccessors<'succ>
121        = DelabelingIterator<<Self as RandomAccessLabeling>::Labels<'succ>>
122    where
123        Self: 'succ;
124
125    fn num_arcs(&self) -> u64 {
126        <Self as RandomAccessLabeling>::num_arcs(self)
127    }
128    fn has_arc(&self, src_node_id: NodeId, dst_node_id: NodeId) -> bool {
129        for succ in self.unlabeled_successors(src_node_id) {
130            if succ == dst_node_id {
131                return true;
132            }
133        }
134        false
135    }
136    fn unlabeled_successors(&self, node_id: NodeId) -> Self::UnlabeledSuccessors<'_> {
137        DelabelingIterator {
138            successors: <Self as RandomAccessLabeling>::labels(self, node_id),
139        }
140    }
141}
142
143pub trait SwhGraph {
144    /// Return the base path of the graph
145    fn path(&self) -> &Path;
146
147    /// Returns whether the graph is in the `ori->snp->rel,rev->dir->cnt` direction
148    /// (with a few `dir->rev` arcs)
149    fn is_transposed(&self) -> bool;
150
151    /// Return the largest node id in the graph plus one.
152    ///
153    /// For graphs directly loaded from disk this is the actual number of nodes,
154    /// but it is an overestimate for some graphs (eg. [`Subgraph`](crate::views::Subgraph)).
155    ///
156    /// Use [`actual_num_nodes`](Self::actual_num_nodes) if you need the exact number of
157    /// nodes in the graph.
158    fn num_nodes(&self) -> usize;
159
160    /// Returns whether the given node id exists in the graph
161    ///
162    /// This is usually true iff `node_id < self.num_nodes()`, but may be false
163    /// when using a filtered view such as [`Subgraph`](crate::views::Subgraph).
164    fn has_node(&self, node_id: NodeId) -> bool {
165        node_id < self.num_nodes()
166    }
167
168    /// Return the number of arcs in the graph.
169    fn num_arcs(&self) -> u64;
170
171    /// Returns the number of nodes of each type, if known.
172    fn num_nodes_by_type(&self) -> Result<HashMap<NodeType, usize>> {
173        let path = self.path().with_extension("nodes.stats.txt");
174        std::fs::read_to_string(&path)
175            .with_context(|| format!("Could not read {}", path.display()))?
176            .lines()
177            .map(|line| {
178                let (type_, count) = line
179                    .split_once(' ')
180                    .with_context(|| format!("Unexpected line in {}: {}", path.display(), line))?;
181                let type_: NodeType = type_
182                    .parse()
183                    .map_err(|_| anyhow!("Unknown node type in {}: {}", path.display(), type_))?;
184                let count = count
185                    .parse()
186                    .map_err(|_| anyhow!("Invalid integer in {}: {}", path.display(), count))?;
187                Ok((type_, count))
188            })
189            .collect()
190    }
191    /// Returns the number of nodes in the graph, if known.
192    fn actual_num_nodes(&self) -> Result<usize> {
193        Ok(self.num_nodes_by_type()?.values().sum())
194    }
195    /// Returns the number of arcs of each type, if known.
196    fn num_arcs_by_type(&self) -> Result<HashMap<(NodeType, NodeType), usize>> {
197        let path = self.path().with_extension("edges.stats.txt");
198        std::fs::read_to_string(&path)
199            .with_context(|| format!("Could not read {}", path.display()))?
200            .lines()
201            .map(|line| {
202                let (type_, count) = line
203                    .split_once(' ')
204                    .with_context(|| format!("Unexpected line in {}: {}", path.display(), line))?;
205                let (src_type, dst_type) = type_
206                    .split_once(':')
207                    .with_context(|| format!("Unexpected line in {}: {}", path.display(), line))?;
208                let src_type: NodeType = src_type.parse().map_err(|_| {
209                    anyhow!("Unknown node type in {}: {}", path.display(), src_type)
210                })?;
211                let dst_type: NodeType = dst_type.parse().map_err(|_| {
212                    anyhow!("Unknown node type in {}: {}", path.display(), dst_type)
213                })?;
214                let count = count
215                    .parse()
216                    .map_err(|_| anyhow!("Invalid integer in {}: {}", path.display(), count))?;
217                Ok(((src_type, dst_type), count))
218            })
219            .collect()
220    }
221
222    /// Return whether there is an arc going from `src_node_id` to `dst_node_id`.
223    fn has_arc(&self, src_node_id: NodeId, dst_node_id: NodeId) -> bool;
224
225    /// Returns an iterator on all the nodes
226    ///
227    /// Order is not guaranteed.
228    ///
229    /// Updates the progress logger on every node id from 0 to `self.num_nodes()`,
230    /// even those that are filtered out.
231    fn iter_nodes<'a>(
232        &'a self,
233        mut pl: impl ProgressLog + 'a,
234    ) -> impl Iterator<Item = NodeId> + 'a {
235        (0..self.num_nodes())
236            .inspect(move |_| pl.light_update())
237            .filter(|&node_id| self.has_node(node_id))
238    }
239    /// Returns a parallel iterator on all the nodes
240    ///
241    /// Order is not guaranteed.
242    ///
243    /// Updates the progress logger on every node id from 0 to `self.num_nodes()`,
244    /// even those that are filtered out.
245    fn par_iter_nodes<'a>(
246        &'a self,
247        pl: impl ConcurrentProgressLog + 'a,
248    ) -> impl ParallelIterator<Item = NodeId> + 'a
249    where
250        Self: Sync,
251    {
252        par_iter_shuffled_range(0..self.num_nodes())
253            .map_with(pl, move |pl, node_id| {
254                pl.update();
255                node_id
256            })
257            .filter(|&node_id| self.has_node(node_id))
258    }
259}
260
261pub trait SwhForwardGraph: SwhGraph {
262    type Successors<'succ>: IntoIterator<Item = usize>
263    where
264        Self: 'succ;
265
266    /// Return an [`IntoIterator`] over the successors of a node.
267    fn successors(&self, node_id: NodeId) -> Self::Successors<'_>;
268    /// Return the number of successors of a node.
269    fn outdegree(&self, node_id: NodeId) -> usize;
270}
271
272pub trait SwhLabeledForwardGraph: SwhForwardGraph {
273    type LabeledArcs<'arc>: IntoIterator<Item = UntypedEdgeLabel>
274    where
275        Self: 'arc;
276    type LabeledSuccessors<'node>: IntoIterator<Item = (usize, Self::LabeledArcs<'node>)>
277        + IntoFlattenedLabeledArcsIterator<UntypedEdgeLabel>
278    where
279        Self: 'node;
280
281    /// Return an [`IntoIterator`] over the successors of a node along with a list of labels
282    /// of each arc
283    fn untyped_labeled_successors(&self, node_id: NodeId) -> Self::LabeledSuccessors<'_>;
284
285    /// Return an [`IntoIterator`] over the successors of a node along with a list of labels
286    /// of each arc
287    fn labeled_successors(
288        &self,
289        node_id: NodeId,
290    ) -> impl Iterator<Item = (usize, impl Iterator<Item = EdgeLabel>)>
291           + IntoFlattenedLabeledArcsIterator<EdgeLabel>
292    where
293        Self: SwhGraphWithProperties + Sized,
294        <Self as SwhGraphWithProperties>::Maps: crate::properties::Maps,
295    {
296        LabelTypingSuccessorIterator {
297            graph: self,
298            is_transposed: false,
299            src: node_id,
300            successors: self.untyped_labeled_successors(node_id).into_iter(),
301        }
302    }
303}
304
305pub trait SwhBackwardGraph: SwhGraph {
306    type Predecessors<'succ>: IntoIterator<Item = usize>
307    where
308        Self: 'succ;
309
310    /// Return an [`IntoIterator`] over the predecessors of a node.
311    fn predecessors(&self, node_id: NodeId) -> Self::Predecessors<'_>;
312    /// Return the number of predecessors of a node.
313    fn indegree(&self, node_id: NodeId) -> usize;
314}
315
316pub trait SwhLabeledBackwardGraph: SwhBackwardGraph {
317    type LabeledArcs<'arc>: IntoIterator<Item = UntypedEdgeLabel>
318    where
319        Self: 'arc;
320    type LabeledPredecessors<'node>: IntoIterator<Item = (usize, Self::LabeledArcs<'node>)>
321        + IntoFlattenedLabeledArcsIterator<UntypedEdgeLabel>
322    where
323        Self: 'node;
324
325    /// Return an [`IntoIterator`] over the predecessors of a node along with a list of labels
326    /// of each arc
327    fn untyped_labeled_predecessors(&self, node_id: NodeId) -> Self::LabeledPredecessors<'_>;
328
329    /// Return an [`IntoIterator`] over the predecessors of a node along with a list of labels
330    /// of each arc
331    fn labeled_predecessors(
332        &self,
333        node_id: NodeId,
334    ) -> impl Iterator<Item = (usize, impl Iterator<Item = crate::labels::EdgeLabel>)>
335    where
336        Self: SwhGraphWithProperties + Sized,
337        <Self as SwhGraphWithProperties>::Maps: crate::properties::Maps,
338    {
339        LabelTypingSuccessorIterator {
340            graph: self,
341            is_transposed: true,
342            src: node_id,
343            successors: self.untyped_labeled_predecessors(node_id).into_iter(),
344        }
345    }
346}
347
348pub trait SwhGraphWithProperties: SwhGraph {
349    type Maps: properties::MaybeMaps;
350    type Timestamps: properties::MaybeTimestamps;
351    type Persons: properties::MaybePersons;
352    type Contents: properties::MaybeContents;
353    type Strings: properties::MaybeStrings;
354    type LabelNames: properties::MaybeLabelNames;
355
356    fn properties(
357        &self,
358    ) -> &properties::SwhGraphProperties<
359        Self::Maps,
360        Self::Timestamps,
361        Self::Persons,
362        Self::Contents,
363        Self::Strings,
364        Self::LabelNames,
365    >;
366}
367
368/// Alias for structures representing a graph with all arcs, arc labels, and node properties
369/// loaded.
370pub trait SwhFullGraph:
371    SwhLabeledForwardGraph
372    + SwhLabeledBackwardGraph
373    + SwhGraphWithProperties<
374        Maps: crate::properties::Maps,
375        Timestamps: crate::properties::Timestamps,
376        Persons: crate::properties::Persons,
377        Contents: crate::properties::Contents,
378        Strings: crate::properties::Strings,
379        LabelNames: crate::properties::LabelNames,
380    >
381{
382}
383
384impl<
385        G: SwhLabeledForwardGraph
386            + SwhLabeledBackwardGraph
387            + SwhGraphWithProperties<
388                Maps: crate::properties::Maps,
389                Timestamps: crate::properties::Timestamps,
390                Persons: crate::properties::Persons,
391                Contents: crate::properties::Contents,
392                Strings: crate::properties::Strings,
393                LabelNames: crate::properties::LabelNames,
394            >,
395    > SwhFullGraph for G
396{
397}
398
399/// Class representing the compressed Software Heritage graph in a single direction.
400///
401/// Type parameters:
402///
403/// * `P` is either `()` or `properties::SwhGraphProperties`, manipulated using
404///   [`load_properties`](SwhUnidirectionalGraph::load_properties) and
405///   [`load_all_properties`](SwhUnidirectionalGraph::load_all_properties)
406/// * G is the forward graph (either [`BvGraph`], or `Zip<BvGraph, SwhLabeling>`
407///   [`load_labels`](SwhUnidirectionalGraph::load_labels)
408pub struct SwhUnidirectionalGraph<P, G: UnderlyingGraph = DefaultUnderlyingGraph> {
409    basepath: PathBuf,
410    graph: G,
411    properties: P,
412}
413
414impl SwhUnidirectionalGraph<()> {
415    pub fn new(basepath: impl AsRef<Path>) -> Result<Self> {
416        let basepath = basepath.as_ref().to_owned();
417        let graph = DefaultUnderlyingGraph(
418            BvGraph::with_basename(&basepath)
419                .endianness::<BE>()
420                .flags(MemoryFlags::TRANSPARENT_HUGE_PAGES | MemoryFlags::RANDOM_ACCESS)
421                .load()?,
422        );
423        Ok(Self::from_underlying_graph(basepath, graph))
424    }
425}
426
427impl<G: UnderlyingGraph> SwhUnidirectionalGraph<(), G> {
428    pub fn from_underlying_graph(basepath: PathBuf, graph: G) -> Self {
429        SwhUnidirectionalGraph {
430            basepath,
431            graph,
432            properties: (),
433        }
434    }
435}
436
437impl<P, G: UnderlyingGraph> SwhGraph for SwhUnidirectionalGraph<P, G> {
438    fn path(&self) -> &Path {
439        self.basepath.as_path()
440    }
441
442    fn is_transposed(&self) -> bool {
443        // Technically, users can load the 'graph-transposed' directly.
444        // However, unless they rename files, this will fail to load properties, because
445        // properties' file names wouldn't match the base path.
446        // As 'is_transposed' is only useful when checking node types (of an arc),
447        // this function is unlikely to be called in that scenario, so this should be fine.
448        false
449    }
450
451    fn num_nodes(&self) -> usize {
452        self.graph.num_nodes()
453    }
454
455    fn num_arcs(&self) -> u64 {
456        UnderlyingGraph::num_arcs(&self.graph)
457    }
458
459    fn has_arc(&self, src_node_id: NodeId, dst_node_id: NodeId) -> bool {
460        self.graph.has_arc(src_node_id, dst_node_id)
461    }
462}
463
464impl<P, G: UnderlyingGraph> SwhForwardGraph for SwhUnidirectionalGraph<P, G> {
465    type Successors<'succ>
466        = <G as UnderlyingGraph>::UnlabeledSuccessors<'succ>
467    where
468        Self: 'succ;
469
470    /// Return an [`IntoIterator`] over the successors of a node.
471    fn successors(&self, node_id: NodeId) -> Self::Successors<'_> {
472        self.graph.unlabeled_successors(node_id)
473    }
474
475    /// Return the number of successors of a node.
476    fn outdegree(&self, node_id: NodeId) -> usize {
477        self.graph.outdegree(node_id)
478    }
479}
480
481impl<P, G: UnderlyingGraph> SwhLabeledForwardGraph for SwhUnidirectionalGraph<P, G>
482where
483    <G as SequentialLabeling>::Label: Pair<Left = NodeId, Right: IntoIterator<Item: Borrow<u64>>>,
484    for<'succ> <G as RandomAccessLabeling>::Labels<'succ>:
485        Iterator<Item = (usize, <<G as SequentialLabeling>::Label as Pair>::Right)>,
486{
487    type LabeledArcs<'arc> = LabeledArcIterator<<<<<G as RandomAccessLabeling>::Labels<'arc> as Iterator>::Item as Pair>::Right as IntoIterator>::IntoIter> where Self: 'arc;
488    type LabeledSuccessors<'succ>
489        = LabeledSuccessorIterator<<G as RandomAccessLabeling>::Labels<'succ>>
490    where
491        Self: 'succ;
492
493    fn untyped_labeled_successors(&self, node_id: NodeId) -> Self::LabeledSuccessors<'_> {
494        LabeledSuccessorIterator {
495            successors: self.graph.labels(node_id),
496        }
497    }
498}
499
500impl<
501        M: properties::MaybeMaps,
502        T: properties::MaybeTimestamps,
503        P: properties::MaybePersons,
504        C: properties::MaybeContents,
505        S: properties::MaybeStrings,
506        N: properties::MaybeLabelNames,
507        G: UnderlyingGraph,
508    > SwhUnidirectionalGraph<properties::SwhGraphProperties<M, T, P, C, S, N>, G>
509{
510    /// Enriches the graph with more properties mmapped from disk
511    ///
512    /// # Example
513    ///
514    /// ```no_run
515    /// # use std::path::PathBuf;
516    /// use swh_graph::java_compat::mph::gov::GOVMPH;
517    ///
518    /// swh_graph::graph::SwhUnidirectionalGraph::new(PathBuf::from("./graph"))
519    ///     .expect("Could not load graph")
520    ///     .init_properties()
521    ///     .load_properties(|properties| properties.load_maps::<GOVMPH>())
522    ///     .expect("Could not load SWHID maps")
523    ///     .load_properties(|properties| properties.load_timestamps())
524    ///     .expect("Could not load timestamps");
525    /// ```
526    pub fn load_properties<
527        M2: properties::MaybeMaps,
528        T2: properties::MaybeTimestamps,
529        P2: properties::MaybePersons,
530        C2: properties::MaybeContents,
531        S2: properties::MaybeStrings,
532        N2: properties::MaybeLabelNames,
533    >(
534        self,
535        loader: impl FnOnce(
536            properties::SwhGraphProperties<M, T, P, C, S, N>,
537        ) -> Result<properties::SwhGraphProperties<M2, T2, P2, C2, S2, N2>>,
538    ) -> Result<SwhUnidirectionalGraph<properties::SwhGraphProperties<M2, T2, P2, C2, S2, N2>, G>>
539    {
540        Ok(SwhUnidirectionalGraph {
541            properties: loader(self.properties)?,
542            basepath: self.basepath,
543            graph: self.graph,
544        })
545    }
546}
547
548impl<G: UnderlyingGraph> SwhUnidirectionalGraph<(), G> {
549    /// Prerequisite for `load_properties`
550    pub fn init_properties(
551        self,
552    ) -> SwhUnidirectionalGraph<
553        properties::SwhGraphProperties<
554            properties::NoMaps,
555            properties::NoTimestamps,
556            properties::NoPersons,
557            properties::NoContents,
558            properties::NoStrings,
559            properties::NoLabelNames,
560        >,
561        G,
562    > {
563        SwhUnidirectionalGraph {
564            properties: properties::SwhGraphProperties::new(&self.basepath, self.graph.num_nodes()),
565            basepath: self.basepath,
566            graph: self.graph,
567        }
568    }
569
570    /// Enriches the graph with all properties, mmapped from disk
571    ///
572    /// # Example
573    ///
574    /// ```no_run
575    /// # use std::path::PathBuf;
576    /// use swh_graph::java_compat::mph::gov::GOVMPH;
577    ///
578    /// swh_graph::graph::SwhUnidirectionalGraph::new(PathBuf::from("./graph"))
579    ///     .expect("Could not load graph")
580    ///     .load_all_properties::<GOVMPH>()
581    ///     .expect("Could not load properties");
582    /// ```
583    pub fn load_all_properties<MPHF: LoadableSwhidMphf>(
584        self,
585    ) -> Result<
586        SwhUnidirectionalGraph<
587            properties::SwhGraphProperties<
588                properties::MappedMaps<MPHF>,
589                properties::MappedTimestamps,
590                properties::MappedPersons,
591                properties::MappedContents,
592                properties::MappedStrings,
593                properties::MappedLabelNames,
594            >,
595            G,
596        >,
597    > {
598        self.init_properties()
599            .load_properties(|properties| properties.load_all())
600    }
601}
602
603impl<
604        MAPS: properties::MaybeMaps,
605        TIMESTAMPS: properties::MaybeTimestamps,
606        PERSONS: properties::MaybePersons,
607        CONTENTS: properties::MaybeContents,
608        STRINGS: properties::MaybeStrings,
609        LABELNAMES: properties::MaybeLabelNames,
610        G: UnderlyingGraph,
611    > SwhGraphWithProperties
612    for SwhUnidirectionalGraph<
613        properties::SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>,
614        G,
615    >
616{
617    type Maps = MAPS;
618    type Timestamps = TIMESTAMPS;
619    type Persons = PERSONS;
620    type Contents = CONTENTS;
621    type Strings = STRINGS;
622    type LabelNames = LABELNAMES;
623
624    fn properties(
625        &self,
626    ) -> &properties::SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>
627    {
628        &self.properties
629    }
630}
631
632impl<P, G: RandomAccessGraph + UnderlyingGraph> SwhUnidirectionalGraph<P, G> {
633    /// Consumes this graph and returns a new one that implements [`SwhLabeledForwardGraph`]
634    pub fn load_labels(self) -> Result<SwhUnidirectionalGraph<P, Zip<G, SwhLabeling>>> {
635        Ok(SwhUnidirectionalGraph {
636            // Note: keep british version of "labelled" here for compatibility with Java swh-graph
637            graph: zip_labels(self.graph, suffix_path(&self.basepath, "-labelled"))
638                .context("Could not load forward labels")?,
639            properties: self.properties,
640            basepath: self.basepath,
641        })
642    }
643}
644
645/// Class representing the compressed Software Heritage graph in both directions.
646///
647/// Type parameters:
648///
649/// * `P` is either `()` or `properties::SwhGraphProperties`, manipulated using
650///   [`load_properties`](SwhBidirectionalGraph::load_properties) and
651///   [`load_all_properties`](SwhBidirectionalGraph::load_all_properties)
652/// * FG is the forward graph (either [`BvGraph`], or `Zip<BvGraph, SwhLabeling>`
653///   after using [`load_forward_labels`](SwhBidirectionalGraph::load_forward_labels)
654/// * BG is the backward graph (either [`BvGraph`], or `Zip<BvGraph, SwhLabeling>`
655///   after using [`load_backward_labels`](SwhBidirectionalGraph::load_backward_labels)
656pub struct SwhBidirectionalGraph<
657    P,
658    FG: UnderlyingGraph = DefaultUnderlyingGraph,
659    BG: UnderlyingGraph = DefaultUnderlyingGraph,
660> {
661    basepath: PathBuf,
662    forward_graph: FG,
663    backward_graph: BG,
664    properties: P,
665}
666
667impl SwhBidirectionalGraph<()> {
668    pub fn new(basepath: impl AsRef<Path>) -> Result<Self> {
669        let basepath = basepath.as_ref().to_owned();
670        let forward_graph = DefaultUnderlyingGraph(
671            BvGraph::with_basename(&basepath)
672                .endianness::<BE>()
673                .flags(MemoryFlags::TRANSPARENT_HUGE_PAGES | MemoryFlags::RANDOM_ACCESS)
674                .load()?,
675        );
676        let backward_graph = DefaultUnderlyingGraph(
677            BvGraph::with_basename(suffix_path(&basepath, "-transposed"))
678                .endianness::<BE>()
679                .flags(MemoryFlags::TRANSPARENT_HUGE_PAGES | MemoryFlags::RANDOM_ACCESS)
680                .load()?,
681        );
682        Ok(Self::from_underlying_graphs(
683            basepath,
684            forward_graph,
685            backward_graph,
686        ))
687    }
688}
689
690impl<FG: UnderlyingGraph, BG: UnderlyingGraph> SwhBidirectionalGraph<(), FG, BG> {
691    pub fn from_underlying_graphs(
692        basepath: PathBuf,
693        forward_graph: FG,
694        backward_graph: BG,
695    ) -> Self {
696        SwhBidirectionalGraph {
697            basepath,
698            backward_graph,
699            forward_graph,
700            properties: (),
701        }
702    }
703}
704
705impl<P, FG: UnderlyingGraph, BG: UnderlyingGraph> SwhGraph for SwhBidirectionalGraph<P, FG, BG> {
706    fn path(&self) -> &Path {
707        self.basepath.as_path()
708    }
709
710    fn is_transposed(&self) -> bool {
711        // Technically, users can load the 'graph-transposed' directly.
712        // However, unless they rename files, this will fail to load properties, because
713        // properties' file names wouldn't match the base path.
714        // As 'is_transposed' is only useful when checking node types (of an arc),
715        // this function is unlikely to be called in that scenario, so this should be fine.
716        false
717    }
718
719    fn num_nodes(&self) -> usize {
720        self.forward_graph.num_nodes()
721    }
722
723    fn num_arcs(&self) -> u64 {
724        UnderlyingGraph::num_arcs(&self.forward_graph)
725    }
726
727    fn has_arc(&self, src_node_id: NodeId, dst_node_id: NodeId) -> bool {
728        self.forward_graph.has_arc(src_node_id, dst_node_id)
729    }
730}
731
732impl<P, FG: UnderlyingGraph, BG: UnderlyingGraph> SwhForwardGraph
733    for SwhBidirectionalGraph<P, FG, BG>
734{
735    type Successors<'succ>
736        = <FG as UnderlyingGraph>::UnlabeledSuccessors<'succ>
737    where
738        Self: 'succ;
739
740    fn successors(&self, node_id: NodeId) -> Self::Successors<'_> {
741        self.forward_graph.unlabeled_successors(node_id)
742    }
743    fn outdegree(&self, node_id: NodeId) -> usize {
744        self.forward_graph.outdegree(node_id)
745    }
746}
747
748impl<P, FG: UnderlyingGraph, BG: UnderlyingGraph> SwhLabeledForwardGraph
749    for SwhBidirectionalGraph<P, FG, BG>
750where
751    <FG as SequentialLabeling>::Label: Pair<Left = NodeId, Right: IntoIterator<Item: Borrow<u64>>>,
752    for<'succ> <FG as RandomAccessLabeling>::Labels<'succ>:
753        Iterator<Item = (usize, <<FG as SequentialLabeling>::Label as Pair>::Right)>,
754{
755    type LabeledArcs<'arc> = LabeledArcIterator<<<<<FG as RandomAccessLabeling>::Labels<'arc> as Iterator>::Item as Pair>::Right as IntoIterator>::IntoIter> where Self: 'arc;
756    type LabeledSuccessors<'succ>
757        = LabeledSuccessorIterator<<FG as RandomAccessLabeling>::Labels<'succ>>
758    where
759        Self: 'succ;
760
761    fn untyped_labeled_successors(&self, node_id: NodeId) -> Self::LabeledSuccessors<'_> {
762        LabeledSuccessorIterator {
763            successors: self.forward_graph.labels(node_id),
764        }
765    }
766}
767
768impl<P, FG: UnderlyingGraph, BG: UnderlyingGraph> SwhBackwardGraph
769    for SwhBidirectionalGraph<P, FG, BG>
770{
771    type Predecessors<'succ>
772        = <BG as UnderlyingGraph>::UnlabeledSuccessors<'succ>
773    where
774        Self: 'succ;
775
776    fn predecessors(&self, node_id: NodeId) -> Self::Predecessors<'_> {
777        self.backward_graph.unlabeled_successors(node_id)
778    }
779
780    fn indegree(&self, node_id: NodeId) -> usize {
781        self.backward_graph.outdegree(node_id)
782    }
783}
784
785impl<P, FG: UnderlyingGraph, BG: UnderlyingGraph> SwhLabeledBackwardGraph
786    for SwhBidirectionalGraph<P, FG, BG>
787where
788    <BG as SequentialLabeling>::Label: Pair<Left = NodeId, Right: IntoIterator<Item: Borrow<u64>>>,
789    for<'succ> <BG as RandomAccessLabeling>::Labels<'succ>:
790        Iterator<Item = (usize, <<BG as SequentialLabeling>::Label as Pair>::Right)>,
791{
792    type LabeledArcs<'arc> = LabeledArcIterator<<<<<BG as RandomAccessLabeling>::Labels<'arc> as Iterator>::Item as Pair>::Right as IntoIterator>::IntoIter> where Self: 'arc;
793    type LabeledPredecessors<'succ>
794        = LabeledSuccessorIterator<<BG as RandomAccessLabeling>::Labels<'succ>>
795    where
796        Self: 'succ;
797
798    fn untyped_labeled_predecessors(&self, node_id: NodeId) -> Self::LabeledPredecessors<'_> {
799        LabeledSuccessorIterator {
800            successors: self.backward_graph.labels(node_id),
801        }
802    }
803}
804
805impl<
806        M: properties::MaybeMaps,
807        T: properties::MaybeTimestamps,
808        P: properties::MaybePersons,
809        C: properties::MaybeContents,
810        S: properties::MaybeStrings,
811        N: properties::MaybeLabelNames,
812        BG: UnderlyingGraph,
813        FG: UnderlyingGraph,
814    > SwhBidirectionalGraph<properties::SwhGraphProperties<M, T, P, C, S, N>, FG, BG>
815{
816    /// Enriches the graph with more properties mmapped from disk
817    ///
818    /// # Example
819    ///
820    /// ```no_run
821    /// # use std::path::PathBuf;
822    /// use swh_graph::java_compat::mph::gov::GOVMPH;
823    /// use swh_graph::SwhGraphProperties;
824    ///
825    /// swh_graph::graph::SwhBidirectionalGraph::new(PathBuf::from("./graph"))
826    ///     .expect("Could not load graph")
827    ///     .init_properties()
828    ///     .load_properties(SwhGraphProperties::load_maps::<GOVMPH>)
829    ///     .expect("Could not load SWHID maps")
830    ///     .load_properties(SwhGraphProperties::load_timestamps)
831    ///     .expect("Could not load timestamps");
832    /// ```
833    pub fn load_properties<
834        M2: properties::MaybeMaps,
835        T2: properties::MaybeTimestamps,
836        P2: properties::MaybePersons,
837        C2: properties::MaybeContents,
838        S2: properties::MaybeStrings,
839        N2: properties::MaybeLabelNames,
840    >(
841        self,
842        loader: impl FnOnce(
843            properties::SwhGraphProperties<M, T, P, C, S, N>,
844        ) -> Result<properties::SwhGraphProperties<M2, T2, P2, C2, S2, N2>>,
845    ) -> Result<SwhBidirectionalGraph<properties::SwhGraphProperties<M2, T2, P2, C2, S2, N2>, FG, BG>>
846    {
847        Ok(SwhBidirectionalGraph {
848            properties: loader(self.properties)?,
849            basepath: self.basepath,
850            forward_graph: self.forward_graph,
851            backward_graph: self.backward_graph,
852        })
853    }
854}
855
856impl<FG: UnderlyingGraph, BG: UnderlyingGraph> SwhBidirectionalGraph<(), FG, BG> {
857    /// Prerequisite for `load_properties`
858    pub fn init_properties(
859        self,
860    ) -> SwhBidirectionalGraph<
861        properties::SwhGraphProperties<
862            properties::NoMaps,
863            properties::NoTimestamps,
864            properties::NoPersons,
865            properties::NoContents,
866            properties::NoStrings,
867            properties::NoLabelNames,
868        >,
869        FG,
870        BG,
871    > {
872        SwhBidirectionalGraph {
873            properties: properties::SwhGraphProperties::new(
874                &self.basepath,
875                self.forward_graph.num_nodes(),
876            ),
877            basepath: self.basepath,
878            forward_graph: self.forward_graph,
879            backward_graph: self.backward_graph,
880        }
881    }
882
883    /// Enriches the graph with more properties mmapped from disk
884    ///
885    /// # Example
886    ///
887    /// ```no_run
888    /// # use std::path::PathBuf;
889    /// use swh_graph::java_compat::mph::gov::GOVMPH;
890    ///
891    /// swh_graph::graph::SwhBidirectionalGraph::new(PathBuf::from("./graph"))
892    ///     .expect("Could not load graph")
893    ///     .load_all_properties::<GOVMPH>()
894    ///     .expect("Could not load properties");
895    /// ```
896    pub fn load_all_properties<MPHF: LoadableSwhidMphf>(
897        self,
898    ) -> Result<
899        SwhBidirectionalGraph<
900            properties::SwhGraphProperties<
901                properties::MappedMaps<MPHF>,
902                properties::MappedTimestamps,
903                properties::MappedPersons,
904                properties::MappedContents,
905                properties::MappedStrings,
906                properties::MappedLabelNames,
907            >,
908            FG,
909            BG,
910        >,
911    > {
912        self.init_properties()
913            .load_properties(|properties| properties.load_all())
914    }
915}
916
917impl<
918        MAPS: properties::MaybeMaps,
919        TIMESTAMPS: properties::MaybeTimestamps,
920        PERSONS: properties::MaybePersons,
921        CONTENTS: properties::MaybeContents,
922        STRINGS: properties::MaybeStrings,
923        LABELNAMES: properties::MaybeLabelNames,
924        BG: UnderlyingGraph,
925        FG: UnderlyingGraph,
926    > SwhGraphWithProperties
927    for SwhBidirectionalGraph<
928        properties::SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>,
929        FG,
930        BG,
931    >
932{
933    type Maps = MAPS;
934    type Timestamps = TIMESTAMPS;
935    type Persons = PERSONS;
936    type Contents = CONTENTS;
937    type Strings = STRINGS;
938    type LabelNames = LABELNAMES;
939
940    fn properties(
941        &self,
942    ) -> &properties::SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>
943    {
944        &self.properties
945    }
946}
947
948impl<P, FG: RandomAccessGraph + UnderlyingGraph, BG: UnderlyingGraph>
949    SwhBidirectionalGraph<P, FG, BG>
950{
951    /// Consumes this graph and returns a new one that implements [`SwhLabeledForwardGraph`]
952    pub fn load_forward_labels(self) -> Result<SwhBidirectionalGraph<P, Zip<FG, SwhLabeling>, BG>> {
953        Ok(SwhBidirectionalGraph {
954            forward_graph: zip_labels(self.forward_graph, suffix_path(&self.basepath, "-labelled"))
955                .context("Could not load forward labels")?,
956            backward_graph: self.backward_graph,
957            properties: self.properties,
958            basepath: self.basepath,
959        })
960    }
961}
962
963impl<P, FG: UnderlyingGraph, BG: RandomAccessGraph + UnderlyingGraph>
964    SwhBidirectionalGraph<P, FG, BG>
965{
966    /// Consumes this graph and returns a new one that implements [`SwhLabeledBackwardGraph`]
967    pub fn load_backward_labels(
968        self,
969    ) -> Result<SwhBidirectionalGraph<P, FG, Zip<BG, SwhLabeling>>> {
970        Ok(SwhBidirectionalGraph {
971            forward_graph: self.forward_graph,
972            backward_graph: zip_labels(
973                self.backward_graph,
974                // Note: keep british version of "labelled" here for compatibility with Java swh-graph
975                suffix_path(&self.basepath, "-transposed-labelled"),
976            )
977            .context("Could not load backward labels")?,
978            properties: self.properties,
979            basepath: self.basepath,
980        })
981    }
982}
983
984impl<P, FG: RandomAccessGraph + UnderlyingGraph, BG: RandomAccessGraph + UnderlyingGraph>
985    SwhBidirectionalGraph<P, FG, BG>
986{
987    /// Equivalent to calling both
988    /// [`load_forward_labels`](SwhBidirectionalGraph::load_forward_labels) and
989    /// [`load_backward_labels`](SwhBidirectionalGraph::load_backward_labels)
990    pub fn load_labels(
991        self,
992    ) -> Result<SwhBidirectionalGraph<P, Zip<FG, SwhLabeling>, Zip<BG, SwhLabeling>>> {
993        self.load_forward_labels()
994            .context("Could not load forward labels")?
995            .load_backward_labels()
996            .context("Could not load backward labels")
997    }
998}
999
1000#[deprecated(
1001    since = "5.2.0",
1002    note = "please use `SwhUnidirectionalGraph::new` instead"
1003)]
1004/// Deprecated alias for [`SwhUnidirectionalGraph::new`]
1005pub fn load_unidirectional(basepath: impl AsRef<Path>) -> Result<SwhUnidirectionalGraph<()>> {
1006    SwhUnidirectionalGraph::new(basepath)
1007}
1008
1009#[deprecated(
1010    since = "5.2.0",
1011    note = "please use `SwhBidirectionalGraph::new` instead"
1012)]
1013/// Deprecated alias for [`SwhBidirectionalGraph::new`]
1014pub fn load_bidirectional(basepath: impl AsRef<Path>) -> Result<SwhBidirectionalGraph<()>> {
1015    SwhBidirectionalGraph::new(basepath)
1016}
1017
1018/// Loads a bidirectional graph, then all its properties, then its labels.
1019///
1020/// This is a shorthand for:
1021///
1022/// ```no_run
1023/// # use std::path::PathBuf;
1024/// # use anyhow::{Context, Result};
1025/// # use swh_graph::graph::SwhBidirectionalGraph;
1026/// # use swh_graph::mph::LoadableSwhidMphf;
1027/// #
1028/// # fn f<MPHF: LoadableSwhidMphf>() -> Result<()> {
1029/// # let basepath = PathBuf::from("./graph");
1030/// let graph = SwhBidirectionalGraph::new(basepath)
1031///     .context("Could not load graph")?
1032///     .load_all_properties::<MPHF>()
1033///     .context("Could not load properties")?
1034///     .load_labels()
1035///     .context("Could not load labels")?;
1036/// # Ok(())
1037/// # }
1038/// ```
1039///
1040/// When working on code made to be re-distributed, you should load only what is needed
1041/// (ie. load unidirectional if you don't need the backward graph, don't load properties
1042/// that are not needed, don't load labels unless they need to be read), so users can
1043/// run your code without a complete graph locally.
1044pub fn load_full<MPHF: LoadableSwhidMphf>(
1045    basepath: impl AsRef<Path>,
1046) -> Result<
1047    SwhBidirectionalGraph<
1048        properties::SwhGraphProperties<
1049            properties::MappedMaps<MPHF>,
1050            properties::MappedTimestamps,
1051            properties::MappedPersons,
1052            properties::MappedContents,
1053            properties::MappedStrings,
1054            properties::MappedLabelNames,
1055        >,
1056        Zip<DefaultUnderlyingGraph, SwhLabeling>,
1057        Zip<DefaultUnderlyingGraph, SwhLabeling>,
1058    >,
1059> {
1060    SwhBidirectionalGraph::new(basepath)
1061        .context("Could not load graph")?
1062        .load_all_properties()
1063        .context("Could not load properties")?
1064        .load_labels()
1065        .context("Could not load labels")
1066}
1067
1068fn zip_labels<G: RandomAccessGraph + UnderlyingGraph, P: AsRef<Path>>(
1069    graph: G,
1070    base_path: P,
1071) -> Result<Zip<G, SwhLabeling>> {
1072    let properties_path = suffix_path(&base_path, ".properties");
1073    let f = std::fs::File::open(&properties_path)
1074        .with_context(|| format!("Could not open {}", properties_path.display()))?;
1075    let map = java_properties::read(std::io::BufReader::new(f))?;
1076
1077    let graphclass = map
1078        .get("graphclass")
1079        .with_context(|| format!("Missing 'graphclass' from {}", properties_path.display()))?;
1080    // Note: keep british version of "labelled" here for compatibility with Java swh-graph
1081    if graphclass != "it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph" {
1082        bail!(
1083            "Expected graphclass in {} to be \"it.unimi.dsi.big.webgraph.labelling.BitStreamArcLabelledImmutableGraph\", got {:?}",
1084            properties_path.display(),
1085            graphclass
1086        );
1087    }
1088
1089    let labelspec = map
1090        .get("labelspec")
1091        .with_context(|| format!("Missing 'labelspec' from {}", properties_path.display()))?;
1092    let width = labelspec
1093        .strip_prefix("org.softwareheritage.graph.labels.SwhLabel(DirEntry,")
1094        .and_then(|labelspec| labelspec.strip_suffix(')'))
1095        .and_then(|labelspec| labelspec.parse::<usize>().ok());
1096    let width = match width {
1097        None =>
1098        bail!("Expected labelspec in {} to be \"org.softwareheritage.graph.labels.SwhLabel(DirEntry,<integer>)\" (where <integer> is a small integer, usually under 30), got {:?}", properties_path.display(), labelspec),
1099        Some(width) => width
1100    };
1101
1102    let labels = crate::labeling::mmap(&base_path, crate::labeling::SwhDeserializer::new(width))
1103        .with_context(|| {
1104            format!(
1105                "Could not load labeling from {}",
1106                base_path.as_ref().display()
1107            )
1108        })?;
1109
1110    debug_assert!(webgraph::prelude::Zip(&graph, &labels).verify());
1111    Ok(Zip(graph, labels))
1112}