swh_graph/properties/
mod.rs

1// Copyright (C) 2023-2024  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6//! Node labels
7//!
8//! [`SwhGraphProperties`] is populated by the `load_properties` and `load_all_properties`
9//! of [`SwhUnidirectionalGraph`](crate::graph::SwhUnidirectionalGraph) and
10//! [`SwhBidirectionalGraph`](crate::graph::SwhBidirectionalGraph) and returned by
11//! their `properties` method.
12//!
13//! ```no_run
14//! # use std::path::PathBuf;
15//! use swh_graph::graph::SwhGraphWithProperties;
16//! use swh_graph::mph::DynMphf;
17//! use swh_graph::SwhGraphProperties;
18//!
19//! let properties: &SwhGraphProperties<_, _, _, _, _, _> =
20//!     swh_graph::graph::SwhUnidirectionalGraph::new(PathBuf::from("./graph"))
21//!     .expect("Could not load graph")
22//!     .load_all_properties::<DynMphf>()
23//!     .expect("Could not load properties")
24//!     .properties();
25//! ```
26
27use std::path::{Path, PathBuf};
28
29use anyhow::{Context, Result};
30use byteorder::BigEndian;
31use mmap_rs::Mmap;
32
33use crate::mph::LoadableSwhidMphf;
34use crate::utils::mmap::NumberMmap;
35use crate::utils::GetIndex;
36use crate::OutOfBoundError;
37
38pub(crate) mod suffixes {
39    pub const NODE2SWHID: &str = ".node2swhid.bin";
40    pub const NODE2TYPE: &str = ".node2type.bin";
41    pub const AUTHOR_TIMESTAMP: &str = ".property.author_timestamp.bin";
42    pub const AUTHOR_TIMESTAMP_OFFSET: &str = ".property.author_timestamp_offset.bin";
43    pub const COMMITTER_TIMESTAMP: &str = ".property.committer_timestamp.bin";
44    pub const COMMITTER_TIMESTAMP_OFFSET: &str = ".property.committer_timestamp_offset.bin";
45    pub const AUTHOR_ID: &str = ".property.author_id.bin";
46    pub const COMMITTER_ID: &str = ".property.committer_id.bin";
47    pub const CONTENT_IS_SKIPPED: &str = ".property.content.is_skipped.bits";
48    pub const CONTENT_LENGTH: &str = ".property.content.length.bin";
49    pub const MESSAGE: &str = ".property.message.bin";
50    pub const MESSAGE_OFFSET: &str = ".property.message.offset.bin";
51    pub const TAG_NAME: &str = ".property.tag_name.bin";
52    pub const TAG_NAME_OFFSET: &str = ".property.tag_name.offset.bin";
53    pub const LABEL_NAME: &str = ".labels.fcl";
54}
55
56#[derive(thiserror::Error, Debug)]
57#[error("{path} cannot be loaded: {source}")]
58pub struct UnavailableProperty {
59    path: PathBuf,
60    #[source]
61    source: std::io::Error,
62}
63
64/// Wrapper for the return type of [`SwhGraphProperties`] methods.
65///
66/// When `B` implements `GuaranteedDataFiles` (the most common case), `PropertiesResult<'err, T, B>`
67/// is exactly the same type as `T`.
68///
69/// aWhen `B` implements `OptionalDataFiles` (which is the case when using
70/// `opt_load_*` instead of `load_*` or [`load_all`](SwhGraphProperties::load_all) for example),
71/// then `PropertiesResult<'err, T, B>` is exactly the same type as `Result<T, &'err UnavailableProperty>`.
72pub type PropertiesResult<'err, T, B> =
73    <<B as PropertiesBackend>::DataFilesAvailability as DataFilesAvailability>::Result<'err, T>;
74
75/// Common trait for type parameters of [`SwhGraphProperties`]
76pub trait PropertiesBackend {
77    type DataFilesAvailability: DataFilesAvailability;
78
79    /// Applies the given function `f` to the value `v` if the value is available
80    ///
81    /// This is an alias for `<Self::DataFilesAvailability as DataFilesAvailability>::map(v, f)`,
82    /// meaning that:
83    ///
84    /// 1. if `Self::DataFilesAvailability` is `GuaranteedDataFiles`, then `map_if_available(v, f)`
85    ///    is equivalent to `f(v)` and has type `U`
86    /// 2. if `Self::DataFilesAvailability` is `OptionalDataFiles`, then `map_if_available(v, f)`
87    ///    is equivalent to `v.map(f)` and has type `Result<U, &'err UnavailableProperty>`
88    fn map_if_available<T, U>(
89        v: <Self::DataFilesAvailability as DataFilesAvailability>::Result<'_, T>,
90        f: impl FnOnce(T) -> U,
91    ) -> <Self::DataFilesAvailability as DataFilesAvailability>::Result<'_, U> {
92        <Self::DataFilesAvailability as DataFilesAvailability>::map(v, f)
93    }
94
95    /// Returns `(v1, v2)` if both are available, or an error otherwise
96    ///
97    /// This is an alias for `<Self::DataFilesAvailability as DataFilesAvailability>::zip(v, f)`,
98    /// meaning that:
99    ///
100    /// 1. if `Self::DataFilesAvailability` is `GuaranteedDataFiles`, then `zip_if_available(v1, v2)`
101    ///    is equivalent to `(v1, v2)` and has type `(T1, T2)`
102    /// 2. if `Self::DataFilesAvailability` is `OptionalDataFiles`, then `zip_if_available(v1, v2)`
103    ///    is equivalent to `v1.and_then(|v1| v2.map(|v2| (v1, v2)))` and has type
104    ///    `Result<(T1, T2), &'err UnavailableProperty>`
105    fn zip_if_available<'err, T1, T2>(
106        v1: <Self::DataFilesAvailability as DataFilesAvailability>::Result<'err, T1>,
107        v2: <Self::DataFilesAvailability as DataFilesAvailability>::Result<'err, T2>,
108    ) -> <Self::DataFilesAvailability as DataFilesAvailability>::Result<'err, (T1, T2)> {
109        <Self::DataFilesAvailability as DataFilesAvailability>::zip(v1, v2)
110    }
111}
112
113/// Helper trait to work with [`PropertiesResult`]
114///
115/// It is implemented by:
116/// * [`GuaranteedDataFiles`]: the common case, where data files are guaranteed to exist
117///   once a graph is loaded, in which case `Self::Result<'err, T>` is the same type as `T`
118/// * [`OptionalDataFiles`]: when they are not, in which case `Self::Result<T>`
119///   is the same type as `Result<T, &'err UnavailableProperty>`.
120pub trait DataFilesAvailability {
121    type Result<'err, T>;
122
123    fn map<T, U>(v: Self::Result<'_, T>, f: impl FnOnce(T) -> U) -> Self::Result<'_, U>;
124    fn zip<'err, T1, T2>(
125        v1: Self::Result<'err, T1>,
126        v2: Self::Result<'err, T2>,
127    ) -> Self::Result<'err, (T1, T2)>;
128    fn make_result<T>(value: Self::Result<'_, T>) -> Result<T, &UnavailableProperty>;
129}
130
131/// Helper type that implements [`DataFilesAvailability`] to signal underlying data files
132/// may be missing at runtime
133pub struct OptionalDataFiles {
134    _marker: (), // Prevents users from instantiating
135}
136
137/// Helper type that implements [`DataFilesAvailability`] to signal underlying data files
138/// are guaranteed to be available once the graph is loaded
139pub struct GuaranteedDataFiles {
140    _marker: (), // Prevents users from instantiating
141}
142
143impl DataFilesAvailability for OptionalDataFiles {
144    type Result<'err, T> = Result<T, &'err UnavailableProperty>;
145
146    #[inline(always)]
147    fn map<T, U>(v: Self::Result<'_, T>, f: impl FnOnce(T) -> U) -> Self::Result<'_, U> {
148        v.map(f)
149    }
150
151    #[inline(always)]
152    fn zip<'err, T1, T2>(
153        v1: Self::Result<'err, T1>,
154        v2: Self::Result<'err, T2>,
155    ) -> Self::Result<'err, (T1, T2)> {
156        v1.and_then(|v1| v2.map(|v2| (v1, v2)))
157    }
158
159    #[inline(always)]
160    fn make_result<T>(value: Self::Result<'_, T>) -> Result<T, &UnavailableProperty> {
161        value
162    }
163}
164
165impl DataFilesAvailability for GuaranteedDataFiles {
166    type Result<'err, T> = T;
167
168    #[inline(always)]
169    fn map<T, U>(v: Self::Result<'_, T>, f: impl FnOnce(T) -> U) -> Self::Result<'_, U> {
170        f(v)
171    }
172
173    #[inline(always)]
174    fn zip<'err, T1, T2>(
175        v1: Self::Result<'err, T1>,
176        v2: Self::Result<'err, T2>,
177    ) -> Self::Result<'err, (T1, T2)> {
178        (v1, v2)
179    }
180
181    #[inline(always)]
182    fn make_result<T>(value: Self::Result<'_, T>) -> Result<T, &UnavailableProperty> {
183        Ok(value)
184    }
185}
186
187/// Properties on graph nodes
188///
189/// This structures has many type parameters, to allow loading only some properties,
190/// and checking at compile time that only loaded properties are accessed.
191///
192/// Extra properties can be loaded, following the builder pattern on the owning graph.
193/// For example, this does not compile:
194///
195/// ```compile_fail
196/// # use std::path::PathBuf;
197/// use swh_graph::graph::SwhGraphWithProperties;
198/// use swh_graph::mph::DynMphf;
199/// use swh_graph::SwhGraphProperties;
200///
201/// swh_graph::graph::SwhUnidirectionalGraph::new(PathBuf::from("./graph"))
202///     .expect("Could not load graph")
203///     .init_properties()
204///     .properties()
205///     .author_timestamp(42);
206/// ```
207///
208/// but this does:
209///
210/// ```no_run
211/// # use std::path::PathBuf;
212/// use swh_graph::graph::SwhGraphWithProperties;
213/// use swh_graph::mph::DynMphf;
214/// use swh_graph::SwhGraphProperties;
215///
216/// swh_graph::graph::SwhUnidirectionalGraph::new(PathBuf::from("./graph"))
217///     .expect("Could not load graph")
218///     .init_properties()
219///     .load_properties(SwhGraphProperties::load_timestamps)
220///     .expect("Could not load timestamp properties")
221///     .properties()
222///     .author_timestamp(42);
223/// ```
224pub struct SwhGraphProperties<
225    MAPS: MaybeMaps,
226    TIMESTAMPS: MaybeTimestamps,
227    PERSONS: MaybePersons,
228    CONTENTS: MaybeContents,
229    STRINGS: MaybeStrings,
230    LABELNAMES: MaybeLabelNames,
231> {
232    pub(crate) path: PathBuf,
233    pub(crate) num_nodes: usize,
234    pub(crate) maps: MAPS,
235    pub(crate) timestamps: TIMESTAMPS,
236    pub(crate) persons: PERSONS,
237    pub(crate) contents: CONTENTS,
238    pub(crate) strings: STRINGS,
239    pub(crate) label_names: LABELNAMES,
240    /// Hack: `Some(false)` if the graph was compressed with Rust (2023-09-06 and newer),
241    /// `Some(true)` if the graph was compressed with Java (2022-12-07 and older),
242    /// `None` if we don't know yet (as we compute this lazily)
243    pub(crate) label_names_are_in_base64_order: once_cell::race::OnceBool,
244}
245
246pub type AllSwhGraphProperties<MPHF> = SwhGraphProperties<
247    MappedMaps<MPHF>,
248    MappedTimestamps,
249    MappedPersons,
250    MappedContents,
251    MappedStrings,
252    MappedLabelNames,
253>;
254
255pub type AllSwhGraphDynProperties<MPHF> = SwhGraphProperties<
256    MappedMaps<MPHF>,
257    OptMappedTimestamps,
258    OptMappedPersons,
259    OptMappedContents,
260    OptMappedStrings,
261    MappedLabelNames,
262>;
263
264impl SwhGraphProperties<NoMaps, NoTimestamps, NoPersons, NoContents, NoStrings, NoLabelNames> {
265    /// Creates an empty [`SwhGraphProperties`] instance, which will load properties
266    /// from the given path prefix.
267    pub fn new(path: impl AsRef<Path>, num_nodes: usize) -> Self {
268        SwhGraphProperties {
269            path: path.as_ref().to_owned(),
270            num_nodes,
271            maps: NoMaps,
272            timestamps: NoTimestamps,
273            persons: NoPersons,
274            contents: NoContents,
275            strings: NoStrings,
276            label_names: NoLabelNames,
277            label_names_are_in_base64_order: Default::default(),
278        }
279    }
280
281    /// Consumes an empty [`SwhGraphProperties`] instance and returns a new one
282    /// with all properties loaded and all methods available.
283    ///
284    /// ```no_run
285    /// # use std::path::PathBuf;
286    ///  use swh_graph::graph::SwhGraphWithProperties;
287    /// use swh_graph::mph::DynMphf;
288    /// use swh_graph::SwhGraphProperties;
289    ///
290    /// SwhGraphProperties::new(PathBuf::from("./graph"), 123)
291    ///     .load_all::<DynMphf>()
292    ///     .expect("Could not load properties");
293    /// ```
294    ///
295    /// is equivalent to:
296    ///
297    /// ```no_run
298    /// # use std::path::PathBuf;
299    /// use swh_graph::mph::DynMphf;
300    /// use swh_graph::SwhGraphProperties;
301    ///
302    /// SwhGraphProperties::new(PathBuf::from("./graph"), 123)
303    ///     .load_maps::<DynMphf>()
304    ///     .expect("Could not load node2swhid/swhid2node")
305    ///     .load_timestamps()
306    ///     .expect("Could not load timestamp properties")
307    ///     .load_persons()
308    ///     .expect("Could not load person properties")
309    ///     .load_contents()
310    ///     .expect("Could not load content properties")
311    ///     .load_strings()
312    ///     .expect("Could not load string properties");
313    /// ```
314    pub fn load_all<MPHF: LoadableSwhidMphf>(self) -> Result<AllSwhGraphProperties<MPHF>> {
315        self.load_maps()?
316            .load_timestamps()?
317            .load_persons()?
318            .load_contents()?
319            .load_strings()?
320            .load_label_names()
321    }
322}
323
324mod maps;
325pub use maps::{MappedMaps, Maps, MaybeMaps, NoMaps, NodeIdFromSwhidError, VecMaps};
326
327mod timestamps;
328pub use timestamps::{
329    MappedTimestamps, MaybeTimestamps, NoTimestamps, OptMappedTimestamps, OptTimestamps,
330    Timestamps, VecTimestamps,
331};
332
333mod persons;
334pub use persons::{
335    MappedPersons, MaybePersons, NoPersons, OptMappedPersons, OptPersons, Persons, VecPersons,
336};
337
338mod contents;
339pub use contents::{
340    Contents, MappedContents, MaybeContents, NoContents, OptContents, OptMappedContents,
341    VecContents,
342};
343
344mod strings;
345pub use strings::{
346    MappedStrings, MaybeStrings, NoStrings, OptMappedStrings, OptStrings, Strings, VecStrings,
347};
348
349mod label_names;
350pub use label_names::{
351    LabelIdFromNameError, LabelNames, MappedLabelNames, MaybeLabelNames, NoLabelNames,
352    VecLabelNames,
353};
354
355mod utils;
356use utils::*;