swh_graph/properties/mod.rs
1// Copyright (C) 2023-2024 The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6//! Node labels
7//!
8//! [`SwhGraphProperties`] is populated by the `load_properties` and `load_all_properties`
9//! of [`SwhUnidirectionalGraph`](crate::graph::SwhUnidirectionalGraph) and
10//! [`SwhBidirectionalGraph`](crate::graph::SwhBidirectionalGraph) and returned by
11//! their `properties` method.
12//!
13//! ```no_run
14//! # use std::path::PathBuf;
15//! use swh_graph::graph::SwhGraphWithProperties;
16//! use swh_graph::mph::DynMphf;
17//! use swh_graph::SwhGraphProperties;
18//!
19//! let properties: &SwhGraphProperties<_, _, _, _, _, _> =
20//! swh_graph::graph::SwhUnidirectionalGraph::new(PathBuf::from("./graph"))
21//! .expect("Could not load graph")
22//! .load_all_properties::<DynMphf>()
23//! .expect("Could not load properties")
24//! .properties();
25//! ```
26
27use std::path::{Path, PathBuf};
28
29use anyhow::{Context, Result};
30use byteorder::BigEndian;
31use mmap_rs::Mmap;
32
33use crate::mph::LoadableSwhidMphf;
34use crate::utils::mmap::NumberMmap;
35use crate::utils::GetIndex;
36use crate::OutOfBoundError;
37
38pub(crate) mod suffixes {
39 pub const NODE2SWHID: &str = ".node2swhid.bin";
40 pub const NODE2TYPE: &str = ".node2type.bin";
41 pub const AUTHOR_TIMESTAMP: &str = ".property.author_timestamp.bin";
42 pub const AUTHOR_TIMESTAMP_OFFSET: &str = ".property.author_timestamp_offset.bin";
43 pub const COMMITTER_TIMESTAMP: &str = ".property.committer_timestamp.bin";
44 pub const COMMITTER_TIMESTAMP_OFFSET: &str = ".property.committer_timestamp_offset.bin";
45 pub const AUTHOR_ID: &str = ".property.author_id.bin";
46 pub const COMMITTER_ID: &str = ".property.committer_id.bin";
47 pub const CONTENT_IS_SKIPPED: &str = ".property.content.is_skipped.bits";
48 pub const CONTENT_LENGTH: &str = ".property.content.length.bin";
49 pub const MESSAGE: &str = ".property.message.bin";
50 pub const MESSAGE_OFFSET: &str = ".property.message.offset.bin";
51 pub const TAG_NAME: &str = ".property.tag_name.bin";
52 pub const TAG_NAME_OFFSET: &str = ".property.tag_name.offset.bin";
53 pub const LABEL_NAME: &str = ".labels.fcl";
54}
55
56#[derive(thiserror::Error, Debug)]
57#[error("{path} cannot be loaded: {source}")]
58pub struct UnavailableProperty {
59 path: PathBuf,
60 #[source]
61 source: std::io::Error,
62}
63
64/// Wrapper for the return type of [`SwhGraphProperties`] methods.
65///
66/// When `B` implements `GuaranteedDataFiles` (the most common case), `PropertiesResult<'err, T, B>`
67/// is exactly the same type as `T`.
68///
69/// aWhen `B` implements `OptionalDataFiles` (which is the case when using
70/// `opt_load_*` instead of `load_*` or [`load_all`](SwhGraphProperties::load_all) for example),
71/// then `PropertiesResult<'err, T, B>` is exactly the same type as `Result<T, &'err UnavailableProperty>`.
72pub type PropertiesResult<'err, T, B> =
73 <<B as PropertiesBackend>::DataFilesAvailability as DataFilesAvailability>::Result<'err, T>;
74
75/// Common trait for type parameters of [`SwhGraphProperties`]
76pub trait PropertiesBackend {
77 type DataFilesAvailability: DataFilesAvailability;
78
79 /// Applies the given function `f` to the value `v` if the value is available
80 ///
81 /// This is an alias for `<Self::DataFilesAvailability as DataFilesAvailability>::map(v, f)`,
82 /// meaning that:
83 ///
84 /// 1. if `Self::DataFilesAvailability` is `GuaranteedDataFiles`, then `map_if_available(v, f)`
85 /// is equivalent to `f(v)` and has type `U`
86 /// 2. if `Self::DataFilesAvailability` is `OptionalDataFiles`, then `map_if_available(v, f)`
87 /// is equivalent to `v.map(f)` and has type `Result<U, &'err UnavailableProperty>`
88 fn map_if_available<T, U>(
89 v: <Self::DataFilesAvailability as DataFilesAvailability>::Result<'_, T>,
90 f: impl FnOnce(T) -> U,
91 ) -> <Self::DataFilesAvailability as DataFilesAvailability>::Result<'_, U> {
92 <Self::DataFilesAvailability as DataFilesAvailability>::map(v, f)
93 }
94
95 /// Returns `(v1, v2)` if both are available, or an error otherwise
96 ///
97 /// This is an alias for `<Self::DataFilesAvailability as DataFilesAvailability>::zip(v, f)`,
98 /// meaning that:
99 ///
100 /// 1. if `Self::DataFilesAvailability` is `GuaranteedDataFiles`, then `zip_if_available(v1, v2)`
101 /// is equivalent to `(v1, v2)` and has type `(T1, T2)`
102 /// 2. if `Self::DataFilesAvailability` is `OptionalDataFiles`, then `zip_if_available(v1, v2)`
103 /// is equivalent to `v1.and_then(|v1| v2.map(|v2| (v1, v2)))` and has type
104 /// `Result<(T1, T2), &'err UnavailableProperty>`
105 fn zip_if_available<'err, T1, T2>(
106 v1: <Self::DataFilesAvailability as DataFilesAvailability>::Result<'err, T1>,
107 v2: <Self::DataFilesAvailability as DataFilesAvailability>::Result<'err, T2>,
108 ) -> <Self::DataFilesAvailability as DataFilesAvailability>::Result<'err, (T1, T2)> {
109 <Self::DataFilesAvailability as DataFilesAvailability>::zip(v1, v2)
110 }
111}
112
113/// Helper trait to work with [`PropertiesResult`]
114///
115/// It is implemented by:
116/// * [`GuaranteedDataFiles`]: the common case, where data files are guaranteed to exist
117/// once a graph is loaded, in which case `Self::Result<'err, T>` is the same type as `T`
118/// * [`OptionalDataFiles`]: when they are not, in which case `Self::Result<T>`
119/// is the same type as `Result<T, &'err UnavailableProperty>`.
120pub trait DataFilesAvailability {
121 type Result<'err, T>;
122
123 fn map<T, U>(v: Self::Result<'_, T>, f: impl FnOnce(T) -> U) -> Self::Result<'_, U>;
124 fn zip<'err, T1, T2>(
125 v1: Self::Result<'err, T1>,
126 v2: Self::Result<'err, T2>,
127 ) -> Self::Result<'err, (T1, T2)>;
128 fn make_result<T>(value: Self::Result<'_, T>) -> Result<T, &UnavailableProperty>;
129}
130
131/// Helper type that implements [`DataFilesAvailability`] to signal underlying data files
132/// may be missing at runtime
133pub struct OptionalDataFiles {
134 _marker: (), // Prevents users from instantiating
135}
136
137/// Helper type that implements [`DataFilesAvailability`] to signal underlying data files
138/// are guaranteed to be available once the graph is loaded
139pub struct GuaranteedDataFiles {
140 _marker: (), // Prevents users from instantiating
141}
142
143impl DataFilesAvailability for OptionalDataFiles {
144 type Result<'err, T> = Result<T, &'err UnavailableProperty>;
145
146 fn map<T, U>(v: Self::Result<'_, T>, f: impl FnOnce(T) -> U) -> Self::Result<'_, U> {
147 v.map(f)
148 }
149
150 fn zip<'err, T1, T2>(
151 v1: Self::Result<'err, T1>,
152 v2: Self::Result<'err, T2>,
153 ) -> Self::Result<'err, (T1, T2)> {
154 v1.and_then(|v1| v2.map(|v2| (v1, v2)))
155 }
156
157 fn make_result<T>(value: Self::Result<'_, T>) -> Result<T, &UnavailableProperty> {
158 value
159 }
160}
161
162impl DataFilesAvailability for GuaranteedDataFiles {
163 type Result<'err, T> = T;
164
165 fn map<T, U>(v: Self::Result<'_, T>, f: impl FnOnce(T) -> U) -> Self::Result<'_, U> {
166 f(v)
167 }
168
169 fn zip<'err, T1, T2>(
170 v1: Self::Result<'err, T1>,
171 v2: Self::Result<'err, T2>,
172 ) -> Self::Result<'err, (T1, T2)> {
173 (v1, v2)
174 }
175
176 fn make_result<T>(value: Self::Result<'_, T>) -> Result<T, &UnavailableProperty> {
177 Ok(value)
178 }
179}
180
181/// Properties on graph nodes
182///
183/// This structures has many type parameters, to allow loading only some properties,
184/// and checking at compile time that only loaded properties are accessed.
185///
186/// Extra properties can be loaded, following the builder pattern on the owning graph.
187/// For example, this does not compile:
188///
189/// ```compile_fail
190/// # use std::path::PathBuf;
191/// use swh_graph::graph::SwhGraphWithProperties;
192/// use swh_graph::mph::DynMphf;
193/// use swh_graph::SwhGraphProperties;
194///
195/// swh_graph::graph::SwhUnidirectionalGraph::new(PathBuf::from("./graph"))
196/// .expect("Could not load graph")
197/// .init_properties()
198/// .properties()
199/// .author_timestamp(42);
200/// ```
201///
202/// but this does:
203///
204/// ```no_run
205/// # use std::path::PathBuf;
206/// use swh_graph::graph::SwhGraphWithProperties;
207/// use swh_graph::mph::DynMphf;
208/// use swh_graph::SwhGraphProperties;
209///
210/// swh_graph::graph::SwhUnidirectionalGraph::new(PathBuf::from("./graph"))
211/// .expect("Could not load graph")
212/// .init_properties()
213/// .load_properties(SwhGraphProperties::load_timestamps)
214/// .expect("Could not load timestamp properties")
215/// .properties()
216/// .author_timestamp(42);
217/// ```
218pub struct SwhGraphProperties<
219 MAPS: MaybeMaps,
220 TIMESTAMPS: MaybeTimestamps,
221 PERSONS: MaybePersons,
222 CONTENTS: MaybeContents,
223 STRINGS: MaybeStrings,
224 LABELNAMES: MaybeLabelNames,
225> {
226 path: PathBuf,
227 num_nodes: usize,
228 pub(crate) maps: MAPS,
229 pub(crate) timestamps: TIMESTAMPS,
230 pub(crate) persons: PERSONS,
231 pub(crate) contents: CONTENTS,
232 pub(crate) strings: STRINGS,
233 pub(crate) label_names: LABELNAMES,
234 /// Hack: `Some(false)` if the graph was compressed with Rust (2023-09-06 and newer),
235 /// `Some(true)` if the graph was compressed with Java (2022-12-07 and older),
236 /// `None` if we don't know yet (as we compute this lazily)
237 pub(crate) label_names_are_in_base64_order: once_cell::race::OnceBool,
238}
239
240pub type AllSwhGraphProperties<MPHF> = SwhGraphProperties<
241 MappedMaps<MPHF>,
242 MappedTimestamps,
243 MappedPersons,
244 MappedContents,
245 MappedStrings,
246 MappedLabelNames,
247>;
248
249pub type AllSwhGraphDynProperties<MPHF> = SwhGraphProperties<
250 MappedMaps<MPHF>,
251 OptMappedTimestamps,
252 OptMappedPersons,
253 OptMappedContents,
254 OptMappedStrings,
255 MappedLabelNames,
256>;
257
258impl SwhGraphProperties<NoMaps, NoTimestamps, NoPersons, NoContents, NoStrings, NoLabelNames> {
259 /// Creates an empty [`SwhGraphProperties`] instance, which will load properties
260 /// from the given path prefix.
261 pub fn new(path: impl AsRef<Path>, num_nodes: usize) -> Self {
262 SwhGraphProperties {
263 path: path.as_ref().to_owned(),
264 num_nodes,
265 maps: NoMaps,
266 timestamps: NoTimestamps,
267 persons: NoPersons,
268 contents: NoContents,
269 strings: NoStrings,
270 label_names: NoLabelNames,
271 label_names_are_in_base64_order: Default::default(),
272 }
273 }
274
275 /// Consumes an empty [`SwhGraphProperties`] instance and returns a new one
276 /// with all properties loaded and all methods available.
277 ///
278 /// ```no_run
279 /// # use std::path::PathBuf;
280 /// use swh_graph::graph::SwhGraphWithProperties;
281 /// use swh_graph::mph::DynMphf;
282 /// use swh_graph::SwhGraphProperties;
283 ///
284 /// SwhGraphProperties::new(PathBuf::from("./graph"), 123)
285 /// .load_all::<DynMphf>()
286 /// .expect("Could not load properties");
287 /// ```
288 ///
289 /// is equivalent to:
290 ///
291 /// ```no_run
292 /// # use std::path::PathBuf;
293 /// use swh_graph::mph::DynMphf;
294 /// use swh_graph::SwhGraphProperties;
295 ///
296 /// SwhGraphProperties::new(PathBuf::from("./graph"), 123)
297 /// .load_maps::<DynMphf>()
298 /// .expect("Could not load node2swhid/swhid2node")
299 /// .load_timestamps()
300 /// .expect("Could not load timestamp properties")
301 /// .load_persons()
302 /// .expect("Could not load person properties")
303 /// .load_contents()
304 /// .expect("Could not load content properties")
305 /// .load_strings()
306 /// .expect("Could not load string properties");
307 /// ```
308 pub fn load_all<MPHF: LoadableSwhidMphf>(self) -> Result<AllSwhGraphProperties<MPHF>> {
309 self.load_maps()?
310 .load_timestamps()?
311 .load_persons()?
312 .load_contents()?
313 .load_strings()?
314 .load_label_names()
315 }
316}
317
318mod maps;
319pub use maps::{MappedMaps, Maps, MaybeMaps, NoMaps, NodeIdFromSwhidError, VecMaps};
320
321mod timestamps;
322pub use timestamps::{
323 MappedTimestamps, MaybeTimestamps, NoTimestamps, OptMappedTimestamps, OptTimestamps,
324 Timestamps, VecTimestamps,
325};
326
327mod persons;
328pub use persons::{
329 MappedPersons, MaybePersons, NoPersons, OptMappedPersons, OptPersons, Persons, VecPersons,
330};
331
332mod contents;
333pub use contents::{
334 Contents, MappedContents, MaybeContents, NoContents, OptContents, OptMappedContents,
335 VecContents,
336};
337
338mod strings;
339pub use strings::{
340 MappedStrings, MaybeStrings, NoStrings, OptMappedStrings, OptStrings, Strings, VecStrings,
341};
342
343mod label_names;
344pub use label_names::{
345 LabelIdFromNameError, LabelNames, MappedLabelNames, MaybeLabelNames, NoLabelNames,
346 VecLabelNames,
347};
348
349mod utils;
350use utils::*;