swh_graph/properties/mod.rs
1// Copyright (C) 2023-2026 The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6//! Node labels
7//!
8//! [`SwhGraphProperties`] is populated by the `load_properties` and `load_all_properties`
9//! of [`SwhUnidirectionalGraph`](crate::graph::SwhUnidirectionalGraph) and
10//! [`SwhBidirectionalGraph`](crate::graph::SwhBidirectionalGraph) and returned by
11//! their `properties` method.
12//!
13//! ```no_run
14//! # use std::path::PathBuf;
15//! use swh_graph::graph::SwhGraphWithProperties;
16//! use swh_graph::mph::DynMphf;
17//! use swh_graph::SwhGraphProperties;
18//!
19//! let properties: &SwhGraphProperties<_, _, _, _, _, _> =
20//! swh_graph::graph::SwhUnidirectionalGraph::new(PathBuf::from("./graph"))
21//! .expect("Could not load graph")
22//! .load_all_properties::<DynMphf>()
23//! .expect("Could not load properties")
24//! .properties();
25//! ```
26
27use std::path::{Path, PathBuf};
28
29use anyhow::{Context, Result};
30use byteorder::BigEndian;
31use mmap_rs::Mmap;
32
33use crate::mph::LoadableSwhidMphf;
34use crate::utils::mmap::NumberMmap;
35use crate::OutOfBoundError;
36use value_traits::slices::SliceByValue;
37
38pub(crate) mod suffixes {
39 pub const NODE2SWHID: &str = ".node2swhid.bin";
40 pub const NODE2TYPE: &str = ".node2type.bin";
41 pub const AUTHOR_TIMESTAMP: &str = ".property.author_timestamp.bin";
42 pub const AUTHOR_TIMESTAMP_OFFSET: &str = ".property.author_timestamp_offset.bin";
43 pub const COMMITTER_TIMESTAMP: &str = ".property.committer_timestamp.bin";
44 pub const COMMITTER_TIMESTAMP_OFFSET: &str = ".property.committer_timestamp_offset.bin";
45 pub const AUTHOR_ID: &str = ".property.author_id.bin";
46 pub const COMMITTER_ID: &str = ".property.committer_id.bin";
47 pub const CONTENT_IS_SKIPPED: &str = ".property.content.is_skipped.bits";
48 pub const CONTENT_LENGTH: &str = ".property.content.length.bin";
49 pub const MESSAGE: &str = ".property.message.bin";
50 pub const MESSAGE_OFFSET: &str = ".property.message.offset.bin";
51 pub const TAG_NAME: &str = ".property.tag_name.bin";
52 pub const TAG_NAME_OFFSET: &str = ".property.tag_name.offset.bin";
53 pub const LABEL_NAME: &str = ".labels.fcl";
54 pub const PERSONS_COUNT: &str = ".persons.count.txt";
55}
56
57#[derive(thiserror::Error, Debug)]
58#[error("{path} cannot be loaded: {source}")]
59pub struct UnavailableProperty {
60 path: PathBuf,
61 #[source]
62 source: std::io::Error,
63}
64
65/// Wrapper for the return type of [`SwhGraphProperties`] methods.
66///
67/// When `B` implements `GuaranteedDataFiles` (the most common case), `PropertiesResult<'err, T, B>`
68/// is exactly the same type as `T`.
69///
70/// aWhen `B` implements `OptionalDataFiles` (which is the case when using
71/// `opt_load_*` instead of `load_*` or [`load_all`](SwhGraphProperties::load_all) for example),
72/// then `PropertiesResult<'err, T, B>` is exactly the same type as `Result<T, &'err UnavailableProperty>`.
73pub type PropertiesResult<'err, T, B> =
74 <<B as PropertiesBackend>::DataFilesAvailability as DataFilesAvailability>::Result<'err, T>;
75
76/// Common trait for type parameters of [`SwhGraphProperties`]
77pub trait PropertiesBackend {
78 type DataFilesAvailability: DataFilesAvailability;
79
80 /// Applies the given function `f` to the value `v` if the value is available
81 ///
82 /// This is an alias for `<Self::DataFilesAvailability as DataFilesAvailability>::map(v, f)`,
83 /// meaning that:
84 ///
85 /// 1. if `Self::DataFilesAvailability` is `GuaranteedDataFiles`, then `map_if_available(v, f)`
86 /// is equivalent to `f(v)` and has type `U`
87 /// 2. if `Self::DataFilesAvailability` is `OptionalDataFiles`, then `map_if_available(v, f)`
88 /// is equivalent to `v.map(f)` and has type `Result<U, &'err UnavailableProperty>`
89 fn map_if_available<T, U>(
90 v: <Self::DataFilesAvailability as DataFilesAvailability>::Result<'_, T>,
91 f: impl FnOnce(T) -> U,
92 ) -> <Self::DataFilesAvailability as DataFilesAvailability>::Result<'_, U> {
93 <Self::DataFilesAvailability as DataFilesAvailability>::map(v, f)
94 }
95
96 /// Returns `(v1, v2)` if both are available, or an error otherwise
97 ///
98 /// This is an alias for `<Self::DataFilesAvailability as DataFilesAvailability>::zip(v, f)`,
99 /// meaning that:
100 ///
101 /// 1. if `Self::DataFilesAvailability` is `GuaranteedDataFiles`, then `zip_if_available(v1, v2)`
102 /// is equivalent to `(v1, v2)` and has type `(T1, T2)`
103 /// 2. if `Self::DataFilesAvailability` is `OptionalDataFiles`, then `zip_if_available(v1, v2)`
104 /// is equivalent to `v1.and_then(|v1| v2.map(|v2| (v1, v2)))` and has type
105 /// `Result<(T1, T2), &'err UnavailableProperty>`
106 fn zip_if_available<'err, T1, T2>(
107 v1: <Self::DataFilesAvailability as DataFilesAvailability>::Result<'err, T1>,
108 v2: <Self::DataFilesAvailability as DataFilesAvailability>::Result<'err, T2>,
109 ) -> <Self::DataFilesAvailability as DataFilesAvailability>::Result<'err, (T1, T2)> {
110 <Self::DataFilesAvailability as DataFilesAvailability>::zip(v1, v2)
111 }
112}
113
114/// Helper trait to work with [`PropertiesResult`]
115///
116/// It is implemented by:
117/// * [`GuaranteedDataFiles`]: the common case, where data files are guaranteed to exist
118/// once a graph is loaded, in which case `Self::Result<'err, T>` is the same type as `T`
119/// * [`OptionalDataFiles`]: when they are not, in which case `Self::Result<T>`
120/// is the same type as `Result<T, &'err UnavailableProperty>`.
121pub trait DataFilesAvailability {
122 type Result<'err, T>;
123
124 fn map<T, U>(v: Self::Result<'_, T>, f: impl FnOnce(T) -> U) -> Self::Result<'_, U>;
125 fn zip<'err, T1, T2>(
126 v1: Self::Result<'err, T1>,
127 v2: Self::Result<'err, T2>,
128 ) -> Self::Result<'err, (T1, T2)>;
129 fn make_result<T>(value: Self::Result<'_, T>) -> Result<T, &UnavailableProperty>;
130}
131
132/// Helper type that implements [`DataFilesAvailability`] to signal underlying data files
133/// may be missing at runtime
134pub struct OptionalDataFiles {
135 _marker: (), // Prevents users from instantiating
136}
137
138/// Helper type that implements [`DataFilesAvailability`] to signal underlying data files
139/// are guaranteed to be available once the graph is loaded
140pub struct GuaranteedDataFiles {
141 _marker: (), // Prevents users from instantiating
142}
143
144impl DataFilesAvailability for OptionalDataFiles {
145 type Result<'err, T> = Result<T, &'err UnavailableProperty>;
146
147 #[inline(always)]
148 fn map<T, U>(v: Self::Result<'_, T>, f: impl FnOnce(T) -> U) -> Self::Result<'_, U> {
149 v.map(f)
150 }
151
152 #[inline(always)]
153 fn zip<'err, T1, T2>(
154 v1: Self::Result<'err, T1>,
155 v2: Self::Result<'err, T2>,
156 ) -> Self::Result<'err, (T1, T2)> {
157 v1.and_then(|v1| v2.map(|v2| (v1, v2)))
158 }
159
160 #[inline(always)]
161 fn make_result<T>(value: Self::Result<'_, T>) -> Result<T, &UnavailableProperty> {
162 value
163 }
164}
165
166impl DataFilesAvailability for GuaranteedDataFiles {
167 type Result<'err, T> = T;
168
169 #[inline(always)]
170 fn map<T, U>(v: Self::Result<'_, T>, f: impl FnOnce(T) -> U) -> Self::Result<'_, U> {
171 f(v)
172 }
173
174 #[inline(always)]
175 fn zip<'err, T1, T2>(
176 v1: Self::Result<'err, T1>,
177 v2: Self::Result<'err, T2>,
178 ) -> Self::Result<'err, (T1, T2)> {
179 (v1, v2)
180 }
181
182 #[inline(always)]
183 fn make_result<T>(value: Self::Result<'_, T>) -> Result<T, &UnavailableProperty> {
184 Ok(value)
185 }
186}
187
188/// Properties on graph nodes
189///
190/// This structures has many type parameters, to allow loading only some properties,
191/// and checking at compile time that only loaded properties are accessed.
192///
193/// Extra properties can be loaded, following the builder pattern on the owning graph.
194/// For example, this does not compile:
195///
196/// ```compile_fail
197/// # use std::path::PathBuf;
198/// use swh_graph::graph::SwhGraphWithProperties;
199/// use swh_graph::mph::DynMphf;
200/// use swh_graph::SwhGraphProperties;
201///
202/// swh_graph::graph::SwhUnidirectionalGraph::new(PathBuf::from("./graph"))
203/// .expect("Could not load graph")
204/// .init_properties()
205/// .properties()
206/// .author_timestamp(42);
207/// ```
208///
209/// but this does:
210///
211/// ```no_run
212/// # use std::path::PathBuf;
213/// use swh_graph::graph::SwhGraphWithProperties;
214/// use swh_graph::mph::DynMphf;
215/// use swh_graph::SwhGraphProperties;
216///
217/// swh_graph::graph::SwhUnidirectionalGraph::new(PathBuf::from("./graph"))
218/// .expect("Could not load graph")
219/// .init_properties()
220/// .load_properties(SwhGraphProperties::load_timestamps)
221/// .expect("Could not load timestamp properties")
222/// .properties()
223/// .author_timestamp(42);
224/// ```
225#[derive(Debug)]
226pub struct SwhGraphProperties<
227 MAPS: MaybeMaps,
228 TIMESTAMPS: MaybeTimestamps,
229 PERSONS: MaybePersons,
230 CONTENTS: MaybeContents,
231 STRINGS: MaybeStrings,
232 LABELNAMES: MaybeLabelNames,
233> {
234 pub(crate) path: PathBuf,
235 pub(crate) num_nodes: usize,
236 pub(crate) maps: MAPS,
237 pub(crate) timestamps: TIMESTAMPS,
238 pub(crate) persons: PERSONS,
239 pub(crate) contents: CONTENTS,
240 pub(crate) strings: STRINGS,
241 pub(crate) label_names: LABELNAMES,
242 /// Hack: `Some(false)` if the graph was compressed with Rust (2023-09-06 and newer),
243 /// `Some(true)` if the graph was compressed with Java (2022-12-07 and older),
244 /// `None` if we don't know yet (as we compute this lazily)
245 pub(crate) label_names_are_in_base64_order: once_cell::race::OnceBool,
246}
247
248pub type AllSwhGraphProperties<MPHF> = SwhGraphProperties<
249 MappedMaps<MPHF>,
250 MappedTimestamps,
251 MappedPersons,
252 MappedContents,
253 MappedStrings,
254 MappedLabelNames,
255>;
256
257pub type AllSwhGraphDynProperties<MPHF> = SwhGraphProperties<
258 MappedMaps<MPHF>,
259 OptMappedTimestamps,
260 OptMappedPersons,
261 OptMappedContents,
262 OptMappedStrings,
263 MappedLabelNames,
264>;
265
266impl SwhGraphProperties<NoMaps, NoTimestamps, NoPersons, NoContents, NoStrings, NoLabelNames> {
267 /// Creates an empty [`SwhGraphProperties`] instance, which will load properties
268 /// from the given path prefix.
269 pub fn new(path: impl AsRef<Path>, num_nodes: usize) -> Self {
270 SwhGraphProperties {
271 path: path.as_ref().to_owned(),
272 num_nodes,
273 maps: NoMaps,
274 timestamps: NoTimestamps,
275 persons: NoPersons,
276 contents: NoContents,
277 strings: NoStrings,
278 label_names: NoLabelNames,
279 label_names_are_in_base64_order: Default::default(),
280 }
281 }
282
283 /// Consumes an empty [`SwhGraphProperties`] instance and returns a new one
284 /// with all properties loaded and all methods available.
285 ///
286 /// ```no_run
287 /// # use std::path::PathBuf;
288 /// use swh_graph::graph::SwhGraphWithProperties;
289 /// use swh_graph::mph::DynMphf;
290 /// use swh_graph::SwhGraphProperties;
291 ///
292 /// SwhGraphProperties::new(PathBuf::from("./graph"), 123)
293 /// .load_all::<DynMphf>()
294 /// .expect("Could not load properties");
295 /// ```
296 ///
297 /// is equivalent to:
298 ///
299 /// ```no_run
300 /// # use std::path::PathBuf;
301 /// use swh_graph::mph::DynMphf;
302 /// use swh_graph::SwhGraphProperties;
303 ///
304 /// SwhGraphProperties::new(PathBuf::from("./graph"), 123)
305 /// .load_maps::<DynMphf>()
306 /// .expect("Could not load node2swhid/swhid2node")
307 /// .load_timestamps()
308 /// .expect("Could not load timestamp properties")
309 /// .load_persons()
310 /// .expect("Could not load person properties")
311 /// .load_contents()
312 /// .expect("Could not load content properties")
313 /// .load_strings()
314 /// .expect("Could not load string properties");
315 /// ```
316 pub fn load_all<MPHF: LoadableSwhidMphf>(self) -> Result<AllSwhGraphProperties<MPHF>> {
317 self.load_maps()?
318 .load_timestamps()?
319 .load_persons()?
320 .load_contents()?
321 .load_strings()?
322 .load_label_names()
323 }
324}
325
326mod maps;
327pub use maps::{MappedMaps, Maps, MaybeMaps, NoMaps, NodeIdFromSwhidError, VecMaps};
328
329mod timestamps;
330pub use timestamps::{
331 MappedTimestamps, MaybeTimestamps, NoTimestamps, OptMappedTimestamps, OptTimestamps,
332 Timestamps, VecTimestamps,
333};
334
335mod persons;
336pub use persons::{
337 MappedPersons, MaybePersons, NoPersons, OptMappedPersons, OptPersons, Persons, VecPersons,
338};
339
340mod contents;
341pub use contents::{
342 Contents, MappedContents, MaybeContents, NoContents, OptContents, OptMappedContents,
343 VecContents,
344};
345
346mod strings;
347pub use strings::{
348 MappedStrings, MaybeStrings, NoStrings, OptMappedStrings, OptStrings, Strings, VecStrings,
349};
350
351mod label_names;
352pub use label_names::{
353 LabelIdFromNameError, LabelNames, MappedLabelNames, MaybeLabelNames, NoLabelNames,
354 VecLabelNames,
355};
356
357mod utils;
358use utils::*;