swh_graph/properties/
label_names.rs

1// Copyright (C) 2023-2024  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6use anyhow::{Context, Result};
7use mmap_rs::Mmap;
8use rayon::prelude::*;
9use thiserror::Error;
10
11use super::suffixes::*;
12use super::*;
13use crate::front_coded_list::FrontCodedList;
14use crate::labels::LabelNameId;
15use crate::utils::suffix_path;
16
17/// Trait implemented by both [`NoLabelNames`] and all implementors of [`LabelNames`],
18/// to allow loading label names only if needed.
19pub trait MaybeLabelNames {}
20
21pub struct MappedLabelNames {
22    label_names: FrontCodedList<Mmap, Mmap>,
23}
24impl<L: LabelNames> MaybeLabelNames for L {}
25
26/// Placeholder for when label names are not loaded.
27#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
28pub struct NoLabelNames;
29impl MaybeLabelNames for NoLabelNames {}
30
31/// Trait for backend storage of label names (either in-memory or memory-mapped)
32#[diagnostic::on_unimplemented(
33    label = "does not have label names loaded",
34    note = "Use `let graph = graph.load_properties(|props| props.load_label_names()).unwrap()` to load them",
35    note = "Or replace `graph.init_properties()` with `graph.load_all_properties::<DynMphf>().unwrap()` to load all properties"
36)]
37pub trait LabelNames {
38    type LabelNames<'a>: GetIndex<Output = Vec<u8>>
39    where
40        Self: 'a;
41
42    fn label_names(&self) -> Self::LabelNames<'_>;
43}
44
45impl LabelNames for MappedLabelNames {
46    type LabelNames<'a>
47        = &'a FrontCodedList<Mmap, Mmap>
48    where
49        Self: 'a;
50
51    #[inline(always)]
52    fn label_names(&self) -> Self::LabelNames<'_> {
53        &self.label_names
54    }
55}
56
57#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
58pub struct VecLabelNames {
59    label_names: Vec<Vec<u8>>,
60}
61impl VecLabelNames {
62    /// Returns a new `VecLabelNames` from a list of label names
63    pub fn new<S: AsRef<[u8]>>(label_names: Vec<S>) -> Result<Self> {
64        let base64 = base64_simd::STANDARD;
65        Ok(VecLabelNames {
66            label_names: label_names
67                .into_iter()
68                .map(|s| base64.encode_to_string(s).into_bytes())
69                .collect(),
70        })
71    }
72}
73
74impl LabelNames for VecLabelNames {
75    type LabelNames<'a>
76        = &'a [Vec<u8>]
77    where
78        Self: 'a;
79
80    #[inline(always)]
81    fn label_names(&self) -> Self::LabelNames<'_> {
82        self.label_names.as_slice()
83    }
84}
85
86impl<
87        MAPS: MaybeMaps,
88        TIMESTAMPS: MaybeTimestamps,
89        PERSONS: MaybePersons,
90        CONTENTS: MaybeContents,
91        STRINGS: MaybeStrings,
92    > SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, NoLabelNames>
93{
94    /// Consumes a [`SwhGraphProperties`] and returns a new one with this methods
95    /// available:
96    ///
97    /// * [`SwhGraphProperties::label_name`]
98    /// * [`SwhGraphProperties::label_name_base64`]
99    pub fn load_label_names(
100        self,
101    ) -> Result<SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, MappedLabelNames>>
102    {
103        let label_names = MappedLabelNames {
104            label_names: FrontCodedList::load(suffix_path(&self.path, LABEL_NAME))
105                .context("Could not load label names")?,
106        };
107        self.with_label_names(label_names)
108    }
109
110    /// Alternative to [`load_label_names`](Self::load_label_names) that allows using
111    /// arbitrary label_names implementations
112    pub fn with_label_names<LABELNAMES: MaybeLabelNames>(
113        self,
114        label_names: LABELNAMES,
115    ) -> Result<SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>> {
116        Ok(SwhGraphProperties {
117            maps: self.maps,
118            timestamps: self.timestamps,
119            persons: self.persons,
120            contents: self.contents,
121            strings: self.strings,
122            label_names,
123            path: self.path,
124            num_nodes: self.num_nodes,
125            label_names_are_in_base64_order: Default::default(), // do not keep the current value
126        })
127    }
128}
129
130/// Functions to access names of arc labels.
131///
132/// Only available after calling [`load_label_names`](SwhGraphProperties::load_label_names)
133/// or [`load_all_properties`](crate::graph::SwhBidirectionalGraph::load_all_properties).
134impl<
135        MAPS: MaybeMaps,
136        TIMESTAMPS: MaybeTimestamps,
137        PERSONS: MaybePersons,
138        CONTENTS: MaybeContents,
139        STRINGS: MaybeStrings,
140        LABELNAMES: LabelNames,
141    > SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>
142{
143    /// Returns the total number of labels in the graph
144    pub fn num_label_names(&self) -> u64 {
145        self.label_names
146            .label_names()
147            .len()
148            .try_into()
149            .expect("label_names.len() overflowed u64")
150    }
151
152    /// Returns an iterator on [`LabelNameId`]s for this graph
153    pub fn iter_label_name_ids(&self) -> impl Iterator<Item = LabelNameId> {
154        (0..self.num_label_names()).map(LabelNameId)
155    }
156
157    /// Returns an iterator on [`LabelNameId`]s for this graph
158    pub fn par_iter_label_name_ids(&self) -> impl ParallelIterator<Item = LabelNameId> {
159        (0..self.num_label_names()).into_par_iter().map(LabelNameId)
160    }
161
162    /// Returns the base64-encoded name of an arc label
163    ///
164    /// This is the file name (resp. branch name) of a label on an arc from a directory
165    /// (resp. snapshot)
166    ///
167    /// # Panics
168    ///
169    /// If `label_name_id` does not exist
170    #[inline]
171    pub fn label_name_base64(&self, label_name_id: LabelNameId) -> Vec<u8> {
172        self.try_label_name_base64(label_name_id)
173            .unwrap_or_else(|e| panic!("Cannot get label name: {e}"))
174    }
175
176    /// Returns the base64-encoded name of an arc label
177    ///
178    /// This is the file name (resp. branch name) of a label on an arc from a directory
179    /// (resp. snapshot)
180    #[inline]
181    pub fn try_label_name_base64(
182        &self,
183        label_name_id: LabelNameId,
184    ) -> Result<Vec<u8>, OutOfBoundError> {
185        let index = label_name_id
186            .0
187            .try_into()
188            .expect("label_name_id overflowed usize");
189        self.label_names
190            .label_names()
191            .get(index)
192            .ok_or(OutOfBoundError {
193                index,
194                len: self.label_names.label_names().len(),
195            })
196    }
197
198    /// Returns the name of an arc label
199    ///
200    /// This is the file name (resp. branch name) of a label on an arc from a directory
201    /// (resp. snapshot)
202    ///
203    /// # Panics
204    ///
205    /// If `label_name_id` does not exist
206    #[inline]
207    pub fn label_name(&self, label_name_id: LabelNameId) -> Vec<u8> {
208        self.try_label_name(label_name_id)
209            .unwrap_or_else(|e| panic!("Cannot get label name: {e}"))
210    }
211
212    /// Returns the name of an arc label
213    ///
214    /// This is the file name (resp. branch name) of a label on an arc from a directory
215    /// (resp. snapshot)
216    #[inline]
217    pub fn try_label_name(&self, label_name_id: LabelNameId) -> Result<Vec<u8>, OutOfBoundError> {
218        let base64 = base64_simd::STANDARD;
219        self.try_label_name_base64(label_name_id).map(|name| {
220            base64.decode_to_vec(name).unwrap_or_else(|name| {
221                panic!(
222                    "Could not decode label_name of id {}: {:?}",
223                    label_name_id.0, name
224                )
225            })
226        })
227    }
228
229    /// Given a branch/file name, returns the label_name id used by edges with that name,
230    /// or `None` if it does not exist.
231    ///
232    /// This is the inverse function of [`label_name`](Self::label_name)
233    ///
234    /// Unlike in Java where this function is `O(1)`, this implementation is `O(log2(num_labels))`
235    /// because it uses a binary search, as the MPH function can only be read from Java.
236    #[inline]
237    pub fn label_name_id(
238        &self,
239        name: impl AsRef<[u8]>,
240    ) -> Result<LabelNameId, LabelIdFromNameError> {
241        use std::cmp::Ordering::*;
242
243        let base64 = base64_simd::STANDARD;
244        let name = name.as_ref();
245        let name_base64 = base64.encode_to_string(name.as_ref()).into_bytes();
246
247        macro_rules! bisect {
248            ($compare_pivot:expr) => {{
249                let compare_pivot = $compare_pivot;
250                let res: Result<LabelNameId, LabelIdFromNameError> = {
251                    // both inclusive
252                    let mut min = 0;
253                    let mut max = self
254                        .label_names
255                        .label_names()
256                        .len()
257                        .saturating_sub(1)
258                        .try_into()
259                        .expect("number of labels overflowed u64");
260
261                    while min <= max {
262                        let pivot = (min + max) / 2;
263                        let pivot_id = LabelNameId(pivot);
264                        let pivot_name = self.label_name_base64(pivot_id);
265                        if min == max {
266                            if pivot_name.as_slice() == name_base64 {
267                                return Ok(pivot_id);
268                            } else {
269                                break;
270                            }
271                        } else {
272                            match compare_pivot(pivot_id) {
273                                Less => min = pivot.saturating_add(1),
274                                Equal => return Ok(pivot_id),
275                                Greater => max = pivot.saturating_sub(1),
276                            }
277                        }
278                    }
279
280                    Err(LabelIdFromNameError(name_base64.to_vec()))
281                };
282                res
283            }};
284        }
285
286        let bisect_base64 = || {
287            bisect!(|pivot_id| self
288                .label_name_base64(pivot_id)
289                .as_slice()
290                .cmp(&name_base64))
291        };
292        let bisect_nonbase64 =
293            || bisect!(|pivot_id| self.label_name(pivot_id).as_slice().cmp(name));
294
295        match self.label_names_are_in_base64_order.get() {
296            Some(true) => {
297                // Graphs compressed with the Java implementation (2022-12-07 and older) sort
298                // labels lexicographically by base64(name)
299                bisect_base64()
300            }
301            Some(false) => {
302                // Graphs compressed with the Rust implementation (2023-09-06 and newer) sort
303                // labels lexicographically by name
304                bisect_nonbase64()
305            }
306            None => {
307                // We don't know yet which one this graph uses.
308                // Assume new order for now.
309                match bisect_nonbase64() {
310                    Ok(label_name_id) => {
311                        // we cannot draw any conclusion yet, because we might just have
312                        // been lucky and not hit a pivot that triggers the non-monotonicity
313                        // of base64-encoding between that pivot and `name`.
314                        // Let's check the old order.
315                        if let Err(LabelIdFromNameError(_)) = bisect_base64() {
316                            // Found with the new order but not the old order, which means
317                            // that label names are in the new order
318                            log::debug!("Labels are not in base64 order");
319                            let _ = self.label_names_are_in_base64_order.set(false);
320                        }
321                        Ok(label_name_id)
322                    }
323                    Err(LabelIdFromNameError(_)) => {
324                        // Not found using the new order, try with the old order
325                        match bisect_base64() {
326                            Ok(label_name_id) => {
327                                // Found with the old order, which means label names are in the old
328                                // order.
329                                log::debug!("Labels are in base64 order");
330                                let _ = self.label_names_are_in_base64_order.set(true);
331                                Ok(label_name_id)
332                            }
333                            Err(LabelIdFromNameError(e)) => {
334                                // Not found either, we cannot draw any conclusion
335                                Err(LabelIdFromNameError(e))
336                            }
337                        }
338                    }
339                }
340            }
341        }
342    }
343}
344
345#[derive(Clone, Debug, Eq, PartialEq, Error)]
346#[error("Unknown label name: {}", String::from_utf8_lossy(.0))]
347pub struct LabelIdFromNameError(pub Vec<u8>);