swh_graph/properties/
contents.rs

1// Copyright (C) 2023-2024  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6use anyhow::{ensure, Context, Result};
7use mmap_rs::Mmap;
8
9use super::suffixes::*;
10use super::*;
11use crate::graph::NodeId;
12
13/// Trait implemented by both [`NoContents`] and all implementors of [`Contents`],
14/// to allow loading content properties only if needed.
15pub trait MaybeContents {}
16impl<C: OptContents> MaybeContents for C {}
17
18/// Placeholder for when "contents" properties are not loaded.
19#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
20pub struct NoContents;
21impl MaybeContents for NoContents {}
22
23/// Returns the bit at the given position, or `None` if it is out of bound
24///
25/// # Safety
26///
27/// The array must have length >= `num_bits.div_ceil(usize::BITS)`
28unsafe fn get_bit(
29    array: impl GetIndex<Output = u64>,
30    num_bits: usize,
31    bit_position: usize,
32) -> Option<bool> {
33    if bit_position >= num_bits {
34        None
35    } else {
36        let cell_id = bit_position / (u64::BITS as usize);
37        let mask = 1 << (bit_position % (u64::BITS as usize));
38
39        // safety: relies on the caller giving the right value for num_bits
40        let cell = unsafe { array.get_unchecked(cell_id) };
41
42        Some((cell & mask) != 0)
43    }
44}
45
46#[diagnostic::on_unimplemented(
47    label = "does not have Content properties loaded",
48    note = "Use `let graph = graph.load_properties(|props| props.load_contents()).unwrap()` to load them",
49    note = "Or replace `graph.init_properties()` with `graph.load_all_properties::<DynMphf>().unwrap()` to load all properties"
50)]
51/// Trait implemented by all implementors of [`MaybeContents`] but [`NoContents`]
52pub trait OptContents: MaybeContents + PropertiesBackend {
53    /// Returns whether the given node is a skipped content, or `None` if out of bounds
54    fn is_skipped_content(&self, node: NodeId) -> PropertiesResult<'_, Option<bool>, Self>;
55    /// Returns the content's length, or `None` if out of bounds, or `u64::MAX` if it
56    /// is not a content / does not not have a known length
57    fn content_length(&self, node: NodeId) -> PropertiesResult<'_, Option<u64>, Self>;
58}
59
60#[diagnostic::on_unimplemented(
61    label = "does not have Content properties loaded",
62    note = "Use `let graph = graph.load_properties(|props| props.load_contents()).unwrap()` to load them",
63    note = "Or replace `graph.init_properties()` with `graph.load_all_properties::<DynMphf>().unwrap()` to load all properties"
64)]
65/// Trait for backend storage of content properties (either in-memory or memory-mapped)
66pub trait Contents: OptContents<DataFilesAvailability = GuaranteedDataFiles> {}
67impl<S: OptContents<DataFilesAvailability = GuaranteedDataFiles>> Contents for S {}
68
69/// Variant of [`MappedStrings`] that checks at runtime that files are present every time
70/// it is accessed
71pub struct OptMappedContents {
72    num_nodes: usize,
73    is_skipped_content: Result<NumberMmap<BigEndian, u64, Mmap>, UnavailableProperty>,
74    content_length: Result<NumberMmap<BigEndian, u64, Mmap>, UnavailableProperty>,
75}
76impl PropertiesBackend for OptMappedContents {
77    type DataFilesAvailability = OptionalDataFiles;
78}
79impl OptContents for OptMappedContents {
80    #[inline(always)]
81    fn is_skipped_content(&self, node: NodeId) -> PropertiesResult<'_, Option<bool>, Self> {
82        // SAFETY: we check that num_nodes matches the mmap size when creating this struct
83        self.is_skipped_content
84            .as_ref()
85            .map(|is_skipped_content| unsafe { get_bit(is_skipped_content, self.num_nodes, node) })
86    }
87    #[inline(always)]
88    fn content_length(&self, node: NodeId) -> PropertiesResult<'_, Option<u64>, Self> {
89        self.content_length
90            .as_ref()
91            .map(|content_lengths| content_lengths.get(node))
92    }
93}
94
95pub struct MappedContents {
96    num_nodes: usize,
97    is_skipped_content: NumberMmap<BigEndian, u64, Mmap>,
98    content_length: NumberMmap<BigEndian, u64, Mmap>,
99}
100impl PropertiesBackend for MappedContents {
101    type DataFilesAvailability = GuaranteedDataFiles;
102}
103impl OptContents for MappedContents {
104    #[inline(always)]
105    fn is_skipped_content(&self, node: NodeId) -> Option<bool> {
106        // SAFETY: we check that num_nodes matches the mmap size when creating this struct
107        unsafe { get_bit(&self.is_skipped_content, self.num_nodes, node) }
108    }
109    #[inline(always)]
110    fn content_length(&self, node: NodeId) -> Option<u64> {
111        (&self.content_length).get(node)
112    }
113}
114
115#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
116pub struct VecContents {
117    num_nodes: usize,
118    is_skipped_content: Vec<u64>,
119    content_length: Vec<u64>,
120}
121
122impl VecContents {
123    pub fn new(data: Vec<(bool, Option<u64>)>) -> Result<Self> {
124        let num_nodes = data.len();
125        let bit_vec_len = num_nodes.div_ceil(64);
126        let mut is_skipped_content = vec![0; bit_vec_len];
127        let mut content_length = Vec::with_capacity(num_nodes);
128        for (node_id, (is_skipped, length)) in data.into_iter().enumerate() {
129            ensure!(
130                length != Some(u64::MAX),
131                "content length may not be {}",
132                u64::MAX
133            );
134            content_length.push(length.unwrap_or(u64::MAX));
135            if is_skipped {
136                let cell_id = node_id / (u64::BITS as usize);
137                let mask = 1 << (node_id % (u64::BITS as usize));
138                is_skipped_content[cell_id] |= mask;
139            }
140        }
141        Ok(VecContents {
142            num_nodes,
143            is_skipped_content,
144            content_length,
145        })
146    }
147}
148
149impl PropertiesBackend for VecContents {
150    type DataFilesAvailability = GuaranteedDataFiles;
151}
152impl OptContents for VecContents {
153    #[inline(always)]
154    fn is_skipped_content(&self, node: NodeId) -> Option<bool> {
155        // SAFETY: we check that num_nodes matches the mmap size when creating this struct
156        unsafe { get_bit(self.is_skipped_content.as_slice(), self.num_nodes, node) }
157    }
158    #[inline(always)]
159    fn content_length(&self, node: NodeId) -> Option<u64> {
160        self.content_length.get(node)
161    }
162}
163
164impl<
165        MAPS: MaybeMaps,
166        TIMESTAMPS: MaybeTimestamps,
167        PERSONS: MaybePersons,
168        STRINGS: MaybeStrings,
169        LABELNAMES: MaybeLabelNames,
170    > SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, NoContents, STRINGS, LABELNAMES>
171{
172    /// Consumes a [`SwhGraphProperties`] and returns a new one with these methods
173    /// available:
174    ///
175    /// * [`SwhGraphProperties::is_skipped_content`]
176    /// * [`SwhGraphProperties::content_length`]
177    pub fn load_contents(
178        self,
179    ) -> Result<SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, MappedContents, STRINGS, LABELNAMES>>
180    {
181        let OptMappedContents {
182            is_skipped_content,
183            content_length,
184            num_nodes,
185        } = self.get_contents()?;
186        let contents = MappedContents {
187            is_skipped_content: is_skipped_content?,
188            content_length: content_length?,
189            num_nodes,
190        };
191        self.with_contents(contents)
192    }
193
194    /// Equivalent to [`Self::load_contents`] that does not require all files to be present
195    pub fn opt_load_contents(
196        self,
197    ) -> Result<SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, OptMappedContents, STRINGS, LABELNAMES>>
198    {
199        let contents = self.get_contents()?;
200        self.with_contents(contents)
201    }
202
203    fn get_contents(&self) -> Result<OptMappedContents> {
204        Ok(OptMappedContents {
205            num_nodes: self.num_nodes,
206            is_skipped_content: load_if_exists(&self.path, CONTENT_IS_SKIPPED, |path| {
207                let num_bytes = self.num_nodes.div_ceil(u64::BITS.try_into().unwrap());
208                NumberMmap::new(path, num_bytes).context("Could not load is_skipped_content")
209            })?,
210            content_length: load_if_exists(&self.path, CONTENT_LENGTH, |path| {
211                NumberMmap::new(path, self.num_nodes).context("Could not load content_length")
212            })?,
213        })
214    }
215
216    /// Alternative to [`load_contents`](Self::load_contents) that allows using arbitrary
217    /// contents implementations
218    pub fn with_contents<CONTENTS: MaybeContents>(
219        self,
220        contents: CONTENTS,
221    ) -> Result<SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>> {
222        Ok(SwhGraphProperties {
223            maps: self.maps,
224            timestamps: self.timestamps,
225            persons: self.persons,
226            contents,
227            strings: self.strings,
228            label_names: self.label_names,
229            path: self.path,
230            num_nodes: self.num_nodes,
231            label_names_are_in_base64_order: self.label_names_are_in_base64_order,
232        })
233    }
234}
235
236/// Functions to access properties of `content` nodes
237///
238/// Only available after calling [`load_contents`](SwhGraphProperties::load_contents)
239/// or [`load_all_properties`](crate::graph::SwhBidirectionalGraph::load_all_properties)
240impl<
241        MAPS: MaybeMaps,
242        TIMESTAMPS: MaybeTimestamps,
243        PERSONS: MaybePersons,
244        CONTENTS: OptContents,
245        STRINGS: MaybeStrings,
246        LABELNAMES: MaybeLabelNames,
247    > SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>
248{
249    /// Returns whether the node is a skipped content
250    ///
251    /// Non-content objects get a `false` value, like non-skipped contents.
252    ///
253    /// # Panics
254    ///
255    /// If the node id does not exist.
256    #[inline]
257    pub fn is_skipped_content(&self, node_id: NodeId) -> PropertiesResult<'_, bool, CONTENTS> {
258        CONTENTS::map_if_available(self.try_is_skipped_content(node_id), |is_skipped_content| {
259            is_skipped_content
260                .unwrap_or_else(|e| panic!("Cannot get is_skipped_content bit of node: {e}"))
261        })
262    }
263
264    /// Returns whether the node is a skipped content, or `Err` if the node id does not exist
265    ///
266    /// Non-content objects get a `false` value, like non-skipped contents.
267    #[inline]
268    pub fn try_is_skipped_content(
269        &self,
270        node_id: NodeId,
271    ) -> PropertiesResult<'_, Result<bool, OutOfBoundError>, CONTENTS> {
272        CONTENTS::map_if_available(
273            self.contents.is_skipped_content(node_id),
274            |is_skipped_content| match is_skipped_content {
275                None => Err(OutOfBoundError {
276                    index: node_id,
277                    len: self.num_nodes,
278                }),
279                Some(is_skipped_content) => Ok(is_skipped_content),
280            },
281        )
282    }
283
284    /// Returns the length of the given content.
285    ///
286    /// May be `None` for skipped contents
287    ///
288    /// # Panics
289    ///
290    /// If the node id does not exist.
291    #[inline]
292    pub fn content_length(&self, node_id: NodeId) -> PropertiesResult<'_, Option<u64>, CONTENTS> {
293        CONTENTS::map_if_available(self.try_content_length(node_id), |content_length| {
294            content_length.unwrap_or_else(|e| panic!("Cannot get content length: {e}"))
295        })
296    }
297
298    /// Returns the length of the given content, or `Err` if the node id does not exist
299    ///
300    /// May be `Ok(None)` for skipped contents
301    #[inline]
302    pub fn try_content_length(
303        &self,
304        node_id: NodeId,
305    ) -> PropertiesResult<'_, Result<Option<u64>, OutOfBoundError>, CONTENTS> {
306        CONTENTS::map_if_available(self.contents.content_length(node_id), |content_length| {
307            match content_length {
308                None => Err(OutOfBoundError {
309                    // id does not exist
310                    index: node_id,
311                    len: self.num_nodes,
312                }),
313                Some(u64::MAX) => Ok(None), // Skipped content with no length
314                Some(length) => Ok(Some(length)),
315            }
316        })
317    }
318}