swh_graph/properties/
strings.rs

1// Copyright (C) 2023-2024  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6use mmap_rs::Mmap;
7
8use super::suffixes::*;
9use super::*;
10use crate::graph::NodeId;
11
12/// Trait implemented by both [`NoStrings`] and all implementors of [`Strings`],
13/// to allow loading string properties only if needed.
14pub trait MaybeStrings {}
15impl<S: OptStrings> MaybeStrings for S {}
16
17/// Placeholder for when string properties are not loaded
18#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
19pub struct NoStrings;
20impl MaybeStrings for NoStrings {}
21
22#[diagnostic::on_unimplemented(
23    label = "does not have String properties loaded",
24    note = "Use `let graph = graph.load_properties(|props| props.load_string()).unwrap()` to load them",
25    note = "Or replace `graph.init_properties()` with `graph.load_all_properties::<DynMphf>().unwrap()` to load all properties"
26)]
27/// Trait implemented by all implementors of [`MaybeStrings`] but [`NoStrings`]
28pub trait OptStrings: MaybeStrings + PropertiesBackend {
29    /// Returns an array with all messages, separated by `b'\n'`
30    fn message(&self) -> PropertiesResult<'_, &[u8], Self>;
31    /// Returns the position of the first character of `node`'s message in [`Self::message`],
32    /// or `None` if it is out of bound, or `Some(u64::MAX)` if the node has no message
33    fn message_offset(&self, node: NodeId) -> PropertiesResult<'_, Option<u64>, Self>;
34    /// Returns an array with all messages, separated by `b'\n'`
35    fn tag_name(&self) -> PropertiesResult<'_, &[u8], Self>;
36    /// Returns the position of the first character of `node`'s tag_name in [`Self::tag_name`],
37    /// or `None` if it is out of bound, or `Some(u64::MAX)` if the node has no tag_name
38    fn tag_name_offset(&self, node: NodeId) -> PropertiesResult<'_, Option<u64>, Self>;
39}
40
41#[diagnostic::on_unimplemented(
42    label = "does not have String properties loaded",
43    note = "Use `let graph = graph.load_properties(|props| props.load_string()).unwrap()` to load them",
44    note = "Or replace `graph.init_properties()` with `graph.load_all_properties::<DynMphf>().unwrap()` to load all properties"
45)]
46/// Trait for backend storage of string properties (either in-memory or memory-mapped)
47pub trait Strings: OptStrings<DataFilesAvailability = GuaranteedDataFiles> {}
48impl<S: OptStrings<DataFilesAvailability = GuaranteedDataFiles>> Strings for S {}
49
50/// Variant of [`MappedStrings`] that checks at runtime that files are present every time
51/// it is accessed
52pub struct OptMappedStrings {
53    message: Result<Mmap, UnavailableProperty>,
54    message_offset: Result<NumberMmap<BigEndian, u64, Mmap>, UnavailableProperty>,
55    tag_name: Result<Mmap, UnavailableProperty>,
56    tag_name_offset: Result<NumberMmap<BigEndian, u64, Mmap>, UnavailableProperty>,
57}
58impl PropertiesBackend for OptMappedStrings {
59    type DataFilesAvailability = OptionalDataFiles;
60}
61impl OptStrings for OptMappedStrings {
62    #[inline(always)]
63    fn message(&self) -> PropertiesResult<'_, &[u8], Self> {
64        self.message.as_deref()
65    }
66    #[inline(always)]
67    fn message_offset(&self, node: NodeId) -> PropertiesResult<'_, Option<u64>, Self> {
68        self.message_offset
69            .as_ref()
70            .map(|message_offsets| message_offsets.get(node))
71    }
72    #[inline(always)]
73    fn tag_name(&self) -> PropertiesResult<'_, &[u8], Self> {
74        self.tag_name.as_deref()
75    }
76    #[inline(always)]
77    fn tag_name_offset(&self, node: NodeId) -> PropertiesResult<'_, Option<u64>, Self> {
78        self.tag_name_offset
79            .as_ref()
80            .map(|tag_name_offsets| tag_name_offsets.get(node))
81    }
82}
83
84/// [`Strings`] implementation backed by files guaranteed to be present once the graph is loaded
85pub struct MappedStrings {
86    message: Mmap,
87    message_offset: NumberMmap<BigEndian, u64, Mmap>,
88    tag_name: Mmap,
89    tag_name_offset: NumberMmap<BigEndian, u64, Mmap>,
90}
91impl PropertiesBackend for MappedStrings {
92    type DataFilesAvailability = GuaranteedDataFiles;
93}
94impl OptStrings for MappedStrings {
95    #[inline(always)]
96    fn message(&self) -> &[u8] {
97        &self.message
98    }
99    #[inline(always)]
100    fn message_offset(&self, node: NodeId) -> Option<u64> {
101        (&self.message_offset).get(node)
102    }
103    #[inline(always)]
104    fn tag_name(&self) -> &[u8] {
105        &self.tag_name
106    }
107    #[inline(always)]
108    fn tag_name_offset(&self, node: NodeId) -> Option<u64> {
109        (&self.tag_name_offset).get(node)
110    }
111}
112
113#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
114pub struct VecStrings {
115    message: Vec<u8>,
116    message_offset: Vec<u64>,
117    tag_name: Vec<u8>,
118    tag_name_offset: Vec<u64>,
119}
120
121impl VecStrings {
122    /// Returns [`VecStrings`] from pairs of `(message, tag_name)`
123    pub fn new<Msg: AsRef<[u8]>, TagName: AsRef<[u8]>>(
124        data: Vec<(Option<Msg>, Option<TagName>)>,
125    ) -> Result<Self> {
126        let base64 = base64_simd::STANDARD;
127
128        let mut message = Vec::new();
129        let mut message_offset = Vec::new();
130        let mut tag_name = Vec::new();
131        let mut tag_name_offset = Vec::new();
132
133        for (msg, tag) in data.into_iter() {
134            match msg {
135                Some(msg) => {
136                    let msg = base64.encode_to_string(msg);
137                    message_offset.push(
138                        message
139                            .len()
140                            .try_into()
141                            .context("total message size overflowed usize")?,
142                    );
143                    message.extend(msg.as_bytes());
144                    message.push(b'\n');
145                }
146                None => message_offset.push(u64::MAX),
147            }
148            match tag {
149                Some(tag) => {
150                    let tag = base64.encode_to_string(tag);
151                    tag_name_offset.push(
152                        tag_name
153                            .len()
154                            .try_into()
155                            .context("total tag_name size overflowed usize")?,
156                    );
157                    tag_name.extend(tag.as_bytes());
158                    tag_name.push(b'\n');
159                }
160                None => tag_name_offset.push(u64::MAX),
161            }
162        }
163
164        Ok(VecStrings {
165            message,
166            message_offset,
167            tag_name,
168            tag_name_offset,
169        })
170    }
171}
172
173impl PropertiesBackend for VecStrings {
174    type DataFilesAvailability = GuaranteedDataFiles;
175}
176impl OptStrings for VecStrings {
177    #[inline(always)]
178    fn message(&self) -> &[u8] {
179        self.message.as_slice()
180    }
181    #[inline(always)]
182    fn message_offset(&self, node: NodeId) -> Option<u64> {
183        self.message_offset.get(node)
184    }
185    #[inline(always)]
186    fn tag_name(&self) -> &[u8] {
187        self.tag_name.as_slice()
188    }
189    #[inline(always)]
190    fn tag_name_offset(&self, node: NodeId) -> Option<u64> {
191        self.tag_name_offset.get(node)
192    }
193}
194
195impl<
196        MAPS: MaybeMaps,
197        TIMESTAMPS: MaybeTimestamps,
198        PERSONS: MaybePersons,
199        CONTENTS: MaybeContents,
200        LABELNAMES: MaybeLabelNames,
201    > SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, NoStrings, LABELNAMES>
202{
203    /// Consumes a [`SwhGraphProperties`] and returns a new one with these methods
204    /// available:
205    ///
206    /// * [`SwhGraphProperties::message_base64`]
207    /// * [`SwhGraphProperties::message`]
208    /// * [`SwhGraphProperties::tag_name_base64`]
209    /// * [`SwhGraphProperties::tag_name`]
210    pub fn load_strings(
211        self,
212    ) -> Result<SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, MappedStrings, LABELNAMES>>
213    {
214        let OptMappedStrings {
215            message,
216            message_offset,
217            tag_name,
218            tag_name_offset,
219        } = self.get_strings()?;
220        let strings = MappedStrings {
221            message: message?,
222            message_offset: message_offset?,
223            tag_name: tag_name?,
224            tag_name_offset: tag_name_offset?,
225        };
226        self.with_strings(strings)
227    }
228    /// Equivalent to [`Self::load_strings`] that does not require all files to be present
229    pub fn opt_load_strings(
230        self,
231    ) -> Result<SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, OptMappedStrings, LABELNAMES>>
232    {
233        let strings = self.get_strings()?;
234        self.with_strings(strings)
235    }
236
237    fn get_strings(&self) -> Result<OptMappedStrings> {
238        Ok(OptMappedStrings {
239            message: load_if_exists(&self.path, MESSAGE, |path| mmap(path))
240                .context("Could not load message")?,
241            message_offset: load_if_exists(&self.path, MESSAGE_OFFSET, |path| {
242                NumberMmap::new(path, self.num_nodes)
243            })
244            .context("Could not load message_offset")?,
245            tag_name: load_if_exists(&self.path, TAG_NAME, |path| mmap(path))
246                .context("Could not load tag_name")?,
247            tag_name_offset: load_if_exists(&self.path, TAG_NAME_OFFSET, |path| {
248                NumberMmap::new(path, self.num_nodes)
249            })
250            .context("Could not load tag_name_offset")?,
251        })
252    }
253
254    /// Alternative to [`load_strings`](Self::load_strings) that allows using arbitrary
255    /// strings implementations
256    pub fn with_strings<STRINGS: MaybeStrings>(
257        self,
258        strings: STRINGS,
259    ) -> Result<SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>> {
260        Ok(SwhGraphProperties {
261            maps: self.maps,
262            timestamps: self.timestamps,
263            persons: self.persons,
264            contents: self.contents,
265            strings,
266            label_names: self.label_names,
267            path: self.path,
268            num_nodes: self.num_nodes,
269            label_names_are_in_base64_order: self.label_names_are_in_base64_order,
270        })
271    }
272}
273
274/// Functions to access message of `revision`/`release` nodes, and names of `release` nodes
275///
276/// Only available after calling [`load_strings`](SwhGraphProperties::load_strings)
277/// or [`load_all_properties`](crate::graph::SwhBidirectionalGraph::load_all_properties)
278impl<
279        MAPS: MaybeMaps,
280        TIMESTAMPS: MaybeTimestamps,
281        PERSONS: MaybePersons,
282        CONTENTS: MaybeContents,
283        STRINGS: OptStrings,
284        LABELNAMES: MaybeLabelNames,
285    > SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>
286{
287    #[inline(always)]
288    fn message_or_tag_name_base64<'a>(
289        &self,
290        what: &'static str,
291        data: &'a [u8],
292        offset: Option<u64>,
293        node_id: NodeId,
294    ) -> Result<Option<&'a [u8]>, OutOfBoundError> {
295        match offset {
296            None => Err(OutOfBoundError {
297                // Unknown node
298                index: node_id,
299                len: self.num_nodes,
300            }),
301            Some(u64::MAX) => Ok(None), // No message
302            Some(offset) => {
303                let offset = offset as usize;
304                let slice: &[u8] = data.get(offset..).unwrap_or_else(|| {
305                    panic!("Missing {what} for node {node_id} at offset {offset}")
306                });
307                Ok(slice
308                    .iter()
309                    .position(|&c| c == b'\n')
310                    .map(|end| &slice[..end]))
311            }
312        }
313    }
314
315    /// Returns the base64-encoded message of a revision or release,
316    /// or the base64-encoded URL of an origin
317    ///
318    /// # Panics
319    ///
320    /// If the node id does not exist
321    #[inline]
322    pub fn message_base64(&self, node_id: NodeId) -> PropertiesResult<'_, Option<&[u8]>, STRINGS> {
323        STRINGS::map_if_available(
324            self.try_message_base64(node_id),
325            |message: Result<_, OutOfBoundError>| {
326                message.unwrap_or_else(|e| panic!("Cannot get node message: {e}"))
327            },
328        )
329    }
330
331    /// Returns the base64-encoded message of a revision or release
332    ///
333    /// Returns `Err` if the node id is unknown, and `Ok(None)` if the node has
334    /// no message.
335    #[inline]
336    pub fn try_message_base64(
337        &self,
338        node_id: NodeId,
339    ) -> PropertiesResult<'_, Result<Option<&[u8]>, OutOfBoundError>, STRINGS> {
340        STRINGS::map_if_available(
341            STRINGS::zip_if_available(self.strings.message(), self.strings.message_offset(node_id)),
342            |(messages, message_offset)| {
343                self.message_or_tag_name_base64("message", messages, message_offset, node_id)
344            },
345        )
346    }
347    /// Returns the message of a revision or release,
348    /// or the URL of an origin
349    ///
350    /// # Panics
351    ///
352    /// If the node id does not exist
353    #[inline]
354    pub fn message(&self, node_id: NodeId) -> PropertiesResult<'_, Option<Vec<u8>>, STRINGS> {
355        STRINGS::map_if_available(self.try_message(node_id), |message| {
356            message.unwrap_or_else(|e| panic!("Cannot get node message: {e}"))
357        })
358    }
359
360    /// Returns the message of a revision or release,
361    /// or the URL of an origin
362    ///
363    /// Returns `Err` if the node id is unknown, and `Ok(None)` if the node has
364    /// no message.
365    #[inline]
366    pub fn try_message(
367        &self,
368        node_id: NodeId,
369    ) -> PropertiesResult<'_, Result<Option<Vec<u8>>, OutOfBoundError>, STRINGS> {
370        let base64 = base64_simd::STANDARD;
371        STRINGS::map_if_available(self.try_message_base64(node_id), |message_opt_res| {
372            message_opt_res.map(|message_opt| {
373                message_opt.map(|message| {
374                    base64
375                        .decode_to_vec(message)
376                        .unwrap_or_else(|e| panic!("Could not decode node message: {e}"))
377                })
378            })
379        })
380    }
381
382    /// Returns the tag name of a release, base64-encoded
383    ///
384    /// # Panics
385    ///
386    /// If the node id does not exist
387    #[inline]
388    pub fn tag_name_base64(&self, node_id: NodeId) -> PropertiesResult<'_, Option<&[u8]>, STRINGS> {
389        STRINGS::map_if_available(self.try_tag_name_base64(node_id), |tag_name| {
390            tag_name.unwrap_or_else(|e| panic!("Cannot get node tag: {e}"))
391        })
392    }
393
394    /// Returns the tag name of a release, base64-encoded
395    ///
396    /// Returns `Err` if the node id is unknown, and `Ok(None)` if the node has
397    /// no tag name.
398    #[inline]
399    pub fn try_tag_name_base64(
400        &self,
401        node_id: NodeId,
402    ) -> PropertiesResult<'_, Result<Option<&[u8]>, OutOfBoundError>, STRINGS> {
403        STRINGS::map_if_available(
404            STRINGS::zip_if_available(
405                self.strings.tag_name(),
406                self.strings.tag_name_offset(node_id),
407            ),
408            |(tag_names, tag_name_offset)| {
409                self.message_or_tag_name_base64("tag_name", tag_names, tag_name_offset, node_id)
410            },
411        )
412    }
413
414    /// Returns the tag name of a release
415    ///
416    /// # Panics
417    ///
418    /// If the node id does not exist
419    #[inline]
420    pub fn tag_name(&self, node_id: NodeId) -> PropertiesResult<'_, Option<Vec<u8>>, STRINGS> {
421        STRINGS::map_if_available(self.try_tag_name(node_id), |tag_name| {
422            tag_name.unwrap_or_else(|e| panic!("Cannot get node tag name: {e}"))
423        })
424    }
425
426    /// Returns the tag name of a release
427    ///
428    /// Returns `Err` if the node id is unknown, and `Ok(None)` if the node has
429    /// no tag name.
430    #[inline]
431    pub fn try_tag_name(
432        &self,
433        node_id: NodeId,
434    ) -> PropertiesResult<'_, Result<Option<Vec<u8>>, OutOfBoundError>, STRINGS> {
435        let base64 = base64_simd::STANDARD;
436        STRINGS::map_if_available(self.try_tag_name_base64(node_id), |tag_name_opt_res| {
437            tag_name_opt_res.map(|tag_name_opt| {
438                tag_name_opt.map(|tag_name| {
439                    base64.decode_to_vec(tag_name).unwrap_or_else(|_| {
440                        panic!("Could not decode tag_name of node {node_id}: {tag_name:?}")
441                    })
442                })
443            })
444        })
445    }
446}