swh_graph/properties/
strings.rs

1// Copyright (C) 2023-2024  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6use mmap_rs::Mmap;
7
8use super::suffixes::*;
9use super::*;
10use crate::graph::NodeId;
11
12/// Trait implemented by both [`NoStrings`] and all implementors of [`Strings`],
13/// to allow loading string properties only if needed.
14pub trait MaybeStrings {}
15impl<S: OptStrings> MaybeStrings for S {}
16
17/// Placeholder for when string properties are not loaded
18#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
19pub struct NoStrings;
20impl MaybeStrings for NoStrings {}
21
22#[diagnostic::on_unimplemented(
23    label = "does not have String properties loaded",
24    note = "Use `let graph = graph.load_properties(|props| props.load_string()).unwrap()` to load them",
25    note = "Or replace `graph.init_properties()` with `graph.load_all_properties::<DynMphf>().unwrap()` to load all properties"
26)]
27/// Trait implemented by all implementors of [`MaybeStrings`] but [`NoStrings`]
28pub trait OptStrings: MaybeStrings + PropertiesBackend {
29    type Offsets<'a>: GetIndex<Output = u64> + 'a
30    where
31        Self: 'a;
32
33    fn message(&self) -> PropertiesResult<&[u8], Self>;
34    fn message_offset(&self) -> PropertiesResult<Self::Offsets<'_>, Self>;
35    fn tag_name(&self) -> PropertiesResult<&[u8], Self>;
36    fn tag_name_offset(&self) -> PropertiesResult<Self::Offsets<'_>, Self>;
37}
38
39#[diagnostic::on_unimplemented(
40    label = "does not have String properties loaded",
41    note = "Use `let graph = graph.load_properties(|props| props.load_string()).unwrap()` to load them",
42    note = "Or replace `graph.init_properties()` with `graph.load_all_properties::<DynMphf>().unwrap()` to load all properties"
43)]
44/// Trait for backend storage of string properties (either in-memory or memory-mapped)
45pub trait Strings: OptStrings<DataFilesAvailability = GuaranteedDataFiles> {}
46impl<S: OptStrings<DataFilesAvailability = GuaranteedDataFiles>> Strings for S {}
47
48/// Variant of [`MappedStrings`] that checks at runtime that files are present every time
49/// it is accessed
50pub struct OptMappedStrings {
51    message: Result<Mmap, UnavailableProperty>,
52    message_offset: Result<NumberMmap<BigEndian, u64, Mmap>, UnavailableProperty>,
53    tag_name: Result<Mmap, UnavailableProperty>,
54    tag_name_offset: Result<NumberMmap<BigEndian, u64, Mmap>, UnavailableProperty>,
55}
56impl PropertiesBackend for OptMappedStrings {
57    type DataFilesAvailability = OptionalDataFiles;
58}
59impl OptStrings for OptMappedStrings {
60    type Offsets<'a>
61        = &'a NumberMmap<BigEndian, u64, Mmap>
62    where
63        Self: 'a;
64
65    #[inline(always)]
66    fn message(&self) -> PropertiesResult<'_, &[u8], Self> {
67        self.message.as_deref()
68    }
69    #[inline(always)]
70    fn message_offset(&self) -> PropertiesResult<'_, Self::Offsets<'_>, Self> {
71        self.message_offset.as_ref()
72    }
73    #[inline(always)]
74    fn tag_name(&self) -> PropertiesResult<'_, &[u8], Self> {
75        self.tag_name.as_deref()
76    }
77    #[inline(always)]
78    fn tag_name_offset(&self) -> PropertiesResult<'_, Self::Offsets<'_>, Self> {
79        self.tag_name_offset.as_ref()
80    }
81}
82
83/// [`Strings`] implementation backed by files guaranteed to be present once the graph is loaded
84pub struct MappedStrings {
85    message: Mmap,
86    message_offset: NumberMmap<BigEndian, u64, Mmap>,
87    tag_name: Mmap,
88    tag_name_offset: NumberMmap<BigEndian, u64, Mmap>,
89}
90impl PropertiesBackend for MappedStrings {
91    type DataFilesAvailability = GuaranteedDataFiles;
92}
93impl OptStrings for MappedStrings {
94    type Offsets<'a>
95        = &'a NumberMmap<BigEndian, u64, Mmap>
96    where
97        Self: 'a;
98
99    #[inline(always)]
100    fn message(&self) -> &[u8] {
101        &self.message
102    }
103    #[inline(always)]
104    fn message_offset(&self) -> Self::Offsets<'_> {
105        &self.message_offset
106    }
107    #[inline(always)]
108    fn tag_name(&self) -> &[u8] {
109        &self.tag_name
110    }
111    #[inline(always)]
112    fn tag_name_offset(&self) -> Self::Offsets<'_> {
113        &self.tag_name_offset
114    }
115}
116
117#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
118pub struct VecStrings {
119    message: Vec<u8>,
120    message_offset: Vec<u64>,
121    tag_name: Vec<u8>,
122    tag_name_offset: Vec<u64>,
123}
124
125impl VecStrings {
126    /// Returns [`VecStrings`] from pairs of `(message, tag_name)`
127    pub fn new<Msg: AsRef<[u8]>, TagName: AsRef<[u8]>>(
128        data: Vec<(Option<Msg>, Option<TagName>)>,
129    ) -> Result<Self> {
130        let base64 = base64_simd::STANDARD;
131
132        let mut message = Vec::new();
133        let mut message_offset = Vec::new();
134        let mut tag_name = Vec::new();
135        let mut tag_name_offset = Vec::new();
136
137        for (msg, tag) in data.into_iter() {
138            match msg {
139                Some(msg) => {
140                    let msg = base64.encode_to_string(msg);
141                    message_offset.push(
142                        message
143                            .len()
144                            .try_into()
145                            .context("total message size overflowed usize")?,
146                    );
147                    message.extend(msg.as_bytes());
148                    message.push(b'\n');
149                }
150                None => message_offset.push(u64::MAX),
151            }
152            match tag {
153                Some(tag) => {
154                    let tag = base64.encode_to_string(tag);
155                    tag_name_offset.push(
156                        tag_name
157                            .len()
158                            .try_into()
159                            .context("total tag_name size overflowed usize")?,
160                    );
161                    tag_name.extend(tag.as_bytes());
162                    tag_name.push(b'\n');
163                }
164                None => tag_name_offset.push(u64::MAX),
165            }
166        }
167
168        Ok(VecStrings {
169            message,
170            message_offset,
171            tag_name,
172            tag_name_offset,
173        })
174    }
175}
176
177impl PropertiesBackend for VecStrings {
178    type DataFilesAvailability = GuaranteedDataFiles;
179}
180impl OptStrings for VecStrings {
181    type Offsets<'a>
182        = &'a [u64]
183    where
184        Self: 'a;
185
186    #[inline(always)]
187    fn message(&self) -> &[u8] {
188        self.message.as_slice()
189    }
190    #[inline(always)]
191    fn message_offset(&self) -> Self::Offsets<'_> {
192        self.message_offset.as_slice()
193    }
194    #[inline(always)]
195    fn tag_name(&self) -> &[u8] {
196        self.tag_name.as_slice()
197    }
198    #[inline(always)]
199    fn tag_name_offset(&self) -> Self::Offsets<'_> {
200        self.tag_name_offset.as_slice()
201    }
202}
203
204impl<
205        MAPS: MaybeMaps,
206        TIMESTAMPS: MaybeTimestamps,
207        PERSONS: MaybePersons,
208        CONTENTS: MaybeContents,
209        LABELNAMES: MaybeLabelNames,
210    > SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, NoStrings, LABELNAMES>
211{
212    /// Consumes a [`SwhGraphProperties`] and returns a new one with these methods
213    /// available:
214    ///
215    /// * [`SwhGraphProperties::message_base64`]
216    /// * [`SwhGraphProperties::message`]
217    /// * [`SwhGraphProperties::tag_name_base64`]
218    /// * [`SwhGraphProperties::tag_name`]
219    pub fn load_strings(
220        self,
221    ) -> Result<SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, MappedStrings, LABELNAMES>>
222    {
223        let OptMappedStrings {
224            message,
225            message_offset,
226            tag_name,
227            tag_name_offset,
228        } = self.get_strings()?;
229        let strings = MappedStrings {
230            message: message?,
231            message_offset: message_offset?,
232            tag_name: tag_name?,
233            tag_name_offset: tag_name_offset?,
234        };
235        self.with_strings(strings)
236    }
237    /// Equivalent to [`Self::load_strings`] that does not require all files to be present
238    pub fn opt_load_strings(
239        self,
240    ) -> Result<SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, OptMappedStrings, LABELNAMES>>
241    {
242        let strings = self.get_strings()?;
243        self.with_strings(strings)
244    }
245
246    fn get_strings(&self) -> Result<OptMappedStrings> {
247        Ok(OptMappedStrings {
248            message: load_if_exists(&self.path, MESSAGE, |path| mmap(path))
249                .context("Could not load message")?,
250            message_offset: load_if_exists(&self.path, MESSAGE_OFFSET, |path| {
251                NumberMmap::new(path, self.num_nodes)
252            })
253            .context("Could not load message_offset")?,
254            tag_name: load_if_exists(&self.path, TAG_NAME, |path| mmap(path))
255                .context("Could not load tag_name")?,
256            tag_name_offset: load_if_exists(&self.path, TAG_NAME_OFFSET, |path| {
257                NumberMmap::new(path, self.num_nodes)
258            })
259            .context("Could not load tag_name_offset")?,
260        })
261    }
262
263    /// Alternative to [`load_strings`](Self::load_strings) that allows using arbitrary
264    /// strings implementations
265    pub fn with_strings<STRINGS: MaybeStrings>(
266        self,
267        strings: STRINGS,
268    ) -> Result<SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>> {
269        Ok(SwhGraphProperties {
270            maps: self.maps,
271            timestamps: self.timestamps,
272            persons: self.persons,
273            contents: self.contents,
274            strings,
275            label_names: self.label_names,
276            path: self.path,
277            num_nodes: self.num_nodes,
278            label_names_are_in_base64_order: self.label_names_are_in_base64_order,
279        })
280    }
281}
282
283/// Functions to access message of `revision`/`release` nodes, and names of `release` nodes
284///
285/// Only available after calling [`load_strings`](SwhGraphProperties::load_strings)
286/// or [`load_all_properties`](crate::graph::SwhBidirectionalGraph::load_all_properties)
287impl<
288        MAPS: MaybeMaps,
289        TIMESTAMPS: MaybeTimestamps,
290        PERSONS: MaybePersons,
291        CONTENTS: MaybeContents,
292        STRINGS: OptStrings,
293        LABELNAMES: MaybeLabelNames,
294    > SwhGraphProperties<MAPS, TIMESTAMPS, PERSONS, CONTENTS, STRINGS, LABELNAMES>
295{
296    #[inline(always)]
297    fn message_or_tag_name_base64<'a>(
298        what: &'static str,
299        data: &'a [u8],
300        offsets: impl GetIndex<Output = u64>,
301        node_id: NodeId,
302    ) -> Result<Option<&'a [u8]>, OutOfBoundError> {
303        match offsets.get(node_id) {
304            None => Err(OutOfBoundError {
305                // Unknown node
306                index: node_id,
307                len: offsets.len(),
308            }),
309            Some(u64::MAX) => Ok(None), // No message
310            Some(offset) => {
311                let offset = offset as usize;
312                let slice: &[u8] = data.get(offset..).unwrap_or_else(|| {
313                    panic!("Missing {what} for node {node_id} at offset {offset}")
314                });
315                Ok(slice
316                    .iter()
317                    .position(|&c| c == b'\n')
318                    .map(|end| &slice[..end]))
319            }
320        }
321    }
322
323    /// Returns the base64-encoded message of a revision or release,
324    /// or the base64-encoded URL of an origin
325    ///
326    /// # Panics
327    ///
328    /// If the node id does not exist
329    #[inline]
330    pub fn message_base64(&self, node_id: NodeId) -> PropertiesResult<Option<&[u8]>, STRINGS> {
331        STRINGS::map_if_available(
332            self.try_message_base64(node_id),
333            |message: Result<_, OutOfBoundError>| {
334                message.unwrap_or_else(|e| panic!("Cannot get node message: {e}"))
335            },
336        )
337    }
338
339    /// Returns the base64-encoded message of a revision or release
340    ///
341    /// Returns `Err` if the node id is unknown, and `Ok(None)` if the node has
342    /// no message.
343    #[inline]
344    pub fn try_message_base64(
345        &self,
346        node_id: NodeId,
347    ) -> PropertiesResult<Result<Option<&[u8]>, OutOfBoundError>, STRINGS> {
348        STRINGS::map_if_available(
349            STRINGS::zip_if_available(self.strings.message(), self.strings.message_offset()),
350            |(messages, message_offsets)| {
351                Self::message_or_tag_name_base64("message", messages, message_offsets, node_id)
352            },
353        )
354    }
355    /// Returns the message of a revision or release,
356    /// or the URL of an origin
357    ///
358    /// # Panics
359    ///
360    /// If the node id does not exist
361    #[inline]
362    pub fn message(&self, node_id: NodeId) -> PropertiesResult<Option<Vec<u8>>, STRINGS> {
363        STRINGS::map_if_available(self.try_message(node_id), |message| {
364            message.unwrap_or_else(|e| panic!("Cannot get node message: {e}"))
365        })
366    }
367
368    /// Returns the message of a revision or release,
369    /// or the URL of an origin
370    ///
371    /// Returns `Err` if the node id is unknown, and `Ok(None)` if the node has
372    /// no message.
373    #[inline]
374    pub fn try_message(
375        &self,
376        node_id: NodeId,
377    ) -> PropertiesResult<Result<Option<Vec<u8>>, OutOfBoundError>, STRINGS> {
378        let base64 = base64_simd::STANDARD;
379        STRINGS::map_if_available(self.try_message_base64(node_id), |message_opt_res| {
380            message_opt_res.map(|message_opt| {
381                message_opt.map(|message| {
382                    base64
383                        .decode_to_vec(message)
384                        .unwrap_or_else(|e| panic!("Could not decode node message: {e}"))
385                })
386            })
387        })
388    }
389
390    /// Returns the tag name of a release, base64-encoded
391    ///
392    /// # Panics
393    ///
394    /// If the node id does not exist
395    #[inline]
396    pub fn tag_name_base64(&self, node_id: NodeId) -> PropertiesResult<Option<&[u8]>, STRINGS> {
397        STRINGS::map_if_available(self.try_tag_name_base64(node_id), |tag_name| {
398            tag_name.unwrap_or_else(|e| panic!("Cannot get node tag: {e}"))
399        })
400    }
401
402    /// Returns the tag name of a release, base64-encoded
403    ///
404    /// Returns `Err` if the node id is unknown, and `Ok(None)` if the node has
405    /// no tag name.
406    #[inline]
407    pub fn try_tag_name_base64(
408        &self,
409        node_id: NodeId,
410    ) -> PropertiesResult<Result<Option<&[u8]>, OutOfBoundError>, STRINGS> {
411        STRINGS::map_if_available(
412            STRINGS::zip_if_available(self.strings.tag_name(), self.strings.tag_name_offset()),
413            |(tag_names, tag_name_offsets)| {
414                Self::message_or_tag_name_base64("tag_name", tag_names, tag_name_offsets, node_id)
415            },
416        )
417    }
418
419    /// Returns the tag name of a release
420    ///
421    /// # Panics
422    ///
423    /// If the node id does not exist
424    #[inline]
425    pub fn tag_name(&self, node_id: NodeId) -> PropertiesResult<Option<Vec<u8>>, STRINGS> {
426        STRINGS::map_if_available(self.try_tag_name(node_id), |tag_name| {
427            tag_name.unwrap_or_else(|e| panic!("Cannot get node tag name: {e}"))
428        })
429    }
430
431    /// Returns the tag name of a release
432    ///
433    /// Returns `Err` if the node id is unknown, and `Ok(None)` if the node has
434    /// no tag name.
435    #[inline]
436    pub fn try_tag_name(
437        &self,
438        node_id: NodeId,
439    ) -> PropertiesResult<Result<Option<Vec<u8>>, OutOfBoundError>, STRINGS> {
440        let base64 = base64_simd::STANDARD;
441        STRINGS::map_if_available(self.try_tag_name_base64(node_id), |tag_name_opt_res| {
442            tag_name_opt_res.map(|tag_name_opt| {
443                tag_name_opt.map(|tag_name| {
444                    base64.decode_to_vec(tag_name).unwrap_or_else(|_| {
445                        panic!("Could not decode tag_name of node {node_id}: {tag_name:?}")
446                    })
447                })
448            })
449        })
450    }
451}