Skip to main content

swh_graph/
swhtype.rs

1// Copyright (C) 2023-2026  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6use std::str::FromStr;
7
8#[repr(u8)]
9#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
10/// Object type of an SWHID
11///
12/// # Reference
13/// - <https://docs.softwareheritage.org/devel/swh-model/data-model.html>
14pub enum NodeType {
15    Content = 0,
16    /// a list of named directory entries, each of which pointing to other
17    /// artifacts, usually file contents or sub-directories. Directory entries
18    /// are also associated to some metadata stored as permission bits.
19    Directory = 1,
20    /// code “hosting places” as previously described are usually large
21    /// platforms that host several unrelated software projects. For software
22    /// provenance purposes it is important to be more specific than that.
23    ///
24    /// Software origins are fine grained references to where source code
25    /// artifacts archived by Software Heritage have been retrieved from. They
26    /// take the form of `(type, url)` pairs, where url is a canonical URL
27    /// (e.g., the address at which one can `git clone` a repository or download
28    /// a source tarball) and `type` the kind of software origin (e.g., git,
29    /// svn, or dsc for Debian source packages).
30    Origin = 2,
31    ///AKA “tags”
32    ///
33    /// some revisions are more equals than others and get selected by
34    /// developers as denoting important project milestones known as “releases”.
35    /// Each release points to the last commit in project history corresponding
36    /// to the release and carries metadata: release name and version, release
37    /// message, cryptographic signatures, etc.
38    Release = 3,
39    /// AKA commits
40    ///
41    /// Software development within a specific project is
42    /// essentially a time-indexed series of copies of a single “root” directory
43    /// that contains the entire project source code. Software evolves when a
44    /// developer modifies the content of one or more files in that directory
45    /// and record their changes.
46    ///
47    /// Each recorded copy of the root directory is known as a “revision”. It
48    /// points to a fully-determined directory and is equipped with arbitrary
49    /// metadata. Some of those are added manually by the developer
50    /// (e.g., commit message), others are automatically synthesized
51    /// (timestamps, preceding commit(s), etc).
52    Revision = 4,
53    /// any kind of software origin offers multiple pointers to the “current”
54    /// state of a development project. In the case of VCS this is reflected by
55    /// branches (e.g., master, development, but also so called feature branches
56    /// dedicated to extending the software in a specific direction); in the
57    /// case of package distributions by notions such as suites that correspond
58    /// to different maturity levels of individual packages (e.g., stable,
59    /// development, etc.).
60    ///
61    /// A “snapshot” of a given software origin records all entry points found
62    /// there and where each of them was pointing at the time. For example, a
63    /// snapshot object might track the commit where the master branch was
64    /// pointing to at any given time, as well as the most recent release of a
65    /// given package in the stable suite of a FOSS distribution.
66    Snapshot = 5,
67}
68
69impl<'a> TryFrom<&'a [u8]> for NodeType {
70    type Error = &'a [u8];
71    fn try_from(value: &'a [u8]) -> Result<Self, Self::Error> {
72        Ok(match value {
73            b"cnt" => Self::Content,
74            b"dir" => Self::Directory,
75            b"ori" => Self::Origin,
76            b"rel" => Self::Release,
77            b"rev" => Self::Revision,
78            b"snp" => Self::Snapshot,
79            _ => return Err(value),
80        })
81    }
82}
83
84impl FromStr for NodeType {
85    type Err = String;
86
87    /// # Examples
88    ///
89    /// ```
90    /// # use swh_graph::NodeType;
91    ///
92    /// assert_eq!("dir".parse::<NodeType>(), Ok(NodeType::Directory));
93    /// assert!(matches!("xyz".parse::<NodeType>(), Err(_)));
94    /// ```
95    fn from_str(s: &str) -> Result<Self, Self::Err> {
96        Ok(match s {
97            "cnt" => Self::Content,
98            "dir" => Self::Directory,
99            "ori" => Self::Origin,
100            "rel" => Self::Release,
101            "rev" => Self::Revision,
102            "snp" => Self::Snapshot,
103            _ => return Err(s.to_owned()),
104        })
105    }
106}
107
108impl TryFrom<u8> for NodeType {
109    type Error = u8;
110    fn try_from(value: u8) -> Result<Self, Self::Error> {
111        Ok(match value {
112            0 => Self::Content,
113            1 => Self::Directory,
114            2 => Self::Origin,
115            3 => Self::Release,
116            4 => Self::Revision,
117            5 => Self::Snapshot,
118            _ => return Err(value),
119        })
120    }
121}
122
123impl NodeType {
124    /// Get the number of possible types.
125    ///
126    /// To avoid having to update this when adding a new type
127    /// we can use the unstable function `std::mem::variant_count`
128    /// or the `variant_count` crate.
129    /// But for now we just hardcode it while we decide how to
130    /// deal with this.
131    pub const NUMBER_OF_TYPES: usize = 6;
132
133    /// The number of bits needed to store the node type as integers
134    /// This is `ceil(log2(NUMBER_OF_TYPES))`  which can be arithmetized into
135    /// `floor(log2(NUMBER_OF_TYPES))` plus one if it's not a power of two.
136    pub const BITWIDTH: usize = Self::NUMBER_OF_TYPES.ilog2() as usize
137        + (!Self::NUMBER_OF_TYPES.is_power_of_two()) as usize;
138
139    /// Convert a type to the str used in the SWHID
140    #[inline(always)]
141    pub fn to_str(&self) -> &'static str {
142        match self {
143            Self::Content => "cnt",
144            Self::Directory => "dir",
145            Self::Origin => "ori",
146            Self::Release => "rel",
147            Self::Revision => "rev",
148            Self::Snapshot => "snp",
149        }
150    }
151
152    /// Convert a type to its enum discriminant value.
153    ///
154    /// In all cases using this method is both safer and more concise than
155    /// `(node_type as isize).try_into().unwrap()`.
156    #[inline(always)]
157    pub fn to_u8(&self) -> u8 {
158        match self {
159            Self::Content => 0,
160            Self::Directory => 1,
161            Self::Origin => 2,
162            Self::Release => 3,
163            Self::Revision => 4,
164            Self::Snapshot => 5,
165        }
166    }
167
168    /// Returns a vector containing all possible `NodeType` values.
169    // TODO make this return an HashSet instead, as the order does not matter
170    pub fn all() -> Vec<Self> {
171        vec![
172            NodeType::Content,
173            NodeType::Directory,
174            NodeType::Origin,
175            NodeType::Release,
176            NodeType::Revision,
177            NodeType::Snapshot,
178        ]
179    }
180}
181
182impl core::fmt::Display for NodeType {
183    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
184        f.write_str(self.to_str())
185    }
186}
187
188/// Compact representation of a set of [NodeType]-s, as a bit array.
189type NodeTypeSet = u64;
190
191/// Constraint on allowed node types, as a set of node types.
192#[derive(Debug, PartialEq, Clone, Copy)]
193pub struct NodeConstraint(pub NodeTypeSet);
194
195impl Default for NodeConstraint {
196    fn default() -> Self {
197        Self(0b111111)
198    }
199}
200
201impl NodeConstraint {
202    /// Builds a `NodeConstraint` that only allows the given types
203    ///
204    /// # Example
205    ///
206    /// ```
207    /// # use std::collections::HashSet;
208    /// # use swh_graph::{NodeConstraint, NodeType};
209    ///
210    /// let only_revrels = NodeConstraint::from_types([NodeType::Revision, NodeType::Release]);
211    ///
212    /// assert!(!only_revrels.matches(NodeType::Directory));
213    /// assert!(!only_revrels.matches(NodeType::Content));
214    /// assert!(only_revrels.matches(NodeType::Release));
215    /// assert!(only_revrels.matches(NodeType::Revision));
216    /// assert!(!only_revrels.matches(NodeType::Origin));
217    /// ```
218    pub fn from_types(node_types: impl IntoIterator<Item = NodeType>) -> Self {
219        let mut bits = 0;
220        for node_type in node_types.into_iter() {
221            bits |= 1 << node_type.to_u8();
222        }
223        Self(bits)
224    }
225
226    /// # Examples
227    ///
228    /// ```
229    /// # use std::collections::HashSet;
230    /// # use swh_graph::{NodeConstraint, NodeType};
231    ///
232    /// let only_dirs: NodeConstraint = "dir".parse().unwrap();
233    /// let history_nodes: NodeConstraint = "rel,rev".parse().unwrap();
234    /// let all_nodes: NodeConstraint = "*".parse().unwrap();
235    ///
236    /// assert!(only_dirs.matches(NodeType::Directory));
237    /// assert!(!only_dirs.matches(NodeType::Content));
238    /// assert!(history_nodes.matches(NodeType::Release));
239    /// assert!(history_nodes.matches(NodeType::Revision));
240    /// assert!(!history_nodes.matches(NodeType::Origin));
241    /// for node_type in NodeType::all() {
242    ///     assert!(all_nodes.matches(node_type));
243    /// }
244    /// ```
245    #[inline(always)]
246    pub fn matches(&self, node_type: NodeType) -> bool {
247        self.0 & (1 << node_type.to_u8()) != 0
248    }
249
250    pub fn to_vec(&self) -> Vec<NodeType> {
251        (0..NodeType::NUMBER_OF_TYPES as u8)
252            .filter(|type_idx| self.0 & (1 << type_idx) != 0)
253            .map(|type_idx| type_idx.try_into().unwrap())
254            .collect()
255    }
256}
257
258impl FromStr for NodeConstraint {
259    type Err = String;
260
261    /// # Examples
262    ///
263    /// ```
264    /// # use std::collections::HashSet;
265    /// # use swh_graph::{NodeConstraint, NodeType};
266    ///
267    /// assert_eq!("*".parse::<NodeConstraint>(), Ok(NodeConstraint(0b111111)));
268    /// assert_eq!("rel".parse::<NodeConstraint>(), Ok(NodeConstraint(0b001000)));
269    /// assert_eq!("dir,cnt".parse::<NodeConstraint>(), Ok(NodeConstraint(0b000011)));
270    /// assert!(matches!("xyz".parse::<NodeConstraint>(), Err(_)));
271    /// ```
272    fn from_str(s: &str) -> Result<Self, Self::Err> {
273        if s == "*" {
274            Ok(NodeConstraint::default())
275        } else {
276            Ok(Self::from_types(
277                s.split(',')
278                    .map(|s| s.parse::<NodeType>())
279                    .collect::<Result<Vec<_>, _>>()?,
280            ))
281        }
282    }
283}
284
285impl core::fmt::Display for NodeConstraint {
286    /// ```
287    /// # use std::collections::HashSet;
288    /// # use swh_graph::{NodeConstraint, NodeType};
289    ///
290    /// assert_eq!(format!("{}", NodeConstraint::default()), "*");
291    /// assert_eq!(
292    ///     format!("{}", NodeConstraint(0b000011)),
293    ///     "cnt,dir"
294    /// );
295    /// assert_eq!(
296    ///     format!("{}", NodeConstraint(0b111100)),
297    ///     "ori,rel,rev,snp"
298    /// );
299    /// ```
300    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
301        if *self == Self::default() {
302            write!(f, "*")?;
303        } else {
304            let mut type_strings: Vec<&str> = self.to_vec().iter().map(|t| t.to_str()).collect();
305            type_strings.sort();
306            write!(f, "{}", type_strings.join(","))?;
307        }
308        Ok(())
309    }
310}
311
312#[cfg(feature = "serde")]
313impl serde::Serialize for NodeConstraint {
314    fn serialize<S: serde::Serializer>(
315        &self,
316        serializer: S,
317    ) -> std::result::Result<S::Ok, S::Error> {
318        serializer.collect_str(self)
319    }
320}
321
322#[cfg(feature = "serde")]
323impl<'de> serde::Deserialize<'de> for NodeConstraint {
324    fn deserialize<D: serde::Deserializer<'de>>(
325        deserializer: D,
326    ) -> std::result::Result<Self, D::Error> {
327        deserializer.deserialize_str(NodeConstraintVisitor)
328    }
329}
330
331#[cfg(feature = "serde")]
332struct NodeConstraintVisitor;
333
334#[cfg(feature = "serde")]
335impl serde::de::Visitor<'_> for NodeConstraintVisitor {
336    type Value = NodeConstraint;
337
338    fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
339        formatter.write_str("a node type constraint")
340    }
341
342    fn visit_str<E>(self, value: &str) -> Result<Self::Value, E>
343    where
344        E: serde::de::Error,
345    {
346        value.parse().map_err(E::custom)
347    }
348}
349
350/// Type of an arc between two nodes in the Software Heritage graph, as a pair
351/// of type constraints on the source and destination arc. When one of the two
352/// is None, it means "any node type accepted".
353// TODO remove Options from ArcType and create a (more  expressive, similar to
354// NodeConstraint) type called ArcConstraint
355pub struct ArcType {
356    pub src: Option<NodeType>,
357    pub dst: Option<NodeType>,
358}