swh_graph/
swhtype.rs

1// Copyright (C) 2023-2024  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6use std::str::FromStr;
7
8use bitvec::prelude::*;
9
10#[repr(u8)]
11#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
12/// Object type of an SWHID
13///
14/// # Reference
15/// - <https://docs.softwareheritage.org/devel/swh-model/data-model.html>
16pub enum NodeType {
17    Content = 0,
18    /// a list of named directory entries, each of which pointing to other
19    /// artifacts, usually file contents or sub-directories. Directory entries
20    /// are also associated to some metadata stored as permission bits.
21    Directory = 1,
22    /// code “hosting places” as previously described are usually large
23    /// platforms that host several unrelated software projects. For software
24    /// provenance purposes it is important to be more specific than that.
25    ///
26    /// Software origins are fine grained references to where source code
27    /// artifacts archived by Software Heritage have been retrieved from. They
28    /// take the form of `(type, url)` pairs, where url is a canonical URL
29    /// (e.g., the address at which one can `git clone` a repository or download
30    /// a source tarball) and `type` the kind of software origin (e.g., git,
31    /// svn, or dsc for Debian source packages).
32    Origin = 2,
33    ///AKA “tags”
34    ///
35    /// some revisions are more equals than others and get selected by
36    /// developers as denoting important project milestones known as “releases”.
37    /// Each release points to the last commit in project history corresponding
38    /// to the release and carries metadata: release name and version, release
39    /// message, cryptographic signatures, etc.
40    Release = 3,
41    /// AKA commits
42    ///
43    /// Software development within a specific project is
44    /// essentially a time-indexed series of copies of a single “root” directory
45    /// that contains the entire project source code. Software evolves when a d
46    /// eveloper modifies the content of one or more files in that directory
47    /// and record their changes.
48    ///
49    /// Each recorded copy of the root directory is known as a “revision”. It
50    /// points to a fully-determined directory and is equipped with arbitrary
51    /// metadata. Some of those are added manually by the developer
52    /// (e.g., commit message), others are automatically synthesized
53    /// (timestamps, preceding commit(s), etc).
54    Revision = 4,
55    /// any kind of software origin offers multiple pointers to the “current”
56    /// state of a development project. In the case of VCS this is reflected by
57    /// branches (e.g., master, development, but also so called feature branches
58    /// dedicated to extending the software in a specific direction); in the
59    /// case of package distributions by notions such as suites that correspond
60    /// to different maturity levels of individual packages (e.g., stable,
61    /// development, etc.).
62    ///
63    /// A “snapshot” of a given software origin records all entry points found
64    /// there and where each of them was pointing at the time. For example, a
65    /// snapshot object might track the commit where the master branch was
66    /// pointing to at any given time, as well as the most recent release of a
67    /// given package in the stable suite of a FOSS distribution.
68    Snapshot = 5,
69}
70
71impl<'a> TryFrom<&'a [u8]> for NodeType {
72    type Error = &'a [u8];
73    fn try_from(value: &'a [u8]) -> Result<Self, Self::Error> {
74        Ok(match value {
75            b"cnt" => Self::Content,
76            b"dir" => Self::Directory,
77            b"ori" => Self::Origin,
78            b"rel" => Self::Release,
79            b"rev" => Self::Revision,
80            b"snp" => Self::Snapshot,
81            _ => return Err(value),
82        })
83    }
84}
85
86impl FromStr for NodeType {
87    type Err = String;
88
89    /// # Examples
90    ///
91    /// ```
92    /// # use swh_graph::NodeType;
93    ///
94    /// assert_eq!("dir".parse::<NodeType>(), Ok(NodeType::Directory));
95    /// assert!(matches!("xyz".parse::<NodeType>(), Err(_)));
96    /// ```
97    fn from_str(s: &str) -> Result<Self, Self::Err> {
98        Ok(match s {
99            "cnt" => Self::Content,
100            "dir" => Self::Directory,
101            "ori" => Self::Origin,
102            "rel" => Self::Release,
103            "rev" => Self::Revision,
104            "snp" => Self::Snapshot,
105            _ => return Err(s.to_owned()),
106        })
107    }
108}
109
110impl TryFrom<u8> for NodeType {
111    type Error = u8;
112    fn try_from(value: u8) -> Result<Self, Self::Error> {
113        Ok(match value {
114            0 => Self::Content,
115            1 => Self::Directory,
116            2 => Self::Origin,
117            3 => Self::Release,
118            4 => Self::Revision,
119            5 => Self::Snapshot,
120            _ => return Err(value),
121        })
122    }
123}
124
125impl NodeType {
126    /// Get the number of possible types.
127    ///
128    /// To avoid having to update this when adding a new type
129    /// we can use the unstable function `std::mem::variant_count`
130    /// or the `variant_count` crate.
131    /// But for now we just hardcode it while we decide how to
132    /// deal with this.
133    pub const NUMBER_OF_TYPES: usize = 6;
134
135    /// The number of bits needed to store the node type as integers
136    /// This is `ceil(log2(NUMBER_OF_TYPES))`  which can be arithmetized into
137    /// `floor(log2(NUMBER_OF_TYPES))` plus one if it's not a power of two.
138    pub const BITWIDTH: usize = Self::NUMBER_OF_TYPES.ilog2() as usize
139        + (!Self::NUMBER_OF_TYPES.is_power_of_two()) as usize;
140
141    /// Convert a type to the str used in the SWHID
142    pub fn to_str(&self) -> &'static str {
143        match self {
144            Self::Content => "cnt",
145            Self::Directory => "dir",
146            Self::Origin => "ori",
147            Self::Release => "rel",
148            Self::Revision => "rev",
149            Self::Snapshot => "snp",
150        }
151    }
152
153    /// Convert a type to its enum discriminant value.
154    ///
155    /// In all cases using this method is both safer and more concise than
156    /// `(node_type as isize).try_into().unwrap()`.
157    pub fn to_u8(&self) -> u8 {
158        match self {
159            Self::Content => 0,
160            Self::Directory => 1,
161            Self::Origin => 2,
162            Self::Release => 3,
163            Self::Revision => 4,
164            Self::Snapshot => 5,
165        }
166    }
167
168    /// Returns a vector containing all possible `NodeType` values.
169    // TODO make this return an HashSet instead, as the order does not matter
170    pub fn all() -> Vec<Self> {
171        vec![
172            NodeType::Content,
173            NodeType::Directory,
174            NodeType::Origin,
175            NodeType::Release,
176            NodeType::Revision,
177            NodeType::Snapshot,
178        ]
179    }
180}
181
182impl core::fmt::Display for NodeType {
183    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
184        f.write_str(self.to_str())
185    }
186}
187
188/// Compact representation of a set of [NodeType]-s, as a bit array.
189type NodeTypeSet = BitArr!(for NodeType::NUMBER_OF_TYPES, in u8, Msb0);
190
191/// Constraint on allowed node types, as a set of node types.
192#[derive(Debug, PartialEq)]
193pub struct NodeConstraint(pub NodeTypeSet);
194
195impl NodeConstraint {
196    /// # Examples
197    ///
198    /// ```
199    /// # use std::collections::HashSet;
200    /// # use swh_graph::{NodeConstraint, NodeType};
201    ///
202    /// let only_dirs: NodeConstraint = "dir".parse().unwrap();
203    /// let history_nodes: NodeConstraint = "rel,rev".parse().unwrap();
204    /// let all_nodes: NodeConstraint = "*".parse().unwrap();
205    ///
206    /// assert!(only_dirs.matches(NodeType::Directory));
207    /// assert!(!only_dirs.matches(NodeType::Content));
208    /// assert!(history_nodes.matches(NodeType::Release));
209    /// assert!(history_nodes.matches(NodeType::Revision));
210    /// assert!(!history_nodes.matches(NodeType::Origin));
211    /// for node_type in NodeType::all() {
212    ///     assert!(all_nodes.matches(node_type));
213    /// }
214    /// ```
215    pub fn matches(&self, node_type: NodeType) -> bool {
216        *self.0.get(node_type.to_u8() as usize).unwrap()
217    }
218
219    pub fn to_vec(&self) -> Vec<NodeType> {
220        self.0
221            .iter_ones() // Note: this iterates on all bits of the u8
222            .filter_map(|type_idx| (type_idx as u8).try_into().ok())
223            .collect()
224    }
225}
226
227impl Default for NodeConstraint {
228    fn default() -> Self {
229        Self(bitarr!(u8, Msb0; 1; NodeType::NUMBER_OF_TYPES))
230    }
231}
232
233impl FromStr for NodeConstraint {
234    type Err = String;
235
236    /// # Examples
237    ///
238    /// ```
239    /// # use std::collections::HashSet;
240    /// # use bitvec::prelude::*;
241    /// # use swh_graph::{NodeConstraint, NodeType};
242    ///
243    /// assert_eq!("*".parse::<NodeConstraint>(), Ok(NodeConstraint(bitarr!(u8, Msb0; 1; 6))));
244    /// assert_eq!("rel".parse::<NodeConstraint>(), Ok(NodeConstraint(bitarr!(u8, Msb0; 0, 0, 0, 1, 0, 0))));
245    /// assert_eq!("dir,cnt".parse::<NodeConstraint>(), Ok(NodeConstraint(bitarr!(u8, Msb0; 1, 1, 0, 0, 0, 0))));
246    /// assert!(matches!("xyz".parse::<NodeConstraint>(), Err(_)));
247    /// ```
248    fn from_str(s: &str) -> Result<Self, Self::Err> {
249        if s == "*" {
250            Ok(NodeConstraint::default())
251        } else {
252            let mut node_types = bitarr!(u8, Msb0; 0; NodeType::NUMBER_OF_TYPES);
253            for s in s.split(',') {
254                node_types.set(s.parse::<NodeType>()?.to_u8() as usize, true);
255            }
256            Ok(NodeConstraint(node_types))
257        }
258    }
259}
260
261impl core::fmt::Display for NodeConstraint {
262    /// ```
263    /// # use std::collections::HashSet;
264    /// # use bitvec::prelude::*;
265    /// # use swh_graph::{NodeConstraint, NodeType};
266    ///
267    /// assert_eq!(format!("{}", NodeConstraint::default()), "*");
268    /// assert_eq!(
269    ///     format!("{}", NodeConstraint(bitarr!(u8, Msb0; 1, 1, 0, 0, 0, 0))),
270    ///     "cnt,dir"
271    /// );
272    /// assert_eq!(
273    ///     format!("{}", NodeConstraint(bitarr!(u8, Msb0; 0, 0, 1, 1, 1, 1))),
274    ///     "ori,rel,rev,snp"
275    /// );
276    /// ```
277    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
278        if self.0.all() {
279            write!(f, "*")?;
280        } else {
281            let mut type_strings: Vec<&str> = self.to_vec().iter().map(|t| t.to_str()).collect();
282            type_strings.sort();
283            write!(f, "{}", type_strings.join(","))?;
284        }
285        Ok(())
286    }
287}
288
289/// Type of an arc between two nodes in the Software Heritage graph, as a pair
290/// of type constraints on the source and destination arc. When one of the two
291/// is None, it means "any node type accepted".
292// TODO remove Options from ArcType and create a (more  expressive, similar to
293// NodeConstraint) type called ArcConstraint
294pub struct ArcType {
295    pub src: Option<NodeType>,
296    pub dst: Option<NodeType>,
297}