swh_graph/
swhid.rs

1// Copyright (C) 2023-2024  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6use std::str::FromStr;
7
8use rdst::RadixKey;
9use sha1::{Digest, Sha1};
10use thiserror::Error;
11
12use crate::NodeType;
13
14#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
15#[repr(C)]
16/// SoftWare Heritage persistent IDentifiers
17///
18/// A SWHID consists of two separate parts, a mandatory core identifier that
19/// can point to any software artifact (or “object”) available in the Software
20/// Heritage archive, and an optional list of qualifiers that allows to specify
21/// the context where the object is meant to be seen and point to a subpart of
22/// the object itself.
23///
24/// # Reference
25/// - <https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html>
26/// - Roberto Di Cosmo, Morane Gruenpeter, Stefano Zacchiroli. [Identifiers for Digital Objects: the Case of Software Source Code Preservation](https://hal.archives-ouvertes.fr/hal-01865790v4). In Proceedings of iPRES 2018: 15th International Conference on Digital Preservation, Boston, MA, USA, September 2018, 9 pages.
27/// - Roberto Di Cosmo, Morane Gruenpeter, Stefano Zacchiroli. [Referencing Source Code Artifacts: a Separate Concern in Software Citation](https://arxiv.org/abs/2001.08647). In Computing in Science and Engineering, volume 22, issue 2, pages 33-43. ISSN 1521-9615, IEEE. March 2020.
28pub struct SWHID {
29    /// Namespace Version
30    pub namespace_version: u8,
31    /// Node type
32    pub node_type: NodeType,
33    /// SHA1 has of the node
34    pub hash: [u8; 20],
35}
36
37impl SWHID {
38    /// The size of the binary representation of a SWHID
39    pub const BYTES_SIZE: usize = 22;
40
41    /// Loads the SWHID representation for a origin uri
42    /// akin to "swh:1:ori:{}"
43    pub fn from_origin_url(origin: impl AsRef<str>) -> SWHID {
44        let mut hasher = Sha1::new();
45        hasher.update(origin.as_ref());
46
47        SWHID {
48            namespace_version: 1,
49            node_type: NodeType::Origin,
50            hash: hasher.finalize().into(),
51        }
52    }
53}
54
55impl core::fmt::Display for SWHID {
56    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
57        write!(
58            f,
59            "swh:{}:{}:",
60            self.namespace_version,
61            self.node_type.to_str(),
62        )?;
63        for byte in self.hash.iter() {
64            write!(f, "{byte:02x}")?;
65        }
66        Ok(())
67    }
68}
69
70#[derive(Error, Debug)]
71pub enum BinSWHIDDeserializationError {
72    #[error("Unsupported SWHID version: {0}")]
73    Version(u8),
74    #[error("Invalid SWHID type: {0}")]
75    Type(u8),
76}
77
78/// Parse a SWHID from bytes, while the SWHID struct has the exact same layout
79/// and thus it can be read directly from bytes, this function is provided for
80/// completeness and safety because we can check the namespace version is
81/// supported.
82impl TryFrom<[u8; SWHID::BYTES_SIZE]> for SWHID {
83    type Error = BinSWHIDDeserializationError;
84    fn try_from(value: [u8; SWHID::BYTES_SIZE]) -> std::result::Result<Self, Self::Error> {
85        use BinSWHIDDeserializationError::*;
86
87        let namespace_version = value[0];
88        if namespace_version != 1 {
89            return Err(Version(namespace_version));
90        }
91        let node_type = NodeType::try_from(value[1]).map_err(Type)?;
92        let mut hash = [0; 20];
93        hash.copy_from_slice(&value[2..]);
94        Ok(Self {
95            namespace_version,
96            node_type,
97            hash,
98        })
99    }
100}
101
102#[derive(Error, Debug, PartialEq, Eq, Hash)]
103pub enum StrSWHIDDeserializationError {
104    #[error("Invalid syntax: {0}")]
105    Syntax(&'static str),
106    #[error("Unsupported SWHID namespace: {0}")]
107    Namespace(String),
108    #[error("Unsupported SWHID version: {0}")]
109    Version(String),
110    #[error("Expected hash length to be {expected}, got {got}")]
111    HashLength { expected: usize, got: usize },
112    #[error("Invalid SWHID type: {0}")]
113    Type(String),
114    #[error("SWHID hash is not hexadecimal: {0}")]
115    HashAlphabet(String),
116}
117
118/// Parse a SWHID from the string representation
119impl TryFrom<&str> for SWHID {
120    type Error = StrSWHIDDeserializationError;
121    fn try_from(value: &str) -> std::result::Result<Self, Self::Error> {
122        Self::from_str(value)
123    }
124}
125
126impl FromStr for SWHID {
127    type Err = StrSWHIDDeserializationError;
128
129    fn from_str(value: &str) -> Result<Self, Self::Err> {
130        use StrSWHIDDeserializationError::*;
131
132        let mut tokens = value.splitn(4, ':');
133        let Some(namespace) = tokens.next() else {
134            return Err(Syntax("SWHID is empty"));
135        };
136        if namespace != "swh" {
137            return Err(Namespace(namespace.to_string()));
138        }
139        let Some(namespace_version) = tokens.next() else {
140            return Err(Syntax("SWHID is too short (no namespace version)"));
141        };
142        if namespace_version != "1" {
143            return Err(Version(namespace_version.to_string()));
144        }
145        let Some(node_type) = tokens.next() else {
146            return Err(Syntax("SWHID is too short (no object type)"));
147        };
148        let Some(hex_hash) = tokens.next() else {
149            return Err(Syntax("SWHID is too short (no object hash)"));
150        };
151        if hex_hash.len() != 40 {
152            return Err(HashLength {
153                expected: 40,
154                got: hex_hash.len(),
155            });
156        }
157        let node_type = node_type
158            .parse::<NodeType>()
159            .map_err(|e| Type(e.to_string()))?;
160        let mut hash = [0u8; 20];
161
162        // Miri does not support the SIMD feature-probing in faster-hex, so we have
163        // to fall back to a different crate.
164        #[cfg(all(miri, feature = "miri"))]
165        hex::decode_to_slice(hex_hash.as_bytes(), &mut hash)
166            .map_err(|_| HashAlphabet(hex_hash.to_string()))?;
167        #[cfg(all(miri, not(feature = "miri")))]
168        std::compile_error!("'miri' feature is required to compile with miri");
169        #[cfg(not(miri))]
170        faster_hex::hex_decode(hex_hash.as_bytes(), &mut hash)
171            .map_err(|_| HashAlphabet(hex_hash.to_string()))?;
172
173        Ok(Self {
174            namespace_version: 1,
175            node_type,
176            hash,
177        })
178    }
179}
180
181impl From<SWHID> for [u8; SWHID::BYTES_SIZE] {
182    fn from(value: SWHID) -> Self {
183        let mut result = [0; SWHID::BYTES_SIZE];
184        result[0] = value.namespace_version;
185        result[1] = value.node_type as u8;
186        result[2..].copy_from_slice(&value.hash);
187        result
188    }
189}
190
191impl RadixKey for SWHID {
192    const LEVELS: usize = 22;
193
194    #[inline(always)]
195    fn get_level(&self, level: usize) -> u8 {
196        assert!(level < Self::LEVELS);
197        match Self::LEVELS - level - 1 {
198            0 => self.namespace_version,
199            1 => match self.node_type {
200                // must follow alphabetical order of the 3-char abbreviation
201                NodeType::Content => 0,   // cnt
202                NodeType::Directory => 1, // dir
203                NodeType::Origin => 2,    // ori
204                NodeType::Release => 3,   // rel
205                NodeType::Revision => 4,  // rev
206                NodeType::Snapshot => 5,  // rel
207            },
208            n => self.hash[n - 2],
209        }
210    }
211}
212
213impl Ord for SWHID {
214    #[inline(always)]
215    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
216        for level in (0..Self::LEVELS).rev() {
217            let ordering = self.get_level(level).cmp(&other.get_level(level));
218            if ordering != std::cmp::Ordering::Equal {
219                return ordering;
220            }
221        }
222        std::cmp::Ordering::Equal
223    }
224}
225impl PartialOrd for SWHID {
226    #[inline(always)]
227    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
228        Some(self.cmp(other))
229    }
230}
231
232#[cfg(feature = "serde")]
233impl serde::Serialize for SWHID {
234    fn serialize<S: serde::Serializer>(
235        &self,
236        serializer: S,
237    ) -> std::result::Result<S::Ok, S::Error> {
238        serializer.collect_str(self)
239    }
240}
241
242#[cfg(feature = "serde")]
243impl<'de> serde::Deserialize<'de> for SWHID {
244    fn deserialize<D: serde::Deserializer<'de>>(
245        deserializer: D,
246    ) -> std::result::Result<Self, D::Error> {
247        deserializer.deserialize_str(SwhidVisitor)
248    }
249}
250
251#[cfg(feature = "serde")]
252struct SwhidVisitor;
253
254#[cfg(feature = "serde")]
255impl serde::de::Visitor<'_> for SwhidVisitor {
256    type Value = SWHID;
257
258    fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
259        formatter.write_str("a SWHID")
260    }
261
262    fn visit_str<E>(self, value: &str) -> Result<Self::Value, E>
263    where
264        E: serde::de::Error,
265    {
266        value.try_into().map_err(E::custom)
267    }
268}
269
270#[doc(hidden)]
271#[cfg(feature = "macros")]
272/// Helper function for [`swhid!()`]
273pub const fn __parse_swhid(node_type: crate::NodeType, hash: &'static str) -> SWHID {
274    use const_panic::unwrap_ok;
275    unwrap_ok!(match const_hex::const_decode_to_array(hash.as_bytes()) {
276        Ok(hash) => Ok(SWHID {
277            namespace_version: 1,
278            node_type,
279            hash
280        }),
281        Err(_) => Err("invalid SWHID hash"),
282    })
283}
284
285#[cfg(feature = "macros")]
286/// A SWHID literal checked at compile time
287///
288/// # Examples
289///
290/// ```
291/// use swh_graph::swhid;
292/// assert_eq!(
293///     swhid!(swh:1:rev:0000000000000000000000000000000000000004).to_string(),
294///     "swh:1:rev:0000000000000000000000000000000000000004".to_string(),
295/// );
296/// ```
297///
298/// ```compile_fail
299/// use swh_graph::swhid;
300/// swhid!(swh:1:rev:ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ);
301/// ```
302///
303/// ```compile_fail
304/// use swh_graph::swhid;
305/// swhid!(swh:1:rev:00000000000000000000000000000000000004);
306/// ```
307#[macro_export]
308macro_rules! swhid {
309    (swh:1:cnt:$hash:literal) => {{
310        const swhid: ::swh_graph::SWHID = {
311            let hash: &str = stringify!($hash);
312            ::swh_graph::__parse_swhid(::swh_graph::NodeType::Content, hash)
313        };
314        swhid
315    }};
316    (swh:1:dir:$hash:literal) => {{
317        const swhid: ::swh_graph::SWHID = {
318            let hash: &str = stringify!($hash);
319            ::swh_graph::__parse_swhid(::swh_graph::NodeType::Directory, hash)
320        };
321        swhid
322    }};
323    (swh:1:rev:$hash:literal) => {{
324        const swhid: ::swh_graph::SWHID = {
325            let hash: &str = stringify!($hash);
326            ::swh_graph::__parse_swhid(::swh_graph::NodeType::Revision, hash)
327        };
328        swhid
329    }};
330    (swh:1:rel:$hash:literal) => {{
331        const swhid: ::swh_graph::SWHID = {
332            let hash: &str = stringify!($hash);
333            ::swh_graph::__parse_swhid(::swh_graph::NodeType::Release, hash)
334        };
335        swhid
336    }};
337    (swh:1:snp:$hash:literal) => {{
338        const swhid: ::swh_graph::SWHID = {
339            let hash: &str = stringify!($hash);
340            ::swh_graph::__parse_swhid(::swh_graph::NodeType::Snapshot, hash)
341        };
342        swhid
343    }};
344    (swh:1:ori:$hash:literal) => {{
345        const swhid: ::swh_graph::SWHID = {
346            let hash: &str = stringify!($hash);
347            ::swh_graph::__parse_swhid(::swh_graph::NodeType::Origin, hash)
348        };
349        swhid
350    }};
351}