swh_graph/
swhid.rs

1// Copyright (C) 2023-2024  The Software Heritage developers
2// See the AUTHORS file at the top-level directory of this distribution
3// License: GNU General Public License version 3, or any later version
4// See top-level LICENSE file for more information
5
6use std::str::FromStr;
7
8use rdst::RadixKey;
9use sha1::{Digest, Sha1};
10use thiserror::Error;
11
12use crate::NodeType;
13
14#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
15#[repr(C)]
16/// SoftWare Heritage persistent IDentifiers
17///
18/// A SWHID consists of two separate parts, a mandatory core identifier that
19/// can point to any software artifact (or “object”) available in the Software
20/// Heritage archive, and an optional list of qualifiers that allows to specify
21/// the context where the object is meant to be seen and point to a subpart of
22/// the object itself.
23///
24/// # Reference
25/// - <https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html>
26/// - Roberto Di Cosmo, Morane Gruenpeter, Stefano Zacchiroli. [Identifiers for Digital Objects: the Case of Software Source Code Preservation](https://hal.archives-ouvertes.fr/hal-01865790v4). In Proceedings of iPRES 2018: 15th International Conference on Digital Preservation, Boston, MA, USA, September 2018, 9 pages.
27/// - Roberto Di Cosmo, Morane Gruenpeter, Stefano Zacchiroli. [Referencing Source Code Artifacts: a Separate Concern in Software Citation](https://arxiv.org/abs/2001.08647). In Computing in Science and Engineering, volume 22, issue 2, pages 33-43. ISSN 1521-9615, IEEE. March 2020.
28pub struct SWHID {
29    /// Namespace Version
30    pub namespace_version: u8,
31    /// Node type
32    pub node_type: NodeType,
33    /// SHA1 has of the node
34    pub hash: [u8; 20],
35}
36
37impl SWHID {
38    /// The size of the binary representation of a SWHID
39    pub const BYTES_SIZE: usize = 22;
40
41    /// Loads the SWHID representation for a origin uri
42    /// akin to "swh:1:ori:{}"
43    pub fn from_origin_url(origin: impl AsRef<str>) -> SWHID {
44        let mut hasher = Sha1::new();
45        hasher.update(origin.as_ref());
46
47        SWHID {
48            namespace_version: 1,
49            node_type: NodeType::Origin,
50            hash: hasher.finalize().into(),
51        }
52    }
53}
54
55impl core::fmt::Display for SWHID {
56    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
57        write!(
58            f,
59            "swh:{}:{}:",
60            self.namespace_version,
61            self.node_type.to_str(),
62        )?;
63        for byte in self.hash.iter() {
64            write!(f, "{byte:02x}")?;
65        }
66        Ok(())
67    }
68}
69
70#[derive(Error, Debug)]
71pub enum BinSWHIDDeserializationError {
72    #[error("Unsupported SWHID version: {0}")]
73    Version(u8),
74    #[error("Invalid SWHID type: {0}")]
75    Type(u8),
76}
77
78/// Parse a SWHID from bytes, while the SWHID struct has the exact same layout
79/// and thus it can be read directly from bytes, this function is provided for
80/// completeness and safety because we can check the namespace version is
81/// supported.
82impl TryFrom<[u8; SWHID::BYTES_SIZE]> for SWHID {
83    type Error = BinSWHIDDeserializationError;
84    fn try_from(value: [u8; SWHID::BYTES_SIZE]) -> std::result::Result<Self, Self::Error> {
85        use BinSWHIDDeserializationError::*;
86
87        let namespace_version = value[0];
88        if namespace_version != 1 {
89            return Err(Version(namespace_version));
90        }
91        let node_type = NodeType::try_from(value[1]).map_err(Type)?;
92        let mut hash = [0; 20];
93        hash.copy_from_slice(&value[2..]);
94        Ok(Self {
95            namespace_version,
96            node_type,
97            hash,
98        })
99    }
100}
101
102#[derive(Error, Debug, PartialEq, Eq, Hash)]
103pub enum StrSWHIDDeserializationError {
104    #[error("Invalid syntax: {0}")]
105    Syntax(&'static str),
106    #[error("Unsupported SWHID namespace: {0}")]
107    Namespace(String),
108    #[error("Unsupported SWHID version: {0}")]
109    Version(String),
110    #[error("Expected hash length to be {expected}, got {got}")]
111    HashLength { expected: usize, got: usize },
112    #[error("Invalid SWHID type: {0}")]
113    Type(String),
114    #[error("SWHID hash is not hexadecimal: {0}")]
115    HashAlphabet(String),
116}
117
118/// Parse a SWHID from the string representation
119impl TryFrom<&str> for SWHID {
120    type Error = StrSWHIDDeserializationError;
121    fn try_from(value: &str) -> std::result::Result<Self, Self::Error> {
122        Self::from_str(value)
123    }
124}
125
126impl FromStr for SWHID {
127    type Err = StrSWHIDDeserializationError;
128
129    fn from_str(value: &str) -> Result<Self, Self::Err> {
130        use StrSWHIDDeserializationError::*;
131
132        let mut tokens = value.splitn(4, ':');
133        let Some(namespace) = tokens.next() else {
134            return Err(Syntax("SWHID is empty"));
135        };
136        if namespace != "swh" {
137            return Err(Namespace(namespace.to_string()));
138        }
139        let Some(namespace_version) = tokens.next() else {
140            return Err(Syntax("SWHID is too short (no namespace version)"));
141        };
142        if namespace_version != "1" {
143            return Err(Version(namespace_version.to_string()));
144        }
145        let Some(node_type) = tokens.next() else {
146            return Err(Syntax("SWHID is too short (no object type)"));
147        };
148        let Some(hex_hash) = tokens.next() else {
149            return Err(Syntax("SWHID is too short (no object hash)"));
150        };
151        if hex_hash.len() != 40 {
152            return Err(HashLength {
153                expected: 40,
154                got: hex_hash.len(),
155            });
156        }
157        let node_type = node_type
158            .parse::<NodeType>()
159            .map_err(|e| Type(e.to_string()))?;
160        let mut hash = [0u8; 20];
161
162        // Miri does not support the SIMD feature-probing in faster-hex, so we have
163        // to fall back to a different crate.
164        #[cfg(all(miri, feature = "miri"))]
165        hex::decode_to_slice(hex_hash.as_bytes(), &mut hash)
166            .map_err(|_| HashAlphabet(hex_hash.to_string()))?;
167        #[cfg(all(miri, not(feature = "miri")))]
168        std::compile_error!("'miri' feature is required to compile with miri");
169        #[cfg(not(miri))]
170        faster_hex::hex_decode(hex_hash.as_bytes(), &mut hash)
171            .map_err(|_| HashAlphabet(hex_hash.to_string()))?;
172
173        Ok(Self {
174            namespace_version: 1,
175            node_type,
176            hash,
177        })
178    }
179}
180
181impl From<SWHID> for [u8; SWHID::BYTES_SIZE] {
182    fn from(value: SWHID) -> Self {
183        let mut result = [0; SWHID::BYTES_SIZE];
184        result[0] = value.namespace_version;
185        result[1] = value.node_type as u8;
186        result[2..].copy_from_slice(&value.hash);
187        result
188    }
189}
190
191impl RadixKey for SWHID {
192    const LEVELS: usize = 22;
193
194    #[inline(always)]
195    fn get_level(&self, level: usize) -> u8 {
196        assert!(level < Self::LEVELS);
197        match Self::LEVELS - level - 1 {
198            0 => self.namespace_version,
199            1 => match self.node_type {
200                // must follow alphabetical order of the 3-char abbreviation
201                NodeType::Content => 0,   // cnt
202                NodeType::Directory => 1, // dir
203                NodeType::Origin => 2,    // ori
204                NodeType::Release => 3,   // rel
205                NodeType::Revision => 4,  // rev
206                NodeType::Snapshot => 5,  // rel
207            },
208            n => self.hash[n - 2],
209        }
210    }
211}
212
213impl Ord for SWHID {
214    #[inline(always)]
215    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
216        for level in (0..Self::LEVELS).rev() {
217            let ordering = self.get_level(level).cmp(&other.get_level(level));
218            if ordering != std::cmp::Ordering::Equal {
219                return ordering;
220            }
221        }
222        std::cmp::Ordering::Equal
223    }
224}
225impl PartialOrd for SWHID {
226    #[inline(always)]
227    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
228        Some(self.cmp(other))
229    }
230}
231
232#[cfg(feature = "serde")]
233impl serde::Serialize for SWHID {
234    fn serialize<S: serde::Serializer>(
235        &self,
236        serializer: S,
237    ) -> std::result::Result<S::Ok, S::Error> {
238        serializer.collect_str(self)
239    }
240}
241
242#[cfg(feature = "serde")]
243impl<'de> serde::Deserialize<'de> for SWHID {
244    fn deserialize<D: serde::Deserializer<'de>>(
245        deserializer: D,
246    ) -> std::result::Result<Self, D::Error> {
247        deserializer.deserialize_str(SwhidVisitor)
248    }
249}
250
251#[cfg(feature = "serde")]
252struct SwhidVisitor;
253
254#[cfg(feature = "serde")]
255impl serde::de::Visitor<'_> for SwhidVisitor {
256    type Value = SWHID;
257
258    fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
259        formatter.write_str("a SWHID")
260    }
261
262    fn visit_str<E>(self, value: &str) -> Result<Self::Value, E>
263    where
264        E: serde::de::Error,
265    {
266        value.try_into().map_err(E::custom)
267    }
268}
269
270#[doc(hidden)]
271#[cfg(feature = "macros")]
272/// Helper function for [`swhid!()`]
273pub const fn __parse_swhid(node_type: crate::NodeType, hash: &'static str) -> SWHID {
274    use const_panic::unwrap_ok;
275    unwrap_ok!(match const_hex::const_decode_to_array(hash.as_bytes()) {
276        Ok(hash) => Ok(SWHID {
277            namespace_version: 1,
278            node_type,
279            hash
280        }),
281        Err(_) => Err("invalid SWHID hash"),
282    })
283}
284
285#[cfg(feature = "macros")]
286/// A SWHID literal checked at compile time
287///
288/// # Examples
289///
290/// ```
291/// use swh_graph::swhid;
292/// assert_eq!(
293///     swhid!(swh:1:rev:0000000000000000000000000000000000000004).to_string(),
294///     "swh:1:rev:0000000000000000000000000000000000000004".to_string(),
295/// );
296/// assert_eq!(
297///     swhid!(swh:1:rev:ffffffffffffffffffffffffffff000000000004).to_string(),
298///     "swh:1:rev:ffffffffffffffffffffffffffff000000000004".to_string(),
299/// );
300/// assert_eq!(
301///     swhid!(swh:1:rev:FFFFFFFFFFFFFFFFFFFFFFFFFFFF000000000004).to_string(),
302///     "swh:1:rev:ffffffffffffffffffffffffffff000000000004".to_string(),
303/// );
304/// ```
305///
306/// ```compile_fail
307/// use swh_graph::swhid;
308/// swhid!(swh:1:rev:ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ);
309/// ```
310///
311/// ```compile_fail
312/// use swh_graph::swhid;
313/// swhid!(swh:1:rev:00000000000000000000000000000000000004);
314/// ```
315#[macro_export]
316macro_rules! swhid {
317    // hash starting with a decimal digit
318    (swh:1:cnt:$hash:literal) => {{
319        const swhid: ::swh_graph::SWHID = {
320            let hash: &str = stringify!($hash);
321            ::swh_graph::__parse_swhid(::swh_graph::NodeType::Content, hash)
322        };
323        swhid
324    }};
325    (swh:1:dir:$hash:literal) => {{
326        const swhid: ::swh_graph::SWHID = {
327            let hash: &str = stringify!($hash);
328            ::swh_graph::__parse_swhid(::swh_graph::NodeType::Directory, hash)
329        };
330        swhid
331    }};
332    (swh:1:rev:$hash:literal) => {{
333        const swhid: ::swh_graph::SWHID = {
334            let hash: &str = stringify!($hash);
335            ::swh_graph::__parse_swhid(::swh_graph::NodeType::Revision, hash)
336        };
337        swhid
338    }};
339    (swh:1:rel:$hash:literal) => {{
340        const swhid: ::swh_graph::SWHID = {
341            let hash: &str = stringify!($hash);
342            ::swh_graph::__parse_swhid(::swh_graph::NodeType::Release, hash)
343        };
344        swhid
345    }};
346    (swh:1:snp:$hash:literal) => {{
347        const swhid: ::swh_graph::SWHID = {
348            let hash: &str = stringify!($hash);
349            ::swh_graph::__parse_swhid(::swh_graph::NodeType::Snapshot, hash)
350        };
351        swhid
352    }};
353    (swh:1:ori:$hash:literal) => {{
354        const swhid: ::swh_graph::SWHID = {
355            let hash: &str = stringify!($hash);
356            ::swh_graph::__parse_swhid(::swh_graph::NodeType::Origin, hash)
357        };
358        swhid
359    }};
360
361    // hash starting with a to f
362    (swh:1:cnt:$hash:ident) => {{
363        const swhid: ::swh_graph::SWHID = {
364            let hash: &str = stringify!($hash);
365            ::swh_graph::__parse_swhid(::swh_graph::NodeType::Content, hash)
366        };
367        swhid
368    }};
369    (swh:1:dir:$hash:ident) => {{
370        const swhid: ::swh_graph::SWHID = {
371            let hash: &str = stringify!($hash);
372            ::swh_graph::__parse_swhid(::swh_graph::NodeType::Directory, hash)
373        };
374        swhid
375    }};
376    (swh:1:rev:$hash:ident) => {{
377        const swhid: ::swh_graph::SWHID = {
378            let hash: &str = stringify!($hash);
379            ::swh_graph::__parse_swhid(::swh_graph::NodeType::Revision, hash)
380        };
381        swhid
382    }};
383    (swh:1:rel:$hash:ident) => {{
384        const swhid: ::swh_graph::SWHID = {
385            let hash: &str = stringify!($hash);
386            ::swh_graph::__parse_swhid(::swh_graph::NodeType::Release, hash)
387        };
388        swhid
389    }};
390    (swh:1:snp:$hash:ident) => {{
391        const swhid: ::swh_graph::SWHID = {
392            let hash: &str = stringify!($hash);
393            ::swh_graph::__parse_swhid(::swh_graph::NodeType::Snapshot, hash)
394        };
395        swhid
396    }};
397    (swh:1:ori:$hash:ident) => {{
398        const swhid: ::swh_graph::SWHID = {
399            let hash: &str = stringify!($hash);
400            ::swh_graph::__parse_swhid(::swh_graph::NodeType::Origin, hash)
401        };
402        swhid
403    }};
404}