Skip to main content

doiget_core/
canonical.rs

1//! Canonical-tuple audit identity for fetched papers (ADR-0021 §1, ADR-0024).
2//!
3//! Binding spec: [`docs/DECISIONS/0021-canonical-tuple-identity.md`](../../../docs/DECISIONS/0021-canonical-tuple-identity.md)
4//! §1 (NORMATIVE shape + digest algorithm) and
5//! [`docs/DECISIONS/0024-canonical-ref-impl.md`](../../../docs/DECISIONS/0024-canonical-ref-impl.md)
6//! (implementation supersession).
7//!
8//! # Algorithm (NORMATIVE)
9//!
10//! ```text
11//! canonical_digest := SHA256( source_type | 0x00 | source_id | 0x00
12//!                           | resolver_profile | 0x00 | version_or_empty )
13//! ```
14//!
15//! where `|` is byte concatenation, `0x00` is the single-byte field
16//! separator, and `version_or_empty` is the literal empty byte sequence
17//! when `version` is `None` (ADR-0021 §1 Slice 2 clarification — NO
18//! sentinel like `"null"` / `"none"` / `"-"`).
19//!
20//! The four `source_type` wire tokens fed into the digest match the
21//! lowercase variant names used by [`Ref`]'s tagged serde encoding:
22//! `"doi"`, `"arxiv"`. Future variants (Pmid, Handle, ...) MUST use
23//! their lowercase variant name as the wire token.
24
25use serde::{Deserialize, Serialize};
26use sha2::{Digest, Sha256};
27
28use crate::Ref;
29
30/// Identifier-class tag for a [`CanonicalRef`].
31///
32/// Marked `#[non_exhaustive]` so adding new classes (Pmid, Handle, etc.)
33/// in a future minor bump is non-breaking. The lowercase variant name is
34/// the byte string fed into the canonical-digest hash input
35/// (ADR-0021 §1) — renaming a variant is a breaking change to the
36/// digest contract.
37#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
38#[serde(rename_all = "lowercase")]
39#[non_exhaustive]
40pub enum SourceType {
41    /// DOI source class.
42    Doi,
43    /// arXiv source class.
44    Arxiv,
45}
46
47impl SourceType {
48    /// The lowercase wire token used in the canonical-digest hash input
49    /// (ADR-0021 §1) and in the JSON wire form.
50    pub fn as_wire_str(&self) -> &'static str {
51        match self {
52            SourceType::Doi => "doi",
53            SourceType::Arxiv => "arxiv",
54        }
55    }
56}
57
58/// Four-tuple audit identity of a fetched paper (ADR-0021 §1).
59///
60/// Two fetches of the same DOI through different resolvers (e.g.
61/// Crossref vs. Unpaywall) produce two distinct `CanonicalRef` values
62/// and therefore two distinct [`CanonicalRef::digest`] outputs. This is
63/// the resolver-distinction the audit log uses to separate "fetched via
64/// Crossref" from "fetched via Unpaywall" without changing the on-disk
65/// filename derivation (which remains keyed on [`Ref`] via
66/// [`crate::Safekey`]).
67///
68/// Marked `#[non_exhaustive]` per `docs/PUBLIC_API.md` §6 — adding new
69/// fields in a future minor bump is non-breaking. External callers
70/// construct values via [`Ref::promote`] or [`CanonicalRef::new`], not
71/// by struct-literal.
72#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
73#[non_exhaustive]
74pub struct CanonicalRef {
75    /// Identifier class (DOI / arXiv / future).
76    pub source_type: SourceType,
77    /// Validated identifier string (`"10.1234/foo"`, `"2401.12345"`).
78    pub source_id: String,
79    /// Resolver key that produced this audit identity. One of the
80    /// existing resolver keys (`"crossref"`, `"unpaywall"`, `"arxiv"`,
81    /// `"oa-publisher"`) plus future ones (`"openalex"`, `"s2"`, ...).
82    pub resolver_profile: String,
83    /// Optional version (arXiv `"v2"`, Crossref-snapshot date). When
84    /// `None`, the digest treats this field as the empty byte sequence
85    /// (ADR-0021 §1 Slice 2 clarification — no sentinel).
86    pub version: Option<String>,
87}
88
89impl CanonicalRef {
90    /// Construct a new `CanonicalRef` from its four fields.
91    ///
92    /// The struct is `#[non_exhaustive]`; external code MUST use this
93    /// constructor (or [`Ref::promote`]) rather than a struct literal.
94    pub fn new(
95        source_type: SourceType,
96        source_id: impl Into<String>,
97        resolver_profile: impl Into<String>,
98        version: Option<String>,
99    ) -> Self {
100        Self {
101            source_type,
102            source_id: source_id.into(),
103            resolver_profile: resolver_profile.into(),
104            version,
105        }
106    }
107
108    /// Compute the canonical-digest per ADR-0021 §1.
109    ///
110    /// ```text
111    /// SHA256( source_type | 0x00 | source_id | 0x00
112    ///       | resolver_profile | 0x00 | version_or_empty )
113    /// ```
114    ///
115    /// `version_or_empty` is the empty byte sequence when
116    /// [`Self::version`] is `None` (ADR-0021 §1 NORMATIVE clarification
117    /// — no `"null"` / `"none"` / `"-"` sentinel).
118    pub fn digest(&self) -> [u8; 32] {
119        let mut hasher = Sha256::new();
120        hasher.update(self.source_type.as_wire_str().as_bytes());
121        hasher.update([0x00]);
122        hasher.update(self.source_id.as_bytes());
123        hasher.update([0x00]);
124        hasher.update(self.resolver_profile.as_bytes());
125        hasher.update([0x00]);
126        if let Some(v) = &self.version {
127            hasher.update(v.as_bytes());
128        }
129        // When `version` is None, the trailing input is the empty byte
130        // sequence — the SHA-256 state is finalized with no further
131        // updates. This matches ADR-0021 §1 Slice 2: no sentinel.
132        hasher.finalize().into()
133    }
134
135    /// Hex-encoded [`Self::digest`] — 64 lowercase ASCII hex chars.
136    pub fn digest_hex(&self) -> String {
137        hex::encode(self.digest())
138    }
139}
140
141impl Ref {
142    /// Promote a [`Ref`] to a [`CanonicalRef`] under the given resolver
143    /// profile and optional version (ADR-0021 §1).
144    ///
145    /// `resolver_profile` MUST be the resolver key that produced (or is
146    /// about to produce) the audit identity — e.g. `"crossref"` for
147    /// Crossref metadata, `"oa-publisher"` for the publisher-side OA
148    /// PDF leg, `"arxiv"` for arXiv. The string is byte-for-byte fed
149    /// into the SHA-256 input.
150    ///
151    /// `version` is the optional source-specific version token (arXiv
152    /// `"v2"`, Crossref snapshot date). `None` selects the empty-bytes
153    /// branch of the digest algorithm.
154    pub fn promote(&self, resolver_profile: &str, version: Option<&str>) -> CanonicalRef {
155        let (source_type, source_id) = match self {
156            Ref::Doi(d) => (SourceType::Doi, d.as_str().to_string()),
157            Ref::Arxiv(a) => (SourceType::Arxiv, a.as_str().to_string()),
158        };
159        CanonicalRef {
160            source_type,
161            source_id,
162            resolver_profile: resolver_profile.to_string(),
163            version: version.map(str::to_string),
164        }
165    }
166}
167
168// ---------------------------------------------------------------------------
169// Tests — canonical-digest golden vectors (ADR-0021 §1, ADR-0024).
170//
171// Each vector pins ONE concrete (source_type, source_id, resolver_profile,
172// version) tuple to its expected SHA-256 hex digest, cross-checked by an
173// in-test reference implementation that re-concatenates the bytes and
174// runs one SHA-256 pass. The streaming digest and the reference digest
175// must agree byte-for-byte; a divergence is a NORMATIVE-contract break.
176// ---------------------------------------------------------------------------
177
178#[cfg(test)]
179#[allow(clippy::expect_used, clippy::unwrap_used, clippy::panic)]
180mod tests {
181    use super::*;
182    use crate::{ArxivId, Doi};
183
184    /// Reference implementation: concatenate the four parts with single
185    /// zero-byte separators and run one SHA-256 pass. Used to
186    /// double-check the streaming impl in [`CanonicalRef::digest`].
187    fn reference_digest_hex(
188        source_type: &str,
189        source_id: &str,
190        resolver_profile: &str,
191        version: Option<&str>,
192    ) -> String {
193        let mut buf: Vec<u8> = Vec::new();
194        buf.extend_from_slice(source_type.as_bytes());
195        buf.push(0x00);
196        buf.extend_from_slice(source_id.as_bytes());
197        buf.push(0x00);
198        buf.extend_from_slice(resolver_profile.as_bytes());
199        buf.push(0x00);
200        if let Some(v) = version {
201            buf.extend_from_slice(v.as_bytes());
202        }
203        let d = Sha256::digest(&buf);
204        hex::encode(d)
205    }
206
207    #[test]
208    fn digest_matches_reference_doi_crossref_no_version() {
209        // ADR-0021 §1 — version=None MUST be the empty byte sequence,
210        // not a sentinel. The reference impl and the streaming impl
211        // must agree.
212        let c = CanonicalRef::new(SourceType::Doi, "10.1234/foo", "crossref", None);
213        let expected = reference_digest_hex("doi", "10.1234/foo", "crossref", None);
214        assert_eq!(c.digest_hex(), expected);
215    }
216
217    #[test]
218    fn digest_matches_reference_doi_unpaywall_no_version() {
219        let c = CanonicalRef::new(SourceType::Doi, "10.1234/foo", "unpaywall", None);
220        let expected = reference_digest_hex("doi", "10.1234/foo", "unpaywall", None);
221        assert_eq!(c.digest_hex(), expected);
222        // Crossref vs. Unpaywall MUST differ — that's the whole point
223        // of the audit identity (ADR-0021 Context).
224        let c_cross = CanonicalRef::new(SourceType::Doi, "10.1234/foo", "crossref", None);
225        assert_ne!(c.digest_hex(), c_cross.digest_hex());
226    }
227
228    #[test]
229    fn digest_matches_reference_doi_oa_publisher_no_version() {
230        let c = CanonicalRef::new(SourceType::Doi, "10.1234/foo", "oa-publisher", None);
231        let expected = reference_digest_hex("doi", "10.1234/foo", "oa-publisher", None);
232        assert_eq!(c.digest_hex(), expected);
233    }
234
235    #[test]
236    fn digest_matches_reference_arxiv_no_version() {
237        let c = CanonicalRef::new(SourceType::Arxiv, "2401.12345", "arxiv", None);
238        let expected = reference_digest_hex("arxiv", "2401.12345", "arxiv", None);
239        assert_eq!(c.digest_hex(), expected);
240    }
241
242    #[test]
243    fn digest_matches_reference_arxiv_with_version_v2() {
244        let c = CanonicalRef::new(SourceType::Arxiv, "2401.12345", "arxiv", Some("v2".into()));
245        let expected = reference_digest_hex("arxiv", "2401.12345", "arxiv", Some("v2"));
246        assert_eq!(c.digest_hex(), expected);
247        // v2 MUST differ from version=None.
248        let c_none = CanonicalRef::new(SourceType::Arxiv, "2401.12345", "arxiv", None);
249        assert_ne!(c.digest_hex(), c_none.digest_hex());
250    }
251
252    #[test]
253    fn digest_matches_reference_arxiv_with_version_v10() {
254        let c = CanonicalRef::new(SourceType::Arxiv, "2401.12345", "arxiv", Some("v10".into()));
255        let expected = reference_digest_hex("arxiv", "2401.12345", "arxiv", Some("v10"));
256        assert_eq!(c.digest_hex(), expected);
257    }
258
259    #[test]
260    fn digest_matches_reference_doi_crossref_with_snapshot_date() {
261        let c = CanonicalRef::new(
262            SourceType::Doi,
263            "10.1234/foo",
264            "crossref",
265            Some("2026-05-12".into()),
266        );
267        let expected = reference_digest_hex("doi", "10.1234/foo", "crossref", Some("2026-05-12"));
268        assert_eq!(c.digest_hex(), expected);
269    }
270
271    #[test]
272    fn digest_matches_reference_real_publisher_doi() {
273        let c = CanonicalRef::new(
274            SourceType::Doi,
275            "10.1103/PhysRevLett.130.200601",
276            "oa-publisher",
277            None,
278        );
279        let expected = reference_digest_hex(
280            "doi",
281            "10.1103/PhysRevLett.130.200601",
282            "oa-publisher",
283            None,
284        );
285        assert_eq!(c.digest_hex(), expected);
286    }
287
288    #[test]
289    fn digest_some_empty_string_version_equals_none_version() {
290        // ADR-0021 §1: the empty-bytes trailing input is selected by
291        // BOTH `None` AND `Some("")` — neither appends any bytes after
292        // the third 0x00 separator. Pin the equality on the wire.
293        let c_some_empty = CanonicalRef::new(
294            SourceType::Doi,
295            "10.1234/foo",
296            "crossref",
297            Some(String::new()),
298        );
299        let c_none = CanonicalRef::new(SourceType::Doi, "10.1234/foo", "crossref", None);
300        assert_eq!(c_some_empty.digest_hex(), c_none.digest_hex());
301    }
302
303    #[test]
304    fn digest_matches_reference_old_style_arxiv() {
305        let c = CanonicalRef::new(SourceType::Arxiv, "cond-mat/9501001", "arxiv", None);
306        let expected = reference_digest_hex("arxiv", "cond-mat/9501001", "arxiv", None);
307        assert_eq!(c.digest_hex(), expected);
308    }
309
310    #[test]
311    fn digest_hex_is_64_lowercase_hex_chars() {
312        // The wire form is 64 lowercase hex chars (SHA-256 = 32 bytes).
313        let c = CanonicalRef::new(SourceType::Doi, "10.1234/foo", "crossref", None);
314        let s = c.digest_hex();
315        assert_eq!(s.len(), 64);
316        assert!(
317            s.chars()
318                .all(|ch| ch.is_ascii_hexdigit() && !ch.is_ascii_uppercase()),
319            "digest_hex must be lowercase ASCII hex, got {s}"
320        );
321    }
322
323    #[test]
324    fn ref_promote_doi_round_trip() {
325        // Spec: Ref::promote builds a CanonicalRef with the given
326        // resolver / version and copies the inner id through verbatim.
327        let r = Ref::Doi(Doi("10.1234/foo".into()));
328        let c = r.promote("crossref", None);
329        assert!(matches!(c.source_type, SourceType::Doi));
330        assert_eq!(c.source_id, "10.1234/foo");
331        assert_eq!(c.resolver_profile, "crossref");
332        assert!(c.version.is_none());
333    }
334
335    #[test]
336    fn ref_promote_arxiv_with_version_round_trip() {
337        let r = Ref::Arxiv(ArxivId("2401.12345".into()));
338        let c = r.promote("arxiv", Some("v2"));
339        assert!(matches!(c.source_type, SourceType::Arxiv));
340        assert_eq!(c.source_id, "2401.12345");
341        assert_eq!(c.resolver_profile, "arxiv");
342        assert_eq!(c.version.as_deref(), Some("v2"));
343    }
344
345    #[test]
346    fn ref_promote_then_digest_matches_direct_construction() {
347        let r = Ref::Doi(Doi("10.1234/foo".into()));
348        let c_promoted = r.promote("crossref", None);
349        let c_direct = CanonicalRef::new(SourceType::Doi, "10.1234/foo", "crossref", None);
350        assert_eq!(c_promoted.digest_hex(), c_direct.digest_hex());
351    }
352
353    #[test]
354    fn source_type_serializes_lowercase() {
355        // The lowercase variant name is the wire token AND the digest
356        // input. Both must agree.
357        let s = serde_json::to_string(&SourceType::Doi).expect("serialize");
358        assert_eq!(s, "\"doi\"");
359        let a = serde_json::to_string(&SourceType::Arxiv).expect("serialize");
360        assert_eq!(a, "\"arxiv\"");
361    }
362}