Skip to main content

acdp_types/
data_ref.rs

1//! Data references — `acdp-data-ref.schema.json`.
2//!
3//! Each `DataRef` MUST contain exactly one of `location` (URI string or
4//! structured locator object) or `embedded` (inline payload, ≤ 64 KB
5//! decoded). The `type` field is a closed enum identifying the role of the
6//! reference within the context.
7
8use acdp_primitives::primitives::ContentHash;
9use serde::{Deserialize, Serialize};
10
11/// Role of a data reference. Closed enum per
12/// `acdp-data-ref.schema.json` `type`.
13#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
14#[serde(rename_all = "snake_case")]
15pub enum DataRefType {
16    /// The principal output of the context.
17    PrimaryResult,
18    /// Source data the context describes or refers back to.
19    RawData,
20    /// Auxiliary material that supports the context (notes, plots, etc.).
21    SupportingInfo,
22    /// Output computed/derived from the primary result.
23    DerivedData,
24}
25
26/// A reference to a piece of data the context describes.
27///
28/// Per `acdp-data-ref.schema.json` `oneOf`: exactly one of `location` or
29/// `embedded` MUST be present. The struct does not enforce this at
30/// construction time; runtime validation is done by `validate_data_ref`.
31#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct DataRef {
33    /// Role of this reference within the context.
34    #[serde(rename = "type")]
35    pub ref_type: DataRefType,
36
37    /// Human-readable description (≤ 1000 chars).
38    ///
39    /// Optional and absent-or-string in `acdp-data-ref.schema.json` — not
40    /// nullable. `de_present` rejects an explicit `"description": null`.
41    #[serde(
42        default,
43        skip_serializing_if = "Option::is_none",
44        deserialize_with = "crate::serde_helpers::de_present"
45    )]
46    pub description: Option<String>,
47
48    /// Size of the referenced or embedded data in bytes.
49    #[serde(
50        default,
51        skip_serializing_if = "Option::is_none",
52        deserialize_with = "crate::serde_helpers::de_present"
53    )]
54    pub size_bytes: Option<u64>,
55
56    /// Producer-defined format identifier (e.g. `parquet`, `csv`).
57    #[serde(
58        default,
59        skip_serializing_if = "Option::is_none",
60        deserialize_with = "crate::serde_helpers::de_present"
61    )]
62    pub format: Option<String>,
63
64    /// Producer-specific schema version for this data.
65    #[serde(
66        default,
67        skip_serializing_if = "Option::is_none",
68        deserialize_with = "crate::serde_helpers::de_present"
69    )]
70    pub schema_version: Option<String>,
71
72    /// Optional SHA-256 hash for verifying data integrity at fetch time.
73    /// For embedded data, computed over the decoded bytes per
74    /// `acdp-data-ref.schema.json`.
75    #[serde(
76        default,
77        skip_serializing_if = "Option::is_none",
78        deserialize_with = "crate::serde_helpers::de_present"
79    )]
80    pub content_hash: Option<ContentHash>,
81
82    /// Where the data resides — either a URI string or a structured
83    /// locator object with a dotted-namespace `scheme` field.
84    #[serde(
85        default,
86        skip_serializing_if = "Option::is_none",
87        deserialize_with = "crate::serde_helpers::de_present"
88    )]
89    pub location: Option<Location>,
90
91    /// Inline embedded payload. Decoded size MUST NOT exceed 64 KB.
92    #[serde(
93        default,
94        skip_serializing_if = "Option::is_none",
95        deserialize_with = "crate::serde_helpers::de_present"
96    )]
97    pub embedded: Option<EmbeddedContent>,
98
99    /// Unknown producer-controlled `DataRef` fields, preserved verbatim.
100    ///
101    /// `acdp-data-ref.schema.json` has NO `additionalProperties: false`
102    /// at its root — the object is open by design. A `DataRef` lives
103    /// inside `ProducerContent` (the `content_hash` preimage), so a
104    /// future ACDP minor version that adds a producer-controlled DataRef
105    /// field must round-trip through this map: without it an older
106    /// consumer would silently drop the new field on deserialization and
107    /// recompute a different `content_hash`, falsely failing verification.
108    /// Mirrors the [`crate::body::Body::extensions`] pattern
109    /// (RFC-ACDP-0001 §5.7, conformance fixture can-010).
110    #[serde(flatten)]
111    pub extensions: serde_json::Map<String, serde_json::Value>,
112}
113
114/// Locator for `DataRef.location` — either a URI string or a structured
115/// locator object. See `acdp-data-ref.schema.json` `location.oneOf`.
116#[derive(Debug, Clone, Serialize, Deserialize)]
117#[serde(untagged)]
118pub enum Location {
119    /// URI form: scheme + authority + path. MUST NOT contain credentials in
120    /// the userinfo component (the body is signed and immutable, so leaked
121    /// secrets cannot be redacted later).
122    Uri(String),
123    /// Structured locator: object with a required dotted-namespace `scheme`
124    /// (e.g. `kafka.offset`, `ipfs.cid`, `db.row`). Additional keys are
125    /// permitted by schema.
126    Structured(serde_json::Map<String, serde_json::Value>),
127}
128
129impl DataRef {
130    /// URI-form data reference (no integrity hash).
131    pub fn uri(ref_type: DataRefType, uri: impl Into<String>) -> Self {
132        Self {
133            ref_type,
134            description: None,
135            size_bytes: None,
136            format: None,
137            schema_version: None,
138            content_hash: None,
139            location: Some(Location::Uri(uri.into())),
140            embedded: None,
141            extensions: serde_json::Map::new(),
142        }
143    }
144
145    /// URI-form data reference with a SHA-256 integrity hash.
146    pub fn uri_verified(ref_type: DataRefType, uri: impl Into<String>, hash: ContentHash) -> Self {
147        Self {
148            ref_type,
149            description: None,
150            size_bytes: None,
151            format: None,
152            schema_version: None,
153            content_hash: Some(hash),
154            location: Some(Location::Uri(uri.into())),
155            embedded: None,
156            extensions: serde_json::Map::new(),
157        }
158    }
159
160    /// Structured-locator data reference. `scheme` MUST match
161    /// `^[a-z][a-z0-9-]*(\.[a-z][a-z0-9-]*)+$`. Additional fields go in `extra`.
162    ///
163    /// In debug builds, an invalid scheme triggers a `debug_assert!`
164    /// to surface the bug at construction time. Release builds accept
165    /// the malformed value silently — pair this constructor with
166    /// `acdp::validation::validate_data_ref` (called automatically
167    /// by `RequestBuilder::build`) for runtime rejection. For a
168    /// fallible variant, use [`Self::try_structured`].
169    pub fn structured(
170        ref_type: DataRefType,
171        scheme: impl Into<String>,
172        extra: serde_json::Map<String, serde_json::Value>,
173    ) -> Self {
174        let scheme: String = scheme.into();
175        debug_assert!(
176            is_dotted_namespace_scheme(&scheme),
177            "DataRef::structured: scheme '{scheme}' does not match \
178             ^[a-z][a-z0-9-]*(\\.[a-z][a-z0-9-]*)+$ — pass a dotted-namespace identifier \
179             like 'kafka.offset' or use try_structured for runtime checking"
180        );
181        let mut map = extra;
182        map.insert("scheme".into(), serde_json::Value::String(scheme));
183        Self {
184            ref_type,
185            description: None,
186            size_bytes: None,
187            format: None,
188            schema_version: None,
189            content_hash: None,
190            location: Some(Location::Structured(map)),
191            embedded: None,
192            extensions: serde_json::Map::new(),
193        }
194    }
195
196    /// Fallible structured-locator constructor. Returns
197    /// [`acdp_primitives::error::AcdpError::SchemaViolation`] if `scheme` does
198    /// not match the dotted-namespace pattern.
199    pub fn try_structured(
200        ref_type: DataRefType,
201        scheme: impl Into<String>,
202        extra: serde_json::Map<String, serde_json::Value>,
203    ) -> Result<Self, acdp_primitives::error::AcdpError> {
204        let scheme: String = scheme.into();
205        if !is_dotted_namespace_scheme(&scheme) {
206            return Err(acdp_primitives::error::AcdpError::SchemaViolation(format!(
207                "structured locator scheme '{scheme}' must match \
208                 ^[a-z][a-z0-9-]*(\\.[a-z][a-z0-9-]*)+$"
209            )));
210        }
211        let mut map = extra;
212        map.insert("scheme".into(), serde_json::Value::String(scheme));
213        Ok(Self {
214            ref_type,
215            description: None,
216            size_bytes: None,
217            format: None,
218            schema_version: None,
219            content_hash: None,
220            location: Some(Location::Structured(map)),
221            embedded: None,
222            extensions: serde_json::Map::new(),
223        })
224    }
225
226    /// Embedded JSON data reference.
227    pub fn embedded_json(ref_type: DataRefType, content: serde_json::Value) -> Self {
228        Self {
229            ref_type,
230            description: None,
231            size_bytes: None,
232            format: Some("application/json".into()),
233            schema_version: None,
234            content_hash: None,
235            location: None,
236            embedded: Some(EmbeddedContent {
237                encoding: EmbeddedEncoding::Json,
238                content,
239            }),
240            extensions: serde_json::Map::new(),
241        }
242    }
243
244    /// Embedded UTF-8 text data reference. The text is stored as a JSON string.
245    pub fn embedded_utf8(ref_type: DataRefType, text: impl Into<String>) -> Self {
246        Self {
247            ref_type,
248            description: None,
249            size_bytes: None,
250            format: None,
251            schema_version: None,
252            content_hash: None,
253            location: None,
254            embedded: Some(EmbeddedContent {
255                encoding: EmbeddedEncoding::Utf8,
256                content: serde_json::Value::String(text.into()),
257            }),
258            extensions: serde_json::Map::new(),
259        }
260    }
261
262    /// Embedded base64 binary data reference. `b64` is stored as a JSON string.
263    pub fn embedded_base64(ref_type: DataRefType, b64: impl Into<String>) -> Self {
264        Self {
265            ref_type,
266            description: None,
267            size_bytes: None,
268            format: None,
269            schema_version: None,
270            content_hash: None,
271            location: None,
272            embedded: Some(EmbeddedContent {
273                encoding: EmbeddedEncoding::Base64,
274                content: serde_json::Value::String(b64.into()),
275            }),
276            extensions: serde_json::Map::new(),
277        }
278    }
279
280    // ── Type-bound URI shortcuts ─────────────────────────────────────────────
281    //
282    // The four DataRefType variants come up frequently; these one-liners
283    // save a `DataRefType::PrimaryResult` mention at every call site.
284
285    /// `DataRef::uri(DataRefType::PrimaryResult, uri)`.
286    pub fn primary_result_uri(uri: impl Into<String>) -> Self {
287        Self::uri(DataRefType::PrimaryResult, uri)
288    }
289    /// `DataRef::uri(DataRefType::RawData, uri)`.
290    pub fn raw_data_uri(uri: impl Into<String>) -> Self {
291        Self::uri(DataRefType::RawData, uri)
292    }
293    /// `DataRef::uri(DataRefType::SupportingInfo, uri)`.
294    pub fn supporting_info_uri(uri: impl Into<String>) -> Self {
295        Self::uri(DataRefType::SupportingInfo, uri)
296    }
297    /// `DataRef::uri(DataRefType::DerivedData, uri)`.
298    pub fn derived_data_uri(uri: impl Into<String>) -> Self {
299        Self::uri(DataRefType::DerivedData, uri)
300    }
301
302    /// `DataRef::embedded_json(DataRefType::PrimaryResult, content)`.
303    pub fn primary_result_json(content: serde_json::Value) -> Self {
304        Self::embedded_json(DataRefType::PrimaryResult, content)
305    }
306    /// `DataRef::embedded_json(DataRefType::DerivedData, content)`.
307    pub fn derived_data_json(content: serde_json::Value) -> Self {
308        Self::embedded_json(DataRefType::DerivedData, content)
309    }
310}
311
312/// Inline embedded data payload.
313#[derive(Debug, Clone, Serialize, Deserialize)]
314#[serde(deny_unknown_fields)]
315pub struct EmbeddedContent {
316    /// Content encoding / interpretation.
317    pub encoding: EmbeddedEncoding,
318    /// The actual content. For `json` encoding this is any JSON value.
319    /// For `utf8` / `base64` it MUST be a JSON string.
320    pub content: serde_json::Value,
321}
322
323/// Validate dotted-namespace scheme pattern.
324fn is_dotted_namespace_scheme(s: &str) -> bool {
325    let parts: Vec<&str> = s.split('.').collect();
326    if parts.len() < 2 {
327        return false;
328    }
329    parts.iter().all(|part| {
330        !part.is_empty()
331            && part.chars().next().is_some_and(|c| c.is_ascii_lowercase())
332            && part
333                .chars()
334                .all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '-')
335    })
336}
337
338/// How the `content` field of an embedded data reference is encoded.
339#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
340#[serde(rename_all = "lowercase")]
341pub enum EmbeddedEncoding {
342    /// Any JSON value (object, array, number, …).
343    Json,
344    /// A UTF-8 text payload encoded as a JSON string.
345    Utf8,
346    /// Binary data encoded as standard base64, stored as a JSON string.
347    Base64,
348}
349
350#[cfg(test)]
351mod tests {
352    use super::*;
353    use serde_json::json;
354
355    // ── is_dotted_namespace_scheme ─────────────────────────────────────────
356
357    #[test]
358    fn dotted_namespace_scheme_accepts_valid() {
359        for s in [
360            "kafka.offset",
361            "ipfs.cid",
362            "db.row",
363            "a.b",
364            "a1.b2.c3",
365            "with-hyphen.part-two",
366        ] {
367            assert!(is_dotted_namespace_scheme(s), "should accept {s:?}");
368        }
369    }
370
371    #[test]
372    fn dotted_namespace_scheme_rejects_invalid() {
373        for s in [
374            "",              // empty
375            "nodot",         // missing separator
376            "Kafka.offset",  // uppercase first char
377            "kafka.Offset",  // uppercase in second segment
378            "kafka..offset", // empty middle segment
379            ".leading",      // empty leading segment
380            "trailing.",     // empty trailing segment
381            "1kafka.offset", // segment must start with a letter
382            "kafka.1offset", // second segment must start with a letter
383            "kafka.off_set", // underscore not permitted
384        ] {
385            assert!(!is_dotted_namespace_scheme(s), "should reject {s:?}");
386        }
387    }
388
389    // ── try_structured / structured ────────────────────────────────────────
390
391    #[test]
392    fn try_structured_ok_inserts_scheme_and_extra() {
393        let mut extra = serde_json::Map::new();
394        extra.insert("offset".into(), json!(42));
395        let dr = DataRef::try_structured(DataRefType::RawData, "kafka.offset", extra).unwrap();
396        match dr.location {
397            Some(Location::Structured(map)) => {
398                assert_eq!(map["scheme"], json!("kafka.offset"));
399                assert_eq!(map["offset"], json!(42));
400            }
401            other => panic!("expected structured location, got {other:?}"),
402        }
403        assert!(dr.embedded.is_none(), "structured locator has no embedded");
404    }
405
406    #[test]
407    fn try_structured_rejects_bad_scheme() {
408        let err = DataRef::try_structured(DataRefType::RawData, "nodot", serde_json::Map::new())
409            .unwrap_err();
410        assert!(
411            matches!(err, acdp_primitives::error::AcdpError::SchemaViolation(_)),
412            "bad scheme must be SchemaViolation, got {err:?}"
413        );
414    }
415
416    #[test]
417    fn structured_inserts_scheme_for_valid_input() {
418        // Valid scheme: avoids the debug_assert! in `structured`.
419        let dr = DataRef::structured(DataRefType::RawData, "ipfs.cid", serde_json::Map::new());
420        match dr.location {
421            Some(Location::Structured(map)) => assert_eq!(map["scheme"], json!("ipfs.cid")),
422            other => panic!("expected structured location, got {other:?}"),
423        }
424    }
425
426    // ── URI constructors ───────────────────────────────────────────────────
427
428    #[test]
429    fn uri_constructor_sets_location_without_hash() {
430        let dr = DataRef::uri(DataRefType::PrimaryResult, "https://x.example/d");
431        assert_eq!(dr.ref_type, DataRefType::PrimaryResult);
432        assert!(matches!(dr.location, Some(Location::Uri(ref u)) if u == "https://x.example/d"));
433        assert!(dr.content_hash.is_none());
434        assert!(dr.embedded.is_none());
435    }
436
437    #[test]
438    fn uri_verified_carries_content_hash() {
439        let hash = ContentHash(
440            "sha256:f170150ddbf59d99794e7797824591b374d459782084597b644ecc57a41031b5".into(),
441        );
442        let dr = DataRef::uri_verified(DataRefType::RawData, "https://x/d", hash.clone());
443        assert_eq!(dr.content_hash, Some(hash));
444        assert!(matches!(dr.location, Some(Location::Uri(_))));
445    }
446
447    #[test]
448    fn type_bound_uri_shortcuts_pick_the_right_type() {
449        assert_eq!(
450            DataRef::primary_result_uri("u").ref_type,
451            DataRefType::PrimaryResult
452        );
453        assert_eq!(DataRef::raw_data_uri("u").ref_type, DataRefType::RawData);
454        assert_eq!(
455            DataRef::supporting_info_uri("u").ref_type,
456            DataRefType::SupportingInfo
457        );
458        assert_eq!(
459            DataRef::derived_data_uri("u").ref_type,
460            DataRefType::DerivedData
461        );
462    }
463
464    // ── Embedded constructors ──────────────────────────────────────────────
465
466    #[test]
467    fn embedded_json_sets_json_encoding_and_format() {
468        let dr = DataRef::embedded_json(DataRefType::PrimaryResult, json!({"k": 1}));
469        let e = dr.embedded.expect("embedded set");
470        assert_eq!(e.encoding, EmbeddedEncoding::Json);
471        assert_eq!(e.content, json!({"k": 1}));
472        assert_eq!(dr.format.as_deref(), Some("application/json"));
473        assert!(dr.location.is_none(), "embedded ref has no location");
474    }
475
476    #[test]
477    fn embedded_utf8_stores_text_as_json_string() {
478        let dr = DataRef::embedded_utf8(DataRefType::SupportingInfo, "hello");
479        let e = dr.embedded.expect("embedded set");
480        assert_eq!(e.encoding, EmbeddedEncoding::Utf8);
481        assert_eq!(e.content, json!("hello"));
482    }
483
484    #[test]
485    fn embedded_base64_stores_payload_as_json_string() {
486        let dr = DataRef::embedded_base64(DataRefType::DerivedData, "aGVsbG8=");
487        let e = dr.embedded.expect("embedded set");
488        assert_eq!(e.encoding, EmbeddedEncoding::Base64);
489        assert_eq!(e.content, json!("aGVsbG8="));
490    }
491
492    #[test]
493    fn type_bound_json_shortcuts_pick_the_right_type() {
494        assert_eq!(
495            DataRef::primary_result_json(json!(1)).ref_type,
496            DataRefType::PrimaryResult
497        );
498        assert_eq!(
499            DataRef::derived_data_json(json!(1)).ref_type,
500            DataRefType::DerivedData
501        );
502    }
503
504    // ── Wire shape ─────────────────────────────────────────────────────────
505
506    #[test]
507    fn data_ref_type_serializes_snake_case() {
508        assert_eq!(
509            serde_json::to_value(DataRefType::PrimaryResult).unwrap(),
510            json!("primary_result")
511        );
512        assert_eq!(
513            serde_json::to_value(DataRefType::RawData).unwrap(),
514            json!("raw_data")
515        );
516        assert_eq!(
517            serde_json::to_value(DataRefType::SupportingInfo).unwrap(),
518            json!("supporting_info")
519        );
520        assert_eq!(
521            serde_json::to_value(DataRefType::DerivedData).unwrap(),
522            json!("derived_data")
523        );
524    }
525
526    #[test]
527    fn embedded_content_rejects_unknown_field() {
528        // EmbeddedContent is `deny_unknown_fields`.
529        let raw = json!({"encoding": "utf8", "content": "x", "surprise": 1});
530        let parsed: Result<EmbeddedContent, _> = serde_json::from_value(raw);
531        assert!(parsed.is_err(), "unknown field must be rejected");
532    }
533
534    #[test]
535    fn constructed_uri_ref_round_trips_through_json() {
536        let dr = DataRef::uri(DataRefType::PrimaryResult, "https://x/d");
537        let v = serde_json::to_value(&dr).unwrap();
538        // `type` rename and omitted-None fields.
539        assert_eq!(v["type"], json!("primary_result"));
540        assert_eq!(v["location"], json!("https://x/d"));
541        assert!(v.as_object().unwrap().get("embedded").is_none());
542        let back: DataRef = serde_json::from_value(v).unwrap();
543        assert_eq!(back.ref_type, DataRefType::PrimaryResult);
544    }
545}