Skip to main content

uor_addr/schema/
document.rs

1//! **`uor_addr::schema::document` — Document content-addressing**
2//! (ARCHITECTURE.md "Schema-pinned descendants" § `uor-addr-document`).
3//!
4//! Schema-pinned descendant of [`crate::json`]. **Imports
5//! schema.org's `Article` type** (extending `CreativeWork`) — the
6//! host-boundary parser admits only JSON-LD values conforming to
7//! schema.org's published Article taxon.
8//!
9//! # `no_std` + `no_alloc`
10//!
11//! Schema admission walks the parsed [`crate::json::JsonValue`]'s
12//! tagged bytes via [`crate::json::JsonValueRef`]. No `serde_json`,
13//! no allocator.
14//!
15//! # Authoritative sources
16//!
17//! - **schema.org Article type** — <https://schema.org/Article>.
18//! - **JSON-LD 1.1** — W3C REC — <https://www.w3.org/TR/json-ld11/>.
19//!
20//! # Admission predicate
21//!
22//! 1. `@context` is `"https://schema.org"` or `"http://schema.org"`.
23//! 2. `@type` is `"Article"` or one of its admissible subtypes.
24//! 3. `headline` — string.
25//! 4. `author` — string, Person/Organization object, or non-empty
26//!    array of either.
27//! 5. `datePublished` — non-empty string (ISO 8601 / RFC 3339).
28
29use prism::pipeline::{ShapeViolation, ViolationKind};
30
31use crate::json::{JsonValue, JsonValueRef};
32
33const DOC_SCHEMA_VIOLATION: ShapeViolation = ShapeViolation {
34    shape_iri: "https://schema.org/Article",
35    constraint_iri: "https://schema.org/Article/schemaOrgConformance",
36    property_iri: "https://schema.org/Article",
37    expected_range: "https://schema.org/Article",
38    min_count: 0,
39    max_count: 1,
40    kind: ViolationKind::ValueCheck,
41};
42
43pub const SCHEMA_ORG_CONTEXTS: &[&[u8]] = &[b"https://schema.org", b"http://schema.org"];
44
45/// Admissible `@type` values — `Article` plus its standard subtypes
46/// per <https://schema.org/Article>.
47pub const ARTICLE_TYPES: &[&[u8]] = &[
48    b"Article",
49    b"NewsArticle",
50    b"Report",
51    b"ScholarlyArticle",
52    b"SocialMediaPosting",
53    b"TechArticle",
54    b"BlogPosting",
55    b"AdvertiserContentArticle",
56    b"AnalysisNewsArticle",
57    b"AskPublicNewsArticle",
58    b"BackgroundNewsArticle",
59    b"OpinionNewsArticle",
60    b"ReportageNewsArticle",
61    b"ReviewNewsArticle",
62    b"SatiricalArticle",
63];
64
65pub const REQUIRED_PROPERTIES: &[&[u8]] = &[
66    b"@context",
67    b"@type",
68    b"headline",
69    b"author",
70    b"datePublished",
71];
72
73#[derive(Clone)]
74pub struct DocumentValue {
75    inner: JsonValue,
76}
77
78impl core::fmt::Debug for DocumentValue {
79    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
80        f.debug_struct("DocumentValue").finish_non_exhaustive()
81    }
82}
83
84impl DocumentValue {
85    pub fn parse(raw: &[u8]) -> Result<Self, ShapeViolation> {
86        let inner = JsonValue::parse(raw).map_err(|_| DOC_SCHEMA_VIOLATION)?;
87        let root = JsonValueRef::root(&inner);
88        if !root.is_object() {
89            return Err(DOC_SCHEMA_VIOLATION);
90        }
91        // @context
92        let context = root
93            .get(b"@context")
94            .and_then(|v| v.as_str())
95            .ok_or(DOC_SCHEMA_VIOLATION)?;
96        if !SCHEMA_ORG_CONTEXTS.contains(&context) {
97            return Err(DOC_SCHEMA_VIOLATION);
98        }
99        // @type
100        let ty = root
101            .get(b"@type")
102            .and_then(|v| v.as_str())
103            .ok_or(DOC_SCHEMA_VIOLATION)?;
104        if !ARTICLE_TYPES.contains(&ty) {
105            return Err(DOC_SCHEMA_VIOLATION);
106        }
107        // headline
108        let _ = root
109            .get(b"headline")
110            .and_then(|v| v.as_str())
111            .ok_or(DOC_SCHEMA_VIOLATION)?;
112        // author
113        validate_author(root.get(b"author"))?;
114        // datePublished
115        let date = root
116            .get(b"datePublished")
117            .and_then(|v| v.as_str())
118            .ok_or(DOC_SCHEMA_VIOLATION)?;
119        if date.is_empty() {
120            return Err(DOC_SCHEMA_VIOLATION);
121        }
122        Ok(Self { inner })
123    }
124
125    #[must_use]
126    pub fn tagged_bytes(&self) -> &[u8] {
127        self.inner.tagged_bytes()
128    }
129}
130
131pub fn address(raw: &[u8]) -> Result<crate::AddressOutcome<71>, AddressFailure> {
132    DocumentValue::parse(raw).map_err(|_| AddressFailure::SchemaViolation)?;
133    crate::json::address(raw).map_err(|e| match e {
134        crate::json::AddressFailure::InvalidJson => AddressFailure::SchemaViolation,
135        crate::json::AddressFailure::PipelineFailure => AddressFailure::PipelineFailure,
136    })
137}
138
139/// As [`address`], but binds the `blake3` σ-axis ([`crate::hash`]). Schema
140/// admission is identical; only the κ-derivation hash differs.
141///
142/// # Errors
143///
144/// As [`address`].
145pub fn address_blake3(raw: &[u8]) -> Result<crate::AddressOutcome<71>, AddressFailure> {
146    DocumentValue::parse(raw).map_err(|_| AddressFailure::SchemaViolation)?;
147    crate::json::address_blake3(raw).map_err(|e| match e {
148        crate::json::AddressFailure::InvalidJson => AddressFailure::SchemaViolation,
149        crate::json::AddressFailure::PipelineFailure => AddressFailure::PipelineFailure,
150    })
151}
152
153/// As [`address`], but binds the `sha3_256` σ-axis ([`crate::hash`]). Schema
154/// admission is identical; only the κ-derivation hash differs.
155///
156/// # Errors
157///
158/// As [`address`].
159pub fn address_sha3_256(raw: &[u8]) -> Result<crate::AddressOutcome<73>, AddressFailure> {
160    DocumentValue::parse(raw).map_err(|_| AddressFailure::SchemaViolation)?;
161    crate::json::address_sha3_256(raw).map_err(|e| match e {
162        crate::json::AddressFailure::InvalidJson => AddressFailure::SchemaViolation,
163        crate::json::AddressFailure::PipelineFailure => AddressFailure::PipelineFailure,
164    })
165}
166
167/// As [`address`], but binds the `keccak256` σ-axis ([`crate::hash`]). Schema
168/// admission is identical; only the κ-derivation hash differs.
169///
170/// # Errors
171///
172/// As [`address`].
173pub fn address_keccak256(raw: &[u8]) -> Result<crate::AddressOutcome<74>, AddressFailure> {
174    DocumentValue::parse(raw).map_err(|_| AddressFailure::SchemaViolation)?;
175    crate::json::address_keccak256(raw).map_err(|e| match e {
176        crate::json::AddressFailure::InvalidJson => AddressFailure::SchemaViolation,
177        crate::json::AddressFailure::PipelineFailure => AddressFailure::PipelineFailure,
178    })
179}
180
181/// As [`address`], but binds the `sha512` σ-axis ([`crate::hash`]).
182///
183/// # Errors
184///
185/// As [`address`].
186pub fn address_sha512(raw: &[u8]) -> Result<crate::AddressOutcome<135, 64>, AddressFailure> {
187    DocumentValue::parse(raw).map_err(|_| AddressFailure::SchemaViolation)?;
188    crate::json::address_sha512(raw).map_err(|e| match e {
189        crate::json::AddressFailure::InvalidJson => AddressFailure::SchemaViolation,
190        crate::json::AddressFailure::PipelineFailure => AddressFailure::PipelineFailure,
191    })
192}
193
194#[derive(Debug, Clone, Copy, PartialEq, Eq)]
195pub enum AddressFailure {
196    SchemaViolation,
197    PipelineFailure,
198}
199
200/// **Available only under the `alloc` feature.**
201#[cfg(feature = "alloc")]
202pub fn canonicalize(raw: &[u8]) -> Result<alloc::vec::Vec<u8>, AddressFailure> {
203    extern crate alloc;
204    DocumentValue::parse(raw).map_err(|_| AddressFailure::SchemaViolation)?;
205    crate::json::canonicalize(raw).map_err(|_| AddressFailure::PipelineFailure)
206}
207
208fn validate_author(value: Option<JsonValueRef<'_>>) -> Result<(), ShapeViolation> {
209    let v = value.ok_or(DOC_SCHEMA_VIOLATION)?;
210    if v.as_str().is_some() {
211        return Ok(());
212    }
213    if v.is_object() {
214        return validate_author_object(v);
215    }
216    if let Some(iter) = v.iter_array() {
217        let mut count = 0;
218        for item in iter {
219            validate_author_item(item)?;
220            count += 1;
221        }
222        if count == 0 {
223            return Err(DOC_SCHEMA_VIOLATION);
224        }
225        return Ok(());
226    }
227    Err(DOC_SCHEMA_VIOLATION)
228}
229
230fn validate_author_item(value: JsonValueRef<'_>) -> Result<(), ShapeViolation> {
231    if value.as_str().is_some() {
232        return Ok(());
233    }
234    if value.is_object() {
235        return validate_author_object(value);
236    }
237    Err(DOC_SCHEMA_VIOLATION)
238}
239
240fn validate_author_object(value: JsonValueRef<'_>) -> Result<(), ShapeViolation> {
241    let at = value
242        .get(b"@type")
243        .and_then(|v| v.as_str())
244        .ok_or(DOC_SCHEMA_VIOLATION)?;
245    if at != b"Person" && at != b"Organization" {
246        return Err(DOC_SCHEMA_VIOLATION);
247    }
248    let _ = value
249        .get(b"name")
250        .and_then(|v| v.as_str())
251        .ok_or(DOC_SCHEMA_VIOLATION)?;
252    Ok(())
253}
254
255#[cfg(test)]
256mod tests {
257    use super::*;
258
259    const VALID_ARTICLE: &[u8] = br#"{
260        "@context": "https://schema.org",
261        "@type": "Article",
262        "headline": "On Typed Content Addressing",
263        "author": {"@type": "Person", "name": "Ada Lovelace"},
264        "datePublished": "2025-01-15"
265    }"#;
266
267    #[test]
268    fn admits_valid_schema_org_article() {
269        let d = DocumentValue::parse(VALID_ARTICLE).expect("valid");
270        assert!(!d.tagged_bytes().is_empty());
271    }
272
273    #[test]
274    fn admits_scholarly_article_subtype() {
275        let raw = br#"{
276            "@context": "https://schema.org",
277            "@type": "ScholarlyArticle",
278            "headline": "P vs. NP",
279            "author": "Anonymous",
280            "datePublished": "2025-01-15T12:00:00Z"
281        }"#;
282        DocumentValue::parse(raw).expect("valid");
283    }
284
285    #[test]
286    fn admits_news_article_subtype() {
287        let raw = br#"{
288            "@context": "http://schema.org",
289            "@type": "NewsArticle",
290            "headline": "Breaking news",
291            "author": "Newsdesk",
292            "datePublished": "2025-01-15"
293        }"#;
294        DocumentValue::parse(raw).expect("valid");
295    }
296
297    #[test]
298    fn rejects_non_schema_org_context() {
299        let raw = br#"{
300            "@context": "https://example.org",
301            "@type": "Article",
302            "headline": "x",
303            "author": "y",
304            "datePublished": "2025-01-15"
305        }"#;
306        let err = DocumentValue::parse(raw).expect_err("not schema.org");
307        assert_eq!(err.constraint_iri, DOC_SCHEMA_VIOLATION.constraint_iri);
308    }
309
310    #[test]
311    fn rejects_non_article_type() {
312        let raw = br#"{
313            "@context": "https://schema.org",
314            "@type": "Photograph",
315            "headline": "x",
316            "author": "y",
317            "datePublished": "2025-01-15"
318        }"#;
319        let err = DocumentValue::parse(raw).expect_err("not Article");
320        assert_eq!(err.constraint_iri, DOC_SCHEMA_VIOLATION.constraint_iri);
321    }
322
323    #[test]
324    fn rejects_missing_headline() {
325        let raw = br#"{
326            "@context": "https://schema.org",
327            "@type": "Article",
328            "author": "y",
329            "datePublished": "2025-01-15"
330        }"#;
331        let err = DocumentValue::parse(raw).expect_err("missing headline");
332        assert_eq!(err.constraint_iri, DOC_SCHEMA_VIOLATION.constraint_iri);
333    }
334
335    #[test]
336    fn address_matches_json_realization() {
337        let from_doc = address(VALID_ARTICLE).expect("κ-label").address;
338        let from_json = crate::json::address(VALID_ARTICLE)
339            .expect("κ-label")
340            .address;
341        assert_eq!(from_doc, from_json);
342    }
343}