1use prism::pipeline::{ShapeViolation, ViolationKind};
30
31use crate::json::{JsonValue, JsonValueRef};
32
33const DOC_SCHEMA_VIOLATION: ShapeViolation = ShapeViolation {
34 shape_iri: "https://schema.org/Article",
35 constraint_iri: "https://schema.org/Article/schemaOrgConformance",
36 property_iri: "https://schema.org/Article",
37 expected_range: "https://schema.org/Article",
38 min_count: 0,
39 max_count: 1,
40 kind: ViolationKind::ValueCheck,
41};
42
43pub const SCHEMA_ORG_CONTEXTS: &[&[u8]] = &[b"https://schema.org", b"http://schema.org"];
44
45pub const ARTICLE_TYPES: &[&[u8]] = &[
48 b"Article",
49 b"NewsArticle",
50 b"Report",
51 b"ScholarlyArticle",
52 b"SocialMediaPosting",
53 b"TechArticle",
54 b"BlogPosting",
55 b"AdvertiserContentArticle",
56 b"AnalysisNewsArticle",
57 b"AskPublicNewsArticle",
58 b"BackgroundNewsArticle",
59 b"OpinionNewsArticle",
60 b"ReportageNewsArticle",
61 b"ReviewNewsArticle",
62 b"SatiricalArticle",
63];
64
65pub const REQUIRED_PROPERTIES: &[&[u8]] = &[
66 b"@context",
67 b"@type",
68 b"headline",
69 b"author",
70 b"datePublished",
71];
72
73#[derive(Clone)]
74pub struct DocumentValue {
75 inner: JsonValue,
76}
77
78impl core::fmt::Debug for DocumentValue {
79 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
80 f.debug_struct("DocumentValue").finish_non_exhaustive()
81 }
82}
83
84impl DocumentValue {
85 pub fn parse(raw: &[u8]) -> Result<Self, ShapeViolation> {
86 let inner = JsonValue::parse(raw).map_err(|_| DOC_SCHEMA_VIOLATION)?;
87 let root = JsonValueRef::root(&inner);
88 if !root.is_object() {
89 return Err(DOC_SCHEMA_VIOLATION);
90 }
91 let context = root
93 .get(b"@context")
94 .and_then(|v| v.as_str())
95 .ok_or(DOC_SCHEMA_VIOLATION)?;
96 if !SCHEMA_ORG_CONTEXTS.contains(&context) {
97 return Err(DOC_SCHEMA_VIOLATION);
98 }
99 let ty = root
101 .get(b"@type")
102 .and_then(|v| v.as_str())
103 .ok_or(DOC_SCHEMA_VIOLATION)?;
104 if !ARTICLE_TYPES.contains(&ty) {
105 return Err(DOC_SCHEMA_VIOLATION);
106 }
107 let _ = root
109 .get(b"headline")
110 .and_then(|v| v.as_str())
111 .ok_or(DOC_SCHEMA_VIOLATION)?;
112 validate_author(root.get(b"author"))?;
114 let date = root
116 .get(b"datePublished")
117 .and_then(|v| v.as_str())
118 .ok_or(DOC_SCHEMA_VIOLATION)?;
119 if date.is_empty() {
120 return Err(DOC_SCHEMA_VIOLATION);
121 }
122 Ok(Self { inner })
123 }
124
125 #[must_use]
126 pub fn tagged_bytes(&self) -> &[u8] {
127 self.inner.tagged_bytes()
128 }
129}
130
131pub fn address(raw: &[u8]) -> Result<crate::AddressOutcome<71>, AddressFailure> {
132 DocumentValue::parse(raw).map_err(|_| AddressFailure::SchemaViolation)?;
133 crate::json::address(raw).map_err(|e| match e {
134 crate::json::AddressFailure::InvalidJson => AddressFailure::SchemaViolation,
135 crate::json::AddressFailure::PipelineFailure => AddressFailure::PipelineFailure,
136 })
137}
138
139pub fn address_blake3(raw: &[u8]) -> Result<crate::AddressOutcome<71>, AddressFailure> {
146 DocumentValue::parse(raw).map_err(|_| AddressFailure::SchemaViolation)?;
147 crate::json::address_blake3(raw).map_err(|e| match e {
148 crate::json::AddressFailure::InvalidJson => AddressFailure::SchemaViolation,
149 crate::json::AddressFailure::PipelineFailure => AddressFailure::PipelineFailure,
150 })
151}
152
153pub fn address_sha3_256(raw: &[u8]) -> Result<crate::AddressOutcome<73>, AddressFailure> {
160 DocumentValue::parse(raw).map_err(|_| AddressFailure::SchemaViolation)?;
161 crate::json::address_sha3_256(raw).map_err(|e| match e {
162 crate::json::AddressFailure::InvalidJson => AddressFailure::SchemaViolation,
163 crate::json::AddressFailure::PipelineFailure => AddressFailure::PipelineFailure,
164 })
165}
166
167pub fn address_keccak256(raw: &[u8]) -> Result<crate::AddressOutcome<74>, AddressFailure> {
174 DocumentValue::parse(raw).map_err(|_| AddressFailure::SchemaViolation)?;
175 crate::json::address_keccak256(raw).map_err(|e| match e {
176 crate::json::AddressFailure::InvalidJson => AddressFailure::SchemaViolation,
177 crate::json::AddressFailure::PipelineFailure => AddressFailure::PipelineFailure,
178 })
179}
180
181pub fn address_sha512(raw: &[u8]) -> Result<crate::AddressOutcome<135, 64>, AddressFailure> {
187 DocumentValue::parse(raw).map_err(|_| AddressFailure::SchemaViolation)?;
188 crate::json::address_sha512(raw).map_err(|e| match e {
189 crate::json::AddressFailure::InvalidJson => AddressFailure::SchemaViolation,
190 crate::json::AddressFailure::PipelineFailure => AddressFailure::PipelineFailure,
191 })
192}
193
194#[derive(Debug, Clone, Copy, PartialEq, Eq)]
195pub enum AddressFailure {
196 SchemaViolation,
197 PipelineFailure,
198}
199
200#[cfg(feature = "alloc")]
202pub fn canonicalize(raw: &[u8]) -> Result<alloc::vec::Vec<u8>, AddressFailure> {
203 extern crate alloc;
204 DocumentValue::parse(raw).map_err(|_| AddressFailure::SchemaViolation)?;
205 crate::json::canonicalize(raw).map_err(|_| AddressFailure::PipelineFailure)
206}
207
208fn validate_author(value: Option<JsonValueRef<'_>>) -> Result<(), ShapeViolation> {
209 let v = value.ok_or(DOC_SCHEMA_VIOLATION)?;
210 if v.as_str().is_some() {
211 return Ok(());
212 }
213 if v.is_object() {
214 return validate_author_object(v);
215 }
216 if let Some(iter) = v.iter_array() {
217 let mut count = 0;
218 for item in iter {
219 validate_author_item(item)?;
220 count += 1;
221 }
222 if count == 0 {
223 return Err(DOC_SCHEMA_VIOLATION);
224 }
225 return Ok(());
226 }
227 Err(DOC_SCHEMA_VIOLATION)
228}
229
230fn validate_author_item(value: JsonValueRef<'_>) -> Result<(), ShapeViolation> {
231 if value.as_str().is_some() {
232 return Ok(());
233 }
234 if value.is_object() {
235 return validate_author_object(value);
236 }
237 Err(DOC_SCHEMA_VIOLATION)
238}
239
240fn validate_author_object(value: JsonValueRef<'_>) -> Result<(), ShapeViolation> {
241 let at = value
242 .get(b"@type")
243 .and_then(|v| v.as_str())
244 .ok_or(DOC_SCHEMA_VIOLATION)?;
245 if at != b"Person" && at != b"Organization" {
246 return Err(DOC_SCHEMA_VIOLATION);
247 }
248 let _ = value
249 .get(b"name")
250 .and_then(|v| v.as_str())
251 .ok_or(DOC_SCHEMA_VIOLATION)?;
252 Ok(())
253}
254
255#[cfg(test)]
256mod tests {
257 use super::*;
258
259 const VALID_ARTICLE: &[u8] = br#"{
260 "@context": "https://schema.org",
261 "@type": "Article",
262 "headline": "On Typed Content Addressing",
263 "author": {"@type": "Person", "name": "Ada Lovelace"},
264 "datePublished": "2025-01-15"
265 }"#;
266
267 #[test]
268 fn admits_valid_schema_org_article() {
269 let d = DocumentValue::parse(VALID_ARTICLE).expect("valid");
270 assert!(!d.tagged_bytes().is_empty());
271 }
272
273 #[test]
274 fn admits_scholarly_article_subtype() {
275 let raw = br#"{
276 "@context": "https://schema.org",
277 "@type": "ScholarlyArticle",
278 "headline": "P vs. NP",
279 "author": "Anonymous",
280 "datePublished": "2025-01-15T12:00:00Z"
281 }"#;
282 DocumentValue::parse(raw).expect("valid");
283 }
284
285 #[test]
286 fn admits_news_article_subtype() {
287 let raw = br#"{
288 "@context": "http://schema.org",
289 "@type": "NewsArticle",
290 "headline": "Breaking news",
291 "author": "Newsdesk",
292 "datePublished": "2025-01-15"
293 }"#;
294 DocumentValue::parse(raw).expect("valid");
295 }
296
297 #[test]
298 fn rejects_non_schema_org_context() {
299 let raw = br#"{
300 "@context": "https://example.org",
301 "@type": "Article",
302 "headline": "x",
303 "author": "y",
304 "datePublished": "2025-01-15"
305 }"#;
306 let err = DocumentValue::parse(raw).expect_err("not schema.org");
307 assert_eq!(err.constraint_iri, DOC_SCHEMA_VIOLATION.constraint_iri);
308 }
309
310 #[test]
311 fn rejects_non_article_type() {
312 let raw = br#"{
313 "@context": "https://schema.org",
314 "@type": "Photograph",
315 "headline": "x",
316 "author": "y",
317 "datePublished": "2025-01-15"
318 }"#;
319 let err = DocumentValue::parse(raw).expect_err("not Article");
320 assert_eq!(err.constraint_iri, DOC_SCHEMA_VIOLATION.constraint_iri);
321 }
322
323 #[test]
324 fn rejects_missing_headline() {
325 let raw = br#"{
326 "@context": "https://schema.org",
327 "@type": "Article",
328 "author": "y",
329 "datePublished": "2025-01-15"
330 }"#;
331 let err = DocumentValue::parse(raw).expect_err("missing headline");
332 assert_eq!(err.constraint_iri, DOC_SCHEMA_VIOLATION.constraint_iri);
333 }
334
335 #[test]
336 fn address_matches_json_realization() {
337 let from_doc = address(VALID_ARTICLE).expect("κ-label").address;
338 let from_json = crate::json::address(VALID_ARTICLE)
339 .expect("κ-label")
340 .address;
341 assert_eq!(from_doc, from_json);
342 }
343}