commonmeta/
schema_utils.rs1use std::fs;
4use std::path::{Path, PathBuf};
5use std::sync::{Arc, OnceLock};
6
7use fastxml::schema::fetcher::{FetchResult, FileFetcher, SchemaFetcher};
8use fastxml::schema::{Schema, Validator};
9use fastxml::schema::fetcher::error::FetchError;
10use serde_json::Value;
11
12use crate::error::{Error, Result};
13
14pub const SCHEMA_VERSION: &str = "commonmeta_v1.0";
15pub const DEFAULT_SCHEMA: &str = "commonmeta";
16pub const SCHEMA_JSON: &str = include_str!("../resources/commonmeta_v1.0.json");
17
18const SCHEMATA: &[&str] = &[
20 DEFAULT_SCHEMA,
21 "cff",
22 "crossref_xml",
23 "csl",
24 "datacite",
25 "inveniordm",
26 "schema_org",
27];
28
29pub fn known_schemata() -> &'static [&'static str] {
31 SCHEMATA
32}
33
34pub fn json_schema_errors(document: &[u8], schema: Option<&str>) -> Result<()> {
38 let schema_name = schema.unwrap_or(DEFAULT_SCHEMA);
39 let Some(schema_file) = schema_file_name(schema_name) else {
40 return Err(Error::UnsupportedFormat(format!(
41 "schema '{schema_name}' not found"
42 )));
43 };
44
45 let schema_text = load_schema(schema_file)?;
46
47 let schema_json: Value = serde_json::from_str(&schema_text)
48 .map_err(|_| Error::Parse(format!("invalid JSON in schema file: {schema_file}.json")))?;
49 let document_json: Value =
50 serde_json::from_slice(document).map_err(|e| Error::Parse(e.to_string()))?;
51
52 let validation_schema = effective_validation_schema(&schema_json);
53
54 let compiled =
55 jsonschema::validator_for(&validation_schema).map_err(|e| Error::Parse(e.to_string()))?;
56
57 let raw_errors: Vec<jsonschema::ValidationError<'_>> =
58 compiled.iter_errors(&document_json).collect();
59
60 if raw_errors.is_empty() {
61 return Ok(());
62 }
63
64 let messages = collect_leaf_errors(&raw_errors);
65 Err(Error::Parse(format!(
66 "json schema validation failed ({} errors): {}",
67 messages.len(),
68 messages.join("; ")
69 )))
70}
71
72pub(crate) fn collect_leaf_errors(errs: &[jsonschema::ValidationError<'_>]) -> Vec<String> {
81 use jsonschema::error::ValidationErrorKind;
82 let mut out = Vec::new();
83 for e in errs {
84 match e.kind() {
85 ValidationErrorKind::AnyOf { context }
86 | ValidationErrorKind::OneOfNotValid { context } => {
87 let useful: Vec<&Vec<jsonschema::ValidationError<'static>>> = context
88 .iter()
89 .filter(|branch| {
90 !(branch.len() == 1
91 && matches!(branch[0].kind(), ValidationErrorKind::Type { .. }))
92 })
93 .collect();
94 let branches = if useful.is_empty() {
95 context.iter().collect()
96 } else {
97 useful
98 };
99 for branch in branches {
100 out.extend(collect_leaf_errors(branch));
101 }
102 }
103 _ => {
104 let path = e.instance_path().to_string();
105 let msg = match e.kind() {
106 ValidationErrorKind::Enum { options } => {
107 format!("value {} not in enum: {options}", e.instance())
108 }
109 other => format_error_kind(other),
110 };
111 out.push(if path.is_empty() {
112 msg
113 } else {
114 format!("{path}: {msg}")
115 });
116 }
117 }
118 }
119 out
120}
121
122fn format_error_kind(kind: &jsonschema::error::ValidationErrorKind) -> String {
123 use jsonschema::error::ValidationErrorKind;
124 match kind {
125 ValidationErrorKind::AdditionalProperties { unexpected } => {
126 format!("unexpected properties: {}", unexpected.join(", "))
127 }
128 ValidationErrorKind::Required { property } => {
129 let name = if let Some(s) = property.as_str() {
130 s.to_string()
131 } else {
132 property.to_string().trim_matches('"').to_string()
133 };
134 format!("required property '{name}' is missing")
135 }
136 ValidationErrorKind::Type { kind } => format!("wrong type: expected {kind:?}"),
137 ValidationErrorKind::Format { format } => {
138 format!("value does not match format '{format}'")
139 }
140 ValidationErrorKind::Pattern { pattern } => {
141 format!("value does not match pattern '{pattern}'")
142 }
143 ValidationErrorKind::UniqueItems => "array contains duplicate items".to_string(),
144 ValidationErrorKind::MinItems { limit } => {
145 format!("array has fewer than {limit} items")
146 }
147 ValidationErrorKind::MaxItems { limit } => {
148 format!("array has more than {limit} items")
149 }
150 ValidationErrorKind::Minimum { limit } => format!("value is less than minimum {limit}"),
151 ValidationErrorKind::Maximum { limit } => format!("value exceeds maximum {limit}"),
152 ValidationErrorKind::MinLength { limit } => {
153 format!("string shorter than {limit} characters")
154 }
155 ValidationErrorKind::MaxLength { limit } => {
156 format!("string longer than {limit} characters")
157 }
158 ValidationErrorKind::Constant { expected_value } => {
159 format!("expected constant value: {expected_value}")
160 }
161 other => format!("{other:?}"),
162 }
163}
164
165pub fn xml_schema_errors(xml: &[u8], schema: Option<&str>) -> Result<()> {
171 let schema_name = schema.unwrap_or("crossref_xml");
172
173 let compiled = match schema_name {
174 "crossref_xml" | "crossref" | "crossref-v5.4.0" => crossref_xsd_schema()?,
175 "datacite_xml" | "datacite-v4.7" => datacite_xsd_schema()?,
176 other => {
177 return Err(Error::UnsupportedFormat(format!(
178 "XSD schema '{other}' not supported"
179 )));
180 }
181 };
182
183 let report = Validator::from(xml)
184 .schema(compiled)
185 .run()
186 .map_err(|e| Error::Parse(e.to_string()))?;
187
188 if report.is_valid() {
189 return Ok(());
190 }
191
192 let errors: Vec<String> = report.errors().iter().map(|e| e.to_string()).collect();
193 Err(Error::Parse(format!(
194 "XSD validation failed ({} errors): {}",
195 errors.len(),
196 errors.join("; ")
197 )))
198}
199
200fn crossref_xsd_schema() -> Result<Arc<Schema>> {
204 static SCHEMA: OnceLock<std::result::Result<Arc<Schema>, String>> = OnceLock::new();
205
206 SCHEMA
207 .get_or_init(build_crossref_schema)
208 .as_ref()
209 .map(Arc::clone)
210 .map_err(|e| Error::Parse(e.clone()))
211}
212
213fn build_crossref_schema() -> std::result::Result<Arc<Schema>, String> {
214 let base_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
215 .join("resources")
216 .join("crossref");
217
218 let main_xsd_path = base_dir.join("crossref5.4.0.xsd");
219 let main_xsd = fs::read(&main_xsd_path)
220 .map_err(|e| format!("could not read crossref5.4.0.xsd: {e}"))?;
221
222 let fetcher = SandboxFetcher { base: FileFetcher::with_base_dir(&base_dir) };
227
228 Schema::builder()
233 .add(
234 "https://www.crossref.org/schemas/crossref5.4.0.xsd",
235 main_xsd,
236 )
237 .resolve_with(&fetcher)
238 .map(Arc::new)
239 .map_err(|e| format!("failed to compile Crossref XSD schema: {e}"))
240}
241
242fn datacite_xsd_schema() -> Result<Arc<Schema>> {
244 static SCHEMA: OnceLock<std::result::Result<Arc<Schema>, String>> = OnceLock::new();
245 SCHEMA
246 .get_or_init(build_datacite_schema)
247 .as_ref()
248 .map(Arc::clone)
249 .map_err(|e| Error::Parse(e.clone()))
250}
251
252fn build_datacite_schema() -> std::result::Result<Arc<Schema>, String> {
253 let base_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
254 .join("resources")
255 .join("datacite");
256
257 let main_xsd_path = base_dir.join("datacite-v4.xsd");
258 let main_xsd = fs::read(&main_xsd_path)
259 .map_err(|e| format!("could not read datacite-v4.xsd: {e}"))?;
260
261 let fetcher = SandboxFetcher { base: FileFetcher::with_base_dir(&base_dir) };
262
263 Schema::builder()
264 .add(
265 "https://schema.datacite.org/meta/kernel-4.7/metadata.xsd",
266 main_xsd,
267 )
268 .resolve_with(&fetcher)
269 .map(Arc::new)
270 .map_err(|e| format!("failed to compile DataCite XSD schema: {e}"))
271}
272
273struct SandboxFetcher {
276 base: FileFetcher,
277}
278
279impl SchemaFetcher for SandboxFetcher {
280 fn fetch(&self, url: &str) -> fastxml::error::Result<FetchResult> {
281 if let Ok(result) = self.base.fetch(url) {
284 return Ok(result);
285 }
286
287 if url.starts_with("http://") || url.starts_with("https://") {
290 if let Some(filename) = url.rsplit('/').next() {
291 if let Ok(result) = self.base.fetch(filename) {
292 return Ok(result);
293 }
294 }
295 let stub = r#"<?xml version="1.0" encoding="UTF-8"?><xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"/>"#;
298 return Ok(FetchResult {
299 content: stub.as_bytes().to_vec(),
300 final_url: url.to_string(),
301 redirected: false,
302 });
303 }
304
305 Err(FetchError::RequestFailed {
307 url: url.to_string(),
308 message: "schema not found locally".to_string(),
309 }
310 .into())
311 }
312}
313
314fn effective_validation_schema(schema_json: &Value) -> Value {
317 let Some(commonmeta_root) = schema_json.get("commonmeta") else {
320 return schema_json.clone();
321 };
322
323 let mut merged = serde_json::Map::new();
324
325 if let Some(v) = schema_json.get("$schema") {
326 merged.insert("$schema".to_string(), v.clone());
327 }
328 if let Some(v) = schema_json.get("$id") {
329 merged.insert("$id".to_string(), v.clone());
330 }
331 if let Some(v) = schema_json.get("definitions") {
332 merged.insert("definitions".to_string(), v.clone());
333 }
334
335 if let Value::Object(obj) = commonmeta_root {
336 for (key, value) in obj {
337 merged.insert(key.clone(), value.clone());
338 }
339 return Value::Object(merged);
340 }
341
342 schema_json.clone()
343}
344
345fn schema_file_name(schema_name: &str) -> Option<&'static str> {
346 match schema_name {
347 "commonmeta" | SCHEMA_VERSION => Some(SCHEMA_VERSION),
348 "cff" | "cff_v1.2.0" => Some("cff_v1.2.0"),
349 "crossref_xml" | "crossref-v5.4.0" | "crossref-v0.2" => Some("crossref-v5.4.0"),
350 "csl" | "csl-data" => Some("csl-data"),
351 "datacite" | "datacite-v4.5" => Some("datacite-v4.5"),
352 "inveniordm" | "inveniordm-v0.1" | "invenio-rdm-v0.1" => Some("inveniordm-v0.1"),
353 "schema_org" | "schema_org-v0.1" => Some("schema_org-v0.1"),
354 _ => None,
355 }
356}
357
358fn load_schema(schema_file: &str) -> Result<String> {
359 if schema_file == SCHEMA_VERSION {
360 return Ok(include_str!("../resources/commonmeta_v1.0.json").to_string());
361 }
362
363 let path = Path::new(env!("CARGO_MANIFEST_DIR"))
364 .join("resources")
365 .join(format!("{schema_file}.json"));
366
367 fs::read_to_string(&path)
368 .map_err(|_| Error::Parse(format!("schema file not found: {}", path.display())))
369}
370
371#[cfg(test)]
372mod tests {
373 use super::{
374 DEFAULT_SCHEMA, SCHEMA_VERSION, json_schema_errors, known_schemata, schema_file_name,
375 xml_schema_errors,
376 };
377
378 #[test]
379 fn validates_commonmeta_document_with_default_schema() {
380 let doc = include_bytes!("../tests/fixtures/commonmeta/journal_article.json");
381 let result = json_schema_errors(doc, None);
382 assert!(
383 result.is_ok(),
384 "expected schema validation to pass: {result:?}"
385 );
386 }
387
388 #[test]
389 fn rejects_invalid_commonmeta_document() {
390 let result = json_schema_errors(br#"{}"#, None);
391 assert!(result.is_err(), "expected validation to fail");
392 let message = result.expect_err("validation should fail").to_string();
393 assert!(
394 message.contains("validation failed") || message.contains("required"),
395 "unexpected error message: {message}"
396 );
397 }
398
399 #[test]
400 fn rejects_unknown_schema_name() {
401 let result = json_schema_errors(br#"{}"#, Some("does-not-exist"));
402 assert!(result.is_err(), "expected unknown schema to fail");
403 let message = result.expect_err("unknown schema should fail").to_string();
404 assert!(message.contains("schema 'does-not-exist' not found"));
405 }
406
407 #[test]
408 fn includes_default_schema_in_known_list() {
409 assert!(known_schemata().contains(&DEFAULT_SCHEMA));
410 }
411
412 #[test]
413 fn supports_python_schema_aliases() {
414 assert_eq!(schema_file_name("commonmeta"), Some(SCHEMA_VERSION));
415 assert_eq!(schema_file_name("commonmeta_v0.18"), None);
416 assert_eq!(schema_file_name("datacite"), Some("datacite-v4.5"));
417 assert_eq!(schema_file_name("crossref_xml"), Some("crossref-v5.4.0"));
418 }
419
420 #[test]
421 fn xsd_rejects_unknown_schema_name() {
422 let result = xml_schema_errors(b"<foo/>", Some("unknown"));
423 assert!(result.is_err());
424 let msg = result.unwrap_err().to_string();
425 assert!(msg.contains("not supported"), "unexpected: {msg}");
426 }
427
428 #[test]
434 fn xsd_crossref_schema_compiles() {
435 let xml = include_bytes!("../tests/fixtures/crossref_xml/journal_article.xml");
440 let result = xml_schema_errors(xml, Some("crossref_xml"));
441 if let Err(ref e) = result {
442 assert!(
443 !e.to_string().contains("failed to compile"),
444 "Crossref XSD schema failed to compile: {e}"
445 );
446 }
447 }
448}