Skip to main content

commonmeta/
schema_utils.rs

1//! JSON Schema and XSD validation utilities.
2
3use std::fs;
4use std::path::{Path, PathBuf};
5use std::sync::{Arc, OnceLock};
6
7use fastxml::schema::fetcher::{FetchResult, FileFetcher, SchemaFetcher};
8use fastxml::schema::{Schema, Validator};
9use fastxml::schema::fetcher::error::FetchError;
10use serde_json::Value;
11
12use crate::error::{Error, Result};
13
14pub const SCHEMA_VERSION: &str = "commonmeta_v1.0";
15pub const DEFAULT_SCHEMA: &str = "commonmeta";
16pub const SCHEMA_JSON: &str = include_str!("../resources/commonmeta_v1.0.json");
17
18// Public schema names aligned with commonmeta-py.
19const SCHEMATA: &[&str] = &[
20    DEFAULT_SCHEMA,
21    "cff",
22    "crossref_xml",
23    "csl",
24    "datacite",
25    "inveniordm",
26    "schema_org",
27];
28
29/// Return the list of schema names accepted by `json_schema_errors`.
30pub fn known_schemata() -> &'static [&'static str] {
31    SCHEMATA
32}
33
34/// Validate a JSON document against one of the bundled schema names.
35///
36/// If `schema` is `None`, the default `DEFAULT_SCHEMA` (`commonmeta`) is used.
37pub fn json_schema_errors(document: &[u8], schema: Option<&str>) -> Result<()> {
38    let schema_name = schema.unwrap_or(DEFAULT_SCHEMA);
39    let Some(schema_file) = schema_file_name(schema_name) else {
40        return Err(Error::UnsupportedFormat(format!(
41            "schema '{schema_name}' not found"
42        )));
43    };
44
45    let schema_text = load_schema(schema_file)?;
46
47    let schema_json: Value = serde_json::from_str(&schema_text)
48        .map_err(|_| Error::Parse(format!("invalid JSON in schema file: {schema_file}.json")))?;
49    let document_json: Value =
50        serde_json::from_slice(document).map_err(|e| Error::Parse(e.to_string()))?;
51
52    let validation_schema = effective_validation_schema(&schema_json);
53
54    let compiled =
55        jsonschema::validator_for(&validation_schema).map_err(|e| Error::Parse(e.to_string()))?;
56
57    let raw_errors: Vec<jsonschema::ValidationError<'_>> =
58        compiled.iter_errors(&document_json).collect();
59
60    if raw_errors.is_empty() {
61        return Ok(());
62    }
63
64    let messages = collect_leaf_errors(&raw_errors);
65    Err(Error::Parse(format!(
66        "json schema validation failed ({} errors): {}",
67        messages.len(),
68        messages.join("; ")
69    )))
70}
71
72// ── XSD validation ─────────────────────────────────────────────────────────────
73
74/// Recursively collect leaf validation errors, drilling through `anyOf`/`oneOf` so
75/// callers see specific field-level messages rather than a full JSON blob.
76///
77/// For `anyOf` with two branches (our schema: single-object vs array), we skip the
78/// branch whose sole error is a type mismatch ("is not of type 'array'") and report
79/// the other branch's errors instead.
80pub(crate) fn collect_leaf_errors(errs: &[jsonschema::ValidationError<'_>]) -> Vec<String> {
81    use jsonschema::error::ValidationErrorKind;
82    let mut out = Vec::new();
83    for e in errs {
84        match e.kind() {
85            ValidationErrorKind::AnyOf { context }
86            | ValidationErrorKind::OneOfNotValid { context } => {
87                let useful: Vec<&Vec<jsonschema::ValidationError<'static>>> = context
88                    .iter()
89                    .filter(|branch| {
90                        !(branch.len() == 1
91                            && matches!(branch[0].kind(), ValidationErrorKind::Type { .. }))
92                    })
93                    .collect();
94                let branches = if useful.is_empty() {
95                    context.iter().collect()
96                } else {
97                    useful
98                };
99                for branch in branches {
100                    out.extend(collect_leaf_errors(branch));
101                }
102            }
103            _ => {
104                let path = e.instance_path().to_string();
105                let msg = match e.kind() {
106                    ValidationErrorKind::Enum { options } => {
107                        format!("value {} not in enum: {options}", e.instance())
108                    }
109                    other => format_error_kind(other),
110                };
111                out.push(if path.is_empty() {
112                    msg
113                } else {
114                    format!("{path}: {msg}")
115                });
116            }
117        }
118    }
119    out
120}
121
122fn format_error_kind(kind: &jsonschema::error::ValidationErrorKind) -> String {
123    use jsonschema::error::ValidationErrorKind;
124    match kind {
125        ValidationErrorKind::AdditionalProperties { unexpected } => {
126            format!("unexpected properties: {}", unexpected.join(", "))
127        }
128        ValidationErrorKind::Required { property } => {
129            let name = if let Some(s) = property.as_str() {
130                s.to_string()
131            } else {
132                property.to_string().trim_matches('"').to_string()
133            };
134            format!("required property '{name}' is missing")
135        }
136        ValidationErrorKind::Type { kind } => format!("wrong type: expected {kind:?}"),
137        ValidationErrorKind::Format { format } => {
138            format!("value does not match format '{format}'")
139        }
140        ValidationErrorKind::Pattern { pattern } => {
141            format!("value does not match pattern '{pattern}'")
142        }
143        ValidationErrorKind::UniqueItems => "array contains duplicate items".to_string(),
144        ValidationErrorKind::MinItems { limit } => {
145            format!("array has fewer than {limit} items")
146        }
147        ValidationErrorKind::MaxItems { limit } => {
148            format!("array has more than {limit} items")
149        }
150        ValidationErrorKind::Minimum { limit } => format!("value is less than minimum {limit}"),
151        ValidationErrorKind::Maximum { limit } => format!("value exceeds maximum {limit}"),
152        ValidationErrorKind::MinLength { limit } => {
153            format!("string shorter than {limit} characters")
154        }
155        ValidationErrorKind::MaxLength { limit } => {
156            format!("string longer than {limit} characters")
157        }
158        ValidationErrorKind::Constant { expected_value } => {
159            format!("expected constant value: {expected_value}")
160        }
161        other => format!("{other:?}"),
162    }
163}
164
165/// Validate an XML document against a bundled XSD schema.
166///
167/// Supported schema names: `"crossref_xml"` (aliases `"crossref"`,
168/// `"crossref-v5.4.0"`), `"datacite_xml"` (alias `"datacite-v4.7"`).
169/// The compiled schema is built once and reused across calls.
170pub fn xml_schema_errors(xml: &[u8], schema: Option<&str>) -> Result<()> {
171    let schema_name = schema.unwrap_or("crossref_xml");
172
173    let compiled = match schema_name {
174        "crossref_xml" | "crossref" | "crossref-v5.4.0" => crossref_xsd_schema()?,
175        "datacite_xml" | "datacite-v4.7"                => datacite_xsd_schema()?,
176        other => {
177            return Err(Error::UnsupportedFormat(format!(
178                "XSD schema '{other}' not supported"
179            )));
180        }
181    };
182
183    let report = Validator::from(xml)
184        .schema(compiled)
185        .run()
186        .map_err(|e| Error::Parse(e.to_string()))?;
187
188    if report.is_valid() {
189        return Ok(());
190    }
191
192    let errors: Vec<String> = report.errors().iter().map(|e| e.to_string()).collect();
193    Err(Error::Parse(format!(
194        "XSD validation failed ({} errors): {}",
195        errors.len(),
196        errors.join("; ")
197    )))
198}
199
200/// Lazy-compiled Crossref 5.4.0 XSD schema.
201///
202/// Built once per process; subsequent calls share the `Arc<Schema>`.
203fn crossref_xsd_schema() -> Result<Arc<Schema>> {
204    static SCHEMA: OnceLock<std::result::Result<Arc<Schema>, String>> = OnceLock::new();
205
206    SCHEMA
207        .get_or_init(build_crossref_schema)
208        .as_ref()
209        .map(Arc::clone)
210        .map_err(|e| Error::Parse(e.clone()))
211}
212
213fn build_crossref_schema() -> std::result::Result<Arc<Schema>, String> {
214    let base_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
215        .join("resources")
216        .join("crossref");
217
218    let main_xsd_path = base_dir.join("crossref5.4.0.xsd");
219    let main_xsd = fs::read(&main_xsd_path)
220        .map_err(|e| format!("could not read crossref5.4.0.xsd: {e}"))?;
221
222    // SandboxFetcher resolves imports from the local resources/crossref/
223    // directory.  For HTTP/HTTPS URLs that cannot be satisfied locally it
224    // returns an empty stub schema rather than making network requests — the
225    // same behaviour as xmlschema's allow="sandbox" in Python.
226    let fetcher = SandboxFetcher { base: FileFetcher::with_base_dir(&base_dir) };
227
228    // The builder requires an absolute URI as the schema's base URI so that
229    // relative imports inside the XSD can be resolved.  We use the canonical
230    // Crossref URL even though we are serving the file locally — the fetcher
231    // intercepts all import requests and rewrites them to local lookups.
232    Schema::builder()
233        .add(
234            "https://www.crossref.org/schemas/crossref5.4.0.xsd",
235            main_xsd,
236        )
237        .resolve_with(&fetcher)
238        .map(Arc::new)
239        .map_err(|e| format!("failed to compile Crossref XSD schema: {e}"))
240}
241
242/// Lazy-compiled DataCite 4.7 XSD schema.
243fn datacite_xsd_schema() -> Result<Arc<Schema>> {
244    static SCHEMA: OnceLock<std::result::Result<Arc<Schema>, String>> = OnceLock::new();
245    SCHEMA
246        .get_or_init(build_datacite_schema)
247        .as_ref()
248        .map(Arc::clone)
249        .map_err(|e| Error::Parse(e.clone()))
250}
251
252fn build_datacite_schema() -> std::result::Result<Arc<Schema>, String> {
253    let base_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
254        .join("resources")
255        .join("datacite");
256
257    let main_xsd_path = base_dir.join("datacite-v4.xsd");
258    let main_xsd = fs::read(&main_xsd_path)
259        .map_err(|e| format!("could not read datacite-v4.xsd: {e}"))?;
260
261    let fetcher = SandboxFetcher { base: FileFetcher::with_base_dir(&base_dir) };
262
263    Schema::builder()
264        .add(
265            "https://schema.datacite.org/meta/kernel-4.7/metadata.xsd",
266            main_xsd,
267        )
268        .resolve_with(&fetcher)
269        .map(Arc::new)
270        .map_err(|e| format!("failed to compile DataCite XSD schema: {e}"))
271}
272
273/// A schema fetcher that resolves imports from a local directory and returns
274/// empty stub schemas for remote URLs (preventing any network access).
275struct SandboxFetcher {
276    base: FileFetcher,
277}
278
279impl SchemaFetcher for SandboxFetcher {
280    fn fetch(&self, url: &str) -> fastxml::error::Result<FetchResult> {
281        // Try the local file fetcher first (handles relative paths and
282        // file:// URLs against the base directory).
283        if let Ok(result) = self.base.fetch(url) {
284            return Ok(result);
285        }
286
287        // For absolute HTTP/HTTPS URLs: try extracting just the filename and
288        // look for it in the base directory (e.g. xml.xsd, mathml3.xsd).
289        if url.starts_with("http://") || url.starts_with("https://") {
290            if let Some(filename) = url.rsplit('/').next() {
291                if let Ok(result) = self.base.fetch(filename) {
292                    return Ok(result);
293                }
294            }
295            // Return an empty stub schema so compilation can proceed without
296            // the remote schema (types from that namespace won't be validated).
297            let stub = r#"<?xml version="1.0" encoding="UTF-8"?><xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"/>"#;
298            return Ok(FetchResult {
299                content: stub.as_bytes().to_vec(),
300                final_url: url.to_string(),
301                redirected: false,
302            });
303        }
304
305        // All other unresolvable URLs: propagate the error.
306        Err(FetchError::RequestFailed {
307            url: url.to_string(),
308            message: "schema not found locally".to_string(),
309        }
310        .into())
311    }
312}
313
314// ── Private helpers ────────────────────────────────────────────────────────────
315
316fn effective_validation_schema(schema_json: &Value) -> Value {
317    // commonmeta schema files wrap the actual document schema under
318    // a `commonmeta` key together with shared `definitions`.
319    let Some(commonmeta_root) = schema_json.get("commonmeta") else {
320        return schema_json.clone();
321    };
322
323    let mut merged = serde_json::Map::new();
324
325    if let Some(v) = schema_json.get("$schema") {
326        merged.insert("$schema".to_string(), v.clone());
327    }
328    if let Some(v) = schema_json.get("$id") {
329        merged.insert("$id".to_string(), v.clone());
330    }
331    if let Some(v) = schema_json.get("definitions") {
332        merged.insert("definitions".to_string(), v.clone());
333    }
334
335    if let Value::Object(obj) = commonmeta_root {
336        for (key, value) in obj {
337            merged.insert(key.clone(), value.clone());
338        }
339        return Value::Object(merged);
340    }
341
342    schema_json.clone()
343}
344
345fn schema_file_name(schema_name: &str) -> Option<&'static str> {
346    match schema_name {
347        "commonmeta" | SCHEMA_VERSION => Some(SCHEMA_VERSION),
348        "cff" | "cff_v1.2.0" => Some("cff_v1.2.0"),
349        "crossref_xml" | "crossref-v5.4.0" | "crossref-v0.2" => Some("crossref-v5.4.0"),
350        "csl" | "csl-data" => Some("csl-data"),
351        "datacite" | "datacite-v4.5" => Some("datacite-v4.5"),
352        "inveniordm" | "inveniordm-v0.1" | "invenio-rdm-v0.1" => Some("inveniordm-v0.1"),
353        "schema_org" | "schema_org-v0.1" => Some("schema_org-v0.1"),
354        _ => None,
355    }
356}
357
358fn load_schema(schema_file: &str) -> Result<String> {
359    if schema_file == SCHEMA_VERSION {
360        return Ok(include_str!("../resources/commonmeta_v1.0.json").to_string());
361    }
362
363    let path = Path::new(env!("CARGO_MANIFEST_DIR"))
364        .join("resources")
365        .join(format!("{schema_file}.json"));
366
367    fs::read_to_string(&path)
368        .map_err(|_| Error::Parse(format!("schema file not found: {}", path.display())))
369}
370
371#[cfg(test)]
372mod tests {
373    use super::{
374        DEFAULT_SCHEMA, SCHEMA_VERSION, json_schema_errors, known_schemata, schema_file_name,
375        xml_schema_errors,
376    };
377
378    #[test]
379    fn validates_commonmeta_document_with_default_schema() {
380        let doc = include_bytes!("../tests/fixtures/commonmeta/journal_article.json");
381        let result = json_schema_errors(doc, None);
382        assert!(
383            result.is_ok(),
384            "expected schema validation to pass: {result:?}"
385        );
386    }
387
388    #[test]
389    fn rejects_invalid_commonmeta_document() {
390        let result = json_schema_errors(br#"{}"#, None);
391        assert!(result.is_err(), "expected validation to fail");
392        let message = result.expect_err("validation should fail").to_string();
393        assert!(
394            message.contains("validation failed") || message.contains("required"),
395            "unexpected error message: {message}"
396        );
397    }
398
399    #[test]
400    fn rejects_unknown_schema_name() {
401        let result = json_schema_errors(br#"{}"#, Some("does-not-exist"));
402        assert!(result.is_err(), "expected unknown schema to fail");
403        let message = result.expect_err("unknown schema should fail").to_string();
404        assert!(message.contains("schema 'does-not-exist' not found"));
405    }
406
407    #[test]
408    fn includes_default_schema_in_known_list() {
409        assert!(known_schemata().contains(&DEFAULT_SCHEMA));
410    }
411
412    #[test]
413    fn supports_python_schema_aliases() {
414        assert_eq!(schema_file_name("commonmeta"), Some(SCHEMA_VERSION));
415        assert_eq!(schema_file_name("commonmeta_v0.18"), None);
416        assert_eq!(schema_file_name("datacite"), Some("datacite-v4.5"));
417        assert_eq!(schema_file_name("crossref_xml"), Some("crossref-v5.4.0"));
418    }
419
420    #[test]
421    fn xsd_rejects_unknown_schema_name() {
422        let result = xml_schema_errors(b"<foo/>", Some("unknown"));
423        assert!(result.is_err());
424        let msg = result.unwrap_err().to_string();
425        assert!(msg.contains("not supported"), "unexpected: {msg}");
426    }
427
428    // fastxml 0.9.0 has a known bug: <xsd:choice minOccurs="0"> groups are
429    // treated as required, causing false failures on any JATS mixed-content
430    // element (title, jats:p, institution_name, etc.).  The test below only
431    // verifies that the Crossref XSD schema loads and compiles successfully,
432    // not that full document validation passes.
433    #[test]
434    fn xsd_crossref_schema_compiles() {
435        // Calling xml_schema_errors forces the OnceLock schema to be built.
436        // We expect either Ok (valid) or an Err that does NOT contain "failed
437        // to compile" (which would indicate a schema-load failure rather than
438        // a document-validation failure).
439        let xml = include_bytes!("../tests/fixtures/crossref_xml/journal_article.xml");
440        let result = xml_schema_errors(xml, Some("crossref_xml"));
441        if let Err(ref e) = result {
442            assert!(
443                !e.to_string().contains("failed to compile"),
444                "Crossref XSD schema failed to compile: {e}"
445            );
446        }
447    }
448}