use std::fs;
use std::path::{Path, PathBuf};
use std::sync::{Arc, OnceLock};
use fastxml::schema::fetcher::{FetchResult, FileFetcher, SchemaFetcher};
use fastxml::schema::{Schema, Validator};
use fastxml::schema::fetcher::error::FetchError;
use serde_json::Value;
use crate::error::{Error, Result};
pub const SCHEMA_VERSION: &str = "commonmeta_v1.0";
pub const DEFAULT_SCHEMA: &str = "commonmeta";
pub const SCHEMA_JSON: &str = include_str!("../resources/commonmeta_v1.0.json");
const SCHEMATA: &[&str] = &[
DEFAULT_SCHEMA,
"cff",
"crossref_xml",
"csl",
"datacite",
"inveniordm",
"orcid",
"schema_org",
];
pub fn known_schemata() -> &'static [&'static str] {
SCHEMATA
}
pub fn json_schema_errors(document: &[u8], schema: Option<&str>) -> Result<()> {
let schema_name = schema.unwrap_or(DEFAULT_SCHEMA);
let Some(schema_file) = schema_file_name(schema_name) else {
return Err(Error::UnsupportedFormat(format!(
"schema '{schema_name}' not found"
)));
};
let schema_text = load_schema(schema_file)?;
let schema_json: Value = serde_json::from_str(&schema_text)
.map_err(|_| Error::Parse(format!("invalid JSON in schema file: {schema_file}.json")))?;
let document_json: Value =
serde_json::from_slice(document).map_err(|e| Error::Parse(e.to_string()))?;
let document_json =
if schema_json.get("type").and_then(Value::as_str) == Some("array")
&& document_json.is_object()
{
Value::Array(vec![document_json])
} else {
document_json
};
let compiled =
jsonschema::validator_for(&schema_json).map_err(|e| Error::Parse(e.to_string()))?;
let raw_errors: Vec<jsonschema::ValidationError<'_>> =
compiled.iter_errors(&document_json).collect();
if raw_errors.is_empty() {
return Ok(());
}
let messages = collect_leaf_errors(&raw_errors);
Err(Error::Parse(format!(
"json schema validation failed ({} errors): {}",
messages.len(),
messages.join("; ")
)))
}
pub(crate) fn collect_leaf_errors(errs: &[jsonschema::ValidationError<'_>]) -> Vec<String> {
use jsonschema::error::ValidationErrorKind;
let mut out = Vec::new();
for e in errs {
match e.kind() {
ValidationErrorKind::AnyOf { context }
| ValidationErrorKind::OneOfNotValid { context } => {
let useful: Vec<&Vec<jsonschema::ValidationError<'static>>> = context
.iter()
.filter(|branch| {
!(branch.len() == 1
&& matches!(branch[0].kind(), ValidationErrorKind::Type { .. }))
})
.collect();
let branches = if useful.is_empty() {
context.iter().collect()
} else {
useful
};
for branch in branches {
out.extend(collect_leaf_errors(branch));
}
}
_ => {
let path = e.instance_path().to_string();
let msg = match e.kind() {
ValidationErrorKind::Enum { options } => {
format!("value {} not in enum: {options}", e.instance())
}
other => format_error_kind(other),
};
out.push(if path.is_empty() {
msg
} else {
format!("{path}: {msg}")
});
}
}
}
out
}
fn format_error_kind(kind: &jsonschema::error::ValidationErrorKind) -> String {
use jsonschema::error::ValidationErrorKind;
match kind {
ValidationErrorKind::AdditionalProperties { unexpected } => {
format!("unexpected properties: {}", unexpected.join(", "))
}
ValidationErrorKind::Required { property } => {
let name = if let Some(s) = property.as_str() {
s.to_string()
} else {
property.to_string().trim_matches('"').to_string()
};
format!("required property '{name}' is missing")
}
ValidationErrorKind::Type { kind } => format!("wrong type: expected {kind:?}"),
ValidationErrorKind::Format { format } => {
format!("value does not match format '{format}'")
}
ValidationErrorKind::Pattern { pattern } => {
format!("value does not match pattern '{pattern}'")
}
ValidationErrorKind::UniqueItems => "array contains duplicate items".to_string(),
ValidationErrorKind::MinItems { limit } => {
format!("array has fewer than {limit} items")
}
ValidationErrorKind::MaxItems { limit } => {
format!("array has more than {limit} items")
}
ValidationErrorKind::Minimum { limit } => format!("value is less than minimum {limit}"),
ValidationErrorKind::Maximum { limit } => format!("value exceeds maximum {limit}"),
ValidationErrorKind::MinLength { limit } => {
format!("string shorter than {limit} characters")
}
ValidationErrorKind::MaxLength { limit } => {
format!("string longer than {limit} characters")
}
ValidationErrorKind::Constant { expected_value } => {
format!("expected constant value: {expected_value}")
}
other => format!("{other:?}"),
}
}
pub fn xml_schema_errors(xml: &[u8], schema: Option<&str>) -> Result<()> {
let schema_name = schema.unwrap_or("crossref_xml");
let compiled = match schema_name {
"crossref_xml" | "crossref" | "crossref-v5.4.0" => crossref_xsd_schema()?,
"datacite_xml" | "datacite-v4.7" => datacite_xsd_schema()?,
other => {
return Err(Error::UnsupportedFormat(format!(
"XSD schema '{other}' not supported"
)));
}
};
let report = Validator::from(xml)
.schema(compiled)
.run()
.map_err(|e| Error::Parse(e.to_string()))?;
if report.is_valid() {
return Ok(());
}
let errors: Vec<String> = report.errors().iter().map(|e| e.to_string()).collect();
Err(Error::Parse(format!(
"XSD validation failed ({} errors): {}",
errors.len(),
errors.join("; ")
)))
}
fn crossref_xsd_schema() -> Result<Arc<Schema>> {
static SCHEMA: OnceLock<std::result::Result<Arc<Schema>, String>> = OnceLock::new();
SCHEMA
.get_or_init(build_crossref_schema)
.as_ref()
.map(Arc::clone)
.map_err(|e| Error::Parse(e.clone()))
}
fn build_crossref_schema() -> std::result::Result<Arc<Schema>, String> {
let base_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("resources")
.join("crossref");
let main_xsd_path = base_dir.join("crossref5.4.0.xsd");
let main_xsd = fs::read(&main_xsd_path)
.map_err(|e| format!("could not read crossref5.4.0.xsd: {e}"))?;
let fetcher = SandboxFetcher { base: FileFetcher::with_base_dir(&base_dir) };
Schema::builder()
.add(
"https://www.crossref.org/schemas/crossref5.4.0.xsd",
main_xsd,
)
.resolve_with(&fetcher)
.map(Arc::new)
.map_err(|e| format!("failed to compile Crossref XSD schema: {e}"))
}
fn datacite_xsd_schema() -> Result<Arc<Schema>> {
static SCHEMA: OnceLock<std::result::Result<Arc<Schema>, String>> = OnceLock::new();
SCHEMA
.get_or_init(build_datacite_schema)
.as_ref()
.map(Arc::clone)
.map_err(|e| Error::Parse(e.clone()))
}
fn build_datacite_schema() -> std::result::Result<Arc<Schema>, String> {
let base_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("resources")
.join("datacite");
let main_xsd_path = base_dir.join("datacite-v4.xsd");
let main_xsd = fs::read(&main_xsd_path)
.map_err(|e| format!("could not read datacite-v4.xsd: {e}"))?;
let fetcher = SandboxFetcher { base: FileFetcher::with_base_dir(&base_dir) };
Schema::builder()
.add(
"https://schema.datacite.org/meta/kernel-4.7/metadata.xsd",
main_xsd,
)
.resolve_with(&fetcher)
.map(Arc::new)
.map_err(|e| format!("failed to compile DataCite XSD schema: {e}"))
}
struct SandboxFetcher {
base: FileFetcher,
}
impl SchemaFetcher for SandboxFetcher {
fn fetch(&self, url: &str) -> fastxml::error::Result<FetchResult> {
if let Ok(result) = self.base.fetch(url) {
return Ok(result);
}
if url.starts_with("http://") || url.starts_with("https://") {
if let Some(filename) = url.rsplit('/').next() {
if let Ok(result) = self.base.fetch(filename) {
return Ok(result);
}
}
let stub = r#"<?xml version="1.0" encoding="UTF-8"?><xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"/>"#;
return Ok(FetchResult {
content: stub.as_bytes().to_vec(),
final_url: url.to_string(),
redirected: false,
});
}
Err(FetchError::RequestFailed {
url: url.to_string(),
message: "schema not found locally".to_string(),
}
.into())
}
}
fn schema_file_name(schema_name: &str) -> Option<&'static str> {
match schema_name {
"commonmeta" | SCHEMA_VERSION => Some(SCHEMA_VERSION),
"cff" | "cff_v1.2.0" => Some("cff_v1.2.0"),
"crossref_xml" | "crossref-v5.4.0" | "crossref-v0.2" => Some("crossref-v5.4.0"),
"csl" | "csl-data" => Some("csl-data"),
"datacite" | "datacite-v4.5" => Some("datacite-v4.5"),
"inveniordm" | "inveniordm-v0.1" | "invenio-rdm-v0.1" => Some("inveniordm-v0.1"),
"orcid" | "orcid_v3.0" | "orcid-v3.0" => Some("orcid_schema_v3.0"),
"schema_org" | "schema_org-v0.1" => Some("schema_org-v0.1"),
_ => None,
}
}
fn load_schema(schema_file: &str) -> Result<String> {
if schema_file == SCHEMA_VERSION {
return Ok(include_str!("../resources/commonmeta_v1.0.json").to_string());
}
let path = Path::new(env!("CARGO_MANIFEST_DIR"))
.join("resources")
.join(format!("{schema_file}.json"));
fs::read_to_string(&path)
.map_err(|_| Error::Parse(format!("schema file not found: {}", path.display())))
}
#[cfg(test)]
mod tests {
use super::{
DEFAULT_SCHEMA, SCHEMA_VERSION, json_schema_errors, known_schemata, schema_file_name,
xml_schema_errors,
};
#[test]
fn validates_commonmeta_document_with_default_schema() {
let doc = include_bytes!("../tests/fixtures/commonmeta/journal_article.json");
let result = json_schema_errors(doc, None);
assert!(
result.is_ok(),
"expected schema validation to pass: {result:?}"
);
}
#[test]
fn rejects_invalid_commonmeta_document() {
let result = json_schema_errors(br#"{}"#, None);
assert!(result.is_err(), "expected validation to fail");
let message = result.expect_err("validation should fail").to_string();
assert!(
message.contains("validation failed") || message.contains("required"),
"unexpected error message: {message}"
);
}
#[test]
fn rejects_unknown_schema_name() {
let result = json_schema_errors(br#"{}"#, Some("does-not-exist"));
assert!(result.is_err(), "expected unknown schema to fail");
let message = result.expect_err("unknown schema should fail").to_string();
assert!(message.contains("schema 'does-not-exist' not found"));
}
#[test]
fn includes_default_schema_in_known_list() {
assert!(known_schemata().contains(&DEFAULT_SCHEMA));
}
#[test]
fn supports_python_schema_aliases() {
assert_eq!(schema_file_name("commonmeta"), Some(SCHEMA_VERSION));
assert_eq!(schema_file_name("commonmeta_v0.18"), None);
assert_eq!(schema_file_name("datacite"), Some("datacite-v4.5"));
assert_eq!(schema_file_name("crossref_xml"), Some("crossref-v5.4.0"));
}
#[test]
fn xsd_rejects_unknown_schema_name() {
let result = xml_schema_errors(b"<foo/>", Some("unknown"));
assert!(result.is_err());
let msg = result.unwrap_err().to_string();
assert!(msg.contains("not supported"), "unexpected: {msg}");
}
#[test]
fn xsd_crossref_schema_compiles() {
let xml = include_bytes!("../tests/fixtures/crossref_xml/journal_article.xml");
let result = xml_schema_errors(xml, Some("crossref_xml"));
if let Err(ref e) = result {
assert!(
!e.to_string().contains("failed to compile"),
"Crossref XSD schema failed to compile: {e}"
);
}
}
}