pub mod resolver;
mod error;
use std::borrow::Cow;
use std::collections::{btree_map::Entry, BTreeMap, HashMap, HashSet, VecDeque};
use std::fmt::Debug;
use std::io::BufRead;
use std::mem::take;
use std::path::Path;
use quick_xml::events::Event;
use resolver::{FileResolver, NoOpResolver, ResolveRequest};
use tracing::instrument;
use url::Url;
use xsd_parser_types::misc::{Namespace, NamespacePrefix};
use xsd_parser_types::quick_xml::{
DeserializeSync, Error as QuickXmlError, IoReader, SliceReader, XmlReader, XmlReaderSync,
};
use crate::models::schema::{
xs::{Import, Schema, SchemaContent},
NamespaceId, NamespaceInfo, Schemas,
};
use crate::models::schema::{Dependency, SchemaId, SchemaInfo};
use crate::pipeline::parser::resolver::ResolveRequestType;
pub use self::error::{Error, XmlErrorWithLocation};
pub use self::resolver::Resolver;
#[must_use]
#[derive(Default, Debug)]
pub struct Parser<TResolver = NoOpResolver> {
cache: HashMap<Url, HashSet<TempSchemaId>>,
entries: Vec<ParserEntry>,
pending: VecDeque<(TempSchemaId, ResolveRequest)>,
next_temp_id: TempSchemaId,
resolver: TResolver,
resolve_includes: bool,
generate_prefixes: bool,
alternative_prefixes: bool,
}
#[derive(Default, Debug, Clone, Copy, Hash, Eq, PartialEq, Ord, PartialOrd)]
struct TempSchemaId(usize);
#[derive(Debug)]
#[allow(clippy::large_enum_variant)]
enum ParserEntry {
AnonymousNamespace,
Namespace {
prefix: NamespacePrefix,
namespace: Namespace,
},
Schema {
id: TempSchemaId,
name: Option<String>,
schema: Schema,
location: Option<Url>,
target_ns: Option<Namespace>,
namespaces: Namespaces,
dependencies: BTreeMap<String, Dependency<TempSchemaId>>,
},
}
#[derive(Debug)]
struct SchemasBuilder {
schemas: Schemas,
entries: Vec<ParserEntry>,
id_cache: HashMap<TempSchemaId, SchemaId>,
prefix_cache: HashMap<Option<Namespace>, PrefixEntry>,
location_cache: HashMap<Url, HashSet<TempSchemaId>>,
generate_prefixes: bool,
alternative_prefixes: bool,
}
#[derive(Default, Debug)]
struct PrefixEntry {
prefix: Option<NamespacePrefix>,
alt_prefixes: HashSet<NamespacePrefix>,
}
impl Parser {
pub fn new() -> Self {
Self::default()
}
}
impl<TResolver> Parser<TResolver> {
pub fn with_default_resolver(self) -> Parser<FileResolver> {
self.with_resolver(FileResolver)
}
pub fn with_resolver<XResolver: Resolver + 'static>(
self,
resolver: XResolver,
) -> Parser<XResolver> {
let Self { entries, .. } = self;
let cache = HashMap::new();
let pending = VecDeque::new();
Parser {
cache,
entries,
pending,
next_temp_id: TempSchemaId(0),
resolver,
resolve_includes: true,
generate_prefixes: true,
alternative_prefixes: true,
}
}
pub fn resolve_includes(mut self, value: bool) -> Self {
self.resolve_includes = value;
self
}
pub fn generate_prefixes(mut self, value: bool) -> Self {
self.generate_prefixes = value;
self
}
pub fn alternative_prefixes(mut self, value: bool) -> Self {
self.alternative_prefixes = value;
self
}
pub fn finish(self) -> Schemas {
let builder = SchemasBuilder {
schemas: Schemas::default(),
entries: self.entries,
id_cache: HashMap::new(),
prefix_cache: HashMap::new(),
location_cache: self.cache,
generate_prefixes: self.generate_prefixes,
alternative_prefixes: self.alternative_prefixes,
};
builder.build()
}
}
impl<TResolver> Parser<TResolver>
where
TResolver: Resolver,
{
pub fn with_default_namespaces(self) -> Self {
self.with_anonymous_namespace()
.with_namespace(NamespacePrefix::XS, Namespace::XS)
.with_namespace(NamespacePrefix::XML, Namespace::XML)
.with_namespace(NamespacePrefix::XSI, Namespace::XSI)
}
pub fn with_namespace(mut self, prefix: NamespacePrefix, namespace: Namespace) -> Self {
self.entries
.push(ParserEntry::Namespace { prefix, namespace });
self
}
pub fn with_anonymous_namespace(mut self) -> Self {
self.entries.push(ParserEntry::AnonymousNamespace);
self
}
}
impl<TResolver> Parser<TResolver>
where
TResolver: Resolver,
TResolver::Buffer: BufRead,
{
#[instrument(err, level = "trace", skip(self, schema))]
pub fn add_schema_from_str(self, schema: &str) -> Result<Self, Error<TResolver::Error>> {
self.add_named_schema_from_str_impl(None, schema)
}
#[instrument(err, level = "trace", skip(self, schema))]
pub fn add_named_schema_from_str(
self,
name: String,
schema: &str,
) -> Result<Self, Error<TResolver::Error>> {
self.add_named_schema_from_str_impl(Some(name), schema)
}
#[instrument(err, level = "trace", skip(self, schema))]
fn add_named_schema_from_str_impl(
mut self,
name: Option<String>,
schema: &str,
) -> Result<Self, Error<TResolver::Error>> {
let reader = SliceReader::new(schema);
let mut reader = SchemaReader::new(reader);
let schema = Schema::deserialize(&mut reader).map_err(XmlErrorWithLocation::from)?;
let id = self.temp_schema_id();
self.add_schema(id, name, schema, None, reader.namespaces);
self.resolve_pending()?;
Ok(self)
}
pub fn add_schema_from_reader<R: BufRead>(
self,
reader: R,
) -> Result<Self, Error<TResolver::Error>> {
self.add_named_schema_from_reader_impl(None, reader)
}
pub fn add_named_schema_from_reader<R: BufRead>(
self,
name: String,
reader: R,
) -> Result<Self, Error<TResolver::Error>> {
self.add_named_schema_from_reader_impl(Some(name), reader)
}
#[instrument(err, level = "trace", skip(self, reader))]
fn add_named_schema_from_reader_impl<R: BufRead>(
mut self,
name: Option<String>,
reader: R,
) -> Result<Self, Error<TResolver::Error>> {
let reader = IoReader::new(reader);
let mut reader = SchemaReader::new(reader);
let schema = Schema::deserialize(&mut reader).map_err(XmlErrorWithLocation::from)?;
let id = self.temp_schema_id();
self.add_schema(id, name, schema, None, reader.namespaces);
self.resolve_pending()?;
Ok(self)
}
#[instrument(err, level = "trace", skip(self))]
pub fn add_schema_from_file<P: AsRef<Path> + Debug>(
self,
path: P,
) -> Result<Self, Error<TResolver::Error>> {
let path = path.as_ref().canonicalize()?;
let url = Url::from_file_path(&path).map_err(|()| Error::InvalidFilePath(path))?;
self.add_schema_from_url(url)
}
#[instrument(err, level = "trace", skip(self))]
pub fn add_schema_from_files<I>(mut self, paths: I) -> Result<Self, Error<TResolver::Error>>
where
I: IntoIterator + Debug,
I::Item: AsRef<Path> + Debug,
{
for path in paths {
self = self.add_schema_from_file(path)?;
}
Ok(self)
}
#[instrument(err, level = "trace", skip(self))]
pub fn add_schema_from_url(mut self, url: Url) -> Result<Self, Error<TResolver::Error>> {
let req = ResolveRequest::new(url, ResolveRequestType::UserDefined);
let id = self.temp_schema_id();
self.resolve_location(id, req)?;
self.resolve_pending()?;
Ok(self)
}
fn temp_schema_id(&mut self) -> TempSchemaId {
let id = self.next_temp_id;
self.next_temp_id.0 = self.next_temp_id.0.wrapping_add(1);
id
}
fn add_pending(&mut self, req: ResolveRequest) -> TempSchemaId {
tracing::debug!("Add pending resolve request: {req:#?}");
let id = self.temp_schema_id();
self.pending.push_back((id, req));
id
}
fn resolve_pending(&mut self) -> Result<(), Error<TResolver::Error>> {
while let Some((id, req)) = self.pending.pop_front() {
self.resolve_location(id, req)?;
}
Ok(())
}
#[instrument(err, level = "trace", skip(self))]
fn resolve_location(
&mut self,
id: TempSchemaId,
req: ResolveRequest,
) -> Result<(), Error<TResolver::Error>> {
tracing::debug!("Process resolve request: {req:#?}");
let Some((name, location, buffer)) =
self.resolver.resolve(&req).map_err(Error::resolver)?
else {
return Err(Error::UnableToResolve(Box::new(req)));
};
if let Some(ids) = self.cache.get_mut(&location) {
ids.insert(id);
return Ok(());
}
let reader = IoReader::new(buffer);
let reader = SchemaReader::new(reader);
let mut reader = reader.with_error_info();
let mut schema =
Schema::deserialize(&mut reader).map_err(|error| XmlErrorWithLocation {
error,
location: Some(location.clone()),
})?;
if let Some(current_ns) = req.current_ns {
if let Some(ns) = schema.target_namespace.as_ref() {
if req.request_type == ResolveRequestType::IncludeRequest
&& ns.as_bytes() != current_ns.as_ref()
{
return Err(Error::MismatchingTargetNamespace {
location,
found: Namespace::new(ns.as_bytes().to_vec()),
expected: current_ns,
});
}
} else {
let inherited_ns = current_ns.to_string();
schema.target_namespace = Some(inherited_ns);
}
}
let reader = reader.into_inner();
self.add_schema(id, name, schema, Some(location.clone()), reader.namespaces);
self.cache.insert(location, HashSet::from([id]));
Ok(())
}
fn add_schema(
&mut self,
id: TempSchemaId,
name: Option<String>,
schema: Schema,
location: Option<Url>,
namespaces: Namespaces,
) {
tracing::debug!(
"Process schema (location={:?}, target_namespace={:?}",
location.as_ref().map(Url::as_str),
&schema.target_namespace
);
let target_ns = schema
.target_namespace
.as_deref()
.map(|ns| Namespace::from(ns.as_bytes().to_owned()));
let mut dependencies = BTreeMap::new();
if self.resolve_includes {
for content in &schema.content {
match content {
SchemaContent::Import(x) => {
if let Some(req) = import_req(x, target_ns.clone(), location.as_ref()) {
let location = req.requested_location.clone();
let id = self.add_pending(req);
dependencies.insert(location, Dependency::Import(id));
}
}
SchemaContent::Include(x) => {
let req =
include_req(&x.schema_location, target_ns.clone(), location.as_ref());
let location = req.requested_location.clone();
let id = self.add_pending(req);
dependencies.insert(location, Dependency::Include(id));
}
SchemaContent::Redefine(x) => {
let req =
include_req(&x.schema_location, target_ns.clone(), location.as_ref());
let location = req.requested_location.clone();
let id = self.add_pending(req);
dependencies.insert(location, Dependency::Redefine(id));
}
SchemaContent::Override(x) => {
let req =
include_req(&x.schema_location, target_ns.clone(), location.as_ref());
let location = req.requested_location.clone();
let id = self.add_pending(req);
dependencies.insert(location, Dependency::Override(id));
}
_ => (),
}
}
}
self.entries.push(ParserEntry::Schema {
id,
name,
schema,
location,
target_ns,
namespaces,
dependencies,
});
}
}
struct SchemaReader<R> {
inner: R,
namespaces: Namespaces,
}
type Namespaces = BTreeMap<Option<Namespace>, Vec<NamespacePrefix>>;
impl<R> SchemaReader<R> {
fn new(inner: R) -> Self {
Self {
inner,
namespaces: BTreeMap::new(),
}
}
}
impl<R> XmlReader for SchemaReader<R>
where
R: XmlReader,
{
fn extend_error(&self, error: QuickXmlError) -> QuickXmlError {
self.inner.extend_error(error)
}
}
impl<'a, R> XmlReaderSync<'a> for SchemaReader<R>
where
R: XmlReaderSync<'a>,
{
fn read_event(&mut self) -> Result<Event<'a>, QuickXmlError> {
let event = self.inner.read_event()?;
if let Event::Start(x) | Event::Empty(x) = &event {
for a in x.attributes() {
let a = a?;
if matches!(a.key.prefix(), Some(x) if x.as_ref() == b"xmlns") {
let prefix = NamespacePrefix::new(a.key.local_name().as_ref().to_owned());
let namespace = Namespace::new(a.value.into_owned());
self.namespaces
.entry(Some(namespace))
.or_default()
.push(prefix);
}
}
}
Ok(event)
}
}
impl SchemasBuilder {
fn build(mut self) -> Schemas {
self.build_id_cache();
self.build_prefix_cache();
for entry in take(&mut self.entries) {
match entry {
ParserEntry::AnonymousNamespace => {
self.get_or_create_namespace_info_mut(None);
}
ParserEntry::Namespace { namespace, .. } => {
self.get_or_create_namespace_info_mut(Some(namespace));
}
ParserEntry::Schema {
id,
name,
schema,
location,
target_ns,
namespaces: _,
dependencies,
} => {
self.add_schema(id, target_ns, name, location, schema, dependencies);
}
}
}
self.determine_prefixes();
self.schemas
}
fn build_id_cache(&mut self) {
for entry in &self.entries {
if let ParserEntry::Schema {
id: temp_id,
location,
..
} = entry
{
let id = self.schemas.next_schema_id();
self.id_cache.insert(*temp_id, id);
if let Some(alternative_ids) =
location.as_ref().and_then(|x| self.location_cache.get(x))
{
for temp_id in alternative_ids {
self.id_cache.insert(*temp_id, id);
}
}
}
}
}
fn build_prefix_cache(&mut self) {
for entry in &self.entries {
match entry {
ParserEntry::AnonymousNamespace => {
self.prefix_cache.entry(None).or_default();
}
ParserEntry::Namespace { prefix, namespace } => {
self.prefix_cache
.entry(Some(namespace.clone()))
.or_default()
.prefix = Some(prefix.clone());
}
ParserEntry::Schema {
target_ns,
namespaces,
..
} => {
let prefix = namespaces
.get(target_ns)
.and_then(|prefixes| prefixes.first())
.cloned();
let entry = self.prefix_cache.entry(target_ns.clone()).or_default();
if entry.prefix.is_none() {
entry.prefix = prefix;
} else if let Some(prefix) = prefix {
entry.alt_prefixes.insert(prefix);
}
for (namespace, prefixes) in namespaces {
for prefix in prefixes {
self.prefix_cache
.entry(namespace.clone())
.or_default()
.alt_prefixes
.insert(prefix.clone());
}
}
}
}
}
}
fn add_schema(
&mut self,
id: TempSchemaId,
namespace: Option<Namespace>,
name: Option<String>,
location: Option<Url>,
schema: Schema,
dependencies: BTreeMap<String, Dependency<TempSchemaId>>,
) {
self.schemas.last_schema_id = self.schemas.last_schema_id.wrapping_add(1);
let schema_id = *self.id_cache.get(&id).unwrap();
let (namespace_id, namespace_info) = self.get_or_create_namespace_info_mut(namespace);
namespace_info.schemas.push(schema_id);
let dependencies = dependencies
.into_iter()
.filter_map(|(location, dep)| {
let id = *self.id_cache.get(&*dep)?;
let dep = dep.map(|_| id);
Some((location, dep))
})
.collect();
match self.schemas.schemas.entry(schema_id) {
Entry::Vacant(e) => e.insert(SchemaInfo {
name,
schema,
location,
namespace_id,
dependencies,
}),
Entry::Occupied(_) => crate::unreachable!(),
};
}
fn get_or_create_namespace_info_mut(
&mut self,
namespace: Option<Namespace>,
) -> (NamespaceId, &mut NamespaceInfo) {
match self.schemas.known_namespaces.entry(namespace) {
Entry::Occupied(e) => {
let id = *e.get();
let info = self.schemas.namespace_infos.get_mut(&id).unwrap();
(id, info)
}
Entry::Vacant(e) => {
let id = if e.key().is_none() {
NamespaceId::ANONYMOUS
} else {
self.schemas.last_namespace_id = self.schemas.last_namespace_id.wrapping_add(1);
NamespaceId(self.schemas.last_namespace_id)
};
let namespace = e.key().clone();
e.insert(id);
let info = match self.schemas.namespace_infos.entry(id) {
Entry::Vacant(e) => e.insert(NamespaceInfo::new(namespace)),
Entry::Occupied(_) => crate::unreachable!(),
};
(id, info)
}
}
}
fn determine_prefixes(&mut self) {
for (id, info) in &mut self.schemas.namespace_infos {
if info.prefix.is_some() {
continue;
}
let entry = &mut self.prefix_cache.get(&info.namespace).unwrap();
if let Some(prefix) = &entry.prefix {
if let Entry::Vacant(e) = self.schemas.known_prefixes.entry(prefix.clone()) {
info.prefix = Some(e.key().clone());
e.insert(*id);
}
}
}
if self.alternative_prefixes {
for (id, info) in &mut self.schemas.namespace_infos {
if info.prefix.is_some() {
continue;
}
let entry = &mut self.prefix_cache.get(&info.namespace).unwrap();
for alt in &entry.alt_prefixes {
if let Entry::Vacant(e) = self.schemas.known_prefixes.entry(alt.clone()) {
info.prefix = Some(e.key().clone());
e.insert(*id);
}
}
}
}
if self.generate_prefixes {
for (id, info) in &mut self.schemas.namespace_infos {
if info.prefix.is_some() {
continue;
}
let entry = &mut self.prefix_cache.get(&info.namespace).unwrap();
let prefix = entry
.prefix
.clone()
.or_else(|| entry.alt_prefixes.iter().next().cloned());
if let Some(prefix) = prefix {
let ext = format!("_{}", id.0);
let ext = ext.as_bytes();
let mut p = prefix.0.into_owned();
p.extend_from_slice(ext);
let prefix = NamespacePrefix(Cow::Owned(p));
self.schemas.known_prefixes.insert(prefix, *id);
}
}
}
}
}
fn import_req(
import: &Import,
current_ns: Option<Namespace>,
current_location: Option<&Url>,
) -> Option<ResolveRequest> {
let location = import.schema_location.as_ref()?;
let mut req = ResolveRequest::new(location, ResolveRequestType::ImportRequest);
if let Some(ns) = current_ns {
req = req.current_ns(ns);
}
if let Some(ns) = &import.namespace {
req = req.requested_ns(Namespace::from(ns.as_bytes().to_owned()));
}
if let Some(current_location) = current_location {
req = req.current_location(current_location.clone());
}
Some(req)
}
fn include_req(
schema_location: &str,
current_ns: Option<Namespace>,
current_location: Option<&Url>,
) -> ResolveRequest {
let mut req = ResolveRequest::new(schema_location, ResolveRequestType::IncludeRequest);
if let Some(ns) = current_ns {
req = req.current_ns(ns);
}
if let Some(current_location) = current_location {
req = req.current_location(current_location.clone());
}
req
}