xsd_parser/pipeline/parser/
mod.rs

1//! XML Schema (XSD) parser and resolver infrastructure.
2//!
3//! This module defines the [`Parser`] type and supporting logic for loading,
4//! resolving, and parsing XML Schema documents (`.xsd`) into structured [`Schemas`].
5//!
6//! The parser supports various input sources (e.g., files, strings, URLs) and
7//! handles `<import>` and `<include>` logic using pluggable [`Resolver`] implementations.
8//!
9//! Parsed schemas can be passed to the [`Interpreter`](crate::Interpreter) for
10//! further transformation into semantic types.
11//!
12//! # Example
13//! ```rust,ignore
14//! let schemas = Parser::new()
15//!     .with_default_resolver()
16//!     .add_schema_from_file("schema.xsd")?
17//!     .finish();
18//! ```
19
20pub mod resolver;
21
22mod error;
23
24use std::collections::{BTreeMap, HashSet, VecDeque};
25use std::fmt::Debug;
26use std::io::BufRead;
27use std::path::Path;
28
29use quick_xml::{
30    events::Event,
31    name::{LocalName, QName, ResolveResult},
32};
33use resolver::{FileResolver, NoOpResolver, ResolveRequest};
34use tracing::instrument;
35use url::Url;
36
37use crate::models::schema::{
38    xs::{Import, Include, Schema, SchemaContent},
39    Namespace, NamespacePrefix, Schemas,
40};
41use crate::pipeline::parser::resolver::ResolveRequestType;
42use crate::quick_xml::{
43    DeserializeSync, Error as QuickXmlError, IoReader, SliceReader, XmlReader, XmlReaderSync,
44};
45use crate::xml::NamespacesShared;
46
47pub use self::error::Error;
48pub use self::resolver::Resolver;
49
50/// The [`Parser`] is responsible for loading and parsing XML Schema documents into
51/// a structured [`Schemas`] representation.
52///
53/// It supports resolution of schema references such as `<import>` and `<include>`
54/// using a pluggable [`Resolver`], and can read schema content from strings, files,
55/// or URLs.
56///
57/// Internally, the parser maintains a queue of pending schema loads and a cache
58/// to prevent duplicate resolutions. Once all schemas are processed, the
59/// [`finish`](Self::finish) method returns the final [`Schemas`] collection.
60///
61/// A generic resolver type `TResolver` controls how external schemas are fetched.
62/// By default, a no-op resolver is used, but file-based or custom resolvers can
63/// be injected using [`with_resolver`](Self::with_resolver).
64#[must_use]
65#[derive(Default, Debug)]
66pub struct Parser<TResolver = NoOpResolver> {
67    cache: HashSet<Url>,
68    schemas: Schemas,
69    pending: VecDeque<ResolveRequest>,
70
71    resolver: TResolver,
72    resolve_includes: bool,
73}
74
75impl Parser {
76    /// Create a new [`Parser`] instance.
77    pub fn new() -> Self {
78        Self::default()
79    }
80}
81
82impl<TResolver> Parser<TResolver> {
83    /// Set the default resolver for this parser.
84    ///
85    /// The default resolver is just a simple [`FileResolver`].
86    pub fn with_default_resolver(self) -> Parser<FileResolver> {
87        self.with_resolver(FileResolver)
88    }
89
90    /// Set a custom defined resolver for this parser.
91    pub fn with_resolver<XResolver: Resolver + 'static>(
92        self,
93        resolver: XResolver,
94    ) -> Parser<XResolver> {
95        let Self { schemas, .. } = self;
96
97        let cache = HashSet::new();
98        let pending = VecDeque::new();
99
100        Parser {
101            cache,
102            schemas,
103            pending,
104
105            resolver,
106            resolve_includes: true,
107        }
108    }
109
110    /// Enable or disable resolving includes of parsed XML schemas.
111    pub fn resolve_includes(mut self, value: bool) -> Self {
112        self.resolve_includes = value;
113
114        self
115    }
116
117    /// Finish the parsing process by returning the generated [`Schemas`] instance
118    /// containing all parsed schemas.
119    pub fn finish(self) -> Schemas {
120        self.schemas
121    }
122}
123
124impl<TResolver> Parser<TResolver>
125where
126    TResolver: Resolver,
127{
128    /// Add the default namespaces to this parser.
129    ///
130    /// The default namespaces are:
131    /// - [`NamespacePrefix::XS`] [`Namespace::XS`]
132    /// - [`NamespacePrefix::XML`] [`Namespace::XML`]
133    ///
134    /// # Errors
135    ///
136    /// Forwards the errors from [`with_namespace`](Self::with_namespace).
137    pub fn with_default_namespaces(self) -> Self {
138        self.with_namespace(NamespacePrefix::XS, Namespace::XS)
139            .with_namespace(NamespacePrefix::XML, Namespace::XML)
140    }
141
142    /// Add a new namespace to this parser.
143    ///
144    /// This method will add a new namespace to the parser. This can be useful to
145    /// pre-heat the prefixes for known namespace, or to define namespaces for
146    /// custom defined types.
147    ///
148    /// This will not add any schema information. It's just a namespace definition.
149    ///
150    /// # Errors
151    ///
152    /// Will return an error if a problem or mismatch with the already existing
153    /// namespaces was encountered.
154    pub fn with_namespace(mut self, prefix: NamespacePrefix, namespace: Namespace) -> Self {
155        self.schemas
156            .get_or_create_namespace_info_mut(Some(prefix), Some(namespace));
157
158        self
159    }
160}
161
162impl<TResolver> Parser<TResolver>
163where
164    TResolver: Resolver,
165    TResolver::Buffer: BufRead,
166{
167    /// Add a new XML schema from the passed string.
168    ///
169    /// This will parse the XML schema represented by the provided string and add
170    /// all schema information to the resulting [`Schemas`] structure.
171    ///
172    /// # Errors
173    ///
174    /// Will return an suitable error if the parser could not parse the provided
175    /// schema.
176    #[instrument(err, level = "trace", skip(self, schema))]
177    pub fn add_schema_from_str(self, schema: &str) -> Result<Self, Error<TResolver::Error>> {
178        self.add_named_schema_from_str_impl(None, schema)
179    }
180
181    /// Add a new XML schema from the passed string.
182    ///
183    /// This will parse the XML schema represented by the provided string and add
184    /// all schema information to the resulting [`Schemas`] structure using the
185    /// passed `name` for the schema.
186    ///
187    /// # Errors
188    ///
189    /// Will return an suitable error if the parser could not parse the provided
190    /// schema.
191    #[instrument(err, level = "trace", skip(self, schema))]
192    pub fn add_named_schema_from_str(
193        self,
194        name: String,
195        schema: &str,
196    ) -> Result<Self, Error<TResolver::Error>> {
197        self.add_named_schema_from_str_impl(Some(name), schema)
198    }
199
200    #[instrument(err, level = "trace", skip(self, schema))]
201    fn add_named_schema_from_str_impl(
202        mut self,
203        name: Option<String>,
204        schema: &str,
205    ) -> Result<Self, Error<TResolver::Error>> {
206        let reader = SliceReader::new(schema);
207        let mut reader = SchemaReader::new(reader);
208
209        let schema = Schema::deserialize(&mut reader)?;
210
211        self.add_schema(name, schema, None, &reader.namespaces);
212        self.resolve_pending()?;
213
214        Ok(self)
215    }
216
217    /// Add a new XML schema from the passed `reader`.
218    ///
219    /// This will parse the XML schema represented by the provided reader and add
220    /// all schema information to the resulting [`Schemas`] structure.
221    ///
222    /// # Errors
223    ///
224    /// Will return an suitable error if the parser could not read the data from
225    /// the reader, or parse the schema provided by the reader.
226    pub fn add_schema_from_reader<R: BufRead>(
227        self,
228        reader: R,
229    ) -> Result<Self, Error<TResolver::Error>> {
230        self.add_named_schema_from_reader_impl(None, reader)
231    }
232
233    /// Add a new XML schema from the passed `reader`.
234    ///
235    /// This will parse the XML schema represented by the provided reader and add
236    /// all schema information to the resulting [`Schemas`] structure using the
237    /// passed `name` as name for the schema.
238    ///
239    /// # Errors
240    ///
241    /// Will return an suitable error if the parser could not read the data from
242    /// the reader, or parse the schema provided by the reader.
243    pub fn add_named_schema_from_reader<R: BufRead>(
244        self,
245        name: String,
246        reader: R,
247    ) -> Result<Self, Error<TResolver::Error>> {
248        self.add_named_schema_from_reader_impl(Some(name), reader)
249    }
250
251    #[instrument(err, level = "trace", skip(self, reader))]
252    fn add_named_schema_from_reader_impl<R: BufRead>(
253        mut self,
254        name: Option<String>,
255        reader: R,
256    ) -> Result<Self, Error<TResolver::Error>> {
257        let reader = IoReader::new(reader);
258        let mut reader = SchemaReader::new(reader);
259
260        let schema = Schema::deserialize(&mut reader)?;
261
262        self.add_schema(name, schema, None, &reader.namespaces);
263        self.resolve_pending()?;
264
265        Ok(self)
266    }
267
268    /// Add a new XML schema from the passed file `path`.
269    ///
270    /// This will parse the XML schema represented by the provided filepath and
271    /// add all schema information to the resulting [`Schemas`] structure.
272    ///
273    /// # Errors
274    ///
275    /// Will return an suitable error if the parser could not read the data from
276    /// the file, or parse the schema content.
277    #[instrument(err, level = "trace", skip(self))]
278    pub fn add_schema_from_file<P: AsRef<Path> + Debug>(
279        self,
280        path: P,
281    ) -> Result<Self, Error<TResolver::Error>> {
282        let path = path.as_ref().canonicalize()?;
283        let url = Url::from_file_path(&path).map_err(|()| Error::InvalidFilePath(path))?;
284
285        self.add_schema_from_url(url)
286    }
287
288    /// Add multiple XML schemas from the passed paths iterator.
289    ///
290    /// # Errors
291    ///
292    /// Will return an suitable error if the parser could not read the data from
293    /// any file, or parse the schema content.
294    #[instrument(err, level = "trace", skip(self))]
295    pub fn add_schema_from_files<I>(mut self, paths: I) -> Result<Self, Error<TResolver::Error>>
296    where
297        I: IntoIterator + Debug,
298        I::Item: AsRef<Path> + Debug,
299    {
300        for path in paths {
301            self = self.add_schema_from_file(path)?;
302        }
303
304        Ok(self)
305    }
306
307    /// Add a new XML schema from the passed file `url`.
308    ///
309    /// This will parse the XML schema represented by the provided url and
310    /// add all schema information to the resulting [`Schemas`] structure.
311    ///
312    /// # Errors
313    ///
314    /// Will return an suitable error if the parser could not resolve the URL
315    /// using the provided resolver or the data from the resolver could not be
316    /// parsed.
317    #[instrument(err, level = "trace", skip(self))]
318    pub fn add_schema_from_url(mut self, url: Url) -> Result<Self, Error<TResolver::Error>> {
319        let req = ResolveRequest::new(url, ResolveRequestType::UserDefined);
320
321        self.resolve_location(req)?;
322        self.resolve_pending()?;
323
324        Ok(self)
325    }
326
327    fn add_pending(&mut self, req: ResolveRequest) {
328        tracing::debug!("Add pending resolve request: {req:#?}");
329
330        self.pending.push_back(req);
331    }
332
333    fn resolve_pending(&mut self) -> Result<(), Error<TResolver::Error>> {
334        while let Some(req) = self.pending.pop_front() {
335            self.resolve_location(req)?;
336        }
337
338        Ok(())
339    }
340
341    #[instrument(err, level = "trace", skip(self))]
342    fn resolve_location(&mut self, req: ResolveRequest) -> Result<(), Error<TResolver::Error>> {
343        tracing::debug!("Process resolve request: {req:#?}");
344
345        let Some((name, location, buffer)) =
346            self.resolver.resolve(&req).map_err(Error::resolver)?
347        else {
348            return Err(Error::UnableToResolve(Box::new(req)));
349        };
350        if self.cache.contains(&location) {
351            return Ok(());
352        }
353
354        let reader = IoReader::new(buffer);
355        let reader = SchemaReader::new(reader);
356        let mut reader = reader.with_error_info();
357
358        let mut schema = Schema::deserialize(&mut reader)?;
359
360        if schema.target_namespace.is_none()
361            && ResolveRequestType::IncludeRequest == req.request_type
362        {
363            if let Some(current_ns) = req.current_ns {
364                let inherited_ns = current_ns.to_string();
365                schema.target_namespace = Some(inherited_ns);
366            }
367        }
368
369        let reader = reader.into_inner();
370
371        self.add_schema(name, schema, Some(location.clone()), &reader.namespaces);
372        self.cache.insert(location);
373
374        Ok(())
375    }
376
377    fn add_schema(
378        &mut self,
379        name: Option<String>,
380        schema: Schema,
381        location: Option<Url>,
382        namespaces: &Namespaces,
383    ) {
384        tracing::debug!(
385            "Process schema (location={:?}, target_namespace={:?}",
386            location.as_ref().map(Url::as_str),
387            &schema.target_namespace
388        );
389
390        let target_ns = schema
391            .target_namespace
392            .as_deref()
393            .map(|ns| Namespace::from(ns.as_bytes().to_owned()));
394        let prefix = namespaces.get(&target_ns).cloned().flatten();
395
396        if self.resolve_includes {
397            for content in &schema.content {
398                match content {
399                    SchemaContent::Import(x) => {
400                        if let Some(req) = import_req(x, target_ns.clone(), location.as_ref()) {
401                            self.add_pending(req);
402                        }
403                    }
404                    SchemaContent::Include(x) => {
405                        self.add_pending(include_req(x, target_ns.clone(), location.as_ref()));
406                    }
407                    _ => (),
408                }
409            }
410        }
411
412        self.schemas
413            .add_schema(prefix, target_ns, name, schema, location);
414    }
415}
416
417struct SchemaReader<R> {
418    inner: R,
419    namespaces: Namespaces,
420}
421
422type Namespaces = BTreeMap<Option<Namespace>, Option<NamespacePrefix>>;
423
424impl<R> SchemaReader<R> {
425    fn new(inner: R) -> Self {
426        Self {
427            inner,
428            namespaces: BTreeMap::new(),
429        }
430    }
431}
432
433impl<R> XmlReader for SchemaReader<R>
434where
435    R: XmlReader,
436{
437    fn resolve<'n>(&self, name: QName<'n>, attribute: bool) -> (ResolveResult<'_>, LocalName<'n>) {
438        self.inner.resolve(name, attribute)
439    }
440
441    fn namespaces(&self) -> NamespacesShared<'static> {
442        self.inner.namespaces()
443    }
444
445    fn current_position(&self) -> u64 {
446        self.inner.current_position()
447    }
448
449    fn error_position(&self) -> u64 {
450        self.inner.error_position()
451    }
452}
453
454impl<'a, R> XmlReaderSync<'a> for SchemaReader<R>
455where
456    R: XmlReaderSync<'a>,
457{
458    fn read_event(&mut self) -> Result<Event<'a>, QuickXmlError> {
459        let event = self.inner.read_event()?;
460
461        if let Event::Start(x) | Event::Empty(x) = &event {
462            for a in x.attributes() {
463                let a = a?;
464                if matches!(a.key.prefix(), Some(x) if x.as_ref() == b"xmlns") {
465                    let prefix = NamespacePrefix::new(a.key.local_name().as_ref().to_owned());
466                    let namespace = Namespace::new(a.value.into_owned());
467
468                    self.namespaces
469                        .entry(Some(namespace))
470                        .or_insert(Some(prefix));
471                }
472            }
473        }
474
475        Ok(event)
476    }
477}
478
479fn import_req(
480    import: &Import,
481    current_ns: Option<Namespace>,
482    current_location: Option<&Url>,
483) -> Option<ResolveRequest> {
484    let location = import.schema_location.as_ref()?;
485
486    let mut req = ResolveRequest::new(location, ResolveRequestType::ImportRequest);
487
488    if let Some(ns) = current_ns {
489        req = req.current_ns(ns);
490    }
491
492    if let Some(ns) = &import.namespace {
493        req = req.requested_ns(Namespace::from(ns.as_bytes().to_owned()));
494    }
495
496    if let Some(current_location) = current_location {
497        req = req.current_location(current_location.clone());
498    }
499
500    Some(req)
501}
502
503fn include_req(
504    include: &Include,
505    current_ns: Option<Namespace>,
506    current_location: Option<&Url>,
507) -> ResolveRequest {
508    let mut req = ResolveRequest::new(&include.schema_location, ResolveRequestType::IncludeRequest);
509
510    if let Some(ns) = current_ns {
511        req = req.current_ns(ns);
512    }
513
514    if let Some(current_location) = current_location {
515        req = req.current_location(current_location.clone());
516    }
517
518    req
519}