xsd_parser/pipeline/parser/
mod.rs

1//! XML Schema (XSD) parser and resolver infrastructure.
2//!
3//! This module defines the [`Parser`] type and supporting logic for loading,
4//! resolving, and parsing XML Schema documents (`.xsd`) into structured [`Schemas`].
5//!
6//! The parser supports various input sources (e.g., files, strings, URLs) and
7//! handles `<import>` and `<include>` logic using pluggable [`Resolver`] implementations.
8//!
9//! Parsed schemas can be passed to the [`Interpreter`](crate::Interpreter) for
10//! further transformation into semantic types.
11//!
12//! # Example
13//! ```rust,ignore
14//! let schemas = Parser::new()
15//!     .with_default_resolver()
16//!     .add_schema_from_file("schema.xsd")?
17//!     .finish();
18//! ```
19
20pub mod resolver;
21
22mod error;
23
24use std::borrow::Cow;
25use std::collections::{btree_map::Entry, BTreeMap, HashMap, HashSet, VecDeque};
26use std::fmt::Debug;
27use std::io::BufRead;
28use std::path::Path;
29
30use quick_xml::events::Event;
31use resolver::{FileResolver, NoOpResolver, ResolveRequest};
32use tracing::instrument;
33use url::Url;
34
35use xsd_parser_types::misc::{Namespace, NamespacePrefix};
36use xsd_parser_types::quick_xml::{
37    DeserializeSync, Error as QuickXmlError, IoReader, SliceReader, XmlReader, XmlReaderSync,
38};
39
40use crate::models::schema::{
41    xs::{Import, Include, Schema, SchemaContent},
42    NamespaceId, NamespaceInfo, Schemas,
43};
44use crate::models::schema::{SchemaId, SchemaInfo};
45use crate::pipeline::parser::resolver::ResolveRequestType;
46
47pub use self::error::Error;
48pub use self::resolver::Resolver;
49
50/// The [`Parser`] is responsible for loading and parsing XML Schema documents into
51/// a structured [`Schemas`] representation.
52///
53/// It supports resolution of schema references such as `<import>` and `<include>`
54/// using a pluggable [`Resolver`], and can read schema content from strings, files,
55/// or URLs.
56///
57/// Internally, the parser maintains a queue of pending schema loads and a cache
58/// to prevent duplicate resolutions. Once all schemas are processed, the
59/// [`finish`](Self::finish) method returns the final [`Schemas`] collection.
60///
61/// A generic resolver type `TResolver` controls how external schemas are fetched.
62/// By default, a no-op resolver is used, but file-based or custom resolvers can
63/// be injected using [`with_resolver`](Self::with_resolver).
64#[must_use]
65#[derive(Default, Debug)]
66pub struct Parser<TResolver = NoOpResolver> {
67    cache: HashSet<Url>,
68    entries: Vec<ParserEntry>,
69    pending: VecDeque<ResolveRequest>,
70
71    resolver: TResolver,
72    resolve_includes: bool,
73    generate_prefixes: bool,
74    alternative_prefixes: bool,
75}
76
77#[derive(Debug)]
78#[allow(clippy::large_enum_variant)]
79enum ParserEntry {
80    Namespace {
81        prefix: NamespacePrefix,
82        namespace: Namespace,
83    },
84    Schema {
85        name: Option<String>,
86        schema: Schema,
87        location: Option<Url>,
88        target_ns: Option<Namespace>,
89        namespaces: Namespaces,
90    },
91}
92
93#[derive(Debug)]
94struct SchemasBuilder {
95    schemas: Schemas,
96    prefix_cache: HashMap<Option<Namespace>, PrefixEntry>,
97
98    generate_prefixes: bool,
99    alternative_prefixes: bool,
100}
101
102#[derive(Default, Debug)]
103struct PrefixEntry {
104    prefix: Option<NamespacePrefix>,
105    alt_prefixes: HashSet<NamespacePrefix>,
106}
107
108impl Parser {
109    /// Create a new [`Parser`] instance.
110    pub fn new() -> Self {
111        Self::default()
112    }
113}
114
115impl<TResolver> Parser<TResolver> {
116    /// Set the default resolver for this parser.
117    ///
118    /// The default resolver is just a simple [`FileResolver`].
119    pub fn with_default_resolver(self) -> Parser<FileResolver> {
120        self.with_resolver(FileResolver)
121    }
122
123    /// Set a custom defined resolver for this parser.
124    pub fn with_resolver<XResolver: Resolver + 'static>(
125        self,
126        resolver: XResolver,
127    ) -> Parser<XResolver> {
128        let Self { entries, .. } = self;
129
130        let cache = HashSet::new();
131        let pending = VecDeque::new();
132
133        Parser {
134            cache,
135            entries,
136            pending,
137
138            resolver,
139            resolve_includes: true,
140            generate_prefixes: true,
141            alternative_prefixes: true,
142        }
143    }
144
145    /// Enable or disable resolving includes of parsed XML schemas.
146    pub fn resolve_includes(mut self, value: bool) -> Self {
147        self.resolve_includes = value;
148
149        self
150    }
151
152    /// Instructs the parser to generate unique prefixes for a certain namespace
153    /// if its actual prefix is already used.
154    pub fn generate_prefixes(mut self, value: bool) -> Self {
155        self.generate_prefixes = value;
156
157        self
158    }
159
160    /// Instructs the parser to use alternate prefixes known from other
161    /// schemas for a certain namespace if its actual prefix is unknown or
162    /// already used.
163    pub fn alternative_prefixes(mut self, value: bool) -> Self {
164        self.alternative_prefixes = value;
165
166        self
167    }
168
169    /// Finish the parsing process by returning the generated [`Schemas`] instance
170    /// containing all parsed schemas.
171    pub fn finish(self) -> Schemas {
172        let builder = SchemasBuilder {
173            schemas: Schemas::default(),
174            prefix_cache: HashMap::new(),
175
176            generate_prefixes: self.generate_prefixes,
177            alternative_prefixes: self.alternative_prefixes,
178        };
179
180        builder.build(self.entries)
181    }
182}
183
184impl<TResolver> Parser<TResolver>
185where
186    TResolver: Resolver,
187{
188    /// Add the default namespaces to this parser.
189    ///
190    /// The default namespaces are:
191    /// - [`NamespacePrefix::XS`] [`Namespace::XS`]
192    /// - [`NamespacePrefix::XML`] [`Namespace::XML`]
193    ///
194    /// # Errors
195    ///
196    /// Forwards the errors from [`with_namespace`](Self::with_namespace).
197    pub fn with_default_namespaces(self) -> Self {
198        self.with_namespace(NamespacePrefix::XS, Namespace::XS)
199            .with_namespace(NamespacePrefix::XML, Namespace::XML)
200    }
201
202    /// Add a new namespace to this parser.
203    ///
204    /// This method will add a new namespace to the parser. This can be useful to
205    /// pre-heat the prefixes for known namespace, or to define namespaces for
206    /// custom defined types.
207    ///
208    /// This will not add any schema information. It's just a namespace definition.
209    ///
210    /// # Errors
211    ///
212    /// Will return an error if a problem or mismatch with the already existing
213    /// namespaces was encountered.
214    pub fn with_namespace(mut self, prefix: NamespacePrefix, namespace: Namespace) -> Self {
215        self.entries
216            .push(ParserEntry::Namespace { prefix, namespace });
217
218        self
219    }
220}
221
222impl<TResolver> Parser<TResolver>
223where
224    TResolver: Resolver,
225    TResolver::Buffer: BufRead,
226{
227    /// Add a new XML schema from the passed string.
228    ///
229    /// This will parse the XML schema represented by the provided string and add
230    /// all schema information to the resulting [`Schemas`] structure.
231    ///
232    /// # Errors
233    ///
234    /// Will return an suitable error if the parser could not parse the provided
235    /// schema.
236    #[instrument(err, level = "trace", skip(self, schema))]
237    pub fn add_schema_from_str(self, schema: &str) -> Result<Self, Error<TResolver::Error>> {
238        self.add_named_schema_from_str_impl(None, schema)
239    }
240
241    /// Add a new XML schema from the passed string.
242    ///
243    /// This will parse the XML schema represented by the provided string and add
244    /// all schema information to the resulting [`Schemas`] structure using the
245    /// passed `name` for the schema.
246    ///
247    /// # Errors
248    ///
249    /// Will return an suitable error if the parser could not parse the provided
250    /// schema.
251    #[instrument(err, level = "trace", skip(self, schema))]
252    pub fn add_named_schema_from_str(
253        self,
254        name: String,
255        schema: &str,
256    ) -> Result<Self, Error<TResolver::Error>> {
257        self.add_named_schema_from_str_impl(Some(name), schema)
258    }
259
260    #[instrument(err, level = "trace", skip(self, schema))]
261    fn add_named_schema_from_str_impl(
262        mut self,
263        name: Option<String>,
264        schema: &str,
265    ) -> Result<Self, Error<TResolver::Error>> {
266        let reader = SliceReader::new(schema);
267        let mut reader = SchemaReader::new(reader);
268
269        let schema = Schema::deserialize(&mut reader)?;
270
271        self.add_schema(name, schema, None, reader.namespaces);
272        self.resolve_pending()?;
273
274        Ok(self)
275    }
276
277    /// Add a new XML schema from the passed `reader`.
278    ///
279    /// This will parse the XML schema represented by the provided reader and add
280    /// all schema information to the resulting [`Schemas`] structure.
281    ///
282    /// # Errors
283    ///
284    /// Will return an suitable error if the parser could not read the data from
285    /// the reader, or parse the schema provided by the reader.
286    pub fn add_schema_from_reader<R: BufRead>(
287        self,
288        reader: R,
289    ) -> Result<Self, Error<TResolver::Error>> {
290        self.add_named_schema_from_reader_impl(None, reader)
291    }
292
293    /// Add a new XML schema from the passed `reader`.
294    ///
295    /// This will parse the XML schema represented by the provided reader and add
296    /// all schema information to the resulting [`Schemas`] structure using the
297    /// passed `name` as name for the schema.
298    ///
299    /// # Errors
300    ///
301    /// Will return an suitable error if the parser could not read the data from
302    /// the reader, or parse the schema provided by the reader.
303    pub fn add_named_schema_from_reader<R: BufRead>(
304        self,
305        name: String,
306        reader: R,
307    ) -> Result<Self, Error<TResolver::Error>> {
308        self.add_named_schema_from_reader_impl(Some(name), reader)
309    }
310
311    #[instrument(err, level = "trace", skip(self, reader))]
312    fn add_named_schema_from_reader_impl<R: BufRead>(
313        mut self,
314        name: Option<String>,
315        reader: R,
316    ) -> Result<Self, Error<TResolver::Error>> {
317        let reader = IoReader::new(reader);
318        let mut reader = SchemaReader::new(reader);
319
320        let schema = Schema::deserialize(&mut reader)?;
321
322        self.add_schema(name, schema, None, reader.namespaces);
323        self.resolve_pending()?;
324
325        Ok(self)
326    }
327
328    /// Add a new XML schema from the passed file `path`.
329    ///
330    /// This will parse the XML schema represented by the provided filepath and
331    /// add all schema information to the resulting [`Schemas`] structure.
332    ///
333    /// # Errors
334    ///
335    /// Will return an suitable error if the parser could not read the data from
336    /// the file, or parse the schema content.
337    #[instrument(err, level = "trace", skip(self))]
338    pub fn add_schema_from_file<P: AsRef<Path> + Debug>(
339        self,
340        path: P,
341    ) -> Result<Self, Error<TResolver::Error>> {
342        let path = path.as_ref().canonicalize()?;
343        let url = Url::from_file_path(&path).map_err(|()| Error::InvalidFilePath(path))?;
344
345        self.add_schema_from_url(url)
346    }
347
348    /// Add multiple XML schemas from the passed paths iterator.
349    ///
350    /// # Errors
351    ///
352    /// Will return an suitable error if the parser could not read the data from
353    /// any file, or parse the schema content.
354    #[instrument(err, level = "trace", skip(self))]
355    pub fn add_schema_from_files<I>(mut self, paths: I) -> Result<Self, Error<TResolver::Error>>
356    where
357        I: IntoIterator + Debug,
358        I::Item: AsRef<Path> + Debug,
359    {
360        for path in paths {
361            self = self.add_schema_from_file(path)?;
362        }
363
364        Ok(self)
365    }
366
367    /// Add a new XML schema from the passed file `url`.
368    ///
369    /// This will parse the XML schema represented by the provided url and
370    /// add all schema information to the resulting [`Schemas`] structure.
371    ///
372    /// # Errors
373    ///
374    /// Will return an suitable error if the parser could not resolve the URL
375    /// using the provided resolver or the data from the resolver could not be
376    /// parsed.
377    #[instrument(err, level = "trace", skip(self))]
378    pub fn add_schema_from_url(mut self, url: Url) -> Result<Self, Error<TResolver::Error>> {
379        let req = ResolveRequest::new(url, ResolveRequestType::UserDefined);
380
381        self.resolve_location(req)?;
382        self.resolve_pending()?;
383
384        Ok(self)
385    }
386
387    fn add_pending(&mut self, req: ResolveRequest) {
388        tracing::debug!("Add pending resolve request: {req:#?}");
389
390        self.pending.push_back(req);
391    }
392
393    fn resolve_pending(&mut self) -> Result<(), Error<TResolver::Error>> {
394        while let Some(req) = self.pending.pop_front() {
395            self.resolve_location(req)?;
396        }
397
398        Ok(())
399    }
400
401    #[instrument(err, level = "trace", skip(self))]
402    fn resolve_location(&mut self, req: ResolveRequest) -> Result<(), Error<TResolver::Error>> {
403        tracing::debug!("Process resolve request: {req:#?}");
404
405        let Some((name, location, buffer)) =
406            self.resolver.resolve(&req).map_err(Error::resolver)?
407        else {
408            return Err(Error::UnableToResolve(Box::new(req)));
409        };
410        if self.cache.contains(&location) {
411            return Ok(());
412        }
413
414        let reader = IoReader::new(buffer);
415        let reader = SchemaReader::new(reader);
416        let mut reader = reader.with_error_info();
417
418        let mut schema = Schema::deserialize(&mut reader)?;
419
420        if schema.target_namespace.is_none()
421            && ResolveRequestType::IncludeRequest == req.request_type
422        {
423            if let Some(current_ns) = req.current_ns {
424                let inherited_ns = current_ns.to_string();
425                schema.target_namespace = Some(inherited_ns);
426            }
427        }
428
429        let reader = reader.into_inner();
430
431        self.add_schema(name, schema, Some(location.clone()), reader.namespaces);
432        self.cache.insert(location);
433
434        Ok(())
435    }
436
437    fn add_schema(
438        &mut self,
439        name: Option<String>,
440        schema: Schema,
441        location: Option<Url>,
442        namespaces: Namespaces,
443    ) {
444        tracing::debug!(
445            "Process schema (location={:?}, target_namespace={:?}",
446            location.as_ref().map(Url::as_str),
447            &schema.target_namespace
448        );
449
450        let target_ns = schema
451            .target_namespace
452            .as_deref()
453            .map(|ns| Namespace::from(ns.as_bytes().to_owned()));
454
455        if self.resolve_includes {
456            for content in &schema.content {
457                match content {
458                    SchemaContent::Import(x) => {
459                        if let Some(req) = import_req(x, target_ns.clone(), location.as_ref()) {
460                            self.add_pending(req);
461                        }
462                    }
463                    SchemaContent::Include(x) => {
464                        self.add_pending(include_req(x, target_ns.clone(), location.as_ref()));
465                    }
466                    _ => (),
467                }
468            }
469        }
470
471        self.entries.push(ParserEntry::Schema {
472            name,
473            schema,
474            location,
475            target_ns,
476            namespaces,
477        });
478    }
479}
480
481struct SchemaReader<R> {
482    inner: R,
483    namespaces: Namespaces,
484}
485
486type Namespaces = BTreeMap<Option<Namespace>, Vec<NamespacePrefix>>;
487
488impl<R> SchemaReader<R> {
489    fn new(inner: R) -> Self {
490        Self {
491            inner,
492            namespaces: BTreeMap::new(),
493        }
494    }
495}
496
497impl<R> XmlReader for SchemaReader<R>
498where
499    R: XmlReader,
500{
501    fn extend_error(&self, error: QuickXmlError) -> QuickXmlError {
502        self.inner.extend_error(error)
503    }
504}
505
506impl<'a, R> XmlReaderSync<'a> for SchemaReader<R>
507where
508    R: XmlReaderSync<'a>,
509{
510    fn read_event(&mut self) -> Result<Event<'a>, QuickXmlError> {
511        let event = self.inner.read_event()?;
512
513        if let Event::Start(x) | Event::Empty(x) = &event {
514            for a in x.attributes() {
515                let a = a?;
516                if matches!(a.key.prefix(), Some(x) if x.as_ref() == b"xmlns") {
517                    let prefix = NamespacePrefix::new(a.key.local_name().as_ref().to_owned());
518                    let namespace = Namespace::new(a.value.into_owned());
519
520                    self.namespaces
521                        .entry(Some(namespace))
522                        .or_default()
523                        .push(prefix);
524                }
525            }
526        }
527
528        Ok(event)
529    }
530}
531
532impl SchemasBuilder {
533    fn build(mut self, entries: Vec<ParserEntry>) -> Schemas {
534        self.build_cache(&entries);
535
536        for entry in entries {
537            match entry {
538                ParserEntry::Namespace { namespace, .. } => {
539                    self.get_or_create_namespace_info_mut(Some(namespace));
540                }
541                ParserEntry::Schema {
542                    name,
543                    schema,
544                    location,
545                    target_ns,
546                    ..
547                } => {
548                    self.add_schema(target_ns, name, location, schema);
549                }
550            }
551        }
552
553        self.determine_prefixes();
554
555        self.schemas
556    }
557
558    fn build_cache(&mut self, entries: &[ParserEntry]) {
559        for entry in entries {
560            match entry {
561                ParserEntry::Namespace { prefix, namespace } => {
562                    self.prefix_cache
563                        .entry(Some(namespace.clone()))
564                        .or_default()
565                        .prefix = Some(prefix.clone());
566                }
567                ParserEntry::Schema {
568                    target_ns,
569                    namespaces,
570                    ..
571                } => {
572                    let prefix = namespaces
573                        .get(target_ns)
574                        .and_then(|prefixes| prefixes.first())
575                        .cloned();
576                    let entry = self.prefix_cache.entry(target_ns.clone()).or_default();
577
578                    if entry.prefix.is_none() {
579                        entry.prefix = prefix;
580                    } else if let Some(prefix) = prefix {
581                        entry.alt_prefixes.insert(prefix);
582                    }
583
584                    for (namespace, prefixes) in namespaces {
585                        for prefix in prefixes {
586                            self.prefix_cache
587                                .entry(namespace.clone())
588                                .or_default()
589                                .alt_prefixes
590                                .insert(prefix.clone());
591                        }
592                    }
593                }
594            }
595        }
596    }
597
598    fn add_schema(
599        &mut self,
600        namespace: Option<Namespace>,
601        name: Option<String>,
602        location: Option<Url>,
603        schema: Schema,
604    ) {
605        let schema_id = SchemaId(self.schemas.next_schema_id);
606        self.schemas.next_schema_id = self.schemas.next_schema_id.wrapping_add(1);
607
608        let (namespace_id, namespace_info) = self.get_or_create_namespace_info_mut(namespace);
609        namespace_info.schemas.push(schema_id);
610
611        match self.schemas.schemas.entry(schema_id) {
612            Entry::Vacant(e) => e.insert(SchemaInfo {
613                name,
614                schema,
615                location,
616                namespace_id,
617            }),
618            Entry::Occupied(_) => crate::unreachable!(),
619        };
620    }
621
622    fn get_or_create_namespace_info_mut(
623        &mut self,
624        namespace: Option<Namespace>,
625    ) -> (NamespaceId, &mut NamespaceInfo) {
626        match self.schemas.known_namespaces.entry(namespace) {
627            Entry::Occupied(e) => {
628                let id = *e.get();
629                let info = self.schemas.namespace_infos.get_mut(&id).unwrap();
630
631                (id, info)
632            }
633            Entry::Vacant(e) => {
634                let id = NamespaceId(self.schemas.next_namespace_id);
635                self.schemas.next_namespace_id = self.schemas.next_namespace_id.wrapping_add(1);
636
637                let namespace = e.key().clone();
638                e.insert(id);
639
640                let info = match self.schemas.namespace_infos.entry(id) {
641                    Entry::Vacant(e) => e.insert(NamespaceInfo::new(namespace)),
642                    Entry::Occupied(_) => crate::unreachable!(),
643                };
644
645                (id, info)
646            }
647        }
648    }
649
650    fn determine_prefixes(&mut self) {
651        // Insert main prefixes
652        for (id, info) in &mut self.schemas.namespace_infos {
653            if info.prefix.is_some() {
654                continue;
655            }
656
657            let entry = &mut self.prefix_cache.get(&info.namespace).unwrap();
658            if let Some(prefix) = &entry.prefix {
659                if let Entry::Vacant(e) = self.schemas.known_prefixes.entry(prefix.clone()) {
660                    info.prefix = Some(e.key().clone());
661                    e.insert(*id);
662                }
663            }
664        }
665
666        // Fallback to alternate prefixes
667        if self.alternative_prefixes {
668            for (id, info) in &mut self.schemas.namespace_infos {
669                if info.prefix.is_some() {
670                    continue;
671                }
672
673                let entry = &mut self.prefix_cache.get(&info.namespace).unwrap();
674                for alt in &entry.alt_prefixes {
675                    if let Entry::Vacant(e) = self.schemas.known_prefixes.entry(alt.clone()) {
676                        info.prefix = Some(e.key().clone());
677                        e.insert(*id);
678                    }
679                }
680            }
681        }
682
683        // Fallback to generated prefix
684        if self.generate_prefixes {
685            for (id, info) in &mut self.schemas.namespace_infos {
686                if info.prefix.is_some() {
687                    continue;
688                }
689
690                let entry = &mut self.prefix_cache.get(&info.namespace).unwrap();
691                let prefix = entry
692                    .prefix
693                    .clone()
694                    .or_else(|| entry.alt_prefixes.iter().next().cloned());
695                if let Some(prefix) = prefix {
696                    let ext = format!("_{}", id.0);
697                    let ext = ext.as_bytes();
698
699                    let mut p = prefix.0.into_owned();
700                    p.extend_from_slice(ext);
701
702                    let prefix = NamespacePrefix(Cow::Owned(p));
703                    self.schemas.known_prefixes.insert(prefix, *id);
704                }
705            }
706        }
707    }
708}
709
710fn import_req(
711    import: &Import,
712    current_ns: Option<Namespace>,
713    current_location: Option<&Url>,
714) -> Option<ResolveRequest> {
715    let location = import.schema_location.as_ref()?;
716
717    let mut req = ResolveRequest::new(location, ResolveRequestType::ImportRequest);
718
719    if let Some(ns) = current_ns {
720        req = req.current_ns(ns);
721    }
722
723    if let Some(ns) = &import.namespace {
724        req = req.requested_ns(Namespace::from(ns.as_bytes().to_owned()));
725    }
726
727    if let Some(current_location) = current_location {
728        req = req.current_location(current_location.clone());
729    }
730
731    Some(req)
732}
733
734fn include_req(
735    include: &Include,
736    current_ns: Option<Namespace>,
737    current_location: Option<&Url>,
738) -> ResolveRequest {
739    let mut req = ResolveRequest::new(&include.schema_location, ResolveRequestType::IncludeRequest);
740
741    if let Some(ns) = current_ns {
742        req = req.current_ns(ns);
743    }
744
745    if let Some(current_location) = current_location {
746        req = req.current_location(current_location.clone());
747    }
748
749    req
750}