1pub mod resolver;
21
22mod error;
23
24use std::borrow::Cow;
25use std::collections::{btree_map::Entry, BTreeMap, HashMap, HashSet, VecDeque};
26use std::fmt::Debug;
27use std::io::BufRead;
28use std::mem::take;
29use std::path::Path;
30
31use quick_xml::events::Event;
32use resolver::{FileResolver, NoOpResolver, ResolveRequest};
33use tracing::instrument;
34use url::Url;
35
36use xsd_parser_types::misc::{Namespace, NamespacePrefix};
37use xsd_parser_types::quick_xml::{
38 DeserializeSync, Error as QuickXmlError, IoReader, SliceReader, XmlReader, XmlReaderSync,
39};
40
41use crate::models::schema::{
42 xs::{Import, Schema, SchemaContent},
43 NamespaceId, NamespaceInfo, Schemas,
44};
45use crate::models::schema::{Dependency, SchemaId, SchemaInfo};
46use crate::pipeline::parser::resolver::ResolveRequestType;
47
48pub use self::error::{Error, XmlErrorWithLocation};
49pub use self::resolver::Resolver;
50
51#[must_use]
66#[derive(Default, Debug)]
67pub struct Parser<TResolver = NoOpResolver> {
68 cache: HashMap<Url, HashSet<TempSchemaId>>,
69 entries: Vec<ParserEntry>,
70 pending: VecDeque<(TempSchemaId, ResolveRequest)>,
71 next_temp_id: TempSchemaId,
72
73 resolver: TResolver,
74 resolve_includes: bool,
75 generate_prefixes: bool,
76 alternative_prefixes: bool,
77}
78
79#[derive(Default, Debug, Clone, Copy, Hash, Eq, PartialEq, Ord, PartialOrd)]
80struct TempSchemaId(usize);
81
82#[derive(Debug)]
83#[allow(clippy::large_enum_variant)]
84enum ParserEntry {
85 AnonymousNamespace,
86 Namespace {
87 prefix: NamespacePrefix,
88 namespace: Namespace,
89 },
90 Schema {
91 id: TempSchemaId,
92 name: Option<String>,
93 schema: Schema,
94 location: Option<Url>,
95 target_ns: Option<Namespace>,
96 namespaces: Namespaces,
97 dependencies: BTreeMap<String, Dependency<TempSchemaId>>,
98 },
99}
100
101#[derive(Debug)]
102struct SchemasBuilder {
103 schemas: Schemas,
104 entries: Vec<ParserEntry>,
105
106 id_cache: HashMap<TempSchemaId, SchemaId>,
107 prefix_cache: HashMap<Option<Namespace>, PrefixEntry>,
108 location_cache: HashMap<Url, HashSet<TempSchemaId>>,
109
110 generate_prefixes: bool,
111 alternative_prefixes: bool,
112}
113
114#[derive(Default, Debug)]
115struct PrefixEntry {
116 prefix: Option<NamespacePrefix>,
117 alt_prefixes: HashSet<NamespacePrefix>,
118}
119
120impl Parser {
121 pub fn new() -> Self {
123 Self::default()
124 }
125}
126
127impl<TResolver> Parser<TResolver> {
128 pub fn with_default_resolver(self) -> Parser<FileResolver> {
132 self.with_resolver(FileResolver)
133 }
134
135 pub fn with_resolver<XResolver: Resolver + 'static>(
137 self,
138 resolver: XResolver,
139 ) -> Parser<XResolver> {
140 let Self { entries, .. } = self;
141
142 let cache = HashMap::new();
143 let pending = VecDeque::new();
144
145 Parser {
146 cache,
147 entries,
148 pending,
149 next_temp_id: TempSchemaId(0),
150
151 resolver,
152 resolve_includes: true,
153 generate_prefixes: true,
154 alternative_prefixes: true,
155 }
156 }
157
158 pub fn resolve_includes(mut self, value: bool) -> Self {
160 self.resolve_includes = value;
161
162 self
163 }
164
165 pub fn generate_prefixes(mut self, value: bool) -> Self {
168 self.generate_prefixes = value;
169
170 self
171 }
172
173 pub fn alternative_prefixes(mut self, value: bool) -> Self {
177 self.alternative_prefixes = value;
178
179 self
180 }
181
182 pub fn finish(self) -> Schemas {
185 let builder = SchemasBuilder {
186 schemas: Schemas::default(),
187 entries: self.entries,
188
189 id_cache: HashMap::new(),
190 prefix_cache: HashMap::new(),
191 location_cache: self.cache,
192
193 generate_prefixes: self.generate_prefixes,
194 alternative_prefixes: self.alternative_prefixes,
195 };
196
197 builder.build()
198 }
199}
200
201impl<TResolver> Parser<TResolver>
202where
203 TResolver: Resolver,
204{
205 pub fn with_default_namespaces(self) -> Self {
216 self.with_anonymous_namespace()
217 .with_namespace(NamespacePrefix::XS, Namespace::XS)
218 .with_namespace(NamespacePrefix::XML, Namespace::XML)
219 .with_namespace(NamespacePrefix::XSI, Namespace::XSI)
220 }
221
222 pub fn with_namespace(mut self, prefix: NamespacePrefix, namespace: Namespace) -> Self {
235 self.entries
236 .push(ParserEntry::Namespace { prefix, namespace });
237
238 self
239 }
240
241 pub fn with_anonymous_namespace(mut self) -> Self {
247 self.entries.push(ParserEntry::AnonymousNamespace);
248
249 self
250 }
251}
252
253impl<TResolver> Parser<TResolver>
254where
255 TResolver: Resolver,
256 TResolver::Buffer: BufRead,
257{
258 #[instrument(err, level = "trace", skip(self, schema))]
268 pub fn add_schema_from_str(self, schema: &str) -> Result<Self, Error<TResolver::Error>> {
269 self.add_named_schema_from_str_impl(None, schema)
270 }
271
272 #[instrument(err, level = "trace", skip(self, schema))]
283 pub fn add_named_schema_from_str(
284 self,
285 name: String,
286 schema: &str,
287 ) -> Result<Self, Error<TResolver::Error>> {
288 self.add_named_schema_from_str_impl(Some(name), schema)
289 }
290
291 #[instrument(err, level = "trace", skip(self, schema))]
292 fn add_named_schema_from_str_impl(
293 mut self,
294 name: Option<String>,
295 schema: &str,
296 ) -> Result<Self, Error<TResolver::Error>> {
297 let reader = SliceReader::new(schema);
298 let mut reader = SchemaReader::new(reader);
299
300 let schema = Schema::deserialize(&mut reader).map_err(XmlErrorWithLocation::from)?;
301 let id = self.temp_schema_id();
302
303 self.add_schema(id, name, schema, None, reader.namespaces);
304 self.resolve_pending()?;
305
306 Ok(self)
307 }
308
309 pub fn add_schema_from_reader<R: BufRead>(
319 self,
320 reader: R,
321 ) -> Result<Self, Error<TResolver::Error>> {
322 self.add_named_schema_from_reader_impl(None, reader)
323 }
324
325 pub fn add_named_schema_from_reader<R: BufRead>(
336 self,
337 name: String,
338 reader: R,
339 ) -> Result<Self, Error<TResolver::Error>> {
340 self.add_named_schema_from_reader_impl(Some(name), reader)
341 }
342
343 #[instrument(err, level = "trace", skip(self, reader))]
344 fn add_named_schema_from_reader_impl<R: BufRead>(
345 mut self,
346 name: Option<String>,
347 reader: R,
348 ) -> Result<Self, Error<TResolver::Error>> {
349 let reader = IoReader::new(reader);
350 let mut reader = SchemaReader::new(reader);
351
352 let schema = Schema::deserialize(&mut reader).map_err(XmlErrorWithLocation::from)?;
353 let id = self.temp_schema_id();
354
355 self.add_schema(id, name, schema, None, reader.namespaces);
356 self.resolve_pending()?;
357
358 Ok(self)
359 }
360
361 #[instrument(err, level = "trace", skip(self))]
371 pub fn add_schema_from_file<P: AsRef<Path> + Debug>(
372 self,
373 path: P,
374 ) -> Result<Self, Error<TResolver::Error>> {
375 let path = path.as_ref().canonicalize()?;
376 let url = Url::from_file_path(&path).map_err(|()| Error::InvalidFilePath(path))?;
377
378 self.add_schema_from_url(url)
379 }
380
381 #[instrument(err, level = "trace", skip(self))]
388 pub fn add_schema_from_files<I>(mut self, paths: I) -> Result<Self, Error<TResolver::Error>>
389 where
390 I: IntoIterator + Debug,
391 I::Item: AsRef<Path> + Debug,
392 {
393 for path in paths {
394 self = self.add_schema_from_file(path)?;
395 }
396
397 Ok(self)
398 }
399
400 #[instrument(err, level = "trace", skip(self))]
411 pub fn add_schema_from_url(mut self, url: Url) -> Result<Self, Error<TResolver::Error>> {
412 let req = ResolveRequest::new(url, ResolveRequestType::UserDefined);
413 let id = self.temp_schema_id();
414
415 self.resolve_location(id, req)?;
416 self.resolve_pending()?;
417
418 Ok(self)
419 }
420
421 fn temp_schema_id(&mut self) -> TempSchemaId {
422 let id = self.next_temp_id;
423
424 self.next_temp_id.0 = self.next_temp_id.0.wrapping_add(1);
425
426 id
427 }
428
429 fn add_pending(&mut self, req: ResolveRequest) -> TempSchemaId {
430 tracing::debug!("Add pending resolve request: {req:#?}");
431
432 let id = self.temp_schema_id();
433
434 self.pending.push_back((id, req));
435
436 id
437 }
438
439 fn resolve_pending(&mut self) -> Result<(), Error<TResolver::Error>> {
440 while let Some((id, req)) = self.pending.pop_front() {
441 self.resolve_location(id, req)?;
442 }
443
444 Ok(())
445 }
446
447 #[instrument(err, level = "trace", skip(self))]
448 fn resolve_location(
449 &mut self,
450 id: TempSchemaId,
451 req: ResolveRequest,
452 ) -> Result<(), Error<TResolver::Error>> {
453 tracing::debug!("Process resolve request: {req:#?}");
454
455 let Some((name, location, buffer)) =
456 self.resolver.resolve(&req).map_err(Error::resolver)?
457 else {
458 return Err(Error::UnableToResolve(Box::new(req)));
459 };
460
461 if let Some(ids) = self.cache.get_mut(&location) {
462 ids.insert(id);
463
464 return Ok(());
465 }
466
467 let reader = IoReader::new(buffer);
468 let reader = SchemaReader::new(reader);
469 let mut reader = reader.with_error_info();
470
471 let mut schema =
472 Schema::deserialize(&mut reader).map_err(|error| XmlErrorWithLocation {
473 error,
474 location: Some(location.clone()),
475 })?;
476
477 if let Some(current_ns) = req.current_ns {
478 if let Some(ns) = schema.target_namespace.as_ref() {
479 if req.request_type == ResolveRequestType::IncludeRequest
480 && ns.as_bytes() != current_ns.as_ref()
481 {
482 return Err(Error::MismatchingTargetNamespace {
483 location,
484 found: Namespace::new(ns.as_bytes().to_vec()),
485 expected: current_ns,
486 });
487 }
488 } else {
489 let inherited_ns = current_ns.to_string();
490 schema.target_namespace = Some(inherited_ns);
491 }
492 }
493
494 let reader = reader.into_inner();
495
496 self.add_schema(id, name, schema, Some(location.clone()), reader.namespaces);
497 self.cache.insert(location, HashSet::from([id]));
498
499 Ok(())
500 }
501
502 fn add_schema(
503 &mut self,
504 id: TempSchemaId,
505 name: Option<String>,
506 schema: Schema,
507 location: Option<Url>,
508 namespaces: Namespaces,
509 ) {
510 tracing::debug!(
511 "Process schema (location={:?}, target_namespace={:?}",
512 location.as_ref().map(Url::as_str),
513 &schema.target_namespace
514 );
515
516 let target_ns = schema
517 .target_namespace
518 .as_deref()
519 .map(|ns| Namespace::from(ns.as_bytes().to_owned()));
520 let mut dependencies = BTreeMap::new();
521
522 if self.resolve_includes {
523 for content in &schema.content {
524 match content {
525 SchemaContent::Import(x) => {
526 if let Some(req) = import_req(x, target_ns.clone(), location.as_ref()) {
527 let location = req.requested_location.clone();
528 let id = self.add_pending(req);
529 dependencies.insert(location, Dependency::Import(id));
530 }
531 }
532 SchemaContent::Include(x) => {
533 let req =
534 include_req(&x.schema_location, target_ns.clone(), location.as_ref());
535 let location = req.requested_location.clone();
536 let id = self.add_pending(req);
537 dependencies.insert(location, Dependency::Include(id));
538 }
539 SchemaContent::Redefine(x) => {
540 let req =
541 include_req(&x.schema_location, target_ns.clone(), location.as_ref());
542 let location = req.requested_location.clone();
543 let id = self.add_pending(req);
544 dependencies.insert(location, Dependency::Redefine(id));
545 }
546 SchemaContent::Override(x) => {
547 let req =
548 include_req(&x.schema_location, target_ns.clone(), location.as_ref());
549 let location = req.requested_location.clone();
550 let id = self.add_pending(req);
551 dependencies.insert(location, Dependency::Override(id));
552 }
553 _ => (),
554 }
555 }
556 }
557
558 self.entries.push(ParserEntry::Schema {
559 id,
560 name,
561 schema,
562 location,
563 target_ns,
564 namespaces,
565 dependencies,
566 });
567 }
568}
569
570struct SchemaReader<R> {
571 inner: R,
572 namespaces: Namespaces,
573}
574
575type Namespaces = BTreeMap<Option<Namespace>, Vec<NamespacePrefix>>;
576
577impl<R> SchemaReader<R> {
578 fn new(inner: R) -> Self {
579 Self {
580 inner,
581 namespaces: BTreeMap::new(),
582 }
583 }
584}
585
586impl<R> XmlReader for SchemaReader<R>
587where
588 R: XmlReader,
589{
590 fn extend_error(&self, error: QuickXmlError) -> QuickXmlError {
591 self.inner.extend_error(error)
592 }
593}
594
595impl<'a, R> XmlReaderSync<'a> for SchemaReader<R>
596where
597 R: XmlReaderSync<'a>,
598{
599 fn read_event(&mut self) -> Result<Event<'a>, QuickXmlError> {
600 let event = self.inner.read_event()?;
601
602 if let Event::Start(x) | Event::Empty(x) = &event {
603 for a in x.attributes() {
604 let a = a?;
605 if matches!(a.key.prefix(), Some(x) if x.as_ref() == b"xmlns") {
606 let prefix = NamespacePrefix::new(a.key.local_name().as_ref().to_owned());
607 let namespace = Namespace::new(a.value.into_owned());
608
609 self.namespaces
610 .entry(Some(namespace))
611 .or_default()
612 .push(prefix);
613 }
614 }
615 }
616
617 Ok(event)
618 }
619}
620
621impl SchemasBuilder {
622 fn build(mut self) -> Schemas {
623 self.build_id_cache();
624 self.build_prefix_cache();
625
626 for entry in take(&mut self.entries) {
627 match entry {
628 ParserEntry::AnonymousNamespace => {
629 self.get_or_create_namespace_info_mut(None);
630 }
631 ParserEntry::Namespace { namespace, .. } => {
632 self.get_or_create_namespace_info_mut(Some(namespace));
633 }
634 ParserEntry::Schema {
635 id,
636 name,
637 schema,
638 location,
639 target_ns,
640 namespaces: _,
641 dependencies,
642 } => {
643 self.add_schema(id, target_ns, name, location, schema, dependencies);
644 }
645 }
646 }
647
648 self.determine_prefixes();
649
650 self.schemas
651 }
652
653 fn build_id_cache(&mut self) {
654 for entry in &self.entries {
655 if let ParserEntry::Schema {
656 id: temp_id,
657 location,
658 ..
659 } = entry
660 {
661 let id = self.schemas.next_schema_id();
662 self.id_cache.insert(*temp_id, id);
663
664 if let Some(alternative_ids) =
665 location.as_ref().and_then(|x| self.location_cache.get(x))
666 {
667 for temp_id in alternative_ids {
668 self.id_cache.insert(*temp_id, id);
669 }
670 }
671 }
672 }
673 }
674
675 fn build_prefix_cache(&mut self) {
676 for entry in &self.entries {
677 match entry {
678 ParserEntry::AnonymousNamespace => {
679 self.prefix_cache.entry(None).or_default();
680 }
681 ParserEntry::Namespace { prefix, namespace } => {
682 self.prefix_cache
683 .entry(Some(namespace.clone()))
684 .or_default()
685 .prefix = Some(prefix.clone());
686 }
687 ParserEntry::Schema {
688 target_ns,
689 namespaces,
690 ..
691 } => {
692 let prefix = namespaces
693 .get(target_ns)
694 .and_then(|prefixes| prefixes.first())
695 .cloned();
696 let entry = self.prefix_cache.entry(target_ns.clone()).or_default();
697
698 if entry.prefix.is_none() {
699 entry.prefix = prefix;
700 } else if let Some(prefix) = prefix {
701 entry.alt_prefixes.insert(prefix);
702 }
703
704 for (namespace, prefixes) in namespaces {
705 for prefix in prefixes {
706 self.prefix_cache
707 .entry(namespace.clone())
708 .or_default()
709 .alt_prefixes
710 .insert(prefix.clone());
711 }
712 }
713 }
714 }
715 }
716 }
717
718 fn add_schema(
719 &mut self,
720 id: TempSchemaId,
721 namespace: Option<Namespace>,
722 name: Option<String>,
723 location: Option<Url>,
724 schema: Schema,
725 dependencies: BTreeMap<String, Dependency<TempSchemaId>>,
726 ) {
727 self.schemas.last_schema_id = self.schemas.last_schema_id.wrapping_add(1);
728 let schema_id = *self.id_cache.get(&id).unwrap();
729
730 let (namespace_id, namespace_info) = self.get_or_create_namespace_info_mut(namespace);
731 namespace_info.schemas.push(schema_id);
732
733 let dependencies = dependencies
734 .into_iter()
735 .filter_map(|(location, dep)| {
736 let id = *self.id_cache.get(&*dep)?;
737 let dep = dep.map(|_| id);
738 Some((location, dep))
739 })
740 .collect();
741
742 match self.schemas.schemas.entry(schema_id) {
743 Entry::Vacant(e) => e.insert(SchemaInfo {
744 name,
745 schema,
746 location,
747 namespace_id,
748 dependencies,
749 }),
750 Entry::Occupied(_) => crate::unreachable!(),
751 };
752 }
753
754 fn get_or_create_namespace_info_mut(
755 &mut self,
756 namespace: Option<Namespace>,
757 ) -> (NamespaceId, &mut NamespaceInfo) {
758 match self.schemas.known_namespaces.entry(namespace) {
759 Entry::Occupied(e) => {
760 let id = *e.get();
761 let info = self.schemas.namespace_infos.get_mut(&id).unwrap();
762
763 (id, info)
764 }
765 Entry::Vacant(e) => {
766 let id = if e.key().is_none() {
767 NamespaceId::ANONYMOUS
768 } else {
769 self.schemas.last_namespace_id = self.schemas.last_namespace_id.wrapping_add(1);
770
771 NamespaceId(self.schemas.last_namespace_id)
772 };
773
774 let namespace = e.key().clone();
775 e.insert(id);
776
777 let info = match self.schemas.namespace_infos.entry(id) {
778 Entry::Vacant(e) => e.insert(NamespaceInfo::new(namespace)),
779 Entry::Occupied(_) => crate::unreachable!(),
780 };
781
782 (id, info)
783 }
784 }
785 }
786
787 fn determine_prefixes(&mut self) {
788 for (id, info) in &mut self.schemas.namespace_infos {
790 if info.prefix.is_some() {
791 continue;
792 }
793
794 let entry = &mut self.prefix_cache.get(&info.namespace).unwrap();
795 if let Some(prefix) = &entry.prefix {
796 if let Entry::Vacant(e) = self.schemas.known_prefixes.entry(prefix.clone()) {
797 info.prefix = Some(e.key().clone());
798 e.insert(*id);
799 }
800 }
801 }
802
803 if self.alternative_prefixes {
805 for (id, info) in &mut self.schemas.namespace_infos {
806 if info.prefix.is_some() {
807 continue;
808 }
809
810 let entry = &mut self.prefix_cache.get(&info.namespace).unwrap();
811 for alt in &entry.alt_prefixes {
812 if let Entry::Vacant(e) = self.schemas.known_prefixes.entry(alt.clone()) {
813 info.prefix = Some(e.key().clone());
814 e.insert(*id);
815 }
816 }
817 }
818 }
819
820 if self.generate_prefixes {
822 for (id, info) in &mut self.schemas.namespace_infos {
823 if info.prefix.is_some() {
824 continue;
825 }
826
827 let entry = &mut self.prefix_cache.get(&info.namespace).unwrap();
828 let prefix = entry
829 .prefix
830 .clone()
831 .or_else(|| entry.alt_prefixes.iter().next().cloned());
832 if let Some(prefix) = prefix {
833 let ext = format!("_{}", id.0);
834 let ext = ext.as_bytes();
835
836 let mut p = prefix.0.into_owned();
837 p.extend_from_slice(ext);
838
839 let prefix = NamespacePrefix(Cow::Owned(p));
840 self.schemas.known_prefixes.insert(prefix, *id);
841 }
842 }
843 }
844 }
845}
846
847fn import_req(
848 import: &Import,
849 current_ns: Option<Namespace>,
850 current_location: Option<&Url>,
851) -> Option<ResolveRequest> {
852 let location = import.schema_location.as_ref()?;
853
854 let mut req = ResolveRequest::new(location, ResolveRequestType::ImportRequest);
855
856 if let Some(ns) = current_ns {
857 req = req.current_ns(ns);
858 }
859
860 if let Some(ns) = &import.namespace {
861 req = req.requested_ns(Namespace::from(ns.as_bytes().to_owned()));
862 }
863
864 if let Some(current_location) = current_location {
865 req = req.current_location(current_location.clone());
866 }
867
868 Some(req)
869}
870
871fn include_req(
872 schema_location: &str,
873 current_ns: Option<Namespace>,
874 current_location: Option<&Url>,
875) -> ResolveRequest {
876 let mut req = ResolveRequest::new(schema_location, ResolveRequestType::IncludeRequest);
877
878 if let Some(ns) = current_ns {
879 req = req.current_ns(ns);
880 }
881
882 if let Some(current_location) = current_location {
883 req = req.current_location(current_location.clone());
884 }
885
886 req
887}