1use std::borrow::Cow;
2use std::collections::{BTreeMap, HashMap, BTreeSet};
3use std::fmt::Display;
4use std::fs::read_to_string;
5use std::path::Path;
6use std::hash::{Hash,DefaultHasher,Hasher};
7
8use roxmltree::{Document, Node, NodeId, ParsingOptions};
9use serde::Deserialize;
10use stam::*;
11use toml;
12use upon::Engine;
13
14const NS_XML: &str = "http://www.w3.org/XML/1998/namespace";
15const CONTEXT_ANNO: &str = "http://www.w3.org/ns/anno.jsonld";
16
17
18fn default_set() -> String {
19 "urn:stam-fromxml".into()
20}
21
22#[derive(Deserialize)]
23pub struct XmlConversionConfig {
25 #[serde(default)]
26 elements: Vec<XmlElementConfig>,
28
29 #[serde(default)]
30 baseelements: HashMap<String, XmlElementConfig>,
32
33 #[serde(default)]
34 namespaces: HashMap<String, String>,
36
37 #[serde(default = "XmlWhitespaceHandling::collapse")]
38 whitespace: XmlWhitespaceHandling,
40
41 #[serde(default)]
42 context: HashMap<String, toml::Value>,
44
45 #[serde(default)]
46 metadata: Vec<MetadataConfig>,
48
49 #[serde(default)]
50 inject_dtd: Option<String>,
52
53 #[serde(default = "default_set")]
54 default_set: String,
55
56 #[serde(default)]
57 id_prefix: Option<String>,
59
60 #[serde(default)]
61 id_strip_suffix: Vec<String>,
63
64 #[serde(default)]
65 provenance: bool,
67
68 #[serde(skip_deserializing)]
69 debug: bool,
70
71}
72
73impl XmlConversionConfig {
74 pub fn new() -> Self {
75 Self {
76 elements: Vec::new(),
77 baseelements: HashMap::new(),
78 namespaces: HashMap::new(),
79 context: HashMap::new(),
80 metadata: Vec::new(),
81 whitespace: XmlWhitespaceHandling::Collapse,
82 default_set: default_set(),
83 inject_dtd: None,
84 id_prefix: None,
85 id_strip_suffix: Vec::new(),
86 provenance: false,
87 debug: false,
88 }
89 }
90
91 pub fn resolve_baseelements(&mut self) -> Result<(), XmlConversionError> {
92 let mut replace: Vec<(usize, XmlElementConfig)> = Vec::new();
93 for (i, element) in self.elements.iter().enumerate() {
94 let mut newelement = None;
95 for basename in element.base.iter().rev() {
96 if let Some(baseelement) = self.baseelements.get(basename) {
97 if newelement.is_none() {
98 newelement = Some(element.clone());
99 }
100 newelement
101 .as_mut()
102 .map(|newelement| newelement.update(baseelement));
103 } else {
104 return Err(XmlConversionError::ConfigError(format!(
105 "No such base element: {}",
106 basename
107 )));
108 }
109 }
110 if let Some(newelement) = newelement {
111 replace.push((i, newelement));
112 }
113 }
114 for (i, element) in replace {
115 self.elements[i] = element;
116 }
117 Ok(())
118 }
119
120 pub fn from_toml_str(tomlstr: &str) -> Result<Self, String> {
122 let mut config: Self = toml::from_str(tomlstr).map_err(|e| format!("{}", e))?;
123 config.resolve_baseelements().map_err(|e| format!("{}", e))?;
124 Ok(config)
125 }
126
127 pub fn with_debug(mut self, value: bool) -> Self {
128 self.debug = value;
129 self
130 }
131
132 pub fn with_provenance(mut self, value: bool) -> Self {
134 self.provenance = value;
135 self
136 }
137
138 pub fn with_prefix(mut self, prefix: impl Into<String>, namespace: impl Into<String>) -> Self {
140 self.namespaces.insert(prefix.into(), namespace.into());
141 self
142 }
143
144 pub fn with_id_prefix(mut self, prefix: impl Into<String>) -> Self {
146 self.id_prefix = Some(prefix.into());
147 self
148 }
149
150 pub fn with_id_strip_suffix(mut self, suffix: impl Into<String>) -> Self {
152 self.id_strip_suffix.push(suffix.into());
153 self
154 }
155
156 pub fn with_inject_dtd(mut self, dtd: impl Into<String>) -> Self {
158 self.inject_dtd = Some(dtd.into());
159 self
160 }
161
162 pub fn with_whitespace(mut self, handling: XmlWhitespaceHandling) -> Self {
164 self.whitespace = handling;
165 self
166 }
167
168 pub fn with_element<F>(mut self, expression: &str, setup: F) -> Self
170 where
171 F: Fn(XmlElementConfig) -> XmlElementConfig,
172 {
173 let expression = XPathExpression::new(expression);
174 let element = setup(XmlElementConfig::new(expression));
175 if self.debug {
176 eprintln!("[STAM fromxml] registered {:?}", element);
177 }
178 self.elements.push(element);
179 self
180 }
181
182 fn element_config(&self, node: Node, path: &NodePath) -> Option<&XmlElementConfig> {
184 for elementconfig in self.elements.iter().rev() {
185 if elementconfig.path.test(path, node, self) {
186 return Some(elementconfig);
187 }
188 }
189 None
190 }
191
192 pub fn add_context(&mut self, key: impl Into<String>, value: toml::Value) {
193 self.context.insert(key.into(), value);
194 }
195
196 pub fn debug(&self) -> bool {
197 self.debug
198 }
199}
200
201#[derive(Clone, Copy, Debug, PartialEq, Deserialize)]
202pub enum XmlWhitespaceHandling {
204 Unspecified,
206 Inherit,
208 Preserve,
210 Collapse,
212}
213
214impl Default for XmlWhitespaceHandling {
215 fn default() -> Self {
216 XmlWhitespaceHandling::Unspecified
217 }
218}
219
220impl XmlWhitespaceHandling {
221 fn collapse() -> Self {
222 XmlWhitespaceHandling::Collapse
223 }
224}
225
226#[derive(Debug, Clone, Deserialize, PartialEq, Copy, Default)]
227pub enum XmlAnnotationHandling {
228 #[default]
230 Unspecified,
231
232 None,
234
235 TextSelector,
237
238 ResourceSelector,
240
241 TextSelectorBetweenMarkers,
243}
244
245#[derive(Debug, Clone, Deserialize)]
246pub struct XmlElementConfig {
248 #[serde(default)]
251 path: XPathExpression,
252
253 #[serde(default)]
254 annotation: XmlAnnotationHandling,
255
256 #[serde(default)]
257 annotationdata: Vec<XmlAnnotationDataConfig>,
258
259 #[serde(default)]
261 textprefix: Option<String>,
262
263 #[serde(default)]
265 text: Option<bool>,
266
267 #[serde(default)]
269 textsuffix: Option<String>,
270
271 #[serde(default)]
273 annotatetextprefix: Vec<XmlAnnotationDataConfig>,
274
275 #[serde(default)]
277 annotatetextsuffix: Vec<XmlAnnotationDataConfig>,
278
279 #[serde(default)]
281 include_textprefix: Option<bool>,
282
283 #[serde(default)]
285 include_textsuffix: Option<bool>,
286
287 #[serde(default)]
289 base: Vec<String>,
290
291 #[serde(default)]
293 id: Option<String>,
294
295 #[serde(default)]
296 stop: Option<bool>,
298
299 #[serde(default)]
300 whitespace: XmlWhitespaceHandling,
302}
303
304impl XmlElementConfig {
305 fn new(expression: XPathExpression) -> Self {
306 Self {
307 path: expression,
308 stop: None,
309 whitespace: XmlWhitespaceHandling::Unspecified,
310 annotation: XmlAnnotationHandling::Unspecified,
311 annotationdata: Vec::new(),
312 base: Vec::new(),
313 id: None,
314 textprefix: None,
315 text: None,
316 textsuffix: None,
317 annotatetextprefix: Vec::new(),
318 annotatetextsuffix: Vec::new(),
319 include_textprefix: None,
320 include_textsuffix: None,
321 }
322 }
323
324 pub fn update(&mut self, base: &XmlElementConfig) {
325 if self.whitespace == XmlWhitespaceHandling::Unspecified
326 && base.whitespace != XmlWhitespaceHandling::Unspecified
327 {
328 self.whitespace = base.whitespace;
329 }
330 if self.annotation == XmlAnnotationHandling::Unspecified
331 && base.annotation != XmlAnnotationHandling::Unspecified
332 {
333 self.annotation = base.annotation;
334 }
335 if self.textprefix.is_none() && base.textprefix.is_some() {
336 self.textprefix = base.textprefix.clone();
337 }
338 if self.text.is_none() && base.text.is_some() {
339 self.text = base.text;
340 }
341 if self.textsuffix.is_none() && base.textsuffix.is_some() {
342 self.textsuffix = base.textsuffix.clone();
343 }
344 if self.id.is_none() && base.id.is_some() {
345 self.id = base.id.clone();
346 }
347 if self.stop.is_none() && base.stop.is_some() {
348 self.stop = base.stop;
349 }
350 for annotationdata in base.annotationdata.iter() {
351 if !self.annotationdata.contains(annotationdata) {
352 self.annotationdata.push(annotationdata.clone());
353 }
354 }
355 if self.annotatetextsuffix.is_empty() && !base.annotatetextsuffix.is_empty() {
356 self.annotatetextsuffix = base.annotatetextsuffix.clone();
357 }
358 if self.annotatetextprefix.is_empty() && !base.annotatetextprefix.is_empty() {
359 self.annotatetextprefix = base.annotatetextprefix.clone();
360 }
361 if self.include_textsuffix.is_none() {
362 self.include_textsuffix = base.include_textsuffix;
363 }
364 if self.include_textprefix.is_none() {
365 self.include_textprefix = base.include_textprefix;
366 }
367 }
368
369
370 pub fn with_stop(mut self, stop: bool) -> Self {
372 self.stop = Some(stop);
373 self
374 }
375
376 pub fn with_whitespace(mut self, handling: XmlWhitespaceHandling) -> Self {
378 self.whitespace = handling;
379 self
380 }
381
382 pub fn with_text(mut self, text: bool) -> Self {
383 self.text = Some(text);
384 self
385 }
386
387 pub fn with_base(mut self, iter: impl Iterator<Item = impl Into<String>>) -> Self {
388 self.base = iter.into_iter().map(|s| s.into()).collect();
389 self
390 }
391
392 pub fn without_text(mut self) -> Self {
393 self.text = None;
394 self
395 }
396
397 pub fn with_annotation(mut self, annotation: XmlAnnotationHandling) -> Self {
398 self.annotation = annotation;
399 self
400 }
401
402 fn hash(&self) -> usize {
404 self.path.0.as_ptr() as usize
405 }
406}
407
408impl PartialEq for XmlElementConfig {
409 fn eq(&self, other: &Self) -> bool {
410 self.hash() == other.hash()
411 }
412}
413
414#[derive(Debug, Clone, Deserialize, PartialEq)]
415pub struct XmlAnnotationDataConfig {
416 id: Option<String>,
418 set: Option<String>,
420 key: Option<String>,
422 value: Option<toml::Value>,
424
425 #[serde(default)]
427 allow_empty_value: bool,
428
429 #[serde(default)]
431 skip_if_missing: bool,
432}
433
434impl XmlAnnotationDataConfig {
435 pub fn with_id(mut self, id: impl Into<String>) -> Self {
436 self.id = Some(id.into());
437 self
438 }
439
440 pub fn with_set(mut self, set: impl Into<String>) -> Self {
441 self.set = Some(set.into());
442 self
443 }
444
445 pub fn with_key(mut self, key: impl Into<String>) -> Self {
446 self.key = Some(key.into());
447 self
448 }
449
450 pub fn with_value(mut self, value: impl Into<toml::Value>) -> Self {
451 self.value = Some(value.into());
452 self
453 }
454}
455
456#[derive(Debug, Clone, PartialEq, Deserialize)]
458struct XPathExpression(String);
459
460impl XPathExpression {
461 pub fn new(expression: impl Into<String>) -> Self {
462 Self(expression.into())
463 }
464
465 pub fn any() -> Self {
466 Self("*".into())
467 }
468
469 pub fn iter<'a>(
470 &'a self,
471 config: &'a XmlConversionConfig,
472 ) -> impl Iterator<Item = (Option<&'a str>, &'a str, Option<&'a str>)> {
473 self.0.trim_start_matches('/').split("/").map(|segment| {
474 let (prefix, name, condition) = Self::parse_segment(segment);
476 let namespace = if let Some(prefix) = prefix {
477 if let Some(namespace) = config.namespaces.get(prefix).map(|x| x.as_str()) {
478 Some(namespace)
479 } else {
480 panic!(
481 "XML namespace prefix not known in configuration: {}",
482 prefix
483 );
484 }
485 } else {
486 None
487 };
488 (namespace, name, condition)
489 })
490 }
491
492 fn test<'a, 'b>(&self, path: &NodePath<'a, 'b>, mut node: Node<'a,'b>, config: &XmlConversionConfig) -> bool {
494 let mut pathiter = path.components.iter().rev();
495 for (refns, refname, condition) in self.iter(config).collect::<Vec<_>>().into_iter().rev() {
496 if let Some(component) = pathiter.next() {
497 if refname != "*" && refname != "" {
501 if refns.is_none() != component.namespace.is_none() || component.namespace != refns || refname != component.tagname {
502 return false;
503 }
504 }
505 if let Some(condition) = condition {
506 if !self.test_condition(condition, node, config) {
507 return false;
508 }
509 }
510 if let Some(parent) = node.parent() {
511 node = parent;
512 }
513 } else {
514 if refname != "" {
515 return false;
516 }
517 }
518 }
519 true
523 }
524
525 fn test_condition<'a,'b>(&self, condition: &'a str, node: Node<'a,'b>, config: &XmlConversionConfig) -> bool {
526 for condition in condition.split(" and ") { if let Some(pos) = condition.find("!=") {
528 let var = &condition[..pos];
529 let right = condition[pos+2..].trim_matches('"');
530 if self.get_var(var, &node, config) == Some(right) {
531 return false;
532 }
533 } else if let Some(pos) = condition.find("=") {
534 let var = &condition[..pos];
535 let right = condition[pos+1..].trim_matches('"');
536 let value = self.get_var(var, &node, config);
537 if value != Some(right) {
538 return false;
539 }
540 } else {
541 let v = self.get_var(condition, &node, config);
543 if v.is_none() || v == Some("") {
544 return false;
545 }
546 }
547 }
548 true
552 }
553
554 fn get_var<'a,'b>(&self, var: &str, node: &Node<'a,'b>, config: &XmlConversionConfig) -> Option<&'a str> {
556 if var.starts_with("@") {
557 if let Some(pos) = var.find(":") {
558 let prefix = &var[1..pos];
559 if let Some(ns) = config.namespaces.get(prefix) {
560 let var = &var[pos+1..];
561 node.attribute((ns.as_str(),var))
562 } else {
563 None
564 }
565 } else {
566 node.attribute(&var[1..])
567 }
568 } else if var == "text()" {
569 node.text().map(|s|s.trim())
570 } else {
571 None
572 }
573 }
574
575 fn parse_segment<'a>(s: &'a str) -> (Option<&'a str>, &'a str, Option<&'a str>) {
577 let (name, condition) = if let (Some(begin), Some(end)) = (s.find("["), s.rfind("]")) {
578 (&s[..begin], Some(&s[begin + 1..end]))
579 } else {
580 (s, None)
581 };
582 if let Some((prefix, name)) = name.split_once(":") {
583 (Some(prefix), name, condition)
584 } else {
585 (None, name, condition)
586 }
587 }
588}
589
590
591
592impl Default for XPathExpression {
593 fn default() -> Self {
594 Self::any()
595 }
596}
597
598#[derive(Clone, Debug, PartialEq)]
599struct NodePathComponent<'a,'b> {
600 namespace: Option<&'a str>,
601 tagname: &'b str,
602 index: Option<usize>,
604}
605
606#[derive(Clone, Debug, PartialEq, Default)]
607struct NodePath<'a, 'b> {
608 components: Vec<NodePathComponent<'a,'b>>,
609}
610
611impl<'a, 'b> Display for NodePath<'a, 'b> {
612 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
613 for component in self.components.iter() {
614 write!(f, "/")?;
615 if let Some(ns) = component.namespace {
616 if let Some(index) = component.index {
617 write!(f, "{{{}}}{}[{}]", ns, component.tagname, index)?;
618 } else {
619 write!(f, "{{{}}}{}", ns, component.tagname)?;
620 }
621 } else {
622 if let Some(index) = component.index {
623 write!(f, "{}[{}]", component.tagname, index)?;
624 } else {
625 write!(f, "{}", component.tagname)?;
626 }
627 }
628 }
629 Ok(())
630 }
631}
632
633impl<'a,'b> NodePath<'a,'b> {
634 fn add(&mut self, node: &Node<'a,'b>, index: Option<usize>) {
635 if node.tag_name().name() != "" {
636 self.components.push(
637 NodePathComponent {
638 namespace: node.tag_name().namespace(),
639 tagname: node.tag_name().name(),
640 index,
641 }
642 )
643 }
644 }
645
646 fn format_as_xpath(&self, prefixes: &HashMap<String, String>) -> String {
647 let mut out = String::new();
648 for component in self.components.iter() {
649 out.push('/');
650 if let Some(ns) = component.namespace {
651 if let Some(prefix) = prefixes.get(ns) {
652 if let Some(index) = component.index {
653 out += &format!("{}:{}[{}]", prefix, component.tagname, index);
654 } else {
655 out += &format!("{}:{}", prefix, component.tagname);
656 }
657 } else {
658 eprintln!("STAM fromxml WARNING: format_as_xpath: namespace {} not defined, no prefix found!", ns);
659 if let Some(index) = component.index {
660 out += &format!("{}[{}]", component.tagname, index);
661 } else {
662 out += &format!("{}", component.tagname);
663 }
664 }
665 } else {
666 if let Some(index) = component.index {
667 out += &format!("{}[{}]", component.tagname, index);
668 } else {
669 out += &format!("{}", component.tagname);
670 }
671 }
672 }
673 out
674 }
675}
676
677
678#[derive(Default,Debug)]
680struct SiblingCounter {
681 map: HashMap<String,usize>,
682}
683
684impl SiblingCounter {
685 fn count<'a,'b>(&mut self, node: &Node<'a,'b>) -> usize {
686 let s = format!("{:?}", node.tag_name());
687 *self.map.entry(s).and_modify(|c| {*c += 1;}).or_insert(1)
688 }
689}
690
691
692#[derive(Debug, Clone, Deserialize)]
693pub struct MetadataConfig {
695 #[serde(default)]
697 annotation: XmlAnnotationHandling,
698
699 #[serde(default)]
700 annotationdata: Vec<XmlAnnotationDataConfig>,
701
702 #[serde(default)]
704 id: Option<String>,
705}
706
707pub fn from_xml<'a>(
709 filename: &Path,
710 config: &XmlConversionConfig,
711 store: &'a mut AnnotationStore,
712) -> Result<(), String> {
713 if config.debug {
714 eprintln!("[STAM fromxml] parsing {}", filename.display());
715 }
716
717 let mut xmlstring = read_to_string(filename)
719 .map_err(|e| format!("Error opening XML file {}: {}", filename.display(), e))?;
720
721 if xmlstring[..100].find("<!DOCTYPE html>").is_some() && config.inject_dtd.is_some() {
723 xmlstring = xmlstring.replacen("<!DOCTYPE html>", "", 1);
724 }
725
726 if xmlstring[..100].find("<!DOCTYPE").is_none() {
728 if let Some(dtd) = config.inject_dtd.as_ref() {
729 xmlstring = dtd.to_string() + &xmlstring
730 };
731 } else if config.inject_dtd.is_some() {
732 eprintln!("[STAM fromxml] WARNING: Can not inject DTD because file already has a DOCTYPE");
733 }
734
735 let doc = Document::parse_with_options(
737 &xmlstring,
738 ParsingOptions {
739 allow_dtd: true,
740 ..ParsingOptions::default()
741 },
742 )
743 .map_err(|e| format!("Error parsing XML file {}: {}", filename.display(), e))?;
744
745 let mut converter = XmlToStamConverter::new(config);
746 converter
747 .compile()
748 .map_err(|e| format!("Error compiling templates: {}", e))?;
749
750 let textoutfilename = format!(
751 "{}.txt",
752 filename
753 .file_stem()
754 .expect("invalid filename")
755 .to_str()
756 .expect("invalid utf-8 in filename")
757 );
758
759 let mut path = NodePath::default();
761 path.add(&doc.root_element(), None);
762 converter
763 .extract_element_text(doc.root_element(), &path, converter.config.whitespace, Some(textoutfilename.as_str()), Some(&filename.to_string_lossy()), 0)
764 .map_err(|e| {
765 format!(
766 "Error extracting element text from {}: {}",
767 filename.display(),
768 e
769 )
770 })?;
771 if config.debug {
772 eprintln!("[STAM fromxml] extracted full text: {}", &converter.text);
773 }
774 let resource = TextResourceBuilder::new()
775 .with_id(filename_to_id(textoutfilename.as_str(), config).to_string())
776 .with_text(converter.text.clone())
777 .with_filename(&textoutfilename);
778
779 converter.resource_handle = Some(
780 store
781 .add_resource(resource)
782 .map_err(|e| format!("Failed to add resource {}: {}", &textoutfilename, e))?,
783 );
784
785 converter.add_metadata(store).map_err(|e| format!("Failed to add metadata {}: {}", &textoutfilename, e))?;
786
787 converter
789 .extract_element_annotation(doc.root_element(), &path, Some(&filename.to_string_lossy()),0, store)
790 .map_err(|e| {
791 format!(
792 "Error extracting element annotation from {}: {}",
793 filename.display(),
794 e
795 )
796 })?;
797
798 Ok(())
799}
800
801pub fn from_multi_xml<'a>(
803 filenames: &Vec<&Path>,
804 outputfile: Option<&Path>,
805 config: &XmlConversionConfig,
806 store: &'a mut AnnotationStore,
807) -> Result<(), String> {
808
809 let textoutfilename = if let Some(outputfile) = outputfile {
810 format!("{}",outputfile.to_str().expect("invalid utf-8 in filename"))
811 } else {
812 format!(
813 "{}.txt",
814 filenames.iter().next().expect("1 or more filename need to be provided")
815 .file_stem()
816 .expect("invalid filename")
817 .to_str()
818 .expect("invalid utf-8 in filename")
819 )
820 };
821
822 let mut xmlstrings: Vec<String> = Vec::new();
824 let mut docs: Vec<Document> = Vec::new();
825 for filename in filenames.iter() {
826 if config.debug {
827 eprintln!("[STAM fromxml] parsing {} (one of multiple)", filename.display());
828 }
829 let mut xmlstring = read_to_string(filename).map_err(|e| format!("Error opening XML file {}: {}", filename.display(), e))?;
831 if xmlstring[..100].find("<!DOCTYPE html>").is_some() && config.inject_dtd.is_some() {
832 xmlstring = xmlstring.replacen("<!DOCTYPE html>", "", 1);
833 }
834 if xmlstring[..100].find("<!DOCTYPE").is_none() {
836 if let Some(dtd) = config.inject_dtd.as_ref() {
837 xmlstring = dtd.to_string() + &xmlstring
838 };
839 } else if config.inject_dtd.is_some() {
840 eprintln!("[STAM fromxml] WARNING: Can not inject DTD because file already has a DOCTYPE");
841 }
842 xmlstrings.push(xmlstring);
843 }
844
845 for (filename, xmlstring) in filenames.iter().zip(xmlstrings.iter()) {
846 let doc = Document::parse_with_options(
848 xmlstring,
849 ParsingOptions {
850 allow_dtd: true,
851 ..ParsingOptions::default()
852 },
853 )
854 .map_err(|e| format!("Error parsing XML file {}: {}", filename.display(), e))?;
855 docs.push(doc);
856 }
857
858 let mut converter = XmlToStamConverter::new(config);
859 converter
860 .compile()
861 .map_err(|e| format!("Error compiling templates: {}", e))?;
862
863 for (i, (doc, filename)) in docs.iter().zip(filenames.iter()).enumerate() {
864 let mut path = NodePath::default();
865 path.add(&doc.root_element(), None);
866 converter
868 .extract_element_text(doc.root_element(), &path, converter.config.whitespace, Some(textoutfilename.as_str()), Some(&filename.to_string_lossy()), i)
869 .map_err(|e| {
870 format!(
871 "Error extracting element text from {}: {}",
872 filename.display(),
873 e
874 )
875 })?;
876 if config.debug {
877 eprintln!("[STAM fromxml] extracted full text: {}", &converter.text);
878 }
879 }
880
881 let resource = TextResourceBuilder::new()
882 .with_id(filename_to_id(textoutfilename.as_str(), config).to_string())
883 .with_text(converter.text.clone())
884 .with_filename(&textoutfilename);
885
886 converter.resource_handle = Some(
887 store
888 .add_resource(resource)
889 .map_err(|e| format!("Failed to add resource {}: {}", &textoutfilename, e))?,
890 );
891
892 converter.add_metadata(store).map_err(|e| format!("Failed to add metadata {}: {}", &textoutfilename, e))?;
893
894 for (i,(doc, filename)) in docs.iter().zip(filenames.iter()).enumerate() {
896 let mut path = NodePath::default();
897 path.add(&doc.root_element(), None);
898 converter
899 .extract_element_annotation(doc.root_element(), &path, Some(&filename.to_string_lossy()),i, store)
900 .map_err(|e| {
901 format!(
902 "Error extracting element annotation from {}: {}",
903 filename.display(),
904 e
905 )
906 })?;
907 }
908
909 Ok(())
910}
911
912pub fn from_xml_in_memory<'a>(
914 resource_id: &str,
915 xmlstring: &str,
916 config: &XmlConversionConfig,
917 store: &'a mut AnnotationStore,
918) -> Result<(), String> {
919 if config.debug {
920 eprintln!("[STAM fromxml] parsing XML string");
921 }
922
923 let doc = Document::parse_with_options(
925 &xmlstring,
926 ParsingOptions {
927 allow_dtd: true,
928 ..ParsingOptions::default()
929 },
930 )
931 .map_err(|e| format!("Error parsing XML string: {}", e))?;
932
933 let mut converter = XmlToStamConverter::new(config);
934 converter
935 .compile()
936 .map_err(|e| format!("Error compiling templates: {}", e))?;
937
938 let mut path = NodePath::default();
939 path.add(&doc.root_element(), None);
940 converter
942 .extract_element_text(doc.root_element(), &path, converter.config.whitespace, Some(resource_id), Some(resource_id), 0)
943 .map_err(|e| {
944 format!(
945 "Error extracting element text from {}: {}",
946 resource_id,
947 e
948 )
949 })?;
950 if config.debug {
951 eprintln!("[STAM fromxml] extracted full text: {}", &converter.text);
952 }
953 let resource = TextResourceBuilder::new()
954 .with_id(resource_id)
955 .with_text(converter.text.clone());
956
957 converter.resource_handle = Some(
958 store
959 .add_resource(resource)
960 .map_err(|e| format!("Failed to add resource {}: {}", &resource_id, e))?,
961 );
962
963 converter.add_metadata(store).map_err(|e| format!("Failed to add metadata for {}: {}", &resource_id, e))?;
964
965 converter
967 .extract_element_annotation(doc.root_element(), &path, Some(resource_id), 0, store)
968 .map_err(|e| {
969 format!(
970 "Error extracting element annotation from {}: {}",
971 resource_id,
972 e
973 )
974 })?;
975
976 Ok(())
977}
978
979pub fn filename_to_id<'a>(filename: &'a str, config: &XmlConversionConfig) -> &'a str {
980 for suffix in config.id_strip_suffix.iter() {
981 if filename.ends_with(suffix) {
982 return &filename[..filename.len() - suffix.len()];
983 }
984 }
985 return filename;
986}
987
988#[derive(Clone,Copy,PartialEq, Hash, Eq)]
989enum PositionType {
990 Body,
991 TextPrefix,
992 TextSuffix,
993}
994
995struct XmlToStamConverter<'a> {
996 cursor: usize,
998
999 text: String,
1001
1002 template_engine: Engine<'a>,
1004
1005 positionmap: HashMap<(usize,NodeId,PositionType), Offset>,
1007
1008 bytepositionmap: HashMap<(usize,NodeId,PositionType), (usize, usize)>,
1010
1011 markers: HashMap<usize, Vec<(usize,NodeId)>>,
1013
1014 resource_handle: Option<TextResourceHandle>,
1016
1017 pending_whitespace: bool,
1019
1020 config: &'a XmlConversionConfig,
1022
1023 prefixes: HashMap<String, String>,
1025
1026 global_context: BTreeMap<String, upon::Value>,
1028
1029 variables: BTreeMap<String, BTreeSet<&'a str>>,
1031
1032 debugindent: String,
1033}
1034
1035pub enum XmlConversionError {
1036 StamError(StamError),
1037 TemplateError(String, Option<upon::Error>),
1038 ConfigError(String),
1039}
1040
1041impl From<StamError> for XmlConversionError {
1042 fn from(error: StamError) -> Self {
1043 Self::StamError(error)
1044 }
1045}
1046
1047impl From<upon::Error> for XmlConversionError {
1048 fn from(error: upon::Error) -> Self {
1049 Self::TemplateError("".into(), Some(error))
1050 }
1051}
1052
1053impl Display for XmlConversionError {
1054 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
1055 match self {
1056 Self::StamError(e) => e.fmt(f),
1057 Self::TemplateError(s, e) => {
1058 f.write_str(s.as_str())?;
1059 f.write_str(": ")?;
1060 if let Some(e) = e {
1061 e.fmt(f)?;
1062 }
1063 f.write_str("")
1064 }
1065 Self::ConfigError(e) => e.fmt(f),
1066 }
1067 }
1068}
1069
1070impl<'a> XmlToStamConverter<'a> {
1071 fn new(config: &'a XmlConversionConfig) -> Self {
1072 let mut prefixes: HashMap<String, String> = HashMap::new();
1073 for (prefix, namespace) in config.namespaces.iter() {
1074 prefixes.insert(namespace.to_string(), prefix.to_string());
1075 }
1076 let mut template_engine = Engine::new();
1077 template_engine.add_function("capitalize", filter_capitalize);
1078 template_engine.add_function("lower", str::to_lowercase);
1079 template_engine.add_function("upper", str::to_uppercase);
1080 template_engine.add_function("trim", |s: &str| s.trim().to_string() );
1081 template_engine.add_function("add", |a: i64, b: i64| a + b);
1082 template_engine.add_function("sub", |a: i64, b: i64| a - b);
1083 template_engine.add_function("mul", |a: i64, b: i64| a * b);
1084 template_engine.add_function("div", |a: i64, b: i64| a / b);
1085 template_engine.add_function("eq", |a: &upon::Value, b: &upon::Value| a == b);
1086 template_engine.add_function("ne", |a: &upon::Value, b: &upon::Value| a != b);
1087 template_engine.add_function("gt", |a: i64, b: i64| a > b);
1088 template_engine.add_function("lt", |a: i64, b: i64| a < b);
1089 template_engine.add_function("gte", |a: i64, b: i64| a >= b);
1090 template_engine.add_function("lte", |a: i64, b: i64| a <= b);
1091 template_engine.add_function("int", |a: &upon::Value| match a {
1092 upon::Value::Integer(x) => upon::Value::Integer(*x),
1093 upon::Value::Float(x) => upon::Value::Integer(*x as i64),
1094 upon::Value::String(s) => upon::Value::Integer(s.parse().expect("int filter expects an integer value")),
1095 _ => panic!("int filter expects an integer value"), });
1097 template_engine.add_function("as_range", |a: i64| upon::Value::List(std::ops::Range { start: 0, end: a }.into_iter().map(|x| upon::Value::Integer(x+1)).collect::<Vec<_>>()) );
1098 template_engine.add_function("last", |list: &[upon::Value]| list.last().map(Clone::clone));
1099 template_engine.add_function("first", |list: &[upon::Value]| {
1100 list.first().map(Clone::clone)
1101 });
1102 template_engine.add_function("tokenize", |s: &str| {
1103 upon::Value::List(
1104 s.split(|c| c == ' ' || c == '\n').filter_map(|x|
1105 if !x.is_empty() {
1106 Some(upon::Value::String(x.to_string()))
1107 } else {
1108 None
1109 }
1110 )
1111 .collect::<Vec<upon::Value>>())
1112 });
1113 template_engine.add_function("replace", |s: &str, from: &str, to: &str| {
1114 upon::Value::String(s.replace(from,to))
1115 });
1116 template_engine.add_function("starts_with", |s: &str, prefix: &str| {
1117 s.starts_with(prefix)
1118 });
1119 template_engine.add_function("ends_with", |s: &str, suffix: &str| {
1120 s.ends_with(suffix)
1121 });
1122 template_engine.add_function("basename", |a: &upon::Value| match a {
1123 upon::Value::String(s) => upon::Value::String(s.split(|c| c == '/' || c == '\\').last().expect("splitting must work").to_string()),
1124 _ => panic!("basename filter expects a string value"), });
1126 template_engine.add_function("noext", |a: &upon::Value| match a {
1127 upon::Value::String(s) => if let Some(pos) = s.rfind('.') {
1128 s[..pos].to_string()
1129 } else {
1130 s.to_string()
1131 },
1132 _ => panic!("basename filter expects a string value"), });
1134 let mut converter = Self {
1135 cursor: 0,
1136 text: String::new(),
1137 template_engine,
1138 positionmap: HashMap::new(),
1139 bytepositionmap: HashMap::new(),
1140 markers: HashMap::new(),
1141 resource_handle: None,
1142 pending_whitespace: false,
1143 global_context: BTreeMap::new(),
1144 debugindent: String::new(),
1145 variables: BTreeMap::new(),
1146 prefixes,
1147 config,
1148 };
1149 converter.set_global_context();
1150 converter
1151 }
1152
1153 fn compile(&mut self) -> Result<(), XmlConversionError> {
1155 if self.config.debug {
1156 eprintln!("[STAM fromxml] compiling templates");
1157 }
1158 for element in self.config.elements.iter() {
1159 if let Some(textprefix) = element.textprefix.as_ref() {
1160 if self.template_engine.get_template(textprefix.as_str()).is_none() {
1161 let template = self.precompile(textprefix.as_str());
1162 self.template_engine
1163 .add_template(textprefix.clone(), template)
1164 .map_err(|e| {
1165 XmlConversionError::TemplateError(
1166 format!("element/textprefix template {}", textprefix.clone()),
1167 Some(e),
1168 )
1169 })?;
1170 }
1171 }
1172 if let Some(textsuffix) = element.textsuffix.as_ref() {
1173 if self.template_engine.get_template(textsuffix.as_str()).is_none() {
1174 let template = self.precompile(textsuffix.as_str());
1175 self.template_engine
1176 .add_template(textsuffix.clone(), template)
1177 .map_err(|e| {
1178 XmlConversionError::TemplateError(
1179 format!("element/textsuffix template {}", textsuffix.clone()),
1180 Some(e),
1181 )
1182 })?;
1183 }
1184 }
1185 if let Some(id) = element.id.as_ref() {
1186 if self.template_engine.get_template(id.as_str()).is_none() {
1187 let template = self.precompile(id.as_str());
1188 self.template_engine.add_template(id.clone(), template).map_err(|e| {
1189 XmlConversionError::TemplateError(
1190 format!("element/id template {}", id.clone()),
1191 Some(e),
1192 )
1193 })?;
1194 }
1195 }
1196 for annotationdata in element.annotationdata.iter().chain(element.annotatetextprefix.iter()).chain(element.annotatetextsuffix.iter()) {
1197 if let Some(id) = annotationdata.id.as_ref() {
1198 if self.template_engine.get_template(id.as_str()).is_none() {
1199 let template = self.precompile(id.as_str());
1200 self.template_engine.add_template(id.clone(), template).map_err(|e| {
1201 XmlConversionError::TemplateError(
1202 format!("annotationdata/id template {}", id.clone()),
1203 Some(e),
1204 )
1205 })?;
1206 }
1207 }
1208 if let Some(set) = annotationdata.set.as_ref() {
1209 if self.template_engine.get_template(set.as_str()).is_none() {
1210 let template = self.precompile(set.as_str());
1211 self.template_engine.add_template(set.clone(), template).map_err(|e| {
1213 XmlConversionError::TemplateError(
1214 format!("annotationdata/set template {}", set.clone()),
1215 Some(e),
1216 )
1217 })?;
1218 }
1219 }
1220 if let Some(key) = annotationdata.key.as_ref() {
1221 if self.template_engine.get_template(key.as_str()).is_none() {
1222 let template = self.precompile(key.as_str());
1223 self.template_engine.add_template(key.clone(), template).map_err(|e| {
1224 XmlConversionError::TemplateError(
1225 format!("annotationdata/key template {}", key.clone()),
1226 Some(e),
1227 )
1228 })?;
1229 }
1230 }
1231 if let Some(value) = annotationdata.value.as_ref() {
1232 self.compile_value(value)?;
1233 }
1234 }
1235 }
1236 for metadata in self.config.metadata.iter() {
1237 if let Some(id) = metadata.id.as_ref() {
1238 if self.template_engine.get_template(id.as_str()).is_none() {
1239 let template = self.precompile(id.as_str());
1240 self.template_engine.add_template(id.clone(), template).map_err(|e| {
1241 XmlConversionError::TemplateError(
1242 format!("metadata/id template {}", id.clone()),
1243 Some(e),
1244 )
1245 })?;
1246 }
1247 }
1248 for annotationdata in metadata.annotationdata.iter() {
1249 if let Some(id) = annotationdata.id.as_ref() {
1250 if self.template_engine.get_template(id.as_str()).is_none() {
1251 let template = self.precompile(id.as_str());
1252 self.template_engine.add_template(id.clone(), template).map_err(|e| {
1253 XmlConversionError::TemplateError(
1254 format!("annotationdata/id template {}", id.clone()),
1255 Some(e),
1256 )
1257 })?;
1258 }
1259 }
1260 if let Some(set) = annotationdata.set.as_ref() {
1261 if self.template_engine.get_template(set.as_str()).is_none() {
1262 let template = self.precompile(set.as_str());
1263 self.template_engine.add_template(set.clone(), template).map_err(|e| {
1265 XmlConversionError::TemplateError(
1266 format!("annotationdata/set template {}", set.clone()),
1267 Some(e),
1268 )
1269 })?;
1270 }
1271 }
1272 if let Some(key) = annotationdata.key.as_ref() {
1273 if self.template_engine.get_template(key.as_str()).is_none() {
1274 let template = self.precompile(key.as_str());
1275 self.template_engine.add_template(key.clone(), template).map_err(|e| {
1276 XmlConversionError::TemplateError(
1277 format!("annotationdata/key template {}", key.clone()),
1278 Some(e),
1279 )
1280 })?;
1281 }
1282 }
1283 if let Some(value) = annotationdata.value.as_ref() {
1284 self.compile_value(value)?;
1285 }
1286 }
1287 }
1288 Ok(())
1289 }
1290
1291 fn compile_value(&mut self, value: &'a toml::Value) -> Result<(), XmlConversionError> {
1293 match value {
1294 toml::Value::String(value) => {
1295 if self.template_engine.get_template(value.as_str()).is_none() {
1296 let template = self.precompile(value.as_str());
1297 self.template_engine.add_template(value.clone(), template).map_err(|e| {
1298 XmlConversionError::TemplateError(
1299 format!("annotationdata/value template {}", value.clone()),
1300 Some(e),
1301 )
1302 })?;
1303 }
1304 }
1305 toml::Value::Table(map) => {
1306 for (_key, value) in map.iter() {
1307 self.compile_value(value)?;
1308 }
1309 },
1310 toml::Value::Array(list) => {
1311 for value in list.iter() {
1312 self.compile_value(value)?;
1313 }
1314 }
1315 _ => {} }
1317 Ok(())
1318 }
1319
1320 fn extract_element_text<'b>(
1325 &mut self,
1326 node: Node<'a,'b>,
1327 path: &NodePath<'a,'b>,
1328 whitespace: XmlWhitespaceHandling,
1329 resource_id: Option<&str>,
1330 inputfile: Option<&str>,
1331 doc_num: usize,
1332 ) -> Result<(), XmlConversionError> {
1333 if self.config.debug {
1334 eprintln!("[STAM fromxml]{} extracting text for element {}", self.debugindent, path);
1335 }
1336 let mut begin = self.cursor; let mut bytebegin = self.text.len(); let mut end_discount = 0; let mut end_bytediscount = 0;
1340 let mut firsttext = true; let mut elder_siblings = SiblingCounter::default();
1343
1344 if let Some(element_config) = self.config.element_config(node, path) {
1346 if self.config.debug {
1347 eprintln!("[STAM fromxml]{} matching config: {:?}", self.debugindent, element_config);
1348 }
1349
1350 if (element_config.stop == Some(false) || element_config.stop.is_none())
1351 && element_config.annotation != XmlAnnotationHandling::TextSelectorBetweenMarkers
1352 {
1353 let whitespace = if node.has_attribute((NS_XML, "space")) {
1356 match node.attribute((NS_XML, "space")).unwrap() {
1358 "preserve" => XmlWhitespaceHandling::Preserve,
1359 "collapse" | "replace" => XmlWhitespaceHandling::Collapse,
1360 _ => whitespace,
1361 }
1362 } else if element_config.whitespace == XmlWhitespaceHandling::Inherit
1363 || element_config.whitespace == XmlWhitespaceHandling::Unspecified
1364 {
1365 whitespace } else {
1367 element_config.whitespace };
1369
1370 if let Some(textprefix) = &element_config.textprefix {
1372 self.pending_whitespace = false;
1373 if self.config.debug {
1374 eprintln!("[STAM fromxml]{} outputting textprefix: {:?}", self.debugindent, textprefix);
1375 }
1376 let result =
1377 self.render_template(textprefix, &node, Some(self.cursor), None, resource_id, inputfile, doc_num)
1378 .map_err(|e| match e {
1379 XmlConversionError::TemplateError(s, e) => {
1380 XmlConversionError::TemplateError(
1381 format!(
1382 "whilst rendering textprefix template '{}' for node '{}': {}",
1383 textprefix, node.tag_name().name(), s
1384 ),
1385 e,
1386 )
1387 }
1388 e => e,
1389 })?;
1390 let result_charlen = result.chars().count();
1391
1392 if !element_config.annotatetextprefix.is_empty() {
1393 let offset = Offset::simple(self.cursor, self.cursor + result_charlen);
1395 self.positionmap.insert((doc_num, node.id(), PositionType::TextPrefix), offset);
1396 self.bytepositionmap
1397 .insert((doc_num, node.id(), PositionType::TextPrefix), (bytebegin, bytebegin + result.len()));
1398 }
1399
1400 self.cursor += result_charlen;
1401 self.text += &result;
1402
1403 if element_config.include_textprefix != Some(true) {
1404 begin += result_charlen;
1406 bytebegin += result.len();
1407 }
1408 }
1409
1410 let textbegin = self.cursor;
1411 for child in node.children() {
1413 if self.config.debug {
1414 eprintln!("[STAM fromxml]{} child {:?}", self.debugindent, child);
1415 }
1416 if child.is_text() && element_config.text == Some(true) {
1417 let mut innertext = child.text().expect("text node must have text");
1421 let mut pending_whitespace = false;
1422 let mut leading_whitespace = false;
1423 if whitespace == XmlWhitespaceHandling::Collapse && !innertext.is_empty() {
1424 let mut all_whitespace = true;
1426 leading_whitespace = innertext.chars().next().unwrap().is_whitespace();
1427
1428 pending_whitespace = innertext
1431 .chars()
1432 .inspect(|c| {
1433 if !c.is_whitespace() {
1434 all_whitespace = false
1435 }
1436 })
1437 .last()
1438 .unwrap()
1439 .is_whitespace();
1440 if all_whitespace {
1441 self.pending_whitespace = true;
1442 if self.config.debug {
1443 eprintln!(
1444 "[STAM fromxml]{} ^- all whitespace, flag pending whitespace and skipping...",
1445 self.debugindent,
1446 );
1447 }
1448 continue;
1449 }
1450 innertext = innertext.trim();
1451 if self.config.debug {
1452 eprintln!(
1453 "[STAM fromxml]{} ^- collapsed whitespace: {:?}",
1454 self.debugindent,
1455 innertext
1456 );
1457 }
1458 }
1459 if self.pending_whitespace || leading_whitespace {
1460 if !self.text.is_empty()
1462 && !self.text.chars().rev().next().unwrap().is_whitespace()
1463 {
1464 if self.config.debug {
1465 eprintln!("[STAM fromxml]{} ^- outputting pending whitespace",self.debugindent);
1466 }
1467 self.text.push(' ');
1468 self.cursor += 1;
1469 if firsttext && self.pending_whitespace {
1470 begin += 1;
1471 bytebegin += 1;
1472 firsttext = false;
1473 }
1474 }
1475 self.pending_whitespace = false;
1476 }
1477
1478 if whitespace == XmlWhitespaceHandling::Collapse {
1480 let mut prevc = ' ';
1481 let mut innertext = innertext.replace(|c: char| c.is_whitespace(), " ");
1482 innertext.retain(|c| {
1483 let do_retain = c != ' ' || prevc != ' ';
1484 prevc = c;
1485 do_retain
1486 });
1487 self.text += &innertext;
1488 self.cursor += innertext.chars().count();
1489 if self.config.debug {
1490 eprintln!("[STAM fromxml]{} ^- outputting text child (collapsed whitespace), cursor is now {}: {}",self.debugindent, self.cursor, innertext);
1491 }
1492 } else {
1493 self.text += &innertext;
1494 self.cursor += innertext.chars().count();
1495 if self.config.debug {
1496 eprintln!("[STAM fromxml]{} ^- outputting text child, cursor is now {}: {}",self.debugindent, self.cursor, innertext);
1497 }
1498 }
1499 self.pending_whitespace = pending_whitespace;
1500 } else if child.is_element() {
1501 if self.config.debug {
1502 eprintln!("[STAM fromxml]{} \\- extracting text for this child", self.debugindent);
1503 }
1504 self.debugindent.push_str(" ");
1505 let mut path = path.clone();
1507 let count = elder_siblings.count(&child);
1508 path.add(&child, Some(count));
1509 self.extract_element_text(child, &path, whitespace, resource_id, inputfile, doc_num)?;
1510 self.debugindent.pop();
1511 self.debugindent.pop();
1512 } else {
1513 if self.config.debug {
1514 eprintln!("[STAM fromxml]{} ^- skipping this child node", self.debugindent);
1515 }
1516 continue;
1517 }
1518 }
1519
1520
1521 if let Some(textsuffix) = &element_config.textsuffix {
1523 if self.config.debug {
1524 eprintln!("[STAM fromxml]{} outputting textsuffix: {:?}", self.debugindent, textsuffix);
1525 }
1526 let result = self.render_template(
1527 textsuffix.as_str(),
1528 &node,
1529 Some(textbegin),
1530 Some(self.cursor),
1531 resource_id,
1532 inputfile,
1533 doc_num
1534 ).map_err(|e| match e {
1535 XmlConversionError::TemplateError(s, e) => {
1536 XmlConversionError::TemplateError(
1537 format!(
1538 "whilst rendering textsuffix template '{}' for node '{}': {}",
1539 textsuffix,
1540 node.tag_name().name(),
1541 s
1542 ),
1543 e,
1544 )
1545 }
1546 e => e,
1547 })?;
1548 let end_discount_tmp = result.chars().count();
1549 let end_bytediscount_tmp = result.len();
1550
1551
1552 self.text += &result;
1553
1554 if !element_config.annotatetextsuffix.is_empty() {
1555 let offset = Offset::simple(self.cursor, self.cursor + end_discount_tmp);
1557 self.positionmap.insert((doc_num, node.id(), PositionType::TextSuffix), offset);
1558 self.bytepositionmap
1559 .insert((doc_num, node.id(), PositionType::TextSuffix), (self.text.len() - end_bytediscount_tmp, self.text.len()));
1560 }
1561
1562 self.cursor += end_discount_tmp;
1563 self.pending_whitespace = false;
1564
1565 if element_config.include_textsuffix == Some(true) {
1566 end_discount = 0;
1568 end_bytediscount = 0;
1569 } else {
1570 end_discount = end_discount_tmp;
1572 end_bytediscount = end_bytediscount_tmp;
1573 }
1574
1575 }
1576 } else if element_config.annotation == XmlAnnotationHandling::TextSelectorBetweenMarkers
1577 {
1578 if self.config.debug {
1580 eprintln!("[STAM fromxml]{} adding to markers", self.debugindent);
1581 }
1582 self.markers
1583 .entry(element_config.hash())
1584 .and_modify(|v| v.push((doc_num, node.id())))
1585 .or_insert(vec![(doc_num, node.id())]);
1586 }
1587 } else if self.config.debug {
1588 eprintln!(
1589 "[STAM fromxml]{} WARNING: no match, skipping text extraction for element {}",
1590 self.debugindent,
1591 path
1592 );
1593 }
1594
1595 if begin <= (self.cursor - end_discount) {
1599 let offset = Offset::simple(begin, self.cursor - end_discount);
1600 if self.config.debug {
1601 eprintln!(
1602 "[STAM fromxml]{} extracted text for {} @{:?}: {:?}",
1603 self.debugindent,
1604 path,
1605 &offset,
1606 &self.text[bytebegin..(self.text.len() - end_bytediscount)]
1607 );
1608 }
1609 self.positionmap.insert((doc_num, node.id(), PositionType::Body), offset);
1610 self.bytepositionmap
1611 .insert((doc_num, node.id(), PositionType::Body), (bytebegin, self.text.len() - end_bytediscount));
1612 }
1613 Ok(())
1614 }
1615
1616 fn extract_element_annotation<'b>(
1621 &mut self,
1622 node: Node<'a,'b>,
1623 path: &NodePath<'a,'b>,
1624 inputfile: Option<&str>,
1625 doc_num: usize,
1626 store: &mut AnnotationStore,
1627 ) -> Result<(), XmlConversionError> {
1628 if self.config.debug {
1629 eprintln!("[STAM fromxml]{} extracting annotation from {}", self.debugindent, path);
1630 }
1631
1632 let mut elder_siblings = SiblingCounter::default();
1633
1634 if let Some(element_config) = self.config.element_config(node, &path) {
1636 if self.config.debug {
1637 eprintln!("[STAM fromxml]{} matching config: {:?}", self.debugindent, element_config);
1638 }
1639 if element_config.annotation != XmlAnnotationHandling::None
1640 && element_config.annotation != XmlAnnotationHandling::Unspecified
1641 {
1642 let mut builder = AnnotationBuilder::new();
1643
1644 let offset = self.positionmap.get(&(doc_num, node.id(), PositionType::Body));
1646 if element_config.annotation == XmlAnnotationHandling::TextSelector {
1647 if let Some((beginbyte, endbyte)) = self.bytepositionmap.get(&(doc_num, node.id(), PositionType::Body)) {
1648 if self.config.debug {
1649 eprintln!("[STAM fromxml]{} annotation covers text {:?} (bytes {}-{})", self.debugindent, offset, beginbyte, endbyte);
1650 }
1651 } else if self.text.is_empty() {
1652 return Err(XmlConversionError::ConfigError("Can't extract annotations on text if no text was extracted!".into()));
1653 }
1654 }
1655 let begin = if let Some(offset) = offset {
1656 if let Cursor::BeginAligned(begin) = offset.begin {
1657 Some(begin)
1658 } else {
1659 None
1660 }
1661 } else {
1662 None
1663 };
1664 let end = if let Some(offset) = offset {
1665 if let Cursor::BeginAligned(end) = offset.end {
1666 Some(end)
1667 } else {
1668 None
1669 }
1670 } else {
1671 None
1672 };
1673
1674 let resource_id = if let Some(resource_handle) = self.resource_handle {
1675 store.resource(resource_handle).unwrap().id()
1676 } else {
1677 None
1678 };
1679
1680 let mut have_id = false;
1681 if let Some(template) = &element_config.id {
1682 let context = self.context_for_node(&node, begin, end, template.as_str(), resource_id, inputfile, doc_num);
1683 let compiled_template = self.template_engine.template(template.as_str());
1684 let id = compiled_template.render(&context).to_string().map_err(|e|
1685 XmlConversionError::TemplateError(
1686 format!(
1687 "whilst rendering id template '{}' for node '{}'",
1688 template,
1689 node.tag_name().name(),
1690 ),
1691 Some(e),
1692 )
1693 )?;
1694 if !id.is_empty() {
1695 builder = builder.with_id(id);
1696 have_id = true;
1697 }
1698 }
1699
1700 if !have_id {
1701 if let Some(resource_id) = resource_id {
1703 builder = builder.with_id(stam::generate_id(&format!("{}-",resource_id), ""));
1704 } else {
1705 builder = builder.with_id(stam::generate_id("", ""));
1706 }
1707 }
1708
1709 builder = self.add_annotationdata_to_builder(element_config.annotationdata.iter(), builder, node.clone(), begin, end, resource_id, inputfile, doc_num)?;
1710
1711
1712 if self.config.provenance && inputfile.is_some() {
1713 let path_string = if let Some(id) = node.attribute((NS_XML,"id")) {
1714 format!("//{}[@xml:id=\"{}\"]", self.get_node_name_for_xpath(&node), id)
1716 } else {
1717 path.format_as_xpath(&self.prefixes)
1719 };
1720 let databuilder = AnnotationDataBuilder::new().with_dataset(CONTEXT_ANNO.into()).with_key("target".into()).with_value(
1721 BTreeMap::from([
1722 ("source".to_string(),inputfile.unwrap().into()),
1723 ("selector".to_string(),
1724 BTreeMap::from([
1725 ("type".to_string(),"XPathSelector".into()),
1726 ("value".to_string(),path_string.into())
1727 ]).into()
1728 )
1729 ]).into()
1730 );
1731 builder = builder.with_data_builder(databuilder);
1732 }
1733
1734
1735 match element_config.annotation {
1737 XmlAnnotationHandling::TextSelector => {
1738 if let Some(selector) = self.textselector(node, doc_num, PositionType::Body) {
1740 builder = builder.with_target(selector);
1741 if self.config.debug {
1742 eprintln!("[STAM fromxml] builder AnnotateText: {:?}", builder);
1743 }
1744 store.annotate(builder)?;
1745 }
1746 if !element_config.annotatetextprefix.is_empty() || !element_config.annotatetextsuffix.is_empty() {
1747 self.annotate_textaffixes(node, element_config, inputfile, doc_num, store)?;
1748 }
1749 }
1750 XmlAnnotationHandling::ResourceSelector => {
1751 builder = builder.with_target(SelectorBuilder::ResourceSelector(
1753 self.resource_handle.into(),
1754 ));
1755 if self.config.debug {
1756 eprintln!("[STAM fromxml] builder AnnotateResource: {:?}", builder);
1757 }
1758 store.annotate(builder)?;
1759 }
1760 XmlAnnotationHandling::TextSelectorBetweenMarkers => {
1761 if let Some(selector) =
1763 self.textselector_for_markers(node, doc_num, store, element_config)
1764 {
1765 builder = builder.with_target(selector);
1766 if self.config.debug {
1767 eprintln!(
1768 "[STAM fromxml] builder TextSelectorBetweenMarkers: {:?}",
1769 builder
1770 );
1771 }
1772 store.annotate(builder)?;
1773 if !element_config.annotatetextprefix.is_empty() || !element_config.annotatetextsuffix.is_empty() {
1774 self.annotate_textaffixes(node, element_config, inputfile, doc_num, store)?;
1775 }
1776 }
1777 }
1778 _ => panic!(
1779 "Invalid annotationhandling: {:?}",
1780 element_config.annotation
1781 ),
1782 }
1783 }
1784
1785 if element_config.stop == Some(false) || element_config.stop.is_none() {
1787 for child in node.children() {
1788 if child.is_element() {
1789 self.debugindent.push_str(" ");
1790 let mut path = path.clone();
1791 let count = elder_siblings.count(&child);
1792 path.add(&child, Some(count));
1793 self.extract_element_annotation(child, &path, inputfile, doc_num, store)?;
1795 self.debugindent.pop();
1796 self.debugindent.pop();
1797 }
1798 }
1799 }
1800 } else {
1801 eprintln!(
1802 "[STAM fromxml]{} WARNING: no match, skipping annotation extraction for element {}",
1803 self.debugindent,
1804 path
1805 );
1806 }
1807 Ok(())
1808 }
1809
1810 fn add_annotationdata_to_builder<'input>(&self, iter: impl Iterator<Item = &'a XmlAnnotationDataConfig>,
1811 mut builder: AnnotationBuilder<'a>,
1812 node: Node<'a, 'input>,
1813 begin: Option<usize>,
1814 end: Option<usize>,
1815 resource_id: Option<&str>,
1816 inputfile: Option<&str>,
1817 doc_num: usize,
1818 ) -> Result<AnnotationBuilder<'a>, XmlConversionError> {
1819 for annotationdata in iter {
1820 let mut databuilder = AnnotationDataBuilder::new();
1821 if let Some(template) = &annotationdata.set {
1822 let context = self.context_for_node(&node, begin, end, template.as_str(), resource_id, inputfile, doc_num);
1823 let compiled_template = self.template_engine.template(template.as_str());
1824 let dataset = compiled_template.render(&context).to_string().map_err(|e|
1825 XmlConversionError::TemplateError(
1826 format!(
1827 "whilst rendering annotationdata/dataset template '{}' for node '{}'",
1828 template,
1829 node.tag_name().name(),
1830 ),
1831 Some(e),
1832 )
1833 )?;
1834 if !dataset.is_empty() {
1835 databuilder = databuilder.with_dataset(dataset.into())
1836 }
1837 } else {
1838 databuilder =
1839 databuilder.with_dataset(self.config.default_set.as_str().into());
1840 }
1841 if let Some(template) = &annotationdata.key {
1842 let context = self.context_for_node(&node, begin, end, template.as_str(), resource_id, inputfile, doc_num);
1843 let compiled_template = self.template_engine.template(template.as_str());
1844 match compiled_template.render(&context).to_string().map_err(|e|
1845 XmlConversionError::TemplateError(
1846 format!(
1847 "whilst rendering annotationdata/key template '{}' for node '{}'",
1848 template,
1849 node.tag_name().name(),
1850 ),
1851 Some(e),
1852 )
1853 ) {
1854 Ok(key) if !key.is_empty() =>
1855 databuilder = databuilder.with_key(key.into()) ,
1856 Ok(_) if !annotationdata.skip_if_missing => {
1857 return Err(XmlConversionError::TemplateError(
1858 format!(
1859 "whilst rendering annotationdata/key template '{}' for node '{}'",
1860 template,
1861 node.tag_name().name(),
1862 ),
1863 None
1864 ));
1865 },
1866 Err(e) if !annotationdata.skip_if_missing => {
1867 return Err(e)
1868 },
1869 _ => {
1870 continue
1872 }
1873 }
1874 }
1875 if let Some(value) = &annotationdata.value {
1876 match self.extract_value(value, node, annotationdata.allow_empty_value, annotationdata.skip_if_missing, begin, end, resource_id, inputfile, doc_num)? {
1877 Some(value) => {
1878 databuilder = databuilder.with_value(value);
1879 },
1880 None => {
1881 continue
1883 }
1884 }
1885 }
1886 builder = builder.with_data_builder(databuilder);
1887 }
1888 Ok(builder)
1889 }
1890
1891 fn annotate_textaffixes<'b>(
1893 &mut self,
1894 node: Node<'a,'b>,
1895 element_config: &XmlElementConfig,
1896 inputfile: Option<&str>,
1897 doc_num: usize,
1898 store: &mut AnnotationStore,
1899 ) -> Result<(), XmlConversionError> {
1900
1901
1902 if !element_config.annotatetextprefix.is_empty() {
1903 let mut builder = AnnotationBuilder::new().with_id(stam::generate_id("textprefix-", ""));
1904 if let Some(offset) = self.positionmap.get(&(doc_num, node.id(), PositionType::TextPrefix)) {
1905 let begin = if let Cursor::BeginAligned(begin) = offset.begin {
1906 Some(begin)
1907 } else {
1908 None
1909 };
1910 let end = if let Cursor::BeginAligned(end) = offset.end {
1911 Some(end)
1912 } else {
1913 None
1914 };
1915 builder = self.add_annotationdata_to_builder(element_config.annotatetextprefix.iter(), builder, node.clone(), begin,end, None, inputfile, doc_num)?; if let Some(selector) = self.textselector(node, doc_num, PositionType::TextPrefix) {
1917 builder = builder.with_target(selector);
1918 if self.config.debug {
1919 eprintln!("[STAM fromxml] builder AnnotateText: {:?}", builder);
1920 }
1921 store.annotate(builder)?;
1922 } else {
1923 return Err(XmlConversionError::ConfigError("Failed to create textselector to target textprefix".into()));
1924 }
1925 }
1926 }
1927
1928 if !element_config.annotatetextsuffix.is_empty() {
1929 let mut builder = AnnotationBuilder::new().with_id(stam::generate_id("textsuffix-", ""));
1930 if let Some(offset) = self.positionmap.get(&(doc_num, node.id(), PositionType::TextSuffix)) {
1931 let begin = if let Cursor::BeginAligned(begin) = offset.begin {
1932 Some(begin)
1933 } else {
1934 None
1935 };
1936 let end = if let Cursor::BeginAligned(end) = offset.end {
1937 Some(end)
1938 } else {
1939 None
1940 };
1941 builder = self.add_annotationdata_to_builder(element_config.annotatetextsuffix.iter(), builder, node.clone(), begin,end, None, inputfile, doc_num)?; if let Some(selector) = self.textselector(node, doc_num, PositionType::TextSuffix) {
1943 builder = builder.with_target(selector);
1944 if self.config.debug {
1945 eprintln!("[STAM fromxml] builder AnnotateText: {:?}", builder);
1946 }
1947 store.annotate(builder)?;
1948 } else {
1949 return Err(XmlConversionError::ConfigError("Failed to create textselector to target textprefix".into()));
1950 }
1951 }
1952 }
1953 Ok(())
1954 }
1955
1956 fn extract_value<'b>(&self, value: &'a toml::Value, node: Node<'a,'b>, allow_empty_value: bool, skip_if_missing: bool, begin: Option<usize>, end: Option<usize>, resource_id: Option<&str>, inputfile: Option<&str>, doc_num: usize) -> Result<Option<DataValue>, XmlConversionError>{
1958 match value {
1959 toml::Value::String(template) => {
1960 let context = self.context_for_node(&node, begin, end, template.as_str(), resource_id, inputfile, doc_num);
1961 let compiled_template = self.template_engine.template(template.as_str()); match compiled_template.render(&context).to_string().map_err(|e|
1973 XmlConversionError::TemplateError(
1974 format!(
1975 "whilst rendering annotationdata/map template '{}' for node '{}'.{}",
1976 template,
1977 node.tag_name().name(),
1978 if self.config.debug() {
1979 format!("\nContext was {:?}.\nVariables are: {:?}", context, self.variables.get(template))
1980 } else {
1981 String::new()
1982 }
1983 ),
1984 Some(e),
1985 )
1986 ) {
1987 Ok(value) => {
1988 if !value.is_empty() || allow_empty_value {
1989 Ok(Some(value.into()))
1990 } else {
1991 Ok(None)
1993 }
1994 },
1995 Err(e) if !skip_if_missing => {
1996 Err(e)
1997 },
1998 Err(_) if allow_empty_value => {
1999 Ok(Some("".into()))
2000 },
2001 Err(_) => {
2002 Ok(None)
2004 }
2005 }
2006 },
2007 toml::Value::Table(map) => {
2008 let mut resultmap: BTreeMap<String,DataValue> = BTreeMap::new();
2009 for (key, value) in map.iter() {
2010 if let Some(value) = self.extract_value(value, node, false, true, begin, end, resource_id, inputfile, doc_num)? {
2011 resultmap.insert(key.clone(), value);
2012 }
2013 }
2014 Ok(Some(resultmap.into()))
2015 },
2016 toml::Value::Array(list) => {
2017 let mut resultlist: Vec<DataValue> = Vec::new();
2018 for value in list.iter() {
2019 if let Some(value) = self.extract_value(value, node, false, true, begin, end, resource_id, inputfile, doc_num)? {
2020 resultlist.push(value);
2021 }
2022 }
2023 Ok(Some(resultlist.into()))
2024 }
2025 toml::Value::Boolean(v) => Ok(Some(DataValue::Bool(*v))),
2026 toml::Value::Float(v) => Ok(Some(DataValue::Float(*v))),
2027 toml::Value::Integer(v) => Ok(Some(DataValue::Int(*v as isize))),
2028 toml::Value::Datetime(_v) => {
2029 todo!("fromxml: Datetime conversion not implemented yet");
2030 }
2031 }
2032 }
2033
2034 fn extract_value_metadata<'b>(&self, value: &'a toml::Value, context: &upon::Value, allow_empty_value: bool, skip_if_missing: bool, resource_id: Option<&str>) -> Result<Option<DataValue>, XmlConversionError>{
2036 match value {
2037 toml::Value::String(template) => {
2038 let compiled_template = self.template_engine.template(template.as_str()); match compiled_template.render(&context).to_string().map_err(|e|
2040 XmlConversionError::TemplateError(
2041 format!(
2042 "whilst rendering annotationdata/metadata template '{}' for metadata",
2043 template,
2044 ),
2045 Some(e),
2046 )
2047 ) {
2048 Ok(value) => {
2049 if !value.is_empty() || allow_empty_value {
2050 Ok(Some(value.into()))
2051 } else {
2052 Ok(None)
2054 }
2055 },
2056 Err(e) if !skip_if_missing => {
2057 Err(e)
2058 },
2059 Err(_) if allow_empty_value => {
2060 Ok(Some("".into()))
2061 },
2062 Err(_) => {
2063 Ok(None)
2065 }
2066 }
2067 },
2068 toml::Value::Table(map) => {
2069 let mut resultmap: BTreeMap<String,DataValue> = BTreeMap::new();
2070 for (key, value) in map.iter() {
2071 if let Some(value) = self.extract_value_metadata(value, context, false, true, resource_id)? {
2072 resultmap.insert(key.clone(), value);
2073 }
2074 }
2075 Ok(Some(resultmap.into()))
2076 },
2077 toml::Value::Array(list) => {
2078 let mut resultlist: Vec<DataValue> = Vec::new();
2079 for value in list.iter() {
2080 if let Some(value) = self.extract_value_metadata(value, context, false, true, resource_id)? {
2081 resultlist.push(value);
2082 }
2083 }
2084 Ok(Some(resultlist.into()))
2085 }
2086 toml::Value::Boolean(v) => Ok(Some(DataValue::Bool(*v))),
2087 toml::Value::Float(v) => Ok(Some(DataValue::Float(*v))),
2088 toml::Value::Integer(v) => Ok(Some(DataValue::Int(*v as isize))),
2089 toml::Value::Datetime(_v) => {
2090 todo!("fromxml: Datetime conversion not implemented yet");
2091 }
2092 }
2093 }
2094
2095 fn textselector<'s>(&'s self, node: Node, doc_num: usize, positiontype: PositionType) -> Option<SelectorBuilder<'s>> {
2097 let res_handle = self.resource_handle.expect("resource must be associated");
2098 if let Some(offset) = self.positionmap.get(&(doc_num, node.id(), positiontype)) {
2099 Some(SelectorBuilder::TextSelector(
2100 BuildItem::Handle(res_handle),
2101 offset.clone(),
2102 ))
2103 } else {
2104 None
2105 }
2106 }
2107
2108 fn textselector_for_markers<'b>(
2110 &self,
2111 node: Node,
2112 doc_num: usize,
2113 store: &AnnotationStore,
2114 element_config: &'b XmlElementConfig,
2115 ) -> Option<SelectorBuilder<'b>> {
2116 let resource = store
2117 .resource(
2118 self.resource_handle
2119 .expect("resource must have been created"),
2120 )
2121 .expect("resource must exist");
2122 let mut end: Option<usize> = None;
2123 if let Some(markers) = self.markers.get(&element_config.hash()) {
2124 let mut grab = false;
2125 for (d_num, n_id) in markers.iter() {
2126 if grab {
2127 end = self.positionmap.get(&(*d_num, *n_id, PositionType::Body)).map(|offset| {
2129 offset
2130 .begin
2131 .try_into()
2132 .expect("begin cursor must be beginaligned")
2133 });
2134 break;
2135 }
2136 if doc_num == *d_num && *n_id == node.id() {
2137 grab = true;
2139 }
2140 }
2141 };
2142 if end.is_none() {
2143 end = Some(resource.textlen());
2145 }
2146 if let (Some(offset), Some(end)) = (self.positionmap.get(&(doc_num, node.id(), PositionType::Body)), end) {
2147 Some(SelectorBuilder::TextSelector(
2148 BuildItem::Handle(self.resource_handle.unwrap()),
2149 Offset::simple(
2150 offset
2151 .begin
2152 .try_into()
2153 .expect("begin cursor must be beginaligned"),
2154 end,
2155 ),
2156 ))
2157 } else {
2158 None
2159 }
2160 }
2161
2162 fn set_global_context(&mut self) {
2163 self.global_context
2164 .insert("context".into(), upon::Value::Map(self.config.context.iter().map(|(k,v)| (k.clone(), map_value(v))).collect()));
2165 self.global_context
2166 .insert("namespaces".into(), self.config.namespaces.clone().into());
2167 self.global_context
2168 .insert("default_set".into(), self.config.default_set.clone().into());
2169 }
2170
2171 fn render_template<'input, 't>(
2172 &self,
2173 template: &'t str,
2174 node: &Node<'a, 'input>,
2175 begin: Option<usize>,
2176 end: Option<usize>,
2177 resource: Option<&str>,
2178 inputfile: Option<&str>,
2179 doc_num: usize,
2180 ) -> Result<Cow<'t, str>, XmlConversionError> {
2181 if template.chars().any(|c| c == '{') {
2182 let compiled_template = self.template_engine.template(template);
2184 let context = self.context_for_node(&node, begin, end, template, resource, inputfile, doc_num);
2185 let result = compiled_template.render(context).to_string()?;
2186 Ok(Cow::Owned(result))
2187 } else {
2188 Ok(Cow::Borrowed(template))
2190 }
2191 }
2192
2193 fn context_for_node<'input>(
2194 &self,
2195 node: &Node<'a, 'input>,
2196 begin: Option<usize>,
2197 end: Option<usize>,
2198 template: &str,
2199 resource: Option<&str>,
2200 inputfile: Option<&str>,
2201 doc_num: usize,
2202 ) -> upon::Value {
2203 let mut context = self.global_context.clone();
2204 let length = if let (Some(begin), Some(end)) = (begin, end) {
2205 Some(end - begin)
2206 } else {
2207 None
2208 };
2209 context.insert("localname".into(), node.tag_name().name().into());
2210 context.insert("name".into(), self.get_node_name_for_template(node).into());
2212 if let Some(namespace) = node.tag_name().namespace() {
2213 context.insert("namespace".into(), namespace.into());
2215 }
2216
2217 if let Some(begin) = begin {
2219 context.insert("begin".into(), upon::Value::Integer(begin as i64));
2220 }
2221 if let Some(end) = end {
2222 context.insert("end".into(), upon::Value::Integer(end as i64));
2223 }
2224 if let Some(length) = length {
2225 context.insert("length".into(), upon::Value::Integer(length as i64));
2226 }
2227 if let Some(resource) = resource {
2228 context.insert("resource".into(), resource.into());
2230 }
2231 if let Some(inputfile) = inputfile {
2232 context.insert("inputfile".into(), inputfile.into());
2234 }
2235 context.insert("doc_num".into(), upon::Value::Integer(doc_num as i64));
2237
2238 if let Some(vars) = self.variables.get(template) {
2239 for var in vars {
2240 let mut encodedvar = String::new();
2241 if let Some(value) = self.context_for_var(node, var, &mut encodedvar) {
2242 if self.config.debug() {
2243 eprintln!(
2244 "[STAM fromxml] Set context variable for template '{}' for node '{}': {}={:?} (encodedvar={})",
2245 template,
2246 node.tag_name().name(),
2247 var,
2248 value,
2249 encodedvar
2250 );
2251 }
2252 if value != upon::Value::None {
2253 context.insert(encodedvar, value);
2254 }
2255 } else if self.config.debug() {
2256 eprintln!(
2257 "[STAM fromxml] Missed context variable for template '{}' for node '{}': {}",
2258 template,
2259 node.tag_name().name(),
2260 var
2261 );
2262 }
2263 }
2264 }
2265 upon::Value::Map(context)
2266 }
2267
2268 fn context_for_var<'input>(
2271 &self,
2272 node: &Node<'a, 'input>,
2273 var: &str,
2274 path: &mut String,
2275 ) -> Option<upon::Value> {
2276
2277 let first = path.is_empty();
2278 let var =
2279 if var.starts_with("?.$") {
2280 if first {
2281 path.push_str("?.ELEMENT_");
2282 };
2283 &var[3..]
2284 } else if var.starts_with("$") {
2285 if first {
2286 path.push_str("ELEMENT_");
2287 };
2288 &var[1..]
2289 } else if var.starts_with("?.@") {
2290 if first {
2291 path.push_str("?.");
2292 };
2293 &var[2..]
2294 } else {
2295 var
2296 };
2297
2298 if !first && !var.is_empty() && !path.ends_with("ELEMENT_"){
2299 path.push_str("_IN_");
2300 }
2301
2302 let (component, remainder) = var.split_once("/").unwrap_or((var,""));
2304 if component.is_empty() {
2306 if first && !remainder.is_empty() {
2307 let mut n = node.clone();
2309 while let Some(parentnode) = n.parent_element() {
2311 n = parentnode;
2312 }
2313 let (rootcomponent, remainder) = remainder.split_once("/").unwrap_or((remainder,""));
2315 let (prefix, localname) = if let Some(pos) = rootcomponent.find(":") {
2316 (Some(&rootcomponent[0..pos]), &rootcomponent[pos+1..])
2317 } else {
2318 (None, rootcomponent)
2319 };
2320 if localname != n.tag_name().name() && localname != "*" {
2322 None
2323 } else {
2324 if let Some(prefix) = prefix {
2325 path.push_str(prefix);
2326 path.push_str("__");
2327 }
2328 path.push_str(localname);
2329 self.context_for_var(&n, remainder, path)
2330 }
2331 } else {
2332 Some(recursive_text(node).into())
2335 }
2336 } else if component.starts_with("@"){
2337 if let Some(pos) = component.find(":") {
2338 let prefix = &component[1..pos];
2339 if let Some(ns) = self.config.namespaces.get(prefix) {
2340 let var = &component[pos+1..];
2341 path.push_str("ATTRIB_");
2342 path.push_str(prefix);
2343 path.push_str("__");
2344 path.push_str(var);
2345 Some(
2346 node.attribute((ns.as_str(),var)).into()
2347 )
2348 } else {
2349 None
2350 }
2351 } else {
2352 let var = &component[1..];
2353 path.push_str("ATTRIB_");
2354 path.push_str(var);
2355 Some(
2356 node.attribute(var).into()
2357 )
2358 }
2359 } else if component == ".." {
2360 if let Some(parentnode) = node.parent_element().as_ref() {
2361 path.push_str("PARENT");
2363 self.context_for_var(parentnode, remainder, path)
2364 } else {
2365 None
2366 }
2367 } else if component == "." {
2368 path.push_str("THIS");
2369 if !remainder.is_empty() {
2370 self.context_for_var(node, remainder, path)
2372 } else {
2373 Some(recursive_text(node).into())
2374 }
2375 } else {
2376 let (prefix, localname) = if let Some(pos) = component.find(":") {
2377 (Some(&component[0..pos]), &component[pos+1..])
2378 } else {
2379 (None, component)
2380 };
2381 let localname_with_condition = localname;
2382 let (localname, condition_str, condition) = self.extract_condition(localname_with_condition); for child in node.children() {
2385 if child.is_element() {
2386 let namedata = child.tag_name();
2387 let mut child_matches = if let Some(namespace) = namedata.namespace() {
2388 if let Some(foundprefix) = self.prefixes.get(namespace) {
2389 Some(foundprefix.as_str()) == prefix && localname == namedata.name()
2390 } else {
2391 false
2392 }
2393 } else {
2394 namedata.name() == localname
2395 };
2396 if child_matches {
2397 if let Some((attribname, negate, attribvalue)) = condition {
2399 if let Some(pos) = attribname.find(":") {
2401 let prefix = &attribname[0..pos];
2402 if let Some(ns) = self.config.namespaces.get(prefix) {
2403 let attribname = &attribname[pos+1..];
2404 if let Some(value) = child.attribute((ns.as_str(),attribname)) {
2405 if !negate && attribvalue != Some(value) {
2406 child_matches = false;
2407 } else if negate && attribvalue == Some(value) {
2408 child_matches = false;
2409 }
2410 } else {
2411 child_matches = false;
2412 }
2413 } else {
2414 child_matches = false;
2415 }
2416 } else {
2417 if let Some(value) = child.attribute(attribname) {
2418 if !negate && attribvalue != Some(value) {
2419 child_matches = false;
2420 } else if negate && attribvalue == Some(value) {
2421 child_matches = false;
2422 }
2423 } else {
2424 child_matches = false;
2425 }
2426 }
2427 }
2428 if !child_matches && self.config.debug {
2429 eprintln!("[STAM fromxml] candidate node does not meet condition: {}", localname_with_condition);
2430 }
2431 }
2433 if child_matches {
2434 if let Some(prefix) = prefix {
2435 path.push_str(prefix);
2436 path.push_str("__");
2437 }
2438 path.push_str(localname);
2439 if condition.is_some() {
2440 let mut hasher = DefaultHasher::new();
2442 condition_str.hash(&mut hasher);
2443 let h = hasher.finish();
2444 path.push_str(&format!("_COND{}_", h));
2445 }
2446 return self.context_for_var(&child, remainder, path);
2447 }
2448 }
2449 }
2450 None
2452 }
2453 }
2454
2455 fn extract_condition<'b>(&self, localname: &'b str) -> (&'b str, &'b str, Option<(&'b str, bool, Option<&'b str>)>) { if localname.ends_with("]") {
2458 if let Some(pos) = localname.find("[") {
2459 let condition = &localname[pos+1..localname.len()-1];
2460 let (mut attrib, negation, attribvalue) = if let Some(pos) = condition.find("=") {
2461 let attrib = condition[0..pos].trim();
2462 let value = condition[pos+1..].trim();
2463 let value = &value[1..value.len() - 1]; if attrib.ends_with('!') {
2465 (attrib[..attrib.len() - 1].trim(), true, Some(value))
2467 } else {
2468 (attrib.trim(), false, Some(value))
2469 }
2470 } else {
2471 (condition, false, None)
2472 };
2473 if attrib.starts_with('@') {
2474 attrib = &attrib[1..];
2476 }
2477 return (&localname[..pos], condition, Some((attrib, negation,attribvalue )) );
2478 }
2479 }
2480 (localname, "", None)
2481 }
2482
2483
2484 fn get_node_name_for_template<'b>(&self, node: &'b Node) -> Cow<'b,str> {
2485 let extended_name = node.tag_name();
2486 match (extended_name.namespace(), extended_name.name()) {
2487 (Some(namespace), tagname) => {
2488 if let Some(prefix) = self.prefixes.get(namespace) {
2489 Cow::Owned(format!("{}__{}", prefix, tagname))
2490 } else {
2491 Cow::Borrowed(tagname)
2492 }
2493 }
2494 (None, tagname) => Cow::Borrowed(tagname),
2495 }
2496 }
2497
2498 fn get_node_name_for_xpath<'b>(&self, node: &'b Node) -> Cow<'b,str> {
2499 let extended_name = node.tag_name();
2500 match (extended_name.namespace(), extended_name.name()) {
2501 (Some(namespace), tagname) => {
2502 if let Some(prefix) = self.prefixes.get(namespace) {
2503 Cow::Owned(format!("{}:{}", prefix, tagname))
2504 } else {
2505 Cow::Borrowed(tagname)
2506 }
2507 }
2508 (None, tagname) => Cow::Borrowed(tagname),
2509 }
2510 }
2511
2512
2513 fn precompile(&mut self, template: &'a str) -> Cow<'a,str> {
2514 let mut replacement = String::new();
2515 let mut variables: BTreeSet<&'a str> = BTreeSet::new();
2516 let mut begin = 0;
2517 let mut end = 0;
2518 for i in 0..template.len() {
2519 let slice = &template[i..];
2520 if slice.starts_with("{{") || slice.starts_with("{%") {
2521 begin = i;
2522 } else if slice.starts_with("}}") || slice.starts_with("%}") {
2523 if end < begin+2 {
2524 replacement.push_str(&template[end..begin+2]);
2525 }
2526 let inner = &template[begin+2..i]; replacement.push_str(&self.precompile_inblock(inner, &mut variables));
2528 end = i;
2529 }
2530 }
2531 if end > 0 {
2532 replacement.push_str(&template[end..]);
2533 }
2534 self.variables.insert(template.into(), variables);
2535 if !replacement.is_empty() {
2538 Cow::Owned(replacement)
2539 } else {
2540 Cow::Borrowed(template)
2541 }
2542 }
2543
2544 fn precompile_inblock<'s>(&self, s: &'s str, vars: &mut BTreeSet<&'s str>) -> Cow<'s,str> {
2545 let mut quoted = false;
2546 let mut var = false;
2547 let mut begin = 0;
2548 let mut end = 0;
2549 let mut replacement = String::new();
2550 let mut in_condition = false;
2551 for (i,c) in s.char_indices() {
2552 if in_condition && c != ']' {
2553 continue;
2554 }
2555 if c == '"' {
2556 quoted = !quoted;
2557 } else if !quoted {
2558 if !var && (c == '@' || c == '$') {
2559 var = true;
2561 begin = i;
2562 } else if var && c == '[' {
2563 in_condition = true;
2564 } else if var && in_condition && c == ']' {
2565 in_condition = false;
2567 } else if var && in_condition {
2568 continue;
2570 } else if var && (!c.is_alphanumeric() && c != '.' && c != '/' && c != '_' && c != ':' && c != '@') {
2571 if end < begin {
2573 replacement.push_str(&s[end..begin]);
2574 }
2575 let varname = &s[begin..i];
2576 vars.insert(varname);
2577 let replacement_var = self.precompile_name(varname);
2578 replacement += &replacement_var;
2579 end = i;
2580 var = false;
2581 }
2582 }
2583 }
2584 if end > 0 {
2585 replacement.push_str(&s[end..]);
2586 }
2587 if var {
2588 let varname = &s[begin..];
2590 vars.insert(varname);
2591 let replacement_var = self.precompile_name(varname);
2592 replacement += &replacement_var;
2593 }
2594 if !replacement.is_empty() {
2595 Cow::Owned(replacement)
2597 } else {
2598 Cow::Borrowed(s)
2599 }
2600 }
2601
2602 fn precompile_name(&self, s: &str) -> String {
2604 let mut replacement = String::new();
2605 let mut begincondition = None;
2606 let mut skip = 0;
2607 for (i,c) in s.char_indices() {
2608 if begincondition.is_some() && c != ']' {
2609 continue;
2610 } else if skip > 0 {
2611 skip -= 1;
2612 continue;
2613 }
2614 if c == '$' {
2615 let slice = &s[i..];
2616 if slice.starts_with("$..") {
2617 replacement.push_str("ELEMENT_PARENT");
2618 skip = 2;
2619 } else if slice.starts_with("$.") {
2620 replacement.push_str("ELEMENT_THIS");
2621 skip = 1;
2622 } else if slice.starts_with("$/") {
2623 replacement.push_str("ELEMENT_");
2624 skip = 1;
2625 } else {
2626 replacement.push_str("ELEMENT_");
2627 }
2628 } else if c == '@' {
2629 replacement.push_str("ATTRIB_");
2630 } else if c == '/' {
2631 replacement.push_str("_IN_");
2632 } else if c == ':' {
2633 replacement.push_str("__");
2634 } else if c == '[' {
2635 begincondition = Some(i+1);
2636 } else if c == ']' {
2637 if let Some(begin) = begincondition {
2639 let mut hasher = DefaultHasher::new();
2640 let _ = &s[begin..i].hash(&mut hasher);
2641 let h = hasher.finish();
2642 replacement.push_str(&format!("_COND{}_", h));
2643 }
2644 begincondition = None;
2645 } else {
2646 replacement.push(c);
2647 }
2648 }
2649 replacement
2651 }
2652
2653 fn add_metadata(&self, store: &mut AnnotationStore) -> Result<(), XmlConversionError> {
2654 for metadata in self.config.metadata.iter() {
2655 let mut builder = AnnotationBuilder::new();
2656
2657 let resource_id = if let Some(resource_handle) = self.resource_handle {
2658 store.resource(resource_handle).unwrap().id()
2659 } else {
2660 None
2661 };
2662
2663 let mut context = self.global_context.clone();
2664 if let Some(resource_id) = resource_id {
2665 context.insert("resource".into(), resource_id.into());
2666 }
2667
2668 if let Some(template) = &metadata.id {
2669 let compiled_template = self.template_engine.template(template.as_str());
2670 let id = compiled_template.render(&context).to_string().map_err(|e|
2671 XmlConversionError::TemplateError(
2672 format!(
2673 "whilst rendering metadata id template '{}'",
2674 template,
2675 ),
2676 Some(e),
2677 )
2678 )?;
2679 if !id.is_empty() {
2680 builder = builder.with_id(id);
2681 }
2682 }
2683
2684 for annotationdata in metadata.annotationdata.iter() {
2685 let mut databuilder = AnnotationDataBuilder::new();
2686 if let Some(template) = &annotationdata.set {
2687 let compiled_template = self.template_engine.template(template.as_str());
2688 let dataset = compiled_template.render(&context).to_string().map_err(|e|
2689 XmlConversionError::TemplateError(
2690 format!(
2691 "whilst rendering annotationdata/dataset template '{}' for metadata",
2692 template,
2693 ),
2694 Some(e),
2695 )
2696 )?;
2697 if !dataset.is_empty() {
2698 databuilder = databuilder.with_dataset(dataset.into())
2699 }
2700 } else {
2701 databuilder =
2702 databuilder.with_dataset(self.config.default_set.as_str().into());
2703 }
2704 if let Some(template) = &annotationdata.key {
2705 let compiled_template = self.template_engine.template(template.as_str());
2706 match compiled_template.render(&context).to_string().map_err(|e|
2707 XmlConversionError::TemplateError(
2708 format!(
2709 "whilst rendering annotationdata/key template '{}' for metadata",
2710 template,
2711 ),
2712 Some(e),
2713 )
2714 ) {
2715 Ok(key) if !key.is_empty() =>
2716 databuilder = databuilder.with_key(key.into()) ,
2717 Ok(_) if !annotationdata.skip_if_missing => {
2718 return Err(XmlConversionError::TemplateError(
2719 format!(
2720 "whilst rendering annotationdata/key template '{}' metadata",
2721 template,
2722 ),
2723 None
2724 ));
2725 },
2726 Err(e) if !annotationdata.skip_if_missing => {
2727 return Err(e)
2728 },
2729 _ => {
2730 continue
2732 }
2733 }
2734 }
2735 if let Some(value) = &annotationdata.value {
2736 match self.extract_value_metadata(value, &upon::Value::Map(context.clone()), annotationdata.allow_empty_value, annotationdata.skip_if_missing, resource_id.as_deref())? {
2737 Some(value) => {
2738 databuilder = databuilder.with_value(value);
2739 },
2740 None => {
2741 continue
2743 }
2744 }
2745 }
2746 builder = builder.with_data_builder(databuilder);
2747 }
2748
2749
2750
2751 match metadata.annotation {
2753 XmlAnnotationHandling::TextSelector => {
2754 builder = builder.with_target(SelectorBuilder::TextSelector(BuildItem::Handle(self.resource_handle.expect("resource must have handle")), Offset::whole()));
2756 if self.config.debug {
2757 eprintln!("[STAM fromxml] builder AnnotateText: {:?}", builder);
2758 }
2759 store.annotate(builder)?;
2760 }
2761 XmlAnnotationHandling::ResourceSelector | XmlAnnotationHandling::None | XmlAnnotationHandling::Unspecified => {
2762 builder = builder.with_target(SelectorBuilder::ResourceSelector(
2764 self.resource_handle.into(),
2765 ));
2766 if self.config.debug {
2767 eprintln!("[STAM fromxml] builder AnnotateResource: {:?}", builder);
2768 }
2769 store.annotate(builder)?;
2770 }
2771 _ => panic!(
2772 "Invalid annotationhandling for metadata: {:?}",
2773 metadata.annotation
2774 ),
2775 }
2776 }
2777 Ok(())
2778 }
2779}
2780
2781
2782
2783fn recursive_text(node: &Node) -> String {
2785 let mut s = String::new();
2786 for child in node.children() {
2787 if child.is_text() {
2788 s += child.text().expect("should have text");
2789 } else if child.is_element() {
2790 s += &recursive_text(&child);
2791 }
2792 }
2793 s
2794}
2795
2796fn filter_capitalize(s: &str) -> String {
2798 let mut out = String::with_capacity(s.len());
2799 for (i, c) in s.chars().enumerate() {
2800 if i == 0 {
2801 out.push_str(&c.to_uppercase().collect::<String>())
2802 } else {
2803 out.push(c);
2804 }
2805 }
2806 out
2807}
2808
2809fn map_value(value: &toml::Value) -> upon::Value {
2811 match value {
2812 toml::Value::String(s) => upon::Value::String(s.clone()),
2813 toml::Value::Integer(i) => upon::Value::Integer(*i),
2814 toml::Value::Float(i) => upon::Value::Float(*i),
2815 toml::Value::Boolean(v) => upon::Value::Bool(*v),
2816 toml::Value::Datetime(s) => upon::Value::String(s.to_string()),
2817 toml::Value::Array(v) => upon::Value::List(v.iter().map(|i| map_value(i)).collect()),
2818 toml::Value::Table(v) => upon::Value::Map(v.iter().map(|(k,i)| (k.clone(),map_value(i))).collect()),
2819 }
2820}
2821
2822#[cfg(test)]
2823mod tests {
2824 use super::*;
2825 const XMLSMALLEXAMPLE: &'static str = r#"<html xmlns="http://www.w3.org/1999/xhtml">
2828<head><title>test</title></head><body><h1>TEST</h1><p xml:id="p1">This is a <em xml:id="emphasis" style="color:green">test</em>.</p></body></html>"#;
2829
2830 const XMLEXAMPLE: &'static str = r#"<!DOCTYPE entities[<!ENTITY nbsp " ">]>
2831<html xmlns="http://www.w3.org/1999/xhtml" xmlns:my="http://example.com">
2832<head>
2833 <title>Test</title>
2834 <meta name="author" content="proycon" />
2835</head>
2836<body>
2837 <h1>Header</h1>
2838
2839 <p xml:id="par1">
2840 <span xml:id="sen1">This is a sentence.</span>
2841 <span xml:id="sen2">This is the second sentence.</span>
2842 </p>
2843 <p xml:id="par2">
2844 <strong>This</strong> is the <em>second</em> paragraph.
2845 It has a <strong>bold</strong> word and one in <em>italics</em>.<br/>
2846 Let's highlight stress in the following word: <span my:stress="secondary">re</span>pu<span my:stress="primary">ta</span>tion.
2847 </p>
2848 <p xml:space="preserve"><![CDATA[This third
2849paragraph consists
2850of CDATA and is configured to preserve whitespace, and weird &entities; ]]></p>
2851
2852 <h2>Subsection</h2>
2853
2854 <p>
2855 Have some fruits:<br/>
2856 <ul xml:id="list1" class="fruits">
2857 <li xml:id="fruit1">apple</li>
2858 <li xml:id="fruit2">banana</li>
2859 <li xml:id="fruit3">melon</li>
2860 </ul>
2861 </p>
2862
2863 Some lingering text outside of any confines...
2864</body>
2865</html>"#;
2866
2867 const XMLEXAMPLE_TEXTOUTPUT: &'static str = "Header\n\nThis is a sentence. This is the second sentence.\n\nThis is the second paragraph. It has a bold word and one in italics.\nLet's highlight stress in the following word: reputation.\n\nThis third\nparagraph consists\nof CDATA and is configured to preserve whitespace, and weird &entities; \nSubsection\n\nHave some fruits:\n* apple\n* banana\n* melon\n\nSome lingering text outside of any confines...";
2868
2869 const XMLTEISPACE: &'static str = r#"<html xmlns="http://www.w3.org/1999/xhtml">
2871<body><space dim="vertical" unit="lines" quantity="3" /></body></html>"#;
2872
2873 const CONF: &'static str = r#"#default whitespace handling (Collapse or Preserve)
2874whitespace = "Collapse"
2875default_set = "urn:stam-fromhtml"
2876
2877[namespaces]
2878#this defines the namespace prefixes you can use in this configuration
2879xml = "http://www.w3.org/XML/1998/namespace"
2880html = "http://www.w3.org/1999/xhtml"
2881xsd = "http://www.w3.org/2001/XMLSchema"
2882xlink = "http://www.w3.org/1999/xlink"
2883
2884# elements and attributes are matched in reverse-order, so put more generic statements before more specific ones
2885
2886#Define some base elements that we reuse later for actual elements (prevents unnecessary repetition)
2887[baseelements.common]
2888id = "{% if ?.@xml:id %}{{ @xml:id }}{% endif %}"
2889
2890 [[baseelements.common.annotationdata]]
2891 key = "type"
2892 value = "{{ localname }}"
2893
2894 [[baseelements.common.annotationdata]]
2895 key = "lang"
2896 value = "{{ @xml:lang }}"
2897 skip_if_missing = true
2898
2899 [[baseelements.common.annotationdata]]
2900 key = "n"
2901 value = "{{ @n }}"
2902 skip_if_missing = true
2903
2904 [[baseelements.common.annotationdata]]
2905 key = "style"
2906 value = "{{ @style }}"
2907 skip_if_missing = true
2908
2909 [[baseelements.common.annotationdata]]
2910 key = "class"
2911 value = "{{ @class }}"
2912 skip_if_missing = true
2913
2914 [[baseelements.common.annotationdata]]
2915 key = "src"
2916 value = "{{ @src }}"
2917 skip_if_missing = true
2918
2919[baseelements.text]
2920text = true
2921
2922
2923[[elements]]
2924base = [ "text", "common" ]
2925path = "*"
2926text = true
2927annotation = "TextSelector"
2928
2929# Pass through the following elements without mapping to text
2930[[elements]]
2931base = [ "common" ]
2932path = "//html:head"
2933
2934[[elements]]
2935base = [ "common" ]
2936path = "//html:head//*"
2937
2938# Map metadata like <meta name="key" content="value"> to annotations with key->value data selecting the resource (ResourceSelector)
2939[[elements]]
2940base = [ "common" ]
2941path = "//html:head//html:meta"
2942
2943[[elements.annotationdata]]
2944key = "{% if ?.@name %}{{ name }}{% endif %}"
2945value = "{% if ?.@content %}{{ @content }}{% endif %}"
2946skip_if_missing = true
2947
2948# By default, ignore any tags in the head (unless they're mentioned specifically later in the config)
2949[[elements]]
2950path = "//html:head/html:title"
2951annotation = "ResourceSelector"
2952
2953[[elements.annotationdata]]
2954key = "title"
2955value = "{{ $. | trim }}"
2956
2957
2958# Determine how various structural elements are converted to text
2959
2960[[elements]]
2961base = [ "common" ]
2962path = "//html:br"
2963textsuffix = "\n"
2964
2965[[elements]]
2966base = [ "common", "text" ]
2967path = "//html:p"
2968textprefix = "\n"
2969textsuffix = "\n"
2970
2971# Let's do headers and bulleted lists like markdown
2972[[elements]]
2973base = [ "common", "text" ]
2974path = "//html:h1"
2975textsuffix = "\n"
2976
2977[[elements]]
2978base = [ "common", "text" ]
2979path = "//html:h2"
2980textsuffix = "\n"
2981
2982#Generic, will be overriden by more specific one
2983[[elements]]
2984base = [ "common", "text" ]
2985path = "//html:li"
2986textprefix = "- "
2987textsuffix = "\n"
2988
2989[[elements]]
2990base = [ "common", "text" ]
2991path = """//html:body"""
2992annotation = "TextSelector"
2993id = "body"
2994
2995 [[elements.annotationdata]]
2996 key = "title_from_parent"
2997 value = "{{ $../html:head/html:title }}"
2998 skip_if_missing = true
2999
3000 [[elements.annotationdata]]
3001 key = "title_from_root"
3002 value = "{{ $/html:html/html:head/html:title }}"
3003 skip_if_missing = true
3004
3005#More specific one takes precendence over the above generic one
3006[[elements]]
3007base = [ "common", "text" ]
3008path = """//html:ul[@class="fruits"]/html:li"""
3009textprefix = "* "
3010textsuffix = "\n"
3011
3012#Not real HTML, test-case modelled after TEI space
3013[[elements]]
3014base = [ "common" ]
3015path = """//html:space[@dim="vertical" and @unit="lines"]"""
3016text = true
3017textsuffix = """\n{% for x in @quantity | int | as_range %}\n{% endfor %}"""
3018
3019
3020[[elements]]
3021base = [ "common", "text" ]
3022path = "//html:example"
3023annotation = "TextSelector"
3024
3025[[elements.annotationdata]]
3026key = "requiredattrib"
3027value = "{{ @requiredattrib }}"
3028
3029[[elements.annotationdata]]
3030key = "optattrib"
3031value = "{{ ?.@optattrib }}"
3032
3033[[elements]]
3034base = [ "common","text" ]
3035path = "//html:marquee"
3036annotation = "TextSelector"
3037
3038#map value, some bogus data to test parsing
3039[[elements.annotationdata]]
3040key = "map"
3041
3042[elements.annotationdata.value]
3043text = "{{ $. }}"
3044number = 42
3045bogus = true
3046
3047"#;
3048
3049 const XMLREQATTRIBEXAMPLE: &'static str = r#"<html xmlns="http://www.w3.org/1999/xhtml">
3050<body><example xml:id="ann1" requiredattrib="blah">test</example></body></html>"#;
3051
3052 const XMLREQATTRIBEXAMPLE2: &'static str = r#"<html xmlns="http://www.w3.org/1999/xhtml">
3053<body><example xml:id="ann1">test</example></body></html>"#;
3054
3055 const XMLREQATTRIBEXAMPLE3: &'static str = r#"<html xmlns="http://www.w3.org/1999/xhtml">
3056<body><example xml:id="ann1" requiredattrib="blah" optattrib="blah">test</example></body></html>"#;
3057
3058 const XMLMAPEXAMPLE: &'static str = r#"<html xmlns="http://www.w3.org/1999/xhtml">
3059<body><marquee xml:id="ann1">test</marquee></body></html>"#;
3060
3061 #[test]
3062 fn test_precompile_template_nochange() -> Result<(), String> {
3063 let config = XmlConversionConfig::new();
3064 let mut conv = XmlToStamConverter::new(&config);
3065 let template_in = "{{ foo }}";
3066 let template_out = conv.precompile(template_in);
3067 assert_eq!( template_out, template_in);
3068 assert!(!conv.variables.get(template_in).as_ref().unwrap().contains("foo"));
3070 Ok(())
3071 }
3072
3073 #[test]
3074 fn test_precompile_template_attrib() -> Result<(), String> {
3075 let config = XmlConversionConfig::new();
3076 let mut conv = XmlToStamConverter::new(&config);
3077 let template_in = "{{ @foo }}";
3078 let template_out = conv.precompile(template_in);
3079 assert_eq!(template_out, "{{ ATTRIB_foo }}");
3080 assert!(conv.variables.get(template_in).as_ref().unwrap().contains("@foo"));
3082 Ok(())
3083 }
3084
3085 #[test]
3086 fn test_precompile_template_attrib_ns() -> Result<(), String> {
3087 let config = XmlConversionConfig::new();
3088 let mut conv = XmlToStamConverter::new(&config);
3089 let template_in = "{{ @bar:foo }}";
3090 let template_out = conv.precompile(template_in);
3091 assert_eq!(template_out, "{{ ATTRIB_bar__foo }}");
3092 assert!(conv.variables.get(template_in).as_ref().unwrap().contains("@bar:foo"));
3094 Ok(())
3095 }
3096
3097 #[test]
3098 fn test_precompile_template_element() -> Result<(), String> {
3099 let config = XmlConversionConfig::new();
3100 let mut conv = XmlToStamConverter::new(&config);
3101 let template_in = "{{ $foo }}";
3102 let template_out = conv.precompile(template_in);
3103 assert_eq!(template_out, "{{ ELEMENT_foo }}");
3104 assert!(conv.variables.get(template_in).as_ref().unwrap().contains("$foo"));
3106 Ok(())
3107 }
3108
3109 #[test]
3110 fn test_precompile_template_element_ns() -> Result<(), String> {
3111 let config = XmlConversionConfig::new();
3112 let mut conv = XmlToStamConverter::new(&config);
3113 let template_in = "{{ $bar:foo }}";
3114 let template_out = conv.precompile(template_in);
3115 assert_eq!(template_out, "{{ ELEMENT_bar__foo }}");
3116 assert!(conv.variables.get(template_in).as_ref().unwrap().contains("$bar:foo"));
3118 Ok(())
3119 }
3120
3121 #[test]
3122 fn test_precompile_template_this_text() -> Result<(), String> {
3123 let config = XmlConversionConfig::new();
3124 let mut conv = XmlToStamConverter::new(&config);
3125 let template_in = "{{ $. }}";
3126 let template_out = conv.precompile(template_in);
3127 assert_eq!(template_out, "{{ ELEMENT_THIS }}");
3128 assert!(conv.variables.get(template_in).as_ref().unwrap().contains("$."));
3129 Ok(())
3130 }
3131
3132 #[test]
3133 fn test_precompile_template_parent_text() -> Result<(), String> {
3134 let config = XmlConversionConfig::new();
3135 let mut conv = XmlToStamConverter::new(&config);
3136 let template_in = "{{ $.. }}";
3137 let template_out = conv.precompile(template_in);
3138 assert_eq!(template_out, "{{ ELEMENT_PARENT }}");
3139 assert!(conv.variables.get(template_in).as_ref().unwrap().contains("$.."));
3140 Ok(())
3141 }
3142
3143
3144 #[test]
3145 fn test_precompile_template_attrib2() -> Result<(), String> {
3146 let config = XmlConversionConfig::new();
3147 let mut conv = XmlToStamConverter::new(&config);
3148 let template_in = "{% for x in @foo %}";
3149 let template_out = conv.precompile(template_in);
3150 assert_eq!(template_out, "{% for x in ATTRIB_foo %}");
3151 assert!(conv.variables.get(template_in).as_ref().unwrap().contains("@foo"));
3153 Ok(())
3154 }
3155
3156 #[test]
3157 fn test_precompile_template_attrib3() -> Result<(), String> {
3158 let config = XmlConversionConfig::new();
3159 let mut conv = XmlToStamConverter::new(&config);
3160 let template_in = "{{ ?.@foo }}";
3161 let template_out = conv.precompile(template_in);
3162 assert_eq!(template_out, "{{ ?.ATTRIB_foo }}");
3163 assert!(conv.variables.get(template_in).as_ref().unwrap().contains("@foo"));
3164 Ok(())
3165 }
3166
3167 #[test]
3168 fn test_precompile_template_path() -> Result<(), String> {
3169 let config = XmlConversionConfig::new();
3170 let mut conv = XmlToStamConverter::new(&config);
3171 let template_in = "{{ $x/y/z/@a }}";
3172 let template_out = conv.precompile(template_in);
3173 assert_eq!(template_out, "{{ ELEMENT_x_IN_y_IN_z_IN_ATTRIB_a }}");
3174 assert!(conv.variables.get(template_in).as_ref().unwrap().contains("$x/y/z/@a"));
3175 Ok(())
3176 }
3177
3178 #[test]
3179 fn test_loadconfig() -> Result<(), String> {
3180 let config = XmlConversionConfig::from_toml_str(CONF)?;
3181 let mut conv = XmlToStamConverter::new(&config);
3182 conv.compile().map_err(|e| format!("{}",e))?;
3183 assert_eq!(conv.config.namespaces.len(),4 , "number of namespaces");
3184 assert_eq!(conv.config.elements.len(), 15, "number of elements");
3185 assert_eq!(conv.config.baseelements.len(), 2, "number of baseelements");
3186 assert_eq!(conv.config.elements.get(0).unwrap().annotationdata.len(), 6,"number of annotationdata under first element");
3187 assert_eq!(conv.config.baseelements.get("common").unwrap().annotationdata.len(), 6,"number of annotationdata under baseelement common");
3188 Ok(())
3189 }
3190
3191 #[test]
3192 fn test_small() -> Result<(), String> {
3193 let config = XmlConversionConfig::from_toml_str(CONF)?.with_debug(true);
3194 let mut store = stam::AnnotationStore::new(stam::Config::new());
3195 from_xml_in_memory("test", XMLSMALLEXAMPLE, &config, &mut store)?;
3196 let res = store.resource("test").expect("resource must have been created at this point");
3197 assert_eq!(res.text(), "TEST\n\nThis is a test.\n", "resource text");
3198 assert_eq!(store.annotations_len(), 4, "number of annotations");
3199 let annotation = store.annotation("emphasis").expect("annotation must have been created at this point");
3200 assert_eq!(annotation.text_simple(), Some("test"));
3201 let key = store.key("urn:stam-fromhtml", "style").expect("key must exist");
3203 assert_eq!(annotation.data().filter_key(&key).value_as_str(), Some("color:green"));
3204 let key = store.key("urn:stam-fromhtml", "title").expect("key must exist");
3205 let annotation = res.annotations_as_metadata().next().expect("annotation");
3206 assert_eq!(annotation.data().filter_key(&key).value_as_str(), Some("test"));
3207 let bodyannotation = store.annotation("body").expect("body annotation not found");
3208 let title1 = store.key("urn:stam-fromhtml", "title_from_parent").expect("key must exist");
3209 let title2 = store.key("urn:stam-fromhtml", "title_from_root").expect("key must exist");
3210 assert_eq!(bodyannotation.data().filter_key(&title1).value_as_str(), Some("test"));
3211 assert_eq!(bodyannotation.data().filter_key(&title2).value_as_str(), Some("test"));
3212 Ok(())
3213 }
3214
3215 #[test]
3216 fn test_full() -> Result<(), String> {
3217 let config = XmlConversionConfig::from_toml_str(CONF)?.with_debug(true);
3218 let mut store = stam::AnnotationStore::new(stam::Config::new());
3219 from_xml_in_memory("test", XMLEXAMPLE, &config, &mut store)?;
3220 let res = store.resource("test").expect("resource must have been created at this point");
3221 assert_eq!(res.text(), XMLEXAMPLE_TEXTOUTPUT, "resource text");
3222 Ok(())
3223 }
3224
3225 #[test]
3226 fn test_teispace() -> Result<(), String> {
3227 let config = XmlConversionConfig::from_toml_str(CONF)?;
3228 let mut store = stam::AnnotationStore::new(stam::Config::new());
3229 from_xml_in_memory("test", XMLTEISPACE, &config, &mut store)?;
3230 let res = store.resource("test").expect("resource must have been created at this point");
3231 assert_eq!(res.text(), "\n\n\n\n", "resource text");
3232 Ok(())
3233 }
3234
3235
3236 #[test]
3237 fn test_reqattrib() -> Result<(), String> {
3238 let config = XmlConversionConfig::from_toml_str(CONF)?;
3239 let mut store = stam::AnnotationStore::new(stam::Config::new());
3240 from_xml_in_memory("test", XMLREQATTRIBEXAMPLE, &config, &mut store)?;
3241 let res = store.resource("test").expect("resource must have been created at this point");
3242 assert_eq!(res.text(), "test", "resource text");
3243 let key = store.key("urn:stam-fromhtml", "requiredattrib").expect("key must exist");
3244 let annotation = store.annotation("ann1").expect("annotation");
3245 assert_eq!(annotation.data().filter_key(&key).value_as_str(), Some("blah"));
3246 assert!(store.key("urn:stam-fromhtml", "optattrib").is_none(), "optional attrib is unused");
3247 Ok(())
3248 }
3249
3250 #[test]
3251 fn test_reqattrib2() -> Result<(), String> {
3252 let mut config = XmlConversionConfig::from_toml_str(CONF)?;
3253 config = config.with_debug(true);
3254 let mut store = stam::AnnotationStore::new(stam::Config::new());
3255 assert!(from_xml_in_memory("test", XMLREQATTRIBEXAMPLE2, &config, &mut store).is_err(), "checking if error is returned");
3256 Ok(())
3257 }
3258
3259 #[test]
3260 fn test_reqattrib3() -> Result<(), String> {
3261 let config = XmlConversionConfig::from_toml_str(CONF)?;
3262 let mut store = stam::AnnotationStore::new(stam::Config::new());
3263 from_xml_in_memory("test", XMLREQATTRIBEXAMPLE3, &config, &mut store)?;
3264 let res = store.resource("test").expect("resource must have been created at this point");
3265 assert_eq!(res.text(), "test", "resource text");
3266 let reqkey = store.key("urn:stam-fromhtml", "requiredattrib").expect("key must exist");
3267 let optkey = store.key("urn:stam-fromhtml", "optattrib").expect("key optattrib must exist");
3268 let annotation = store.annotation("ann1").expect("annotation");
3269 assert_eq!(annotation.data().filter_key(&reqkey).value_as_str(), Some("blah"));
3270 assert_eq!(annotation.data().filter_key(&optkey).value_as_str(), Some("blah"));
3271 Ok(())
3272 }
3273
3274 #[test]
3275 fn test_map() -> Result<(), String> {
3276 let config = XmlConversionConfig::from_toml_str(CONF)?;
3277 let mut store = stam::AnnotationStore::new(stam::Config::new());
3278 from_xml_in_memory("test", XMLMAPEXAMPLE, &config, &mut store)?;
3279 let res = store.resource("test").expect("resource must have been created at this point");
3280 assert_eq!(res.text(), "test", "resource text");
3281 let key = store.key("urn:stam-fromhtml", "map").expect("key must exist");
3282 let annotation = store.annotation("ann1").expect("annotation");
3283 let data = annotation.data().filter_key(&key).value().expect("data must exist");
3284 if let DataValue::Map(data) = data {
3285 assert_eq!(data.get("text"), Some(&DataValue::String("test".into())));
3286 assert_eq!(data.get("number"), Some(&DataValue::Int(42)));
3287 assert_eq!(data.get("bogus"), Some(&DataValue::Bool(true)));
3288 assert_eq!(data.len(), 3);
3289 } else {
3290 assert!(false, "Data is supposed to be a map");
3291 }
3292 Ok(())
3293 }
3294}