stamtools/
xml.rs

1use std::borrow::Cow;
2use std::collections::{BTreeMap, HashMap, BTreeSet};
3use std::fmt::Display;
4use std::fs::read_to_string;
5use std::path::Path;
6use std::hash::{Hash,DefaultHasher,Hasher};
7
8use roxmltree::{Document, Node, NodeId, ParsingOptions};
9use serde::Deserialize;
10use stam::*;
11use toml;
12use upon::Engine;
13
14const NS_XML: &str = "http://www.w3.org/XML/1998/namespace";
15const CONTEXT_ANNO: &str = "http://www.w3.org/ns/anno.jsonld";
16
17
18fn default_set() -> String {
19    "urn:stam-fromxml".into()
20}
21
22#[derive(Deserialize)]
23/// Holds the configuration for mapping a specific XML format to STAM
24pub struct XmlConversionConfig {
25    #[serde(default)]
26    /// Holds configurations for mapping specific XML elements to STAM, evaluated in reverse-order, so put more generic rules before specific ones
27    elements: Vec<XmlElementConfig>,
28
29    #[serde(default)]
30    /// Base elements are named templates, other elements can derive from this
31    baseelements: HashMap<String, XmlElementConfig>,
32
33    #[serde(default)]
34    /// Maps XML prefixes to namespace
35    namespaces: HashMap<String, String>,
36
37    #[serde(default = "XmlWhitespaceHandling::collapse")]
38    /// Default whitespace handling
39    whitespace: XmlWhitespaceHandling,
40
41    #[serde(default)]
42    /// Sets additional context variables that can be used in templates
43    context: HashMap<String, toml::Value>,
44
45    #[serde(default)]
46    /// Sets additional context variables that can be used in templates
47    metadata: Vec<MetadataConfig>,
48
49    #[serde(default)]
50    /// Inject a DTD (for XML entity resolution)
51    inject_dtd: Option<String>,
52
53    #[serde(default = "default_set")]
54    default_set: String,
55
56    #[serde(default)]
57    /// A prefix to assign when setting annotation IDs
58    id_prefix: Option<String>,
59
60    #[serde(default)]
61    /// A suffix to strip when setting annotation IDs
62    id_strip_suffix: Vec<String>,
63
64    #[serde(default)]
65    /// Add provenance information pointing each annotation to the appropriate node in the XML source files where it came from (translates into XPathSelector in Web Annotation output)
66    provenance: bool,
67
68    #[serde(skip_deserializing)]
69    debug: bool,
70
71}
72
73impl XmlConversionConfig {
74    pub fn new() -> Self {
75        Self {
76            elements: Vec::new(),
77            baseelements: HashMap::new(),
78            namespaces: HashMap::new(),
79            context: HashMap::new(),
80            metadata: Vec::new(),
81            whitespace: XmlWhitespaceHandling::Collapse,
82            default_set: default_set(),
83            inject_dtd: None,
84            id_prefix: None,
85            id_strip_suffix: Vec::new(),
86            provenance: false,
87            debug: false,
88        }
89    }
90
91    pub fn resolve_baseelements(&mut self) -> Result<(), XmlConversionError> {
92        let mut replace: Vec<(usize, XmlElementConfig)> = Vec::new();
93        for (i, element) in self.elements.iter().enumerate() {
94            let mut newelement = None;
95            for basename in element.base.iter().rev() {
96                if let Some(baseelement) = self.baseelements.get(basename) {
97                    if newelement.is_none() {
98                        newelement = Some(element.clone());
99                    }
100                    newelement
101                        .as_mut()
102                        .map(|newelement| newelement.update(baseelement));
103                } else {
104                    return Err(XmlConversionError::ConfigError(format!(
105                        "No such base element: {}",
106                        basename
107                    )));
108                }
109            }
110            if let Some(newelement) = newelement {
111                replace.push((i, newelement));
112            }
113        }
114        for (i, element) in replace {
115            self.elements[i] = element;
116        }
117        Ok(())
118    }
119
120    /// Parse the configuration from a TOML string (load the data from file yourself).
121    pub fn from_toml_str(tomlstr: &str) -> Result<Self, String> {
122        let mut config: Self = toml::from_str(tomlstr).map_err(|e| format!("{}", e))?;
123        config.resolve_baseelements().map_err(|e| format!("{}", e))?;
124        Ok(config)
125    }
126
127    pub fn with_debug(mut self, value: bool) -> Self {
128        self.debug = value;
129        self
130    }
131
132    /// Add provenance information pointing each annotation to the appropriate node in the XML source files where it came from (translates into XPathSelector in Web Annotation output)
133    pub fn with_provenance(mut self, value: bool) -> Self {
134        self.provenance = value;
135        self
136    }
137
138    /// Register an XML namespace with prefix
139    pub fn with_prefix(mut self, prefix: impl Into<String>, namespace: impl Into<String>) -> Self {
140        self.namespaces.insert(prefix.into(), namespace.into());
141        self
142    }
143
144    /// A prefix to assign when setting annotation IDs, within this string you can use the special variable `{resource}` to use the resource ID.
145    pub fn with_id_prefix(mut self, prefix: impl Into<String>) -> Self {
146        self.id_prefix = Some(prefix.into());
147        self
148    }
149
150    /// A suffix to strip when assigning annotation IDs
151    pub fn with_id_strip_suffix(mut self, suffix: impl Into<String>) -> Self {
152        self.id_strip_suffix.push(suffix.into());
153        self
154    }
155
156    /// Inject a DTD (for XML entity resolution)
157    pub fn with_inject_dtd(mut self, dtd: impl Into<String>) -> Self {
158        self.inject_dtd = Some(dtd.into());
159        self
160    }
161
162    /// Set default whitespace handling
163    pub fn with_whitespace(mut self, handling: XmlWhitespaceHandling) -> Self {
164        self.whitespace = handling;
165        self
166    }
167
168    /// Set an element configuration
169    pub fn with_element<F>(mut self, expression: &str, setup: F) -> Self
170    where
171        F: Fn(XmlElementConfig) -> XmlElementConfig,
172    {
173        let expression = XPathExpression::new(expression);
174        let element = setup(XmlElementConfig::new(expression));
175        if self.debug {
176            eprintln!("[STAM fromxml] registered {:?}", element);
177        }
178        self.elements.push(element);
179        self
180    }
181
182    /// How to handle this element?
183    fn element_config(&self, node: Node, path: &NodePath) -> Option<&XmlElementConfig> {
184        for elementconfig in self.elements.iter().rev() {
185            if elementconfig.path.test(path, node, self) {
186                return Some(elementconfig);
187            }
188        }
189        None
190    }
191
192    pub fn add_context(&mut self, key: impl Into<String>, value: toml::Value) {
193        self.context.insert(key.into(), value);
194    }
195
196    pub fn debug(&self) -> bool {
197        self.debug
198    }
199}
200
201#[derive(Clone, Copy, Debug, PartialEq, Deserialize)]
202/// Determines how to handle whitespace for an XML element
203pub enum XmlWhitespaceHandling {
204    /// Not specified (used for base templates)
205    Unspecified,
206    //Inherit from parent
207    Inherit,
208    /// Whitespace is kept as is in the XML
209    Preserve,
210    /// all whitespace becomes space, consecutive whitespace is squashed
211    Collapse,
212}
213
214impl Default for XmlWhitespaceHandling {
215    fn default() -> Self {
216        XmlWhitespaceHandling::Unspecified
217    }
218}
219
220impl XmlWhitespaceHandling {
221    fn collapse() -> Self {
222        XmlWhitespaceHandling::Collapse
223    }
224}
225
226#[derive(Debug, Clone, Deserialize, PartialEq, Copy, Default)]
227pub enum XmlAnnotationHandling {
228    /// No annotation
229    #[default]
230    Unspecified,
231
232    /// No annotation
233    None,
234
235    /// Selects the text pertaining to the current element
236    TextSelector,
237
238    /// Selects the text pertaining to the current resource
239    ResourceSelector,
240
241    /// Selects the text between the current element and the next instance of the same element type
242    TextSelectorBetweenMarkers,
243}
244
245#[derive(Debug, Clone, Deserialize)]
246/// XML Element configuration, determines how to map an XML element (identified by an XPath expression) to STAM
247pub struct XmlElementConfig {
248    /// This is XPath-like expression (just a small subset of XPath) to identify an element by its path
249
250    #[serde(default)]
251    path: XPathExpression,
252
253    #[serde(default)]
254    annotation: XmlAnnotationHandling,
255
256    #[serde(default)]
257    annotationdata: Vec<XmlAnnotationDataConfig>,
258
259    /// Template or None for no text handling, prefixes are never targeted by annotations
260    #[serde(default)]
261    textprefix: Option<String>,
262
263    /// Extract text. None means unspecified and defaults to false.
264    #[serde(default)]
265    text: Option<bool>,
266
267    /// Template or None for no text handling, suffixes are never targeted by annotations
268    #[serde(default)]
269    textsuffix: Option<String>,
270
271    // Annotation data for the text prefix
272    #[serde(default)]
273    annotatetextprefix: Vec<XmlAnnotationDataConfig>,
274
275    // Annotation data for the text suffix
276    #[serde(default)]
277    annotatetextsuffix: Vec<XmlAnnotationDataConfig>,
278
279    /// Include the text prefix in the annotation's text selector. None means unspecified and defaults to false
280    #[serde(default)]
281    include_textprefix: Option<bool>,
282
283    /// Include the text suffix in the annotation's text selector. None means unspecified and defaults to false
284    #[serde(default)]
285    include_textsuffix: Option<bool>,
286
287    /// Base elements to derive from
288    #[serde(default)]
289    base: Vec<String>,
290
291    /// Template or None for no ID extraction
292    #[serde(default)]
293    id: Option<String>,
294
295    #[serde(default)]
296    /// Descend into children (false) or not? (true). None means unspecified and defaults to false
297    stop: Option<bool>,
298
299    #[serde(default)]
300    /// Whitespace handling for this element
301    whitespace: XmlWhitespaceHandling,
302}
303
304impl XmlElementConfig {
305    fn new(expression: XPathExpression) -> Self {
306        Self {
307            path: expression,
308            stop: None,
309            whitespace: XmlWhitespaceHandling::Unspecified,
310            annotation: XmlAnnotationHandling::Unspecified,
311            annotationdata: Vec::new(),
312            base: Vec::new(),
313            id: None,
314            textprefix: None,
315            text: None,
316            textsuffix: None,
317            annotatetextprefix: Vec::new(),
318            annotatetextsuffix: Vec::new(),
319            include_textprefix: None,
320            include_textsuffix: None,
321        }
322    }
323
324    pub fn update(&mut self, base: &XmlElementConfig) {
325        if self.whitespace == XmlWhitespaceHandling::Unspecified
326            && base.whitespace != XmlWhitespaceHandling::Unspecified
327        {
328            self.whitespace = base.whitespace;
329        }
330        if self.annotation == XmlAnnotationHandling::Unspecified
331            && base.annotation != XmlAnnotationHandling::Unspecified
332        {
333            self.annotation = base.annotation;
334        }
335        if self.textprefix.is_none() && base.textprefix.is_some() {
336            self.textprefix = base.textprefix.clone();
337        }
338        if self.text.is_none() && base.text.is_some() {
339            self.text = base.text;
340        }
341        if self.textsuffix.is_none() && base.textsuffix.is_some() {
342            self.textsuffix = base.textsuffix.clone();
343        }
344        if self.id.is_none() && base.id.is_some() {
345            self.id = base.id.clone();
346        }
347        if self.stop.is_none() && base.stop.is_some() {
348            self.stop = base.stop;
349        }
350        for annotationdata in base.annotationdata.iter() {
351            if !self.annotationdata.contains(annotationdata) {
352                self.annotationdata.push(annotationdata.clone());
353            }
354        }
355        if self.annotatetextsuffix.is_empty() && !base.annotatetextsuffix.is_empty() {
356            self.annotatetextsuffix = base.annotatetextsuffix.clone();
357        }
358        if self.annotatetextprefix.is_empty() && !base.annotatetextprefix.is_empty() {
359            self.annotatetextprefix = base.annotatetextprefix.clone();
360        }
361        if self.include_textsuffix.is_none() {
362            self.include_textsuffix = base.include_textsuffix;
363        }
364        if self.include_textprefix.is_none() {
365            self.include_textprefix = base.include_textprefix;
366        }
367    }
368
369
370    /// This sets the mode that determines how the element is handledhttps://www.youtube.com/watch?v=G_BrbhRrP6g
371    pub fn with_stop(mut self, stop: bool) -> Self {
372        self.stop = Some(stop);
373        self
374    }
375
376    /// This sets the whitespace handling for this element
377    pub fn with_whitespace(mut self, handling: XmlWhitespaceHandling) -> Self {
378        self.whitespace = handling;
379        self
380    }
381
382    pub fn with_text(mut self, text: bool) -> Self {
383        self.text = Some(text);
384        self
385    }
386
387    pub fn with_base(mut self, iter: impl Iterator<Item = impl Into<String>>) -> Self {
388        self.base = iter.into_iter().map(|s| s.into()).collect();
389        self
390    }
391
392    pub fn without_text(mut self) -> Self {
393        self.text = None;
394        self
395    }
396
397    pub fn with_annotation(mut self, annotation: XmlAnnotationHandling) -> Self {
398        self.annotation = annotation;
399        self
400    }
401
402    /// Not a very safe hash function (just uses an address uniquely associated with this object) but works for our ends
403    fn hash(&self) -> usize {
404        self.path.0.as_ptr() as usize
405    }
406}
407
408impl PartialEq for XmlElementConfig {
409    fn eq(&self, other: &Self) -> bool {
410        self.hash() == other.hash()
411    }
412}
413
414#[derive(Debug, Clone, Deserialize, PartialEq)]
415pub struct XmlAnnotationDataConfig {
416    /// Template
417    id: Option<String>,
418    /// Template
419    set: Option<String>,
420    /// Template
421    key: Option<String>,
422    /// Any string values are interpreted as templates
423    value: Option<toml::Value>,
424
425    /// Allow value templates that yield an empty string?
426    #[serde(default)]
427    allow_empty_value: bool,
428
429    /// Skip this data entirely if any underlying variables in the templates are undefined
430    #[serde(default)]
431    skip_if_missing: bool,
432}
433
434impl XmlAnnotationDataConfig {
435    pub fn with_id(mut self, id: impl Into<String>) -> Self {
436        self.id = Some(id.into());
437        self
438    }
439
440    pub fn with_set(mut self, set: impl Into<String>) -> Self {
441        self.set = Some(set.into());
442        self
443    }
444
445    pub fn with_key(mut self, key: impl Into<String>) -> Self {
446        self.key = Some(key.into());
447        self
448    }
449
450    pub fn with_value(mut self, value: impl Into<toml::Value>) -> Self {
451        self.value = Some(value.into());
452        self
453    }
454}
455
456/// Not really full XPath, just a very minor subset
457#[derive(Debug, Clone, PartialEq, Deserialize)]
458struct XPathExpression(String);
459
460impl XPathExpression {
461    pub fn new(expression: impl Into<String>) -> Self {
462        Self(expression.into())
463    }
464
465    pub fn any() -> Self {
466        Self("*".into())
467    }
468
469    pub fn iter<'a>(
470        &'a self,
471        config: &'a XmlConversionConfig,
472    ) -> impl Iterator<Item = (Option<&'a str>, &'a str, Option<&'a str>)> {
473        self.0.trim_start_matches('/').split("/").map(|segment| {
474            //eprintln!("DEBUG: segment={}", segment);
475            let (prefix, name, condition) = Self::parse_segment(segment);
476            let namespace = if let Some(prefix) = prefix {
477                if let Some(namespace) = config.namespaces.get(prefix).map(|x| x.as_str()) {
478                    Some(namespace)
479                } else {
480                    panic!(
481                        "XML namespace prefix not known in configuration: {}",
482                        prefix
483                    );
484                }
485            } else {
486                None
487            };
488            (namespace, name, condition)
489        })
490    }
491
492    /// matches a node path against an XPath-like expression
493    fn test<'a, 'b>(&self, path: &NodePath<'a, 'b>, mut node: Node<'a,'b>, config: &XmlConversionConfig) -> bool {
494        let mut pathiter = path.components.iter().rev();
495        for (refns, refname, condition) in self.iter(config).collect::<Vec<_>>().into_iter().rev() {
496            if let Some(component) = pathiter.next() {
497                /*if config.debug() {
498                    eprintln!("[STAM fromxml]          testing component {:?} against refns={:?} refname={} condition={:?}", component, refns, refname, condition);
499                }*/
500                if refname != "*" && refname != "" {
501                    if refns.is_none() != component.namespace.is_none() || component.namespace != refns || refname != component.tagname {
502                        return false;
503                    }
504                }
505                if let Some(condition) = condition {
506                    if !self.test_condition(condition, node, config) {
507                        return false;
508                    }
509                }
510                if let Some(parent) = node.parent() { 
511                    node = parent;
512                }
513            } else {
514                if refname != "" {
515                    return false;
516                }
517            }
518        }
519        /* if config.debug() {
520            eprintln!("[STAM fromxml]          match");
521        }*/
522        true
523    }
524
525    fn test_condition<'a,'b>(&self, condition: &'a str, node: Node<'a,'b>, config: &XmlConversionConfig) -> bool {
526        for condition in condition.split(" and ") { //MAYBE TODO: doesn't take literals into account yet!
527            if let Some(pos) = condition.find("!=") {
528                let var = &condition[..pos];
529                let right = condition[pos+2..].trim_matches('"');
530                if self.get_var(var, &node, config) == Some(right) {
531                    return false;
532                }
533            } else if let Some(pos) = condition.find("=") {
534                let var = &condition[..pos];
535                let right = condition[pos+1..].trim_matches('"');
536                let value = self.get_var(var, &node, config);
537                if value != Some(right) {
538                    return false;
539                }
540            } else {
541                //condition is one variable and merely needs to exist
542                let v = self.get_var(condition, &node, config);
543                if v.is_none() || v == Some("") {
544                    return false;
545                }
546            }
547        }
548        /*if config.debug() {
549            eprintln!("[STAM fromxml]          condition matches");
550        }*/
551        true
552    }
553
554    /// Resolve a variable from a conditional expression, given a variable name, node and config
555    fn get_var<'a,'b>(&self, var: &str, node: &Node<'a,'b>, config: &XmlConversionConfig) -> Option<&'a str> { 
556        if var.starts_with("@") {
557            if let Some(pos) = var.find(":") {
558                let prefix = &var[1..pos];
559                if let Some(ns) = config.namespaces.get(prefix) {
560                    let var = &var[pos+1..];
561                    node.attribute((ns.as_str(),var))
562                } else {
563                    None
564                }
565            } else {
566                node.attribute(&var[1..])
567            }
568        } else if var == "text()" {
569            node.text().map(|s|s.trim())
570        } else {
571            None
572        }
573    }
574
575    /// Parses a segment into a namespace-prefix, a name and a condition
576    fn parse_segment<'a>(s: &'a str) -> (Option<&'a str>, &'a str, Option<&'a str>) {
577        let (name, condition) = if let (Some(begin), Some(end)) = (s.find("["), s.rfind("]")) {
578            (&s[..begin], Some(&s[begin + 1..end]))
579        } else {
580            (s, None)
581        };
582        if let Some((prefix, name)) = name.split_once(":") {
583            (Some(prefix), name, condition)
584        } else {
585            (None, name, condition)
586        }
587    }
588}
589
590
591
592impl Default for XPathExpression {
593    fn default() -> Self {
594        Self::any()
595    }
596}
597
598#[derive(Clone, Debug, PartialEq)]
599struct NodePathComponent<'a,'b> {
600    namespace: Option<&'a str>,
601    tagname: &'b str,
602    /// Index sequence number, 1-indexed (as specified by XPath)
603    index: Option<usize>,
604}
605
606#[derive(Clone, Debug, PartialEq, Default)]
607struct NodePath<'a, 'b> {
608    components: Vec<NodePathComponent<'a,'b>>,
609}
610
611impl<'a, 'b> Display for NodePath<'a, 'b> {
612    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
613        for component in self.components.iter() {
614            write!(f, "/")?;
615            if let Some(ns) = component.namespace {
616                if let Some(index) = component.index {
617                    write!(f, "{{{}}}{}[{}]", ns, component.tagname, index)?;
618                } else {
619                    write!(f, "{{{}}}{}", ns, component.tagname)?;
620                }
621            } else {
622                if let Some(index) = component.index {
623                    write!(f, "{}[{}]", component.tagname, index)?;
624                } else {
625                    write!(f, "{}", component.tagname)?;
626                }
627            }
628        }
629        Ok(())
630    }
631}
632
633impl<'a,'b> NodePath<'a,'b> {
634    fn add(&mut self, node: &Node<'a,'b>, index: Option<usize>) {
635        if node.tag_name().name() != "" {
636            self.components.push(
637                NodePathComponent {
638                    namespace: node.tag_name().namespace(),
639                    tagname: node.tag_name().name(),
640                    index,
641                }
642            )
643        }
644    }
645
646    fn format_as_xpath(&self, prefixes: &HashMap<String, String>) -> String {
647        let mut out = String::new();
648        for component in self.components.iter() {
649            out.push('/');
650            if let Some(ns) = component.namespace {
651                if let Some(prefix) = prefixes.get(ns) {
652                    if let Some(index) = component.index {
653                        out += &format!("{}:{}[{}]", prefix, component.tagname, index);
654                    } else {
655                        out += &format!("{}:{}", prefix, component.tagname);
656                    }
657                } else {
658                    eprintln!("STAM fromxml WARNING: format_as_xpath: namespace {} not defined, no prefix found!", ns);
659                    if let Some(index) = component.index {
660                        out += &format!("{}[{}]", component.tagname, index);
661                    } else {
662                        out += &format!("{}", component.tagname);
663                    }
664                }
665            } else {
666                if let Some(index) = component.index {
667                    out += &format!("{}[{}]", component.tagname, index);
668                } else {
669                    out += &format!("{}", component.tagname);
670                }
671            }
672        }
673        out
674    }
675}
676
677
678/// Counts elder siblings, used to determine index values
679#[derive(Default,Debug)]
680struct SiblingCounter {
681    map: HashMap<String,usize>,
682}
683
684impl SiblingCounter {
685    fn count<'a,'b>(&mut self, node: &Node<'a,'b>) -> usize {
686        let s = format!("{:?}", node.tag_name());
687        *self.map.entry(s).and_modify(|c| {*c += 1;}).or_insert(1)
688    }
689}
690
691
692#[derive(Debug, Clone, Deserialize)]
693/// XML Element configuration, determines how to map an XML element (identified by an XPath expression) to STAM
694pub struct MetadataConfig {
695    /// This is XPath-like expression (just a small subset of XPath) to identify an element by its path
696    #[serde(default)]
697    annotation: XmlAnnotationHandling,
698
699    #[serde(default)]
700    annotationdata: Vec<XmlAnnotationDataConfig>,
701
702    /// Template or None for no ID extraction
703    #[serde(default)]
704    id: Option<String>,
705}
706
707/// Translate an XML file to STAM, given a particular configuration
708pub fn from_xml<'a>(
709    filename: &Path,
710    config: &XmlConversionConfig,
711    store: &'a mut AnnotationStore,
712) -> Result<(), String> {
713    if config.debug {
714        eprintln!("[STAM fromxml] parsing {}", filename.display());
715    }
716
717    // Read the raw XML data
718    let mut xmlstring = read_to_string(filename)
719        .map_err(|e| format!("Error opening XML file {}: {}", filename.display(), e))?;
720
721    //patchy: remove HTML5 doctype and inject our own
722    if xmlstring[..100].find("<!DOCTYPE html>").is_some() && config.inject_dtd.is_some() {
723        xmlstring = xmlstring.replacen("<!DOCTYPE html>", "", 1);
724    }
725
726    // we can only inject a DTD if there is no doctype
727    if xmlstring[..100].find("<!DOCTYPE").is_none() {
728        if let Some(dtd) = config.inject_dtd.as_ref() {
729            xmlstring = dtd.to_string() + &xmlstring
730        };
731    } else if config.inject_dtd.is_some() {
732        eprintln!("[STAM fromxml] WARNING: Can not inject DTD because file already has a DOCTYPE");
733    }
734
735    // parse the raw XML data into a DOM
736    let doc = Document::parse_with_options(
737        &xmlstring,
738        ParsingOptions {
739            allow_dtd: true,
740            ..ParsingOptions::default()
741        },
742    )
743    .map_err(|e| format!("Error parsing XML file {}: {}", filename.display(), e))?;
744
745    let mut converter = XmlToStamConverter::new(config);
746    converter
747        .compile()
748        .map_err(|e| format!("Error compiling templates: {}", e))?;
749
750    let textoutfilename = format!(
751        "{}.txt",
752        filename
753            .file_stem()
754            .expect("invalid filename")
755            .to_str()
756            .expect("invalid utf-8 in filename")
757    );
758
759    // extract text (first pass)
760    let mut path = NodePath::default();
761    path.add(&doc.root_element(), None);
762    converter
763        .extract_element_text(doc.root_element(), &path, converter.config.whitespace, Some(textoutfilename.as_str()), Some(&filename.to_string_lossy()), 0)
764        .map_err(|e| {
765            format!(
766                "Error extracting element text from {}: {}",
767                filename.display(),
768                e
769            )
770        })?;
771    if config.debug {
772        eprintln!("[STAM fromxml] extracted full text: {}", &converter.text);
773    }
774    let resource = TextResourceBuilder::new()
775        .with_id(filename_to_id(textoutfilename.as_str(), config).to_string())
776        .with_text(converter.text.clone())
777        .with_filename(&textoutfilename);
778
779    converter.resource_handle = Some(
780        store
781            .add_resource(resource)
782            .map_err(|e| format!("Failed to add resource {}: {}", &textoutfilename, e))?,
783    );
784
785    converter.add_metadata(store).map_err(|e| format!("Failed to add metadata {}: {}", &textoutfilename, e))?;
786
787    // extract annotations (second pass)
788    converter
789        .extract_element_annotation(doc.root_element(), &path,  Some(&filename.to_string_lossy()),0,  store)
790        .map_err(|e| {
791            format!(
792                "Error extracting element annotation from {}: {}",
793                filename.display(),
794                e
795            )
796        })?;
797
798    Ok(())
799}
800
801/// Translate an XML file to STAM, given a particular configuration. This translates multiple XML files to a single output file.
802pub fn from_multi_xml<'a>(
803    filenames: &Vec<&Path>,
804    outputfile: Option<&Path>,
805    config: &XmlConversionConfig,
806    store: &'a mut AnnotationStore,
807) -> Result<(), String> {
808
809    let textoutfilename = if let Some(outputfile) = outputfile {
810        format!("{}",outputfile.to_str().expect("invalid utf-8 in filename"))
811    } else {
812        format!(
813            "{}.txt",
814                filenames.iter().next().expect("1 or more filename need to be provided")
815                .file_stem()
816                .expect("invalid filename")
817                .to_str()
818                .expect("invalid utf-8 in filename")
819        )
820    };
821
822    // Read the raw XML data
823    let mut xmlstrings: Vec<String> = Vec::new();
824    let mut docs: Vec<Document> = Vec::new();
825    for filename in filenames.iter() {
826        if config.debug {
827            eprintln!("[STAM fromxml] parsing {} (one of multiple)", filename.display());
828        }
829        //patchy: remove HTML5 doctype and inject our own
830        let mut xmlstring = read_to_string(filename).map_err(|e| format!("Error opening XML file {}: {}", filename.display(), e))?;
831        if xmlstring[..100].find("<!DOCTYPE html>").is_some() && config.inject_dtd.is_some() {
832            xmlstring = xmlstring.replacen("<!DOCTYPE html>", "", 1);
833        }
834        // we can only inject a DTD if there is no doctype
835        if xmlstring[..100].find("<!DOCTYPE").is_none() {
836            if let Some(dtd) = config.inject_dtd.as_ref() {
837                xmlstring = dtd.to_string() + &xmlstring
838            };
839        } else if config.inject_dtd.is_some() {
840            eprintln!("[STAM fromxml] WARNING: Can not inject DTD because file already has a DOCTYPE");
841        }
842        xmlstrings.push(xmlstring);
843    }
844
845    for (filename, xmlstring) in filenames.iter().zip(xmlstrings.iter()) {
846        // parse the raw XML data into a DOM
847        let doc = Document::parse_with_options(
848            xmlstring,
849            ParsingOptions {
850                allow_dtd: true,
851                ..ParsingOptions::default()
852            },
853        )
854        .map_err(|e| format!("Error parsing XML file {}: {}", filename.display(), e))?;
855        docs.push(doc);
856    }
857
858    let mut converter = XmlToStamConverter::new(config);
859    converter
860        .compile()
861        .map_err(|e| format!("Error compiling templates: {}", e))?;
862
863    for (i, (doc, filename)) in docs.iter().zip(filenames.iter()).enumerate() {
864        let mut path = NodePath::default();
865        path.add(&doc.root_element(), None);
866        // extract text (first pass)
867        converter
868            .extract_element_text(doc.root_element(), &path, converter.config.whitespace, Some(textoutfilename.as_str()), Some(&filename.to_string_lossy()), i)
869            .map_err(|e| {
870                format!(
871                    "Error extracting element text from {}: {}",
872                    filename.display(),
873                    e
874                )
875            })?;
876        if config.debug {
877            eprintln!("[STAM fromxml] extracted full text: {}", &converter.text);
878        }
879    }
880
881    let resource = TextResourceBuilder::new()
882        .with_id(filename_to_id(textoutfilename.as_str(), config).to_string())
883        .with_text(converter.text.clone())
884        .with_filename(&textoutfilename);
885
886    converter.resource_handle = Some(
887        store
888            .add_resource(resource)
889            .map_err(|e| format!("Failed to add resource {}: {}", &textoutfilename, e))?,
890    );
891
892    converter.add_metadata(store).map_err(|e| format!("Failed to add metadata {}: {}", &textoutfilename, e))?;
893
894    // extract annotations (second pass)
895    for (i,(doc, filename)) in docs.iter().zip(filenames.iter()).enumerate() {
896        let mut path = NodePath::default();
897        path.add(&doc.root_element(), None);
898        converter
899            .extract_element_annotation(doc.root_element(), &path, Some(&filename.to_string_lossy()),i,  store)
900            .map_err(|e| {
901                format!(
902                    "Error extracting element annotation from {}: {}",
903                    filename.display(),
904                    e
905                )
906            })?;
907    }
908
909    Ok(())
910}
911
912/// Translate an XML file to STAM, given a particular configuration. Not writing output files and keeping all in memory. Does not support DTD injection.
913pub fn from_xml_in_memory<'a>(
914    resource_id: &str, 
915    xmlstring: &str,
916    config: &XmlConversionConfig,
917    store: &'a mut AnnotationStore,
918) -> Result<(), String> {
919    if config.debug {
920        eprintln!("[STAM fromxml] parsing XML string");
921    }
922
923    // parse the raw XML data into a DOM
924    let doc = Document::parse_with_options(
925        &xmlstring,
926        ParsingOptions {
927            allow_dtd: true,
928            ..ParsingOptions::default()
929        },
930    )
931    .map_err(|e| format!("Error parsing XML string: {}",  e))?;
932
933    let mut converter = XmlToStamConverter::new(config);
934    converter
935        .compile()
936        .map_err(|e| format!("Error compiling templates: {}", e))?;
937
938    let mut path = NodePath::default();
939    path.add(&doc.root_element(), None);
940    // extract text (first pass)
941    converter
942        .extract_element_text(doc.root_element(), &path, converter.config.whitespace, Some(resource_id), Some(resource_id), 0)
943        .map_err(|e| {
944            format!(
945                "Error extracting element text from {}: {}",
946                resource_id,
947                e
948            )
949        })?;
950    if config.debug {
951        eprintln!("[STAM fromxml] extracted full text: {}", &converter.text);
952    }
953    let resource = TextResourceBuilder::new()
954        .with_id(resource_id)
955        .with_text(converter.text.clone());
956
957    converter.resource_handle = Some(
958        store
959            .add_resource(resource)
960            .map_err(|e| format!("Failed to add resource {}: {}", &resource_id, e))?,
961    );
962
963    converter.add_metadata(store).map_err(|e| format!("Failed to add metadata for {}: {}", &resource_id, e))?;
964
965    // extract annotations (second pass)
966    converter
967        .extract_element_annotation(doc.root_element(), &path, Some(resource_id), 0, store)
968        .map_err(|e| {
969            format!(
970                "Error extracting element annotation from {}: {}",
971                resource_id,
972                e
973            )
974        })?;
975
976    Ok(())
977}
978
979pub fn filename_to_id<'a>(filename: &'a str, config: &XmlConversionConfig) -> &'a str {
980    for suffix in config.id_strip_suffix.iter() {
981        if filename.ends_with(suffix) {
982            return &filename[..filename.len() - suffix.len()];
983        }
984    }
985    return filename;
986}
987
988#[derive(Clone,Copy,PartialEq, Hash, Eq)]
989enum PositionType {
990    Body,
991    TextPrefix,
992    TextSuffix,
993}
994
995struct XmlToStamConverter<'a> {
996    /// The current character position the conversion process is at
997    cursor: usize,
998
999    /// The extracted plain-text after/during untangling
1000    text: String,
1001
1002    /// The template engine
1003    template_engine: Engine<'a>,
1004
1005    /// Keep track of the new positions (unicode offset) where the node starts in the untangled document. The key consist of a document sequence number and a node ID.
1006    positionmap: HashMap<(usize,NodeId,PositionType), Offset>,
1007
1008    /// Keep track of the new positions (bytes offset) where the node starts in the untangled document. The key consist of a document sequence number and a node ID.
1009    bytepositionmap: HashMap<(usize,NodeId,PositionType), (usize, usize)>,
1010
1011    /// Keep track of markers (XML elements with `XmlAnnotationHandling::TextSelectorBetweenMarkers`), the key in this map is some hash of XmlElementConfig.
1012    markers: HashMap<usize, Vec<(usize,NodeId)>>,
1013
1014    /// The resource
1015    resource_handle: Option<TextResourceHandle>,
1016
1017    /// Used to keep track of whether we need to insert a whitespace before actual text
1018    pending_whitespace: bool,
1019
1020    /// The configuration
1021    config: &'a XmlConversionConfig,
1022
1023    /// Namespace to prefix map
1024    prefixes: HashMap<String, String>,
1025
1026    ///  Global context for template
1027    global_context: BTreeMap<String, upon::Value>,
1028
1029    /// Variable names per template
1030    variables: BTreeMap<String, BTreeSet<&'a str>>,
1031    
1032    debugindent: String,
1033}
1034
1035pub enum XmlConversionError {
1036    StamError(StamError),
1037    TemplateError(String, Option<upon::Error>),
1038    ConfigError(String),
1039}
1040
1041impl From<StamError> for XmlConversionError {
1042    fn from(error: StamError) -> Self {
1043        Self::StamError(error)
1044    }
1045}
1046
1047impl From<upon::Error> for XmlConversionError {
1048    fn from(error: upon::Error) -> Self {
1049        Self::TemplateError("".into(), Some(error))
1050    }
1051}
1052
1053impl Display for XmlConversionError {
1054    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
1055        match self {
1056            Self::StamError(e) => e.fmt(f),
1057            Self::TemplateError(s, e) => {
1058                f.write_str(s.as_str())?;
1059                f.write_str(": ")?;
1060                if let Some(e) = e {
1061                    e.fmt(f)?;
1062                }
1063                f.write_str("")
1064            }
1065            Self::ConfigError(e) => e.fmt(f),
1066        }
1067    }
1068}
1069
1070impl<'a> XmlToStamConverter<'a> {
1071    fn new(config: &'a XmlConversionConfig) -> Self {
1072        let mut prefixes: HashMap<String, String> = HashMap::new();
1073        for (prefix, namespace) in config.namespaces.iter() {
1074            prefixes.insert(namespace.to_string(), prefix.to_string());
1075        }
1076        let mut template_engine = Engine::new();
1077        template_engine.add_function("capitalize", filter_capitalize);
1078        template_engine.add_function("lower", str::to_lowercase);
1079        template_engine.add_function("upper", str::to_uppercase);
1080        template_engine.add_function("trim", |s: &str| s.trim().to_string() );
1081        template_engine.add_function("add", |a: i64, b: i64| a + b);
1082        template_engine.add_function("sub", |a: i64, b: i64| a - b);
1083        template_engine.add_function("mul", |a: i64, b: i64| a * b);
1084        template_engine.add_function("div", |a: i64, b: i64| a / b);
1085        template_engine.add_function("eq", |a: &upon::Value, b: &upon::Value| a == b);
1086        template_engine.add_function("ne", |a: &upon::Value, b: &upon::Value| a != b);
1087        template_engine.add_function("gt", |a: i64, b: i64| a > b);
1088        template_engine.add_function("lt", |a: i64, b: i64| a < b);
1089        template_engine.add_function("gte", |a: i64, b: i64| a >= b);
1090        template_engine.add_function("lte", |a: i64, b: i64| a <= b);
1091        template_engine.add_function("int", |a: &upon::Value| match a {
1092            upon::Value::Integer(x) => upon::Value::Integer(*x), 
1093            upon::Value::Float(x) => upon::Value::Integer(*x as i64), 
1094            upon::Value::String(s) => upon::Value::Integer(s.parse().expect("int filter expects an integer value")),
1095            _ => panic!("int filter expects an integer value"), //<< --^  TODO: PANIC IS WAY TO STRICT
1096        });
1097        template_engine.add_function("as_range", |a: i64| upon::Value::List(std::ops::Range { start: 0, end: a }.into_iter().map(|x| upon::Value::Integer(x+1)).collect::<Vec<_>>()) );
1098        template_engine.add_function("last", |list: &[upon::Value]| list.last().map(Clone::clone));
1099        template_engine.add_function("first", |list: &[upon::Value]| {
1100            list.first().map(Clone::clone)
1101        });
1102        template_engine.add_function("tokenize", |s: &str| {
1103            upon::Value::List(
1104                s.split(|c| c == ' ' || c == '\n').filter_map(|x|
1105                    if !x.is_empty() { 
1106                        Some(upon::Value::String(x.to_string())) 
1107                    } else {
1108                        None
1109                    }
1110                )
1111                .collect::<Vec<upon::Value>>())
1112        });
1113        template_engine.add_function("replace", |s: &str, from: &str, to: &str| { 
1114            upon::Value::String(s.replace(from,to))
1115        });
1116        template_engine.add_function("starts_with", |s: &str, prefix: &str| { 
1117            s.starts_with(prefix)
1118        });
1119        template_engine.add_function("ends_with", |s: &str, suffix: &str| { 
1120            s.ends_with(suffix)
1121        });
1122        template_engine.add_function("basename", |a: &upon::Value| match a {
1123            upon::Value::String(s) => upon::Value::String(s.split(|c| c == '/' || c == '\\').last().expect("splitting must work").to_string()),
1124            _ => panic!("basename filter expects a string value"), //<< --^  TODO: PANIC IS WAY TO STRICT
1125        });
1126        template_engine.add_function("noext", |a: &upon::Value| match a {
1127            upon::Value::String(s) => if let Some(pos) = s.rfind('.') {
1128                s[..pos].to_string()
1129            } else {
1130                s.to_string()
1131            },
1132            _ => panic!("basename filter expects a string value"), //<< --^  TODO: PANIC IS WAY TO STRICT
1133        });
1134        let mut converter = Self {
1135            cursor: 0,
1136            text: String::new(),
1137            template_engine,
1138            positionmap: HashMap::new(),
1139            bytepositionmap: HashMap::new(),
1140            markers: HashMap::new(),
1141            resource_handle: None,
1142            pending_whitespace: false,
1143            global_context: BTreeMap::new(),
1144            debugindent: String::new(),
1145            variables: BTreeMap::new(),
1146            prefixes,
1147            config,
1148        };
1149        converter.set_global_context();
1150        converter
1151    }
1152
1153    /// Compile templates
1154    fn compile(&mut self) -> Result<(), XmlConversionError> {
1155        if self.config.debug {
1156            eprintln!("[STAM fromxml] compiling templates");
1157        }
1158        for element in self.config.elements.iter() {
1159            if let Some(textprefix) = element.textprefix.as_ref() {
1160                if self.template_engine.get_template(textprefix.as_str()).is_none() {
1161                    let template = self.precompile(textprefix.as_str());
1162                    self.template_engine
1163                        .add_template(textprefix.clone(), template)
1164                        .map_err(|e| {
1165                            XmlConversionError::TemplateError(
1166                                format!("element/textprefix template {}", textprefix.clone()),
1167                                Some(e),
1168                            )
1169                        })?;
1170                }
1171            }
1172            if let Some(textsuffix) = element.textsuffix.as_ref() {
1173                if self.template_engine.get_template(textsuffix.as_str()).is_none() {
1174                    let template = self.precompile(textsuffix.as_str());
1175                    self.template_engine
1176                        .add_template(textsuffix.clone(), template)
1177                        .map_err(|e| {
1178                            XmlConversionError::TemplateError(
1179                                format!("element/textsuffix template {}", textsuffix.clone()),
1180                                Some(e),
1181                            )
1182                        })?;
1183                }
1184            }
1185            if let Some(id) = element.id.as_ref() {
1186                if self.template_engine.get_template(id.as_str()).is_none() {
1187                    let template = self.precompile(id.as_str());
1188                    self.template_engine.add_template(id.clone(), template).map_err(|e| {
1189                        XmlConversionError::TemplateError(
1190                            format!("element/id template {}", id.clone()),
1191                            Some(e),
1192                        )
1193                    })?;
1194                }
1195            }
1196            for annotationdata in element.annotationdata.iter().chain(element.annotatetextprefix.iter()).chain(element.annotatetextsuffix.iter()) {
1197                if let Some(id) = annotationdata.id.as_ref() {
1198                    if self.template_engine.get_template(id.as_str()).is_none() {
1199                        let template = self.precompile(id.as_str());
1200                        self.template_engine.add_template(id.clone(), template).map_err(|e| {
1201                            XmlConversionError::TemplateError(
1202                                format!("annotationdata/id template {}", id.clone()),
1203                                Some(e),
1204                            )
1205                        })?;
1206                    }
1207                }
1208                if let Some(set) = annotationdata.set.as_ref() {
1209                    if self.template_engine.get_template(set.as_str()).is_none() {
1210                        let template = self.precompile(set.as_str());
1211                        //eprintln!("------- DEBUG: {} -> {}", set.as_str(), template);
1212                        self.template_engine.add_template(set.clone(), template).map_err(|e| {
1213                            XmlConversionError::TemplateError(
1214                                format!("annotationdata/set template {}", set.clone()),
1215                                Some(e),
1216                            )
1217                        })?;
1218                    }
1219                }
1220                if let Some(key) = annotationdata.key.as_ref() {
1221                    if self.template_engine.get_template(key.as_str()).is_none() {
1222                        let template = self.precompile(key.as_str());
1223                        self.template_engine.add_template(key.clone(), template).map_err(|e| {
1224                            XmlConversionError::TemplateError(
1225                                format!("annotationdata/key template {}", key.clone()),
1226                                Some(e),
1227                            )
1228                        })?;
1229                    }
1230                }
1231                if let Some(value) = annotationdata.value.as_ref() {
1232                    self.compile_value(value)?;
1233                }
1234            }
1235        }
1236        for metadata in self.config.metadata.iter() {
1237            if let Some(id) = metadata.id.as_ref() {
1238                if self.template_engine.get_template(id.as_str()).is_none() {
1239                    let template = self.precompile(id.as_str());
1240                    self.template_engine.add_template(id.clone(), template).map_err(|e| {
1241                        XmlConversionError::TemplateError(
1242                            format!("metadata/id template {}", id.clone()),
1243                            Some(e),
1244                        )
1245                    })?;
1246                }
1247            }
1248            for annotationdata in metadata.annotationdata.iter() {
1249                if let Some(id) = annotationdata.id.as_ref() {
1250                    if self.template_engine.get_template(id.as_str()).is_none() {
1251                        let template = self.precompile(id.as_str());
1252                        self.template_engine.add_template(id.clone(), template).map_err(|e| {
1253                            XmlConversionError::TemplateError(
1254                                format!("annotationdata/id template {}", id.clone()),
1255                                Some(e),
1256                            )
1257                        })?;
1258                    }
1259                }
1260                if let Some(set) = annotationdata.set.as_ref() {
1261                    if self.template_engine.get_template(set.as_str()).is_none() {
1262                        let template = self.precompile(set.as_str());
1263                        //eprintln!("------- DEBUG: {} -> {}", set.as_str(), template);
1264                        self.template_engine.add_template(set.clone(), template).map_err(|e| {
1265                            XmlConversionError::TemplateError(
1266                                format!("annotationdata/set template {}", set.clone()),
1267                                Some(e),
1268                            )
1269                        })?;
1270                    }
1271                }
1272                if let Some(key) = annotationdata.key.as_ref() {
1273                    if self.template_engine.get_template(key.as_str()).is_none() {
1274                        let template = self.precompile(key.as_str());
1275                        self.template_engine.add_template(key.clone(), template).map_err(|e| {
1276                            XmlConversionError::TemplateError(
1277                                format!("annotationdata/key template {}", key.clone()),
1278                                Some(e),
1279                            )
1280                        })?;
1281                    }
1282                }
1283                if let Some(value) = annotationdata.value.as_ref() {
1284                    self.compile_value(value)?;
1285                }
1286            }
1287        }
1288        Ok(())
1289    }
1290
1291    /// Compile templates from a value, all strings are considered templates
1292    fn compile_value(&mut self, value: &'a toml::Value) -> Result<(), XmlConversionError> {
1293        match value {
1294            toml::Value::String(value) => {
1295                if self.template_engine.get_template(value.as_str()).is_none() {
1296                    let template = self.precompile(value.as_str());
1297                    self.template_engine.add_template(value.clone(), template).map_err(|e| {
1298                        XmlConversionError::TemplateError(
1299                            format!("annotationdata/value template {}", value.clone()),
1300                            Some(e),
1301                        )
1302                    })?;
1303                }
1304            }
1305            toml::Value::Table(map) => {
1306                for (_key, value) in map.iter() {
1307                    self.compile_value(value)?;
1308                }
1309            },
1310            toml::Value::Array(list) => {
1311                for value in list.iter() {
1312                    self.compile_value(value)?;
1313                }
1314            }
1315            _ => {} //no templates in other types
1316        }
1317        Ok(())
1318    }
1319
1320    /// untangle text, extract the text (and only the text)
1321    /// from an XML document, according to the
1322    /// mapping configuration and creates a STAM TextResource for it.
1323    /// Records exact offsets per element/node for later use during annotation extraction.
1324    fn extract_element_text<'b>(
1325        &mut self,
1326        node: Node<'a,'b>,
1327        path: &NodePath<'a,'b>,
1328        whitespace: XmlWhitespaceHandling,
1329        resource_id: Option<&str>,
1330        inputfile: Option<&str>,
1331        doc_num: usize,
1332    ) -> Result<(), XmlConversionError> {
1333        if self.config.debug {
1334            eprintln!("[STAM fromxml]{} extracting text for element {}", self.debugindent, path);
1335        }
1336        let mut begin = self.cursor; //current character pos marks the begin
1337        let mut bytebegin = self.text.len(); //current byte pos marks the begin
1338        let mut end_discount = 0; //the discount may be needed later if textsuffixes are outputted (which we do not want as part of the annotation)
1339        let mut end_bytediscount = 0;
1340        let mut firsttext = true; //tracks whether we have already outputted some text, needed for whitespace handling
1341
1342        let mut elder_siblings = SiblingCounter::default();
1343
1344        // obtain the configuration that applies to this element
1345        if let Some(element_config) = self.config.element_config(node, path) {
1346            if self.config.debug {
1347                eprintln!("[STAM fromxml]{} matching config: {:?}", self.debugindent, element_config);
1348            }
1349
1350            if (element_config.stop == Some(false) || element_config.stop.is_none())
1351                && element_config.annotation != XmlAnnotationHandling::TextSelectorBetweenMarkers
1352            {
1353                //do text extraction for this element
1354
1355                let whitespace = if node.has_attribute((NS_XML, "space")) {
1356                    // if there is an explicit xml:space attributes, it overrides whatever whitespace handling we have set:
1357                    match node.attribute((NS_XML, "space")).unwrap() {
1358                        "preserve" => XmlWhitespaceHandling::Preserve,
1359                        "collapse" | "replace" => XmlWhitespaceHandling::Collapse,
1360                        _ => whitespace,
1361                    }
1362                } else if element_config.whitespace == XmlWhitespaceHandling::Inherit
1363                    || element_config.whitespace == XmlWhitespaceHandling::Unspecified
1364                {
1365                    whitespace //from parent, i.e. passed to this (recursive) function by caller
1366                } else {
1367                    element_config.whitespace //default from the config
1368                };
1369
1370                // process the text prefix, a text template to include prior to the actual text
1371                if let Some(textprefix) = &element_config.textprefix {
1372                    self.pending_whitespace = false;
1373                    if self.config.debug {
1374                        eprintln!("[STAM fromxml]{} outputting textprefix: {:?}", self.debugindent, textprefix);
1375                    }
1376                    let result =
1377                        self.render_template(textprefix, &node, Some(self.cursor), None, resource_id, inputfile, doc_num)
1378                            .map_err(|e| match e {
1379                                XmlConversionError::TemplateError(s, e) => {
1380                                    XmlConversionError::TemplateError(
1381                                        format!(
1382                                        "whilst rendering textprefix template '{}' for node '{}': {}",
1383                                        textprefix, node.tag_name().name(), s
1384                                    ),
1385                                        e,
1386                                    )
1387                                }
1388                                e => e,
1389                            })?;
1390                    let result_charlen = result.chars().count();
1391
1392                    if !element_config.annotatetextprefix.is_empty() {
1393                        //record the offsets for textprefix annotation later
1394                        let offset = Offset::simple(self.cursor, self.cursor + result_charlen);
1395                        self.positionmap.insert((doc_num, node.id(), PositionType::TextPrefix), offset);
1396                        self.bytepositionmap
1397                            .insert((doc_num, node.id(), PositionType::TextPrefix), (bytebegin, bytebegin + result.len()));
1398                    }
1399
1400                    self.cursor += result_charlen;
1401                    self.text += &result;
1402
1403                    if element_config.include_textprefix != Some(true) {
1404                        // the textprefix will not be part of the annotation's text selection, increment the offsets:
1405                        begin += result_charlen;
1406                        bytebegin += result.len();
1407                    }
1408                }
1409
1410                let textbegin = self.cursor;
1411                // process all child elements
1412                for child in node.children() {
1413                    if self.config.debug {
1414                        eprintln!("[STAM fromxml]{} child {:?}", self.debugindent, child);
1415                    }
1416                    if child.is_text() && element_config.text == Some(true) {
1417                        // extract the actual element text
1418                        // this may trigger multiple times if the XML element (`node`) has mixed content
1419
1420                        let mut innertext = child.text().expect("text node must have text");
1421                        let mut pending_whitespace = false;
1422                        let mut leading_whitespace = false;
1423                        if whitespace == XmlWhitespaceHandling::Collapse && !innertext.is_empty() {
1424                            // analyse what kind of whitespace we are dealing with
1425                            let mut all_whitespace = true;
1426                            leading_whitespace = innertext.chars().next().unwrap().is_whitespace();
1427
1428                            // any pending whitespace after this elements is 'buffered' in this boolean
1429                            // and only written out depending on the next text's whitespace situation
1430                            pending_whitespace = innertext
1431                                .chars()
1432                                .inspect(|c| {
1433                                    if !c.is_whitespace() {
1434                                        all_whitespace = false
1435                                    }
1436                                })
1437                                .last()
1438                                .unwrap()
1439                                .is_whitespace();
1440                            if all_whitespace {
1441                                self.pending_whitespace = true;
1442                                if self.config.debug {
1443                                    eprintln!(
1444                                        "[STAM fromxml]{} ^- all whitespace, flag pending whitespace and skipping...",
1445                                        self.debugindent,
1446                                    );
1447                                }
1448                                continue;
1449                            }
1450                            innertext = innertext.trim();
1451                            if self.config.debug {
1452                                eprintln!(
1453                                    "[STAM fromxml]{} ^- collapsed whitespace: {:?}",
1454                                    self.debugindent,
1455                                    innertext
1456                                );
1457                            }
1458                        }
1459                        if self.pending_whitespace || leading_whitespace {
1460                            //output any pending whitespace
1461                            if !self.text.is_empty()
1462                                && !self.text.chars().rev().next().unwrap().is_whitespace()
1463                            {
1464                                if self.config.debug {
1465                                    eprintln!("[STAM fromxml]{} ^- outputting pending whitespace",self.debugindent);
1466                                }
1467                                self.text.push(' ');
1468                                self.cursor += 1;
1469                                if firsttext && self.pending_whitespace {
1470                                    begin += 1;
1471                                    bytebegin += 1;
1472                                    firsttext = false;
1473                                }
1474                            }
1475                            self.pending_whitespace = false;
1476                        }
1477
1478                        // finally we output the actual text, and advance the cursor
1479                        if whitespace == XmlWhitespaceHandling::Collapse {
1480                            let mut prevc = ' ';
1481                            let mut innertext = innertext.replace(|c: char| c.is_whitespace(), " ");
1482                            innertext.retain(|c| {
1483                                let do_retain = c != ' ' || prevc != ' ';
1484                                prevc = c;
1485                                do_retain
1486                            });
1487                            self.text += &innertext;
1488                            self.cursor += innertext.chars().count();
1489                            if self.config.debug {
1490                                eprintln!("[STAM fromxml]{} ^- outputting text child (collapsed whitespace), cursor is now {}: {}",self.debugindent, self.cursor, innertext);
1491                            }
1492                        } else {
1493                            self.text += &innertext;
1494                            self.cursor += innertext.chars().count();
1495                            if self.config.debug {
1496                                eprintln!("[STAM fromxml]{} ^- outputting text child, cursor is now {}: {}",self.debugindent, self.cursor, innertext);
1497                            }
1498                        }
1499                        self.pending_whitespace = pending_whitespace;
1500                    } else if child.is_element() {
1501                        if self.config.debug {
1502                            eprintln!("[STAM fromxml]{} \\- extracting text for this child", self.debugindent);
1503                        }
1504                        self.debugindent.push_str("  ");
1505                        // recursion step, process child element, pass our whitespace handling mode since it may inherit it
1506                        let mut path = path.clone();
1507                        let count = elder_siblings.count(&child);
1508                        path.add(&child, Some(count));
1509                        self.extract_element_text(child, &path, whitespace, resource_id, inputfile, doc_num)?;
1510                        self.debugindent.pop();
1511                        self.debugindent.pop();
1512                    } else {
1513                        if self.config.debug {
1514                            eprintln!("[STAM fromxml]{} ^- skipping this child node", self.debugindent);
1515                        }
1516                        continue;
1517                    }
1518                }
1519
1520
1521                // process the text suffix, a preconfigured string of text to include after to the actual text
1522                if let Some(textsuffix) = &element_config.textsuffix {
1523                    if self.config.debug {
1524                        eprintln!("[STAM fromxml]{} outputting textsuffix: {:?}", self.debugindent, textsuffix);
1525                    }
1526                    let result = self.render_template(
1527                        textsuffix.as_str(),
1528                        &node,
1529                        Some(textbegin),
1530                        Some(self.cursor),
1531                        resource_id,
1532                        inputfile,
1533                        doc_num
1534                    ).map_err(|e| match e {
1535                            XmlConversionError::TemplateError(s, e) => {
1536                                XmlConversionError::TemplateError(
1537                                    format!(
1538                                        "whilst rendering textsuffix template '{}' for node '{}': {}",
1539                                        textsuffix,
1540                                        node.tag_name().name(),
1541                                        s
1542                                    ),
1543                                    e,
1544                                )
1545                            }
1546                            e => e,
1547                    })?;
1548                    let end_discount_tmp = result.chars().count();
1549                    let end_bytediscount_tmp = result.len();
1550
1551
1552                    self.text += &result;
1553
1554                    if !element_config.annotatetextsuffix.is_empty() {
1555                        //record the offsets for textsuffix annotation later
1556                        let offset = Offset::simple(self.cursor, self.cursor + end_discount_tmp);
1557                        self.positionmap.insert((doc_num, node.id(), PositionType::TextSuffix), offset);
1558                        self.bytepositionmap
1559                            .insert((doc_num, node.id(), PositionType::TextSuffix), (self.text.len() - end_bytediscount_tmp, self.text.len()));
1560                    }
1561
1562                    self.cursor += end_discount_tmp;
1563                    self.pending_whitespace = false;
1564
1565                    if element_config.include_textsuffix == Some(true) {
1566                        // the textsuffix will be part of the annotation's text selection, no discount for later
1567                        end_discount = 0;
1568                        end_bytediscount = 0;
1569                    } else {
1570                        // the textsuffix will not be part of the annotation's text selection, set discounts for later
1571                        end_discount = end_discount_tmp;
1572                        end_bytediscount = end_bytediscount_tmp;
1573                    }
1574
1575                }
1576            } else if element_config.annotation == XmlAnnotationHandling::TextSelectorBetweenMarkers
1577            {
1578                // this is a marker, keep track of it so we can extract the span between markers in [`extract_element_annotation()`] later
1579                if self.config.debug {
1580                    eprintln!("[STAM fromxml]{} adding to markers", self.debugindent);
1581                }
1582                self.markers
1583                    .entry(element_config.hash())
1584                    .and_modify(|v| v.push((doc_num, node.id())))
1585                    .or_insert(vec![(doc_num, node.id())]);
1586            }
1587        } else if self.config.debug {
1588            eprintln!(
1589                "[STAM fromxml]{} WARNING: no match, skipping text extraction for element {}",
1590                self.debugindent,
1591                path
1592            );
1593        }
1594
1595        // Last, we store the new text offsets for this element/node so
1596        // we can use it in [`extract_element_annotation()`] to associate
1597        // actual annotations with this span.
1598        if begin <= (self.cursor - end_discount) {
1599            let offset = Offset::simple(begin, self.cursor - end_discount);
1600            if self.config.debug {
1601                eprintln!(
1602                    "[STAM fromxml]{} extracted text for {} @{:?}: {:?}",
1603                    self.debugindent,
1604                    path,
1605                    &offset,
1606                    &self.text[bytebegin..(self.text.len() - end_bytediscount)]
1607                );
1608            }
1609            self.positionmap.insert((doc_num, node.id(), PositionType::Body), offset);
1610            self.bytepositionmap
1611                .insert((doc_num, node.id(), PositionType::Body), (bytebegin, self.text.len() - end_bytediscount));
1612        }
1613        Ok(())
1614    }
1615
1616    /// extract annotations from the XML document
1617    /// according to the mapping configuration and creates a STAM TextResource for it.
1618    /// The text, for the full document, must have already been extracted earlier with [`extract_element_text()`].
1619    /// This relies on the exact offsets per element/node computed earlier during text extraction (`positionmap`).
1620    fn extract_element_annotation<'b>(
1621        &mut self,
1622        node: Node<'a,'b>,
1623        path: &NodePath<'a,'b>,
1624        inputfile: Option<&str>,
1625        doc_num: usize,
1626        store: &mut AnnotationStore,
1627    ) -> Result<(), XmlConversionError> {
1628        if self.config.debug {
1629            eprintln!("[STAM fromxml]{} extracting annotation from {}", self.debugindent, path);
1630        }
1631
1632        let mut elder_siblings = SiblingCounter::default();
1633
1634        // obtain the configuration that applies to this element
1635        if let Some(element_config) = self.config.element_config(node, &path) {
1636            if self.config.debug {
1637                eprintln!("[STAM fromxml]{} matching config: {:?}", self.debugindent, element_config);
1638            }
1639            if element_config.annotation != XmlAnnotationHandling::None
1640                && element_config.annotation != XmlAnnotationHandling::Unspecified
1641            {
1642                let mut builder = AnnotationBuilder::new();
1643
1644                //prepare variables to pass to the template context
1645                let offset = self.positionmap.get(&(doc_num, node.id(), PositionType::Body));
1646                if element_config.annotation == XmlAnnotationHandling::TextSelector {
1647                    if let Some((beginbyte, endbyte)) = self.bytepositionmap.get(&(doc_num, node.id(), PositionType::Body)) {
1648                        if self.config.debug {
1649                            eprintln!("[STAM fromxml]{} annotation covers text {:?} (bytes {}-{})", self.debugindent, offset, beginbyte, endbyte);
1650                        }
1651                    }  else if self.text.is_empty() {
1652                        return Err(XmlConversionError::ConfigError("Can't extract annotations on text if no text was extracted!".into()));
1653                    }
1654                }
1655                let begin = if let Some(offset) = offset {
1656                    if let Cursor::BeginAligned(begin) = offset.begin {
1657                        Some(begin)
1658                    } else {
1659                        None
1660                    }
1661                } else {
1662                    None
1663                };
1664                let end = if let Some(offset) = offset {
1665                    if let Cursor::BeginAligned(end) = offset.end {
1666                        Some(end)
1667                    } else {
1668                        None
1669                    }
1670                } else {
1671                    None
1672                };
1673
1674                let resource_id = if let Some(resource_handle) = self.resource_handle {
1675                    store.resource(resource_handle).unwrap().id()
1676                } else {
1677                    None
1678                };
1679
1680                let mut have_id = false;
1681                if let Some(template) = &element_config.id {
1682                    let context = self.context_for_node(&node, begin, end, template.as_str(), resource_id, inputfile, doc_num);
1683                    let compiled_template = self.template_engine.template(template.as_str());
1684                    let id = compiled_template.render(&context).to_string().map_err(|e| 
1685                            XmlConversionError::TemplateError(
1686                                format!(
1687                                    "whilst rendering id template '{}' for node '{}'",
1688                                    template,
1689                                    node.tag_name().name(),
1690                                ),
1691                                Some(e),
1692                            )
1693                        )?;
1694                    if !id.is_empty() {
1695                        builder = builder.with_id(id);
1696                        have_id = true;
1697                    }
1698                }
1699
1700                if !have_id {
1701                    //generate a random ID if we have none
1702                    if let Some(resource_id) = resource_id {
1703                        builder = builder.with_id(stam::generate_id(&format!("{}-",resource_id), ""));
1704                    } else {
1705                        builder = builder.with_id(stam::generate_id("", ""));
1706                    }
1707                }
1708
1709                builder = self.add_annotationdata_to_builder(element_config.annotationdata.iter(), builder, node.clone(), begin, end, resource_id, inputfile, doc_num)?;
1710
1711
1712                if self.config.provenance  && inputfile.is_some() {
1713                    let path_string = if let Some(id) = node.attribute((NS_XML,"id")) {
1714                        //node has an ID, use that
1715                        format!("//{}[@xml:id=\"{}\"]", self.get_node_name_for_xpath(&node), id)
1716                    } else {
1717                        //no ID, use full XPath expression
1718                        path.format_as_xpath(&self.prefixes)
1719                    };
1720                    let databuilder = AnnotationDataBuilder::new().with_dataset(CONTEXT_ANNO.into()).with_key("target".into()).with_value(
1721                        BTreeMap::from([
1722                            ("source".to_string(),inputfile.unwrap().into()),
1723                            ("selector".to_string(), 
1724                                    BTreeMap::from([
1725                                        ("type".to_string(),"XPathSelector".into()),
1726                                        ("value".to_string(),path_string.into())
1727                                    ]).into()
1728                            )
1729                        ]).into()
1730                    );
1731                    builder = builder.with_data_builder(databuilder);
1732                }
1733
1734
1735                // Finish the builder and add the actual annotation to the store, according to its element handling
1736                match element_config.annotation {
1737                    XmlAnnotationHandling::TextSelector => {
1738                        // Annotation is on text, translates to TextSelector
1739                        if let Some(selector) = self.textselector(node, doc_num, PositionType::Body) {
1740                            builder = builder.with_target(selector);
1741                            if self.config.debug {
1742                                eprintln!("[STAM fromxml]   builder AnnotateText: {:?}", builder);
1743                            }
1744                            store.annotate(builder)?;
1745                        }
1746                        if !element_config.annotatetextprefix.is_empty() || !element_config.annotatetextsuffix.is_empty() {
1747                            self.annotate_textaffixes(node, element_config, inputfile, doc_num, store)?;
1748                        }
1749                    }
1750                    XmlAnnotationHandling::ResourceSelector => {
1751                        // Annotation is metadata, translates to ResourceSelector
1752                        builder = builder.with_target(SelectorBuilder::ResourceSelector(
1753                            self.resource_handle.into(),
1754                        ));
1755                        if self.config.debug {
1756                            eprintln!("[STAM fromxml]   builder AnnotateResource: {:?}", builder);
1757                        }
1758                        store.annotate(builder)?;
1759                    }
1760                    XmlAnnotationHandling::TextSelectorBetweenMarkers => {
1761                        // Annotation is on a text span *between* two marker elements
1762                        if let Some(selector) =
1763                            self.textselector_for_markers(node, doc_num, store, element_config)
1764                        {
1765                            builder = builder.with_target(selector);
1766                            if self.config.debug {
1767                                eprintln!(
1768                                    "[STAM fromxml]   builder TextSelectorBetweenMarkers: {:?}",
1769                                    builder
1770                                );
1771                            }
1772                            store.annotate(builder)?;
1773                            if !element_config.annotatetextprefix.is_empty() || !element_config.annotatetextsuffix.is_empty() {
1774                                self.annotate_textaffixes(node, element_config, inputfile, doc_num, store)?;
1775                            }
1776                        }
1777                    }
1778                    _ => panic!(
1779                        "Invalid annotationhandling: {:?}",
1780                        element_config.annotation
1781                    ),
1782                }
1783            }
1784
1785            // Recursion step
1786            if element_config.stop == Some(false) || element_config.stop.is_none() {
1787                for child in node.children() {
1788                    if child.is_element() {
1789                        self.debugindent.push_str("  ");
1790                        let mut path = path.clone();
1791                        let count = elder_siblings.count(&child);
1792                        path.add(&child, Some(count));
1793                        //eprintln!("DEBUG: count={}, child={:?}, parent={:?}, elder_siblings={:?}", count, child.tag_name(), node.tag_name(), elder_siblings);
1794                        self.extract_element_annotation(child, &path, inputfile, doc_num, store)?;
1795                        self.debugindent.pop();
1796                        self.debugindent.pop();
1797                    }
1798                }
1799            }
1800        } else {
1801            eprintln!(
1802                "[STAM fromxml]{} WARNING: no match, skipping annotation extraction for element {}",
1803                self.debugindent,
1804                path
1805            );
1806        }
1807        Ok(())
1808    }
1809
1810    fn add_annotationdata_to_builder<'input>(&self, iter: impl Iterator<Item = &'a XmlAnnotationDataConfig>,
1811        mut builder: AnnotationBuilder<'a>,
1812        node: Node<'a, 'input>,
1813        begin: Option<usize>,
1814        end: Option<usize>,
1815        resource_id: Option<&str>,
1816        inputfile: Option<&str>,
1817        doc_num: usize,
1818    ) -> Result<AnnotationBuilder<'a>, XmlConversionError> {
1819        for annotationdata in iter {
1820            let mut databuilder = AnnotationDataBuilder::new();
1821            if let Some(template) = &annotationdata.set {
1822                let context = self.context_for_node(&node, begin, end, template.as_str(), resource_id, inputfile, doc_num);
1823                let compiled_template = self.template_engine.template(template.as_str());
1824                let dataset = compiled_template.render(&context).to_string().map_err(|e| 
1825                        XmlConversionError::TemplateError(
1826                            format!(
1827                                "whilst rendering annotationdata/dataset template '{}' for node '{}'",
1828                                template,
1829                                node.tag_name().name(),
1830                            ),
1831                            Some(e),
1832                        )
1833                    )?;
1834                if !dataset.is_empty() {
1835                    databuilder = databuilder.with_dataset(dataset.into())
1836                }
1837            } else {
1838                databuilder =
1839                    databuilder.with_dataset(self.config.default_set.as_str().into());
1840            }
1841            if let Some(template) = &annotationdata.key {
1842                let context = self.context_for_node(&node, begin, end, template.as_str(), resource_id, inputfile, doc_num);
1843                let compiled_template = self.template_engine.template(template.as_str());
1844                match compiled_template.render(&context).to_string().map_err(|e| 
1845                        XmlConversionError::TemplateError(
1846                            format!(
1847                                "whilst rendering annotationdata/key template '{}' for node '{}'",
1848                                template,
1849                                node.tag_name().name(),
1850                            ),
1851                            Some(e),
1852                        )
1853                    )  {
1854                    Ok(key) if !key.is_empty() =>
1855                        databuilder = databuilder.with_key(key.into()) ,
1856                    Ok(_) if !annotationdata.skip_if_missing => {
1857                        return Err(XmlConversionError::TemplateError(
1858                            format!(
1859                                "whilst rendering annotationdata/key template '{}' for node '{}'",
1860                                template,
1861                                node.tag_name().name(),
1862                            ),
1863                            None
1864                        ));
1865                    },
1866                    Err(e) if !annotationdata.skip_if_missing => {
1867                        return Err(e)
1868                    },
1869                    _ => {
1870                        //skip whole databuilder if missing
1871                        continue
1872                    }
1873                }
1874            }
1875            if let Some(value) = &annotationdata.value {
1876                match self.extract_value(value,  node, annotationdata.allow_empty_value, annotationdata.skip_if_missing, begin, end, resource_id, inputfile, doc_num)? {
1877                    Some(value) => {
1878                        databuilder = databuilder.with_value(value);
1879                    },
1880                    None =>  {
1881                        //skip whole databuilder if missing
1882                        continue
1883                    }
1884                }
1885            }
1886            builder = builder.with_data_builder(databuilder);
1887        }
1888        Ok(builder)
1889    }
1890
1891    /// Annotates textprefix and textsuffix, if applicable
1892    fn annotate_textaffixes<'b>(
1893        &mut self,
1894        node: Node<'a,'b>,
1895        element_config: &XmlElementConfig,
1896        inputfile: Option<&str>,
1897        doc_num: usize,
1898        store: &mut AnnotationStore,
1899    ) -> Result<(), XmlConversionError> {
1900
1901
1902        if !element_config.annotatetextprefix.is_empty() {
1903            let mut builder = AnnotationBuilder::new().with_id(stam::generate_id("textprefix-", ""));
1904            if let Some(offset) = self.positionmap.get(&(doc_num, node.id(), PositionType::TextPrefix)) {
1905                let begin = if let Cursor::BeginAligned(begin) = offset.begin {
1906                        Some(begin)
1907                    } else {
1908                        None
1909                    };
1910                let end = if let Cursor::BeginAligned(end) = offset.end {
1911                        Some(end)
1912                    } else {
1913                        None
1914                    };
1915                builder = self.add_annotationdata_to_builder(element_config.annotatetextprefix.iter(), builder, node.clone(), begin,end, None, inputfile, doc_num)?; //MAYBE TODO: pass resource_id
1916                if let Some(selector) = self.textselector(node, doc_num, PositionType::TextPrefix) {
1917                    builder = builder.with_target(selector);
1918                    if self.config.debug {
1919                        eprintln!("[STAM fromxml]   builder AnnotateText: {:?}", builder);
1920                    }
1921                    store.annotate(builder)?;
1922                } else {
1923                    return Err(XmlConversionError::ConfigError("Failed to create textselector to target textprefix".into()));
1924                }
1925            }
1926        }
1927
1928        if !element_config.annotatetextsuffix.is_empty() {
1929            let mut builder = AnnotationBuilder::new().with_id(stam::generate_id("textsuffix-", ""));
1930            if let Some(offset) = self.positionmap.get(&(doc_num, node.id(), PositionType::TextSuffix)) {
1931                let begin = if let Cursor::BeginAligned(begin) = offset.begin {
1932                        Some(begin)
1933                    } else {
1934                        None
1935                    };
1936                let end = if let Cursor::BeginAligned(end) = offset.end {
1937                        Some(end)
1938                    } else {
1939                        None
1940                    };
1941                builder = self.add_annotationdata_to_builder(element_config.annotatetextsuffix.iter(), builder, node.clone(), begin,end, None, inputfile, doc_num)?; //MAYBE TODO: pass resource_id
1942                if let Some(selector) = self.textselector(node, doc_num, PositionType::TextSuffix) {
1943                    builder = builder.with_target(selector);
1944                    if self.config.debug {
1945                        eprintln!("[STAM fromxml]   builder AnnotateText: {:?}", builder);
1946                    }
1947                    store.annotate(builder)?;
1948                } else {
1949                    return Err(XmlConversionError::ConfigError("Failed to create textselector to target textprefix".into()));
1950                }
1951            }
1952        }
1953        Ok(())
1954    }
1955
1956    /// Extract values, running the templating engine in case of string values
1957    fn extract_value<'b>(&self, value: &'a toml::Value, node: Node<'a,'b>, allow_empty_value: bool, skip_if_missing: bool, begin: Option<usize>, end: Option<usize>, resource_id: Option<&str>, inputfile: Option<&str>, doc_num: usize) -> Result<Option<DataValue>, XmlConversionError>{
1958        match value {
1959            toml::Value::String(template) => {  
1960                let context = self.context_for_node(&node, begin, end, template.as_str(), resource_id, inputfile, doc_num);
1961                /*
1962                if self.config.debug() {
1963                    eprintln!(
1964                        "[STAM fromxml]              Context for annotationdata/map template '{}' for node '{}': {:?}",
1965                        template,
1966                        node.tag_name().name(),
1967                        context
1968                    );
1969                }
1970                */
1971                let compiled_template = self.template_engine.template(template.as_str()); //panics if doesn't exist, but that can't happen
1972                match compiled_template.render(&context).to_string().map_err(|e| 
1973                        XmlConversionError::TemplateError(
1974                            format!(
1975                                "whilst rendering annotationdata/map template '{}' for node '{}'.{}",
1976                                template,
1977                                node.tag_name().name(),
1978                                if self.config.debug() {
1979                                    format!("\nContext was {:?}.\nVariables are: {:?}", context, self.variables.get(template))
1980                                } else {
1981                                    String::new()
1982                                }
1983                            ),
1984                            Some(e),
1985                        )
1986                    )  {
1987                    Ok(value) => {
1988                        if !value.is_empty() || allow_empty_value {
1989                            Ok(Some(value.into()))
1990                        } else {
1991                            //skip
1992                            Ok(None)
1993                        }
1994                    },
1995                    Err(e) if !skip_if_missing => {
1996                        Err(e)
1997                    },
1998                    Err(_) if allow_empty_value => {
1999                        Ok(Some("".into()))
2000                    },
2001                    Err(_) => {
2002                        //skip whole databuilder if missing
2003                        Ok(None)
2004                    }
2005                }
2006            },
2007            toml::Value::Table(map) => {  
2008                let mut resultmap: BTreeMap<String,DataValue> = BTreeMap::new();
2009                for (key, value) in map.iter() {
2010                    if let Some(value) = self.extract_value(value,  node, false, true, begin, end, resource_id, inputfile, doc_num)? {
2011                        resultmap.insert(key.clone(), value);
2012                    }
2013                }
2014                Ok(Some(resultmap.into()))
2015            },
2016            toml::Value::Array(list) => {  
2017                let mut resultlist: Vec<DataValue> = Vec::new();
2018                for value in list.iter() {
2019                    if let Some(value) = self.extract_value(value, node, false, true, begin, end, resource_id, inputfile, doc_num)? {
2020                        resultlist.push(value);
2021                    }
2022                }
2023                Ok(Some(resultlist.into()))
2024            }
2025            toml::Value::Boolean(v) => Ok(Some(DataValue::Bool(*v))),
2026            toml::Value::Float(v) => Ok(Some(DataValue::Float(*v))),
2027            toml::Value::Integer(v) => Ok(Some(DataValue::Int(*v as isize))),
2028            toml::Value::Datetime(_v) => {
2029                todo!("fromxml: Datetime conversion not implemented yet");
2030            }
2031        }
2032    }
2033
2034    /// Extract values for metadata (no associated node), running the templating engine in case of string values
2035    fn extract_value_metadata<'b>(&self, value: &'a toml::Value, context: &upon::Value, allow_empty_value: bool, skip_if_missing: bool, resource_id: Option<&str>) -> Result<Option<DataValue>, XmlConversionError>{
2036        match value {
2037            toml::Value::String(template) => {  
2038                let compiled_template = self.template_engine.template(template.as_str()); //panics if doesn't exist, but that can't happen
2039                match compiled_template.render(&context).to_string().map_err(|e| 
2040                        XmlConversionError::TemplateError(
2041                            format!(
2042                                "whilst rendering annotationdata/metadata template '{}' for metadata",
2043                                template,
2044                            ),
2045                            Some(e),
2046                        )
2047                    )  {
2048                    Ok(value) => {
2049                        if !value.is_empty() || allow_empty_value {
2050                            Ok(Some(value.into()))
2051                        } else {
2052                            //skip
2053                            Ok(None)
2054                        }
2055                    },
2056                    Err(e) if !skip_if_missing => {
2057                        Err(e)
2058                    },
2059                    Err(_) if allow_empty_value => {
2060                        Ok(Some("".into()))
2061                    },
2062                    Err(_) => {
2063                        //skip whole databuilder if missing
2064                        Ok(None)
2065                    }
2066                }
2067            },
2068            toml::Value::Table(map) => {  
2069                let mut resultmap: BTreeMap<String,DataValue> = BTreeMap::new();
2070                for (key, value) in map.iter() {
2071                    if let Some(value) = self.extract_value_metadata(value, context, false, true,  resource_id)? {
2072                        resultmap.insert(key.clone(), value);
2073                    }
2074                }
2075                Ok(Some(resultmap.into()))
2076            },
2077            toml::Value::Array(list) => {  
2078                let mut resultlist: Vec<DataValue> = Vec::new();
2079                for value in list.iter() {
2080                    if let Some(value) = self.extract_value_metadata(value, context, false, true, resource_id)? {
2081                        resultlist.push(value);
2082                    }
2083                }
2084                Ok(Some(resultlist.into()))
2085            }
2086            toml::Value::Boolean(v) => Ok(Some(DataValue::Bool(*v))),
2087            toml::Value::Float(v) => Ok(Some(DataValue::Float(*v))),
2088            toml::Value::Integer(v) => Ok(Some(DataValue::Int(*v as isize))),
2089            toml::Value::Datetime(_v) => {
2090                todo!("fromxml: Datetime conversion not implemented yet");
2091            }
2092        }
2093    }
2094
2095    /// Select text corresponding to the element/node and document number
2096    fn textselector<'s>(&'s self, node: Node, doc_num: usize, positiontype: PositionType) -> Option<SelectorBuilder<'s>> {
2097        let res_handle = self.resource_handle.expect("resource must be associated");
2098        if let Some(offset) = self.positionmap.get(&(doc_num, node.id(), positiontype)) {
2099            Some(SelectorBuilder::TextSelector(
2100                BuildItem::Handle(res_handle),
2101                offset.clone(),
2102            ))
2103        } else {
2104            None
2105        }
2106    }
2107
2108    /// Select text between this element/node and the next of the same type
2109    fn textselector_for_markers<'b>(
2110        &self,
2111        node: Node,
2112        doc_num: usize,
2113        store: &AnnotationStore,
2114        element_config: &'b XmlElementConfig,
2115    ) -> Option<SelectorBuilder<'b>> {
2116        let resource = store
2117            .resource(
2118                self.resource_handle
2119                    .expect("resource must have been created"),
2120            )
2121            .expect("resource must exist");
2122        let mut end: Option<usize> = None;
2123        if let Some(markers) = self.markers.get(&element_config.hash()) {
2124            let mut grab = false;
2125            for (d_num, n_id) in markers.iter() {
2126                if grab {
2127                    //this marker is the next one, it's begin position is our desired end position
2128                    end = self.positionmap.get(&(*d_num, *n_id, PositionType::Body)).map(|offset| {
2129                        offset
2130                            .begin
2131                            .try_into()
2132                            .expect("begin cursor must be beginaligned")
2133                    });
2134                    break;
2135                }
2136                if doc_num == *d_num && *n_id == node.id() {
2137                    //current node/marker found, signal grab for the next one
2138                    grab = true;
2139                }
2140            }
2141        };
2142        if end.is_none() {
2143            //no next marker found, use end of document instead
2144            end = Some(resource.textlen());
2145        }
2146        if let (Some(offset), Some(end)) = (self.positionmap.get(&(doc_num, node.id(), PositionType::Body)), end) {
2147            Some(SelectorBuilder::TextSelector(
2148                BuildItem::Handle(self.resource_handle.unwrap()),
2149                Offset::simple(
2150                    offset
2151                        .begin
2152                        .try_into()
2153                        .expect("begin cursor must be beginaligned"),
2154                    end,
2155                ),
2156            ))
2157        } else {
2158            None
2159        }
2160    }
2161
2162    fn set_global_context(&mut self) {
2163        self.global_context
2164            .insert("context".into(), upon::Value::Map(self.config.context.iter().map(|(k,v)| (k.clone(), map_value(v))).collect()));
2165        self.global_context
2166            .insert("namespaces".into(), self.config.namespaces.clone().into());
2167        self.global_context
2168            .insert("default_set".into(), self.config.default_set.clone().into());
2169    }
2170
2171    fn render_template<'input, 't>(
2172        &self,
2173        template: &'t str,
2174        node: &Node<'a, 'input>,
2175        begin: Option<usize>,
2176        end: Option<usize>,
2177        resource: Option<&str>,
2178        inputfile: Option<&str>,
2179        doc_num: usize,
2180    ) -> Result<Cow<'t, str>, XmlConversionError> {
2181        if template.chars().any(|c| c == '{') {
2182            //value is a template, templating engine probably needed
2183            let compiled_template = self.template_engine.template(template);
2184            let context = self.context_for_node(&node, begin, end, template, resource, inputfile, doc_num);
2185            let result = compiled_template.render(context).to_string()?;
2186            Ok(Cow::Owned(result))
2187        } else {
2188            //value is a literal: templating engine not needed
2189            Ok(Cow::Borrowed(template))
2190        }
2191    }
2192
2193    fn context_for_node<'input>(
2194        &self,
2195        node: &Node<'a, 'input>,
2196        begin: Option<usize>,
2197        end: Option<usize>,
2198        template: &str, 
2199        resource: Option<&str>,
2200        inputfile: Option<&str>,
2201        doc_num: usize,
2202    ) -> upon::Value {
2203        let mut context = self.global_context.clone();
2204        let length = if let (Some(begin), Some(end)) = (begin, end) {
2205            Some(end - begin)
2206        } else {
2207            None
2208        };
2209        context.insert("localname".into(), node.tag_name().name().into());
2210        //name with name prefix (if any)
2211        context.insert("name".into(), self.get_node_name_for_template(node).into());
2212        if let Some(namespace) = node.tag_name().namespace() {
2213            //the full namespace
2214            context.insert("namespace".into(), namespace.into());
2215        }
2216
2217        // Offset in the untangled plain text
2218        if let Some(begin) = begin {
2219            context.insert("begin".into(), upon::Value::Integer(begin as i64));
2220        }
2221        if let Some(end) = end {
2222            context.insert("end".into(), upon::Value::Integer(end as i64));
2223        }
2224        if let Some(length) = length {
2225            context.insert("length".into(), upon::Value::Integer(length as i64));
2226        }
2227        if let Some(resource) = resource {
2228            //the resource ID
2229            context.insert("resource".into(), resource.into());
2230        }
2231        if let Some(inputfile) = inputfile {
2232            //the input file
2233            context.insert("inputfile".into(), inputfile.into());
2234        }
2235        //document number (0-indexed), useful in case multiple input documents are cast to a single output text
2236        context.insert("doc_num".into(), upon::Value::Integer(doc_num as i64));
2237
2238        if let Some(vars) = self.variables.get(template) {
2239            for var in vars {
2240                let mut encodedvar = String::new();
2241                if let Some(value) = self.context_for_var(node, var, &mut encodedvar) {
2242                    if self.config.debug() {
2243                        eprintln!(
2244                            "[STAM fromxml]              Set context variable for template '{}' for node '{}': {}={:?}   (encodedvar={})",
2245                            template,
2246                            node.tag_name().name(),
2247                            var,
2248                            value,
2249                            encodedvar
2250                        );
2251                    }
2252                    if value != upon::Value::None {
2253                        context.insert(encodedvar, value);
2254                    }
2255                } else if self.config.debug() {
2256                    eprintln!(
2257                        "[STAM fromxml]              Missed context variable for template '{}' for node '{}': {}",
2258                        template,
2259                        node.tag_name().name(),
2260                        var
2261                    );
2262                }
2263            }
2264        }
2265        upon::Value::Map(context)
2266    }
2267
2268    /// Looks up a variable value (from the DOM XML) to be used in for template context
2269    // returns value and stores full the *encoded* variable name in path (this is safe to pass to template)
2270    fn context_for_var<'input>(
2271        &self,
2272        node: &Node<'a, 'input>,
2273        var: &str, 
2274        path: &mut String,
2275    ) -> Option<upon::Value> {
2276
2277        let first = path.is_empty();
2278        let var = 
2279        if var.starts_with("?.$") {
2280            if first {
2281                path.push_str("?.ELEMENT_");
2282            };
2283            &var[3..]
2284        } else if var.starts_with("$") {
2285            if first {
2286                path.push_str("ELEMENT_");
2287            };
2288            &var[1..]
2289        } else if var.starts_with("?.@") {
2290            if first {
2291                path.push_str("?.");
2292            };
2293            &var[2..]
2294        } else {
2295            var
2296        };
2297
2298        if !first && !var.is_empty() && !path.ends_with("ELEMENT_"){
2299            path.push_str("_IN_");
2300        }
2301
2302        //get the first component of the variable
2303        let (component, remainder) = var.split_once("/").unwrap_or((var,""));
2304        //eprintln!("DEBUG: component={}, remainder={}", component, remainder);
2305        if component.is_empty() {
2306            if first && !remainder.is_empty() {
2307                //we're asked to start at the root node
2308                let mut n = node.clone();
2309                //find the root node
2310                while let Some(parentnode) = n.parent_element() {
2311                    n = parentnode;
2312                }
2313                //recurse from root node
2314                let (rootcomponent, remainder) = remainder.split_once("/").unwrap_or((remainder,""));
2315                let (prefix, localname)  = if let Some(pos) = rootcomponent.find(":") {
2316                    (Some(&rootcomponent[0..pos]),  &rootcomponent[pos+1..])
2317                } else {
2318                    (None, rootcomponent)
2319                };
2320                //test if root name corresponds with what we expected
2321                if localname != n.tag_name().name() && localname != "*" {
2322                    None
2323                } else {
2324                    if let Some(prefix) = prefix {
2325                        path.push_str(prefix);
2326                        path.push_str("__");
2327                    }
2328                    path.push_str(localname);
2329                    self.context_for_var(&n, remainder, path)
2330                }
2331            } else {
2332                //an empty component is the stop condition , this function is called recursively, stripping one
2333                //component at a time until nothing is left, we then take the text of that final node:
2334                Some(recursive_text(node).into())
2335            }
2336        } else if component.starts_with("@"){
2337            if let Some(pos) = component.find(":") {
2338                let prefix = &component[1..pos];
2339                if let Some(ns) = self.config.namespaces.get(prefix) {
2340                    let var = &component[pos+1..];
2341                    path.push_str("ATTRIB_");
2342                    path.push_str(prefix);
2343                    path.push_str("__");
2344                    path.push_str(var);
2345                    Some(
2346                        node.attribute((ns.as_str(),var)).into()
2347                    )
2348                } else {
2349                    None
2350                }
2351            } else {
2352                let var = &component[1..];
2353                path.push_str("ATTRIB_");
2354                path.push_str(var);
2355                Some(
2356                    node.attribute(var).into()
2357                )
2358            }
2359        } else if component == ".." {
2360            if let Some(parentnode) = node.parent_element().as_ref() {
2361                //recurse with parent node
2362                path.push_str("PARENT");
2363                self.context_for_var(parentnode, remainder, path)
2364            } else {
2365                None
2366            }
2367        } else if component == "." {
2368            path.push_str("THIS");
2369            if !remainder.is_empty() {
2370                //a . is meaningless if not the final component
2371                self.context_for_var(node, remainder, path)
2372            } else {
2373                Some(recursive_text(node).into())
2374            }
2375        } else {
2376            let (prefix, localname)  = if let Some(pos) = component.find(":") {
2377                (Some(&component[0..pos]),  &component[pos+1..])
2378            } else {
2379                (None, component)
2380            };
2381            let localname_with_condition = localname;
2382            let (localname, condition_str, condition) = self.extract_condition(localname_with_condition); //extract X-Path like conditions [@attrib="value"]  (very limited!)
2383            //eprintln!("DEBUG: looking for {} (prefix={:?},localname={}, condition={:?}) in {:?}", localname_with_condition,  prefix, localname, condition, node.tag_name());
2384            for child in node.children() {
2385                if child.is_element() {
2386                    let namedata = child.tag_name();
2387                    let mut child_matches = if let Some(namespace) = namedata.namespace() {
2388                        if let Some(foundprefix) = self.prefixes.get(namespace) {
2389                            Some(foundprefix.as_str()) == prefix && localname == namedata.name()
2390                        } else {
2391                            false
2392                        }
2393                    } else {
2394                        namedata.name() == localname
2395                    };
2396                    if child_matches {
2397                        //MAYBE TODO: move to separate funtion
2398                        if let Some((attribname, negate, attribvalue)) = condition {
2399                            //test condition: falsify child_matches
2400                            if let Some(pos) = attribname.find(":") {
2401                                let prefix = &attribname[0..pos];
2402                                if let Some(ns) = self.config.namespaces.get(prefix) {
2403                                    let attribname = &attribname[pos+1..];
2404                                    if let Some(value) = child.attribute((ns.as_str(),attribname)) {
2405                                        if !negate && attribvalue != Some(value) {
2406                                            child_matches = false;
2407                                        } else if negate && attribvalue == Some(value) {
2408                                            child_matches = false;
2409                                        }
2410                                    } else {
2411                                        child_matches = false;
2412                                    }
2413                                } else {
2414                                    child_matches = false;
2415                                }
2416                            } else {
2417                                if let Some(value) = child.attribute(attribname) {
2418                                    if !negate && attribvalue != Some(value) {
2419                                        child_matches = false;
2420                                    } else if negate && attribvalue == Some(value) {
2421                                        child_matches = false;
2422                                    }
2423                                } else {
2424                                    child_matches = false;
2425                                }
2426                            }
2427                        }
2428                        if !child_matches && self.config.debug {
2429                            eprintln!("[STAM fromxml] candidate node does not meet condition: {}", localname_with_condition);
2430                        }
2431                        //end condition test
2432                    }
2433                    if child_matches {
2434                        if let Some(prefix) = prefix {
2435                            path.push_str(prefix);
2436                            path.push_str("__");
2437                        }
2438                        path.push_str(localname);
2439                        if condition.is_some() {
2440                            //simply encode the condition as a hash (non-decodable but that's okay)
2441                            let mut hasher = DefaultHasher::new();
2442                            condition_str.hash(&mut hasher);
2443                            let h = hasher.finish();
2444                            path.push_str(&format!("_COND{}_", h));
2445                        }
2446                        return self.context_for_var(&child, remainder, path);
2447                    }
2448                }
2449            }
2450            //no match found for this variable
2451            None
2452        }
2453    }
2454
2455    fn extract_condition<'b>(&self, localname: &'b str) -> (&'b str, &'b str, Option<(&'b str, bool, Option<&'b str>)>) { //(localname, condition, Option<(attrib, negation, attribvalue)>)
2456        //simple conditional statement
2457        if localname.ends_with("]") {
2458            if let Some(pos) = localname.find("[") {
2459                let condition = &localname[pos+1..localname.len()-1];
2460                let (mut attrib, negation, attribvalue) = if let Some(pos) = condition.find("=") {
2461                     let attrib = condition[0..pos].trim();
2462                     let value = condition[pos+1..].trim();
2463                     let value = &value[1..value.len() - 1]; //strips the literal quotes (") for the value
2464                     if attrib.ends_with('!') {
2465                        //negation (!= operator)
2466                        (attrib[..attrib.len() - 1].trim(), true, Some(value))
2467                     } else {
2468                        (attrib.trim(), false, Some(value))
2469                     }
2470                } else {
2471                    (condition, false, None)
2472                };
2473                if attrib.starts_with('@') {
2474                    //this should actually be mandatory and already checked during template precompilation
2475                    attrib = &attrib[1..];
2476                }
2477                return (&localname[..pos], condition, Some((attrib,  negation,attribvalue )) );
2478            }
2479        }
2480        (localname, "", None)
2481    }
2482
2483
2484    fn get_node_name_for_template<'b>(&self, node: &'b Node) -> Cow<'b,str> {
2485        let extended_name = node.tag_name();
2486        match (extended_name.namespace(), extended_name.name()) {
2487            (Some(namespace), tagname) => {
2488                if let Some(prefix) = self.prefixes.get(namespace) {
2489                    Cow::Owned(format!("{}__{}", prefix, tagname))
2490                } else {
2491                    Cow::Borrowed(tagname)
2492                }
2493            }
2494            (None, tagname) => Cow::Borrowed(tagname),
2495        }
2496    }
2497
2498    fn get_node_name_for_xpath<'b>(&self, node: &'b Node) -> Cow<'b,str> {
2499        let extended_name = node.tag_name();
2500        match (extended_name.namespace(), extended_name.name()) {
2501            (Some(namespace), tagname) => {
2502                if let Some(prefix) = self.prefixes.get(namespace) {
2503                    Cow::Owned(format!("{}:{}", prefix, tagname))
2504                } else {
2505                    Cow::Borrowed(tagname)
2506                }
2507            }
2508            (None, tagname) => Cow::Borrowed(tagname),
2509        }
2510    }
2511
2512
2513    fn precompile(&mut self, template: &'a str) -> Cow<'a,str> {
2514        let mut replacement = String::new();
2515        let mut variables: BTreeSet<&'a str> = BTreeSet::new();
2516        let mut begin = 0;
2517        let mut end = 0;
2518        for i  in 0..template.len() {
2519            let slice = &template[i..];
2520            if slice.starts_with("{{") || slice.starts_with("{%") {
2521                begin = i;
2522            } else if slice.starts_with("}}") || slice.starts_with("%}") {
2523                if end < begin+2 {
2524                    replacement.push_str(&template[end..begin+2]);
2525                }
2526                let inner = &template[begin+2..i]; //the part without the {{  }}
2527                replacement.push_str(&self.precompile_inblock(inner, &mut variables));
2528                end = i;
2529            }
2530        }
2531        if end > 0 {
2532            replacement.push_str(&template[end..]);
2533        }
2534        self.variables.insert(template.into(), variables);
2535        //eprintln!("DEBUG: precompile({}) -> {}", template, replacement);
2536
2537        if !replacement.is_empty() {
2538            Cow::Owned(replacement)
2539        } else {
2540            Cow::Borrowed(template)
2541        }
2542    }
2543
2544    fn precompile_inblock<'s>(&self, s: &'s str, vars: &mut BTreeSet<&'s str>) -> Cow<'s,str> {
2545        let mut quoted = false;
2546        let mut var = false;
2547        let mut begin = 0;
2548        let mut end = 0;
2549        let mut replacement = String::new();
2550        let mut in_condition = false;
2551        for (i,c) in s.char_indices() {
2552            if in_condition && c != ']' {
2553                continue;
2554            }
2555            if c == '"' {
2556                quoted = !quoted;
2557            } else if !quoted {
2558                if !var && (c == '@' || c == '$') {
2559                    //token is an XML variable name, its syntax needs some changes before it can be used in the templating engine
2560                    var = true;
2561                    begin = i;
2562                } else if var && c == '[' {
2563                    in_condition = true;
2564                } else if var && in_condition && c == ']' {
2565                    //end of condition
2566                    in_condition = false;
2567                } else if var && in_condition  {
2568                    //in condition
2569                    continue;
2570                } else if var && (!c.is_alphanumeric() && c != '.' && c != '/' && c != '_' && c != ':' && c != '@') {
2571                    //end of variable (including condition if applicable)
2572                    if end < begin {
2573                        replacement.push_str(&s[end..begin]);
2574                    }
2575                    let varname = &s[begin..i];
2576                    vars.insert(varname);
2577                    let replacement_var = self.precompile_name(varname);
2578                    replacement += &replacement_var;
2579                    end = i;
2580                    var = false;
2581                }
2582            }
2583        }
2584        if end > 0 {
2585            replacement.push_str(&s[end..]);
2586        }
2587        if var {
2588            //don't forget last one
2589            let varname = &s[begin..];
2590            vars.insert(varname);
2591            let replacement_var = self.precompile_name(varname);
2592            replacement += &replacement_var;
2593        }
2594        if !replacement.is_empty() {
2595            //eprintln!("DEBUG: precompile_inblock({}) -> {}", s, replacement);
2596            Cow::Owned(replacement)
2597        } else {
2598            Cow::Borrowed(s)
2599        }
2600    }
2601
2602    /// upon's templating syntax doesn't support some of the characters we use in names, this function substitutes them for more verbose equivalents
2603    fn precompile_name(&self, s: &str) -> String {
2604        let mut replacement = String::new();
2605        let mut begincondition = None;
2606        let mut skip = 0;
2607        for (i,c) in s.char_indices() {
2608            if begincondition.is_some() && c != ']' {
2609                continue;
2610            } else if skip > 0 {
2611                skip -= 1;
2612                continue;
2613            }
2614            if c == '$' {
2615                let slice = &s[i..];
2616                if slice.starts_with("$..") {
2617                    replacement.push_str("ELEMENT_PARENT");
2618                    skip = 2;
2619                } else if slice.starts_with("$.") {
2620                    replacement.push_str("ELEMENT_THIS");
2621                    skip = 1;
2622                } else if slice.starts_with("$/") {
2623                    replacement.push_str("ELEMENT_");
2624                    skip = 1;
2625                } else {
2626                    replacement.push_str("ELEMENT_");
2627                }
2628            } else if c == '@' {
2629                replacement.push_str("ATTRIB_");
2630            } else if c == '/' {
2631                replacement.push_str("_IN_");
2632            } else if c == ':' {
2633                replacement.push_str("__");
2634            } else if c == '[' {
2635                begincondition = Some(i+1);
2636            } else if c == ']' {
2637                //conditions are just stored as hashes
2638                if let Some(begin) = begincondition {
2639                    let mut hasher = DefaultHasher::new();
2640                    let _ = &s[begin..i].hash(&mut hasher);
2641                    let h = hasher.finish();
2642                    replacement.push_str(&format!("_COND{}_", h));
2643                }
2644                begincondition = None;
2645            } else {
2646                replacement.push(c);
2647            }
2648        }
2649        //eprintln!("DEBUG: precompile_name({}) -> {}", s, replacement);
2650        replacement
2651    }
2652
2653    fn add_metadata(&self, store: &mut AnnotationStore) -> Result<(), XmlConversionError> {
2654        for metadata in self.config.metadata.iter() {
2655            let mut builder = AnnotationBuilder::new();
2656
2657            let resource_id = if let Some(resource_handle) = self.resource_handle {
2658                store.resource(resource_handle).unwrap().id()
2659            } else {
2660                None
2661            };
2662
2663            let mut context = self.global_context.clone();
2664            if let Some(resource_id) = resource_id {
2665                context.insert("resource".into(), resource_id.into());
2666            }
2667
2668            if let Some(template) = &metadata.id {
2669                let compiled_template = self.template_engine.template(template.as_str());
2670                let id = compiled_template.render(&context).to_string().map_err(|e| 
2671                        XmlConversionError::TemplateError(
2672                            format!(
2673                                "whilst rendering metadata id template '{}'",
2674                                template,
2675                            ),
2676                            Some(e),
2677                        )
2678                    )?;
2679                if !id.is_empty() {
2680                    builder = builder.with_id(id);
2681                }
2682            }
2683
2684            for annotationdata in metadata.annotationdata.iter() {
2685                let mut databuilder = AnnotationDataBuilder::new();
2686                if let Some(template) = &annotationdata.set {
2687                    let compiled_template = self.template_engine.template(template.as_str());
2688                    let dataset = compiled_template.render(&context).to_string().map_err(|e| 
2689                            XmlConversionError::TemplateError(
2690                                format!(
2691                                    "whilst rendering annotationdata/dataset template '{}' for metadata",
2692                                    template,
2693                                ),
2694                                Some(e),
2695                            )
2696                        )?;
2697                    if !dataset.is_empty() {
2698                        databuilder = databuilder.with_dataset(dataset.into())
2699                    }
2700                } else {
2701                    databuilder =
2702                        databuilder.with_dataset(self.config.default_set.as_str().into());
2703                }
2704                if let Some(template) = &annotationdata.key {
2705                    let compiled_template = self.template_engine.template(template.as_str());
2706                    match compiled_template.render(&context).to_string().map_err(|e| 
2707                            XmlConversionError::TemplateError(
2708                                format!(
2709                                    "whilst rendering annotationdata/key template '{}' for metadata",
2710                                    template,
2711                                ),
2712                                Some(e),
2713                            )
2714                        )  {
2715                        Ok(key) if !key.is_empty() =>
2716                            databuilder = databuilder.with_key(key.into()) ,
2717                        Ok(_) if !annotationdata.skip_if_missing => {
2718                            return Err(XmlConversionError::TemplateError(
2719                                format!(
2720                                    "whilst rendering annotationdata/key template '{}' metadata",
2721                                    template,
2722                                ),
2723                                None
2724                            ));
2725                        },
2726                        Err(e) if !annotationdata.skip_if_missing => {
2727                            return Err(e)
2728                        },
2729                        _ => {
2730                            //skip whole databuilder if missing
2731                            continue
2732                        }
2733                    }
2734                }
2735                if let Some(value) = &annotationdata.value {
2736                    match self.extract_value_metadata(value, &upon::Value::Map(context.clone()), annotationdata.allow_empty_value, annotationdata.skip_if_missing,  resource_id.as_deref())? {
2737                        Some(value) => {
2738                            databuilder = databuilder.with_value(value);
2739                        },
2740                        None =>  {
2741                            //skip whole databuilder if missing
2742                            continue
2743                        }
2744                    }
2745                }
2746                builder = builder.with_data_builder(databuilder);
2747            }
2748
2749
2750
2751            // Finish the builder and add the actual annotation to the store, according to its element handling
2752            match metadata.annotation {
2753                XmlAnnotationHandling::TextSelector => {
2754                    // Annotation is on text, translates to TextSelector
2755                    builder = builder.with_target(SelectorBuilder::TextSelector(BuildItem::Handle(self.resource_handle.expect("resource must have handle")), Offset::whole()));
2756                    if self.config.debug {
2757                        eprintln!("[STAM fromxml]   builder AnnotateText: {:?}", builder);
2758                    }
2759                    store.annotate(builder)?;
2760                }
2761                XmlAnnotationHandling::ResourceSelector  | XmlAnnotationHandling::None | XmlAnnotationHandling::Unspecified => {
2762                    // Annotation is metadata (default), translates to ResourceSelector
2763                    builder = builder.with_target(SelectorBuilder::ResourceSelector(
2764                        self.resource_handle.into(),
2765                    ));
2766                    if self.config.debug {
2767                        eprintln!("[STAM fromxml]   builder AnnotateResource: {:?}", builder);
2768                    }
2769                    store.annotate(builder)?;
2770                }
2771                _ => panic!(
2772                    "Invalid annotationhandling for metadata: {:?}",
2773                    metadata.annotation
2774                ),
2775            }
2776        }
2777        Ok(())
2778    }
2779}
2780
2781
2782
2783/// Get recursive text without any elements
2784fn recursive_text(node: &Node) -> String {
2785    let mut s = String::new();
2786    for child in node.children() {
2787        if child.is_text() {
2788            s += child.text().expect("should have text");
2789        } else if child.is_element() {
2790            s += &recursive_text(&child);
2791        }
2792    }
2793    s
2794}
2795
2796// Filters
2797fn filter_capitalize(s: &str) -> String {
2798    let mut out = String::with_capacity(s.len());
2799    for (i, c) in s.chars().enumerate() {
2800        if i == 0 {
2801            out.push_str(&c.to_uppercase().collect::<String>())
2802        } else {
2803            out.push(c);
2804        }
2805    }
2806    out
2807}
2808
2809/// Map value between toml and upon. This makes a clone.
2810fn map_value(value: &toml::Value) -> upon::Value {
2811    match value {
2812        toml::Value::String(s) => upon::Value::String(s.clone()),
2813        toml::Value::Integer(i) => upon::Value::Integer(*i),
2814        toml::Value::Float(i) => upon::Value::Float(*i),
2815        toml::Value::Boolean(v) => upon::Value::Bool(*v),
2816        toml::Value::Datetime(s) => upon::Value::String(s.to_string()),
2817        toml::Value::Array(v) => upon::Value::List(v.iter().map(|i| map_value(i)).collect()),
2818        toml::Value::Table(v) => upon::Value::Map(v.iter().map(|(k,i)| (k.clone(),map_value(i))).collect()),
2819    }
2820}
2821
2822#[cfg(test)]
2823mod tests {
2824    use super::*;
2825    //use crate::info::info;
2826
2827    const XMLSMALLEXAMPLE: &'static str = r#"<html xmlns="http://www.w3.org/1999/xhtml">
2828<head><title>test</title></head><body><h1>TEST</h1><p xml:id="p1">This  is a <em xml:id="emphasis" style="color:green">test</em>.</p></body></html>"#;
2829
2830    const XMLEXAMPLE: &'static str = r#"<!DOCTYPE entities[<!ENTITY nbsp "&#xA0;">]>
2831<html xmlns="http://www.w3.org/1999/xhtml" xmlns:my="http://example.com">
2832<head>
2833    <title>Test</title>
2834    <meta name="author" content="proycon" />
2835</head>
2836<body>
2837    <h1>Header</h1>
2838
2839    <p xml:id="par1">
2840        <span xml:id="sen1">This is a sentence.</span>
2841        <span xml:id="sen2">This is the second&nbsp;sentence.</span>
2842    </p>
2843    <p xml:id="par2">
2844        <strong>This</strong> is    the <em>second</em> paragraph.
2845            It has a <strong>bold</strong> word and one in <em>italics</em>.<br/>
2846        Let's highlight stress in the following word: <span my:stress="secondary">re</span>pu<span my:stress="primary">ta</span>tion.
2847    </p>
2848    <p xml:space="preserve"><![CDATA[This    third
2849paragraph consists
2850of CDATA and is configured to preserve whitespace, and weird &entities; ]]></p>
2851
2852    <h2>Subsection</h2>
2853
2854    <p>
2855    Have some fruits:<br/>
2856    <ul xml:id="list1" class="fruits">
2857        <li xml:id="fruit1">apple</li>
2858        <li xml:id="fruit2">banana</li>
2859        <li xml:id="fruit3">melon</li>
2860    </ul>
2861    </p>
2862
2863    Some lingering text outside of any confines...
2864</body>
2865</html>"#;
2866
2867    const XMLEXAMPLE_TEXTOUTPUT: &'static str = "Header\n\nThis is a sentence. This is the second sentence.\n\nThis is the second paragraph. It has a bold word and one in italics.\nLet's highlight stress in the following word: reputation.\n\nThis    third\nparagraph consists\nof CDATA and is configured to preserve whitespace, and weird &entities; \nSubsection\n\nHave some fruits:\n* apple\n* banana\n* melon\n\nSome lingering text outside of any confines...";
2868    
2869    //fake example (not real HTML, testing TEI-like space attribute with complex template)
2870    const XMLTEISPACE: &'static str = r#"<html xmlns="http://www.w3.org/1999/xhtml">
2871<body><space dim="vertical" unit="lines" quantity="3" /></body></html>"#;
2872
2873    const CONF: &'static str = r#"#default whitespace handling (Collapse or Preserve)
2874whitespace = "Collapse"
2875default_set = "urn:stam-fromhtml" 
2876
2877[namespaces]
2878#this defines the namespace prefixes you can use in this configuration
2879xml = "http://www.w3.org/XML/1998/namespace"
2880html = "http://www.w3.org/1999/xhtml"
2881xsd =  "http://www.w3.org/2001/XMLSchema"
2882xlink = "http://www.w3.org/1999/xlink"
2883
2884# elements and attributes are matched in reverse-order, so put more generic statements before more specific ones
2885
2886#Define some base elements that we reuse later for actual elements (prevents unnecessary repetition)
2887[baseelements.common]
2888id = "{% if ?.@xml:id %}{{ @xml:id }}{% endif %}"
2889
2890    [[baseelements.common.annotationdata]]
2891    key = "type"
2892    value = "{{ localname }}"
2893
2894    [[baseelements.common.annotationdata]]
2895    key = "lang"
2896    value = "{{ @xml:lang }}"
2897    skip_if_missing = true
2898
2899    [[baseelements.common.annotationdata]]
2900    key = "n"
2901    value = "{{ @n }}"
2902    skip_if_missing = true
2903
2904    [[baseelements.common.annotationdata]]
2905    key = "style"
2906    value = "{{ @style }}"
2907    skip_if_missing = true
2908
2909    [[baseelements.common.annotationdata]]
2910    key = "class"
2911    value = "{{ @class }}"
2912    skip_if_missing = true
2913
2914    [[baseelements.common.annotationdata]]
2915    key = "src"
2916    value = "{{ @src }}"
2917    skip_if_missing = true
2918
2919[baseelements.text]
2920text = true
2921
2922
2923[[elements]]
2924base = [ "text", "common" ]
2925path = "*"
2926text = true
2927annotation = "TextSelector"
2928
2929# Pass through the following elements without mapping to text
2930[[elements]]
2931base = [ "common" ]
2932path = "//html:head"
2933
2934[[elements]]
2935base = [ "common" ]
2936path = "//html:head//*"
2937
2938# Map metadata like <meta name="key" content="value"> to annotations with key->value data selecting the resource (ResourceSelector)
2939[[elements]]
2940base = [ "common" ]
2941path = "//html:head//html:meta"
2942
2943[[elements.annotationdata]]
2944key = "{% if ?.@name %}{{ name }}{% endif %}"
2945value = "{% if ?.@content %}{{ @content }}{% endif %}"
2946skip_if_missing = true
2947
2948# By default, ignore any tags in the head (unless they're mentioned specifically later in the config)
2949[[elements]]
2950path = "//html:head/html:title"
2951annotation = "ResourceSelector"
2952
2953[[elements.annotationdata]]
2954key = "title"
2955value = "{{ $. | trim }}"
2956
2957
2958# Determine how various structural elements are converted to text
2959
2960[[elements]]
2961base = [ "common" ]
2962path = "//html:br"
2963textsuffix = "\n"
2964
2965[[elements]]
2966base = [ "common", "text" ]
2967path = "//html:p"
2968textprefix = "\n"
2969textsuffix = "\n"
2970
2971# Let's do headers and bulleted lists like markdown
2972[[elements]]
2973base = [ "common", "text" ]
2974path = "//html:h1"
2975textsuffix = "\n"
2976
2977[[elements]]
2978base = [ "common", "text" ]
2979path = "//html:h2"
2980textsuffix = "\n"
2981
2982#Generic, will be overriden by more specific one
2983[[elements]]
2984base = [ "common", "text" ]
2985path = "//html:li"
2986textprefix = "- "
2987textsuffix = "\n"
2988
2989[[elements]]
2990base = [ "common", "text" ]
2991path = """//html:body"""
2992annotation = "TextSelector"
2993id = "body"
2994
2995    [[elements.annotationdata]]
2996    key = "title_from_parent"
2997    value = "{{ $../html:head/html:title }}"
2998    skip_if_missing = true
2999
3000    [[elements.annotationdata]]
3001    key = "title_from_root"
3002    value = "{{ $/html:html/html:head/html:title }}"
3003    skip_if_missing = true
3004
3005#More specific one takes precendence over the above generic one
3006[[elements]]
3007base = [ "common", "text" ]
3008path = """//html:ul[@class="fruits"]/html:li"""
3009textprefix = "* "
3010textsuffix = "\n"
3011
3012#Not real HTML, test-case modelled after TEI space
3013[[elements]]
3014base = [ "common" ]
3015path = """//html:space[@dim="vertical" and @unit="lines"]"""
3016text = true
3017textsuffix = """\n{% for x in @quantity | int | as_range %}\n{% endfor %}"""
3018
3019
3020[[elements]]
3021base = [ "common", "text" ]
3022path = "//html:example"
3023annotation = "TextSelector"
3024
3025[[elements.annotationdata]]
3026key = "requiredattrib"
3027value = "{{ @requiredattrib }}"
3028
3029[[elements.annotationdata]]
3030key = "optattrib"
3031value = "{{ ?.@optattrib }}"
3032
3033[[elements]]
3034base = [ "common","text" ]
3035path = "//html:marquee"
3036annotation = "TextSelector"
3037
3038#map value, some bogus data to test parsing
3039[[elements.annotationdata]]
3040key = "map"
3041
3042[elements.annotationdata.value]
3043text = "{{ $. }}"
3044number = 42
3045bogus = true
3046
3047"#;
3048
3049    const XMLREQATTRIBEXAMPLE: &'static str = r#"<html xmlns="http://www.w3.org/1999/xhtml">
3050<body><example xml:id="ann1" requiredattrib="blah">test</example></body></html>"#;
3051
3052    const XMLREQATTRIBEXAMPLE2: &'static str = r#"<html xmlns="http://www.w3.org/1999/xhtml">
3053<body><example xml:id="ann1">test</example></body></html>"#;
3054
3055    const XMLREQATTRIBEXAMPLE3: &'static str = r#"<html xmlns="http://www.w3.org/1999/xhtml">
3056<body><example xml:id="ann1" requiredattrib="blah" optattrib="blah">test</example></body></html>"#;
3057
3058    const XMLMAPEXAMPLE: &'static str = r#"<html xmlns="http://www.w3.org/1999/xhtml">
3059<body><marquee xml:id="ann1">test</marquee></body></html>"#;
3060
3061    #[test]
3062    fn test_precompile_template_nochange() -> Result<(), String> {
3063        let config = XmlConversionConfig::new();
3064        let mut conv = XmlToStamConverter::new(&config);
3065        let template_in = "{{ foo }}";
3066        let template_out = conv.precompile(template_in);
3067        assert_eq!( template_out, template_in);
3068        //foo is not a special variable
3069        assert!(!conv.variables.get(template_in).as_ref().unwrap().contains("foo"));
3070        Ok(())
3071    }
3072
3073    #[test]
3074    fn test_precompile_template_attrib() -> Result<(), String> {
3075        let config = XmlConversionConfig::new();
3076        let mut conv = XmlToStamConverter::new(&config);
3077        let template_in = "{{ @foo }}";
3078        let template_out = conv.precompile(template_in);
3079        assert_eq!(template_out, "{{ ATTRIB_foo }}");
3080        //foo is an attribute so is returned 
3081        assert!(conv.variables.get(template_in).as_ref().unwrap().contains("@foo"));
3082        Ok(())
3083    }
3084
3085    #[test]
3086    fn test_precompile_template_attrib_ns() -> Result<(), String> {
3087        let config = XmlConversionConfig::new();
3088        let mut conv = XmlToStamConverter::new(&config);
3089        let template_in = "{{ @bar:foo }}";
3090        let template_out = conv.precompile(template_in);
3091        assert_eq!(template_out, "{{ ATTRIB_bar__foo }}");
3092        //foo is an attribute so is returned 
3093        assert!(conv.variables.get(template_in).as_ref().unwrap().contains("@bar:foo"));
3094        Ok(())
3095    }
3096
3097    #[test]
3098    fn test_precompile_template_element() -> Result<(), String> {
3099        let config = XmlConversionConfig::new();
3100        let mut conv = XmlToStamConverter::new(&config);
3101        let template_in = "{{ $foo }}";
3102        let template_out = conv.precompile(template_in);
3103        assert_eq!(template_out, "{{ ELEMENT_foo }}");
3104        //foo is an element so is returned 
3105        assert!(conv.variables.get(template_in).as_ref().unwrap().contains("$foo"));
3106        Ok(())
3107    }
3108
3109    #[test]
3110    fn test_precompile_template_element_ns() -> Result<(), String> {
3111        let config = XmlConversionConfig::new();
3112        let mut conv = XmlToStamConverter::new(&config);
3113        let template_in = "{{ $bar:foo }}";
3114        let template_out = conv.precompile(template_in);
3115        assert_eq!(template_out, "{{ ELEMENT_bar__foo }}");
3116        //foo is an element so is returned 
3117        assert!(conv.variables.get(template_in).as_ref().unwrap().contains("$bar:foo"));
3118        Ok(())
3119    }
3120
3121    #[test]
3122    fn test_precompile_template_this_text() -> Result<(), String> {
3123        let config = XmlConversionConfig::new();
3124        let mut conv = XmlToStamConverter::new(&config);
3125        let template_in = "{{ $. }}";
3126        let template_out = conv.precompile(template_in);
3127        assert_eq!(template_out, "{{ ELEMENT_THIS }}");
3128        assert!(conv.variables.get(template_in).as_ref().unwrap().contains("$."));
3129        Ok(())
3130    }
3131
3132    #[test]
3133    fn test_precompile_template_parent_text() -> Result<(), String> {
3134        let config = XmlConversionConfig::new();
3135        let mut conv = XmlToStamConverter::new(&config);
3136        let template_in = "{{ $.. }}";
3137        let template_out = conv.precompile(template_in);
3138        assert_eq!(template_out, "{{ ELEMENT_PARENT }}");
3139        assert!(conv.variables.get(template_in).as_ref().unwrap().contains("$.."));
3140        Ok(())
3141    }
3142
3143
3144    #[test]
3145    fn test_precompile_template_attrib2() -> Result<(), String> {
3146        let config = XmlConversionConfig::new();
3147        let mut conv = XmlToStamConverter::new(&config);
3148        let template_in = "{% for x in @foo %}";
3149        let template_out = conv.precompile(template_in);
3150        assert_eq!(template_out, "{% for x in ATTRIB_foo %}");
3151        //foo is an attribute so is returned 
3152        assert!(conv.variables.get(template_in).as_ref().unwrap().contains("@foo"));
3153        Ok(())
3154    }
3155
3156    #[test]
3157    fn test_precompile_template_attrib3() -> Result<(), String> {
3158        let config = XmlConversionConfig::new();
3159        let mut conv = XmlToStamConverter::new(&config);
3160        let template_in = "{{ ?.@foo }}";
3161        let template_out = conv.precompile(template_in);
3162        assert_eq!(template_out, "{{ ?.ATTRIB_foo }}");
3163        assert!(conv.variables.get(template_in).as_ref().unwrap().contains("@foo"));
3164        Ok(())
3165    }
3166
3167    #[test]
3168    fn test_precompile_template_path() -> Result<(), String> {
3169        let config = XmlConversionConfig::new();
3170        let mut conv = XmlToStamConverter::new(&config);
3171        let template_in = "{{ $x/y/z/@a }}";
3172        let template_out = conv.precompile(template_in);
3173        assert_eq!(template_out, "{{ ELEMENT_x_IN_y_IN_z_IN_ATTRIB_a }}");
3174        assert!(conv.variables.get(template_in).as_ref().unwrap().contains("$x/y/z/@a"));
3175        Ok(())
3176    }
3177
3178    #[test]
3179    fn test_loadconfig() -> Result<(), String> {
3180        let config = XmlConversionConfig::from_toml_str(CONF)?;
3181        let mut conv = XmlToStamConverter::new(&config);
3182        conv.compile().map_err(|e| format!("{}",e))?;
3183        assert_eq!(conv.config.namespaces.len(),4 , "number of namespaces");
3184        assert_eq!(conv.config.elements.len(), 15, "number of elements");
3185        assert_eq!(conv.config.baseelements.len(), 2, "number of baseelements");
3186        assert_eq!(conv.config.elements.get(0).unwrap().annotationdata.len(), 6,"number of annotationdata under first element");
3187        assert_eq!(conv.config.baseelements.get("common").unwrap().annotationdata.len(), 6,"number of annotationdata under baseelement common");
3188        Ok(())
3189    }
3190
3191    #[test]
3192    fn test_small() -> Result<(), String> {
3193        let config = XmlConversionConfig::from_toml_str(CONF)?.with_debug(true);
3194        let mut store = stam::AnnotationStore::new(stam::Config::new());
3195        from_xml_in_memory("test", XMLSMALLEXAMPLE, &config, &mut store)?;
3196        let res = store.resource("test").expect("resource must have been created at this point");
3197        assert_eq!(res.text(), "TEST\n\nThis is a test.\n", "resource text");
3198        assert_eq!(store.annotations_len(), 4, "number of annotations");
3199        let annotation = store.annotation("emphasis").expect("annotation must have been created at this point");
3200        assert_eq!(annotation.text_simple(), Some("test"));
3201        //eprintln!("DEBUG: {:?}",annotation.data().collect::<Vec<_>>());
3202        let key = store.key("urn:stam-fromhtml", "style").expect("key must exist");
3203        assert_eq!(annotation.data().filter_key(&key).value_as_str(), Some("color:green"));
3204        let key = store.key("urn:stam-fromhtml", "title").expect("key must exist");
3205        let annotation = res.annotations_as_metadata().next().expect("annotation");
3206        assert_eq!(annotation.data().filter_key(&key).value_as_str(), Some("test"));
3207        let bodyannotation = store.annotation("body").expect("body annotation not found");
3208        let title1 = store.key("urn:stam-fromhtml", "title_from_parent").expect("key must exist");
3209        let title2 = store.key("urn:stam-fromhtml", "title_from_root").expect("key must exist");
3210        assert_eq!(bodyannotation.data().filter_key(&title1).value_as_str(), Some("test"));
3211        assert_eq!(bodyannotation.data().filter_key(&title2).value_as_str(), Some("test"));
3212        Ok(())
3213    }
3214
3215    #[test]
3216    fn test_full() -> Result<(), String> {
3217        let config = XmlConversionConfig::from_toml_str(CONF)?.with_debug(true);
3218        let mut store = stam::AnnotationStore::new(stam::Config::new());
3219        from_xml_in_memory("test", XMLEXAMPLE, &config, &mut store)?;
3220        let res = store.resource("test").expect("resource must have been created at this point");
3221        assert_eq!(res.text(), XMLEXAMPLE_TEXTOUTPUT, "resource text");
3222        Ok(())
3223    }
3224
3225    #[test]
3226    fn test_teispace() -> Result<(), String> {
3227        let config = XmlConversionConfig::from_toml_str(CONF)?;
3228        let mut store = stam::AnnotationStore::new(stam::Config::new());
3229        from_xml_in_memory("test", XMLTEISPACE, &config, &mut store)?;
3230        let res = store.resource("test").expect("resource must have been created at this point");
3231        assert_eq!(res.text(), "\n\n\n\n", "resource text");
3232        Ok(())
3233    }
3234
3235
3236    #[test]
3237    fn test_reqattrib() -> Result<(), String> {
3238        let config = XmlConversionConfig::from_toml_str(CONF)?;
3239        let mut store = stam::AnnotationStore::new(stam::Config::new());
3240        from_xml_in_memory("test", XMLREQATTRIBEXAMPLE, &config, &mut store)?;
3241        let res = store.resource("test").expect("resource must have been created at this point");
3242        assert_eq!(res.text(), "test", "resource text");
3243        let key = store.key("urn:stam-fromhtml", "requiredattrib").expect("key must exist");
3244        let annotation = store.annotation("ann1").expect("annotation");
3245        assert_eq!(annotation.data().filter_key(&key).value_as_str(), Some("blah"));
3246        assert!(store.key("urn:stam-fromhtml", "optattrib").is_none(), "optional attrib is unused");
3247        Ok(())
3248    }
3249
3250    #[test]
3251    fn test_reqattrib2() -> Result<(), String> {
3252        let mut config = XmlConversionConfig::from_toml_str(CONF)?;
3253        config = config.with_debug(true);
3254        let mut store = stam::AnnotationStore::new(stam::Config::new());
3255        assert!(from_xml_in_memory("test", XMLREQATTRIBEXAMPLE2, &config, &mut store).is_err(), "checking if error is returned");
3256        Ok(())
3257    }
3258
3259    #[test]
3260    fn test_reqattrib3() -> Result<(), String> {
3261        let config = XmlConversionConfig::from_toml_str(CONF)?;
3262        let mut store = stam::AnnotationStore::new(stam::Config::new());
3263        from_xml_in_memory("test", XMLREQATTRIBEXAMPLE3, &config, &mut store)?;
3264        let res = store.resource("test").expect("resource must have been created at this point");
3265        assert_eq!(res.text(), "test", "resource text");
3266        let reqkey = store.key("urn:stam-fromhtml", "requiredattrib").expect("key must exist");
3267        let optkey = store.key("urn:stam-fromhtml", "optattrib").expect("key optattrib must exist");
3268        let annotation = store.annotation("ann1").expect("annotation");
3269        assert_eq!(annotation.data().filter_key(&reqkey).value_as_str(), Some("blah"));
3270        assert_eq!(annotation.data().filter_key(&optkey).value_as_str(), Some("blah"));
3271        Ok(())
3272    }
3273
3274    #[test]
3275    fn test_map() -> Result<(), String> {
3276        let config = XmlConversionConfig::from_toml_str(CONF)?;
3277        let mut store = stam::AnnotationStore::new(stam::Config::new());
3278        from_xml_in_memory("test", XMLMAPEXAMPLE, &config, &mut store)?;
3279        let res = store.resource("test").expect("resource must have been created at this point");
3280        assert_eq!(res.text(), "test", "resource text");
3281        let key = store.key("urn:stam-fromhtml", "map").expect("key must exist");
3282        let annotation = store.annotation("ann1").expect("annotation");
3283        let data = annotation.data().filter_key(&key).value().expect("data must exist");
3284        if let DataValue::Map(data) = data {
3285            assert_eq!(data.get("text"), Some(&DataValue::String("test".into())));
3286            assert_eq!(data.get("number"), Some(&DataValue::Int(42)));
3287            assert_eq!(data.get("bogus"), Some(&DataValue::Bool(true)));
3288            assert_eq!(data.len(), 3);
3289        } else {
3290            assert!(false, "Data is supposed to be a map");
3291        }
3292        Ok(())
3293    }
3294}