Skip to main content

stamtools/
xml.rs

1use std::borrow::Cow;
2use std::collections::{BTreeMap, HashMap, BTreeSet};
3use std::fmt::Display;
4use std::fs::read_to_string;
5use std::path::Path;
6use std::hash::{Hash,DefaultHasher,Hasher};
7use std::process::{Command,  Stdio};
8use std::io::{ BufWriter, Write};
9
10use roxmltree::{Document, Node, NodeId, ParsingOptions};
11use serde::Deserialize;
12use stam::*;
13use toml;
14use upon::Engine;
15use std::fmt::Write as FmtWrite;
16use serde_json;
17
18const NS_XML: &str = "http://www.w3.org/XML/1998/namespace";
19const CONTEXT_ANNO: &str = "http://www.w3.org/ns/anno.jsonld";
20
21
22fn default_set() -> String {
23    "urn:stam-fromxml".into()
24}
25
26#[derive(Deserialize)]
27/// Holds the configuration for mapping a specific XML format to STAM
28pub struct XmlConversionConfig {
29    #[serde(default)]
30    /// Holds configurations for mapping specific XML elements to STAM, evaluated in reverse-order, so put more generic rules before specific ones
31    elements: Vec<XmlElementConfig>,
32
33    #[serde(default)]
34    /// Base elements are named templates, other elements can derive from this
35    baseelements: HashMap<String, XmlElementConfig>,
36
37    #[serde(default)]
38    /// Maps XML prefixes to namespace
39    namespaces: HashMap<String, String>,
40
41    #[serde(default = "XmlWhitespaceHandling::collapse")]
42    /// Default whitespace handling
43    whitespace: XmlWhitespaceHandling,
44
45    #[serde(default)]
46    /// Sets additional context variables that can be used in templates
47    context: HashMap<String, toml::Value>,
48
49    #[serde(default)]
50    /// Sets additional context variables that can be used in templates
51    metadata: Vec<MetadataConfig>,
52
53    #[serde(default)]
54    /// Inject a DTD (for XML entity resolution)
55    inject_dtd: Option<String>,
56
57    #[serde(default = "default_set")]
58    default_set: String,
59
60    #[serde(default)]
61    /// A prefix to assign when setting annotation IDs
62    id_prefix: Option<String>,
63
64    #[serde(default)]
65    /// A suffix to strip when setting annotation IDs
66    id_strip_suffix: Vec<String>,
67
68    #[serde(default)]
69    /// Add provenance information pointing each annotation to the appropriate node in the XML source files where it came from (translates into XPathSelector in Web Annotation output)
70    provenance: bool,
71
72    #[serde(default)]
73    external_filters: Vec<ExternalFilter>,
74
75    #[serde(skip_deserializing)]
76    debug: bool,
77
78}
79
80impl XmlConversionConfig {
81    pub fn new() -> Self {
82        Self {
83            elements: Vec::new(),
84            baseelements: HashMap::new(),
85            namespaces: HashMap::new(),
86            context: HashMap::new(),
87            metadata: Vec::new(),
88            whitespace: XmlWhitespaceHandling::Collapse,
89            default_set: default_set(),
90            inject_dtd: None,
91            id_prefix: None,
92            id_strip_suffix: Vec::new(),
93            provenance: false,
94            external_filters: Vec::new(),
95            debug: false,
96        }
97    }
98
99    pub fn resolve_baseelements(&mut self) -> Result<(), XmlConversionError> {
100        let mut replace: Vec<(usize, XmlElementConfig)> = Vec::new();
101        for (i, element) in self.elements.iter().enumerate() {
102            let mut newelement = None;
103            for basename in element.base.iter().rev() {
104                if let Some(baseelement) = self.baseelements.get(basename) {
105                    if newelement.is_none() {
106                        newelement = Some(element.clone());
107                    }
108                    newelement
109                        .as_mut()
110                        .map(|newelement| newelement.update(baseelement));
111                } else {
112                    return Err(XmlConversionError::ConfigError(format!(
113                        "No such base element: {}",
114                        basename
115                    )));
116                }
117            }
118            if let Some(newelement) = newelement {
119                replace.push((i, newelement));
120            }
121        }
122        for (i, element) in replace {
123            self.elements[i] = element;
124        }
125        Ok(())
126    }
127
128    /// Parse the configuration from a TOML string (load the data from file yourself).
129    pub fn from_toml_str(tomlstr: &str) -> Result<Self, String> {
130        let mut config: Self = toml::from_str(tomlstr).map_err(|e| format!("{}", e))?;
131        config.resolve_baseelements().map_err(|e| format!("{}", e))?;
132        Ok(config)
133    }
134
135    pub fn with_debug(mut self, value: bool) -> Self {
136        self.debug = value;
137        self
138    }
139
140    /// Add provenance information pointing each annotation to the appropriate node in the XML source files where it came from (translates into XPathSelector in Web Annotation output)
141    pub fn with_provenance(mut self, value: bool) -> Self {
142        self.provenance = value;
143        self
144    }
145
146    /// Register an XML namespace with prefix
147    pub fn with_prefix(mut self, prefix: impl Into<String>, namespace: impl Into<String>) -> Self {
148        self.namespaces.insert(prefix.into(), namespace.into());
149        self
150    }
151
152    /// A prefix to assign when setting annotation IDs, within this string you can use the special variable `{resource}` to use the resource ID.
153    pub fn with_id_prefix(mut self, prefix: impl Into<String>) -> Self {
154        self.id_prefix = Some(prefix.into());
155        self
156    }
157
158    /// A suffix to strip when assigning annotation IDs
159    pub fn with_id_strip_suffix(mut self, suffix: impl Into<String>) -> Self {
160        self.id_strip_suffix.push(suffix.into());
161        self
162    }
163
164    /// Inject a DTD (for XML entity resolution)
165    pub fn with_inject_dtd(mut self, dtd: impl Into<String>) -> Self {
166        self.inject_dtd = Some(dtd.into());
167        self
168    }
169
170    /// Set default whitespace handling
171    pub fn with_whitespace(mut self, handling: XmlWhitespaceHandling) -> Self {
172        self.whitespace = handling;
173        self
174    }
175
176    /// Set an element configuration
177    pub fn with_element<F>(mut self, expression: &str, setup: F) -> Self
178    where
179        F: Fn(XmlElementConfig) -> XmlElementConfig,
180    {
181        let expression = XPathExpression::new(expression);
182        let element = setup(XmlElementConfig::new(expression));
183        if self.debug {
184            eprintln!("[STAM fromxml] registered {:?}", element);
185        }
186        self.elements.push(element);
187        self
188    }
189
190    /// How to handle this element?
191    fn element_config(&self, node: Node, path: &NodePath) -> Option<&XmlElementConfig> {
192        for elementconfig in self.elements.iter().rev() {
193            if elementconfig.path.test(path, node, self) {
194                return Some(elementconfig);
195            }
196        }
197        None
198    }
199
200    pub fn add_context(&mut self, key: impl Into<String>, value: toml::Value) {
201        self.context.insert(key.into(), value);
202    }
203
204    pub fn debug(&self) -> bool {
205        self.debug
206    }
207}
208
209#[derive(Clone, Copy, Debug, PartialEq, Deserialize)]
210/// Determines how to handle whitespace for an XML element
211pub enum XmlWhitespaceHandling {
212    /// Not specified (used for base templates)
213    Unspecified,
214    //Inherit from parent
215    Inherit,
216    /// Whitespace is kept as is in the XML
217    Preserve,
218    /// all whitespace becomes space, consecutive whitespace is squashed
219    Collapse,
220}
221
222impl Default for XmlWhitespaceHandling {
223    fn default() -> Self {
224        XmlWhitespaceHandling::Unspecified
225    }
226}
227
228impl XmlWhitespaceHandling {
229    fn collapse() -> Self {
230        XmlWhitespaceHandling::Collapse
231    }
232}
233
234#[derive(Debug, Clone, Deserialize, PartialEq, Copy, Default)]
235pub enum XmlAnnotationHandling {
236    /// No annotation
237    #[default]
238    Unspecified,
239
240    /// No annotation
241    None,
242
243    /// Selects the text pertaining to the current element
244    TextSelector,
245
246    /// Selects the text pertaining to the current resource
247    ResourceSelector,
248
249    /// Selects the text between the current element and the next instance of the same element type
250    TextSelectorBetweenMarkers,
251}
252
253#[derive(Debug, Clone, Deserialize)]
254/// XML Element configuration, determines how to map an XML element (identified by an XPath expression) to STAM
255pub struct XmlElementConfig {
256    /// This is XPath-like expression (just a small subset of XPath) to identify an element by its path
257
258    #[serde(default)]
259    path: XPathExpression,
260
261    #[serde(default)]
262    annotation: XmlAnnotationHandling,
263
264    #[serde(default)]
265    annotationdata: Vec<XmlAnnotationDataConfig>,
266
267    /// Template or None for no text handling, prefixes are never targeted by annotations
268    #[serde(default)]
269    textprefix: Option<String>,
270
271    /// Extract text. None means unspecified and defaults to false.
272    #[serde(default)]
273    text: Option<bool>,
274
275    /// Template or None for no text handling, suffixes are never targeted by annotations
276    #[serde(default)]
277    textsuffix: Option<String>,
278
279    // Annotation data for the text prefix
280    #[serde(default)]
281    annotatetextprefix: Vec<XmlAnnotationDataConfig>,
282
283    // Annotation data for the text suffix
284    #[serde(default)]
285    annotatetextsuffix: Vec<XmlAnnotationDataConfig>,
286
287    /// Include the text prefix in the annotation's text selector. None means unspecified and defaults to false
288    #[serde(default)]
289    include_textprefix: Option<bool>,
290
291    /// Include the text suffix in the annotation's text selector. None means unspecified and defaults to false
292    #[serde(default)]
293    include_textsuffix: Option<bool>,
294
295    /// Base elements to derive from
296    #[serde(default)]
297    base: Vec<String>,
298
299    /// Template or None for no ID extraction
300    #[serde(default)]
301    id: Option<String>,
302
303    #[serde(default)]
304    /// Descend into children (false) or not? (true). None means unspecified and defaults to false
305    stop: Option<bool>,
306
307    #[serde(default)]
308    /// Whitespace handling for this element
309    whitespace: XmlWhitespaceHandling,
310}
311
312impl XmlElementConfig {
313    fn new(expression: XPathExpression) -> Self {
314        Self {
315            path: expression,
316            stop: None,
317            whitespace: XmlWhitespaceHandling::Unspecified,
318            annotation: XmlAnnotationHandling::Unspecified,
319            annotationdata: Vec::new(),
320            base: Vec::new(),
321            id: None,
322            textprefix: None,
323            text: None,
324            textsuffix: None,
325            annotatetextprefix: Vec::new(),
326            annotatetextsuffix: Vec::new(),
327            include_textprefix: None,
328            include_textsuffix: None,
329        }
330    }
331
332    pub fn update(&mut self, base: &XmlElementConfig) {
333        if self.whitespace == XmlWhitespaceHandling::Unspecified
334            && base.whitespace != XmlWhitespaceHandling::Unspecified
335        {
336            self.whitespace = base.whitespace;
337        }
338        if self.annotation == XmlAnnotationHandling::Unspecified
339            && base.annotation != XmlAnnotationHandling::Unspecified
340        {
341            self.annotation = base.annotation;
342        }
343        if self.textprefix.is_none() && base.textprefix.is_some() {
344            self.textprefix = base.textprefix.clone();
345        }
346        if self.text.is_none() && base.text.is_some() {
347            self.text = base.text;
348        }
349        if self.textsuffix.is_none() && base.textsuffix.is_some() {
350            self.textsuffix = base.textsuffix.clone();
351        }
352        if self.id.is_none() && base.id.is_some() {
353            self.id = base.id.clone();
354        }
355        if self.stop.is_none() && base.stop.is_some() {
356            self.stop = base.stop;
357        }
358        for annotationdata in base.annotationdata.iter() {
359            if !self.annotationdata.contains(annotationdata) {
360                self.annotationdata.push(annotationdata.clone());
361            }
362        }
363        if self.annotatetextsuffix.is_empty() && !base.annotatetextsuffix.is_empty() {
364            self.annotatetextsuffix = base.annotatetextsuffix.clone();
365        }
366        if self.annotatetextprefix.is_empty() && !base.annotatetextprefix.is_empty() {
367            self.annotatetextprefix = base.annotatetextprefix.clone();
368        }
369        if self.include_textsuffix.is_none() {
370            self.include_textsuffix = base.include_textsuffix;
371        }
372        if self.include_textprefix.is_none() {
373            self.include_textprefix = base.include_textprefix;
374        }
375    }
376
377
378    /// This sets the mode that determines how the element is handledhttps://www.youtube.com/watch?v=G_BrbhRrP6g
379    pub fn with_stop(mut self, stop: bool) -> Self {
380        self.stop = Some(stop);
381        self
382    }
383
384    /// This sets the whitespace handling for this element
385    pub fn with_whitespace(mut self, handling: XmlWhitespaceHandling) -> Self {
386        self.whitespace = handling;
387        self
388    }
389
390    pub fn with_text(mut self, text: bool) -> Self {
391        self.text = Some(text);
392        self
393    }
394
395    pub fn with_base(mut self, iter: impl Iterator<Item = impl Into<String>>) -> Self {
396        self.base = iter.into_iter().map(|s| s.into()).collect();
397        self
398    }
399
400    pub fn without_text(mut self) -> Self {
401        self.text = None;
402        self
403    }
404
405    pub fn with_annotation(mut self, annotation: XmlAnnotationHandling) -> Self {
406        self.annotation = annotation;
407        self
408    }
409
410    /// Not a very safe hash function (just uses an address uniquely associated with this object) but works for our ends
411    fn hash(&self) -> usize {
412        self.path.0.as_ptr() as usize
413    }
414}
415
416impl PartialEq for XmlElementConfig {
417    fn eq(&self, other: &Self) -> bool {
418        self.hash() == other.hash()
419    }
420}
421
422#[derive(Debug, Clone, Deserialize, PartialEq)]
423pub struct XmlAnnotationDataConfig {
424    /// Template
425    id: Option<String>,
426    /// Template
427    set: Option<String>,
428    /// Template
429    key: Option<String>,
430    /// Any string values are interpreted as templates
431    value: Option<toml::Value>,
432
433    /// The type of the value, will be automatically detected if not set.
434    #[serde(default)]
435    valuetype: Option<String>,
436
437    /// Allow value templates that yield an empty string?
438    #[serde(default)]
439    allow_empty_value: bool,
440
441    /// Skip this data entirely if any underlying variables in the templates are undefined
442    #[serde(default)]
443    skip_if_missing: bool,
444
445
446    /// If the value is a list, convert it to multiple annotationdata instances with the same key, one for each of the values
447    #[serde(default)]
448    multiple: bool,
449}
450
451impl XmlAnnotationDataConfig {
452    pub fn with_id(mut self, id: impl Into<String>) -> Self {
453        self.id = Some(id.into());
454        self
455    }
456
457    pub fn with_set(mut self, set: impl Into<String>) -> Self {
458        self.set = Some(set.into());
459        self
460    }
461
462    pub fn with_key(mut self, key: impl Into<String>) -> Self {
463        self.key = Some(key.into());
464        self
465    }
466
467    pub fn with_value(mut self, value: impl Into<toml::Value>) -> Self {
468        self.value = Some(value.into());
469        self
470    }
471}
472
473/// Not really full XPath, just a very minor subset
474#[derive(Debug, Clone, PartialEq, Deserialize)]
475struct XPathExpression(String);
476
477impl XPathExpression {
478    pub fn new(expression: impl Into<String>) -> Self {
479        Self(expression.into())
480    }
481
482    pub fn any() -> Self {
483        Self("*".into())
484    }
485
486    pub fn iter<'a>(
487        &'a self,
488        config: &'a XmlConversionConfig,
489    ) -> impl Iterator<Item = (Option<&'a str>, &'a str, Option<&'a str>)> {
490        self.0.trim_start_matches('/').split("/").map(|segment| {
491            //eprintln!("DEBUG: segment={}", segment);
492            let (prefix, name, condition) = Self::parse_segment(segment);
493            let namespace = if let Some(prefix) = prefix {
494                if let Some(namespace) = config.namespaces.get(prefix).map(|x| x.as_str()) {
495                    Some(namespace)
496                } else {
497                    panic!(
498                        "XML namespace prefix not known in configuration: {}",
499                        prefix
500                    );
501                }
502            } else {
503                None
504            };
505            (namespace, name, condition)
506        })
507    }
508
509    /// matches a node path against an XPath-like expression
510    fn test<'a, 'b>(&self, path: &NodePath<'a, 'b>, mut node: Node<'a,'b>, config: &XmlConversionConfig) -> bool {
511        let mut pathiter = path.components.iter().rev();
512        for (refns, refname, condition) in self.iter(config).collect::<Vec<_>>().into_iter().rev() {
513            if let Some(component) = pathiter.next() {
514                /*if config.debug() {
515                    eprintln!("[STAM fromxml]          testing component {:?} against refns={:?} refname={} condition={:?}", component, refns, refname, condition);
516                }*/
517                if refname != "*" && refname != "" {
518                    if refns.is_none() != component.namespace.is_none() || component.namespace != refns || refname != component.tagname {
519                        return false;
520                    }
521                }
522                if let Some(condition) = condition {
523                    if !self.test_condition(condition, node, config) {
524                        return false;
525                    }
526                }
527                if let Some(parent) = node.parent() { 
528                    node = parent;
529                }
530            } else {
531                if refname != "" {
532                    return false;
533                }
534            }
535        }
536        /* if config.debug() {
537            eprintln!("[STAM fromxml]          match");
538        }*/
539        true
540    }
541
542    fn test_condition<'a,'b>(&self, condition: &'a str, node: Node<'a,'b>, config: &XmlConversionConfig) -> bool {
543        for condition in condition.split(" and ") { //MAYBE TODO: doesn't take literals into account yet!
544            if let Some(pos) = condition.find("!=") {
545                let var = &condition[..pos];
546                let right = condition[pos+2..].trim_matches('"');
547                if self.get_var(var, &node, config) == Some(right) {
548                    return false;
549                }
550            } else if let Some(pos) = condition.find("=") {
551                let var = &condition[..pos];
552                let right = condition[pos+1..].trim_matches('"');
553                let value = self.get_var(var, &node, config);
554                if value != Some(right) {
555                    return false;
556                }
557            } else {
558                //condition is one variable and merely needs to exist
559                let v = self.get_var(condition, &node, config);
560                if v.is_none() || v == Some("") {
561                    return false;
562                }
563            }
564        }
565        /*if config.debug() {
566            eprintln!("[STAM fromxml]          condition matches");
567        }*/
568        true
569    }
570
571    /// Resolve a variable from a conditional expression, given a variable name, node and config
572    fn get_var<'a,'b>(&self, var: &str, node: &Node<'a,'b>, config: &XmlConversionConfig) -> Option<&'a str> { 
573        if var.starts_with("@") {
574            if let Some(pos) = var.find(":") {
575                let prefix = &var[1..pos];
576                if let Some(ns) = config.namespaces.get(prefix) {
577                    let var = &var[pos+1..];
578                    node.attribute((ns.as_str(),var))
579                } else {
580                    None
581                }
582            } else {
583                node.attribute(&var[1..])
584            }
585        } else if var == "text()" {
586            node.text().map(|s|s.trim())
587        } else {
588            None
589        }
590    }
591
592    /// Parses a segment into a namespace-prefix, a name and a condition
593    fn parse_segment<'a>(s: &'a str) -> (Option<&'a str>, &'a str, Option<&'a str>) {
594        let (name, condition) = if let (Some(begin), Some(end)) = (s.find("["), s.rfind("]")) {
595            (&s[..begin], Some(&s[begin + 1..end]))
596        } else {
597            (s, None)
598        };
599        if let Some((prefix, name)) = name.split_once(":") {
600            (Some(prefix), name, condition)
601        } else {
602            (None, name, condition)
603        }
604    }
605}
606
607
608
609impl Default for XPathExpression {
610    fn default() -> Self {
611        Self::any()
612    }
613}
614
615#[derive(Clone, Debug, PartialEq)]
616struct NodePathComponent<'a,'b> {
617    namespace: Option<&'a str>,
618    tagname: &'b str,
619    /// Index sequence number, 1-indexed (as specified by XPath)
620    index: Option<usize>,
621}
622
623#[derive(Clone, Debug, PartialEq, Default)]
624struct NodePath<'a, 'b> {
625    components: Vec<NodePathComponent<'a,'b>>,
626}
627
628impl<'a, 'b> Display for NodePath<'a, 'b> {
629    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
630        for component in self.components.iter() {
631            write!(f, "/")?;
632            if let Some(ns) = component.namespace {
633                if let Some(index) = component.index {
634                    write!(f, "{{{}}}{}[{}]", ns, component.tagname, index)?;
635                } else {
636                    write!(f, "{{{}}}{}", ns, component.tagname)?;
637                }
638            } else {
639                if let Some(index) = component.index {
640                    write!(f, "{}[{}]", component.tagname, index)?;
641                } else {
642                    write!(f, "{}", component.tagname)?;
643                }
644            }
645        }
646        Ok(())
647    }
648}
649
650impl<'a,'b> NodePath<'a,'b> {
651    fn add(&mut self, node: &Node<'a,'b>, index: Option<usize>) {
652        if node.tag_name().name() != "" {
653            self.components.push(
654                NodePathComponent {
655                    namespace: node.tag_name().namespace(),
656                    tagname: node.tag_name().name(),
657                    index,
658                }
659            )
660        }
661    }
662
663    fn format_as_xpath(&self, prefixes: &HashMap<String, String>) -> String {
664        let mut out = String::new();
665        for component in self.components.iter() {
666            out.push('/');
667            if let Some(ns) = component.namespace {
668                if let Some(prefix) = prefixes.get(ns) {
669                    if let Some(index) = component.index {
670                        out += &format!("{}:{}[{}]", prefix, component.tagname, index);
671                    } else {
672                        out += &format!("{}:{}", prefix, component.tagname);
673                    }
674                } else {
675                    eprintln!("STAM fromxml WARNING: format_as_xpath: namespace {} not defined, no prefix found!", ns);
676                    if let Some(index) = component.index {
677                        out += &format!("{}[{}]", component.tagname, index);
678                    } else {
679                        out += &format!("{}", component.tagname);
680                    }
681                }
682            } else {
683                if let Some(index) = component.index {
684                    out += &format!("{}[{}]", component.tagname, index);
685                } else {
686                    out += &format!("{}", component.tagname);
687                }
688            }
689        }
690        out
691    }
692}
693
694
695/// Counts elder siblings, used to determine index values
696#[derive(Default,Debug)]
697struct SiblingCounter {
698    map: HashMap<String,usize>,
699}
700
701impl SiblingCounter {
702    fn count<'a,'b>(&mut self, node: &Node<'a,'b>) -> usize {
703        let s = format!("{:?}", node.tag_name());
704        *self.map.entry(s).and_modify(|c| {*c += 1;}).or_insert(1)
705    }
706}
707
708
709#[derive(Debug, Clone, Deserialize)]
710/// XML Element configuration, determines how to map an XML element (identified by an XPath expression) to STAM
711pub struct MetadataConfig {
712    /// This is XPath-like expression (just a small subset of XPath) to identify an element by its path
713    #[serde(default)]
714    annotation: XmlAnnotationHandling,
715
716    #[serde(default)]
717    annotationdata: Vec<XmlAnnotationDataConfig>,
718
719    /// Template or None for no ID extraction
720    #[serde(default)]
721    id: Option<String>,
722}
723
724/// Translate an XML file to STAM, given a particular configuration
725pub fn from_xml<'a>(
726    filename: &Path,
727    config: &XmlConversionConfig,
728    store: &'a mut AnnotationStore,
729) -> Result<(), String> {
730    if config.debug {
731        eprintln!("[STAM fromxml] parsing {}", filename.display());
732    }
733
734    // Read the raw XML data
735    let mut xmlstring = read_to_string(filename)
736        .map_err(|e| format!("Error opening XML file {}: {}", filename.display(), e))?;
737
738    //patchy: remove HTML5 doctype and inject our own
739    if xmlstring[..100].find("<!DOCTYPE html>").is_some() && config.inject_dtd.is_some() {
740        xmlstring = xmlstring.replacen("<!DOCTYPE html>", "", 1);
741    }
742
743    // we can only inject a DTD if there is no doctype
744    if xmlstring[..100].find("<!DOCTYPE").is_none() {
745        if let Some(dtd) = config.inject_dtd.as_ref() {
746            xmlstring = dtd.to_string() + &xmlstring
747        };
748    } else if config.inject_dtd.is_some() {
749        eprintln!("[STAM fromxml] WARNING: Can not inject DTD because file already has a DOCTYPE");
750    }
751
752    // parse the raw XML data into a DOM
753    let doc = Document::parse_with_options(
754        &xmlstring,
755        ParsingOptions {
756            allow_dtd: true,
757            ..ParsingOptions::default()
758        },
759    )
760    .map_err(|e| format!("Error parsing XML file {}: {}", filename.display(), e))?;
761
762    let mut converter = XmlToStamConverter::new(config);
763    converter
764        .compile()
765        .map_err(|e| format!("Error compiling templates: {}", e))?;
766
767    let textoutfilename = format!(
768        "{}.txt",
769        filename
770            .file_stem()
771            .expect("invalid filename")
772            .to_str()
773            .expect("invalid utf-8 in filename")
774    );
775
776    // extract text (first pass)
777    let mut path = NodePath::default();
778    path.add(&doc.root_element(), None);
779    converter
780        .extract_element_text(doc.root_element(), &path, converter.config.whitespace, Some(textoutfilename.as_str()), Some(&filename.to_string_lossy()), 0)
781        .map_err(|e| {
782            format!(
783                "Error extracting element text from {}: {}",
784                filename.display(),
785                e
786            )
787        })?;
788    if config.debug {
789        eprintln!("[STAM fromxml] extracted full text: {}", &converter.text);
790    }
791    let resource = TextResourceBuilder::new()
792        .with_id(filename_to_id(textoutfilename.as_str(), config).to_string())
793        .with_text(converter.text.clone())
794        .with_filename(&textoutfilename);
795
796    converter.resource_handle = Some(
797        store
798            .add_resource(resource)
799            .map_err(|e| format!("Failed to add resource {}: {}", &textoutfilename, e))?,
800    );
801
802    converter.add_metadata(store).map_err(|e| format!("Failed to add metadata {}: {}", &textoutfilename, e))?;
803
804    // extract annotations (second pass)
805    converter
806        .extract_element_annotation(doc.root_element(), &path,  Some(&filename.to_string_lossy()),0,  store)
807        .map_err(|e| {
808            format!(
809                "Error extracting element annotation from {}: {}",
810                filename.display(),
811                e
812            )
813        })?;
814
815    Ok(())
816}
817
818/// Translate an XML file to STAM, given a particular configuration. This translates multiple XML files to a single output file.
819pub fn from_multi_xml<'a>(
820    filenames: &Vec<&Path>,
821    outputfile: Option<&Path>,
822    config: &XmlConversionConfig,
823    store: &'a mut AnnotationStore,
824) -> Result<(), String> {
825
826    let textoutfilename = if let Some(outputfile) = outputfile {
827        format!("{}",outputfile.to_str().expect("invalid utf-8 in filename"))
828    } else {
829        format!(
830            "{}.txt",
831                filenames.iter().next().expect("1 or more filename need to be provided")
832                .file_stem()
833                .expect("invalid filename")
834                .to_str()
835                .expect("invalid utf-8 in filename")
836        )
837    };
838
839    // Read the raw XML data
840    let mut xmlstrings: Vec<String> = Vec::new();
841    let mut docs: Vec<Document> = Vec::new();
842    for filename in filenames.iter() {
843        if config.debug {
844            eprintln!("[STAM fromxml] parsing {} (one of multiple)", filename.display());
845        }
846        //patchy: remove HTML5 doctype and inject our own
847        let mut xmlstring = read_to_string(filename).map_err(|e| format!("Error opening XML file {}: {}", filename.display(), e))?;
848        if xmlstring[..100].find("<!DOCTYPE html>").is_some() && config.inject_dtd.is_some() {
849            xmlstring = xmlstring.replacen("<!DOCTYPE html>", "", 1);
850        }
851        // we can only inject a DTD if there is no doctype
852        if xmlstring[..100].find("<!DOCTYPE").is_none() {
853            if let Some(dtd) = config.inject_dtd.as_ref() {
854                xmlstring = dtd.to_string() + &xmlstring
855            };
856        } else if config.inject_dtd.is_some() {
857            eprintln!("[STAM fromxml] WARNING: Can not inject DTD because file already has a DOCTYPE");
858        }
859        xmlstrings.push(xmlstring);
860    }
861
862    for (filename, xmlstring) in filenames.iter().zip(xmlstrings.iter()) {
863        // parse the raw XML data into a DOM
864        let doc = Document::parse_with_options(
865            xmlstring,
866            ParsingOptions {
867                allow_dtd: true,
868                ..ParsingOptions::default()
869            },
870        )
871        .map_err(|e| format!("Error parsing XML file {}: {}", filename.display(), e))?;
872        docs.push(doc);
873    }
874
875    let mut converter = XmlToStamConverter::new(config);
876    converter
877        .compile()
878        .map_err(|e| format!("Error compiling templates: {}", e))?;
879
880    for (i, (doc, filename)) in docs.iter().zip(filenames.iter()).enumerate() {
881        let mut path = NodePath::default();
882        path.add(&doc.root_element(), None);
883        // extract text (first pass)
884        converter
885            .extract_element_text(doc.root_element(), &path, converter.config.whitespace, Some(textoutfilename.as_str()), Some(&filename.to_string_lossy()), i)
886            .map_err(|e| {
887                format!(
888                    "Error extracting element text from {}: {}",
889                    filename.display(),
890                    e
891                )
892            })?;
893        if config.debug {
894            eprintln!("[STAM fromxml] extracted full text: {}", &converter.text);
895        }
896    }
897
898    let resource = TextResourceBuilder::new()
899        .with_id(filename_to_id(textoutfilename.as_str(), config).to_string())
900        .with_text(converter.text.clone())
901        .with_filename(&textoutfilename);
902
903    converter.resource_handle = Some(
904        store
905            .add_resource(resource)
906            .map_err(|e| format!("Failed to add resource {}: {}", &textoutfilename, e))?,
907    );
908
909    converter.add_metadata(store).map_err(|e| format!("Failed to add metadata {}: {}", &textoutfilename, e))?;
910
911    // extract annotations (second pass)
912    for (i,(doc, filename)) in docs.iter().zip(filenames.iter()).enumerate() {
913        let mut path = NodePath::default();
914        path.add(&doc.root_element(), None);
915        converter
916            .extract_element_annotation(doc.root_element(), &path, Some(&filename.to_string_lossy()),i,  store)
917            .map_err(|e| {
918                format!(
919                    "Error extracting element annotation from {}: {}",
920                    filename.display(),
921                    e
922                )
923            })?;
924    }
925
926    Ok(())
927}
928
929/// Translate an XML file to STAM, given a particular configuration. Not writing output files and keeping all in memory. Does not support DTD injection.
930pub fn from_xml_in_memory<'a>(
931    resource_id: &str, 
932    xmlstring: &str,
933    config: &XmlConversionConfig,
934    store: &'a mut AnnotationStore,
935) -> Result<(), String> {
936    if config.debug {
937        eprintln!("[STAM fromxml] parsing XML string");
938    }
939
940    // parse the raw XML data into a DOM
941    let doc = Document::parse_with_options(
942        &xmlstring,
943        ParsingOptions {
944            allow_dtd: true,
945            ..ParsingOptions::default()
946        },
947    )
948    .map_err(|e| format!("Error parsing XML string: {}",  e))?;
949
950    let mut converter = XmlToStamConverter::new(config);
951    converter
952        .compile()
953        .map_err(|e| format!("Error compiling templates: {}", e))?;
954
955    let mut path = NodePath::default();
956    path.add(&doc.root_element(), None);
957    // extract text (first pass)
958    converter
959        .extract_element_text(doc.root_element(), &path, converter.config.whitespace, Some(resource_id), Some(resource_id), 0)
960        .map_err(|e| {
961            format!(
962                "Error extracting element text from {}: {}",
963                resource_id,
964                e
965            )
966        })?;
967    if config.debug {
968        eprintln!("[STAM fromxml] extracted full text: {}", &converter.text);
969    }
970    let resource = TextResourceBuilder::new()
971        .with_id(resource_id)
972        .with_text(converter.text.clone());
973
974    converter.resource_handle = Some(
975        store
976            .add_resource(resource)
977            .map_err(|e| format!("Failed to add resource {}: {}", &resource_id, e))?,
978    );
979
980    converter.add_metadata(store).map_err(|e| format!("Failed to add metadata for {}: {}", &resource_id, e))?;
981
982    // extract annotations (second pass)
983    converter
984        .extract_element_annotation(doc.root_element(), &path, Some(resource_id), 0, store)
985        .map_err(|e| {
986            format!(
987                "Error extracting element annotation from {}: {}",
988                resource_id,
989                e
990            )
991        })?;
992
993    Ok(())
994}
995
996pub fn filename_to_id<'a>(filename: &'a str, config: &XmlConversionConfig) -> &'a str {
997    for suffix in config.id_strip_suffix.iter() {
998        if filename.ends_with(suffix) {
999            return &filename[..filename.len() - suffix.len()];
1000        }
1001    }
1002    return filename;
1003}
1004
1005#[derive(Clone,Copy,PartialEq, Hash, Eq)]
1006enum PositionType {
1007    Body,
1008    TextPrefix,
1009    TextSuffix,
1010}
1011
1012struct XmlToStamConverter<'a> {
1013    /// The current character position the conversion process is at
1014    cursor: usize,
1015
1016    /// The extracted plain-text after/during untangling
1017    text: String,
1018
1019    /// The template engine
1020    template_engine: Engine<'a>,
1021
1022    /// Keep track of the new positions (unicode offset) where the node starts in the untangled document. The key consist of a document sequence number and a node ID.
1023    positionmap: HashMap<(usize,NodeId,PositionType), Offset>,
1024
1025    /// Keep track of the new positions (bytes offset) where the node starts in the untangled document. The key consist of a document sequence number and a node ID.
1026    bytepositionmap: HashMap<(usize,NodeId,PositionType), (usize, usize)>,
1027
1028    /// Keep track of markers (XML elements with `XmlAnnotationHandling::TextSelectorBetweenMarkers`), the key in this map is some hash of XmlElementConfig.
1029    markers: HashMap<usize, Vec<(usize,NodeId)>>,
1030
1031    /// The resource
1032    resource_handle: Option<TextResourceHandle>,
1033
1034    /// Used to keep track of whether we need to insert a whitespace before actual text
1035    pending_whitespace: bool,
1036
1037    /// The configuration
1038    config: &'a XmlConversionConfig,
1039
1040    /// Namespace to prefix map
1041    prefixes: HashMap<String, String>,
1042
1043    ///  Global context for template
1044    global_context: BTreeMap<String, upon::Value>,
1045
1046    /// Variable names per template
1047    variables: BTreeMap<String, BTreeSet<&'a str>>,
1048    
1049    debugindent: String,
1050}
1051
1052pub enum XmlConversionError {
1053    StamError(StamError),
1054    TemplateError(String, Option<upon::Error>),
1055    ConfigError(String),
1056}
1057
1058impl From<StamError> for XmlConversionError {
1059    fn from(error: StamError) -> Self {
1060        Self::StamError(error)
1061    }
1062}
1063
1064impl From<upon::Error> for XmlConversionError {
1065    fn from(error: upon::Error) -> Self {
1066        Self::TemplateError("".into(), Some(error))
1067    }
1068}
1069
1070impl Display for XmlConversionError {
1071    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
1072        match self {
1073            Self::StamError(e) => e.fmt(f),
1074            Self::TemplateError(s, e) => {
1075                f.write_str(s.as_str())?;
1076                f.write_str(": ")?;
1077                if let Some(e) = e {
1078                    e.fmt(f)?;
1079                }
1080                f.write_str("")
1081            }
1082            Self::ConfigError(e) => e.fmt(f),
1083        }
1084    }
1085}
1086
1087impl<'a> XmlToStamConverter<'a> {
1088    fn new(config: &'a XmlConversionConfig) -> Self {
1089        let mut prefixes: HashMap<String, String> = HashMap::new();
1090        for (prefix, namespace) in config.namespaces.iter() {
1091            prefixes.insert(namespace.to_string(), prefix.to_string());
1092        }
1093        let mut template_engine = Engine::new();
1094        template_engine.set_default_formatter(&value_formatter); //this one serializes Lists like in JSON
1095        template_engine.add_function("capitalize", filter_capitalize);
1096        template_engine.add_function("lower", str::to_lowercase);
1097        template_engine.add_function("upper", str::to_uppercase);
1098        template_engine.add_function("trim", |s: &str| s.trim().to_string() );
1099        template_engine.add_function("add", filter_add);
1100        template_engine.add_function("sub", filter_sub);
1101        template_engine.add_function("mul", filter_mul);
1102        template_engine.add_function("div", filter_div);
1103        template_engine.add_function("eq", |a: &upon::Value, b: &upon::Value| a == b);
1104        template_engine.add_function("ne", |a: &upon::Value, b: &upon::Value| a != b);
1105        template_engine.add_function("gt", filter_gt);
1106        template_engine.add_function("lt", filter_lt);
1107        template_engine.add_function("gte", filter_gte);
1108        template_engine.add_function("lte", filter_lte);
1109        template_engine.add_function("int", |a: &upon::Value| match a {
1110            upon::Value::Integer(x) => upon::Value::Integer(*x),
1111            upon::Value::Float(x) => upon::Value::Integer(*x as i64), 
1112            upon::Value::String(s) => upon::Value::Integer(s.parse().expect("int filter expects an integer value")),
1113            _ => panic!("int filter expects an integer value"), //<< --^  TODO: PANIC IS WAY TO STRICT
1114        });
1115        template_engine.add_function("float", |a: &upon::Value| match a {
1116            upon::Value::Float(_) => a.clone(),
1117            upon::Value::Integer(x) => upon::Value::Float(*x as f64),
1118            upon::Value::String(s) => upon::Value::Float(s.parse().expect("float filter expects a float value")),
1119            _ => panic!("int filter expects an integer value"), //<< --^  TODO: PANIC IS WAY TO STRICT
1120        });
1121        template_engine.add_function("str", |a: upon::Value| match a {
1122            upon::Value::Integer(x) => upon::Value::String(format!("{}",x)),
1123            upon::Value::Float(x) => upon::Value::String(format!("{}",x)),
1124            upon::Value::Bool(x) => upon::Value::String(format!("{}",x)),
1125            upon::Value::String(_) => a,
1126            upon::Value::None => upon::Value::String(String::new()),
1127            upon::Value::List(list) => { //too much cloning but it'll do for now
1128                let newlist: Vec<String> = list.iter().map(|v| match v {
1129                    upon::Value::String(s) => s.clone(),
1130                    upon::Value::Integer(d) => format!("{}",d),
1131                    upon::Value::Float(d) => format!("{}",d),
1132                    upon::Value::Bool(d) => format!("{}",d),
1133                    _ => String::new(),
1134                }).collect();
1135                upon::Value::String(newlist.join(", "))
1136            },
1137            _ => panic!("map to string not implemented"), //<< --^  TODO: PANIC IS WAY TO STRICT
1138        });
1139        template_engine.add_function("as_range", |a: i64| upon::Value::List(std::ops::Range { start: 0, end: a }.into_iter().map(|x| upon::Value::Integer(x+1)).collect::<Vec<_>>()) );
1140        template_engine.add_function("last", |list: &[upon::Value]| list.last().map(Clone::clone));
1141        template_engine.add_function("first", |list: &[upon::Value]| {
1142            list.first().map(Clone::clone)
1143        });
1144        template_engine.add_function("tokenize", |s: &str| {
1145            upon::Value::List(
1146                s.split(|c| c == ' ' || c == '\n').filter_map(|x|
1147                    if !x.is_empty() { 
1148                        Some(upon::Value::String(x.to_string())) 
1149                    } else {
1150                        None
1151                    }
1152                )
1153                .collect::<Vec<upon::Value>>())
1154        });
1155        template_engine.add_function("replace", |s: &str, from: &str, to: &str| { 
1156            upon::Value::String(s.replace(from,to))
1157        });
1158        template_engine.add_function("starts_with", |s: &str, prefix: &str| { 
1159            s.starts_with(prefix)
1160        });
1161        template_engine.add_function("ends_with", |s: &str, suffix: &str| { 
1162            s.ends_with(suffix)
1163        });
1164        template_engine.add_function("basename", |a: &upon::Value| match a {
1165            upon::Value::String(s) => upon::Value::String(s.split(|c| c == '/' || c == '\\').last().expect("splitting must work").to_string()),
1166            _ => panic!("basename filter expects a string value"), //<< --^  TODO: PANIC IS WAY TO STRICT
1167        });
1168        template_engine.add_function("noext", |a: &upon::Value| match a {
1169            upon::Value::String(s) => if let Some(pos) = s.rfind('.') {
1170                s[..pos].to_string()
1171            } else {
1172                s.to_string()
1173            },
1174            _ => panic!("basename filter expects a string value"), //<< --^  TODO: PANIC IS WAY TO STRICT
1175        });
1176        template_engine.add_function("join", |list: &upon::Value, delimiter: &str| match list {
1177            upon::Value::List(list) => { //too much cloning but it'll do for now
1178                let newlist: Vec<String> = list.iter().map(|v| match v {
1179                    upon::Value::String(s) => s.clone(),
1180                    upon::Value::Integer(d) => format!("{}",d),
1181                    upon::Value::Float(d) => format!("{}",d),
1182                    upon::Value::Bool(d) => format!("{}",d),
1183                    _ => String::new(),
1184                }).collect();
1185                upon::Value::String(newlist.join(delimiter))
1186            },
1187            _ => {
1188                list.clone() //was not really a list after all, just pass it on so we don't need to panic
1189            }
1190        });
1191        let mut converter = Self {
1192            cursor: 0,
1193            text: String::new(),
1194            template_engine,
1195            positionmap: HashMap::new(),
1196            bytepositionmap: HashMap::new(),
1197            markers: HashMap::new(),
1198            resource_handle: None,
1199            pending_whitespace: false,
1200            global_context: BTreeMap::new(),
1201            debugindent: String::new(),
1202            variables: BTreeMap::new(),
1203            prefixes,
1204            config,
1205        };
1206        converter.set_global_context();
1207        converter.add_external_filters();
1208        converter
1209    }
1210
1211    fn add_external_filters(&mut self) {
1212        for filter in self.config.external_filters.clone() {
1213            self.template_engine.add_function(filter.name.clone(), move |value: &upon::Value| filter.run(value)  );
1214        }
1215    }
1216
1217    /// Compile templates
1218    fn compile(&mut self) -> Result<(), XmlConversionError> {
1219        if self.config.debug {
1220            eprintln!("[STAM fromxml] compiling templates");
1221        }
1222        for element in self.config.elements.iter() {
1223            if let Some(textprefix) = element.textprefix.as_ref() {
1224                if self.template_engine.get_template(textprefix.as_str()).is_none() {
1225                    let template = self.precompile(textprefix.as_str());
1226                    self.template_engine
1227                        .add_template(textprefix.clone(), template)
1228                        .map_err(|e| {
1229                            XmlConversionError::TemplateError(
1230                                format!("element/textprefix template {}", textprefix.clone()),
1231                                Some(e),
1232                            )
1233                        })?;
1234                }
1235            }
1236            if let Some(textsuffix) = element.textsuffix.as_ref() {
1237                if self.template_engine.get_template(textsuffix.as_str()).is_none() {
1238                    let template = self.precompile(textsuffix.as_str());
1239                    self.template_engine
1240                        .add_template(textsuffix.clone(), template)
1241                        .map_err(|e| {
1242                            XmlConversionError::TemplateError(
1243                                format!("element/textsuffix template {}", textsuffix.clone()),
1244                                Some(e),
1245                            )
1246                        })?;
1247                }
1248            }
1249            if let Some(id) = element.id.as_ref() {
1250                if self.template_engine.get_template(id.as_str()).is_none() {
1251                    let template = self.precompile(id.as_str());
1252                    self.template_engine.add_template(id.clone(), template).map_err(|e| {
1253                        XmlConversionError::TemplateError(
1254                            format!("element/id template {}", id.clone()),
1255                            Some(e),
1256                        )
1257                    })?;
1258                }
1259            }
1260            for annotationdata in element.annotationdata.iter().chain(element.annotatetextprefix.iter()).chain(element.annotatetextsuffix.iter()) {
1261                if let Some(id) = annotationdata.id.as_ref() {
1262                    if self.template_engine.get_template(id.as_str()).is_none() {
1263                        let template = self.precompile(id.as_str());
1264                        self.template_engine.add_template(id.clone(), template).map_err(|e| {
1265                            XmlConversionError::TemplateError(
1266                                format!("annotationdata/id template {}", id.clone()),
1267                                Some(e),
1268                            )
1269                        })?;
1270                    }
1271                }
1272                if let Some(set) = annotationdata.set.as_ref() {
1273                    if self.template_engine.get_template(set.as_str()).is_none() {
1274                        let template = self.precompile(set.as_str());
1275                        //eprintln!("------- DEBUG: {} -> {}", set.as_str(), template);
1276                        self.template_engine.add_template(set.clone(), template).map_err(|e| {
1277                            XmlConversionError::TemplateError(
1278                                format!("annotationdata/set template {}", set.clone()),
1279                                Some(e),
1280                            )
1281                        })?;
1282                    }
1283                }
1284                if let Some(key) = annotationdata.key.as_ref() {
1285                    if self.template_engine.get_template(key.as_str()).is_none() {
1286                        let template = self.precompile(key.as_str());
1287                        self.template_engine.add_template(key.clone(), template).map_err(|e| {
1288                            XmlConversionError::TemplateError(
1289                                format!("annotationdata/key template {}", key.clone()),
1290                                Some(e),
1291                            )
1292                        })?;
1293                    }
1294                }
1295                if let Some(value) = annotationdata.value.as_ref() {
1296                    self.compile_value(value)?;
1297                }
1298            }
1299        }
1300        for metadata in self.config.metadata.iter() {
1301            if let Some(id) = metadata.id.as_ref() {
1302                if self.template_engine.get_template(id.as_str()).is_none() {
1303                    let template = self.precompile(id.as_str());
1304                    self.template_engine.add_template(id.clone(), template).map_err(|e| {
1305                        XmlConversionError::TemplateError(
1306                            format!("metadata/id template {}", id.clone()),
1307                            Some(e),
1308                        )
1309                    })?;
1310                }
1311            }
1312            for annotationdata in metadata.annotationdata.iter() {
1313                if let Some(id) = annotationdata.id.as_ref() {
1314                    if self.template_engine.get_template(id.as_str()).is_none() {
1315                        let template = self.precompile(id.as_str());
1316                        self.template_engine.add_template(id.clone(), template).map_err(|e| {
1317                            XmlConversionError::TemplateError(
1318                                format!("annotationdata/id template {}", id.clone()),
1319                                Some(e),
1320                            )
1321                        })?;
1322                    }
1323                }
1324                if let Some(set) = annotationdata.set.as_ref() {
1325                    if self.template_engine.get_template(set.as_str()).is_none() {
1326                        let template = self.precompile(set.as_str());
1327                        //eprintln!("------- DEBUG: {} -> {}", set.as_str(), template);
1328                        self.template_engine.add_template(set.clone(), template).map_err(|e| {
1329                            XmlConversionError::TemplateError(
1330                                format!("annotationdata/set template {}", set.clone()),
1331                                Some(e),
1332                            )
1333                        })?;
1334                    }
1335                }
1336                if let Some(key) = annotationdata.key.as_ref() {
1337                    if self.template_engine.get_template(key.as_str()).is_none() {
1338                        let template = self.precompile(key.as_str());
1339                        self.template_engine.add_template(key.clone(), template).map_err(|e| {
1340                            XmlConversionError::TemplateError(
1341                                format!("annotationdata/key template {}", key.clone()),
1342                                Some(e),
1343                            )
1344                        })?;
1345                    }
1346                }
1347                if let Some(value) = annotationdata.value.as_ref() {
1348                    self.compile_value(value)?;
1349                }
1350            }
1351        }
1352        Ok(())
1353    }
1354
1355    /// Compile templates from a value, all strings are considered templates
1356    fn compile_value(&mut self, value: &'a toml::Value) -> Result<(), XmlConversionError> {
1357        match value {
1358            toml::Value::String(value) => {
1359                if self.template_engine.get_template(value.as_str()).is_none() {
1360                    let template = self.precompile(value.as_str());
1361                    self.template_engine.add_template(value.clone(), template).map_err(|e| {
1362                        XmlConversionError::TemplateError(
1363                            format!("annotationdata/value template {}", value.clone()),
1364                            Some(e),
1365                        )
1366                    })?;
1367                }
1368            }
1369            toml::Value::Table(map) => {
1370                for (_key, value) in map.iter() {
1371                    self.compile_value(value)?;
1372                }
1373            },
1374            toml::Value::Array(list) => {
1375                for value in list.iter() {
1376                    self.compile_value(value)?;
1377                }
1378            }
1379            _ => {} //no templates in other types
1380        }
1381        Ok(())
1382    }
1383
1384    /// untangle text, extract the text (and only the text)
1385    /// from an XML document, according to the
1386    /// mapping configuration and creates a STAM TextResource for it.
1387    /// Records exact offsets per element/node for later use during annotation extraction.
1388    fn extract_element_text<'b>(
1389        &mut self,
1390        node: Node<'a,'b>,
1391        path: &NodePath<'a,'b>,
1392        whitespace: XmlWhitespaceHandling,
1393        resource_id: Option<&str>,
1394        inputfile: Option<&str>,
1395        doc_num: usize,
1396    ) -> Result<(), XmlConversionError> {
1397        if self.config.debug {
1398            eprintln!("[STAM fromxml]{} extracting text for element {}", self.debugindent, path);
1399        }
1400        let mut begin = self.cursor; //current character pos marks the begin
1401        let mut bytebegin = self.text.len(); //current byte pos marks the begin
1402        let mut end_discount = 0; //the discount may be needed later if textsuffixes are outputted (which we do not want as part of the annotation)
1403        let mut end_bytediscount = 0;
1404        let mut firsttext = true; //tracks whether we have already outputted some text, needed for whitespace handling
1405
1406        let mut elder_siblings = SiblingCounter::default();
1407
1408        // obtain the configuration that applies to this element
1409        if let Some(element_config) = self.config.element_config(node, path) {
1410            if self.config.debug {
1411                eprintln!("[STAM fromxml]{} matching config: {:?}", self.debugindent, element_config);
1412            }
1413
1414            if (element_config.stop == Some(false) || element_config.stop.is_none())
1415                && element_config.annotation != XmlAnnotationHandling::TextSelectorBetweenMarkers
1416            {
1417                //do text extraction for this element
1418
1419                let whitespace = if node.has_attribute((NS_XML, "space")) {
1420                    // if there is an explicit xml:space attributes, it overrides whatever whitespace handling we have set:
1421                    match node.attribute((NS_XML, "space")).unwrap() {
1422                        "preserve" => XmlWhitespaceHandling::Preserve,
1423                        "collapse" | "replace" => XmlWhitespaceHandling::Collapse,
1424                        _ => whitespace,
1425                    }
1426                } else if element_config.whitespace == XmlWhitespaceHandling::Inherit
1427                    || element_config.whitespace == XmlWhitespaceHandling::Unspecified
1428                {
1429                    whitespace //from parent, i.e. passed to this (recursive) function by caller
1430                } else {
1431                    element_config.whitespace //default from the config
1432                };
1433
1434                // process the text prefix, a text template to include prior to the actual text
1435                self.process_textprefix(element_config, node, resource_id, inputfile, doc_num, &mut begin, &mut bytebegin)?;
1436
1437                let textbegin = self.cursor;
1438                // process all child elements
1439                for child in node.children() {
1440                    if self.config.debug {
1441                        eprintln!("[STAM fromxml]{} child {:?}", self.debugindent, child);
1442                    }
1443                    if child.is_text() && element_config.text == Some(true) {
1444                        // extract the actual element text
1445                        // this may trigger multiple times if the XML element (`node`) has mixed content
1446
1447                        let mut innertext = child.text().expect("text node must have text");
1448                        let mut pending_whitespace = false;
1449                        let mut leading_whitespace = false;
1450                        if whitespace == XmlWhitespaceHandling::Collapse && !innertext.is_empty() {
1451                            // analyse what kind of whitespace we are dealing with
1452                            let mut all_whitespace = true;
1453                            leading_whitespace = innertext.chars().next().unwrap().is_whitespace();
1454
1455                            // any pending whitespace after this elements is 'buffered' in this boolean
1456                            // and only written out depending on the next text's whitespace situation
1457                            pending_whitespace = innertext
1458                                .chars()
1459                                .inspect(|c| {
1460                                    if !c.is_whitespace() {
1461                                        all_whitespace = false
1462                                    }
1463                                })
1464                                .last()
1465                                .unwrap()
1466                                .is_whitespace();
1467                            if all_whitespace {
1468                                self.pending_whitespace = true;
1469                                if self.config.debug {
1470                                    eprintln!(
1471                                        "[STAM fromxml]{} ^- all whitespace, flag pending whitespace and skipping...",
1472                                        self.debugindent,
1473                                    );
1474                                }
1475                                continue;
1476                            }
1477                            innertext = innertext.trim();
1478                            if self.config.debug {
1479                                eprintln!(
1480                                    "[STAM fromxml]{} ^- collapsed whitespace: {:?}",
1481                                    self.debugindent,
1482                                    innertext
1483                                );
1484                            }
1485                        }
1486                        if self.pending_whitespace || leading_whitespace {
1487                            //output any pending whitespace
1488                            if !self.text.is_empty()
1489                                && !self.text.chars().rev().next().unwrap().is_whitespace()
1490                            {
1491                                if self.config.debug {
1492                                    eprintln!("[STAM fromxml]{} ^- outputting pending whitespace",self.debugindent);
1493                                }
1494                                self.text.push(' ');
1495                                self.cursor += 1;
1496                                if firsttext && self.pending_whitespace {
1497                                    begin += 1;
1498                                    bytebegin += 1;
1499                                    firsttext = false;
1500                                }
1501                            }
1502                            self.pending_whitespace = false;
1503                        }
1504
1505                        // finally we output the actual text, and advance the cursor
1506                        if whitespace == XmlWhitespaceHandling::Collapse {
1507                            let mut prevc = ' ';
1508                            let mut innertext = innertext.replace(|c: char| c.is_whitespace(), " ");
1509                            innertext.retain(|c| {
1510                                let do_retain = c != ' ' || prevc != ' ';
1511                                prevc = c;
1512                                do_retain
1513                            });
1514                            self.text += &innertext;
1515                            self.cursor += innertext.chars().count();
1516                            if self.config.debug {
1517                                eprintln!("[STAM fromxml]{} ^- outputting text child (collapsed whitespace), cursor is now {}: {}",self.debugindent, self.cursor, innertext);
1518                            }
1519                        } else {
1520                            self.text += &innertext;
1521                            self.cursor += innertext.chars().count();
1522                            if self.config.debug {
1523                                eprintln!("[STAM fromxml]{} ^- outputting text child, cursor is now {}: {}",self.debugindent, self.cursor, innertext);
1524                            }
1525                        }
1526                        self.pending_whitespace = pending_whitespace;
1527                    } else if child.is_element() {
1528                        if self.config.debug {
1529                            eprintln!("[STAM fromxml]{} \\- extracting text for this child", self.debugindent);
1530                        }
1531                        self.debugindent.push_str("  ");
1532                        // recursion step, process child element, pass our whitespace handling mode since it may inherit it
1533                        let mut path = path.clone();
1534                        let count = elder_siblings.count(&child);
1535                        path.add(&child, Some(count));
1536                        self.extract_element_text(child, &path, whitespace, resource_id, inputfile, doc_num)?;
1537                        self.debugindent.pop();
1538                        self.debugindent.pop();
1539                    } else {
1540                        if self.config.debug {
1541                            eprintln!("[STAM fromxml]{} ^- skipping this child node", self.debugindent);
1542                        }
1543                        continue;
1544                    }
1545                }
1546
1547                // process the text suffix, a preconfigured string of text to include after to the actual text
1548                self.process_textsuffix(element_config, node, resource_id, inputfile, doc_num, &mut end_discount, &mut end_bytediscount, textbegin)?;
1549            } else if element_config.annotation == XmlAnnotationHandling::TextSelectorBetweenMarkers
1550            {
1551                // this is a marker, keep track of it so we can extract the span between markers in [`extract_element_annotation()`] later
1552                if self.config.debug {
1553                    eprintln!("[STAM fromxml]{} adding to markers (textprefix={:?}, textsuffix={:?})", self.debugindent, element_config.textprefix, element_config.textsuffix);
1554                }
1555
1556
1557                self.markers
1558                    .entry(element_config.hash())
1559                    .and_modify(|v| v.push((doc_num, node.id())))
1560                    .or_insert(vec![(doc_num, node.id())]);
1561
1562                // for markers it doesn't matter whether something text is defined as a prefix or suffix, it's functionally the same because a marker has no text itself
1563
1564                self.process_textprefix(element_config, node, resource_id, inputfile, doc_num, &mut begin, &mut bytebegin)?;
1565                self.process_textsuffix(element_config, node, resource_id, inputfile, doc_num, &mut end_discount, &mut end_bytediscount, self.cursor)?;
1566            }
1567        } else if self.config.debug {
1568            eprintln!(
1569                "[STAM fromxml]{} WARNING: no match, skipping text extraction for element {}",
1570                self.debugindent,
1571                path
1572            );
1573        }
1574
1575        // Last, we store the new text offsets for this element/node so
1576        // we can use it in [`extract_element_annotation()`] to associate
1577        // actual annotations with this span.
1578        if begin <= (self.cursor - end_discount) {
1579            let offset = Offset::simple(begin, self.cursor - end_discount);
1580            if self.config.debug {
1581                eprintln!(
1582                    "[STAM fromxml]{} extracted text for {} @{:?}: {:?}",
1583                    self.debugindent,
1584                    path,
1585                    &offset,
1586                    &self.text[bytebegin..(self.text.len() - end_bytediscount)]
1587                );
1588            }
1589            self.positionmap.insert((doc_num, node.id(), PositionType::Body), offset);
1590            self.bytepositionmap
1591                .insert((doc_num, node.id(), PositionType::Body), (bytebegin, self.text.len() - end_bytediscount));
1592        }
1593        Ok(())
1594    }
1595
1596    /// process the text prefix, a text template to include prior to the actual text
1597    fn process_textprefix<'b>(
1598        &mut self,
1599        element_config: &XmlElementConfig,
1600        node: Node<'a,'b>,
1601        resource_id: Option<&str>,
1602        inputfile: Option<&str>,
1603        doc_num: usize,
1604        begin: &mut usize,
1605        bytebegin: &mut usize
1606    ) -> Result<(), XmlConversionError> {
1607        if let Some(textprefix) = &element_config.textprefix {
1608            self.pending_whitespace = false;
1609            if self.config.debug {
1610                eprintln!("[STAM fromxml]{} outputting textprefix: {:?}", self.debugindent, textprefix);
1611            }
1612            let result =
1613                self.render_template(textprefix, &node, Some(self.cursor), None, resource_id, inputfile, doc_num)
1614                    .map_err(|e| match e {
1615                        XmlConversionError::TemplateError(s, e) => {
1616                            XmlConversionError::TemplateError(
1617                                format!(
1618                                "whilst rendering textprefix template '{}' for node '{}': {}",
1619                                textprefix, node.tag_name().name(), s
1620                            ),
1621                                e,
1622                            )
1623                        }
1624                        e => e,
1625                    })?;
1626            let result_charlen = result.chars().count();
1627
1628            if !element_config.annotatetextprefix.is_empty() {
1629                //record the offsets for textprefix annotation later
1630                let offset = Offset::simple(self.cursor, self.cursor + result_charlen);
1631                self.positionmap.insert((doc_num, node.id(), PositionType::TextPrefix), offset);
1632                self.bytepositionmap
1633                    .insert((doc_num, node.id(), PositionType::TextPrefix), (*bytebegin, *bytebegin + result.len()));
1634            }
1635
1636            self.cursor += result_charlen;
1637            self.text += &result;
1638
1639            if element_config.include_textprefix != Some(true) {
1640                // the textprefix will not be part of the annotation's text selection, increment the offsets:
1641                *begin += result_charlen;
1642                *bytebegin += result.len();
1643            }
1644        }
1645        Ok(())
1646    }
1647
1648    /// process the text suffix, a preconfigured string of text to include after to the actual text
1649    fn process_textsuffix<'b>(
1650        &mut self,
1651        element_config: &XmlElementConfig,
1652        node: Node<'a,'b>,
1653        resource_id: Option<&str>,
1654        inputfile: Option<&str>,
1655        doc_num: usize,
1656        end_discount: &mut usize,
1657        end_bytediscount: &mut usize,
1658        textbegin: usize,
1659    ) -> Result<(), XmlConversionError> {
1660        if let Some(textsuffix) = &element_config.textsuffix {
1661            if self.config.debug {
1662                eprintln!("[STAM fromxml]{} outputting textsuffix: {:?}", self.debugindent, textsuffix);
1663            }
1664            let result = self.render_template(
1665                textsuffix.as_str(),
1666                &node,
1667                Some(textbegin),
1668                Some(self.cursor),
1669                resource_id,
1670                inputfile,
1671                doc_num
1672            ).map_err(|e| match e {
1673                    XmlConversionError::TemplateError(s, e) => {
1674                        XmlConversionError::TemplateError(
1675                            format!(
1676                                "whilst rendering textsuffix template '{}' for node '{}': {}",
1677                                textsuffix,
1678                                node.tag_name().name(),
1679                                s
1680                            ),
1681                            e,
1682                        )
1683                    }
1684                    e => e,
1685            })?;
1686            let end_discount_tmp = result.chars().count();
1687            let end_bytediscount_tmp = result.len();
1688
1689
1690            self.text += &result;
1691
1692            if !element_config.annotatetextsuffix.is_empty() {
1693                //record the offsets for textsuffix annotation later
1694                let offset = Offset::simple(self.cursor, self.cursor + end_discount_tmp);
1695                self.positionmap.insert((doc_num, node.id(), PositionType::TextSuffix), offset);
1696                self.bytepositionmap
1697                    .insert((doc_num, node.id(), PositionType::TextSuffix), (self.text.len() - end_bytediscount_tmp, self.text.len()));
1698            }
1699
1700            self.cursor += end_discount_tmp;
1701            self.pending_whitespace = false;
1702
1703            if element_config.include_textsuffix == Some(true) {
1704                // the textsuffix will be part of the annotation's text selection, no discount for later
1705                *end_discount = 0;
1706                *end_bytediscount = 0;
1707            } else {
1708                // the textsuffix will not be part of the annotation's text selection, set discounts for later
1709                *end_discount = end_discount_tmp;
1710                *end_bytediscount = end_bytediscount_tmp;
1711            }
1712        }
1713        Ok(())
1714    }
1715
1716    /// extract annotations from the XML document
1717    /// according to the mapping configuration and creates a STAM TextResource for it.
1718    /// The text, for the full document, must have already been extracted earlier with [`extract_element_text()`].
1719    /// This relies on the exact offsets per element/node computed earlier during text extraction (`positionmap`).
1720    fn extract_element_annotation<'b>(
1721        &mut self,
1722        node: Node<'a,'b>,
1723        path: &NodePath<'a,'b>,
1724        inputfile: Option<&str>,
1725        doc_num: usize,
1726        store: &mut AnnotationStore,
1727    ) -> Result<(), XmlConversionError> {
1728        if self.config.debug {
1729            eprintln!("[STAM fromxml]{} extracting annotation from {}", self.debugindent, path);
1730        }
1731
1732        let mut elder_siblings = SiblingCounter::default();
1733
1734        // obtain the configuration that applies to this element
1735        if let Some(element_config) = self.config.element_config(node, &path) {
1736            if self.config.debug {
1737                eprintln!("[STAM fromxml]{} matching config: {:?}", self.debugindent, element_config);
1738            }
1739            if element_config.annotation != XmlAnnotationHandling::None
1740                && element_config.annotation != XmlAnnotationHandling::Unspecified
1741            {
1742                let mut builder = AnnotationBuilder::new();
1743
1744                //prepare variables to pass to the template context
1745                let offset = self.positionmap.get(&(doc_num, node.id(), PositionType::Body));
1746                if element_config.annotation == XmlAnnotationHandling::TextSelector {
1747                    if let Some((beginbyte, endbyte)) = self.bytepositionmap.get(&(doc_num, node.id(), PositionType::Body)) {
1748                        if self.config.debug {
1749                            eprintln!("[STAM fromxml]{} annotation covers text {:?} (bytes {}-{})", self.debugindent, offset, beginbyte, endbyte);
1750                        }
1751                    }  else if self.text.is_empty() {
1752                        return Err(XmlConversionError::ConfigError("Can't extract annotations on text if no text was extracted!".into()));
1753                    }
1754                }
1755                let begin = if let Some(offset) = offset {
1756                    if let Cursor::BeginAligned(begin) = offset.begin {
1757                        Some(begin)
1758                    } else {
1759                        None
1760                    }
1761                } else {
1762                    None
1763                };
1764                let end = if let Some(offset) = offset {
1765                    if let Cursor::BeginAligned(end) = offset.end {
1766                        Some(end)
1767                    } else {
1768                        None
1769                    }
1770                } else {
1771                    None
1772                };
1773
1774                let resource_id = if let Some(resource_handle) = self.resource_handle {
1775                    store.resource(resource_handle).unwrap().id()
1776                } else {
1777                    None
1778                };
1779
1780                let mut have_id = false;
1781                if let Some(template) = &element_config.id {
1782                    let context = self.context_for_node(&node, begin, end, template.as_str(), resource_id, inputfile, doc_num);
1783                    let compiled_template = self.template_engine.template(template.as_str());
1784                    let id = compiled_template.render(&context).to_string().map_err(|e| 
1785                            XmlConversionError::TemplateError(
1786                                format!(
1787                                    "whilst rendering id template '{}' for node '{}'",
1788                                    template,
1789                                    node.tag_name().name(),
1790                                ),
1791                                Some(e),
1792                            )
1793                        )?;
1794                    if !id.is_empty() {
1795                        builder = builder.with_id(id);
1796                        have_id = true;
1797                    }
1798                }
1799
1800                if !have_id {
1801                    //generate a random ID if we have none
1802                    if let Some(resource_id) = resource_id {
1803                        builder = builder.with_id(stam::generate_id(&format!("{}-",resource_id), ""));
1804                    } else {
1805                        builder = builder.with_id(stam::generate_id("", ""));
1806                    }
1807                }
1808
1809                builder = self.add_annotationdata_to_builder(element_config.annotationdata.iter(), builder, node.clone(), begin, end, resource_id, inputfile, doc_num)?;
1810
1811
1812                if self.config.provenance  && inputfile.is_some() {
1813                    let path_string = if let Some(id) = node.attribute((NS_XML,"id")) {
1814                        //node has an ID, use that
1815                        format!("//{}[@xml:id=\"{}\"]", self.get_node_name_for_xpath(&node), id)
1816                    } else {
1817                        //no ID, use full XPath expression
1818                        path.format_as_xpath(&self.prefixes)
1819                    };
1820                    let databuilder = AnnotationDataBuilder::new().with_dataset(CONTEXT_ANNO.into()).with_key("target".into()).with_value(
1821                        BTreeMap::from([
1822                            ("source".to_string(),inputfile.unwrap().into()),
1823                            ("selector".to_string(), 
1824                                    BTreeMap::from([
1825                                        ("type".to_string(),"XPathSelector".into()),
1826                                        ("value".to_string(),path_string.into())
1827                                    ]).into()
1828                            )
1829                        ]).into()
1830                    );
1831                    builder = builder.with_data_builder(databuilder);
1832                }
1833
1834
1835                // Finish the builder and add the actual annotation to the store, according to its element handling
1836                match element_config.annotation {
1837                    XmlAnnotationHandling::TextSelector => {
1838                        // Annotation is on text, translates to TextSelector
1839                        if let Some(selector) = self.textselector(node, doc_num, PositionType::Body) {
1840                            builder = builder.with_target(selector);
1841                            if self.config.debug {
1842                                eprintln!("[STAM fromxml]   builder AnnotateText: {:?}", builder);
1843                            }
1844                            store.annotate(builder)?;
1845                        }
1846                        if !element_config.annotatetextprefix.is_empty() || !element_config.annotatetextsuffix.is_empty() {
1847                            self.annotate_textaffixes(node, element_config, inputfile, doc_num, store)?;
1848                        }
1849                    }
1850                    XmlAnnotationHandling::ResourceSelector => {
1851                        // Annotation is metadata, translates to ResourceSelector
1852                        builder = builder.with_target(SelectorBuilder::ResourceSelector(
1853                            self.resource_handle.into(),
1854                        ));
1855                        if self.config.debug {
1856                            eprintln!("[STAM fromxml]   builder AnnotateResource: {:?}", builder);
1857                        }
1858                        store.annotate(builder)?;
1859                    }
1860                    XmlAnnotationHandling::TextSelectorBetweenMarkers => {
1861                        // Annotation is on a text span *between* two marker elements
1862                        if let Some(selector) =
1863                            self.textselector_for_markers(node, doc_num, store, element_config)
1864                        {
1865                            builder = builder.with_target(selector);
1866                            if self.config.debug {
1867                                eprintln!(
1868                                    "[STAM fromxml]   builder TextSelectorBetweenMarkers: {:?}",
1869                                    builder
1870                                );
1871                            }
1872                            store.annotate(builder)?;
1873                            if !element_config.annotatetextprefix.is_empty() || !element_config.annotatetextsuffix.is_empty() {
1874                                self.annotate_textaffixes(node, element_config, inputfile, doc_num, store)?;
1875                            }
1876                        }
1877                    }
1878                    _ => panic!(
1879                        "Invalid annotationhandling: {:?}",
1880                        element_config.annotation
1881                    ),
1882                }
1883            }
1884
1885            // Recursion step
1886            if element_config.stop == Some(false) || element_config.stop.is_none() {
1887                for child in node.children() {
1888                    if child.is_element() {
1889                        self.debugindent.push_str("  ");
1890                        let mut path = path.clone();
1891                        let count = elder_siblings.count(&child);
1892                        path.add(&child, Some(count));
1893                        //eprintln!("DEBUG: count={}, child={:?}, parent={:?}, elder_siblings={:?}", count, child.tag_name(), node.tag_name(), elder_siblings);
1894                        self.extract_element_annotation(child, &path, inputfile, doc_num, store)?;
1895                        self.debugindent.pop();
1896                        self.debugindent.pop();
1897                    }
1898                }
1899            }
1900        } else {
1901            eprintln!(
1902                "[STAM fromxml]{} WARNING: no match, skipping annotation extraction for element {}",
1903                self.debugindent,
1904                path
1905            );
1906        }
1907        Ok(())
1908    }
1909
1910    fn add_annotationdata_to_builder<'input>(&self, iter: impl Iterator<Item = &'a XmlAnnotationDataConfig>,
1911        mut builder: AnnotationBuilder<'a>,
1912        node: Node<'a, 'input>,
1913        begin: Option<usize>,
1914        end: Option<usize>,
1915        resource_id: Option<&str>,
1916        inputfile: Option<&str>,
1917        doc_num: usize,
1918    ) -> Result<AnnotationBuilder<'a>, XmlConversionError> {
1919        for annotationdata in iter {
1920            let mut databuilder = AnnotationDataBuilder::new();
1921            if let Some(template) = &annotationdata.set {
1922                let context = self.context_for_node(&node, begin, end, template.as_str(), resource_id, inputfile, doc_num);
1923                let compiled_template = self.template_engine.template(template.as_str());
1924                let dataset = compiled_template.render(&context).to_string().map_err(|e| 
1925                        XmlConversionError::TemplateError(
1926                            format!(
1927                                "whilst rendering annotationdata/dataset template '{}' for node '{}'",
1928                                template,
1929                                node.tag_name().name(),
1930                            ),
1931                            Some(e),
1932                        )
1933                    )?;
1934                if !dataset.is_empty() {
1935                    databuilder = databuilder.with_dataset(dataset.into())
1936                }
1937            } else {
1938                databuilder =
1939                    databuilder.with_dataset(self.config.default_set.as_str().into());
1940            }
1941            if let Some(template) = &annotationdata.key {
1942                let context = self.context_for_node(&node, begin, end, template.as_str(), resource_id, inputfile, doc_num);
1943                let compiled_template = self.template_engine.template(template.as_str());
1944                match compiled_template.render(&context).to_string().map_err(|e| 
1945                        XmlConversionError::TemplateError(
1946                            format!(
1947                                "whilst rendering annotationdata/key template '{}' for node '{}'",
1948                                template,
1949                                node.tag_name().name(),
1950                            ),
1951                            Some(e),
1952                        )
1953                    )  {
1954                    Ok(key) if !key.is_empty() =>
1955                        databuilder = databuilder.with_key(key.into()) ,
1956                    Ok(_) if !annotationdata.skip_if_missing => {
1957                        return Err(XmlConversionError::TemplateError(
1958                            format!(
1959                                "whilst rendering annotationdata/key template '{}' for node '{}'",
1960                                template,
1961                                node.tag_name().name(),
1962                            ),
1963                            None
1964                        ));
1965                    },
1966                    Err(e) if !annotationdata.skip_if_missing => {
1967                        return Err(e)
1968                    },
1969                    _ => {
1970                        //skip whole databuilder if missing
1971                        continue
1972                    }
1973                }
1974            }
1975            if let Some(value) = &annotationdata.value {
1976                match self.extract_value(value,  node, annotationdata.allow_empty_value, annotationdata.skip_if_missing, annotationdata.valuetype.as_ref().map(|s| s.as_str()), begin, end, resource_id, inputfile, doc_num)? {
1977                    Some(DataValue::List(values)) if annotationdata.multiple => {
1978                        for value in values {
1979                            let mut databuilder_multi = databuilder.clone();
1980                            databuilder_multi = databuilder_multi.with_value(value);
1981                            builder = builder.with_data_builder(databuilder_multi);
1982                        }
1983                    },
1984                    Some(value) => {
1985                        databuilder = databuilder.with_value(value);
1986                    },
1987                    None =>  {
1988                        //skip whole databuilder if missing
1989                        continue
1990                    }
1991                }
1992            }
1993            if !annotationdata.multiple {
1994                builder = builder.with_data_builder(databuilder);
1995            }
1996        }
1997        Ok(builder)
1998    }
1999
2000    /// Annotates textprefix and textsuffix, if applicable
2001    fn annotate_textaffixes<'b>(
2002        &mut self,
2003        node: Node<'a,'b>,
2004        element_config: &XmlElementConfig,
2005        inputfile: Option<&str>,
2006        doc_num: usize,
2007        store: &mut AnnotationStore,
2008    ) -> Result<(), XmlConversionError> {
2009
2010
2011        if !element_config.annotatetextprefix.is_empty() {
2012            let mut builder = AnnotationBuilder::new().with_id(stam::generate_id("textprefix-", ""));
2013            if let Some(offset) = self.positionmap.get(&(doc_num, node.id(), PositionType::TextPrefix)) {
2014                let begin = if let Cursor::BeginAligned(begin) = offset.begin {
2015                        Some(begin)
2016                    } else {
2017                        None
2018                    };
2019                let end = if let Cursor::BeginAligned(end) = offset.end {
2020                        Some(end)
2021                    } else {
2022                        None
2023                    };
2024                builder = self.add_annotationdata_to_builder(element_config.annotatetextprefix.iter(), builder, node.clone(), begin,end, None, inputfile, doc_num)?; //MAYBE TODO: pass resource_id
2025                if let Some(selector) = self.textselector(node, doc_num, PositionType::TextPrefix) {
2026                    builder = builder.with_target(selector);
2027                    if self.config.debug {
2028                        eprintln!("[STAM fromxml]   builder AnnotateText: {:?}", builder);
2029                    }
2030                    store.annotate(builder)?;
2031                } else {
2032                    return Err(XmlConversionError::ConfigError("Failed to create textselector to target textprefix".into()));
2033                }
2034            }
2035        }
2036
2037        if !element_config.annotatetextsuffix.is_empty() {
2038            let mut builder = AnnotationBuilder::new().with_id(stam::generate_id("textsuffix-", ""));
2039            if let Some(offset) = self.positionmap.get(&(doc_num, node.id(), PositionType::TextSuffix)) {
2040                let begin = if let Cursor::BeginAligned(begin) = offset.begin {
2041                        Some(begin)
2042                    } else {
2043                        None
2044                    };
2045                let end = if let Cursor::BeginAligned(end) = offset.end {
2046                        Some(end)
2047                    } else {
2048                        None
2049                    };
2050                builder = self.add_annotationdata_to_builder(element_config.annotatetextsuffix.iter(), builder, node.clone(), begin,end, None, inputfile, doc_num)?; //MAYBE TODO: pass resource_id
2051                if let Some(selector) = self.textselector(node, doc_num, PositionType::TextSuffix) {
2052                    builder = builder.with_target(selector);
2053                    if self.config.debug {
2054                        eprintln!("[STAM fromxml]   builder AnnotateText: {:?}", builder);
2055                    }
2056                    store.annotate(builder)?;
2057                } else {
2058                    return Err(XmlConversionError::ConfigError("Failed to create textselector to target textprefix".into()));
2059                }
2060            }
2061        }
2062        Ok(())
2063    }
2064
2065    /// Extract values, running the templating engine in case of string values
2066    fn extract_value<'b>(&self, value: &'a toml::Value, node: Node<'a,'b>, allow_empty_value: bool, skip_if_missing: bool, valuetype: Option<&str>, begin: Option<usize>, end: Option<usize>, resource_id: Option<&str>, inputfile: Option<&str>, doc_num: usize) -> Result<Option<DataValue>, XmlConversionError>{
2067        match value {
2068            toml::Value::String(template) => {  
2069                let context = self.context_for_node(&node, begin, end, template.as_str(), resource_id, inputfile, doc_num);
2070                /*
2071                if self.config.debug() {
2072                    eprintln!(
2073                        "[STAM fromxml]              Context for annotationdata/map template '{}' for node '{}': {:?}",
2074                        template,
2075                        node.tag_name().name(),
2076                        context
2077                    );
2078                }
2079                */
2080                let compiled_template = self.template_engine.template(template.as_str()); //panics if doesn't exist, but that can't happen
2081                match compiled_template.render(&context).to_string().map_err(|e| 
2082                        XmlConversionError::TemplateError(
2083                            format!(
2084                                "whilst rendering annotationdata/map template '{}' for node '{}'.{}",
2085                                template,
2086                                node.tag_name().name(),
2087                                if self.config.debug() {
2088                                    format!("\nContext was {:?}.\nVariables are: {:?}", context, self.variables.get(template))
2089                                } else {
2090                                    String::new()
2091                                }
2092                            ),
2093                            Some(e),
2094                        )
2095                    )  {
2096                    Ok(value) => {
2097                        if !value.is_empty() || allow_empty_value {
2098                            string_to_datavalue(value, valuetype).map(|v| Some(v))
2099                        } else {
2100                            //skip
2101                            Ok(None)
2102                        }
2103                    },
2104                    Err(e) if !skip_if_missing => {
2105                        Err(e)
2106                    },
2107                    Err(_) if allow_empty_value => {
2108                        Ok(Some("".into()))
2109                    },
2110                    Err(_) => {
2111                        //skip whole databuilder if missing
2112                        Ok(None)
2113                    }
2114                }
2115            },
2116            toml::Value::Table(map) => {
2117                let mut resultmap: BTreeMap<String,DataValue> = BTreeMap::new();
2118                for (key, value) in map.iter() {
2119                    if let Some(value) = self.extract_value(value,  node, false, true, None, begin, end, resource_id, inputfile, doc_num)? {
2120                        resultmap.insert(key.clone(), value);
2121                    }
2122                }
2123                Ok(Some(resultmap.into()))
2124            },
2125            toml::Value::Array(list) => {
2126                let mut resultlist: Vec<DataValue> = Vec::new();
2127                for value in list.iter() {
2128                    if let Some(value) = self.extract_value(value, node, false, true, None,  begin, end, resource_id, inputfile, doc_num)? {
2129                        resultlist.push(value);
2130                    }
2131                }
2132                Ok(Some(resultlist.into()))
2133            }
2134            toml::Value::Boolean(v) => Ok(Some(DataValue::Bool(*v))),
2135            toml::Value::Float(v) => Ok(Some(DataValue::Float(*v))),
2136            toml::Value::Integer(v) => Ok(Some(DataValue::Int(*v as isize))),
2137            toml::Value::Datetime(_v) => {
2138                todo!("fromxml: Datetime conversion not implemented yet");
2139            }
2140        }
2141    }
2142
2143    /// Extract values for metadata (no associated node), running the templating engine in case of string values
2144    fn extract_value_metadata<'b>(&self, value: &'a toml::Value, context: &upon::Value, allow_empty_value: bool, skip_if_missing: bool, resource_id: Option<&str>) -> Result<Option<DataValue>, XmlConversionError>{
2145        match value {
2146            toml::Value::String(template) => {  
2147                let compiled_template = self.template_engine.template(template.as_str()); //panics if doesn't exist, but that can't happen
2148                match compiled_template.render(&context).to_string().map_err(|e| 
2149                        XmlConversionError::TemplateError(
2150                            format!(
2151                                "whilst rendering annotationdata/metadata template '{}' for metadata",
2152                                template,
2153                            ),
2154                            Some(e),
2155                        )
2156                    )  {
2157                    Ok(value) => {
2158                        if !value.is_empty() || allow_empty_value {
2159                            Ok(Some(value.into()))
2160                        } else {
2161                            //skip
2162                            Ok(None)
2163                        }
2164                    },
2165                    Err(e) if !skip_if_missing => {
2166                        Err(e)
2167                    },
2168                    Err(_) if allow_empty_value => {
2169                        Ok(Some("".into()))
2170                    },
2171                    Err(_) => {
2172                        //skip whole databuilder if missing
2173                        Ok(None)
2174                    }
2175                }
2176            },
2177            toml::Value::Table(map) => {  
2178                let mut resultmap: BTreeMap<String,DataValue> = BTreeMap::new();
2179                for (key, value) in map.iter() {
2180                    if let Some(value) = self.extract_value_metadata(value, context, false, true,  resource_id)? {
2181                        resultmap.insert(key.clone(), value);
2182                    }
2183                }
2184                Ok(Some(resultmap.into()))
2185            },
2186            toml::Value::Array(list) => {  
2187                let mut resultlist: Vec<DataValue> = Vec::new();
2188                for value in list.iter() {
2189                    if let Some(value) = self.extract_value_metadata(value, context, false, true, resource_id)? {
2190                        resultlist.push(value);
2191                    }
2192                }
2193                Ok(Some(resultlist.into()))
2194            }
2195            toml::Value::Boolean(v) => Ok(Some(DataValue::Bool(*v))),
2196            toml::Value::Float(v) => Ok(Some(DataValue::Float(*v))),
2197            toml::Value::Integer(v) => Ok(Some(DataValue::Int(*v as isize))),
2198            toml::Value::Datetime(_v) => {
2199                todo!("fromxml: Datetime conversion not implemented yet");
2200            }
2201        }
2202    }
2203
2204    /// Select text corresponding to the element/node and document number
2205    fn textselector<'s>(&'s self, node: Node, doc_num: usize, positiontype: PositionType) -> Option<SelectorBuilder<'s>> {
2206        let res_handle = self.resource_handle.expect("resource must be associated");
2207        if let Some(offset) = self.positionmap.get(&(doc_num, node.id(), positiontype)) {
2208            Some(SelectorBuilder::TextSelector(
2209                BuildItem::Handle(res_handle),
2210                offset.clone(),
2211            ))
2212        } else {
2213            None
2214        }
2215    }
2216
2217    /// Select text between this element/node and the next of the same type
2218    fn textselector_for_markers<'b>(
2219        &self,
2220        node: Node,
2221        doc_num: usize,
2222        store: &AnnotationStore,
2223        element_config: &'b XmlElementConfig,
2224    ) -> Option<SelectorBuilder<'b>> {
2225        let resource = store
2226            .resource(
2227                self.resource_handle
2228                    .expect("resource must have been created"),
2229            )
2230            .expect("resource must exist");
2231        let mut end: Option<usize> = None;
2232        if let Some(markers) = self.markers.get(&element_config.hash()) {
2233            let mut grab = false;
2234            for (d_num, n_id) in markers.iter() {
2235                if grab {
2236                    //this marker is the next one, it's begin position is our desired end position
2237                    end = self.positionmap.get(&(*d_num, *n_id, PositionType::Body)).map(|offset| {
2238                        offset
2239                            .begin
2240                            .try_into()
2241                            .expect("begin cursor must be beginaligned")
2242                    });
2243                    break;
2244                }
2245                if doc_num == *d_num && *n_id == node.id() {
2246                    //current node/marker found, signal grab for the next one
2247                    grab = true;
2248                }
2249            }
2250        };
2251        if end.is_none() {
2252            //no next marker found, use end of document instead
2253            end = Some(resource.textlen());
2254        }
2255        if let (Some(offset), Some(end)) = (self.positionmap.get(&(doc_num, node.id(), PositionType::Body)), end) {
2256            Some(SelectorBuilder::TextSelector(
2257                BuildItem::Handle(self.resource_handle.unwrap()),
2258                Offset::simple(
2259                    offset
2260                        .begin
2261                        .try_into()
2262                        .expect("begin cursor must be beginaligned"),
2263                    end,
2264                ),
2265            ))
2266        } else {
2267            None
2268        }
2269    }
2270
2271    fn set_global_context(&mut self) {
2272        self.global_context
2273            .insert("context".into(), upon::Value::Map(self.config.context.iter().map(|(k,v)| (k.clone(), map_value(v))).collect()));
2274        self.global_context
2275            .insert("namespaces".into(), self.config.namespaces.clone().into());
2276        self.global_context
2277            .insert("default_set".into(), self.config.default_set.clone().into());
2278    }
2279
2280    fn render_template<'input, 't>(
2281        &self,
2282        template: &'t str,
2283        node: &Node<'a, 'input>,
2284        begin: Option<usize>,
2285        end: Option<usize>,
2286        resource: Option<&str>,
2287        inputfile: Option<&str>,
2288        doc_num: usize,
2289    ) -> Result<Cow<'t, str>, XmlConversionError> {
2290        if template.chars().any(|c| c == '{') {
2291            //value is a template, templating engine probably needed
2292            let compiled_template = self.template_engine.template(template);
2293            let context = self.context_for_node(&node, begin, end, template, resource, inputfile, doc_num);
2294            let result = compiled_template.render(context).to_string()?;
2295            Ok(Cow::Owned(result))
2296        } else {
2297            //value is a literal: templating engine not needed
2298            Ok(Cow::Borrowed(template))
2299        }
2300    }
2301
2302    fn context_for_node<'input>(
2303        &self,
2304        node: &Node<'a, 'input>,
2305        begin: Option<usize>,
2306        end: Option<usize>,
2307        template: &str, 
2308        resource: Option<&str>,
2309        inputfile: Option<&str>,
2310        doc_num: usize,
2311    ) -> upon::Value {
2312        let mut context = self.global_context.clone();
2313        let length = if let (Some(begin), Some(end)) = (begin, end) {
2314            Some(end - begin)
2315        } else {
2316            None
2317        };
2318        context.insert("localname".into(), node.tag_name().name().into());
2319        //name with name prefix (if any)
2320        context.insert("name".into(), self.get_node_name_for_template(node).into());
2321        if let Some(namespace) = node.tag_name().namespace() {
2322            //the full namespace
2323            context.insert("namespace".into(), namespace.into());
2324        }
2325
2326        // Offset in the untangled plain text
2327        if let Some(begin) = begin {
2328            context.insert("begin".into(), upon::Value::Integer(begin as i64));
2329        }
2330        if let Some(end) = end {
2331            context.insert("end".into(), upon::Value::Integer(end as i64));
2332        }
2333        if let Some(length) = length {
2334            context.insert("length".into(), upon::Value::Integer(length as i64));
2335        }
2336        if let Some(resource) = resource {
2337            //the resource ID
2338            context.insert("resource".into(), resource.into());
2339        }
2340        if let Some(inputfile) = inputfile {
2341            //the input file
2342            context.insert("inputfile".into(), inputfile.into());
2343        }
2344        //document number (0-indexed), useful in case multiple input documents are cast to a single output text
2345        context.insert("doc_num".into(), upon::Value::Integer(doc_num as i64));
2346
2347        if let Some(vars) = self.variables.get(template) {
2348            for var in vars {
2349                let mut encodedvar = String::new();
2350                if let Some(value) = self.context_for_var(node, var, &mut encodedvar, false) {
2351                    if self.config.debug() {
2352                        eprintln!(
2353                            "[STAM fromxml]              Set context variable for template '{}' for node '{}': {}={:?}   (encodedvar={})",
2354                            template,
2355                            node.tag_name().name(),
2356                            var,
2357                            value,
2358                            encodedvar
2359                        );
2360                    }
2361                    if value != upon::Value::None {
2362                        context.insert(encodedvar, value);
2363                    }
2364                } else if self.config.debug() {
2365                    eprintln!(
2366                        "[STAM fromxml]              Missed context variable for template '{}' for node '{}': {}",
2367                        template,
2368                        node.tag_name().name(),
2369                        var
2370                    );
2371                }
2372            }
2373        }
2374        upon::Value::Map(context)
2375    }
2376
2377    /// Looks up a variable value (from the DOM XML) to be used in for template context
2378    // returns value and stores full the *encoded* variable name in path (this is safe to pass to template)
2379    // return values are temporarily aggregated in multiple if multiple elements are requested, it will be emptied automatically, the caller owns it but doesn't use it itself.
2380    fn context_for_var<'input>(
2381        &self,
2382        node: &Node<'a, 'input>,
2383        var: &str,
2384        path: &mut String,
2385        mut return_all_matches: bool,
2386    ) -> Option<upon::Value> {
2387
2388        //are we the first call by the caller or are we a recursion?
2389        let first = path.is_empty();
2390
2391        let var = if var.starts_with("?.$$") {
2392            if first {
2393                path.push_str("?.ELEMENTS_");
2394                return_all_matches = true;
2395                if self.config.debug {
2396                    eprintln!("[STAM fromxml]              will return all matches for {}", var);
2397                }
2398            };
2399            &var[4..]
2400        } else if var.starts_with("?.$") {
2401            if first {
2402                path.push_str("?.ELEMENT_");
2403            };
2404            &var[3..]
2405        } else if var.starts_with("$$") {
2406            if first {
2407                path.push_str("ELEMENTS_");
2408                return_all_matches = true;
2409                if self.config.debug {
2410                    eprintln!("[STAM fromxml]              will return all matches for {}", var);
2411                }
2412            };
2413            &var[2..]
2414        } else if var.starts_with("$") {
2415            if first {
2416                path.push_str("ELEMENT_");
2417            };
2418            &var[1..]
2419        } else if var.starts_with("?.@") {
2420            if first {
2421                path.push_str("?.");
2422            };
2423            &var[2..]
2424        } else {
2425            var
2426        };
2427
2428        if !first && !var.is_empty() && !path.ends_with("ELEMENT_") && !path.ends_with("ELEMENTS_"){
2429            path.push_str("_IN_");
2430        }
2431
2432        //get the first component of the variable
2433        let (component, remainder) = var.split_once("/").unwrap_or((var,""));
2434        //eprintln!("DEBUG: component={}, remainder={}, node={}, return_all_matches={}", component, remainder, node.tag_name().name(), return_all_matches);
2435        if component.is_empty() {
2436            if first && !remainder.is_empty() {
2437                //we're asked to start at the root node
2438                let mut n = node.clone();
2439                //find the root node
2440                while let Some(parentnode) = n.parent_element() {
2441                    n = parentnode;
2442                }
2443                //recurse from root node
2444                let (rootcomponent, remainder) = remainder.split_once("/").unwrap_or((remainder,""));
2445                let (prefix, localname)  = if let Some(pos) = rootcomponent.find(":") {
2446                    (Some(&rootcomponent[0..pos]),  &rootcomponent[pos+1..])
2447                } else {
2448                    (None, rootcomponent)
2449                };
2450                //test if root name corresponds with what we expected
2451                if localname != n.tag_name().name() && localname != "*" {
2452                    None
2453                } else {
2454                    if let Some(prefix) = prefix {
2455                        path.push_str(prefix);
2456                        path.push_str("__");
2457                    }
2458                    path.push_str(localname);
2459                    self.context_for_var(&n, remainder, path, return_all_matches)
2460                }
2461            } else {
2462                //an empty component is the stop condition , this function is called recursively, stripping one
2463                //component at a time until nothing is left, we then take the text of that final node:
2464                Some(recursive_text(node).into())
2465            }
2466        } else if component.starts_with("@"){
2467            if let Some(pos) = component.find(":") {
2468                let prefix = &component[1..pos];
2469                if let Some(ns) = self.config.namespaces.get(prefix) {
2470                    let var = &component[pos+1..];
2471                    path.push_str("ATTRIB_");
2472                    path.push_str(prefix);
2473                    path.push_str("__");
2474                    path.push_str(var);
2475                    Some(
2476                        node.attribute((ns.as_str(),var)).into()
2477                    )
2478                } else {
2479                    None
2480                }
2481            } else {
2482                let var = &component[1..];
2483                path.push_str("ATTRIB_");
2484                path.push_str(var);
2485                Some(
2486                    node.attribute(var).into()
2487                )
2488            }
2489        } else if component == ".." {
2490            if let Some(parentnode) = node.parent_element().as_ref() {
2491                //recurse with parent node
2492                path.push_str("PARENT");
2493                self.context_for_var(parentnode, remainder, path, return_all_matches)
2494            } else {
2495                None
2496            }
2497        } else if component == "." {
2498            path.push_str("THIS");
2499            if !remainder.is_empty() {
2500                //a . is meaningless if not the final component
2501                self.context_for_var(node, remainder, path, return_all_matches)
2502            } else {
2503                Some(recursive_text(node).into())
2504            }
2505        } else {
2506            let (prefix, localname)  = if let Some(pos) = component.find(":") {
2507                (Some(&component[0..pos]),  &component[pos+1..])
2508            } else {
2509                (None, component)
2510            };
2511            let localname_with_condition = localname;
2512            let (localname, condition_str, condition) = self.extract_condition(localname_with_condition); //extract X-Path like conditions [@attrib="value"]  (very limited!)
2513            //eprintln!("DEBUG: looking for {} (prefix={:?},localname={}, condition={:?}) in {:?}", localname_with_condition,  prefix, localname, condition, node.tag_name());
2514            let mut multiple_value_buffer: Vec<upon::Value> = Vec::new(); //only used when multiple == true
2515            let mut final_path: String = String::new(); //only used when multiple == true
2516            for child in node.children() {
2517                if child.is_element() {
2518                    let namedata = child.tag_name();
2519                    let mut child_matches = if let Some(namespace) = namedata.namespace() {
2520                        if let Some(foundprefix) = self.prefixes.get(namespace) {
2521                            Some(foundprefix.as_str()) == prefix && localname == namedata.name()
2522                        } else {
2523                            false
2524                        }
2525                    } else {
2526                        namedata.name() == localname
2527                    };
2528                    if child_matches {
2529                        //MAYBE TODO: move to separate funtion
2530                        if let Some((attribname, negate, attribvalue)) = condition {
2531                            //test condition: falsify child_matches
2532                            if let Some(pos) = attribname.find(":") {
2533                                let prefix = &attribname[0..pos];
2534                                if let Some(ns) = self.config.namespaces.get(prefix) {
2535                                    let attribname = &attribname[pos+1..];
2536                                    if let Some(value) = child.attribute((ns.as_str(),attribname)) {
2537                                        if !negate && attribvalue != Some(value) {
2538                                            child_matches = false;
2539                                        } else if negate && attribvalue == Some(value) {
2540                                            child_matches = false;
2541                                        }
2542                                    } else {
2543                                        child_matches = false;
2544                                    }
2545                                } else {
2546                                    child_matches = false;
2547                                }
2548                            } else {
2549                                if let Some(value) = child.attribute(attribname) {
2550                                    if !negate && attribvalue != Some(value) {
2551                                        child_matches = false;
2552                                    } else if negate && attribvalue == Some(value) {
2553                                        child_matches = false;
2554                                    }
2555                                } else {
2556                                    child_matches = false;
2557                                }
2558                            }
2559                        }
2560                        if !child_matches && self.config.debug {
2561                            eprintln!("[STAM fromxml] candidate node does not meet condition: {}", localname_with_condition);
2562                        }
2563                        //end condition test
2564                    }
2565                    if child_matches {
2566                        let prevpathlen = path.len();
2567                        //update path
2568                        if let Some(prefix) = prefix {
2569                            path.push_str(prefix);
2570                            path.push_str("__");
2571                        }
2572                        path.push_str(localname);
2573                        if condition.is_some() {
2574                            //simply encode the condition as a hash (non-decodable but that's okay)
2575                            let mut hasher = DefaultHasher::new();
2576                            condition_str.hash(&mut hasher);
2577                            let h = hasher.finish();
2578                            path.push_str(&format!("_COND{}_", h));
2579                        }
2580                        if let Some(value) = self.context_for_var(&child, remainder, path, return_all_matches) {
2581                            //success
2582                            if return_all_matches {
2583                                if let upon::Value::List(v) = value {
2584                                    multiple_value_buffer.extend(v.into_iter());
2585                                } else {
2586                                    multiple_value_buffer.push(value);
2587                                }
2588                                if final_path.is_empty() {
2589                                    final_path = path.clone();
2590                                }
2591                                //do not return yet, there may be more!
2592                            } else {
2593                                //normal behaviour, get first match
2594                                return Some(value);
2595                            }
2596                        }
2597                        //child didn't match (or we want multiple matches), truncate path again and continue search (a later child may match again!)
2598                        path.truncate(prevpathlen);
2599                    }
2600                }
2601            }
2602            if !multiple_value_buffer.is_empty() {
2603                //we found multiple values, return them
2604                if self.config.debug {
2605                    eprintln!("[STAM fromxml]              returning multiple matches of {} as list", var);
2606                }
2607                //we also return the path of the match
2608                *path = final_path;
2609                Some(multiple_value_buffer.into())
2610            } else {
2611                //no match found for this variable
2612                if self.config.debug {
2613                    eprintln!("[STAM fromxml]              returning with no match found for {} in {}", var, node.tag_name().name());
2614                }
2615                None
2616            }
2617        }
2618    }
2619
2620    fn extract_condition<'b>(&self, localname: &'b str) -> (&'b str, &'b str, Option<(&'b str, bool, Option<&'b str>)>) { //(localname, condition, Option<(attrib, negation, attribvalue)>)
2621        //simple conditional statement
2622        if localname.ends_with("]") {
2623            if let Some(pos) = localname.find("[") {
2624                let condition = &localname[pos+1..localname.len()-1];
2625                let (mut attrib, negation, attribvalue) = if let Some(pos) = condition.find("=") {
2626                     let attrib = condition[0..pos].trim();
2627                     let value = condition[pos+1..].trim();
2628                     let value = &value[1..value.len() - 1]; //strips the literal quotes (") for the value
2629                     if attrib.ends_with('!') {
2630                        //negation (!= operator)
2631                        (attrib[..attrib.len() - 1].trim(), true, Some(value))
2632                     } else {
2633                        (attrib.trim(), false, Some(value))
2634                     }
2635                } else {
2636                    (condition, false, None)
2637                };
2638                if attrib.starts_with('@') {
2639                    //this should actually be mandatory and already checked during template precompilation
2640                    attrib = &attrib[1..];
2641                }
2642                return (&localname[..pos], condition, Some((attrib,  negation,attribvalue )) );
2643            }
2644        }
2645        (localname, "", None)
2646    }
2647
2648
2649    fn get_node_name_for_template<'b>(&self, node: &'b Node) -> Cow<'b,str> {
2650        let extended_name = node.tag_name();
2651        match (extended_name.namespace(), extended_name.name()) {
2652            (Some(namespace), tagname) => {
2653                if let Some(prefix) = self.prefixes.get(namespace) {
2654                    Cow::Owned(format!("{}__{}", prefix, tagname))
2655                } else {
2656                    Cow::Borrowed(tagname)
2657                }
2658            }
2659            (None, tagname) => Cow::Borrowed(tagname),
2660        }
2661    }
2662
2663    fn get_node_name_for_xpath<'b>(&self, node: &'b Node) -> Cow<'b,str> {
2664        let extended_name = node.tag_name();
2665        match (extended_name.namespace(), extended_name.name()) {
2666            (Some(namespace), tagname) => {
2667                if let Some(prefix) = self.prefixes.get(namespace) {
2668                    Cow::Owned(format!("{}:{}", prefix, tagname))
2669                } else {
2670                    Cow::Borrowed(tagname)
2671                }
2672            }
2673            (None, tagname) => Cow::Borrowed(tagname),
2674        }
2675    }
2676
2677
2678    fn precompile(&mut self, template: &'a str) -> Cow<'a,str> {
2679        let mut replacement = String::new();
2680        let mut variables: BTreeSet<&'a str> = BTreeSet::new();
2681        let mut begin = 0;
2682        let mut end = 0;
2683        for i  in 0..template.len() {
2684            let slice = &template[i..];
2685            if slice.starts_with("{{") || slice.starts_with("{%") {
2686                begin = i;
2687            } else if slice.starts_with("}}") || slice.starts_with("%}") {
2688                if end < begin+2 {
2689                    replacement.push_str(&template[end..begin+2]);
2690                }
2691                let inner = &template[begin+2..i]; //the part without the {{  }}
2692                replacement.push_str(&self.precompile_inblock(inner, &mut variables));
2693                end = i;
2694            }
2695        }
2696        if end > 0 {
2697            replacement.push_str(&template[end..]);
2698        }
2699        self.variables.insert(template.into(), variables);
2700        //eprintln!("DEBUG: precompile({}) -> {}", template, replacement);
2701
2702        if !replacement.is_empty() {
2703            Cow::Owned(replacement)
2704        } else {
2705            Cow::Borrowed(template)
2706        }
2707    }
2708
2709    fn precompile_inblock<'s>(&self, s: &'s str, vars: &mut BTreeSet<&'s str>) -> Cow<'s,str> {
2710        let mut quoted = false;
2711        let mut var = false;
2712        let mut begin = 0;
2713        let mut end = 0;
2714        let mut replacement = String::new();
2715        let mut in_condition = false;
2716        for (i,c) in s.char_indices() {
2717            if in_condition && c != ']' {
2718                continue;
2719            }
2720            if c == '"' {
2721                quoted = !quoted;
2722            } else if !quoted {
2723                if !var && (c == '@' || c == '$') {
2724                    //token is an XML variable name, its syntax needs some changes before it can be used in the templating engine
2725                    var = true;
2726                    begin = i;
2727                } else if var && c == '[' {
2728                    in_condition = true;
2729                } else if var && in_condition && c == ']' {
2730                    //end of condition
2731                    in_condition = false;
2732                } else if var && in_condition  {
2733                    //in condition
2734                    continue;
2735                } else if var && (!c.is_alphanumeric() && c != '$' && c != '.' && c != '/' && c != '_' && c != ':' && c != '@') {
2736                    //end of variable (including condition if applicable)
2737                    if end < begin {
2738                        replacement.push_str(&s[end..begin]);
2739                    }
2740                    let varname = &s[begin..i];
2741                    vars.insert(varname);
2742                    let replacement_var = self.precompile_name(varname);
2743                    replacement += &replacement_var;
2744                    end = i;
2745                    var = false;
2746                }
2747            }
2748        }
2749        if end > 0 {
2750            replacement.push_str(&s[end..]);
2751        }
2752        if var {
2753            //don't forget last one
2754            let varname = &s[begin..];
2755            vars.insert(varname);
2756            let replacement_var = self.precompile_name(varname);
2757            replacement += &replacement_var;
2758        }
2759        if !replacement.is_empty() {
2760            //eprintln!("DEBUG: precompile_inblock({}) -> {}", s, replacement);
2761            Cow::Owned(replacement)
2762        } else {
2763            Cow::Borrowed(s)
2764        }
2765    }
2766
2767    /// upon's templating syntax doesn't support some of the characters we use in names, this function substitutes them for more verbose equivalents
2768    fn precompile_name(&self, s: &str) -> String {
2769        let mut replacement = String::new();
2770        let mut begincondition = None;
2771        let mut skip = 0;
2772        for (i,c) in s.char_indices() {
2773            if begincondition.is_some() && c != ']' {
2774                continue;
2775            } else if skip > 0 {
2776                skip -= 1;
2777                continue;
2778            }
2779            if c == '$' {
2780                let slice = &s[i..];
2781                if slice.starts_with("$$..") {
2782                    replacement.push_str("ELEMENTS_PARENT");
2783                    skip = 3;
2784                } else if slice.starts_with("$$.") {
2785                    replacement.push_str("ELEMENTS_THIS");
2786                    skip = 2;
2787                } else if slice.starts_with("$$/") {
2788                    replacement.push_str("ELEMENTS_");
2789                    skip = 2;
2790                } else if slice.starts_with("$$") {
2791                    replacement.push_str("ELEMENTS_");
2792                    skip = 1;
2793                } else if slice.starts_with("$..") {
2794                    replacement.push_str("ELEMENT_PARENT");
2795                    skip = 2;
2796                } else if slice.starts_with("$.") {
2797                    replacement.push_str("ELEMENT_THIS");
2798                    skip = 1;
2799                } else if slice.starts_with("$/") {
2800                    replacement.push_str("ELEMENT_");
2801                    skip = 1;
2802                } else {
2803                    replacement.push_str("ELEMENT_");
2804                }
2805            } else if c == '@' {
2806                replacement.push_str("ATTRIB_");
2807            } else if c == '/' {
2808                replacement.push_str("_IN_");
2809            } else if c == ':' {
2810                replacement.push_str("__");
2811            } else if c == '[' {
2812                begincondition = Some(i+1);
2813            } else if c == ']' {
2814                //conditions are just stored as hashes
2815                if let Some(begin) = begincondition {
2816                    let mut hasher = DefaultHasher::new();
2817                    let _ = &s[begin..i].hash(&mut hasher);
2818                    let h = hasher.finish();
2819                    replacement.push_str(&format!("_COND{}_", h));
2820                }
2821                begincondition = None;
2822            } else {
2823                replacement.push(c);
2824            }
2825        }
2826        //eprintln!("DEBUG: precompile_name({}) -> {}", s, replacement);
2827        replacement
2828    }
2829
2830    fn add_metadata(&self, store: &mut AnnotationStore) -> Result<(), XmlConversionError> {
2831        for metadata in self.config.metadata.iter() {
2832            let mut builder = AnnotationBuilder::new();
2833
2834            let resource_id = if let Some(resource_handle) = self.resource_handle {
2835                store.resource(resource_handle).unwrap().id()
2836            } else {
2837                None
2838            };
2839
2840            let mut context = self.global_context.clone();
2841            if let Some(resource_id) = resource_id {
2842                context.insert("resource".into(), resource_id.into());
2843            }
2844
2845            if let Some(template) = &metadata.id {
2846                let compiled_template = self.template_engine.template(template.as_str());
2847                let id = compiled_template.render(&context).to_string().map_err(|e| 
2848                        XmlConversionError::TemplateError(
2849                            format!(
2850                                "whilst rendering metadata id template '{}'",
2851                                template,
2852                            ),
2853                            Some(e),
2854                        )
2855                    )?;
2856                if !id.is_empty() {
2857                    builder = builder.with_id(id);
2858                }
2859            }
2860
2861            for annotationdata in metadata.annotationdata.iter() {
2862                let mut databuilder = AnnotationDataBuilder::new();
2863                if let Some(template) = &annotationdata.set {
2864                    let compiled_template = self.template_engine.template(template.as_str());
2865                    let dataset = compiled_template.render(&context).to_string().map_err(|e| 
2866                            XmlConversionError::TemplateError(
2867                                format!(
2868                                    "whilst rendering annotationdata/dataset template '{}' for metadata",
2869                                    template,
2870                                ),
2871                                Some(e),
2872                            )
2873                        )?;
2874                    if !dataset.is_empty() {
2875                        databuilder = databuilder.with_dataset(dataset.into())
2876                    }
2877                } else {
2878                    databuilder =
2879                        databuilder.with_dataset(self.config.default_set.as_str().into());
2880                }
2881                if let Some(template) = &annotationdata.key {
2882                    let compiled_template = self.template_engine.template(template.as_str());
2883                    match compiled_template.render(&context).to_string().map_err(|e| 
2884                            XmlConversionError::TemplateError(
2885                                format!(
2886                                    "whilst rendering annotationdata/key template '{}' for metadata",
2887                                    template,
2888                                ),
2889                                Some(e),
2890                            )
2891                        )  {
2892                        Ok(key) if !key.is_empty() =>
2893                            databuilder = databuilder.with_key(key.into()) ,
2894                        Ok(_) if !annotationdata.skip_if_missing => {
2895                            return Err(XmlConversionError::TemplateError(
2896                                format!(
2897                                    "whilst rendering annotationdata/key template '{}' metadata",
2898                                    template,
2899                                ),
2900                                None
2901                            ));
2902                        },
2903                        Err(e) if !annotationdata.skip_if_missing => {
2904                            return Err(e)
2905                        },
2906                        _ => {
2907                            //skip whole databuilder if missing
2908                            continue
2909                        }
2910                    }
2911                }
2912                if let Some(value) = &annotationdata.value {
2913                    match self.extract_value_metadata(value, &upon::Value::Map(context.clone()), annotationdata.allow_empty_value, annotationdata.skip_if_missing,  resource_id.as_deref())? {
2914                        Some(value) => {
2915                            databuilder = databuilder.with_value(value);
2916                        },
2917                        None =>  {
2918                            //skip whole databuilder if missing
2919                            continue
2920                        }
2921                    }
2922                }
2923                builder = builder.with_data_builder(databuilder);
2924            }
2925
2926
2927
2928            // Finish the builder and add the actual annotation to the store, according to its element handling
2929            match metadata.annotation {
2930                XmlAnnotationHandling::TextSelector => {
2931                    // Annotation is on text, translates to TextSelector
2932                    builder = builder.with_target(SelectorBuilder::TextSelector(BuildItem::Handle(self.resource_handle.expect("resource must have handle")), Offset::whole()));
2933                    if self.config.debug {
2934                        eprintln!("[STAM fromxml]   builder AnnotateText: {:?}", builder);
2935                    }
2936                    store.annotate(builder)?;
2937                }
2938                XmlAnnotationHandling::ResourceSelector  | XmlAnnotationHandling::None | XmlAnnotationHandling::Unspecified => {
2939                    // Annotation is metadata (default), translates to ResourceSelector
2940                    builder = builder.with_target(SelectorBuilder::ResourceSelector(
2941                        self.resource_handle.into(),
2942                    ));
2943                    if self.config.debug {
2944                        eprintln!("[STAM fromxml]   builder AnnotateResource: {:?}", builder);
2945                    }
2946                    store.annotate(builder)?;
2947                }
2948                _ => panic!(
2949                    "Invalid annotationhandling for metadata: {:?}",
2950                    metadata.annotation
2951                ),
2952            }
2953        }
2954        Ok(())
2955    }
2956}
2957
2958
2959
2960/// Get recursive text without any elements
2961fn recursive_text(node: &Node) -> String {
2962    let mut s = String::new();
2963    for child in node.children() {
2964        if child.is_text() {
2965            s += child.text().expect("should have text");
2966        } else if child.is_element() {
2967            s += &recursive_text(&child);
2968        }
2969    }
2970    s
2971}
2972
2973// Filters
2974fn filter_capitalize(s: &str) -> String {
2975    let mut out = String::with_capacity(s.len());
2976    for (i, c) in s.chars().enumerate() {
2977        if i == 0 {
2978            out.push_str(&c.to_uppercase().collect::<String>())
2979        } else {
2980            out.push(c);
2981        }
2982    }
2983    out
2984}
2985
2986fn filter_gt(a: &upon::Value, b: &upon::Value) -> bool {
2987    match (a, b) {
2988        (upon::Value::Integer(a), upon::Value::Integer(b)) => *a > *b,
2989        (upon::Value::Float(a), upon::Value::Float(b)) => *a > *b,
2990        (upon::Value::String(a), upon::Value::String(b)) => *a > *b,
2991        _ => false,
2992    }
2993}
2994
2995fn filter_lt(a: &upon::Value, b: &upon::Value) -> bool {
2996    match (a, b) {
2997        (upon::Value::Integer(a), upon::Value::Integer(b)) => *a < *b,
2998        (upon::Value::Float(a), upon::Value::Float(b)) => *a < *b,
2999        (upon::Value::String(a), upon::Value::String(b)) => *a < *b,
3000        _ => false,
3001    }
3002}
3003
3004fn filter_gte(a: &upon::Value, b: &upon::Value) -> bool {
3005    match (a, b) {
3006        (upon::Value::Integer(a), upon::Value::Integer(b)) => *a >= *b,
3007        (upon::Value::Float(a), upon::Value::Float(b)) => *a >= *b,
3008        (upon::Value::String(a), upon::Value::String(b)) => *a >= *b,
3009        _ => false,
3010    }
3011}
3012
3013fn filter_lte(a: &upon::Value, b: &upon::Value) -> bool {
3014    match (a, b) {
3015        (upon::Value::Integer(a), upon::Value::Integer(b)) => *a <= *b,
3016        (upon::Value::Float(a), upon::Value::Float(b)) => *a <= *b,
3017        (upon::Value::String(a), upon::Value::String(b)) => *a <= *b,
3018        _ => false,
3019    }
3020}
3021
3022fn filter_add(a: &upon::Value, b: &upon::Value) -> upon::Value {
3023    match (a, b) {
3024        (upon::Value::Integer(a), upon::Value::Integer(b)) => upon::Value::Integer(a + b),
3025        (upon::Value::Float(a), upon::Value::Float(b)) => upon::Value::Float(a + b),
3026        (upon::Value::String(a), upon::Value::String(b)) => upon::Value::String(a.clone() + b),
3027        _ => upon::Value::None,
3028    }
3029}
3030
3031fn filter_sub(a: &upon::Value, b: &upon::Value) -> upon::Value {
3032    match (a, b) {
3033        (upon::Value::Integer(a), upon::Value::Integer(b)) => upon::Value::Integer(a - b),
3034        (upon::Value::Float(a), upon::Value::Float(b)) => upon::Value::Float(a - b),
3035        _ => upon::Value::None,
3036    }
3037}
3038
3039fn filter_mul(a: &upon::Value, b: &upon::Value) -> upon::Value {
3040    match (a, b) {
3041        (upon::Value::Integer(a), upon::Value::Integer(b)) => upon::Value::Integer(a * b),
3042        (upon::Value::Float(a), upon::Value::Float(b)) => upon::Value::Float(a * b),
3043        _ => upon::Value::None,
3044    }
3045}
3046
3047fn filter_div(a: &upon::Value, b: &upon::Value) -> upon::Value {
3048    match (a, b) {
3049        (upon::Value::Integer(a), upon::Value::Integer(b)) => upon::Value::Integer(a / b),
3050        (upon::Value::Float(a), upon::Value::Float(b)) => upon::Value::Float(a / b),
3051        _ => upon::Value::None,
3052    }
3053}
3054
3055
3056/// Map value between toml and upon. This makes a clone.
3057fn map_value(value: &toml::Value) -> upon::Value {
3058    match value {
3059        toml::Value::String(s) => upon::Value::String(s.clone()),
3060        toml::Value::Integer(i) => upon::Value::Integer(*i),
3061        toml::Value::Float(i) => upon::Value::Float(*i),
3062        toml::Value::Boolean(v) => upon::Value::Bool(*v),
3063        toml::Value::Datetime(s) => upon::Value::String(s.to_string()),
3064        toml::Value::Array(v) => upon::Value::List(v.iter().map(|i| map_value(i)).collect()),
3065        toml::Value::Table(v) => upon::Value::Map(v.iter().map(|(k,i)| (k.clone(),map_value(i))).collect()),
3066    }
3067}
3068
3069/// Parse a string that is a result from the template renderer to a DataValue again
3070#[inline]
3071fn string_to_datavalue(value: String, valuetype: Option<&str>) -> Result<DataValue,XmlConversionError> {
3072    match valuetype {
3073        Some("str") | Some("string")  => Ok(DataValue::String(value)),
3074        Some("int") => {
3075            if let Ok(value) = value.parse::<isize>() {
3076                Ok(DataValue::Int(value))
3077            } else {
3078                Err(XmlConversionError::TemplateError(format!("Unable to interpret value as integer: {}", value), None))
3079            }
3080        },
3081        Some("float") => {
3082            if let Ok(value) = value.parse::<f64>() {
3083                Ok(DataValue::Float(value))
3084            } else {
3085                Err(XmlConversionError::TemplateError(format!("Unable to interpret value as integer: {}", value), None))
3086            }
3087        },
3088        Some("bool") => match value.as_str() {
3089            "yes" | "true" | "enabled" | "on" | "1" | "active"  => Ok(DataValue::Bool(true)),
3090            _ => Ok(DataValue::Bool(false))
3091        },
3092        Some(x) => {
3093                Err(XmlConversionError::TemplateError(format!("Invalid valuetype: {}", x), None))
3094        }
3095        None => {
3096            //automatically determine type
3097            if let Ok(value) =  value.parse::<isize>() {
3098                Ok(DataValue::Int(value))
3099            } else if let Ok(value) =  value.parse::<f64>() {
3100                Ok(DataValue::Float(value))
3101            } else if value.starts_with("(list) [ ") && value.ends_with(" ]") {
3102                //deserialize lists again
3103                if let Ok(serde_json::Value::Array(values)) = serde_json::from_str(&value[6..]) {
3104                    Ok(DataValue::List(values.into_iter().map(|v| {
3105                        match v {
3106                            serde_json::Value::String(s) => DataValue::String(s),
3107                            serde_json::Value::Number(n) => if let Some(n) = n.as_i64() {
3108                                DataValue::Int(n as isize)
3109                            } else if let Some(n) = n.as_f64() {
3110                                DataValue::Float(n)
3111                            } else {
3112                                unreachable!("number should always be either int or float")
3113                            },
3114                            serde_json::Value::Bool(b) => DataValue::Bool(b),
3115                            _ => DataValue::Null, //nested arrays and maps are NOT supported here!
3116                        }
3117                    }).collect()))
3118                } else {
3119                    Err(XmlConversionError::TemplateError(format!("Unable to deserialize list value: {}", value), None))
3120                }
3121            } else {
3122                Ok(value.into())
3123            }
3124        }
3125    }
3126}
3127
3128fn string_to_templatevalue(value: String) -> upon::Value {
3129    if let Ok(value) =  value.parse::<i64>() {
3130        upon::Value::Integer(value)
3131    } else if let Ok(value) =  value.parse::<f64>() {
3132        upon::Value::Float(value)
3133    } else {
3134        upon::Value::String(value)
3135    }
3136}
3137
3138/// Custom formatter for templating that can also handle lists (the default one in upon can't)
3139/// Lists will be output JSON-style prepended by the marker text "(list) ", this allows deserialisers to turn it into a list again
3140fn value_formatter(f: &mut upon::fmt::Formatter<'_>, value: &upon::Value) -> upon::fmt::Result {
3141    match value {
3142        upon::Value::List(vs) => {
3143            f.write_str("(list) [ ")?;
3144            for (i, v) in vs.iter().enumerate() {
3145                if i > 0 {
3146                    f.write_str(", ")?;
3147                }
3148                if let upon::Value::String(s) = v {
3149                    write!(f, "\"{}\"", s.replace("\"","\\\"").replace("\n"," ").split_whitespace().collect::<Vec<_>>().join(" "))?;
3150                } else {
3151                    upon::fmt::default(f, v)?;
3152                    f.write_char('"')?;
3153                }
3154            }
3155            f.write_str(" ]")?;
3156        }
3157        v => upon::fmt::default(f, v)?, // fallback to default formatter
3158    };
3159    Ok(())
3160}
3161
3162#[derive(Clone,Debug,Deserialize)]
3163struct ExternalFilter {
3164    /// The name of the filter
3165    name: String,
3166
3167    /// The command to run.
3168    command: String,
3169
3170    /// The arguments to pass to the command, you can use "{{ value }}" or `$value` to represent the input value if needed. It will also be passed to stdin. No escaping needed, it is not mediated by a shell.
3171    args: Vec<String>
3172}
3173
3174impl ExternalFilter {
3175    //TODO: panic may be too strict in here:
3176    fn run(&self, input_value: &upon::Value) -> upon::Value {
3177        let process = Command::new(self.command.as_str()).args(
3178            //args are passed directly, not mediated via shell, so no escaping necessary
3179            self.args.iter().map(|x| if x == "{{value}}" || x == "{{ value }}" || x == "$value" {
3180                match input_value {
3181                    upon::Value::String(s) => s.clone(),
3182                    upon::Value::Integer(d) => format!("{}",d),
3183                    upon::Value::Float(d) => format!("{}",d),
3184                    upon::Value::Bool(d) => format!("{}",d),
3185                    upon::Value::None => String::new(),
3186                    _ => panic!("Lists and maps are not supported to be passed as parameter to  external filters yet!"), 
3187                }
3188            } else {
3189                x.clone() //too much cloning, but Cow didn't work here because it is coerced into OsStr later
3190            })
3191        ).stdin(Stdio::piped()).stdout(Stdio::piped()).spawn();
3192
3193
3194        if let Ok(mut process) = process {
3195            {
3196                let mut outstdin = process.stdin.take().expect("unable to open stdin for external filter");
3197                let mut writer = BufWriter::new(&mut outstdin);
3198                match input_value {
3199                    upon::Value::String(s) => writer.write(s.as_bytes()),
3200                    upon::Value::Integer(d) => writer.write(format!("{}",d).as_bytes()),
3201                    upon::Value::Float(d) => writer.write(format!("{}",d).as_bytes()),
3202                    upon::Value::Bool(d) => writer.write(format!("{}",d).as_bytes()),
3203                    upon::Value::None => writer.write(&[]),
3204                    _ => panic!("Lists and maps are not supported to be passed as input to external filters yet!"),
3205                }.expect("Writing to stdin for external filter failed!");
3206                //block ensures writer and outputsdin are dropped prior to waiting for output
3207            }
3208            let output = process.wait_with_output().expect("External filter wasn't running");
3209            if !output.status.success() {
3210                panic!("External filter {} failed ({:?})", self.name, output.status.code());
3211            }
3212            if let Ok(s) = String::from_utf8(output.stdout) {
3213                return string_to_templatevalue(s);
3214            } else {
3215                panic!("External filter {} produced invalid UTF-8!", self.name);
3216            }
3217        }
3218        panic!("External filter {} failed!", self.name);
3219    }
3220}
3221
3222#[cfg(test)]
3223mod tests {
3224    use super::*;
3225    //use crate::info::info;
3226
3227    const XMLSMALLEXAMPLE: &'static str = r#"<html xmlns="http://www.w3.org/1999/xhtml">
3228<head><title>test</title></head><body><h1>TEST</h1><p xml:id="p1" n="001">This  is a <em xml:id="emphasis" style="color:green">test</em>.</p></body></html>"#;
3229
3230    const XMLEXAMPLE: &'static str = r#"<!DOCTYPE entities[<!ENTITY nbsp "&#xA0;">]>
3231<html xmlns="http://www.w3.org/1999/xhtml" xmlns:my="http://example.com">
3232<head>
3233    <title>Test</title>
3234    <meta name="author" content="proycon" />
3235</head>
3236<body>
3237    <h1>Header</h1>
3238
3239    <p xml:id="par1">
3240        <span xml:id="sen1">This is a sentence.</span>
3241        <span xml:id="sen2">This is the second&nbsp;sentence.</span>
3242    </p>
3243    <p xml:id="par2">
3244        <strong>This</strong> is    the <em>second</em> paragraph.
3245            It has a <strong>bold</strong> word and one in <em>italics</em>.<br/>
3246        Let's highlight stress in the following word: <span my:stress="secondary">re</span>pu<span my:stress="primary">ta</span>tion.
3247    </p>
3248    <p xml:space="preserve"><![CDATA[This    third
3249paragraph consists
3250of CDATA and is configured to preserve whitespace, and weird &entities; ]]></p>
3251
3252    <h2>Subsection</h2>
3253
3254    <p>
3255    Have some fruits:<br/>
3256    <ul xml:id="list1" class="fruits">
3257        <li xml:id="fruit1">apple</li>
3258        <li xml:id="fruit2">banana</li>
3259        <li xml:id="fruit3">melon</li>
3260    </ul>
3261    </p>
3262
3263    Some lingering text outside of any confines...
3264</body>
3265</html>"#;
3266
3267    const XMLEXAMPLE_TEXTOUTPUT: &'static str = "Header\n\nThis is a sentence. This is the second sentence.\n\nThis is the second paragraph. It has a bold word and one in italics.\nLet's highlight stress in the following word: reputation.\n\nThis    third\nparagraph consists\nof CDATA and is configured to preserve whitespace, and weird &entities; \nSubsection\n\nHave some fruits:\n* apple\n* banana\n* melon\n\nSome lingering text outside of any confines...";
3268
3269    //fake example (not real HTML, testing TEI-like space attribute with complex template)
3270    const XMLTEISPACE: &'static str = r#"<html xmlns="http://www.w3.org/1999/xhtml">
3271<body><space dim="vertical" unit="lines" quantity="3" /></body></html>"#;
3272
3273    const CONF: &'static str = r#"#default whitespace handling (Collapse or Preserve)
3274whitespace = "Collapse"
3275default_set = "urn:stam-fromhtml"
3276
3277[namespaces]
3278#this defines the namespace prefixes you can use in this configuration
3279xml = "http://www.w3.org/XML/1998/namespace"
3280html = "http://www.w3.org/1999/xhtml"
3281xsd =  "http://www.w3.org/2001/XMLSchema"
3282xlink = "http://www.w3.org/1999/xlink"
3283
3284# elements and attributes are matched in reverse-order, so put more generic statements before more specific ones
3285
3286#Define some base elements that we reuse later for actual elements (prevents unnecessary repetition)
3287[baseelements.common]
3288id = "{% if ?.@xml:id %}{{ @xml:id }}{% endif %}"
3289
3290    [[baseelements.common.annotationdata]]
3291    key = "type"
3292    value = "{{ localname }}"
3293
3294    [[baseelements.common.annotationdata]]
3295    key = "lang"
3296    value = "{{ @xml:lang }}"
3297    skip_if_missing = true
3298
3299    [[baseelements.common.annotationdata]]
3300    key = "n"
3301    value = "{{ @n }}"
3302    skip_if_missing = true
3303    valuetype = "int"
3304
3305    [[baseelements.common.annotationdata]]
3306    key = "nstring"
3307    value = "{{ @n }}"
3308    skip_if_missing = true
3309    valuetype = "string"
3310
3311    [[baseelements.common.annotationdata]]
3312    key = "style"
3313    value = "{{ @style }}"
3314    skip_if_missing = true
3315
3316    [[baseelements.common.annotationdata]]
3317    key = "class"
3318    value = "{{ @class }}"
3319    skip_if_missing = true
3320
3321    [[baseelements.common.annotationdata]]
3322    key = "src"
3323    value = "{{ @src }}"
3324    skip_if_missing = true
3325
3326[baseelements.text]
3327text = true
3328
3329
3330[[elements]]
3331base = [ "text", "common" ]
3332path = "*"
3333text = true
3334annotation = "TextSelector"
3335
3336# Pass through the following elements without mapping to text
3337[[elements]]
3338base = [ "common" ]
3339path = "//html:head"
3340
3341[[elements]]
3342base = [ "common" ]
3343path = "//html:head//*"
3344
3345# Map metadata like <meta name="key" content="value"> to annotations with key->value data selecting the resource (ResourceSelector)
3346[[elements]]
3347base = [ "common" ]
3348path = "//html:head//html:meta"
3349
3350[[elements.annotationdata]]
3351key = "{% if ?.@name %}{{ name }}{% endif %}"
3352value = "{% if ?.@content %}{{ @content }}{% endif %}"
3353skip_if_missing = true
3354
3355# By default, ignore any tags in the head (unless they're mentioned specifically later in the config)
3356[[elements]]
3357path = "//html:head/html:title"
3358annotation = "ResourceSelector"
3359
3360[[elements.annotationdata]]
3361key = "title"
3362value = "{{ $. | trim }}"
3363
3364
3365# Determine how various structural elements are converted to text
3366
3367[[elements]]
3368base = [ "common" ]
3369path = "//html:br"
3370textsuffix = "\n"
3371
3372[[elements]]
3373base = [ "common", "text" ]
3374path = "//html:p"
3375textprefix = "\n"
3376textsuffix = "\n"
3377annotation = "TextSelector"
3378
3379# Let's do headers and bulleted lists like markdown
3380[[elements]]
3381base = [ "common", "text" ]
3382path = "//html:h1"
3383textsuffix = "\n"
3384
3385[[elements]]
3386base = [ "common", "text" ]
3387path = "//html:h2"
3388textsuffix = "\n"
3389
3390#Generic, will be overriden by more specific one
3391[[elements]]
3392base = [ "common", "text" ]
3393path = "//html:li"
3394textprefix = "- "
3395textsuffix = "\n"
3396
3397[[elements]]
3398base = [ "common", "text" ]
3399path = """//html:body"""
3400annotation = "TextSelector"
3401id = "body"
3402
3403    [[elements.annotationdata]]
3404    key = "title_from_parent"
3405    value = "{{ $../html:head/html:title }}"
3406    skip_if_missing = true
3407
3408    [[elements.annotationdata]]
3409    key = "title_from_root"
3410    value = "{{ $/html:html/html:head/html:title }}"
3411    skip_if_missing = true
3412
3413    [[elements.annotationdata]]
3414    key = "firstfruit"
3415    value = """{{ $./html:p/html:ul/html:li }}"""
3416    skip_if_missing = true
3417
3418    [[elements.annotationdata]]
3419    key = "fruits"
3420    value = """{{ $$./html:p/html:ul/html:li }}"""
3421    skip_if_missing = true
3422
3423    [[elements.annotationdata]]
3424    key = "multifruits"
3425    value = """{{ $$./html:p/html:ul/html:li }}"""
3426    skip_if_missing = true
3427    multiple = true
3428
3429#More specific one takes precendence over the above generic one
3430[[elements]]
3431base = [ "common", "text" ]
3432path = """//html:ul[@class="fruits"]/html:li"""
3433textprefix = "* "
3434textsuffix = "\n"
3435
3436#Not real HTML, test-case modelled after TEI space
3437[[elements]]
3438base = [ "common" ]
3439path = """//html:space[@dim="vertical" and @unit="lines"]"""
3440text = true
3441textsuffix = """\n{% for x in @quantity | int | as_range %}\n{% endfor %}"""
3442
3443
3444[[elements]]
3445base = [ "common", "text" ]
3446path = "//html:example"
3447annotation = "TextSelector"
3448
3449[[elements.annotationdata]]
3450key = "requiredattrib"
3451value = "{{ @requiredattrib }}"
3452
3453[[elements.annotationdata]]
3454key = "optattrib"
3455value = "{{ ?.@optattrib }}"
3456
3457[[elements]]
3458base = [ "common","text" ]
3459path = "//html:marquee"
3460annotation = "TextSelector"
3461
3462#map value, some bogus data to test parsing
3463[[elements.annotationdata]]
3464key = "map"
3465
3466[elements.annotationdata.value]
3467text = "{{ $. }}"
3468number = 42
3469bogus = true
3470
3471[[metadata]]
3472id = "metadata"
3473
3474[[metadata.annotationdata]]
3475key = "author"
3476value = "proycon"
3477"#;
3478
3479    const XMLREQATTRIBEXAMPLE: &'static str = r#"<html xmlns="http://www.w3.org/1999/xhtml">
3480<body><example xml:id="ann1" requiredattrib="blah">test</example></body></html>"#;
3481
3482    const XMLREQATTRIBEXAMPLE2: &'static str = r#"<html xmlns="http://www.w3.org/1999/xhtml">
3483<body><example xml:id="ann1">test</example></body></html>"#;
3484
3485    const XMLREQATTRIBEXAMPLE3: &'static str = r#"<html xmlns="http://www.w3.org/1999/xhtml">
3486<body><example xml:id="ann1" requiredattrib="blah" optattrib="blah">test</example></body></html>"#;
3487
3488    const XMLMAPEXAMPLE: &'static str = r#"<html xmlns="http://www.w3.org/1999/xhtml">
3489<body><marquee xml:id="ann1">test</marquee></body></html>"#;
3490
3491    #[test]
3492    fn test_precompile_template_nochange() -> Result<(), String> {
3493        let config = XmlConversionConfig::new();
3494        let mut conv = XmlToStamConverter::new(&config);
3495        let template_in = "{{ foo }}";
3496        let template_out = conv.precompile(template_in);
3497        assert_eq!( template_out, template_in);
3498        //foo is not a special variable
3499        assert!(!conv.variables.get(template_in).as_ref().unwrap().contains("foo"));
3500        Ok(())
3501    }
3502
3503    #[test]
3504    fn test_precompile_template_attrib() -> Result<(), String> {
3505        let config = XmlConversionConfig::new();
3506        let mut conv = XmlToStamConverter::new(&config);
3507        let template_in = "{{ @foo }}";
3508        let template_out = conv.precompile(template_in);
3509        assert_eq!(template_out, "{{ ATTRIB_foo }}");
3510        //foo is an attribute so is returned 
3511        assert!(conv.variables.get(template_in).as_ref().unwrap().contains("@foo"));
3512        Ok(())
3513    }
3514
3515    #[test]
3516    fn test_precompile_template_attrib_ns() -> Result<(), String> {
3517        let config = XmlConversionConfig::new();
3518        let mut conv = XmlToStamConverter::new(&config);
3519        let template_in = "{{ @bar:foo }}";
3520        let template_out = conv.precompile(template_in);
3521        assert_eq!(template_out, "{{ ATTRIB_bar__foo }}");
3522        //foo is an attribute so is returned 
3523        assert!(conv.variables.get(template_in).as_ref().unwrap().contains("@bar:foo"));
3524        Ok(())
3525    }
3526
3527    #[test]
3528    fn test_precompile_template_element() -> Result<(), String> {
3529        let config = XmlConversionConfig::new();
3530        let mut conv = XmlToStamConverter::new(&config);
3531        let template_in = "{{ $foo }}";
3532        let template_out = conv.precompile(template_in);
3533        assert_eq!(template_out, "{{ ELEMENT_foo }}");
3534        //foo is an element so is returned 
3535        assert!(conv.variables.get(template_in).as_ref().unwrap().contains("$foo"));
3536        Ok(())
3537    }
3538
3539    #[test]
3540    fn test_precompile_template_element_ns() -> Result<(), String> {
3541        let config = XmlConversionConfig::new();
3542        let mut conv = XmlToStamConverter::new(&config);
3543        let template_in = "{{ $bar:foo }}";
3544        let template_out = conv.precompile(template_in);
3545        assert_eq!(template_out, "{{ ELEMENT_bar__foo }}");
3546        //foo is an element so is returned 
3547        assert!(conv.variables.get(template_in).as_ref().unwrap().contains("$bar:foo"));
3548        Ok(())
3549    }
3550
3551    #[test]
3552    fn test_precompile_template_this_text() -> Result<(), String> {
3553        let config = XmlConversionConfig::new();
3554        let mut conv = XmlToStamConverter::new(&config);
3555        let template_in = "{{ $. }}";
3556        let template_out = conv.precompile(template_in);
3557        assert_eq!(template_out, "{{ ELEMENT_THIS }}");
3558        assert!(conv.variables.get(template_in).as_ref().unwrap().contains("$."));
3559        Ok(())
3560    }
3561
3562    #[test]
3563    fn test_precompile_template_parent_text() -> Result<(), String> {
3564        let config = XmlConversionConfig::new();
3565        let mut conv = XmlToStamConverter::new(&config);
3566        let template_in = "{{ $.. }}";
3567        let template_out = conv.precompile(template_in);
3568        assert_eq!(template_out, "{{ ELEMENT_PARENT }}");
3569        assert!(conv.variables.get(template_in).as_ref().unwrap().contains("$.."));
3570        Ok(())
3571    }
3572
3573    #[test]
3574    fn test_precompile_template_elements() -> Result<(), String> {
3575        let config = XmlConversionConfig::new();
3576        let mut conv = XmlToStamConverter::new(&config);
3577        let template_in = "{{ $$foo }}";
3578        let template_out = conv.precompile(template_in);
3579        assert_eq!(template_out, "{{ ELEMENTS_foo }}");
3580        assert!(conv.variables.get(template_in).as_ref().unwrap().contains("$$foo"));
3581        Ok(())
3582    }
3583
3584    #[test]
3585    fn test_precompile_template_elements_ns() -> Result<(), String> {
3586        let config = XmlConversionConfig::new();
3587        let mut conv = XmlToStamConverter::new(&config);
3588        let template_in = "{{ $$bar:foo }}";
3589        let template_out = conv.precompile(template_in);
3590        assert_eq!(template_out, "{{ ELEMENTS_bar__foo }}");
3591        assert!(conv.variables.get(template_in).as_ref().unwrap().contains("$$bar:foo"));
3592        Ok(())
3593    }
3594
3595
3596    #[test]
3597    fn test_precompile_template_attrib2() -> Result<(), String> {
3598        let config = XmlConversionConfig::new();
3599        let mut conv = XmlToStamConverter::new(&config);
3600        let template_in = "{% for x in @foo %}";
3601        let template_out = conv.precompile(template_in);
3602        assert_eq!(template_out, "{% for x in ATTRIB_foo %}");
3603        //foo is an attribute so is returned 
3604        assert!(conv.variables.get(template_in).as_ref().unwrap().contains("@foo"));
3605        Ok(())
3606    }
3607
3608    #[test]
3609    fn test_precompile_template_attrib3() -> Result<(), String> {
3610        let config = XmlConversionConfig::new();
3611        let mut conv = XmlToStamConverter::new(&config);
3612        let template_in = "{{ ?.@foo }}";
3613        let template_out = conv.precompile(template_in);
3614        assert_eq!(template_out, "{{ ?.ATTRIB_foo }}");
3615        assert!(conv.variables.get(template_in).as_ref().unwrap().contains("@foo"));
3616        Ok(())
3617    }
3618
3619    #[test]
3620    fn test_precompile_template_path() -> Result<(), String> {
3621        let config = XmlConversionConfig::new();
3622        let mut conv = XmlToStamConverter::new(&config);
3623        let template_in = "{{ $x/y/z/@a }}";
3624        let template_out = conv.precompile(template_in);
3625        assert_eq!(template_out, "{{ ELEMENT_x_IN_y_IN_z_IN_ATTRIB_a }}");
3626        assert!(conv.variables.get(template_in).as_ref().unwrap().contains("$x/y/z/@a"));
3627        Ok(())
3628    }
3629
3630    #[test]
3631    fn test_loadconfig() -> Result<(), String> {
3632        let config = XmlConversionConfig::from_toml_str(CONF)?;
3633        let mut conv = XmlToStamConverter::new(&config);
3634        conv.compile().map_err(|e| format!("{}",e))?;
3635        assert_eq!(conv.config.namespaces.len(),4 , "number of namespaces");
3636        assert_eq!(conv.config.elements.len(), 15, "number of elements");
3637        assert_eq!(conv.config.baseelements.len(), 2, "number of baseelements");
3638        assert_eq!(conv.config.elements.get(0).unwrap().annotationdata.len(), 7,"number of annotationdata under first element");
3639        assert_eq!(conv.config.baseelements.get("common").unwrap().annotationdata.len(), 7,"number of annotationdata under baseelement common");
3640        Ok(())
3641    }
3642
3643    #[test]
3644    fn test_small() -> Result<(), String> {
3645        let config = XmlConversionConfig::from_toml_str(CONF)?.with_debug(true);
3646        let mut store = stam::AnnotationStore::new(stam::Config::new());
3647        from_xml_in_memory("test", XMLSMALLEXAMPLE, &config, &mut store)?;
3648        let res = store.resource("test").expect("resource must have been created at this point");
3649        assert_eq!(res.text(), "TEST\n\nThis is a test.\n", "resource text");
3650        assert_eq!(store.annotations_len(), 6, "number of annotations");
3651        let annotation = store.annotation("emphasis").expect("annotation must have been created at this point");
3652        assert_eq!(annotation.text_simple(), Some("test"));
3653        //eprintln!("DEBUG: {:?}",annotation.data().collect::<Vec<_>>());
3654        let key = store.key("urn:stam-fromhtml", "style").expect("key must exist");
3655        assert_eq!(annotation.data().filter_key(&key).value_as_str(), Some("color:green"));
3656        let key = store.key("urn:stam-fromhtml", "title").expect("key must exist");
3657        let annotation = res.annotations_as_metadata().filter_key(&key).next().expect("annotation");
3658        assert_eq!(annotation.data().filter_key(&key).value_as_str(), Some("test"));
3659        let bodyannotation = store.annotation("body").expect("body annotation not found");
3660        let title1 = store.key("urn:stam-fromhtml", "title_from_parent").expect("key must exist");
3661        let title2 = store.key("urn:stam-fromhtml", "title_from_root").expect("key must exist");
3662        assert_eq!(bodyannotation.data().filter_key(&title1).value_as_str(), Some("test"));
3663        assert_eq!(bodyannotation.data().filter_key(&title2).value_as_str(), Some("test"));
3664        Ok(())
3665    }
3666
3667    #[test]
3668    fn test_full() -> Result<(), String> {
3669        let config = XmlConversionConfig::from_toml_str(CONF)?.with_debug(true);
3670        let mut store = stam::AnnotationStore::new(stam::Config::new());
3671        from_xml_in_memory("test", XMLEXAMPLE, &config, &mut store)?;
3672        let res = store.resource("test").expect("resource must have been created at this point");
3673        assert_eq!(res.text(), XMLEXAMPLE_TEXTOUTPUT, "resource text");
3674        Ok(())
3675    }
3676
3677    #[test]
3678    fn test_firstfruit() -> Result<(), String> {
3679        let config = XmlConversionConfig::from_toml_str(CONF)?.with_debug(true);
3680        let mut store = stam::AnnotationStore::new(stam::Config::new());
3681        from_xml_in_memory("test", XMLEXAMPLE, &config, &mut store)?;
3682        let bodyannotation = store.annotation("body").expect("body annotation not found");
3683        let fruit = store.key("urn:stam-fromhtml", "firstfruit").expect("key must exist");
3684        assert_eq!(bodyannotation.data().filter_key(&fruit).value_as_str(), Some("apple") );
3685        Ok(())
3686    }
3687
3688    #[test]
3689    fn test_fruits() -> Result<(), String> {
3690        let config = XmlConversionConfig::from_toml_str(CONF)?.with_debug(true);
3691        let mut store = stam::AnnotationStore::new(stam::Config::new());
3692        from_xml_in_memory("test", XMLEXAMPLE, &config, &mut store)?;
3693        let bodyannotation = store.annotation("body").expect("body annotation not found");
3694        let fruits = store.key("urn:stam-fromhtml", "fruits").expect("key must exist");
3695        assert_eq!(bodyannotation.data().filter_key(&fruits).value(), Some(&DataValue::List(vec!("apple".into(),"banana".into(),"melon".into()) )));
3696        Ok(())
3697    }
3698
3699    #[test]
3700    fn test_multifruits() -> Result<(), String> {
3701        let config = XmlConversionConfig::from_toml_str(CONF)?.with_debug(true);
3702        let mut store = stam::AnnotationStore::new(stam::Config::new());
3703        from_xml_in_memory("test", XMLEXAMPLE, &config, &mut store)?;
3704        let bodyannotation = store.annotation("body").expect("body annotation not found");
3705        let fruits = store.key("urn:stam-fromhtml", "multifruits").expect("key must exist");
3706        let results: Vec<_> = bodyannotation.data().filter_key(&fruits).collect();
3707        assert_eq!(results.len(), 3);
3708        assert_eq!(results.get(0).unwrap().value(),&DataValue::String("apple".to_string()) );
3709        assert_eq!(results.get(1).unwrap().value(),&DataValue::String("banana".to_string()) );
3710        assert_eq!(results.get(2).unwrap().value(),&DataValue::String("melon".to_string()) );
3711        Ok(())
3712    }
3713
3714    #[test]
3715    fn test_teispace() -> Result<(), String> {
3716        let config = XmlConversionConfig::from_toml_str(CONF)?;
3717        let mut store = stam::AnnotationStore::new(stam::Config::new());
3718        from_xml_in_memory("test", XMLTEISPACE, &config, &mut store)?;
3719        let res = store.resource("test").expect("resource must have been created at this point");
3720        assert_eq!(res.text(), "\n\n\n\n", "resource text");
3721        Ok(())
3722    }
3723
3724
3725    #[test]
3726    fn test_reqattrib() -> Result<(), String> {
3727        let config = XmlConversionConfig::from_toml_str(CONF)?;
3728        let mut store = stam::AnnotationStore::new(stam::Config::new());
3729        from_xml_in_memory("test", XMLREQATTRIBEXAMPLE, &config, &mut store)?;
3730        let res = store.resource("test").expect("resource must have been created at this point");
3731        assert_eq!(res.text(), "test", "resource text");
3732        let key = store.key("urn:stam-fromhtml", "requiredattrib").expect("key must exist");
3733        let annotation = store.annotation("ann1").expect("annotation");
3734        assert_eq!(annotation.data().filter_key(&key).value_as_str(), Some("blah"));
3735        assert!(store.key("urn:stam-fromhtml", "optattrib").is_none(), "optional attrib is unused");
3736        Ok(())
3737    }
3738
3739    #[test]
3740    fn test_reqattrib2() -> Result<(), String> {
3741        let mut config = XmlConversionConfig::from_toml_str(CONF)?;
3742        config = config.with_debug(true);
3743        let mut store = stam::AnnotationStore::new(stam::Config::new());
3744        assert!(from_xml_in_memory("test", XMLREQATTRIBEXAMPLE2, &config, &mut store).is_err(), "checking if error is returned");
3745        Ok(())
3746    }
3747
3748    #[test]
3749    fn test_reqattrib3() -> Result<(), String> {
3750        let config = XmlConversionConfig::from_toml_str(CONF)?;
3751        let mut store = stam::AnnotationStore::new(stam::Config::new());
3752        from_xml_in_memory("test", XMLREQATTRIBEXAMPLE3, &config, &mut store)?;
3753        let res = store.resource("test").expect("resource must have been created at this point");
3754        assert_eq!(res.text(), "test", "resource text");
3755        let reqkey = store.key("urn:stam-fromhtml", "requiredattrib").expect("key must exist");
3756        let optkey = store.key("urn:stam-fromhtml", "optattrib").expect("key optattrib must exist");
3757        let annotation = store.annotation("ann1").expect("annotation");
3758        assert_eq!(annotation.data().filter_key(&reqkey).value_as_str(), Some("blah"));
3759        assert_eq!(annotation.data().filter_key(&optkey).value_as_str(), Some("blah"));
3760        Ok(())
3761    }
3762
3763    #[test]
3764    fn test_map() -> Result<(), String> {
3765        let config = XmlConversionConfig::from_toml_str(CONF)?;
3766        let mut store = stam::AnnotationStore::new(stam::Config::new());
3767        from_xml_in_memory("test", XMLMAPEXAMPLE, &config, &mut store)?;
3768        let res = store.resource("test").expect("resource must have been created at this point");
3769        assert_eq!(res.text(), "test", "resource text");
3770        let key = store.key("urn:stam-fromhtml", "map").expect("key must exist");
3771        let annotation = store.annotation("ann1").expect("annotation");
3772        let data = annotation.data().filter_key(&key).value().expect("data must exist");
3773        if let DataValue::Map(data) = data {
3774            assert_eq!(data.get("text"), Some(&DataValue::String("test".into())));
3775            assert_eq!(data.get("number"), Some(&DataValue::Int(42)));
3776            assert_eq!(data.get("bogus"), Some(&DataValue::Bool(true)));
3777            assert_eq!(data.len(), 3);
3778        } else {
3779            assert!(false, "Data is supposed to be a map");
3780        }
3781        Ok(())
3782    }
3783
3784    #[test]
3785    fn test_metadata() -> Result<(), String> {
3786        let config = XmlConversionConfig::from_toml_str(CONF)?.with_debug(true);
3787        let mut store = stam::AnnotationStore::new(stam::Config::new());
3788        from_xml_in_memory("test", XMLEXAMPLE, &config, &mut store)?;
3789        let annotation = store.annotation("metadata").expect("annotation");
3790        let key = store.key("urn:stam-fromhtml", "author").expect("key must exist");
3791        let data = annotation.data().filter_key(&key).value().expect("data must exist");
3792        assert_eq!(data, &DataValue::String("proycon".into()));
3793        Ok(())
3794    }
3795
3796    #[test]
3797    fn test_datavalue_int() -> Result<(), String> {
3798        let config = XmlConversionConfig::from_toml_str(CONF)?.with_debug(true);
3799        let mut store = stam::AnnotationStore::new(stam::Config::new());
3800        from_xml_in_memory("test", XMLSMALLEXAMPLE, &config, &mut store)?;
3801        let annotation = store.annotation("p1").expect("annotation not found");
3802        let key = store.key("urn:stam-fromhtml", "n").expect("key must exist");
3803        assert_eq!(annotation.data().filter_key(&key).value(), Some(&DataValue::Int(1)));
3804        Ok(())
3805    }
3806
3807    #[test]
3808    fn test_datavalue_string() -> Result<(), String> {
3809        let config = XmlConversionConfig::from_toml_str(CONF)?.with_debug(true);
3810        let mut store = stam::AnnotationStore::new(stam::Config::new());
3811        from_xml_in_memory("test", XMLSMALLEXAMPLE, &config, &mut store)?;
3812        let annotation = store.annotation("p1").expect("annotation not found");
3813        let key = store.key("urn:stam-fromhtml", "nstring").expect("key must exist");
3814        assert_eq!(annotation.data().filter_key(&key).value(), Some(&DataValue::String("001".to_string())));
3815        Ok(())
3816    }
3817
3818}