use std::borrow::Cow;
use std::collections::{BTreeMap, HashMap, BTreeSet};
use std::fmt::Display;
use std::fs::read_to_string;
use std::path::Path;
use std::hash::{Hash,DefaultHasher,Hasher};
use std::process::{Command, Stdio};
use std::io::{ BufWriter, Write};
use roxmltree::{Document, Node, NodeId, ParsingOptions};
use serde::Deserialize;
use stam::*;
use toml;
use upon::Engine;
use std::fmt::Write as FmtWrite;
use serde_json;
const NS_XML: &str = "http://www.w3.org/XML/1998/namespace";
const CONTEXT_ANNO: &str = "http://www.w3.org/ns/anno.jsonld";
fn default_set() -> String {
"urn:stam-fromxml".into()
}
#[derive(Deserialize)]
pub struct XmlConversionConfig {
#[serde(default)]
elements: Vec<XmlElementConfig>,
#[serde(default)]
baseelements: HashMap<String, XmlElementConfig>,
#[serde(default)]
namespaces: HashMap<String, String>,
#[serde(default = "XmlWhitespaceHandling::collapse")]
whitespace: XmlWhitespaceHandling,
#[serde(default)]
context: HashMap<String, toml::Value>,
#[serde(default)]
metadata: Vec<MetadataConfig>,
#[serde(default)]
inject_dtd: Option<String>,
#[serde(default = "default_set")]
default_set: String,
#[serde(default)]
id_prefix: Option<String>,
#[serde(default)]
id_strip_suffix: Vec<String>,
#[serde(default)]
provenance: bool,
#[serde(default)]
external_filters: Vec<ExternalFilter>,
#[serde(skip_deserializing)]
debug: bool,
}
impl XmlConversionConfig {
pub fn new() -> Self {
Self {
elements: Vec::new(),
baseelements: HashMap::new(),
namespaces: HashMap::new(),
context: HashMap::new(),
metadata: Vec::new(),
whitespace: XmlWhitespaceHandling::Collapse,
default_set: default_set(),
inject_dtd: None,
id_prefix: None,
id_strip_suffix: Vec::new(),
provenance: false,
external_filters: Vec::new(),
debug: false,
}
}
pub fn resolve_baseelements(&mut self) -> Result<(), XmlConversionError> {
let mut replace: Vec<(usize, XmlElementConfig)> = Vec::new();
for (i, element) in self.elements.iter().enumerate() {
let mut newelement = None;
for basename in element.base.iter().rev() {
if let Some(baseelement) = self.baseelements.get(basename) {
if newelement.is_none() {
newelement = Some(element.clone());
}
newelement
.as_mut()
.map(|newelement| newelement.update(baseelement));
} else {
return Err(XmlConversionError::ConfigError(format!(
"No such base element: {}",
basename
)));
}
}
if let Some(newelement) = newelement {
replace.push((i, newelement));
}
}
for (i, element) in replace {
self.elements[i] = element;
}
Ok(())
}
pub fn from_toml_str(tomlstr: &str) -> Result<Self, String> {
let mut config: Self = toml::from_str(tomlstr).map_err(|e| format!("{}", e))?;
config.resolve_baseelements().map_err(|e| format!("{}", e))?;
Ok(config)
}
pub fn with_debug(mut self, value: bool) -> Self {
self.debug = value;
self
}
pub fn with_provenance(mut self, value: bool) -> Self {
self.provenance = value;
self
}
pub fn with_prefix(mut self, prefix: impl Into<String>, namespace: impl Into<String>) -> Self {
self.namespaces.insert(prefix.into(), namespace.into());
self
}
pub fn with_id_prefix(mut self, prefix: impl Into<String>) -> Self {
self.id_prefix = Some(prefix.into());
self
}
pub fn with_id_strip_suffix(mut self, suffix: impl Into<String>) -> Self {
self.id_strip_suffix.push(suffix.into());
self
}
pub fn with_inject_dtd(mut self, dtd: impl Into<String>) -> Self {
self.inject_dtd = Some(dtd.into());
self
}
pub fn with_whitespace(mut self, handling: XmlWhitespaceHandling) -> Self {
self.whitespace = handling;
self
}
pub fn with_element<F>(mut self, expression: &str, setup: F) -> Self
where
F: Fn(XmlElementConfig) -> XmlElementConfig,
{
let expression = XPathExpression::new(expression);
let element = setup(XmlElementConfig::new(expression));
if self.debug {
eprintln!("[STAM fromxml] registered {:?}", element);
}
self.elements.push(element);
self
}
fn element_config(&self, node: Node, path: &NodePath) -> Option<&XmlElementConfig> {
for elementconfig in self.elements.iter().rev() {
if elementconfig.path.test(path, node, self) {
return Some(elementconfig);
}
}
None
}
pub fn add_context(&mut self, key: impl Into<String>, value: toml::Value) {
self.context.insert(key.into(), value);
}
pub fn debug(&self) -> bool {
self.debug
}
}
#[derive(Clone, Copy, Debug, PartialEq, Deserialize)]
pub enum XmlWhitespaceHandling {
Unspecified,
Inherit,
Preserve,
Collapse,
}
impl Default for XmlWhitespaceHandling {
fn default() -> Self {
XmlWhitespaceHandling::Unspecified
}
}
impl XmlWhitespaceHandling {
fn collapse() -> Self {
XmlWhitespaceHandling::Collapse
}
}
#[derive(Debug, Clone, Deserialize, PartialEq, Copy, Default)]
pub enum XmlAnnotationHandling {
#[default]
Unspecified,
None,
TextSelector,
ResourceSelector,
TextSelectorBetweenMarkers,
}
#[derive(Debug, Clone, Deserialize)]
pub struct XmlElementConfig {
#[serde(default)]
path: XPathExpression,
#[serde(default)]
annotation: XmlAnnotationHandling,
#[serde(default)]
annotationdata: Vec<XmlAnnotationDataConfig>,
#[serde(default)]
textprefix: Option<String>,
#[serde(default)]
text: Option<bool>,
#[serde(default)]
textsuffix: Option<String>,
#[serde(default)]
annotatetextprefix: Vec<XmlAnnotationDataConfig>,
#[serde(default)]
annotatetextsuffix: Vec<XmlAnnotationDataConfig>,
#[serde(default)]
include_textprefix: Option<bool>,
#[serde(default)]
include_textsuffix: Option<bool>,
#[serde(default)]
base: Vec<String>,
#[serde(default)]
id: Option<String>,
#[serde(default)]
stop: Option<bool>,
#[serde(default)]
whitespace: XmlWhitespaceHandling,
#[serde(default)]
scope_id: Option<String>,
#[serde(default)]
marker_scope: Option<String>,
}
impl XmlElementConfig {
fn new(expression: XPathExpression) -> Self {
Self {
path: expression,
stop: None,
whitespace: XmlWhitespaceHandling::Unspecified,
annotation: XmlAnnotationHandling::Unspecified,
annotationdata: Vec::new(),
base: Vec::new(),
id: None,
textprefix: None,
text: None,
textsuffix: None,
annotatetextprefix: Vec::new(),
annotatetextsuffix: Vec::new(),
include_textprefix: None,
include_textsuffix: None,
scope_id: None,
marker_scope: None,
}
}
pub fn update(&mut self, base: &XmlElementConfig) {
if self.whitespace == XmlWhitespaceHandling::Unspecified
&& base.whitespace != XmlWhitespaceHandling::Unspecified
{
self.whitespace = base.whitespace;
}
if self.annotation == XmlAnnotationHandling::Unspecified
&& base.annotation != XmlAnnotationHandling::Unspecified
{
self.annotation = base.annotation;
}
if self.textprefix.is_none() && base.textprefix.is_some() {
self.textprefix = base.textprefix.clone();
}
if self.text.is_none() && base.text.is_some() {
self.text = base.text;
}
if self.textsuffix.is_none() && base.textsuffix.is_some() {
self.textsuffix = base.textsuffix.clone();
}
if self.id.is_none() && base.id.is_some() {
self.id = base.id.clone();
}
if self.stop.is_none() && base.stop.is_some() {
self.stop = base.stop;
}
for annotationdata in base.annotationdata.iter() {
if !self.annotationdata.contains(annotationdata) {
self.annotationdata.push(annotationdata.clone());
}
}
if self.annotatetextsuffix.is_empty() && !base.annotatetextsuffix.is_empty() {
self.annotatetextsuffix = base.annotatetextsuffix.clone();
}
if self.annotatetextprefix.is_empty() && !base.annotatetextprefix.is_empty() {
self.annotatetextprefix = base.annotatetextprefix.clone();
}
if self.include_textsuffix.is_none() {
self.include_textsuffix = base.include_textsuffix;
}
if self.include_textprefix.is_none() {
self.include_textprefix = base.include_textprefix;
}
}
pub fn with_stop(mut self, stop: bool) -> Self {
self.stop = Some(stop);
self
}
pub fn with_whitespace(mut self, handling: XmlWhitespaceHandling) -> Self {
self.whitespace = handling;
self
}
pub fn with_text(mut self, text: bool) -> Self {
self.text = Some(text);
self
}
pub fn with_base(mut self, iter: impl Iterator<Item = impl Into<String>>) -> Self {
self.base = iter.into_iter().map(|s| s.into()).collect();
self
}
pub fn without_text(mut self) -> Self {
self.text = None;
self
}
pub fn with_annotation(mut self, annotation: XmlAnnotationHandling) -> Self {
self.annotation = annotation;
self
}
fn hash(&self) -> usize {
self.path.0.as_ptr() as usize
}
}
impl PartialEq for XmlElementConfig {
fn eq(&self, other: &Self) -> bool {
self.hash() == other.hash()
}
}
#[derive(Debug, Clone, Deserialize, PartialEq)]
pub struct XmlAnnotationDataConfig {
id: Option<String>,
set: Option<String>,
key: Option<String>,
value: Option<toml::Value>,
#[serde(default)]
valuetype: Option<String>,
#[serde(default)]
allow_empty_value: bool,
#[serde(default)]
skip_if_missing: bool,
#[serde(default)]
multiple: bool,
}
impl XmlAnnotationDataConfig {
pub fn with_id(mut self, id: impl Into<String>) -> Self {
self.id = Some(id.into());
self
}
pub fn with_set(mut self, set: impl Into<String>) -> Self {
self.set = Some(set.into());
self
}
pub fn with_key(mut self, key: impl Into<String>) -> Self {
self.key = Some(key.into());
self
}
pub fn with_value(mut self, value: impl Into<toml::Value>) -> Self {
self.value = Some(value.into());
self
}
}
#[derive(Debug, Clone, PartialEq, Deserialize)]
struct XPathExpression(String);
impl XPathExpression {
pub fn new(expression: impl Into<String>) -> Self {
Self(expression.into())
}
pub fn any() -> Self {
Self("*".into())
}
pub fn iter<'a>(
&'a self,
config: &'a XmlConversionConfig,
) -> impl Iterator<Item = (Option<&'a str>, &'a str, Option<&'a str>)> {
self.0.trim_start_matches('/').split("/").map(|segment| {
let (prefix, name, condition) = Self::parse_segment(segment);
let namespace = if let Some(prefix) = prefix {
if let Some(namespace) = config.namespaces.get(prefix).map(|x| x.as_str()) {
Some(namespace)
} else {
panic!(
"XML namespace prefix not known in configuration: {}",
prefix
);
}
} else {
None
};
(namespace, name, condition)
})
}
fn test<'a, 'b>(&self, path: &NodePath<'a, 'b>, node: Node<'a,'b>, config: &XmlConversionConfig) -> bool {
let refiter = self.iter(config).collect::<Vec<_>>().into_iter().rev();
let pathiter = path.components.iter().rev();
self.test_withiter(refiter, pathiter, node, config)
}
fn test_withiter<'a, 'b>(&self, mut refiter: impl Iterator<Item=(Option<&'a str>, &'a str, Option<&'a str>)> + Clone, mut pathiter: impl Iterator<Item=&'a NodePathComponent<'a, 'b>> + Clone, mut node: Node<'a,'b>, config: &XmlConversionConfig) -> bool {
while let Some((refns, refname, condition)) = refiter.next() {
if refns.is_none() && refname == "" && condition.is_none() {
if self.test_withiter(refiter.clone(), pathiter.clone(), node, config) {
return true;
}
}
if let Some(component) = pathiter.next() {
if refname != "" && refname != "*" {
if refns.is_none() != component.namespace.is_none() || component.namespace != refns || refname != component.tagname {
return false;
}
}
if let Some(condition) = condition {
if !self.test_condition(condition, node, config) {
return false;
}
}
if let Some(parent) = node.parent() {
node = parent;
}
} else {
if refname != "" {
return false;
}
}
}
true
}
fn test_condition<'a,'b>(&self, condition: &'a str, node: Node<'a,'b>, config: &XmlConversionConfig) -> bool {
for condition in condition.split(" and ") { if let Some(pos) = condition.find("!=") {
let var = &condition[..pos];
let right = condition[pos+2..].trim_matches('"');
if self.get_var(var, &node, config) == Some(right) {
return false;
}
} else if let Some(pos) = condition.find("=") {
let var = &condition[..pos];
let right = condition[pos+1..].trim_matches('"');
let value = self.get_var(var, &node, config);
if value != Some(right) {
return false;
}
} else {
let v = self.get_var(condition, &node, config);
if v.is_none() || v == Some("") {
return false;
}
}
}
true
}
fn get_var<'a,'b>(&self, var: &str, node: &Node<'a,'b>, config: &XmlConversionConfig) -> Option<&'a str> {
if var.starts_with("@") {
if let Some(pos) = var.find(":") {
let prefix = &var[1..pos];
if let Some(ns) = config.namespaces.get(prefix) {
let var = &var[pos+1..];
node.attribute((ns.as_str(),var))
} else {
None
}
} else {
node.attribute(&var[1..])
}
} else if var == "text()" {
node.text().map(|s|s.trim())
} else {
None
}
}
fn parse_segment<'a>(s: &'a str) -> (Option<&'a str>, &'a str, Option<&'a str>) {
let (name, condition) = if let (Some(begin), Some(end)) = (s.find("["), s.rfind("]")) {
(&s[..begin], Some(&s[begin + 1..end]))
} else {
(s, None)
};
if let Some((prefix, name)) = name.split_once(":") {
(Some(prefix), name, condition)
} else {
(None, name, condition)
}
}
}
impl Default for XPathExpression {
fn default() -> Self {
Self::any()
}
}
#[derive(Clone, Debug, PartialEq)]
struct NodePathComponent<'a,'b> {
namespace: Option<&'a str>,
tagname: &'b str,
index: Option<usize>,
}
#[derive(Clone, Debug, PartialEq, Default)]
struct NodePath<'a, 'b> {
components: Vec<NodePathComponent<'a,'b>>,
}
impl<'a, 'b> Display for NodePath<'a, 'b> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
for component in self.components.iter() {
write!(f, "/")?;
if let Some(ns) = component.namespace {
if let Some(index) = component.index {
write!(f, "{{{}}}{}[{}]", ns, component.tagname, index)?;
} else {
write!(f, "{{{}}}{}", ns, component.tagname)?;
}
} else {
if let Some(index) = component.index {
write!(f, "{}[{}]", component.tagname, index)?;
} else {
write!(f, "{}", component.tagname)?;
}
}
}
Ok(())
}
}
impl<'a,'b> NodePath<'a,'b> {
fn add(&mut self, node: &Node<'a,'b>, index: Option<usize>) {
if node.tag_name().name() != "" {
self.components.push(
NodePathComponent {
namespace: node.tag_name().namespace(),
tagname: node.tag_name().name(),
index,
}
)
}
}
fn format_as_xpath(&self, prefixes: &HashMap<String, String>) -> String {
let mut out = String::new();
for component in self.components.iter() {
out.push('/');
if let Some(ns) = component.namespace {
if let Some(prefix) = prefixes.get(ns) {
if let Some(index) = component.index {
out += &format!("{}:{}[{}]", prefix, component.tagname, index);
} else {
out += &format!("{}:{}", prefix, component.tagname);
}
} else {
eprintln!("STAM fromxml WARNING: format_as_xpath: namespace {} not defined, no prefix found!", ns);
if let Some(index) = component.index {
out += &format!("{}[{}]", component.tagname, index);
} else {
out += &format!("{}", component.tagname);
}
}
} else {
if let Some(index) = component.index {
out += &format!("{}[{}]", component.tagname, index);
} else {
out += &format!("{}", component.tagname);
}
}
}
out
}
}
#[derive(Default,Debug)]
struct SiblingCounter {
map: HashMap<String,usize>,
}
impl SiblingCounter {
fn count<'a,'b>(&mut self, node: &Node<'a,'b>) -> usize {
let s = format!("{:?}", node.tag_name());
*self.map.entry(s).and_modify(|c| {*c += 1;}).or_insert(1)
}
}
#[derive(Debug, Clone, Deserialize)]
pub struct MetadataConfig {
#[serde(default)]
annotation: XmlAnnotationHandling,
#[serde(default)]
annotationdata: Vec<XmlAnnotationDataConfig>,
#[serde(default)]
id: Option<String>,
}
pub fn from_xml<'a>(
filename: &Path,
config: &XmlConversionConfig,
store: &'a mut AnnotationStore,
) -> Result<(), String> {
if config.debug {
eprintln!("[STAM fromxml] parsing {}", filename.display());
}
let mut xmlstring = read_to_string(filename)
.map_err(|e| format!("Error opening XML file {}: {}", filename.display(), e))?;
if xmlstring[..100].find("<!DOCTYPE html>").is_some() && config.inject_dtd.is_some() {
xmlstring = xmlstring.replacen("<!DOCTYPE html>", "", 1);
}
if xmlstring[..100].find("<!DOCTYPE").is_none() {
if let Some(dtd) = config.inject_dtd.as_ref() {
xmlstring = dtd.to_string() + &xmlstring
};
} else if config.inject_dtd.is_some() {
eprintln!("[STAM fromxml] WARNING: Can not inject DTD because file already has a DOCTYPE");
}
let doc = Document::parse_with_options(
&xmlstring,
ParsingOptions {
allow_dtd: true,
..ParsingOptions::default()
},
)
.map_err(|e| format!("Error parsing XML file {}: {}", filename.display(), e))?;
let mut converter = XmlToStamConverter::new(config);
converter
.compile()
.map_err(|e| format!("Error compiling templates: {}", e))?;
let textoutfilename = format!(
"{}.txt",
filename
.file_stem()
.expect("invalid filename")
.to_str()
.expect("invalid utf-8 in filename")
);
let mut path = NodePath::default();
path.add(&doc.root_element(), None);
converter
.extract_element_text(doc.root_element(), &path, converter.config.whitespace, Some(textoutfilename.as_str()), Some(&filename.to_string_lossy()), 0)
.map_err(|e| {
format!(
"Error extracting element text from {}: {}",
filename.display(),
e
)
})?;
if config.debug {
eprintln!("[STAM fromxml] extracted full text: {}", &converter.text);
}
let resource = TextResourceBuilder::new()
.with_id(filename_to_id(textoutfilename.as_str(), config).to_string())
.with_text(converter.text.clone())
.with_filename(&textoutfilename);
converter.resource_handle = Some(
store
.add_resource(resource)
.map_err(|e| format!("Failed to add resource {}: {}", &textoutfilename, e))?,
);
converter.add_metadata(store).map_err(|e| format!("Failed to add metadata {}: {}", &textoutfilename, e))?;
converter
.extract_element_annotation(doc.root_element(), &path, Some(&filename.to_string_lossy()),0, store)
.map_err(|e| {
format!(
"Error extracting element annotation from {}: {}",
filename.display(),
e
)
})?;
Ok(())
}
pub fn from_multi_xml<'a>(
filenames: &Vec<&Path>,
outputfile: Option<&Path>,
config: &XmlConversionConfig,
store: &'a mut AnnotationStore,
) -> Result<(), String> {
let textoutfilename = if let Some(outputfile) = outputfile {
format!("{}",outputfile.to_str().expect("invalid utf-8 in filename"))
} else {
format!(
"{}.txt",
filenames.iter().next().expect("1 or more filename need to be provided")
.file_stem()
.expect("invalid filename")
.to_str()
.expect("invalid utf-8 in filename")
)
};
let mut xmlstrings: Vec<String> = Vec::new();
let mut docs: Vec<Document> = Vec::new();
for filename in filenames.iter() {
if config.debug {
eprintln!("[STAM fromxml] parsing {} (one of multiple)", filename.display());
}
let mut xmlstring = read_to_string(filename).map_err(|e| format!("Error opening XML file {}: {}", filename.display(), e))?;
if xmlstring[..100].find("<!DOCTYPE html>").is_some() && config.inject_dtd.is_some() {
xmlstring = xmlstring.replacen("<!DOCTYPE html>", "", 1);
}
if xmlstring[..100].find("<!DOCTYPE").is_none() {
if let Some(dtd) = config.inject_dtd.as_ref() {
xmlstring = dtd.to_string() + &xmlstring
};
} else if config.inject_dtd.is_some() {
eprintln!("[STAM fromxml] WARNING: Can not inject DTD because file already has a DOCTYPE");
}
xmlstrings.push(xmlstring);
}
for (filename, xmlstring) in filenames.iter().zip(xmlstrings.iter()) {
let doc = Document::parse_with_options(
xmlstring,
ParsingOptions {
allow_dtd: true,
..ParsingOptions::default()
},
)
.map_err(|e| format!("Error parsing XML file {}: {}", filename.display(), e))?;
docs.push(doc);
}
let mut converter = XmlToStamConverter::new(config);
converter
.compile()
.map_err(|e| format!("Error compiling templates: {}", e))?;
for (i, (doc, filename)) in docs.iter().zip(filenames.iter()).enumerate() {
let mut path = NodePath::default();
path.add(&doc.root_element(), None);
converter
.extract_element_text(doc.root_element(), &path, converter.config.whitespace, Some(textoutfilename.as_str()), Some(&filename.to_string_lossy()), i)
.map_err(|e| {
format!(
"Error extracting element text from {}: {}",
filename.display(),
e
)
})?;
if config.debug {
eprintln!("[STAM fromxml] extracted full text: {}", &converter.text);
}
}
let resource = TextResourceBuilder::new()
.with_id(filename_to_id(textoutfilename.as_str(), config).to_string())
.with_text(converter.text.clone())
.with_filename(&textoutfilename);
converter.resource_handle = Some(
store
.add_resource(resource)
.map_err(|e| format!("Failed to add resource {}: {}", &textoutfilename, e))?,
);
converter.add_metadata(store).map_err(|e| format!("Failed to add metadata {}: {}", &textoutfilename, e))?;
for (i,(doc, filename)) in docs.iter().zip(filenames.iter()).enumerate() {
let mut path = NodePath::default();
path.add(&doc.root_element(), None);
converter
.extract_element_annotation(doc.root_element(), &path, Some(&filename.to_string_lossy()),i, store)
.map_err(|e| {
format!(
"Error extracting element annotation from {}: {}",
filename.display(),
e
)
})?;
}
Ok(())
}
pub fn from_xml_in_memory<'a>(
resource_id: &str,
xmlstring: &str,
config: &XmlConversionConfig,
store: &'a mut AnnotationStore,
) -> Result<(), String> {
if config.debug {
eprintln!("[STAM fromxml] parsing XML string");
}
let doc = Document::parse_with_options(
&xmlstring,
ParsingOptions {
allow_dtd: true,
..ParsingOptions::default()
},
)
.map_err(|e| format!("Error parsing XML string: {}", e))?;
let mut converter = XmlToStamConverter::new(config);
converter
.compile()
.map_err(|e| format!("Error compiling templates: {}", e))?;
let mut path = NodePath::default();
path.add(&doc.root_element(), None);
converter
.extract_element_text(doc.root_element(), &path, converter.config.whitespace, Some(resource_id), Some(resource_id), 0)
.map_err(|e| {
format!(
"Error extracting element text from {}: {}",
resource_id,
e
)
})?;
if config.debug {
eprintln!("[STAM fromxml] extracted full text: {}", &converter.text);
}
let resource = TextResourceBuilder::new()
.with_id(resource_id)
.with_text(converter.text.clone());
converter.resource_handle = Some(
store
.add_resource(resource)
.map_err(|e| format!("Failed to add resource {}: {}", &resource_id, e))?,
);
converter.add_metadata(store).map_err(|e| format!("Failed to add metadata for {}: {}", &resource_id, e))?;
converter
.extract_element_annotation(doc.root_element(), &path, Some(resource_id), 0, store)
.map_err(|e| {
format!(
"Error extracting element annotation from {}: {}",
resource_id,
e
)
})?;
Ok(())
}
pub fn filename_to_id<'a>(filename: &'a str, config: &XmlConversionConfig) -> &'a str {
for suffix in config.id_strip_suffix.iter() {
if filename.ends_with(suffix) {
return &filename[..filename.len() - suffix.len()];
}
}
return filename;
}
#[derive(Clone,Copy,PartialEq, Hash, Eq)]
enum PositionType {
Body,
TextPrefix,
TextSuffix,
}
struct XmlToStamConverter<'a> {
cursor: usize,
text: String,
template_engine: Engine<'a>,
positionmap: HashMap<(usize,NodeId,PositionType), Offset>,
bytepositionmap: HashMap<(usize,NodeId,PositionType), (usize, usize)>,
markers: HashMap<usize, Vec<(usize,NodeId)>>,
scopes: HashMap<String, (usize,NodeId)>,
resource_handle: Option<TextResourceHandle>,
pending_whitespace: bool,
config: &'a XmlConversionConfig,
prefixes: HashMap<String, String>,
global_context: BTreeMap<String, upon::Value>,
variables: BTreeMap<String, BTreeSet<&'a str>>,
debugindent: String,
}
pub enum XmlConversionError {
StamError(StamError),
TemplateError(String, Option<upon::Error>),
ConfigError(String),
}
impl From<StamError> for XmlConversionError {
fn from(error: StamError) -> Self {
Self::StamError(error)
}
}
impl From<upon::Error> for XmlConversionError {
fn from(error: upon::Error) -> Self {
Self::TemplateError("".into(), Some(error))
}
}
impl Display for XmlConversionError {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
Self::StamError(e) => e.fmt(f),
Self::TemplateError(s, e) => {
f.write_str(s.as_str())?;
f.write_str(": ")?;
if let Some(e) = e {
e.fmt(f)?;
}
f.write_str("")
}
Self::ConfigError(e) => e.fmt(f),
}
}
}
impl<'a> XmlToStamConverter<'a> {
fn new(config: &'a XmlConversionConfig) -> Self {
let mut prefixes: HashMap<String, String> = HashMap::new();
for (prefix, namespace) in config.namespaces.iter() {
prefixes.insert(namespace.to_string(), prefix.to_string());
}
let mut template_engine = Engine::new();
template_engine.set_default_formatter(&value_formatter); template_engine.add_function("capitalize", filter_capitalize);
template_engine.add_function("lower", str::to_lowercase);
template_engine.add_function("upper", str::to_uppercase);
template_engine.add_function("trim", |s: &str| s.trim().to_string() );
template_engine.add_function("add", filter_add);
template_engine.add_function("sub", filter_sub);
template_engine.add_function("mul", filter_mul);
template_engine.add_function("div", filter_div);
template_engine.add_function("eq", |a: &upon::Value, b: &upon::Value| a == b);
template_engine.add_function("ne", |a: &upon::Value, b: &upon::Value| a != b);
template_engine.add_function("gt", filter_gt);
template_engine.add_function("lt", filter_lt);
template_engine.add_function("gte", filter_gte);
template_engine.add_function("lte", filter_lte);
template_engine.add_function("int", |a: &upon::Value| match a {
upon::Value::Integer(x) => upon::Value::Integer(*x),
upon::Value::Float(x) => upon::Value::Integer(*x as i64),
upon::Value::String(s) => upon::Value::Integer(s.parse().expect("int filter expects an integer value")),
_ => panic!("int filter expects an integer value"), });
template_engine.add_function("float", |a: &upon::Value| match a {
upon::Value::Float(_) => a.clone(),
upon::Value::Integer(x) => upon::Value::Float(*x as f64),
upon::Value::String(s) => upon::Value::Float(s.parse().expect("float filter expects a float value")),
_ => panic!("int filter expects an integer value"), });
template_engine.add_function("str", |a: upon::Value| match a {
upon::Value::Integer(x) => upon::Value::String(format!("{}",x)),
upon::Value::Float(x) => upon::Value::String(format!("{}",x)),
upon::Value::Bool(x) => upon::Value::String(format!("{}",x)),
upon::Value::String(_) => a,
upon::Value::None => upon::Value::String(String::new()),
upon::Value::List(list) => { let newlist: Vec<String> = list.iter().map(|v| match v {
upon::Value::String(s) => s.clone(),
upon::Value::Integer(d) => format!("{}",d),
upon::Value::Float(d) => format!("{}",d),
upon::Value::Bool(d) => format!("{}",d),
_ => String::new(),
}).collect();
upon::Value::String(newlist.join(", "))
},
_ => panic!("map to string not implemented"), });
template_engine.add_function("as_range", |a: i64| upon::Value::List(std::ops::Range { start: 0, end: a }.into_iter().map(|x| upon::Value::Integer(x+1)).collect::<Vec<_>>()) );
template_engine.add_function("last", |list: &[upon::Value]| list.last().map(Clone::clone));
template_engine.add_function("first", |list: &[upon::Value]| {
list.first().map(Clone::clone)
});
template_engine.add_function("tokenize", |s: &str| {
upon::Value::List(
s.split(|c| c == ' ' || c == '\n').filter_map(|x|
if !x.is_empty() {
Some(upon::Value::String(x.to_string()))
} else {
None
}
)
.collect::<Vec<upon::Value>>())
});
template_engine.add_function("replace", |s: &str, from: &str, to: &str| {
upon::Value::String(s.replace(from,to))
});
template_engine.add_function("starts_with", |s: &str, prefix: &str| {
s.starts_with(prefix)
});
template_engine.add_function("ends_with", |s: &str, suffix: &str| {
s.ends_with(suffix)
});
template_engine.add_function("basename", |a: &upon::Value| match a {
upon::Value::String(s) => upon::Value::String(s.split(|c| c == '/' || c == '\\').last().expect("splitting must work").to_string()),
_ => panic!("basename filter expects a string value"), });
template_engine.add_function("noext", |a: &upon::Value| match a {
upon::Value::String(s) => if let Some(pos) = s.rfind('.') {
s[..pos].to_string()
} else {
s.to_string()
},
_ => panic!("basename filter expects a string value"), });
template_engine.add_function("join", |list: &upon::Value, delimiter: &str| match list {
upon::Value::List(list) => { let newlist: Vec<String> = list.iter().map(|v| match v {
upon::Value::String(s) => s.clone(),
upon::Value::Integer(d) => format!("{}",d),
upon::Value::Float(d) => format!("{}",d),
upon::Value::Bool(d) => format!("{}",d),
_ => String::new(),
}).collect();
upon::Value::String(newlist.join(delimiter))
},
_ => {
list.clone() }
});
let mut converter = Self {
cursor: 0,
text: String::new(),
template_engine,
positionmap: HashMap::new(),
bytepositionmap: HashMap::new(),
scopes: HashMap::new(),
markers: HashMap::new(),
resource_handle: None,
pending_whitespace: false,
global_context: BTreeMap::new(),
debugindent: String::new(),
variables: BTreeMap::new(),
prefixes,
config,
};
converter.set_global_context();
converter.add_external_filters();
converter
}
fn add_external_filters(&mut self) {
for filter in self.config.external_filters.clone() {
self.template_engine.add_function(filter.name.clone(), move |value: &upon::Value| filter.run(value) );
}
}
fn compile(&mut self) -> Result<(), XmlConversionError> {
if self.config.debug {
eprintln!("[STAM fromxml] compiling templates");
}
for element in self.config.elements.iter() {
if let Some(textprefix) = element.textprefix.as_ref() {
if self.template_engine.get_template(textprefix.as_str()).is_none() {
let template = self.precompile(textprefix.as_str());
self.template_engine
.add_template(textprefix.clone(), template)
.map_err(|e| {
XmlConversionError::TemplateError(
format!("element/textprefix template {}", textprefix.clone()),
Some(e),
)
})?;
}
}
if let Some(textsuffix) = element.textsuffix.as_ref() {
if self.template_engine.get_template(textsuffix.as_str()).is_none() {
let template = self.precompile(textsuffix.as_str());
self.template_engine
.add_template(textsuffix.clone(), template)
.map_err(|e| {
XmlConversionError::TemplateError(
format!("element/textsuffix template {}", textsuffix.clone()),
Some(e),
)
})?;
}
}
if let Some(id) = element.id.as_ref() {
if self.template_engine.get_template(id.as_str()).is_none() {
let template = self.precompile(id.as_str());
self.template_engine.add_template(id.clone(), template).map_err(|e| {
XmlConversionError::TemplateError(
format!("element/id template {}", id.clone()),
Some(e),
)
})?;
}
}
for annotationdata in element.annotationdata.iter().chain(element.annotatetextprefix.iter()).chain(element.annotatetextsuffix.iter()) {
if let Some(id) = annotationdata.id.as_ref() {
if self.template_engine.get_template(id.as_str()).is_none() {
let template = self.precompile(id.as_str());
self.template_engine.add_template(id.clone(), template).map_err(|e| {
XmlConversionError::TemplateError(
format!("annotationdata/id template {}", id.clone()),
Some(e),
)
})?;
}
}
if let Some(set) = annotationdata.set.as_ref() {
if self.template_engine.get_template(set.as_str()).is_none() {
let template = self.precompile(set.as_str());
self.template_engine.add_template(set.clone(), template).map_err(|e| {
XmlConversionError::TemplateError(
format!("annotationdata/set template {}", set.clone()),
Some(e),
)
})?;
}
}
if let Some(key) = annotationdata.key.as_ref() {
if self.template_engine.get_template(key.as_str()).is_none() {
let template = self.precompile(key.as_str());
self.template_engine.add_template(key.clone(), template).map_err(|e| {
XmlConversionError::TemplateError(
format!("annotationdata/key template {}", key.clone()),
Some(e),
)
})?;
}
}
if let Some(value) = annotationdata.value.as_ref() {
self.compile_value(value)?;
}
}
}
for metadata in self.config.metadata.iter() {
if let Some(id) = metadata.id.as_ref() {
if self.template_engine.get_template(id.as_str()).is_none() {
let template = self.precompile(id.as_str());
self.template_engine.add_template(id.clone(), template).map_err(|e| {
XmlConversionError::TemplateError(
format!("metadata/id template {}", id.clone()),
Some(e),
)
})?;
}
}
for annotationdata in metadata.annotationdata.iter() {
if let Some(id) = annotationdata.id.as_ref() {
if self.template_engine.get_template(id.as_str()).is_none() {
let template = self.precompile(id.as_str());
self.template_engine.add_template(id.clone(), template).map_err(|e| {
XmlConversionError::TemplateError(
format!("annotationdata/id template {}", id.clone()),
Some(e),
)
})?;
}
}
if let Some(set) = annotationdata.set.as_ref() {
if self.template_engine.get_template(set.as_str()).is_none() {
let template = self.precompile(set.as_str());
self.template_engine.add_template(set.clone(), template).map_err(|e| {
XmlConversionError::TemplateError(
format!("annotationdata/set template {}", set.clone()),
Some(e),
)
})?;
}
}
if let Some(key) = annotationdata.key.as_ref() {
if self.template_engine.get_template(key.as_str()).is_none() {
let template = self.precompile(key.as_str());
self.template_engine.add_template(key.clone(), template).map_err(|e| {
XmlConversionError::TemplateError(
format!("annotationdata/key template {}", key.clone()),
Some(e),
)
})?;
}
}
if let Some(value) = annotationdata.value.as_ref() {
self.compile_value(value)?;
}
}
}
Ok(())
}
fn compile_value(&mut self, value: &'a toml::Value) -> Result<(), XmlConversionError> {
match value {
toml::Value::String(value) => {
if self.template_engine.get_template(value.as_str()).is_none() {
let template = self.precompile(value.as_str());
self.template_engine.add_template(value.clone(), template).map_err(|e| {
XmlConversionError::TemplateError(
format!("annotationdata/value template {}", value.clone()),
Some(e),
)
})?;
}
}
toml::Value::Table(map) => {
for (_key, value) in map.iter() {
self.compile_value(value)?;
}
},
toml::Value::Array(list) => {
for value in list.iter() {
self.compile_value(value)?;
}
}
_ => {} }
Ok(())
}
fn extract_element_text<'b>(
&mut self,
node: Node<'a,'b>,
path: &NodePath<'a,'b>,
whitespace: XmlWhitespaceHandling,
resource_id: Option<&str>,
inputfile: Option<&str>,
doc_num: usize,
) -> Result<(), XmlConversionError> {
if self.config.debug {
eprintln!("[STAM fromxml]{} extracting text for element {}", self.debugindent, path);
}
let mut begin = self.cursor; let mut bytebegin = self.text.len(); let mut end_discount = 0; let mut end_bytediscount = 0;
let mut firsttext = true;
let mut elder_siblings = SiblingCounter::default();
if let Some(element_config) = self.config.element_config(node, path) {
if self.config.debug {
eprintln!("[STAM fromxml]{} matching config: {:?}", self.debugindent, element_config);
}
if (element_config.stop == Some(false) || element_config.stop.is_none())
&& element_config.annotation != XmlAnnotationHandling::TextSelectorBetweenMarkers
{
let whitespace = if node.has_attribute((NS_XML, "space")) {
match node.attribute((NS_XML, "space")).unwrap() {
"preserve" => XmlWhitespaceHandling::Preserve,
"collapse" | "replace" => XmlWhitespaceHandling::Collapse,
_ => whitespace,
}
} else if element_config.whitespace == XmlWhitespaceHandling::Inherit
|| element_config.whitespace == XmlWhitespaceHandling::Unspecified
{
whitespace } else {
element_config.whitespace };
self.process_textprefix(element_config, node, resource_id, inputfile, doc_num, &mut begin, &mut bytebegin)?;
let textbegin = self.cursor;
for child in node.children() {
if self.config.debug {
eprintln!("[STAM fromxml]{} child {:?}", self.debugindent, child);
}
if child.is_text() && element_config.text == Some(true) {
let mut innertext = child.text().expect("text node must have text");
let mut pending_whitespace = false;
let mut leading_whitespace = false;
if whitespace == XmlWhitespaceHandling::Collapse && !innertext.is_empty() {
let mut all_whitespace = true;
leading_whitespace = innertext.chars().next().unwrap().is_whitespace();
pending_whitespace = innertext
.chars()
.inspect(|c| {
if !c.is_whitespace() {
all_whitespace = false
}
})
.last()
.unwrap()
.is_whitespace();
if all_whitespace {
self.pending_whitespace = true;
if self.config.debug {
eprintln!(
"[STAM fromxml]{} ^- all whitespace, flag pending whitespace and skipping...",
self.debugindent,
);
}
continue;
}
innertext = innertext.trim();
if self.config.debug {
eprintln!(
"[STAM fromxml]{} ^- collapsed whitespace: {:?}",
self.debugindent,
innertext
);
}
}
if self.pending_whitespace || leading_whitespace {
if !self.text.is_empty()
&& !self.text.chars().rev().next().unwrap().is_whitespace()
{
if self.config.debug {
eprintln!("[STAM fromxml]{} ^- outputting pending whitespace",self.debugindent);
}
self.text.push(' ');
self.cursor += 1;
if firsttext && self.pending_whitespace {
begin += 1;
bytebegin += 1;
firsttext = false;
}
}
self.pending_whitespace = false;
}
if whitespace == XmlWhitespaceHandling::Collapse {
let mut prevc = ' ';
let mut innertext = innertext.replace(|c: char| c.is_whitespace(), " ");
innertext.retain(|c| {
let do_retain = c != ' ' || prevc != ' ';
prevc = c;
do_retain
});
self.text += &innertext;
self.cursor += innertext.chars().count();
if self.config.debug {
eprintln!("[STAM fromxml]{} ^- outputting text child (collapsed whitespace), cursor is now {}: {}",self.debugindent, self.cursor, innertext);
}
} else {
self.text += &innertext;
self.cursor += innertext.chars().count();
if self.config.debug {
eprintln!("[STAM fromxml]{} ^- outputting text child, cursor is now {}: {}",self.debugindent, self.cursor, innertext);
}
}
self.pending_whitespace = pending_whitespace;
} else if child.is_element() {
if self.config.debug {
eprintln!("[STAM fromxml]{} \\- extracting text for this child", self.debugindent);
}
self.debugindent.push_str(" ");
let mut path = path.clone();
let count = elder_siblings.count(&child);
path.add(&child, Some(count));
self.extract_element_text(child, &path, whitespace, resource_id, inputfile, doc_num)?;
self.debugindent.pop();
self.debugindent.pop();
} else {
if self.config.debug {
eprintln!("[STAM fromxml]{} ^- skipping this child node", self.debugindent);
}
continue;
}
}
self.process_textsuffix(element_config, node, resource_id, inputfile, doc_num, &mut end_discount, &mut end_bytediscount, textbegin)?;
if let Some(scope_id) = element_config.scope_id.as_ref() {
self.scopes.insert( scope_id.clone(), (doc_num, node.id()) );
}
} else if element_config.annotation == XmlAnnotationHandling::TextSelectorBetweenMarkers
{
if self.config.debug {
eprintln!("[STAM fromxml]{} adding to markers (textprefix={:?}, textsuffix={:?})", self.debugindent, element_config.textprefix, element_config.textsuffix);
}
self.markers
.entry(element_config.hash())
.and_modify(|v| v.push((doc_num, node.id())))
.or_insert(vec![(doc_num, node.id())]);
self.process_textprefix(element_config, node, resource_id, inputfile, doc_num, &mut begin, &mut bytebegin)?;
self.process_textsuffix(element_config, node, resource_id, inputfile, doc_num, &mut end_discount, &mut end_bytediscount, self.cursor)?;
}
} else if self.config.debug {
eprintln!(
"[STAM fromxml]{} WARNING: no match, skipping text extraction for element {}",
self.debugindent,
path
);
}
if begin <= (self.cursor - end_discount) {
let offset = Offset::simple(begin, self.cursor - end_discount);
if self.config.debug {
eprintln!(
"[STAM fromxml]{} extracted text for {} @{:?}: {:?}",
self.debugindent,
path,
&offset,
&self.text[bytebegin..(self.text.len() - end_bytediscount)]
);
}
self.positionmap.insert((doc_num, node.id(), PositionType::Body), offset);
self.bytepositionmap
.insert((doc_num, node.id(), PositionType::Body), (bytebegin, self.text.len() - end_bytediscount));
}
Ok(())
}
fn process_textprefix<'b>(
&mut self,
element_config: &XmlElementConfig,
node: Node<'a,'b>,
resource_id: Option<&str>,
inputfile: Option<&str>,
doc_num: usize,
begin: &mut usize,
bytebegin: &mut usize
) -> Result<(), XmlConversionError> {
if let Some(textprefix) = &element_config.textprefix {
self.pending_whitespace = false;
if self.config.debug {
eprintln!("[STAM fromxml]{} outputting textprefix: {:?}", self.debugindent, textprefix);
}
let result =
self.render_template(textprefix, &node, Some(self.cursor), None, resource_id, inputfile, doc_num)
.map_err(|e| match e {
XmlConversionError::TemplateError(s, e) => {
XmlConversionError::TemplateError(
format!(
"whilst rendering textprefix template '{}' for node '{}': {}",
textprefix, node.tag_name().name(), s
),
e,
)
}
e => e,
})?;
let result_charlen = result.chars().count();
if !element_config.annotatetextprefix.is_empty() {
let offset = Offset::simple(self.cursor, self.cursor + result_charlen);
self.positionmap.insert((doc_num, node.id(), PositionType::TextPrefix), offset);
self.bytepositionmap
.insert((doc_num, node.id(), PositionType::TextPrefix), (*bytebegin, *bytebegin + result.len()));
}
self.cursor += result_charlen;
self.text += &result;
if element_config.include_textprefix != Some(true) {
*begin += result_charlen;
*bytebegin += result.len();
}
}
Ok(())
}
fn process_textsuffix<'b>(
&mut self,
element_config: &XmlElementConfig,
node: Node<'a,'b>,
resource_id: Option<&str>,
inputfile: Option<&str>,
doc_num: usize,
end_discount: &mut usize,
end_bytediscount: &mut usize,
textbegin: usize,
) -> Result<(), XmlConversionError> {
if let Some(textsuffix) = &element_config.textsuffix {
if self.config.debug {
eprintln!("[STAM fromxml]{} outputting textsuffix: {:?}", self.debugindent, textsuffix);
}
let result = self.render_template(
textsuffix.as_str(),
&node,
Some(textbegin),
Some(self.cursor),
resource_id,
inputfile,
doc_num
).map_err(|e| match e {
XmlConversionError::TemplateError(s, e) => {
XmlConversionError::TemplateError(
format!(
"whilst rendering textsuffix template '{}' for node '{}': {}",
textsuffix,
node.tag_name().name(),
s
),
e,
)
}
e => e,
})?;
let end_discount_tmp = result.chars().count();
let end_bytediscount_tmp = result.len();
self.text += &result;
if !element_config.annotatetextsuffix.is_empty() {
let offset = Offset::simple(self.cursor, self.cursor + end_discount_tmp);
self.positionmap.insert((doc_num, node.id(), PositionType::TextSuffix), offset);
self.bytepositionmap
.insert((doc_num, node.id(), PositionType::TextSuffix), (self.text.len() - end_bytediscount_tmp, self.text.len()));
}
self.cursor += end_discount_tmp;
self.pending_whitespace = false;
if element_config.include_textsuffix == Some(true) {
*end_discount = 0;
*end_bytediscount = 0;
} else {
*end_discount = end_discount_tmp;
*end_bytediscount = end_bytediscount_tmp;
}
}
Ok(())
}
fn extract_element_annotation<'b>(
&mut self,
node: Node<'a,'b>,
path: &NodePath<'a,'b>,
inputfile: Option<&str>,
doc_num: usize,
store: &mut AnnotationStore,
) -> Result<(), XmlConversionError> {
if self.config.debug {
eprintln!("[STAM fromxml]{} extracting annotation from {}", self.debugindent, path);
}
let mut elder_siblings = SiblingCounter::default();
if let Some(element_config) = self.config.element_config(node, &path) {
if self.config.debug {
eprintln!("[STAM fromxml]{} matching config: {:?}", self.debugindent, element_config);
}
if element_config.annotation != XmlAnnotationHandling::None
&& element_config.annotation != XmlAnnotationHandling::Unspecified
{
let mut builder = AnnotationBuilder::new();
let offset = self.positionmap.get(&(doc_num, node.id(), PositionType::Body));
if element_config.annotation == XmlAnnotationHandling::TextSelector {
if let Some((beginbyte, endbyte)) = self.bytepositionmap.get(&(doc_num, node.id(), PositionType::Body)) {
if self.config.debug {
eprintln!("[STAM fromxml]{} annotation covers text {:?} (bytes {}-{})", self.debugindent, offset, beginbyte, endbyte);
}
} else if self.text.is_empty() {
return Err(XmlConversionError::ConfigError("Can't extract annotations on text if no text was extracted!".into()));
}
}
let begin = if let Some(offset) = offset {
if let Cursor::BeginAligned(begin) = offset.begin {
Some(begin)
} else {
None
}
} else {
None
};
let end = if let Some(offset) = offset {
if let Cursor::BeginAligned(end) = offset.end {
Some(end)
} else {
None
}
} else {
None
};
let resource_id = if let Some(resource_handle) = self.resource_handle {
store.resource(resource_handle).unwrap().id()
} else {
None
};
let mut have_id = false;
if let Some(template) = &element_config.id {
let context = self.context_for_node(&node, begin, end, template.as_str(), resource_id, inputfile, doc_num);
let compiled_template = self.template_engine.template(template.as_str());
let id = compiled_template.render(&context).to_string().map_err(|e|
XmlConversionError::TemplateError(
format!(
"whilst rendering id template '{}' for node '{}'",
template,
node.tag_name().name(),
),
Some(e),
)
)?;
if !id.is_empty() {
builder = builder.with_id(id);
have_id = true;
}
}
if !have_id {
if let Some(resource_id) = resource_id {
builder = builder.with_id(stam::generate_id(&format!("{}-",resource_id), ""));
} else {
builder = builder.with_id(stam::generate_id("", ""));
}
}
builder = self.add_annotationdata_to_builder(element_config.annotationdata.iter(), builder, node.clone(), begin, end, resource_id, inputfile, doc_num)?;
if self.config.provenance && inputfile.is_some() {
let path_string = if let Some(id) = node.attribute((NS_XML,"id")) {
format!("//{}[@xml:id=\"{}\"]", self.get_node_name_for_xpath(&node), id)
} else {
path.format_as_xpath(&self.prefixes)
};
let databuilder = AnnotationDataBuilder::new().with_dataset(CONTEXT_ANNO.into()).with_key("target".into()).with_value(
BTreeMap::from([
("source".to_string(),inputfile.unwrap().into()),
("selector".to_string(),
BTreeMap::from([
("type".to_string(),"XPathSelector".into()),
("value".to_string(),path_string.into())
]).into()
)
]).into()
);
builder = builder.with_data_builder(databuilder);
}
match element_config.annotation {
XmlAnnotationHandling::TextSelector => {
if let Some(selector) = self.textselector(node, doc_num, PositionType::Body) {
builder = builder.with_target(selector);
if self.config.debug {
eprintln!("[STAM fromxml] builder AnnotateText: {:?}", builder);
}
store.annotate(builder)?;
}
if !element_config.annotatetextprefix.is_empty() || !element_config.annotatetextsuffix.is_empty() {
self.annotate_textaffixes(node, element_config, inputfile, doc_num, store)?;
}
}
XmlAnnotationHandling::ResourceSelector => {
builder = builder.with_target(SelectorBuilder::ResourceSelector(
self.resource_handle.into(),
));
if self.config.debug {
eprintln!("[STAM fromxml] builder AnnotateResource: {:?}", builder);
}
store.annotate(builder)?;
}
XmlAnnotationHandling::TextSelectorBetweenMarkers => {
if let Some(selector) =
self.textselector_for_markers(node, doc_num, store, element_config)
{
builder = builder.with_target(selector);
if self.config.debug {
eprintln!(
"[STAM fromxml] builder TextSelectorBetweenMarkers: {:?}",
builder
);
}
store.annotate(builder)?;
if !element_config.annotatetextprefix.is_empty() || !element_config.annotatetextsuffix.is_empty() {
self.annotate_textaffixes(node, element_config, inputfile, doc_num, store)?;
}
}
}
_ => panic!(
"Invalid annotationhandling: {:?}",
element_config.annotation
),
}
}
if element_config.stop == Some(false) || element_config.stop.is_none() {
for child in node.children() {
if child.is_element() {
self.debugindent.push_str(" ");
let mut path = path.clone();
let count = elder_siblings.count(&child);
path.add(&child, Some(count));
self.extract_element_annotation(child, &path, inputfile, doc_num, store)?;
self.debugindent.pop();
self.debugindent.pop();
}
}
}
} else {
eprintln!(
"[STAM fromxml]{} WARNING: no match, skipping annotation extraction for element {}",
self.debugindent,
path
);
}
Ok(())
}
fn add_annotationdata_to_builder<'input>(&self, iter: impl Iterator<Item = &'a XmlAnnotationDataConfig>,
mut builder: AnnotationBuilder<'a>,
node: Node<'a, 'input>,
begin: Option<usize>,
end: Option<usize>,
resource_id: Option<&str>,
inputfile: Option<&str>,
doc_num: usize,
) -> Result<AnnotationBuilder<'a>, XmlConversionError> {
for annotationdata in iter {
let mut databuilder = AnnotationDataBuilder::new();
if let Some(template) = &annotationdata.set {
let context = self.context_for_node(&node, begin, end, template.as_str(), resource_id, inputfile, doc_num);
let compiled_template = self.template_engine.template(template.as_str());
let dataset = compiled_template.render(&context).to_string().map_err(|e|
XmlConversionError::TemplateError(
format!(
"whilst rendering annotationdata/dataset template '{}' for node '{}'",
template,
node.tag_name().name(),
),
Some(e),
)
)?;
if !dataset.is_empty() {
databuilder = databuilder.with_dataset(dataset.into())
}
} else {
databuilder =
databuilder.with_dataset(self.config.default_set.as_str().into());
}
if let Some(template) = &annotationdata.key {
let context = self.context_for_node(&node, begin, end, template.as_str(), resource_id, inputfile, doc_num);
let compiled_template = self.template_engine.template(template.as_str());
match compiled_template.render(&context).to_string().map_err(|e|
XmlConversionError::TemplateError(
format!(
"whilst rendering annotationdata/key template '{}' for node '{}'",
template,
node.tag_name().name(),
),
Some(e),
)
) {
Ok(key) if !key.is_empty() =>
databuilder = databuilder.with_key(key.into()) ,
Ok(_) if !annotationdata.skip_if_missing => {
return Err(XmlConversionError::TemplateError(
format!(
"whilst rendering annotationdata/key template '{}' for node '{}'",
template,
node.tag_name().name(),
),
None
));
},
Err(e) if !annotationdata.skip_if_missing => {
return Err(e)
},
_ => {
continue
}
}
}
if let Some(value) = &annotationdata.value {
match self.extract_value(value, node, annotationdata.allow_empty_value, annotationdata.skip_if_missing, annotationdata.valuetype.as_ref().map(|s| s.as_str()), begin, end, resource_id, inputfile, doc_num)? {
Some(DataValue::List(values)) if annotationdata.multiple => {
for value in values {
let mut databuilder_multi = databuilder.clone();
databuilder_multi = databuilder_multi.with_value(value);
builder = builder.with_data_builder(databuilder_multi);
}
},
Some(value) => {
databuilder = databuilder.with_value(value);
},
None => {
continue
}
}
}
if !annotationdata.multiple {
builder = builder.with_data_builder(databuilder);
}
}
Ok(builder)
}
fn annotate_textaffixes<'b>(
&mut self,
node: Node<'a,'b>,
element_config: &XmlElementConfig,
inputfile: Option<&str>,
doc_num: usize,
store: &mut AnnotationStore,
) -> Result<(), XmlConversionError> {
if !element_config.annotatetextprefix.is_empty() {
let mut builder = AnnotationBuilder::new().with_id(stam::generate_id("textprefix-", ""));
if let Some(offset) = self.positionmap.get(&(doc_num, node.id(), PositionType::TextPrefix)) {
let begin = if let Cursor::BeginAligned(begin) = offset.begin {
Some(begin)
} else {
None
};
let end = if let Cursor::BeginAligned(end) = offset.end {
Some(end)
} else {
None
};
builder = self.add_annotationdata_to_builder(element_config.annotatetextprefix.iter(), builder, node.clone(), begin,end, None, inputfile, doc_num)?; if let Some(selector) = self.textselector(node, doc_num, PositionType::TextPrefix) {
builder = builder.with_target(selector);
if self.config.debug {
eprintln!("[STAM fromxml] builder AnnotateText: {:?}", builder);
}
store.annotate(builder)?;
} else {
return Err(XmlConversionError::ConfigError("Failed to create textselector to target textprefix".into()));
}
}
}
if !element_config.annotatetextsuffix.is_empty() {
let mut builder = AnnotationBuilder::new().with_id(stam::generate_id("textsuffix-", ""));
if let Some(offset) = self.positionmap.get(&(doc_num, node.id(), PositionType::TextSuffix)) {
let begin = if let Cursor::BeginAligned(begin) = offset.begin {
Some(begin)
} else {
None
};
let end = if let Cursor::BeginAligned(end) = offset.end {
Some(end)
} else {
None
};
builder = self.add_annotationdata_to_builder(element_config.annotatetextsuffix.iter(), builder, node.clone(), begin,end, None, inputfile, doc_num)?; if let Some(selector) = self.textselector(node, doc_num, PositionType::TextSuffix) {
builder = builder.with_target(selector);
if self.config.debug {
eprintln!("[STAM fromxml] builder AnnotateText: {:?}", builder);
}
store.annotate(builder)?;
} else {
return Err(XmlConversionError::ConfigError("Failed to create textselector to target textprefix".into()));
}
}
}
Ok(())
}
fn extract_value<'b>(&self, value: &'a toml::Value, node: Node<'a,'b>, allow_empty_value: bool, skip_if_missing: bool, valuetype: Option<&str>, begin: Option<usize>, end: Option<usize>, resource_id: Option<&str>, inputfile: Option<&str>, doc_num: usize) -> Result<Option<DataValue>, XmlConversionError>{
match value {
toml::Value::String(template) => {
let context = self.context_for_node(&node, begin, end, template.as_str(), resource_id, inputfile, doc_num);
let compiled_template = self.template_engine.template(template.as_str()); match compiled_template.render(&context).to_string().map_err(|e|
XmlConversionError::TemplateError(
format!(
"whilst rendering annotationdata/map template '{}' for node '{}'.{}",
template,
node.tag_name().name(),
if self.config.debug() {
format!("\nContext was {:?}.\nVariables are: {:?}", context, self.variables.get(template))
} else {
String::new()
}
),
Some(e),
)
) {
Ok(value) => {
if !value.is_empty() || allow_empty_value {
string_to_datavalue(value, valuetype).map(|v| Some(v))
} else {
Ok(None)
}
},
Err(e) if !skip_if_missing => {
Err(e)
},
Err(_) if allow_empty_value => {
Ok(Some("".into()))
},
Err(_) => {
Ok(None)
}
}
},
toml::Value::Table(map) => {
let mut resultmap: BTreeMap<String,DataValue> = BTreeMap::new();
for (key, value) in map.iter() {
if let Some(value) = self.extract_value(value, node, false, true, None, begin, end, resource_id, inputfile, doc_num)? {
resultmap.insert(key.clone(), value);
}
}
Ok(Some(resultmap.into()))
},
toml::Value::Array(list) => {
let mut resultlist: Vec<DataValue> = Vec::new();
for value in list.iter() {
if let Some(value) = self.extract_value(value, node, false, true, None, begin, end, resource_id, inputfile, doc_num)? {
resultlist.push(value);
}
}
Ok(Some(resultlist.into()))
}
toml::Value::Boolean(v) => Ok(Some(DataValue::Bool(*v))),
toml::Value::Float(v) => Ok(Some(DataValue::Float(*v))),
toml::Value::Integer(v) => Ok(Some(DataValue::Int(*v as isize))),
toml::Value::Datetime(_v) => {
todo!("fromxml: Datetime conversion not implemented yet");
}
}
}
fn extract_value_metadata<'b>(&self, value: &'a toml::Value, context: &upon::Value, allow_empty_value: bool, skip_if_missing: bool, resource_id: Option<&str>) -> Result<Option<DataValue>, XmlConversionError>{
match value {
toml::Value::String(template) => {
let compiled_template = self.template_engine.template(template.as_str()); match compiled_template.render(&context).to_string().map_err(|e|
XmlConversionError::TemplateError(
format!(
"whilst rendering annotationdata/metadata template '{}' for metadata",
template,
),
Some(e),
)
) {
Ok(value) => {
if !value.is_empty() || allow_empty_value {
Ok(Some(value.into()))
} else {
Ok(None)
}
},
Err(e) if !skip_if_missing => {
Err(e)
},
Err(_) if allow_empty_value => {
Ok(Some("".into()))
},
Err(_) => {
Ok(None)
}
}
},
toml::Value::Table(map) => {
let mut resultmap: BTreeMap<String,DataValue> = BTreeMap::new();
for (key, value) in map.iter() {
if let Some(value) = self.extract_value_metadata(value, context, false, true, resource_id)? {
resultmap.insert(key.clone(), value);
}
}
Ok(Some(resultmap.into()))
},
toml::Value::Array(list) => {
let mut resultlist: Vec<DataValue> = Vec::new();
for value in list.iter() {
if let Some(value) = self.extract_value_metadata(value, context, false, true, resource_id)? {
resultlist.push(value);
}
}
Ok(Some(resultlist.into()))
}
toml::Value::Boolean(v) => Ok(Some(DataValue::Bool(*v))),
toml::Value::Float(v) => Ok(Some(DataValue::Float(*v))),
toml::Value::Integer(v) => Ok(Some(DataValue::Int(*v as isize))),
toml::Value::Datetime(_v) => {
todo!("fromxml: Datetime conversion not implemented yet");
}
}
}
fn textselector<'s>(&'s self, node: Node, doc_num: usize, positiontype: PositionType) -> Option<SelectorBuilder<'s>> {
let res_handle = self.resource_handle.expect("resource must be associated");
if let Some(offset) = self.positionmap.get(&(doc_num, node.id(), positiontype)) {
Some(SelectorBuilder::TextSelector(
BuildItem::Handle(res_handle),
offset.clone(),
))
} else {
None
}
}
fn textselector_for_markers<'b>(
&self,
node: Node,
doc_num: usize,
store: &AnnotationStore,
element_config: &'b XmlElementConfig,
) -> Option<SelectorBuilder<'b>> {
let resource = store
.resource(
self.resource_handle
.expect("resource must have been created"),
)
.expect("resource must exist");
let mut end: Option<usize> = None;
if let Some(markers) = self.markers.get(&element_config.hash()) {
let mut grab = false;
for (d_num, n_id) in markers.iter() {
if grab {
end = self.positionmap.get(&(*d_num, *n_id, PositionType::Body)).map(|offset| {
offset
.begin
.try_into()
.expect("begin cursor must be beginaligned")
});
break;
}
if doc_num == *d_num && *n_id == node.id() {
grab = true;
}
}
};
if end.is_none() {
if let Some(scope) = element_config.marker_scope.as_deref() {
if let Some((d_num, n_id)) = self.scopes.get(scope) {
end = self.positionmap.get(&(*d_num, *n_id, PositionType::Body)).map(|offset| {
offset
.end
.try_into()
.expect("end cursor must be beginaligned")
});
} else {
eprintln!("WARNING: Undefined scope referenced in marker_scope: {}, no matching text with this `scope_id` in this document! Skipping last marker!", scope);
return None;
}
} else {
end = Some(resource.textlen());
}
}
if let (Some(offset), Some(end)) = (self.positionmap.get(&(doc_num, node.id(), PositionType::Body)), end) {
Some(SelectorBuilder::TextSelector(
BuildItem::Handle(self.resource_handle.unwrap()),
Offset::simple(
offset
.begin
.try_into()
.expect("begin cursor must be beginaligned"),
end,
),
))
} else {
None
}
}
fn set_global_context(&mut self) {
self.global_context
.insert("context".into(), upon::Value::Map(self.config.context.iter().map(|(k,v)| (k.clone(), map_value(v))).collect()));
self.global_context
.insert("namespaces".into(), self.config.namespaces.clone().into());
self.global_context
.insert("default_set".into(), self.config.default_set.clone().into());
}
fn render_template<'input, 't>(
&self,
template: &'t str,
node: &Node<'a, 'input>,
begin: Option<usize>,
end: Option<usize>,
resource: Option<&str>,
inputfile: Option<&str>,
doc_num: usize,
) -> Result<Cow<'t, str>, XmlConversionError> {
if template.chars().any(|c| c == '{') {
let compiled_template = self.template_engine.template(template);
let context = self.context_for_node(&node, begin, end, template, resource, inputfile, doc_num);
let result = compiled_template.render(context).to_string()?;
Ok(Cow::Owned(result))
} else {
Ok(Cow::Borrowed(template))
}
}
fn context_for_node<'input>(
&self,
node: &Node<'a, 'input>,
begin: Option<usize>,
end: Option<usize>,
template: &str,
resource: Option<&str>,
inputfile: Option<&str>,
doc_num: usize,
) -> upon::Value {
let mut context = self.global_context.clone();
let length = if let (Some(begin), Some(end)) = (begin, end) {
Some(end - begin)
} else {
None
};
context.insert("localname".into(), node.tag_name().name().into());
context.insert("name".into(), self.get_node_name_for_template(node).into());
if let Some(namespace) = node.tag_name().namespace() {
context.insert("namespace".into(), namespace.into());
}
if let Some(begin) = begin {
context.insert("begin".into(), upon::Value::Integer(begin as i64));
}
if let Some(end) = end {
context.insert("end".into(), upon::Value::Integer(end as i64));
}
if let Some(length) = length {
context.insert("length".into(), upon::Value::Integer(length as i64));
}
if let Some(resource) = resource {
context.insert("resource".into(), resource.into());
}
if let Some(inputfile) = inputfile {
context.insert("inputfile".into(), inputfile.into());
}
context.insert("doc_num".into(), upon::Value::Integer(doc_num as i64));
if let Some(vars) = self.variables.get(template) {
for var in vars {
let mut encodedvar = String::new();
if let Some(value) = self.context_for_var(node, var, &mut encodedvar, false) {
if self.config.debug() {
eprintln!(
"[STAM fromxml] Set context variable for template '{}' for node '{}': {}={:?} (encodedvar={})",
template,
node.tag_name().name(),
var,
value,
encodedvar
);
}
if value != upon::Value::None {
context.insert(encodedvar, value);
}
} else if self.config.debug() {
eprintln!(
"[STAM fromxml] Missed context variable for template '{}' for node '{}': {}",
template,
node.tag_name().name(),
var
);
}
}
}
upon::Value::Map(context)
}
fn context_for_var<'input>(
&self,
node: &Node<'a, 'input>,
var: &str,
path: &mut String,
mut return_all_matches: bool,
) -> Option<upon::Value> {
let first = path.is_empty();
let var = if var.starts_with("?.$$") {
if first {
path.push_str("?.ELEMENTS_");
return_all_matches = true;
if self.config.debug {
eprintln!("[STAM fromxml] will return all matches for {}", var);
}
};
&var[4..]
} else if var.starts_with("?.$") {
if first {
path.push_str("?.ELEMENT_");
};
&var[3..]
} else if var.starts_with("$$") {
if first {
path.push_str("ELEMENTS_");
return_all_matches = true;
if self.config.debug {
eprintln!("[STAM fromxml] will return all matches for {}", var);
}
};
&var[2..]
} else if var.starts_with("$") {
if first {
path.push_str("ELEMENT_");
};
&var[1..]
} else if var.starts_with("?.@") {
if first {
path.push_str("?.");
};
&var[2..]
} else {
var
};
if !first && !var.is_empty() && !path.ends_with("ELEMENT_") && !path.ends_with("ELEMENTS_"){
path.push_str("_IN_");
}
let (component, remainder) = var.split_once("/").unwrap_or((var,""));
if component.is_empty() {
if first && !remainder.is_empty() {
let mut n = node.clone();
while let Some(parentnode) = n.parent_element() {
n = parentnode;
}
let (rootcomponent, remainder) = remainder.split_once("/").unwrap_or((remainder,""));
let (prefix, localname) = if let Some(pos) = rootcomponent.find(":") {
(Some(&rootcomponent[0..pos]), &rootcomponent[pos+1..])
} else {
(None, rootcomponent)
};
if localname != n.tag_name().name() && localname != "*" {
None
} else {
if let Some(prefix) = prefix {
path.push_str(prefix);
path.push_str("__");
}
path.push_str(localname);
self.context_for_var(&n, remainder, path, return_all_matches)
}
} else {
Some(recursive_text(node).into())
}
} else if component.starts_with("@"){
if let Some(pos) = component.find(":") {
let prefix = &component[1..pos];
if let Some(ns) = self.config.namespaces.get(prefix) {
let var = &component[pos+1..];
path.push_str("ATTRIB_");
path.push_str(prefix);
path.push_str("__");
path.push_str(var);
Some(
node.attribute((ns.as_str(),var)).into()
)
} else {
None
}
} else {
let var = &component[1..];
path.push_str("ATTRIB_");
path.push_str(var);
Some(
node.attribute(var).into()
)
}
} else if component == ".." {
if let Some(parentnode) = node.parent_element().as_ref() {
path.push_str("PARENT");
self.context_for_var(parentnode, remainder, path, return_all_matches)
} else {
None
}
} else if component == "." {
path.push_str("THIS");
if !remainder.is_empty() {
self.context_for_var(node, remainder, path, return_all_matches)
} else {
Some(recursive_text(node).into())
}
} else {
let (prefix, localname) = if let Some(pos) = component.find(":") {
(Some(&component[0..pos]), &component[pos+1..])
} else {
(None, component)
};
let localname_with_condition = localname;
let (localname, condition_str, condition) = self.extract_condition(localname_with_condition); let mut multiple_value_buffer: Vec<upon::Value> = Vec::new(); let mut final_path: String = String::new(); for child in node.children() {
if child.is_element() {
let namedata = child.tag_name();
let mut child_matches = if let Some(namespace) = namedata.namespace() {
if let Some(foundprefix) = self.prefixes.get(namespace) {
Some(foundprefix.as_str()) == prefix && localname == namedata.name()
} else {
false
}
} else {
namedata.name() == localname
};
if child_matches {
if let Some((attribname, negate, attribvalue)) = condition {
if let Some(pos) = attribname.find(":") {
let prefix = &attribname[0..pos];
if let Some(ns) = self.config.namespaces.get(prefix) {
let attribname = &attribname[pos+1..];
if let Some(value) = child.attribute((ns.as_str(),attribname)) {
if !negate && attribvalue != Some(value) {
child_matches = false;
} else if negate && attribvalue == Some(value) {
child_matches = false;
}
} else {
child_matches = false;
}
} else {
child_matches = false;
}
} else {
if let Some(value) = child.attribute(attribname) {
if !negate && attribvalue != Some(value) {
child_matches = false;
} else if negate && attribvalue == Some(value) {
child_matches = false;
}
} else {
child_matches = false;
}
}
}
if !child_matches && self.config.debug {
eprintln!("[STAM fromxml] candidate node does not meet condition: {}", localname_with_condition);
}
}
if child_matches {
let prevpathlen = path.len();
if let Some(prefix) = prefix {
path.push_str(prefix);
path.push_str("__");
}
path.push_str(localname);
if condition.is_some() {
let mut hasher = DefaultHasher::new();
condition_str.hash(&mut hasher);
let h = hasher.finish();
path.push_str(&format!("_COND{}_", h));
}
if let Some(value) = self.context_for_var(&child, remainder, path, return_all_matches) {
if return_all_matches {
if let upon::Value::List(v) = value {
multiple_value_buffer.extend(v.into_iter());
} else {
multiple_value_buffer.push(value);
}
if final_path.is_empty() {
final_path = path.clone();
}
} else {
return Some(value);
}
}
path.truncate(prevpathlen);
}
}
}
if !multiple_value_buffer.is_empty() {
if self.config.debug {
eprintln!("[STAM fromxml] returning multiple matches of {} as list", var);
}
*path = final_path;
Some(multiple_value_buffer.into())
} else {
if self.config.debug {
eprintln!("[STAM fromxml] returning with no match found for {} in {}", var, node.tag_name().name());
}
None
}
}
}
fn extract_condition<'b>(&self, localname: &'b str) -> (&'b str, &'b str, Option<(&'b str, bool, Option<&'b str>)>) { if localname.ends_with("]") {
if let Some(pos) = localname.find("[") {
let condition = &localname[pos+1..localname.len()-1];
let (mut attrib, negation, attribvalue) = if let Some(pos) = condition.find("=") {
let attrib = condition[0..pos].trim();
let value = condition[pos+1..].trim();
let value = &value[1..value.len() - 1]; if attrib.ends_with('!') {
(attrib[..attrib.len() - 1].trim(), true, Some(value))
} else {
(attrib.trim(), false, Some(value))
}
} else {
(condition, false, None)
};
if attrib.starts_with('@') {
attrib = &attrib[1..];
}
return (&localname[..pos], condition, Some((attrib, negation,attribvalue )) );
}
}
(localname, "", None)
}
fn get_node_name_for_template<'b>(&self, node: &'b Node) -> Cow<'b,str> {
let extended_name = node.tag_name();
match (extended_name.namespace(), extended_name.name()) {
(Some(namespace), tagname) => {
if let Some(prefix) = self.prefixes.get(namespace) {
Cow::Owned(format!("{}__{}", prefix, tagname))
} else {
Cow::Borrowed(tagname)
}
}
(None, tagname) => Cow::Borrowed(tagname),
}
}
fn get_node_name_for_xpath<'b>(&self, node: &'b Node) -> Cow<'b,str> {
let extended_name = node.tag_name();
match (extended_name.namespace(), extended_name.name()) {
(Some(namespace), tagname) => {
if let Some(prefix) = self.prefixes.get(namespace) {
Cow::Owned(format!("{}:{}", prefix, tagname))
} else {
Cow::Borrowed(tagname)
}
}
(None, tagname) => Cow::Borrowed(tagname),
}
}
fn precompile(&mut self, template: &'a str) -> Cow<'a,str> {
let mut replacement = String::new();
let mut variables: BTreeSet<&'a str> = BTreeSet::new();
let mut begin = 0;
let mut end = 0;
for i in 0..template.len() {
let slice = &template[i..];
if slice.starts_with("{{") || slice.starts_with("{%") {
begin = i;
} else if slice.starts_with("}}") || slice.starts_with("%}") {
if end < begin+2 {
replacement.push_str(&template[end..begin+2]);
}
let inner = &template[begin+2..i]; replacement.push_str(&self.precompile_inblock(inner, &mut variables));
end = i;
}
}
if end > 0 {
replacement.push_str(&template[end..]);
}
self.variables.insert(template.into(), variables);
if !replacement.is_empty() {
Cow::Owned(replacement)
} else {
Cow::Borrowed(template)
}
}
fn precompile_inblock<'s>(&self, s: &'s str, vars: &mut BTreeSet<&'s str>) -> Cow<'s,str> {
let mut quoted = false;
let mut var = false;
let mut begin = 0;
let mut end = 0;
let mut replacement = String::new();
let mut in_condition = false;
for (i,c) in s.char_indices() {
if in_condition && c != ']' {
continue;
}
if c == '"' {
quoted = !quoted;
} else if !quoted {
if !var && (c == '@' || c == '$') {
var = true;
begin = i;
} else if var && c == '[' {
in_condition = true;
} else if var && in_condition && c == ']' {
in_condition = false;
} else if var && in_condition {
continue;
} else if var && (!c.is_alphanumeric() && c != '$' && c != '.' && c != '/' && c != '_' && c != ':' && c != '@') {
if end < begin {
replacement.push_str(&s[end..begin]);
}
let varname = &s[begin..i];
vars.insert(varname);
let replacement_var = self.precompile_name(varname);
replacement += &replacement_var;
end = i;
var = false;
}
}
}
if end > 0 {
replacement.push_str(&s[end..]);
}
if var {
let varname = &s[begin..];
vars.insert(varname);
let replacement_var = self.precompile_name(varname);
replacement += &replacement_var;
}
if !replacement.is_empty() {
Cow::Owned(replacement)
} else {
Cow::Borrowed(s)
}
}
fn precompile_name(&self, s: &str) -> String {
let mut replacement = String::new();
let mut begincondition = None;
let mut skip = 0;
for (i,c) in s.char_indices() {
if begincondition.is_some() && c != ']' {
continue;
} else if skip > 0 {
skip -= 1;
continue;
}
if c == '$' {
let slice = &s[i..];
if slice.starts_with("$$..") {
replacement.push_str("ELEMENTS_PARENT");
skip = 3;
} else if slice.starts_with("$$.") {
replacement.push_str("ELEMENTS_THIS");
skip = 2;
} else if slice.starts_with("$$/") {
replacement.push_str("ELEMENTS_");
skip = 2;
} else if slice.starts_with("$$") {
replacement.push_str("ELEMENTS_");
skip = 1;
} else if slice.starts_with("$..") {
replacement.push_str("ELEMENT_PARENT");
skip = 2;
} else if slice.starts_with("$.") {
replacement.push_str("ELEMENT_THIS");
skip = 1;
} else if slice.starts_with("$/") {
replacement.push_str("ELEMENT_");
skip = 1;
} else {
replacement.push_str("ELEMENT_");
}
} else if c == '@' {
replacement.push_str("ATTRIB_");
} else if c == '/' {
replacement.push_str("_IN_");
} else if c == ':' {
replacement.push_str("__");
} else if c == '[' {
begincondition = Some(i+1);
} else if c == ']' {
if let Some(begin) = begincondition {
let mut hasher = DefaultHasher::new();
let _ = &s[begin..i].hash(&mut hasher);
let h = hasher.finish();
replacement.push_str(&format!("_COND{}_", h));
}
begincondition = None;
} else {
replacement.push(c);
}
}
replacement
}
fn add_metadata(&self, store: &mut AnnotationStore) -> Result<(), XmlConversionError> {
for metadata in self.config.metadata.iter() {
let mut builder = AnnotationBuilder::new();
let resource_id = if let Some(resource_handle) = self.resource_handle {
store.resource(resource_handle).unwrap().id()
} else {
None
};
let mut context = self.global_context.clone();
if let Some(resource_id) = resource_id {
context.insert("resource".into(), resource_id.into());
}
if let Some(template) = &metadata.id {
let compiled_template = self.template_engine.template(template.as_str());
let id = compiled_template.render(&context).to_string().map_err(|e|
XmlConversionError::TemplateError(
format!(
"whilst rendering metadata id template '{}'",
template,
),
Some(e),
)
)?;
if !id.is_empty() {
builder = builder.with_id(id);
}
}
for annotationdata in metadata.annotationdata.iter() {
let mut databuilder = AnnotationDataBuilder::new();
if let Some(template) = &annotationdata.set {
let compiled_template = self.template_engine.template(template.as_str());
let dataset = compiled_template.render(&context).to_string().map_err(|e|
XmlConversionError::TemplateError(
format!(
"whilst rendering annotationdata/dataset template '{}' for metadata",
template,
),
Some(e),
)
)?;
if !dataset.is_empty() {
databuilder = databuilder.with_dataset(dataset.into())
}
} else {
databuilder =
databuilder.with_dataset(self.config.default_set.as_str().into());
}
if let Some(template) = &annotationdata.key {
let compiled_template = self.template_engine.template(template.as_str());
match compiled_template.render(&context).to_string().map_err(|e|
XmlConversionError::TemplateError(
format!(
"whilst rendering annotationdata/key template '{}' for metadata",
template,
),
Some(e),
)
) {
Ok(key) if !key.is_empty() =>
databuilder = databuilder.with_key(key.into()) ,
Ok(_) if !annotationdata.skip_if_missing => {
return Err(XmlConversionError::TemplateError(
format!(
"whilst rendering annotationdata/key template '{}' metadata",
template,
),
None
));
},
Err(e) if !annotationdata.skip_if_missing => {
return Err(e)
},
_ => {
continue
}
}
}
if let Some(value) = &annotationdata.value {
match self.extract_value_metadata(value, &upon::Value::Map(context.clone()), annotationdata.allow_empty_value, annotationdata.skip_if_missing, resource_id.as_deref())? {
Some(value) => {
databuilder = databuilder.with_value(value);
},
None => {
continue
}
}
}
builder = builder.with_data_builder(databuilder);
}
match metadata.annotation {
XmlAnnotationHandling::TextSelector => {
builder = builder.with_target(SelectorBuilder::TextSelector(BuildItem::Handle(self.resource_handle.expect("resource must have handle")), Offset::whole()));
if self.config.debug {
eprintln!("[STAM fromxml] builder AnnotateText: {:?}", builder);
}
store.annotate(builder)?;
}
XmlAnnotationHandling::ResourceSelector | XmlAnnotationHandling::None | XmlAnnotationHandling::Unspecified => {
builder = builder.with_target(SelectorBuilder::ResourceSelector(
self.resource_handle.into(),
));
if self.config.debug {
eprintln!("[STAM fromxml] builder AnnotateResource: {:?}", builder);
}
store.annotate(builder)?;
}
_ => panic!(
"Invalid annotationhandling for metadata: {:?}",
metadata.annotation
),
}
}
Ok(())
}
}
fn recursive_text(node: &Node) -> String {
let mut s = String::new();
for child in node.children() {
if child.is_text() {
s += child.text().expect("should have text");
} else if child.is_element() {
s += &recursive_text(&child);
}
}
s
}
fn filter_capitalize(s: &str) -> String {
let mut out = String::with_capacity(s.len());
for (i, c) in s.chars().enumerate() {
if i == 0 {
out.push_str(&c.to_uppercase().collect::<String>())
} else {
out.push(c);
}
}
out
}
fn filter_gt(a: &upon::Value, b: &upon::Value) -> bool {
match (a, b) {
(upon::Value::Integer(a), upon::Value::Integer(b)) => *a > *b,
(upon::Value::Float(a), upon::Value::Float(b)) => *a > *b,
(upon::Value::String(a), upon::Value::String(b)) => *a > *b,
_ => false,
}
}
fn filter_lt(a: &upon::Value, b: &upon::Value) -> bool {
match (a, b) {
(upon::Value::Integer(a), upon::Value::Integer(b)) => *a < *b,
(upon::Value::Float(a), upon::Value::Float(b)) => *a < *b,
(upon::Value::String(a), upon::Value::String(b)) => *a < *b,
_ => false,
}
}
fn filter_gte(a: &upon::Value, b: &upon::Value) -> bool {
match (a, b) {
(upon::Value::Integer(a), upon::Value::Integer(b)) => *a >= *b,
(upon::Value::Float(a), upon::Value::Float(b)) => *a >= *b,
(upon::Value::String(a), upon::Value::String(b)) => *a >= *b,
_ => false,
}
}
fn filter_lte(a: &upon::Value, b: &upon::Value) -> bool {
match (a, b) {
(upon::Value::Integer(a), upon::Value::Integer(b)) => *a <= *b,
(upon::Value::Float(a), upon::Value::Float(b)) => *a <= *b,
(upon::Value::String(a), upon::Value::String(b)) => *a <= *b,
_ => false,
}
}
fn filter_add(a: &upon::Value, b: &upon::Value) -> upon::Value {
match (a, b) {
(upon::Value::Integer(a), upon::Value::Integer(b)) => upon::Value::Integer(a + b),
(upon::Value::Float(a), upon::Value::Float(b)) => upon::Value::Float(a + b),
(upon::Value::String(a), upon::Value::String(b)) => upon::Value::String(a.clone() + b),
_ => upon::Value::None,
}
}
fn filter_sub(a: &upon::Value, b: &upon::Value) -> upon::Value {
match (a, b) {
(upon::Value::Integer(a), upon::Value::Integer(b)) => upon::Value::Integer(a - b),
(upon::Value::Float(a), upon::Value::Float(b)) => upon::Value::Float(a - b),
_ => upon::Value::None,
}
}
fn filter_mul(a: &upon::Value, b: &upon::Value) -> upon::Value {
match (a, b) {
(upon::Value::Integer(a), upon::Value::Integer(b)) => upon::Value::Integer(a * b),
(upon::Value::Float(a), upon::Value::Float(b)) => upon::Value::Float(a * b),
_ => upon::Value::None,
}
}
fn filter_div(a: &upon::Value, b: &upon::Value) -> upon::Value {
match (a, b) {
(upon::Value::Integer(a), upon::Value::Integer(b)) => upon::Value::Integer(a / b),
(upon::Value::Float(a), upon::Value::Float(b)) => upon::Value::Float(a / b),
_ => upon::Value::None,
}
}
fn map_value(value: &toml::Value) -> upon::Value {
match value {
toml::Value::String(s) => upon::Value::String(s.clone()),
toml::Value::Integer(i) => upon::Value::Integer(*i),
toml::Value::Float(i) => upon::Value::Float(*i),
toml::Value::Boolean(v) => upon::Value::Bool(*v),
toml::Value::Datetime(s) => upon::Value::String(s.to_string()),
toml::Value::Array(v) => upon::Value::List(v.iter().map(|i| map_value(i)).collect()),
toml::Value::Table(v) => upon::Value::Map(v.iter().map(|(k,i)| (k.clone(),map_value(i))).collect()),
}
}
#[inline]
fn string_to_datavalue(value: String, valuetype: Option<&str>) -> Result<DataValue,XmlConversionError> {
match valuetype {
Some("str") | Some("string") => Ok(DataValue::String(value)),
Some("int") => {
if let Ok(value) = value.parse::<isize>() {
Ok(DataValue::Int(value))
} else {
Err(XmlConversionError::TemplateError(format!("Unable to interpret value as integer: {}", value), None))
}
},
Some("float") => {
if let Ok(value) = value.parse::<f64>() {
Ok(DataValue::Float(value))
} else {
Err(XmlConversionError::TemplateError(format!("Unable to interpret value as integer: {}", value), None))
}
},
Some("bool") => match value.as_str() {
"yes" | "true" | "enabled" | "on" | "1" | "active" => Ok(DataValue::Bool(true)),
_ => Ok(DataValue::Bool(false))
},
Some(x) => {
Err(XmlConversionError::TemplateError(format!("Invalid valuetype: {}", x), None))
}
None => {
if let Ok(value) = value.parse::<isize>() {
Ok(DataValue::Int(value))
} else if let Ok(value) = value.parse::<f64>() {
Ok(DataValue::Float(value))
} else if value.starts_with("(list) [ ") && value.ends_with(" ]") {
if let Ok(serde_json::Value::Array(values)) = serde_json::from_str(&value[6..]) {
Ok(DataValue::List(values.into_iter().map(|v| {
match v {
serde_json::Value::String(s) => DataValue::String(s),
serde_json::Value::Number(n) => if let Some(n) = n.as_i64() {
DataValue::Int(n as isize)
} else if let Some(n) = n.as_f64() {
DataValue::Float(n)
} else {
unreachable!("number should always be either int or float")
},
serde_json::Value::Bool(b) => DataValue::Bool(b),
_ => DataValue::Null, }
}).collect()))
} else {
Err(XmlConversionError::TemplateError(format!("Unable to deserialize list value: {}", value), None))
}
} else {
Ok(value.into())
}
}
}
}
fn string_to_templatevalue(value: String) -> upon::Value {
if let Ok(value) = value.parse::<i64>() {
upon::Value::Integer(value)
} else if let Ok(value) = value.parse::<f64>() {
upon::Value::Float(value)
} else {
upon::Value::String(value)
}
}
fn value_formatter(f: &mut upon::fmt::Formatter<'_>, value: &upon::Value) -> upon::fmt::Result {
match value {
upon::Value::List(vs) => {
f.write_str("(list) [ ")?;
for (i, v) in vs.iter().enumerate() {
if i > 0 {
f.write_str(", ")?;
}
if let upon::Value::String(s) = v {
write!(f, "\"{}\"", s.replace("\"","\\\"").replace("\n"," ").split_whitespace().collect::<Vec<_>>().join(" "))?;
} else {
upon::fmt::default(f, v)?;
f.write_char('"')?;
}
}
f.write_str(" ]")?;
}
v => upon::fmt::default(f, v)?, };
Ok(())
}
#[derive(Clone,Debug,Deserialize)]
struct ExternalFilter {
name: String,
command: String,
args: Vec<String>
}
impl ExternalFilter {
fn run(&self, input_value: &upon::Value) -> upon::Value {
let process = Command::new(self.command.as_str()).args(
self.args.iter().map(|x| if x == "{{value}}" || x == "{{ value }}" || x == "$value" {
match input_value {
upon::Value::String(s) => s.clone(),
upon::Value::Integer(d) => format!("{}",d),
upon::Value::Float(d) => format!("{}",d),
upon::Value::Bool(d) => format!("{}",d),
upon::Value::None => String::new(),
_ => panic!("Lists and maps are not supported to be passed as parameter to external filters yet!"),
}
} else {
x.clone() })
).stdin(Stdio::piped()).stdout(Stdio::piped()).spawn();
if let Ok(mut process) = process {
{
let mut outstdin = process.stdin.take().expect("unable to open stdin for external filter");
let mut writer = BufWriter::new(&mut outstdin);
match input_value {
upon::Value::String(s) => writer.write(s.as_bytes()),
upon::Value::Integer(d) => writer.write(format!("{}",d).as_bytes()),
upon::Value::Float(d) => writer.write(format!("{}",d).as_bytes()),
upon::Value::Bool(d) => writer.write(format!("{}",d).as_bytes()),
upon::Value::None => writer.write(&[]),
_ => panic!("Lists and maps are not supported to be passed as input to external filters yet!"),
}.expect("Writing to stdin for external filter failed!");
}
let output = process.wait_with_output().expect("External filter wasn't running");
if !output.status.success() {
panic!("External filter {} failed ({:?})", self.name, output.status.code());
}
if let Ok(s) = String::from_utf8(output.stdout) {
return string_to_templatevalue(s);
} else {
panic!("External filter {} produced invalid UTF-8!", self.name);
}
}
panic!("External filter {} failed!", self.name);
}
}
#[cfg(test)]
mod tests {
use super::*;
const XMLSMALLEXAMPLE: &'static str = r#"<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>test</title></head><body><h1>TEST</h1><p xml:id="p1" n="001">This is a <em xml:id="emphasis" style="color:green">test</em>.</p></body></html>"#;
const XMLEXAMPLE: &'static str = r#"<!DOCTYPE entities[<!ENTITY nbsp " ">]>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:my="http://example.com">
<head>
<title>Test</title>
<meta name="author" content="proycon" />
</head>
<body>
<h1>Header</h1>
<p xml:id="par1">
<span xml:id="sen1">This is a sentence.</span>
<span xml:id="sen2">This is the second sentence.</span>
</p>
<p xml:id="par2">
<strong>This</strong> is the <em>second</em> paragraph.
It has a <strong>bold</strong> word and one in <em>italics</em>.<br/>
Let's highlight stress in the following word: <span my:stress="secondary">re</span>pu<span my:stress="primary">ta</span>tion.
</p>
<p xml:space="preserve"><![CDATA[This third
paragraph consists
of CDATA and is configured to preserve whitespace, and weird &entities; ]]></p>
<h2>Subsection</h2>
<p>
Have some fruits:<br/>
<ul xml:id="list1" class="fruits">
<li xml:id="fruit1">apple</li>
<li xml:id="fruit2">banana</li>
<li xml:id="fruit3">melon</li>
</ul>
</p>
Some lingering text outside of any confines...
</body>
</html>"#;
const XMLEXAMPLE_TEXTOUTPUT: &'static str = "Header\n\nThis is a sentence. This is the second sentence.\n\nThis is the second paragraph. It has a bold word and one in italics.\nLet's highlight stress in the following word: reputation.\n\nThis third\nparagraph consists\nof CDATA and is configured to preserve whitespace, and weird &entities; \nSubsection\n\nHave some fruits:\n* apple\n* banana\n* melon\n\nSome lingering text outside of any confines...";
const XMLTEISPACE: &'static str = r#"<html xmlns="http://www.w3.org/1999/xhtml">
<body><space dim="vertical" unit="lines" quantity="3" /></body></html>"#;
const CONF: &'static str = r#"#default whitespace handling (Collapse or Preserve)
whitespace = "Collapse"
default_set = "urn:stam-fromhtml"
[namespaces]
#this defines the namespace prefixes you can use in this configuration
xml = "http://www.w3.org/XML/1998/namespace"
html = "http://www.w3.org/1999/xhtml"
xsd = "http://www.w3.org/2001/XMLSchema"
xlink = "http://www.w3.org/1999/xlink"
# elements and attributes are matched in reverse-order, so put more generic statements before more specific ones
#Define some base elements that we reuse later for actual elements (prevents unnecessary repetition)
[baseelements.common]
id = "{% if ?.@xml:id %}{{ @xml:id }}{% endif %}"
[[baseelements.common.annotationdata]]
key = "type"
value = "{{ localname }}"
[[baseelements.common.annotationdata]]
key = "lang"
value = "{{ @xml:lang }}"
skip_if_missing = true
[[baseelements.common.annotationdata]]
key = "n"
value = "{{ @n }}"
skip_if_missing = true
valuetype = "int"
[[baseelements.common.annotationdata]]
key = "nstring"
value = "{{ @n }}"
skip_if_missing = true
valuetype = "string"
[[baseelements.common.annotationdata]]
key = "style"
value = "{{ @style }}"
skip_if_missing = true
[[baseelements.common.annotationdata]]
key = "class"
value = "{{ @class }}"
skip_if_missing = true
[[baseelements.common.annotationdata]]
key = "src"
value = "{{ @src }}"
skip_if_missing = true
[baseelements.text]
text = true
[[elements]]
base = [ "text", "common" ]
path = "*"
text = true
annotation = "TextSelector"
# Pass through the following elements without mapping to text
[[elements]]
base = [ "common" ]
path = "//html:head"
[[elements]]
base = [ "common" ]
path = "//html:head//*"
# Map metadata like <meta name="key" content="value"> to annotations with key->value data selecting the resource (ResourceSelector)
[[elements]]
base = [ "common" ]
path = "//html:head//html:meta"
[[elements.annotationdata]]
key = "{% if ?.@name %}{{ name }}{% endif %}"
value = "{% if ?.@content %}{{ @content }}{% endif %}"
skip_if_missing = true
# By default, ignore any tags in the head (unless they're mentioned specifically later in the config)
[[elements]]
path = "//html:head/html:title"
annotation = "ResourceSelector"
[[elements.annotationdata]]
key = "title"
value = "{{ $. | trim }}"
# Determine how various structural elements are converted to text
[[elements]]
base = [ "common" ]
path = "//html:br"
textsuffix = "\n"
[[elements]]
base = [ "common", "text" ]
path = "//html:p"
textprefix = "\n"
textsuffix = "\n"
annotation = "TextSelector"
# Let's do headers and bulleted lists like markdown
[[elements]]
base = [ "common", "text" ]
path = "//html:h1"
textsuffix = "\n"
annotation = "TextSelector"
id = "h1"
[[elements]]
base = [ "common", "text" ]
path = "//html:body//html:h2"
textsuffix = "\n"
annotation = "TextSelector"
id = "h2"
#Generic, will be overriden by more specific one
[[elements]]
base = [ "common", "text" ]
path = "//html:li"
textprefix = "- "
textsuffix = "\n"
[[elements]]
base = [ "common", "text" ]
path = """//html:body"""
annotation = "TextSelector"
id = "body"
[[elements.annotationdata]]
key = "title_from_parent"
value = "{{ $../html:head/html:title }}"
skip_if_missing = true
[[elements.annotationdata]]
key = "title_from_root"
value = "{{ $/html:html/html:head/html:title }}"
skip_if_missing = true
[[elements.annotationdata]]
key = "firstfruit"
value = """{{ $./html:p/html:ul/html:li }}"""
skip_if_missing = true
[[elements.annotationdata]]
key = "fruits"
value = """{{ $$./html:p/html:ul/html:li }}"""
skip_if_missing = true
[[elements.annotationdata]]
key = "multifruits"
value = """{{ $$./html:p/html:ul/html:li }}"""
skip_if_missing = true
multiple = true
#More specific one takes precendence over the above generic one
[[elements]]
base = [ "common", "text" ]
path = """//html:ul[@class="fruits"]/html:li"""
textprefix = "* "
textsuffix = "\n"
#Not real HTML, test-case modelled after TEI space
[[elements]]
base = [ "common" ]
path = """//html:space[@dim="vertical" and @unit="lines"]"""
text = true
textsuffix = """\n{% for x in @quantity | int | as_range %}\n{% endfor %}"""
[[elements]]
base = [ "common", "text" ]
path = "//html:example"
annotation = "TextSelector"
[[elements.annotationdata]]
key = "requiredattrib"
value = "{{ @requiredattrib }}"
[[elements.annotationdata]]
key = "optattrib"
value = "{{ ?.@optattrib }}"
[[elements]]
base = [ "common","text" ]
path = "//html:marquee"
annotation = "TextSelector"
#map value, some bogus data to test parsing
[[elements.annotationdata]]
key = "map"
[elements.annotationdata.value]
text = "{{ $. }}"
number = 42
bogus = true
[[metadata]]
id = "metadata"
[[metadata.annotationdata]]
key = "author"
value = "proycon"
"#;
const XMLREQATTRIBEXAMPLE: &'static str = r#"<html xmlns="http://www.w3.org/1999/xhtml">
<body><example xml:id="ann1" requiredattrib="blah">test</example></body></html>"#;
const XMLREQATTRIBEXAMPLE2: &'static str = r#"<html xmlns="http://www.w3.org/1999/xhtml">
<body><example xml:id="ann1">test</example></body></html>"#;
const XMLREQATTRIBEXAMPLE3: &'static str = r#"<html xmlns="http://www.w3.org/1999/xhtml">
<body><example xml:id="ann1" requiredattrib="blah" optattrib="blah">test</example></body></html>"#;
const XMLMAPEXAMPLE: &'static str = r#"<html xmlns="http://www.w3.org/1999/xhtml">
<body><marquee xml:id="ann1">test</marquee></body></html>"#;
#[test]
fn test_precompile_template_nochange() -> Result<(), String> {
let config = XmlConversionConfig::new();
let mut conv = XmlToStamConverter::new(&config);
let template_in = "{{ foo }}";
let template_out = conv.precompile(template_in);
assert_eq!( template_out, template_in);
assert!(!conv.variables.get(template_in).as_ref().unwrap().contains("foo"));
Ok(())
}
#[test]
fn test_precompile_template_attrib() -> Result<(), String> {
let config = XmlConversionConfig::new();
let mut conv = XmlToStamConverter::new(&config);
let template_in = "{{ @foo }}";
let template_out = conv.precompile(template_in);
assert_eq!(template_out, "{{ ATTRIB_foo }}");
assert!(conv.variables.get(template_in).as_ref().unwrap().contains("@foo"));
Ok(())
}
#[test]
fn test_precompile_template_attrib_ns() -> Result<(), String> {
let config = XmlConversionConfig::new();
let mut conv = XmlToStamConverter::new(&config);
let template_in = "{{ @bar:foo }}";
let template_out = conv.precompile(template_in);
assert_eq!(template_out, "{{ ATTRIB_bar__foo }}");
assert!(conv.variables.get(template_in).as_ref().unwrap().contains("@bar:foo"));
Ok(())
}
#[test]
fn test_precompile_template_element() -> Result<(), String> {
let config = XmlConversionConfig::new();
let mut conv = XmlToStamConverter::new(&config);
let template_in = "{{ $foo }}";
let template_out = conv.precompile(template_in);
assert_eq!(template_out, "{{ ELEMENT_foo }}");
assert!(conv.variables.get(template_in).as_ref().unwrap().contains("$foo"));
Ok(())
}
#[test]
fn test_precompile_template_element_ns() -> Result<(), String> {
let config = XmlConversionConfig::new();
let mut conv = XmlToStamConverter::new(&config);
let template_in = "{{ $bar:foo }}";
let template_out = conv.precompile(template_in);
assert_eq!(template_out, "{{ ELEMENT_bar__foo }}");
assert!(conv.variables.get(template_in).as_ref().unwrap().contains("$bar:foo"));
Ok(())
}
#[test]
fn test_precompile_template_this_text() -> Result<(), String> {
let config = XmlConversionConfig::new();
let mut conv = XmlToStamConverter::new(&config);
let template_in = "{{ $. }}";
let template_out = conv.precompile(template_in);
assert_eq!(template_out, "{{ ELEMENT_THIS }}");
assert!(conv.variables.get(template_in).as_ref().unwrap().contains("$."));
Ok(())
}
#[test]
fn test_precompile_template_parent_text() -> Result<(), String> {
let config = XmlConversionConfig::new();
let mut conv = XmlToStamConverter::new(&config);
let template_in = "{{ $.. }}";
let template_out = conv.precompile(template_in);
assert_eq!(template_out, "{{ ELEMENT_PARENT }}");
assert!(conv.variables.get(template_in).as_ref().unwrap().contains("$.."));
Ok(())
}
#[test]
fn test_precompile_template_elements() -> Result<(), String> {
let config = XmlConversionConfig::new();
let mut conv = XmlToStamConverter::new(&config);
let template_in = "{{ $$foo }}";
let template_out = conv.precompile(template_in);
assert_eq!(template_out, "{{ ELEMENTS_foo }}");
assert!(conv.variables.get(template_in).as_ref().unwrap().contains("$$foo"));
Ok(())
}
#[test]
fn test_precompile_template_elements_ns() -> Result<(), String> {
let config = XmlConversionConfig::new();
let mut conv = XmlToStamConverter::new(&config);
let template_in = "{{ $$bar:foo }}";
let template_out = conv.precompile(template_in);
assert_eq!(template_out, "{{ ELEMENTS_bar__foo }}");
assert!(conv.variables.get(template_in).as_ref().unwrap().contains("$$bar:foo"));
Ok(())
}
#[test]
fn test_precompile_template_attrib2() -> Result<(), String> {
let config = XmlConversionConfig::new();
let mut conv = XmlToStamConverter::new(&config);
let template_in = "{% for x in @foo %}";
let template_out = conv.precompile(template_in);
assert_eq!(template_out, "{% for x in ATTRIB_foo %}");
assert!(conv.variables.get(template_in).as_ref().unwrap().contains("@foo"));
Ok(())
}
#[test]
fn test_precompile_template_attrib3() -> Result<(), String> {
let config = XmlConversionConfig::new();
let mut conv = XmlToStamConverter::new(&config);
let template_in = "{{ ?.@foo }}";
let template_out = conv.precompile(template_in);
assert_eq!(template_out, "{{ ?.ATTRIB_foo }}");
assert!(conv.variables.get(template_in).as_ref().unwrap().contains("@foo"));
Ok(())
}
#[test]
fn test_precompile_template_path() -> Result<(), String> {
let config = XmlConversionConfig::new();
let mut conv = XmlToStamConverter::new(&config);
let template_in = "{{ $x/y/z/@a }}";
let template_out = conv.precompile(template_in);
assert_eq!(template_out, "{{ ELEMENT_x_IN_y_IN_z_IN_ATTRIB_a }}");
assert!(conv.variables.get(template_in).as_ref().unwrap().contains("$x/y/z/@a"));
Ok(())
}
#[test]
fn test_loadconfig() -> Result<(), String> {
let config = XmlConversionConfig::from_toml_str(CONF)?;
let mut conv = XmlToStamConverter::new(&config);
conv.compile().map_err(|e| format!("{}",e))?;
assert_eq!(conv.config.namespaces.len(),4 , "number of namespaces");
assert_eq!(conv.config.elements.len(), 15, "number of elements");
assert_eq!(conv.config.baseelements.len(), 2, "number of baseelements");
assert_eq!(conv.config.elements.get(0).unwrap().annotationdata.len(), 7,"number of annotationdata under first element");
assert_eq!(conv.config.baseelements.get("common").unwrap().annotationdata.len(), 7,"number of annotationdata under baseelement common");
Ok(())
}
#[test]
fn test_small() -> Result<(), String> {
let config = XmlConversionConfig::from_toml_str(CONF)?.with_debug(true);
let mut store = stam::AnnotationStore::new(stam::Config::new());
from_xml_in_memory("test", XMLSMALLEXAMPLE, &config, &mut store)?;
let res = store.resource("test").expect("resource must have been created at this point");
assert_eq!(res.text(), "TEST\n\nThis is a test.\n", "resource text");
assert_eq!(store.annotations_len(), 7, "number of annotations");
let annotation = store.annotation("emphasis").expect("annotation must have been created at this point");
assert_eq!(annotation.text_simple(), Some("test"));
let key = store.key("urn:stam-fromhtml", "style").expect("key must exist");
assert_eq!(annotation.data().filter_key(&key).value_as_str(), Some("color:green"));
let key = store.key("urn:stam-fromhtml", "title").expect("key must exist");
let annotation = res.annotations_as_metadata().filter_key(&key).next().expect("annotation");
assert_eq!(annotation.data().filter_key(&key).value_as_str(), Some("test"));
let bodyannotation = store.annotation("body").expect("body annotation not found");
let title1 = store.key("urn:stam-fromhtml", "title_from_parent").expect("key must exist");
let title2 = store.key("urn:stam-fromhtml", "title_from_root").expect("key must exist");
assert_eq!(bodyannotation.data().filter_key(&title1).value_as_str(), Some("test"));
assert_eq!(bodyannotation.data().filter_key(&title2).value_as_str(), Some("test"));
Ok(())
}
#[test]
fn test_full() -> Result<(), String> {
let config = XmlConversionConfig::from_toml_str(CONF)?.with_debug(true);
let mut store = stam::AnnotationStore::new(stam::Config::new());
from_xml_in_memory("test", XMLEXAMPLE, &config, &mut store)?;
let res = store.resource("test").expect("resource must have been created at this point");
assert_eq!(res.text(), XMLEXAMPLE_TEXTOUTPUT, "resource text");
Ok(())
}
#[test]
fn test_firstfruit() -> Result<(), String> {
let config = XmlConversionConfig::from_toml_str(CONF)?.with_debug(true);
let mut store = stam::AnnotationStore::new(stam::Config::new());
from_xml_in_memory("test", XMLEXAMPLE, &config, &mut store)?;
let bodyannotation = store.annotation("body").expect("body annotation not found");
let fruit = store.key("urn:stam-fromhtml", "firstfruit").expect("key must exist");
assert_eq!(bodyannotation.data().filter_key(&fruit).value_as_str(), Some("apple") );
Ok(())
}
#[test]
fn test_fruits() -> Result<(), String> {
let config = XmlConversionConfig::from_toml_str(CONF)?.with_debug(true);
let mut store = stam::AnnotationStore::new(stam::Config::new());
from_xml_in_memory("test", XMLEXAMPLE, &config, &mut store)?;
let bodyannotation = store.annotation("body").expect("body annotation not found");
let fruits = store.key("urn:stam-fromhtml", "fruits").expect("key must exist");
assert_eq!(bodyannotation.data().filter_key(&fruits).value(), Some(&DataValue::List(vec!("apple".into(),"banana".into(),"melon".into()) )));
Ok(())
}
#[test]
fn test_multifruits() -> Result<(), String> {
let config = XmlConversionConfig::from_toml_str(CONF)?.with_debug(true);
let mut store = stam::AnnotationStore::new(stam::Config::new());
from_xml_in_memory("test", XMLEXAMPLE, &config, &mut store)?;
let bodyannotation = store.annotation("body").expect("body annotation not found");
let fruits = store.key("urn:stam-fromhtml", "multifruits").expect("key must exist");
let results: Vec<_> = bodyannotation.data().filter_key(&fruits).collect();
assert_eq!(results.len(), 3);
assert_eq!(results.get(0).unwrap().value(),&DataValue::String("apple".to_string()) );
assert_eq!(results.get(1).unwrap().value(),&DataValue::String("banana".to_string()) );
assert_eq!(results.get(2).unwrap().value(),&DataValue::String("melon".to_string()) );
Ok(())
}
#[test]
fn test_teispace() -> Result<(), String> {
let config = XmlConversionConfig::from_toml_str(CONF)?;
let mut store = stam::AnnotationStore::new(stam::Config::new());
from_xml_in_memory("test", XMLTEISPACE, &config, &mut store)?;
let res = store.resource("test").expect("resource must have been created at this point");
assert_eq!(res.text(), "\n\n\n\n", "resource text");
Ok(())
}
#[test]
fn test_reqattrib() -> Result<(), String> {
let config = XmlConversionConfig::from_toml_str(CONF)?;
let mut store = stam::AnnotationStore::new(stam::Config::new());
from_xml_in_memory("test", XMLREQATTRIBEXAMPLE, &config, &mut store)?;
let res = store.resource("test").expect("resource must have been created at this point");
assert_eq!(res.text(), "test", "resource text");
let key = store.key("urn:stam-fromhtml", "requiredattrib").expect("key must exist");
let annotation = store.annotation("ann1").expect("annotation");
assert_eq!(annotation.data().filter_key(&key).value_as_str(), Some("blah"));
assert!(store.key("urn:stam-fromhtml", "optattrib").is_none(), "optional attrib is unused");
Ok(())
}
#[test]
fn test_reqattrib2() -> Result<(), String> {
let mut config = XmlConversionConfig::from_toml_str(CONF)?;
config = config.with_debug(true);
let mut store = stam::AnnotationStore::new(stam::Config::new());
assert!(from_xml_in_memory("test", XMLREQATTRIBEXAMPLE2, &config, &mut store).is_err(), "checking if error is returned");
Ok(())
}
#[test]
fn test_reqattrib3() -> Result<(), String> {
let config = XmlConversionConfig::from_toml_str(CONF)?;
let mut store = stam::AnnotationStore::new(stam::Config::new());
from_xml_in_memory("test", XMLREQATTRIBEXAMPLE3, &config, &mut store)?;
let res = store.resource("test").expect("resource must have been created at this point");
assert_eq!(res.text(), "test", "resource text");
let reqkey = store.key("urn:stam-fromhtml", "requiredattrib").expect("key must exist");
let optkey = store.key("urn:stam-fromhtml", "optattrib").expect("key optattrib must exist");
let annotation = store.annotation("ann1").expect("annotation");
assert_eq!(annotation.data().filter_key(&reqkey).value_as_str(), Some("blah"));
assert_eq!(annotation.data().filter_key(&optkey).value_as_str(), Some("blah"));
Ok(())
}
#[test]
fn test_map() -> Result<(), String> {
let config = XmlConversionConfig::from_toml_str(CONF)?;
let mut store = stam::AnnotationStore::new(stam::Config::new());
from_xml_in_memory("test", XMLMAPEXAMPLE, &config, &mut store)?;
let res = store.resource("test").expect("resource must have been created at this point");
assert_eq!(res.text(), "test", "resource text");
let key = store.key("urn:stam-fromhtml", "map").expect("key must exist");
let annotation = store.annotation("ann1").expect("annotation");
let data = annotation.data().filter_key(&key).value().expect("data must exist");
if let DataValue::Map(data) = data {
assert_eq!(data.get("text"), Some(&DataValue::String("test".into())));
assert_eq!(data.get("number"), Some(&DataValue::Int(42)));
assert_eq!(data.get("bogus"), Some(&DataValue::Bool(true)));
assert_eq!(data.len(), 3);
} else {
assert!(false, "Data is supposed to be a map");
}
Ok(())
}
#[test]
fn test_metadata() -> Result<(), String> {
let config = XmlConversionConfig::from_toml_str(CONF)?.with_debug(true);
let mut store = stam::AnnotationStore::new(stam::Config::new());
from_xml_in_memory("test", XMLEXAMPLE, &config, &mut store)?;
let annotation = store.annotation("metadata").expect("annotation");
let key = store.key("urn:stam-fromhtml", "author").expect("key must exist");
let data = annotation.data().filter_key(&key).value().expect("data must exist");
assert_eq!(data, &DataValue::String("proycon".into()));
Ok(())
}
#[test]
fn test_datavalue_int() -> Result<(), String> {
let config = XmlConversionConfig::from_toml_str(CONF)?.with_debug(true);
let mut store = stam::AnnotationStore::new(stam::Config::new());
from_xml_in_memory("test", XMLSMALLEXAMPLE, &config, &mut store)?;
let annotation = store.annotation("p1").expect("annotation not found");
let key = store.key("urn:stam-fromhtml", "n").expect("key must exist");
assert_eq!(annotation.data().filter_key(&key).value(), Some(&DataValue::Int(1)));
Ok(())
}
#[test]
fn test_datavalue_string() -> Result<(), String> {
let config = XmlConversionConfig::from_toml_str(CONF)?.with_debug(true);
let mut store = stam::AnnotationStore::new(stam::Config::new());
from_xml_in_memory("test", XMLSMALLEXAMPLE, &config, &mut store)?;
let annotation = store.annotation("p1").expect("annotation not found");
let key = store.key("urn:stam-fromhtml", "nstring").expect("key must exist");
assert_eq!(annotation.data().filter_key(&key).value(), Some(&DataValue::String("001".to_string())));
Ok(())
}
#[test]
fn test_doubleslash_selector_root() -> Result<(), String> {
let config = XmlConversionConfig::from_toml_str(CONF)?.with_debug(true);
let mut store = stam::AnnotationStore::new(stam::Config::new());
from_xml_in_memory("test", XMLEXAMPLE, &config, &mut store)?;
assert!( store.annotation("h1").is_some());
Ok(())
}
#[test]
fn test_doubleslash_selector_infix_none() -> Result<(), String> {
let config = XmlConversionConfig::from_toml_str(CONF)?.with_debug(true);
let mut store = stam::AnnotationStore::new(stam::Config::new());
from_xml_in_memory("test", XMLEXAMPLE, &config, &mut store)?;
assert!( store.annotation("h2").is_some());
Ok(())
}
}