use clayers_xml::ContentHash;
use xot::Xot;
use crate::error::{Error, Result};
use crate::hash;
use crate::object::{
Attribute, CommentObject, DocumentObject, ElementObject, Object, PIObject, TextObject,
};
use crate::store::ObjectStore;
struct CollectedObject {
hash: ContentHash,
object: Object,
}
pub async fn import_xml(store: &dyn ObjectStore, xml: &str) -> Result<ContentHash> {
let mut xot = Xot::new();
let doc = xot.parse(xml).map_err(xot::Error::from)?;
let root = xot
.document_element(doc)
.map_err(|e| Error::XmlParse(e.to_string()))?;
let mut objects = Vec::new();
let mut prologue_hashes = Vec::new();
let doc_children: Vec<_> = xot.children(doc).collect();
for child in doc_children {
if child == root {
break;
}
if xot.comment_str(child).is_some() || xot.processing_instruction(child).is_some() {
let h = collect_node(&mut xot, child, &mut objects)?;
prologue_hashes.push(h);
}
}
let root_hash = collect_node(&mut xot, root, &mut objects)?;
let doc_obj = DocumentObject {
root: root_hash,
prologue: prologue_hashes,
};
let doc_xml = doc_obj.to_xml();
let doc_hash = hash::hash_exclusive(&doc_xml)?;
objects.push(CollectedObject {
hash: doc_hash,
object: Object::Document(doc_obj),
});
let mut tx = store.transaction().await?;
for entry in objects {
tx.put(entry.hash, entry.object).await?;
}
tx.commit().await?;
Ok(doc_hash)
}
#[allow(clippy::too_many_lines)]
fn collect_node(
xot: &mut Xot,
node: xot::Node,
objects: &mut Vec<CollectedObject>,
) -> Result<ContentHash> {
if let Some(text) = xot.text_str(node) {
let text = text.to_string();
let h = hash::hash_text(&text);
objects.push(CollectedObject {
hash: h,
object: Object::Text(TextObject { content: text }),
});
return Ok(h);
}
if let Some(comment) = xot.comment_str(node) {
let comment = comment.to_string();
let h = hash::hash_text(&comment);
objects.push(CollectedObject {
hash: h,
object: Object::Comment(CommentObject { content: comment }),
});
return Ok(h);
}
if let Some(pi) = xot.processing_instruction(node) {
let target = xot.local_name_str(pi.target()).to_string();
let data = pi.data().map(String::from);
let h = hash::hash_pi(&target, data.as_deref());
objects.push(CollectedObject {
hash: h,
object: Object::PI(PIObject { target, data }),
});
return Ok(h);
}
if xot.is_element(node) {
let mut child_hashes = Vec::new();
let children: Vec<_> = xot.children(node).collect();
for child in children {
let child_hash = collect_node(xot, child, objects)?;
child_hashes.push(child_hash);
}
let clone = xot.clone_with_prefixes(node);
let xml_str = xot
.to_string(clone)
.map_err(|e| Error::XmlParse(e.to_string()))?;
let prefix_map: Vec<(String, String)> = xot
.namespaces(clone)
.iter()
.map(|(prefix_id, ns_id)| {
(
xot.prefix_str(prefix_id).to_string(),
xot.namespace_str(*ns_id).to_string(),
)
})
.collect();
xot.remove(clone)
.map_err(|e| Error::XmlParse(e.to_string()))?;
let (identity_hash, inclusive_hash) = hash::hash_element_xml(&xml_str)?;
let element = xot
.element(node)
.ok_or_else(|| Error::InvalidObject("expected element data".into()))?;
let name_id = element.name();
let (local_name, ns_str) = xot.name_ns_str(name_id);
let local_name = local_name.to_string();
let namespace_uri = if ns_str.is_empty() {
None
} else {
Some(ns_str.to_string())
};
let namespace_prefix = namespace_uri.as_ref().and_then(|uri| {
prefix_map.iter().find_map(|(pfx, ns)| {
if ns == uri && !pfx.is_empty() {
Some(pfx.clone())
} else {
None
}
})
});
let mut attributes = Vec::new();
for (attr_name_id, attr_value) in xot.attributes(node).iter() {
let (attr_local, attr_ns) = xot.name_ns_str(attr_name_id);
let attr_ns_uri = if attr_ns.is_empty() {
None
} else {
Some(attr_ns.to_string())
};
let attr_prefix = attr_ns_uri.as_ref().and_then(|uri| {
prefix_map.iter().find_map(|(pfx, ns)| {
if ns == uri && !pfx.is_empty() {
Some(pfx.clone())
} else {
None
}
})
});
attributes.push(Attribute {
local_name: attr_local.to_string(),
namespace_uri: attr_ns_uri,
namespace_prefix: attr_prefix,
value: attr_value.clone(),
});
}
let mut used_uris: std::collections::HashSet<&str> = std::collections::HashSet::new();
if let Some(ref uri) = namespace_uri {
used_uris.insert(uri);
}
for attr in &attributes {
if let Some(ref uri) = attr.namespace_uri {
used_uris.insert(uri);
}
}
let extra_namespaces: Vec<(String, String)> = prefix_map
.iter()
.filter(|(pfx, uri)| !pfx.is_empty() && !used_uris.contains(uri.as_str()))
.map(|(pfx, uri)| (pfx.clone(), uri.clone()))
.collect();
objects.push(CollectedObject {
hash: identity_hash,
object: Object::Element(ElementObject {
local_name,
namespace_uri,
namespace_prefix,
extra_namespaces,
attributes,
children: child_hashes,
inclusive_hash,
}),
});
return Ok(identity_hash);
}
let mut last_hash = None;
let children: Vec<_> = xot.children(node).collect();
for child in children {
last_hash = Some(collect_node(xot, child, objects)?);
}
last_hash.ok_or(Error::EmptyDocument)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::store::memory::MemoryStore;
#[tokio::test]
async fn import_simple_element() {
let store = MemoryStore::new();
let xml = "<root>hello</root>";
let hash = import_xml(&store, xml).await.unwrap();
assert!(store.contains(&hash).await.unwrap());
}
#[tokio::test]
async fn import_nested_elements() {
let store = MemoryStore::new();
let xml = r#"<root xmlns="urn:test"><child>text</child></root>"#;
let hash = import_xml(&store, xml).await.unwrap();
assert!(store.contains(&hash).await.unwrap());
}
#[tokio::test]
async fn import_deterministic() {
let store = MemoryStore::new();
let xml = "<root><a>1</a><b>2</b></root>";
let h1 = import_xml(&store, xml).await.unwrap();
let h2 = import_xml(&store, xml).await.unwrap();
assert_eq!(h1, h2);
}
#[tokio::test]
async fn import_mixed_content() {
let store = MemoryStore::new();
let xml = "<p>Hello <b>world</b>!</p>";
let hash = import_xml(&store, xml).await.unwrap();
assert!(store.contains(&hash).await.unwrap());
}
}