use crate::core::error::{XmpError, XmpResult};
use crate::core::namespace::{ns, NamespaceMap};
use crate::core::node::{Node, StructureNode};
use crate::types::qualifier::Qualifier;
use quick_xml::escape::unescape;
use quick_xml::events::Event;
use quick_xml::Reader;
pub struct XmpParser {
namespaces: NamespaceMap,
}
impl XmpParser {
pub fn new() -> Self {
Self {
namespaces: NamespaceMap::new(),
}
}
pub fn parse_packet(&mut self, xml: &str) -> XmpResult<StructureNode> {
let packet_content = self.extract_packet_content(xml)?;
self.parse_rdf(&packet_content)
}
pub fn namespace_map(&self) -> NamespaceMap {
self.namespaces.clone()
}
fn extract_packet_content(&self, xml: &str) -> XmpResult<String> {
let Some(start_pos) = xml.find("<?xpacket") else {
return self.validate_and_return_xml(xml);
};
let Some(end_pos) = xml[start_pos..].find("?>") else {
return self.validate_and_return_xml(xml);
};
let pi_end = start_pos + end_pos + 2;
let Some(close_pos) = xml[pi_end..].find("<?xpacket end") else {
return self.validate_and_return_xml(xml);
};
let content = xml[pi_end..pi_end + close_pos].trim().to_string();
Ok(content)
}
fn validate_and_return_xml(&self, xml: &str) -> XmpResult<String> {
let trimmed = xml.trim();
if trimmed.is_empty() || (!trimmed.starts_with('<') && !trimmed.starts_with("<?xml")) {
return Err(XmpError::ParseError("Invalid XML content".to_string()));
}
Ok(trimmed.to_string())
}
fn parse_rdf(&mut self, xml: &str) -> XmpResult<StructureNode> {
let mut reader = Reader::from_str(xml);
reader.config_mut().trim_text(true);
let mut buf = Vec::new();
let mut root = StructureNode::new();
let mut stack: Vec<StructureNode> = Vec::new();
let mut current_path: Vec<String> = Vec::new();
let mut current_qualifiers: Vec<Qualifier> = Vec::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(e)) => {
let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
let attrs = Self::collect_attributes(&e);
self.process_attributes(&attrs, &mut current_qualifiers);
if self.is_description_element(&name) {
self.handle_description_attributes(&attrs, &mut root, ¤t_qualifiers)?;
}
else if self.is_array_container(&name) {
self.handle_array_container(&name, &mut root, &mut current_path)?;
}
else if self.is_li_element(&name) {
} else if !self.is_rdf_element(&name) {
self.push_element_to_path(&name, &mut current_path);
}
}
Ok(Event::Text(e)) => {
let raw_text = String::from_utf8_lossy(e.as_ref());
let text = match unescape(&raw_text) {
Ok(unescaped) => unescaped.to_string(),
Err(_) => raw_text.to_string(),
};
let trimmed_text = text.trim();
if trimmed_text.is_empty() {
continue;
}
let Some(last_path) = current_path.last() else {
continue;
};
if last_path == "__array__" {
self.handle_array_text_item(
&mut root,
¤t_path,
trimmed_text,
¤t_qualifiers,
)?;
} else {
self.handle_simple_text_item(
&mut root,
&mut stack,
last_path,
trimmed_text,
¤t_qualifiers,
)?;
}
}
Ok(Event::End(e)) => {
let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
if name == "Seq"
|| name == "Bag"
|| name == "Alt"
|| name.ends_with(":Seq")
|| name.ends_with(":Bag")
|| name.ends_with(":Alt")
{
if current_path.last() == Some(&"__array__".to_string()) {
current_path.pop();
}
} else if name != "Description"
&& !name.ends_with(":Description")
&& name != "RDF"
&& !name.ends_with(":RDF")
&& name != "li"
&& !name.ends_with(":li")
{
current_path.pop();
}
}
Ok(Event::Eof) => break,
Err(e) => {
return Err(XmpError::ParseError(format!("XML parsing error: {}", e)));
}
Ok(Event::Empty(e)) => {
let name = String::from_utf8_lossy(e.name().as_ref()).to_string();
let attrs = Self::collect_attributes_empty(&e);
self.process_attributes(&attrs, &mut current_qualifiers);
if self.is_description_element(&name) {
self.handle_description_attributes(&attrs, &mut root, ¤t_qualifiers)?;
}
}
_ => {}
}
buf.clear();
}
Ok(root)
}
fn process_attributes(
&mut self,
attrs: &[(String, String)],
current_qualifiers: &mut Vec<Qualifier>,
) {
for (attr_name, attr_value) in attrs {
if attr_name == "xmlns" {
continue;
}
if let Some(prefix) = attr_name.strip_prefix("xmlns:") {
let _ = self.namespaces.register(attr_value, prefix);
}
}
current_qualifiers.clear();
for (attr_name, attr_value) in attrs {
if self.is_lang_attribute(attr_name) {
let qualifier = Qualifier::new(ns::XML, "lang", attr_value.clone());
current_qualifiers.push(qualifier);
}
}
}
fn collect_attributes(e: &quick_xml::events::BytesStart<'_>) -> Vec<(String, String)> {
e.attributes()
.flatten()
.map(|attr| {
let key = String::from_utf8_lossy(attr.key.as_ref()).to_string();
let raw_value = String::from_utf8_lossy(attr.value.as_ref());
let value = match unescape(&raw_value) {
Ok(unescaped) => unescaped.to_string(),
Err(_) => raw_value.to_string(),
};
(key, value)
})
.collect()
}
fn collect_attributes_empty(e: &quick_xml::events::BytesStart<'_>) -> Vec<(String, String)> {
Self::collect_attributes(e)
}
fn is_lang_attribute(&self, attr_name: &str) -> bool {
attr_name == "lang" || attr_name == "xml:lang" || attr_name.ends_with(":lang")
}
fn is_description_element(&self, name: &str) -> bool {
name == "Description" || name.ends_with(":Description")
}
fn is_array_container(&self, name: &str) -> bool {
name == "Seq"
|| name == "Bag"
|| name == "Alt"
|| name.ends_with(":Seq")
|| name.ends_with(":Bag")
|| name.ends_with(":Alt")
}
fn is_li_element(&self, name: &str) -> bool {
name == "li" || name.ends_with(":li")
}
fn is_rdf_element(&self, name: &str) -> bool {
name == "RDF" || name.ends_with(":RDF")
}
fn handle_description_attributes(
&self,
attrs: &[(String, String)],
root: &mut StructureNode,
qualifiers: &[Qualifier],
) -> XmpResult<()> {
for (attr_name, attr_value) in attrs {
if self.should_skip_attribute(attr_name) {
continue;
}
let Some(colon_pos) = attr_name.find(':') else {
continue;
};
let ns_prefix = &attr_name[..colon_pos];
let prop_name = &attr_name[colon_pos + 1..];
let ns_uri = self.namespaces.get_uri(ns_prefix).or_else(|| {
if ns_prefix == "TC260" {
self.namespaces.get_uri("C260")
} else if ns_prefix == "C260" {
self.namespaces.get_uri("TC260")
} else {
None
}
});
let Some(ns_uri) = ns_uri else {
continue;
};
let full_path = format!("{}:{}", ns_uri, prop_name);
let mut simple_node = Node::simple(attr_value.clone());
if let Node::Simple(ref mut sn) = simple_node {
for qual in qualifiers {
sn.add_qualifier(qual.clone());
}
}
root.set_field(full_path.clone(), simple_node);
}
Ok(())
}
fn should_skip_attribute(&self, attr_name: &str) -> bool {
attr_name == "xmlns"
|| attr_name.starts_with("xmlns:")
|| attr_name == "about"
|| attr_name.ends_with(":about")
|| self.is_lang_attribute(attr_name)
}
fn handle_array_container(
&self,
name: &str,
root: &mut StructureNode,
current_path: &mut Vec<String>,
) -> XmpResult<()> {
use crate::core::node::{ArrayNode, ArrayType};
let array_type = if name.contains("Seq") {
ArrayType::Ordered
} else if name.contains("Bag") {
ArrayType::Unordered
} else {
ArrayType::Alternative
};
let array_node = ArrayNode::new(array_type);
let array_node_wrapper = Node::Array(array_node);
let Some(last_path) = current_path.last() else {
return Ok(());
};
let full_path = self.resolve_path_to_full_format(last_path);
root.set_field(full_path.clone(), array_node_wrapper);
current_path.push("__array__".to_string());
Ok(())
}
fn push_element_to_path(&self, name: &str, current_path: &mut Vec<String>) {
let Some(colon_pos) = name.find(':') else {
current_path.push(name.to_string());
return;
};
let ns_prefix = &name[..colon_pos];
let prop_name = &name[colon_pos + 1..];
if let Some(ns_uri) = self.namespaces.get_uri(ns_prefix) {
let full_path = format!("{}:{}", ns_uri, prop_name);
current_path.push(full_path);
} else {
current_path.push(name.to_string());
}
}
fn resolve_path_to_full_format(&self, path: &str) -> String {
if path.starts_with("http://") {
return path.to_string();
}
let Some(colon_pos) = path.find(':') else {
return path.to_string();
};
let ns_prefix = &path[..colon_pos];
let prop_name = &path[colon_pos + 1..];
self.namespaces
.get_uri(ns_prefix)
.map(|ns_uri| format!("{}:{}", ns_uri, prop_name))
.unwrap_or_else(|| path.to_string())
}
fn handle_array_text_item(
&self,
root: &mut StructureNode,
current_path: &[String],
text: &str,
qualifiers: &[Qualifier],
) -> XmpResult<()> {
if current_path.len() < 2 {
return Ok(());
}
let prop_path = ¤t_path[current_path.len() - 2];
let full_path = prop_path.clone();
let Some(Node::Array(ref mut arr)) = root.get_field_mut(&full_path) else {
return Ok(());
};
let mut simple_node = Node::simple(text);
if let Node::Simple(ref mut sn) = simple_node {
for qual in qualifiers {
sn.add_qualifier(qual.clone());
}
}
arr.append(simple_node);
Ok(())
}
fn handle_simple_text_item(
&self,
root: &mut StructureNode,
stack: &mut [StructureNode],
last_path: &str,
text: &str,
qualifiers: &[Qualifier],
) -> XmpResult<()> {
let path_to_check = if last_path.starts_with("http://") {
last_path.to_string()
} else if let Some(colon_pos) = last_path.find(':') {
let ns_prefix = &last_path[..colon_pos];
let prop_name = &last_path[colon_pos + 1..];
self.namespaces
.get_uri(ns_prefix)
.map(|ns_uri| format!("{}:{}", ns_uri, prop_name))
.unwrap_or_else(|| last_path.to_string())
} else {
last_path.to_string()
};
let has_array = root
.get_field(&path_to_check)
.map(|n| n.is_array())
.unwrap_or(false);
if has_array {
return Ok(());
}
let mut simple_node = Node::simple(text);
if let Node::Simple(ref mut sn) = simple_node {
for qual in qualifiers {
sn.add_qualifier(qual.clone());
}
}
if let Some(parent) = stack.last_mut() {
parent.set_field(path_to_check.clone(), simple_node);
} else {
root.set_field(path_to_check.clone(), simple_node);
}
Ok(())
}
}
impl Default for XmpParser {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_packet_content() {
let parser = XmpParser::new();
let xml = r#"<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
<rdf:RDF>...</rdf:RDF>
<?xpacket end="w"?>"#;
let content = parser.extract_packet_content(xml).unwrap();
assert!(content.contains("<rdf:RDF>"));
}
#[test]
fn test_parse_simple_rdf() {
let mut parser = XmpParser::new();
let xml = r#"
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:xmp="http://ns.adobe.com/xap/1.0/">
<rdf:Description rdf:about=""
xmp:CreatorTool="MyApp"/>
</rdf:RDF>"#;
let result = parser.parse_rdf(xml);
assert!(result.is_ok());
let root = result.unwrap();
for field_name in root.field_names() {
eprintln!("Field: {}", field_name);
}
eprintln!("xmp URI: {:?}", parser.namespaces.get_uri("xmp"));
assert!(root.has_field("http://ns.adobe.com/xap/1.0/:CreatorTool"));
}
}