Skip to main content

xml_disassembler/parsers/
parse_xml.rs

1//! Parse XML file from path into XmlElement structure.
2
3use quickxml_to_serde::{Config, NullValue};
4use serde_json::Value;
5use tokio::fs;
6
7use crate::parsers::strip_whitespace_text_nodes;
8use crate::types::XmlElement;
9
10/// XML parser config matching fast-xml-parser behavior:
11/// - @ prefix for attributes
12/// - #text for text nodes
13/// - leading_zero_as_string to preserve string values
14fn xml_parser_config() -> Config {
15    Config::new_with_custom_values(true, "@", "#text", NullValue::EmptyObject)
16}
17
18/// Parses an XML file from a path.
19pub async fn parse_xml(file_path: &str) -> Option<XmlElement> {
20    let content = match fs::read_to_string(file_path).await {
21        Ok(c) => c,
22        Err(e) => {
23            log::error!(
24                "{} was unable to be parsed and will not be processed. Confirm formatting and try again.",
25                file_path
26            );
27            log::debug!("Parse error: {}", e);
28            return None;
29        }
30    };
31    parse_xml_from_str(&content, file_path)
32}
33
34/// Parses XML from a string. The file_path is used for error logging only.
35pub fn parse_xml_from_str(content: &str, file_path: &str) -> Option<XmlElement> {
36    let config = xml_parser_config();
37    let parsed: Value = match quickxml_to_serde::xml_string_to_json(content.to_string(), &config) {
38        Ok(v) => v,
39        Err(e) => {
40            log::error!(
41                "{} was unable to be parsed and will not be processed. Confirm formatting and try again.",
42                file_path
43            );
44            log::debug!("Parse error: {}", e);
45            return None;
46        }
47    };
48
49    let cleaned = strip_whitespace_text_nodes(&parsed);
50    Some(cleaned)
51}
52
53/// Extract xmlns attribute from raw XML (quickxml_to_serde drops it).
54/// Returns Some(value) if found, None otherwise.
55pub fn extract_xmlns_from_raw(xml_content: &str) -> Option<String> {
56    let re = regex::Regex::new(r#"xmlns="([^"]*)""#).ok()?;
57    re.captures(xml_content).map(|c| c[1].to_string())
58}
59
60/// Extract XML declaration from raw XML (quickxml_to_serde drops it).
61/// Returns a Value object like {"@version": "1.0", "@encoding": "UTF-8", "@standalone": "yes"}
62/// for use in build_xml_string. None if no declaration found.
63pub fn extract_xml_declaration_from_raw(xml_content: &str) -> Option<XmlElement> {
64    let decl_re = regex::Regex::new(r#"<\?xml\s+([^?]+)\?>"#).ok()?;
65    let decl_content = decl_re.captures(xml_content)?.get(1)?.as_str();
66    let mut decl = serde_json::Map::new();
67    let version_re = regex::Regex::new(r#"version="([^"]*)""#).ok()?;
68    if let Some(cap) = version_re.captures(decl_content) {
69        decl.insert("@version".to_string(), Value::String(cap[1].to_string()));
70    } else {
71        return None;
72    }
73    let encoding_re = regex::Regex::new(r#"encoding="([^"]*)""#).ok()?;
74    if let Some(cap) = encoding_re.captures(decl_content) {
75        decl.insert("@encoding".to_string(), Value::String(cap[1].to_string()));
76    }
77    let standalone_re = regex::Regex::new(r#"standalone="([^"]*)""#).ok()?;
78    if let Some(cap) = standalone_re.captures(decl_content) {
79        decl.insert("@standalone".to_string(), Value::String(cap[1].to_string()));
80    }
81    Some(Value::Object(decl))
82}
83
84#[cfg(test)]
85mod tests {
86    use super::*;
87
88    #[test]
89    fn extract_xmlns_from_raw_finds_namespace() {
90        let xml = r#"<root xmlns="http://soap.sforce.com/2006/04/metadata"><a/></root>"#;
91        assert_eq!(
92            extract_xmlns_from_raw(xml),
93            Some("http://soap.sforce.com/2006/04/metadata".to_string())
94        );
95    }
96
97    #[test]
98    fn extract_xmlns_from_raw_returns_none_when_absent() {
99        let xml = r#"<root><a/></root>"#;
100        assert_eq!(extract_xmlns_from_raw(xml), None);
101    }
102
103    #[test]
104    fn extract_xml_declaration_from_raw_parses_version_and_encoding() {
105        let xml = r#"<?xml version="1.0" encoding="UTF-8"?><root/>"#;
106        let decl = extract_xml_declaration_from_raw(xml).unwrap();
107        let obj = decl.as_object().unwrap();
108        assert_eq!(obj.get("@version").and_then(|v| v.as_str()), Some("1.0"));
109        assert_eq!(obj.get("@encoding").and_then(|v| v.as_str()), Some("UTF-8"));
110    }
111
112    #[test]
113    fn extract_xml_declaration_from_raw_parses_standalone() {
114        let xml = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><root/>"#;
115        let decl = extract_xml_declaration_from_raw(xml).unwrap();
116        let obj = decl.as_object().unwrap();
117        assert_eq!(obj.get("@standalone").and_then(|v| v.as_str()), Some("yes"));
118    }
119
120    #[test]
121    fn extract_xml_declaration_from_raw_returns_none_without_declaration() {
122        let xml = r#"<root/>"#;
123        assert!(extract_xml_declaration_from_raw(xml).is_none());
124    }
125}