Skip to main content

xml_disassembler/builders/
build_disassembled_files.rs

1//! Build disassembled files from source XML file.
2
3use crate::builders::{build_disassembled_file, extract_root_attributes};
4use crate::parsers::{extract_xml_declaration_from_raw, parse_element_unified};
5use crate::types::{
6    BuildDisassembledFilesOptions, DecomposeRule, XmlElementArrayMap, XmlElementParams,
7};
8use crate::utils::normalize_path_unix;
9use serde_json::{Map, Value};
10use std::collections::HashMap;
11use tokio::fs;
12
13const BATCH_SIZE: usize = 20;
14
15fn get_root_info(parsed_xml: &Value) -> Option<(String, Value)> {
16    let obj = parsed_xml.as_object()?;
17    let root_element_name = obj.keys().find(|k| *k != "?xml")?.clone();
18    let root_element = obj.get(&root_element_name)?.clone();
19    Some((root_element_name, root_element))
20}
21
22fn order_xml_element_keys(content: &Map<String, Value>, key_order: &[String]) -> Value {
23    let mut ordered = Map::new();
24    for key in key_order {
25        if let Some(v) = content.get(key) {
26            ordered.insert(key.clone(), v.clone());
27        }
28    }
29    Value::Object(ordered)
30}
31
32#[allow(clippy::too_many_arguments)]
33async fn disassemble_element_keys(
34    root_element: &Value,
35    key_order: &[String],
36    disassembled_path: &str,
37    root_element_name: &str,
38    root_attributes: &Value,
39    xml_declaration: Option<&Value>,
40    unique_id_elements: Option<&str>,
41    strategy: &str,
42    format: &str,
43) -> (Map<String, Value>, XmlElementArrayMap, usize, bool) {
44    let mut leaf_content = Map::new();
45    let mut nested_groups = XmlElementArrayMap::new();
46    let mut leaf_count = 0usize;
47    let mut has_nested_elements = false;
48
49    let empty_map = Map::new();
50    let root_obj = root_element.as_object().unwrap_or(&empty_map);
51
52    // Iterate root_obj in key_order's ordering: we consume only keys that are present,
53    // which matches the caller's invariant and keeps the loop body branch-free.
54    let ordered: Vec<(&String, &Value)> = key_order
55        .iter()
56        .filter_map(|k| root_obj.get_key_value(k))
57        .collect();
58    for (key, val) in ordered {
59        let elements: Vec<Value> = match val.as_array() {
60            Some(arr) => arr.clone(),
61            None => vec![val.clone()],
62        };
63
64        for chunk in elements.chunks(BATCH_SIZE) {
65            for element in chunk {
66                let result = parse_element_unified(XmlElementParams {
67                    element: element.clone(),
68                    disassembled_path,
69                    unique_id_elements,
70                    root_element_name,
71                    root_attributes: root_attributes.clone(),
72                    key,
73                    leaf_content: Value::Object(Map::new()),
74                    leaf_count,
75                    has_nested_elements,
76                    format,
77                    xml_declaration: xml_declaration.cloned(),
78                    strategy,
79                })
80                .await;
81
82                if let Some(arr) = result.leaf_content.as_object().and_then(|o| o.get(key)) {
83                    match leaf_content.get_mut(key).and_then(|v| v.as_array_mut()) {
84                        Some(existing_arr) => {
85                            if let Some(new_arr) = arr.as_array() {
86                                existing_arr.extend(new_arr.iter().cloned());
87                            }
88                        }
89                        None => {
90                            leaf_content.insert(key.clone(), arr.clone());
91                        }
92                    }
93                }
94
95                if strategy == "grouped-by-tag" {
96                    if let Some(groups) = result.nested_groups {
97                        for (tag, arr) in groups {
98                            nested_groups.entry(tag).or_default().extend(arr);
99                        }
100                    }
101                }
102
103                leaf_count = result.leaf_count;
104                has_nested_elements = result.has_nested_elements;
105            }
106        }
107    }
108
109    (leaf_content, nested_groups, leaf_count, has_nested_elements)
110}
111
112/// Extract string from an element's field - handles direct strings and objects with #text (XML leaf elements).
113fn get_field_value(element: &Value, field: &str) -> Option<String> {
114    let v = element.as_object()?.get(field)?;
115    if let Some(s) = v.as_str() {
116        return Some(s.to_string());
117    }
118    v.as_object()
119        .and_then(|child| child.get("#text"))
120        .and_then(|t| t.as_str())
121        .map(|s| s.to_string())
122}
123
124/// For group mode: use the segment before the first '.' as key when present (e.g. "Account.Name" -> "Account").
125fn group_key_from_field_value(s: &str) -> &str {
126    s.find('.').map(|i| &s[..i]).unwrap_or(s)
127}
128
129/// Sanitize a string for use as a filename (no path separators or invalid chars).
130fn sanitize_filename(s: &str) -> String {
131    s.chars()
132        .map(|c| {
133            if c.is_ascii_alphanumeric() || c == '-' || c == '_' || c == '.' {
134                c
135            } else {
136                '_'
137            }
138        })
139        .collect()
140}
141
142async fn write_nested_groups(
143    nested_groups: &XmlElementArrayMap,
144    strategy: &str,
145    options: &WriteNestedOptions<'_>,
146) {
147    if strategy != "grouped-by-tag" {
148        return;
149    }
150    let decompose_by_tag: HashMap<&str, &DecomposeRule> = options
151        .decompose_rules
152        .map(|rules| rules.iter().map(|r| (r.tag.as_str(), r)).collect())
153        .unwrap_or_default();
154
155    for (tag, arr) in nested_groups {
156        let rule = decompose_by_tag.get(tag.as_str());
157        let path_segment = rule
158            .map(|r| {
159                if r.path_segment.is_empty() {
160                    &r.tag
161                } else {
162                    &r.path_segment
163                }
164            })
165            .unwrap_or(tag);
166
167        if let Some(r) = rule {
168            if r.mode == "split" {
169                for (idx, item) in arr.iter().enumerate() {
170                    let name = get_field_value(item, &r.field)
171                        .as_deref()
172                        .map(sanitize_filename)
173                        .filter(|s: &String| !s.is_empty())
174                        .unwrap_or_else(|| idx.to_string());
175                    let file_name = format!("{}.{}-meta.{}", name, tag, options.format);
176                    let _ = build_disassembled_file(crate::types::BuildDisassembledFileOptions {
177                        content: item.clone(),
178                        disassembled_path: options.disassembled_path,
179                        output_file_name: Some(&file_name),
180                        subdirectory: Some(path_segment),
181                        wrap_key: Some(tag),
182                        is_grouped_array: false,
183                        root_element_name: options.root_element_name,
184                        root_attributes: options.root_attributes.clone(),
185                        format: options.format,
186                        xml_declaration: options.xml_declaration.clone(),
187                        unique_id_elements: None,
188                    })
189                    .await;
190                }
191            } else if r.mode == "group" {
192                let mut by_key: HashMap<String, Vec<Value>> = HashMap::new();
193                for item in arr {
194                    let key = get_field_value(item, &r.field)
195                        .as_deref()
196                        .map(group_key_from_field_value)
197                        .map(sanitize_filename)
198                        .filter(|s: &String| !s.is_empty())
199                        .unwrap_or_else(|| "unknown".to_string());
200                    by_key.entry(key).or_default().push(item.clone());
201                }
202                // Sort keys for deterministic cross-platform output order
203                let mut sorted_keys: Vec<_> = by_key.keys().cloned().collect();
204                sorted_keys.sort();
205                for key in sorted_keys {
206                    let group = by_key.remove(&key).unwrap();
207                    let file_name = format!("{}.{}-meta.{}", key, tag, options.format);
208                    let _ = build_disassembled_file(crate::types::BuildDisassembledFileOptions {
209                        content: Value::Array(group),
210                        disassembled_path: options.disassembled_path,
211                        output_file_name: Some(&file_name),
212                        subdirectory: Some(path_segment),
213                        wrap_key: Some(tag),
214                        is_grouped_array: true,
215                        root_element_name: options.root_element_name,
216                        root_attributes: options.root_attributes.clone(),
217                        format: options.format,
218                        xml_declaration: options.xml_declaration.clone(),
219                        unique_id_elements: None,
220                    })
221                    .await;
222                }
223            } else {
224                fallback_write_one_file(tag, arr, path_segment, options).await;
225            }
226        } else {
227            fallback_write_one_file(tag, arr, path_segment, options).await;
228        }
229    }
230}
231
232async fn fallback_write_one_file(
233    tag: &str,
234    arr: &[Value],
235    _path_segment: &str,
236    options: &WriteNestedOptions<'_>,
237) {
238    let _ = build_disassembled_file(crate::types::BuildDisassembledFileOptions {
239        content: Value::Array(arr.to_vec()),
240        disassembled_path: options.disassembled_path,
241        output_file_name: Some(&format!("{}.{}", tag, options.format)),
242        subdirectory: None,
243        wrap_key: Some(tag),
244        is_grouped_array: true,
245        root_element_name: options.root_element_name,
246        root_attributes: options.root_attributes.clone(),
247        format: options.format,
248        xml_declaration: options.xml_declaration.clone(),
249        unique_id_elements: None,
250    })
251    .await;
252}
253
254struct WriteNestedOptions<'a> {
255    disassembled_path: &'a str,
256    root_element_name: &'a str,
257    root_attributes: Value,
258    xml_declaration: Option<Value>,
259    format: &'a str,
260    decompose_rules: Option<&'a [DecomposeRule]>,
261}
262
263pub async fn build_disassembled_files_unified(
264    options: BuildDisassembledFilesOptions<'_>,
265) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
266    let BuildDisassembledFilesOptions {
267        file_path,
268        disassembled_path,
269        base_name,
270        post_purge,
271        format,
272        unique_id_elements,
273        strategy,
274        decompose_rules,
275    } = options;
276
277    let file_path = normalize_path_unix(file_path);
278
279    let xml_content = match fs::read_to_string(&file_path).await {
280        Ok(c) => c,
281        Err(_) => return Ok(()),
282    };
283
284    let parsed_xml = match crate::parsers::parse_xml_from_str(&xml_content, &file_path) {
285        Some(p) => p,
286        None => return Ok(()),
287    };
288
289    let (root_element_name, root_element) = match get_root_info(&parsed_xml) {
290        Some(info) => info,
291        None => return Ok(()),
292    };
293    // The custom parser ignores <?xml ?>; always recover it from raw XML.
294    let xml_declaration = extract_xml_declaration_from_raw(&xml_content);
295
296    let root_attributes = extract_root_attributes(&root_element);
297    let key_order: Vec<String> = root_element
298        .as_object()
299        .map(|o| o.keys().filter(|k| !k.starts_with('@')).cloned().collect())
300        .unwrap_or_default();
301
302    let (leaf_content, nested_groups, leaf_count, has_nested_elements) = disassemble_element_keys(
303        &root_element,
304        &key_order,
305        disassembled_path,
306        &root_element_name,
307        &root_attributes,
308        xml_declaration.as_ref(),
309        unique_id_elements,
310        strategy,
311        format,
312    )
313    .await;
314
315    if !has_nested_elements && leaf_count > 0 {
316        log::error!(
317            "The XML file {} only has leaf elements. This file will not be disassembled.",
318            &file_path
319        );
320        return Ok(());
321    }
322
323    let write_opts = WriteNestedOptions {
324        disassembled_path,
325        root_element_name: &root_element_name,
326        root_attributes: root_attributes.clone(),
327        xml_declaration: xml_declaration.clone(),
328        format,
329        decompose_rules,
330    };
331    write_nested_groups(&nested_groups, strategy, &write_opts).await;
332
333    // Persist root key order so reassembly can match original document order.
334    // serde_json::to_string never fails for Vec<String>; writes are best-effort.
335    let key_order_path = std::path::Path::new(disassembled_path).join(".key_order.json");
336    let json = serde_json::to_string(&key_order).unwrap_or_else(|_| "[]".to_string());
337    let _ = fs::write(key_order_path, json).await;
338
339    if leaf_count > 0 {
340        let final_leaf_content = if strategy == "grouped-by-tag" {
341            order_xml_element_keys(&leaf_content, &key_order)
342        } else {
343            Value::Object(leaf_content.clone())
344        };
345
346        let _ = build_disassembled_file(crate::types::BuildDisassembledFileOptions {
347            content: final_leaf_content,
348            disassembled_path,
349            output_file_name: Some(&format!("{}.{}", base_name, format)),
350            subdirectory: None,
351            wrap_key: None,
352            is_grouped_array: false,
353            root_element_name: &root_element_name,
354            root_attributes: root_attributes.clone(),
355            format,
356            xml_declaration: xml_declaration.clone(),
357            unique_id_elements: None,
358        })
359        .await;
360    }
361
362    if post_purge {
363        // Best-effort purge; a failure here is benign (file may have been removed concurrently).
364        let _ = fs::remove_file(&file_path).await;
365    }
366
367    Ok(())
368}
369
370#[cfg(test)]
371mod tests {
372    use super::*;
373    use serde_json::json;
374
375    #[test]
376    fn get_field_value_returns_direct_string() {
377        let el = json!({ "field": "value" });
378        assert_eq!(get_field_value(&el, "field"), Some("value".to_string()));
379    }
380
381    #[test]
382    fn get_field_value_returns_nested_text() {
383        let el = json!({ "field": { "#text": "value" } });
384        assert_eq!(get_field_value(&el, "field"), Some("value".to_string()));
385    }
386
387    #[test]
388    fn get_field_value_returns_none_when_missing_or_non_string() {
389        let el = json!({ "field": { "nested": { "#text": "x" } } });
390        assert!(get_field_value(&el, "field").is_none());
391        assert!(get_field_value(&el, "missing").is_none());
392        let el = json!("not-an-object");
393        assert!(get_field_value(&el, "field").is_none());
394    }
395
396    #[test]
397    fn group_key_from_field_value_takes_prefix_before_dot() {
398        assert_eq!(group_key_from_field_value("Account.Name"), "Account");
399        assert_eq!(group_key_from_field_value("NoDot"), "NoDot");
400    }
401
402    #[test]
403    fn sanitize_filename_replaces_disallowed_chars_with_underscore() {
404        assert_eq!(sanitize_filename("a/b c:d"), "a_b_c_d");
405        assert_eq!(sanitize_filename("ok-name_1.xml"), "ok-name_1.xml");
406    }
407
408    #[test]
409    fn order_xml_element_keys_preserves_order_and_drops_absent() {
410        let mut m = Map::new();
411        m.insert("b".to_string(), json!(2));
412        m.insert("a".to_string(), json!(1));
413        let ordered =
414            order_xml_element_keys(&m, &["a".to_string(), "c".to_string(), "b".to_string()]);
415        let obj = ordered.as_object().unwrap();
416        let keys: Vec<&String> = obj.keys().collect();
417        assert_eq!(keys, vec![&"a".to_string(), &"b".to_string()]);
418    }
419
420    #[test]
421    fn get_root_info_returns_name_and_element() {
422        let parsed = json!({ "?xml": {"@version": "1.0"}, "Root": { "child": 1 } });
423        let (name, element) = get_root_info(&parsed).unwrap();
424        assert_eq!(name, "Root");
425        assert!(element.as_object().unwrap().contains_key("child"));
426    }
427
428    #[test]
429    fn get_root_info_returns_none_for_non_object_or_decl_only() {
430        assert!(get_root_info(&json!("s")).is_none());
431        assert!(get_root_info(&json!({ "?xml": {} })).is_none());
432    }
433
434    #[tokio::test]
435    async fn unified_build_returns_ok_when_source_unreadable() {
436        // Missing source file: unified build should short-circuit with Ok(()).
437        let dir = tempfile::tempdir().unwrap();
438        let disassembled = dir.path().join("out");
439        let missing = dir.path().join("does_not_exist.xml");
440        build_disassembled_files_unified(BuildDisassembledFilesOptions {
441            file_path: missing.to_str().unwrap(),
442            disassembled_path: disassembled.to_str().unwrap(),
443            base_name: "does_not_exist",
444            post_purge: false,
445            format: "xml",
446            unique_id_elements: None,
447            strategy: "unique-id",
448            decompose_rules: None,
449        })
450        .await
451        .unwrap();
452        assert!(!disassembled.exists());
453    }
454}