Skip to main content

config_disassembler/xml/handlers/
reassemble.rs

1//! Reassemble XML from disassembled directory.
2
3use crate::xml::builders::{build_xml_string, merge_xml_elements, reorder_root_keys};
4use crate::xml::multi_level::{ensure_segment_files_structure, load_multi_level_config};
5use crate::xml::parsers::parse_to_xml_object;
6use crate::xml::types::XmlElement;
7use crate::xml::utils::normalize_path_unix;
8use serde_json::Value;
9use std::future::Future;
10use std::path::Path;
11use std::pin::Pin;
12use tokio::fs;
13
14/// Read a `.key_order.json` file (if present) and parse it as a list of root key names.
15async fn read_key_order(path: &Path) -> Option<Vec<String>> {
16    let bytes = fs::read(path).await.ok()?;
17    serde_json::from_slice::<Vec<String>>(&bytes).ok()
18}
19
20/// Remove @xmlns from an object so the reassembled segment wrapper (e.g. programProcesses) has no xmlns.
21fn strip_xmlns_from_value(v: Value) -> Value {
22    match v {
23        Value::Object(obj) => {
24            Value::Object(obj.into_iter().filter(|(k, _)| k != "@xmlns").collect())
25        }
26        other => other,
27    }
28}
29
30type ProcessDirFuture<'a> = Pin<
31    Box<
32        dyn Future<Output = Result<Vec<XmlElement>, Box<dyn std::error::Error + Send + Sync>>>
33            + Send
34            + 'a,
35    >,
36>;
37
38pub struct ReassembleXmlFileHandler;
39
40impl ReassembleXmlFileHandler {
41    pub fn new() -> Self {
42        Self
43    }
44
45    pub async fn reassemble(
46        &self,
47        file_path: &str,
48        file_extension: Option<&str>,
49        post_purge: bool,
50    ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
51        let file_path = normalize_path_unix(file_path);
52        if !self.validate_directory(&file_path).await? {
53            return Ok(());
54        }
55
56        let path = Path::new(&file_path);
57        let config = load_multi_level_config(path).await;
58        if let Some(ref config) = config {
59            for rule in &config.rules {
60                let segment_path = path.join(&rule.path_segment);
61                self.reassemble_multi_level_segment(&segment_path, rule)
62                    .await?;
63            }
64        }
65
66        // Build one base-segment entry per multi-level rule so the recursive walker can
67        // recognize each rule's path_segment under the disassembly root.
68        let base_segments: Vec<(String, String, bool)> = config
69            .as_ref()
70            .map(|c| {
71                c.rules
72                    .iter()
73                    .map(|r| (file_path.clone(), r.path_segment.clone(), true))
74                    .collect()
75            })
76            .unwrap_or_default();
77        // When multi-level reassembly is done, purge the entire disassembled directory
78        let post_purge_final = post_purge || config.is_some();
79        self.reassemble_plain(&file_path, file_extension, post_purge_final, &base_segments)
80            .await
81    }
82
83    /// Reassemble a single multi-level segment directory: walk each process dir, reassemble
84    /// nested segments, reassemble the process, then ensure the wrapper structure.
85    async fn reassemble_multi_level_segment(
86        &self,
87        segment_path: &Path,
88        rule: &crate::xml::types::MultiLevelRule,
89    ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
90        if !segment_path.is_dir() {
91            return Ok(());
92        }
93        let mut entries = Vec::new();
94        let mut read_dir = fs::read_dir(segment_path).await?;
95        while let Some(entry) = read_dir.next_entry().await? {
96            entries.push(entry);
97        }
98        entries.sort_by_key(|e| e.file_name());
99        for entry in entries {
100            let process_path = entry.path();
101            if !process_path.is_dir() {
102                continue;
103            }
104            let process_path_str = normalize_path_unix(&process_path.to_string_lossy());
105            let mut sub_entries = Vec::new();
106            let mut sub_read = fs::read_dir(&process_path).await?;
107            while let Some(e) = sub_read.next_entry().await? {
108                sub_entries.push(e);
109            }
110            sub_entries.sort_by_key(|e| e.file_name());
111            for sub_entry in sub_entries {
112                let sub_path = sub_entry.path();
113                if sub_path.is_dir() {
114                    let sub_path_str = normalize_path_unix(&sub_path.to_string_lossy());
115                    self.reassemble_plain(&sub_path_str, Some("xml"), true, &[])
116                        .await?;
117                }
118            }
119            self.reassemble_plain(&process_path_str, Some("xml"), true, &[])
120                .await?;
121        }
122        ensure_segment_files_structure(
123            segment_path,
124            &rule.wrap_root_element,
125            &rule.path_segment,
126            &rule.wrap_xmlns,
127        )
128        .await?;
129        Ok(())
130    }
131
132    /// Merge and write reassembled XML (no multi-level pre-step). Used internally.
133    /// `base_segments` carries one tuple `(base_path, segment_name, extract_inner)` per
134    /// multi-level rule. When the recursive walker reaches `base_path` and finds a subdir
135    /// whose name matches one of the segment_names, that subdir's XML files are folded
136    /// into a single array under the segment_name key. When extract_inner is true, each
137    /// file's structure is `document_root > segment_name > content` and only the content
138    /// is collected; otherwise the whole root is kept.
139    async fn reassemble_plain(
140        &self,
141        file_path: &str,
142        file_extension: Option<&str>,
143        post_purge: bool,
144        base_segments: &[(String, String, bool)],
145    ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
146        let file_path = normalize_path_unix(file_path);
147        log::debug!("Parsing directory to reassemble: {}", file_path);
148        let parsed_objects = self
149            .process_files_in_directory(file_path.to_string(), base_segments.to_vec())
150            .await?;
151
152        if parsed_objects.is_empty() {
153            log::error!(
154                "No files under {} were parsed successfully. A reassembled XML file was not created.",
155                file_path
156            );
157            return Ok(());
158        }
159
160        // merge_xml_elements only returns None when every parsed element is empty or
161        // declaration-only (no usable root). Treat that the same as "nothing parsed"
162        // rather than emitting an `<root></root>` stub.
163        let Some(mut merged) = merge_xml_elements(&parsed_objects) else {
164            log::error!(
165                "No usable root element found while merging files under {}. A reassembled XML file was not created.",
166                file_path
167            );
168            return Ok(());
169        };
170
171        // Apply stored key order so reassembled XML matches original document order.
172        let key_order_path = Path::new(&file_path).join(".key_order.json");
173        if let Some(reordered) = read_key_order(&key_order_path)
174            .await
175            .and_then(|order| reorder_root_keys(&merged, &order))
176        {
177            merged = reordered;
178        }
179
180        let final_xml = build_xml_string(&merged);
181        let output_path = self.get_output_path(&file_path, file_extension);
182
183        fs::write(&output_path, final_xml).await?;
184
185        if post_purge {
186            fs::remove_dir_all(file_path).await.ok();
187        }
188
189        Ok(())
190    }
191
192    fn process_files_in_directory<'a>(
193        &'a self,
194        dir_path: String,
195        base_segments: Vec<(String, String, bool)>,
196    ) -> ProcessDirFuture<'a> {
197        Box::pin(async move {
198            let mut parsed = Vec::new();
199            let mut entries = Vec::new();
200            let mut read_dir = fs::read_dir(&dir_path).await?;
201            while let Some(entry) = read_dir.next_entry().await? {
202                entries.push(entry);
203            }
204            // Sort by full filename for deterministic cross-platform ordering
205            entries.sort_by(|a, b| {
206                let a_name = a.file_name().to_string_lossy().to_string();
207                let b_name = b.file_name().to_string_lossy().to_string();
208                a_name.cmp(&b_name)
209            });
210
211            // We are at the disassembly root for a given rule when our dir_path matches
212            // the base_path stored on that rule. Each rule shares the same base_path in
213            // the current implementation, but tracking them per-entry keeps the door open
214            // for future per-rule base_paths without another signature change.
215            let is_base = base_segments.iter().any(|(base, _, _)| dir_path == *base);
216
217            for entry in entries {
218                let path = entry.path();
219                let file_path = normalize_path_unix(&path.to_string_lossy()).to_string();
220
221                if path.is_file() {
222                    let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
223                    if !name.starts_with('.') && self.is_parsable_file(name) {
224                        if let Some(parsed_obj) = parse_to_xml_object(&file_path).await {
225                            parsed.push(parsed_obj);
226                        }
227                    }
228                } else {
229                    // Anything not a regular file is treated as a directory; symlinks and
230                    // other exotic entries simply recurse via read_dir below.
231                    let dir_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
232                    let matched_segment = if is_base {
233                        base_segments
234                            .iter()
235                            .find(|(_, seg_name, _)| seg_name == dir_name)
236                            .cloned()
237                    } else {
238                        None
239                    };
240                    if let Some((_, segment_name, extract_inner)) = matched_segment {
241                        let segment_element = self
242                            .collect_segment_as_array(&file_path, &segment_name, extract_inner)
243                            .await?;
244                        if let Some(el) = segment_element {
245                            parsed.push(el);
246                        }
247                    } else {
248                        let sub_parsed = self
249                            .process_files_in_directory(file_path, base_segments.clone())
250                            .await?;
251                        parsed.extend(sub_parsed);
252                    }
253                }
254            }
255
256            Ok(parsed)
257        })
258    }
259
260    /// Collect all .xml files in a directory, parse each, and build one element with
261    /// root_key and single key segment_name whose value is array of each file's content.
262    /// When extract_inner is true, each file has root > segment_name > content; we push that content.
263    async fn collect_segment_as_array(
264        &self,
265        segment_dir: &str,
266        segment_name: &str,
267        extract_inner: bool,
268    ) -> Result<Option<XmlElement>, Box<dyn std::error::Error + Send + Sync>> {
269        let mut xml_files = Vec::new();
270        let mut read_dir = fs::read_dir(segment_dir).await?;
271        while let Some(entry) = read_dir.next_entry().await? {
272            let path = entry.path();
273            let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
274            if path.is_file() && !name.starts_with('.') && self.is_parsable_file(name) {
275                xml_files.push(normalize_path_unix(&path.to_string_lossy()));
276            }
277        }
278        xml_files.sort();
279
280        let mut root_contents = Vec::new();
281        let mut first_xml: Option<(String, Option<Value>)> = None;
282        for file_path in &xml_files {
283            // parse_to_xml_object always yields a JSON object on success; treat any other
284            // shape (including parse failure) as a skip without branching explicitly.
285            let Some(parsed) = parse_to_xml_object(file_path).await else {
286                continue;
287            };
288            let obj_owned = parsed.as_object().cloned().unwrap_or_default();
289            let obj = &obj_owned;
290            let Some(root_key) = obj.keys().find(|k| *k != "?xml").cloned() else {
291                continue;
292            };
293            let root_val = obj
294                .get(&root_key)
295                .cloned()
296                .unwrap_or(Value::Object(serde_json::Map::new()));
297            let mut content = if extract_inner {
298                root_val
299                    .get(segment_name)
300                    .cloned()
301                    .unwrap_or_else(|| Value::Object(serde_json::Map::new()))
302            } else {
303                root_val
304            };
305            // Inner segment element (e.g. programProcesses) should not have xmlns in output
306            if extract_inner {
307                content = strip_xmlns_from_value(content);
308            }
309            root_contents.push(content);
310            if first_xml.is_none() {
311                first_xml = Some((root_key, obj.get("?xml").cloned()));
312            }
313        }
314        if root_contents.is_empty() {
315            return Ok(None);
316        }
317        let (root_key, decl_opt) = first_xml.unwrap();
318        let mut content = serde_json::Map::new();
319        content.insert(segment_name.to_string(), Value::Array(root_contents));
320        let mut top = serde_json::Map::new();
321        if let Some(decl) = decl_opt {
322            top.insert("?xml".to_string(), decl);
323        } else {
324            let mut d = serde_json::Map::new();
325            d.insert("@version".to_string(), Value::String("1.0".to_string()));
326            d.insert("@encoding".to_string(), Value::String("UTF-8".to_string()));
327            top.insert("?xml".to_string(), Value::Object(d));
328        }
329        top.insert(root_key, Value::Object(content));
330        Ok(Some(Value::Object(top)))
331    }
332
333    fn is_parsable_file(&self, file_name: &str) -> bool {
334        let lower = file_name.to_lowercase();
335        lower.ends_with(".xml")
336            || lower.ends_with(".json")
337            || lower.ends_with(".json5")
338            || lower.ends_with(".yaml")
339            || lower.ends_with(".yml")
340    }
341
342    async fn validate_directory(
343        &self,
344        path: &str,
345    ) -> Result<bool, Box<dyn std::error::Error + Send + Sync>> {
346        let meta = fs::metadata(path).await?;
347        if !meta.is_dir() {
348            log::error!(
349                "The provided path to reassemble is not a directory: {}",
350                path
351            );
352            return Ok(false);
353        }
354        Ok(true)
355    }
356
357    fn get_output_path(&self, dir_path: &str, extension: Option<&str>) -> String {
358        let path = Path::new(dir_path);
359        let parent = path.parent().unwrap_or(Path::new("."));
360        let base_name = path
361            .file_name()
362            .and_then(|n| n.to_str())
363            .unwrap_or("output");
364        let ext = extension.unwrap_or("xml");
365        parent
366            .join(format!("{}.{}", base_name, ext))
367            .to_string_lossy()
368            .to_string()
369    }
370}
371
372impl Default for ReassembleXmlFileHandler {
373    fn default() -> Self {
374        Self::new()
375    }
376}
377
378#[cfg(test)]
379mod tests {
380    use super::*;
381    use serde_json::json;
382
383    #[test]
384    #[allow(clippy::default_constructed_unit_structs)]
385    fn reassemble_handler_default_equals_new() {
386        let _ = ReassembleXmlFileHandler::default();
387    }
388
389    #[test]
390    fn strip_xmlns_from_value_passes_non_object_through() {
391        let s = Value::String("hello".to_string());
392        assert_eq!(
393            strip_xmlns_from_value(s),
394            Value::String("hello".to_string())
395        );
396        let arr = json!([1, 2]);
397        assert_eq!(strip_xmlns_from_value(arr.clone()), arr);
398    }
399
400    #[test]
401    fn strip_xmlns_from_value_removes_xmlns_key() {
402        let obj = json!({ "@xmlns": "ns", "child": 1 });
403        let stripped = strip_xmlns_from_value(obj);
404        let map = stripped.as_object().unwrap();
405        assert!(map.get("@xmlns").is_none());
406        assert_eq!(map.get("child").and_then(|v| v.as_i64()), Some(1));
407    }
408
409    #[test]
410    fn is_parsable_file_recognises_supported_extensions() {
411        let h = ReassembleXmlFileHandler::new();
412        assert!(h.is_parsable_file("a.xml"));
413        assert!(h.is_parsable_file("a.json"));
414        assert!(h.is_parsable_file("a.json5"));
415        assert!(h.is_parsable_file("a.yaml"));
416        assert!(h.is_parsable_file("a.yml"));
417        assert!(h.is_parsable_file("A.XML"));
418        assert!(!h.is_parsable_file("a.txt"));
419    }
420
421    #[test]
422    fn get_output_path_appends_extension_and_uses_parent_dir() {
423        let h = ReassembleXmlFileHandler::new();
424        let out = h.get_output_path("/tmp/foo", Some("xml"));
425        assert!(out.ends_with("foo.xml"));
426        let out_default = h.get_output_path("/tmp/bar", None);
427        assert!(out_default.ends_with("bar.xml"));
428        // No parent - uses "." fallback
429        assert_eq!(h.get_output_path("only", Some("json")), "only.json");
430    }
431
432    #[tokio::test]
433    async fn reassemble_multi_level_segment_noop_when_not_dir() {
434        let h = ReassembleXmlFileHandler::new();
435        let tmp = tempfile::tempdir().unwrap();
436        let file = tmp.path().join("not_a_dir.txt");
437        tokio::fs::write(&file, "hi").await.unwrap();
438        let rule = crate::xml::types::MultiLevelRule {
439            file_pattern: String::new(),
440            root_to_strip: String::new(),
441            unique_id_elements: String::new(),
442            path_segment: String::new(),
443            wrap_root_element: "Root".to_string(),
444            wrap_xmlns: String::new(),
445        };
446        h.reassemble_multi_level_segment(&file, &rule)
447            .await
448            .unwrap();
449    }
450
451    #[tokio::test]
452    async fn reassemble_multi_level_segment_skips_files_in_segment_root() {
453        let h = ReassembleXmlFileHandler::new();
454        let tmp = tempfile::tempdir().unwrap();
455        let segment = tmp.path().join("segment");
456        tokio::fs::create_dir(&segment).await.unwrap();
457        // A bare file inside the segment dir should be skipped (not a subdir).
458        tokio::fs::write(segment.join("stray.txt"), "x")
459            .await
460            .unwrap();
461        let rule = crate::xml::types::MultiLevelRule {
462            file_pattern: String::new(),
463            root_to_strip: String::new(),
464            unique_id_elements: String::new(),
465            path_segment: "segment".to_string(),
466            wrap_root_element: "Root".to_string(),
467            wrap_xmlns: "http://example.com".to_string(),
468        };
469        h.reassemble_multi_level_segment(&segment, &rule)
470            .await
471            .unwrap();
472    }
473
474    #[tokio::test]
475    async fn collect_segment_as_array_returns_none_for_empty_dir() {
476        let h = ReassembleXmlFileHandler::new();
477        let tmp = tempfile::tempdir().unwrap();
478        let out = h
479            .collect_segment_as_array(tmp.path().to_str().unwrap(), "seg", true)
480            .await
481            .unwrap();
482        assert!(out.is_none());
483    }
484
485    #[tokio::test]
486    async fn collect_segment_as_array_skips_unparseable_and_empty_roots() {
487        let h = ReassembleXmlFileHandler::new();
488        let tmp = tempfile::tempdir().unwrap();
489        // Unparseable XML
490        tokio::fs::write(tmp.path().join("bad.xml"), "<<")
491            .await
492            .unwrap();
493        // Valid XML but only declaration and no root after parse
494        tokio::fs::write(tmp.path().join("only-decl.xml"), "")
495            .await
496            .unwrap();
497        // Hidden file is skipped
498        tokio::fs::write(tmp.path().join(".hidden.xml"), "<r/>")
499            .await
500            .unwrap();
501        let out = h
502            .collect_segment_as_array(tmp.path().to_str().unwrap(), "seg", false)
503            .await
504            .unwrap();
505        assert!(out.is_none());
506    }
507
508    #[tokio::test]
509    async fn collect_segment_as_array_without_extract_inner_wraps_root() {
510        let h = ReassembleXmlFileHandler::new();
511        let tmp = tempfile::tempdir().unwrap();
512        tokio::fs::write(tmp.path().join("a.xml"), r#"<Root><child>1</child></Root>"#)
513            .await
514            .unwrap();
515        let out = h
516            .collect_segment_as_array(tmp.path().to_str().unwrap(), "seg", false)
517            .await
518            .unwrap()
519            .unwrap();
520        let obj = out.as_object().unwrap();
521        assert!(obj.contains_key("?xml"));
522        let root = obj.get("Root").and_then(|r| r.as_object()).unwrap();
523        assert!(root.get("seg").and_then(|v| v.as_array()).is_some());
524    }
525}