Skip to main content

xml_disassembler/handlers/
reassemble.rs

1//! Reassemble XML from disassembled directory.
2
3use crate::builders::{build_xml_string, merge_xml_elements, reorder_root_keys};
4use crate::multi_level::{ensure_segment_files_structure, load_multi_level_config};
5use crate::parsers::parse_to_xml_object;
6use crate::types::XmlElement;
7use crate::utils::normalize_path_unix;
8use serde_json::Value;
9use std::future::Future;
10use std::path::Path;
11use std::pin::Pin;
12use tokio::fs;
13
14/// Read a `.key_order.json` file (if present) and parse it as a list of root key names.
15async fn read_key_order(path: &Path) -> Option<Vec<String>> {
16    let bytes = fs::read(path).await.ok()?;
17    serde_json::from_slice::<Vec<String>>(&bytes).ok()
18}
19
20/// Remove @xmlns from an object so the reassembled segment wrapper (e.g. programProcesses) has no xmlns.
21fn strip_xmlns_from_value(v: Value) -> Value {
22    match v {
23        Value::Object(obj) => {
24            Value::Object(obj.into_iter().filter(|(k, _)| k != "@xmlns").collect())
25        }
26        other => other,
27    }
28}
29
30type ProcessDirFuture<'a> = Pin<
31    Box<
32        dyn Future<Output = Result<Vec<XmlElement>, Box<dyn std::error::Error + Send + Sync>>>
33            + Send
34            + 'a,
35    >,
36>;
37
38pub struct ReassembleXmlFileHandler;
39
40impl ReassembleXmlFileHandler {
41    pub fn new() -> Self {
42        Self
43    }
44
45    pub async fn reassemble(
46        &self,
47        file_path: &str,
48        file_extension: Option<&str>,
49        post_purge: bool,
50    ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
51        let file_path = normalize_path_unix(file_path);
52        if !self.validate_directory(&file_path).await? {
53            return Ok(());
54        }
55
56        let path = Path::new(&file_path);
57        let config = load_multi_level_config(path).await;
58        if let Some(ref config) = config {
59            for rule in &config.rules {
60                let segment_path = path.join(&rule.path_segment);
61                self.reassemble_multi_level_segment(&segment_path, rule)
62                    .await?;
63            }
64        }
65
66        let base_segment = config.as_ref().and_then(|c| {
67            c.rules.first().map(|r| {
68                (
69                    file_path.clone(),
70                    r.path_segment.clone(),
71                    true, // extract_inner: segment files have document_root > segment > content
72                )
73            })
74        });
75        // When multi-level reassembly is done, purge the entire disassembled directory
76        let post_purge_final = post_purge || config.is_some();
77        self.reassemble_plain(&file_path, file_extension, post_purge_final, base_segment)
78            .await
79    }
80
81    /// Reassemble a single multi-level segment directory: walk each process dir, reassemble
82    /// nested segments, reassemble the process, then ensure the wrapper structure.
83    async fn reassemble_multi_level_segment(
84        &self,
85        segment_path: &Path,
86        rule: &crate::types::MultiLevelRule,
87    ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
88        if !segment_path.is_dir() {
89            return Ok(());
90        }
91        let mut entries = Vec::new();
92        let mut read_dir = fs::read_dir(segment_path).await?;
93        while let Some(entry) = read_dir.next_entry().await? {
94            entries.push(entry);
95        }
96        entries.sort_by_key(|e| e.file_name());
97        for entry in entries {
98            let process_path = entry.path();
99            if !process_path.is_dir() {
100                continue;
101            }
102            let process_path_str = normalize_path_unix(&process_path.to_string_lossy());
103            let mut sub_entries = Vec::new();
104            let mut sub_read = fs::read_dir(&process_path).await?;
105            while let Some(e) = sub_read.next_entry().await? {
106                sub_entries.push(e);
107            }
108            sub_entries.sort_by_key(|e| e.file_name());
109            for sub_entry in sub_entries {
110                let sub_path = sub_entry.path();
111                if sub_path.is_dir() {
112                    let sub_path_str = normalize_path_unix(&sub_path.to_string_lossy());
113                    self.reassemble_plain(&sub_path_str, Some("xml"), true, None)
114                        .await?;
115                }
116            }
117            self.reassemble_plain(&process_path_str, Some("xml"), true, None)
118                .await?;
119        }
120        ensure_segment_files_structure(
121            segment_path,
122            &rule.wrap_root_element,
123            &rule.path_segment,
124            &rule.wrap_xmlns,
125        )
126        .await?;
127        Ok(())
128    }
129
130    /// Merge and write reassembled XML (no multi-level pre-step). Used internally.
131    /// When base_segment is Some((base_path, segment_name, extract_inner)), processing that base path
132    /// treats the segment subdir as one key whose value is an array; when extract_inner is true,
133    /// each file's root has document_root > segment > content and we use content (not whole root).
134    async fn reassemble_plain(
135        &self,
136        file_path: &str,
137        file_extension: Option<&str>,
138        post_purge: bool,
139        base_segment: Option<(String, String, bool)>,
140    ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
141        let file_path = normalize_path_unix(file_path);
142        log::debug!("Parsing directory to reassemble: {}", file_path);
143        let parsed_objects = self
144            .process_files_in_directory(file_path.to_string(), base_segment)
145            .await?;
146
147        if parsed_objects.is_empty() {
148            log::error!(
149                "No files under {} were parsed successfully. A reassembled XML file was not created.",
150                file_path
151            );
152            return Ok(());
153        }
154
155        // merge_xml_elements only returns None when every parsed element is empty or
156        // declaration-only (no usable root). Treat that the same as "nothing parsed"
157        // rather than emitting an `<root></root>` stub.
158        let Some(mut merged) = merge_xml_elements(&parsed_objects) else {
159            log::error!(
160                "No usable root element found while merging files under {}. A reassembled XML file was not created.",
161                file_path
162            );
163            return Ok(());
164        };
165
166        // Apply stored key order so reassembled XML matches original document order.
167        let key_order_path = Path::new(&file_path).join(".key_order.json");
168        if let Some(reordered) = read_key_order(&key_order_path)
169            .await
170            .and_then(|order| reorder_root_keys(&merged, &order))
171        {
172            merged = reordered;
173        }
174
175        let final_xml = build_xml_string(&merged);
176        let output_path = self.get_output_path(&file_path, file_extension);
177
178        fs::write(&output_path, final_xml).await?;
179
180        if post_purge {
181            fs::remove_dir_all(file_path).await.ok();
182        }
183
184        Ok(())
185    }
186
187    fn process_files_in_directory<'a>(
188        &'a self,
189        dir_path: String,
190        base_segment: Option<(String, String, bool)>,
191    ) -> ProcessDirFuture<'a> {
192        Box::pin(async move {
193            let mut parsed = Vec::new();
194            let mut entries = Vec::new();
195            let mut read_dir = fs::read_dir(&dir_path).await?;
196            while let Some(entry) = read_dir.next_entry().await? {
197                entries.push(entry);
198            }
199            // Sort by full filename for deterministic cross-platform ordering
200            entries.sort_by(|a, b| {
201                let a_name = a.file_name().to_string_lossy().to_string();
202                let b_name = b.file_name().to_string_lossy().to_string();
203                a_name.cmp(&b_name)
204            });
205
206            let is_base = base_segment
207                .as_ref()
208                .map(|(base, _, _)| dir_path == *base)
209                .unwrap_or(false);
210            let segment_name = base_segment.as_ref().map(|(_, name, _)| name.as_str());
211            let extract_inner = base_segment.as_ref().map(|(_, _, e)| *e).unwrap_or(false);
212
213            for entry in entries {
214                let path = entry.path();
215                let file_path = normalize_path_unix(&path.to_string_lossy()).to_string();
216
217                if path.is_file() {
218                    let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
219                    if !name.starts_with('.') && self.is_parsable_file(name) {
220                        if let Some(parsed_obj) = parse_to_xml_object(&file_path).await {
221                            parsed.push(parsed_obj);
222                        }
223                    }
224                } else {
225                    // Anything not a regular file is treated as a directory; symlinks and
226                    // other exotic entries simply recurse via read_dir below.
227                    let dir_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
228                    if is_base && segment_name == Some(dir_name) {
229                        let segment_element = self
230                            .collect_segment_as_array(
231                                &file_path,
232                                segment_name.unwrap(),
233                                extract_inner,
234                            )
235                            .await?;
236                        if let Some(el) = segment_element {
237                            parsed.push(el);
238                        }
239                    } else {
240                        let sub_parsed = self
241                            .process_files_in_directory(file_path, base_segment.clone())
242                            .await?;
243                        parsed.extend(sub_parsed);
244                    }
245                }
246            }
247
248            Ok(parsed)
249        })
250    }
251
252    /// Collect all .xml files in a directory, parse each, and build one element with
253    /// root_key and single key segment_name whose value is array of each file's content.
254    /// When extract_inner is true, each file has root > segment_name > content; we push that content.
255    async fn collect_segment_as_array(
256        &self,
257        segment_dir: &str,
258        segment_name: &str,
259        extract_inner: bool,
260    ) -> Result<Option<XmlElement>, Box<dyn std::error::Error + Send + Sync>> {
261        let mut xml_files = Vec::new();
262        let mut read_dir = fs::read_dir(segment_dir).await?;
263        while let Some(entry) = read_dir.next_entry().await? {
264            let path = entry.path();
265            let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
266            if path.is_file() && !name.starts_with('.') && self.is_parsable_file(name) {
267                xml_files.push(normalize_path_unix(&path.to_string_lossy()));
268            }
269        }
270        xml_files.sort();
271
272        let mut root_contents = Vec::new();
273        let mut first_xml: Option<(String, Option<Value>)> = None;
274        for file_path in &xml_files {
275            // parse_to_xml_object always yields a JSON object on success; treat any other
276            // shape (including parse failure) as a skip without branching explicitly.
277            let Some(parsed) = parse_to_xml_object(file_path).await else {
278                continue;
279            };
280            let obj_owned = parsed.as_object().cloned().unwrap_or_default();
281            let obj = &obj_owned;
282            let Some(root_key) = obj.keys().find(|k| *k != "?xml").cloned() else {
283                continue;
284            };
285            let root_val = obj
286                .get(&root_key)
287                .cloned()
288                .unwrap_or(Value::Object(serde_json::Map::new()));
289            let mut content = if extract_inner {
290                root_val
291                    .get(segment_name)
292                    .cloned()
293                    .unwrap_or_else(|| Value::Object(serde_json::Map::new()))
294            } else {
295                root_val
296            };
297            // Inner segment element (e.g. programProcesses) should not have xmlns in output
298            if extract_inner {
299                content = strip_xmlns_from_value(content);
300            }
301            root_contents.push(content);
302            if first_xml.is_none() {
303                first_xml = Some((root_key, obj.get("?xml").cloned()));
304            }
305        }
306        if root_contents.is_empty() {
307            return Ok(None);
308        }
309        let (root_key, decl_opt) = first_xml.unwrap();
310        let mut content = serde_json::Map::new();
311        content.insert(segment_name.to_string(), Value::Array(root_contents));
312        let mut top = serde_json::Map::new();
313        if let Some(decl) = decl_opt {
314            top.insert("?xml".to_string(), decl);
315        } else {
316            let mut d = serde_json::Map::new();
317            d.insert("@version".to_string(), Value::String("1.0".to_string()));
318            d.insert("@encoding".to_string(), Value::String("UTF-8".to_string()));
319            top.insert("?xml".to_string(), Value::Object(d));
320        }
321        top.insert(root_key, Value::Object(content));
322        Ok(Some(Value::Object(top)))
323    }
324
325    fn is_parsable_file(&self, file_name: &str) -> bool {
326        let lower = file_name.to_lowercase();
327        lower.ends_with(".xml")
328            || lower.ends_with(".json")
329            || lower.ends_with(".json5")
330            || lower.ends_with(".yaml")
331            || lower.ends_with(".yml")
332    }
333
334    async fn validate_directory(
335        &self,
336        path: &str,
337    ) -> Result<bool, Box<dyn std::error::Error + Send + Sync>> {
338        let meta = fs::metadata(path).await?;
339        if !meta.is_dir() {
340            log::error!(
341                "The provided path to reassemble is not a directory: {}",
342                path
343            );
344            return Ok(false);
345        }
346        Ok(true)
347    }
348
349    fn get_output_path(&self, dir_path: &str, extension: Option<&str>) -> String {
350        let path = Path::new(dir_path);
351        let parent = path.parent().unwrap_or(Path::new("."));
352        let base_name = path
353            .file_name()
354            .and_then(|n| n.to_str())
355            .unwrap_or("output");
356        let ext = extension.unwrap_or("xml");
357        parent
358            .join(format!("{}.{}", base_name, ext))
359            .to_string_lossy()
360            .to_string()
361    }
362}
363
364impl Default for ReassembleXmlFileHandler {
365    fn default() -> Self {
366        Self::new()
367    }
368}
369
370#[cfg(test)]
371mod tests {
372    use super::*;
373    use serde_json::json;
374
375    #[test]
376    #[allow(clippy::default_constructed_unit_structs)]
377    fn reassemble_handler_default_equals_new() {
378        let _ = ReassembleXmlFileHandler::default();
379    }
380
381    #[test]
382    fn strip_xmlns_from_value_passes_non_object_through() {
383        let s = Value::String("hello".to_string());
384        assert_eq!(
385            strip_xmlns_from_value(s),
386            Value::String("hello".to_string())
387        );
388        let arr = json!([1, 2]);
389        assert_eq!(strip_xmlns_from_value(arr.clone()), arr);
390    }
391
392    #[test]
393    fn strip_xmlns_from_value_removes_xmlns_key() {
394        let obj = json!({ "@xmlns": "ns", "child": 1 });
395        let stripped = strip_xmlns_from_value(obj);
396        let map = stripped.as_object().unwrap();
397        assert!(map.get("@xmlns").is_none());
398        assert_eq!(map.get("child").and_then(|v| v.as_i64()), Some(1));
399    }
400
401    #[test]
402    fn is_parsable_file_recognises_supported_extensions() {
403        let h = ReassembleXmlFileHandler::new();
404        assert!(h.is_parsable_file("a.xml"));
405        assert!(h.is_parsable_file("a.json"));
406        assert!(h.is_parsable_file("a.json5"));
407        assert!(h.is_parsable_file("a.yaml"));
408        assert!(h.is_parsable_file("a.yml"));
409        assert!(h.is_parsable_file("A.XML"));
410        assert!(!h.is_parsable_file("a.txt"));
411    }
412
413    #[test]
414    fn get_output_path_appends_extension_and_uses_parent_dir() {
415        let h = ReassembleXmlFileHandler::new();
416        let out = h.get_output_path("/tmp/foo", Some("xml"));
417        assert!(out.ends_with("foo.xml"));
418        let out_default = h.get_output_path("/tmp/bar", None);
419        assert!(out_default.ends_with("bar.xml"));
420        // No parent - uses "." fallback
421        assert_eq!(h.get_output_path("only", Some("json")), "only.json");
422    }
423
424    #[tokio::test]
425    async fn reassemble_multi_level_segment_noop_when_not_dir() {
426        let h = ReassembleXmlFileHandler::new();
427        let tmp = tempfile::tempdir().unwrap();
428        let file = tmp.path().join("not_a_dir.txt");
429        tokio::fs::write(&file, "hi").await.unwrap();
430        let rule = crate::types::MultiLevelRule {
431            file_pattern: String::new(),
432            root_to_strip: String::new(),
433            unique_id_elements: String::new(),
434            path_segment: String::new(),
435            wrap_root_element: "Root".to_string(),
436            wrap_xmlns: String::new(),
437        };
438        h.reassemble_multi_level_segment(&file, &rule)
439            .await
440            .unwrap();
441    }
442
443    #[tokio::test]
444    async fn reassemble_multi_level_segment_skips_files_in_segment_root() {
445        let h = ReassembleXmlFileHandler::new();
446        let tmp = tempfile::tempdir().unwrap();
447        let segment = tmp.path().join("segment");
448        tokio::fs::create_dir(&segment).await.unwrap();
449        // A bare file inside the segment dir should be skipped (not a subdir).
450        tokio::fs::write(segment.join("stray.txt"), "x")
451            .await
452            .unwrap();
453        let rule = crate::types::MultiLevelRule {
454            file_pattern: String::new(),
455            root_to_strip: String::new(),
456            unique_id_elements: String::new(),
457            path_segment: "segment".to_string(),
458            wrap_root_element: "Root".to_string(),
459            wrap_xmlns: "http://example.com".to_string(),
460        };
461        h.reassemble_multi_level_segment(&segment, &rule)
462            .await
463            .unwrap();
464    }
465
466    #[tokio::test]
467    async fn collect_segment_as_array_returns_none_for_empty_dir() {
468        let h = ReassembleXmlFileHandler::new();
469        let tmp = tempfile::tempdir().unwrap();
470        let out = h
471            .collect_segment_as_array(tmp.path().to_str().unwrap(), "seg", true)
472            .await
473            .unwrap();
474        assert!(out.is_none());
475    }
476
477    #[tokio::test]
478    async fn collect_segment_as_array_skips_unparseable_and_empty_roots() {
479        let h = ReassembleXmlFileHandler::new();
480        let tmp = tempfile::tempdir().unwrap();
481        // Unparseable XML
482        tokio::fs::write(tmp.path().join("bad.xml"), "<<")
483            .await
484            .unwrap();
485        // Valid XML but only declaration and no root after parse
486        tokio::fs::write(tmp.path().join("only-decl.xml"), "")
487            .await
488            .unwrap();
489        // Hidden file is skipped
490        tokio::fs::write(tmp.path().join(".hidden.xml"), "<r/>")
491            .await
492            .unwrap();
493        let out = h
494            .collect_segment_as_array(tmp.path().to_str().unwrap(), "seg", false)
495            .await
496            .unwrap();
497        assert!(out.is_none());
498    }
499
500    #[tokio::test]
501    async fn collect_segment_as_array_without_extract_inner_wraps_root() {
502        let h = ReassembleXmlFileHandler::new();
503        let tmp = tempfile::tempdir().unwrap();
504        tokio::fs::write(tmp.path().join("a.xml"), r#"<Root><child>1</child></Root>"#)
505            .await
506            .unwrap();
507        let out = h
508            .collect_segment_as_array(tmp.path().to_str().unwrap(), "seg", false)
509            .await
510            .unwrap()
511            .unwrap();
512        let obj = out.as_object().unwrap();
513        assert!(obj.contains_key("?xml"));
514        let root = obj.get("Root").and_then(|r| r.as_object()).unwrap();
515        assert!(root.get("seg").and_then(|v| v.as_array()).is_some());
516    }
517}