Skip to main content

config_disassembler/xml/handlers/
reassemble.rs

1//! Reassemble XML from disassembled directory.
2
3use crate::xml::builders::{build_xml_string, merge_xml_elements, reorder_root_keys};
4use crate::xml::multi_level::{ensure_segment_files_structure, load_multi_level_config};
5use crate::xml::parsers::parse_to_xml_object;
6use crate::xml::types::{MultiLevelRule, XmlElement};
7use crate::xml::utils::normalize_path_unix;
8use serde_json::Value;
9use std::collections::HashSet;
10use std::ffi::OsString;
11use std::future::Future;
12use std::path::{Path, PathBuf};
13use std::pin::Pin;
14use tokio::fs;
15
16/// Read a `.key_order.json` file (if present) and parse it as a list of root key names.
17async fn read_key_order(path: &Path) -> Option<Vec<String>> {
18    let bytes = fs::read(path).await.ok()?;
19    serde_json::from_slice::<Vec<String>>(&bytes).ok()
20}
21
22/// Remove @xmlns from an object so the reassembled segment wrapper (e.g. programProcesses) has no xmlns.
23fn strip_xmlns_from_value(v: Value) -> Value {
24    match v {
25        Value::Object(obj) => {
26            Value::Object(obj.into_iter().filter(|(k, _)| k != "@xmlns").collect())
27        }
28        other => other,
29    }
30}
31
32type ProcessDirFuture<'a> = Pin<
33    Box<
34        dyn Future<Output = Result<Vec<XmlElement>, Box<dyn std::error::Error + Send + Sync>>>
35            + Send
36            + 'a,
37    >,
38>;
39
40type SegmentFuture<'a> =
41    Pin<Box<dyn Future<Output = Result<(), Box<dyn std::error::Error + Send + Sync>>> + Send + 'a>>;
42
43pub struct ReassembleXmlFileHandler;
44
45impl ReassembleXmlFileHandler {
46    pub fn new() -> Self {
47        Self
48    }
49
50    pub async fn reassemble(
51        &self,
52        file_path: &str,
53        file_extension: Option<&str>,
54        post_purge: bool,
55    ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
56        let file_path = normalize_path_unix(file_path);
57        if !self.validate_directory(&file_path).await? {
58            return Ok(());
59        }
60
61        let path = Path::new(&file_path);
62        let config = load_multi_level_config(path).await;
63        if let Some(ref config) = config {
64            // Process each rule whose path_segment exists as a directory at the
65            // disassembly root. Inner-only rules (whose segment lives nested under another
66            // rule's item dir) are handled dynamically when the parent rule walks its
67            // items; we hand them in as `nested_rules` candidates here.
68            for (i, rule) in config.rules.iter().enumerate() {
69                let segment_path = path.join(&rule.path_segment);
70                if !segment_path.is_dir() {
71                    continue;
72                }
73                let nested: Vec<MultiLevelRule> = config
74                    .rules
75                    .iter()
76                    .enumerate()
77                    .filter(|(j, _)| *j != i)
78                    .map(|(_, r)| r.clone())
79                    .collect();
80                self.reassemble_multi_level_segment(&segment_path, rule, &nested)
81                    .await?;
82            }
83        }
84
85        // Build one base-segment entry per multi-level rule so the recursive walker can
86        // recognize each rule's path_segment under the disassembly root.
87        let base_segments: Vec<(String, String, bool)> = config
88            .as_ref()
89            .map(|c| {
90                c.rules
91                    .iter()
92                    .map(|r| (file_path.clone(), r.path_segment.clone(), true))
93                    .collect()
94            })
95            .unwrap_or_default();
96        // When multi-level reassembly is done, purge the entire disassembled directory
97        let post_purge_final = post_purge || config.is_some();
98        self.reassemble_plain(&file_path, file_extension, post_purge_final, &base_segments)
99            .await
100    }
101
102    /// Reassemble a single multi-level segment directory.
103    ///
104    /// For each item directory under `segment_path` (e.g. each `<dialog>/` under
105    /// `botDialogs/`):
106    ///
107    /// 1. **Phase 1 — nested rules first.** For every immediate sub-directory whose name
108    ///    matches a `nested_rules` candidate's `path_segment`, recursively reassemble
109    ///    that sub-directory as its own segment. This wraps each per-step file in
110    ///    `<wrap_root_element><inner_segment>...</inner_segment></wrap_root_element>` *before*
111    ///    the outer-level merge sees it, so multiple inner items survive as siblings
112    ///    rather than collapsing into a single bag of leaves.
113    ///
114    /// 2. **Phase 2 — flat sub-directories.** Any remaining sub-directory (anything not
115    ///    consumed by phase 1) is collapsed into a per-item `.xml` at the parent level
116    ///    via [`Self::reassemble_plain`], the original behaviour for things like
117    ///    decompose-rule outputs.
118    ///
119    /// 3. **Phase 3 — merge item.** Everything in the item directory (the `.xml` files
120    ///    written by phases 1 and 2 plus any leaf `.xml` already there) is merged into
121    ///    a single `.xml` at the parent level.
122    ///
123    /// Finally, [`ensure_segment_files_structure`] wraps every `.xml` in `segment_path`
124    /// in `<wrap_root_element><path_segment>...</path_segment></wrap_root_element>` so
125    /// the parent reassembly sees correctly-wrapped siblings.
126    fn reassemble_multi_level_segment<'a>(
127        &'a self,
128        segment_path: &'a Path,
129        rule: &'a MultiLevelRule,
130        nested_rules: &'a [MultiLevelRule],
131    ) -> SegmentFuture<'a> {
132        let segment_path = segment_path.to_path_buf();
133        let rule = rule.clone();
134        let nested_rules = nested_rules.to_vec();
135        Box::pin(async move {
136            self.reassemble_multi_level_segment_inner(&segment_path, &rule, &nested_rules)
137                .await
138        })
139    }
140
141    async fn reassemble_multi_level_segment_inner(
142        &self,
143        segment_path: &Path,
144        rule: &MultiLevelRule,
145        nested_rules: &[MultiLevelRule],
146    ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
147        if !segment_path.is_dir() {
148            return Ok(());
149        }
150        let mut entries = Vec::new();
151        let mut read_dir = fs::read_dir(segment_path).await?;
152        while let Some(entry) = read_dir.next_entry().await? {
153            entries.push(entry);
154        }
155        entries.sort_by_key(|e| e.file_name());
156        for entry in entries {
157            let process_path = entry.path();
158            if !process_path.is_dir() {
159                continue;
160            }
161            let process_path_str = normalize_path_unix(&process_path.to_string_lossy());
162            let mut sub_entries = Vec::new();
163            let mut sub_read = fs::read_dir(&process_path).await?;
164            while let Some(e) = sub_read.next_entry().await? {
165                sub_entries.push(e);
166            }
167            sub_entries.sort_by_key(|e| e.file_name());
168
169            // Phase 1: drain any sub-directory that matches a nested rule's
170            // `path_segment` so it is re-wrapped before the outer merge runs.
171            let mut handled: HashSet<OsString> = HashSet::new();
172            for sub_entry in &sub_entries {
173                let sub_path: PathBuf = sub_entry.path();
174                if !sub_path.is_dir() {
175                    continue;
176                }
177                let sub_name = sub_path.file_name().and_then(|n| n.to_str()).unwrap_or("");
178                let Some(nested_rule) = nested_rules.iter().find(|r| r.path_segment == sub_name)
179                else {
180                    continue;
181                };
182                // Pass everything *except* the rule we just matched as deeper candidates.
183                // Sibling rules remain candidates further down the tree without re-entering
184                // the same rule on a sub-dir that happens to share its name.
185                let deeper: Vec<MultiLevelRule> = nested_rules
186                    .iter()
187                    .filter(|r| r.path_segment != nested_rule.path_segment)
188                    .cloned()
189                    .collect();
190                self.reassemble_multi_level_segment(&sub_path, nested_rule, &deeper)
191                    .await?;
192                handled.insert(sub_entry.file_name());
193            }
194
195            // Phase 2: collapse remaining sub-directories into per-item .xml files at
196            // the parent level (preserves existing behaviour for non-nested-rule subdirs).
197            for sub_entry in &sub_entries {
198                let sub_path = sub_entry.path();
199                if !sub_path.is_dir() {
200                    continue;
201                }
202                if handled.contains(&sub_entry.file_name()) {
203                    continue;
204                }
205                let sub_path_str = normalize_path_unix(&sub_path.to_string_lossy());
206                self.reassemble_plain(&sub_path_str, Some("xml"), true, &[])
207                    .await?;
208            }
209
210            // Phase 3: merge everything in the item dir into a single .xml at the parent.
211            self.reassemble_plain(&process_path_str, Some("xml"), true, &[])
212                .await?;
213        }
214        ensure_segment_files_structure(
215            segment_path,
216            &rule.wrap_root_element,
217            &rule.path_segment,
218            &rule.wrap_xmlns,
219        )
220        .await?;
221        Ok(())
222    }
223
224    /// Merge and write reassembled XML (no multi-level pre-step). Used internally.
225    /// `base_segments` carries one tuple `(base_path, segment_name, extract_inner)` per
226    /// multi-level rule. When the recursive walker reaches `base_path` and finds a subdir
227    /// whose name matches one of the segment_names, that subdir's XML files are folded
228    /// into a single array under the segment_name key. When extract_inner is true, each
229    /// file's structure is `document_root > segment_name > content` and only the content
230    /// is collected; otherwise the whole root is kept.
231    async fn reassemble_plain(
232        &self,
233        file_path: &str,
234        file_extension: Option<&str>,
235        post_purge: bool,
236        base_segments: &[(String, String, bool)],
237    ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
238        let file_path = normalize_path_unix(file_path);
239        log::debug!("Parsing directory to reassemble: {}", file_path);
240        let parsed_objects = self
241            .process_files_in_directory(file_path.to_string(), base_segments.to_vec())
242            .await?;
243
244        if parsed_objects.is_empty() {
245            log::error!(
246                "No files under {} were parsed successfully. A reassembled XML file was not created.",
247                file_path
248            );
249            return Ok(());
250        }
251
252        // merge_xml_elements only returns None when every parsed element is empty or
253        // declaration-only (no usable root). Treat that the same as "nothing parsed"
254        // rather than emitting an `<root></root>` stub.
255        let Some(mut merged) = merge_xml_elements(&parsed_objects) else {
256            log::error!(
257                "No usable root element found while merging files under {}. A reassembled XML file was not created.",
258                file_path
259            );
260            return Ok(());
261        };
262
263        // Apply stored key order so reassembled XML matches original document order.
264        let key_order_path = Path::new(&file_path).join(".key_order.json");
265        if let Some(reordered) = read_key_order(&key_order_path)
266            .await
267            .and_then(|order| reorder_root_keys(&merged, &order))
268        {
269            merged = reordered;
270        }
271
272        let final_xml = build_xml_string(&merged);
273        let output_path = self.get_output_path(&file_path, file_extension);
274
275        fs::write(&output_path, final_xml).await?;
276
277        if post_purge {
278            fs::remove_dir_all(file_path).await.ok();
279        }
280
281        Ok(())
282    }
283
284    fn process_files_in_directory<'a>(
285        &'a self,
286        dir_path: String,
287        base_segments: Vec<(String, String, bool)>,
288    ) -> ProcessDirFuture<'a> {
289        Box::pin(async move {
290            let mut parsed = Vec::new();
291            let mut entries = Vec::new();
292            let mut read_dir = fs::read_dir(&dir_path).await?;
293            while let Some(entry) = read_dir.next_entry().await? {
294                entries.push(entry);
295            }
296            // Sort by full filename for deterministic cross-platform ordering
297            entries.sort_by(|a, b| {
298                let a_name = a.file_name().to_string_lossy().to_string();
299                let b_name = b.file_name().to_string_lossy().to_string();
300                a_name.cmp(&b_name)
301            });
302
303            // We are at the disassembly root for a given rule when our dir_path matches
304            // the base_path stored on that rule. Each rule shares the same base_path in
305            // the current implementation, but tracking them per-entry keeps the door open
306            // for future per-rule base_paths without another signature change.
307            let is_base = base_segments.iter().any(|(base, _, _)| dir_path == *base);
308
309            for entry in entries {
310                let path = entry.path();
311                let file_path = normalize_path_unix(&path.to_string_lossy()).to_string();
312
313                if path.is_file() {
314                    let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
315                    if !name.starts_with('.') && self.is_parsable_file(name) {
316                        if let Some(parsed_obj) = parse_to_xml_object(&file_path).await {
317                            parsed.push(parsed_obj);
318                        }
319                    }
320                } else {
321                    // Anything not a regular file is treated as a directory; symlinks and
322                    // other exotic entries simply recurse via read_dir below.
323                    let dir_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
324                    let matched_segment = if is_base {
325                        base_segments
326                            .iter()
327                            .find(|(_, seg_name, _)| seg_name == dir_name)
328                            .cloned()
329                    } else {
330                        None
331                    };
332                    if let Some((_, segment_name, extract_inner)) = matched_segment {
333                        let segment_element = self
334                            .collect_segment_as_array(&file_path, &segment_name, extract_inner)
335                            .await?;
336                        if let Some(el) = segment_element {
337                            parsed.push(el);
338                        }
339                    } else {
340                        let sub_parsed = self
341                            .process_files_in_directory(file_path, base_segments.clone())
342                            .await?;
343                        parsed.extend(sub_parsed);
344                    }
345                }
346            }
347
348            Ok(parsed)
349        })
350    }
351
352    /// Collect all .xml files in a directory, parse each, and build one element with
353    /// root_key and single key segment_name whose value is array of each file's content.
354    /// When extract_inner is true, each file has root > segment_name > content; we push that content.
355    async fn collect_segment_as_array(
356        &self,
357        segment_dir: &str,
358        segment_name: &str,
359        extract_inner: bool,
360    ) -> Result<Option<XmlElement>, Box<dyn std::error::Error + Send + Sync>> {
361        let mut xml_files = Vec::new();
362        let mut read_dir = fs::read_dir(segment_dir).await?;
363        while let Some(entry) = read_dir.next_entry().await? {
364            let path = entry.path();
365            let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
366            if path.is_file() && !name.starts_with('.') && self.is_parsable_file(name) {
367                xml_files.push(normalize_path_unix(&path.to_string_lossy()));
368            }
369        }
370        xml_files.sort();
371
372        let mut root_contents = Vec::new();
373        let mut first_xml: Option<(String, Option<Value>)> = None;
374        for file_path in &xml_files {
375            // parse_to_xml_object always yields a JSON object on success; treat any other
376            // shape (including parse failure) as a skip without branching explicitly.
377            let Some(parsed) = parse_to_xml_object(file_path).await else {
378                continue;
379            };
380            let obj_owned = parsed.as_object().cloned().unwrap_or_default();
381            let obj = &obj_owned;
382            let Some(root_key) = obj.keys().find(|k| *k != "?xml").cloned() else {
383                continue;
384            };
385            let root_val = obj
386                .get(&root_key)
387                .cloned()
388                .unwrap_or(Value::Object(serde_json::Map::new()));
389            let mut content = if extract_inner {
390                root_val
391                    .get(segment_name)
392                    .cloned()
393                    .unwrap_or_else(|| Value::Object(serde_json::Map::new()))
394            } else {
395                root_val
396            };
397            // Inner segment element (e.g. programProcesses) should not have xmlns in output
398            if extract_inner {
399                content = strip_xmlns_from_value(content);
400            }
401            root_contents.push(content);
402            if first_xml.is_none() {
403                first_xml = Some((root_key, obj.get("?xml").cloned()));
404            }
405        }
406        if root_contents.is_empty() {
407            return Ok(None);
408        }
409        let (root_key, decl_opt) = first_xml.unwrap();
410        let mut content = serde_json::Map::new();
411        content.insert(segment_name.to_string(), Value::Array(root_contents));
412        let mut top = serde_json::Map::new();
413        if let Some(decl) = decl_opt {
414            top.insert("?xml".to_string(), decl);
415        } else {
416            let mut d = serde_json::Map::new();
417            d.insert("@version".to_string(), Value::String("1.0".to_string()));
418            d.insert("@encoding".to_string(), Value::String("UTF-8".to_string()));
419            top.insert("?xml".to_string(), Value::Object(d));
420        }
421        top.insert(root_key, Value::Object(content));
422        Ok(Some(Value::Object(top)))
423    }
424
425    fn is_parsable_file(&self, file_name: &str) -> bool {
426        let lower = file_name.to_lowercase();
427        lower.ends_with(".xml")
428            || lower.ends_with(".json")
429            || lower.ends_with(".json5")
430            || lower.ends_with(".yaml")
431            || lower.ends_with(".yml")
432    }
433
434    async fn validate_directory(
435        &self,
436        path: &str,
437    ) -> Result<bool, Box<dyn std::error::Error + Send + Sync>> {
438        let meta = fs::metadata(path).await?;
439        if !meta.is_dir() {
440            log::error!(
441                "The provided path to reassemble is not a directory: {}",
442                path
443            );
444            return Ok(false);
445        }
446        Ok(true)
447    }
448
449    fn get_output_path(&self, dir_path: &str, extension: Option<&str>) -> String {
450        let path = Path::new(dir_path);
451        let parent = path.parent().unwrap_or(Path::new("."));
452        let base_name = path
453            .file_name()
454            .and_then(|n| n.to_str())
455            .unwrap_or("output");
456        let ext = extension.unwrap_or("xml");
457        parent
458            .join(format!("{}.{}", base_name, ext))
459            .to_string_lossy()
460            .to_string()
461    }
462}
463
464impl Default for ReassembleXmlFileHandler {
465    fn default() -> Self {
466        Self::new()
467    }
468}
469
470#[cfg(test)]
471mod tests {
472    use super::*;
473    use serde_json::json;
474
475    #[test]
476    #[allow(clippy::default_constructed_unit_structs)]
477    fn reassemble_handler_default_equals_new() {
478        let _ = ReassembleXmlFileHandler::default();
479    }
480
481    #[test]
482    fn strip_xmlns_from_value_passes_non_object_through() {
483        let s = Value::String("hello".to_string());
484        assert_eq!(
485            strip_xmlns_from_value(s),
486            Value::String("hello".to_string())
487        );
488        let arr = json!([1, 2]);
489        assert_eq!(strip_xmlns_from_value(arr.clone()), arr);
490    }
491
492    #[test]
493    fn strip_xmlns_from_value_removes_xmlns_key() {
494        let obj = json!({ "@xmlns": "ns", "child": 1 });
495        let stripped = strip_xmlns_from_value(obj);
496        let map = stripped.as_object().unwrap();
497        assert!(map.get("@xmlns").is_none());
498        assert_eq!(map.get("child").and_then(|v| v.as_i64()), Some(1));
499    }
500
501    #[test]
502    fn is_parsable_file_recognises_supported_extensions() {
503        let h = ReassembleXmlFileHandler::new();
504        assert!(h.is_parsable_file("a.xml"));
505        assert!(h.is_parsable_file("a.json"));
506        assert!(h.is_parsable_file("a.json5"));
507        assert!(h.is_parsable_file("a.yaml"));
508        assert!(h.is_parsable_file("a.yml"));
509        assert!(h.is_parsable_file("A.XML"));
510        assert!(!h.is_parsable_file("a.txt"));
511    }
512
513    #[test]
514    fn get_output_path_appends_extension_and_uses_parent_dir() {
515        let h = ReassembleXmlFileHandler::new();
516        let out = h.get_output_path("/tmp/foo", Some("xml"));
517        assert!(out.ends_with("foo.xml"));
518        let out_default = h.get_output_path("/tmp/bar", None);
519        assert!(out_default.ends_with("bar.xml"));
520        // No parent - uses "." fallback
521        assert_eq!(h.get_output_path("only", Some("json")), "only.json");
522    }
523
524    #[tokio::test]
525    async fn reassemble_multi_level_segment_noop_when_not_dir() {
526        let h = ReassembleXmlFileHandler::new();
527        let tmp = tempfile::tempdir().unwrap();
528        let file = tmp.path().join("not_a_dir.txt");
529        tokio::fs::write(&file, "hi").await.unwrap();
530        let rule = crate::xml::types::MultiLevelRule {
531            file_pattern: String::new(),
532            root_to_strip: String::new(),
533            unique_id_elements: String::new(),
534            path_segment: String::new(),
535            wrap_root_element: "Root".to_string(),
536            wrap_xmlns: String::new(),
537        };
538        h.reassemble_multi_level_segment(&file, &rule, &[])
539            .await
540            .unwrap();
541    }
542
543    #[tokio::test]
544    async fn reassemble_multi_level_segment_skips_files_in_segment_root() {
545        let h = ReassembleXmlFileHandler::new();
546        let tmp = tempfile::tempdir().unwrap();
547        let segment = tmp.path().join("segment");
548        tokio::fs::create_dir(&segment).await.unwrap();
549        // A bare file inside the segment dir should be skipped (not a subdir).
550        tokio::fs::write(segment.join("stray.txt"), "x")
551            .await
552            .unwrap();
553        let rule = crate::xml::types::MultiLevelRule {
554            file_pattern: String::new(),
555            root_to_strip: String::new(),
556            unique_id_elements: String::new(),
557            path_segment: "segment".to_string(),
558            wrap_root_element: "Root".to_string(),
559            wrap_xmlns: "http://example.com".to_string(),
560        };
561        h.reassemble_multi_level_segment(&segment, &rule, &[])
562            .await
563            .unwrap();
564    }
565
566    #[tokio::test]
567    async fn collect_segment_as_array_returns_none_for_empty_dir() {
568        let h = ReassembleXmlFileHandler::new();
569        let tmp = tempfile::tempdir().unwrap();
570        let out = h
571            .collect_segment_as_array(tmp.path().to_str().unwrap(), "seg", true)
572            .await
573            .unwrap();
574        assert!(out.is_none());
575    }
576
577    #[tokio::test]
578    async fn collect_segment_as_array_skips_unparseable_and_empty_roots() {
579        let h = ReassembleXmlFileHandler::new();
580        let tmp = tempfile::tempdir().unwrap();
581        // Unparseable XML
582        tokio::fs::write(tmp.path().join("bad.xml"), "<<")
583            .await
584            .unwrap();
585        // Valid XML but only declaration and no root after parse
586        tokio::fs::write(tmp.path().join("only-decl.xml"), "")
587            .await
588            .unwrap();
589        // Hidden file is skipped
590        tokio::fs::write(tmp.path().join(".hidden.xml"), "<r/>")
591            .await
592            .unwrap();
593        let out = h
594            .collect_segment_as_array(tmp.path().to_str().unwrap(), "seg", false)
595            .await
596            .unwrap();
597        assert!(out.is_none());
598    }
599
600    #[tokio::test]
601    async fn collect_segment_as_array_without_extract_inner_wraps_root() {
602        let h = ReassembleXmlFileHandler::new();
603        let tmp = tempfile::tempdir().unwrap();
604        tokio::fs::write(tmp.path().join("a.xml"), r#"<Root><child>1</child></Root>"#)
605            .await
606            .unwrap();
607        let out = h
608            .collect_segment_as_array(tmp.path().to_str().unwrap(), "seg", false)
609            .await
610            .unwrap()
611            .unwrap();
612        let obj = out.as_object().unwrap();
613        assert!(obj.contains_key("?xml"));
614        let root = obj.get("Root").and_then(|r| r.as_object()).unwrap();
615        assert!(root.get("seg").and_then(|v| v.as_array()).is_some());
616    }
617}