Skip to main content

xml_disassembler/handlers/
reassemble.rs

1//! Reassemble XML from disassembled directory.
2
3use crate::builders::{build_xml_string, merge_xml_elements, reorder_root_keys};
4use crate::multi_level::{ensure_segment_files_structure, load_multi_level_config};
5use crate::parsers::parse_to_xml_object;
6use crate::types::XmlElement;
7use serde_json::{Map, Value};
8use std::future::Future;
9use std::path::Path;
10use std::pin::Pin;
11use tokio::fs;
12
13/// Remove @xmlns from an object so the reassembled segment wrapper (e.g. programProcesses) has no xmlns.
14fn strip_xmlns_from_value(v: Value) -> Value {
15    let obj = match v.as_object() {
16        Some(o) => o,
17        None => return v,
18    };
19    let mut out = Map::new();
20    for (k, val) in obj {
21        if k != "@xmlns" {
22            out.insert(k.clone(), val.clone());
23        }
24    }
25    Value::Object(out)
26}
27
28type ProcessDirFuture<'a> = Pin<
29    Box<
30        dyn Future<Output = Result<Vec<XmlElement>, Box<dyn std::error::Error + Send + Sync>>>
31            + Send
32            + 'a,
33    >,
34>;
35
36pub struct ReassembleXmlFileHandler;
37
38impl ReassembleXmlFileHandler {
39    pub fn new() -> Self {
40        Self
41    }
42
43    pub async fn reassemble(
44        &self,
45        file_path: &str,
46        file_extension: Option<&str>,
47        post_purge: bool,
48    ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
49        if !self.validate_directory(file_path).await? {
50            return Ok(());
51        }
52
53        let path = Path::new(file_path);
54        let config = load_multi_level_config(path).await;
55        if let Some(ref config) = config {
56            for rule in &config.rules {
57                if rule.path_segment.is_empty() {
58                    continue;
59                }
60                let segment_path = path.join(&rule.path_segment);
61                if !segment_path.is_dir() {
62                    continue;
63                }
64                let mut entries = Vec::new();
65                let mut read_dir = fs::read_dir(&segment_path).await?;
66                while let Some(entry) = read_dir.next_entry().await? {
67                    entries.push(entry);
68                }
69                // Sort for deterministic cross-platform ordering
70                entries.sort_by_key(|e| e.file_name());
71                for entry in entries {
72                    let process_path = entry.path();
73                    if !process_path.is_dir() {
74                        continue;
75                    }
76                    let process_path_str = process_path.to_string_lossy().to_string();
77                    let mut sub_entries = Vec::new();
78                    let mut sub_read = fs::read_dir(&process_path).await?;
79                    while let Some(e) = sub_read.next_entry().await? {
80                        sub_entries.push(e);
81                    }
82                    // Sort for deterministic cross-platform ordering
83                    sub_entries.sort_by_key(|e| e.file_name());
84                    for sub_entry in sub_entries {
85                        let sub_path = sub_entry.path();
86                        if sub_path.is_dir() {
87                            let sub_path_str = sub_path.to_string_lossy().to_string();
88                            self.reassemble_plain(&sub_path_str, Some("xml"), true, None)
89                                .await?;
90                        }
91                    }
92                    self.reassemble_plain(&process_path_str, Some("xml"), true, None)
93                        .await?;
94                }
95                ensure_segment_files_structure(
96                    &segment_path,
97                    &rule.wrap_root_element,
98                    &rule.path_segment,
99                    &rule.wrap_xmlns,
100                )
101                .await?;
102            }
103        }
104
105        let base_segment = config.as_ref().and_then(|c| {
106            c.rules.first().map(|r| {
107                (
108                    file_path.to_string(),
109                    r.path_segment.clone(),
110                    true, // extract_inner: segment files have document_root > segment > content
111                )
112            })
113        });
114        // When multi-level reassembly is done, purge the entire disassembled directory
115        let post_purge_final = post_purge || config.is_some();
116        self.reassemble_plain(file_path, file_extension, post_purge_final, base_segment)
117            .await
118    }
119
120    /// Merge and write reassembled XML (no multi-level pre-step). Used internally.
121    /// When base_segment is Some((base_path, segment_name, extract_inner)), processing that base path
122    /// treats the segment subdir as one key whose value is an array; when extract_inner is true,
123    /// each file's root has document_root > segment > content and we use content (not whole root).
124    async fn reassemble_plain(
125        &self,
126        file_path: &str,
127        file_extension: Option<&str>,
128        post_purge: bool,
129        base_segment: Option<(String, String, bool)>,
130    ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
131        log::debug!("Parsing directory to reassemble: {}", file_path);
132        let parsed_objects = self
133            .process_files_in_directory(file_path.to_string(), base_segment)
134            .await?;
135
136        if parsed_objects.is_empty() {
137            log::error!(
138                "No files under {} were parsed successfully. A reassembled XML file was not created.",
139                file_path
140            );
141            return Ok(());
142        }
143
144        let mut merged = match merge_xml_elements(&parsed_objects) {
145            Some(m) => m,
146            None => return Ok(()),
147        };
148
149        // Apply stored key order so reassembled XML matches original document order.
150        let key_order_path = Path::new(file_path).join(".key_order.json");
151        if key_order_path.exists() {
152            if let Ok(bytes) = fs::read(&key_order_path).await {
153                if let Ok(key_order) = serde_json::from_slice::<Vec<String>>(&bytes) {
154                    if let Some(reordered) = reorder_root_keys(&merged, &key_order) {
155                        merged = reordered;
156                    }
157                }
158            }
159        }
160
161        let final_xml = build_xml_string(&merged);
162        let output_path = self.get_output_path(file_path, file_extension);
163
164        fs::write(&output_path, final_xml).await?;
165
166        if post_purge {
167            fs::remove_dir_all(file_path).await.ok();
168        }
169
170        Ok(())
171    }
172
173    fn process_files_in_directory<'a>(
174        &'a self,
175        dir_path: String,
176        base_segment: Option<(String, String, bool)>,
177    ) -> ProcessDirFuture<'a> {
178        Box::pin(async move {
179            let mut parsed = Vec::new();
180            let mut entries = Vec::new();
181            let mut read_dir = fs::read_dir(&dir_path).await?;
182            while let Some(entry) = read_dir.next_entry().await? {
183                entries.push(entry);
184            }
185            // Sort by full filename for deterministic cross-platform ordering
186            entries.sort_by(|a, b| {
187                let a_name = a.file_name().to_string_lossy().to_string();
188                let b_name = b.file_name().to_string_lossy().to_string();
189                a_name.cmp(&b_name)
190            });
191
192            let is_base = base_segment
193                .as_ref()
194                .map(|(base, _, _)| dir_path == *base)
195                .unwrap_or(false);
196            let segment_name = base_segment.as_ref().map(|(_, name, _)| name.as_str());
197            let extract_inner = base_segment.as_ref().map(|(_, _, e)| *e).unwrap_or(false);
198
199            for entry in entries {
200                let path = entry.path();
201                let file_path = path.to_string_lossy().to_string();
202
203                if path.is_file() {
204                    let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
205                    if !name.starts_with('.') && self.is_parsable_file(name) {
206                        if let Some(parsed_obj) = parse_to_xml_object(&file_path).await {
207                            parsed.push(parsed_obj);
208                        }
209                    }
210                } else if path.is_dir() {
211                    let dir_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
212                    if is_base && segment_name == Some(dir_name) {
213                        let segment_element = self
214                            .collect_segment_as_array(
215                                &file_path,
216                                segment_name.unwrap(),
217                                extract_inner,
218                            )
219                            .await?;
220                        if let Some(el) = segment_element {
221                            parsed.push(el);
222                        }
223                    } else {
224                        let sub_parsed = self
225                            .process_files_in_directory(file_path, base_segment.clone())
226                            .await?;
227                        parsed.extend(sub_parsed);
228                    }
229                }
230            }
231
232            Ok(parsed)
233        })
234    }
235
236    /// Collect all .xml files in a directory, parse each, and build one element with
237    /// root_key and single key segment_name whose value is array of each file's content.
238    /// When extract_inner is true, each file has root > segment_name > content; we push that content.
239    async fn collect_segment_as_array(
240        &self,
241        segment_dir: &str,
242        segment_name: &str,
243        extract_inner: bool,
244    ) -> Result<Option<XmlElement>, Box<dyn std::error::Error + Send + Sync>> {
245        let mut xml_files = Vec::new();
246        let mut read_dir = fs::read_dir(segment_dir).await?;
247        while let Some(entry) = read_dir.next_entry().await? {
248            let path = entry.path();
249            if path.is_file() {
250                let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
251                if !name.starts_with('.') && self.is_parsable_file(name) {
252                    xml_files.push(path.to_string_lossy().to_string());
253                }
254            }
255        }
256        xml_files.sort();
257
258        let mut root_contents = Vec::new();
259        let mut first_xml: Option<(String, Option<Value>)> = None;
260        for file_path in &xml_files {
261            let parsed = match parse_to_xml_object(file_path).await {
262                Some(p) => p,
263                None => continue,
264            };
265            let obj = match parsed.as_object() {
266                Some(o) => o,
267                None => continue,
268            };
269            let root_key = match obj.keys().find(|k| *k != "?xml").cloned() {
270                Some(k) => k,
271                None => continue,
272            };
273            let root_val = obj
274                .get(&root_key)
275                .cloned()
276                .unwrap_or(Value::Object(serde_json::Map::new()));
277            let mut content = if extract_inner {
278                root_val
279                    .get(segment_name)
280                    .cloned()
281                    .unwrap_or_else(|| Value::Object(serde_json::Map::new()))
282            } else {
283                root_val
284            };
285            // Inner segment element (e.g. programProcesses) should not have xmlns in output
286            if extract_inner {
287                content = strip_xmlns_from_value(content);
288            }
289            root_contents.push(content);
290            if first_xml.is_none() {
291                first_xml = Some((root_key, obj.get("?xml").cloned()));
292            }
293        }
294        if root_contents.is_empty() {
295            return Ok(None);
296        }
297        let (root_key, decl_opt) = first_xml.unwrap();
298        let mut content = serde_json::Map::new();
299        content.insert(segment_name.to_string(), Value::Array(root_contents));
300        let mut top = serde_json::Map::new();
301        if let Some(decl) = decl_opt {
302            top.insert("?xml".to_string(), decl);
303        } else {
304            let mut d = serde_json::Map::new();
305            d.insert("@version".to_string(), Value::String("1.0".to_string()));
306            d.insert("@encoding".to_string(), Value::String("UTF-8".to_string()));
307            top.insert("?xml".to_string(), Value::Object(d));
308        }
309        top.insert(root_key, Value::Object(content));
310        Ok(Some(Value::Object(top)))
311    }
312
313    fn is_parsable_file(&self, file_name: &str) -> bool {
314        let lower = file_name.to_lowercase();
315        lower.ends_with(".xml")
316            || lower.ends_with(".json")
317            || lower.ends_with(".json5")
318            || lower.ends_with(".yaml")
319            || lower.ends_with(".yml")
320    }
321
322    async fn validate_directory(
323        &self,
324        path: &str,
325    ) -> Result<bool, Box<dyn std::error::Error + Send + Sync>> {
326        let meta = fs::metadata(path).await?;
327        if !meta.is_dir() {
328            log::error!(
329                "The provided path to reassemble is not a directory: {}",
330                path
331            );
332            return Ok(false);
333        }
334        Ok(true)
335    }
336
337    fn get_output_path(&self, dir_path: &str, extension: Option<&str>) -> String {
338        let path = Path::new(dir_path);
339        let parent = path.parent().unwrap_or(Path::new("."));
340        let base_name = path
341            .file_name()
342            .and_then(|n| n.to_str())
343            .unwrap_or("output");
344        let ext = extension.unwrap_or("xml");
345        parent
346            .join(format!("{}.{}", base_name, ext))
347            .to_string_lossy()
348            .to_string()
349    }
350}
351
352impl Default for ReassembleXmlFileHandler {
353    fn default() -> Self {
354        Self::new()
355    }
356}