Skip to main content

xml_disassembler/handlers/
reassemble.rs

1//! Reassemble XML from disassembled directory.
2
3use crate::builders::{build_xml_string, merge_xml_elements, reorder_root_keys};
4use crate::multi_level::{ensure_segment_files_structure, load_multi_level_config};
5use crate::parsers::parse_to_xml_object;
6use crate::types::XmlElement;
7use crate::utils::normalize_path_unix;
8use serde_json::{Map, Value};
9use std::future::Future;
10use std::path::Path;
11use std::pin::Pin;
12use tokio::fs;
13
14/// Remove @xmlns from an object so the reassembled segment wrapper (e.g. programProcesses) has no xmlns.
15fn strip_xmlns_from_value(v: Value) -> Value {
16    let obj = match v.as_object() {
17        Some(o) => o,
18        None => return v,
19    };
20    let mut out = Map::new();
21    for (k, val) in obj {
22        if k != "@xmlns" {
23            out.insert(k.clone(), val.clone());
24        }
25    }
26    Value::Object(out)
27}
28
29type ProcessDirFuture<'a> = Pin<
30    Box<
31        dyn Future<Output = Result<Vec<XmlElement>, Box<dyn std::error::Error + Send + Sync>>>
32            + Send
33            + 'a,
34    >,
35>;
36
37pub struct ReassembleXmlFileHandler;
38
39impl ReassembleXmlFileHandler {
40    pub fn new() -> Self {
41        Self
42    }
43
44    pub async fn reassemble(
45        &self,
46        file_path: &str,
47        file_extension: Option<&str>,
48        post_purge: bool,
49    ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
50        let file_path = normalize_path_unix(file_path);
51        if !self.validate_directory(&file_path).await? {
52            return Ok(());
53        }
54
55        let path = Path::new(&file_path);
56        let config = load_multi_level_config(path).await;
57        if let Some(ref config) = config {
58            for rule in &config.rules {
59                if rule.path_segment.is_empty() {
60                    continue;
61                }
62                let segment_path = path.join(&rule.path_segment);
63                if !segment_path.is_dir() {
64                    continue;
65                }
66                let mut entries = Vec::new();
67                let mut read_dir = fs::read_dir(&segment_path).await?;
68                while let Some(entry) = read_dir.next_entry().await? {
69                    entries.push(entry);
70                }
71                // Sort for deterministic cross-platform ordering
72                entries.sort_by_key(|e| e.file_name());
73                for entry in entries {
74                    let process_path = entry.path();
75                    if !process_path.is_dir() {
76                        continue;
77                    }
78                    let process_path_str = normalize_path_unix(&process_path.to_string_lossy());
79                    let mut sub_entries = Vec::new();
80                    let mut sub_read = fs::read_dir(&process_path).await?;
81                    while let Some(e) = sub_read.next_entry().await? {
82                        sub_entries.push(e);
83                    }
84                    // Sort for deterministic cross-platform ordering
85                    sub_entries.sort_by_key(|e| e.file_name());
86                    for sub_entry in sub_entries {
87                        let sub_path = sub_entry.path();
88                        if sub_path.is_dir() {
89                            let sub_path_str = normalize_path_unix(&sub_path.to_string_lossy());
90                            self.reassemble_plain(&sub_path_str, Some("xml"), true, None)
91                                .await?;
92                        }
93                    }
94                    self.reassemble_plain(&process_path_str, Some("xml"), true, None)
95                        .await?;
96                }
97                ensure_segment_files_structure(
98                    &segment_path,
99                    &rule.wrap_root_element,
100                    &rule.path_segment,
101                    &rule.wrap_xmlns,
102                )
103                .await?;
104            }
105        }
106
107        let base_segment = config.as_ref().and_then(|c| {
108            c.rules.first().map(|r| {
109                (
110                    file_path.clone(),
111                    r.path_segment.clone(),
112                    true, // extract_inner: segment files have document_root > segment > content
113                )
114            })
115        });
116        // When multi-level reassembly is done, purge the entire disassembled directory
117        let post_purge_final = post_purge || config.is_some();
118        self.reassemble_plain(&file_path, file_extension, post_purge_final, base_segment)
119            .await
120    }
121
122    /// Merge and write reassembled XML (no multi-level pre-step). Used internally.
123    /// When base_segment is Some((base_path, segment_name, extract_inner)), processing that base path
124    /// treats the segment subdir as one key whose value is an array; when extract_inner is true,
125    /// each file's root has document_root > segment > content and we use content (not whole root).
126    async fn reassemble_plain(
127        &self,
128        file_path: &str,
129        file_extension: Option<&str>,
130        post_purge: bool,
131        base_segment: Option<(String, String, bool)>,
132    ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
133        let file_path = normalize_path_unix(file_path);
134        log::debug!("Parsing directory to reassemble: {}", file_path);
135        let parsed_objects = self
136            .process_files_in_directory(file_path.to_string(), base_segment)
137            .await?;
138
139        if parsed_objects.is_empty() {
140            log::error!(
141                "No files under {} were parsed successfully. A reassembled XML file was not created.",
142                file_path
143            );
144            return Ok(());
145        }
146
147        let mut merged = match merge_xml_elements(&parsed_objects) {
148            Some(m) => m,
149            None => return Ok(()),
150        };
151
152        // Apply stored key order so reassembled XML matches original document order.
153        let key_order_path = Path::new(&file_path).join(".key_order.json");
154        if key_order_path.exists() {
155            if let Ok(bytes) = fs::read(&key_order_path).await {
156                if let Ok(key_order) = serde_json::from_slice::<Vec<String>>(&bytes) {
157                    if let Some(reordered) = reorder_root_keys(&merged, &key_order) {
158                        merged = reordered;
159                    }
160                }
161            }
162        }
163
164        let final_xml = build_xml_string(&merged);
165        let output_path = self.get_output_path(&file_path, file_extension);
166
167        fs::write(&output_path, final_xml).await?;
168
169        if post_purge {
170            fs::remove_dir_all(file_path).await.ok();
171        }
172
173        Ok(())
174    }
175
176    fn process_files_in_directory<'a>(
177        &'a self,
178        dir_path: String,
179        base_segment: Option<(String, String, bool)>,
180    ) -> ProcessDirFuture<'a> {
181        Box::pin(async move {
182            let mut parsed = Vec::new();
183            let mut entries = Vec::new();
184            let mut read_dir = fs::read_dir(&dir_path).await?;
185            while let Some(entry) = read_dir.next_entry().await? {
186                entries.push(entry);
187            }
188            // Sort by full filename for deterministic cross-platform ordering
189            entries.sort_by(|a, b| {
190                let a_name = a.file_name().to_string_lossy().to_string();
191                let b_name = b.file_name().to_string_lossy().to_string();
192                a_name.cmp(&b_name)
193            });
194
195            let is_base = base_segment
196                .as_ref()
197                .map(|(base, _, _)| dir_path == *base)
198                .unwrap_or(false);
199            let segment_name = base_segment.as_ref().map(|(_, name, _)| name.as_str());
200            let extract_inner = base_segment.as_ref().map(|(_, _, e)| *e).unwrap_or(false);
201
202            for entry in entries {
203                let path = entry.path();
204                let file_path = normalize_path_unix(&path.to_string_lossy()).to_string();
205
206                if path.is_file() {
207                    let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
208                    if !name.starts_with('.') && self.is_parsable_file(name) {
209                        if let Some(parsed_obj) = parse_to_xml_object(&file_path).await {
210                            parsed.push(parsed_obj);
211                        }
212                    }
213                } else if path.is_dir() {
214                    let dir_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
215                    if is_base && segment_name == Some(dir_name) {
216                        let segment_element = self
217                            .collect_segment_as_array(
218                                &file_path,
219                                segment_name.unwrap(),
220                                extract_inner,
221                            )
222                            .await?;
223                        if let Some(el) = segment_element {
224                            parsed.push(el);
225                        }
226                    } else {
227                        let sub_parsed = self
228                            .process_files_in_directory(file_path, base_segment.clone())
229                            .await?;
230                        parsed.extend(sub_parsed);
231                    }
232                }
233            }
234
235            Ok(parsed)
236        })
237    }
238
239    /// Collect all .xml files in a directory, parse each, and build one element with
240    /// root_key and single key segment_name whose value is array of each file's content.
241    /// When extract_inner is true, each file has root > segment_name > content; we push that content.
242    async fn collect_segment_as_array(
243        &self,
244        segment_dir: &str,
245        segment_name: &str,
246        extract_inner: bool,
247    ) -> Result<Option<XmlElement>, Box<dyn std::error::Error + Send + Sync>> {
248        let mut xml_files = Vec::new();
249        let mut read_dir = fs::read_dir(segment_dir).await?;
250        while let Some(entry) = read_dir.next_entry().await? {
251            let path = entry.path();
252            if path.is_file() {
253                let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
254                if !name.starts_with('.') && self.is_parsable_file(name) {
255                    xml_files.push(normalize_path_unix(&path.to_string_lossy()));
256                }
257            }
258        }
259        xml_files.sort();
260
261        let mut root_contents = Vec::new();
262        let mut first_xml: Option<(String, Option<Value>)> = None;
263        for file_path in &xml_files {
264            let parsed = match parse_to_xml_object(file_path).await {
265                Some(p) => p,
266                None => continue,
267            };
268            let obj = match parsed.as_object() {
269                Some(o) => o,
270                None => continue,
271            };
272            let root_key = match obj.keys().find(|k| *k != "?xml").cloned() {
273                Some(k) => k,
274                None => continue,
275            };
276            let root_val = obj
277                .get(&root_key)
278                .cloned()
279                .unwrap_or(Value::Object(serde_json::Map::new()));
280            let mut content = if extract_inner {
281                root_val
282                    .get(segment_name)
283                    .cloned()
284                    .unwrap_or_else(|| Value::Object(serde_json::Map::new()))
285            } else {
286                root_val
287            };
288            // Inner segment element (e.g. programProcesses) should not have xmlns in output
289            if extract_inner {
290                content = strip_xmlns_from_value(content);
291            }
292            root_contents.push(content);
293            if first_xml.is_none() {
294                first_xml = Some((root_key, obj.get("?xml").cloned()));
295            }
296        }
297        if root_contents.is_empty() {
298            return Ok(None);
299        }
300        let (root_key, decl_opt) = first_xml.unwrap();
301        let mut content = serde_json::Map::new();
302        content.insert(segment_name.to_string(), Value::Array(root_contents));
303        let mut top = serde_json::Map::new();
304        if let Some(decl) = decl_opt {
305            top.insert("?xml".to_string(), decl);
306        } else {
307            let mut d = serde_json::Map::new();
308            d.insert("@version".to_string(), Value::String("1.0".to_string()));
309            d.insert("@encoding".to_string(), Value::String("UTF-8".to_string()));
310            top.insert("?xml".to_string(), Value::Object(d));
311        }
312        top.insert(root_key, Value::Object(content));
313        Ok(Some(Value::Object(top)))
314    }
315
316    fn is_parsable_file(&self, file_name: &str) -> bool {
317        let lower = file_name.to_lowercase();
318        lower.ends_with(".xml")
319            || lower.ends_with(".json")
320            || lower.ends_with(".json5")
321            || lower.ends_with(".yaml")
322            || lower.ends_with(".yml")
323    }
324
325    async fn validate_directory(
326        &self,
327        path: &str,
328    ) -> Result<bool, Box<dyn std::error::Error + Send + Sync>> {
329        let meta = fs::metadata(path).await?;
330        if !meta.is_dir() {
331            log::error!(
332                "The provided path to reassemble is not a directory: {}",
333                path
334            );
335            return Ok(false);
336        }
337        Ok(true)
338    }
339
340    fn get_output_path(&self, dir_path: &str, extension: Option<&str>) -> String {
341        let path = Path::new(dir_path);
342        let parent = path.parent().unwrap_or(Path::new("."));
343        let base_name = path
344            .file_name()
345            .and_then(|n| n.to_str())
346            .unwrap_or("output");
347        let ext = extension.unwrap_or("xml");
348        parent
349            .join(format!("{}.{}", base_name, ext))
350            .to_string_lossy()
351            .to_string()
352    }
353}
354
355impl Default for ReassembleXmlFileHandler {
356    fn default() -> Self {
357        Self::new()
358    }
359}
360
361#[cfg(test)]
362mod tests {
363    use super::*;
364
365    #[test]
366    #[allow(clippy::default_constructed_unit_structs)]
367    fn reassemble_handler_default_equals_new() {
368        let _ = ReassembleXmlFileHandler::default();
369    }
370}