Skip to main content

xml_disassembler/handlers/
reassemble.rs

1//! Reassemble XML from disassembled directory.
2
3use crate::builders::{build_xml_string, merge_xml_elements, reorder_root_keys};
4use crate::multi_level::{ensure_segment_files_structure, load_multi_level_config};
5use crate::parsers::parse_to_xml_object;
6use crate::types::XmlElement;
7use serde_json::{Map, Value};
8use std::future::Future;
9use std::path::Path;
10use std::pin::Pin;
11use tokio::fs;
12
13/// Remove @xmlns from an object so the reassembled segment wrapper (e.g. programProcesses) has no xmlns.
14fn strip_xmlns_from_value(v: Value) -> Value {
15    let obj = match v.as_object() {
16        Some(o) => o,
17        None => return v,
18    };
19    let mut out = Map::new();
20    for (k, val) in obj {
21        if k != "@xmlns" {
22            out.insert(k.clone(), val.clone());
23        }
24    }
25    Value::Object(out)
26}
27
28type ProcessDirFuture<'a> = Pin<
29    Box<
30        dyn Future<Output = Result<Vec<XmlElement>, Box<dyn std::error::Error + Send + Sync>>>
31            + Send
32            + 'a,
33    >,
34>;
35
36pub struct ReassembleXmlFileHandler;
37
38impl ReassembleXmlFileHandler {
39    pub fn new() -> Self {
40        Self
41    }
42
43    pub async fn reassemble(
44        &self,
45        file_path: &str,
46        file_extension: Option<&str>,
47        post_purge: bool,
48    ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
49        if !self.validate_directory(file_path).await? {
50            return Ok(());
51        }
52
53        let path = Path::new(file_path);
54        let config = load_multi_level_config(path).await;
55        if let Some(ref config) = config {
56            for rule in &config.rules {
57                if rule.path_segment.is_empty() {
58                    continue;
59                }
60                let segment_path = path.join(&rule.path_segment);
61                if !segment_path.is_dir() {
62                    continue;
63                }
64                let mut entries = Vec::new();
65                let mut read_dir = fs::read_dir(&segment_path).await?;
66                while let Some(entry) = read_dir.next_entry().await? {
67                    entries.push(entry);
68                }
69                for entry in entries {
70                    let process_path = entry.path();
71                    if !process_path.is_dir() {
72                        continue;
73                    }
74                    let process_path_str = process_path.to_string_lossy().to_string();
75                    let mut sub_entries = Vec::new();
76                    let mut sub_read = fs::read_dir(&process_path).await?;
77                    while let Some(e) = sub_read.next_entry().await? {
78                        sub_entries.push(e);
79                    }
80                    for sub_entry in sub_entries {
81                        let sub_path = sub_entry.path();
82                        if sub_path.is_dir() {
83                            let sub_path_str = sub_path.to_string_lossy().to_string();
84                            self.reassemble_plain(&sub_path_str, Some("xml"), true, None)
85                                .await?;
86                        }
87                    }
88                    self.reassemble_plain(&process_path_str, Some("xml"), true, None)
89                        .await?;
90                }
91                ensure_segment_files_structure(
92                    &segment_path,
93                    &rule.wrap_root_element,
94                    &rule.path_segment,
95                    &rule.wrap_xmlns,
96                )
97                .await?;
98            }
99        }
100
101        let base_segment = config.as_ref().and_then(|c| {
102            c.rules.first().map(|r| {
103                (
104                    file_path.to_string(),
105                    r.path_segment.clone(),
106                    true, // extract_inner: segment files have document_root > segment > content
107                )
108            })
109        });
110        // When multi-level reassembly is done, purge the entire disassembled directory
111        let post_purge_final = post_purge || config.is_some();
112        self.reassemble_plain(file_path, file_extension, post_purge_final, base_segment)
113            .await
114    }
115
116    /// Merge and write reassembled XML (no multi-level pre-step). Used internally.
117    /// When base_segment is Some((base_path, segment_name, extract_inner)), processing that base path
118    /// treats the segment subdir as one key whose value is an array; when extract_inner is true,
119    /// each file's root has document_root > segment > content and we use content (not whole root).
120    async fn reassemble_plain(
121        &self,
122        file_path: &str,
123        file_extension: Option<&str>,
124        post_purge: bool,
125        base_segment: Option<(String, String, bool)>,
126    ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
127        log::debug!("Parsing directory to reassemble: {}", file_path);
128        let parsed_objects = self
129            .process_files_in_directory(file_path.to_string(), base_segment)
130            .await?;
131
132        if parsed_objects.is_empty() {
133            log::error!(
134                "No files under {} were parsed successfully. A reassembled XML file was not created.",
135                file_path
136            );
137            return Ok(());
138        }
139
140        let mut merged = match merge_xml_elements(&parsed_objects) {
141            Some(m) => m,
142            None => return Ok(()),
143        };
144
145        // Apply stored key order so reassembled XML matches original document order.
146        let key_order_path = Path::new(file_path).join(".key_order.json");
147        if key_order_path.exists() {
148            if let Ok(bytes) = fs::read(&key_order_path).await {
149                if let Ok(key_order) = serde_json::from_slice::<Vec<String>>(&bytes) {
150                    if let Some(reordered) = reorder_root_keys(&merged, &key_order) {
151                        merged = reordered;
152                    }
153                }
154            }
155        }
156
157        let final_xml = build_xml_string(&merged);
158        let output_path = self.get_output_path(file_path, file_extension);
159
160        fs::write(&output_path, final_xml).await?;
161
162        if post_purge {
163            fs::remove_dir_all(file_path).await.ok();
164        }
165
166        Ok(())
167    }
168
169    fn process_files_in_directory<'a>(
170        &'a self,
171        dir_path: String,
172        base_segment: Option<(String, String, bool)>,
173    ) -> ProcessDirFuture<'a> {
174        Box::pin(async move {
175            let mut parsed = Vec::new();
176            let mut entries = Vec::new();
177            let mut read_dir = fs::read_dir(&dir_path).await?;
178            while let Some(entry) = read_dir.next_entry().await? {
179                entries.push(entry);
180            }
181            entries.sort_by(|a, b| {
182                let a_base: String = a
183                    .file_name()
184                    .to_str()
185                    .unwrap_or("")
186                    .split('.')
187                    .next()
188                    .unwrap_or("")
189                    .to_string();
190                let b_base: String = b
191                    .file_name()
192                    .to_str()
193                    .unwrap_or("")
194                    .split('.')
195                    .next()
196                    .unwrap_or("")
197                    .to_string();
198                a_base.cmp(&b_base)
199            });
200
201            let is_base = base_segment
202                .as_ref()
203                .map(|(base, _, _)| dir_path == *base)
204                .unwrap_or(false);
205            let segment_name = base_segment.as_ref().map(|(_, name, _)| name.as_str());
206            let extract_inner = base_segment.as_ref().map(|(_, _, e)| *e).unwrap_or(false);
207
208            for entry in entries {
209                let path = entry.path();
210                let file_path = path.to_string_lossy().to_string();
211
212                if path.is_file() {
213                    let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
214                    if !name.starts_with('.') && self.is_parsable_file(name) {
215                        if let Some(parsed_obj) = parse_to_xml_object(&file_path).await {
216                            parsed.push(parsed_obj);
217                        }
218                    }
219                } else if path.is_dir() {
220                    let dir_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
221                    if is_base && segment_name == Some(dir_name) {
222                        let segment_element = self
223                            .collect_segment_as_array(
224                                &file_path,
225                                segment_name.unwrap(),
226                                extract_inner,
227                            )
228                            .await?;
229                        if let Some(el) = segment_element {
230                            parsed.push(el);
231                        }
232                    } else {
233                        let sub_parsed = self
234                            .process_files_in_directory(file_path, base_segment.clone())
235                            .await?;
236                        parsed.extend(sub_parsed);
237                    }
238                }
239            }
240
241            Ok(parsed)
242        })
243    }
244
245    /// Collect all .xml files in a directory, parse each, and build one element with
246    /// root_key and single key segment_name whose value is array of each file's content.
247    /// When extract_inner is true, each file has root > segment_name > content; we push that content.
248    async fn collect_segment_as_array(
249        &self,
250        segment_dir: &str,
251        segment_name: &str,
252        extract_inner: bool,
253    ) -> Result<Option<XmlElement>, Box<dyn std::error::Error + Send + Sync>> {
254        let mut xml_files = Vec::new();
255        let mut read_dir = fs::read_dir(segment_dir).await?;
256        while let Some(entry) = read_dir.next_entry().await? {
257            let path = entry.path();
258            if path.is_file() {
259                let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
260                if !name.starts_with('.') && self.is_parsable_file(name) {
261                    xml_files.push(path.to_string_lossy().to_string());
262                }
263            }
264        }
265        xml_files.sort();
266
267        let mut root_contents = Vec::new();
268        let mut first_xml: Option<(String, Option<Value>)> = None;
269        for file_path in &xml_files {
270            let parsed = match parse_to_xml_object(file_path).await {
271                Some(p) => p,
272                None => continue,
273            };
274            let obj = match parsed.as_object() {
275                Some(o) => o,
276                None => continue,
277            };
278            let root_key = match obj.keys().find(|k| *k != "?xml").cloned() {
279                Some(k) => k,
280                None => continue,
281            };
282            let root_val = obj
283                .get(&root_key)
284                .cloned()
285                .unwrap_or(Value::Object(serde_json::Map::new()));
286            let mut content = if extract_inner {
287                root_val
288                    .get(segment_name)
289                    .cloned()
290                    .unwrap_or_else(|| Value::Object(serde_json::Map::new()))
291            } else {
292                root_val
293            };
294            // Inner segment element (e.g. programProcesses) should not have xmlns in output
295            if extract_inner {
296                content = strip_xmlns_from_value(content);
297            }
298            root_contents.push(content);
299            if first_xml.is_none() {
300                first_xml = Some((root_key, obj.get("?xml").cloned()));
301            }
302        }
303        if root_contents.is_empty() {
304            return Ok(None);
305        }
306        let (root_key, decl_opt) = first_xml.unwrap();
307        let mut content = serde_json::Map::new();
308        content.insert(segment_name.to_string(), Value::Array(root_contents));
309        let mut top = serde_json::Map::new();
310        if let Some(decl) = decl_opt {
311            top.insert("?xml".to_string(), decl);
312        } else {
313            let mut d = serde_json::Map::new();
314            d.insert("@version".to_string(), Value::String("1.0".to_string()));
315            d.insert("@encoding".to_string(), Value::String("UTF-8".to_string()));
316            top.insert("?xml".to_string(), Value::Object(d));
317        }
318        top.insert(root_key, Value::Object(content));
319        Ok(Some(Value::Object(top)))
320    }
321
322    fn is_parsable_file(&self, file_name: &str) -> bool {
323        let lower = file_name.to_lowercase();
324        lower.ends_with(".xml")
325            || lower.ends_with(".json")
326            || lower.ends_with(".json5")
327            || lower.ends_with(".yaml")
328            || lower.ends_with(".yml")
329    }
330
331    async fn validate_directory(
332        &self,
333        path: &str,
334    ) -> Result<bool, Box<dyn std::error::Error + Send + Sync>> {
335        let meta = fs::metadata(path).await?;
336        if !meta.is_dir() {
337            log::error!(
338                "The provided path to reassemble is not a directory: {}",
339                path
340            );
341            return Ok(false);
342        }
343        Ok(true)
344    }
345
346    fn get_output_path(&self, dir_path: &str, extension: Option<&str>) -> String {
347        let path = Path::new(dir_path);
348        let parent = path.parent().unwrap_or(Path::new("."));
349        let base_name = path
350            .file_name()
351            .and_then(|n| n.to_str())
352            .unwrap_or("output");
353        let ext = extension.unwrap_or("xml");
354        parent
355            .join(format!("{}.{}", base_name, ext))
356            .to_string_lossy()
357            .to_string()
358    }
359}
360
361impl Default for ReassembleXmlFileHandler {
362    fn default() -> Self {
363        Self::new()
364    }
365}