xml_disassembler/handlers/
reassemble.rs1use crate::builders::{build_xml_string, merge_xml_elements, reorder_root_keys};
4use crate::multi_level::{ensure_segment_files_structure, load_multi_level_config};
5use crate::parsers::parse_to_xml_object;
6use crate::types::XmlElement;
7use crate::utils::normalize_path_unix;
8use serde_json::Value;
9use std::future::Future;
10use std::path::Path;
11use std::pin::Pin;
12use tokio::fs;
13
14async fn read_key_order(path: &Path) -> Option<Vec<String>> {
16 let bytes = fs::read(path).await.ok()?;
17 serde_json::from_slice::<Vec<String>>(&bytes).ok()
18}
19
20fn strip_xmlns_from_value(v: Value) -> Value {
22 match v {
23 Value::Object(obj) => {
24 Value::Object(obj.into_iter().filter(|(k, _)| k != "@xmlns").collect())
25 }
26 other => other,
27 }
28}
29
30type ProcessDirFuture<'a> = Pin<
31 Box<
32 dyn Future<Output = Result<Vec<XmlElement>, Box<dyn std::error::Error + Send + Sync>>>
33 + Send
34 + 'a,
35 >,
36>;
37
38pub struct ReassembleXmlFileHandler;
39
40impl ReassembleXmlFileHandler {
41 pub fn new() -> Self {
42 Self
43 }
44
45 pub async fn reassemble(
46 &self,
47 file_path: &str,
48 file_extension: Option<&str>,
49 post_purge: bool,
50 ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
51 let file_path = normalize_path_unix(file_path);
52 if !self.validate_directory(&file_path).await? {
53 return Ok(());
54 }
55
56 let path = Path::new(&file_path);
57 let config = load_multi_level_config(path).await;
58 if let Some(ref config) = config {
59 for rule in &config.rules {
60 let segment_path = path.join(&rule.path_segment);
61 self.reassemble_multi_level_segment(&segment_path, rule)
62 .await?;
63 }
64 }
65
66 let base_segment = config.as_ref().and_then(|c| {
67 c.rules.first().map(|r| {
68 (
69 file_path.clone(),
70 r.path_segment.clone(),
71 true, )
73 })
74 });
75 let post_purge_final = post_purge || config.is_some();
77 self.reassemble_plain(&file_path, file_extension, post_purge_final, base_segment)
78 .await
79 }
80
81 async fn reassemble_multi_level_segment(
84 &self,
85 segment_path: &Path,
86 rule: &crate::types::MultiLevelRule,
87 ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
88 if !segment_path.is_dir() {
89 return Ok(());
90 }
91 let mut entries = Vec::new();
92 let mut read_dir = fs::read_dir(segment_path).await?;
93 while let Some(entry) = read_dir.next_entry().await? {
94 entries.push(entry);
95 }
96 entries.sort_by_key(|e| e.file_name());
97 for entry in entries {
98 let process_path = entry.path();
99 if !process_path.is_dir() {
100 continue;
101 }
102 let process_path_str = normalize_path_unix(&process_path.to_string_lossy());
103 let mut sub_entries = Vec::new();
104 let mut sub_read = fs::read_dir(&process_path).await?;
105 while let Some(e) = sub_read.next_entry().await? {
106 sub_entries.push(e);
107 }
108 sub_entries.sort_by_key(|e| e.file_name());
109 for sub_entry in sub_entries {
110 let sub_path = sub_entry.path();
111 if sub_path.is_dir() {
112 let sub_path_str = normalize_path_unix(&sub_path.to_string_lossy());
113 self.reassemble_plain(&sub_path_str, Some("xml"), true, None)
114 .await?;
115 }
116 }
117 self.reassemble_plain(&process_path_str, Some("xml"), true, None)
118 .await?;
119 }
120 ensure_segment_files_structure(
121 segment_path,
122 &rule.wrap_root_element,
123 &rule.path_segment,
124 &rule.wrap_xmlns,
125 )
126 .await?;
127 Ok(())
128 }
129
130 async fn reassemble_plain(
135 &self,
136 file_path: &str,
137 file_extension: Option<&str>,
138 post_purge: bool,
139 base_segment: Option<(String, String, bool)>,
140 ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
141 let file_path = normalize_path_unix(file_path);
142 log::debug!("Parsing directory to reassemble: {}", file_path);
143 let parsed_objects = self
144 .process_files_in_directory(file_path.to_string(), base_segment)
145 .await?;
146
147 if parsed_objects.is_empty() {
148 log::error!(
149 "No files under {} were parsed successfully. A reassembled XML file was not created.",
150 file_path
151 );
152 return Ok(());
153 }
154
155 let Some(mut merged) = merge_xml_elements(&parsed_objects) else {
159 log::error!(
160 "No usable root element found while merging files under {}. A reassembled XML file was not created.",
161 file_path
162 );
163 return Ok(());
164 };
165
166 let key_order_path = Path::new(&file_path).join(".key_order.json");
168 if let Some(reordered) = read_key_order(&key_order_path)
169 .await
170 .and_then(|order| reorder_root_keys(&merged, &order))
171 {
172 merged = reordered;
173 }
174
175 let final_xml = build_xml_string(&merged);
176 let output_path = self.get_output_path(&file_path, file_extension);
177
178 fs::write(&output_path, final_xml).await?;
179
180 if post_purge {
181 fs::remove_dir_all(file_path).await.ok();
182 }
183
184 Ok(())
185 }
186
187 fn process_files_in_directory<'a>(
188 &'a self,
189 dir_path: String,
190 base_segment: Option<(String, String, bool)>,
191 ) -> ProcessDirFuture<'a> {
192 Box::pin(async move {
193 let mut parsed = Vec::new();
194 let mut entries = Vec::new();
195 let mut read_dir = fs::read_dir(&dir_path).await?;
196 while let Some(entry) = read_dir.next_entry().await? {
197 entries.push(entry);
198 }
199 entries.sort_by(|a, b| {
201 let a_name = a.file_name().to_string_lossy().to_string();
202 let b_name = b.file_name().to_string_lossy().to_string();
203 a_name.cmp(&b_name)
204 });
205
206 let is_base = base_segment
207 .as_ref()
208 .map(|(base, _, _)| dir_path == *base)
209 .unwrap_or(false);
210 let segment_name = base_segment.as_ref().map(|(_, name, _)| name.as_str());
211 let extract_inner = base_segment.as_ref().map(|(_, _, e)| *e).unwrap_or(false);
212
213 for entry in entries {
214 let path = entry.path();
215 let file_path = normalize_path_unix(&path.to_string_lossy()).to_string();
216
217 if path.is_file() {
218 let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
219 if !name.starts_with('.') && self.is_parsable_file(name) {
220 if let Some(parsed_obj) = parse_to_xml_object(&file_path).await {
221 parsed.push(parsed_obj);
222 }
223 }
224 } else {
225 let dir_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
228 if is_base && segment_name == Some(dir_name) {
229 let segment_element = self
230 .collect_segment_as_array(
231 &file_path,
232 segment_name.unwrap(),
233 extract_inner,
234 )
235 .await?;
236 if let Some(el) = segment_element {
237 parsed.push(el);
238 }
239 } else {
240 let sub_parsed = self
241 .process_files_in_directory(file_path, base_segment.clone())
242 .await?;
243 parsed.extend(sub_parsed);
244 }
245 }
246 }
247
248 Ok(parsed)
249 })
250 }
251
252 async fn collect_segment_as_array(
256 &self,
257 segment_dir: &str,
258 segment_name: &str,
259 extract_inner: bool,
260 ) -> Result<Option<XmlElement>, Box<dyn std::error::Error + Send + Sync>> {
261 let mut xml_files = Vec::new();
262 let mut read_dir = fs::read_dir(segment_dir).await?;
263 while let Some(entry) = read_dir.next_entry().await? {
264 let path = entry.path();
265 let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
266 if path.is_file() && !name.starts_with('.') && self.is_parsable_file(name) {
267 xml_files.push(normalize_path_unix(&path.to_string_lossy()));
268 }
269 }
270 xml_files.sort();
271
272 let mut root_contents = Vec::new();
273 let mut first_xml: Option<(String, Option<Value>)> = None;
274 for file_path in &xml_files {
275 let Some(parsed) = parse_to_xml_object(file_path).await else {
278 continue;
279 };
280 let obj_owned = parsed.as_object().cloned().unwrap_or_default();
281 let obj = &obj_owned;
282 let Some(root_key) = obj.keys().find(|k| *k != "?xml").cloned() else {
283 continue;
284 };
285 let root_val = obj
286 .get(&root_key)
287 .cloned()
288 .unwrap_or(Value::Object(serde_json::Map::new()));
289 let mut content = if extract_inner {
290 root_val
291 .get(segment_name)
292 .cloned()
293 .unwrap_or_else(|| Value::Object(serde_json::Map::new()))
294 } else {
295 root_val
296 };
297 if extract_inner {
299 content = strip_xmlns_from_value(content);
300 }
301 root_contents.push(content);
302 if first_xml.is_none() {
303 first_xml = Some((root_key, obj.get("?xml").cloned()));
304 }
305 }
306 if root_contents.is_empty() {
307 return Ok(None);
308 }
309 let (root_key, decl_opt) = first_xml.unwrap();
310 let mut content = serde_json::Map::new();
311 content.insert(segment_name.to_string(), Value::Array(root_contents));
312 let mut top = serde_json::Map::new();
313 if let Some(decl) = decl_opt {
314 top.insert("?xml".to_string(), decl);
315 } else {
316 let mut d = serde_json::Map::new();
317 d.insert("@version".to_string(), Value::String("1.0".to_string()));
318 d.insert("@encoding".to_string(), Value::String("UTF-8".to_string()));
319 top.insert("?xml".to_string(), Value::Object(d));
320 }
321 top.insert(root_key, Value::Object(content));
322 Ok(Some(Value::Object(top)))
323 }
324
325 fn is_parsable_file(&self, file_name: &str) -> bool {
326 let lower = file_name.to_lowercase();
327 lower.ends_with(".xml")
328 || lower.ends_with(".json")
329 || lower.ends_with(".json5")
330 || lower.ends_with(".yaml")
331 || lower.ends_with(".yml")
332 }
333
334 async fn validate_directory(
335 &self,
336 path: &str,
337 ) -> Result<bool, Box<dyn std::error::Error + Send + Sync>> {
338 let meta = fs::metadata(path).await?;
339 if !meta.is_dir() {
340 log::error!(
341 "The provided path to reassemble is not a directory: {}",
342 path
343 );
344 return Ok(false);
345 }
346 Ok(true)
347 }
348
349 fn get_output_path(&self, dir_path: &str, extension: Option<&str>) -> String {
350 let path = Path::new(dir_path);
351 let parent = path.parent().unwrap_or(Path::new("."));
352 let base_name = path
353 .file_name()
354 .and_then(|n| n.to_str())
355 .unwrap_or("output");
356 let ext = extension.unwrap_or("xml");
357 parent
358 .join(format!("{}.{}", base_name, ext))
359 .to_string_lossy()
360 .to_string()
361 }
362}
363
364impl Default for ReassembleXmlFileHandler {
365 fn default() -> Self {
366 Self::new()
367 }
368}
369
370#[cfg(test)]
371mod tests {
372 use super::*;
373 use serde_json::json;
374
375 #[test]
376 #[allow(clippy::default_constructed_unit_structs)]
377 fn reassemble_handler_default_equals_new() {
378 let _ = ReassembleXmlFileHandler::default();
379 }
380
381 #[test]
382 fn strip_xmlns_from_value_passes_non_object_through() {
383 let s = Value::String("hello".to_string());
384 assert_eq!(
385 strip_xmlns_from_value(s),
386 Value::String("hello".to_string())
387 );
388 let arr = json!([1, 2]);
389 assert_eq!(strip_xmlns_from_value(arr.clone()), arr);
390 }
391
392 #[test]
393 fn strip_xmlns_from_value_removes_xmlns_key() {
394 let obj = json!({ "@xmlns": "ns", "child": 1 });
395 let stripped = strip_xmlns_from_value(obj);
396 let map = stripped.as_object().unwrap();
397 assert!(map.get("@xmlns").is_none());
398 assert_eq!(map.get("child").and_then(|v| v.as_i64()), Some(1));
399 }
400
401 #[test]
402 fn is_parsable_file_recognises_supported_extensions() {
403 let h = ReassembleXmlFileHandler::new();
404 assert!(h.is_parsable_file("a.xml"));
405 assert!(h.is_parsable_file("a.json"));
406 assert!(h.is_parsable_file("a.json5"));
407 assert!(h.is_parsable_file("a.yaml"));
408 assert!(h.is_parsable_file("a.yml"));
409 assert!(h.is_parsable_file("A.XML"));
410 assert!(!h.is_parsable_file("a.txt"));
411 }
412
413 #[test]
414 fn get_output_path_appends_extension_and_uses_parent_dir() {
415 let h = ReassembleXmlFileHandler::new();
416 let out = h.get_output_path("/tmp/foo", Some("xml"));
417 assert!(out.ends_with("foo.xml"));
418 let out_default = h.get_output_path("/tmp/bar", None);
419 assert!(out_default.ends_with("bar.xml"));
420 assert_eq!(h.get_output_path("only", Some("json")), "only.json");
422 }
423
424 #[tokio::test]
425 async fn reassemble_multi_level_segment_noop_when_not_dir() {
426 let h = ReassembleXmlFileHandler::new();
427 let tmp = tempfile::tempdir().unwrap();
428 let file = tmp.path().join("not_a_dir.txt");
429 tokio::fs::write(&file, "hi").await.unwrap();
430 let rule = crate::types::MultiLevelRule {
431 file_pattern: String::new(),
432 root_to_strip: String::new(),
433 unique_id_elements: String::new(),
434 path_segment: String::new(),
435 wrap_root_element: "Root".to_string(),
436 wrap_xmlns: String::new(),
437 };
438 h.reassemble_multi_level_segment(&file, &rule)
439 .await
440 .unwrap();
441 }
442
443 #[tokio::test]
444 async fn reassemble_multi_level_segment_skips_files_in_segment_root() {
445 let h = ReassembleXmlFileHandler::new();
446 let tmp = tempfile::tempdir().unwrap();
447 let segment = tmp.path().join("segment");
448 tokio::fs::create_dir(&segment).await.unwrap();
449 tokio::fs::write(segment.join("stray.txt"), "x")
451 .await
452 .unwrap();
453 let rule = crate::types::MultiLevelRule {
454 file_pattern: String::new(),
455 root_to_strip: String::new(),
456 unique_id_elements: String::new(),
457 path_segment: "segment".to_string(),
458 wrap_root_element: "Root".to_string(),
459 wrap_xmlns: "http://example.com".to_string(),
460 };
461 h.reassemble_multi_level_segment(&segment, &rule)
462 .await
463 .unwrap();
464 }
465
466 #[tokio::test]
467 async fn collect_segment_as_array_returns_none_for_empty_dir() {
468 let h = ReassembleXmlFileHandler::new();
469 let tmp = tempfile::tempdir().unwrap();
470 let out = h
471 .collect_segment_as_array(tmp.path().to_str().unwrap(), "seg", true)
472 .await
473 .unwrap();
474 assert!(out.is_none());
475 }
476
477 #[tokio::test]
478 async fn collect_segment_as_array_skips_unparseable_and_empty_roots() {
479 let h = ReassembleXmlFileHandler::new();
480 let tmp = tempfile::tempdir().unwrap();
481 tokio::fs::write(tmp.path().join("bad.xml"), "<<")
483 .await
484 .unwrap();
485 tokio::fs::write(tmp.path().join("only-decl.xml"), "")
487 .await
488 .unwrap();
489 tokio::fs::write(tmp.path().join(".hidden.xml"), "<r/>")
491 .await
492 .unwrap();
493 let out = h
494 .collect_segment_as_array(tmp.path().to_str().unwrap(), "seg", false)
495 .await
496 .unwrap();
497 assert!(out.is_none());
498 }
499
500 #[tokio::test]
501 async fn collect_segment_as_array_without_extract_inner_wraps_root() {
502 let h = ReassembleXmlFileHandler::new();
503 let tmp = tempfile::tempdir().unwrap();
504 tokio::fs::write(tmp.path().join("a.xml"), r#"<Root><child>1</child></Root>"#)
505 .await
506 .unwrap();
507 let out = h
508 .collect_segment_as_array(tmp.path().to_str().unwrap(), "seg", false)
509 .await
510 .unwrap()
511 .unwrap();
512 let obj = out.as_object().unwrap();
513 assert!(obj.contains_key("?xml"));
514 let root = obj.get("Root").and_then(|r| r.as_object()).unwrap();
515 assert!(root.get("seg").and_then(|v| v.as_array()).is_some());
516 }
517}