config_disassembler/xml/handlers/
reassemble.rs1use crate::xml::builders::{build_xml_string, merge_xml_elements, reorder_root_keys};
4use crate::xml::multi_level::{ensure_segment_files_structure, load_multi_level_config};
5use crate::xml::parsers::parse_to_xml_object;
6use crate::xml::types::XmlElement;
7use crate::xml::utils::normalize_path_unix;
8use serde_json::Value;
9use std::future::Future;
10use std::path::Path;
11use std::pin::Pin;
12use tokio::fs;
13
14async fn read_key_order(path: &Path) -> Option<Vec<String>> {
16 let bytes = fs::read(path).await.ok()?;
17 serde_json::from_slice::<Vec<String>>(&bytes).ok()
18}
19
20fn strip_xmlns_from_value(v: Value) -> Value {
22 match v {
23 Value::Object(obj) => {
24 Value::Object(obj.into_iter().filter(|(k, _)| k != "@xmlns").collect())
25 }
26 other => other,
27 }
28}
29
30type ProcessDirFuture<'a> = Pin<
31 Box<
32 dyn Future<Output = Result<Vec<XmlElement>, Box<dyn std::error::Error + Send + Sync>>>
33 + Send
34 + 'a,
35 >,
36>;
37
38pub struct ReassembleXmlFileHandler;
39
40impl ReassembleXmlFileHandler {
41 pub fn new() -> Self {
42 Self
43 }
44
45 pub async fn reassemble(
46 &self,
47 file_path: &str,
48 file_extension: Option<&str>,
49 post_purge: bool,
50 ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
51 let file_path = normalize_path_unix(file_path);
52 if !self.validate_directory(&file_path).await? {
53 return Ok(());
54 }
55
56 let path = Path::new(&file_path);
57 let config = load_multi_level_config(path).await;
58 if let Some(ref config) = config {
59 for rule in &config.rules {
60 let segment_path = path.join(&rule.path_segment);
61 self.reassemble_multi_level_segment(&segment_path, rule)
62 .await?;
63 }
64 }
65
66 let base_segments: Vec<(String, String, bool)> = config
69 .as_ref()
70 .map(|c| {
71 c.rules
72 .iter()
73 .map(|r| (file_path.clone(), r.path_segment.clone(), true))
74 .collect()
75 })
76 .unwrap_or_default();
77 let post_purge_final = post_purge || config.is_some();
79 self.reassemble_plain(&file_path, file_extension, post_purge_final, &base_segments)
80 .await
81 }
82
83 async fn reassemble_multi_level_segment(
86 &self,
87 segment_path: &Path,
88 rule: &crate::xml::types::MultiLevelRule,
89 ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
90 if !segment_path.is_dir() {
91 return Ok(());
92 }
93 let mut entries = Vec::new();
94 let mut read_dir = fs::read_dir(segment_path).await?;
95 while let Some(entry) = read_dir.next_entry().await? {
96 entries.push(entry);
97 }
98 entries.sort_by_key(|e| e.file_name());
99 for entry in entries {
100 let process_path = entry.path();
101 if !process_path.is_dir() {
102 continue;
103 }
104 let process_path_str = normalize_path_unix(&process_path.to_string_lossy());
105 let mut sub_entries = Vec::new();
106 let mut sub_read = fs::read_dir(&process_path).await?;
107 while let Some(e) = sub_read.next_entry().await? {
108 sub_entries.push(e);
109 }
110 sub_entries.sort_by_key(|e| e.file_name());
111 for sub_entry in sub_entries {
112 let sub_path = sub_entry.path();
113 if sub_path.is_dir() {
114 let sub_path_str = normalize_path_unix(&sub_path.to_string_lossy());
115 self.reassemble_plain(&sub_path_str, Some("xml"), true, &[])
116 .await?;
117 }
118 }
119 self.reassemble_plain(&process_path_str, Some("xml"), true, &[])
120 .await?;
121 }
122 ensure_segment_files_structure(
123 segment_path,
124 &rule.wrap_root_element,
125 &rule.path_segment,
126 &rule.wrap_xmlns,
127 )
128 .await?;
129 Ok(())
130 }
131
132 async fn reassemble_plain(
140 &self,
141 file_path: &str,
142 file_extension: Option<&str>,
143 post_purge: bool,
144 base_segments: &[(String, String, bool)],
145 ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
146 let file_path = normalize_path_unix(file_path);
147 log::debug!("Parsing directory to reassemble: {}", file_path);
148 let parsed_objects = self
149 .process_files_in_directory(file_path.to_string(), base_segments.to_vec())
150 .await?;
151
152 if parsed_objects.is_empty() {
153 log::error!(
154 "No files under {} were parsed successfully. A reassembled XML file was not created.",
155 file_path
156 );
157 return Ok(());
158 }
159
160 let Some(mut merged) = merge_xml_elements(&parsed_objects) else {
164 log::error!(
165 "No usable root element found while merging files under {}. A reassembled XML file was not created.",
166 file_path
167 );
168 return Ok(());
169 };
170
171 let key_order_path = Path::new(&file_path).join(".key_order.json");
173 if let Some(reordered) = read_key_order(&key_order_path)
174 .await
175 .and_then(|order| reorder_root_keys(&merged, &order))
176 {
177 merged = reordered;
178 }
179
180 let final_xml = build_xml_string(&merged);
181 let output_path = self.get_output_path(&file_path, file_extension);
182
183 fs::write(&output_path, final_xml).await?;
184
185 if post_purge {
186 fs::remove_dir_all(file_path).await.ok();
187 }
188
189 Ok(())
190 }
191
192 fn process_files_in_directory<'a>(
193 &'a self,
194 dir_path: String,
195 base_segments: Vec<(String, String, bool)>,
196 ) -> ProcessDirFuture<'a> {
197 Box::pin(async move {
198 let mut parsed = Vec::new();
199 let mut entries = Vec::new();
200 let mut read_dir = fs::read_dir(&dir_path).await?;
201 while let Some(entry) = read_dir.next_entry().await? {
202 entries.push(entry);
203 }
204 entries.sort_by(|a, b| {
206 let a_name = a.file_name().to_string_lossy().to_string();
207 let b_name = b.file_name().to_string_lossy().to_string();
208 a_name.cmp(&b_name)
209 });
210
211 let is_base = base_segments.iter().any(|(base, _, _)| dir_path == *base);
216
217 for entry in entries {
218 let path = entry.path();
219 let file_path = normalize_path_unix(&path.to_string_lossy()).to_string();
220
221 if path.is_file() {
222 let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
223 if !name.starts_with('.') && self.is_parsable_file(name) {
224 if let Some(parsed_obj) = parse_to_xml_object(&file_path).await {
225 parsed.push(parsed_obj);
226 }
227 }
228 } else {
229 let dir_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
232 let matched_segment = if is_base {
233 base_segments
234 .iter()
235 .find(|(_, seg_name, _)| seg_name == dir_name)
236 .cloned()
237 } else {
238 None
239 };
240 if let Some((_, segment_name, extract_inner)) = matched_segment {
241 let segment_element = self
242 .collect_segment_as_array(&file_path, &segment_name, extract_inner)
243 .await?;
244 if let Some(el) = segment_element {
245 parsed.push(el);
246 }
247 } else {
248 let sub_parsed = self
249 .process_files_in_directory(file_path, base_segments.clone())
250 .await?;
251 parsed.extend(sub_parsed);
252 }
253 }
254 }
255
256 Ok(parsed)
257 })
258 }
259
260 async fn collect_segment_as_array(
264 &self,
265 segment_dir: &str,
266 segment_name: &str,
267 extract_inner: bool,
268 ) -> Result<Option<XmlElement>, Box<dyn std::error::Error + Send + Sync>> {
269 let mut xml_files = Vec::new();
270 let mut read_dir = fs::read_dir(segment_dir).await?;
271 while let Some(entry) = read_dir.next_entry().await? {
272 let path = entry.path();
273 let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
274 if path.is_file() && !name.starts_with('.') && self.is_parsable_file(name) {
275 xml_files.push(normalize_path_unix(&path.to_string_lossy()));
276 }
277 }
278 xml_files.sort();
279
280 let mut root_contents = Vec::new();
281 let mut first_xml: Option<(String, Option<Value>)> = None;
282 for file_path in &xml_files {
283 let Some(parsed) = parse_to_xml_object(file_path).await else {
286 continue;
287 };
288 let obj_owned = parsed.as_object().cloned().unwrap_or_default();
289 let obj = &obj_owned;
290 let Some(root_key) = obj.keys().find(|k| *k != "?xml").cloned() else {
291 continue;
292 };
293 let root_val = obj
294 .get(&root_key)
295 .cloned()
296 .unwrap_or(Value::Object(serde_json::Map::new()));
297 let mut content = if extract_inner {
298 root_val
299 .get(segment_name)
300 .cloned()
301 .unwrap_or_else(|| Value::Object(serde_json::Map::new()))
302 } else {
303 root_val
304 };
305 if extract_inner {
307 content = strip_xmlns_from_value(content);
308 }
309 root_contents.push(content);
310 if first_xml.is_none() {
311 first_xml = Some((root_key, obj.get("?xml").cloned()));
312 }
313 }
314 if root_contents.is_empty() {
315 return Ok(None);
316 }
317 let (root_key, decl_opt) = first_xml.unwrap();
318 let mut content = serde_json::Map::new();
319 content.insert(segment_name.to_string(), Value::Array(root_contents));
320 let mut top = serde_json::Map::new();
321 if let Some(decl) = decl_opt {
322 top.insert("?xml".to_string(), decl);
323 } else {
324 let mut d = serde_json::Map::new();
325 d.insert("@version".to_string(), Value::String("1.0".to_string()));
326 d.insert("@encoding".to_string(), Value::String("UTF-8".to_string()));
327 top.insert("?xml".to_string(), Value::Object(d));
328 }
329 top.insert(root_key, Value::Object(content));
330 Ok(Some(Value::Object(top)))
331 }
332
333 fn is_parsable_file(&self, file_name: &str) -> bool {
334 let lower = file_name.to_lowercase();
335 lower.ends_with(".xml")
336 || lower.ends_with(".json")
337 || lower.ends_with(".json5")
338 || lower.ends_with(".yaml")
339 || lower.ends_with(".yml")
340 }
341
342 async fn validate_directory(
343 &self,
344 path: &str,
345 ) -> Result<bool, Box<dyn std::error::Error + Send + Sync>> {
346 let meta = fs::metadata(path).await?;
347 if !meta.is_dir() {
348 log::error!(
349 "The provided path to reassemble is not a directory: {}",
350 path
351 );
352 return Ok(false);
353 }
354 Ok(true)
355 }
356
357 fn get_output_path(&self, dir_path: &str, extension: Option<&str>) -> String {
358 let path = Path::new(dir_path);
359 let parent = path.parent().unwrap_or(Path::new("."));
360 let base_name = path
361 .file_name()
362 .and_then(|n| n.to_str())
363 .unwrap_or("output");
364 let ext = extension.unwrap_or("xml");
365 parent
366 .join(format!("{}.{}", base_name, ext))
367 .to_string_lossy()
368 .to_string()
369 }
370}
371
372impl Default for ReassembleXmlFileHandler {
373 fn default() -> Self {
374 Self::new()
375 }
376}
377
378#[cfg(test)]
379mod tests {
380 use super::*;
381 use serde_json::json;
382
383 #[test]
384 #[allow(clippy::default_constructed_unit_structs)]
385 fn reassemble_handler_default_equals_new() {
386 let _ = ReassembleXmlFileHandler::default();
387 }
388
389 #[test]
390 fn strip_xmlns_from_value_passes_non_object_through() {
391 let s = Value::String("hello".to_string());
392 assert_eq!(
393 strip_xmlns_from_value(s),
394 Value::String("hello".to_string())
395 );
396 let arr = json!([1, 2]);
397 assert_eq!(strip_xmlns_from_value(arr.clone()), arr);
398 }
399
400 #[test]
401 fn strip_xmlns_from_value_removes_xmlns_key() {
402 let obj = json!({ "@xmlns": "ns", "child": 1 });
403 let stripped = strip_xmlns_from_value(obj);
404 let map = stripped.as_object().unwrap();
405 assert!(map.get("@xmlns").is_none());
406 assert_eq!(map.get("child").and_then(|v| v.as_i64()), Some(1));
407 }
408
409 #[test]
410 fn is_parsable_file_recognises_supported_extensions() {
411 let h = ReassembleXmlFileHandler::new();
412 assert!(h.is_parsable_file("a.xml"));
413 assert!(h.is_parsable_file("a.json"));
414 assert!(h.is_parsable_file("a.json5"));
415 assert!(h.is_parsable_file("a.yaml"));
416 assert!(h.is_parsable_file("a.yml"));
417 assert!(h.is_parsable_file("A.XML"));
418 assert!(!h.is_parsable_file("a.txt"));
419 }
420
421 #[test]
422 fn get_output_path_appends_extension_and_uses_parent_dir() {
423 let h = ReassembleXmlFileHandler::new();
424 let out = h.get_output_path("/tmp/foo", Some("xml"));
425 assert!(out.ends_with("foo.xml"));
426 let out_default = h.get_output_path("/tmp/bar", None);
427 assert!(out_default.ends_with("bar.xml"));
428 assert_eq!(h.get_output_path("only", Some("json")), "only.json");
430 }
431
432 #[tokio::test]
433 async fn reassemble_multi_level_segment_noop_when_not_dir() {
434 let h = ReassembleXmlFileHandler::new();
435 let tmp = tempfile::tempdir().unwrap();
436 let file = tmp.path().join("not_a_dir.txt");
437 tokio::fs::write(&file, "hi").await.unwrap();
438 let rule = crate::xml::types::MultiLevelRule {
439 file_pattern: String::new(),
440 root_to_strip: String::new(),
441 unique_id_elements: String::new(),
442 path_segment: String::new(),
443 wrap_root_element: "Root".to_string(),
444 wrap_xmlns: String::new(),
445 };
446 h.reassemble_multi_level_segment(&file, &rule)
447 .await
448 .unwrap();
449 }
450
451 #[tokio::test]
452 async fn reassemble_multi_level_segment_skips_files_in_segment_root() {
453 let h = ReassembleXmlFileHandler::new();
454 let tmp = tempfile::tempdir().unwrap();
455 let segment = tmp.path().join("segment");
456 tokio::fs::create_dir(&segment).await.unwrap();
457 tokio::fs::write(segment.join("stray.txt"), "x")
459 .await
460 .unwrap();
461 let rule = crate::xml::types::MultiLevelRule {
462 file_pattern: String::new(),
463 root_to_strip: String::new(),
464 unique_id_elements: String::new(),
465 path_segment: "segment".to_string(),
466 wrap_root_element: "Root".to_string(),
467 wrap_xmlns: "http://example.com".to_string(),
468 };
469 h.reassemble_multi_level_segment(&segment, &rule)
470 .await
471 .unwrap();
472 }
473
474 #[tokio::test]
475 async fn collect_segment_as_array_returns_none_for_empty_dir() {
476 let h = ReassembleXmlFileHandler::new();
477 let tmp = tempfile::tempdir().unwrap();
478 let out = h
479 .collect_segment_as_array(tmp.path().to_str().unwrap(), "seg", true)
480 .await
481 .unwrap();
482 assert!(out.is_none());
483 }
484
485 #[tokio::test]
486 async fn collect_segment_as_array_skips_unparseable_and_empty_roots() {
487 let h = ReassembleXmlFileHandler::new();
488 let tmp = tempfile::tempdir().unwrap();
489 tokio::fs::write(tmp.path().join("bad.xml"), "<<")
491 .await
492 .unwrap();
493 tokio::fs::write(tmp.path().join("only-decl.xml"), "")
495 .await
496 .unwrap();
497 tokio::fs::write(tmp.path().join(".hidden.xml"), "<r/>")
499 .await
500 .unwrap();
501 let out = h
502 .collect_segment_as_array(tmp.path().to_str().unwrap(), "seg", false)
503 .await
504 .unwrap();
505 assert!(out.is_none());
506 }
507
508 #[tokio::test]
509 async fn collect_segment_as_array_without_extract_inner_wraps_root() {
510 let h = ReassembleXmlFileHandler::new();
511 let tmp = tempfile::tempdir().unwrap();
512 tokio::fs::write(tmp.path().join("a.xml"), r#"<Root><child>1</child></Root>"#)
513 .await
514 .unwrap();
515 let out = h
516 .collect_segment_as_array(tmp.path().to_str().unwrap(), "seg", false)
517 .await
518 .unwrap()
519 .unwrap();
520 let obj = out.as_object().unwrap();
521 assert!(obj.contains_key("?xml"));
522 let root = obj.get("Root").and_then(|r| r.as_object()).unwrap();
523 assert!(root.get("seg").and_then(|v| v.as_array()).is_some());
524 }
525}