config_disassembler/xml/handlers/
reassemble.rs1use crate::xml::builders::{build_xml_string, merge_xml_elements, reorder_root_keys};
4use crate::xml::multi_level::{ensure_segment_files_structure, load_multi_level_config};
5use crate::xml::parsers::parse_to_xml_object;
6use crate::xml::types::{MultiLevelRule, XmlElement};
7use crate::xml::utils::normalize_path_unix;
8use serde_json::Value;
9use std::collections::HashSet;
10use std::ffi::OsString;
11use std::future::Future;
12use std::path::{Path, PathBuf};
13use std::pin::Pin;
14use tokio::fs;
15
16async fn read_key_order(path: &Path) -> Option<Vec<String>> {
18 let bytes = fs::read(path).await.ok()?;
19 serde_json::from_slice::<Vec<String>>(&bytes).ok()
20}
21
22fn strip_xmlns_from_value(v: Value) -> Value {
24 match v {
25 Value::Object(obj) => {
26 Value::Object(obj.into_iter().filter(|(k, _)| k != "@xmlns").collect())
27 }
28 other => other,
29 }
30}
31
32type ProcessDirFuture<'a> = Pin<
33 Box<
34 dyn Future<Output = Result<Vec<XmlElement>, Box<dyn std::error::Error + Send + Sync>>>
35 + Send
36 + 'a,
37 >,
38>;
39
40type SegmentFuture<'a> =
41 Pin<Box<dyn Future<Output = Result<(), Box<dyn std::error::Error + Send + Sync>>> + Send + 'a>>;
42
43pub struct ReassembleXmlFileHandler;
44
45impl ReassembleXmlFileHandler {
46 pub fn new() -> Self {
47 Self
48 }
49
50 pub async fn reassemble(
51 &self,
52 file_path: &str,
53 file_extension: Option<&str>,
54 post_purge: bool,
55 ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
56 let file_path = normalize_path_unix(file_path);
57 if !self.validate_directory(&file_path).await? {
58 return Ok(());
59 }
60
61 let path = Path::new(&file_path);
62 let config = load_multi_level_config(path).await;
63 if let Some(ref config) = config {
64 for (i, rule) in config.rules.iter().enumerate() {
69 let segment_path = path.join(&rule.path_segment);
70 if !segment_path.is_dir() {
71 continue;
72 }
73 let nested: Vec<MultiLevelRule> = config
74 .rules
75 .iter()
76 .enumerate()
77 .filter(|(j, _)| *j != i)
78 .map(|(_, r)| r.clone())
79 .collect();
80 self.reassemble_multi_level_segment(&segment_path, rule, &nested)
81 .await?;
82 }
83 }
84
85 let base_segments: Vec<(String, String, bool)> = config
88 .as_ref()
89 .map(|c| {
90 c.rules
91 .iter()
92 .map(|r| (file_path.clone(), r.path_segment.clone(), true))
93 .collect()
94 })
95 .unwrap_or_default();
96 let post_purge_final = post_purge || config.is_some();
98 self.reassemble_plain(&file_path, file_extension, post_purge_final, &base_segments)
99 .await
100 }
101
102 fn reassemble_multi_level_segment<'a>(
127 &'a self,
128 segment_path: &'a Path,
129 rule: &'a MultiLevelRule,
130 nested_rules: &'a [MultiLevelRule],
131 ) -> SegmentFuture<'a> {
132 let segment_path = segment_path.to_path_buf();
133 let rule = rule.clone();
134 let nested_rules = nested_rules.to_vec();
135 Box::pin(async move {
136 self.reassemble_multi_level_segment_inner(&segment_path, &rule, &nested_rules)
137 .await
138 })
139 }
140
141 async fn reassemble_multi_level_segment_inner(
142 &self,
143 segment_path: &Path,
144 rule: &MultiLevelRule,
145 nested_rules: &[MultiLevelRule],
146 ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
147 if !segment_path.is_dir() {
148 return Ok(());
149 }
150 let mut entries = Vec::new();
151 let mut read_dir = fs::read_dir(segment_path).await?;
152 while let Some(entry) = read_dir.next_entry().await? {
153 entries.push(entry);
154 }
155 entries.sort_by_key(|e| e.file_name());
156 for entry in entries {
157 let process_path = entry.path();
158 if !process_path.is_dir() {
159 continue;
160 }
161 let process_path_str = normalize_path_unix(&process_path.to_string_lossy());
162 let mut sub_entries = Vec::new();
163 let mut sub_read = fs::read_dir(&process_path).await?;
164 while let Some(e) = sub_read.next_entry().await? {
165 sub_entries.push(e);
166 }
167 sub_entries.sort_by_key(|e| e.file_name());
168
169 let mut handled: HashSet<OsString> = HashSet::new();
172 for sub_entry in &sub_entries {
173 let sub_path: PathBuf = sub_entry.path();
174 if !sub_path.is_dir() {
175 continue;
176 }
177 let sub_name = sub_path.file_name().and_then(|n| n.to_str()).unwrap_or("");
178 let Some(nested_rule) = nested_rules.iter().find(|r| r.path_segment == sub_name)
179 else {
180 continue;
181 };
182 let deeper: Vec<MultiLevelRule> = nested_rules
186 .iter()
187 .filter(|r| r.path_segment != nested_rule.path_segment)
188 .cloned()
189 .collect();
190 self.reassemble_multi_level_segment(&sub_path, nested_rule, &deeper)
191 .await?;
192 handled.insert(sub_entry.file_name());
193 }
194
195 for sub_entry in &sub_entries {
198 let sub_path = sub_entry.path();
199 if !sub_path.is_dir() {
200 continue;
201 }
202 if handled.contains(&sub_entry.file_name()) {
203 continue;
204 }
205 let sub_path_str = normalize_path_unix(&sub_path.to_string_lossy());
206 self.reassemble_plain(&sub_path_str, Some("xml"), true, &[])
207 .await?;
208 }
209
210 self.reassemble_plain(&process_path_str, Some("xml"), true, &[])
212 .await?;
213 }
214 ensure_segment_files_structure(
215 segment_path,
216 &rule.wrap_root_element,
217 &rule.path_segment,
218 &rule.wrap_xmlns,
219 )
220 .await?;
221 Ok(())
222 }
223
224 async fn reassemble_plain(
232 &self,
233 file_path: &str,
234 file_extension: Option<&str>,
235 post_purge: bool,
236 base_segments: &[(String, String, bool)],
237 ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
238 let file_path = normalize_path_unix(file_path);
239 log::debug!("Parsing directory to reassemble: {}", file_path);
240 let parsed_objects = self
241 .process_files_in_directory(file_path.to_string(), base_segments.to_vec())
242 .await?;
243
244 if parsed_objects.is_empty() {
245 log::error!(
246 "No files under {} were parsed successfully. A reassembled XML file was not created.",
247 file_path
248 );
249 return Ok(());
250 }
251
252 let Some(mut merged) = merge_xml_elements(&parsed_objects) else {
256 log::error!(
257 "No usable root element found while merging files under {}. A reassembled XML file was not created.",
258 file_path
259 );
260 return Ok(());
261 };
262
263 let key_order_path = Path::new(&file_path).join(".key_order.json");
265 if let Some(reordered) = read_key_order(&key_order_path)
266 .await
267 .and_then(|order| reorder_root_keys(&merged, &order))
268 {
269 merged = reordered;
270 }
271
272 let final_xml = build_xml_string(&merged);
273 let output_path = self.get_output_path(&file_path, file_extension);
274
275 fs::write(&output_path, final_xml).await?;
276
277 if post_purge {
278 fs::remove_dir_all(file_path).await.ok();
279 }
280
281 Ok(())
282 }
283
284 fn process_files_in_directory<'a>(
285 &'a self,
286 dir_path: String,
287 base_segments: Vec<(String, String, bool)>,
288 ) -> ProcessDirFuture<'a> {
289 Box::pin(async move {
290 let mut parsed = Vec::new();
291 let mut entries = Vec::new();
292 let mut read_dir = fs::read_dir(&dir_path).await?;
293 while let Some(entry) = read_dir.next_entry().await? {
294 entries.push(entry);
295 }
296 entries.sort_by(|a, b| {
298 let a_name = a.file_name().to_string_lossy().to_string();
299 let b_name = b.file_name().to_string_lossy().to_string();
300 a_name.cmp(&b_name)
301 });
302
303 let is_base = base_segments.iter().any(|(base, _, _)| dir_path == *base);
308
309 for entry in entries {
310 let path = entry.path();
311 let file_path = normalize_path_unix(&path.to_string_lossy()).to_string();
312
313 if path.is_file() {
314 let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
315 if !name.starts_with('.') && self.is_parsable_file(name) {
316 if let Some(parsed_obj) = parse_to_xml_object(&file_path).await {
317 parsed.push(parsed_obj);
318 }
319 }
320 } else {
321 let dir_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
324 let matched_segment = if is_base {
325 base_segments
326 .iter()
327 .find(|(_, seg_name, _)| seg_name == dir_name)
328 .cloned()
329 } else {
330 None
331 };
332 if let Some((_, segment_name, extract_inner)) = matched_segment {
333 let segment_element = self
334 .collect_segment_as_array(&file_path, &segment_name, extract_inner)
335 .await?;
336 if let Some(el) = segment_element {
337 parsed.push(el);
338 }
339 } else {
340 let sub_parsed = self
341 .process_files_in_directory(file_path, base_segments.clone())
342 .await?;
343 parsed.extend(sub_parsed);
344 }
345 }
346 }
347
348 Ok(parsed)
349 })
350 }
351
352 async fn collect_segment_as_array(
356 &self,
357 segment_dir: &str,
358 segment_name: &str,
359 extract_inner: bool,
360 ) -> Result<Option<XmlElement>, Box<dyn std::error::Error + Send + Sync>> {
361 let mut xml_files = Vec::new();
362 let mut read_dir = fs::read_dir(segment_dir).await?;
363 while let Some(entry) = read_dir.next_entry().await? {
364 let path = entry.path();
365 let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
366 if path.is_file() && !name.starts_with('.') && self.is_parsable_file(name) {
367 xml_files.push(normalize_path_unix(&path.to_string_lossy()));
368 }
369 }
370 xml_files.sort();
371
372 let mut root_contents = Vec::new();
373 let mut first_xml: Option<(String, Option<Value>)> = None;
374 for file_path in &xml_files {
375 let Some(parsed) = parse_to_xml_object(file_path).await else {
378 continue;
379 };
380 let obj_owned = parsed.as_object().cloned().unwrap_or_default();
381 let obj = &obj_owned;
382 let Some(root_key) = obj.keys().find(|k| *k != "?xml").cloned() else {
383 continue;
384 };
385 let root_val = obj
386 .get(&root_key)
387 .cloned()
388 .unwrap_or(Value::Object(serde_json::Map::new()));
389 let mut content = if extract_inner {
390 root_val
391 .get(segment_name)
392 .cloned()
393 .unwrap_or_else(|| Value::Object(serde_json::Map::new()))
394 } else {
395 root_val
396 };
397 if extract_inner {
399 content = strip_xmlns_from_value(content);
400 }
401 root_contents.push(content);
402 if first_xml.is_none() {
403 first_xml = Some((root_key, obj.get("?xml").cloned()));
404 }
405 }
406 if root_contents.is_empty() {
407 return Ok(None);
408 }
409 let (root_key, decl_opt) = first_xml.unwrap();
410 let mut content = serde_json::Map::new();
411 content.insert(segment_name.to_string(), Value::Array(root_contents));
412 let mut top = serde_json::Map::new();
413 if let Some(decl) = decl_opt {
414 top.insert("?xml".to_string(), decl);
415 } else {
416 let mut d = serde_json::Map::new();
417 d.insert("@version".to_string(), Value::String("1.0".to_string()));
418 d.insert("@encoding".to_string(), Value::String("UTF-8".to_string()));
419 top.insert("?xml".to_string(), Value::Object(d));
420 }
421 top.insert(root_key, Value::Object(content));
422 Ok(Some(Value::Object(top)))
423 }
424
425 fn is_parsable_file(&self, file_name: &str) -> bool {
426 let lower = file_name.to_lowercase();
427 lower.ends_with(".xml")
428 || lower.ends_with(".json")
429 || lower.ends_with(".json5")
430 || lower.ends_with(".yaml")
431 || lower.ends_with(".yml")
432 }
433
434 async fn validate_directory(
435 &self,
436 path: &str,
437 ) -> Result<bool, Box<dyn std::error::Error + Send + Sync>> {
438 let meta = fs::metadata(path).await?;
439 if !meta.is_dir() {
440 log::error!(
441 "The provided path to reassemble is not a directory: {}",
442 path
443 );
444 return Ok(false);
445 }
446 Ok(true)
447 }
448
449 fn get_output_path(&self, dir_path: &str, extension: Option<&str>) -> String {
450 let path = Path::new(dir_path);
451 let parent = path.parent().unwrap_or(Path::new("."));
452 let base_name = path
453 .file_name()
454 .and_then(|n| n.to_str())
455 .unwrap_or("output");
456 let ext = extension.unwrap_or("xml");
457 parent
458 .join(format!("{}.{}", base_name, ext))
459 .to_string_lossy()
460 .to_string()
461 }
462}
463
464impl Default for ReassembleXmlFileHandler {
465 fn default() -> Self {
466 Self::new()
467 }
468}
469
470#[cfg(test)]
471mod tests {
472 use super::*;
473 use serde_json::json;
474
475 #[test]
476 #[allow(clippy::default_constructed_unit_structs)]
477 fn reassemble_handler_default_equals_new() {
478 let _ = ReassembleXmlFileHandler::default();
479 }
480
481 #[test]
482 fn strip_xmlns_from_value_passes_non_object_through() {
483 let s = Value::String("hello".to_string());
484 assert_eq!(
485 strip_xmlns_from_value(s),
486 Value::String("hello".to_string())
487 );
488 let arr = json!([1, 2]);
489 assert_eq!(strip_xmlns_from_value(arr.clone()), arr);
490 }
491
492 #[test]
493 fn strip_xmlns_from_value_removes_xmlns_key() {
494 let obj = json!({ "@xmlns": "ns", "child": 1 });
495 let stripped = strip_xmlns_from_value(obj);
496 let map = stripped.as_object().unwrap();
497 assert!(map.get("@xmlns").is_none());
498 assert_eq!(map.get("child").and_then(|v| v.as_i64()), Some(1));
499 }
500
501 #[test]
502 fn is_parsable_file_recognises_supported_extensions() {
503 let h = ReassembleXmlFileHandler::new();
504 assert!(h.is_parsable_file("a.xml"));
505 assert!(h.is_parsable_file("a.json"));
506 assert!(h.is_parsable_file("a.json5"));
507 assert!(h.is_parsable_file("a.yaml"));
508 assert!(h.is_parsable_file("a.yml"));
509 assert!(h.is_parsable_file("A.XML"));
510 assert!(!h.is_parsable_file("a.txt"));
511 }
512
513 #[test]
514 fn get_output_path_appends_extension_and_uses_parent_dir() {
515 let h = ReassembleXmlFileHandler::new();
516 let out = h.get_output_path("/tmp/foo", Some("xml"));
517 assert!(out.ends_with("foo.xml"));
518 let out_default = h.get_output_path("/tmp/bar", None);
519 assert!(out_default.ends_with("bar.xml"));
520 assert_eq!(h.get_output_path("only", Some("json")), "only.json");
522 }
523
524 #[tokio::test]
525 async fn reassemble_multi_level_segment_noop_when_not_dir() {
526 let h = ReassembleXmlFileHandler::new();
527 let tmp = tempfile::tempdir().unwrap();
528 let file = tmp.path().join("not_a_dir.txt");
529 tokio::fs::write(&file, "hi").await.unwrap();
530 let rule = crate::xml::types::MultiLevelRule {
531 file_pattern: String::new(),
532 root_to_strip: String::new(),
533 unique_id_elements: String::new(),
534 path_segment: String::new(),
535 wrap_root_element: "Root".to_string(),
536 wrap_xmlns: String::new(),
537 };
538 h.reassemble_multi_level_segment(&file, &rule, &[])
539 .await
540 .unwrap();
541 }
542
543 #[tokio::test]
544 async fn reassemble_multi_level_segment_skips_files_in_segment_root() {
545 let h = ReassembleXmlFileHandler::new();
546 let tmp = tempfile::tempdir().unwrap();
547 let segment = tmp.path().join("segment");
548 tokio::fs::create_dir(&segment).await.unwrap();
549 tokio::fs::write(segment.join("stray.txt"), "x")
551 .await
552 .unwrap();
553 let rule = crate::xml::types::MultiLevelRule {
554 file_pattern: String::new(),
555 root_to_strip: String::new(),
556 unique_id_elements: String::new(),
557 path_segment: "segment".to_string(),
558 wrap_root_element: "Root".to_string(),
559 wrap_xmlns: "http://example.com".to_string(),
560 };
561 h.reassemble_multi_level_segment(&segment, &rule, &[])
562 .await
563 .unwrap();
564 }
565
566 #[tokio::test]
567 async fn collect_segment_as_array_returns_none_for_empty_dir() {
568 let h = ReassembleXmlFileHandler::new();
569 let tmp = tempfile::tempdir().unwrap();
570 let out = h
571 .collect_segment_as_array(tmp.path().to_str().unwrap(), "seg", true)
572 .await
573 .unwrap();
574 assert!(out.is_none());
575 }
576
577 #[tokio::test]
578 async fn collect_segment_as_array_skips_unparseable_and_empty_roots() {
579 let h = ReassembleXmlFileHandler::new();
580 let tmp = tempfile::tempdir().unwrap();
581 tokio::fs::write(tmp.path().join("bad.xml"), "<<")
583 .await
584 .unwrap();
585 tokio::fs::write(tmp.path().join("only-decl.xml"), "")
587 .await
588 .unwrap();
589 tokio::fs::write(tmp.path().join(".hidden.xml"), "<r/>")
591 .await
592 .unwrap();
593 let out = h
594 .collect_segment_as_array(tmp.path().to_str().unwrap(), "seg", false)
595 .await
596 .unwrap();
597 assert!(out.is_none());
598 }
599
600 #[tokio::test]
601 async fn collect_segment_as_array_without_extract_inner_wraps_root() {
602 let h = ReassembleXmlFileHandler::new();
603 let tmp = tempfile::tempdir().unwrap();
604 tokio::fs::write(tmp.path().join("a.xml"), r#"<Root><child>1</child></Root>"#)
605 .await
606 .unwrap();
607 let out = h
608 .collect_segment_as_array(tmp.path().to_str().unwrap(), "seg", false)
609 .await
610 .unwrap()
611 .unwrap();
612 let obj = out.as_object().unwrap();
613 assert!(obj.contains_key("?xml"));
614 let root = obj.get("Root").and_then(|r| r.as_object()).unwrap();
615 assert!(root.get("seg").and_then(|v| v.as_array()).is_some());
616 }
617}