1use crate::xml::builders::{build_xml_string, merge_xml_elements, reorder_root_keys};
4use crate::xml::multi_level::{ensure_segment_files_structure, load_multi_level_config};
5use crate::xml::parsers::parse_to_xml_object;
6use crate::xml::types::{MultiLevelRule, XmlElement};
7use crate::xml::utils::normalize_path_unix;
8use serde_json::Value;
9use std::collections::HashSet;
10use std::ffi::OsString;
11use std::future::Future;
12use std::path::{Path, PathBuf};
13use std::pin::Pin;
14use tokio::fs;
15
16async fn read_key_order(path: &Path) -> Option<Vec<String>> {
18 let bytes = fs::read(path).await.ok()?;
19 serde_json::from_slice::<Vec<String>>(&bytes).ok()
20}
21
22fn strip_xmlns_from_value(v: Value) -> Value {
24 match v {
25 Value::Object(obj) => {
26 Value::Object(obj.into_iter().filter(|(k, _)| k != "@xmlns").collect())
27 }
28 other => other,
29 }
30}
31
32fn deeper_candidate_rules(
40 all_rules: &[MultiLevelRule],
41 exclude_path_segment: &str,
42) -> Vec<MultiLevelRule> {
43 all_rules
44 .iter()
45 .filter(|r| r.path_segment != exclude_path_segment)
46 .cloned()
47 .collect()
48}
49
50fn is_at_base_path(dir_path: &str, base_segments: &[(String, String, bool)]) -> bool {
57 base_segments.iter().any(|(base, _, _)| dir_path == base)
58}
59
60type ProcessDirFuture<'a> = Pin<
61 Box<
62 dyn Future<Output = Result<Vec<XmlElement>, Box<dyn std::error::Error + Send + Sync>>>
63 + Send
64 + 'a,
65 >,
66>;
67
68type SegmentFuture<'a> =
69 Pin<Box<dyn Future<Output = Result<(), Box<dyn std::error::Error + Send + Sync>>> + Send + 'a>>;
70
71pub struct ReassembleXmlFileHandler;
72
73impl ReassembleXmlFileHandler {
74 pub fn new() -> Self {
75 Self
76 }
77
78 pub async fn reassemble(
79 &self,
80 file_path: &str,
81 file_extension: Option<&str>,
82 post_purge: bool,
83 ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
84 let file_path = normalize_path_unix(file_path);
85 if !self.validate_directory(&file_path).await? {
86 return Ok(());
87 }
88
89 let path = Path::new(&file_path);
90 let config = load_multi_level_config(path).await;
91 if let Some(ref config) = config {
92 for (i, rule) in config.rules.iter().enumerate() {
97 let segment_path = path.join(&rule.path_segment);
98 if !segment_path.is_dir() {
99 continue;
100 }
101 let nested: Vec<MultiLevelRule> = config
102 .rules
103 .iter()
104 .enumerate()
105 .filter(|(j, _)| *j != i)
106 .map(|(_, r)| r.clone())
107 .collect();
108 self.reassemble_multi_level_segment(&segment_path, rule, &nested)
109 .await?;
110 }
111 }
112
113 let base_segments: Vec<(String, String, bool)> = config
116 .as_ref()
117 .map(|c| {
118 c.rules
119 .iter()
120 .map(|r| (file_path.clone(), r.path_segment.clone(), true))
121 .collect()
122 })
123 .unwrap_or_default();
124 let post_purge_final = post_purge || config.is_some();
126 self.reassemble_plain(&file_path, file_extension, post_purge_final, &base_segments)
127 .await
128 }
129
130 fn reassemble_multi_level_segment<'a>(
155 &'a self,
156 segment_path: &'a Path,
157 rule: &'a MultiLevelRule,
158 nested_rules: &'a [MultiLevelRule],
159 ) -> SegmentFuture<'a> {
160 let segment_path = segment_path.to_path_buf();
161 let rule = rule.clone();
162 let nested_rules = nested_rules.to_vec();
163 Box::pin(async move {
164 self.reassemble_multi_level_segment_inner(&segment_path, &rule, &nested_rules)
165 .await
166 })
167 }
168
169 async fn reassemble_multi_level_segment_inner(
170 &self,
171 segment_path: &Path,
172 rule: &MultiLevelRule,
173 nested_rules: &[MultiLevelRule],
174 ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
175 if !segment_path.is_dir() {
176 return Ok(());
177 }
178 let mut entries = Vec::new();
179 let mut read_dir = fs::read_dir(segment_path).await?;
180 while let Some(entry) = read_dir.next_entry().await? {
181 entries.push(entry);
182 }
183 entries.sort_by_key(|e| e.file_name());
184 for entry in entries {
185 let process_path = entry.path();
186 if !process_path.is_dir() {
187 continue;
188 }
189 let process_path_str = normalize_path_unix(&process_path.to_string_lossy());
190 let mut sub_entries = Vec::new();
191 let mut sub_read = fs::read_dir(&process_path).await?;
192 while let Some(e) = sub_read.next_entry().await? {
193 sub_entries.push(e);
194 }
195 sub_entries.sort_by_key(|e| e.file_name());
196
197 let mut handled: HashSet<OsString> = HashSet::new();
200 for sub_entry in &sub_entries {
201 let sub_path: PathBuf = sub_entry.path();
202 if !sub_path.is_dir() {
203 continue;
204 }
205 let sub_name = sub_path.file_name().and_then(|n| n.to_str()).unwrap_or("");
206 let Some(nested_rule) = nested_rules.iter().find(|r| r.path_segment == sub_name)
207 else {
208 continue;
209 };
210 let deeper = deeper_candidate_rules(nested_rules, &nested_rule.path_segment);
214 self.reassemble_multi_level_segment(&sub_path, nested_rule, &deeper)
215 .await?;
216 handled.insert(sub_entry.file_name());
217 }
218
219 for sub_entry in &sub_entries {
222 let sub_path = sub_entry.path();
223 if !sub_path.is_dir() {
224 continue;
225 }
226 if handled.contains(&sub_entry.file_name()) {
227 continue;
228 }
229 let sub_path_str = normalize_path_unix(&sub_path.to_string_lossy());
230 self.reassemble_plain(&sub_path_str, Some("xml"), true, &[])
231 .await?;
232 }
233
234 self.reassemble_plain(&process_path_str, Some("xml"), true, &[])
236 .await?;
237 }
238 ensure_segment_files_structure(
239 segment_path,
240 &rule.wrap_root_element,
241 &rule.path_segment,
242 &rule.wrap_xmlns,
243 )
244 .await?;
245 Ok(())
246 }
247
248 async fn reassemble_plain(
256 &self,
257 file_path: &str,
258 file_extension: Option<&str>,
259 post_purge: bool,
260 base_segments: &[(String, String, bool)],
261 ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
262 let file_path = normalize_path_unix(file_path);
263 log::debug!("Parsing directory to reassemble: {}", file_path);
264 let parsed_objects = self
265 .process_files_in_directory(file_path.to_string(), base_segments.to_vec())
266 .await?;
267
268 if parsed_objects.is_empty() {
269 log::error!(
270 "No files under {} were parsed successfully. A reassembled XML file was not created.",
271 file_path
272 );
273 return Ok(());
274 }
275
276 let Some(mut merged) = merge_xml_elements(&parsed_objects) else {
280 log::error!(
281 "No usable root element found while merging files under {}. A reassembled XML file was not created.",
282 file_path
283 );
284 return Ok(());
285 };
286
287 let key_order_path = Path::new(&file_path).join(".key_order.json");
289 if let Some(reordered) = read_key_order(&key_order_path)
290 .await
291 .and_then(|order| reorder_root_keys(&merged, &order))
292 {
293 merged = reordered;
294 }
295
296 let final_xml = build_xml_string(&merged);
297 let output_path = self.get_output_path(&file_path, file_extension);
298
299 fs::write(&output_path, final_xml).await?;
300
301 if post_purge {
302 fs::remove_dir_all(file_path).await.ok();
303 }
304
305 Ok(())
306 }
307
308 fn process_files_in_directory<'a>(
309 &'a self,
310 dir_path: String,
311 base_segments: Vec<(String, String, bool)>,
312 ) -> ProcessDirFuture<'a> {
313 Box::pin(async move {
314 let mut parsed = Vec::new();
315 let mut entries = Vec::new();
316 let mut read_dir = fs::read_dir(&dir_path).await?;
317 while let Some(entry) = read_dir.next_entry().await? {
318 entries.push(entry);
319 }
320 entries.sort_by(|a, b| {
322 let a_name = a.file_name().to_string_lossy().to_string();
323 let b_name = b.file_name().to_string_lossy().to_string();
324 a_name.cmp(&b_name)
325 });
326
327 let is_base = is_at_base_path(&dir_path, &base_segments);
332
333 for entry in entries {
334 let path = entry.path();
335 let file_path = normalize_path_unix(&path.to_string_lossy()).to_string();
336
337 if path.is_file() {
338 let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
339 if !name.starts_with('.') && self.is_parsable_file(name) {
340 if let Some(parsed_obj) = parse_to_xml_object(&file_path).await {
341 parsed.push(parsed_obj);
342 }
343 }
344 } else {
345 let dir_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
348 let matched_segment = if is_base {
349 base_segments
350 .iter()
351 .find(|(_, seg_name, _)| seg_name == dir_name)
352 .cloned()
353 } else {
354 None
355 };
356 if let Some((_, segment_name, extract_inner)) = matched_segment {
357 let segment_element = self
358 .collect_segment_as_array(&file_path, &segment_name, extract_inner)
359 .await?;
360 if let Some(el) = segment_element {
361 parsed.push(el);
362 }
363 } else {
364 let sub_parsed = self
365 .process_files_in_directory(file_path, base_segments.clone())
366 .await?;
367 parsed.extend(sub_parsed);
368 }
369 }
370 }
371
372 Ok(parsed)
373 })
374 }
375
376 async fn collect_segment_as_array(
380 &self,
381 segment_dir: &str,
382 segment_name: &str,
383 extract_inner: bool,
384 ) -> Result<Option<XmlElement>, Box<dyn std::error::Error + Send + Sync>> {
385 let mut xml_files = Vec::new();
386 let mut read_dir = fs::read_dir(segment_dir).await?;
387 while let Some(entry) = read_dir.next_entry().await? {
388 let path = entry.path();
389 let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
390 if path.is_file() && !name.starts_with('.') && self.is_parsable_file(name) {
391 xml_files.push(normalize_path_unix(&path.to_string_lossy()));
392 }
393 }
394 xml_files.sort();
395
396 let mut root_contents = Vec::new();
397 let mut first_xml: Option<(String, Option<Value>)> = None;
398 for file_path in &xml_files {
399 let Some(parsed) = parse_to_xml_object(file_path).await else {
402 continue;
403 };
404 let obj_owned = parsed.as_object().cloned().unwrap_or_default();
405 let obj = &obj_owned;
406 let Some(root_key) = obj.keys().find(|k| *k != "?xml").cloned() else {
407 continue;
408 };
409 let root_val = obj
410 .get(&root_key)
411 .cloned()
412 .unwrap_or(Value::Object(serde_json::Map::new()));
413 let mut content = if extract_inner {
414 root_val
415 .get(segment_name)
416 .cloned()
417 .unwrap_or_else(|| Value::Object(serde_json::Map::new()))
418 } else {
419 root_val
420 };
421 if extract_inner {
423 content = strip_xmlns_from_value(content);
424 }
425 root_contents.push(content);
426 if first_xml.is_none() {
427 first_xml = Some((root_key, obj.get("?xml").cloned()));
428 }
429 }
430 if root_contents.is_empty() {
431 return Ok(None);
432 }
433 let (root_key, decl_opt) = first_xml.unwrap();
434 let mut content = serde_json::Map::new();
435 content.insert(segment_name.to_string(), Value::Array(root_contents));
436 let mut top = serde_json::Map::new();
437 if let Some(decl) = decl_opt {
438 top.insert("?xml".to_string(), decl);
439 } else {
440 let mut d = serde_json::Map::new();
441 d.insert("@version".to_string(), Value::String("1.0".to_string()));
442 d.insert("@encoding".to_string(), Value::String("UTF-8".to_string()));
443 top.insert("?xml".to_string(), Value::Object(d));
444 }
445 top.insert(root_key, Value::Object(content));
446 Ok(Some(Value::Object(top)))
447 }
448
449 fn is_parsable_file(&self, file_name: &str) -> bool {
450 let lower = file_name.to_lowercase();
451 lower.ends_with(".xml")
452 || lower.ends_with(".json")
453 || lower.ends_with(".json5")
454 || lower.ends_with(".yaml")
455 || lower.ends_with(".yml")
456 }
457
458 async fn validate_directory(
459 &self,
460 path: &str,
461 ) -> Result<bool, Box<dyn std::error::Error + Send + Sync>> {
462 let meta = fs::metadata(path).await?;
463 if !meta.is_dir() {
464 log::error!(
465 "The provided path to reassemble is not a directory: {}",
466 path
467 );
468 return Ok(false);
469 }
470 Ok(true)
471 }
472
473 fn get_output_path(&self, dir_path: &str, extension: Option<&str>) -> String {
474 let path = Path::new(dir_path);
475 let parent = path.parent().unwrap_or(Path::new("."));
476 let base_name = path
477 .file_name()
478 .and_then(|n| n.to_str())
479 .unwrap_or("output");
480 let ext = extension.unwrap_or("xml");
481 parent
482 .join(format!("{}.{}", base_name, ext))
483 .to_string_lossy()
484 .to_string()
485 }
486}
487
488impl Default for ReassembleXmlFileHandler {
489 fn default() -> Self {
490 Self::new()
491 }
492}
493
494#[cfg(test)]
495mod tests {
496 use super::*;
497 use serde_json::json;
498
499 #[test]
500 #[allow(clippy::default_constructed_unit_structs)]
501 fn reassemble_handler_default_equals_new() {
502 let _ = ReassembleXmlFileHandler::default();
503 }
504
505 #[test]
506 fn strip_xmlns_from_value_passes_non_object_through() {
507 let s = Value::String("hello".to_string());
508 assert_eq!(
509 strip_xmlns_from_value(s),
510 Value::String("hello".to_string())
511 );
512 let arr = json!([1, 2]);
513 assert_eq!(strip_xmlns_from_value(arr.clone()), arr);
514 }
515
516 #[test]
517 fn strip_xmlns_from_value_removes_xmlns_key() {
518 let obj = json!({ "@xmlns": "ns", "child": 1 });
519 let stripped = strip_xmlns_from_value(obj);
520 let map = stripped.as_object().unwrap();
521 assert!(map.get("@xmlns").is_none());
522 assert_eq!(map.get("child").and_then(|v| v.as_i64()), Some(1));
523 }
524
525 #[test]
526 fn is_parsable_file_recognises_supported_extensions() {
527 let h = ReassembleXmlFileHandler::new();
528 assert!(h.is_parsable_file("a.xml"));
529 assert!(h.is_parsable_file("a.json"));
530 assert!(h.is_parsable_file("a.json5"));
531 assert!(h.is_parsable_file("a.yaml"));
532 assert!(h.is_parsable_file("a.yml"));
533 assert!(h.is_parsable_file("A.XML"));
534 assert!(!h.is_parsable_file("a.txt"));
535 }
536
537 #[test]
538 fn get_output_path_appends_extension_and_uses_parent_dir() {
539 let h = ReassembleXmlFileHandler::new();
540 let out = h.get_output_path("/tmp/foo", Some("xml"));
541 assert!(out.ends_with("foo.xml"));
542 let out_default = h.get_output_path("/tmp/bar", None);
543 assert!(out_default.ends_with("bar.xml"));
544 assert_eq!(h.get_output_path("only", Some("json")), "only.json");
546 }
547
548 #[tokio::test]
549 async fn reassemble_multi_level_segment_noop_when_not_dir() {
550 let h = ReassembleXmlFileHandler::new();
551 let tmp = tempfile::tempdir().unwrap();
552 let file = tmp.path().join("not_a_dir.txt");
553 tokio::fs::write(&file, "hi").await.unwrap();
554 let rule = crate::xml::types::MultiLevelRule {
555 file_pattern: String::new(),
556 root_to_strip: String::new(),
557 unique_id_elements: String::new(),
558 path_segment: String::new(),
559 wrap_root_element: "Root".to_string(),
560 wrap_xmlns: String::new(),
561 };
562 h.reassemble_multi_level_segment(&file, &rule, &[])
563 .await
564 .unwrap();
565 }
566
567 #[tokio::test]
568 async fn reassemble_multi_level_segment_skips_files_in_segment_root() {
569 let h = ReassembleXmlFileHandler::new();
570 let tmp = tempfile::tempdir().unwrap();
571 let segment = tmp.path().join("segment");
572 tokio::fs::create_dir(&segment).await.unwrap();
573 tokio::fs::write(segment.join("stray.txt"), "x")
575 .await
576 .unwrap();
577 let rule = crate::xml::types::MultiLevelRule {
578 file_pattern: String::new(),
579 root_to_strip: String::new(),
580 unique_id_elements: String::new(),
581 path_segment: "segment".to_string(),
582 wrap_root_element: "Root".to_string(),
583 wrap_xmlns: "http://example.com".to_string(),
584 };
585 h.reassemble_multi_level_segment(&segment, &rule, &[])
586 .await
587 .unwrap();
588 }
589
590 #[tokio::test]
591 async fn collect_segment_as_array_returns_none_for_empty_dir() {
592 let h = ReassembleXmlFileHandler::new();
593 let tmp = tempfile::tempdir().unwrap();
594 let out = h
595 .collect_segment_as_array(tmp.path().to_str().unwrap(), "seg", true)
596 .await
597 .unwrap();
598 assert!(out.is_none());
599 }
600
601 #[tokio::test]
602 async fn collect_segment_as_array_skips_unparseable_and_empty_roots() {
603 let h = ReassembleXmlFileHandler::new();
604 let tmp = tempfile::tempdir().unwrap();
605 tokio::fs::write(tmp.path().join("bad.xml"), "<<")
607 .await
608 .unwrap();
609 tokio::fs::write(tmp.path().join("only-decl.xml"), "")
611 .await
612 .unwrap();
613 tokio::fs::write(tmp.path().join(".hidden.xml"), "<r/>")
615 .await
616 .unwrap();
617 let out = h
618 .collect_segment_as_array(tmp.path().to_str().unwrap(), "seg", false)
619 .await
620 .unwrap();
621 assert!(out.is_none());
622 }
623
624 #[tokio::test]
625 async fn collect_segment_as_array_without_extract_inner_wraps_root() {
626 let h = ReassembleXmlFileHandler::new();
627 let tmp = tempfile::tempdir().unwrap();
628 tokio::fs::write(tmp.path().join("a.xml"), r#"<Root><child>1</child></Root>"#)
629 .await
630 .unwrap();
631 let out = h
632 .collect_segment_as_array(tmp.path().to_str().unwrap(), "seg", false)
633 .await
634 .unwrap()
635 .unwrap();
636 let obj = out.as_object().unwrap();
637 assert!(obj.contains_key("?xml"));
638 let root = obj.get("Root").and_then(|r| r.as_object()).unwrap();
639 assert!(root.get("seg").and_then(|v| v.as_array()).is_some());
640 }
641
642 fn rule_with_segment(segment: &str) -> MultiLevelRule {
643 MultiLevelRule {
644 file_pattern: String::new(),
645 root_to_strip: String::new(),
646 unique_id_elements: String::new(),
647 path_segment: segment.to_string(),
648 wrap_root_element: String::new(),
649 wrap_xmlns: String::new(),
650 }
651 }
652
653 #[test]
654 fn deeper_candidate_rules_excludes_the_matched_segment() {
655 let rules = vec![rule_with_segment("seg_a"), rule_with_segment("seg_b")];
659 let deeper = deeper_candidate_rules(&rules, "seg_a");
660 assert_eq!(deeper.len(), 1);
661 assert_eq!(deeper[0].path_segment, "seg_b");
662 }
663
664 #[test]
665 fn deeper_candidate_rules_keeps_all_when_no_segment_matches() {
666 let rules = vec![rule_with_segment("seg_a"), rule_with_segment("seg_b")];
670 let deeper = deeper_candidate_rules(&rules, "missing");
671 assert_eq!(deeper.len(), 2);
672 }
673
674 #[test]
675 fn deeper_candidate_rules_returns_empty_for_empty_input() {
676 let deeper: Vec<MultiLevelRule> = deeper_candidate_rules(&[], "anything");
677 assert!(deeper.is_empty());
678 }
679
680 #[test]
681 fn is_at_base_path_true_when_dir_matches_any_segment() {
682 let segs = vec![
683 ("/base/other".to_string(), "seg1".to_string(), false),
684 ("/base/here".to_string(), "seg2".to_string(), false),
685 ];
686 assert!(is_at_base_path("/base/here", &segs));
687 }
688
689 #[test]
690 fn is_at_base_path_false_when_dir_matches_nothing() {
691 let segs = vec![("/base/a".to_string(), "seg".to_string(), false)];
692 assert!(!is_at_base_path("/base/b", &segs));
693 }
694
695 #[test]
696 fn is_at_base_path_false_for_empty_segments() {
697 let segs: Vec<(String, String, bool)> = Vec::new();
698 assert!(!is_at_base_path("/anywhere", &segs));
699 }
700}