Skip to main content

mig_assembly/
assembler.rs

1//! Recursive descent assembler — MIG-guided segment consumption.
2//!
3//! The assembler walks the MIG tree structure and consumes matching
4//! segments from the input. It produces a generic tree representation
5//! that can be converted to typed PID structs.
6
7use crate::cursor::SegmentCursor;
8use crate::diagnostic::{StructureDiagnostic, StructureDiagnosticKind};
9use crate::matcher;
10use crate::tokenize::OwnedSegment;
11use crate::AssemblyError;
12use mig_types::schema::mig::{MigSchema, MigSegment, MigSegmentGroup};
13use serde::{Deserialize, Serialize};
14
15/// A generic assembled tree node (before PID-specific typing).
16#[derive(Debug, Clone, Serialize, Deserialize)]
17pub struct AssembledTree {
18    pub segments: Vec<AssembledSegment>,
19    pub groups: Vec<AssembledGroup>,
20    /// Index in `segments` where post-group segments start (e.g., UNT, UNZ).
21    /// Segments before this index appear before groups in EDIFACT order.
22    #[serde(default)]
23    pub post_group_start: usize,
24    /// Root segments consumed between groups during assembly (e.g., UNS
25    /// section separator in MSCONS). Key = index into `groups` vec; value =
26    /// segments that appear immediately before that group in the EDIFACT
27    /// stream. Empty for messages without inter-group root segments.
28    #[serde(default, skip_serializing_if = "std::collections::BTreeMap::is_empty")]
29    pub inter_group_segments: std::collections::BTreeMap<usize, Vec<AssembledSegment>>,
30}
31
32/// An assembled segment with its data elements.
33#[derive(Debug, Clone, Serialize, Deserialize)]
34pub struct AssembledSegment {
35    pub tag: String,
36    /// `elements[i][j]` = component `j` of element `i`
37    pub elements: Vec<Vec<String>>,
38    /// MIG `Number` attribute identifying this segment variant.
39    /// Two segments with the same tag (e.g., DTM) but different roles
40    /// (DTM+92 vs DTM+93) have distinct MIG numbers.
41    #[serde(default, skip_serializing_if = "Option::is_none")]
42    pub mig_number: Option<String>,
43    /// Source counter position of this segment within the input message,
44    /// preserved so AHB-validator-emitted issues can populate UCS `0096`
45    /// (segmentPosition) in CONTRL responses. `None` when the segment was
46    /// constructed by the reverse mapper (BO4E -> EDIFACT) since it has no
47    /// source position.
48    #[serde(default, skip_serializing_if = "Option::is_none")]
49    pub segment_number: Option<u32>,
50}
51
52/// An assembled segment group (may repeat).
53#[derive(Debug, Clone, Serialize, Deserialize)]
54pub struct AssembledGroup {
55    pub group_id: String,
56    pub repetitions: Vec<AssembledGroupInstance>,
57}
58
59/// One repetition of a segment group.
60#[derive(Debug, Clone, Serialize, Deserialize)]
61pub struct AssembledGroupInstance {
62    pub segments: Vec<AssembledSegment>,
63    pub child_groups: Vec<AssembledGroup>,
64    /// MIG `Number` of the entry segment that identified this group instance's variant.
65    #[serde(default, skip_serializing_if = "Option::is_none")]
66    pub entry_mig_number: Option<String>,
67    /// All MIG `Number`s defined for this group variant — includes segments that
68    /// may be absent in the EDIFACT but are defined in the MIG for this variant.
69    ///
70    /// Used by the validator to determine which AHB rules belong to this instance:
71    /// a rule with `mig_number` in this set applies here, even if the segment is
72    /// missing (which is then a missing-field error). Without this, rules for
73    /// absent-but-required segments would be incorrectly filtered out.
74    #[serde(default, skip_serializing_if = "Vec::is_empty")]
75    pub variant_mig_numbers: Vec<String>,
76    /// Segments that were present in the EDIFACT input but not defined in
77    /// the PID-filtered MIG for this group. Only populated when the assembler
78    /// runs with [`AssemblerConfig::skip_unknown_segments`] enabled.
79    #[serde(default, skip_serializing_if = "Vec::is_empty")]
80    pub skipped_segments: Vec<AssembledSegment>,
81    /// Input positions of each skipped segment, parallel to `skipped_segments`.
82    /// Used by `assemble_with_diagnostics` to locate AHB-foreign content in
83    /// the original EDIFACT stream for STR008 diagnostics.
84    #[serde(default, skip_serializing_if = "Vec::is_empty")]
85    pub skipped_positions: Vec<usize>,
86}
87
88impl AssembledGroupInstance {
89    /// Create a virtual `AssembledTree` scoped to this group instance.
90    ///
91    /// The instance's own segments become the tree's root segments,
92    /// and its child groups become the tree's groups. This enables
93    /// running `MappingEngine::map_all_forward()` on a single
94    /// transaction group as if it were a complete message.
95    pub fn as_assembled_tree(&self) -> AssembledTree {
96        AssembledTree {
97            segments: self.segments.clone(),
98            groups: self.child_groups.clone(),
99            post_group_start: self.segments.len(),
100            inter_group_segments: std::collections::BTreeMap::new(),
101        }
102    }
103}
104
105/// Configuration for the assembler.
106#[derive(Debug, Clone, Default)]
107pub struct AssemblerConfig {
108    /// When `true`, the assembler skips segments inside a group instance that
109    /// don't match any remaining MIG slot, nested-group entry, or the group's
110    /// entry tag (next repetition). Skipped segments are preserved on
111    /// [`AssembledGroupInstance::skipped_segments`] for roundtrip re-emission.
112    ///
113    /// Default: `false` (strict AHB — unknown segments stall the cursor).
114    pub skip_unknown_segments: bool,
115
116    /// Qualifier-aware assembly: maps MIG `Number` to `(element_index, component_index, expected_value)`.
117    ///
118    /// When a bounded slot has a `number` with an entry in this map,
119    /// `try_consume_segment` checks the input segment's value at the
120    /// specified position. If it doesn't match, the slot is skipped (segment
121    /// is for a different qualifier variant).
122    ///
123    /// Build from the PID schema JSON, or construct manually:
124    /// `{ "00023" => (0, 0, "92".to_string()), "00024" => (0, 0, "93".to_string()) }`.
125    ///
126    /// Default: empty (positional assembly, no qualifier checking).
127    pub qualifier_map: std::collections::HashMap<String, (usize, usize, String)>,
128
129    /// Reject a slot when the input segment has a value outside the slot's
130    /// allowed codes at any code-bearing position (not just the primary
131    /// qualifier). Disambiguates PID-filtered slots that share a primary
132    /// qualifier but differ on a secondary code (e.g. PID 55035 SG8/PIA
133    /// variants all use 4347='5' but differ at C212/7143).
134    ///
135    /// Default: `false`. Only safe to enable on a PID-filtered MIG whose
136    /// per-slot codes reflect the AHB-narrowed allowed sets — on the raw
137    /// MIG, each slot's codes cover only one variant and strict matching
138    /// would leave most segments unconsumed.
139    pub strict_code_matching: bool,
140}
141
142/// MIG-guided assembler.
143///
144/// Takes a MIG schema and uses it as a grammar to guide consumption
145/// of parsed EDIFACT segments. Produces a generic `AssembledTree`.
146pub struct Assembler<'a> {
147    mig: &'a MigSchema,
148    config: AssemblerConfig,
149}
150
151impl<'a> Assembler<'a> {
152    pub fn new(mig: &'a MigSchema) -> Self {
153        Self {
154            mig,
155            config: AssemblerConfig::default(),
156        }
157    }
158
159    pub fn with_config(mig: &'a MigSchema, config: AssemblerConfig) -> Self {
160        Self { mig, config }
161    }
162
163    /// Tags reachable at the top level from a group's point of view — used
164    /// as the initial enclosing scope for the skip-unknown retry logic.
165    ///
166    /// Excludes top-level segment slots that have already been matched
167    /// (they won't be consumed again) and top-level group entries that
168    /// come before the current group (those groups have already run).
169    /// What's left is: unmatched top-level segments (e.g., UNT/UNZ still
170    /// to come) and top-level group entries of siblings the assembler
171    /// hasn't reached yet. A group's retry loop should break on those
172    /// tags (structural for an upcoming step) but may skip anything else
173    /// that slipped past the earlier passes.
174    ///
175    /// Empty when skip mode is off (the skip path is never taken).
176    fn top_level_enclosing_for_group(
177        &self,
178        current_group_idx: usize,
179        matched_seg_indices: &[usize],
180    ) -> std::collections::HashSet<String> {
181        if !self.config.skip_unknown_segments {
182            return std::collections::HashSet::new();
183        }
184        let mut tags: std::collections::HashSet<String> = self
185            .mig
186            .segments
187            .iter()
188            .enumerate()
189            .filter_map(|(i, s)| {
190                if matched_seg_indices.contains(&i) {
191                    None
192                } else {
193                    Some(s.id.clone())
194                }
195            })
196            .collect();
197        for (idx, group) in self.mig.segment_groups.iter().enumerate() {
198            if idx < current_group_idx {
199                continue;
200            }
201            if let Some(entry) = group.segments.first() {
202                tags.insert(entry.id.clone());
203            }
204        }
205        tags
206    }
207
208    /// Assemble segments into a generic tree following MIG structure.
209    pub fn assemble_generic(
210        &self,
211        segments: &[OwnedSegment],
212    ) -> Result<AssembledTree, AssemblyError> {
213        let mut cursor = SegmentCursor::new(segments.len());
214        let mut tree = AssembledTree {
215            segments: Vec::new(),
216            groups: Vec::new(),
217            post_group_start: 0,
218            inter_group_segments: std::collections::BTreeMap::new(),
219        };
220
221        // Track which MIG segment indices were matched in the first pass
222        let mut matched_seg_indices = Vec::new();
223
224        // Process top-level segments (first pass — before groups)
225        for (i, mig_seg) in self.mig.segments.iter().enumerate() {
226            if cursor.is_exhausted() {
227                break;
228            }
229            if let Some(assembled) = self.try_consume_segment(segments, &mut cursor, mig_seg)? {
230                tree.segments.push(assembled);
231                matched_seg_indices.push(i);
232            }
233        }
234
235        // Process segment groups, interleaving root segment consumption.
236        // Some message types (e.g., MSCONS) have root segments like UNS
237        // between groups (SG2 and SG5). Before trying each group, consume
238        // any unmatched root segments at the current cursor position.
239        //
240        // When consecutive same-ID groups have variant_code set (e.g., 3 SG8
241        // entries for ZD7, Z98, ZF3), the assembler tries ALL variants at each
242        // cursor position to handle interleaved reps.
243        let mut group_idx = 0;
244        while group_idx < self.mig.segment_groups.len() {
245            if cursor.is_exhausted() {
246                break;
247            }
248
249            let mig_group = &self.mig.segment_groups[group_idx];
250
251            // Top-level skip-unknown: in skip mode, advance the cursor past
252            // any input segment whose tag is neither an unmatched root MIG
253            // segment nor any group entry. Without this, an AHB-foreign
254            // top-level segment (e.g. IMD in QUOTES PID 15005) stalls the
255            // assembler before the first group entry is reached.
256            //
257            // Skipped segments are stashed in `inter_group_segments[N]` so
258            // disassembly re-emits them at the same logical position.
259            if self.config.skip_unknown_segments {
260                let tree_group_idx = tree.groups.len();
261                while !cursor.is_exhausted() {
262                    let seg = &segments[cursor.position()];
263                    let tag = &seg.id;
264                    let is_unmatched_root_seg = self
265                        .mig
266                        .segments
267                        .iter()
268                        .enumerate()
269                        .any(|(i, ms)| !matched_seg_indices.contains(&i) && ms.id == *tag);
270                    let is_any_group_entry = self
271                        .mig
272                        .segment_groups
273                        .iter()
274                        .any(|g| g.segments.first().is_some_and(|s| s.id == *tag));
275                    if is_unmatched_root_seg || is_any_group_entry {
276                        break;
277                    }
278                    tree.inter_group_segments
279                        .entry(tree_group_idx)
280                        .or_default()
281                        .push(owned_to_assembled(seg));
282                    cursor.advance();
283                }
284                if cursor.is_exhausted() {
285                    break;
286                }
287            }
288
289            // Try consuming unmatched root segments before this group
290            let tree_group_idx = tree.groups.len();
291            for (i, mig_seg) in self.mig.segments.iter().enumerate() {
292                if cursor.is_exhausted() {
293                    break;
294                }
295                if matched_seg_indices.contains(&i) {
296                    continue;
297                }
298                if let Some(assembled) = self.try_consume_segment(segments, &mut cursor, mig_seg)? {
299                    tree.inter_group_segments
300                        .entry(tree_group_idx)
301                        .or_default()
302                        .push(assembled);
303                    matched_seg_indices.push(i);
304                }
305            }
306
307            // Scope visible to this group's skip-retry logic: top-level
308            // segments still to be consumed, plus entries of sibling
309            // top-level groups that haven't run yet. Built per-group so
310            // already-matched DTM/BGM slots don't incorrectly protect an
311            // AHB-foreign DTM+92 inside SG4 from being skipped.
312            let top_enclosing =
313                self.top_level_enclosing_for_group(group_idx, &matched_seg_indices);
314
315            // Check if this starts a variant set (consecutive same-ID groups with variant_code)
316            if mig_group.variant_code.is_some() {
317                let variant_count = self.mig.segment_groups[group_idx..]
318                    .iter()
319                    .take_while(|g| g.id == mig_group.id && g.variant_code.is_some())
320                    .count();
321                let variant_end = group_idx + variant_count;
322
323                let variant_groups = &self.mig.segment_groups[group_idx..variant_end];
324                if let Some(combined) = self.try_consume_variant_groups(
325                    segments,
326                    &mut cursor,
327                    variant_groups,
328                    &top_enclosing,
329                )? {
330                    tree.groups.push(combined);
331                }
332                group_idx = variant_end;
333            } else {
334                if let Some(assembled) = self.try_consume_group(
335                    segments,
336                    &mut cursor,
337                    mig_group,
338                    &top_enclosing,
339                )? {
340                    tree.groups.push(assembled);
341                }
342                group_idx += 1;
343            }
344        }
345
346        // Mark where post-group segments start
347        tree.post_group_start = tree.segments.len();
348
349        // Second pass: try unmatched top-level segments (e.g., UNT, UNZ after groups)
350        for (i, mig_seg) in self.mig.segments.iter().enumerate() {
351            if cursor.is_exhausted() {
352                break;
353            }
354            if matched_seg_indices.contains(&i) {
355                continue;
356            }
357            if let Some(assembled) = self.try_consume_segment(segments, &mut cursor, mig_seg)? {
358                tree.segments.push(assembled);
359            }
360        }
361
362        Ok(tree)
363    }
364
365    fn try_consume_segment(
366        &self,
367        segments: &[OwnedSegment],
368        cursor: &mut SegmentCursor,
369        mig_seg: &MigSegment,
370    ) -> Result<Option<AssembledSegment>, AssemblyError> {
371        if cursor.is_exhausted() {
372            return Ok(None);
373        }
374        let seg = &segments[cursor.position()];
375        if matcher::matches_segment_tag(&seg.id, &mig_seg.id) {
376            // Qualifier check: if the MIG slot has a qualifier_map entry,
377            // verify the input segment's qualifier matches before consuming.
378            if let Some(ref num) = mig_seg.number {
379                if let Some((el_idx, comp_idx, expected)) = self.config.qualifier_map.get(num) {
380                    let actual = seg
381                        .elements
382                        .get(*el_idx)
383                        .and_then(|e| e.get(*comp_idx))
384                        .map(|s| s.as_str())
385                        .unwrap_or("");
386                    if actual != expected {
387                        return Ok(None); // Wrong qualifier — skip this slot
388                    }
389                }
390            }
391            // Note: full-code-profile matching (for disambiguating merged
392            // sibling slots sharing a primary qualifier, e.g. PID 55035 PIA
393            // variants all use 4347='5' but differ at C212/7143) is handled
394            // by the caller in `try_consume_group`'s entry-run when strict
395            // mode is on and `run_len > 1`. Doing it here would reject
396            // solo-slot segments whose codes fall outside the AHB-narrowed
397            // allowed set — those belong to the validator as COD002.
398            let mut assembled = owned_to_assembled(seg);
399            assembled.mig_number = mig_seg.number.clone();
400            cursor.advance();
401            Ok(Some(assembled))
402        } else {
403            Ok(None) // Segment not present (optional)
404        }
405    }
406
407    /// Consume the entry run of a group with best-match slot selection.
408    ///
409    /// Used when `strict_code_matching` is on and `run_len > 1` — the group
410    /// has multiple sibling entry slots with the same tag (merged PID-specific
411    /// variants). For each pending segment, picks the unused slot whose full
412    /// code profile matches best. Ties broken by MIG order. When no slot's
413    /// profile matches, falls back to the first unused tag+qualifier-matching
414    /// slot so the segment is still consumed (the validator emits COD002 if
415    /// the code is truly invalid).
416    fn consume_entry_run_best_match(
417        &self,
418        segments: &[OwnedSegment],
419        cursor: &mut SegmentCursor,
420        entry_slots: &[MigSegment],
421        instance: &mut AssembledGroupInstance,
422    ) -> Result<(), AssemblyError> {
423        let mut used = vec![false; entry_slots.len()];
424        for _ in 0..entry_slots.len() {
425            if cursor.is_exhausted() {
426                break;
427            }
428            let seg = &segments[cursor.position()];
429            let mut strict_match: Option<usize> = None;
430            let mut tag_match: Option<usize> = None;
431            for (i, slot) in entry_slots.iter().enumerate() {
432                if used[i] {
433                    continue;
434                }
435                if !matcher::matches_segment_tag(&seg.id, &slot.id) {
436                    continue;
437                }
438                if !self.segment_passes_qualifier_map(seg, slot) {
439                    continue;
440                }
441                if tag_match.is_none() {
442                    tag_match = Some(i);
443                }
444                if strict_match.is_none() && segment_matches_mig_codes(seg, slot) {
445                    strict_match = Some(i);
446                }
447            }
448            let Some(i) = strict_match.or(tag_match) else {
449                break;
450            };
451            used[i] = true;
452            let slot = &entry_slots[i];
453            let mut assembled = owned_to_assembled(seg);
454            assembled.mig_number = slot.number.clone();
455            instance.segments.push(assembled);
456            cursor.advance();
457        }
458        Ok(())
459    }
460
461    fn segment_passes_qualifier_map(&self, seg: &OwnedSegment, mig_seg: &MigSegment) -> bool {
462        let Some(ref num) = mig_seg.number else {
463            return true;
464        };
465        let Some((el_idx, comp_idx, expected)) = self.config.qualifier_map.get(num) else {
466            return true;
467        };
468        let actual = seg
469            .elements
470            .get(*el_idx)
471            .and_then(|e| e.get(*comp_idx))
472            .map(|s| s.as_str())
473            .unwrap_or("");
474        actual == expected
475    }
476
477    fn try_consume_group(
478        &self,
479        segments: &[OwnedSegment],
480        cursor: &mut SegmentCursor,
481        mig_group: &MigSegmentGroup,
482        enclosing: &std::collections::HashSet<String>,
483    ) -> Result<Option<AssembledGroup>, AssemblyError> {
484        let mut repetitions = Vec::new();
485        let entry_segment = mig_group.segments.first().ok_or_else(|| {
486            AssemblyError::ParseError(format!("Group {} has no segments", mig_group.id))
487        })?;
488
489        // Scope visible to skip decisions inside this group: our own local
490        // scope (entry + slots + direct nested entries) unioned with the
491        // caller's enclosing scope. Only built when skip mode is on —
492        // otherwise the skip path is dead code and the set stays unused.
493        let nested_enclosing: std::collections::HashSet<String> =
494            if self.config.skip_unknown_segments {
495                let mut set = enclosing.clone();
496                set.extend(group_local_scope(mig_group));
497                set
498            } else {
499                std::collections::HashSet::new()
500            };
501
502        // Loop for repeating groups
503        while !cursor.is_exhausted() {
504            let iter_start = cursor.position();
505            let seg = &segments[cursor.position()];
506            if !matcher::matches_segment_tag(&seg.id, &entry_segment.id) {
507                break; // Current segment doesn't match group entry — stop repeating
508            }
509
510            // Check variant qualifier if set — tag matches but wrong variant
511            if !mig_group.variant_codes.is_empty() {
512                let (ei, ci) = mig_group.variant_qualifier_position.unwrap_or((0, 0));
513                let actual_qual = seg
514                    .elements
515                    .get(ei)
516                    .and_then(|e| e.get(ci))
517                    .map(|s| s.as_str())
518                    .unwrap_or("");
519                if !mig_group
520                    .variant_codes
521                    .iter()
522                    .any(|c| actual_qual.eq_ignore_ascii_case(c))
523                {
524                    break;
525                }
526            } else if let Some(ref expected_code) = mig_group.variant_code {
527                let (ei, ci) = mig_group.variant_qualifier_position.unwrap_or((0, 0));
528                let actual_qual = seg
529                    .elements
530                    .get(ei)
531                    .and_then(|e| e.get(ci))
532                    .map(|s| s.as_str())
533                    .unwrap_or("");
534                if !actual_qual.eq_ignore_ascii_case(expected_code) {
535                    break;
536                }
537            }
538
539            let mut instance = AssembledGroupInstance {
540                segments: Vec::new(),
541                child_groups: Vec::new(),
542                entry_mig_number: entry_segment.number.clone(),
543                variant_mig_numbers: collect_mig_numbers(mig_group),
544                skipped_segments: Vec::new(),
545                skipped_positions: Vec::new(),
546            };
547
548            // Consume segments within this group instance.
549            // Process MIG slots in tag runs: for consecutive slots with the
550            // same tag, consume ALL matching input segments — not just the
551            // defined count. This handles real-world fixtures with more
552            // repetitions than the merged MIG predicts (e.g., 6 RFFs when
553            // the schema defines max 4).
554            //
555            // The entry segment (first tag run) is consumed bounded — one per
556            // defined slot — because the outer while loop uses the entry tag
557            // to delineate group repetitions.
558            let mut slot_idx = 0;
559            let mut is_entry_run = true;
560            while slot_idx < mig_group.segments.len() {
561                if cursor.is_exhausted() {
562                    break;
563                }
564                let current_tag = &mig_group.segments[slot_idx].id;
565                let run_len = mig_group.segments[slot_idx..]
566                    .iter()
567                    .take_while(|s| s.id == *current_tag)
568                    .count();
569
570                if is_entry_run {
571                    // Entry tag: consume at most run_len (preserves group boundaries)
572                    let entry_slots = &mig_group.segments[slot_idx..slot_idx + run_len];
573                    if self.config.strict_code_matching && run_len > 1 {
574                        // Best-match: among tag-matching sibling slots, prefer
575                        // the one whose full code profile matches the segment
576                        // (disambiguates PID 55035 PIA 00108/Z12 vs 00197/SRW).
577                        // Falls back to MIG order when no profile matches —
578                        // preserves assembly for codes outside any AHB-narrowed set.
579                        self.consume_entry_run_best_match(
580                            segments,
581                            cursor,
582                            entry_slots,
583                            &mut instance,
584                        )?;
585                    } else {
586                        for slot in entry_slots {
587                            if cursor.is_exhausted() {
588                                break;
589                            }
590                            if let Some(assembled) =
591                                self.try_consume_segment(segments, cursor, slot)?
592                            {
593                                instance.segments.push(assembled);
594                            }
595                        }
596                    }
597                    is_entry_run = false;
598                } else if matcher::matches_segment_tag(current_tag, &entry_segment.id) {
599                    // Non-entry slot with SAME tag as entry (e.g., CCI appears as
600                    // both entry and non-entry in merged SG30).
601                    //
602                    // Only consume if we haven't yet consumed any NON-entry-tag
603                    // segments (i.e., we're still in a consecutive entry-tag run).
604                    // Once we've consumed a different tag (like CAV), seeing the
605                    // entry tag again means a new rep boundary.
606                    //
607                    // z35: entry CCI → CAV CAV → sees CCI → has_other=true → break ✓
608                    // z39: entry CCI → (no CAV) → sees CCI → has_other=false → consume ✓
609                    //      then CCI CCI → CAV → sees CCI → has_other=true → break
610                    //      BUT: z39 needs CCI-CAV-CCI-CAV structure
611                    //
612                    // Better heuristic: check if ALL remaining slots from here are
613                    // entry-tag + non-entry pairs. If the current slot is entry-tag
614                    // and the NEXT input segment after it would be a non-entry tag,
615                    // consume — it's a continuation. Otherwise break.
616                    if cursor.is_exhausted() {
617                        break;
618                    }
619                    let seg = &segments[cursor.position()];
620                    if !matcher::matches_segment_tag(&seg.id, current_tag) {
621                        break;
622                    }
623                    // Check: is there a non-entry segment AFTER this entry-tag?
624                    // If so, this CCI+CAV pair is part of the current rep.
625                    let has_following_non_entry = if cursor.position() + 1 < segments.len() {
626                        let next = &segments[cursor.position() + 1];
627                        !matcher::matches_segment_tag(&next.id, &entry_segment.id)
628                            && mig_group.segments.iter().any(|s| {
629                                matcher::matches_segment_tag(&next.id, &s.id)
630                                    && !matcher::matches_segment_tag(&s.id, &entry_segment.id)
631                            })
632                    } else {
633                        false
634                    };
635                    if has_following_non_entry {
636                        // CCI followed by CAV → consume as continuation pair
637                        instance.segments.push(owned_to_assembled(seg));
638                        cursor.advance();
639                    } else {
640                        // CCI followed by CCI or unknown → let outer loop decide
641                        break;
642                    }
643                } else {
644                    // Non-entry tag: consume bounded slots first (with mig_number),
645                    // then greedily consume extras (without mig_number).
646                    // The bounded slots get mig_number from the MIG definition so
647                    // the validator can distinguish same-tag segments (e.g., DTM+92
648                    // vs DTM+93 both in SG4).
649                    let slots = &mig_group.segments[slot_idx..slot_idx + run_len];
650                    if self.config.strict_code_matching && run_len > 1 {
651                        self.consume_entry_run_best_match(segments, cursor, slots, &mut instance)?;
652                    } else {
653                        for slot in slots {
654                            if cursor.is_exhausted() {
655                                break;
656                            }
657                            if let Some(assembled) =
658                                self.try_consume_segment(segments, cursor, slot)?
659                            {
660                                instance.segments.push(assembled);
661                            }
662                        }
663                    }
664                    // Greedily consume any remaining same-tag segments beyond the MIG count
665                    while !cursor.is_exhausted() {
666                        let seg = &segments[cursor.position()];
667                        if matcher::matches_segment_tag(&seg.id, current_tag) {
668                            instance.segments.push(owned_to_assembled(seg));
669                            cursor.advance();
670                        } else {
671                            break;
672                        }
673                    }
674                }
675
676                slot_idx += run_len;
677
678                // Point A: Skip unknown segments between MIG slot runs.
679                // When skip mode is ON and we just finished a slot run but the
680                // current segment doesn't match any remaining MIG slot, nested
681                // group entry, or the entry tag, skip it — unless the tag
682                // appears elsewhere in the full MIG (it belongs to an
683                // enclosing group, not this one).
684                if self.config.skip_unknown_segments {
685                    while !cursor.is_exhausted() {
686                        let seg = &segments[cursor.position()];
687                        // Stop if it matches the entry tag (next group repetition)
688                        if matcher::matches_segment_tag(&seg.id, &entry_segment.id) {
689                            break;
690                        }
691                        // Stop if it matches any remaining MIG slot
692                        if mig_group.segments[slot_idx..]
693                            .iter()
694                            .any(|s| matcher::matches_segment_tag(&seg.id, &s.id))
695                        {
696                            break;
697                        }
698                        // Stop if it matches any nested group entry
699                        if mig_group.nested_groups.iter().any(|ng| {
700                            ng.segments
701                                .first()
702                                .is_some_and(|es| matcher::matches_segment_tag(&seg.id, &es.id))
703                        }) {
704                            break;
705                        }
706                        // Stop if the tag is reachable from the enclosing
707                        // scope — it's structural for an outer group (e.g.,
708                        // an SG8 SEQ seen from inside SG10) and should
709                        // surface to the outer loop rather than be
710                        // swallowed here.
711                        if enclosing.contains(&seg.id) {
712                            break;
713                        }
714                        // Unknown segment — skip it
715                        instance.skipped_positions.push(cursor.position());
716                        instance.skipped_segments.push(owned_to_assembled(seg));
717                        cursor.advance();
718                    }
719                }
720            }
721
722            // Consume nested groups (variant-aware for same-ID groups).
723            //
724            // When `skip_unknown_segments` is on, the loop retries after
725            // stalling on an AHB-foreign segment that matches none of the
726            // nested-group entries — the orphan is recorded in
727            // `instance.skipped_segments` and subsequent legitimate reps are
728            // still assembled (PID 55035 cascade fix). Retries merge new
729            // child groups into the existing entry for the same id so a
730            // variant set assembled across two passes stays a single child.
731            loop {
732                let pass_start = cursor.position();
733                let mut nested_idx = 0;
734                while nested_idx < mig_group.nested_groups.len() {
735                    if cursor.is_exhausted() {
736                        break;
737                    }
738                    let nested = &mig_group.nested_groups[nested_idx];
739
740                    if nested.variant_code.is_some() {
741                        // Variant set: collect consecutive same-ID groups with variant_code
742                        let variant_count = mig_group.nested_groups[nested_idx..]
743                            .iter()
744                            .take_while(|g| g.id == nested.id && g.variant_code.is_some())
745                            .count();
746                        let variant_end = nested_idx + variant_count;
747                        let variant_groups = &mig_group.nested_groups[nested_idx..variant_end];
748                        if let Some(combined) = self.try_consume_variant_groups(
749                            segments,
750                            cursor,
751                            variant_groups,
752                            &nested_enclosing,
753                        )? {
754                            push_or_merge_child(&mut instance.child_groups, combined);
755                        }
756                        nested_idx = variant_end;
757                    } else {
758                        if let Some(assembled) =
759                            self.try_consume_group(segments, cursor, nested, &nested_enclosing)?
760                        {
761                            push_or_merge_child(&mut instance.child_groups, assembled);
762                        }
763                        nested_idx += 1;
764                    }
765                }
766
767                if !self.config.skip_unknown_segments || cursor.is_exhausted() {
768                    break;
769                }
770                let seg = &segments[cursor.position()];
771                // A segment is a true orphan from this group's perspective
772                // if its tag is not reachable from the enclosing scope
773                // (siblings above, upcoming top-level segments). Tags in
774                // this group's own local scope (e.g., RFF as an entry of
775                // a sibling nested SG6) get skipped here: we've already
776                // tried every nested group in the pass above and none
777                // consumed the segment, so a sibling qualifier-mismatch
778                // at this position will reject every subsequent pass
779                // too. Skipping moves the cursor past the mismatched
780                // content so legitimate reps that follow can still
781                // assemble. Progress is guaranteed: either a pass
782                // consumed something (cursor advanced) or we skip one
783                // segment per iteration until we hit an enclosing-scoped
784                // tag or exhaust the input.
785                if enclosing.contains(&seg.id) {
786                    break;
787                }
788                // Defensive: if the nested pass made no progress AND
789                // nothing to skip (impossible given the check above but
790                // kept so this loop always terminates), break.
791                if cursor.position() == pass_start && !self.config.skip_unknown_segments {
792                    break;
793                }
794
795                instance.skipped_positions.push(cursor.position());
796                instance.skipped_segments.push(owned_to_assembled(seg));
797                cursor.advance();
798            }
799
800            // Guard against infinite loops: if no progress was made this iteration
801            // (entry tag matched but the entry segment was rejected by e.g. a
802            // qualifier_map mismatch), stop. Pushing an empty rep per iteration
803            // would allocate unbounded memory (see collect_mig_numbers call in
804            // the instance constructor).
805            if cursor.position() == iter_start {
806                break;
807            }
808            repetitions.push(instance);
809        }
810
811        if repetitions.is_empty() {
812            Ok(None)
813        } else {
814            Ok(Some(AssembledGroup {
815                group_id: mig_group.id.clone(),
816                repetitions,
817            }))
818        }
819    }
820
821    /// Consume interleaved repetitions of variant groups.
822    ///
823    /// At each cursor position, tries all variant definitions to find which one
824    /// matches the entry segment's qualifier. Collects all reps into one
825    /// `AssembledGroup` with the shared group_id.
826    fn try_consume_variant_groups(
827        &self,
828        segments: &[OwnedSegment],
829        cursor: &mut SegmentCursor,
830        variants: &[MigSegmentGroup],
831        enclosing: &std::collections::HashSet<String>,
832    ) -> Result<Option<AssembledGroup>, AssemblyError> {
833        let group_id = variants[0].id.clone();
834        let entry_tag = variants[0]
835            .segments
836            .first()
837            .map(|s| s.id.as_str())
838            .unwrap_or("");
839        let mut all_reps = Vec::new();
840
841        while !cursor.is_exhausted() {
842            let seg = &segments[cursor.position()];
843            if !matcher::matches_segment_tag(&seg.id, entry_tag) {
844                break;
845            }
846
847            // Find which variant matches this segment's qualifier.
848            // Each variant may have its qualifier at a different element position
849            // (e.g., CCI+Z19 has qualifier at [0][0], but CCI+++Z15 at [2][0]).
850            let matched = variants.iter().find(|v| {
851                let (ei, ci) = v.variant_qualifier_position.unwrap_or((0, 0));
852                let actual_qual = seg
853                    .elements
854                    .get(ei)
855                    .and_then(|e| e.get(ci))
856                    .map(|s| s.as_str())
857                    .unwrap_or("");
858                if !v.variant_codes.is_empty() {
859                    v.variant_codes
860                        .iter()
861                        .any(|c| actual_qual.eq_ignore_ascii_case(c))
862                } else if let Some(ref expected_code) = v.variant_code {
863                    actual_qual.eq_ignore_ascii_case(expected_code)
864                } else {
865                    false
866                }
867            });
868
869            if let Some(variant) = matched {
870                if let Some(group) =
871                    self.try_consume_group(segments, cursor, variant, enclosing)?
872                {
873                    all_reps.extend(group.repetitions);
874                } else {
875                    break;
876                }
877            } else {
878                // No variant matches — try consuming with the first variant as
879                // fallback to avoid getting stuck. This handles edge cases where
880                // the qualifier doesn't exactly match any variant code.
881                if let Some(group) =
882                    self.try_consume_group(segments, cursor, &variants[0], enclosing)?
883                {
884                    all_reps.extend(group.repetitions);
885                } else {
886                    break;
887                }
888            }
889        }
890
891        if all_reps.is_empty() {
892            Ok(None)
893        } else {
894            Ok(Some(AssembledGroup {
895                group_id,
896                repetitions: all_reps,
897            }))
898        }
899    }
900
901    /// Assemble segments with diagnostic collection.
902    ///
903    /// Returns the assembled tree plus diagnostics for segments not consumed
904    /// by the MIG-guided assembly. Existing `assemble_generic()` is unchanged.
905    pub fn assemble_with_diagnostics(
906        &self,
907        segments: &[OwnedSegment],
908    ) -> (AssembledTree, Vec<StructureDiagnostic>) {
909        let mut diagnostics = Vec::new();
910
911        let tree = match self.assemble_generic(segments) {
912            Ok(tree) => tree,
913            Err(e) => {
914                diagnostics.push(StructureDiagnostic {
915                    kind: StructureDiagnosticKind::UnexpectedSegment,
916                    segment_id: String::new(),
917                    position: 0,
918                    message: format!("Assembly failed: {e}"),
919                });
920                return (
921                    AssembledTree {
922                        segments: Vec::new(),
923                        groups: Vec::new(),
924                        post_group_start: 0,
925                        inter_group_segments: std::collections::BTreeMap::new(),
926                    },
927                    diagnostics,
928                );
929            }
930        };
931
932        // Count consumed segments in the assembled tree (skipped segments
933        // are included in this count because they were advanced past and
934        // stored — they surface as their own diagnostics below).
935        let consumed = count_tree_segments(&tree);
936
937        // Segments beyond consumed count are unconsumed tail (cascade case
938        // when skip mode is off, or structure genuinely ran out of MIG).
939        for (i, seg) in segments.iter().enumerate().skip(consumed) {
940            diagnostics.push(StructureDiagnostic {
941                kind: StructureDiagnosticKind::UnexpectedSegment,
942                segment_id: seg.id.clone(),
943                position: i,
944                message: format!(
945                    "Segment '{}' at position {} was not consumed by MIG-guided assembly",
946                    seg.id, i
947                ),
948            });
949        }
950
951        // Walk the tree and emit a diagnostic for each AHB-foreign segment
952        // the assembler advanced past (only populated when skip mode is on).
953        // Emit in input-position order so the report reads top-to-bottom.
954        let mut skipped: Vec<(usize, String)> = Vec::new();
955        collect_skipped(&tree, &mut skipped);
956        skipped.sort_by_key(|(pos, _)| *pos);
957        for (pos, tag) in skipped {
958            diagnostics.push(StructureDiagnostic {
959                kind: StructureDiagnosticKind::SkippedUnknownSegment,
960                segment_id: tag.clone(),
961                position: pos,
962                message: format!(
963                    "Segment '{tag}' at position {pos} is not defined in the PID-filtered MIG; the assembler advanced past it",
964                ),
965            });
966        }
967
968        (tree, diagnostics)
969    }
970}
971
972fn collect_skipped(tree: &AssembledTree, out: &mut Vec<(usize, String)>) {
973    for group in &tree.groups {
974        collect_skipped_from_group(group, out);
975    }
976}
977
978fn collect_skipped_from_group(group: &AssembledGroup, out: &mut Vec<(usize, String)>) {
979    for rep in &group.repetitions {
980        for (i, seg) in rep.skipped_segments.iter().enumerate() {
981            let pos = rep.skipped_positions.get(i).copied().unwrap_or(0);
982            out.push((pos, seg.tag.clone()));
983        }
984        for child in &rep.child_groups {
985            collect_skipped_from_group(child, out);
986        }
987    }
988}
989
990fn count_tree_segments(tree: &AssembledTree) -> usize {
991    let mut count = tree.segments.len();
992    for group in &tree.groups {
993        count += count_group_segments(group);
994    }
995    // Count inter-group segments (e.g., UNS+D between groups)
996    for segs in tree.inter_group_segments.values() {
997        count += segs.len();
998    }
999    count
1000}
1001
1002fn count_group_segments(group: &AssembledGroup) -> usize {
1003    let mut count = 0;
1004    for rep in &group.repetitions {
1005        count += rep.segments.len();
1006        count += rep.skipped_segments.len();
1007        for child in &rep.child_groups {
1008            count += count_group_segments(child);
1009        }
1010    }
1011    count
1012}
1013
1014/// Collect tags reachable from the given group's local scope — its entry
1015/// segment, its own slot tags, and each direct nested group's entry tag.
1016/// Used by the skip-unknown retry logic to decide whether a stalled
1017/// segment belongs to the current group's responsibility (break, let
1018/// something structural consume it) or is a true orphan (skip).
1019fn group_local_scope(mig_group: &MigSegmentGroup) -> std::collections::HashSet<String> {
1020    let mut tags = std::collections::HashSet::new();
1021    for seg in &mig_group.segments {
1022        tags.insert(seg.id.clone());
1023    }
1024    for nested in &mig_group.nested_groups {
1025        if let Some(entry) = nested.segments.first() {
1026            tags.insert(entry.id.clone());
1027        }
1028    }
1029    tags
1030}
1031
1032/// Push a newly-assembled child group into an instance's `child_groups`,
1033/// merging its repetitions into any existing same-id entry. Used by the
1034/// skip-unknown retry loop in `try_consume_group` so a variant set whose
1035/// reps are split by an orphan segment still surfaces as one child group.
1036fn push_or_merge_child(child_groups: &mut Vec<AssembledGroup>, new: AssembledGroup) {
1037    if let Some(existing) = child_groups.iter_mut().find(|g| g.group_id == new.group_id) {
1038        existing.repetitions.extend(new.repetitions);
1039    } else {
1040        child_groups.push(new);
1041    }
1042}
1043
1044/// Collect all MIG `Number`s from a segment group definition, recursively.
1045///
1046/// This includes numbers from direct segments and from nested groups.
1047/// Used to populate `AssembledGroupInstance::variant_mig_numbers`.
1048fn collect_mig_numbers(group: &MigSegmentGroup) -> Vec<String> {
1049    let mut numbers = Vec::new();
1050    for seg in &group.segments {
1051        if let Some(ref num) = seg.number {
1052            numbers.push(num.clone());
1053        }
1054    }
1055    for nested in &group.nested_groups {
1056        numbers.extend(collect_mig_numbers(nested));
1057    }
1058    numbers
1059}
1060
1061pub fn owned_to_assembled(seg: &OwnedSegment) -> AssembledSegment {
1062    AssembledSegment {
1063        tag: seg.id.clone(),
1064        elements: seg.elements.clone(),
1065        mig_number: None,
1066        segment_number: Some(seg.segment_number),
1067    }
1068}
1069
1070/// Check every code-bearing position declared on a MIG segment against the
1071/// corresponding value on the input segment.
1072///
1073/// Used by `try_consume_segment` to disambiguate slots that share the same
1074/// primary qualifier but differ on a secondary code. Returns `true` when the
1075/// input segment's values at each declared position are either empty
1076/// (optional) or in the slot's allowed set.
1077fn segment_matches_mig_codes(seg: &OwnedSegment, mig_seg: &MigSegment) -> bool {
1078    let actual_at = |el: usize, c: usize| -> &str {
1079        seg.elements
1080            .get(el)
1081            .and_then(|e| e.get(c))
1082            .map(|s| s.as_str())
1083            .unwrap_or("")
1084    };
1085    for de in &mig_seg.data_elements {
1086        if !de.codes.is_empty() {
1087            let actual = actual_at(de.position, 0);
1088            if !actual.is_empty() && !de.codes.iter().any(|c| c.value == actual) {
1089                return false;
1090            }
1091        }
1092    }
1093    for comp in &mig_seg.composites {
1094        for de in &comp.data_elements {
1095            if !de.codes.is_empty() {
1096                let actual = actual_at(comp.position, de.position);
1097                if !actual.is_empty() && !de.codes.iter().any(|c| c.value == actual) {
1098                    return false;
1099                }
1100            }
1101        }
1102    }
1103    true
1104}
1105
1106#[cfg(test)]
1107mod tests {
1108    use super::*;
1109    use crate::test_support::{make_mig_group, make_mig_group_with_variant, make_mig_segment};
1110
1111    fn make_owned_seg(id: &str, elements: Vec<Vec<&str>>) -> OwnedSegment {
1112        OwnedSegment {
1113            id: id.to_string(),
1114            elements: elements
1115                .into_iter()
1116                .map(|e| e.into_iter().map(|c| c.to_string()).collect())
1117                .collect(),
1118            segment_number: 0,
1119        }
1120    }
1121
1122    fn make_mig_schema(segments: Vec<&str>, groups: Vec<MigSegmentGroup>) -> MigSchema {
1123        MigSchema {
1124            message_type: "UTILMD".to_string(),
1125            variant: Some("Strom".to_string()),
1126            version: "S2.1".to_string(),
1127            publication_date: "2025-03-20".to_string(),
1128            author: "BDEW".to_string(),
1129            format_version: "FV2504".to_string(),
1130            source_file: "test".to_string(),
1131            segments: segments.into_iter().map(make_mig_segment).collect(),
1132            segment_groups: groups,
1133        }
1134    }
1135
1136    #[test]
1137    fn test_assembler_top_level_segments_only() {
1138        let mig = make_mig_schema(vec!["UNH", "BGM", "DTM", "UNT"], vec![]);
1139
1140        let segments = vec![
1141            make_owned_seg("UNH", vec![vec!["001", "UTILMD:D:11A:UN:S2.1"]]),
1142            make_owned_seg("BGM", vec![vec!["E01", "DOC001"]]),
1143            make_owned_seg("DTM", vec![vec!["137", "20250101", "102"]]),
1144            make_owned_seg("UNT", vec![vec!["4", "001"]]),
1145        ];
1146
1147        let assembler = Assembler::new(&mig);
1148        let result = assembler.assemble_generic(&segments).unwrap();
1149
1150        assert_eq!(result.segments.len(), 4);
1151        assert_eq!(result.segments[0].tag, "UNH");
1152        assert_eq!(result.segments[1].tag, "BGM");
1153        assert_eq!(result.segments[2].tag, "DTM");
1154        assert_eq!(result.segments[3].tag, "UNT");
1155        assert!(result.groups.is_empty());
1156    }
1157
1158    #[test]
1159    fn test_assembler_with_segment_group() {
1160        let mig = make_mig_schema(
1161            vec!["UNH", "BGM"],
1162            vec![
1163                make_mig_group("SG2", vec!["NAD"], vec![]),
1164                make_mig_group("SG4", vec!["IDE", "STS"], vec![]),
1165            ],
1166        );
1167
1168        let segments = vec![
1169            make_owned_seg("UNH", vec![vec!["001"]]),
1170            make_owned_seg("BGM", vec![vec!["E01"]]),
1171            make_owned_seg("NAD", vec![vec!["MS", "9900123"]]),
1172            make_owned_seg("NAD", vec![vec!["MR", "9900456"]]),
1173            make_owned_seg("IDE", vec![vec!["24", "TX001"]]),
1174            make_owned_seg("STS", vec![vec!["7"], vec!["Z33"]]),
1175        ];
1176
1177        let assembler = Assembler::new(&mig);
1178        let result = assembler.assemble_generic(&segments).unwrap();
1179
1180        // Top-level: UNH, BGM
1181        assert_eq!(result.segments.len(), 2);
1182        // SG2: 2 repetitions (two NAD segments)
1183        assert_eq!(result.groups.len(), 2);
1184        assert_eq!(result.groups[0].group_id, "SG2");
1185        assert_eq!(result.groups[0].repetitions.len(), 2);
1186        assert_eq!(result.groups[0].repetitions[0].segments[0].tag, "NAD");
1187        assert_eq!(result.groups[0].repetitions[1].segments[0].tag, "NAD");
1188        // SG4: 1 repetition (IDE + STS)
1189        assert_eq!(result.groups[1].group_id, "SG4");
1190        assert_eq!(result.groups[1].repetitions.len(), 1);
1191        assert_eq!(result.groups[1].repetitions[0].segments.len(), 2);
1192    }
1193
1194    #[test]
1195    fn test_assembler_nested_groups() {
1196        let sg3 = make_mig_group("SG3", vec!["CTA", "COM"], vec![]);
1197        let mig = make_mig_schema(
1198            vec!["UNH", "BGM"],
1199            vec![make_mig_group("SG2", vec!["NAD"], vec![sg3])],
1200        );
1201
1202        let segments = vec![
1203            make_owned_seg("UNH", vec![vec!["001"]]),
1204            make_owned_seg("BGM", vec![vec!["E01"]]),
1205            make_owned_seg("NAD", vec![vec!["MS", "9900123"]]),
1206            make_owned_seg("CTA", vec![vec!["IC", "Kontakt"]]),
1207            make_owned_seg("COM", vec![vec!["040@example.com", "EM"]]),
1208        ];
1209
1210        let assembler = Assembler::new(&mig);
1211        let result = assembler.assemble_generic(&segments).unwrap();
1212
1213        // SG2 has 1 repetition
1214        let sg2 = &result.groups[0];
1215        assert_eq!(sg2.group_id, "SG2");
1216        assert_eq!(sg2.repetitions.len(), 1);
1217
1218        let sg2_inst = &sg2.repetitions[0];
1219        assert_eq!(sg2_inst.segments[0].tag, "NAD");
1220
1221        // SG3 nested inside SG2
1222        assert_eq!(sg2_inst.child_groups.len(), 1);
1223        let sg3 = &sg2_inst.child_groups[0];
1224        assert_eq!(sg3.group_id, "SG3");
1225        assert_eq!(sg3.repetitions[0].segments.len(), 2);
1226        assert_eq!(sg3.repetitions[0].segments[0].tag, "CTA");
1227        assert_eq!(sg3.repetitions[0].segments[1].tag, "COM");
1228    }
1229
1230    #[test]
1231    fn test_assembler_optional_segments_skipped() {
1232        // MIG expects UNH, BGM, DTM, UNT but input has no DTM
1233        let mig = make_mig_schema(vec!["UNH", "BGM", "DTM", "UNT"], vec![]);
1234
1235        let segments = vec![
1236            make_owned_seg("UNH", vec![vec!["001"]]),
1237            make_owned_seg("BGM", vec![vec!["E01"]]),
1238            make_owned_seg("UNT", vec![vec!["2", "001"]]),
1239        ];
1240
1241        let assembler = Assembler::new(&mig);
1242        let result = assembler.assemble_generic(&segments).unwrap();
1243
1244        // DTM is skipped (optional), UNT consumed
1245        assert_eq!(result.segments.len(), 3);
1246        assert_eq!(result.segments[0].tag, "UNH");
1247        assert_eq!(result.segments[1].tag, "BGM");
1248        assert_eq!(result.segments[2].tag, "UNT");
1249    }
1250
1251    #[test]
1252    fn test_assembler_empty_segments() {
1253        let mig = make_mig_schema(vec!["UNH"], vec![]);
1254        let assembler = Assembler::new(&mig);
1255        let result = assembler.assemble_generic(&[]).unwrap();
1256        assert!(result.segments.is_empty());
1257        assert!(result.groups.is_empty());
1258    }
1259
1260    #[test]
1261    fn test_assembler_preserves_element_data() {
1262        let mig = make_mig_schema(vec!["DTM"], vec![]);
1263
1264        let segments = vec![make_owned_seg(
1265            "DTM",
1266            vec![vec!["137", "202501010000+01", "303"]],
1267        )];
1268
1269        let assembler = Assembler::new(&mig);
1270        let result = assembler.assemble_generic(&segments).unwrap();
1271
1272        let dtm = &result.segments[0];
1273        assert_eq!(dtm.elements[0][0], "137");
1274        assert_eq!(dtm.elements[0][1], "202501010000+01");
1275        assert_eq!(dtm.elements[0][2], "303");
1276    }
1277
1278    #[test]
1279    fn test_group_instance_as_assembled_tree() {
1280        // Build an SG4 instance with root segments (IDE, STS) and child groups (SG5)
1281        let sg5 = AssembledGroup {
1282            group_id: "SG5".to_string(),
1283            repetitions: vec![AssembledGroupInstance {
1284                segments: vec![AssembledSegment {
1285                    tag: "LOC".to_string(),
1286                    elements: vec![vec!["Z16".to_string(), "DE000111222333".to_string()]],
1287                    mig_number: None,
1288                    segment_number: None,
1289                }],
1290                child_groups: vec![],
1291                entry_mig_number: None,
1292                variant_mig_numbers: vec![],
1293                skipped_segments: vec![],
1294                skipped_positions: Vec::new(),
1295            }],
1296        };
1297
1298        let sg4_instance = AssembledGroupInstance {
1299            segments: vec![
1300                AssembledSegment {
1301                    tag: "IDE".to_string(),
1302                    elements: vec![vec!["24".to_string(), "TX001".to_string()]],
1303                    mig_number: None,
1304                    segment_number: None,
1305                },
1306                AssembledSegment {
1307                    tag: "STS".to_string(),
1308                    elements: vec![vec!["7".to_string()]],
1309                    mig_number: None,
1310                    segment_number: None,
1311                },
1312            ],
1313            child_groups: vec![sg5],
1314            entry_mig_number: None,
1315            variant_mig_numbers: vec![],
1316            skipped_segments: vec![],
1317            skipped_positions: Vec::new(),
1318        };
1319
1320        let sub_tree = sg4_instance.as_assembled_tree();
1321
1322        // Root segments of sub-tree are the SG4 instance's segments
1323        assert_eq!(sub_tree.segments.len(), 2);
1324        assert_eq!(sub_tree.segments[0].tag, "IDE");
1325        assert_eq!(sub_tree.segments[1].tag, "STS");
1326
1327        // Groups of sub-tree are the SG4 instance's child groups
1328        assert_eq!(sub_tree.groups.len(), 1);
1329        assert_eq!(sub_tree.groups[0].group_id, "SG5");
1330
1331        // post_group_start marks where root segments end
1332        assert_eq!(sub_tree.post_group_start, 2);
1333    }
1334
1335    #[test]
1336    fn test_assembler_from_parsed_edifact() {
1337        // End-to-end: parse raw EDIFACT, then assemble
1338        let input = b"UNA:+.? 'UNB+UNOC:3+SENDER+RECEIVER+210101:1200+REF001'UNH+MSG001+UTILMD:D:11A:UN:S2.1'BGM+E01+DOC001+9'DTM+137:20250101:102'UNT+3+MSG001'UNZ+1+REF001'";
1339        let segments = crate::tokenize::parse_to_segments(input).unwrap();
1340
1341        let mig = make_mig_schema(vec!["UNB", "UNH", "BGM", "DTM", "UNT", "UNZ"], vec![]);
1342
1343        let assembler = Assembler::new(&mig);
1344        let result = assembler.assemble_generic(&segments).unwrap();
1345
1346        assert!(result.segments.iter().any(|s| s.tag == "UNH"));
1347        assert!(result.segments.iter().any(|s| s.tag == "BGM"));
1348        assert!(result.segments.iter().any(|s| s.tag == "DTM"));
1349    }
1350
1351    #[test]
1352    fn test_assemble_with_diagnostics_clean_input() {
1353        let mig = make_mig_schema(vec!["UNH", "BGM", "UNT"], vec![]);
1354        let segments = vec![
1355            make_owned_seg("UNH", vec![vec!["001"]]),
1356            make_owned_seg("BGM", vec![vec!["E01"]]),
1357            make_owned_seg("UNT", vec![vec!["2", "001"]]),
1358        ];
1359        let assembler = Assembler::new(&mig);
1360        let (tree, diagnostics) = assembler.assemble_with_diagnostics(&segments);
1361        assert_eq!(tree.segments.len(), 3);
1362        assert!(
1363            diagnostics.is_empty(),
1364            "Clean input should have no diagnostics"
1365        );
1366    }
1367
1368    #[test]
1369    fn test_assemble_with_diagnostics_unconsumed_segments() {
1370        let mig = make_mig_schema(vec!["UNH", "BGM"], vec![]);
1371        let segments = vec![
1372            make_owned_seg("UNH", vec![vec!["001"]]),
1373            make_owned_seg("BGM", vec![vec!["E01"]]),
1374            make_owned_seg("FTX", vec![vec!["AAA", "extra text"]]),
1375        ];
1376        let assembler = Assembler::new(&mig);
1377        let (tree, diagnostics) = assembler.assemble_with_diagnostics(&segments);
1378        assert_eq!(tree.segments.len(), 2);
1379        assert_eq!(diagnostics.len(), 1);
1380        assert_eq!(
1381            diagnostics[0].kind,
1382            StructureDiagnosticKind::UnexpectedSegment
1383        );
1384        assert_eq!(diagnostics[0].segment_id, "FTX");
1385        assert_eq!(diagnostics[0].position, 2);
1386    }
1387
1388    #[test]
1389    fn test_assemble_with_diagnostics_multiple_unconsumed() {
1390        let mig = make_mig_schema(vec!["UNH"], vec![]);
1391        let segments = vec![
1392            make_owned_seg("UNH", vec![vec!["001"]]),
1393            make_owned_seg("FOO", vec![]),
1394            make_owned_seg("BAR", vec![]),
1395            make_owned_seg("BAZ", vec![]),
1396        ];
1397        let assembler = Assembler::new(&mig);
1398        let (tree, diagnostics) = assembler.assemble_with_diagnostics(&segments);
1399        assert_eq!(tree.segments.len(), 1);
1400        assert_eq!(diagnostics.len(), 3);
1401        assert_eq!(diagnostics[0].segment_id, "FOO");
1402        assert_eq!(diagnostics[1].segment_id, "BAR");
1403        assert_eq!(diagnostics[2].segment_id, "BAZ");
1404    }
1405
1406    // ── Non-entry segment mig_number assignment tests ──
1407
1408    #[test]
1409    fn test_non_entry_segments_get_mig_number_from_bounded_slots() {
1410        // MIG group SG4 has entry IDE + two numbered DTMs + STS.
1411        // The assembler should assign mig_number from the MIG slots to
1412        // each non-entry segment via the bounded consumption path.
1413        use crate::test_support::make_mig_segment_numbered;
1414
1415        let sg4 = MigSegmentGroup {
1416            segments: vec![
1417                make_mig_segment_numbered("IDE", "00020"),
1418                make_mig_segment_numbered("DTM", "00023"),
1419                make_mig_segment_numbered("DTM", "00024"),
1420                make_mig_segment_numbered("STS", "00035"),
1421            ],
1422            ..make_mig_group("SG4", vec![], vec![])
1423        };
1424        let mig = make_mig_schema(vec!["UNH"], vec![sg4]);
1425
1426        let segments = vec![
1427            make_owned_seg("UNH", vec![vec!["001"]]),
1428            make_owned_seg("IDE", vec![vec!["24", "TX001"]]),
1429            make_owned_seg("DTM", vec![vec!["92", "202505312200+00", "303"]]),
1430            make_owned_seg("DTM", vec![vec!["93", "202512312300+00", "303"]]),
1431            make_owned_seg("STS", vec![vec!["7"], vec![], vec!["E01"]]),
1432        ];
1433
1434        let assembler = Assembler::new(&mig);
1435        let tree = assembler.assemble_generic(&segments).unwrap();
1436
1437        let sg4_instance = &tree.groups[0].repetitions[0];
1438
1439        // IDE (entry) gets mig_number from try_consume_segment
1440        assert_eq!(sg4_instance.segments[0].tag, "IDE");
1441        assert_eq!(sg4_instance.segments[0].mig_number.as_deref(), Some("00020"));
1442
1443        // DTM+92 gets mig_number "00023" from first DTM slot
1444        assert_eq!(sg4_instance.segments[1].tag, "DTM");
1445        assert_eq!(sg4_instance.segments[1].mig_number.as_deref(), Some("00023"));
1446
1447        // DTM+93 gets mig_number "00024" from second DTM slot
1448        assert_eq!(sg4_instance.segments[2].tag, "DTM");
1449        assert_eq!(sg4_instance.segments[2].mig_number.as_deref(), Some("00024"));
1450
1451        // STS gets mig_number "00035"
1452        assert_eq!(sg4_instance.segments[3].tag, "STS");
1453        assert_eq!(sg4_instance.segments[3].mig_number.as_deref(), Some("00035"));
1454
1455        // variant_mig_numbers should contain all four
1456        assert!(sg4_instance.variant_mig_numbers.contains(&"00020".to_string()));
1457        assert!(sg4_instance.variant_mig_numbers.contains(&"00023".to_string()));
1458        assert!(sg4_instance.variant_mig_numbers.contains(&"00024".to_string()));
1459        assert!(sg4_instance.variant_mig_numbers.contains(&"00035".to_string()));
1460    }
1461
1462    #[test]
1463    fn test_greedy_extra_segments_get_no_mig_number() {
1464        // MIG defines 1 DTM slot, but input has 2 DTMs.
1465        // First DTM gets mig_number from bounded path, second gets None (greedy extra).
1466        use crate::test_support::make_mig_segment_numbered;
1467
1468        let sg4 = MigSegmentGroup {
1469            segments: vec![
1470                make_mig_segment_numbered("IDE", "00020"),
1471                make_mig_segment_numbered("DTM", "00023"),
1472            ],
1473            ..make_mig_group("SG4", vec![], vec![])
1474        };
1475        let mig = make_mig_schema(vec!["UNH"], vec![sg4]);
1476
1477        let segments = vec![
1478            make_owned_seg("UNH", vec![vec!["001"]]),
1479            make_owned_seg("IDE", vec![vec!["24"]]),
1480            make_owned_seg("DTM", vec![vec!["92", "20250531"]]),
1481            make_owned_seg("DTM", vec![vec!["93", "20251231"]]), // extra beyond MIG
1482        ];
1483
1484        let assembler = Assembler::new(&mig);
1485        let tree = assembler.assemble_generic(&segments).unwrap();
1486
1487        let sg4_instance = &tree.groups[0].repetitions[0];
1488        assert_eq!(sg4_instance.segments.len(), 3); // IDE + 2 DTMs
1489
1490        // First DTM: bounded slot → mig_number set
1491        assert_eq!(sg4_instance.segments[1].mig_number.as_deref(), Some("00023"));
1492
1493        // Second DTM: greedy extra → mig_number None
1494        assert_eq!(sg4_instance.segments[2].mig_number, None);
1495    }
1496
1497    // ── Qualifier-aware assembly tests ──
1498
1499    #[test]
1500    fn test_qualifier_map_prevents_wrong_slot_consumption() {
1501        // MIG defines DTM(00023) + DTM(00024). Input has only DTM+93.
1502        // Without qualifier map: DTM+93 consumed by slot 00023 (wrong).
1503        // With qualifier map: slot 00023 expects "92", skips DTM+93.
1504        //   Slot 00024 expects "93", consumes DTM+93 correctly.
1505        use crate::test_support::make_mig_segment_numbered;
1506        use std::collections::HashMap;
1507
1508        let sg4 = MigSegmentGroup {
1509            segments: vec![
1510                make_mig_segment_numbered("IDE", "00020"),
1511                make_mig_segment_numbered("DTM", "00023"),
1512                make_mig_segment_numbered("DTM", "00024"),
1513            ],
1514            ..make_mig_group("SG4", vec![], vec![])
1515        };
1516        let mig = make_mig_schema(vec!["UNH"], vec![sg4]);
1517
1518        let segments = vec![
1519            make_owned_seg("UNH", vec![vec!["001"]]),
1520            make_owned_seg("IDE", vec![vec!["24"]]),
1521            make_owned_seg("DTM", vec![vec!["93", "202512312300+00", "303"]]),
1522        ];
1523
1524        let mut qualifier_map = HashMap::new();
1525        qualifier_map.insert("00023".to_string(), (0, 0, "92".to_string()));
1526        qualifier_map.insert("00024".to_string(), (0, 0, "93".to_string()));
1527
1528        let config = AssemblerConfig {
1529            skip_unknown_segments: false,
1530            qualifier_map,
1531            ..Default::default()
1532        };
1533        let assembler = Assembler::with_config(&mig, config);
1534        let tree = assembler.assemble_generic(&segments).unwrap();
1535
1536        let sg4_instance = &tree.groups[0].repetitions[0];
1537
1538        // DTM+93 should be consumed by slot 00024, NOT slot 00023
1539        assert_eq!(sg4_instance.segments.len(), 2); // IDE + DTM+93
1540        let dtm = &sg4_instance.segments[1];
1541        assert_eq!(dtm.tag, "DTM");
1542        assert_eq!(
1543            dtm.mig_number.as_deref(),
1544            Some("00024"),
1545            "DTM+93 should get mig_number 00024 (not 00023)"
1546        );
1547    }
1548
1549    #[test]
1550    fn test_group_entry_qualifier_mismatch_does_not_infinite_loop() {
1551        // Regression: when a group's entry segment has a qualifier_map entry
1552        // but the input segment's qualifier does not match, the outer
1553        // `while !cursor.is_exhausted()` loop in try_consume_group used to
1554        // spin forever — entry tag matched, so the loop kept going, but
1555        // try_consume_segment rejected the segment on qualifier mismatch, so
1556        // the cursor never advanced. Each iteration allocated a fresh
1557        // variant_mig_numbers Vec via collect_mig_numbers, driving unbounded
1558        // memory growth (observed: 3.5 GB → OOM in ~3 s on staging for
1559        // FV2604/UTILMD_Gas/PID 44004 with LOC+172).
1560        use crate::test_support::make_mig_segment_numbered;
1561        use std::collections::HashMap;
1562
1563        let sg5 = MigSegmentGroup {
1564            segments: vec![make_mig_segment_numbered("LOC", "00050")],
1565            ..make_mig_group("SG5", vec![], vec![])
1566        };
1567        let mig = make_mig_schema(vec!["UNH"], vec![sg5]);
1568
1569        let segments = vec![
1570            make_owned_seg("UNH", vec![vec!["001"]]),
1571            // LOC entry tag matches, but qualifier "172" ≠ expected "Z16"
1572            make_owned_seg("LOC", vec![vec!["172"], vec!["92003964705"]]),
1573        ];
1574
1575        let mut qualifier_map = HashMap::new();
1576        qualifier_map.insert("00050".to_string(), (0, 0, "Z16".to_string()));
1577
1578        let config = AssemblerConfig {
1579            skip_unknown_segments: false,
1580            qualifier_map,
1581            ..Default::default()
1582        };
1583        let assembler = Assembler::with_config(&mig, config);
1584
1585        // Before the fix this would loop forever. Bound the assertion with a
1586        // generous wall-clock guard so a regression is a clear test failure
1587        // rather than a hanging CI job.
1588        let start = std::time::Instant::now();
1589        let tree = assembler.assemble_generic(&segments).unwrap();
1590        assert!(
1591            start.elapsed() < std::time::Duration::from_secs(5),
1592            "assembly took {:?} — suspected infinite-loop regression",
1593            start.elapsed()
1594        );
1595
1596        // LOC+172 didn't match SG5's qualifier, so SG5 should be empty.
1597        // The LOC segment remains unconsumed (caller will surface it as a
1598        // structure diagnostic).
1599        assert!(tree.groups.is_empty());
1600    }
1601
1602    // ── Skip-unknown-segments tests ──
1603
1604    #[test]
1605    fn test_skip_unknown_segment_between_slots() {
1606        // MIG group expects [SEQ, CCI], input has [SEQ, RFF, CCI].
1607        // With skip ON, RFF is skipped and CCI is consumed.
1608        // With skip OFF (default), CCI is lost because RFF stalls the cursor.
1609        let sg8 = make_mig_group("SG8", vec!["SEQ", "CCI"], vec![]);
1610        let mig = make_mig_schema(vec!["UNH"], vec![sg8.clone()]);
1611
1612        let segments = vec![
1613            make_owned_seg("UNH", vec![vec!["001"]]),
1614            make_owned_seg("SEQ", vec![vec!["Z98"]]),
1615            make_owned_seg("RFF", vec![vec!["Z38", "CROSSREF"]]),
1616            make_owned_seg("CCI", vec![vec!["Z30"]]),
1617        ];
1618
1619        // Skip OFF: CCI not consumed (RFF stalls cursor after SEQ)
1620        let off = Assembler::new(&mig);
1621        let tree_off = off.assemble_generic(&segments).unwrap();
1622        let sg8_off = &tree_off.groups[0];
1623        assert_eq!(sg8_off.repetitions[0].segments.len(), 1); // Only SEQ
1624        assert_eq!(sg8_off.repetitions[0].segments[0].tag, "SEQ");
1625
1626        // Skip ON: RFF skipped, CCI consumed
1627        let on = Assembler::with_config(
1628            &mig,
1629            AssemblerConfig {
1630                skip_unknown_segments: true,
1631                ..Default::default()
1632            },
1633        );
1634        let tree_on = on.assemble_generic(&segments).unwrap();
1635        let sg8_on = &tree_on.groups[0];
1636        assert_eq!(sg8_on.repetitions[0].segments.len(), 2); // SEQ + CCI
1637        assert_eq!(sg8_on.repetitions[0].segments[0].tag, "SEQ");
1638        assert_eq!(sg8_on.repetitions[0].segments[1].tag, "CCI");
1639    }
1640
1641    #[test]
1642    fn test_skip_preserves_on_instance() {
1643        // Skipped segments are stored in instance.skipped_segments
1644        let sg8 = make_mig_group("SG8", vec!["SEQ", "CCI"], vec![]);
1645        let mig = make_mig_schema(vec!["UNH"], vec![sg8]);
1646
1647        let segments = vec![
1648            make_owned_seg("UNH", vec![vec!["001"]]),
1649            make_owned_seg("SEQ", vec![vec!["Z98"]]),
1650            make_owned_seg("RFF", vec![vec!["Z38", "REF1"]]),
1651            make_owned_seg("DTM", vec![vec!["92", "20250101"]]),
1652            make_owned_seg("CCI", vec![vec!["Z30"]]),
1653        ];
1654
1655        let assembler = Assembler::with_config(
1656            &mig,
1657            AssemblerConfig {
1658                skip_unknown_segments: true,
1659                ..Default::default()
1660            },
1661        );
1662        let tree = assembler.assemble_generic(&segments).unwrap();
1663        let instance = &tree.groups[0].repetitions[0];
1664
1665        assert_eq!(instance.segments.len(), 2); // SEQ + CCI
1666        assert_eq!(instance.skipped_segments.len(), 2); // RFF + DTM
1667        assert_eq!(instance.skipped_segments[0].tag, "RFF");
1668        assert_eq!(instance.skipped_segments[1].tag, "DTM");
1669    }
1670
1671    #[test]
1672    fn test_skip_mode_off_default() {
1673        // Assembler::new() doesn't skip (backwards compat)
1674        let mig = make_mig_schema(vec![], vec![]);
1675        let assembler = Assembler::new(&mig);
1676        assert!(!assembler.config.skip_unknown_segments);
1677    }
1678
1679    #[test]
1680    fn test_skip_does_not_consume_nested_group_entry() {
1681        // Skip must NOT consume segments that are nested group entries.
1682        // SG4 expects [IDE, STS], nested SG5 expects [LOC].
1683        // Input: IDE, FOO, STS, LOC. FOO should be skipped, LOC goes to SG5.
1684        let sg5 = make_mig_group("SG5", vec!["LOC"], vec![]);
1685        let sg4 = make_mig_group("SG4", vec!["IDE", "STS"], vec![sg5]);
1686        let mig = make_mig_schema(vec!["UNH"], vec![sg4]);
1687
1688        let segments = vec![
1689            make_owned_seg("UNH", vec![vec!["001"]]),
1690            make_owned_seg("IDE", vec![vec!["24"]]),
1691            make_owned_seg("FOO", vec![vec!["unknown"]]),
1692            make_owned_seg("STS", vec![vec!["7"]]),
1693            make_owned_seg("LOC", vec![vec!["Z16"]]),
1694        ];
1695
1696        let assembler = Assembler::with_config(
1697            &mig,
1698            AssemblerConfig {
1699                skip_unknown_segments: true,
1700                ..Default::default()
1701            },
1702        );
1703        let tree = assembler.assemble_generic(&segments).unwrap();
1704        let sg4 = &tree.groups[0];
1705        let inst = &sg4.repetitions[0];
1706
1707        // IDE + STS consumed, FOO skipped
1708        assert_eq!(inst.segments.len(), 2);
1709        assert_eq!(inst.segments[0].tag, "IDE");
1710        assert_eq!(inst.segments[1].tag, "STS");
1711        assert_eq!(inst.skipped_segments.len(), 1);
1712        assert_eq!(inst.skipped_segments[0].tag, "FOO");
1713
1714        // LOC went to nested SG5
1715        assert_eq!(inst.child_groups.len(), 1);
1716        assert_eq!(inst.child_groups[0].group_id, "SG5");
1717        assert_eq!(inst.child_groups[0].repetitions[0].segments[0].tag, "LOC");
1718    }
1719
1720    #[test]
1721    fn test_skip_unknown_between_nested_group_reps() {
1722        // PID 55035 regression: an AHB-foreign segment sitting between two
1723        // reps of a nested variant-aware group currently stalls the cursor
1724        // and cascades: every subsequent valid rep is lost. With skip mode on,
1725        // the orphan should be recorded and the following reps consumed.
1726        //
1727        // Shape:
1728        //   SG4 [IDE, STS]
1729        //     SG8 (variant ZD7) [SEQ]
1730        //       SG10 [CCI]
1731        //     SG8 (variant Z98) [SEQ]
1732        //     SG12 [NAD]
1733        //
1734        // Input:
1735        //   IDE, STS, SEQ+ZD7, CCI+Z30, FOO+<orphan>, SEQ+Z98, NAD+MS
1736        //
1737        // Expected with skip ON: SG8 has 2 reps (ZD7 + Z98), SG12 has 1 rep,
1738        // and the orphan FOO sits in SG4's skipped_segments.
1739        let sg10 = make_mig_group("SG10", vec!["CCI"], vec![]);
1740        let sg8_zd7 =
1741            make_mig_group_with_variant("SG8", vec!["SEQ"], vec![sg10.clone()], "ZD7");
1742        let sg8_z98 = make_mig_group_with_variant("SG8", vec!["SEQ"], vec![], "Z98");
1743        let sg12 = make_mig_group("SG12", vec!["NAD"], vec![]);
1744        let sg4 = make_mig_group(
1745            "SG4",
1746            vec!["IDE", "STS"],
1747            vec![sg8_zd7, sg8_z98, sg12],
1748        );
1749        let mig = make_mig_schema(vec!["UNH"], vec![sg4]);
1750
1751        let segments = vec![
1752            make_owned_seg("UNH", vec![vec!["001"]]),
1753            make_owned_seg("IDE", vec![vec!["24", "TX001"]]),
1754            make_owned_seg("STS", vec![vec!["7"], vec!["Z33"]]),
1755            make_owned_seg("SEQ", vec![vec!["ZD7"]]),
1756            make_owned_seg("CCI", vec![vec!["Z30"]]),
1757            make_owned_seg("FOO", vec![vec!["orphan"]]),
1758            make_owned_seg("SEQ", vec![vec!["Z98"]]),
1759            make_owned_seg("NAD", vec![vec!["MS", "9900123"]]),
1760        ];
1761
1762        let assembler = Assembler::with_config(
1763            &mig,
1764            AssemblerConfig {
1765                skip_unknown_segments: true,
1766                ..Default::default()
1767            },
1768        );
1769        let tree = assembler.assemble_generic(&segments).unwrap();
1770
1771        assert_eq!(tree.groups.len(), 1);
1772        let sg4_inst = &tree.groups[0].repetitions[0];
1773        // SG4 nested child groups: one SG8 (combined ZD7+Z98) and one SG12
1774        assert_eq!(sg4_inst.child_groups.len(), 2, "expected SG8 + SG12");
1775        let sg8_tree = &sg4_inst.child_groups[0];
1776        assert_eq!(sg8_tree.group_id, "SG8");
1777        assert_eq!(
1778            sg8_tree.repetitions.len(),
1779            2,
1780            "SG8 should have both ZD7 and Z98 reps after orphan skip"
1781        );
1782        assert_eq!(sg8_tree.repetitions[0].segments[0].elements[0][0], "ZD7");
1783        assert_eq!(sg8_tree.repetitions[1].segments[0].elements[0][0], "Z98");
1784
1785        let sg12_tree = &sg4_inst.child_groups[1];
1786        assert_eq!(sg12_tree.group_id, "SG12");
1787        assert_eq!(sg12_tree.repetitions.len(), 1);
1788
1789        // Orphan FOO is recorded on whichever instance was active when it
1790        // was encountered. It may live on SG4, on the first SG8 rep, or on
1791        // that rep's child SG10 — the important thing is that it's captured
1792        // exactly once and the subsequent valid reps were still consumed.
1793        fn count_foo(inst: &AssembledGroupInstance) -> usize {
1794            let mut n = inst
1795                .skipped_segments
1796                .iter()
1797                .filter(|s| s.tag == "FOO")
1798                .count();
1799            for child in &inst.child_groups {
1800                for rep in &child.repetitions {
1801                    n += count_foo(rep);
1802                }
1803            }
1804            n
1805        }
1806        assert_eq!(count_foo(sg4_inst), 1, "FOO should be recorded exactly once");
1807    }
1808
1809    #[test]
1810    fn test_skip_off_preserves_cascade_behavior() {
1811        // Same structure as above, but with skip OFF the orphan must still
1812        // stall the cursor (callers relying on strict assembly shouldn't
1813        // suddenly see orphans silently swallowed).
1814        let sg10 = make_mig_group("SG10", vec!["CCI"], vec![]);
1815        let sg8_zd7 =
1816            make_mig_group_with_variant("SG8", vec!["SEQ"], vec![sg10], "ZD7");
1817        let sg8_z98 = make_mig_group_with_variant("SG8", vec!["SEQ"], vec![], "Z98");
1818        let sg12 = make_mig_group("SG12", vec!["NAD"], vec![]);
1819        let sg4 = make_mig_group(
1820            "SG4",
1821            vec!["IDE", "STS"],
1822            vec![sg8_zd7, sg8_z98, sg12],
1823        );
1824        let mig = make_mig_schema(vec!["UNH"], vec![sg4]);
1825
1826        let segments = vec![
1827            make_owned_seg("UNH", vec![vec!["001"]]),
1828            make_owned_seg("IDE", vec![vec!["24", "TX001"]]),
1829            make_owned_seg("STS", vec![vec!["7"], vec!["Z33"]]),
1830            make_owned_seg("SEQ", vec![vec!["ZD7"]]),
1831            make_owned_seg("CCI", vec![vec!["Z30"]]),
1832            make_owned_seg("FOO", vec![vec!["orphan"]]),
1833            make_owned_seg("SEQ", vec![vec!["Z98"]]),
1834            make_owned_seg("NAD", vec![vec!["MS", "9900123"]]),
1835        ];
1836
1837        let assembler = Assembler::new(&mig);
1838        let tree = assembler.assemble_generic(&segments).unwrap();
1839
1840        let sg4_inst = &tree.groups[0].repetitions[0];
1841        let sg8_tree = sg4_inst
1842            .child_groups
1843            .iter()
1844            .find(|g| g.group_id == "SG8")
1845            .expect("SG8 should still be present");
1846        // Only the first SG8 rep gets consumed; Z98 and NAD stall behind FOO.
1847        assert_eq!(sg8_tree.repetitions.len(), 1);
1848        assert!(
1849            sg4_inst
1850                .skipped_segments
1851                .iter()
1852                .all(|s| s.tag != "FOO"),
1853            "FOO must not be skipped when skip mode is off"
1854        );
1855    }
1856
1857    #[test]
1858    fn test_roundtrip_with_skip() {
1859        // Full roundtrip: assemble with skip → disassemble → byte-identical
1860        // including skipped segments in the output.
1861        use crate::disassembler::Disassembler;
1862        use crate::renderer::render_edifact;
1863
1864        let sg8 = make_mig_group("SG8", vec!["SEQ", "CCI"], vec![]);
1865        let mig = make_mig_schema(vec!["UNH", "UNT"], vec![sg8]);
1866
1867        let segments = vec![
1868            make_owned_seg("UNH", vec![vec!["001"]]),
1869            make_owned_seg("SEQ", vec![vec!["Z98"]]),
1870            make_owned_seg("RFF", vec![vec!["Z38", "REF1"]]),
1871            make_owned_seg("CCI", vec![vec!["Z30"]]),
1872            make_owned_seg("UNT", vec![vec!["4", "001"]]),
1873        ];
1874
1875        let assembler = Assembler::with_config(
1876            &mig,
1877            AssemblerConfig {
1878                skip_unknown_segments: true,
1879                ..Default::default()
1880            },
1881        );
1882        let tree = assembler.assemble_generic(&segments).unwrap();
1883
1884        let disassembler = Disassembler::new(&mig);
1885        let dis = disassembler.disassemble(&tree);
1886        let delimiters = edifact_primitives::EdifactDelimiters::default();
1887        let rendered = render_edifact(&dis, &delimiters);
1888
1889        // All 5 segments should appear in output (including skipped RFF).
1890        // Disassembler emits MIG-guided segments first (SEQ, CCI),
1891        // then skipped segments (RFF) — so order within the group differs
1892        // from the original input, but all content is preserved.
1893        assert_eq!(dis.len(), 5);
1894        assert_eq!(dis[0].tag, "UNH");
1895        assert_eq!(dis[1].tag, "SEQ");
1896        assert_eq!(dis[2].tag, "CCI");
1897        assert_eq!(dis[3].tag, "RFF"); // skipped → emitted after MIG segments
1898        assert_eq!(dis[4].tag, "UNT");
1899
1900        // Rendered output contains all segments
1901        assert!(rendered.contains("UNH+001"));
1902        assert!(rendered.contains("SEQ+Z98"));
1903        assert!(rendered.contains("RFF+Z38:REF1"));
1904        assert!(rendered.contains("CCI+Z30"));
1905        assert!(rendered.contains("UNT+4:001"));
1906    }
1907
1908    // ── Variant-aware assembly tests ──
1909
1910    #[test]
1911    fn test_variant_groups_interleaved_reps() {
1912        // Two SG8 variant definitions: one for SEQ+ZD7, one for SEQ+Z98.
1913        // Input has interleaved reps: ZD7, Z98, ZD7, Z98.
1914        // All should be collected into one SG8 group with 4 reps.
1915        let sg8_zd7 = make_mig_group_with_variant("SG8", vec!["SEQ", "CCI"], vec![], "ZD7");
1916        let sg8_z98 = make_mig_group_with_variant("SG8", vec!["SEQ", "RFF"], vec![], "Z98");
1917
1918        let mig = make_mig_schema(vec!["UNH"], vec![sg8_zd7, sg8_z98]);
1919
1920        let segments = vec![
1921            make_owned_seg("UNH", vec![vec!["001"]]),
1922            make_owned_seg("SEQ", vec![vec!["ZD7"]]),
1923            make_owned_seg("CCI", vec![vec!["Z30"]]),
1924            make_owned_seg("SEQ", vec![vec!["Z98"]]),
1925            make_owned_seg("RFF", vec![vec!["Z38", "REF1"]]),
1926            make_owned_seg("SEQ", vec![vec!["ZD7"]]),
1927            make_owned_seg("CCI", vec![vec!["Z31"]]),
1928            make_owned_seg("SEQ", vec![vec!["Z98"]]),
1929            make_owned_seg("RFF", vec![vec!["Z38", "REF2"]]),
1930        ];
1931
1932        let assembler = Assembler::new(&mig);
1933        let result = assembler.assemble_generic(&segments).unwrap();
1934
1935        assert_eq!(result.segments.len(), 1); // UNH
1936        assert_eq!(result.groups.len(), 1); // One combined SG8
1937        let sg8 = &result.groups[0];
1938        assert_eq!(sg8.group_id, "SG8");
1939        assert_eq!(sg8.repetitions.len(), 4);
1940
1941        // ZD7 reps have SEQ+CCI, Z98 reps have SEQ+RFF
1942        assert_eq!(sg8.repetitions[0].segments[0].elements[0][0], "ZD7");
1943        assert_eq!(sg8.repetitions[0].segments[1].tag, "CCI");
1944        assert_eq!(sg8.repetitions[1].segments[0].elements[0][0], "Z98");
1945        assert_eq!(sg8.repetitions[1].segments[1].tag, "RFF");
1946        assert_eq!(sg8.repetitions[2].segments[0].elements[0][0], "ZD7");
1947        assert_eq!(sg8.repetitions[3].segments[0].elements[0][0], "Z98");
1948    }
1949
1950    #[test]
1951    fn test_variant_groups_single_variant_type() {
1952        // Only Z98 reps, no ZD7 — still works with variant matching
1953        let sg8_zd7 = make_mig_group_with_variant("SG8", vec!["SEQ", "CCI"], vec![], "ZD7");
1954        let sg8_z98 = make_mig_group_with_variant("SG8", vec!["SEQ", "RFF"], vec![], "Z98");
1955
1956        let mig = make_mig_schema(vec!["UNH"], vec![sg8_zd7, sg8_z98]);
1957
1958        let segments = vec![
1959            make_owned_seg("UNH", vec![vec!["001"]]),
1960            make_owned_seg("SEQ", vec![vec!["Z98"]]),
1961            make_owned_seg("RFF", vec![vec!["Z38", "REF1"]]),
1962            make_owned_seg("SEQ", vec![vec!["Z98"]]),
1963            make_owned_seg("RFF", vec![vec!["Z38", "REF2"]]),
1964        ];
1965
1966        let assembler = Assembler::new(&mig);
1967        let result = assembler.assemble_generic(&segments).unwrap();
1968
1969        assert_eq!(result.groups.len(), 1);
1970        assert_eq!(result.groups[0].repetitions.len(), 2);
1971        assert_eq!(
1972            result.groups[0].repetitions[0].segments[0].elements[0][0],
1973            "Z98"
1974        );
1975        assert_eq!(
1976            result.groups[0].repetitions[1].segments[0].elements[0][0],
1977            "Z98"
1978        );
1979    }
1980
1981    #[test]
1982    fn test_non_variant_groups_unchanged() {
1983        // Groups without variant_code behave exactly as before
1984        let sg2 = make_mig_group("SG2", vec!["NAD"], vec![]);
1985        let sg4 = make_mig_group("SG4", vec!["IDE", "STS"], vec![]);
1986
1987        let mig = make_mig_schema(vec!["UNH", "BGM"], vec![sg2, sg4]);
1988
1989        let segments = vec![
1990            make_owned_seg("UNH", vec![vec!["001"]]),
1991            make_owned_seg("BGM", vec![vec!["E01"]]),
1992            make_owned_seg("NAD", vec![vec!["MS", "9900123"]]),
1993            make_owned_seg("NAD", vec![vec!["MR", "9900456"]]),
1994            make_owned_seg("IDE", vec![vec!["24", "TX001"]]),
1995            make_owned_seg("STS", vec![vec!["7"], vec!["Z33"]]),
1996        ];
1997
1998        let assembler = Assembler::new(&mig);
1999        let result = assembler.assemble_generic(&segments).unwrap();
2000
2001        assert_eq!(result.segments.len(), 2);
2002        assert_eq!(result.groups.len(), 2);
2003        assert_eq!(result.groups[0].group_id, "SG2");
2004        assert_eq!(result.groups[0].repetitions.len(), 2);
2005        assert_eq!(result.groups[1].group_id, "SG4");
2006        assert_eq!(result.groups[1].repetitions.len(), 1);
2007    }
2008
2009    #[test]
2010    fn test_variant_groups_with_nested_children() {
2011        // Variant groups can have nested child groups
2012        let sg10 = make_mig_group("SG10", vec!["CCI", "CAV"], vec![]);
2013        let sg8_zd7 = make_mig_group_with_variant("SG8", vec!["SEQ"], vec![sg10.clone()], "ZD7");
2014        let sg8_z98 = make_mig_group_with_variant("SG8", vec!["SEQ"], vec![sg10], "Z98");
2015
2016        let mig = make_mig_schema(vec!["UNH"], vec![sg8_zd7, sg8_z98]);
2017
2018        let segments = vec![
2019            make_owned_seg("UNH", vec![vec!["001"]]),
2020            make_owned_seg("SEQ", vec![vec!["ZD7"]]),
2021            make_owned_seg("CCI", vec![vec!["Z30"]]),
2022            make_owned_seg("CAV", vec![vec!["Z91", "Y"]]),
2023            make_owned_seg("SEQ", vec![vec!["Z98"]]),
2024            make_owned_seg("CCI", vec![vec!["Z31"]]),
2025            make_owned_seg("CAV", vec![vec!["Z91", "N"]]),
2026        ];
2027
2028        let assembler = Assembler::new(&mig);
2029        let result = assembler.assemble_generic(&segments).unwrap();
2030
2031        assert_eq!(result.groups.len(), 1);
2032        let sg8 = &result.groups[0];
2033        assert_eq!(sg8.repetitions.len(), 2);
2034
2035        // First rep (ZD7) has nested SG10
2036        assert_eq!(sg8.repetitions[0].child_groups.len(), 1);
2037        assert_eq!(sg8.repetitions[0].child_groups[0].group_id, "SG10");
2038        assert_eq!(
2039            sg8.repetitions[0].child_groups[0].repetitions[0].segments[0].elements[0][0],
2040            "Z30"
2041        );
2042
2043        // Second rep (Z98) has nested SG10
2044        assert_eq!(sg8.repetitions[1].child_groups.len(), 1);
2045        assert_eq!(
2046            sg8.repetitions[1].child_groups[0].repetitions[0].segments[0].elements[0][0],
2047            "Z31"
2048        );
2049    }
2050
2051    #[test]
2052    fn test_variant_qualifier_check_prevents_wrong_variant_consumption() {
2053        // try_consume_group with variant_code set should NOT consume a segment
2054        // whose qualifier doesn't match, even if the tag matches.
2055        let sg8_zd7 = make_mig_group_with_variant("SG8", vec!["SEQ", "CCI"], vec![], "ZD7");
2056
2057        let mig = make_mig_schema(vec!["UNH"], vec![sg8_zd7]);
2058
2059        let segments = vec![
2060            make_owned_seg("UNH", vec![vec!["001"]]),
2061            make_owned_seg("SEQ", vec![vec!["Z98"]]), // Wrong qualifier
2062            make_owned_seg("CCI", vec![vec!["Z30"]]),
2063        ];
2064
2065        let assembler = Assembler::new(&mig);
2066        let result = assembler.assemble_generic(&segments).unwrap();
2067
2068        // SG8 should have no reps because Z98 != ZD7
2069        assert!(result.groups.is_empty());
2070    }
2071
2072    #[test]
2073    fn test_mixed_variant_and_non_variant_groups() {
2074        // SG2 (no variant), then variant SG8s, then SG12 (no variant)
2075        let sg2 = make_mig_group("SG2", vec!["NAD"], vec![]);
2076        let sg8_zd7 = make_mig_group_with_variant("SG8", vec!["SEQ", "CCI"], vec![], "ZD7");
2077        let sg8_z98 = make_mig_group_with_variant("SG8", vec!["SEQ", "RFF"], vec![], "Z98");
2078        let sg12 = make_mig_group("SG12", vec!["NAD"], vec![]);
2079
2080        let mig = make_mig_schema(vec!["UNH"], vec![sg2, sg8_zd7, sg8_z98, sg12]);
2081
2082        let segments = vec![
2083            make_owned_seg("UNH", vec![vec!["001"]]),
2084            make_owned_seg("NAD", vec![vec!["MS", "9900123"]]),
2085            make_owned_seg("SEQ", vec![vec!["ZD7"]]),
2086            make_owned_seg("CCI", vec![vec!["Z30"]]),
2087            make_owned_seg("SEQ", vec![vec!["Z98"]]),
2088            make_owned_seg("RFF", vec![vec!["Z38", "REF1"]]),
2089            make_owned_seg("NAD", vec![vec!["Z65", "ID001"]]),
2090        ];
2091
2092        let assembler = Assembler::new(&mig);
2093        let result = assembler.assemble_generic(&segments).unwrap();
2094
2095        assert_eq!(result.groups.len(), 3); // SG2, SG8 (combined), SG12
2096        assert_eq!(result.groups[0].group_id, "SG2");
2097        assert_eq!(result.groups[0].repetitions.len(), 1);
2098        assert_eq!(result.groups[1].group_id, "SG8");
2099        assert_eq!(result.groups[1].repetitions.len(), 2);
2100        assert_eq!(result.groups[2].group_id, "SG12");
2101        assert_eq!(result.groups[2].repetitions.len(), 1);
2102    }
2103
2104    #[test]
2105    fn test_assembler_disambiguates_shared_qualifier_by_full_code_profile() {
2106        // PID 55035 PIA variants: several mig slots share the primary qualifier
2107        // 4347='5' but differ at C212/7143 (one allows Z12, another SRW). With
2108        // only per-mig qualifier_map, the assembler consumes at the first matching
2109        // slot regardless of the composite code — the downstream validator is
2110        // then forced to second-guess the variant choice. Disambiguate at
2111        // assembly time by checking all code-bearing positions declared on the
2112        // MIG segment.
2113        use mig_types::schema::common::CodeDefinition;
2114        use mig_types::schema::mig::{MigComposite, MigDataElement};
2115        use std::collections::HashMap;
2116
2117        fn code(value: &str) -> CodeDefinition {
2118            CodeDefinition {
2119                value: value.to_string(),
2120                name: value.to_string(),
2121                description: None,
2122            }
2123        }
2124
2125        fn pia_slot(number: &str, composite_code: &str) -> MigSegment {
2126            MigSegment {
2127                id: "PIA".to_string(),
2128                name: "PIA".to_string(),
2129                description: None,
2130                counter: None,
2131                level: 1,
2132                number: Some(number.to_string()),
2133                max_rep_std: 1,
2134                max_rep_spec: 1,
2135                status_std: Some("M".to_string()),
2136                status_spec: Some("M".to_string()),
2137                example: None,
2138                data_elements: vec![MigDataElement {
2139                    id: "4347".to_string(),
2140                    name: "Produkt-ID-Funktion".to_string(),
2141                    description: None,
2142                    status_std: Some("M".to_string()),
2143                    status_spec: Some("M".to_string()),
2144                    format_std: None,
2145                    format_spec: None,
2146                    codes: vec![code("5")],
2147                    position: 0,
2148                }],
2149                composites: vec![MigComposite {
2150                    id: "C212".to_string(),
2151                    name: "Item Identifier".to_string(),
2152                    description: None,
2153                    status_std: Some("M".to_string()),
2154                    status_spec: Some("M".to_string()),
2155                    data_elements: vec![MigDataElement {
2156                        id: "7143".to_string(),
2157                        name: "Artikel/Dienstleistung-ID".to_string(),
2158                        description: None,
2159                        status_std: Some("M".to_string()),
2160                        status_spec: Some("M".to_string()),
2161                        format_std: None,
2162                        format_spec: None,
2163                        codes: vec![code(composite_code)],
2164                        position: 0,
2165                    }],
2166                    position: 1,
2167                }],
2168            }
2169        }
2170
2171        let sg4 = MigSegmentGroup {
2172            segments: vec![
2173                crate::test_support::make_mig_segment_numbered("IDE", "00020"),
2174                pia_slot("00108", "Z12"),
2175                pia_slot("00197", "SRW"),
2176            ],
2177            ..make_mig_group("SG4", vec![], vec![])
2178        };
2179        let mig = make_mig_schema(vec!["UNH"], vec![sg4]);
2180
2181        let segments = vec![
2182            make_owned_seg("UNH", vec![vec!["001"]]),
2183            make_owned_seg("IDE", vec![vec!["24"]]),
2184            // PIA+5+:::SRW — composite element 1, component 0 = "SRW"
2185            make_owned_seg("PIA", vec![vec!["5"], vec!["SRW"]]),
2186        ];
2187
2188        // Both PIA mig slots share the qualifier (0,0)='5'. Without full-profile
2189        // matching, the first slot (00108) wins and the SRW composite is
2190        // mis-assigned to the Z12 variant.
2191        let mut qualifier_map = HashMap::new();
2192        qualifier_map.insert("00108".to_string(), (0, 0, "5".to_string()));
2193        qualifier_map.insert("00197".to_string(), (0, 0, "5".to_string()));
2194
2195        let config = AssemblerConfig {
2196            skip_unknown_segments: false,
2197            qualifier_map,
2198            strict_code_matching: true,
2199        };
2200        let assembler = Assembler::with_config(&mig, config);
2201        let tree = assembler.assemble_generic(&segments).unwrap();
2202
2203        let sg4_instance = &tree.groups[0].repetitions[0];
2204        let pia = sg4_instance
2205            .segments
2206            .iter()
2207            .find(|s| s.tag == "PIA")
2208            .expect("PIA consumed into SG4");
2209        assert_eq!(
2210            pia.mig_number.as_deref(),
2211            Some("00197"),
2212            "PIA+5+:::SRW must be assigned the SRW variant (mig=00197), not the Z12 variant"
2213        );
2214    }
2215}