Skip to main content

xsd_schema/compiler/
fragment.rs

1//! NFA fragment structures and composition helpers
2//!
3//! This module implements Thompson's construction algorithm for building NFAs
4//! from content model particles. Fragments are composable building blocks that
5//! can be concatenated, alternated, or repeated.
6
7use crate::parser::location::SourceRef;
8
9use super::nfa::{CounterDef, CounterId, NfaState, NfaTable, NfaTerm, StateId, TransitionKind};
10
11/// A composable NFA fragment with single entry and exit points
12///
13/// Fragments are the building blocks for constructing complex NFAs using
14/// Thompson's construction. They maintain the invariant of having exactly
15/// one start state and one end state, which enables easy composition.
16#[derive(Debug, Clone)]
17pub struct NfaFragment {
18    /// All states in this fragment (indices are local to fragment)
19    pub states: Vec<NfaState>,
20    /// Entry point state index (into states vector)
21    pub start: usize,
22    /// Exit point state index (into states vector)
23    pub end: usize,
24    /// Counter definitions for counted loops within this fragment
25    pub counter_defs: Vec<CounterDef>,
26    /// Whether this fragment can match the empty string (end reachable from
27    /// start without consuming any input).  Tracked incrementally through
28    /// all composition operations and used to set `CounterDef::body_nullable`.
29    pub nullable: bool,
30}
31
32impl NfaFragment {
33    /// Create a new fragment from states with specified start/end (no counters, not nullable)
34    pub fn new(states: Vec<NfaState>, start: usize, end: usize) -> Self {
35        debug_assert!(start < states.len(), "start index out of bounds");
36        debug_assert!(end < states.len(), "end index out of bounds");
37        Self {
38            states,
39            start,
40            end,
41            counter_defs: Vec::new(),
42            nullable: false,
43        }
44    }
45
46    /// Create a new fragment with counter definitions
47    pub fn with_counters(
48        states: Vec<NfaState>,
49        start: usize,
50        end: usize,
51        counter_defs: Vec<CounterDef>,
52        nullable: bool,
53    ) -> Self {
54        debug_assert!(start < states.len(), "start index out of bounds");
55        debug_assert!(end < states.len(), "end index out of bounds");
56        Self {
57            states,
58            start,
59            end,
60            counter_defs,
61            nullable,
62        }
63    }
64
65    /// Verifies the position-based ID invariant: `state.id == index` for all states.
66    ///
67    /// This invariant is critical for correctness of all composition operations
68    /// (`concat`, `alternate`, etc.) which use position-based offset arithmetic.
69    /// `FragmentBuilder` guarantees it at construction; combinators preserve it.
70    /// Any new fragment creation path MUST maintain this invariant.
71    fn assert_ids_normalized(&self) {
72        for (pos, state) in self.states.iter().enumerate() {
73            debug_assert_eq!(
74                state.id, pos as StateId,
75                "Fragment ID invariant violated: state at position {} has id {}",
76                pos, state.id
77            );
78        }
79    }
80
81    /// Get the start state
82    pub fn start_state(&self) -> &NfaState {
83        &self.states[self.start]
84    }
85
86    /// Get the end state
87    pub fn end_state(&self) -> &NfaState {
88        &self.states[self.end]
89    }
90
91    /// Get a mutable reference to a state by local index
92    pub fn get_state_mut(&mut self, index: usize) -> Option<&mut NfaState> {
93        self.states.get_mut(index)
94    }
95
96    /// Concatenate two fragments: self followed by other
97    ///
98    /// Creates an epsilon transition from self's end state to other's start state.
99    /// The resulting fragment starts at self's start and ends at other's end.
100    pub fn concat(mut self, mut other: NfaFragment) -> NfaFragment {
101        let nullable = self.nullable && other.nullable;
102
103        self.assert_ids_normalized();
104        other.assert_ids_normalized();
105
106        let state_offset = self.states.len();
107        let counter_offset = self.counter_defs.len() as CounterId;
108
109        // Offset all state IDs and counter IDs in other fragment
110        for state in &mut other.states {
111            state.id += state_offset as StateId;
112            for trans in &mut state.transitions {
113                trans.target += state_offset as StateId;
114                trans.kind = trans.kind.offset_counter(counter_offset);
115            }
116        }
117
118        // Add epsilon transition from self.end to other.start
119        let other_start = other.start + state_offset;
120        self.states[self.end].add_epsilon(other_start as StateId);
121
122        // Merge states and counter defs
123        let new_end = other.end + state_offset;
124        self.states.extend(other.states);
125        self.counter_defs.extend(other.counter_defs);
126
127        NfaFragment::with_counters(
128            self.states,
129            self.start,
130            new_end,
131            self.counter_defs,
132            nullable,
133        )
134    }
135
136    /// Alternate two fragments: self | other
137    ///
138    /// Creates a new start state with epsilon transitions to both fragments,
139    /// and a new end state that both fragments converge to.
140    pub fn alternate(mut self, mut other: NfaFragment) -> NfaFragment {
141        let nullable = self.nullable || other.nullable;
142
143        self.assert_ids_normalized();
144        other.assert_ids_normalized();
145
146        // Create new start and end states
147        let new_start_id = (self.states.len() + other.states.len()) as StateId;
148        let new_end_id = new_start_id + 1;
149
150        let mut new_start = NfaState::epsilon(new_start_id, None);
151        let new_end = NfaState::epsilon(new_end_id, None);
152
153        // Offset other fragment's state IDs and counter IDs
154        let other_state_offset = self.states.len();
155        let counter_offset = self.counter_defs.len() as CounterId;
156        for state in &mut other.states {
157            state.id += other_state_offset as StateId;
158            for trans in &mut state.transitions {
159                trans.target += other_state_offset as StateId;
160                trans.kind = trans.kind.offset_counter(counter_offset);
161            }
162        }
163
164        // Add epsilon from new start to both fragment starts
165        new_start.add_epsilon(self.start as StateId);
166        new_start.add_epsilon((other.start + other_state_offset) as StateId);
167
168        // Add epsilon from both fragment ends to new end
169        self.states[self.end].add_epsilon(new_end_id);
170        other.states[other.end].add_epsilon(new_end_id);
171
172        // Merge all states and counter defs
173        let mut states = self.states;
174        states.extend(other.states);
175        states.push(new_start);
176        states.push(new_end);
177
178        let mut counter_defs = self.counter_defs;
179        counter_defs.extend(other.counter_defs);
180
181        NfaFragment::with_counters(
182            states,
183            new_start_id as usize,
184            new_end_id as usize,
185            counter_defs,
186            nullable,
187        )
188    }
189
190    /// Make fragment optional: self?
191    ///
192    /// Adds an epsilon transition from start to end, allowing the fragment
193    /// to be skipped entirely.
194    pub fn optional(mut self) -> NfaFragment {
195        self.assert_ids_normalized();
196        // Add epsilon from start to end
197        let end_id = self.end as StateId;
198        self.states[self.start].add_epsilon(end_id);
199        self.nullable = true;
200        self
201    }
202
203    /// Kleene star: self*
204    ///
205    /// Allows zero or more repetitions of the fragment.
206    /// Adds loop back from end to start, plus makes it optional.
207    pub fn repeat_star(mut self) -> NfaFragment {
208        self.assert_ids_normalized();
209        // Add epsilon loop from end back to start
210        let start_id = self.start as StateId;
211        self.states[self.end].add_epsilon(start_id);
212
213        // Make optional (zero occurrences allowed) — already normalized
214        let end_id = self.end as StateId;
215        self.states[self.start].add_epsilon(end_id);
216        self.nullable = true;
217        self
218    }
219
220    /// Plus repetition: self+
221    ///
222    /// Requires at least one occurrence, then allows more.
223    /// Adds loop back from end to start (no optional bypass).
224    pub fn repeat_plus(mut self) -> NfaFragment {
225        self.assert_ids_normalized();
226        // Add epsilon loop from end back to start
227        let start_id = self.start as StateId;
228        self.states[self.end].add_epsilon(start_id);
229        // nullable iff one occurrence can match empty
230        // (self.nullable is already set from the body)
231        self
232    }
233
234    /// Repeat exactly n times: self{n}
235    ///
236    /// Creates n concatenated copies of the fragment.
237    /// For n=0, returns an epsilon fragment.
238    pub fn repeat_exact(self, n: u32) -> NfaFragment {
239        if n == 0 {
240            return FragmentBuilder::new().epsilon_fragment();
241        }
242
243        // nullable: all n copies must be nullable → self.nullable
244        // (concat propagates: a.nullable && b.nullable)
245        let mut result = self.clone();
246        for _ in 1..n {
247            result = result.concat(self.clone());
248        }
249        result
250    }
251
252    /// Counted repeat: uses counter transitions for self{min,max}.
253    ///
254    /// Produces a compact loop structure with 3 extra states (entry, guard, exit)
255    /// regardless of min/max values. Counter tracks completed iterations.
256    ///
257    /// Structure:
258    /// ```text
259    /// entry --CounterReset(c)--> body_start
260    /// body_end --CounterIncrement(c)--> guard
261    /// guard --CounterMaxGuard(c)--> body_start   [loop if count < max]
262    /// guard --CounterMinGuard(c)--> exit          [exit if count >= min]
263    /// [if min == 0: entry --Epsilon--> exit]      [bypass]
264    /// ```
265    pub fn repeat_counted(mut self, min: u32, max: u32) -> NfaFragment {
266        debug_assert!(min <= max, "repeat_counted: min ({min}) > max ({max})");
267
268        // Capture body nullability *before* adding counter infrastructure.
269        let body_nullable = self.nullable;
270
271        self.assert_ids_normalized();
272
273        // Allocate counter
274        let counter_id = self.counter_defs.len() as CounterId;
275        self.counter_defs.push(CounterDef {
276            min,
277            max,
278            body_nullable,
279        });
280
281        // Allocate new states: entry, guard, exit
282        let entry_idx = self.states.len();
283        let guard_idx = entry_idx + 1;
284        let exit_idx = entry_idx + 2;
285
286        let entry_id = entry_idx as StateId;
287        let guard_id = guard_idx as StateId;
288        let exit_id = exit_idx as StateId;
289        let body_start_id = self.start as StateId;
290
291        // entry → CounterReset → body_start
292        let mut entry = NfaState::epsilon(entry_id, None);
293        entry.add_transition(body_start_id, TransitionKind::CounterReset(counter_id));
294
295        // Optional bypass: entry → exit (if min == 0)
296        if min == 0 {
297            entry.add_epsilon(exit_id);
298        }
299
300        // body_end → CounterIncrement → guard
301        self.states[self.end]
302            .add_transition(guard_id, TransitionKind::CounterIncrement(counter_id));
303
304        // guard → CounterMaxGuard → body_start (loop back)
305        // guard → CounterMinGuard → exit (exit loop)
306        let mut guard = NfaState::epsilon(guard_id, None);
307        guard.add_transition(body_start_id, TransitionKind::CounterMaxGuard(counter_id));
308        guard.add_transition(exit_id, TransitionKind::CounterMinGuard(counter_id));
309
310        let exit = NfaState::epsilon(exit_id, None);
311
312        self.states.push(entry);
313        self.states.push(guard);
314        self.states.push(exit);
315
316        // The counted loop is nullable if min==0 (bypass edge) or body is nullable
317        // (all min iterations can complete without consuming input).
318        let nullable = min == 0 || body_nullable;
319
320        NfaFragment::with_counters(
321            self.states,
322            entry_idx,
323            exit_idx,
324            self.counter_defs,
325            nullable,
326        )
327    }
328
329    /// Repeat between min and max times: self{min,max}
330    ///
331    /// Creates min mandatory copies followed by (max-min) optional copies.
332    /// If max is None, creates min copies followed by a star.
333    pub fn repeat_range(self, min: u32, max: Option<u32>) -> NfaFragment {
334        match (min, max) {
335            (0, Some(0)) => FragmentBuilder::new().epsilon_fragment(),
336            (0, Some(1)) => self.optional(),
337            (0, None) => self.repeat_star(),
338            (1, Some(1)) => self,
339            (1, None) => self.repeat_plus(),
340            (n, Some(m)) if n == m => self.repeat_exact(n),
341            (n, Some(m)) => {
342                // n mandatory + (m-n) optional
343                let mut result = self.clone().repeat_exact(n);
344                for _ in n..m {
345                    result = result.concat(self.clone().optional());
346                }
347                result
348            }
349            (n, None) => {
350                // n mandatory + star
351                let mandatory = self.clone().repeat_exact(n);
352                mandatory.concat(self.repeat_star())
353            }
354        }
355    }
356}
357
358/// Builder for constructing NFA fragments with fragment-local state IDs.
359///
360/// Fragments are created with position-based IDs (`state.id == index`),
361/// which is the invariant required by all composition operations.
362/// The builder is stateless — each fragment starts with IDs from 0.
363#[derive(Debug)]
364pub struct FragmentBuilder;
365
366impl FragmentBuilder {
367    /// Create a new fragment builder
368    pub fn new() -> Self {
369        Self
370    }
371
372    /// Build a single-term fragment
373    ///
374    /// Creates a fragment with one term state (id=0) and one epsilon exit
375    /// state (id=1). The term state has a consuming transition to the exit.
376    pub fn single_term(&self, term: NfaTerm, origin: Option<SourceRef>) -> NfaFragment {
377        let mut term_state = NfaState::with_term(0, term, origin);
378        let exit_state = NfaState::epsilon(1, None);
379
380        // Add consuming transition from term state to exit
381        term_state.add_consume(1);
382
383        NfaFragment::new(vec![term_state, exit_state], 0, 1)
384    }
385
386    /// Build an epsilon-only fragment
387    ///
388    /// Creates a minimal fragment that matches nothing (empty string).
389    /// Used for optional content and as base case for empty sequences.
390    pub fn epsilon_fragment(&self) -> NfaFragment {
391        let state = NfaState::epsilon(0, None);
392        let mut frag = NfaFragment::new(vec![state], 0, 0);
393        frag.nullable = true;
394        frag
395    }
396}
397
398impl Default for FragmentBuilder {
399    fn default() -> Self {
400        Self::new()
401    }
402}
403
404/// Convert a fragment to a complete NFA table.
405///
406/// Asserts the fragment's ID invariant (`state.id == position`) and
407/// wraps the states into an `NfaTable` with the fragment's start/end
408/// as start/accept states.
409pub fn fragment_to_table(fragment: NfaFragment) -> NfaTable {
410    fragment.assert_ids_normalized();
411
412    let start_state = fragment.start as StateId;
413    let accept_state = fragment.end as StateId;
414
415    NfaTable::with_counters(
416        fragment.states,
417        start_state,
418        accept_state,
419        fragment.counter_defs,
420    )
421}
422
423#[cfg(test)]
424mod tests {
425    use super::*;
426    use crate::ids::NameId;
427
428    fn make_element_term(name: u32) -> NfaTerm {
429        NfaTerm::element(NameId(name), None, None)
430    }
431
432    #[test]
433    fn test_single_term_fragment() {
434        let builder = FragmentBuilder::new();
435        let frag = builder.single_term(make_element_term(1), None);
436
437        assert_eq!(frag.states.len(), 2);
438        assert_eq!(frag.start, 0);
439        assert_eq!(frag.end, 1);
440        assert!(frag.states[0].term.is_some());
441        assert!(frag.states[1].term.is_none()); // epsilon exit
442    }
443
444    #[test]
445    fn test_epsilon_fragment() {
446        let builder = FragmentBuilder::new();
447        let frag = builder.epsilon_fragment();
448
449        assert_eq!(frag.states.len(), 1);
450        assert_eq!(frag.start, 0);
451        assert_eq!(frag.end, 0); // Same state
452        assert!(frag.states[0].term.is_none());
453    }
454
455    #[test]
456    fn test_concat() {
457        let builder = FragmentBuilder::new();
458        let a = builder.single_term(make_element_term(1), None);
459        let b = builder.single_term(make_element_term(2), None);
460
461        let concat = a.concat(b);
462
463        // a(2 states) + b(2 states) = 4 states
464        assert_eq!(concat.states.len(), 4);
465        assert_eq!(concat.start, 0); // a's start
466        assert_eq!(concat.end, 3); // b's end (offset by 2)
467
468        // Check epsilon from a's end to b's start
469        let a_end = &concat.states[1];
470        assert!(a_end.epsilon_transitions().any(|t| t == 2));
471    }
472
473    #[test]
474    fn test_alternate() {
475        let builder = FragmentBuilder::new();
476        let a = builder.single_term(make_element_term(1), None);
477        let b = builder.single_term(make_element_term(2), None);
478
479        let alt = a.alternate(b);
480
481        // a(2) + b(2) + new_start(1) + new_end(1) = 6 states
482        assert_eq!(alt.states.len(), 6);
483
484        // New start should have epsilon to both a.start and b.start
485        let new_start = &alt.states[alt.start];
486        let eps: Vec<_> = new_start.epsilon_transitions().collect();
487        assert_eq!(eps.len(), 2);
488        assert!(eps.contains(&0)); // a's start
489        assert!(eps.contains(&2)); // b's start (offset by 2)
490    }
491
492    #[test]
493    fn test_optional() {
494        let builder = FragmentBuilder::new();
495        let frag = builder.single_term(make_element_term(1), None);
496        let opt = frag.optional();
497
498        // Check epsilon from start to end (bypass)
499        let start = &opt.states[opt.start];
500        assert!(start.epsilon_transitions().any(|t| t == opt.end as StateId));
501    }
502
503    #[test]
504    fn test_repeat_star() {
505        let builder = FragmentBuilder::new();
506        let frag = builder.single_term(make_element_term(1), None);
507        let star = frag.repeat_star();
508
509        // Check epsilon loop from end to start
510        let end = &star.states[star.end];
511        assert!(end
512            .epsilon_transitions()
513            .any(|t| t == star.start as StateId));
514
515        // Check optional bypass
516        let start = &star.states[star.start];
517        assert!(start
518            .epsilon_transitions()
519            .any(|t| t == star.end as StateId));
520    }
521
522    #[test]
523    fn test_repeat_plus() {
524        let builder = FragmentBuilder::new();
525        let frag = builder.single_term(make_element_term(1), None);
526        let plus = frag.repeat_plus();
527
528        // Check epsilon loop from end to start
529        let end = &plus.states[plus.end];
530        assert!(end
531            .epsilon_transitions()
532            .any(|t| t == plus.start as StateId));
533
534        // Should NOT have optional bypass
535        let start = &plus.states[plus.start];
536        assert!(!start
537            .epsilon_transitions()
538            .any(|t| t == plus.end as StateId));
539    }
540
541    #[test]
542    fn test_repeat_exact() {
543        let builder = FragmentBuilder::new();
544        let frag = builder.single_term(make_element_term(1), None);
545        let exact = frag.repeat_exact(3);
546
547        // 3 copies of 2-state fragment connected = 6 states
548        assert_eq!(exact.states.len(), 6);
549    }
550
551    #[test]
552    fn test_repeat_range() {
553        let builder = FragmentBuilder::new();
554
555        // {0,1} = optional
556        let frag1 = builder.single_term(make_element_term(1), None);
557        let opt = frag1.repeat_range(0, Some(1));
558        let start = &opt.states[opt.start];
559        assert!(start.epsilon_transitions().any(|t| t == opt.end as StateId));
560
561        // {2,4} = 2 mandatory + 2 optional
562        let frag2 = builder.single_term(make_element_term(2), None);
563        let range = frag2.repeat_range(2, Some(4));
564        // 2*2 mandatory + 2*2 optional = 8 states
565        assert_eq!(range.states.len(), 8);
566    }
567
568    #[test]
569    fn test_fragment_to_table() {
570        let builder = FragmentBuilder::new();
571        let frag = builder.single_term(make_element_term(1), None);
572        let table = fragment_to_table(frag);
573
574        assert_eq!(table.start_state, 0);
575        assert_eq!(table.accept_state, 1);
576        assert_eq!(table.state_count(), 2);
577    }
578}