1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
use core::mem;

use alloc::{sync::Arc, vec, vec::Vec};

use crate::{
    nfa::thompson::{
        error::BuildError,
        nfa::{self, SparseTransitions, Transition, NFA},
    },
    util::{
        look::{Look, LookMatcher},
        primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID},
    },
};

/// An intermediate NFA state used during construction.
///
/// During construction of an NFA, it is often convenient to work with states
/// that are amenable to mutation and other carry more information than we
/// otherwise need once an NFA has been built. This type represents those
/// needs.
///
/// Once construction is finished, the builder will convert these states to a
/// [`nfa::thompson::State`](crate::nfa::thompson::State). This conversion not
/// only results in a simpler representation, but in some cases, entire classes
/// of states are completely removed (such as [`State::Empty`]).
#[derive(Clone, Debug, Eq, PartialEq)]
enum State {
    /// An empty state whose only purpose is to forward the automaton to
    /// another state via an unconditional epsilon transition.
    ///
    /// Unconditional epsilon transitions are quite useful during the
    /// construction of an NFA, as they permit the insertion of no-op
    /// placeholders that make it easier to compose NFA sub-graphs. When
    /// the Thompson NFA builder produces a final NFA, all unconditional
    /// epsilon transitions are removed, and state identifiers are remapped
    /// accordingly.
    Empty {
        /// The next state that this state should transition to.
        next: StateID,
    },
    /// A state that only transitions to another state if the current input
    /// byte is in a particular range of bytes.
    ByteRange { trans: Transition },
    /// A state with possibly many transitions, represented in a sparse
    /// fashion. Transitions must be ordered lexicographically by input range
    /// and be non-overlapping. As such, this may only be used when every
    /// transition has equal priority. (In practice, this is only used for
    /// encoding large UTF-8 automata.) In contrast, a `Union` state has each
    /// alternate in order of priority. Priority is used to implement greedy
    /// matching and also alternations themselves, e.g., `abc|a` where `abc`
    /// has priority over `a`.
    ///
    /// To clarify, it is possible to remove `Sparse` and represent all things
    /// that `Sparse` is used for via `Union`. But this creates a more bloated
    /// NFA with more epsilon transitions than is necessary in the special case
    /// of character classes.
    Sparse { transitions: Vec<Transition> },
    /// A conditional epsilon transition satisfied via some sort of
    /// look-around.
    Look { look: Look, next: StateID },
    /// An empty state that records the start of a capture location. This is an
    /// unconditional epsilon transition like `Empty`, except it can be used to
    /// record position information for a capture group when using the NFA for
    /// search.
    CaptureStart {
        /// The ID of the pattern that this capture was defined.
        pattern_id: PatternID,
        /// The capture group index that this capture state corresponds to.
        /// The capture group index is always relative to its corresponding
        /// pattern. Therefore, in the presence of multiple patterns, both the
        /// pattern ID and the capture group index are required to uniquely
        /// identify a capturing group.
        group_index: SmallIndex,
        /// The next state that this state should transition to.
        next: StateID,
    },
    /// An empty state that records the end of a capture location. This is an
    /// unconditional epsilon transition like `Empty`, except it can be used to
    /// record position information for a capture group when using the NFA for
    /// search.
    CaptureEnd {
        /// The ID of the pattern that this capture was defined.
        pattern_id: PatternID,
        /// The capture group index that this capture state corresponds to.
        /// The capture group index is always relative to its corresponding
        /// pattern. Therefore, in the presence of multiple patterns, both the
        /// pattern ID and the capture group index are required to uniquely
        /// identify a capturing group.
        group_index: SmallIndex,
        /// The next state that this state should transition to.
        next: StateID,
    },
    /// An alternation such that there exists an epsilon transition to all
    /// states in `alternates`, where matches found via earlier transitions
    /// are preferred over later transitions.
    Union { alternates: Vec<StateID> },
    /// An alternation such that there exists an epsilon transition to all
    /// states in `alternates`, where matches found via later transitions are
    /// preferred over earlier transitions.
    ///
    /// This "reverse" state exists for convenience during compilation that
    /// permits easy construction of non-greedy combinations of NFA states. At
    /// the end of compilation, Union and UnionReverse states are merged into
    /// one Union type of state, where the latter has its epsilon transitions
    /// reversed to reflect the priority inversion.
    ///
    /// The "convenience" here arises from the fact that as new states are
    /// added to the list of `alternates`, we would like that add operation
    /// to be amortized constant time. But if we used a `Union`, we'd need to
    /// prepend the state, which takes O(n) time. There are other approaches we
    /// could use to solve this, but this seems simple enough.
    UnionReverse { alternates: Vec<StateID> },
    /// A state that cannot be transitioned out of. This is useful for cases
    /// where you want to prevent matching from occurring. For example, if your
    /// regex parser permits empty character classes, then one could choose a
    /// `Fail` state to represent it.
    Fail,
    /// A match state. There is at most one such occurrence of this state in
    /// an NFA for each pattern compiled into the NFA. At time of writing, a
    /// match state is always produced for every pattern given, but in theory,
    /// if a pattern can never lead to a match, then the match state could be
    /// omitted.
    ///
    /// `pattern_id` refers to the ID of the pattern itself, which corresponds
    /// to the pattern's index (starting at 0).
    Match { pattern_id: PatternID },
}

impl State {
    /// If this state is an unconditional epsilon transition, then this returns
    /// the target of the transition.
    fn goto(&self) -> Option<StateID> {
        match *self {
            State::Empty { next } => Some(next),
            State::Union { ref alternates } if alternates.len() == 1 => {
                Some(alternates[0])
            }
            State::UnionReverse { ref alternates }
                if alternates.len() == 1 =>
            {
                Some(alternates[0])
            }
            _ => None,
        }
    }

    /// Returns the heap memory usage, in bytes, of this state.
    fn memory_usage(&self) -> usize {
        match *self {
            State::Empty { .. }
            | State::ByteRange { .. }
            | State::Look { .. }
            | State::CaptureStart { .. }
            | State::CaptureEnd { .. }
            | State::Fail
            | State::Match { .. } => 0,
            State::Sparse { ref transitions } => {
                transitions.len() * mem::size_of::<Transition>()
            }
            State::Union { ref alternates } => {
                alternates.len() * mem::size_of::<StateID>()
            }
            State::UnionReverse { ref alternates } => {
                alternates.len() * mem::size_of::<StateID>()
            }
        }
    }
}

/// An abstraction for building Thompson NFAs by hand.
///
/// A builder is what a [`thompson::Compiler`](crate::nfa::thompson::Compiler)
/// uses internally to translate a regex's high-level intermediate
/// representation into an [`NFA`].
///
/// The primary function of this builder is to abstract away the internal
/// representation of an NFA and make it difficult to produce NFAs are that
/// internally invalid or inconsistent. This builder also provides a way to
/// add "empty" states (which can be thought of as unconditional epsilon
/// transitions), despite the fact that [`thompson::State`](nfa::State) does
/// not have any "empty" representation. The advantage of "empty" states is
/// that they make the code for constructing a Thompson NFA logically simpler.
///
/// Many of the routines on this builder may panic or return errors. Generally
/// speaking, panics occur when an invalid sequence of method calls were made,
/// where as an error occurs if things get too big. (Where "too big" might mean
/// exhausting identifier space or using up too much heap memory in accordance
/// with the configured [`size_limit`](Builder::set_size_limit).)
///
/// # Overview
///
/// ## Adding multiple patterns
///
/// Each pattern you add to an NFA should correspond to a pair of
/// [`Builder::start_pattern`] and [`Builder::finish_pattern`] calls, with
/// calls inbetween that add NFA states for that pattern. NFA states may be
/// added without first calling `start_pattern`, with the exception of adding
/// capturing states.
///
/// ## Adding NFA states
///
/// Here is a very brief overview of each of the methods that add NFA states.
/// Every method adds a single state.
///
/// * [`add_empty`](Builder::add_empty): Add a state with a single
/// unconditional epsilon transition to another state.
/// * [`add_union`](Builder::add_union): Adds a state with unconditional
/// epsilon transitions to two or more states, with earlier transitions
/// preferred over later ones.
/// * [`add_union_reverse`](Builder::add_union_reverse): Adds a state with
/// unconditional epsilon transitions to two or more states, with later
/// transitions preferred over earlier ones.
/// * [`add_range`](Builder::add_range): Adds a state with a single transition
/// to another state that can only be followed if the current input byte is
/// within the range given.
/// * [`add_sparse`](Builder::add_sparse): Adds a state with two or more
/// range transitions to other states, where a transition is only followed
/// if the current input byte is within one of the ranges. All transitions
/// in this state have equal priority, and the corresponding ranges must be
/// non-overlapping.
/// * [`add_look`](Builder::add_look): Adds a state with a single *conditional*
/// epsilon transition to another state, where the condition depends on a
/// limited look-around property.
/// * [`add_capture_start`](Builder::add_capture_start): Adds a state with
/// a single unconditional epsilon transition that also instructs an NFA
/// simulation to record the current input position to a specific location in
/// memory. This is intended to represent the starting location of a capturing
/// group.
/// * [`add_capture_end`](Builder::add_capture_end): Adds a state with
/// a single unconditional epsilon transition that also instructs an NFA
/// simulation to record the current input position to a specific location in
/// memory. This is intended to represent the ending location of a capturing
/// group.
/// * [`add_fail`](Builder::add_fail): Adds a state that never transitions to
/// another state.
/// * [`add_match`](Builder::add_match): Add a state that indicates a match has
/// been found for a particular pattern. A match state is a final state with
/// no outgoing transitions.
///
/// ## Setting transitions between NFA states
///
/// The [`Builder::patch`] method creates a transition from one state to the
/// next. If the `from` state corresponds to a state that supports multiple
/// outgoing transitions (such as "union"), then this adds the corresponding
/// transition. Otherwise, it sets the single transition. (This routine panics
/// if `from` corresponds to a state added by `add_sparse`, since sparse states
/// need more specialized handling.)
///
/// # Example
///
/// This annotated example shows how to hand construct the regex `[a-z]+`
/// (without an unanchored prefix).
///
/// ```
/// use regex_automata::{
///     nfa::thompson::{pikevm::PikeVM, Builder, Transition},
///     util::primitives::StateID,
///     Match,
/// };
///
/// let mut builder = Builder::new();
/// // Before adding NFA states for our pattern, we need to tell the builder
/// // that we are starting the pattern.
/// builder.start_pattern()?;
/// // Since we use the Pike VM below for searching, we need to add capturing
/// // states. If you're just going to build a DFA from the NFA, then capturing
/// // states do not need to be added.
/// let start = builder.add_capture_start(StateID::ZERO, 0, None)?;
/// let range = builder.add_range(Transition {
///     // We don't know the state ID of the 'next' state yet, so we just fill
///     // in a dummy 'ZERO' value.
///     start: b'a', end: b'z', next: StateID::ZERO,
/// })?;
/// // This state will point back to 'range', but also enable us to move ahead.
/// // That is, this implements the '+' repetition operator. We add 'range' and
/// // then 'end' below to this alternation.
/// let alt = builder.add_union(vec![])?;
/// // The final state before the match state, which serves to capture the
/// // end location of the match.
/// let end = builder.add_capture_end(StateID::ZERO, 0)?;
/// // The match state for our pattern.
/// let mat = builder.add_match()?;
/// // Now we fill in the transitions between states.
/// builder.patch(start, range)?;
/// builder.patch(range, alt)?;
/// // If we added 'end' before 'range', then we'd implement non-greedy
/// // matching, i.e., '+?'.
/// builder.patch(alt, range)?;
/// builder.patch(alt, end)?;
/// builder.patch(end, mat)?;
/// // We must explicitly finish pattern and provide the starting state ID for
/// // this particular pattern.
/// builder.finish_pattern(start)?;
/// // Finally, when we build the NFA, we provide the anchored and unanchored
/// // starting state IDs. Since we didn't bother with an unanchored prefix
/// // here, we only support anchored searching. Thus, both starting states are
/// // the same.
/// let nfa = builder.build(start, start)?;
///
/// // Now build a Pike VM from our NFA, and use it for searching. This shows
/// // how we can use a regex engine without ever worrying about syntax!
/// let re = PikeVM::new_from_nfa(nfa)?;
/// let mut cache = re.create_cache();
/// let mut caps = re.create_captures();
/// let expected = Some(Match::must(0, 0..3));
/// re.captures(&mut cache, "foo0", &mut caps);
/// assert_eq!(expected, caps.get_match());
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[derive(Clone, Debug, Default)]
pub struct Builder {
    /// The ID of the pattern that we're currently building.
    ///
    /// Callers are required to set (and unset) this by calling
    /// {start,finish}_pattern. Otherwise, most methods will panic.
    pattern_id: Option<PatternID>,
    /// A sequence of intermediate NFA states. Once a state is added to this
    /// sequence, it is assigned a state ID equivalent to its index. Once a
    /// state is added, it is still expected to be mutated, e.g., to set its
    /// transition to a state that didn't exist at the time it was added.
    states: Vec<State>,
    /// The starting states for each individual pattern. Starting at any
    /// of these states will result in only an anchored search for the
    /// corresponding pattern. The vec is indexed by pattern ID. When the NFA
    /// contains a single regex, then `start_pattern[0]` and `start_anchored`
    /// are always equivalent.
    start_pattern: Vec<StateID>,
    /// A map from pattern ID to capture group index to name. (If no name
    /// exists, then a None entry is present. Thus, all capturing groups are
    /// present in this mapping.)
    ///
    /// The outer vec is indexed by pattern ID, while the inner vec is indexed
    /// by capture index offset for the corresponding pattern.
    ///
    /// The first capture group for each pattern is always unnamed and is thus
    /// always None.
    captures: Vec<Vec<Option<Arc<str>>>>,
    /// The combined memory used by each of the 'State's in 'states'. This
    /// only includes heap usage by each state, and not the size of the state
    /// itself. In other words, this tracks heap memory used that isn't
    /// captured via `size_of::<State>() * states.len()`.
    memory_states: usize,
    /// Whether this NFA only matches UTF-8 and whether regex engines using
    /// this NFA for searching should report empty matches that split a
    /// codepoint.
    utf8: bool,
    /// Whether this NFA should be matched in reverse or not.
    reverse: bool,
    /// The matcher to use for look-around assertions.
    look_matcher: LookMatcher,
    /// A size limit to respect when building an NFA. If the total heap memory
    /// of the intermediate NFA states exceeds (or would exceed) this amount,
    /// then an error is returned.
    size_limit: Option<usize>,
}

impl Builder {
    /// Create a new builder for hand-assembling NFAs.
    pub fn new() -> Builder {
        Builder::default()
    }

    /// Clear this builder.
    ///
    /// Clearing removes all state associated with building an NFA, but does
    /// not reset configuration (such as size limits and whether the NFA
    /// should only match UTF-8). After clearing, the builder can be reused to
    /// assemble an entirely new NFA.
    pub fn clear(&mut self) {
        self.pattern_id = None;
        self.states.clear();
        self.start_pattern.clear();
        self.captures.clear();
        self.memory_states = 0;
    }

    /// Assemble a [`NFA`] from the states added so far.
    ///
    /// After building an NFA, more states may be added and `build` may be
    /// called again. To reuse a builder to produce an entirely new NFA from
    /// scratch, call the [`clear`](Builder::clear) method first.
    ///
    /// `start_anchored` refers to the ID of the starting state that anchored
    /// searches should use. That is, searches who matches are limited to the
    /// starting position of the search.
    ///
    /// `start_unanchored` refers to the ID of the starting state that
    /// unanchored searches should use. This permits searches to report matches
    /// that start after the beginning of the search. In cases where unanchored
    /// searches are not supported, the unanchored starting state ID must be
    /// the same as the anchored starting state ID.
    ///
    /// # Errors
    ///
    /// This returns an error if there was a problem producing the final NFA.
    /// In particular, this might include an error if the capturing groups
    /// added to this builder violate any of the invariants documented on
    /// [`GroupInfo`](crate::util::captures::GroupInfo).
    ///
    /// # Panics
    ///
    /// If `start_pattern` was called, then `finish_pattern` must be called
    /// before `build`, otherwise this panics.
    ///
    /// This may panic for other invalid uses of a builder. For example, if
    /// a "start capture" state was added without a corresponding "end capture"
    /// state.
    pub fn build(
        &self,
        start_anchored: StateID,
        start_unanchored: StateID,
    ) -> Result<NFA, BuildError> {
        assert!(self.pattern_id.is_none(), "must call 'finish_pattern' first");
        debug!(
            "intermediate NFA compilation via builder is complete, \
             intermediate NFA size: {} states, {} bytes on heap",
            self.states.len(),
            self.memory_usage(),
        );

        let mut nfa = nfa::Inner::default();
        nfa.set_utf8(self.utf8);
        nfa.set_reverse(self.reverse);
        nfa.set_look_matcher(self.look_matcher.clone());
        // A set of compiler internal state IDs that correspond to states
        // that are exclusively epsilon transitions, i.e., goto instructions,
        // combined with the state that they point to. This is used to
        // record said states while transforming the compiler's internal NFA
        // representation to the external form.
        let mut empties = vec![];
        // A map used to re-map state IDs when translating this builder's
        // internal NFA state representation to the final NFA representation.
        let mut remap = vec![];
        remap.resize(self.states.len(), StateID::ZERO);

        nfa.set_starts(start_anchored, start_unanchored, &self.start_pattern);
        nfa.set_captures(&self.captures).map_err(BuildError::captures)?;
        // The idea here is to convert our intermediate states to their final
        // form. The only real complexity here is the process of converting
        // transitions, which are expressed in terms of state IDs. The new
        // set of states will be smaller because of partial epsilon removal,
        // so the state IDs will not be the same.
        for (sid, state) in self.states.iter().with_state_ids() {
            match *state {
                State::Empty { next } => {
                    // Since we're removing empty states, we need to handle
                    // them later since we don't yet know which new state this
                    // empty state will be mapped to.
                    empties.push((sid, next));
                }
                State::ByteRange { trans } => {
                    remap[sid] = nfa.add(nfa::State::ByteRange { trans });
                }
                State::Sparse { ref transitions } => {
                    remap[sid] = match transitions.len() {
                        0 => nfa.add(nfa::State::Fail),
                        1 => nfa.add(nfa::State::ByteRange {
                            trans: transitions[0],
                        }),
                        _ => {
                            let transitions =
                                transitions.to_vec().into_boxed_slice();
                            let sparse = SparseTransitions { transitions };
                            nfa.add(nfa::State::Sparse(sparse))
                        }
                    }
                }
                State::Look { look, next } => {
                    remap[sid] = nfa.add(nfa::State::Look { look, next });
                }
                State::CaptureStart { pattern_id, group_index, next } => {
                    // We can't remove this empty state because of the side
                    // effect of capturing an offset for this capture slot.
                    let slot = nfa
                        .group_info()
                        .slot(pattern_id, group_index.as_usize())
                        .expect("invalid capture index");
                    let slot =
                        SmallIndex::new(slot).expect("a small enough slot");
                    remap[sid] = nfa.add(nfa::State::Capture {
                        next,
                        pattern_id,
                        group_index,
                        slot,
                    });
                }
                State::CaptureEnd { pattern_id, group_index, next } => {
                    // We can't remove this empty state because of the side
                    // effect of capturing an offset for this capture slot.
                    // Also, this always succeeds because we check that all
                    // slot indices are valid for all capture indices when they
                    // are initially added.
                    let slot = nfa
                        .group_info()
                        .slot(pattern_id, group_index.as_usize())
                        .expect("invalid capture index")
                        .checked_add(1)
                        .unwrap();
                    let slot =
                        SmallIndex::new(slot).expect("a small enough slot");
                    remap[sid] = nfa.add(nfa::State::Capture {
                        next,
                        pattern_id,
                        group_index,
                        slot,
                    });
                }
                State::Union { ref alternates } => {
                    if alternates.is_empty() {
                        remap[sid] = nfa.add(nfa::State::Fail);
                    } else if alternates.len() == 1 {
                        empties.push((sid, alternates[0]));
                        remap[sid] = alternates[0];
                    } else if alternates.len() == 2 {
                        remap[sid] = nfa.add(nfa::State::BinaryUnion {
                            alt1: alternates[0],
                            alt2: alternates[1],
                        });
                    } else {
                        let alternates =
                            alternates.to_vec().into_boxed_slice();
                        remap[sid] = nfa.add(nfa::State::Union { alternates });
                    }
                }
                State::UnionReverse { ref alternates } => {
                    if alternates.is_empty() {
                        remap[sid] = nfa.add(nfa::State::Fail);
                    } else if alternates.len() == 1 {
                        empties.push((sid, alternates[0]));
                        remap[sid] = alternates[0];
                    } else if alternates.len() == 2 {
                        remap[sid] = nfa.add(nfa::State::BinaryUnion {
                            alt1: alternates[1],
                            alt2: alternates[0],
                        });
                    } else {
                        let mut alternates =
                            alternates.to_vec().into_boxed_slice();
                        alternates.reverse();
                        remap[sid] = nfa.add(nfa::State::Union { alternates });
                    }
                }
                State::Fail => {
                    remap[sid] = nfa.add(nfa::State::Fail);
                }
                State::Match { pattern_id } => {
                    remap[sid] = nfa.add(nfa::State::Match { pattern_id });
                }
            }
        }
        // Some of the new states still point to empty state IDs, so we need to
        // follow each of them and remap the empty state IDs to their non-empty
        // state IDs.
        //
        // We also keep track of which states we've already mapped. This helps
        // avoid quadratic behavior in a long chain of empty states. For
        // example, in 'a{0}{50000}'.
        let mut remapped = vec![false; self.states.len()];
        for &(empty_id, empty_next) in empties.iter() {
            if remapped[empty_id] {
                continue;
            }
            // empty states can point to other empty states, forming a chain.
            // So we must follow the chain until the end, which must end at
            // a non-empty state, and therefore, a state that is correctly
            // remapped. We are guaranteed to terminate because our compiler
            // never builds a loop among only empty states.
            let mut new_next = empty_next;
            while let Some(next) = self.states[new_next].goto() {
                new_next = next;
            }
            remap[empty_id] = remap[new_next];
            remapped[empty_id] = true;

            // Now that we've remapped the main 'empty_id' above, we re-follow
            // the chain from above and remap every empty state we found along
            // the way to our ultimate non-empty target. We are careful to set
            // 'remapped' to true for each such state. We thus will not need
            // to re-compute this chain for any subsequent empty states in
            // 'empties' that are part of this chain.
            let mut next2 = empty_next;
            while let Some(next) = self.states[next2].goto() {
                remap[next2] = remap[new_next];
                remapped[next2] = true;
                next2 = next;
            }
        }
        // Finally remap all of the state IDs.
        nfa.remap(&remap);
        let final_nfa = nfa.into_nfa();
        debug!(
            "NFA compilation via builder complete, \
             final NFA size: {} states, {} bytes on heap, \
             has empty? {:?}, utf8? {:?}",
            final_nfa.states().len(),
            final_nfa.memory_usage(),
            final_nfa.has_empty(),
            final_nfa.is_utf8(),
        );
        Ok(final_nfa)
    }

    /// Start the assembly of a pattern in this NFA.
    ///
    /// Upon success, this returns the identifier for the new pattern.
    /// Identifiers start at `0` and are incremented by 1 for each new pattern.
    ///
    /// It is necessary to call this routine before adding capturing states.
    /// Otherwise, any other NFA state may be added before starting a pattern.
    ///
    /// # Errors
    ///
    /// If the pattern identifier space is exhausted, then this returns an
    /// error.
    ///
    /// # Panics
    ///
    /// If this is called while assembling another pattern (i.e., before
    /// `finish_pattern` is called), then this panics.
    pub fn start_pattern(&mut self) -> Result<PatternID, BuildError> {
        assert!(self.pattern_id.is_none(), "must call 'finish_pattern' first");

        let proposed = self.start_pattern.len();
        let pid = PatternID::new(proposed)
            .map_err(|_| BuildError::too_many_patterns(proposed))?;
        self.pattern_id = Some(pid);
        // This gets filled in when 'finish_pattern' is called.
        self.start_pattern.push(StateID::ZERO);
        Ok(pid)
    }

    /// Finish the assembly of a pattern in this NFA.
    ///
    /// Upon success, this returns the identifier for the new pattern.
    /// Identifiers start at `0` and are incremented by 1 for each new
    /// pattern. This is the same identifier returned by the corresponding
    /// `start_pattern` call.
    ///
    /// Note that `start_pattern` and `finish_pattern` pairs cannot be
    /// interleaved or nested. A correct `finish_pattern` call _always_
    /// corresponds to the most recently called `start_pattern` routine.
    ///
    /// # Errors
    ///
    /// This currently never returns an error, but this is subject to change.
    ///
    /// # Panics
    ///
    /// If this is called without a corresponding `start_pattern` call, then
    /// this panics.
    pub fn finish_pattern(
        &mut self,
        start_id: StateID,
    ) -> Result<PatternID, BuildError> {
        let pid = self.current_pattern_id();
        self.start_pattern[pid] = start_id;
        self.pattern_id = None;
        Ok(pid)
    }

    /// Returns the pattern identifier of the current pattern.
    ///
    /// # Panics
    ///
    /// If this doesn't occur after a `start_pattern` call and before the
    /// corresponding `finish_pattern` call, then this panics.
    pub fn current_pattern_id(&self) -> PatternID {
        self.pattern_id.expect("must call 'start_pattern' first")
    }

    /// Returns the number of patterns added to this builder so far.
    ///
    /// This only includes patterns that have had `finish_pattern` called
    /// for them.
    pub fn pattern_len(&self) -> usize {
        self.start_pattern.len()
    }

    /// Add an "empty" NFA state.
    ///
    /// An "empty" NFA state is a state with a single unconditional epsilon
    /// transition to another NFA state. Such empty states are removed before
    /// building the final [`NFA`] (which has no such "empty" states), but they
    /// can be quite useful in the construction process of an NFA.
    ///
    /// # Errors
    ///
    /// This returns an error if the state identifier space is exhausted, or if
    /// the configured heap size limit has been exceeded.
    pub fn add_empty(&mut self) -> Result<StateID, BuildError> {
        self.add(State::Empty { next: StateID::ZERO })
    }

    /// Add a "union" NFA state.
    ///
    /// A "union" NFA state that contains zero or more unconditional epsilon
    /// transitions to other NFA states. The order of these transitions
    /// reflects a priority order where earlier transitions are preferred over
    /// later transitions.
    ///
    /// Callers may provide an empty set of alternates to this method call, and
    /// then later add transitions via `patch`. At final build time, a "union"
    /// state with no alternates is converted to a "fail" state, and a "union"
    /// state with exactly one alternate is treated as if it were an "empty"
    /// state.
    ///
    /// # Errors
    ///
    /// This returns an error if the state identifier space is exhausted, or if
    /// the configured heap size limit has been exceeded.
    pub fn add_union(
        &mut self,
        alternates: Vec<StateID>,
    ) -> Result<StateID, BuildError> {
        self.add(State::Union { alternates })
    }

    /// Add a "reverse union" NFA state.
    ///
    /// A "reverse union" NFA state contains zero or more unconditional epsilon
    /// transitions to other NFA states. The order of these transitions
    /// reflects a priority order where later transitions are preferred
    /// over earlier transitions. This is an inverted priority order when
    /// compared to `add_union`. This is useful, for example, for implementing
    /// non-greedy repetition operators.
    ///
    /// Callers may provide an empty set of alternates to this method call, and
    /// then later add transitions via `patch`. At final build time, a "reverse
    /// union" state with no alternates is converted to a "fail" state, and a
    /// "reverse union" state with exactly one alternate is treated as if it
    /// were an "empty" state.
    ///
    /// # Errors
    ///
    /// This returns an error if the state identifier space is exhausted, or if
    /// the configured heap size limit has been exceeded.
    pub fn add_union_reverse(
        &mut self,
        alternates: Vec<StateID>,
    ) -> Result<StateID, BuildError> {
        self.add(State::UnionReverse { alternates })
    }

    /// Add a "range" NFA state.
    ///
    /// A "range" NFA state is a state with one outgoing transition to another
    /// state, where that transition may only be followed if the current input
    /// byte falls between a range of bytes given.
    ///
    /// # Errors
    ///
    /// This returns an error if the state identifier space is exhausted, or if
    /// the configured heap size limit has been exceeded.
    pub fn add_range(
        &mut self,
        trans: Transition,
    ) -> Result<StateID, BuildError> {
        self.add(State::ByteRange { trans })
    }

    /// Add a "sparse" NFA state.
    ///
    /// A "sparse" NFA state contains zero or more outgoing transitions, where
    /// the transition to be followed (if any) is chosen based on whether the
    /// current input byte falls in the range of one such transition. The
    /// transitions given *must* be non-overlapping and in ascending order. (A
    /// "sparse" state with no transitions is equivalent to a "fail" state.)
    ///
    /// A "sparse" state is like adding a "union" state and pointing it at a
    /// bunch of "range" states, except that the different alternates have
    /// equal priority.
    ///
    /// Note that a "sparse" state is the only state that cannot be patched.
    /// This is because a "sparse" state has many transitions, each of which
    /// may point to a different NFA state. Moreover, adding more such
    /// transitions requires more than just an NFA state ID to point to. It
    /// also requires a byte range. The `patch` routine does not support the
    /// additional information required. Therefore, callers must ensure that
    /// all outgoing transitions for this state are included when `add_sparse`
    /// is called. There is no way to add more later.
    ///
    /// # Errors
    ///
    /// This returns an error if the state identifier space is exhausted, or if
    /// the configured heap size limit has been exceeded.
    ///
    /// # Panics
    ///
    /// This routine _may_ panic if the transitions given overlap or are not
    /// in ascending order.
    pub fn add_sparse(
        &mut self,
        transitions: Vec<Transition>,
    ) -> Result<StateID, BuildError> {
        self.add(State::Sparse { transitions })
    }

    /// Add a "look" NFA state.
    ///
    /// A "look" NFA state corresponds to a state with exactly one
    /// *conditional* epsilon transition to another NFA state. Namely, it
    /// represents one of a small set of simplistic look-around operators.
    ///
    /// Callers may provide a "dummy" state ID (typically [`StateID::ZERO`]),
    /// and then change it later with [`patch`](Builder::patch).
    ///
    /// # Errors
    ///
    /// This returns an error if the state identifier space is exhausted, or if
    /// the configured heap size limit has been exceeded.
    pub fn add_look(
        &mut self,
        next: StateID,
        look: Look,
    ) -> Result<StateID, BuildError> {
        self.add(State::Look { look, next })
    }

    /// Add a "start capture" NFA state.
    ///
    /// A "start capture" NFA state corresponds to a state with exactly one
    /// outgoing unconditional epsilon transition to another state. Unlike
    /// "empty" states, a "start capture" state also carries with it an
    /// instruction for saving the current position of input to a particular
    /// location in memory. NFA simulations, like the Pike VM, may use this
    /// information to report the match locations of capturing groups in a
    /// regex pattern.
    ///
    /// If the corresponding capturing group has a name, then callers should
    /// include it here.
    ///
    /// Callers may provide a "dummy" state ID (typically [`StateID::ZERO`]),
    /// and then change it later with [`patch`](Builder::patch).
    ///
    /// Note that unlike `start_pattern`/`finish_pattern`, capturing start and
    /// end states may be interleaved. Indeed, it is typical for many "start
    /// capture" NFA states to appear before the first "end capture" state.
    ///
    /// # Errors
    ///
    /// This returns an error if the state identifier space is exhausted, or if
    /// the configured heap size limit has been exceeded or if the given
    /// capture index overflows `usize`.
    ///
    /// While the above are the only conditions in which this routine can
    /// currently return an error, it is possible to call this method with an
    /// inputs that results in the final `build()` step failing to produce an
    /// NFA. For example, if one adds two distinct capturing groups with the
    /// same name, then that will result in `build()` failing with an error.
    ///
    /// See the [`GroupInfo`](crate::util::captures::GroupInfo) type for
    /// more information on what qualifies as valid capturing groups.
    ///
    /// # Example
    ///
    /// This example shows that an error occurs when one tries to add multiple
    /// capturing groups with the same name to the same pattern.
    ///
    /// ```
    /// use regex_automata::{
    ///     nfa::thompson::Builder,
    ///     util::primitives::StateID,
    /// };
    ///
    /// let name = Some(std::sync::Arc::from("foo"));
    /// let mut builder = Builder::new();
    /// builder.start_pattern()?;
    /// // 0th capture group should always be unnamed.
    /// let start = builder.add_capture_start(StateID::ZERO, 0, None)?;
    /// // OK
    /// builder.add_capture_start(StateID::ZERO, 1, name.clone())?;
    /// // This is not OK, but 'add_capture_start' still succeeds. We don't
    /// // get an error until we call 'build' below. Without this call, the
    /// // call to 'build' below would succeed.
    /// builder.add_capture_start(StateID::ZERO, 2, name.clone())?;
    /// // Finish our pattern so we can try to build the NFA.
    /// builder.finish_pattern(start)?;
    /// let result = builder.build(start, start);
    /// assert!(result.is_err());
    ///
    /// # Ok::<(), Box<dyn std::error::Error>>(())
    /// ```
    ///
    /// However, adding multiple capturing groups with the same name to
    /// distinct patterns is okay:
    ///
    /// ```
    /// use std::sync::Arc;
    ///
    /// use regex_automata::{
    ///     nfa::thompson::{pikevm::PikeVM, Builder, Transition},
    ///     util::{
    ///         captures::Captures,
    ///         primitives::{PatternID, StateID},
    ///     },
    ///     Span,
    /// };
    ///
    /// // Hand-compile the patterns '(?P<foo>[a-z])' and '(?P<foo>[A-Z])'.
    /// let mut builder = Builder::new();
    /// // We compile them to support an unanchored search, which requires
    /// // adding an implicit '(?s-u:.)*?' prefix before adding either pattern.
    /// let unanchored_prefix = builder.add_union_reverse(vec![])?;
    /// let any = builder.add_range(Transition {
    ///     start: b'\x00', end: b'\xFF', next: StateID::ZERO,
    /// })?;
    /// builder.patch(unanchored_prefix, any)?;
    /// builder.patch(any, unanchored_prefix)?;
    ///
    /// // Compile an alternation that permits matching multiple patterns.
    /// let alt = builder.add_union(vec![])?;
    /// builder.patch(unanchored_prefix, alt)?;
    ///
    /// // Compile '(?P<foo>[a-z]+)'.
    /// builder.start_pattern()?;
    /// let start0 = builder.add_capture_start(StateID::ZERO, 0, None)?;
    /// // N.B. 0th capture group must always be unnamed.
    /// let foo_start0 = builder.add_capture_start(
    ///     StateID::ZERO, 1, Some(Arc::from("foo")),
    /// )?;
    /// let lowercase = builder.add_range(Transition {
    ///     start: b'a', end: b'z', next: StateID::ZERO,
    /// })?;
    /// let foo_end0 = builder.add_capture_end(StateID::ZERO, 1)?;
    /// let end0 = builder.add_capture_end(StateID::ZERO, 0)?;
    /// let match0 = builder.add_match()?;
    /// builder.patch(start0, foo_start0)?;
    /// builder.patch(foo_start0, lowercase)?;
    /// builder.patch(lowercase, foo_end0)?;
    /// builder.patch(foo_end0, end0)?;
    /// builder.patch(end0, match0)?;
    /// builder.finish_pattern(start0)?;
    ///
    /// // Compile '(?P<foo>[A-Z]+)'.
    /// builder.start_pattern()?;
    /// let start1 = builder.add_capture_start(StateID::ZERO, 0, None)?;
    /// // N.B. 0th capture group must always be unnamed.
    /// let foo_start1 = builder.add_capture_start(
    ///     StateID::ZERO, 1, Some(Arc::from("foo")),
    /// )?;
    /// let uppercase = builder.add_range(Transition {
    ///     start: b'A', end: b'Z', next: StateID::ZERO,
    /// })?;
    /// let foo_end1 = builder.add_capture_end(StateID::ZERO, 1)?;
    /// let end1 = builder.add_capture_end(StateID::ZERO, 0)?;
    /// let match1 = builder.add_match()?;
    /// builder.patch(start1, foo_start1)?;
    /// builder.patch(foo_start1, uppercase)?;
    /// builder.patch(uppercase, foo_end1)?;
    /// builder.patch(foo_end1, end1)?;
    /// builder.patch(end1, match1)?;
    /// builder.finish_pattern(start1)?;
    ///
    /// // Now add the patterns to our alternation that we started above.
    /// builder.patch(alt, start0)?;
    /// builder.patch(alt, start1)?;
    ///
    /// // Finally build the NFA. The first argument is the anchored starting
    /// // state (the pattern alternation) where as the second is the
    /// // unanchored starting state (the unanchored prefix).
    /// let nfa = builder.build(alt, unanchored_prefix)?;
    ///
    /// // Now build a Pike VM from our NFA and access the 'foo' capture
    /// // group regardless of which pattern matched, since it is defined
    /// // for both patterns.
    /// let vm = PikeVM::new_from_nfa(nfa)?;
    /// let mut cache = vm.create_cache();
    /// let caps: Vec<Captures> =
    ///     vm.captures_iter(&mut cache, "0123aAaAA").collect();
    /// assert_eq!(5, caps.len());
    ///
    /// assert_eq!(Some(PatternID::must(0)), caps[0].pattern());
    /// assert_eq!(Some(Span::from(4..5)), caps[0].get_group_by_name("foo"));
    ///
    /// assert_eq!(Some(PatternID::must(1)), caps[1].pattern());
    /// assert_eq!(Some(Span::from(5..6)), caps[1].get_group_by_name("foo"));
    ///
    /// assert_eq!(Some(PatternID::must(0)), caps[2].pattern());
    /// assert_eq!(Some(Span::from(6..7)), caps[2].get_group_by_name("foo"));
    ///
    /// assert_eq!(Some(PatternID::must(1)), caps[3].pattern());
    /// assert_eq!(Some(Span::from(7..8)), caps[3].get_group_by_name("foo"));
    ///
    /// assert_eq!(Some(PatternID::must(1)), caps[4].pattern());
    /// assert_eq!(Some(Span::from(8..9)), caps[4].get_group_by_name("foo"));
    ///
    /// # Ok::<(), Box<dyn std::error::Error>>(())
    /// ```
    pub fn add_capture_start(
        &mut self,
        next: StateID,
        group_index: u32,
        name: Option<Arc<str>>,
    ) -> Result<StateID, BuildError> {
        let pid = self.current_pattern_id();
        let group_index = match SmallIndex::try_from(group_index) {
            Err(_) => {
                return Err(BuildError::invalid_capture_index(group_index))
            }
            Ok(group_index) => group_index,
        };
        // Make sure we have space to insert our (pid,index)|-->name mapping.
        if pid.as_usize() >= self.captures.len() {
            for _ in 0..=(pid.as_usize() - self.captures.len()) {
                self.captures.push(vec![]);
            }
        }
        // In the case where 'group_index < self.captures[pid].len()', it means
        // that we are adding a duplicate capture group. This is somewhat
        // weird, but permissible because the capture group itself can be
        // repeated in the syntax. For example, '([a-z]){4}' will produce 4
        // capture groups. In practice, only the last will be set at search
        // time when a match occurs. For duplicates, we don't need to push
        // anything other than a CaptureStart NFA state.
        if group_index.as_usize() >= self.captures[pid].len() {
            // For discontiguous indices, push placeholders for earlier capture
            // groups that weren't explicitly added.
            for _ in 0..(group_index.as_usize() - self.captures[pid].len()) {
                self.captures[pid].push(None);
            }
            self.captures[pid].push(name);
        }
        self.add(State::CaptureStart { pattern_id: pid, group_index, next })
    }

    /// Add a "end capture" NFA state.
    ///
    /// A "end capture" NFA state corresponds to a state with exactly one
    /// outgoing unconditional epsilon transition to another state. Unlike
    /// "empty" states, a "end capture" state also carries with it an
    /// instruction for saving the current position of input to a particular
    /// location in memory. NFA simulations, like the Pike VM, may use this
    /// information to report the match locations of capturing groups in a
    ///
    /// Callers may provide a "dummy" state ID (typically [`StateID::ZERO`]),
    /// and then change it later with [`patch`](Builder::patch).
    ///
    /// Note that unlike `start_pattern`/`finish_pattern`, capturing start and
    /// end states may be interleaved. Indeed, it is typical for many "start
    /// capture" NFA states to appear before the first "end capture" state.
    ///
    /// # Errors
    ///
    /// This returns an error if the state identifier space is exhausted, or if
    /// the configured heap size limit has been exceeded or if the given
    /// capture index overflows `usize`.
    ///
    /// While the above are the only conditions in which this routine can
    /// currently return an error, it is possible to call this method with an
    /// inputs that results in the final `build()` step failing to produce an
    /// NFA. For example, if one adds two distinct capturing groups with the
    /// same name, then that will result in `build()` failing with an error.
    ///
    /// See the [`GroupInfo`](crate::util::captures::GroupInfo) type for
    /// more information on what qualifies as valid capturing groups.
    pub fn add_capture_end(
        &mut self,
        next: StateID,
        group_index: u32,
    ) -> Result<StateID, BuildError> {
        let pid = self.current_pattern_id();
        let group_index = match SmallIndex::try_from(group_index) {
            Err(_) => {
                return Err(BuildError::invalid_capture_index(group_index))
            }
            Ok(group_index) => group_index,
        };
        self.add(State::CaptureEnd { pattern_id: pid, group_index, next })
    }

    /// Adds a "fail" NFA state.
    ///
    /// A "fail" state is simply a state that has no outgoing transitions. It
    /// acts as a way to cause a search to stop without reporting a match.
    /// For example, one way to represent an NFA with zero patterns is with a
    /// single "fail" state.
    ///
    /// # Errors
    ///
    /// This returns an error if the state identifier space is exhausted, or if
    /// the configured heap size limit has been exceeded.
    pub fn add_fail(&mut self) -> Result<StateID, BuildError> {
        self.add(State::Fail)
    }

    /// Adds a "match" NFA state.
    ///
    /// A "match" state has no outgoing transitions (just like a "fail"
    /// state), but it has special significance in that if a search enters
    /// this state, then a match has been found. The match state that is added
    /// automatically has the current pattern ID associated with it. This is
    /// used to report the matching pattern ID at search time.
    ///
    /// # Errors
    ///
    /// This returns an error if the state identifier space is exhausted, or if
    /// the configured heap size limit has been exceeded.
    ///
    /// # Panics
    ///
    /// This must be called after a `start_pattern` call but before the
    /// corresponding `finish_pattern` call. Otherwise, it panics.
    pub fn add_match(&mut self) -> Result<StateID, BuildError> {
        let pattern_id = self.current_pattern_id();
        let sid = self.add(State::Match { pattern_id })?;
        Ok(sid)
    }

    /// The common implementation of "add a state." It handles the common
    /// error cases of state ID exhausting (by owning state ID allocation) and
    /// whether the size limit has been exceeded.
    fn add(&mut self, state: State) -> Result<StateID, BuildError> {
        let id = StateID::new(self.states.len())
            .map_err(|_| BuildError::too_many_states(self.states.len()))?;
        self.memory_states += state.memory_usage();
        self.states.push(state);
        self.check_size_limit()?;
        Ok(id)
    }

    /// Add a transition from one state to another.
    ///
    /// This routine is called "patch" since it is very common to add the
    /// states you want, typically with "dummy" state ID transitions, and then
    /// "patch" in the real state IDs later. This is because you don't always
    /// know all of the necessary state IDs to add because they might not
    /// exist yet.
    ///
    /// # Errors
    ///
    /// This may error if patching leads to an increase in heap usage beyond
    /// the configured size limit. Heap usage only grows when patching adds a
    /// new transition (as in the case of a "union" state).
    ///
    /// # Panics
    ///
    /// This panics if `from` corresponds to a "sparse" state. When "sparse"
    /// states are added, there is no way to patch them after-the-fact. (If you
    /// have a use case where this would be helpful, please file an issue. It
    /// will likely require a new API.)
    pub fn patch(
        &mut self,
        from: StateID,
        to: StateID,
    ) -> Result<(), BuildError> {
        let old_memory_states = self.memory_states;
        match self.states[from] {
            State::Empty { ref mut next } => {
                *next = to;
            }
            State::ByteRange { ref mut trans } => {
                trans.next = to;
            }
            State::Sparse { .. } => {
                panic!("cannot patch from a sparse NFA state")
            }
            State::Look { ref mut next, .. } => {
                *next = to;
            }
            State::Union { ref mut alternates } => {
                alternates.push(to);
                self.memory_states += mem::size_of::<StateID>();
            }
            State::UnionReverse { ref mut alternates } => {
                alternates.push(to);
                self.memory_states += mem::size_of::<StateID>();
            }
            State::CaptureStart { ref mut next, .. } => {
                *next = to;
            }
            State::CaptureEnd { ref mut next, .. } => {
                *next = to;
            }
            State::Fail => {}
            State::Match { .. } => {}
        }
        if old_memory_states != self.memory_states {
            self.check_size_limit()?;
        }
        Ok(())
    }

    /// Set whether the NFA produced by this builder should only match UTF-8.
    ///
    /// This should be set when both of the following are true:
    ///
    /// 1. The caller guarantees that the NFA created by this build will only
    /// report non-empty matches with spans that are valid UTF-8.
    /// 2. The caller desires regex engines using this NFA to avoid reporting
    /// empty matches with a span that splits a valid UTF-8 encoded codepoint.
    ///
    /// Property (1) is not checked. Instead, this requires the caller to
    /// promise that it is true. Property (2) corresponds to the behavior of
    /// regex engines using the NFA created by this builder. Namely, there
    /// is no way in the NFA's graph itself to say that empty matches found
    /// by, for example, the regex `a*` will fall on valid UTF-8 boundaries.
    /// Instead, this option is used to communicate the UTF-8 semantic to regex
    /// engines that will typically implement it as a post-processing step by
    /// filtering out empty matches that don't fall on UTF-8 boundaries.
    ///
    /// If you're building an NFA from an HIR (and not using a
    /// [`thompson::Compiler`](crate::nfa::thompson::Compiler)), then you can
    /// use the [`syntax::Config::utf8`](crate::util::syntax::Config::utf8)
    /// option to guarantee that if the HIR detects a non-empty match, then it
    /// is guaranteed to be valid UTF-8.
    ///
    /// Note that property (2) does *not* specify the behavior of executing
    /// a search on a haystack that is not valid UTF-8. Therefore, if you're
    /// *not* running this NFA on strings that are guaranteed to be valid
    /// UTF-8, you almost certainly do not want to enable this option.
    /// Similarly, if you are running the NFA on strings that *are* guaranteed
    /// to be valid UTF-8, then you almost certainly want to enable this option
    /// unless you can guarantee that your NFA will never produce a zero-width
    /// match.
    ///
    /// It is disabled by default.
    pub fn set_utf8(&mut self, yes: bool) {
        self.utf8 = yes;
    }

    /// Returns whether UTF-8 mode is enabled for this builder.
    ///
    /// See [`Builder::set_utf8`] for more details about what "UTF-8 mode" is.
    pub fn get_utf8(&self) -> bool {
        self.utf8
    }

    /// Sets whether the NFA produced by this builder should be matched in
    /// reverse or not. Generally speaking, when enabled, the NFA produced
    /// should be matched by moving backwards through a haystack, from a higher
    /// memory address to a lower memory address.
    ///
    /// See also [`NFA::is_reverse`] for more details.
    ///
    /// This is disabled by default, which means NFAs are by default matched
    /// in the forward direction.
    pub fn set_reverse(&mut self, yes: bool) {
        self.reverse = yes;
    }

    /// Returns whether reverse mode is enabled for this builder.
    ///
    /// See [`Builder::set_reverse`] for more details about what "reverse mode"
    /// is.
    pub fn get_reverse(&self) -> bool {
        self.reverse
    }

    /// Sets the look-around matcher that should be used for the resulting NFA.
    ///
    /// A look-around matcher can be used to configure how look-around
    /// assertions are matched. For example, a matcher might carry
    /// configuration that changes the line terminator used for `(?m:^)` and
    /// `(?m:$)` assertions.
    pub fn set_look_matcher(&mut self, m: LookMatcher) {
        self.look_matcher = m;
    }

    /// Returns the look-around matcher used for this builder.
    ///
    /// If a matcher was not explicitly set, then `LookMatcher::default()` is
    /// returned.
    pub fn get_look_matcher(&self) -> &LookMatcher {
        &self.look_matcher
    }

    /// Set the size limit on this builder.
    ///
    /// Setting the size limit will also check whether the NFA built so far
    /// fits within the given size limit. If it doesn't, then an error is
    /// returned.
    ///
    /// By default, there is no configured size limit.
    pub fn set_size_limit(
        &mut self,
        limit: Option<usize>,
    ) -> Result<(), BuildError> {
        self.size_limit = limit;
        self.check_size_limit()
    }

    /// Return the currently configured size limit.
    ///
    /// By default, this returns `None`, which corresponds to no configured
    /// size limit.
    pub fn get_size_limit(&self) -> Option<usize> {
        self.size_limit
    }

    /// Returns the heap memory usage, in bytes, used by the NFA states added
    /// so far.
    ///
    /// Note that this is an approximation of how big the final NFA will be.
    /// In practice, the final NFA will likely be a bit smaller because of
    /// its simpler state representation. (For example, using things like
    /// `Box<[StateID]>` instead of `Vec<StateID>`.)
    pub fn memory_usage(&self) -> usize {
        self.states.len() * mem::size_of::<State>() + self.memory_states
    }

    fn check_size_limit(&self) -> Result<(), BuildError> {
        if let Some(limit) = self.size_limit {
            if self.memory_usage() > limit {
                return Err(BuildError::exceeded_size_limit(limit));
            }
        }
        Ok(())
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    // This asserts that a builder state doesn't have its size changed. It is
    // *really* easy to accidentally increase the size, and thus potentially
    // dramatically increase the memory usage of NFA builder.
    //
    // This assert doesn't mean we absolutely cannot increase the size of a
    // builder state. We can. It's just here to make sure we do it knowingly
    // and intentionally.
    //
    // A builder state is unfortunately a little bigger than an NFA state,
    // since we really want to support adding things to a pre-existing state.
    // i.e., We use Vec<thing> instead of Box<[thing]>. So we end up using an
    // extra 8 bytes per state. Sad, but at least it gets freed once the NFA
    // is built.
    #[test]
    fn state_has_small_size() {
        #[cfg(target_pointer_width = "64")]
        assert_eq!(32, core::mem::size_of::<State>());
        #[cfg(target_pointer_width = "32")]
        assert_eq!(16, core::mem::size_of::<State>());
    }
}