regexr 0.1.2

A high-performance regex engine built from scratch with JIT compilation and SIMD acceleration
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
//! UTF-8 automata compiler.
//!
//! This module converts Unicode code point ranges into sequences of byte ranges
//! that represent valid UTF-8 encodings. This is essential for matching Unicode
//! character classes like `[α-ω]` or `[\u{100}-\u{1FF}]` in a byte-oriented regex engine.
//!
//! # UTF-8 Encoding Structure
//!
//! | Code Point Range     | Byte 1     | Byte 2     | Byte 3     | Byte 4     |
//! |----------------------|------------|------------|------------|------------|
//! | U+0000..U+007F       | 0xxxxxxx   |            |            |            |
//! | U+0080..U+07FF       | 110xxxxx   | 10xxxxxx   |            |            |
//! | U+0800..U+FFFF       | 1110xxxx   | 10xxxxxx   | 10xxxxxx   |            |
//! | U+10000..U+10FFFF    | 11110xxx   | 10xxxxxx   | 10xxxxxx   | 10xxxxxx   |
//!
//! # Algorithm
//!
//! To match a range of code points, we split the range at UTF-8 encoding boundaries
//! and generate byte range sequences for each sub-range. For example, `[a-ÿ]` (U+0061-U+00FF)
//! becomes:
//! - 1-byte: `[0x61-0x7F]` (U+0061-U+007F)
//! - 2-byte: `[0xC2-0xC3][0x80-0xBF]` (U+0080-U+00FF)

/// A sequence of byte ranges representing a UTF-8 encoded character range.
///
/// Each element is a (min, max) byte range. The sequence length corresponds
/// to the UTF-8 encoding length (1-4 bytes).
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Utf8Sequence {
    /// The byte ranges, one per UTF-8 byte position.
    pub ranges: Vec<(u8, u8)>,
}

impl Utf8Sequence {
    /// Creates a new UTF-8 sequence with the given byte ranges.
    pub fn new(ranges: Vec<(u8, u8)>) -> Self {
        Self { ranges }
    }

    /// Returns the number of bytes in this sequence.
    pub fn len(&self) -> usize {
        self.ranges.len()
    }

    /// Returns true if this sequence is empty.
    pub fn is_empty(&self) -> bool {
        self.ranges.is_empty()
    }
}

/// Compiles a Unicode code point range into a set of UTF-8 byte sequences.
///
/// This function takes a range of Unicode code points and returns all the
/// UTF-8 byte sequences needed to match that range. The sequences are
/// returned in order and cover the entire input range without overlap.
///
/// # Example
///
/// ```ignore
/// // Range [a-ÿ] (U+0061-U+00FF)
/// let sequences = compile_utf8_range('a' as u32, 'ÿ' as u32);
/// // Returns:
/// // - 1-byte sequence: [0x61, 0x7F]
/// // - 2-byte sequence: [0xC2, 0xC3] + [0x80, 0xBF]
/// ```
pub fn compile_utf8_range(start: u32, end: u32) -> Vec<Utf8Sequence> {
    if start > end {
        return vec![];
    }

    // Clamp to valid Unicode range
    let start = start.min(0x10FFFF);
    let end = end.min(0x10FFFF);

    let mut sequences = Vec::new();

    // Split the range at UTF-8 encoding boundaries
    let boundaries = [
        0x00,     // 1-byte start
        0x80,     // 2-byte start
        0x800,    // 3-byte start
        0x10000,  // 4-byte start
        0x110000, // End (one past max Unicode)
    ];

    let mut current = start;

    for i in 0..4 {
        let boundary_start = boundaries[i];
        let boundary_end = boundaries[i + 1] - 1;

        if current > boundary_end {
            continue;
        }
        if current > end {
            break;
        }

        let range_start = current.max(boundary_start);
        let range_end = end.min(boundary_end);

        if range_start <= range_end {
            // Generate sequences for this encoding class
            let class_sequences = compile_utf8_class(range_start, range_end, i + 1);
            sequences.extend(class_sequences);
            current = range_end + 1;
        }
    }

    sequences
}

/// Compiles a code point range within a single UTF-8 encoding class.
///
/// `encoding_len` is 1, 2, 3, or 4.
fn compile_utf8_class(start: u32, end: u32, encoding_len: usize) -> Vec<Utf8Sequence> {
    match encoding_len {
        1 => compile_1byte(start, end),
        2 => compile_2byte(start, end),
        3 => compile_3byte(start, end),
        4 => compile_4byte(start, end),
        _ => vec![],
    }
}

/// Compiles 1-byte UTF-8 sequences (U+0000..U+007F).
fn compile_1byte(start: u32, end: u32) -> Vec<Utf8Sequence> {
    debug_assert!(start <= 0x7F && end <= 0x7F);
    vec![Utf8Sequence::new(vec![(start as u8, end as u8)])]
}

/// Compiles 2-byte UTF-8 sequences (U+0080..U+07FF).
///
/// 2-byte encoding: 110xxxxx 10xxxxxx
/// - Byte 1: 0xC0 | (cp >> 6)
/// - Byte 2: 0x80 | (cp & 0x3F)
fn compile_2byte(start: u32, end: u32) -> Vec<Utf8Sequence> {
    debug_assert!(start >= 0x80 && end <= 0x7FF);

    let mut sequences = Vec::new();
    let mut current = start;

    while current <= end {
        // Find the range where byte 1 stays constant
        let byte1 = (0xC0 | (current >> 6)) as u8;

        // Calculate the max code point with the same first byte
        let max_with_same_byte1 = ((current >> 6) << 6) | 0x3F;
        let range_end = end.min(max_with_same_byte1);

        // Byte 2 range
        let byte2_start = (0x80 | (current & 0x3F)) as u8;
        let byte2_end = (0x80 | (range_end & 0x3F)) as u8;

        sequences.push(Utf8Sequence::new(vec![
            (byte1, byte1),
            (byte2_start, byte2_end),
        ]));

        current = range_end + 1;
    }

    sequences
}

/// Compiles 3-byte UTF-8 sequences (U+0800..U+FFFF).
///
/// 3-byte encoding: 1110xxxx 10xxxxxx 10xxxxxx
/// - Byte 1: 0xE0 | (cp >> 12)
/// - Byte 2: 0x80 | ((cp >> 6) & 0x3F)
/// - Byte 3: 0x80 | (cp & 0x3F)
fn compile_3byte(start: u32, end: u32) -> Vec<Utf8Sequence> {
    debug_assert!(start >= 0x800 && end <= 0xFFFF);

    // Skip surrogate range (U+D800..U+DFFF) - these are not valid Unicode scalar values
    let mut sequences = Vec::new();
    let mut current = start;

    while current <= end {
        // Skip surrogates
        if (0xD800..=0xDFFF).contains(&current) {
            current = 0xE000;
            if current > end {
                break;
            }
        }

        let byte1 = (0xE0 | (current >> 12)) as u8;

        // Find max with same byte1
        let max_with_same_byte1 = ((current >> 12) << 12) | 0xFFF;
        let range_end = end.min(max_with_same_byte1);

        // Skip if we'd land in surrogates
        let range_end = if range_end >= 0xD800 && current < 0xD800 {
            0xD7FF
        } else {
            range_end
        };

        if current <= range_end {
            // Now split by byte2
            let sub_sequences = compile_3byte_with_fixed_byte1(current, range_end, byte1);
            sequences.extend(sub_sequences);
        }

        current = range_end + 1;

        // Skip surrogates after the range
        if (0xD800..=0xDFFF).contains(&current) {
            current = 0xE000;
        }
    }

    sequences
}

/// Compiles 3-byte sequences with a fixed first byte.
fn compile_3byte_with_fixed_byte1(start: u32, end: u32, byte1: u8) -> Vec<Utf8Sequence> {
    let mut sequences = Vec::new();
    let mut current = start;

    while current <= end {
        let byte2 = (0x80 | ((current >> 6) & 0x3F)) as u8;

        // Find max with same byte2
        let max_with_same_byte2 = ((current >> 6) << 6) | 0x3F;
        let range_end = end.min(max_with_same_byte2);

        let byte3_start = (0x80 | (current & 0x3F)) as u8;
        let byte3_end = (0x80 | (range_end & 0x3F)) as u8;

        sequences.push(Utf8Sequence::new(vec![
            (byte1, byte1),
            (byte2, byte2),
            (byte3_start, byte3_end),
        ]));

        current = range_end + 1;
    }

    sequences
}

/// Compiles 4-byte UTF-8 sequences (U+10000..U+10FFFF).
///
/// 4-byte encoding: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
/// - Byte 1: 0xF0 | (cp >> 18)
/// - Byte 2: 0x80 | ((cp >> 12) & 0x3F)
/// - Byte 3: 0x80 | ((cp >> 6) & 0x3F)
/// - Byte 4: 0x80 | (cp & 0x3F)
fn compile_4byte(start: u32, end: u32) -> Vec<Utf8Sequence> {
    debug_assert!(start >= 0x10000 && end <= 0x10FFFF);

    let mut sequences = Vec::new();
    let mut current = start;

    while current <= end {
        let byte1 = (0xF0 | (current >> 18)) as u8;

        // Find max with same byte1
        let max_with_same_byte1 = ((current >> 18) << 18) | 0x3FFFF;
        let range_end = end.min(max_with_same_byte1);

        let sub_sequences = compile_4byte_with_fixed_byte1(current, range_end, byte1);
        sequences.extend(sub_sequences);

        current = range_end + 1;
    }

    sequences
}

/// Compiles 4-byte sequences with a fixed first byte.
fn compile_4byte_with_fixed_byte1(start: u32, end: u32, byte1: u8) -> Vec<Utf8Sequence> {
    let mut sequences = Vec::new();
    let mut current = start;

    while current <= end {
        let byte2 = (0x80 | ((current >> 12) & 0x3F)) as u8;

        // Find max with same byte2
        let max_with_same_byte2 = ((current >> 12) << 12) | 0xFFF;
        let range_end = end.min(max_with_same_byte2);

        let sub_sequences = compile_4byte_with_fixed_byte12(current, range_end, byte1, byte2);
        sequences.extend(sub_sequences);

        current = range_end + 1;
    }

    sequences
}

/// Compiles 4-byte sequences with fixed first and second bytes.
fn compile_4byte_with_fixed_byte12(
    start: u32,
    end: u32,
    byte1: u8,
    byte2: u8,
) -> Vec<Utf8Sequence> {
    let mut sequences = Vec::new();
    let mut current = start;

    while current <= end {
        let byte3 = (0x80 | ((current >> 6) & 0x3F)) as u8;

        // Find max with same byte3
        let max_with_same_byte3 = ((current >> 6) << 6) | 0x3F;
        let range_end = end.min(max_with_same_byte3);

        let byte4_start = (0x80 | (current & 0x3F)) as u8;
        let byte4_end = (0x80 | (range_end & 0x3F)) as u8;

        sequences.push(Utf8Sequence::new(vec![
            (byte1, byte1),
            (byte2, byte2),
            (byte3, byte3),
            (byte4_start, byte4_end),
        ]));

        current = range_end + 1;
    }

    sequences
}

/// Encodes a single code point to its UTF-8 byte sequence.
///
/// Returns `None` for invalid code points (surrogates or out of range).
pub fn encode_code_point(cp: u32) -> Option<Vec<u8>> {
    if cp > 0x10FFFF || (0xD800..=0xDFFF).contains(&cp) {
        return None;
    }

    let c = char::from_u32(cp)?;
    let mut buf = [0u8; 4];
    let s = c.encode_utf8(&mut buf);
    Some(s.as_bytes().to_vec())
}

/// Optimizes a list of UTF-8 sequences by merging adjacent ranges where possible.
///
/// This can significantly reduce the number of NFA states needed.
pub fn optimize_sequences(sequences: Vec<Utf8Sequence>) -> Vec<Utf8Sequence> {
    if sequences.len() <= 1 {
        return sequences;
    }

    let mut optimized = Vec::new();
    let mut i = 0;

    while i < sequences.len() {
        let mut current = sequences[i].clone();

        // Try to merge with subsequent sequences of the same length
        while i + 1 < sequences.len() {
            let next = &sequences[i + 1];

            if let Some(merged) = try_merge_sequences(&current, next) {
                current = merged;
                i += 1;
            } else {
                break;
            }
        }

        optimized.push(current);
        i += 1;
    }

    optimized
}

/// Computes the complement of sorted, non-overlapping code point ranges.
///
/// The complement includes all valid Unicode scalar values NOT in the input ranges.
/// Invalid code points (surrogates U+D800-U+DFFF) are automatically excluded.
///
/// # Arguments
///
/// * `ranges` - Sorted, non-overlapping code point ranges to exclude
///
/// # Returns
///
/// A vector of code point ranges representing the complement.
fn complement_code_point_ranges(ranges: &[(u32, u32)]) -> Vec<(u32, u32)> {
    if ranges.is_empty() {
        // If no ranges to exclude, return full Unicode range minus surrogates
        return vec![(0x0000, 0xD7FF), (0xE000, 0x10FFFF)];
    }

    let mut complement = Vec::new();
    let mut current = 0u32;

    for &(start, end) in ranges {
        // Clamp to valid Unicode range
        let start = start.min(0x10FFFF);
        let end = end.min(0x10FFFF);

        // Skip invalid ranges
        if start > end || start > 0x10FFFF {
            continue;
        }

        // Add gap before this range
        if current < start {
            // Need to handle surrogate split
            if current <= 0xD7FF && start > 0xD7FF {
                // Gap crosses surrogate boundary
                if current <= 0xD7FF {
                    complement.push((current, 0xD7FF.min(start.saturating_sub(1))));
                }
                if start > 0xDFFF {
                    complement.push((0xE000.max(current), start.saturating_sub(1)));
                } else if (0xD800..=0xDFFF).contains(&start) {
                    // start is in surrogate range, skip to after
                    current = 0xE000;
                    if current < start {
                        complement.push((current, start.saturating_sub(1)));
                    }
                }
            } else if (0xD800..=0xDFFF).contains(&current) {
                // Current is in surrogate range, skip to after
                current = 0xE000;
                if current < start {
                    complement.push((current, start.saturating_sub(1)));
                }
            } else if (0xD800..=0xDFFF).contains(&start) {
                // Start is in surrogate range
                if current < 0xD800 {
                    complement.push((current, 0xD7FF));
                }
                // current will be set to end+1 below, no need to set here
            } else {
                // Simple gap, no surrogate issues
                complement.push((current, start.saturating_sub(1)));
            }
        }

        // Move past this range
        current = end.saturating_add(1);

        // Skip surrogates if we land in them
        if (0xD800..=0xDFFF).contains(&current) {
            current = 0xE000;
        }
    }

    // Add final gap to end of Unicode
    if current <= 0x10FFFF {
        if current <= 0xD7FF {
            complement.push((current, 0xD7FF));
            if 0xE000 <= 0x10FFFF {
                complement.push((0xE000, 0x10FFFF));
            }
        } else if current >= 0xE000 {
            complement.push((current, 0x10FFFF));
        }
    }

    complement
}

/// Computes the UTF-8 sequences that match the complement of the given code point ranges.
///
/// The complement includes all valid Unicode scalar values NOT in the input ranges.
/// Invalid code points (surrogates) are automatically excluded.
///
/// # Arguments
///
/// * `ranges` - Code point ranges to exclude (does not need to be sorted)
///
/// # Returns
///
/// A vector of UTF-8 sequences representing all valid characters NOT in the input ranges.
///
/// # Example
///
/// ```ignore
/// // Complement of Greek lowercase [α-ω] (U+03B1-U+03C9)
/// let complement = compile_utf8_complement(&[(0x03B1, 0x03C9)]);
/// // Returns sequences matching everything except Greek lowercase
/// ```
pub fn compile_utf8_complement(ranges: &[(u32, u32)]) -> Vec<Utf8Sequence> {
    // Sort and merge input ranges
    let mut sorted_ranges: Vec<(u32, u32)> = ranges.to_vec();
    sorted_ranges.sort_by_key(|r| r.0);

    // Merge overlapping ranges
    let mut merged = Vec::new();
    for range in sorted_ranges {
        if merged.is_empty() {
            merged.push(range);
        } else {
            let last = merged.last_mut().unwrap();
            if range.0 <= last.1.saturating_add(1) {
                // Overlapping or adjacent, merge
                last.1 = last.1.max(range.1);
            } else {
                merged.push(range);
            }
        }
    }

    // Compute complement ranges
    let complement_ranges = complement_code_point_ranges(&merged);

    // Convert each complement range to UTF-8 sequences
    let mut sequences = Vec::new();
    for (start, end) in complement_ranges {
        sequences.extend(compile_utf8_range(start, end));
    }

    // Optimize the result
    optimize_sequences(sequences)
}

/// Tries to merge two UTF-8 sequences if they differ only in the last byte range
/// and those ranges are adjacent.
fn try_merge_sequences(a: &Utf8Sequence, b: &Utf8Sequence) -> Option<Utf8Sequence> {
    if a.len() != b.len() || a.is_empty() {
        return None;
    }

    let n = a.len();

    // All bytes except the last must be identical
    for i in 0..n - 1 {
        if a.ranges[i] != b.ranges[i] {
            return None;
        }
    }

    // Last byte ranges must be adjacent
    let (a_start, a_end) = a.ranges[n - 1];
    let (b_start, b_end) = b.ranges[n - 1];

    if a_end.checked_add(1) == Some(b_start) {
        let mut merged_ranges = a.ranges[..n - 1].to_vec();
        merged_ranges.push((a_start, b_end));
        Some(Utf8Sequence::new(merged_ranges))
    } else {
        None
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_encode_code_point_ascii() {
        assert_eq!(encode_code_point(0x41), Some(vec![0x41])); // 'A'
        assert_eq!(encode_code_point(0x7F), Some(vec![0x7F]));
    }

    #[test]
    fn test_encode_code_point_2byte() {
        // U+00E9 = 'é' = 0xC3 0xA9
        assert_eq!(encode_code_point(0xE9), Some(vec![0xC3, 0xA9]));
        // U+00FF = 'ÿ' = 0xC3 0xBF
        assert_eq!(encode_code_point(0xFF), Some(vec![0xC3, 0xBF]));
    }

    #[test]
    fn test_encode_code_point_3byte() {
        // U+3042 = 'あ' = 0xE3 0x81 0x82
        assert_eq!(encode_code_point(0x3042), Some(vec![0xE3, 0x81, 0x82]));
    }

    #[test]
    fn test_encode_code_point_4byte() {
        // U+1F600 = '😀' = 0xF0 0x9F 0x98 0x80
        assert_eq!(
            encode_code_point(0x1F600),
            Some(vec![0xF0, 0x9F, 0x98, 0x80])
        );
    }

    #[test]
    fn test_encode_code_point_invalid() {
        // Surrogates are invalid
        assert_eq!(encode_code_point(0xD800), None);
        assert_eq!(encode_code_point(0xDFFF), None);
        // Out of range
        assert_eq!(encode_code_point(0x110000), None);
    }

    #[test]
    fn test_compile_1byte_range() {
        let seqs = compile_utf8_range(0x41, 0x5A); // A-Z
        assert_eq!(seqs.len(), 1);
        assert_eq!(seqs[0].ranges, vec![(0x41, 0x5A)]);
    }

    #[test]
    fn test_compile_2byte_range() {
        // U+0080 to U+00BF (first 64 2-byte characters)
        let seqs = compile_utf8_range(0x80, 0xBF);
        // All have first byte 0xC2
        assert_eq!(seqs.len(), 1);
        assert_eq!(seqs[0].ranges[0], (0xC2, 0xC2));
        assert_eq!(seqs[0].ranges[1], (0x80, 0xBF));
    }

    #[test]
    fn test_compile_cross_boundary_1_2() {
        // Range crossing 1-byte and 2-byte boundary
        let seqs = compile_utf8_range(0x61, 0xFF); // 'a' to 'ÿ'

        // Should have 1-byte part and 2-byte part
        assert!(seqs.len() >= 2);

        // First sequence should be 1-byte (a-DEL area)
        assert_eq!(seqs[0].len(), 1);
        assert_eq!(seqs[0].ranges[0], (0x61, 0x7F));

        // Remaining sequences should be 2-byte
        for seq in &seqs[1..] {
            assert_eq!(seq.len(), 2);
        }
    }

    #[test]
    fn test_compile_greek_letters() {
        // Greek lowercase: α (U+03B1) to ω (U+03C9)
        let seqs = compile_utf8_range(0x03B1, 0x03C9);

        // All should be 2-byte sequences
        for seq in &seqs {
            assert_eq!(seq.len(), 2);
            // First byte should be 0xCE or 0xCF
            assert!(seq.ranges[0].0 >= 0xCE && seq.ranges[0].1 <= 0xCF);
        }
    }

    #[test]
    fn test_compile_cjk() {
        // CJK Unified Ideographs: U+4E00 to U+4E0F (first 16)
        let seqs = compile_utf8_range(0x4E00, 0x4E0F);

        // All should be 3-byte sequences
        for seq in &seqs {
            assert_eq!(seq.len(), 3);
        }
    }

    #[test]
    fn test_compile_emoji() {
        // Emoji range: U+1F600 to U+1F60F (first 16 emoji faces)
        let seqs = compile_utf8_range(0x1F600, 0x1F60F);

        // All should be 4-byte sequences
        for seq in &seqs {
            assert_eq!(seq.len(), 4);
        }
    }

    #[test]
    fn test_surrogate_skip() {
        // Range that includes surrogates should skip them
        let seqs = compile_utf8_range(0xD700, 0xE000);

        // Should have sequences for U+D700-U+D7FF and U+E000
        // but nothing in U+D800-U+DFFF
        for seq in &seqs {
            // Verify no sequence would match surrogate-like bytes
            // (This is implicit in the algorithm)
            assert!(seq.len() >= 2);
        }
    }

    #[test]
    fn test_optimize_sequences() {
        let seqs = vec![
            Utf8Sequence::new(vec![(0xC2, 0xC2), (0x80, 0x8F)]),
            Utf8Sequence::new(vec![(0xC2, 0xC2), (0x90, 0x9F)]),
            Utf8Sequence::new(vec![(0xC2, 0xC2), (0xA0, 0xAF)]),
        ];

        let optimized = optimize_sequences(seqs);

        // Should merge into one sequence
        assert_eq!(optimized.len(), 1);
        assert_eq!(optimized[0].ranges, vec![(0xC2, 0xC2), (0x80, 0xAF)]);
    }

    #[test]
    fn test_full_unicode_range() {
        // Test the entire valid Unicode range
        let seqs = compile_utf8_range(0, 0x10FFFF);

        // Should have sequences covering all 4 encoding lengths
        let has_1byte = seqs.iter().any(|s| s.len() == 1);
        let has_2byte = seqs.iter().any(|s| s.len() == 2);
        let has_3byte = seqs.iter().any(|s| s.len() == 3);
        let has_4byte = seqs.iter().any(|s| s.len() == 4);

        assert!(has_1byte);
        assert!(has_2byte);
        assert!(has_3byte);
        assert!(has_4byte);
    }

    #[test]
    fn test_single_code_point() {
        // Single code point should produce exactly one sequence
        let seqs = compile_utf8_range(0x03B1, 0x03B1); // Just 'α'
        assert_eq!(seqs.len(), 1);
        assert_eq!(seqs[0].len(), 2);
    }

    #[test]
    fn test_complement_ascii() {
        // [^a-z] should match everything except a-z
        let complement = compile_utf8_complement(&[(0x61, 0x7A)]);

        // Should have sequences for:
        // 1. 0x00-0x60 (single byte)
        // 2. 0x7B-0x7F (single byte)
        // 3. All 2-byte UTF-8 (U+0080-U+07FF)
        // 4. All 3-byte UTF-8 (U+0800-U+FFFF, minus surrogates)
        // 5. All 4-byte UTF-8 (U+10000-U+10FFFF)

        // Verify we have different sequence lengths
        let has_1byte = complement.iter().any(|s| s.len() == 1);
        let has_2byte = complement.iter().any(|s| s.len() == 2);
        let has_3byte = complement.iter().any(|s| s.len() == 3);
        let has_4byte = complement.iter().any(|s| s.len() == 4);

        assert!(has_1byte, "Should have 1-byte sequences");
        assert!(has_2byte, "Should have 2-byte sequences");
        assert!(has_3byte, "Should have 3-byte sequences");
        assert!(has_4byte, "Should have 4-byte sequences");
    }

    #[test]
    fn test_complement_greek() {
        // [^α-ω] should match everything except Greek lowercase (U+03B1-U+03C9)
        let complement = compile_utf8_complement(&[(0x03B1, 0x03C9)]);

        // Should include ASCII, other 2-byte, all 3-byte, all 4-byte
        let has_1byte = complement.iter().any(|s| s.len() == 1);
        let has_2byte = complement.iter().any(|s| s.len() == 2);
        let has_3byte = complement.iter().any(|s| s.len() == 3);
        let has_4byte = complement.iter().any(|s| s.len() == 4);

        assert!(has_1byte, "Should have ASCII");
        assert!(has_2byte, "Should have other 2-byte sequences");
        assert!(has_3byte, "Should have 3-byte sequences");
        assert!(has_4byte, "Should have 4-byte sequences");
    }

    #[test]
    fn test_complement_excludes_surrogates() {
        // Complement of empty set should give full Unicode minus surrogates
        let complement = compile_utf8_complement(&[]);

        // Full range is 0x0000-0xD7FF and 0xE000-0x10FFFF
        // The complement should cover all valid Unicode

        // Verify we never generate sequences for surrogates (U+D800-U+DFFF)
        // Surrogates would be 3-byte: 0xED 0xA0-0xBF 0x80-0xBF
        for seq in &complement {
            if seq.len() == 3 {
                let (b1_start, b1_end) = seq.ranges[0];
                if b1_start == 0xED && b1_end == 0xED {
                    let (b2_start, _) = seq.ranges[1];
                    // If first byte is 0xED, second byte should not start at 0xA0 or higher
                    // (which would enter surrogate territory)
                    assert!(b2_start < 0xA0, "Should not generate surrogate sequences");
                }
            }
        }
    }

    #[test]
    fn test_complement_single_char() {
        // [^α] should match everything except α (U+03B1)
        let complement = compile_utf8_complement(&[(0x03B1, 0x03B1)]);

        // Should have many sequences
        assert!(!complement.is_empty());

        // Should have all encoding lengths
        let has_1byte = complement.iter().any(|s| s.len() == 1);
        let has_2byte = complement.iter().any(|s| s.len() == 2);
        let has_3byte = complement.iter().any(|s| s.len() == 3);
        let has_4byte = complement.iter().any(|s| s.len() == 4);

        assert!(has_1byte);
        assert!(has_2byte);
        assert!(has_3byte);
        assert!(has_4byte);
    }

    #[test]
    fn test_complement_multiple_ranges() {
        // [^a-zA-Z] should match everything except ASCII letters
        let complement = compile_utf8_complement(&[(0x41, 0x5A), (0x61, 0x7A)]);

        // Should have single-byte sequences for other ASCII
        let has_1byte = complement.iter().any(|s| s.len() == 1);
        assert!(has_1byte);

        // Should also have multi-byte for all Unicode
        let has_2byte = complement.iter().any(|s| s.len() == 2);
        let has_3byte = complement.iter().any(|s| s.len() == 3);
        let has_4byte = complement.iter().any(|s| s.len() == 4);

        assert!(has_2byte);
        assert!(has_3byte);
        assert!(has_4byte);
    }

    #[test]
    fn test_complement_cjk() {
        // [^一-龥] should match everything except CJK Unified Ideographs (U+4E00-U+9FFF)
        let complement = compile_utf8_complement(&[(0x4E00, 0x9FFF)]);

        // Should include ASCII and other ranges
        let has_1byte = complement.iter().any(|s| s.len() == 1);
        let has_2byte = complement.iter().any(|s| s.len() == 2);
        let has_3byte = complement.iter().any(|s| s.len() == 3);
        let has_4byte = complement.iter().any(|s| s.len() == 4);

        assert!(has_1byte);
        assert!(has_2byte);
        assert!(has_3byte);
        assert!(has_4byte);
    }

    #[test]
    fn test_complement_emoji() {
        // [^😀-😂] should match everything except a small emoji range
        let complement = compile_utf8_complement(&[(0x1F600, 0x1F602)]);

        // Should have all encoding lengths
        let has_1byte = complement.iter().any(|s| s.len() == 1);
        let has_2byte = complement.iter().any(|s| s.len() == 2);
        let has_3byte = complement.iter().any(|s| s.len() == 3);
        let has_4byte = complement.iter().any(|s| s.len() == 4);

        assert!(has_1byte);
        assert!(has_2byte);
        assert!(has_3byte);
        assert!(has_4byte);
    }

    #[test]
    fn test_complement_boundary_cases() {
        // Test complement at encoding boundaries
        // Range crossing 1-byte/2-byte boundary
        let complement = compile_utf8_complement(&[(0x7F, 0x80)]);
        assert!(!complement.is_empty());

        // Range crossing 2-byte/3-byte boundary
        let complement = compile_utf8_complement(&[(0x7FF, 0x800)]);
        assert!(!complement.is_empty());

        // Range crossing 3-byte/4-byte boundary
        let complement = compile_utf8_complement(&[(0xFFFF, 0x10000)]);
        assert!(!complement.is_empty());
    }
}