multimatch 0.1.1

Multi-pattern matching engine — Aho-Corasick + regex with optional Hyperscan SIMD acceleration
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
//! Adversarial tests designed to expose edge cases, performance issues,
//! and correctness bugs in the santh-match engine.
//!
//! These tests intentionally push boundaries: empty inputs, massive inputs,
//! pathological patterns, encoding edge cases, and stress scenarios.

#[cfg(test)]
mod tests {
    use crate::{PatternSet, Scanner};

    // =========================================================================
    // Test 1: Empty input variations
    // =========================================================================
    
    /// Empty input against empty pattern should have defined behavior
    #[test]
    fn adversarial_empty_input_empty_pattern() {
        let ps = PatternSet::builder()
            .add_literal("", 0)  // Empty literal pattern
            .build()
            .unwrap();
        
        let matches = ps.scan(b"");
        // Empty pattern matches empty input at position 0
        assert_eq!(matches.len(), 1, "Empty pattern should match empty input once");
        assert_eq!(matches[0].pattern_id, 0);
        assert_eq!(matches[0].start, 0);
        assert_eq!(matches[0].end, 0);
    }

    /// Empty regex pattern should match at every position (including empty input)
    #[test]
    fn adversarial_empty_regex_pattern() {
        let ps = PatternSet::builder()
            .add_regex("", 0)  // Empty regex matches empty string
            .build()
            .unwrap();
        
        let matches = ps.scan(b"abc");
        // Empty regex matches at positions: 0, 1, 2, 3 (before each char and at end)
        assert_eq!(matches.len(), 4, "Empty regex should match at 4 positions in 'abc'");
    }

    // =========================================================================
    // Test 2: 1-byte input stress test
    // =========================================================================

    /// Single byte input with multi-byte patterns
    #[test]
    fn adversarial_single_byte_input() {
        let ps = PatternSet::builder()
            .add_literal("abc", 0)
            .add_literal("x", 1)
            .add_regex(r"\w+", 2)
            .add_regex(r".", 3)
            .build()
            .unwrap();
        
        let matches = ps.scan(b"x");
        
        // Pattern 0 ("abc") should not match
        assert!(!matches.iter().any(|m| m.pattern_id == 0), "'abc' should not match single byte 'x'");
        
        // Pattern 1 ("x") should match
        assert!(matches.iter().any(|m| m.pattern_id == 1), "'x' should match 'x'");
        
        // Pattern 2 (\w+) should match
        assert!(matches.iter().any(|m| m.pattern_id == 2), r"'\w+' should match 'x'");
        
        // Pattern 3 (.) should match
        assert!(matches.iter().any(|m| m.pattern_id == 3), "'.' should match 'x'");
    }

    /// 1-byte input that is a regex metacharacter
    #[test]
    fn adversarial_single_byte_metachar() {
        let ps = PatternSet::builder()
            .add_regex(r"\.", 0)  // Literal dot
            .add_regex(r".", 1)   // Any char
            .build()
            .unwrap();
        
        // Input is literal dot byte
        let matches = ps.scan(b".");
        
        // Both patterns should match
        assert!(matches.iter().any(|m| m.pattern_id == 0), r"'\.' should match '.'");
        assert!(matches.iter().any(|m| m.pattern_id == 1), "'.' should match '.'");
    }

    // =========================================================================
    // Test 3: Large input (10MB) - performance stress test
    // =========================================================================

    /// 10MB input with patterns that match frequently (potential memory/perf issue)
    #[test]
    fn adversarial_large_input_frequent_matches() {
        let input = vec![b'a'; 10 * 1024 * 1024]; // 10MB of 'a's
        
        let ps = PatternSet::builder()
            .add_literal("a", 0)  // Matches every position
            .build()
            .unwrap();
        
        let matches = ps.scan(&input);
        
        // This could cause massive memory usage or be slow
        // 10MB input with single-char pattern = 10M matches
        assert_eq!(matches.len(), input.len(), "Should have one match per byte");
        
        // Verify some offsets
        assert_eq!(matches[0].start, 0);
        assert_eq!(matches[matches.len() - 1].start, input.len() - 1);
    }

    /// 10MB input with regex that matches everything (catastrophic backtracking risk)
    #[test]
    fn adversarial_large_input_regex_match_all() {
        let input = vec![b'x'; 5 * 1024 * 1024]; // 5MB to avoid timeout
        
        let ps = PatternSet::builder()
            .add_regex(r".*", 0)  // Matches entire string
            .build()
            .unwrap();
        
        let matches = ps.scan(&input);
        
        // .* should match once, the entire input
        assert!(!matches.is_empty(), ".* should match");
        // First match should cover whole input
        assert_eq!(matches[0].start, 0);
        assert_eq!(matches[0].end, input.len());
    }

    // =========================================================================
    // Test 4: Regex that matches everything (pathological patterns)
    // =========================================================================

    /// Regex anchors and word boundaries
    #[test]
    fn adversarial_regex_anchors_and_boundaries() {
        let ps = PatternSet::builder()
            .add_regex(r"^", 0)      // Start of string
            .add_regex(r"$", 1)      // End of string  
            .add_regex(r"\b", 2)     // Word boundary
            .build()
            .unwrap();
        
        let matches = ps.scan(b"ab");
        
        // ^ matches at position 0 (empty match at start)
        assert!(matches.iter().any(|m| m.pattern_id == 0 && m.start == 0), "^ should match at start");
        
        // $ matches at position 2 (end)
        assert!(matches.iter().any(|m| m.pattern_id == 1 && m.start == 2), "$ should match at end");
        
        // \b should match at word boundaries (positions 0 and 2 for "ab")
        let boundary_matches: Vec<_> = matches.iter().filter(|m| m.pattern_id == 2).collect();
        assert!(!boundary_matches.is_empty(), r"'\b' should find word boundaries");
    }

    /// Regex with lookahead is NOT supported by the regex crate
    /// This test documents that limitation
    #[test]
    fn adversarial_regex_lookahead_not_supported() {
        let result = PatternSet::builder()
            .add_regex(r"(?=\w)", 0)  // Zero-width lookahead - NOT SUPPORTED
            .build();
        
        // Should fail to compile with InvalidRegex error
        assert!(result.is_err(), "Lookahead regex should fail to compile");
    }

    /// Regex with catastrophic backtracking potential
    #[test]
    fn adversarial_regex_catastrophic_backtracking() {
        let ps = PatternSet::builder()
            .add_regex(r"(a+)+b", 0)  // Known backtracking pattern
            .build()
        .unwrap();
        
        // Input that triggers exponential backtracking in naive engines
        // With "aaaaaaaaaaaaaac" pattern should fail quickly with no match
        let input = b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaac";
        
        let start = std::time::Instant::now();
        let matches = ps.scan(input);
        let elapsed = start.elapsed();
        
        // Should complete in reasonable time (regex crate handles this well)
        assert!(elapsed.as_secs() < 5, "Should not hang on backtracking pattern");
        assert!(matches.is_empty(), "Pattern should not match");
    }

    // =========================================================================
    // Test 5: Overlapping literal + regex patterns
    // =========================================================================

    /// Same pattern added as both literal and regex
    #[test]
    fn adversarial_overlapping_literal_regex_same() {
        let ps = PatternSet::builder()
            .add_literal("test", 0)
            .add_regex(r"test", 1)
            .build()
            .unwrap();
        
        let matches = ps.scan(b"this is a test string");
        
        // Both should match at same position
        let literal_match = matches.iter().find(|m| m.pattern_id == 0);
        let regex_match = matches.iter().find(|m| m.pattern_id == 1);
        
        assert!(literal_match.is_some(), "Literal should match");
        assert!(regex_match.is_some(), "Regex should match");
        
        // Both should have same offsets
        assert_eq!(literal_match.unwrap().start, regex_match.unwrap().start);
        assert_eq!(literal_match.unwrap().end, regex_match.unwrap().end);
    }

    /// Overlapping matches with different priorities
    #[test]
    fn adversarial_overlapping_priority() {
        let ps = PatternSet::builder()
            .add_literal("password", 0)
            .add_literal("pass", 1)
            .add_regex(r"pass\w+", 2)
            .build()
            .unwrap();
        
        let matches = ps.scan(b"mypassword123");
        
        // All three should match (overlapping allowed)
        assert!(matches.iter().any(|m| m.pattern_id == 0), "'password' should match inside 'mypassword123'");
        assert!(matches.iter().any(|m| m.pattern_id == 1), "'pass' should match");
        assert!(matches.iter().any(|m| m.pattern_id == 2), r"'pass\w+' should match");
    }

    // =========================================================================
    // Test 6: Null byte patterns and input
    // =========================================================================

    /// Pattern containing null bytes
    #[test]
    fn adversarial_null_byte_pattern() {
        let ps = PatternSet::builder()
            .add_literal("pass\0word", 0)  // Literal with null (9 bytes: p-a-s-s-\0-w-o-r-d)
            .build()
            .unwrap();
        
        let matches = ps.scan(b"my pass\0word is secret");
        assert_eq!(matches.len(), 1, "Pattern with null byte should match");
        assert_eq!(matches[0].start, 3);  // After "my "
        assert_eq!(matches[0].end, 12);   // 3 + 9 bytes = 12
    }

    /// Input is all null bytes
    #[test]
    fn adversarial_null_byte_input() {
        let ps = PatternSet::builder()
            .add_literal("\0\0", 0)
            .add_regex(r"\x00+", 1)
            .build()
            .unwrap();
        
        let input = vec![0u8; 100];
        let matches = ps.scan(&input);
        
        // Literal "\0\0" should match 99 times (overlapping)
        let literal_matches: Vec<_> = matches.iter().filter(|m| m.pattern_id == 0).collect();
        assert_eq!(literal_matches.len(), 99, "'\0\0' should match 99 times with overlap");
        
        // Regex \x00+ should match once (the whole sequence)
        let regex_matches: Vec<_> = matches.iter().filter(|m| m.pattern_id == 1).collect();
        assert!(!regex_matches.is_empty(), r"'\x00+' should match");
    }

    /// Mixed null and regular bytes
    #[test]
    fn adversarial_mixed_null_bytes() {
        let ps = PatternSet::builder()
            .add_literal("a\0b\0c", 0)
            .add_regex(r"a\x00b\x00c", 1)
            .build()
            .unwrap();
        
        let matches = ps.scan(b"xyz a\0b\0c def");
        
        assert_eq!(matches.len(), 2, "Both patterns should match");
    }

    // =========================================================================
    // Test 7: Case sensitivity edge cases
    // =========================================================================

    /// Turkish 'I' problem - dotted/dotless I
    #[test]
    fn adversarial_turkish_i_case() {
        let ps = PatternSet::builder()
            .add_literal_ci("I", 0)   // Should match Turkish dotted i?
            .add_literal_ci("i", 1)   // Should match Turkish dotless I?
            .build()
            .unwrap();
        
        // Turkish dotted I
        let dotted_i = "İ".as_bytes();
        let matches = ps.scan(dotted_i);
        
        // With simple unicode case-insensitivity (via regex), 'i' does NOT match 'İ'
        // because it requires full/Turkic case folding.
        assert!(!matches.iter().any(|m| m.pattern_id == 1), "'i' should not match 'İ' with simple case folding");
        
        // Turkish dotless I
        let dotless_i = "I".as_bytes();
        let matches2 = ps.scan(dotless_i);
        assert!(matches2.iter().any(|m| m.pattern_id == 0), "'I' should match 'I'");
    }

    /// Fullwidth Latin letters (different Unicode block)
    #[test]
    fn adversarial_fullwidth_case() {
        let ps = PatternSet::builder()
            .add_literal_ci("A", 0)
            .build()
            .unwrap();
        
        // Fullwidth Latin Capital A (U+FF21)
        let fullwidth_a = "".as_bytes();
        let matches = ps.scan(fullwidth_a);
        
        // ASCII 'A' case-insensitive should NOT match fullwidth 'A'
        // This is intentional - they are different characters
        assert!(!matches.iter().any(|m| m.pattern_id == 0), 
            "ASCII 'A' should not match fullwidth 'A' even with case-insensitive");
    }

    /// German eszett (ß) case folding
    #[test]
    fn adversarial_german_eszett() {
        let ps = PatternSet::builder()
            .add_literal_ci("SS", 0)   // Should this match ß?
            .add_literal_ci("ß", 1)    // Lowercase eszett
            .build()
            .unwrap();
        
        // ß to uppercase is SS in German, but SS to lowercase is not necessarily ß
        let eszett = "Maße".as_bytes();  // Contains ß
        let matches = ps.scan(eszett);
        
        // Pattern 1 (ß) should match
        assert!(matches.iter().any(|m| m.pattern_id == 1), "'ß' should match 'Maße'");
        
        // Pattern 0 (SS) with case-insensitive - Rust regex crate does NOT expand 
        // 1-to-many case folding by default, so 'SS' does not match 'ß'
        assert!(!matches.iter().any(|m| m.pattern_id == 0), "'SS' does not match 'ß' in standard regex case-folding");
    }

    /// Case-insensitive literal vs case-insensitive regex consistency
    #[test]
    fn adversarial_ci_consistency() {
        let ps = PatternSet::builder()
            .add_literal_ci("ABC", 0)
            .add_regex_ci("ABC", 1)
            .build()
            .unwrap();
        
        let test_cases = vec![
            ("abc", true, true),
            ("ABC", true, true),
            ("Abc", true, true),
            ("aBc", true, true),
        ];
        
        for (input, _lit_expected, _regex_expected) in test_cases {
            let matches = ps.scan(input.as_bytes());
            let lit_matches = matches.iter().any(|m| m.pattern_id == 0);
            let regex_matches = matches.iter().any(|m| m.pattern_id == 1);
            
            // Both should behave consistently
            assert_eq!(lit_matches, regex_matches, 
                "Literal and regex case-insensitive should match same inputs for '{}'", input);
        }
    }

    // =========================================================================
    // Test 8: 1000+ patterns stress test
    // =========================================================================

    /// Exactly 1000 patterns
    #[test]
    fn adversarial_thousand_patterns() {
        let mut builder = PatternSet::builder();
        for i in 0..1000 {
            builder = builder.add_literal(&format!("pattern_{:04}", i), i);
        }
        
        let ps = builder.build().unwrap();
        assert_eq!(ps.pattern_count(), 1000);
        
        // Input contains several patterns
        let input = b"contains pattern_0001 and pattern_0999 in middle";
        let matches = ps.scan(input);
        
        assert!(matches.iter().any(|m| m.pattern_id == 1), "pattern_0001 should match");
        assert!(matches.iter().any(|m| m.pattern_id == 999), "pattern_0999 should match");
    }

    /// 1000 patterns where many match the same input
    #[test]
    fn adversarial_thousand_overlapping_matches() {
        let mut builder = PatternSet::builder();
        
        // All patterns are single characters, all match same input positions
        for i in 0..1000 {
            builder = builder.add_literal("a", i);
        }
        
        let ps = builder.build().unwrap();
        let input = b"aaa";  // 3 bytes
        let matches = ps.scan(input);
        
        // Each position gets 1000 matches (one per pattern ID)
        // 3 positions × 1000 patterns = 3000 matches
        assert_eq!(matches.len(), 3000, "Each position should match all 1000 patterns");
    }

    /// Mixed 1000 literals and regexes
    #[test]
    fn adversarial_mixed_thousand_patterns() {
        let mut builder = PatternSet::builder();
        
        for i in 0..500 {
            builder = builder.add_literal(&format!("lit_{}", i), i);
            builder = builder.add_regex(&format!(r"re_{}", i), i + 500);
        }
        
        let ps = builder.build().unwrap();
        
        let input = b"contains lit_100 and re_200 here";
        let matches = ps.scan(input);
        
        assert!(matches.iter().any(|m| m.pattern_id == 100), "lit_100 should match");
        assert!(matches.iter().any(|m| m.pattern_id == 700), "re_200 should match (id 500+200)");
    }

    // =========================================================================
    // Test 9: Unicode edge cases
    // =========================================================================

    /// Multi-byte UTF-8 characters - offsets should be in bytes
    #[test]
    fn adversarial_unicode_byte_offsets() {
        let ps = PatternSet::builder()
            .add_literal("日本", 0)  // Japanese "Japan" - 6 bytes (3 bytes per char)
            .build()
            .unwrap();
        
        let input = "私は日本語を話します".as_bytes(); // "I speak Japanese"
        let matches = ps.scan(input);
        
        assert_eq!(matches.len(), 1);
        // "私" = 3 bytes, "は" = 3 bytes
        // "日本" starts after "私は" (2 chars × 3 bytes = 6 bytes)
        assert_eq!(matches[0].start, 6);
        assert_eq!(matches[0].end, 12); // 6 + 6 bytes for "日本"
    }

    /// Invalid UTF-8 sequences
    #[test]
    fn adversarial_invalid_utf8() {
        let ps = PatternSet::builder()
            .add_literal("abc", 0)
            .add_regex(r"abc", 1)
            .build()
            .unwrap();
        
        // Invalid UTF-8: 0x80 is a continuation byte without a starter
        let input = b"\x80abc\xff\xfe";
        let matches = ps.scan(input);
        
        // Should still find "abc" despite invalid UTF-8 surrounding it
        assert!(matches.iter().any(|m| m.pattern_id == 0), "Should find literal in invalid UTF-8");
        assert!(matches.iter().any(|m| m.pattern_id == 1), "Should find regex in invalid UTF-8");
    }

    /// Zero-width characters and combining marks
    #[test]
    fn adversarial_zero_width_chars() {
        let ps = PatternSet::builder()
            .add_literal("café", 0)  // Pre-composed é
            .add_literal("cafe\u{0301}", 1)  // e + combining acute accent
            .build()
            .unwrap();
        
        // These are DIFFERENT byte sequences!
        let composed = "café".as_bytes();
        let decomposed = "cafe\u{0301}".as_bytes();
        
        let matches1 = ps.scan(composed);
        assert!(matches1.iter().any(|m| m.pattern_id == 0), "Pre-composed café should match");
        
        let matches2 = ps.scan(decomposed);
        assert!(matches2.iter().any(|m| m.pattern_id == 1), "Decomposed cafe+◌́ should match");
    }

    /// Emoji and variation selectors
    #[test]
    fn adversarial_emoji_variation() {
        let ps = PatternSet::builder()
            .add_literal("❤️", 0)  // Heart with variation selector
            .add_literal("", 1)   // Heart without variation selector
            .build()
            .unwrap();
        
        let with_vs = "❤️".as_bytes();   // U+2764 U+FE0F (6 bytes)
        let without_vs = "".as_bytes(); // U+2764 (3 bytes)
        
        let matches1 = ps.scan(with_vs);
        assert!(matches1.iter().any(|m| m.pattern_id == 0), "Heart with VS should match");
        
        let matches2 = ps.scan(without_vs);
        assert!(matches2.iter().any(|m| m.pattern_id == 1), "Heart without VS should match");
    }

    /// Right-to-left text
    #[test]
    fn adversarial_rtl_text() {
        let ps = PatternSet::builder()
            .add_literal("مرحبا", 0)  // Arabic "hello"
            .build()
            .unwrap();
        
        let input = "مرحبا بالعالم".as_bytes(); // "Hello world"
        let matches = ps.scan(input);
        
        assert_eq!(matches.len(), 1);
        // Byte offsets work regardless of visual direction
        assert_eq!(matches[0].start, 0);
        assert_eq!(matches[0].end, 10); // 5 Arabic chars × 2 bytes each
    }

    // =========================================================================
    // Test 10: Special regex metacharacters in literals
    // =========================================================================

    /// Literal containing regex metacharacters should be treated as literal
    #[test]
    fn adversarial_metachar_in_literal() {
        // Pattern includes various regex metacharacters that should be treated literally
        let pattern = r".*+?^${}()|[]\";
        let ps = PatternSet::builder()
            .add_literal(pattern, 0)
            .build()
            .unwrap();
        
        // Create matching input - must match the literal exactly
        let input = pattern.as_bytes();
        let matches = ps.scan(input);
        
        assert_eq!(matches.len(), 1, "Literal with metachars should match exactly");
        assert_eq!(matches[0].pattern_id, 0);
        assert_eq!(matches[0].start, 0);
        assert_eq!(matches[0].end, pattern.len());
    }

    /// Same pattern as literal vs regex
    #[test]
    fn adversarial_literal_vs_regex_metachars() {
        let ps = PatternSet::builder()
            .add_literal(r"\d+", 0)  // Literal backslash-d-plus
            .add_regex(r"\d+", 1)    // Regex for digits
            .build()
            .unwrap();
        
        let input = b"123 \\d+ 456";
        let matches = ps.scan(input);
        
        // Regex should match "123" and "456"
        let regex_matches: Vec<_> = matches.iter().filter(|m| m.pattern_id == 1).collect();
        assert_eq!(regex_matches.len(), 2, r"Regex '\d+' should match digit sequences");
        
        // Literal should match the literal "\d+" text
        let lit_matches: Vec<_> = matches.iter().filter(|m| m.pattern_id == 0).collect();
        assert_eq!(lit_matches.len(), 1, r"Literal '\d+' should match literal text");
    }

    // =========================================================================
    // Test 11: Boundary value tests
    // =========================================================================

    /// Pattern at exact start and end of input
    #[test]
    fn adversarial_boundary_positions() {
        let ps = PatternSet::builder()
            .add_literal("^start", 0)
            .add_literal("end$", 1)
            .build()
            .unwrap();
        
        // Note: ^ and $ are NOT anchors in literals!
        let matches = ps.scan(b"^start middle end$");
        
        assert!(matches.iter().any(|m| m.pattern_id == 0), "Should find '^start' at position 0");
        assert!(matches.iter().any(|m| m.pattern_id == 1), "Should find 'end$' at end");
    }

    /// Maximum pattern ID (usize)
    #[test]
    fn adversarial_max_pattern_id() {
        let ps = PatternSet::builder()
            .add_literal("test", usize::MAX)
            .build()
            .unwrap();
        
        let matches = ps.scan(b"test");
        assert_eq!(matches[0].pattern_id, usize::MAX);
    }

    /// Very long pattern
    #[test]
    fn adversarial_very_long_pattern() {
        let long_pattern = "a".repeat(10000);
        let ps = PatternSet::builder()
            .add_literal(&long_pattern, 0)
            .build()
            .unwrap();
        
        let input = format!("prefix{}suffix", long_pattern);
        let matches = ps.scan(input.as_bytes());
        
        assert_eq!(matches.len(), 1);
        assert_eq!(matches[0].start, 6); // "prefix" length
        assert_eq!(matches[0].end, 6 + 10000);
    }

    // =========================================================================
    // Test 12: Concurrent/determinism tests
    // =========================================================================

    /// Deterministic ordering of matches
    #[test]
    fn adversarial_match_ordering() {
        let ps = PatternSet::builder()
            .add_literal("abc", 0)
            .add_literal("abc", 1)  // Same pattern, different ID
            .add_regex(r"abc", 2)   // Same as regex
            .build()
            .unwrap();
        
        let matches = ps.scan(b"abc");
        
        // Should find all three, order may vary but should be consistent
        assert_eq!(matches.len(), 3);
        
        // All should have same offsets
        for m in &matches {
            assert_eq!(m.start, 0);
            assert_eq!(m.end, 3);
        }
    }

    /// Empty builder should fail
    #[test]
    fn adversarial_empty_builder_fails() {
        let result = PatternSet::builder().build();
        assert!(result.is_err(), "Empty pattern set should fail to build");
    }

    /// Invalid regex should fail gracefully
    #[test]
    fn adversarial_invalid_regex() {
        let result = PatternSet::builder()
            .add_regex("[invalid(", 0)  // Unclosed group
            .build();
        
        assert!(result.is_err(), "Invalid regex should fail to compile");
    }

    /// Pattern ID collisions - should both be stored
    #[test]
    fn adversarial_pattern_id_collision() {
        let ps = PatternSet::builder()
            .add_literal("foo", 0)
            .add_literal("bar", 0)  // Same ID, different pattern
            .build()
            .unwrap();
        
        let matches = ps.scan(b"foobar");
        
        // Both should match with same pattern_id
        let id_0_matches: Vec<_> = matches.iter().filter(|m| m.pattern_id == 0).collect();
        assert_eq!(id_0_matches.len(), 2, "Both patterns with ID 0 should match");
    }
}