multimatch 0.1.1

//! Adversarial tests designed to expose edge cases, performance issues,
//! and correctness bugs in the santh-match engine.
//!
//! These tests intentionally push boundaries: empty inputs, massive inputs,
//! pathological patterns, encoding edge cases, and stress scenarios.

#[cfg(test)]
mod tests {
    use crate::{PatternSet, Scanner};

    // =========================================================================
    // Test 1: Empty input variations
    // =========================================================================
    
    /// Empty input against empty pattern should have defined behavior
    #[test]
    fn adversarial_empty_input_empty_pattern() {
        let ps = PatternSet::builder()
            .add_literal("", 0)  // Empty literal pattern
            .build()
            .unwrap();
        
        let matches = ps.scan(b"");
        // Empty pattern matches empty input at position 0
        assert_eq!(matches.len(), 1, "Empty pattern should match empty input once");
        assert_eq!(matches[0].pattern_id, 0);
        assert_eq!(matches[0].start, 0);
        assert_eq!(matches[0].end, 0);
    }

    /// Empty regex pattern should match at every position (including empty input)
    #[test]
    fn adversarial_empty_regex_pattern() {
        let ps = PatternSet::builder()
            .add_regex("", 0)  // Empty regex matches empty string
            .build()
            .unwrap();
        
        let matches = ps.scan(b"abc");
        // Empty regex matches at positions: 0, 1, 2, 3 (before each char and at end)
        assert_eq!(matches.len(), 4, "Empty regex should match at 4 positions in 'abc'");
    }

    // =========================================================================
    // Test 2: 1-byte input stress test
    // =========================================================================

    /// Single byte input with multi-byte patterns
    #[test]
    fn adversarial_single_byte_input() {
        let ps = PatternSet::builder()
            .add_literal("abc", 0)
            .add_literal("x", 1)
            .add_regex(r"\w+", 2)
            .add_regex(r".", 3)
            .build()
            .unwrap();
        
        let matches = ps.scan(b"x");
        
        // Pattern 0 ("abc") should not match
        assert!(!matches.iter().any(|m| m.pattern_id == 0), "'abc' should not match single byte 'x'");
        
        // Pattern 1 ("x") should match
        assert!(matches.iter().any(|m| m.pattern_id == 1), "'x' should match 'x'");
        
        // Pattern 2 (\w+) should match
        assert!(matches.iter().any(|m| m.pattern_id == 2), r"'\w+' should match 'x'");
        
        // Pattern 3 (.) should match
        assert!(matches.iter().any(|m| m.pattern_id == 3), "'.' should match 'x'");
    }

    /// 1-byte input that is a regex metacharacter
    #[test]
    fn adversarial_single_byte_metachar() {
        let ps = PatternSet::builder()
            .add_regex(r"\.", 0)  // Literal dot
            .add_regex(r".", 1)   // Any char
            .build()
            .unwrap();
        
        // Input is literal dot byte
        let matches = ps.scan(b".");
        
        // Both patterns should match
        assert!(matches.iter().any(|m| m.pattern_id == 0), r"'\.' should match '.'");
        assert!(matches.iter().any(|m| m.pattern_id == 1), "'.' should match '.'");
    }

    // =========================================================================
    // Test 3: Large input (10MB) - performance stress test
    // =========================================================================

    /// 10MB input with patterns that match frequently (potential memory/perf issue)
    #[test]
    fn adversarial_large_input_frequent_matches() {
        let input = vec![b'a'; 10 * 1024 * 1024]; // 10MB of 'a's
        
        let ps = PatternSet::builder()
            .add_literal("a", 0)  // Matches every position
            .build()
            .unwrap();
        
        let matches = ps.scan(&input);
        
        // This could cause massive memory usage or be slow
        // 10MB input with single-char pattern = 10M matches
        assert_eq!(matches.len(), input.len(), "Should have one match per byte");
        
        // Verify some offsets
        assert_eq!(matches[0].start, 0);
        assert_eq!(matches[matches.len() - 1].start, input.len() - 1);
    }

    /// 10MB input with regex that matches everything (catastrophic backtracking risk)
    #[test]
    fn adversarial_large_input_regex_match_all() {
        let input = vec![b'x'; 5 * 1024 * 1024]; // 5MB to avoid timeout
        
        let ps = PatternSet::builder()
            .add_regex(r".*", 0)  // Matches entire string
            .build()
            .unwrap();
        
        let matches = ps.scan(&input);
        
        // .* should match once, the entire input
        assert!(!matches.is_empty(), ".* should match");
        // First match should cover whole input
        assert_eq!(matches[0].start, 0);
        assert_eq!(matches[0].end, input.len());
    }

    // =========================================================================
    // Test 4: Regex that matches everything (pathological patterns)
    // =========================================================================

    /// Regex anchors and word boundaries
    #[test]
    fn adversarial_regex_anchors_and_boundaries() {
        let ps = PatternSet::builder()
            .add_regex(r"^", 0)      // Start of string
            .add_regex(r"$", 1)      // End of string  
            .add_regex(r"\b", 2)     // Word boundary
            .build()
            .unwrap();
        
        let matches = ps.scan(b"ab");
        
        // ^ matches at position 0 (empty match at start)
        assert!(matches.iter().any(|m| m.pattern_id == 0 && m.start == 0), "^ should match at start");
        
        // $ matches at position 2 (end)
        assert!(matches.iter().any(|m| m.pattern_id == 1 && m.start == 2), "$ should match at end");
        
        // \b should match at word boundaries (positions 0 and 2 for "ab")
        let boundary_matches: Vec<_> = matches.iter().filter(|m| m.pattern_id == 2).collect();
        assert!(!boundary_matches.is_empty(), r"'\b' should find word boundaries");
    }

    /// Regex with lookahead is NOT supported by the regex crate
    /// This test documents that limitation
    #[test]
    fn adversarial_regex_lookahead_not_supported() {
        let result = PatternSet::builder()
            .add_regex(r"(?=\w)", 0)  // Zero-width lookahead - NOT SUPPORTED
            .build();
        
        // Should fail to compile with InvalidRegex error
        assert!(result.is_err(), "Lookahead regex should fail to compile");
    }

    /// Regex with catastrophic backtracking potential
    #[test]
    fn adversarial_regex_catastrophic_backtracking() {
        let ps = PatternSet::builder()
            .add_regex(r"(a+)+b", 0)  // Known backtracking pattern
            .build()
        .unwrap();
        
        // Input that triggers exponential backtracking in naive engines
        // With "aaaaaaaaaaaaaac" pattern should fail quickly with no match
        let input = b"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaac";
        
        let start = std::time::Instant::now();
        let matches = ps.scan(input);
        let elapsed = start.elapsed();
        
        // Should complete in reasonable time (regex crate handles this well)
        assert!(elapsed.as_secs() < 5, "Should not hang on backtracking pattern");
        assert!(matches.is_empty(), "Pattern should not match");
    }

    // =========================================================================
    // Test 5: Overlapping literal + regex patterns
    // =========================================================================

    /// Same pattern added as both literal and regex
    #[test]
    fn adversarial_overlapping_literal_regex_same() {
        let ps = PatternSet::builder()
            .add_literal("test", 0)
            .add_regex(r"test", 1)
            .build()
            .unwrap();
        
        let matches = ps.scan(b"this is a test string");
        
        // Both should match at same position
        let literal_match = matches.iter().find(|m| m.pattern_id == 0);
        let regex_match = matches.iter().find(|m| m.pattern_id == 1);
        
        assert!(literal_match.is_some(), "Literal should match");
        assert!(regex_match.is_some(), "Regex should match");
        
        // Both should have same offsets
        assert_eq!(literal_match.unwrap().start, regex_match.unwrap().start);
        assert_eq!(literal_match.unwrap().end, regex_match.unwrap().end);
    }

    /// Overlapping matches with different priorities
    #[test]
    fn adversarial_overlapping_priority() {
        let ps = PatternSet::builder()
            .add_literal("password", 0)
            .add_literal("pass", 1)
            .add_regex(r"pass\w+", 2)
            .build()
            .unwrap();
        
        let matches = ps.scan(b"mypassword123");
        
        // All three should match (overlapping allowed)
        assert!(matches.iter().any(|m| m.pattern_id == 0), "'password' should match inside 'mypassword123'");
        assert!(matches.iter().any(|m| m.pattern_id == 1), "'pass' should match");
        assert!(matches.iter().any(|m| m.pattern_id == 2), r"'pass\w+' should match");
    }

    // =========================================================================
    // Test 6: Null byte patterns and input
    // =========================================================================

    /// Pattern containing null bytes
    #[test]
    fn adversarial_null_byte_pattern() {
        let ps = PatternSet::builder()
            .add_literal("pass\0word", 0)  // Literal with null (9 bytes: p-a-s-s-\0-w-o-r-d)
            .build()
            .unwrap();
        
        let matches = ps.scan(b"my pass\0word is secret");
        assert_eq!(matches.len(), 1, "Pattern with null byte should match");
        assert_eq!(matches[0].start, 3);  // After "my "
        assert_eq!(matches[0].end, 12);   // 3 + 9 bytes = 12
    }

    /// Input is all null bytes
    #[test]
    fn adversarial_null_byte_input() {
        let ps = PatternSet::builder()
            .add_literal("\0\0", 0)
            .add_regex(r"\x00+", 1)
            .build()
            .unwrap();
        
        let input = vec![0u8; 100];
        let matches = ps.scan(&input);
        
        // Literal "\0\0" should match 99 times (overlapping)
        let literal_matches: Vec<_> = matches.iter().filter(|m| m.pattern_id == 0).collect();
        assert_eq!(literal_matches.len(), 99, "'\0\0' should match 99 times with overlap");
        
        // Regex \x00+ should match once (the whole sequence)
        let regex_matches: Vec<_> = matches.iter().filter(|m| m.pattern_id == 1).collect();
        assert!(!regex_matches.is_empty(), r"'\x00+' should match");
    }

    /// Mixed null and regular bytes
    #[test]
    fn adversarial_mixed_null_bytes() {
        let ps = PatternSet::builder()
            .add_literal("a\0b\0c", 0)
            .add_regex(r"a\x00b\x00c", 1)
            .build()
            .unwrap();
        
        let matches = ps.scan(b"xyz a\0b\0c def");
        
        assert_eq!(matches.len(), 2, "Both patterns should match");
    }

    // =========================================================================
    // Test 7: Case sensitivity edge cases
    // =========================================================================

    /// Turkish 'I' problem - dotted/dotless I
    #[test]
    fn adversarial_turkish_i_case() {
        let ps = PatternSet::builder()
            .add_literal_ci("I", 0)   // Should match Turkish dotted i?
            .add_literal_ci("i", 1)   // Should match Turkish dotless I?
            .build()
            .unwrap();
        
        // Turkish dotted I
        let dotted_i = "İ".as_bytes();
        let matches = ps.scan(dotted_i);
        
        // With simple unicode case-insensitivity (via regex), 'i' does NOT match 'İ'
        // because it requires full/Turkic case folding.
        assert!(!matches.iter().any(|m| m.pattern_id == 1), "'i' should not match 'İ' with simple case folding");
        
        // Turkish dotless I
        let dotless_i = "I".as_bytes();
        let matches2 = ps.scan(dotless_i);
        assert!(matches2.iter().any(|m| m.pattern_id == 0), "'I' should match 'I'");
    }

    /// Fullwidth Latin letters (different Unicode block)
    #[test]
    fn adversarial_fullwidth_case() {
        let ps = PatternSet::builder()
            .add_literal_ci("A", 0)
            .build()
            .unwrap();
        
        // Fullwidth Latin Capital A (U+FF21)
        let fullwidth_a = "Ａ".as_bytes();
        let matches = ps.scan(fullwidth_a);
        
        // ASCII 'A' case-insensitive should NOT match fullwidth 'Ａ'
        // This is intentional - they are different characters
        assert!(!matches.iter().any(|m| m.pattern_id == 0), 
            "ASCII 'A' should not match fullwidth 'Ａ' even with case-insensitive");
    }

    /// German eszett (ß) case folding
    #[test]
    fn adversarial_german_eszett() {
        let ps = PatternSet::builder()
            .add_literal_ci("SS", 0)   // Should this match ß?
            .add_literal_ci("ß", 1)    // Lowercase eszett
            .build()
            .unwrap();
        
        // ß to uppercase is SS in German, but SS to lowercase is not necessarily ß
        let eszett = "Maße".as_bytes();  // Contains ß
        let matches = ps.scan(eszett);
        
        // Pattern 1 (ß) should match
        assert!(matches.iter().any(|m| m.pattern_id == 1), "'ß' should match 'Maße'");
        
        // Pattern 0 (SS) with case-insensitive - Rust regex crate does NOT expand 
        // 1-to-many case folding by default, so 'SS' does not match 'ß'
        assert!(!matches.iter().any(|m| m.pattern_id == 0), "'SS' does not match 'ß' in standard regex case-folding");
    }

    /// Case-insensitive literal vs case-insensitive regex consistency
    #[test]
    fn adversarial_ci_consistency() {
        let ps = PatternSet::builder()
            .add_literal_ci("ABC", 0)
            .add_regex_ci("ABC", 1)
            .build()
            .unwrap();
        
        let test_cases = vec![
            ("abc", true, true),
            ("ABC", true, true),
            ("Abc", true, true),
            ("aBc", true, true),
        ];
        
        for (input, _lit_expected, _regex_expected) in test_cases {
            let matches = ps.scan(input.as_bytes());
            let lit_matches = matches.iter().any(|m| m.pattern_id == 0);
            let regex_matches = matches.iter().any(|m| m.pattern_id == 1);
            
            // Both should behave consistently
            assert_eq!(lit_matches, regex_matches, 
                "Literal and regex case-insensitive should match same inputs for '{}'", input);
        }
    }

    // =========================================================================
    // Test 8: 1000+ patterns stress test
    // =========================================================================

    /// Exactly 1000 patterns
    #[test]
    fn adversarial_thousand_patterns() {
        let mut builder = PatternSet::builder();
        for i in 0..1000 {
            builder = builder.add_literal(&format!("pattern_{:04}", i), i);
        }
        
        let ps = builder.build().unwrap();
        assert_eq!(ps.pattern_count(), 1000);
        
        // Input contains several patterns
        let input = b"contains pattern_0001 and pattern_0999 in middle";
        let matches = ps.scan(input);
        
        assert!(matches.iter().any(|m| m.pattern_id == 1), "pattern_0001 should match");
        assert!(matches.iter().any(|m| m.pattern_id == 999), "pattern_0999 should match");
    }

    /// 1000 patterns where many match the same input
    #[test]
    fn adversarial_thousand_overlapping_matches() {
        let mut builder = PatternSet::builder();
        
        // All patterns are single characters, all match same input positions
        for i in 0..1000 {
            builder = builder.add_literal("a", i);
        }
        
        let ps = builder.build().unwrap();
        let input = b"aaa";  // 3 bytes
        let matches = ps.scan(input);
        
        // Each position gets 1000 matches (one per pattern ID)
        // 3 positions × 1000 patterns = 3000 matches
        assert_eq!(matches.len(), 3000, "Each position should match all 1000 patterns");
    }

    /// Mixed 1000 literals and regexes
    #[test]
    fn adversarial_mixed_thousand_patterns() {
        let mut builder = PatternSet::builder();
        
        for i in 0..500 {
            builder = builder.add_literal(&format!("lit_{}", i), i);
            builder = builder.add_regex(&format!(r"re_{}", i), i + 500);
        }
        
        let ps = builder.build().unwrap();
        
        let input = b"contains lit_100 and re_200 here";
        let matches = ps.scan(input);
        
        assert!(matches.iter().any(|m| m.pattern_id == 100), "lit_100 should match");
        assert!(matches.iter().any(|m| m.pattern_id == 700), "re_200 should match (id 500+200)");
    }

    // =========================================================================
    // Test 9: Unicode edge cases
    // =========================================================================

    /// Multi-byte UTF-8 characters - offsets should be in bytes
    #[test]
    fn adversarial_unicode_byte_offsets() {
        let ps = PatternSet::builder()
            .add_literal("日本", 0)  // Japanese "Japan" - 6 bytes (3 bytes per char)
            .build()
            .unwrap();
        
        let input = "私は日本語を話します".as_bytes(); // "I speak Japanese"
        let matches = ps.scan(input);
        
        assert_eq!(matches.len(), 1);
        // "私" = 3 bytes, "は" = 3 bytes
        // "日本" starts after "私は" (2 chars × 3 bytes = 6 bytes)
        assert_eq!(matches[0].start, 6);
        assert_eq!(matches[0].end, 12); // 6 + 6 bytes for "日本"
    }

    /// Invalid UTF-8 sequences
    #[test]
    fn adversarial_invalid_utf8() {
        let ps = PatternSet::builder()
            .add_literal("abc", 0)
            .add_regex(r"abc", 1)
            .build()
            .unwrap();
        
        // Invalid UTF-8: 0x80 is a continuation byte without a starter
        let input = b"\x80abc\xff\xfe";
        let matches = ps.scan(input);
        
        // Should still find "abc" despite invalid UTF-8 surrounding it
        assert!(matches.iter().any(|m| m.pattern_id == 0), "Should find literal in invalid UTF-8");
        assert!(matches.iter().any(|m| m.pattern_id == 1), "Should find regex in invalid UTF-8");
    }

    /// Zero-width characters and combining marks
    #[test]
    fn adversarial_zero_width_chars() {
        let ps = PatternSet::builder()
            .add_literal("café", 0)  // Pre-composed é
            .add_literal("cafe\u{0301}", 1)  // e + combining acute accent
            .build()
            .unwrap();
        
        // These are DIFFERENT byte sequences!
        let composed = "café".as_bytes();
        let decomposed = "cafe\u{0301}".as_bytes();
        
        let matches1 = ps.scan(composed);
        assert!(matches1.iter().any(|m| m.pattern_id == 0), "Pre-composed café should match");
        
        let matches2 = ps.scan(decomposed);
        assert!(matches2.iter().any(|m| m.pattern_id == 1), "Decomposed cafe+◌́ should match");
    }

    /// Emoji and variation selectors
    #[test]
    fn adversarial_emoji_variation() {
        let ps = PatternSet::builder()
            .add_literal("❤️", 0)  // Heart with variation selector
            .add_literal("❤", 1)   // Heart without variation selector
            .build()
            .unwrap();
        
        let with_vs = "❤️".as_bytes();   // U+2764 U+FE0F (6 bytes)
        let without_vs = "❤".as_bytes(); // U+2764 (3 bytes)
        
        let matches1 = ps.scan(with_vs);
        assert!(matches1.iter().any(|m| m.pattern_id == 0), "Heart with VS should match");
        
        let matches2 = ps.scan(without_vs);
        assert!(matches2.iter().any(|m| m.pattern_id == 1), "Heart without VS should match");
    }

    /// Right-to-left text
    #[test]
    fn adversarial_rtl_text() {
        let ps = PatternSet::builder()
            .add_literal("مرحبا", 0)  // Arabic "hello"
            .build()
            .unwrap();
        
        let input = "مرحبا بالعالم".as_bytes(); // "Hello world"
        let matches = ps.scan(input);
        
        assert_eq!(matches.len(), 1);
        // Byte offsets work regardless of visual direction
        assert_eq!(matches[0].start, 0);
        assert_eq!(matches[0].end, 10); // 5 Arabic chars × 2 bytes each
    }

    // =========================================================================
    // Test 10: Special regex metacharacters in literals
    // =========================================================================

    /// Literal containing regex metacharacters should be treated as literal
    #[test]
    fn adversarial_metachar_in_literal() {
        // Pattern includes various regex metacharacters that should be treated literally
        let pattern = r".*+?^${}()|[]\";
        let ps = PatternSet::builder()
            .add_literal(pattern, 0)
            .build()
            .unwrap();
        
        // Create matching input - must match the literal exactly
        let input = pattern.as_bytes();
        let matches = ps.scan(input);
        
        assert_eq!(matches.len(), 1, "Literal with metachars should match exactly");
        assert_eq!(matches[0].pattern_id, 0);
        assert_eq!(matches[0].start, 0);
        assert_eq!(matches[0].end, pattern.len());
    }

    /// Same pattern as literal vs regex
    #[test]
    fn adversarial_literal_vs_regex_metachars() {
        let ps = PatternSet::builder()
            .add_literal(r"\d+", 0)  // Literal backslash-d-plus
            .add_regex(r"\d+", 1)    // Regex for digits
            .build()
            .unwrap();
        
        let input = b"123 \\d+ 456";
        let matches = ps.scan(input);
        
        // Regex should match "123" and "456"
        let regex_matches: Vec<_> = matches.iter().filter(|m| m.pattern_id == 1).collect();
        assert_eq!(regex_matches.len(), 2, r"Regex '\d+' should match digit sequences");
        
        // Literal should match the literal "\d+" text
        let lit_matches: Vec<_> = matches.iter().filter(|m| m.pattern_id == 0).collect();
        assert_eq!(lit_matches.len(), 1, r"Literal '\d+' should match literal text");
    }

    // =========================================================================
    // Test 11: Boundary value tests
    // =========================================================================

    /// Pattern at exact start and end of input
    #[test]
    fn adversarial_boundary_positions() {
        let ps = PatternSet::builder()
            .add_literal("^start", 0)
            .add_literal("end$", 1)
            .build()
            .unwrap();
        
        // Note: ^ and $ are NOT anchors in literals!
        let matches = ps.scan(b"^start middle end$");
        
        assert!(matches.iter().any(|m| m.pattern_id == 0), "Should find '^start' at position 0");
        assert!(matches.iter().any(|m| m.pattern_id == 1), "Should find 'end$' at end");
    }

    /// Maximum pattern ID (usize)
    #[test]
    fn adversarial_max_pattern_id() {
        let ps = PatternSet::builder()
            .add_literal("test", usize::MAX)
            .build()
            .unwrap();
        
        let matches = ps.scan(b"test");
        assert_eq!(matches[0].pattern_id, usize::MAX);
    }

    /// Very long pattern
    #[test]
    fn adversarial_very_long_pattern() {
        let long_pattern = "a".repeat(10000);
        let ps = PatternSet::builder()
            .add_literal(&long_pattern, 0)
            .build()
            .unwrap();
        
        let input = format!("prefix{}suffix", long_pattern);
        let matches = ps.scan(input.as_bytes());
        
        assert_eq!(matches.len(), 1);
        assert_eq!(matches[0].start, 6); // "prefix" length
        assert_eq!(matches[0].end, 6 + 10000);
    }

    // =========================================================================
    // Test 12: Concurrent/determinism tests
    // =========================================================================

    /// Deterministic ordering of matches
    #[test]
    fn adversarial_match_ordering() {
        let ps = PatternSet::builder()
            .add_literal("abc", 0)
            .add_literal("abc", 1)  // Same pattern, different ID
            .add_regex(r"abc", 2)   // Same as regex
            .build()
            .unwrap();
        
        let matches = ps.scan(b"abc");
        
        // Should find all three, order may vary but should be consistent
        assert_eq!(matches.len(), 3);
        
        // All should have same offsets
        for m in &matches {
            assert_eq!(m.start, 0);
            assert_eq!(m.end, 3);
        }
    }

    /// Empty builder should fail
    #[test]
    fn adversarial_empty_builder_fails() {
        let result = PatternSet::builder().build();
        assert!(result.is_err(), "Empty pattern set should fail to build");
    }

    /// Invalid regex should fail gracefully
    #[test]
    fn adversarial_invalid_regex() {
        let result = PatternSet::builder()
            .add_regex("[invalid(", 0)  // Unclosed group
            .build();
        
        assert!(result.is_err(), "Invalid regex should fail to compile");
    }

    /// Pattern ID collisions - should both be stored
    #[test]
    fn adversarial_pattern_id_collision() {
        let ps = PatternSet::builder()
            .add_literal("foo", 0)
            .add_literal("bar", 0)  // Same ID, different pattern
            .build()
            .unwrap();
        
        let matches = ps.scan(b"foobar");
        
        // Both should match with same pattern_id
        let id_0_matches: Vec<_> = matches.iter().filter(|m| m.pattern_id == 0).collect();
        assert_eq!(id_0_matches.len(), 2, "Both patterns with ID 0 should match");
    }
}