use super::regex;
#[test]
fn test_ascii_control_chars() {
let re = regex(r"[\x00-\x1F\x7F]");
assert!(re.is_match("\x00")); assert!(re.is_match("\x01")); assert!(re.is_match("\x1B")); assert!(re.is_match("\x7F"));
assert!(!re.is_match("a"));
assert!(!re.is_match(" "));
}
#[test]
fn test_strip_control_chars() {
let re = regex(r"[\x00-\x1F\x7F]+");
let text = "Hello\x00World\x1B";
let result = re.replace_all(text, "");
assert_eq!(result, "HelloWorld");
}
#[test]
fn test_preserve_important_control_chars() {
let re = regex(r"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]");
assert!(!re.is_match("\t")); assert!(!re.is_match("\n")); assert!(!re.is_match("\r"));
assert!(re.is_match("\x00")); assert!(re.is_match("\x1F")); }
#[test]
fn test_unicode_whitespace() {
let re = regex(r"\p{Whitespace}+");
assert!(re.is_match(" "));
assert!(re.is_match("\t"));
assert!(re.is_match("\n"));
assert!(re.is_match("\r"));
assert!(re.is_match("\u{00A0}")); assert!(re.is_match("\u{2000}")); assert!(re.is_match("\u{2003}")); assert!(re.is_match("\u{3000}")); }
#[test]
fn test_split_on_whitespace() {
let re = regex(r"\p{Whitespace}+");
let text = "Hello\u{2003}World\u{00A0}Test";
let parts: Vec<&str> = text
.split(|c: char| re.is_match(&c.to_string()))
.filter(|s| !s.is_empty())
.collect();
assert!(!parts.is_empty());
}
#[test]
fn test_normalize_whitespace() {
let re = regex(r"\p{Whitespace}+");
let text = "Hello\u{2003}\u{2003}World\u{00A0}\u{00A0}Test";
let normalized = re.replace_all(text, " ");
assert_eq!(normalized, "Hello World Test");
}
#[test]
fn test_word_boundaries_basic() {
let re = regex(r"\b\w+\b");
let matches: Vec<_> = re
.find_iter("hello world test")
.map(|m| m.as_str())
.collect();
assert_eq!(matches, vec!["hello", "world", "test"]);
}
#[test]
fn test_word_boundaries_with_punctuation() {
let re = regex(r"\b\w+\b");
let matches: Vec<_> = re
.find_iter("Hello, world! How's it?")
.map(|m| m.as_str())
.collect();
assert!(matches.contains(&"Hello"));
assert!(matches.contains(&"world"));
assert!(matches.contains(&"How"));
assert!(matches.contains(&"s"));
assert!(matches.contains(&"it"));
}
#[test]
fn test_word_boundaries_unicode() {
let re = regex(r"\b\w+\b");
let text = "café naïve résumé";
let count = re.find_iter(text).count();
assert!(count >= 3);
}
#[test]
fn test_alphanumeric_tokens() {
let re = regex(r"[a-zA-Z0-9]+");
let matches: Vec<_> = re
.find_iter("test123 hello456 world")
.map(|m| m.as_str())
.collect();
assert_eq!(matches, vec!["test123", "hello456", "world"]);
}
#[test]
fn test_word_followed_by_punctuation() {
let re = regex(r"\w+(?=[.,!?])");
let matches: Vec<_> = re
.find_iter("Hello, world! How are you?")
.map(|m| m.as_str())
.collect();
assert!(matches.contains(&"Hello"));
assert!(matches.contains(&"world"));
assert!(matches.contains(&"you"));
}
#[test]
fn test_word_after_whitespace() {
let re = regex(r"(?<=\s)\w+");
let matches: Vec<_> = re
.find_iter("hello world test")
.map(|m| m.as_str())
.collect();
assert!(matches.contains(&"world"));
assert!(matches.contains(&"test"));
assert!(!matches.contains(&"hello")); }
#[test]
fn test_quoted_strings() {
let re = regex(r#""[^"]+""#);
let matches: Vec<_> = re
.find_iter(r#"He said "hello" and "goodbye"."#)
.map(|m| m.as_str())
.collect();
assert_eq!(matches, vec![r#""hello""#, r#""goodbye""#]);
}
#[test]
fn test_context_aware_apostrophe() {
let re = regex(r"\b\w+'\w+\b");
let matches: Vec<_> = re
.find_iter("Don't can't won't it's")
.map(|m| m.as_str())
.collect();
assert!(matches.contains(&"Don't") || matches.contains(&"don't"));
assert!(matches.contains(&"can't"));
}
#[test]
fn test_emoji_detection() {
let re = regex(r"\p{Emoji}+");
assert!(re.is_match("😀"));
assert!(re.is_match("🎉"));
assert!(re.is_match("👍"));
assert!(re.is_match("❤️"));
assert!(!re.is_match("hello"));
}
#[test]
fn test_emoji_extraction() {
let re = regex(r"\p{Emoji}+");
let text = "Hello 😀 World 🎉 Test 👍";
let emojis: Vec<_> = re.find_iter(text).map(|m| m.as_str()).collect();
assert!(emojis.len() >= 3);
}
#[test]
fn test_emoji_with_modifiers() {
let re = regex(r"\p{Emoji}+");
assert!(re.is_match("👋🏻")); assert!(re.is_match("👨👩👧👦")); }
#[test]
fn test_special_symbols() {
let re = regex(r"[\p{Symbol}\p{Punctuation}]+");
assert!(re.is_match("©"));
assert!(re.is_match("®"));
assert!(re.is_match("™"));
assert!(re.is_match("€"));
assert!(re.is_match("¥"));
assert!(re.is_match("@#$%"));
}
#[test]
fn test_identifier_tokens() {
let re = regex(r"\b[a-zA-Z_][a-zA-Z0-9_]*\b");
let code = "var foo_bar = my_function(arg1, arg2);";
let matches: Vec<_> = re.find_iter(code).map(|m| m.as_str()).collect();
assert!(matches.contains(&"var"));
assert!(matches.contains(&"foo_bar"));
assert!(matches.contains(&"my_function"));
assert!(matches.contains(&"arg1"));
assert!(matches.contains(&"arg2"));
}
#[test]
fn test_number_literals() {
let re = regex(r"\b\d+\.?\d*\b");
let code = "x = 42, y = 3.14, z = 100";
let matches: Vec<_> = re.find_iter(code).map(|m| m.as_str()).collect();
assert!(matches.contains(&"42"));
assert!(matches.contains(&"3.14"));
assert!(matches.contains(&"100"));
}
#[test]
fn test_hex_literals() {
let re = regex(r"\b0x[0-9a-fA-F]+\b");
let code = "color = 0xFF00FF, value = 0xDEADBEEF";
let matches: Vec<_> = re.find_iter(code).map(|m| m.as_str()).collect();
assert_eq!(matches, vec!["0xFF00FF", "0xDEADBEEF"]);
}
#[test]
fn test_string_literals_double() {
let re = regex(r#""(?:[^"\\]|\\.)*""#);
let code = r#"msg = "Hello \"World\"", path = "C:\\Users\\test""#;
let matches: Vec<_> = re.find_iter(code).map(|m| m.as_str()).collect();
assert_eq!(matches.len(), 2);
}
#[test]
fn test_string_literals_single() {
let re = regex(r"'(?:[^'\\]|\\.)*'");
let code = r"msg = 'Hello \'World\'', char = 'a'";
let matches: Vec<_> = re.find_iter(code).map(|m| m.as_str()).collect();
assert_eq!(matches.len(), 2);
}
#[test]
fn test_comment_single_line() {
let re = regex(r"//[^\n]*");
let code = "x = 5; // This is a comment\ny = 10; // Another comment";
let matches: Vec<_> = re.find_iter(code).map(|m| m.as_str()).collect();
assert_eq!(matches.len(), 2);
assert!(matches[0].contains("This is a comment"));
assert!(matches[1].contains("Another comment"));
}
#[test]
fn test_operators() {
let re = regex(r"[+\-*/=<>!&|]+");
let code = "x = a + b - c * d / e";
let matches: Vec<_> = re.find_iter(code).map(|m| m.as_str()).collect();
assert!(matches.contains(&"+"));
assert!(matches.contains(&"-"));
assert!(matches.contains(&"*"));
assert!(matches.contains(&"/"));
assert!(matches.contains(&"="));
}
#[test]
fn test_sentence_boundaries() {
let re = regex(r"[.!?]+");
let text = "Hello world! How are you? I'm fine.";
let matches: Vec<_> = re.find_iter(text).map(|m| m.as_str()).collect();
assert_eq!(matches, vec!["!", "?", "."]);
}
#[test]
fn test_comma_separated() {
let re = regex(r"[^,]+");
let text = "apple,banana,cherry,date";
let matches: Vec<_> = re.find_iter(text).map(|m| m.as_str().trim()).collect();
assert_eq!(matches, vec!["apple", "banana", "cherry", "date"]);
}
#[test]
fn test_bracket_matching() {
let re = regex(r"[(\[{].*?[)\]}]");
let text = "array[0] func(x) dict{key}";
let count = re.find_iter(text).count();
assert!(count >= 3);
}
#[test]
fn test_delimiter_types() {
let re = regex(r"[\p{Punctuation}]");
let text = "Hello, world! How's it? (fine) {great} [awesome]";
let delimiters: Vec<_> = re.find_iter(text).map(|m| m.as_str()).collect();
assert!(delimiters.contains(&","));
assert!(delimiters.contains(&"!"));
assert!(delimiters.contains(&"?"));
assert!(delimiters.contains(&"("));
assert!(delimiters.contains(&")"));
}
#[test]
fn test_ascii_digits() {
let re = regex(r"\d+");
let text = "There are 42 apples and 100 oranges";
let matches: Vec<_> = re.find_iter(text).map(|m| m.as_str()).collect();
assert_eq!(matches, vec!["42", "100"]);
}
#[test]
fn test_unicode_digits() {
let re = regex(r"\p{Nd}+");
assert!(re.is_match("٠١٢٣٤٥٦٧٨٩"));
assert!(re.is_match("०१२३४५६७८९"));
assert!(re.is_match("0123456789"));
}
#[test]
fn test_decimal_numbers() {
let re = regex(r"\d+\.\d+");
let text = "pi is 3.14159 and e is 2.71828";
let matches: Vec<_> = re.find_iter(text).map(|m| m.as_str()).collect();
assert_eq!(matches, vec!["3.14159", "2.71828"]);
}
#[test]
fn test_scientific_notation() {
let re = regex(r"\d+\.?\d*[eE][+-]?\d+");
assert!(re.is_match("1.23e10"));
assert!(re.is_match("4.56E-5"));
assert!(re.is_match("7e8"));
}
#[test]
fn test_number_with_separators() {
let re = regex(r"\d{1,3}(?:,\d{3})+");
assert!(re.is_match("1,000"));
assert!(re.is_match("1,000,000"));
assert!(re.is_match("123,456,789"));
}
#[test]
fn test_latin_cjk_mixed() {
let re_latin = regex(r"\p{Latin}+");
let re_han = regex(r"\p{Han}+");
let text = "Hello 你好 World 世界";
let latin: Vec<_> = re_latin.find_iter(text).map(|m| m.as_str()).collect();
let han: Vec<_> = re_han.find_iter(text).map(|m| m.as_str()).collect();
assert!(latin.contains(&"Hello"));
assert!(latin.contains(&"World"));
assert!(han.contains(&"你好"));
assert!(han.contains(&"世界"));
}
#[test]
fn test_latin_arabic_mixed() {
let re_latin = regex(r"\p{Latin}+");
let re_arabic = regex(r"\p{Arabic}+");
let text = "Hello مرحبا World";
assert!(re_latin.is_match(text));
assert!(re_arabic.is_match(text));
}
#[test]
fn test_latin_cyrillic_mixed() {
let re_latin = regex(r"\p{Latin}+");
let re_cyrillic = regex(r"\p{Cyrillic}+");
let text = "Hello Привет World";
let latin: Vec<_> = re_latin.find_iter(text).map(|m| m.as_str()).collect();
let cyrillic: Vec<_> = re_cyrillic.find_iter(text).map(|m| m.as_str()).collect();
assert!(latin.contains(&"Hello"));
assert!(latin.contains(&"World"));
assert!(cyrillic.contains(&"Привет"));
}
#[test]
fn test_script_detection() {
let re_latin = regex(r"\p{Latin}+");
let re_han = regex(r"\p{Han}+");
let re_hiragana = regex(r"\p{Hiragana}+");
let re_hangul = regex(r"\p{Hangul}+");
let re_arabic = regex(r"\p{Arabic}+");
assert!(re_latin.is_match("English"));
assert!(re_han.is_match("中文"));
assert!(re_hiragana.is_match("ひらがな")); assert!(re_hangul.is_match("한글"));
assert!(re_arabic.is_match("العربية"));
}
#[test]
fn test_mixed_script_word_segmentation() {
let re = regex(r"\p{L}+");
let text = "Hello世界Test中文Mix";
let segments: Vec<_> = re.find_iter(text).map(|m| m.as_str()).collect();
assert!(!segments.is_empty());
}
#[test]
fn test_combining_marks() {
let re = regex(r"\p{M}+");
assert!(re.is_match("\u{0301}")); assert!(re.is_match("\u{0308}"));
assert!(!re.is_match("a"));
}
#[test]
fn test_grapheme_clusters() {
let re = regex(r"\X");
assert!(re.is_match("é")); assert!(re.is_match("ñ")); }
#[test]
fn test_zero_width_joiners() {
let text = "👨👩👧👦";
let re = regex(r"\p{Emoji}");
assert!(re.is_match(text));
}
#[test]
fn test_html_tags_removal() {
let re = regex(r"<[^>]+>");
let html = "Hello <b>world</b> <a href='test'>link</a>";
let cleaned = re.replace_all(html, "");
assert_eq!(cleaned, "Hello world link");
}
#[test]
fn test_markdown_code_blocks() {
let re = regex(r"`[^`]+`");
let text = "Use `print()` and `input()` functions";
let matches: Vec<_> = re.find_iter(text).map(|m| m.as_str()).collect();
assert_eq!(matches, vec!["`print()`", "`input()`"]);
}
#[test]
fn test_camelcase_split() {
let re = regex(r"[A-Z][a-z]+");
let text = "CamelCaseExample";
let matches: Vec<_> = re.find_iter(text).map(|m| m.as_str()).collect();
assert_eq!(matches, vec!["Camel", "Case", "Example"]);
}
#[test]
fn test_hashtag_extraction() {
let re = regex(r"#\w+");
let text = "Check out #regex #programming #rust";
let hashtags: Vec<_> = re.find_iter(text).map(|m| m.as_str()).collect();
assert_eq!(hashtags, vec!["#regex", "#programming", "#rust"]);
}
#[test]
fn test_mention_extraction() {
let re = regex(r"@\w+");
let text = "Hey @user123 and @developer, check this out!";
let mentions: Vec<_> = re.find_iter(text).map(|m| m.as_str()).collect();
assert_eq!(mentions, vec!["@user123", "@developer"]);
}
#[test]
fn test_url_in_text() {
let re = regex(r"(?:https|http)://[^ \t\n\r]+");
let text = "Visit https://example.com and http://test.org for info.";
let urls: Vec<_> = re.find_iter(text).map(|m| m.as_str()).collect();
assert!(urls.len() >= 2);
assert!(urls.iter().any(|u| u.contains("example.com")));
assert!(urls.iter().any(|u| u.contains("test.org")));
}
#[test]
fn test_newline_types() {
let re = regex(r"\r?\n|\r");
assert!(re.is_match("\n")); assert!(re.is_match("\r\n")); assert!(re.is_match("\r")); }
#[test]
fn test_consecutive_duplicates() {
let re = regex(r"(\w)\1+");
assert!(re.is_match("hello")); assert!(re.is_match("book")); assert!(re.is_match("aaa")); }
#[test]
fn test_acronym_detection() {
let re = regex(r"\b[A-Z]{2,}\b");
let text = "NASA and FBI work with USA on AI";
let acronyms: Vec<_> = re.find_iter(text).map(|m| m.as_str()).collect();
assert!(acronyms.contains(&"NASA"));
assert!(acronyms.contains(&"FBI"));
assert!(acronyms.contains(&"USA"));
assert!(acronyms.contains(&"AI"));
}
const CL100K_PATTERN: &str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+";
#[test]
fn test_cl100k_whitespace_tokenization() {
let re = regex(CL100K_PATTERN);
let text = "hello world";
let tokens: Vec<_> = re.find_iter(text).map(|m| m.as_str()).collect();
assert_eq!(tokens.len(), 3, "Should have 3 tokens");
assert_eq!(tokens[0], "hello", "First token should be 'hello'");
assert_eq!(tokens[1], " ", "Second token should be 2 spaces (not 3)");
assert_eq!(
tokens[2], " world",
"Third token should be ' world' (space + word)"
);
}
#[test]
fn test_cl100k_mixed_whitespace() {
let re = regex(CL100K_PATTERN);
let text = "hello world\t\ttest\n\n\nmore text";
let tokens: Vec<_> = re.find_iter(text).map(|m| m.as_str()).collect();
assert_eq!(tokens.len(), 9, "Should have 9 tokens");
assert_eq!(tokens[0], "hello");
assert_eq!(tokens[1], " "); assert_eq!(tokens[2], " world"); assert_eq!(tokens[3], "\t"); assert_eq!(tokens[4], "\ttest"); assert_eq!(tokens[5], "\n\n\n"); assert_eq!(tokens[6], "more");
assert_eq!(tokens[7], " "); assert_eq!(tokens[8], " text"); }
#[test]
fn test_whitespace_negative_lookahead_backtracking() {
let re = regex(r"\s+(?!\S)");
let text = " w";
let m = re.find(text);
assert!(m.is_some(), "Should find a match");
assert_eq!(m.unwrap().as_str(), " ", "Should match only 1 space");
let text2 = " word";
let m2 = re.find(text2);
assert!(m2.is_some(), "Should find a match");
assert_eq!(m2.unwrap().as_str(), " ", "Should match 2 spaces");
let text3 = " ";
let m3 = re.find(text3);
assert!(m3.is_some(), "Should find a match");
assert_eq!(m3.unwrap().as_str(), " ", "Should match all 3 spaces");
}
#[test]
fn test_cl100k_contractions() {
let re = regex(CL100K_PATTERN);
let text = "I'm you're they've we'll";
let tokens: Vec<_> = re.find_iter(text).map(|m| m.as_str()).collect();
assert!(tokens.contains(&"'m"), "Should contain 'm");
assert!(tokens.contains(&"'re"), "Should contain 're");
assert!(tokens.contains(&"'ve"), "Should contain 've");
assert!(tokens.contains(&"'ll"), "Should contain 'll");
}
#[test]
fn test_cl100k_numbers() {
let re = regex(CL100K_PATTERN);
let text = "123456789";
let tokens: Vec<_> = re.find_iter(text).map(|m| m.as_str()).collect();
assert_eq!(tokens, vec!["123", "456", "789"]);
}
const O200K_PATTERN: &str = r"[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+";
#[test]
fn test_o200k_basic_tokenization() {
let re = regex(O200K_PATTERN);
let text = "Hello world";
let tokens: Vec<_> = re.find_iter(text).map(|m| m.as_str()).collect();
assert_eq!(tokens.len(), 2);
assert_eq!(tokens[0], "Hello");
assert_eq!(tokens[1], " world");
}
#[test]
fn test_o200k_whitespace_tokenization() {
let re = regex(O200K_PATTERN);
let text = "hello world";
let tokens: Vec<_> = re.find_iter(text).map(|m| m.as_str()).collect();
assert_eq!(tokens.len(), 3, "Should have 3 tokens");
assert_eq!(tokens[0], "hello", "First token should be 'hello'");
assert_eq!(tokens[1], " ", "Second token should be 2 spaces");
assert_eq!(tokens[2], " world", "Third token should be ' world'");
}
#[test]
fn test_o200k_contractions() {
let re = regex(O200K_PATTERN);
let text = "I'm you're they've we'll";
let tokens: Vec<_> = re.find_iter(text).map(|m| m.as_str()).collect();
assert!(tokens.iter().any(|t| t.contains("'m")));
assert!(tokens.iter().any(|t| t.contains("'re")));
assert!(tokens.iter().any(|t| t.contains("'ve")));
assert!(tokens.iter().any(|t| t.contains("'ll")));
}
#[test]
fn test_o200k_numbers() {
let re = regex(O200K_PATTERN);
let text = "123456789";
let tokens: Vec<_> = re.find_iter(text).map(|m| m.as_str()).collect();
assert_eq!(tokens, vec!["123", "456", "789"]);
}
#[test]
fn test_o200k_utf8_em_dash() {
let re = regex(O200K_PATTERN);
let text = "word—word";
let tokens: Vec<_> = re.find_iter(text).map(|m| m.as_str()).collect();
assert!(!tokens.is_empty());
let reconstructed: String = tokens.concat();
assert_eq!(reconstructed, text);
}
#[test]
fn test_o200k_utf8_curly_quotes() {
let re = regex(O200K_PATTERN);
let text = "He said, \u{2018}Hello\u{2019} and \u{201c}Goodbye\u{201d}.";
let tokens: Vec<_> = re.find_iter(text).map(|m| m.as_str()).collect();
assert!(!tokens.is_empty());
let reconstructed: String = tokens.concat();
assert_eq!(reconstructed, text);
}
#[test]
fn test_o200k_utf8_multiple_em_dashes() {
let re = regex(O200K_PATTERN);
let texts = [
"word—word",
"a—b",
"test—",
"—start",
"one—two—three",
"Check your brake pads or rotors—they might be worn out.",
"I'm sorry you're hurting—breakups suck, but you'll get through it.",
"Check if you're using valid credentials—API key, token—in headers.",
];
for text in texts {
let tokens: Vec<_> = re.find_iter(text).map(|m| m.as_str()).collect();
assert!(!tokens.is_empty(), "Should find tokens in: {}", text);
let reconstructed: String = tokens.concat();
assert_eq!(
reconstructed, text,
"Should reconstruct original text: {}",
text
);
}
}
#[test]
fn test_o200k_mixed_unicode() {
let re = regex(O200K_PATTERN);
let text = "Hello 你好 World 世界";
let tokens: Vec<_> = re.find_iter(text).map(|m| m.as_str()).collect();
assert!(!tokens.is_empty());
let reconstructed: String = tokens.concat();
assert_eq!(reconstructed, text);
}
#[test]
fn test_o200k_emoji() {
let re = regex(O200K_PATTERN);
let text = "Hello 😀 World 🎉";
let tokens: Vec<_> = re.find_iter(text).map(|m| m.as_str()).collect();
assert!(!tokens.is_empty());
let reconstructed: String = tokens.concat();
assert_eq!(reconstructed, text);
}