rumdl_lib/utils/
regex_cache.rs1use fancy_regex::Regex as FancyRegex;
23use regex::Regex;
24use std::collections::HashMap;
25use std::sync::LazyLock;
26use std::sync::{Arc, Mutex};
27
28#[derive(Debug)]
30pub struct RegexCache {
31 cache: HashMap<String, Arc<Regex>>,
32 fancy_cache: HashMap<String, Arc<FancyRegex>>,
33 usage_stats: HashMap<String, u64>,
34}
35
36impl Default for RegexCache {
37 fn default() -> Self {
38 Self::new()
39 }
40}
41
42impl RegexCache {
43 pub fn new() -> Self {
44 Self {
45 cache: HashMap::new(),
46 fancy_cache: HashMap::new(),
47 usage_stats: HashMap::new(),
48 }
49 }
50
51 pub fn get_regex(&mut self, pattern: &str) -> Result<Arc<Regex>, regex::Error> {
53 if let Some(regex) = self.cache.get(pattern) {
54 *self.usage_stats.entry(pattern.to_string()).or_insert(0) += 1;
55 return Ok(regex.clone());
56 }
57
58 let regex = Arc::new(Regex::new(pattern)?);
59 self.cache.insert(pattern.to_string(), regex.clone());
60 *self.usage_stats.entry(pattern.to_string()).or_insert(0) += 1;
61 Ok(regex)
62 }
63
64 pub fn get_fancy_regex(&mut self, pattern: &str) -> Result<Arc<FancyRegex>, Box<fancy_regex::Error>> {
66 if let Some(regex) = self.fancy_cache.get(pattern) {
67 *self.usage_stats.entry(pattern.to_string()).or_insert(0) += 1;
68 return Ok(regex.clone());
69 }
70
71 match FancyRegex::new(pattern) {
72 Ok(regex) => {
73 let arc_regex = Arc::new(regex);
74 self.fancy_cache.insert(pattern.to_string(), arc_regex.clone());
75 *self.usage_stats.entry(pattern.to_string()).or_insert(0) += 1;
76 Ok(arc_regex)
77 }
78 Err(e) => Err(Box::new(e)),
79 }
80 }
81
82 pub fn get_stats(&self) -> HashMap<String, u64> {
84 self.usage_stats.clone()
85 }
86
87 pub fn clear(&mut self) {
89 self.cache.clear();
90 self.fancy_cache.clear();
91 self.usage_stats.clear();
92 }
93}
94
95static GLOBAL_REGEX_CACHE: LazyLock<Arc<Mutex<RegexCache>>> = LazyLock::new(|| Arc::new(Mutex::new(RegexCache::new())));
97
98pub fn get_cached_regex(pattern: &str) -> Result<Arc<Regex>, regex::Error> {
104 let mut cache = GLOBAL_REGEX_CACHE.lock().unwrap_or_else(|poisoned| {
105 let mut guard = poisoned.into_inner();
107 guard.clear();
108 guard
109 });
110 cache.get_regex(pattern)
111}
112
113pub fn get_cached_fancy_regex(pattern: &str) -> Result<Arc<FancyRegex>, Box<fancy_regex::Error>> {
119 let mut cache = GLOBAL_REGEX_CACHE.lock().unwrap_or_else(|poisoned| {
120 let mut guard = poisoned.into_inner();
122 guard.clear();
123 guard
124 });
125 cache.get_fancy_regex(pattern)
126}
127
128pub fn get_cache_stats() -> HashMap<String, u64> {
132 match GLOBAL_REGEX_CACHE.lock() {
133 Ok(cache) => cache.get_stats(),
134 Err(_) => HashMap::new(),
135 }
136}
137
138#[macro_export]
156macro_rules! regex_lazy {
157 ($pattern:expr) => {{
158 static REGEX: LazyLock<regex::Regex> = LazyLock::new(|| regex::Regex::new($pattern).unwrap());
159 &*REGEX
160 }};
161}
162
163#[macro_export]
170macro_rules! regex_cached {
171 ($pattern:expr) => {{ $crate::utils::regex_cache::get_cached_regex($pattern).expect("Failed to compile regex") }};
172}
173
174#[macro_export]
181macro_rules! fancy_regex_cached {
182 ($pattern:expr) => {{ $crate::utils::regex_cache::get_cached_fancy_regex($pattern).expect("Failed to compile fancy regex") }};
183}
184
185pub use crate::regex_lazy;
187
188pub static URL_REGEX: LazyLock<Regex> =
190 LazyLock::new(|| Regex::new(r#"(?:https?|ftp)://[^\s<>\[\]()'"]+[^\s<>\[\]()"'.,]"#).unwrap());
191pub static BARE_URL_REGEX: LazyLock<Regex> =
192 LazyLock::new(|| Regex::new(r"(?:https?|ftp)://[^\s<>]+[^\s<>.]").unwrap());
193pub static URL_PATTERN: LazyLock<Regex> =
194 LazyLock::new(|| Regex::new(r"((?:https?|ftp)://[^\s\)<>]+[^\s\)<>.,])").unwrap());
195
196pub static ATX_HEADING_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)(#{1,6})(\s+|$)").unwrap());
198pub static CLOSED_ATX_HEADING_REGEX: LazyLock<Regex> =
199 LazyLock::new(|| Regex::new(r"^(\s*)(#{1,6})(\s+)(.*)(\s+)(#+)(\s*)$").unwrap());
200pub static SETEXT_HEADING_REGEX: LazyLock<Regex> =
201 LazyLock::new(|| Regex::new(r"^(\s*)[^\s]+.*\n(\s*)(=+|-+)\s*$").unwrap());
202pub static TRAILING_PUNCTUATION_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[.,:);!?]$").unwrap());
203
204pub static ATX_HEADING_WITH_CAPTURE: LazyLock<Regex> =
206 LazyLock::new(|| Regex::new(r"^(#{1,6})\s+(.+?)(?:\s+#*\s*)?$").unwrap());
207pub static SETEXT_HEADING_WITH_CAPTURE: LazyLock<FancyRegex> =
208 LazyLock::new(|| FancyRegex::new(r"^([^\n]+)\n([=\-])\2+\s*$").unwrap());
209
210pub static UNORDERED_LIST_MARKER_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)([*+-])(\s+)").unwrap());
212pub static ORDERED_LIST_MARKER_REGEX: LazyLock<Regex> =
213 LazyLock::new(|| Regex::new(r"^(\s*)(\d+)([.)])(\s+)").unwrap());
214pub static LIST_MARKER_ANY_REGEX: LazyLock<Regex> =
215 LazyLock::new(|| Regex::new(r"^(\s*)(?:([*+-])|(\d+)[.)])(\s+)").unwrap());
216
217pub static FENCED_CODE_BLOCK_START_REGEX: LazyLock<Regex> =
219 LazyLock::new(|| Regex::new(r"^(\s*)(```|~~~)(.*)$").unwrap());
220pub static FENCED_CODE_BLOCK_END_REGEX: LazyLock<Regex> =
221 LazyLock::new(|| Regex::new(r"^(\s*)(```|~~~)(\s*)$").unwrap());
222pub static INDENTED_CODE_BLOCK_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s{4,})(.*)$").unwrap());
223pub static CODE_FENCE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(`{3,}|~{3,})").unwrap());
224
225pub static EMPHASIS_REGEX: LazyLock<FancyRegex> =
227 LazyLock::new(|| FancyRegex::new(r"(\s|^)(\*{1,2}|_{1,2})(?=\S)(.+?)(?<=\S)(\2)(\s|$)").unwrap());
228pub static SPACE_IN_EMPHASIS_REGEX: LazyLock<FancyRegex> =
229 LazyLock::new(|| FancyRegex::new(r"(\*|_)(\s+)(.+?)(\s+)(\1)").unwrap());
230
231pub static ASTERISK_EMPHASIS: LazyLock<Regex> =
235 LazyLock::new(|| Regex::new(r"(?:^|[^*])\*(\s+[^*]+\s*|\s*[^*]+\s+)\*(?:[^*]|$)").unwrap());
236pub static UNDERSCORE_EMPHASIS: LazyLock<Regex> =
237 LazyLock::new(|| Regex::new(r"(?:^|[^_])_(\s+[^_]+\s*|\s*[^_]+\s+)_(?:[^_]|$)").unwrap());
238pub static DOUBLE_UNDERSCORE_EMPHASIS: LazyLock<Regex> =
239 LazyLock::new(|| Regex::new(r"(?:^|[^_])__(\s+[^_]+\s*|\s*[^_]+\s+)__(?:[^_]|$)").unwrap());
240pub static DOUBLE_ASTERISK_EMPHASIS: LazyLock<FancyRegex> =
241 LazyLock::new(|| FancyRegex::new(r"\*\*\s+([^*]+?)\s+\*\*").unwrap());
242pub static DOUBLE_ASTERISK_SPACE_START: LazyLock<FancyRegex> =
243 LazyLock::new(|| FancyRegex::new(r"\*\*\s+([^*]+?)\*\*").unwrap());
244pub static DOUBLE_ASTERISK_SPACE_END: LazyLock<FancyRegex> =
245 LazyLock::new(|| FancyRegex::new(r"\*\*([^*]+?)\s+\*\*").unwrap());
246
247pub static FENCED_CODE_BLOCK_START: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)```(?:[^`\r\n]*)$").unwrap());
249pub static FENCED_CODE_BLOCK_END: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)```\s*$").unwrap());
250pub static ALTERNATE_FENCED_CODE_BLOCK_START: LazyLock<Regex> =
251 LazyLock::new(|| Regex::new(r"^(\s*)~~~(?:[^~\r\n]*)$").unwrap());
252pub static ALTERNATE_FENCED_CODE_BLOCK_END: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)~~~\s*$").unwrap());
253pub static INDENTED_CODE_BLOCK_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s{4,})").unwrap());
254
255pub static HTML_TAG_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<([a-zA-Z][^>]*)>").unwrap());
257pub static HTML_SELF_CLOSING_TAG_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<([a-zA-Z][^>]*/)>").unwrap());
258pub static HTML_TAG_FINDER: LazyLock<Regex> = LazyLock::new(|| Regex::new("(?i)</?[a-zA-Z][^>]*>").unwrap());
259pub static HTML_OPENING_TAG_FINDER: LazyLock<Regex> = LazyLock::new(|| Regex::new("(?i)<[a-zA-Z][^>]*>").unwrap());
260pub static HTML_TAG_QUICK_CHECK: LazyLock<Regex> = LazyLock::new(|| Regex::new("(?i)</?[a-zA-Z]").unwrap());
261
262pub static LINK_REFERENCE_DEFINITION_REGEX: LazyLock<Regex> =
264 LazyLock::new(|| Regex::new(r"^\s*\[([^\]]+)\]:\s+(.+)$").unwrap());
265pub static INLINE_LINK_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").unwrap());
266pub static LINK_TEXT_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[([^\]]*)\]").unwrap());
267pub static LINK_REGEX: LazyLock<FancyRegex> =
268 LazyLock::new(|| FancyRegex::new(r"(?<!\\)\[([^\]]*)\]\(([^)#]*)#([^)]+)\)").unwrap());
269pub static EXTERNAL_URL_REGEX: LazyLock<FancyRegex> =
270 LazyLock::new(|| FancyRegex::new(r"^(https?://|ftp://|www\.|[^/]+\.[a-z]{2,})").unwrap());
271
272pub static IMAGE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\(([^)]+)\)").unwrap());
274
275pub static TRAILING_WHITESPACE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\s+$").unwrap());
277pub static MULTIPLE_BLANK_LINES_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\n{3,}").unwrap());
278
279pub static FRONT_MATTER_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^---\n.*?\n---\n").unwrap());
281
282pub static INLINE_CODE_REGEX: LazyLock<FancyRegex> = LazyLock::new(|| FancyRegex::new(r"`[^`]+`").unwrap());
284pub static BOLD_ASTERISK_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\*\*(.+?)\*\*").unwrap());
285pub static BOLD_UNDERSCORE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"__(.+?)__").unwrap());
286pub static ITALIC_ASTERISK_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\*([^*]+?)\*").unwrap());
287pub static ITALIC_UNDERSCORE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"_([^_]+?)_").unwrap());
288pub static LINK_TEXT_FULL_REGEX: LazyLock<FancyRegex> =
289 LazyLock::new(|| FancyRegex::new(r"\[([^\]]*)\]\([^)]*\)").unwrap());
290pub static STRIKETHROUGH_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"~~(.+?)~~").unwrap());
291pub static MULTIPLE_HYPHENS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"-{2,}").unwrap());
292pub static TOC_SECTION_START: LazyLock<Regex> =
293 LazyLock::new(|| Regex::new(r"^#+\s*(?:Table of Contents|Contents|TOC)\s*$").unwrap());
294
295pub static BLOCKQUOTE_PREFIX_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
297
298pub static IMAGE_REF_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^!\[.*?\]\[.*?\]$").unwrap());
300pub static LINK_REF_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\[.*?\]:\s*https?://\S+$").unwrap());
301pub static URL_IN_TEXT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"https?://\S+").unwrap());
302pub static SENTENCE_END: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[.!?]\s+[A-Z]").unwrap());
303pub static ABBREVIATION: LazyLock<Regex> = LazyLock::new(|| {
304 Regex::new(r"\b(?:Mr|Mrs|Ms|Dr|Prof|Sr|Jr|vs|etc|i\.e|e\.g|Inc|Corp|Ltd|Co|St|Ave|Blvd|Rd|Ph\.D|M\.D|B\.A|M\.A|Ph\.D|U\.S|U\.K|U\.N|N\.Y|L\.A|D\.C)\.\s+[A-Z]").unwrap()
305});
306pub static DECIMAL_NUMBER: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\d+\.\s*\d+").unwrap());
307pub static LIST_ITEM: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*\d+\.\s+").unwrap());
308pub static REFERENCE_LINK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[([^\]]*)\]\[([^\]]*)\]").unwrap());
309
310pub static EMAIL_PATTERN: LazyLock<Regex> =
312 LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
313
314pub static REF_LINK_REGEX: LazyLock<FancyRegex> =
318 LazyLock::new(|| FancyRegex::new(r"(?<!\\)\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]\[([^\]]*)\]").unwrap());
319
320pub static SHORTCUT_REF_REGEX: LazyLock<FancyRegex> =
325 LazyLock::new(|| FancyRegex::new(r"(?<![\\)\]])\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\](?!\s*[\[\(])").unwrap());
326
327pub static INLINE_LINK_FANCY_REGEX: LazyLock<FancyRegex> =
329 LazyLock::new(|| FancyRegex::new(r"(?<!\\)\[([^\]]+)\]\(([^)]+)\)").unwrap());
330
331pub static INLINE_IMAGE_FANCY_REGEX: LazyLock<FancyRegex> =
333 LazyLock::new(|| FancyRegex::new(r"!\[([^\]]*)\]\(([^)]+)\)").unwrap());
334
335pub static LINKED_IMAGE_INLINE_INLINE: LazyLock<FancyRegex> =
343 LazyLock::new(|| FancyRegex::new(r"\[!\[([^\]]*)\]\(([^)]+)\)\]\(([^)]+)\)").unwrap());
344
345pub static LINKED_IMAGE_REF_INLINE: LazyLock<FancyRegex> =
347 LazyLock::new(|| FancyRegex::new(r"\[!\[([^\]]*)\]\[([^\]]*)\]\]\(([^)]+)\)").unwrap());
348
349pub static LINKED_IMAGE_INLINE_REF: LazyLock<FancyRegex> =
351 LazyLock::new(|| FancyRegex::new(r"\[!\[([^\]]*)\]\(([^)]+)\)\]\[([^\]]*)\]").unwrap());
352
353pub static LINKED_IMAGE_REF_REF: LazyLock<FancyRegex> =
355 LazyLock::new(|| FancyRegex::new(r"\[!\[([^\]]*)\]\[([^\]]*)\]\]\[([^\]]*)\]").unwrap());
356
357pub static REF_IMAGE_REGEX: LazyLock<FancyRegex> =
359 LazyLock::new(|| FancyRegex::new(r"!\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]\[([^\]]*)\]").unwrap());
360
361pub static FOOTNOTE_REF_REGEX: LazyLock<FancyRegex> = LazyLock::new(|| FancyRegex::new(r"\[\^([^\]]+)\]").unwrap());
363
364pub static STRIKETHROUGH_FANCY_REGEX: LazyLock<FancyRegex> = LazyLock::new(|| FancyRegex::new(r"~~([^~]+)~~").unwrap());
366
367pub static WIKI_LINK_REGEX: LazyLock<FancyRegex> = LazyLock::new(|| FancyRegex::new(r"\[\[([^\]]+)\]\]").unwrap());
369
370pub static INLINE_MATH_REGEX: LazyLock<FancyRegex> =
372 LazyLock::new(|| FancyRegex::new(r"(?<!\$)\$(?!\$)([^\$]+)\$(?!\$)").unwrap());
373pub static DISPLAY_MATH_REGEX: LazyLock<FancyRegex> = LazyLock::new(|| FancyRegex::new(r"\$\$([^\$]+)\$\$").unwrap());
374
375pub static EMOJI_SHORTCODE_REGEX: LazyLock<FancyRegex> =
377 LazyLock::new(|| FancyRegex::new(r":([a-zA-Z0-9_+-]+):").unwrap());
378
379pub static HTML_TAG_PATTERN: LazyLock<FancyRegex> =
381 LazyLock::new(|| FancyRegex::new(r"</?[a-zA-Z][^>]*>|<[a-zA-Z][^>]*/\s*>").unwrap());
382
383pub static HTML_ENTITY_REGEX: LazyLock<FancyRegex> =
385 LazyLock::new(|| FancyRegex::new(r"&[a-zA-Z][a-zA-Z0-9]*;|&#\d+;|&#x[0-9a-fA-F]+;").unwrap());
386
387pub static HTML_COMMENT_START: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<!--").unwrap());
390pub static HTML_COMMENT_END: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"-->").unwrap());
391pub static HTML_COMMENT_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<!--[\s\S]*?-->").unwrap());
392
393pub static HTML_HEADING_PATTERN: LazyLock<FancyRegex> =
395 LazyLock::new(|| FancyRegex::new(r"^\s*<h([1-6])(?:\s[^>]*)?>.*</h\1>\s*$").unwrap());
396
397pub static HEADING_CHECK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?m)^(?:\s*)#").unwrap());
399
400pub static HR_DASH: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\-{3,}\s*$").unwrap());
402pub static HR_ASTERISK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\*{3,}\s*$").unwrap());
403pub static HR_UNDERSCORE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^_{3,}\s*$").unwrap());
404pub static HR_SPACED_DASH: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\-\s+){2,}\-\s*$").unwrap());
405pub static HR_SPACED_ASTERISK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\*\s+){2,}\*\s*$").unwrap());
406pub static HR_SPACED_UNDERSCORE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(_\s+){2,}_\s*$").unwrap());
407
408pub fn has_heading_markers(content: &str) -> bool {
411 content.contains('#')
412}
413
414pub fn has_list_markers(content: &str) -> bool {
416 content.contains('*')
417 || content.contains('-')
418 || content.contains('+')
419 || (content.contains('.') && content.contains(|c: char| c.is_ascii_digit()))
420}
421
422pub fn has_code_block_markers(content: &str) -> bool {
424 content.contains("```") || content.contains("~~~") || content.contains("\n ")
425 }
427
428pub fn has_emphasis_markers(content: &str) -> bool {
430 content.contains('*') || content.contains('_')
431}
432
433pub fn has_html_tags(content: &str) -> bool {
435 content.contains('<') && (content.contains('>') || content.contains("/>"))
436}
437
438pub fn has_link_markers(content: &str) -> bool {
440 (content.contains('[') && content.contains(']'))
441 || content.contains("http://")
442 || content.contains("https://")
443 || content.contains("ftp://")
444}
445
446pub fn has_image_markers(content: &str) -> bool {
448 content.contains("![")
449}
450
451pub fn contains_url(content: &str) -> bool {
454 if !content.contains("://") {
456 return false;
457 }
458
459 let chars: Vec<char> = content.chars().collect();
460 let mut i = 0;
461
462 while i < chars.len() {
463 if i + 2 < chars.len()
465 && ((chars[i] == 'h' && chars[i + 1] == 't' && chars[i + 2] == 't')
466 || (chars[i] == 'f' && chars[i + 1] == 't' && chars[i + 2] == 'p'))
467 {
468 let mut j = i;
470 while j + 2 < chars.len() {
471 if chars[j] == ':' && chars[j + 1] == '/' && chars[j + 2] == '/' {
472 return true;
473 }
474 j += 1;
475
476 if j > i + 10 {
478 break;
479 }
480 }
481 }
482 i += 1;
483 }
484
485 false
486}
487
488pub fn escape_regex(s: &str) -> String {
490 let mut result = String::with_capacity(s.len() * 2);
491
492 for c in s.chars() {
493 if matches!(
495 c,
496 '.' | '+' | '*' | '?' | '^' | '$' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '\\'
497 ) {
498 result.push('\\');
499 }
500 result.push(c);
501 }
502
503 result
504}
505
506#[cfg(test)]
507mod tests {
508 use super::*;
509
510 #[test]
511 fn test_regex_cache_new() {
512 let cache = RegexCache::new();
513 assert!(cache.cache.is_empty());
514 assert!(cache.fancy_cache.is_empty());
515 assert!(cache.usage_stats.is_empty());
516 }
517
518 #[test]
519 fn test_regex_cache_default() {
520 let cache = RegexCache::default();
521 assert!(cache.cache.is_empty());
522 assert!(cache.fancy_cache.is_empty());
523 assert!(cache.usage_stats.is_empty());
524 }
525
526 #[test]
527 fn test_get_regex_compilation() {
528 let mut cache = RegexCache::new();
529
530 let regex1 = cache.get_regex(r"\d+").unwrap();
532 assert_eq!(cache.cache.len(), 1);
533 assert_eq!(cache.usage_stats.get(r"\d+"), Some(&1));
534
535 let regex2 = cache.get_regex(r"\d+").unwrap();
537 assert_eq!(cache.cache.len(), 1);
538 assert_eq!(cache.usage_stats.get(r"\d+"), Some(&2));
539
540 assert!(Arc::ptr_eq(®ex1, ®ex2));
542 }
543
544 #[test]
545 fn test_get_regex_invalid_pattern() {
546 let mut cache = RegexCache::new();
547 let result = cache.get_regex(r"[unterminated");
548 assert!(result.is_err());
549 assert!(cache.cache.is_empty());
550 }
551
552 #[test]
553 fn test_get_fancy_regex_compilation() {
554 let mut cache = RegexCache::new();
555
556 let regex1 = cache.get_fancy_regex(r"(?<=foo)bar").unwrap();
558 assert_eq!(cache.fancy_cache.len(), 1);
559 assert_eq!(cache.usage_stats.get(r"(?<=foo)bar"), Some(&1));
560
561 let regex2 = cache.get_fancy_regex(r"(?<=foo)bar").unwrap();
563 assert_eq!(cache.fancy_cache.len(), 1);
564 assert_eq!(cache.usage_stats.get(r"(?<=foo)bar"), Some(&2));
565
566 assert!(Arc::ptr_eq(®ex1, ®ex2));
568 }
569
570 #[test]
571 fn test_get_fancy_regex_invalid_pattern() {
572 let mut cache = RegexCache::new();
573 let result = cache.get_fancy_regex(r"(?<=invalid");
574 assert!(result.is_err());
575 assert!(cache.fancy_cache.is_empty());
576 }
577
578 #[test]
579 fn test_get_stats() {
580 let mut cache = RegexCache::new();
581
582 let _ = cache.get_regex(r"\d+").unwrap();
584 let _ = cache.get_regex(r"\d+").unwrap();
585 let _ = cache.get_regex(r"\w+").unwrap();
586 let _ = cache.get_fancy_regex(r"(?<=foo)bar").unwrap();
587
588 let stats = cache.get_stats();
589 assert_eq!(stats.get(r"\d+"), Some(&2));
590 assert_eq!(stats.get(r"\w+"), Some(&1));
591 assert_eq!(stats.get(r"(?<=foo)bar"), Some(&1));
592 }
593
594 #[test]
595 fn test_clear_cache() {
596 let mut cache = RegexCache::new();
597
598 let _ = cache.get_regex(r"\d+").unwrap();
600 let _ = cache.get_fancy_regex(r"(?<=foo)bar").unwrap();
601
602 assert!(!cache.cache.is_empty());
603 assert!(!cache.fancy_cache.is_empty());
604 assert!(!cache.usage_stats.is_empty());
605
606 cache.clear();
608
609 assert!(cache.cache.is_empty());
610 assert!(cache.fancy_cache.is_empty());
611 assert!(cache.usage_stats.is_empty());
612 }
613
614 #[test]
615 fn test_global_cache_functions() {
616 let regex1 = get_cached_regex(r"\d{3}").unwrap();
618 let regex2 = get_cached_regex(r"\d{3}").unwrap();
619 assert!(Arc::ptr_eq(®ex1, ®ex2));
620
621 let fancy1 = get_cached_fancy_regex(r"(?<=test)ing").unwrap();
623 let fancy2 = get_cached_fancy_regex(r"(?<=test)ing").unwrap();
624 assert!(Arc::ptr_eq(&fancy1, &fancy2));
625
626 let stats = get_cache_stats();
628 assert!(stats.contains_key(r"\d{3}"));
629 assert!(stats.contains_key(r"(?<=test)ing"));
630 }
631
632 #[test]
633 fn test_regex_lazy_macro() {
634 let re = regex_lazy!(r"^test.*end$");
635 assert!(re.is_match("test something end"));
636 assert!(!re.is_match("test something"));
637
638 let re2 = regex_lazy!(r"^start.*finish$");
642 assert!(re2.is_match("start and finish"));
643 assert!(!re2.is_match("start without end"));
644 }
645
646 #[test]
647 fn test_has_heading_markers() {
648 assert!(has_heading_markers("# Heading"));
649 assert!(has_heading_markers("Text with # symbol"));
650 assert!(!has_heading_markers("Text without heading marker"));
651 }
652
653 #[test]
654 fn test_has_list_markers() {
655 assert!(has_list_markers("* Item"));
656 assert!(has_list_markers("- Item"));
657 assert!(has_list_markers("+ Item"));
658 assert!(has_list_markers("1. Item"));
659 assert!(!has_list_markers("Text without list markers"));
660 }
661
662 #[test]
663 fn test_has_code_block_markers() {
664 assert!(has_code_block_markers("```code```"));
665 assert!(has_code_block_markers("~~~code~~~"));
666 assert!(has_code_block_markers("Text\n indented code"));
667 assert!(!has_code_block_markers("Text without code blocks"));
668 }
669
670 #[test]
671 fn test_has_emphasis_markers() {
672 assert!(has_emphasis_markers("*emphasis*"));
673 assert!(has_emphasis_markers("_emphasis_"));
674 assert!(has_emphasis_markers("**bold**"));
675 assert!(has_emphasis_markers("__bold__"));
676 assert!(!has_emphasis_markers("no emphasis"));
677 }
678
679 #[test]
680 fn test_has_html_tags() {
681 assert!(has_html_tags("<div>content</div>"));
682 assert!(has_html_tags("<br/>"));
683 assert!(has_html_tags("<img src='test.jpg'>"));
684 assert!(!has_html_tags("no html tags"));
685 assert!(!has_html_tags("less than < but no tag"));
686 }
687
688 #[test]
689 fn test_has_link_markers() {
690 assert!(has_link_markers("[text](url)"));
691 assert!(has_link_markers("[reference][1]"));
692 assert!(has_link_markers("http://example.com"));
693 assert!(has_link_markers("https://example.com"));
694 assert!(has_link_markers("ftp://example.com"));
695 assert!(!has_link_markers("no links here"));
696 }
697
698 #[test]
699 fn test_has_image_markers() {
700 assert!(has_image_markers(""));
701 assert!(has_image_markers(""));
702 assert!(!has_image_markers("[link](url)"));
703 assert!(!has_image_markers("no images"));
704 }
705
706 #[test]
707 fn test_contains_url() {
708 assert!(contains_url("http://example.com"));
709 assert!(contains_url("Text with https://example.com link"));
710 assert!(contains_url("ftp://example.com"));
711 assert!(!contains_url("Text without URL"));
712 assert!(!contains_url("http not followed by ://"));
713
714 assert!(!contains_url("http"));
716 assert!(!contains_url("https"));
717 assert!(!contains_url("://"));
718 assert!(contains_url("Visit http://site.com now"));
719 assert!(contains_url("See https://secure.site.com/path"));
720 }
721
722 #[test]
723 fn test_contains_url_performance() {
724 let long_text = "a".repeat(10000);
726 assert!(!contains_url(&long_text));
727
728 let text_with_url = format!("{long_text}https://example.com");
730 assert!(contains_url(&text_with_url));
731 }
732
733 #[test]
734 fn test_escape_regex() {
735 assert_eq!(escape_regex("a.b"), "a\\.b");
736 assert_eq!(escape_regex("a+b*c"), "a\\+b\\*c");
737 assert_eq!(escape_regex("(test)"), "\\(test\\)");
738 assert_eq!(escape_regex("[a-z]"), "\\[a-z\\]");
739 assert_eq!(escape_regex("normal text"), "normal text");
740
741 assert_eq!(escape_regex(".$^{[(|)*+?\\"), "\\.\\$\\^\\{\\[\\(\\|\\)\\*\\+\\?\\\\");
743
744 assert_eq!(escape_regex(""), "");
746
747 assert_eq!(escape_regex("test.com/path?query=1"), "test\\.com/path\\?query=1");
749 }
750
751 #[test]
752 fn test_static_regex_patterns() {
753 assert!(URL_REGEX.is_match("https://example.com"));
755 assert!(URL_REGEX.is_match("http://test.org/path"));
756 assert!(URL_REGEX.is_match("ftp://files.com"));
757 assert!(!URL_REGEX.is_match("not a url"));
758
759 assert!(ATX_HEADING_REGEX.is_match("# Heading"));
761 assert!(ATX_HEADING_REGEX.is_match(" ## Indented"));
762 assert!(ATX_HEADING_REGEX.is_match("### "));
763 assert!(!ATX_HEADING_REGEX.is_match("Not a heading"));
764
765 assert!(UNORDERED_LIST_MARKER_REGEX.is_match("* Item"));
767 assert!(UNORDERED_LIST_MARKER_REGEX.is_match("- Item"));
768 assert!(UNORDERED_LIST_MARKER_REGEX.is_match("+ Item"));
769 assert!(ORDERED_LIST_MARKER_REGEX.is_match("1. Item"));
770 assert!(ORDERED_LIST_MARKER_REGEX.is_match("99. Item"));
771
772 assert!(FENCED_CODE_BLOCK_START_REGEX.is_match("```"));
774 assert!(FENCED_CODE_BLOCK_START_REGEX.is_match("```rust"));
775 assert!(FENCED_CODE_BLOCK_START_REGEX.is_match("~~~"));
776 assert!(FENCED_CODE_BLOCK_END_REGEX.is_match("```"));
777 assert!(FENCED_CODE_BLOCK_END_REGEX.is_match("~~~"));
778
779 assert!(BOLD_ASTERISK_REGEX.is_match("**bold**"));
781 assert!(BOLD_UNDERSCORE_REGEX.is_match("__bold__"));
782 assert!(ITALIC_ASTERISK_REGEX.is_match("*italic*"));
783 assert!(ITALIC_UNDERSCORE_REGEX.is_match("_italic_"));
784
785 assert!(HTML_TAG_REGEX.is_match("<div>"));
787 assert!(HTML_TAG_REGEX.is_match("<span class='test'>"));
788 assert!(HTML_SELF_CLOSING_TAG_REGEX.is_match("<br/>"));
789 assert!(HTML_SELF_CLOSING_TAG_REGEX.is_match("<img src='test'/>"));
790
791 assert!(TRAILING_WHITESPACE_REGEX.is_match("line with spaces "));
793 assert!(TRAILING_WHITESPACE_REGEX.is_match("tabs\t\t"));
794 assert!(MULTIPLE_BLANK_LINES_REGEX.is_match("\n\n\n"));
795 assert!(MULTIPLE_BLANK_LINES_REGEX.is_match("\n\n\n\n"));
796
797 assert!(BLOCKQUOTE_PREFIX_RE.is_match("> Quote"));
799 assert!(BLOCKQUOTE_PREFIX_RE.is_match(" > Indented quote"));
800 assert!(BLOCKQUOTE_PREFIX_RE.is_match(">> Nested"));
801 }
802
803 #[test]
804 fn test_thread_safety() {
805 use std::thread;
806
807 let handles: Vec<_> = (0..10)
808 .map(|i| {
809 thread::spawn(move || {
810 let pattern = format!(r"\d{{{i}}}");
811 let regex = get_cached_regex(&pattern).unwrap();
812 assert!(regex.is_match(&"1".repeat(i)));
813 })
814 })
815 .collect();
816
817 for handle in handles {
818 handle.join().unwrap();
819 }
820 }
821}