rumdl_lib/utils/
regex_cache.rs

1//!
2//! Cached Regex Patterns and Fast Content Checks for Markdown Linting
3//!
4//! This module provides a centralized collection of pre-compiled, cached regex patterns
5//! for all major Markdown constructs (headings, lists, code blocks, links, images, etc.).
6//! It also includes fast-path utility functions for quickly checking if content
7//! potentially contains certain Markdown elements, allowing rules to skip expensive
8//! processing when unnecessary.
9//!
10//! # Performance
11//!
12//! All regexes are compiled once at startup using `lazy_static`, avoiding repeated
13//! compilation and improving performance across the linter. Use these shared patterns
14//! in rules instead of compiling new regexes.
15//!
16//! # Usage
17//!
18//! - Use the provided statics for common Markdown patterns.
19//! - Use the `regex_lazy!` macro for ad-hoc regexes that are not predefined.
20//! - Use the utility functions for fast content checks before running regexes.
21
22use fancy_regex::Regex as FancyRegex;
23use lazy_static::lazy_static;
24use regex::Regex;
25use std::collections::HashMap;
26use std::sync::{Arc, Mutex};
27
28/// Global regex cache for dynamic patterns
29#[derive(Debug)]
30pub struct RegexCache {
31    cache: HashMap<String, Arc<Regex>>,
32    fancy_cache: HashMap<String, Arc<FancyRegex>>,
33    usage_stats: HashMap<String, u64>,
34}
35
36impl Default for RegexCache {
37    fn default() -> Self {
38        Self::new()
39    }
40}
41
42impl RegexCache {
43    pub fn new() -> Self {
44        Self {
45            cache: HashMap::new(),
46            fancy_cache: HashMap::new(),
47            usage_stats: HashMap::new(),
48        }
49    }
50
51    /// Get or compile a regex pattern
52    pub fn get_regex(&mut self, pattern: &str) -> Result<Arc<Regex>, regex::Error> {
53        if let Some(regex) = self.cache.get(pattern) {
54            *self.usage_stats.entry(pattern.to_string()).or_insert(0) += 1;
55            return Ok(regex.clone());
56        }
57
58        let regex = Arc::new(Regex::new(pattern)?);
59        self.cache.insert(pattern.to_string(), regex.clone());
60        *self.usage_stats.entry(pattern.to_string()).or_insert(0) += 1;
61        Ok(regex)
62    }
63
64    /// Get or compile a fancy regex pattern
65    pub fn get_fancy_regex(&mut self, pattern: &str) -> Result<Arc<FancyRegex>, Box<fancy_regex::Error>> {
66        if let Some(regex) = self.fancy_cache.get(pattern) {
67            *self.usage_stats.entry(pattern.to_string()).or_insert(0) += 1;
68            return Ok(regex.clone());
69        }
70
71        match FancyRegex::new(pattern) {
72            Ok(regex) => {
73                let arc_regex = Arc::new(regex);
74                self.fancy_cache.insert(pattern.to_string(), arc_regex.clone());
75                *self.usage_stats.entry(pattern.to_string()).or_insert(0) += 1;
76                Ok(arc_regex)
77            }
78            Err(e) => Err(Box::new(e)),
79        }
80    }
81
82    /// Get cache statistics
83    pub fn get_stats(&self) -> HashMap<String, u64> {
84        self.usage_stats.clone()
85    }
86
87    /// Clear cache (useful for testing)
88    pub fn clear(&mut self) {
89        self.cache.clear();
90        self.fancy_cache.clear();
91        self.usage_stats.clear();
92    }
93}
94
95lazy_static! {
96    /// Global regex cache instance
97    static ref GLOBAL_REGEX_CACHE: Arc<Mutex<RegexCache>> = Arc::new(Mutex::new(RegexCache::new()));
98}
99
100/// Get a regex from the global cache
101pub fn get_cached_regex(pattern: &str) -> Result<Arc<Regex>, regex::Error> {
102    let mut cache = GLOBAL_REGEX_CACHE.lock().unwrap();
103    cache.get_regex(pattern)
104}
105
106/// Get a fancy regex from the global cache
107pub fn get_cached_fancy_regex(pattern: &str) -> Result<Arc<FancyRegex>, Box<fancy_regex::Error>> {
108    let mut cache = GLOBAL_REGEX_CACHE.lock().unwrap();
109    cache.get_fancy_regex(pattern)
110}
111
112/// Get cache usage statistics
113pub fn get_cache_stats() -> HashMap<String, u64> {
114    let cache = GLOBAL_REGEX_CACHE.lock().unwrap();
115    cache.get_stats()
116}
117
118/// Macro for defining a lazily-initialized, cached regex pattern.
119/// Use this for ad-hoc regexes that are not already defined in this module.
120/// Example:
121/// ```
122/// use rumdl_lib::regex_lazy;
123/// let my_re = regex_lazy!(r"^foo.*bar$");
124/// assert!(my_re.is_match("foobar"));
125/// ```
126#[macro_export]
127macro_rules! regex_lazy {
128    ($pattern:expr) => {{
129        lazy_static::lazy_static! {
130            static ref REGEX: regex::Regex = regex::Regex::new($pattern).unwrap();
131        }
132        &*REGEX
133    }};
134}
135
136/// Macro for getting regex from global cache
137#[macro_export]
138macro_rules! regex_cached {
139    ($pattern:expr) => {{ $crate::utils::regex_cache::get_cached_regex($pattern).expect("Failed to compile regex") }};
140}
141
142/// Macro for getting fancy regex from global cache
143#[macro_export]
144macro_rules! fancy_regex_cached {
145    ($pattern:expr) => {{ $crate::utils::regex_cache::get_cached_fancy_regex($pattern).expect("Failed to compile fancy regex") }};
146}
147
148// Also make the macro available directly from this module
149pub use crate::regex_lazy;
150
151lazy_static! {
152    // URL patterns
153    pub static ref URL_REGEX: Regex = Regex::new(r#"(?:https?|ftp)://[^\s<>\[\]()'"]+[^\s<>\[\]()"'.,]"#).unwrap();
154    pub static ref BARE_URL_REGEX: Regex = Regex::new(r"(?:https?|ftp)://[^\s<>]+[^\s<>.]").unwrap();
155    pub static ref URL_PATTERN: Regex = Regex::new(r"((?:https?|ftp)://[^\s\)<>]+[^\s\)<>.,])").unwrap();
156
157    // Heading patterns
158    pub static ref ATX_HEADING_REGEX: Regex = Regex::new(r"^(\s*)(#{1,6})(\s+|$)").unwrap();
159    pub static ref CLOSED_ATX_HEADING_REGEX: Regex = Regex::new(r"^(\s*)(#{1,6})(\s+)(.*)(\s+)(#+)(\s*)$").unwrap();
160    pub static ref SETEXT_HEADING_REGEX: Regex = Regex::new(r"^(\s*)[^\s]+.*\n(\s*)(=+|-+)\s*$").unwrap();
161    pub static ref TRAILING_PUNCTUATION_REGEX: Regex = Regex::new(r"[.,:;!?]$").unwrap();
162
163    // ATX heading patterns for MD051 and other rules
164    pub static ref ATX_HEADING_WITH_CAPTURE: Regex = Regex::new(r"^(#{1,6})\s+(.+?)(?:\s+#*\s*)?$").unwrap();
165    pub static ref SETEXT_HEADING_WITH_CAPTURE: FancyRegex = FancyRegex::new(r"^([^\n]+)\n([=\-])\2+\s*$").unwrap();
166
167    // List patterns
168    pub static ref UNORDERED_LIST_MARKER_REGEX: Regex = Regex::new(r"^(\s*)([*+-])(\s+)").unwrap();
169    pub static ref ORDERED_LIST_MARKER_REGEX: Regex = Regex::new(r"^(\s*)(\d+)([.)])(\s+)").unwrap();
170    pub static ref LIST_MARKER_ANY_REGEX: Regex = Regex::new(r"^(\s*)(?:([*+-])|(\d+)[.)])(\s+)").unwrap();
171
172    // Code block patterns
173    pub static ref FENCED_CODE_BLOCK_START_REGEX: Regex = Regex::new(r"^(\s*)(```|~~~)(.*)$").unwrap();
174    pub static ref FENCED_CODE_BLOCK_END_REGEX: Regex = Regex::new(r"^(\s*)(```|~~~)(\s*)$").unwrap();
175    pub static ref INDENTED_CODE_BLOCK_REGEX: Regex = Regex::new(r"^(\s{4,})(.*)$").unwrap();
176    pub static ref CODE_FENCE_REGEX: Regex = Regex::new(r"^(`{3,}|~{3,})").unwrap();
177
178    // Emphasis patterns
179    pub static ref EMPHASIS_REGEX: FancyRegex = FancyRegex::new(r"(\s|^)(\*{1,2}|_{1,2})(?=\S)(.+?)(?<=\S)(\2)(\s|$)").unwrap();
180    pub static ref SPACE_IN_EMPHASIS_REGEX: FancyRegex = FancyRegex::new(r"(\*|_)(\s+)(.+?)(\s+)(\1)").unwrap();
181
182    // MD037 specific emphasis patterns - improved to avoid false positives
183    // Only match emphasis with spaces that are actually complete emphasis blocks
184    // Use word boundaries and negative lookbehind/lookahead to avoid matching across emphasis boundaries
185    pub static ref ASTERISK_EMPHASIS: Regex = Regex::new(r"(?:^|[^*])\*(\s+[^*]+\s*|\s*[^*]+\s+)\*(?:[^*]|$)").unwrap();
186    pub static ref UNDERSCORE_EMPHASIS: Regex = Regex::new(r"(?:^|[^_])_(\s+[^_]+\s*|\s*[^_]+\s+)_(?:[^_]|$)").unwrap();
187    pub static ref DOUBLE_UNDERSCORE_EMPHASIS: Regex = Regex::new(r"(?:^|[^_])__(\s+[^_]+\s*|\s*[^_]+\s+)__(?:[^_]|$)").unwrap();
188    pub static ref DOUBLE_ASTERISK_EMPHASIS: FancyRegex = FancyRegex::new(r"\*\*\s+([^*]+?)\s+\*\*").unwrap();
189    pub static ref DOUBLE_ASTERISK_SPACE_START: FancyRegex = FancyRegex::new(r"\*\*\s+([^*]+?)\*\*").unwrap();
190    pub static ref DOUBLE_ASTERISK_SPACE_END: FancyRegex = FancyRegex::new(r"\*\*([^*]+?)\s+\*\*").unwrap();
191
192    // Code block patterns
193    pub static ref FENCED_CODE_BLOCK_START: Regex = Regex::new(r"^(\s*)```(?:[^`\r\n]*)$").unwrap();
194    pub static ref FENCED_CODE_BLOCK_END: Regex = Regex::new(r"^(\s*)```\s*$").unwrap();
195    pub static ref ALTERNATE_FENCED_CODE_BLOCK_START: Regex = Regex::new(r"^(\s*)~~~(?:[^~\r\n]*)$").unwrap();
196    pub static ref ALTERNATE_FENCED_CODE_BLOCK_END: Regex = Regex::new(r"^(\s*)~~~\s*$").unwrap();
197    pub static ref INDENTED_CODE_BLOCK_PATTERN: Regex = Regex::new(r"^(\s{4,})").unwrap();
198
199    // HTML patterns
200    pub static ref HTML_TAG_REGEX: Regex = Regex::new(r"<([a-zA-Z][^>]*)>").unwrap();
201    pub static ref HTML_SELF_CLOSING_TAG_REGEX: Regex = Regex::new(r"<([a-zA-Z][^>]*/)>").unwrap();
202    pub static ref HTML_TAG_FINDER: Regex = Regex::new("(?i)</?[a-zA-Z][^>]*>").unwrap();
203    pub static ref HTML_OPENING_TAG_FINDER: Regex = Regex::new("(?i)<[a-zA-Z][^>]*>").unwrap();
204    pub static ref HTML_TAG_QUICK_CHECK: Regex = Regex::new("(?i)</?[a-zA-Z]").unwrap();
205
206    // Link patterns for MD051 and other rules
207    pub static ref LINK_REFERENCE_DEFINITION_REGEX: Regex = Regex::new(r"^\s*\[([^\]]+)\]:\s+(.+)$").unwrap();
208    pub static ref INLINE_LINK_REGEX: Regex = Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").unwrap();
209    pub static ref LINK_TEXT_REGEX: Regex = Regex::new(r"\[([^\]]*)\]").unwrap();
210    pub static ref LINK_REGEX: FancyRegex = FancyRegex::new(r"(?<!\\)\[([^\]]*)\]\(([^)#]*)#([^)]+)\)").unwrap();
211    pub static ref EXTERNAL_URL_REGEX: FancyRegex = FancyRegex::new(r"^(https?://|ftp://|www\.|[^/]+\.[a-z]{2,})").unwrap();
212
213    // Image patterns
214    pub static ref IMAGE_REGEX: Regex = Regex::new(r"!\[([^\]]*)\]\(([^)]+)\)").unwrap();
215
216    // Whitespace patterns
217    pub static ref TRAILING_WHITESPACE_REGEX: Regex = Regex::new(r"\s+$").unwrap();
218    pub static ref MULTIPLE_BLANK_LINES_REGEX: Regex = Regex::new(r"\n{3,}").unwrap();
219
220    // Front matter patterns
221    pub static ref FRONT_MATTER_REGEX: Regex = Regex::new(r"^---\n.*?\n---\n").unwrap();
222
223    // MD051 specific patterns
224    pub static ref INLINE_CODE_REGEX: FancyRegex = FancyRegex::new(r"`[^`]+`").unwrap();
225    pub static ref BOLD_ASTERISK_REGEX: Regex = Regex::new(r"\*\*(.+?)\*\*").unwrap();
226    pub static ref BOLD_UNDERSCORE_REGEX: Regex = Regex::new(r"__(.+?)__").unwrap();
227    pub static ref ITALIC_ASTERISK_REGEX: Regex = Regex::new(r"\*([^*]+?)\*").unwrap();
228    pub static ref ITALIC_UNDERSCORE_REGEX: Regex = Regex::new(r"_([^_]+?)_").unwrap();
229    pub static ref LINK_TEXT_FULL_REGEX: FancyRegex = FancyRegex::new(r"\[([^\]]*)\]\([^)]*\)").unwrap();
230    pub static ref STRIKETHROUGH_REGEX: Regex = Regex::new(r"~~(.+?)~~").unwrap();
231    pub static ref MULTIPLE_HYPHENS: Regex = Regex::new(r"-{2,}").unwrap();
232    pub static ref TOC_SECTION_START: Regex = Regex::new(r"^#+\s*(?:Table of Contents|Contents|TOC)\s*$").unwrap();
233
234    // Blockquote patterns
235    pub static ref BLOCKQUOTE_PREFIX_RE: Regex = Regex::new(r"^(\s*>+\s*)").unwrap();
236
237    // MD013 specific patterns
238    pub static ref IMAGE_REF_PATTERN: Regex = Regex::new(r"^!\[.*?\]\[.*?\]$").unwrap();
239    pub static ref LINK_REF_PATTERN: Regex = Regex::new(r"^\[.*?\]:\s*https?://\S+$").unwrap();
240    pub static ref URL_IN_TEXT: Regex = Regex::new(r"https?://\S+").unwrap();
241    pub static ref SENTENCE_END: Regex = Regex::new(r"[.!?]\s+[A-Z]").unwrap();
242    pub static ref ABBREVIATION: Regex = Regex::new(r"\b(?:Mr|Mrs|Ms|Dr|Prof|Sr|Jr|vs|etc|i\.e|e\.g|Inc|Corp|Ltd|Co|St|Ave|Blvd|Rd|Ph\.D|M\.D|B\.A|M\.A|Ph\.D|U\.S|U\.K|U\.N|N\.Y|L\.A|D\.C)\.\s+[A-Z]").unwrap();
243    pub static ref DECIMAL_NUMBER: Regex = Regex::new(r"\d+\.\s*\d+").unwrap();
244    pub static ref LIST_ITEM: Regex = Regex::new(r"^\s*\d+\.\s+").unwrap();
245    pub static ref REFERENCE_LINK: Regex = Regex::new(r"\[([^\]]*)\]\[([^\]]*)\]").unwrap();
246
247    // Email pattern
248    pub static ref EMAIL_PATTERN: Regex = Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap();
249}
250
251// Third lazy_static block for link and image patterns used by MD052 and text_reflow
252lazy_static! {
253    // Reference link patterns (shared by MD052 and text_reflow)
254    // Pattern to match reference links: [text][reference] or [text][]
255    pub static ref REF_LINK_REGEX: FancyRegex = FancyRegex::new(r"(?<!\\)\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]\[([^\]]*)\]").unwrap();
256
257    // Pattern for shortcut reference links: [reference]
258    // Must not be preceded by ] or ) (to avoid matching second part of [text][ref])
259    // Must not be followed by [ or ( (to avoid matching first part of [text][ref] or [text](url))
260    // The capturing group handles nested brackets to support cases like [`Union[T, None]`]
261    pub static ref SHORTCUT_REF_REGEX: FancyRegex = FancyRegex::new(r"(?<![\\)\]])\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\](?!\s*[\[\(])").unwrap();
262
263    // Inline link with fancy regex for better escaping handling (used by text_reflow)
264    pub static ref INLINE_LINK_FANCY_REGEX: FancyRegex = FancyRegex::new(r"(?<!\\)\[([^\]]+)\]\(([^)]+)\)").unwrap();
265
266    // Inline image with fancy regex (used by MD052 and text_reflow)
267    pub static ref INLINE_IMAGE_FANCY_REGEX: FancyRegex = FancyRegex::new(r"!\[([^\]]*)\]\(([^)]+)\)").unwrap();
268
269    // Reference image: ![alt][ref] or ![alt][]
270    pub static ref REF_IMAGE_REGEX: FancyRegex = FancyRegex::new(r"!\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]\[([^\]]*)\]").unwrap();
271
272    // Footnote reference: [^note]
273    pub static ref FOOTNOTE_REF_REGEX: FancyRegex = FancyRegex::new(r"\[\^([^\]]+)\]").unwrap();
274
275    // Strikethrough with fancy regex: ~~text~~
276    pub static ref STRIKETHROUGH_FANCY_REGEX: FancyRegex = FancyRegex::new(r"~~([^~]+)~~").unwrap();
277
278    // Wiki-style links: [[wiki]] or [[wiki|display text]]
279    pub static ref WIKI_LINK_REGEX: FancyRegex = FancyRegex::new(r"\[\[([^\]]+)\]\]").unwrap();
280
281    // Math formulas: $inline$ or $$display$$
282    pub static ref INLINE_MATH_REGEX: FancyRegex = FancyRegex::new(r"(?<!\$)\$(?!\$)([^\$]+)\$(?!\$)").unwrap();
283    pub static ref DISPLAY_MATH_REGEX: FancyRegex = FancyRegex::new(r"\$\$([^\$]+)\$\$").unwrap();
284
285    // Emoji shortcodes: :emoji:
286    pub static ref EMOJI_SHORTCODE_REGEX: FancyRegex = FancyRegex::new(r":([a-zA-Z0-9_+-]+):").unwrap();
287
288    // HTML tags (opening, closing, self-closing)
289    pub static ref HTML_TAG_PATTERN: FancyRegex = FancyRegex::new(r"</?[a-zA-Z][^>]*>|<[a-zA-Z][^>]*/\s*>").unwrap();
290
291    // HTML entities: &nbsp; &mdash; etc
292    pub static ref HTML_ENTITY_REGEX: FancyRegex = FancyRegex::new(r"&[a-zA-Z][a-zA-Z0-9]*;|&#\d+;|&#x[0-9a-fA-F]+;").unwrap();
293}
294
295// Fourth lazy_static block for additional patterns
296lazy_static! {
297    // HTML comment patterns
298    pub static ref HTML_COMMENT_START: Regex = Regex::new(r"<!--").unwrap();
299    pub static ref HTML_COMMENT_END: Regex = Regex::new(r"-->").unwrap();
300    pub static ref HTML_COMMENT_PATTERN: Regex = Regex::new(r"<!--[\s\S]*?-->").unwrap();
301
302    // HTML heading pattern (matches <h1> through <h6> tags)
303    pub static ref HTML_HEADING_PATTERN: FancyRegex = FancyRegex::new(r"^\s*<h([1-6])(?:\s[^>]*)?>.*</h\1>\s*$").unwrap();
304
305    // Heading quick check pattern
306    pub static ref HEADING_CHECK: Regex = Regex::new(r"(?m)^(?:\s*)#").unwrap();
307
308    // Horizontal rule patterns
309    pub static ref HR_DASH: Regex = Regex::new(r"^\-{3,}\s*$").unwrap();
310    pub static ref HR_ASTERISK: Regex = Regex::new(r"^\*{3,}\s*$").unwrap();
311    pub static ref HR_UNDERSCORE: Regex = Regex::new(r"^_{3,}\s*$").unwrap();
312    pub static ref HR_SPACED_DASH: Regex = Regex::new(r"^(\-\s+){2,}\-\s*$").unwrap();
313    pub static ref HR_SPACED_ASTERISK: Regex = Regex::new(r"^(\*\s+){2,}\*\s*$").unwrap();
314    pub static ref HR_SPACED_UNDERSCORE: Regex = Regex::new(r"^(_\s+){2,}_\s*$").unwrap();
315}
316
317/// Utility functions for quick content checks
318/// Check if content contains any headings (quick check before regex)
319pub fn has_heading_markers(content: &str) -> bool {
320    content.contains('#')
321}
322
323/// Check if content contains any lists (quick check before regex)
324pub fn has_list_markers(content: &str) -> bool {
325    content.contains('*')
326        || content.contains('-')
327        || content.contains('+')
328        || (content.contains('.') && content.contains(|c: char| c.is_ascii_digit()))
329}
330
331/// Check if content contains any code blocks (quick check before regex)
332pub fn has_code_block_markers(content: &str) -> bool {
333    content.contains("```") || content.contains("~~~") || content.contains("\n    ")
334    // Indented code block potential
335}
336
337/// Check if content contains any emphasis markers (quick check before regex)
338pub fn has_emphasis_markers(content: &str) -> bool {
339    content.contains('*') || content.contains('_')
340}
341
342/// Check if content contains any HTML tags (quick check before regex)
343pub fn has_html_tags(content: &str) -> bool {
344    content.contains('<') && (content.contains('>') || content.contains("/>"))
345}
346
347/// Check if content contains any links (quick check before regex)
348pub fn has_link_markers(content: &str) -> bool {
349    (content.contains('[') && content.contains(']'))
350        || content.contains("http://")
351        || content.contains("https://")
352        || content.contains("ftp://")
353}
354
355/// Check if content contains any images (quick check before regex)
356pub fn has_image_markers(content: &str) -> bool {
357    content.contains("![")
358}
359
360/// Optimize URL detection by implementing a character-by-character scanner
361/// that's much faster than regex for cases where we know there's no URL
362pub fn contains_url(content: &str) -> bool {
363    // Fast check - if these substrings aren't present, there's no URL
364    if !content.contains("://") {
365        return false;
366    }
367
368    let chars: Vec<char> = content.chars().collect();
369    let mut i = 0;
370
371    while i < chars.len() {
372        // Look for the start of a URL protocol
373        if i + 2 < chars.len()
374            && ((chars[i] == 'h' && chars[i + 1] == 't' && chars[i + 2] == 't')
375                || (chars[i] == 'f' && chars[i + 1] == 't' && chars[i + 2] == 'p'))
376        {
377            // Scan forward to find "://"
378            let mut j = i;
379            while j + 2 < chars.len() {
380                if chars[j] == ':' && chars[j + 1] == '/' && chars[j + 2] == '/' {
381                    return true;
382                }
383                j += 1;
384
385                // Don't scan too far ahead for the protocol
386                if j > i + 10 {
387                    break;
388                }
389            }
390        }
391        i += 1;
392    }
393
394    false
395}
396
397/// Escapes a string to be used in a regex pattern
398pub fn escape_regex(s: &str) -> String {
399    let special_chars = ['.', '+', '*', '?', '^', '$', '(', ')', '[', ']', '{', '}', '|', '\\'];
400    let mut result = String::with_capacity(s.len() * 2);
401
402    for c in s.chars() {
403        if special_chars.contains(&c) {
404            result.push('\\');
405        }
406        result.push(c);
407    }
408
409    result
410}
411
412#[cfg(test)]
413mod tests {
414    use super::*;
415
416    #[test]
417    fn test_regex_cache_new() {
418        let cache = RegexCache::new();
419        assert!(cache.cache.is_empty());
420        assert!(cache.fancy_cache.is_empty());
421        assert!(cache.usage_stats.is_empty());
422    }
423
424    #[test]
425    fn test_regex_cache_default() {
426        let cache = RegexCache::default();
427        assert!(cache.cache.is_empty());
428        assert!(cache.fancy_cache.is_empty());
429        assert!(cache.usage_stats.is_empty());
430    }
431
432    #[test]
433    fn test_get_regex_compilation() {
434        let mut cache = RegexCache::new();
435
436        // First call compiles and caches
437        let regex1 = cache.get_regex(r"\d+").unwrap();
438        assert_eq!(cache.cache.len(), 1);
439        assert_eq!(cache.usage_stats.get(r"\d+"), Some(&1));
440
441        // Second call returns cached version
442        let regex2 = cache.get_regex(r"\d+").unwrap();
443        assert_eq!(cache.cache.len(), 1);
444        assert_eq!(cache.usage_stats.get(r"\d+"), Some(&2));
445
446        // Both should be the same Arc
447        assert!(Arc::ptr_eq(&regex1, &regex2));
448    }
449
450    #[test]
451    fn test_get_regex_invalid_pattern() {
452        let mut cache = RegexCache::new();
453        let result = cache.get_regex(r"[unterminated");
454        assert!(result.is_err());
455        assert!(cache.cache.is_empty());
456    }
457
458    #[test]
459    fn test_get_fancy_regex_compilation() {
460        let mut cache = RegexCache::new();
461
462        // First call compiles and caches
463        let regex1 = cache.get_fancy_regex(r"(?<=foo)bar").unwrap();
464        assert_eq!(cache.fancy_cache.len(), 1);
465        assert_eq!(cache.usage_stats.get(r"(?<=foo)bar"), Some(&1));
466
467        // Second call returns cached version
468        let regex2 = cache.get_fancy_regex(r"(?<=foo)bar").unwrap();
469        assert_eq!(cache.fancy_cache.len(), 1);
470        assert_eq!(cache.usage_stats.get(r"(?<=foo)bar"), Some(&2));
471
472        // Both should be the same Arc
473        assert!(Arc::ptr_eq(&regex1, &regex2));
474    }
475
476    #[test]
477    fn test_get_fancy_regex_invalid_pattern() {
478        let mut cache = RegexCache::new();
479        let result = cache.get_fancy_regex(r"(?<=invalid");
480        assert!(result.is_err());
481        assert!(cache.fancy_cache.is_empty());
482    }
483
484    #[test]
485    fn test_get_stats() {
486        let mut cache = RegexCache::new();
487
488        // Use some patterns
489        let _ = cache.get_regex(r"\d+").unwrap();
490        let _ = cache.get_regex(r"\d+").unwrap();
491        let _ = cache.get_regex(r"\w+").unwrap();
492        let _ = cache.get_fancy_regex(r"(?<=foo)bar").unwrap();
493
494        let stats = cache.get_stats();
495        assert_eq!(stats.get(r"\d+"), Some(&2));
496        assert_eq!(stats.get(r"\w+"), Some(&1));
497        assert_eq!(stats.get(r"(?<=foo)bar"), Some(&1));
498    }
499
500    #[test]
501    fn test_clear_cache() {
502        let mut cache = RegexCache::new();
503
504        // Add some patterns
505        let _ = cache.get_regex(r"\d+").unwrap();
506        let _ = cache.get_fancy_regex(r"(?<=foo)bar").unwrap();
507
508        assert!(!cache.cache.is_empty());
509        assert!(!cache.fancy_cache.is_empty());
510        assert!(!cache.usage_stats.is_empty());
511
512        // Clear cache
513        cache.clear();
514
515        assert!(cache.cache.is_empty());
516        assert!(cache.fancy_cache.is_empty());
517        assert!(cache.usage_stats.is_empty());
518    }
519
520    #[test]
521    fn test_global_cache_functions() {
522        // Test get_cached_regex
523        let regex1 = get_cached_regex(r"\d{3}").unwrap();
524        let regex2 = get_cached_regex(r"\d{3}").unwrap();
525        assert!(Arc::ptr_eq(&regex1, &regex2));
526
527        // Test get_cached_fancy_regex
528        let fancy1 = get_cached_fancy_regex(r"(?<=test)ing").unwrap();
529        let fancy2 = get_cached_fancy_regex(r"(?<=test)ing").unwrap();
530        assert!(Arc::ptr_eq(&fancy1, &fancy2));
531
532        // Test stats
533        let stats = get_cache_stats();
534        assert!(stats.contains_key(r"\d{3}"));
535        assert!(stats.contains_key(r"(?<=test)ing"));
536    }
537
538    #[test]
539    fn test_regex_lazy_macro() {
540        let re = regex_lazy!(r"^test.*end$");
541        assert!(re.is_match("test something end"));
542        assert!(!re.is_match("test something"));
543
544        // The macro creates a new static for each invocation location,
545        // so we can't test pointer equality across different invocations
546        // But we can test that the regex works correctly
547        let re2 = regex_lazy!(r"^start.*finish$");
548        assert!(re2.is_match("start and finish"));
549        assert!(!re2.is_match("start without end"));
550    }
551
552    #[test]
553    fn test_has_heading_markers() {
554        assert!(has_heading_markers("# Heading"));
555        assert!(has_heading_markers("Text with # symbol"));
556        assert!(!has_heading_markers("Text without heading marker"));
557    }
558
559    #[test]
560    fn test_has_list_markers() {
561        assert!(has_list_markers("* Item"));
562        assert!(has_list_markers("- Item"));
563        assert!(has_list_markers("+ Item"));
564        assert!(has_list_markers("1. Item"));
565        assert!(!has_list_markers("Text without list markers"));
566    }
567
568    #[test]
569    fn test_has_code_block_markers() {
570        assert!(has_code_block_markers("```code```"));
571        assert!(has_code_block_markers("~~~code~~~"));
572        assert!(has_code_block_markers("Text\n    indented code"));
573        assert!(!has_code_block_markers("Text without code blocks"));
574    }
575
576    #[test]
577    fn test_has_emphasis_markers() {
578        assert!(has_emphasis_markers("*emphasis*"));
579        assert!(has_emphasis_markers("_emphasis_"));
580        assert!(has_emphasis_markers("**bold**"));
581        assert!(has_emphasis_markers("__bold__"));
582        assert!(!has_emphasis_markers("no emphasis"));
583    }
584
585    #[test]
586    fn test_has_html_tags() {
587        assert!(has_html_tags("<div>content</div>"));
588        assert!(has_html_tags("<br/>"));
589        assert!(has_html_tags("<img src='test.jpg'>"));
590        assert!(!has_html_tags("no html tags"));
591        assert!(!has_html_tags("less than < but no tag"));
592    }
593
594    #[test]
595    fn test_has_link_markers() {
596        assert!(has_link_markers("[text](url)"));
597        assert!(has_link_markers("[reference][1]"));
598        assert!(has_link_markers("http://example.com"));
599        assert!(has_link_markers("https://example.com"));
600        assert!(has_link_markers("ftp://example.com"));
601        assert!(!has_link_markers("no links here"));
602    }
603
604    #[test]
605    fn test_has_image_markers() {
606        assert!(has_image_markers("![alt text](image.png)"));
607        assert!(has_image_markers("![](image.png)"));
608        assert!(!has_image_markers("[link](url)"));
609        assert!(!has_image_markers("no images"));
610    }
611
612    #[test]
613    fn test_contains_url() {
614        assert!(contains_url("http://example.com"));
615        assert!(contains_url("Text with https://example.com link"));
616        assert!(contains_url("ftp://example.com"));
617        assert!(!contains_url("Text without URL"));
618        assert!(!contains_url("http not followed by ://"));
619
620        // Edge cases
621        assert!(!contains_url("http"));
622        assert!(!contains_url("https"));
623        assert!(!contains_url("://"));
624        assert!(contains_url("Visit http://site.com now"));
625        assert!(contains_url("See https://secure.site.com/path"));
626    }
627
628    #[test]
629    fn test_contains_url_performance() {
630        // Test early exit for strings without "://"
631        let long_text = "a".repeat(10000);
632        assert!(!contains_url(&long_text));
633
634        // Test with URL at the end
635        let text_with_url = format!("{long_text}https://example.com");
636        assert!(contains_url(&text_with_url));
637    }
638
639    #[test]
640    fn test_escape_regex() {
641        assert_eq!(escape_regex("a.b"), "a\\.b");
642        assert_eq!(escape_regex("a+b*c"), "a\\+b\\*c");
643        assert_eq!(escape_regex("(test)"), "\\(test\\)");
644        assert_eq!(escape_regex("[a-z]"), "\\[a-z\\]");
645        assert_eq!(escape_regex("normal text"), "normal text");
646
647        // Test all special characters
648        assert_eq!(escape_regex(".$^{[(|)*+?\\"), "\\.\\$\\^\\{\\[\\(\\|\\)\\*\\+\\?\\\\");
649
650        // Test empty string
651        assert_eq!(escape_regex(""), "");
652
653        // Test mixed content
654        assert_eq!(escape_regex("test.com/path?query=1"), "test\\.com/path\\?query=1");
655    }
656
657    #[test]
658    fn test_static_regex_patterns() {
659        // Test URL patterns
660        assert!(URL_REGEX.is_match("https://example.com"));
661        assert!(URL_REGEX.is_match("http://test.org/path"));
662        assert!(URL_REGEX.is_match("ftp://files.com"));
663        assert!(!URL_REGEX.is_match("not a url"));
664
665        // Test heading patterns
666        assert!(ATX_HEADING_REGEX.is_match("# Heading"));
667        assert!(ATX_HEADING_REGEX.is_match("  ## Indented"));
668        assert!(ATX_HEADING_REGEX.is_match("### "));
669        assert!(!ATX_HEADING_REGEX.is_match("Not a heading"));
670
671        // Test list patterns
672        assert!(UNORDERED_LIST_MARKER_REGEX.is_match("* Item"));
673        assert!(UNORDERED_LIST_MARKER_REGEX.is_match("- Item"));
674        assert!(UNORDERED_LIST_MARKER_REGEX.is_match("+ Item"));
675        assert!(ORDERED_LIST_MARKER_REGEX.is_match("1. Item"));
676        assert!(ORDERED_LIST_MARKER_REGEX.is_match("99. Item"));
677
678        // Test code block patterns
679        assert!(FENCED_CODE_BLOCK_START_REGEX.is_match("```"));
680        assert!(FENCED_CODE_BLOCK_START_REGEX.is_match("```rust"));
681        assert!(FENCED_CODE_BLOCK_START_REGEX.is_match("~~~"));
682        assert!(FENCED_CODE_BLOCK_END_REGEX.is_match("```"));
683        assert!(FENCED_CODE_BLOCK_END_REGEX.is_match("~~~"));
684
685        // Test emphasis patterns
686        assert!(BOLD_ASTERISK_REGEX.is_match("**bold**"));
687        assert!(BOLD_UNDERSCORE_REGEX.is_match("__bold__"));
688        assert!(ITALIC_ASTERISK_REGEX.is_match("*italic*"));
689        assert!(ITALIC_UNDERSCORE_REGEX.is_match("_italic_"));
690
691        // Test HTML patterns
692        assert!(HTML_TAG_REGEX.is_match("<div>"));
693        assert!(HTML_TAG_REGEX.is_match("<span class='test'>"));
694        assert!(HTML_SELF_CLOSING_TAG_REGEX.is_match("<br/>"));
695        assert!(HTML_SELF_CLOSING_TAG_REGEX.is_match("<img src='test'/>"));
696
697        // Test whitespace patterns
698        assert!(TRAILING_WHITESPACE_REGEX.is_match("line with spaces   "));
699        assert!(TRAILING_WHITESPACE_REGEX.is_match("tabs\t\t"));
700        assert!(MULTIPLE_BLANK_LINES_REGEX.is_match("\n\n\n"));
701        assert!(MULTIPLE_BLANK_LINES_REGEX.is_match("\n\n\n\n"));
702
703        // Test blockquote pattern
704        assert!(BLOCKQUOTE_PREFIX_RE.is_match("> Quote"));
705        assert!(BLOCKQUOTE_PREFIX_RE.is_match("  > Indented quote"));
706        assert!(BLOCKQUOTE_PREFIX_RE.is_match(">> Nested"));
707    }
708
709    #[test]
710    fn test_thread_safety() {
711        use std::thread;
712
713        let handles: Vec<_> = (0..10)
714            .map(|i| {
715                thread::spawn(move || {
716                    let pattern = format!(r"\d{{{i}}}");
717                    let regex = get_cached_regex(&pattern).unwrap();
718                    assert!(regex.is_match(&"1".repeat(i)));
719                })
720            })
721            .collect();
722
723        for handle in handles {
724            handle.join().unwrap();
725        }
726    }
727}
rumdl_lib/utils/regex_cache.rs

rumdl_lib/utils/
regex_cache.rs