rumdl_lib/utils/
regex_cache.rs

1//!
2//! Cached Regex Patterns and Fast Content Checks for Markdown Linting
3//!
4//! This module provides a centralized collection of pre-compiled, cached regex patterns
5//! for all major Markdown constructs (headings, lists, code blocks, links, images, etc.).
6//! It also includes fast-path utility functions for quickly checking if content
7//! potentially contains certain Markdown elements, allowing rules to skip expensive
8//! processing when unnecessary.
9//!
10//! # Performance
11//!
12//! All regexes are compiled once at startup using `lazy_static`, avoiding repeated
13//! compilation and improving performance across the linter. Use these shared patterns
14//! in rules instead of compiling new regexes.
15//!
16//! # Usage
17//!
18//! - Use the provided statics for common Markdown patterns.
19//! - Use the `regex_lazy!` macro for ad-hoc regexes that are not predefined.
20//! - Use the utility functions for fast content checks before running regexes.
21
22use fancy_regex::Regex as FancyRegex;
23use regex::Regex;
24use std::collections::HashMap;
25use std::sync::LazyLock;
26use std::sync::{Arc, Mutex};
27
28/// Global regex cache for dynamic patterns
29#[derive(Debug)]
30pub struct RegexCache {
31    cache: HashMap<String, Arc<Regex>>,
32    usage_stats: HashMap<String, u64>,
33}
34
35impl Default for RegexCache {
36    fn default() -> Self {
37        Self::new()
38    }
39}
40
41impl RegexCache {
42    pub fn new() -> Self {
43        Self {
44            cache: HashMap::new(),
45            usage_stats: HashMap::new(),
46        }
47    }
48
49    /// Get or compile a regex pattern
50    pub fn get_regex(&mut self, pattern: &str) -> Result<Arc<Regex>, regex::Error> {
51        if let Some(regex) = self.cache.get(pattern) {
52            *self.usage_stats.entry(pattern.to_string()).or_insert(0) += 1;
53            return Ok(regex.clone());
54        }
55
56        let regex = Arc::new(Regex::new(pattern)?);
57        self.cache.insert(pattern.to_string(), regex.clone());
58        *self.usage_stats.entry(pattern.to_string()).or_insert(0) += 1;
59        Ok(regex)
60    }
61
62    /// Get cache statistics
63    pub fn get_stats(&self) -> HashMap<String, u64> {
64        self.usage_stats.clone()
65    }
66
67    /// Clear cache (useful for testing)
68    pub fn clear(&mut self) {
69        self.cache.clear();
70        self.usage_stats.clear();
71    }
72}
73
74/// Global regex cache instance
75static GLOBAL_REGEX_CACHE: LazyLock<Arc<Mutex<RegexCache>>> = LazyLock::new(|| Arc::new(Mutex::new(RegexCache::new())));
76
77/// Get a regex from the global cache
78///
79/// If the mutex is poisoned (another thread panicked while holding the lock),
80/// this function recovers by clearing the cache and continuing. This ensures
81/// the library never panics due to mutex poisoning.
82pub fn get_cached_regex(pattern: &str) -> Result<Arc<Regex>, regex::Error> {
83    let mut cache = GLOBAL_REGEX_CACHE.lock().unwrap_or_else(|poisoned| {
84        // Recover from poisoned mutex by clearing the cache
85        let mut guard = poisoned.into_inner();
86        guard.clear();
87        guard
88    });
89    cache.get_regex(pattern)
90}
91
92/// Get cache usage statistics
93///
94/// If the mutex is poisoned, returns an empty HashMap rather than panicking.
95pub fn get_cache_stats() -> HashMap<String, u64> {
96    match GLOBAL_REGEX_CACHE.lock() {
97        Ok(cache) => cache.get_stats(),
98        Err(_) => HashMap::new(),
99    }
100}
101
102/// Macro for defining a lazily-initialized, cached regex pattern.
103///
104/// Use this for ad-hoc regexes that are not already defined in this module.
105///
106/// # Panics
107///
108/// This macro will panic at initialization if the regex pattern is invalid.
109/// This is intentional for compile-time constant patterns - we want to catch
110/// invalid patterns during development, not at runtime.
111///
112/// # Example
113///
114/// ```
115/// use std::sync::LazyLock;
116/// use rumdl_lib::regex_lazy;
117/// let my_re = regex_lazy!(r"^foo.*bar$");
118/// assert!(my_re.is_match("foobar"));
119/// ```
120#[macro_export]
121macro_rules! regex_lazy {
122    ($pattern:expr) => {{
123        static REGEX: LazyLock<regex::Regex> = LazyLock::new(|| regex::Regex::new($pattern).unwrap());
124        &*REGEX
125    }};
126}
127
128/// Macro for getting regex from global cache.
129///
130/// # Panics
131///
132/// Panics if the regex pattern is invalid. This is acceptable for static patterns
133/// where we want to fail fast during development.
134#[macro_export]
135macro_rules! regex_cached {
136    ($pattern:expr) => {{ $crate::utils::regex_cache::get_cached_regex($pattern).expect("Failed to compile regex") }};
137}
138
139// Also make the macro available directly from this module
140pub use crate::regex_lazy;
141
142// =============================================================================
143// URL REGEX PATTERNS - Centralized URL Detection
144// =============================================================================
145//
146// ## Pattern Hierarchy (use the most specific pattern for your needs):
147//
148// | Pattern              | Use Case                                    | Parens | Trailing Punct |
149// |----------------------|---------------------------------------------|--------|----------------|
150// | URL_STANDARD_REGEX   | MD034 bare URL detection with auto-fix      | Yes    | Captured*      |
151// | URL_WWW_REGEX        | www.domain URLs without protocol            | Yes    | Captured*      |
152// | URL_IPV6_REGEX       | IPv6 URLs like https://[::1]/path           | Yes    | Captured*      |
153// | URL_QUICK_CHECK_REGEX| Fast early-exit check (contains URL?)       | N/A    | N/A            |
154// | URL_SIMPLE_REGEX     | Content detection, line length exemption    | No     | Excluded       |
155//
156// *Trailing punctuation is captured by the regex; use trim_trailing_punctuation() to clean.
157//
158// ## Design Principles:
159// 1. Parentheses in paths are allowed for Wikipedia-style URLs (Issue #240)
160// 2. Host portion excludes / so path is captured separately
161// 3. Unbalanced trailing parens are handled by trim_trailing_punctuation()
162// 4. All patterns exclude angle brackets <> to avoid matching autolinks
163//
164// ## URL Structure: protocol://host[:port][/path][?query][#fragment]
165
166/// Pattern for standard HTTP(S)/FTP(S) URLs with full path support.
167///
168/// Use this for bare URL detection where you need the complete URL including
169/// Wikipedia-style parentheses in paths. Trailing punctuation like `,;.!?` may
170/// be captured and should be trimmed by the caller.
171///
172/// # Examples
173/// - `https://example.com/path_(with_parens)?query#fragment`
174/// - `https://en.wikipedia.org/wiki/Rust_(programming_language)`
175pub const URL_STANDARD_STR: &str = concat!(
176    r#"(?:https?|ftps?|ftp)://"#, // Protocol
177    r#"(?:"#,
178    r#"\[[0-9a-fA-F:%.\-a-zA-Z]+\]"#, // IPv6 host OR
179    r#"|"#,
180    r#"[^\s<>\[\]()\\'\"`/]+"#, // Standard host (no parens, no /)
181    r#")"#,
182    r#"(?::\d+)?"#,                 // Optional port
183    r#"(?:/[^\s<>\[\]\\'\"`]*)?"#,  // Optional path (allows parens)
184    r#"(?:\?[^\s<>\[\]\\'\"`]*)?"#, // Optional query (allows parens)
185    r#"(?:#[^\s<>\[\]\\'\"`]*)?"#,  // Optional fragment (allows parens)
186);
187
188/// Pattern for www URLs without protocol.
189///
190/// Matches URLs starting with `www.` that lack a protocol prefix.
191/// These should be converted to proper URLs or flagged as bare URLs.
192/// Supports port, path, query string, and fragment like URL_STANDARD_STR.
193///
194/// # Examples
195/// - `www.example.com`
196/// - `www.example.com:8080`
197/// - `www.example.com/path`
198/// - `www.example.com?query=value`
199/// - `www.example.com#section`
200pub const URL_WWW_STR: &str = concat!(
201    r#"www\.(?:[a-zA-Z0-9][-a-zA-Z0-9]*\.)+[a-zA-Z]{2,}"#, // www.domain.tld
202    r#"(?::\d+)?"#,                                        // Optional port
203    r#"(?:/[^\s<>\[\]\\'\"`]*)?"#,                         // Optional path (allows parens)
204    r#"(?:\?[^\s<>\[\]\\'\"`]*)?"#,                        // Optional query (allows parens)
205    r#"(?:#[^\s<>\[\]\\'\"`]*)?"#,                         // Optional fragment (allows parens)
206);
207
208/// Pattern for IPv6 URLs specifically.
209///
210/// Matches URLs with IPv6 addresses in brackets, including zone identifiers.
211/// Examples: `https://[::1]/path`, `https://[fe80::1%eth0]:8080/`
212pub const URL_IPV6_STR: &str = concat!(
213    r#"(?:https?|ftps?|ftp)://"#,
214    r#"\[[0-9a-fA-F:%.\-a-zA-Z]+\]"#, // IPv6 host in brackets
215    r#"(?::\d+)?"#,                   // Optional port
216    r#"(?:/[^\s<>\[\]\\'\"`]*)?"#,    // Optional path
217    r#"(?:\?[^\s<>\[\]\\'\"`]*)?"#,   // Optional query
218    r#"(?:#[^\s<>\[\]\\'\"`]*)?"#,    // Optional fragment
219);
220
221/// Pattern for XMPP URIs per GFM extended autolinks specification.
222///
223/// XMPP URIs use the format `xmpp:user@domain/resource` (without `://`).
224/// Reference: <https://github.github.com/gfm/#autolinks-extension->
225///
226/// # Examples
227/// - `xmpp:foo@bar.baz`
228/// - `xmpp:foo@bar.baz/txt`
229pub const XMPP_URI_STR: &str = r#"xmpp:[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:/[^\s<>\[\]\\'\"`]*)?"#;
230
231/// Quick check pattern for early exits.
232///
233/// Use this for fast pre-filtering before running more expensive patterns.
234/// Matches if the text likely contains a URL or email address.
235/// Includes `xmpp:` for GFM extended autolinks.
236pub const URL_QUICK_CHECK_STR: &str = r#"(?:https?|ftps?|ftp|xmpp)://|xmpp:|@|www\."#;
237
238/// Simple URL pattern for content detection.
239///
240/// Less strict pattern that excludes trailing sentence punctuation (.,).
241/// Use for line length exemption checks or content characteristic detection
242/// where you just need to know if a URL exists, not extract it precisely.
243pub const URL_SIMPLE_STR: &str = r#"(?:https?|ftps?|ftp)://[^\s<>]+[^\s<>.,]"#;
244
245// Pre-compiled static patterns for performance
246
247/// Standard URL regex - primary pattern for bare URL detection (MD034).
248/// See [`URL_STANDARD_STR`] for documentation.
249pub static URL_STANDARD_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(URL_STANDARD_STR).unwrap());
250
251/// WWW URL regex - for URLs starting with www. without protocol.
252/// See [`URL_WWW_STR`] for documentation.
253pub static URL_WWW_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(URL_WWW_STR).unwrap());
254
255/// IPv6 URL regex - for URLs with IPv6 addresses.
256/// See [`URL_IPV6_STR`] for documentation.
257pub static URL_IPV6_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(URL_IPV6_STR).unwrap());
258
259/// Quick check regex - fast early-exit test.
260/// See [`URL_QUICK_CHECK_STR`] for documentation.
261pub static URL_QUICK_CHECK_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(URL_QUICK_CHECK_STR).unwrap());
262
263/// Simple URL regex - for content detection and line length exemption.
264/// See [`URL_SIMPLE_STR`] for documentation.
265pub static URL_SIMPLE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(URL_SIMPLE_STR).unwrap());
266
267/// Alias for `URL_SIMPLE_REGEX`. Used by MD013 for line length exemption.
268pub static URL_PATTERN: LazyLock<Regex> = LazyLock::new(|| URL_SIMPLE_REGEX.clone());
269
270/// XMPP URI regex - for GFM extended autolinks.
271/// See [`XMPP_URI_STR`] for documentation.
272pub static XMPP_URI_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(XMPP_URI_STR).unwrap());
273
274// Heading patterns
275pub static ATX_HEADING_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)(#{1,6})(\s+|$)").unwrap());
276
277// List patterns
278pub static UNORDERED_LIST_MARKER_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)([*+-])(\s+)").unwrap());
279pub static ORDERED_LIST_MARKER_REGEX: LazyLock<Regex> =
280    LazyLock::new(|| Regex::new(r"^(\s*)(\d+)([.)])(\s+)").unwrap());
281
282// Emphasis patterns
283
284// MD037 specific emphasis patterns - improved to avoid false positives
285// Only match emphasis with spaces that are actually complete emphasis blocks
286// Use word boundaries and negative lookbehind/lookahead to avoid matching across emphasis boundaries
287pub static ASTERISK_EMPHASIS: LazyLock<Regex> =
288    LazyLock::new(|| Regex::new(r"(?:^|[^*])\*(\s+[^*]+\s*|\s*[^*]+\s+)\*(?:[^*]|$)").unwrap());
289pub static UNDERSCORE_EMPHASIS: LazyLock<Regex> =
290    LazyLock::new(|| Regex::new(r"(?:^|[^_])_(\s+[^_]+\s*|\s*[^_]+\s+)_(?:[^_]|$)").unwrap());
291pub static DOUBLE_UNDERSCORE_EMPHASIS: LazyLock<Regex> =
292    LazyLock::new(|| Regex::new(r"(?:^|[^_])__(\s+[^_]+\s*|\s*[^_]+\s+)__(?:[^_]|$)").unwrap());
293// Code block patterns
294pub static FENCED_CODE_BLOCK_START: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)```(?:[^`\r\n]*)$").unwrap());
295pub static FENCED_CODE_BLOCK_END: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)```\s*$").unwrap());
296
297// HTML patterns
298pub static HTML_TAG_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<([a-zA-Z][^>]*)>").unwrap());
299pub static HTML_TAG_QUICK_CHECK: LazyLock<Regex> = LazyLock::new(|| Regex::new("(?i)</?[a-zA-Z]").unwrap());
300
301// Image patterns
302pub static IMAGE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\(([^)]+)\)").unwrap());
303
304// Blockquote patterns
305pub static BLOCKQUOTE_PREFIX_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
306
307/// Check if a line is blank in the context of blockquotes.
308///
309/// A line is considered "blank" if:
310/// - It's empty or contains only whitespace
311/// - It's a blockquote continuation line with no content (e.g., ">", ">>", "> ")
312///
313/// This is essential for rules like MD058 (blanks-around-tables), MD065 (blanks-around-horizontal-rules),
314/// and any other rule that needs to detect blank lines that might be inside blockquotes.
315///
316/// # Examples
317/// ```
318/// use rumdl_lib::utils::regex_cache::is_blank_in_blockquote_context;
319///
320/// assert!(is_blank_in_blockquote_context(""));           // Empty line
321/// assert!(is_blank_in_blockquote_context("   "));        // Whitespace only
322/// assert!(is_blank_in_blockquote_context(">"));          // Blockquote continuation
323/// assert!(is_blank_in_blockquote_context("> "));         // Blockquote with trailing space
324/// assert!(is_blank_in_blockquote_context(">>"));         // Nested blockquote
325/// assert!(is_blank_in_blockquote_context("> > "));       // Spaced nested blockquote
326/// assert!(!is_blank_in_blockquote_context("> text"));    // Blockquote with content
327/// assert!(!is_blank_in_blockquote_context("text"));      // Regular text
328/// ```
329pub fn is_blank_in_blockquote_context(line: &str) -> bool {
330    if line.trim().is_empty() {
331        return true;
332    }
333    // Check if line is a blockquote prefix with no content after it
334    // Handle spaced nested blockquotes like "> > " by recursively checking remainder
335    if let Some(m) = BLOCKQUOTE_PREFIX_RE.find(line) {
336        let remainder = &line[m.end()..];
337        // The remainder should be empty/whitespace OR another blockquote prefix (for spaced nesting)
338        is_blank_in_blockquote_context(remainder)
339    } else {
340        false
341    }
342}
343
344// MD013 specific patterns
345pub static IMAGE_REF_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^!\[.*?\]\[.*?\]$").unwrap());
346pub static LINK_REF_PATTERN: LazyLock<Regex> =
347    LazyLock::new(|| Regex::new(r#"^\[.*?\]:\s*\S+(\s+["'(].*)?\s*$"#).unwrap());
348pub static ABBREVIATION: LazyLock<Regex> = LazyLock::new(|| {
349    Regex::new(r"\b(?:Mr|Mrs|Ms|Dr|Prof|Sr|Jr|vs|etc|i\.e|e\.g|Inc|Corp|Ltd|Co|St|Ave|Blvd|Rd|Ph\.D|M\.D|B\.A|M\.A|Ph\.D|U\.S|U\.K|U\.N|N\.Y|L\.A|D\.C)\.\s+[A-Z]").unwrap()
350});
351pub static LIST_ITEM: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*\d+\.\s+").unwrap());
352
353// Email pattern
354pub static EMAIL_PATTERN: LazyLock<Regex> =
355    LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
356
357// Third lazy_static block for link and image patterns used by MD052 and text_reflow
358// Reference link patterns (shared by MD052 and text_reflow)
359// Pattern to match reference links: [text][reference] or [text][]
360pub static REF_LINK_REGEX: LazyLock<FancyRegex> =
361    LazyLock::new(|| FancyRegex::new(r"(?<!\\)\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]\[([^\]]*)\]").unwrap());
362
363// Pattern for shortcut reference links: [reference]
364// Must not be preceded by ] or ) (to avoid matching second part of [text][ref])
365// Must not be followed by [ or ( (to avoid matching first part of [text][ref] or [text](url))
366// The capturing group handles nested brackets to support cases like [`Union[T, None]`]
367pub static SHORTCUT_REF_REGEX: LazyLock<FancyRegex> =
368    LazyLock::new(|| FancyRegex::new(r"(?<![\\)\]])\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\](?!\s*[\[\(])").unwrap());
369
370// Inline link with fancy regex for better escaping handling (used by text_reflow)
371pub static INLINE_LINK_FANCY_REGEX: LazyLock<FancyRegex> =
372    LazyLock::new(|| FancyRegex::new(r"(?<!\\)\[([^\]]+)\]\(([^)]+)\)").unwrap());
373
374// Inline image (used by MD052 and text_reflow)
375pub static INLINE_IMAGE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\(([^)]+)\)").unwrap());
376
377// Linked images (clickable badges) - all 4 variants
378// Must be detected before inline_image and inline_link to treat as atomic units
379//
380// Limitation: Alt text containing brackets like [![[v1.0]](img)](link) is not supported.
381// The [^\]]* pattern cannot match nested brackets. This is rare in practice.
382//
383// Pattern 1: Inline image in inline link - [![alt](img-url)](link-url)
384pub static LINKED_IMAGE_INLINE_INLINE: LazyLock<Regex> =
385    LazyLock::new(|| Regex::new(r"\[!\[([^\]]*)\]\(([^)]+)\)\]\(([^)]+)\)").unwrap());
386
387// Pattern 2: Reference image in inline link - [![alt][img-ref]](link-url)
388pub static LINKED_IMAGE_REF_INLINE: LazyLock<Regex> =
389    LazyLock::new(|| Regex::new(r"\[!\[([^\]]*)\]\[([^\]]*)\]\]\(([^)]+)\)").unwrap());
390
391// Pattern 3: Inline image in reference link - [![alt](img-url)][link-ref]
392pub static LINKED_IMAGE_INLINE_REF: LazyLock<Regex> =
393    LazyLock::new(|| Regex::new(r"\[!\[([^\]]*)\]\(([^)]+)\)\]\[([^\]]*)\]").unwrap());
394
395// Pattern 4: Reference image in reference link - [![alt][img-ref]][link-ref]
396pub static LINKED_IMAGE_REF_REF: LazyLock<Regex> =
397    LazyLock::new(|| Regex::new(r"\[!\[([^\]]*)\]\[([^\]]*)\]\]\[([^\]]*)\]").unwrap());
398
399// Reference image: ![alt][ref] or ![alt][]
400pub static REF_IMAGE_REGEX: LazyLock<Regex> =
401    LazyLock::new(|| Regex::new(r"!\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]\[([^\]]*)\]").unwrap());
402
403// Footnote reference: [^note]
404pub static FOOTNOTE_REF_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[\^([^\]]+)\]").unwrap());
405
406// Wiki-style links: [[wiki]] or [[wiki|display text]]
407pub static WIKI_LINK_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[\[([^\]]+)\]\]").unwrap());
408
409// Math formulas: $inline$ or $$display$$
410pub static INLINE_MATH_REGEX: LazyLock<FancyRegex> =
411    LazyLock::new(|| FancyRegex::new(r"(?<!\$)\$(?!\$)([^\$]+)\$(?!\$)").unwrap());
412pub static DISPLAY_MATH_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\$\$([^\$]+)\$\$").unwrap());
413
414// Emoji shortcodes: :emoji:
415pub static EMOJI_SHORTCODE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r":([a-zA-Z0-9_+-]+):").unwrap());
416
417// HTML tags (opening, closing, self-closing)
418pub static HTML_TAG_PATTERN: LazyLock<Regex> =
419    LazyLock::new(|| Regex::new(r"</?[a-zA-Z][^>]*>|<[a-zA-Z][^>]*/\s*>").unwrap());
420
421// HTML entities: &nbsp; &mdash; etc
422pub static HTML_ENTITY_REGEX: LazyLock<Regex> =
423    LazyLock::new(|| Regex::new(r"&[a-zA-Z][a-zA-Z0-9]*;|&#\d+;|&#x[0-9a-fA-F]+;").unwrap());
424
425// Hugo/Go template shortcodes: {{< figure ... >}} and {{% shortcode %}}
426// Matches both delimiters: {{< ... >}} (shortcode) and {{% ... %}} (template)
427// Handles multi-line content with embedded quotes and newlines
428pub static HUGO_SHORTCODE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\{\{[<%][\s\S]*?[%>]\}\}").unwrap());
429
430// HTML comment pattern
431pub static HTML_COMMENT_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<!--[\s\S]*?-->").unwrap());
432
433// HTML heading pattern (matches <h1> through <h6> tags)
434// Uses FancyRegex because the pattern requires a backreference (\1)
435pub static HTML_HEADING_PATTERN: LazyLock<FancyRegex> =
436    LazyLock::new(|| FancyRegex::new(r"^\s*<h([1-6])(?:\s[^>]*)?>.*</h\1>\s*$").unwrap());
437
438/// Escapes a string to be used in a regex pattern
439pub fn escape_regex(s: &str) -> String {
440    let mut result = String::with_capacity(s.len() * 2);
441
442    for c in s.chars() {
443        // Use matches! for O(1) lookup instead of array.contains() which is O(n)
444        if matches!(
445            c,
446            '.' | '+' | '*' | '?' | '^' | '$' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '\\'
447        ) {
448            result.push('\\');
449        }
450        result.push(c);
451    }
452
453    result
454}
455
456#[cfg(test)]
457mod tests {
458    use super::*;
459
460    #[test]
461    fn test_regex_cache_new() {
462        let cache = RegexCache::new();
463        assert!(cache.cache.is_empty());
464        assert!(cache.usage_stats.is_empty());
465    }
466
467    #[test]
468    fn test_regex_cache_default() {
469        let cache = RegexCache::default();
470        assert!(cache.cache.is_empty());
471        assert!(cache.usage_stats.is_empty());
472    }
473
474    #[test]
475    fn test_get_regex_compilation() {
476        let mut cache = RegexCache::new();
477
478        // First call compiles and caches
479        let regex1 = cache.get_regex(r"\d+").unwrap();
480        assert_eq!(cache.cache.len(), 1);
481        assert_eq!(cache.usage_stats.get(r"\d+"), Some(&1));
482
483        // Second call returns cached version
484        let regex2 = cache.get_regex(r"\d+").unwrap();
485        assert_eq!(cache.cache.len(), 1);
486        assert_eq!(cache.usage_stats.get(r"\d+"), Some(&2));
487
488        // Both should be the same Arc
489        assert!(Arc::ptr_eq(&regex1, &regex2));
490    }
491
492    #[test]
493    fn test_get_regex_invalid_pattern() {
494        let mut cache = RegexCache::new();
495        let result = cache.get_regex(r"[unterminated");
496        assert!(result.is_err());
497        assert!(cache.cache.is_empty());
498    }
499
500    #[test]
501    fn test_get_stats() {
502        let mut cache = RegexCache::new();
503
504        // Use some patterns
505        let _ = cache.get_regex(r"\d+").unwrap();
506        let _ = cache.get_regex(r"\d+").unwrap();
507        let _ = cache.get_regex(r"\w+").unwrap();
508
509        let stats = cache.get_stats();
510        assert_eq!(stats.get(r"\d+"), Some(&2));
511        assert_eq!(stats.get(r"\w+"), Some(&1));
512    }
513
514    #[test]
515    fn test_clear_cache() {
516        let mut cache = RegexCache::new();
517
518        // Add some patterns
519        let _ = cache.get_regex(r"\d+").unwrap();
520
521        assert!(!cache.cache.is_empty());
522        assert!(!cache.usage_stats.is_empty());
523
524        // Clear cache
525        cache.clear();
526
527        assert!(cache.cache.is_empty());
528        assert!(cache.usage_stats.is_empty());
529    }
530
531    #[test]
532    fn test_global_cache_functions() {
533        // Test get_cached_regex
534        let regex1 = get_cached_regex(r"\d{3}").unwrap();
535        let regex2 = get_cached_regex(r"\d{3}").unwrap();
536        assert!(Arc::ptr_eq(&regex1, &regex2));
537
538        // Test stats
539        let stats = get_cache_stats();
540        assert!(stats.contains_key(r"\d{3}"));
541    }
542
543    #[test]
544    fn test_regex_lazy_macro() {
545        let re = regex_lazy!(r"^test.*end$");
546        assert!(re.is_match("test something end"));
547        assert!(!re.is_match("test something"));
548
549        // The macro creates a new static for each invocation location,
550        // so we can't test pointer equality across different invocations
551        // But we can test that the regex works correctly
552        let re2 = regex_lazy!(r"^start.*finish$");
553        assert!(re2.is_match("start and finish"));
554        assert!(!re2.is_match("start without end"));
555    }
556
557    #[test]
558    fn test_escape_regex() {
559        assert_eq!(escape_regex("a.b"), "a\\.b");
560        assert_eq!(escape_regex("a+b*c"), "a\\+b\\*c");
561        assert_eq!(escape_regex("(test)"), "\\(test\\)");
562        assert_eq!(escape_regex("[a-z]"), "\\[a-z\\]");
563        assert_eq!(escape_regex("normal text"), "normal text");
564
565        // Test all special characters
566        assert_eq!(escape_regex(".$^{[(|)*+?\\"), "\\.\\$\\^\\{\\[\\(\\|\\)\\*\\+\\?\\\\");
567
568        // Test empty string
569        assert_eq!(escape_regex(""), "");
570
571        // Test mixed content
572        assert_eq!(escape_regex("test.com/path?query=1"), "test\\.com/path\\?query=1");
573    }
574
575    #[test]
576    fn test_static_regex_patterns() {
577        // Test URL patterns
578        assert!(URL_SIMPLE_REGEX.is_match("https://example.com"));
579        assert!(URL_SIMPLE_REGEX.is_match("http://test.org/path"));
580        assert!(URL_SIMPLE_REGEX.is_match("ftp://files.com"));
581        assert!(!URL_SIMPLE_REGEX.is_match("not a url"));
582
583        // Test heading patterns
584        assert!(ATX_HEADING_REGEX.is_match("# Heading"));
585        assert!(ATX_HEADING_REGEX.is_match("  ## Indented"));
586        assert!(ATX_HEADING_REGEX.is_match("### "));
587        assert!(!ATX_HEADING_REGEX.is_match("Not a heading"));
588
589        // Test list patterns
590        assert!(UNORDERED_LIST_MARKER_REGEX.is_match("* Item"));
591        assert!(UNORDERED_LIST_MARKER_REGEX.is_match("- Item"));
592        assert!(UNORDERED_LIST_MARKER_REGEX.is_match("+ Item"));
593        assert!(ORDERED_LIST_MARKER_REGEX.is_match("1. Item"));
594        assert!(ORDERED_LIST_MARKER_REGEX.is_match("99. Item"));
595
596        // Test HTML patterns
597        assert!(HTML_TAG_REGEX.is_match("<div>"));
598        assert!(HTML_TAG_REGEX.is_match("<span class='test'>"));
599
600        // Test blockquote pattern
601        assert!(BLOCKQUOTE_PREFIX_RE.is_match("> Quote"));
602        assert!(BLOCKQUOTE_PREFIX_RE.is_match("  > Indented quote"));
603        assert!(BLOCKQUOTE_PREFIX_RE.is_match(">> Nested"));
604    }
605
606    #[test]
607    fn test_thread_safety() {
608        use std::thread;
609
610        let handles: Vec<_> = (0..10)
611            .map(|i| {
612                thread::spawn(move || {
613                    let pattern = format!(r"\d{{{i}}}");
614                    let regex = get_cached_regex(&pattern).unwrap();
615                    assert!(regex.is_match(&"1".repeat(i)));
616                })
617            })
618            .collect();
619
620        for handle in handles {
621            handle.join().unwrap();
622        }
623    }
624
625    // ==========================================================================
626    // Comprehensive URL Regex Tests
627    // ==========================================================================
628
629    #[test]
630    fn test_url_standard_basic() {
631        // Basic HTTP/HTTPS URLs
632        assert!(URL_STANDARD_REGEX.is_match("https://example.com"));
633        assert!(URL_STANDARD_REGEX.is_match("http://example.com"));
634        assert!(URL_STANDARD_REGEX.is_match("https://example.com/"));
635        assert!(URL_STANDARD_REGEX.is_match("https://example.com/path"));
636        assert!(URL_STANDARD_REGEX.is_match("ftp://files.example.com"));
637        assert!(URL_STANDARD_REGEX.is_match("ftps://secure.example.com"));
638
639        // Should not match non-URLs
640        assert!(!URL_STANDARD_REGEX.is_match("not a url"));
641        assert!(!URL_STANDARD_REGEX.is_match("example.com"));
642        assert!(!URL_STANDARD_REGEX.is_match("www.example.com"));
643    }
644
645    #[test]
646    fn test_url_standard_with_path() {
647        assert!(URL_STANDARD_REGEX.is_match("https://example.com/path/to/page"));
648        assert!(URL_STANDARD_REGEX.is_match("https://example.com/path/to/page.html"));
649        assert!(URL_STANDARD_REGEX.is_match("https://example.com/path/to/page/"));
650    }
651
652    #[test]
653    fn test_url_standard_with_query() {
654        assert!(URL_STANDARD_REGEX.is_match("https://example.com?query=value"));
655        assert!(URL_STANDARD_REGEX.is_match("https://example.com/path?query=value"));
656        assert!(URL_STANDARD_REGEX.is_match("https://example.com/path?a=1&b=2"));
657    }
658
659    #[test]
660    fn test_url_standard_with_fragment() {
661        assert!(URL_STANDARD_REGEX.is_match("https://example.com#section"));
662        assert!(URL_STANDARD_REGEX.is_match("https://example.com/path#section"));
663        assert!(URL_STANDARD_REGEX.is_match("https://example.com/path?query=value#section"));
664    }
665
666    #[test]
667    fn test_url_standard_with_port() {
668        assert!(URL_STANDARD_REGEX.is_match("https://example.com:8080"));
669        assert!(URL_STANDARD_REGEX.is_match("https://example.com:443/path"));
670        assert!(URL_STANDARD_REGEX.is_match("http://localhost:3000"));
671        assert!(URL_STANDARD_REGEX.is_match("https://192.168.1.1:8080/path"));
672    }
673
674    #[test]
675    fn test_url_standard_wikipedia_style_parentheses() {
676        // Wikipedia-style URLs with parentheses in path (Issue #240)
677        let url = "https://en.wikipedia.org/wiki/Rust_(programming_language)";
678        assert!(URL_STANDARD_REGEX.is_match(url));
679
680        // Verify the full URL is captured
681        let cap = URL_STANDARD_REGEX.find(url).unwrap();
682        assert_eq!(cap.as_str(), url);
683
684        // Multiple parentheses pairs
685        let url2 = "https://example.com/path_(foo)_(bar)";
686        let cap2 = URL_STANDARD_REGEX.find(url2).unwrap();
687        assert_eq!(cap2.as_str(), url2);
688    }
689
690    #[test]
691    fn test_url_standard_ipv6() {
692        // IPv6 addresses in URLs
693        assert!(URL_STANDARD_REGEX.is_match("https://[::1]/path"));
694        assert!(URL_STANDARD_REGEX.is_match("https://[2001:db8::1]:8080/path"));
695        assert!(URL_STANDARD_REGEX.is_match("http://[fe80::1%eth0]/"));
696    }
697
698    #[test]
699    fn test_url_www_basic() {
700        // www URLs without protocol
701        assert!(URL_WWW_REGEX.is_match("www.example.com"));
702        assert!(URL_WWW_REGEX.is_match("www.example.co.uk"));
703        assert!(URL_WWW_REGEX.is_match("www.sub.example.com"));
704
705        // Should not match plain domains without www
706        assert!(!URL_WWW_REGEX.is_match("example.com"));
707
708        // Note: https://www.example.com DOES match because it contains "www."
709        // The URL_WWW_REGEX is designed to find www. URLs that lack a protocol
710        // Use URL_STANDARD_REGEX for full URLs with protocols
711        assert!(URL_WWW_REGEX.is_match("https://www.example.com"));
712    }
713
714    #[test]
715    fn test_url_www_with_path() {
716        assert!(URL_WWW_REGEX.is_match("www.example.com/path"));
717        assert!(URL_WWW_REGEX.is_match("www.example.com/path/to/page"));
718        assert!(URL_WWW_REGEX.is_match("www.example.com/path_(with_parens)"));
719    }
720
721    #[test]
722    fn test_url_ipv6_basic() {
723        // IPv6 specific patterns
724        assert!(URL_IPV6_REGEX.is_match("https://[::1]/"));
725        assert!(URL_IPV6_REGEX.is_match("http://[2001:db8::1]/path"));
726        assert!(URL_IPV6_REGEX.is_match("https://[fe80::1]:8080/path"));
727        assert!(URL_IPV6_REGEX.is_match("ftp://[::ffff:192.168.1.1]/file"));
728    }
729
730    #[test]
731    fn test_url_ipv6_with_zone_id() {
732        // IPv6 with zone identifiers
733        assert!(URL_IPV6_REGEX.is_match("https://[fe80::1%eth0]/path"));
734        assert!(URL_IPV6_REGEX.is_match("http://[fe80::1%25eth0]:8080/"));
735    }
736
737    #[test]
738    fn test_url_simple_detection() {
739        // Simple pattern for content characteristic detection
740        assert!(URL_SIMPLE_REGEX.is_match("https://example.com"));
741        assert!(URL_SIMPLE_REGEX.is_match("http://test.org/path"));
742        assert!(URL_SIMPLE_REGEX.is_match("ftp://files.com/file.zip"));
743        assert!(!URL_SIMPLE_REGEX.is_match("not a url"));
744    }
745
746    #[test]
747    fn test_url_quick_check() {
748        // Quick check pattern for early exits
749        assert!(URL_QUICK_CHECK_REGEX.is_match("https://example.com"));
750        assert!(URL_QUICK_CHECK_REGEX.is_match("http://example.com"));
751        assert!(URL_QUICK_CHECK_REGEX.is_match("ftp://files.com"));
752        assert!(URL_QUICK_CHECK_REGEX.is_match("www.example.com"));
753        assert!(URL_QUICK_CHECK_REGEX.is_match("user@example.com"));
754        assert!(!URL_QUICK_CHECK_REGEX.is_match("just plain text"));
755    }
756
757    #[test]
758    fn test_url_edge_cases() {
759        // URLs with special characters that should be excluded
760        let url = "https://example.com/path";
761        assert!(URL_STANDARD_REGEX.is_match(url));
762
763        // URL followed by punctuation - the regex captures trailing punctuation
764        // because trimming is done by `trim_trailing_punctuation()` in the rule
765        let text = "Check https://example.com, it's great!";
766        let cap = URL_STANDARD_REGEX.find(text).unwrap();
767        // The comma IS captured by the regex - rule-level trimming handles this
768        assert!(cap.as_str().ends_with(','));
769
770        // URL in angle brackets should still be found
771        let text2 = "See <https://example.com> for more";
772        assert!(URL_STANDARD_REGEX.is_match(text2));
773
774        // URL ending at angle bracket should stop at >
775        let cap2 = URL_STANDARD_REGEX.find(text2).unwrap();
776        assert!(!cap2.as_str().contains('>'));
777    }
778
779    #[test]
780    fn test_url_with_complex_paths() {
781        // Complex real-world URLs
782        let urls = [
783            "https://github.com/owner/repo/blob/main/src/file.rs#L123",
784            "https://docs.example.com/api/v2/endpoint?format=json&page=1",
785            "https://cdn.example.com/assets/images/logo.png?v=2023",
786            "https://search.example.com/results?q=test+query&filter=all",
787        ];
788
789        for url in urls {
790            assert!(URL_STANDARD_REGEX.is_match(url), "Should match: {url}");
791        }
792    }
793
794    #[test]
795    fn test_url_pattern_strings_are_valid() {
796        // Verify patterns compile into valid regexes by accessing them
797        assert!(URL_STANDARD_REGEX.is_match("https://example.com"));
798        assert!(URL_WWW_REGEX.is_match("www.example.com"));
799        assert!(URL_IPV6_REGEX.is_match("https://[::1]/"));
800        assert!(URL_QUICK_CHECK_REGEX.is_match("https://example.com"));
801        assert!(URL_SIMPLE_REGEX.is_match("https://example.com"));
802    }
803
804    // =========================================================================
805    // Tests for is_blank_in_blockquote_context
806    // This is a shared utility used by MD058, MD065, and other rules that need
807    // to detect blank lines inside blockquotes (Issue #305)
808    // =========================================================================
809
810    #[test]
811    fn test_is_blank_in_blockquote_context_regular_blanks() {
812        // Regular blank lines
813        assert!(is_blank_in_blockquote_context(""));
814        assert!(is_blank_in_blockquote_context("   "));
815        assert!(is_blank_in_blockquote_context("\t"));
816        assert!(is_blank_in_blockquote_context("  \t  "));
817    }
818
819    #[test]
820    fn test_is_blank_in_blockquote_context_blockquote_blanks() {
821        // Blockquote continuation lines with no content (should be treated as blank)
822        assert!(is_blank_in_blockquote_context(">"));
823        assert!(is_blank_in_blockquote_context("> "));
824        assert!(is_blank_in_blockquote_context(">  "));
825        assert!(is_blank_in_blockquote_context(">>"));
826        assert!(is_blank_in_blockquote_context(">> "));
827        assert!(is_blank_in_blockquote_context(">>>"));
828        assert!(is_blank_in_blockquote_context(">>> "));
829    }
830
831    #[test]
832    fn test_is_blank_in_blockquote_context_spaced_nested() {
833        // Spaced nested blockquotes ("> > " style)
834        assert!(is_blank_in_blockquote_context("> > "));
835        assert!(is_blank_in_blockquote_context("> > > "));
836        assert!(is_blank_in_blockquote_context(">  >  "));
837    }
838
839    #[test]
840    fn test_is_blank_in_blockquote_context_with_leading_space() {
841        // Blockquote with leading whitespace
842        assert!(is_blank_in_blockquote_context("  >"));
843        assert!(is_blank_in_blockquote_context("  > "));
844        assert!(is_blank_in_blockquote_context("  >>"));
845    }
846
847    #[test]
848    fn test_is_blank_in_blockquote_context_not_blank() {
849        // Lines with actual content (should NOT be treated as blank)
850        assert!(!is_blank_in_blockquote_context("text"));
851        assert!(!is_blank_in_blockquote_context("> text"));
852        assert!(!is_blank_in_blockquote_context(">> text"));
853        assert!(!is_blank_in_blockquote_context("> | table |"));
854        assert!(!is_blank_in_blockquote_context("| table |"));
855        assert!(!is_blank_in_blockquote_context("> # Heading"));
856        assert!(!is_blank_in_blockquote_context(">text")); // No space after > but has text
857    }
858
859    #[test]
860    fn test_is_blank_in_blockquote_context_edge_cases() {
861        // Edge cases
862        assert!(!is_blank_in_blockquote_context(">a")); // Content immediately after >
863        assert!(!is_blank_in_blockquote_context("> a")); // Single char content
864        assert!(is_blank_in_blockquote_context(">   ")); // Multiple spaces after >
865        assert!(!is_blank_in_blockquote_context(">  text")); // Multiple spaces before content
866    }
867}
rumdl_lib/utils/regex_cache.rs

rumdl_lib/utils/
regex_cache.rs