rumdl_lib/utils/
regex_cache.rs

1//!
2//! Cached Regex Patterns and Fast Content Checks for Markdown Linting
3//!
4//! This module provides a centralized collection of pre-compiled, cached regex patterns
5//! for all major Markdown constructs (headings, lists, code blocks, links, images, etc.).
6//! It also includes fast-path utility functions for quickly checking if content
7//! potentially contains certain Markdown elements, allowing rules to skip expensive
8//! processing when unnecessary.
9//!
10//! # Performance
11//!
12//! All regexes are compiled once at startup using `lazy_static`, avoiding repeated
13//! compilation and improving performance across the linter. Use these shared patterns
14//! in rules instead of compiling new regexes.
15//!
16//! # Usage
17//!
18//! - Use the provided statics for common Markdown patterns.
19//! - Use the `regex_lazy!` macro for ad-hoc regexes that are not predefined.
20//! - Use the utility functions for fast content checks before running regexes.
21
22use fancy_regex::Regex as FancyRegex;
23use regex::Regex;
24use std::collections::HashMap;
25use std::sync::LazyLock;
26use std::sync::{Arc, Mutex};
27
28/// Global regex cache for dynamic patterns
29#[derive(Debug)]
30pub struct RegexCache {
31    cache: HashMap<String, Arc<Regex>>,
32    fancy_cache: HashMap<String, Arc<FancyRegex>>,
33    usage_stats: HashMap<String, u64>,
34}
35
36impl Default for RegexCache {
37    fn default() -> Self {
38        Self::new()
39    }
40}
41
42impl RegexCache {
43    pub fn new() -> Self {
44        Self {
45            cache: HashMap::new(),
46            fancy_cache: HashMap::new(),
47            usage_stats: HashMap::new(),
48        }
49    }
50
51    /// Get or compile a regex pattern
52    pub fn get_regex(&mut self, pattern: &str) -> Result<Arc<Regex>, regex::Error> {
53        if let Some(regex) = self.cache.get(pattern) {
54            *self.usage_stats.entry(pattern.to_string()).or_insert(0) += 1;
55            return Ok(regex.clone());
56        }
57
58        let regex = Arc::new(Regex::new(pattern)?);
59        self.cache.insert(pattern.to_string(), regex.clone());
60        *self.usage_stats.entry(pattern.to_string()).or_insert(0) += 1;
61        Ok(regex)
62    }
63
64    /// Get or compile a fancy regex pattern
65    pub fn get_fancy_regex(&mut self, pattern: &str) -> Result<Arc<FancyRegex>, Box<fancy_regex::Error>> {
66        if let Some(regex) = self.fancy_cache.get(pattern) {
67            *self.usage_stats.entry(pattern.to_string()).or_insert(0) += 1;
68            return Ok(regex.clone());
69        }
70
71        match FancyRegex::new(pattern) {
72            Ok(regex) => {
73                let arc_regex = Arc::new(regex);
74                self.fancy_cache.insert(pattern.to_string(), arc_regex.clone());
75                *self.usage_stats.entry(pattern.to_string()).or_insert(0) += 1;
76                Ok(arc_regex)
77            }
78            Err(e) => Err(Box::new(e)),
79        }
80    }
81
82    /// Get cache statistics
83    pub fn get_stats(&self) -> HashMap<String, u64> {
84        self.usage_stats.clone()
85    }
86
87    /// Clear cache (useful for testing)
88    pub fn clear(&mut self) {
89        self.cache.clear();
90        self.fancy_cache.clear();
91        self.usage_stats.clear();
92    }
93}
94
95/// Global regex cache instance
96static GLOBAL_REGEX_CACHE: LazyLock<Arc<Mutex<RegexCache>>> = LazyLock::new(|| Arc::new(Mutex::new(RegexCache::new())));
97
98/// Get a regex from the global cache
99///
100/// If the mutex is poisoned (another thread panicked while holding the lock),
101/// this function recovers by clearing the cache and continuing. This ensures
102/// the library never panics due to mutex poisoning.
103pub fn get_cached_regex(pattern: &str) -> Result<Arc<Regex>, regex::Error> {
104    let mut cache = GLOBAL_REGEX_CACHE.lock().unwrap_or_else(|poisoned| {
105        // Recover from poisoned mutex by clearing the cache
106        let mut guard = poisoned.into_inner();
107        guard.clear();
108        guard
109    });
110    cache.get_regex(pattern)
111}
112
113/// Get a fancy regex from the global cache
114///
115/// If the mutex is poisoned (another thread panicked while holding the lock),
116/// this function recovers by clearing the cache and continuing. This ensures
117/// the library never panics due to mutex poisoning.
118pub fn get_cached_fancy_regex(pattern: &str) -> Result<Arc<FancyRegex>, Box<fancy_regex::Error>> {
119    let mut cache = GLOBAL_REGEX_CACHE.lock().unwrap_or_else(|poisoned| {
120        // Recover from poisoned mutex by clearing the cache
121        let mut guard = poisoned.into_inner();
122        guard.clear();
123        guard
124    });
125    cache.get_fancy_regex(pattern)
126}
127
128/// Get cache usage statistics
129///
130/// If the mutex is poisoned, returns an empty HashMap rather than panicking.
131pub fn get_cache_stats() -> HashMap<String, u64> {
132    match GLOBAL_REGEX_CACHE.lock() {
133        Ok(cache) => cache.get_stats(),
134        Err(_) => HashMap::new(),
135    }
136}
137
138/// Macro for defining a lazily-initialized, cached regex pattern.
139///
140/// Use this for ad-hoc regexes that are not already defined in this module.
141///
142/// # Panics
143///
144/// This macro will panic at initialization if the regex pattern is invalid.
145/// This is intentional for compile-time constant patterns - we want to catch
146/// invalid patterns during development, not at runtime.
147///
148/// # Example
149///
150/// ```
151/// use rumdl_lib::regex_lazy;
152/// let my_re = regex_lazy!(r"^foo.*bar$");
153/// assert!(my_re.is_match("foobar"));
154/// ```
155#[macro_export]
156macro_rules! regex_lazy {
157    ($pattern:expr) => {{
158        static REGEX: LazyLock<regex::Regex> = LazyLock::new(|| regex::Regex::new($pattern).unwrap());
159        &*REGEX
160    }};
161}
162
163/// Macro for getting regex from global cache.
164///
165/// # Panics
166///
167/// Panics if the regex pattern is invalid. This is acceptable for static patterns
168/// where we want to fail fast during development.
169#[macro_export]
170macro_rules! regex_cached {
171    ($pattern:expr) => {{ $crate::utils::regex_cache::get_cached_regex($pattern).expect("Failed to compile regex") }};
172}
173
174/// Macro for getting fancy regex from global cache.
175///
176/// # Panics
177///
178/// Panics if the regex pattern is invalid. This is acceptable for static patterns
179/// where we want to fail fast during development.
180#[macro_export]
181macro_rules! fancy_regex_cached {
182    ($pattern:expr) => {{ $crate::utils::regex_cache::get_cached_fancy_regex($pattern).expect("Failed to compile fancy regex") }};
183}
184
185// Also make the macro available directly from this module
186pub use crate::regex_lazy;
187
188// =============================================================================
189// URL REGEX PATTERNS - Centralized URL Detection
190// =============================================================================
191//
192// ## Pattern Hierarchy (use the most specific pattern for your needs):
193//
194// | Pattern              | Use Case                                    | Parens | Trailing Punct |
195// |----------------------|---------------------------------------------|--------|----------------|
196// | URL_STANDARD_REGEX   | MD034 bare URL detection with auto-fix      | Yes    | Captured*      |
197// | URL_WWW_REGEX        | www.domain URLs without protocol            | Yes    | Captured*      |
198// | URL_IPV6_REGEX       | IPv6 URLs like https://[::1]/path           | Yes    | Captured*      |
199// | URL_QUICK_CHECK_REGEX| Fast early-exit check (contains URL?)       | N/A    | N/A            |
200// | URL_SIMPLE_REGEX     | Content detection, line length exemption    | No     | Excluded       |
201//
202// *Trailing punctuation is captured by the regex; use trim_trailing_punctuation() to clean.
203//
204// ## Design Principles:
205// 1. Parentheses in paths are allowed for Wikipedia-style URLs (Issue #240)
206// 2. Host portion excludes / so path is captured separately
207// 3. Unbalanced trailing parens are handled by trim_trailing_punctuation()
208// 4. All patterns exclude angle brackets <> to avoid matching autolinks
209//
210// ## URL Structure: protocol://host[:port][/path][?query][#fragment]
211
212/// Pattern for standard HTTP(S)/FTP(S) URLs with full path support.
213///
214/// Use this for bare URL detection where you need the complete URL including
215/// Wikipedia-style parentheses in paths. Trailing punctuation like `,;.!?` may
216/// be captured and should be trimmed by the caller.
217///
218/// # Examples
219/// - `https://example.com/path_(with_parens)?query#fragment`
220/// - `https://en.wikipedia.org/wiki/Rust_(programming_language)`
221pub const URL_STANDARD_STR: &str = concat!(
222    r#"(?:https?|ftps?|ftp)://"#, // Protocol
223    r#"(?:"#,
224    r#"\[[0-9a-fA-F:%.\-a-zA-Z]+\]"#, // IPv6 host OR
225    r#"|"#,
226    r#"[^\s<>\[\]()\\'\"`/]+"#, // Standard host (no parens, no /)
227    r#")"#,
228    r#"(?::\d+)?"#,                 // Optional port
229    r#"(?:/[^\s<>\[\]\\'\"`]*)?"#,  // Optional path (allows parens)
230    r#"(?:\?[^\s<>\[\]\\'\"`]*)?"#, // Optional query (allows parens)
231    r#"(?:#[^\s<>\[\]\\'\"`]*)?"#,  // Optional fragment (allows parens)
232);
233
234/// Pattern for www URLs without protocol.
235///
236/// Matches URLs starting with `www.` that lack a protocol prefix.
237/// These should be converted to proper URLs or flagged as bare URLs.
238/// Supports port, path, query string, and fragment like URL_STANDARD_STR.
239///
240/// # Examples
241/// - `www.example.com`
242/// - `www.example.com:8080`
243/// - `www.example.com/path`
244/// - `www.example.com?query=value`
245/// - `www.example.com#section`
246pub const URL_WWW_STR: &str = concat!(
247    r#"www\.(?:[a-zA-Z0-9][-a-zA-Z0-9]*\.)+[a-zA-Z]{2,}"#, // www.domain.tld
248    r#"(?::\d+)?"#,                                        // Optional port
249    r#"(?:/[^\s<>\[\]\\'\"`]*)?"#,                         // Optional path (allows parens)
250    r#"(?:\?[^\s<>\[\]\\'\"`]*)?"#,                        // Optional query (allows parens)
251    r#"(?:#[^\s<>\[\]\\'\"`]*)?"#,                         // Optional fragment (allows parens)
252);
253
254/// Pattern for IPv6 URLs specifically.
255///
256/// Matches URLs with IPv6 addresses in brackets, including zone identifiers.
257/// Examples: `https://[::1]/path`, `https://[fe80::1%eth0]:8080/`
258pub const URL_IPV6_STR: &str = concat!(
259    r#"(?:https?|ftps?|ftp)://"#,
260    r#"\[[0-9a-fA-F:%.\-a-zA-Z]+\]"#, // IPv6 host in brackets
261    r#"(?::\d+)?"#,                   // Optional port
262    r#"(?:/[^\s<>\[\]\\'\"`]*)?"#,    // Optional path
263    r#"(?:\?[^\s<>\[\]\\'\"`]*)?"#,   // Optional query
264    r#"(?:#[^\s<>\[\]\\'\"`]*)?"#,    // Optional fragment
265);
266
267/// Pattern for XMPP URIs per GFM extended autolinks specification.
268///
269/// XMPP URIs use the format `xmpp:user@domain/resource` (without `://`).
270/// Reference: <https://github.github.com/gfm/#autolinks-extension->
271///
272/// # Examples
273/// - `xmpp:foo@bar.baz`
274/// - `xmpp:foo@bar.baz/txt`
275pub const XMPP_URI_STR: &str = r#"xmpp:[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:/[^\s<>\[\]\\'\"`]*)?"#;
276
277/// Quick check pattern for early exits.
278///
279/// Use this for fast pre-filtering before running more expensive patterns.
280/// Matches if the text likely contains a URL or email address.
281/// Includes `xmpp:` for GFM extended autolinks.
282pub const URL_QUICK_CHECK_STR: &str = r#"(?:https?|ftps?|ftp|xmpp)://|xmpp:|@|www\."#;
283
284/// Simple URL pattern for content detection.
285///
286/// Less strict pattern that excludes trailing sentence punctuation (.,).
287/// Use for line length exemption checks or content characteristic detection
288/// where you just need to know if a URL exists, not extract it precisely.
289pub const URL_SIMPLE_STR: &str = r#"(?:https?|ftps?|ftp)://[^\s<>]+[^\s<>.,]"#;
290
291// Pre-compiled static patterns for performance
292
293/// Standard URL regex - primary pattern for bare URL detection (MD034).
294/// See [`URL_STANDARD_STR`] for documentation.
295pub static URL_STANDARD_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(URL_STANDARD_STR).unwrap());
296
297/// WWW URL regex - for URLs starting with www. without protocol.
298/// See [`URL_WWW_STR`] for documentation.
299pub static URL_WWW_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(URL_WWW_STR).unwrap());
300
301/// IPv6 URL regex - for URLs with IPv6 addresses.
302/// See [`URL_IPV6_STR`] for documentation.
303pub static URL_IPV6_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(URL_IPV6_STR).unwrap());
304
305/// Quick check regex - fast early-exit test.
306/// See [`URL_QUICK_CHECK_STR`] for documentation.
307pub static URL_QUICK_CHECK_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(URL_QUICK_CHECK_STR).unwrap());
308
309/// Simple URL regex - for content detection and line length exemption.
310/// See [`URL_SIMPLE_STR`] for documentation.
311pub static URL_SIMPLE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(URL_SIMPLE_STR).unwrap());
312
313/// Alias for `URL_SIMPLE_REGEX`. Used by MD013 for line length exemption.
314pub static URL_PATTERN: LazyLock<Regex> = LazyLock::new(|| URL_SIMPLE_REGEX.clone());
315
316/// XMPP URI regex - for GFM extended autolinks.
317/// See [`XMPP_URI_STR`] for documentation.
318pub static XMPP_URI_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(XMPP_URI_STR).unwrap());
319
320// Heading patterns
321pub static ATX_HEADING_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)(#{1,6})(\s+|$)").unwrap());
322pub static CLOSED_ATX_HEADING_REGEX: LazyLock<Regex> =
323    LazyLock::new(|| Regex::new(r"^(\s*)(#{1,6})(\s+)(.*)(\s+)(#+)(\s*)$").unwrap());
324pub static SETEXT_HEADING_REGEX: LazyLock<Regex> =
325    LazyLock::new(|| Regex::new(r"^(\s*)[^\s]+.*\n(\s*)(=+|-+)\s*$").unwrap());
326pub static TRAILING_PUNCTUATION_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[.,:);!?]$").unwrap());
327
328// ATX heading patterns for MD051 and other rules
329pub static ATX_HEADING_WITH_CAPTURE: LazyLock<Regex> =
330    LazyLock::new(|| Regex::new(r"^(#{1,6})\s+(.+?)(?:\s+#*\s*)?$").unwrap());
331pub static SETEXT_HEADING_WITH_CAPTURE: LazyLock<FancyRegex> =
332    LazyLock::new(|| FancyRegex::new(r"^([^\n]+)\n([=\-])\2+\s*$").unwrap());
333
334// List patterns
335pub static UNORDERED_LIST_MARKER_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)([*+-])(\s+)").unwrap());
336pub static ORDERED_LIST_MARKER_REGEX: LazyLock<Regex> =
337    LazyLock::new(|| Regex::new(r"^(\s*)(\d+)([.)])(\s+)").unwrap());
338pub static LIST_MARKER_ANY_REGEX: LazyLock<Regex> =
339    LazyLock::new(|| Regex::new(r"^(\s*)(?:([*+-])|(\d+)[.)])(\s+)").unwrap());
340
341// Code block patterns
342pub static FENCED_CODE_BLOCK_START_REGEX: LazyLock<Regex> =
343    LazyLock::new(|| Regex::new(r"^(\s*)(```|~~~)(.*)$").unwrap());
344pub static FENCED_CODE_BLOCK_END_REGEX: LazyLock<Regex> =
345    LazyLock::new(|| Regex::new(r"^(\s*)(```|~~~)(\s*)$").unwrap());
346pub static INDENTED_CODE_BLOCK_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s{4,})(.*)$").unwrap());
347pub static CODE_FENCE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(`{3,}|~{3,})").unwrap());
348
349// Emphasis patterns
350pub static EMPHASIS_REGEX: LazyLock<FancyRegex> =
351    LazyLock::new(|| FancyRegex::new(r"(\s|^)(\*{1,2}|_{1,2})(?=\S)(.+?)(?<=\S)(\2)(\s|$)").unwrap());
352pub static SPACE_IN_EMPHASIS_REGEX: LazyLock<FancyRegex> =
353    LazyLock::new(|| FancyRegex::new(r"(\*|_)(\s+)(.+?)(\s+)(\1)").unwrap());
354
355// MD037 specific emphasis patterns - improved to avoid false positives
356// Only match emphasis with spaces that are actually complete emphasis blocks
357// Use word boundaries and negative lookbehind/lookahead to avoid matching across emphasis boundaries
358pub static ASTERISK_EMPHASIS: LazyLock<Regex> =
359    LazyLock::new(|| Regex::new(r"(?:^|[^*])\*(\s+[^*]+\s*|\s*[^*]+\s+)\*(?:[^*]|$)").unwrap());
360pub static UNDERSCORE_EMPHASIS: LazyLock<Regex> =
361    LazyLock::new(|| Regex::new(r"(?:^|[^_])_(\s+[^_]+\s*|\s*[^_]+\s+)_(?:[^_]|$)").unwrap());
362pub static DOUBLE_UNDERSCORE_EMPHASIS: LazyLock<Regex> =
363    LazyLock::new(|| Regex::new(r"(?:^|[^_])__(\s+[^_]+\s*|\s*[^_]+\s+)__(?:[^_]|$)").unwrap());
364pub static DOUBLE_ASTERISK_EMPHASIS: LazyLock<FancyRegex> =
365    LazyLock::new(|| FancyRegex::new(r"\*\*\s+([^*]+?)\s+\*\*").unwrap());
366pub static DOUBLE_ASTERISK_SPACE_START: LazyLock<FancyRegex> =
367    LazyLock::new(|| FancyRegex::new(r"\*\*\s+([^*]+?)\*\*").unwrap());
368pub static DOUBLE_ASTERISK_SPACE_END: LazyLock<FancyRegex> =
369    LazyLock::new(|| FancyRegex::new(r"\*\*([^*]+?)\s+\*\*").unwrap());
370
371// Code block patterns
372pub static FENCED_CODE_BLOCK_START: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)```(?:[^`\r\n]*)$").unwrap());
373pub static FENCED_CODE_BLOCK_END: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)```\s*$").unwrap());
374pub static ALTERNATE_FENCED_CODE_BLOCK_START: LazyLock<Regex> =
375    LazyLock::new(|| Regex::new(r"^(\s*)~~~(?:[^~\r\n]*)$").unwrap());
376pub static ALTERNATE_FENCED_CODE_BLOCK_END: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)~~~\s*$").unwrap());
377pub static INDENTED_CODE_BLOCK_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s{4,})").unwrap());
378
379// HTML patterns
380pub static HTML_TAG_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<([a-zA-Z][^>]*)>").unwrap());
381pub static HTML_SELF_CLOSING_TAG_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<([a-zA-Z][^>]*/)>").unwrap());
382pub static HTML_TAG_FINDER: LazyLock<Regex> = LazyLock::new(|| Regex::new("(?i)</?[a-zA-Z][^>]*>").unwrap());
383pub static HTML_OPENING_TAG_FINDER: LazyLock<Regex> = LazyLock::new(|| Regex::new("(?i)<[a-zA-Z][^>]*>").unwrap());
384pub static HTML_TAG_QUICK_CHECK: LazyLock<Regex> = LazyLock::new(|| Regex::new("(?i)</?[a-zA-Z]").unwrap());
385
386// Link patterns for MD051 and other rules
387pub static LINK_REFERENCE_DEFINITION_REGEX: LazyLock<Regex> =
388    LazyLock::new(|| Regex::new(r"^\s*\[([^\]]+)\]:\s+(.+)$").unwrap());
389pub static INLINE_LINK_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").unwrap());
390pub static LINK_TEXT_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[([^\]]*)\]").unwrap());
391pub static LINK_REGEX: LazyLock<FancyRegex> =
392    LazyLock::new(|| FancyRegex::new(r"(?<!\\)\[([^\]]*)\]\(([^)#]*)#([^)]+)\)").unwrap());
393pub static EXTERNAL_URL_REGEX: LazyLock<FancyRegex> =
394    LazyLock::new(|| FancyRegex::new(r"^(https?://|ftp://|www\.|[^/]+\.[a-z]{2,})").unwrap());
395
396// Image patterns
397pub static IMAGE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\(([^)]+)\)").unwrap());
398
399// Whitespace patterns
400pub static TRAILING_WHITESPACE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\s+$").unwrap());
401pub static MULTIPLE_BLANK_LINES_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\n{3,}").unwrap());
402
403// Front matter patterns
404pub static FRONT_MATTER_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^---\n.*?\n---\n").unwrap());
405
406// MD051 specific patterns
407pub static INLINE_CODE_REGEX: LazyLock<FancyRegex> = LazyLock::new(|| FancyRegex::new(r"`[^`]+`").unwrap());
408pub static BOLD_ASTERISK_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\*\*(.+?)\*\*").unwrap());
409pub static BOLD_UNDERSCORE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"__(.+?)__").unwrap());
410pub static ITALIC_ASTERISK_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\*([^*]+?)\*").unwrap());
411pub static ITALIC_UNDERSCORE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"_([^_]+?)_").unwrap());
412pub static LINK_TEXT_FULL_REGEX: LazyLock<FancyRegex> =
413    LazyLock::new(|| FancyRegex::new(r"\[([^\]]*)\]\([^)]*\)").unwrap());
414pub static STRIKETHROUGH_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"~~(.+?)~~").unwrap());
415pub static MULTIPLE_HYPHENS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"-{2,}").unwrap());
416pub static TOC_SECTION_START: LazyLock<Regex> =
417    LazyLock::new(|| Regex::new(r"^#+\s*(?:Table of Contents|Contents|TOC)\s*$").unwrap());
418
419// Blockquote patterns
420pub static BLOCKQUOTE_PREFIX_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
421
422/// Check if a line is blank in the context of blockquotes.
423///
424/// A line is considered "blank" if:
425/// - It's empty or contains only whitespace
426/// - It's a blockquote continuation line with no content (e.g., ">", ">>", "> ")
427///
428/// This is essential for rules like MD058 (blanks-around-tables), MD065 (blanks-around-horizontal-rules),
429/// and any other rule that needs to detect blank lines that might be inside blockquotes.
430///
431/// # Examples
432/// ```
433/// use rumdl_lib::utils::regex_cache::is_blank_in_blockquote_context;
434///
435/// assert!(is_blank_in_blockquote_context(""));           // Empty line
436/// assert!(is_blank_in_blockquote_context("   "));        // Whitespace only
437/// assert!(is_blank_in_blockquote_context(">"));          // Blockquote continuation
438/// assert!(is_blank_in_blockquote_context("> "));         // Blockquote with trailing space
439/// assert!(is_blank_in_blockquote_context(">>"));         // Nested blockquote
440/// assert!(is_blank_in_blockquote_context("> > "));       // Spaced nested blockquote
441/// assert!(!is_blank_in_blockquote_context("> text"));    // Blockquote with content
442/// assert!(!is_blank_in_blockquote_context("text"));      // Regular text
443/// ```
444pub fn is_blank_in_blockquote_context(line: &str) -> bool {
445    if line.trim().is_empty() {
446        return true;
447    }
448    // Check if line is a blockquote prefix with no content after it
449    // Handle spaced nested blockquotes like "> > " by recursively checking remainder
450    if let Some(m) = BLOCKQUOTE_PREFIX_RE.find(line) {
451        let remainder = &line[m.end()..];
452        // The remainder should be empty/whitespace OR another blockquote prefix (for spaced nesting)
453        is_blank_in_blockquote_context(remainder)
454    } else {
455        false
456    }
457}
458
459// MD013 specific patterns
460pub static IMAGE_REF_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^!\[.*?\]\[.*?\]$").unwrap());
461pub static LINK_REF_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\[.*?\]:\s*https?://\S+$").unwrap());
462/// Greedy URL pattern for finding URLs in text for length calculation.
463///
464/// Pattern `https?://\S+` matches until whitespace, which may include trailing
465/// punctuation. This is intentional for MD013 line length calculation where
466/// we replace URLs with fixed-length placeholders.
467///
468/// For precise URL extraction, use `URL_STANDARD_REGEX` instead.
469pub static URL_IN_TEXT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"https?://\S+").unwrap());
470pub static SENTENCE_END: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[.!?]\s+[A-Z]").unwrap());
471pub static ABBREVIATION: LazyLock<Regex> = LazyLock::new(|| {
472    Regex::new(r"\b(?:Mr|Mrs|Ms|Dr|Prof|Sr|Jr|vs|etc|i\.e|e\.g|Inc|Corp|Ltd|Co|St|Ave|Blvd|Rd|Ph\.D|M\.D|B\.A|M\.A|Ph\.D|U\.S|U\.K|U\.N|N\.Y|L\.A|D\.C)\.\s+[A-Z]").unwrap()
473});
474pub static DECIMAL_NUMBER: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\d+\.\s*\d+").unwrap());
475pub static LIST_ITEM: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*\d+\.\s+").unwrap());
476pub static REFERENCE_LINK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[([^\]]*)\]\[([^\]]*)\]").unwrap());
477
478// Email pattern
479pub static EMAIL_PATTERN: LazyLock<Regex> =
480    LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
481
482// Third lazy_static block for link and image patterns used by MD052 and text_reflow
483// Reference link patterns (shared by MD052 and text_reflow)
484// Pattern to match reference links: [text][reference] or [text][]
485pub static REF_LINK_REGEX: LazyLock<FancyRegex> =
486    LazyLock::new(|| FancyRegex::new(r"(?<!\\)\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]\[([^\]]*)\]").unwrap());
487
488// Pattern for shortcut reference links: [reference]
489// Must not be preceded by ] or ) (to avoid matching second part of [text][ref])
490// Must not be followed by [ or ( (to avoid matching first part of [text][ref] or [text](url))
491// The capturing group handles nested brackets to support cases like [`Union[T, None]`]
492pub static SHORTCUT_REF_REGEX: LazyLock<FancyRegex> =
493    LazyLock::new(|| FancyRegex::new(r"(?<![\\)\]])\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\](?!\s*[\[\(])").unwrap());
494
495// Inline link with fancy regex for better escaping handling (used by text_reflow)
496pub static INLINE_LINK_FANCY_REGEX: LazyLock<FancyRegex> =
497    LazyLock::new(|| FancyRegex::new(r"(?<!\\)\[([^\]]+)\]\(([^)]+)\)").unwrap());
498
499// Inline image with fancy regex (used by MD052 and text_reflow)
500pub static INLINE_IMAGE_FANCY_REGEX: LazyLock<FancyRegex> =
501    LazyLock::new(|| FancyRegex::new(r"!\[([^\]]*)\]\(([^)]+)\)").unwrap());
502
503// Linked images (clickable badges) - all 4 variants
504// Must be detected before inline_image and inline_link to treat as atomic units
505//
506// Limitation: Alt text containing brackets like [![[v1.0]](img)](link) is not supported.
507// The [^\]]* pattern cannot match nested brackets. This is rare in practice.
508//
509// Pattern 1: Inline image in inline link - [![alt](img-url)](link-url)
510pub static LINKED_IMAGE_INLINE_INLINE: LazyLock<FancyRegex> =
511    LazyLock::new(|| FancyRegex::new(r"\[!\[([^\]]*)\]\(([^)]+)\)\]\(([^)]+)\)").unwrap());
512
513// Pattern 2: Reference image in inline link - [![alt][img-ref]](link-url)
514pub static LINKED_IMAGE_REF_INLINE: LazyLock<FancyRegex> =
515    LazyLock::new(|| FancyRegex::new(r"\[!\[([^\]]*)\]\[([^\]]*)\]\]\(([^)]+)\)").unwrap());
516
517// Pattern 3: Inline image in reference link - [![alt](img-url)][link-ref]
518pub static LINKED_IMAGE_INLINE_REF: LazyLock<FancyRegex> =
519    LazyLock::new(|| FancyRegex::new(r"\[!\[([^\]]*)\]\(([^)]+)\)\]\[([^\]]*)\]").unwrap());
520
521// Pattern 4: Reference image in reference link - [![alt][img-ref]][link-ref]
522pub static LINKED_IMAGE_REF_REF: LazyLock<FancyRegex> =
523    LazyLock::new(|| FancyRegex::new(r"\[!\[([^\]]*)\]\[([^\]]*)\]\]\[([^\]]*)\]").unwrap());
524
525// Reference image: ![alt][ref] or ![alt][]
526pub static REF_IMAGE_REGEX: LazyLock<FancyRegex> =
527    LazyLock::new(|| FancyRegex::new(r"!\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]\[([^\]]*)\]").unwrap());
528
529// Footnote reference: [^note]
530pub static FOOTNOTE_REF_REGEX: LazyLock<FancyRegex> = LazyLock::new(|| FancyRegex::new(r"\[\^([^\]]+)\]").unwrap());
531
532// Strikethrough with fancy regex: ~~text~~
533pub static STRIKETHROUGH_FANCY_REGEX: LazyLock<FancyRegex> = LazyLock::new(|| FancyRegex::new(r"~~([^~]+)~~").unwrap());
534
535// Wiki-style links: [[wiki]] or [[wiki|display text]]
536pub static WIKI_LINK_REGEX: LazyLock<FancyRegex> = LazyLock::new(|| FancyRegex::new(r"\[\[([^\]]+)\]\]").unwrap());
537
538// Math formulas: $inline$ or $$display$$
539pub static INLINE_MATH_REGEX: LazyLock<FancyRegex> =
540    LazyLock::new(|| FancyRegex::new(r"(?<!\$)\$(?!\$)([^\$]+)\$(?!\$)").unwrap());
541pub static DISPLAY_MATH_REGEX: LazyLock<FancyRegex> = LazyLock::new(|| FancyRegex::new(r"\$\$([^\$]+)\$\$").unwrap());
542
543// Emoji shortcodes: :emoji:
544pub static EMOJI_SHORTCODE_REGEX: LazyLock<FancyRegex> =
545    LazyLock::new(|| FancyRegex::new(r":([a-zA-Z0-9_+-]+):").unwrap());
546
547// HTML tags (opening, closing, self-closing)
548pub static HTML_TAG_PATTERN: LazyLock<FancyRegex> =
549    LazyLock::new(|| FancyRegex::new(r"</?[a-zA-Z][^>]*>|<[a-zA-Z][^>]*/\s*>").unwrap());
550
551// HTML entities: &nbsp; &mdash; etc
552pub static HTML_ENTITY_REGEX: LazyLock<FancyRegex> =
553    LazyLock::new(|| FancyRegex::new(r"&[a-zA-Z][a-zA-Z0-9]*;|&#\d+;|&#x[0-9a-fA-F]+;").unwrap());
554
555// Hugo/Go template shortcodes: {{< figure ... >}} and {{% shortcode %}}
556// Matches both delimiters: {{< ... >}} (shortcode) and {{% ... %}} (template)
557// Handles multi-line content with embedded quotes and newlines
558pub static HUGO_SHORTCODE_REGEX: LazyLock<FancyRegex> =
559    LazyLock::new(|| FancyRegex::new(r"\{\{[<%][\s\S]*?[%>]\}\}").unwrap());
560
561// Fourth lazy_static block for additional patterns
562// HTML comment patterns
563pub static HTML_COMMENT_START: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<!--").unwrap());
564pub static HTML_COMMENT_END: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"-->").unwrap());
565pub static HTML_COMMENT_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<!--[\s\S]*?-->").unwrap());
566
567// HTML heading pattern (matches <h1> through <h6> tags)
568pub static HTML_HEADING_PATTERN: LazyLock<FancyRegex> =
569    LazyLock::new(|| FancyRegex::new(r"^\s*<h([1-6])(?:\s[^>]*)?>.*</h\1>\s*$").unwrap());
570
571// Heading quick check pattern
572pub static HEADING_CHECK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?m)^(?:\s*)#").unwrap());
573
574// Horizontal rule patterns
575pub static HR_DASH: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\-{3,}\s*$").unwrap());
576pub static HR_ASTERISK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\*{3,}\s*$").unwrap());
577pub static HR_UNDERSCORE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^_{3,}\s*$").unwrap());
578pub static HR_SPACED_DASH: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\-\s+){2,}\-\s*$").unwrap());
579pub static HR_SPACED_ASTERISK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\*\s+){2,}\*\s*$").unwrap());
580pub static HR_SPACED_UNDERSCORE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(_\s+){2,}_\s*$").unwrap());
581
582/// Utility functions for quick content checks
583/// Check if content contains any headings (quick check before regex)
584pub fn has_heading_markers(content: &str) -> bool {
585    content.contains('#')
586}
587
588/// Check if content contains any lists (quick check before regex)
589pub fn has_list_markers(content: &str) -> bool {
590    content.contains('*')
591        || content.contains('-')
592        || content.contains('+')
593        || (content.contains('.') && content.contains(|c: char| c.is_ascii_digit()))
594}
595
596/// Check if content contains any code blocks (quick check before regex)
597pub fn has_code_block_markers(content: &str) -> bool {
598    content.contains("```") || content.contains("~~~") || content.contains("\n    ")
599    // Indented code block potential
600}
601
602/// Check if content contains any emphasis markers (quick check before regex)
603pub fn has_emphasis_markers(content: &str) -> bool {
604    content.contains('*') || content.contains('_')
605}
606
607/// Check if content contains any HTML tags (quick check before regex)
608pub fn has_html_tags(content: &str) -> bool {
609    content.contains('<') && (content.contains('>') || content.contains("/>"))
610}
611
612/// Check if content contains any links (quick check before regex)
613pub fn has_link_markers(content: &str) -> bool {
614    (content.contains('[') && content.contains(']'))
615        || content.contains("http://")
616        || content.contains("https://")
617        || content.contains("ftp://")
618}
619
620/// Check if content contains any images (quick check before regex)
621pub fn has_image_markers(content: &str) -> bool {
622    content.contains("![")
623}
624
625/// Optimize URL detection by implementing a character-by-character scanner
626/// that's much faster than regex for cases where we know there's no URL
627pub fn contains_url(content: &str) -> bool {
628    // Fast check - if these substrings aren't present, there's no URL
629    if !content.contains("://") {
630        return false;
631    }
632
633    let chars: Vec<char> = content.chars().collect();
634    let mut i = 0;
635
636    while i < chars.len() {
637        // Look for the start of a URL protocol
638        if i + 2 < chars.len()
639            && ((chars[i] == 'h' && chars[i + 1] == 't' && chars[i + 2] == 't')
640                || (chars[i] == 'f' && chars[i + 1] == 't' && chars[i + 2] == 'p'))
641        {
642            // Scan forward to find "://"
643            let mut j = i;
644            while j + 2 < chars.len() {
645                if chars[j] == ':' && chars[j + 1] == '/' && chars[j + 2] == '/' {
646                    return true;
647                }
648                j += 1;
649
650                // Don't scan too far ahead for the protocol
651                if j > i + 10 {
652                    break;
653                }
654            }
655        }
656        i += 1;
657    }
658
659    false
660}
661
662/// Escapes a string to be used in a regex pattern
663pub fn escape_regex(s: &str) -> String {
664    let mut result = String::with_capacity(s.len() * 2);
665
666    for c in s.chars() {
667        // Use matches! for O(1) lookup instead of array.contains() which is O(n)
668        if matches!(
669            c,
670            '.' | '+' | '*' | '?' | '^' | '$' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '\\'
671        ) {
672            result.push('\\');
673        }
674        result.push(c);
675    }
676
677    result
678}
679
680#[cfg(test)]
681mod tests {
682    use super::*;
683
684    #[test]
685    fn test_regex_cache_new() {
686        let cache = RegexCache::new();
687        assert!(cache.cache.is_empty());
688        assert!(cache.fancy_cache.is_empty());
689        assert!(cache.usage_stats.is_empty());
690    }
691
692    #[test]
693    fn test_regex_cache_default() {
694        let cache = RegexCache::default();
695        assert!(cache.cache.is_empty());
696        assert!(cache.fancy_cache.is_empty());
697        assert!(cache.usage_stats.is_empty());
698    }
699
700    #[test]
701    fn test_get_regex_compilation() {
702        let mut cache = RegexCache::new();
703
704        // First call compiles and caches
705        let regex1 = cache.get_regex(r"\d+").unwrap();
706        assert_eq!(cache.cache.len(), 1);
707        assert_eq!(cache.usage_stats.get(r"\d+"), Some(&1));
708
709        // Second call returns cached version
710        let regex2 = cache.get_regex(r"\d+").unwrap();
711        assert_eq!(cache.cache.len(), 1);
712        assert_eq!(cache.usage_stats.get(r"\d+"), Some(&2));
713
714        // Both should be the same Arc
715        assert!(Arc::ptr_eq(&regex1, &regex2));
716    }
717
718    #[test]
719    fn test_get_regex_invalid_pattern() {
720        let mut cache = RegexCache::new();
721        let result = cache.get_regex(r"[unterminated");
722        assert!(result.is_err());
723        assert!(cache.cache.is_empty());
724    }
725
726    #[test]
727    fn test_get_fancy_regex_compilation() {
728        let mut cache = RegexCache::new();
729
730        // First call compiles and caches
731        let regex1 = cache.get_fancy_regex(r"(?<=foo)bar").unwrap();
732        assert_eq!(cache.fancy_cache.len(), 1);
733        assert_eq!(cache.usage_stats.get(r"(?<=foo)bar"), Some(&1));
734
735        // Second call returns cached version
736        let regex2 = cache.get_fancy_regex(r"(?<=foo)bar").unwrap();
737        assert_eq!(cache.fancy_cache.len(), 1);
738        assert_eq!(cache.usage_stats.get(r"(?<=foo)bar"), Some(&2));
739
740        // Both should be the same Arc
741        assert!(Arc::ptr_eq(&regex1, &regex2));
742    }
743
744    #[test]
745    fn test_get_fancy_regex_invalid_pattern() {
746        let mut cache = RegexCache::new();
747        let result = cache.get_fancy_regex(r"(?<=invalid");
748        assert!(result.is_err());
749        assert!(cache.fancy_cache.is_empty());
750    }
751
752    #[test]
753    fn test_get_stats() {
754        let mut cache = RegexCache::new();
755
756        // Use some patterns
757        let _ = cache.get_regex(r"\d+").unwrap();
758        let _ = cache.get_regex(r"\d+").unwrap();
759        let _ = cache.get_regex(r"\w+").unwrap();
760        let _ = cache.get_fancy_regex(r"(?<=foo)bar").unwrap();
761
762        let stats = cache.get_stats();
763        assert_eq!(stats.get(r"\d+"), Some(&2));
764        assert_eq!(stats.get(r"\w+"), Some(&1));
765        assert_eq!(stats.get(r"(?<=foo)bar"), Some(&1));
766    }
767
768    #[test]
769    fn test_clear_cache() {
770        let mut cache = RegexCache::new();
771
772        // Add some patterns
773        let _ = cache.get_regex(r"\d+").unwrap();
774        let _ = cache.get_fancy_regex(r"(?<=foo)bar").unwrap();
775
776        assert!(!cache.cache.is_empty());
777        assert!(!cache.fancy_cache.is_empty());
778        assert!(!cache.usage_stats.is_empty());
779
780        // Clear cache
781        cache.clear();
782
783        assert!(cache.cache.is_empty());
784        assert!(cache.fancy_cache.is_empty());
785        assert!(cache.usage_stats.is_empty());
786    }
787
788    #[test]
789    fn test_global_cache_functions() {
790        // Test get_cached_regex
791        let regex1 = get_cached_regex(r"\d{3}").unwrap();
792        let regex2 = get_cached_regex(r"\d{3}").unwrap();
793        assert!(Arc::ptr_eq(&regex1, &regex2));
794
795        // Test get_cached_fancy_regex
796        let fancy1 = get_cached_fancy_regex(r"(?<=test)ing").unwrap();
797        let fancy2 = get_cached_fancy_regex(r"(?<=test)ing").unwrap();
798        assert!(Arc::ptr_eq(&fancy1, &fancy2));
799
800        // Test stats
801        let stats = get_cache_stats();
802        assert!(stats.contains_key(r"\d{3}"));
803        assert!(stats.contains_key(r"(?<=test)ing"));
804    }
805
806    #[test]
807    fn test_regex_lazy_macro() {
808        let re = regex_lazy!(r"^test.*end$");
809        assert!(re.is_match("test something end"));
810        assert!(!re.is_match("test something"));
811
812        // The macro creates a new static for each invocation location,
813        // so we can't test pointer equality across different invocations
814        // But we can test that the regex works correctly
815        let re2 = regex_lazy!(r"^start.*finish$");
816        assert!(re2.is_match("start and finish"));
817        assert!(!re2.is_match("start without end"));
818    }
819
820    #[test]
821    fn test_has_heading_markers() {
822        assert!(has_heading_markers("# Heading"));
823        assert!(has_heading_markers("Text with # symbol"));
824        assert!(!has_heading_markers("Text without heading marker"));
825    }
826
827    #[test]
828    fn test_has_list_markers() {
829        assert!(has_list_markers("* Item"));
830        assert!(has_list_markers("- Item"));
831        assert!(has_list_markers("+ Item"));
832        assert!(has_list_markers("1. Item"));
833        assert!(!has_list_markers("Text without list markers"));
834    }
835
836    #[test]
837    fn test_has_code_block_markers() {
838        assert!(has_code_block_markers("```code```"));
839        assert!(has_code_block_markers("~~~code~~~"));
840        assert!(has_code_block_markers("Text\n    indented code"));
841        assert!(!has_code_block_markers("Text without code blocks"));
842    }
843
844    #[test]
845    fn test_has_emphasis_markers() {
846        assert!(has_emphasis_markers("*emphasis*"));
847        assert!(has_emphasis_markers("_emphasis_"));
848        assert!(has_emphasis_markers("**bold**"));
849        assert!(has_emphasis_markers("__bold__"));
850        assert!(!has_emphasis_markers("no emphasis"));
851    }
852
853    #[test]
854    fn test_has_html_tags() {
855        assert!(has_html_tags("<div>content</div>"));
856        assert!(has_html_tags("<br/>"));
857        assert!(has_html_tags("<img src='test.jpg'>"));
858        assert!(!has_html_tags("no html tags"));
859        assert!(!has_html_tags("less than < but no tag"));
860    }
861
862    #[test]
863    fn test_has_link_markers() {
864        assert!(has_link_markers("[text](url)"));
865        assert!(has_link_markers("[reference][1]"));
866        assert!(has_link_markers("http://example.com"));
867        assert!(has_link_markers("https://example.com"));
868        assert!(has_link_markers("ftp://example.com"));
869        assert!(!has_link_markers("no links here"));
870    }
871
872    #[test]
873    fn test_has_image_markers() {
874        assert!(has_image_markers("![alt text](image.png)"));
875        assert!(has_image_markers("![](image.png)"));
876        assert!(!has_image_markers("[link](url)"));
877        assert!(!has_image_markers("no images"));
878    }
879
880    #[test]
881    fn test_contains_url() {
882        assert!(contains_url("http://example.com"));
883        assert!(contains_url("Text with https://example.com link"));
884        assert!(contains_url("ftp://example.com"));
885        assert!(!contains_url("Text without URL"));
886        assert!(!contains_url("http not followed by ://"));
887
888        // Edge cases
889        assert!(!contains_url("http"));
890        assert!(!contains_url("https"));
891        assert!(!contains_url("://"));
892        assert!(contains_url("Visit http://site.com now"));
893        assert!(contains_url("See https://secure.site.com/path"));
894    }
895
896    #[test]
897    fn test_contains_url_performance() {
898        // Test early exit for strings without "://"
899        let long_text = "a".repeat(10000);
900        assert!(!contains_url(&long_text));
901
902        // Test with URL at the end
903        let text_with_url = format!("{long_text}https://example.com");
904        assert!(contains_url(&text_with_url));
905    }
906
907    #[test]
908    fn test_escape_regex() {
909        assert_eq!(escape_regex("a.b"), "a\\.b");
910        assert_eq!(escape_regex("a+b*c"), "a\\+b\\*c");
911        assert_eq!(escape_regex("(test)"), "\\(test\\)");
912        assert_eq!(escape_regex("[a-z]"), "\\[a-z\\]");
913        assert_eq!(escape_regex("normal text"), "normal text");
914
915        // Test all special characters
916        assert_eq!(escape_regex(".$^{[(|)*+?\\"), "\\.\\$\\^\\{\\[\\(\\|\\)\\*\\+\\?\\\\");
917
918        // Test empty string
919        assert_eq!(escape_regex(""), "");
920
921        // Test mixed content
922        assert_eq!(escape_regex("test.com/path?query=1"), "test\\.com/path\\?query=1");
923    }
924
925    #[test]
926    fn test_static_regex_patterns() {
927        // Test URL patterns
928        assert!(URL_SIMPLE_REGEX.is_match("https://example.com"));
929        assert!(URL_SIMPLE_REGEX.is_match("http://test.org/path"));
930        assert!(URL_SIMPLE_REGEX.is_match("ftp://files.com"));
931        assert!(!URL_SIMPLE_REGEX.is_match("not a url"));
932
933        // Test heading patterns
934        assert!(ATX_HEADING_REGEX.is_match("# Heading"));
935        assert!(ATX_HEADING_REGEX.is_match("  ## Indented"));
936        assert!(ATX_HEADING_REGEX.is_match("### "));
937        assert!(!ATX_HEADING_REGEX.is_match("Not a heading"));
938
939        // Test list patterns
940        assert!(UNORDERED_LIST_MARKER_REGEX.is_match("* Item"));
941        assert!(UNORDERED_LIST_MARKER_REGEX.is_match("- Item"));
942        assert!(UNORDERED_LIST_MARKER_REGEX.is_match("+ Item"));
943        assert!(ORDERED_LIST_MARKER_REGEX.is_match("1. Item"));
944        assert!(ORDERED_LIST_MARKER_REGEX.is_match("99. Item"));
945
946        // Test code block patterns
947        assert!(FENCED_CODE_BLOCK_START_REGEX.is_match("```"));
948        assert!(FENCED_CODE_BLOCK_START_REGEX.is_match("```rust"));
949        assert!(FENCED_CODE_BLOCK_START_REGEX.is_match("~~~"));
950        assert!(FENCED_CODE_BLOCK_END_REGEX.is_match("```"));
951        assert!(FENCED_CODE_BLOCK_END_REGEX.is_match("~~~"));
952
953        // Test emphasis patterns
954        assert!(BOLD_ASTERISK_REGEX.is_match("**bold**"));
955        assert!(BOLD_UNDERSCORE_REGEX.is_match("__bold__"));
956        assert!(ITALIC_ASTERISK_REGEX.is_match("*italic*"));
957        assert!(ITALIC_UNDERSCORE_REGEX.is_match("_italic_"));
958
959        // Test HTML patterns
960        assert!(HTML_TAG_REGEX.is_match("<div>"));
961        assert!(HTML_TAG_REGEX.is_match("<span class='test'>"));
962        assert!(HTML_SELF_CLOSING_TAG_REGEX.is_match("<br/>"));
963        assert!(HTML_SELF_CLOSING_TAG_REGEX.is_match("<img src='test'/>"));
964
965        // Test whitespace patterns
966        assert!(TRAILING_WHITESPACE_REGEX.is_match("line with spaces   "));
967        assert!(TRAILING_WHITESPACE_REGEX.is_match("tabs\t\t"));
968        assert!(MULTIPLE_BLANK_LINES_REGEX.is_match("\n\n\n"));
969        assert!(MULTIPLE_BLANK_LINES_REGEX.is_match("\n\n\n\n"));
970
971        // Test blockquote pattern
972        assert!(BLOCKQUOTE_PREFIX_RE.is_match("> Quote"));
973        assert!(BLOCKQUOTE_PREFIX_RE.is_match("  > Indented quote"));
974        assert!(BLOCKQUOTE_PREFIX_RE.is_match(">> Nested"));
975    }
976
977    #[test]
978    fn test_thread_safety() {
979        use std::thread;
980
981        let handles: Vec<_> = (0..10)
982            .map(|i| {
983                thread::spawn(move || {
984                    let pattern = format!(r"\d{{{i}}}");
985                    let regex = get_cached_regex(&pattern).unwrap();
986                    assert!(regex.is_match(&"1".repeat(i)));
987                })
988            })
989            .collect();
990
991        for handle in handles {
992            handle.join().unwrap();
993        }
994    }
995
996    // ==========================================================================
997    // Comprehensive URL Regex Tests
998    // ==========================================================================
999
1000    #[test]
1001    fn test_url_standard_basic() {
1002        // Basic HTTP/HTTPS URLs
1003        assert!(URL_STANDARD_REGEX.is_match("https://example.com"));
1004        assert!(URL_STANDARD_REGEX.is_match("http://example.com"));
1005        assert!(URL_STANDARD_REGEX.is_match("https://example.com/"));
1006        assert!(URL_STANDARD_REGEX.is_match("https://example.com/path"));
1007        assert!(URL_STANDARD_REGEX.is_match("ftp://files.example.com"));
1008        assert!(URL_STANDARD_REGEX.is_match("ftps://secure.example.com"));
1009
1010        // Should not match non-URLs
1011        assert!(!URL_STANDARD_REGEX.is_match("not a url"));
1012        assert!(!URL_STANDARD_REGEX.is_match("example.com"));
1013        assert!(!URL_STANDARD_REGEX.is_match("www.example.com"));
1014    }
1015
1016    #[test]
1017    fn test_url_standard_with_path() {
1018        assert!(URL_STANDARD_REGEX.is_match("https://example.com/path/to/page"));
1019        assert!(URL_STANDARD_REGEX.is_match("https://example.com/path/to/page.html"));
1020        assert!(URL_STANDARD_REGEX.is_match("https://example.com/path/to/page/"));
1021    }
1022
1023    #[test]
1024    fn test_url_standard_with_query() {
1025        assert!(URL_STANDARD_REGEX.is_match("https://example.com?query=value"));
1026        assert!(URL_STANDARD_REGEX.is_match("https://example.com/path?query=value"));
1027        assert!(URL_STANDARD_REGEX.is_match("https://example.com/path?a=1&b=2"));
1028    }
1029
1030    #[test]
1031    fn test_url_standard_with_fragment() {
1032        assert!(URL_STANDARD_REGEX.is_match("https://example.com#section"));
1033        assert!(URL_STANDARD_REGEX.is_match("https://example.com/path#section"));
1034        assert!(URL_STANDARD_REGEX.is_match("https://example.com/path?query=value#section"));
1035    }
1036
1037    #[test]
1038    fn test_url_standard_with_port() {
1039        assert!(URL_STANDARD_REGEX.is_match("https://example.com:8080"));
1040        assert!(URL_STANDARD_REGEX.is_match("https://example.com:443/path"));
1041        assert!(URL_STANDARD_REGEX.is_match("http://localhost:3000"));
1042        assert!(URL_STANDARD_REGEX.is_match("https://192.168.1.1:8080/path"));
1043    }
1044
1045    #[test]
1046    fn test_url_standard_wikipedia_style_parentheses() {
1047        // Wikipedia-style URLs with parentheses in path (Issue #240)
1048        let url = "https://en.wikipedia.org/wiki/Rust_(programming_language)";
1049        assert!(URL_STANDARD_REGEX.is_match(url));
1050
1051        // Verify the full URL is captured
1052        let cap = URL_STANDARD_REGEX.find(url).unwrap();
1053        assert_eq!(cap.as_str(), url);
1054
1055        // Multiple parentheses pairs
1056        let url2 = "https://example.com/path_(foo)_(bar)";
1057        let cap2 = URL_STANDARD_REGEX.find(url2).unwrap();
1058        assert_eq!(cap2.as_str(), url2);
1059    }
1060
1061    #[test]
1062    fn test_url_standard_ipv6() {
1063        // IPv6 addresses in URLs
1064        assert!(URL_STANDARD_REGEX.is_match("https://[::1]/path"));
1065        assert!(URL_STANDARD_REGEX.is_match("https://[2001:db8::1]:8080/path"));
1066        assert!(URL_STANDARD_REGEX.is_match("http://[fe80::1%eth0]/"));
1067    }
1068
1069    #[test]
1070    fn test_url_www_basic() {
1071        // www URLs without protocol
1072        assert!(URL_WWW_REGEX.is_match("www.example.com"));
1073        assert!(URL_WWW_REGEX.is_match("www.example.co.uk"));
1074        assert!(URL_WWW_REGEX.is_match("www.sub.example.com"));
1075
1076        // Should not match plain domains without www
1077        assert!(!URL_WWW_REGEX.is_match("example.com"));
1078
1079        // Note: https://www.example.com DOES match because it contains "www."
1080        // The URL_WWW_REGEX is designed to find www. URLs that lack a protocol
1081        // Use URL_STANDARD_REGEX for full URLs with protocols
1082        assert!(URL_WWW_REGEX.is_match("https://www.example.com"));
1083    }
1084
1085    #[test]
1086    fn test_url_www_with_path() {
1087        assert!(URL_WWW_REGEX.is_match("www.example.com/path"));
1088        assert!(URL_WWW_REGEX.is_match("www.example.com/path/to/page"));
1089        assert!(URL_WWW_REGEX.is_match("www.example.com/path_(with_parens)"));
1090    }
1091
1092    #[test]
1093    fn test_url_ipv6_basic() {
1094        // IPv6 specific patterns
1095        assert!(URL_IPV6_REGEX.is_match("https://[::1]/"));
1096        assert!(URL_IPV6_REGEX.is_match("http://[2001:db8::1]/path"));
1097        assert!(URL_IPV6_REGEX.is_match("https://[fe80::1]:8080/path"));
1098        assert!(URL_IPV6_REGEX.is_match("ftp://[::ffff:192.168.1.1]/file"));
1099    }
1100
1101    #[test]
1102    fn test_url_ipv6_with_zone_id() {
1103        // IPv6 with zone identifiers
1104        assert!(URL_IPV6_REGEX.is_match("https://[fe80::1%eth0]/path"));
1105        assert!(URL_IPV6_REGEX.is_match("http://[fe80::1%25eth0]:8080/"));
1106    }
1107
1108    #[test]
1109    fn test_url_simple_detection() {
1110        // Simple pattern for content characteristic detection
1111        assert!(URL_SIMPLE_REGEX.is_match("https://example.com"));
1112        assert!(URL_SIMPLE_REGEX.is_match("http://test.org/path"));
1113        assert!(URL_SIMPLE_REGEX.is_match("ftp://files.com/file.zip"));
1114        assert!(!URL_SIMPLE_REGEX.is_match("not a url"));
1115    }
1116
1117    #[test]
1118    fn test_url_quick_check() {
1119        // Quick check pattern for early exits
1120        assert!(URL_QUICK_CHECK_REGEX.is_match("https://example.com"));
1121        assert!(URL_QUICK_CHECK_REGEX.is_match("http://example.com"));
1122        assert!(URL_QUICK_CHECK_REGEX.is_match("ftp://files.com"));
1123        assert!(URL_QUICK_CHECK_REGEX.is_match("www.example.com"));
1124        assert!(URL_QUICK_CHECK_REGEX.is_match("user@example.com"));
1125        assert!(!URL_QUICK_CHECK_REGEX.is_match("just plain text"));
1126    }
1127
1128    #[test]
1129    fn test_url_edge_cases() {
1130        // URLs with special characters that should be excluded
1131        let url = "https://example.com/path";
1132        assert!(URL_STANDARD_REGEX.is_match(url));
1133
1134        // URL followed by punctuation - the regex captures trailing punctuation
1135        // because trimming is done by `trim_trailing_punctuation()` in the rule
1136        let text = "Check https://example.com, it's great!";
1137        let cap = URL_STANDARD_REGEX.find(text).unwrap();
1138        // The comma IS captured by the regex - rule-level trimming handles this
1139        assert!(cap.as_str().ends_with(','));
1140
1141        // URL in angle brackets should still be found
1142        let text2 = "See <https://example.com> for more";
1143        assert!(URL_STANDARD_REGEX.is_match(text2));
1144
1145        // URL ending at angle bracket should stop at >
1146        let cap2 = URL_STANDARD_REGEX.find(text2).unwrap();
1147        assert!(!cap2.as_str().contains('>'));
1148    }
1149
1150    #[test]
1151    fn test_url_with_complex_paths() {
1152        // Complex real-world URLs
1153        let urls = [
1154            "https://github.com/owner/repo/blob/main/src/file.rs#L123",
1155            "https://docs.example.com/api/v2/endpoint?format=json&page=1",
1156            "https://cdn.example.com/assets/images/logo.png?v=2023",
1157            "https://search.example.com/results?q=test+query&filter=all",
1158        ];
1159
1160        for url in urls {
1161            assert!(URL_STANDARD_REGEX.is_match(url), "Should match: {url}");
1162        }
1163    }
1164
1165    #[test]
1166    fn test_url_pattern_strings_are_valid() {
1167        // Verify patterns compile into valid regexes by accessing them
1168        assert!(URL_STANDARD_REGEX.is_match("https://example.com"));
1169        assert!(URL_WWW_REGEX.is_match("www.example.com"));
1170        assert!(URL_IPV6_REGEX.is_match("https://[::1]/"));
1171        assert!(URL_QUICK_CHECK_REGEX.is_match("https://example.com"));
1172        assert!(URL_SIMPLE_REGEX.is_match("https://example.com"));
1173    }
1174
1175    // =========================================================================
1176    // Tests for is_blank_in_blockquote_context
1177    // This is a shared utility used by MD058, MD065, and other rules that need
1178    // to detect blank lines inside blockquotes (Issue #305)
1179    // =========================================================================
1180
1181    #[test]
1182    fn test_is_blank_in_blockquote_context_regular_blanks() {
1183        // Regular blank lines
1184        assert!(is_blank_in_blockquote_context(""));
1185        assert!(is_blank_in_blockquote_context("   "));
1186        assert!(is_blank_in_blockquote_context("\t"));
1187        assert!(is_blank_in_blockquote_context("  \t  "));
1188    }
1189
1190    #[test]
1191    fn test_is_blank_in_blockquote_context_blockquote_blanks() {
1192        // Blockquote continuation lines with no content (should be treated as blank)
1193        assert!(is_blank_in_blockquote_context(">"));
1194        assert!(is_blank_in_blockquote_context("> "));
1195        assert!(is_blank_in_blockquote_context(">  "));
1196        assert!(is_blank_in_blockquote_context(">>"));
1197        assert!(is_blank_in_blockquote_context(">> "));
1198        assert!(is_blank_in_blockquote_context(">>>"));
1199        assert!(is_blank_in_blockquote_context(">>> "));
1200    }
1201
1202    #[test]
1203    fn test_is_blank_in_blockquote_context_spaced_nested() {
1204        // Spaced nested blockquotes ("> > " style)
1205        assert!(is_blank_in_blockquote_context("> > "));
1206        assert!(is_blank_in_blockquote_context("> > > "));
1207        assert!(is_blank_in_blockquote_context(">  >  "));
1208    }
1209
1210    #[test]
1211    fn test_is_blank_in_blockquote_context_with_leading_space() {
1212        // Blockquote with leading whitespace
1213        assert!(is_blank_in_blockquote_context("  >"));
1214        assert!(is_blank_in_blockquote_context("  > "));
1215        assert!(is_blank_in_blockquote_context("  >>"));
1216    }
1217
1218    #[test]
1219    fn test_is_blank_in_blockquote_context_not_blank() {
1220        // Lines with actual content (should NOT be treated as blank)
1221        assert!(!is_blank_in_blockquote_context("text"));
1222        assert!(!is_blank_in_blockquote_context("> text"));
1223        assert!(!is_blank_in_blockquote_context(">> text"));
1224        assert!(!is_blank_in_blockquote_context("> | table |"));
1225        assert!(!is_blank_in_blockquote_context("| table |"));
1226        assert!(!is_blank_in_blockquote_context("> # Heading"));
1227        assert!(!is_blank_in_blockquote_context(">text")); // No space after > but has text
1228    }
1229
1230    #[test]
1231    fn test_is_blank_in_blockquote_context_edge_cases() {
1232        // Edge cases
1233        assert!(!is_blank_in_blockquote_context(">a")); // Content immediately after >
1234        assert!(!is_blank_in_blockquote_context("> a")); // Single char content
1235        assert!(is_blank_in_blockquote_context(">   ")); // Multiple spaces after >
1236        assert!(!is_blank_in_blockquote_context(">  text")); // Multiple spaces before content
1237    }
1238}
rumdl_lib/utils/regex_cache.rs

rumdl_lib/utils/
regex_cache.rs