rumdl_lib/utils/
regex_cache.rs

1//!
2//! Cached Regex Patterns and Fast Content Checks for Markdown Linting
3//!
4//! This module provides a centralized collection of pre-compiled, cached regex patterns
5//! for all major Markdown constructs (headings, lists, code blocks, links, images, etc.).
6//! It also includes fast-path utility functions for quickly checking if content
7//! potentially contains certain Markdown elements, allowing rules to skip expensive
8//! processing when unnecessary.
9//!
10//! # Performance
11//!
12//! All regexes are compiled once at startup using `lazy_static`, avoiding repeated
13//! compilation and improving performance across the linter. Use these shared patterns
14//! in rules instead of compiling new regexes.
15//!
16//! # Usage
17//!
18//! - Use the provided statics for common Markdown patterns.
19//! - Use the `regex_lazy!` macro for ad-hoc regexes that are not predefined.
20//! - Use the utility functions for fast content checks before running regexes.
21
22use fancy_regex::Regex as FancyRegex;
23use regex::Regex;
24use std::collections::HashMap;
25use std::sync::LazyLock;
26use std::sync::{Arc, Mutex};
27
28/// Global regex cache for dynamic patterns
29#[derive(Debug)]
30pub struct RegexCache {
31    cache: HashMap<String, Arc<Regex>>,
32    fancy_cache: HashMap<String, Arc<FancyRegex>>,
33    usage_stats: HashMap<String, u64>,
34}
35
36impl Default for RegexCache {
37    fn default() -> Self {
38        Self::new()
39    }
40}
41
42impl RegexCache {
43    pub fn new() -> Self {
44        Self {
45            cache: HashMap::new(),
46            fancy_cache: HashMap::new(),
47            usage_stats: HashMap::new(),
48        }
49    }
50
51    /// Get or compile a regex pattern
52    pub fn get_regex(&mut self, pattern: &str) -> Result<Arc<Regex>, regex::Error> {
53        if let Some(regex) = self.cache.get(pattern) {
54            *self.usage_stats.entry(pattern.to_string()).or_insert(0) += 1;
55            return Ok(regex.clone());
56        }
57
58        let regex = Arc::new(Regex::new(pattern)?);
59        self.cache.insert(pattern.to_string(), regex.clone());
60        *self.usage_stats.entry(pattern.to_string()).or_insert(0) += 1;
61        Ok(regex)
62    }
63
64    /// Get or compile a fancy regex pattern
65    pub fn get_fancy_regex(&mut self, pattern: &str) -> Result<Arc<FancyRegex>, Box<fancy_regex::Error>> {
66        if let Some(regex) = self.fancy_cache.get(pattern) {
67            *self.usage_stats.entry(pattern.to_string()).or_insert(0) += 1;
68            return Ok(regex.clone());
69        }
70
71        match FancyRegex::new(pattern) {
72            Ok(regex) => {
73                let arc_regex = Arc::new(regex);
74                self.fancy_cache.insert(pattern.to_string(), arc_regex.clone());
75                *self.usage_stats.entry(pattern.to_string()).or_insert(0) += 1;
76                Ok(arc_regex)
77            }
78            Err(e) => Err(Box::new(e)),
79        }
80    }
81
82    /// Get cache statistics
83    pub fn get_stats(&self) -> HashMap<String, u64> {
84        self.usage_stats.clone()
85    }
86
87    /// Clear cache (useful for testing)
88    pub fn clear(&mut self) {
89        self.cache.clear();
90        self.fancy_cache.clear();
91        self.usage_stats.clear();
92    }
93}
94
95/// Global regex cache instance
96static GLOBAL_REGEX_CACHE: LazyLock<Arc<Mutex<RegexCache>>> = LazyLock::new(|| Arc::new(Mutex::new(RegexCache::new())));
97
98/// Get a regex from the global cache
99///
100/// If the mutex is poisoned (another thread panicked while holding the lock),
101/// this function recovers by clearing the cache and continuing. This ensures
102/// the library never panics due to mutex poisoning.
103pub fn get_cached_regex(pattern: &str) -> Result<Arc<Regex>, regex::Error> {
104    let mut cache = GLOBAL_REGEX_CACHE.lock().unwrap_or_else(|poisoned| {
105        // Recover from poisoned mutex by clearing the cache
106        let mut guard = poisoned.into_inner();
107        guard.clear();
108        guard
109    });
110    cache.get_regex(pattern)
111}
112
113/// Get a fancy regex from the global cache
114///
115/// If the mutex is poisoned (another thread panicked while holding the lock),
116/// this function recovers by clearing the cache and continuing. This ensures
117/// the library never panics due to mutex poisoning.
118pub fn get_cached_fancy_regex(pattern: &str) -> Result<Arc<FancyRegex>, Box<fancy_regex::Error>> {
119    let mut cache = GLOBAL_REGEX_CACHE.lock().unwrap_or_else(|poisoned| {
120        // Recover from poisoned mutex by clearing the cache
121        let mut guard = poisoned.into_inner();
122        guard.clear();
123        guard
124    });
125    cache.get_fancy_regex(pattern)
126}
127
128/// Get cache usage statistics
129///
130/// If the mutex is poisoned, returns an empty HashMap rather than panicking.
131pub fn get_cache_stats() -> HashMap<String, u64> {
132    match GLOBAL_REGEX_CACHE.lock() {
133        Ok(cache) => cache.get_stats(),
134        Err(_) => HashMap::new(),
135    }
136}
137
138/// Macro for defining a lazily-initialized, cached regex pattern.
139///
140/// Use this for ad-hoc regexes that are not already defined in this module.
141///
142/// # Panics
143///
144/// This macro will panic at initialization if the regex pattern is invalid.
145/// This is intentional for compile-time constant patterns - we want to catch
146/// invalid patterns during development, not at runtime.
147///
148/// # Example
149///
150/// ```
151/// use rumdl_lib::regex_lazy;
152/// let my_re = regex_lazy!(r"^foo.*bar$");
153/// assert!(my_re.is_match("foobar"));
154/// ```
155#[macro_export]
156macro_rules! regex_lazy {
157    ($pattern:expr) => {{
158        static REGEX: LazyLock<regex::Regex> = LazyLock::new(|| regex::Regex::new($pattern).unwrap());
159        &*REGEX
160    }};
161}
162
163/// Macro for getting regex from global cache.
164///
165/// # Panics
166///
167/// Panics if the regex pattern is invalid. This is acceptable for static patterns
168/// where we want to fail fast during development.
169#[macro_export]
170macro_rules! regex_cached {
171    ($pattern:expr) => {{ $crate::utils::regex_cache::get_cached_regex($pattern).expect("Failed to compile regex") }};
172}
173
174/// Macro for getting fancy regex from global cache.
175///
176/// # Panics
177///
178/// Panics if the regex pattern is invalid. This is acceptable for static patterns
179/// where we want to fail fast during development.
180#[macro_export]
181macro_rules! fancy_regex_cached {
182    ($pattern:expr) => {{ $crate::utils::regex_cache::get_cached_fancy_regex($pattern).expect("Failed to compile fancy regex") }};
183}
184
185// Also make the macro available directly from this module
186pub use crate::regex_lazy;
187
188// URL patterns
189pub static URL_REGEX: LazyLock<Regex> =
190    LazyLock::new(|| Regex::new(r#"(?:https?|ftp)://[^\s<>\[\]()'"]+[^\s<>\[\]()"'.,]"#).unwrap());
191pub static BARE_URL_REGEX: LazyLock<Regex> =
192    LazyLock::new(|| Regex::new(r"(?:https?|ftp)://[^\s<>]+[^\s<>.]").unwrap());
193pub static URL_PATTERN: LazyLock<Regex> =
194    LazyLock::new(|| Regex::new(r"((?:https?|ftp)://[^\s\)<>]+[^\s\)<>.,])").unwrap());
195
196// Heading patterns
197pub static ATX_HEADING_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)(#{1,6})(\s+|$)").unwrap());
198pub static CLOSED_ATX_HEADING_REGEX: LazyLock<Regex> =
199    LazyLock::new(|| Regex::new(r"^(\s*)(#{1,6})(\s+)(.*)(\s+)(#+)(\s*)$").unwrap());
200pub static SETEXT_HEADING_REGEX: LazyLock<Regex> =
201    LazyLock::new(|| Regex::new(r"^(\s*)[^\s]+.*\n(\s*)(=+|-+)\s*$").unwrap());
202pub static TRAILING_PUNCTUATION_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[.,:);!?]$").unwrap());
203
204// ATX heading patterns for MD051 and other rules
205pub static ATX_HEADING_WITH_CAPTURE: LazyLock<Regex> =
206    LazyLock::new(|| Regex::new(r"^(#{1,6})\s+(.+?)(?:\s+#*\s*)?$").unwrap());
207pub static SETEXT_HEADING_WITH_CAPTURE: LazyLock<FancyRegex> =
208    LazyLock::new(|| FancyRegex::new(r"^([^\n]+)\n([=\-])\2+\s*$").unwrap());
209
210// List patterns
211pub static UNORDERED_LIST_MARKER_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)([*+-])(\s+)").unwrap());
212pub static ORDERED_LIST_MARKER_REGEX: LazyLock<Regex> =
213    LazyLock::new(|| Regex::new(r"^(\s*)(\d+)([.)])(\s+)").unwrap());
214pub static LIST_MARKER_ANY_REGEX: LazyLock<Regex> =
215    LazyLock::new(|| Regex::new(r"^(\s*)(?:([*+-])|(\d+)[.)])(\s+)").unwrap());
216
217// Code block patterns
218pub static FENCED_CODE_BLOCK_START_REGEX: LazyLock<Regex> =
219    LazyLock::new(|| Regex::new(r"^(\s*)(```|~~~)(.*)$").unwrap());
220pub static FENCED_CODE_BLOCK_END_REGEX: LazyLock<Regex> =
221    LazyLock::new(|| Regex::new(r"^(\s*)(```|~~~)(\s*)$").unwrap());
222pub static INDENTED_CODE_BLOCK_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s{4,})(.*)$").unwrap());
223pub static CODE_FENCE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(`{3,}|~{3,})").unwrap());
224
225// Emphasis patterns
226pub static EMPHASIS_REGEX: LazyLock<FancyRegex> =
227    LazyLock::new(|| FancyRegex::new(r"(\s|^)(\*{1,2}|_{1,2})(?=\S)(.+?)(?<=\S)(\2)(\s|$)").unwrap());
228pub static SPACE_IN_EMPHASIS_REGEX: LazyLock<FancyRegex> =
229    LazyLock::new(|| FancyRegex::new(r"(\*|_)(\s+)(.+?)(\s+)(\1)").unwrap());
230
231// MD037 specific emphasis patterns - improved to avoid false positives
232// Only match emphasis with spaces that are actually complete emphasis blocks
233// Use word boundaries and negative lookbehind/lookahead to avoid matching across emphasis boundaries
234pub static ASTERISK_EMPHASIS: LazyLock<Regex> =
235    LazyLock::new(|| Regex::new(r"(?:^|[^*])\*(\s+[^*]+\s*|\s*[^*]+\s+)\*(?:[^*]|$)").unwrap());
236pub static UNDERSCORE_EMPHASIS: LazyLock<Regex> =
237    LazyLock::new(|| Regex::new(r"(?:^|[^_])_(\s+[^_]+\s*|\s*[^_]+\s+)_(?:[^_]|$)").unwrap());
238pub static DOUBLE_UNDERSCORE_EMPHASIS: LazyLock<Regex> =
239    LazyLock::new(|| Regex::new(r"(?:^|[^_])__(\s+[^_]+\s*|\s*[^_]+\s+)__(?:[^_]|$)").unwrap());
240pub static DOUBLE_ASTERISK_EMPHASIS: LazyLock<FancyRegex> =
241    LazyLock::new(|| FancyRegex::new(r"\*\*\s+([^*]+?)\s+\*\*").unwrap());
242pub static DOUBLE_ASTERISK_SPACE_START: LazyLock<FancyRegex> =
243    LazyLock::new(|| FancyRegex::new(r"\*\*\s+([^*]+?)\*\*").unwrap());
244pub static DOUBLE_ASTERISK_SPACE_END: LazyLock<FancyRegex> =
245    LazyLock::new(|| FancyRegex::new(r"\*\*([^*]+?)\s+\*\*").unwrap());
246
247// Code block patterns
248pub static FENCED_CODE_BLOCK_START: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)```(?:[^`\r\n]*)$").unwrap());
249pub static FENCED_CODE_BLOCK_END: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)```\s*$").unwrap());
250pub static ALTERNATE_FENCED_CODE_BLOCK_START: LazyLock<Regex> =
251    LazyLock::new(|| Regex::new(r"^(\s*)~~~(?:[^~\r\n]*)$").unwrap());
252pub static ALTERNATE_FENCED_CODE_BLOCK_END: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)~~~\s*$").unwrap());
253pub static INDENTED_CODE_BLOCK_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s{4,})").unwrap());
254
255// HTML patterns
256pub static HTML_TAG_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<([a-zA-Z][^>]*)>").unwrap());
257pub static HTML_SELF_CLOSING_TAG_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<([a-zA-Z][^>]*/)>").unwrap());
258pub static HTML_TAG_FINDER: LazyLock<Regex> = LazyLock::new(|| Regex::new("(?i)</?[a-zA-Z][^>]*>").unwrap());
259pub static HTML_OPENING_TAG_FINDER: LazyLock<Regex> = LazyLock::new(|| Regex::new("(?i)<[a-zA-Z][^>]*>").unwrap());
260pub static HTML_TAG_QUICK_CHECK: LazyLock<Regex> = LazyLock::new(|| Regex::new("(?i)</?[a-zA-Z]").unwrap());
261
262// Link patterns for MD051 and other rules
263pub static LINK_REFERENCE_DEFINITION_REGEX: LazyLock<Regex> =
264    LazyLock::new(|| Regex::new(r"^\s*\[([^\]]+)\]:\s+(.+)$").unwrap());
265pub static INLINE_LINK_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").unwrap());
266pub static LINK_TEXT_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[([^\]]*)\]").unwrap());
267pub static LINK_REGEX: LazyLock<FancyRegex> =
268    LazyLock::new(|| FancyRegex::new(r"(?<!\\)\[([^\]]*)\]\(([^)#]*)#([^)]+)\)").unwrap());
269pub static EXTERNAL_URL_REGEX: LazyLock<FancyRegex> =
270    LazyLock::new(|| FancyRegex::new(r"^(https?://|ftp://|www\.|[^/]+\.[a-z]{2,})").unwrap());
271
272// Image patterns
273pub static IMAGE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\(([^)]+)\)").unwrap());
274
275// Whitespace patterns
276pub static TRAILING_WHITESPACE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\s+$").unwrap());
277pub static MULTIPLE_BLANK_LINES_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\n{3,}").unwrap());
278
279// Front matter patterns
280pub static FRONT_MATTER_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^---\n.*?\n---\n").unwrap());
281
282// MD051 specific patterns
283pub static INLINE_CODE_REGEX: LazyLock<FancyRegex> = LazyLock::new(|| FancyRegex::new(r"`[^`]+`").unwrap());
284pub static BOLD_ASTERISK_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\*\*(.+?)\*\*").unwrap());
285pub static BOLD_UNDERSCORE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"__(.+?)__").unwrap());
286pub static ITALIC_ASTERISK_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\*([^*]+?)\*").unwrap());
287pub static ITALIC_UNDERSCORE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"_([^_]+?)_").unwrap());
288pub static LINK_TEXT_FULL_REGEX: LazyLock<FancyRegex> =
289    LazyLock::new(|| FancyRegex::new(r"\[([^\]]*)\]\([^)]*\)").unwrap());
290pub static STRIKETHROUGH_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"~~(.+?)~~").unwrap());
291pub static MULTIPLE_HYPHENS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"-{2,}").unwrap());
292pub static TOC_SECTION_START: LazyLock<Regex> =
293    LazyLock::new(|| Regex::new(r"^#+\s*(?:Table of Contents|Contents|TOC)\s*$").unwrap());
294
295// Blockquote patterns
296pub static BLOCKQUOTE_PREFIX_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*>+\s*)").unwrap());
297
298// MD013 specific patterns
299pub static IMAGE_REF_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^!\[.*?\]\[.*?\]$").unwrap());
300pub static LINK_REF_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\[.*?\]:\s*https?://\S+$").unwrap());
301pub static URL_IN_TEXT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"https?://\S+").unwrap());
302pub static SENTENCE_END: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[.!?]\s+[A-Z]").unwrap());
303pub static ABBREVIATION: LazyLock<Regex> = LazyLock::new(|| {
304    Regex::new(r"\b(?:Mr|Mrs|Ms|Dr|Prof|Sr|Jr|vs|etc|i\.e|e\.g|Inc|Corp|Ltd|Co|St|Ave|Blvd|Rd|Ph\.D|M\.D|B\.A|M\.A|Ph\.D|U\.S|U\.K|U\.N|N\.Y|L\.A|D\.C)\.\s+[A-Z]").unwrap()
305});
306pub static DECIMAL_NUMBER: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\d+\.\s*\d+").unwrap());
307pub static LIST_ITEM: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\s*\d+\.\s+").unwrap());
308pub static REFERENCE_LINK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[([^\]]*)\]\[([^\]]*)\]").unwrap());
309
310// Email pattern
311pub static EMAIL_PATTERN: LazyLock<Regex> =
312    LazyLock::new(|| Regex::new(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}").unwrap());
313
314// Third lazy_static block for link and image patterns used by MD052 and text_reflow
315// Reference link patterns (shared by MD052 and text_reflow)
316// Pattern to match reference links: [text][reference] or [text][]
317pub static REF_LINK_REGEX: LazyLock<FancyRegex> =
318    LazyLock::new(|| FancyRegex::new(r"(?<!\\)\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]\[([^\]]*)\]").unwrap());
319
320// Pattern for shortcut reference links: [reference]
321// Must not be preceded by ] or ) (to avoid matching second part of [text][ref])
322// Must not be followed by [ or ( (to avoid matching first part of [text][ref] or [text](url))
323// The capturing group handles nested brackets to support cases like [`Union[T, None]`]
324pub static SHORTCUT_REF_REGEX: LazyLock<FancyRegex> =
325    LazyLock::new(|| FancyRegex::new(r"(?<![\\)\]])\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\](?!\s*[\[\(])").unwrap());
326
327// Inline link with fancy regex for better escaping handling (used by text_reflow)
328pub static INLINE_LINK_FANCY_REGEX: LazyLock<FancyRegex> =
329    LazyLock::new(|| FancyRegex::new(r"(?<!\\)\[([^\]]+)\]\(([^)]+)\)").unwrap());
330
331// Inline image with fancy regex (used by MD052 and text_reflow)
332pub static INLINE_IMAGE_FANCY_REGEX: LazyLock<FancyRegex> =
333    LazyLock::new(|| FancyRegex::new(r"!\[([^\]]*)\]\(([^)]+)\)").unwrap());
334
335// Linked images (clickable badges) - all 4 variants
336// Must be detected before inline_image and inline_link to treat as atomic units
337//
338// Limitation: Alt text containing brackets like [![[v1.0]](img)](link) is not supported.
339// The [^\]]* pattern cannot match nested brackets. This is rare in practice.
340//
341// Pattern 1: Inline image in inline link - [![alt](img-url)](link-url)
342pub static LINKED_IMAGE_INLINE_INLINE: LazyLock<FancyRegex> =
343    LazyLock::new(|| FancyRegex::new(r"\[!\[([^\]]*)\]\(([^)]+)\)\]\(([^)]+)\)").unwrap());
344
345// Pattern 2: Reference image in inline link - [![alt][img-ref]](link-url)
346pub static LINKED_IMAGE_REF_INLINE: LazyLock<FancyRegex> =
347    LazyLock::new(|| FancyRegex::new(r"\[!\[([^\]]*)\]\[([^\]]*)\]\]\(([^)]+)\)").unwrap());
348
349// Pattern 3: Inline image in reference link - [![alt](img-url)][link-ref]
350pub static LINKED_IMAGE_INLINE_REF: LazyLock<FancyRegex> =
351    LazyLock::new(|| FancyRegex::new(r"\[!\[([^\]]*)\]\(([^)]+)\)\]\[([^\]]*)\]").unwrap());
352
353// Pattern 4: Reference image in reference link - [![alt][img-ref]][link-ref]
354pub static LINKED_IMAGE_REF_REF: LazyLock<FancyRegex> =
355    LazyLock::new(|| FancyRegex::new(r"\[!\[([^\]]*)\]\[([^\]]*)\]\]\[([^\]]*)\]").unwrap());
356
357// Reference image: ![alt][ref] or ![alt][]
358pub static REF_IMAGE_REGEX: LazyLock<FancyRegex> =
359    LazyLock::new(|| FancyRegex::new(r"!\[((?:[^\[\]\\]|\\.|\[[^\]]*\])*)\]\[([^\]]*)\]").unwrap());
360
361// Footnote reference: [^note]
362pub static FOOTNOTE_REF_REGEX: LazyLock<FancyRegex> = LazyLock::new(|| FancyRegex::new(r"\[\^([^\]]+)\]").unwrap());
363
364// Strikethrough with fancy regex: ~~text~~
365pub static STRIKETHROUGH_FANCY_REGEX: LazyLock<FancyRegex> = LazyLock::new(|| FancyRegex::new(r"~~([^~]+)~~").unwrap());
366
367// Wiki-style links: [[wiki]] or [[wiki|display text]]
368pub static WIKI_LINK_REGEX: LazyLock<FancyRegex> = LazyLock::new(|| FancyRegex::new(r"\[\[([^\]]+)\]\]").unwrap());
369
370// Math formulas: $inline$ or $$display$$
371pub static INLINE_MATH_REGEX: LazyLock<FancyRegex> =
372    LazyLock::new(|| FancyRegex::new(r"(?<!\$)\$(?!\$)([^\$]+)\$(?!\$)").unwrap());
373pub static DISPLAY_MATH_REGEX: LazyLock<FancyRegex> = LazyLock::new(|| FancyRegex::new(r"\$\$([^\$]+)\$\$").unwrap());
374
375// Emoji shortcodes: :emoji:
376pub static EMOJI_SHORTCODE_REGEX: LazyLock<FancyRegex> =
377    LazyLock::new(|| FancyRegex::new(r":([a-zA-Z0-9_+-]+):").unwrap());
378
379// HTML tags (opening, closing, self-closing)
380pub static HTML_TAG_PATTERN: LazyLock<FancyRegex> =
381    LazyLock::new(|| FancyRegex::new(r"</?[a-zA-Z][^>]*>|<[a-zA-Z][^>]*/\s*>").unwrap());
382
383// HTML entities: &nbsp; &mdash; etc
384pub static HTML_ENTITY_REGEX: LazyLock<FancyRegex> =
385    LazyLock::new(|| FancyRegex::new(r"&[a-zA-Z][a-zA-Z0-9]*;|&#\d+;|&#x[0-9a-fA-F]+;").unwrap());
386
387// Fourth lazy_static block for additional patterns
388// HTML comment patterns
389pub static HTML_COMMENT_START: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<!--").unwrap());
390pub static HTML_COMMENT_END: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"-->").unwrap());
391pub static HTML_COMMENT_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<!--[\s\S]*?-->").unwrap());
392
393// HTML heading pattern (matches <h1> through <h6> tags)
394pub static HTML_HEADING_PATTERN: LazyLock<FancyRegex> =
395    LazyLock::new(|| FancyRegex::new(r"^\s*<h([1-6])(?:\s[^>]*)?>.*</h\1>\s*$").unwrap());
396
397// Heading quick check pattern
398pub static HEADING_CHECK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?m)^(?:\s*)#").unwrap());
399
400// Horizontal rule patterns
401pub static HR_DASH: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\-{3,}\s*$").unwrap());
402pub static HR_ASTERISK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\*{3,}\s*$").unwrap());
403pub static HR_UNDERSCORE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^_{3,}\s*$").unwrap());
404pub static HR_SPACED_DASH: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\-\s+){2,}\-\s*$").unwrap());
405pub static HR_SPACED_ASTERISK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\*\s+){2,}\*\s*$").unwrap());
406pub static HR_SPACED_UNDERSCORE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(_\s+){2,}_\s*$").unwrap());
407
408/// Utility functions for quick content checks
409/// Check if content contains any headings (quick check before regex)
410pub fn has_heading_markers(content: &str) -> bool {
411    content.contains('#')
412}
413
414/// Check if content contains any lists (quick check before regex)
415pub fn has_list_markers(content: &str) -> bool {
416    content.contains('*')
417        || content.contains('-')
418        || content.contains('+')
419        || (content.contains('.') && content.contains(|c: char| c.is_ascii_digit()))
420}
421
422/// Check if content contains any code blocks (quick check before regex)
423pub fn has_code_block_markers(content: &str) -> bool {
424    content.contains("```") || content.contains("~~~") || content.contains("\n    ")
425    // Indented code block potential
426}
427
428/// Check if content contains any emphasis markers (quick check before regex)
429pub fn has_emphasis_markers(content: &str) -> bool {
430    content.contains('*') || content.contains('_')
431}
432
433/// Check if content contains any HTML tags (quick check before regex)
434pub fn has_html_tags(content: &str) -> bool {
435    content.contains('<') && (content.contains('>') || content.contains("/>"))
436}
437
438/// Check if content contains any links (quick check before regex)
439pub fn has_link_markers(content: &str) -> bool {
440    (content.contains('[') && content.contains(']'))
441        || content.contains("http://")
442        || content.contains("https://")
443        || content.contains("ftp://")
444}
445
446/// Check if content contains any images (quick check before regex)
447pub fn has_image_markers(content: &str) -> bool {
448    content.contains("![")
449}
450
451/// Optimize URL detection by implementing a character-by-character scanner
452/// that's much faster than regex for cases where we know there's no URL
453pub fn contains_url(content: &str) -> bool {
454    // Fast check - if these substrings aren't present, there's no URL
455    if !content.contains("://") {
456        return false;
457    }
458
459    let chars: Vec<char> = content.chars().collect();
460    let mut i = 0;
461
462    while i < chars.len() {
463        // Look for the start of a URL protocol
464        if i + 2 < chars.len()
465            && ((chars[i] == 'h' && chars[i + 1] == 't' && chars[i + 2] == 't')
466                || (chars[i] == 'f' && chars[i + 1] == 't' && chars[i + 2] == 'p'))
467        {
468            // Scan forward to find "://"
469            let mut j = i;
470            while j + 2 < chars.len() {
471                if chars[j] == ':' && chars[j + 1] == '/' && chars[j + 2] == '/' {
472                    return true;
473                }
474                j += 1;
475
476                // Don't scan too far ahead for the protocol
477                if j > i + 10 {
478                    break;
479                }
480            }
481        }
482        i += 1;
483    }
484
485    false
486}
487
488/// Escapes a string to be used in a regex pattern
489pub fn escape_regex(s: &str) -> String {
490    let mut result = String::with_capacity(s.len() * 2);
491
492    for c in s.chars() {
493        // Use matches! for O(1) lookup instead of array.contains() which is O(n)
494        if matches!(
495            c,
496            '.' | '+' | '*' | '?' | '^' | '$' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '\\'
497        ) {
498            result.push('\\');
499        }
500        result.push(c);
501    }
502
503    result
504}
505
506#[cfg(test)]
507mod tests {
508    use super::*;
509
510    #[test]
511    fn test_regex_cache_new() {
512        let cache = RegexCache::new();
513        assert!(cache.cache.is_empty());
514        assert!(cache.fancy_cache.is_empty());
515        assert!(cache.usage_stats.is_empty());
516    }
517
518    #[test]
519    fn test_regex_cache_default() {
520        let cache = RegexCache::default();
521        assert!(cache.cache.is_empty());
522        assert!(cache.fancy_cache.is_empty());
523        assert!(cache.usage_stats.is_empty());
524    }
525
526    #[test]
527    fn test_get_regex_compilation() {
528        let mut cache = RegexCache::new();
529
530        // First call compiles and caches
531        let regex1 = cache.get_regex(r"\d+").unwrap();
532        assert_eq!(cache.cache.len(), 1);
533        assert_eq!(cache.usage_stats.get(r"\d+"), Some(&1));
534
535        // Second call returns cached version
536        let regex2 = cache.get_regex(r"\d+").unwrap();
537        assert_eq!(cache.cache.len(), 1);
538        assert_eq!(cache.usage_stats.get(r"\d+"), Some(&2));
539
540        // Both should be the same Arc
541        assert!(Arc::ptr_eq(&regex1, &regex2));
542    }
543
544    #[test]
545    fn test_get_regex_invalid_pattern() {
546        let mut cache = RegexCache::new();
547        let result = cache.get_regex(r"[unterminated");
548        assert!(result.is_err());
549        assert!(cache.cache.is_empty());
550    }
551
552    #[test]
553    fn test_get_fancy_regex_compilation() {
554        let mut cache = RegexCache::new();
555
556        // First call compiles and caches
557        let regex1 = cache.get_fancy_regex(r"(?<=foo)bar").unwrap();
558        assert_eq!(cache.fancy_cache.len(), 1);
559        assert_eq!(cache.usage_stats.get(r"(?<=foo)bar"), Some(&1));
560
561        // Second call returns cached version
562        let regex2 = cache.get_fancy_regex(r"(?<=foo)bar").unwrap();
563        assert_eq!(cache.fancy_cache.len(), 1);
564        assert_eq!(cache.usage_stats.get(r"(?<=foo)bar"), Some(&2));
565
566        // Both should be the same Arc
567        assert!(Arc::ptr_eq(&regex1, &regex2));
568    }
569
570    #[test]
571    fn test_get_fancy_regex_invalid_pattern() {
572        let mut cache = RegexCache::new();
573        let result = cache.get_fancy_regex(r"(?<=invalid");
574        assert!(result.is_err());
575        assert!(cache.fancy_cache.is_empty());
576    }
577
578    #[test]
579    fn test_get_stats() {
580        let mut cache = RegexCache::new();
581
582        // Use some patterns
583        let _ = cache.get_regex(r"\d+").unwrap();
584        let _ = cache.get_regex(r"\d+").unwrap();
585        let _ = cache.get_regex(r"\w+").unwrap();
586        let _ = cache.get_fancy_regex(r"(?<=foo)bar").unwrap();
587
588        let stats = cache.get_stats();
589        assert_eq!(stats.get(r"\d+"), Some(&2));
590        assert_eq!(stats.get(r"\w+"), Some(&1));
591        assert_eq!(stats.get(r"(?<=foo)bar"), Some(&1));
592    }
593
594    #[test]
595    fn test_clear_cache() {
596        let mut cache = RegexCache::new();
597
598        // Add some patterns
599        let _ = cache.get_regex(r"\d+").unwrap();
600        let _ = cache.get_fancy_regex(r"(?<=foo)bar").unwrap();
601
602        assert!(!cache.cache.is_empty());
603        assert!(!cache.fancy_cache.is_empty());
604        assert!(!cache.usage_stats.is_empty());
605
606        // Clear cache
607        cache.clear();
608
609        assert!(cache.cache.is_empty());
610        assert!(cache.fancy_cache.is_empty());
611        assert!(cache.usage_stats.is_empty());
612    }
613
614    #[test]
615    fn test_global_cache_functions() {
616        // Test get_cached_regex
617        let regex1 = get_cached_regex(r"\d{3}").unwrap();
618        let regex2 = get_cached_regex(r"\d{3}").unwrap();
619        assert!(Arc::ptr_eq(&regex1, &regex2));
620
621        // Test get_cached_fancy_regex
622        let fancy1 = get_cached_fancy_regex(r"(?<=test)ing").unwrap();
623        let fancy2 = get_cached_fancy_regex(r"(?<=test)ing").unwrap();
624        assert!(Arc::ptr_eq(&fancy1, &fancy2));
625
626        // Test stats
627        let stats = get_cache_stats();
628        assert!(stats.contains_key(r"\d{3}"));
629        assert!(stats.contains_key(r"(?<=test)ing"));
630    }
631
632    #[test]
633    fn test_regex_lazy_macro() {
634        let re = regex_lazy!(r"^test.*end$");
635        assert!(re.is_match("test something end"));
636        assert!(!re.is_match("test something"));
637
638        // The macro creates a new static for each invocation location,
639        // so we can't test pointer equality across different invocations
640        // But we can test that the regex works correctly
641        let re2 = regex_lazy!(r"^start.*finish$");
642        assert!(re2.is_match("start and finish"));
643        assert!(!re2.is_match("start without end"));
644    }
645
646    #[test]
647    fn test_has_heading_markers() {
648        assert!(has_heading_markers("# Heading"));
649        assert!(has_heading_markers("Text with # symbol"));
650        assert!(!has_heading_markers("Text without heading marker"));
651    }
652
653    #[test]
654    fn test_has_list_markers() {
655        assert!(has_list_markers("* Item"));
656        assert!(has_list_markers("- Item"));
657        assert!(has_list_markers("+ Item"));
658        assert!(has_list_markers("1. Item"));
659        assert!(!has_list_markers("Text without list markers"));
660    }
661
662    #[test]
663    fn test_has_code_block_markers() {
664        assert!(has_code_block_markers("```code```"));
665        assert!(has_code_block_markers("~~~code~~~"));
666        assert!(has_code_block_markers("Text\n    indented code"));
667        assert!(!has_code_block_markers("Text without code blocks"));
668    }
669
670    #[test]
671    fn test_has_emphasis_markers() {
672        assert!(has_emphasis_markers("*emphasis*"));
673        assert!(has_emphasis_markers("_emphasis_"));
674        assert!(has_emphasis_markers("**bold**"));
675        assert!(has_emphasis_markers("__bold__"));
676        assert!(!has_emphasis_markers("no emphasis"));
677    }
678
679    #[test]
680    fn test_has_html_tags() {
681        assert!(has_html_tags("<div>content</div>"));
682        assert!(has_html_tags("<br/>"));
683        assert!(has_html_tags("<img src='test.jpg'>"));
684        assert!(!has_html_tags("no html tags"));
685        assert!(!has_html_tags("less than < but no tag"));
686    }
687
688    #[test]
689    fn test_has_link_markers() {
690        assert!(has_link_markers("[text](url)"));
691        assert!(has_link_markers("[reference][1]"));
692        assert!(has_link_markers("http://example.com"));
693        assert!(has_link_markers("https://example.com"));
694        assert!(has_link_markers("ftp://example.com"));
695        assert!(!has_link_markers("no links here"));
696    }
697
698    #[test]
699    fn test_has_image_markers() {
700        assert!(has_image_markers("![alt text](image.png)"));
701        assert!(has_image_markers("![](image.png)"));
702        assert!(!has_image_markers("[link](url)"));
703        assert!(!has_image_markers("no images"));
704    }
705
706    #[test]
707    fn test_contains_url() {
708        assert!(contains_url("http://example.com"));
709        assert!(contains_url("Text with https://example.com link"));
710        assert!(contains_url("ftp://example.com"));
711        assert!(!contains_url("Text without URL"));
712        assert!(!contains_url("http not followed by ://"));
713
714        // Edge cases
715        assert!(!contains_url("http"));
716        assert!(!contains_url("https"));
717        assert!(!contains_url("://"));
718        assert!(contains_url("Visit http://site.com now"));
719        assert!(contains_url("See https://secure.site.com/path"));
720    }
721
722    #[test]
723    fn test_contains_url_performance() {
724        // Test early exit for strings without "://"
725        let long_text = "a".repeat(10000);
726        assert!(!contains_url(&long_text));
727
728        // Test with URL at the end
729        let text_with_url = format!("{long_text}https://example.com");
730        assert!(contains_url(&text_with_url));
731    }
732
733    #[test]
734    fn test_escape_regex() {
735        assert_eq!(escape_regex("a.b"), "a\\.b");
736        assert_eq!(escape_regex("a+b*c"), "a\\+b\\*c");
737        assert_eq!(escape_regex("(test)"), "\\(test\\)");
738        assert_eq!(escape_regex("[a-z]"), "\\[a-z\\]");
739        assert_eq!(escape_regex("normal text"), "normal text");
740
741        // Test all special characters
742        assert_eq!(escape_regex(".$^{[(|)*+?\\"), "\\.\\$\\^\\{\\[\\(\\|\\)\\*\\+\\?\\\\");
743
744        // Test empty string
745        assert_eq!(escape_regex(""), "");
746
747        // Test mixed content
748        assert_eq!(escape_regex("test.com/path?query=1"), "test\\.com/path\\?query=1");
749    }
750
751    #[test]
752    fn test_static_regex_patterns() {
753        // Test URL patterns
754        assert!(URL_REGEX.is_match("https://example.com"));
755        assert!(URL_REGEX.is_match("http://test.org/path"));
756        assert!(URL_REGEX.is_match("ftp://files.com"));
757        assert!(!URL_REGEX.is_match("not a url"));
758
759        // Test heading patterns
760        assert!(ATX_HEADING_REGEX.is_match("# Heading"));
761        assert!(ATX_HEADING_REGEX.is_match("  ## Indented"));
762        assert!(ATX_HEADING_REGEX.is_match("### "));
763        assert!(!ATX_HEADING_REGEX.is_match("Not a heading"));
764
765        // Test list patterns
766        assert!(UNORDERED_LIST_MARKER_REGEX.is_match("* Item"));
767        assert!(UNORDERED_LIST_MARKER_REGEX.is_match("- Item"));
768        assert!(UNORDERED_LIST_MARKER_REGEX.is_match("+ Item"));
769        assert!(ORDERED_LIST_MARKER_REGEX.is_match("1. Item"));
770        assert!(ORDERED_LIST_MARKER_REGEX.is_match("99. Item"));
771
772        // Test code block patterns
773        assert!(FENCED_CODE_BLOCK_START_REGEX.is_match("```"));
774        assert!(FENCED_CODE_BLOCK_START_REGEX.is_match("```rust"));
775        assert!(FENCED_CODE_BLOCK_START_REGEX.is_match("~~~"));
776        assert!(FENCED_CODE_BLOCK_END_REGEX.is_match("```"));
777        assert!(FENCED_CODE_BLOCK_END_REGEX.is_match("~~~"));
778
779        // Test emphasis patterns
780        assert!(BOLD_ASTERISK_REGEX.is_match("**bold**"));
781        assert!(BOLD_UNDERSCORE_REGEX.is_match("__bold__"));
782        assert!(ITALIC_ASTERISK_REGEX.is_match("*italic*"));
783        assert!(ITALIC_UNDERSCORE_REGEX.is_match("_italic_"));
784
785        // Test HTML patterns
786        assert!(HTML_TAG_REGEX.is_match("<div>"));
787        assert!(HTML_TAG_REGEX.is_match("<span class='test'>"));
788        assert!(HTML_SELF_CLOSING_TAG_REGEX.is_match("<br/>"));
789        assert!(HTML_SELF_CLOSING_TAG_REGEX.is_match("<img src='test'/>"));
790
791        // Test whitespace patterns
792        assert!(TRAILING_WHITESPACE_REGEX.is_match("line with spaces   "));
793        assert!(TRAILING_WHITESPACE_REGEX.is_match("tabs\t\t"));
794        assert!(MULTIPLE_BLANK_LINES_REGEX.is_match("\n\n\n"));
795        assert!(MULTIPLE_BLANK_LINES_REGEX.is_match("\n\n\n\n"));
796
797        // Test blockquote pattern
798        assert!(BLOCKQUOTE_PREFIX_RE.is_match("> Quote"));
799        assert!(BLOCKQUOTE_PREFIX_RE.is_match("  > Indented quote"));
800        assert!(BLOCKQUOTE_PREFIX_RE.is_match(">> Nested"));
801    }
802
803    #[test]
804    fn test_thread_safety() {
805        use std::thread;
806
807        let handles: Vec<_> = (0..10)
808            .map(|i| {
809                thread::spawn(move || {
810                    let pattern = format!(r"\d{{{i}}}");
811                    let regex = get_cached_regex(&pattern).unwrap();
812                    assert!(regex.is_match(&"1".repeat(i)));
813                })
814            })
815            .collect();
816
817        for handle in handles {
818            handle.join().unwrap();
819        }
820    }
821}