rumdl_lib/rules/
md034_no_bare_urls.rs

1/// Rule MD034: No unformatted URLs
2///
3/// See [docs/md034.md](../../docs/md034.md) for full documentation, configuration, and examples.
4use crate::rule::{Fix, LintError, LintResult, LintWarning, Rule, RuleCategory, Severity};
5use crate::utils::range_utils::calculate_url_range;
6use crate::utils::regex_cache::{EMAIL_PATTERN, get_cached_regex};
7
8use crate::lint_context::LintContext;
9
10// URL detection patterns
11const URL_QUICK_CHECK_STR: &str = r#"(?:https?|ftps?)://|@"#;
12const CUSTOM_PROTOCOL_PATTERN_STR: &str = r#"(?:grpc|ws|wss|ssh|git|svn|file|data|javascript|vscode|chrome|about|slack|discord|matrix|irc|redis|mongodb|postgresql|mysql|kafka|nats|amqp|mqtt|custom|app|api|service)://"#;
13const MARKDOWN_LINK_PATTERN_STR: &str = r#"\[(?:[^\[\]]|\[[^\]]*\])*\]\(([^)\s]+)(?:\s+(?:\"[^\"]*\"|\'[^\']*\'))?\)"#;
14const ANGLE_LINK_PATTERN_STR: &str =
15    r#"<((?:https?|ftps?)://(?:\[[0-9a-fA-F:]+(?:%[a-zA-Z0-9]+)?\]|[^>]+)|[^@\s]+@[^@\s]+\.[^@\s>]+)>"#;
16const BADGE_LINK_LINE_STR: &str = r#"^\s*\[!\[[^\]]*\]\([^)]*\)\]\([^)]*\)\s*$"#;
17const MARKDOWN_IMAGE_PATTERN_STR: &str = r#"!\s*\[([^\]]*)\]\s*\(([^)\s]+)(?:\s+(?:\"[^\"]*\"|\'[^\']*\'))?\)"#;
18const SIMPLE_URL_REGEX_STR: &str = r#"(https?|ftps?)://(?:\[[0-9a-fA-F:%.]+\](?::\d+)?|[^\s<>\[\]()\\'\"`\]]+)(?:/[^\s<>\[\]()\\'\"`]*)?(?:\?[^\s<>\[\]()\\'\"`]*)?(?:#[^\s<>\[\]()\\'\"`]*)?"#;
19const IPV6_URL_REGEX_STR: &str = r#"(https?|ftps?)://\[[0-9a-fA-F:%.\-a-zA-Z]+\](?::\d+)?(?:/[^\s<>\[\]()\\'\"`]*)?(?:\?[^\s<>\[\]()\\'\"`]*)?(?:#[^\s<>\[\]()\\'\"`]*)?"#;
20const REFERENCE_DEF_RE_STR: &str = r"^\s*\[[^\]]+\]:\s*(?:https?|ftps?)://\S+$";
21const HTML_COMMENT_PATTERN_STR: &str = r#"<!--[\s\S]*?-->"#;
22const HTML_TAG_PATTERN_STR: &str = r#"<[^>]*>"#;
23const MULTILINE_LINK_CONTINUATION_STR: &str = r#"^[^\[]*\]\(.*\)"#;
24
25/// Reusable buffers for check_line to reduce allocations
26#[derive(Default)]
27struct LineCheckBuffers {
28    markdown_link_ranges: Vec<(usize, usize)>,
29    image_ranges: Vec<(usize, usize)>,
30    urls_found: Vec<(usize, usize, String)>,
31}
32
33#[derive(Default, Clone)]
34pub struct MD034NoBareUrls;
35
36impl MD034NoBareUrls {
37    #[inline]
38    pub fn should_skip_content(&self, content: &str) -> bool {
39        // Skip if content has no URLs and no email addresses
40        // Fast byte scanning for common URL/email indicators
41        let bytes = content.as_bytes();
42        !bytes.contains(&b':') && !bytes.contains(&b'@')
43    }
44
45    /// Remove trailing punctuation that is likely sentence punctuation, not part of the URL
46    fn trim_trailing_punctuation<'a>(&self, url: &'a str) -> &'a str {
47        let mut trimmed = url;
48
49        // Check for balanced parentheses - if we have unmatched closing parens, they're likely punctuation
50        let open_parens = url.chars().filter(|&c| c == '(').count();
51        let close_parens = url.chars().filter(|&c| c == ')').count();
52
53        if close_parens > open_parens {
54            // Find the last balanced closing paren position
55            let mut balance = 0;
56            let mut last_balanced_pos = url.len();
57
58            for (i, c) in url.chars().enumerate() {
59                if c == '(' {
60                    balance += 1;
61                } else if c == ')' {
62                    balance -= 1;
63                    if balance < 0 {
64                        // Found an unmatched closing paren
65                        last_balanced_pos = i;
66                        break;
67                    }
68                }
69            }
70
71            trimmed = &trimmed[..last_balanced_pos];
72        }
73
74        // Trim specific punctuation only if not followed by more URL-like chars
75        while let Some(last_char) = trimmed.chars().last() {
76            if matches!(last_char, '.' | ',' | ';' | ':' | '!' | '?') {
77                // Check if this looks like it could be part of the URL
78                // For ':' specifically, keep it if followed by digits (port number)
79                if last_char == ':' && trimmed.len() > 1 {
80                    // Don't trim
81                    break;
82                }
83                trimmed = &trimmed[..trimmed.len() - 1];
84            } else {
85                break;
86            }
87        }
88
89        trimmed
90    }
91
92    /// Check if line is inside a reference definition
93    fn is_reference_definition(&self, line: &str) -> bool {
94        get_cached_regex(REFERENCE_DEF_RE_STR)
95            .map(|re| re.is_match(line))
96            .unwrap_or(false)
97    }
98
99    /// Check if we're inside an HTML comment
100    fn is_in_html_comment(&self, content: &str, pos: usize) -> bool {
101        // Find all HTML comments in the content
102        if let Ok(re) = get_cached_regex(HTML_COMMENT_PATTERN_STR) {
103            for mat in re.find_iter(content) {
104                if pos >= mat.start() && pos < mat.end() {
105                    return true;
106                }
107            }
108        }
109        false
110    }
111
112    /// Check if a position in a line is inside an HTML tag
113    fn is_in_html_tag(&self, line: &str, pos: usize) -> bool {
114        // Find all HTML tags in the line
115        if let Ok(re) = get_cached_regex(HTML_TAG_PATTERN_STR) {
116            for mat in re.find_iter(line) {
117                if pos >= mat.start() && pos < mat.end() {
118                    return true;
119                }
120            }
121        }
122        false
123    }
124
125    fn check_line(
126        &self,
127        line: &str,
128        content: &str,
129        line_number: usize,
130        code_spans: &[crate::lint_context::CodeSpan],
131        buffers: &mut LineCheckBuffers,
132    ) -> Vec<LintWarning> {
133        let mut warnings = Vec::new();
134
135        // Skip reference definitions
136        if self.is_reference_definition(line) {
137            return warnings;
138        }
139
140        // Skip lines that are continuations of multiline markdown links
141        // Pattern: text](url) without a leading [
142        if let Ok(re) = get_cached_regex(MULTILINE_LINK_CONTINUATION_STR)
143            && re.is_match(line)
144        {
145            return warnings;
146        }
147
148        // Quick check - does this line potentially have a URL or email?
149        if let Ok(re) = get_cached_regex(URL_QUICK_CHECK_STR)
150            && !re.is_match(line)
151            && !line.contains('@')
152        {
153            return warnings;
154        }
155
156        // Clear and reuse buffers instead of allocating new ones
157        buffers.markdown_link_ranges.clear();
158        if let Ok(re) = get_cached_regex(MARKDOWN_LINK_PATTERN_STR) {
159            for cap in re.captures_iter(line) {
160                if let Some(mat) = cap.get(0) {
161                    buffers.markdown_link_ranges.push((mat.start(), mat.end()));
162                }
163            }
164        }
165
166        if let Ok(re) = get_cached_regex(ANGLE_LINK_PATTERN_STR) {
167            for cap in re.captures_iter(line) {
168                if let Some(mat) = cap.get(0) {
169                    buffers.markdown_link_ranges.push((mat.start(), mat.end()));
170                }
171            }
172        }
173
174        // Find all markdown images for exclusion
175        buffers.image_ranges.clear();
176        if let Ok(re) = get_cached_regex(MARKDOWN_IMAGE_PATTERN_STR) {
177            for cap in re.captures_iter(line) {
178                if let Some(mat) = cap.get(0) {
179                    buffers.image_ranges.push((mat.start(), mat.end()));
180                }
181            }
182        }
183
184        // Check if this line contains only a badge link (common pattern)
185        let is_badge_line = get_cached_regex(BADGE_LINK_LINE_STR)
186            .map(|re| re.is_match(line))
187            .unwrap_or(false);
188
189        if is_badge_line {
190            return warnings;
191        }
192
193        // Find bare URLs
194        buffers.urls_found.clear();
195
196        // First, find IPv6 URLs (they need special handling)
197        if let Ok(re) = get_cached_regex(IPV6_URL_REGEX_STR) {
198            for mat in re.find_iter(line) {
199                let url_str = mat.as_str();
200                buffers.urls_found.push((mat.start(), mat.end(), url_str.to_string()));
201            }
202        }
203
204        // Then find regular URLs
205        if let Ok(re) = get_cached_regex(SIMPLE_URL_REGEX_STR) {
206            for mat in re.find_iter(line) {
207                let url_str = mat.as_str();
208
209                // Skip if it's an IPv6 URL (already handled)
210                if url_str.contains("://[") {
211                    continue;
212                }
213
214                // Skip malformed IPv6-like URLs
215                // Check for IPv6-like patterns that are malformed
216                if let Some(host_start) = url_str.find("://") {
217                    let after_protocol = &url_str[host_start + 3..];
218                    // If it looks like IPv6 (has :: or multiple :) but no brackets, skip if followed by ]
219                    if after_protocol.contains("::") || after_protocol.chars().filter(|&c| c == ':').count() > 1 {
220                        // Check if the next character after our match is ]
221                        if let Some(char_after) = line.chars().nth(mat.end())
222                            && char_after == ']'
223                        {
224                            // This is likely a malformed IPv6 URL like "https://::1]:8080"
225                            continue;
226                        }
227                    }
228                }
229
230                buffers.urls_found.push((mat.start(), mat.end(), url_str.to_string()));
231            }
232        }
233
234        // Process found URLs
235        for &(start, end, ref url_str) in buffers.urls_found.iter() {
236            // Skip custom protocols
237            if get_cached_regex(CUSTOM_PROTOCOL_PATTERN_STR)
238                .map(|re| re.is_match(url_str))
239                .unwrap_or(false)
240            {
241                continue;
242            }
243
244            // Check if this URL is inside a markdown link, angle bracket, or image
245            let mut is_inside_construct = false;
246            for &(link_start, link_end) in buffers.markdown_link_ranges.iter() {
247                if start >= link_start && end <= link_end {
248                    is_inside_construct = true;
249                    break;
250                }
251            }
252
253            for &(img_start, img_end) in buffers.image_ranges.iter() {
254                if start >= img_start && end <= img_end {
255                    is_inside_construct = true;
256                    break;
257                }
258            }
259
260            if is_inside_construct {
261                continue;
262            }
263
264            // Check if URL is inside an HTML tag
265            if self.is_in_html_tag(line, start) {
266                continue;
267            }
268
269            // Check if we're inside an HTML comment
270            let absolute_pos = content
271                .lines()
272                .take(line_number - 1)
273                .map(|l| l.len() + 1)
274                .sum::<usize>()
275                + start;
276            if self.is_in_html_comment(content, absolute_pos) {
277                continue;
278            }
279
280            // Clean up the URL by removing trailing punctuation
281            let trimmed_url = self.trim_trailing_punctuation(url_str);
282
283            // Only report if we have a valid URL after trimming
284            if !trimmed_url.is_empty() && trimmed_url != "//" {
285                let trimmed_len = trimmed_url.len();
286                let (start_line, start_col, end_line, end_col) =
287                    calculate_url_range(line_number, line, start, trimmed_len);
288
289                warnings.push(LintWarning {
290                    rule_name: Some("MD034"),
291                    line: start_line,
292                    column: start_col,
293                    end_line,
294                    end_column: end_col,
295                    message: format!("URL without angle brackets or link formatting: '{trimmed_url}'"),
296                    severity: Severity::Warning,
297                    fix: Some(Fix {
298                        range: {
299                            let line_start_byte = content
300                                .lines()
301                                .take(line_number - 1)
302                                .map(|l| l.len() + 1)
303                                .sum::<usize>();
304                            (line_start_byte + start)..(line_start_byte + start + trimmed_len)
305                        },
306                        replacement: format!("<{trimmed_url}>"),
307                    }),
308                });
309            }
310        }
311
312        // Check for bare email addresses
313        for cap in EMAIL_PATTERN.captures_iter(line) {
314            if let Some(mat) = cap.get(0) {
315                let email = mat.as_str();
316                let start = mat.start();
317                let end = mat.end();
318
319                // Check if email is inside angle brackets or markdown link
320                let mut is_inside_construct = false;
321                for &(link_start, link_end) in buffers.markdown_link_ranges.iter() {
322                    if start >= link_start && end <= link_end {
323                        is_inside_construct = true;
324                        break;
325                    }
326                }
327
328                if !is_inside_construct {
329                    // Check if email is inside an HTML tag
330                    if self.is_in_html_tag(line, start) {
331                        continue;
332                    }
333
334                    // Check if email is inside a code span
335                    let is_in_code_span = code_spans
336                        .iter()
337                        .any(|span| span.line == line_number && start >= span.start_col && start < span.end_col);
338
339                    if !is_in_code_span {
340                        let email_len = end - start;
341                        let (start_line, start_col, end_line, end_col) =
342                            calculate_url_range(line_number, line, start, email_len);
343
344                        warnings.push(LintWarning {
345                            rule_name: Some("MD034"),
346                            line: start_line,
347                            column: start_col,
348                            end_line,
349                            end_column: end_col,
350                            message: format!("Email address without angle brackets or link formatting: '{email}'"),
351                            severity: Severity::Warning,
352                            fix: Some(Fix {
353                                range: {
354                                    let line_start_byte = content
355                                        .lines()
356                                        .take(line_number - 1)
357                                        .map(|l| l.len() + 1)
358                                        .sum::<usize>();
359                                    (line_start_byte + start)..(line_start_byte + end)
360                                },
361                                replacement: format!("<{email}>"),
362                            }),
363                        });
364                    }
365                }
366            }
367        }
368
369        warnings
370    }
371}
372
373impl Rule for MD034NoBareUrls {
374    #[inline]
375    fn name(&self) -> &'static str {
376        "MD034"
377    }
378
379    fn as_any(&self) -> &dyn std::any::Any {
380        self
381    }
382
383    fn from_config(_config: &crate::config::Config) -> Box<dyn Rule>
384    where
385        Self: Sized,
386    {
387        Box::new(MD034NoBareUrls)
388    }
389
390    #[inline]
391    fn category(&self) -> RuleCategory {
392        RuleCategory::Link
393    }
394
395    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
396        !ctx.likely_has_links_or_images() && self.should_skip_content(ctx.content)
397    }
398
399    #[inline]
400    fn description(&self) -> &'static str {
401        "No bare URLs - wrap URLs in angle brackets"
402    }
403
404    fn check(&self, ctx: &LintContext) -> LintResult {
405        let mut warnings = Vec::new();
406        let content = ctx.content;
407
408        // Quick skip for content without URLs
409        if self.should_skip_content(content) {
410            return Ok(warnings);
411        }
412
413        // Get code spans for exclusion
414        let code_spans = ctx.code_spans();
415
416        // Allocate reusable buffers once instead of per-line to reduce allocations
417        let mut buffers = LineCheckBuffers::default();
418
419        // Check line by line
420        for (line_num, line) in content.lines().enumerate() {
421            // Skip lines inside code blocks
422            if ctx.is_in_code_block(line_num + 1) {
423                continue;
424            }
425
426            let mut line_warnings = self.check_line(line, content, line_num + 1, &code_spans, &mut buffers);
427
428            // Filter out warnings that are inside code spans
429            line_warnings.retain(|warning| {
430                // Check if the URL is inside a code span
431                !code_spans.iter().any(|span| {
432                    span.line == warning.line &&
433                    warning.column > 0 && // column is 1-indexed
434                    (warning.column - 1) >= span.start_col &&
435                    (warning.column - 1) < span.end_col
436                })
437            });
438
439            warnings.extend(line_warnings);
440        }
441
442        Ok(warnings)
443    }
444
445    fn fix(&self, ctx: &LintContext) -> Result<String, LintError> {
446        let mut content = ctx.content.to_string();
447        let mut warnings = self.check(ctx)?;
448
449        // Sort warnings by position to ensure consistent fix application
450        warnings.sort_by_key(|w| w.fix.as_ref().map(|f| f.range.start).unwrap_or(0));
451
452        // Apply fixes in reverse order to maintain positions
453        for warning in warnings.iter().rev() {
454            if let Some(fix) = &warning.fix {
455                let start = fix.range.start;
456                let end = fix.range.end;
457                content.replace_range(start..end, &fix.replacement);
458            }
459        }
460
461        Ok(content)
462    }
463}