rumdl_lib/rules/
md034_no_bare_urls.rs

1/// Rule MD034: No unformatted URLs
2///
3/// See [docs/md034.md](../../docs/md034.md) for full documentation, configuration, and examples.
4use crate::rule::{Fix, LintError, LintResult, LintWarning, Rule, RuleCategory, Severity};
5use crate::utils::range_utils::{LineIndex, calculate_url_range};
6use crate::utils::regex_cache::{
7    EMAIL_PATTERN, URL_IPV6_STR, URL_QUICK_CHECK_STR, URL_STANDARD_STR, URL_WWW_STR, XMPP_URI_STR,
8    get_cached_fancy_regex, get_cached_regex,
9};
10
11use crate::filtered_lines::FilteredLinesExt;
12use crate::lint_context::LintContext;
13
14// MD034-specific patterns for markdown constructs
15// Core URL patterns (URL_QUICK_CHECK_STR, URL_STANDARD_STR, etc.) are imported from regex_cache
16const CUSTOM_PROTOCOL_PATTERN_STR: &str = r#"(?:grpc|ws|wss|ssh|git|svn|file|data|javascript|vscode|chrome|about|slack|discord|matrix|irc|redis|mongodb|postgresql|mysql|kafka|nats|amqp|mqtt|custom|app|api|service)://"#;
17const MARKDOWN_LINK_PATTERN_STR: &str = r#"\[(?:[^\[\]]|\[[^\]]*\])*\]\(([^)\s]+)(?:\s+(?:\"[^\"]*\"|\'[^\']*\'))?\)"#;
18const MARKDOWN_EMPTY_LINK_PATTERN_STR: &str = r#"\[(?:[^\[\]]|\[[^\]]*\])*\]\(\)"#;
19const MARKDOWN_EMPTY_REF_PATTERN_STR: &str = r#"\[(?:[^\[\]]|\[[^\]]*\])*\]\[\]"#;
20// Pattern for links in angle brackets - excludes HTTP(S), FTP(S), XMPP URIs, and emails
21const ANGLE_LINK_PATTERN_STR: &str =
22    r#"<((?:https?|ftps?)://(?:\[[0-9a-fA-F:]+(?:%[a-zA-Z0-9]+)?\]|[^>]+)|xmpp:[^>]+|[^@\s]+@[^@\s]+\.[^@\s>]+)>"#;
23const BADGE_LINK_LINE_STR: &str = r#"^\s*\[!\[[^\]]*\]\([^)]*\)\]\([^)]*\)\s*$"#;
24const MARKDOWN_IMAGE_PATTERN_STR: &str = r#"!\s*\[([^\]]*)\]\s*\(([^)\s]+)(?:\s+(?:\"[^\"]*\"|\'[^\']*\'))?\)"#;
25// Reference definition pattern - matches [label]: URL with optional title
26const REFERENCE_DEF_RE_STR: &str = r"^\s*\[[^\]]+\]:\s*(?:<|(?:https?|ftps?)://)";
27const MULTILINE_LINK_CONTINUATION_STR: &str = r#"^[^\[]*\]\(.*\)"#;
28// Pattern to match shortcut/collapsed reference links: [text] or [text][]
29const SHORTCUT_REF_PATTERN_STR: &str = r#"\[([^\[\]]+)\](?!\s*[\[(])"#;
30
31/// Reusable buffers for check_line to reduce allocations
32#[derive(Default)]
33struct LineCheckBuffers {
34    markdown_link_ranges: Vec<(usize, usize)>,
35    image_ranges: Vec<(usize, usize)>,
36    urls_found: Vec<(usize, usize, String)>,
37}
38
39#[derive(Default, Clone)]
40pub struct MD034NoBareUrls;
41
42impl MD034NoBareUrls {
43    #[inline]
44    pub fn should_skip_content(&self, content: &str) -> bool {
45        // Skip if content has no URLs, XMPP URIs, or email addresses
46        // Fast byte scanning for common URL/email/xmpp indicators
47        let bytes = content.as_bytes();
48        let has_colon = bytes.contains(&b':');
49        let has_at = bytes.contains(&b'@');
50        let has_www = content.contains("www.");
51        !has_colon && !has_at && !has_www
52    }
53
54    /// Remove trailing punctuation that is likely sentence punctuation, not part of the URL
55    fn trim_trailing_punctuation<'a>(&self, url: &'a str) -> &'a str {
56        let mut trimmed = url;
57
58        // Check for balanced parentheses - if we have unmatched closing parens, they're likely punctuation
59        let open_parens = url.chars().filter(|&c| c == '(').count();
60        let close_parens = url.chars().filter(|&c| c == ')').count();
61
62        if close_parens > open_parens {
63            // Find the last balanced closing paren position
64            let mut balance = 0;
65            let mut last_balanced_pos = url.len();
66
67            for (byte_idx, c) in url.char_indices() {
68                if c == '(' {
69                    balance += 1;
70                } else if c == ')' {
71                    balance -= 1;
72                    if balance < 0 {
73                        // Found an unmatched closing paren
74                        last_balanced_pos = byte_idx;
75                        break;
76                    }
77                }
78            }
79
80            trimmed = &trimmed[..last_balanced_pos];
81        }
82
83        // Trim specific punctuation only if not followed by more URL-like chars
84        while let Some(last_char) = trimmed.chars().last() {
85            if matches!(last_char, '.' | ',' | ';' | ':' | '!' | '?') {
86                // Check if this looks like it could be part of the URL
87                // For ':' specifically, keep it if followed by digits (port number)
88                if last_char == ':' && trimmed.len() > 1 {
89                    // Don't trim
90                    break;
91                }
92                trimmed = &trimmed[..trimmed.len() - 1];
93            } else {
94                break;
95            }
96        }
97
98        trimmed
99    }
100
101    /// Check if line is inside a reference definition
102    fn is_reference_definition(&self, line: &str) -> bool {
103        get_cached_regex(REFERENCE_DEF_RE_STR)
104            .map(|re| re.is_match(line))
105            .unwrap_or(false)
106    }
107
108    fn check_line(
109        &self,
110        line: &str,
111        ctx: &LintContext,
112        line_number: usize,
113        code_spans: &[crate::lint_context::CodeSpan],
114        buffers: &mut LineCheckBuffers,
115        line_index: &LineIndex,
116    ) -> Vec<LintWarning> {
117        let mut warnings = Vec::new();
118
119        // Skip reference definitions
120        if self.is_reference_definition(line) {
121            return warnings;
122        }
123
124        // Skip lines inside HTML blocks - URLs in HTML attributes should not be linted
125        if ctx.line_info(line_number).is_some_and(|info| info.in_html_block) {
126            return warnings;
127        }
128
129        // Skip lines that are continuations of multiline markdown links
130        // Pattern: text](url) without a leading [
131        if let Ok(re) = get_cached_regex(MULTILINE_LINK_CONTINUATION_STR)
132            && re.is_match(line)
133        {
134            return warnings;
135        }
136
137        // Quick check - does this line potentially have a URL or email?
138        let has_quick_check = get_cached_regex(URL_QUICK_CHECK_STR)
139            .map(|re| re.is_match(line))
140            .unwrap_or(false);
141        let has_www = line.contains("www.");
142        let has_at = line.contains('@');
143
144        if !has_quick_check && !has_at && !has_www {
145            return warnings;
146        }
147
148        // Clear and reuse buffers instead of allocating new ones
149        buffers.markdown_link_ranges.clear();
150        if let Ok(re) = get_cached_regex(MARKDOWN_LINK_PATTERN_STR) {
151            for cap in re.captures_iter(line) {
152                if let Some(mat) = cap.get(0) {
153                    buffers.markdown_link_ranges.push((mat.start(), mat.end()));
154                }
155            }
156        }
157
158        // Also include empty link patterns like [text]() and [text][]
159        if let Ok(re) = get_cached_regex(MARKDOWN_EMPTY_LINK_PATTERN_STR) {
160            for mat in re.find_iter(line) {
161                buffers.markdown_link_ranges.push((mat.start(), mat.end()));
162            }
163        }
164
165        if let Ok(re) = get_cached_regex(MARKDOWN_EMPTY_REF_PATTERN_STR) {
166            for mat in re.find_iter(line) {
167                buffers.markdown_link_ranges.push((mat.start(), mat.end()));
168            }
169        }
170
171        // Also exclude shortcut reference links like [URL] - even if no definition exists,
172        // the brackets indicate user intent to use markdown formatting
173        // Uses fancy_regex for negative lookahead support
174        if let Ok(re) = get_cached_fancy_regex(SHORTCUT_REF_PATTERN_STR) {
175            for mat in re.find_iter(line).flatten() {
176                buffers.markdown_link_ranges.push((mat.start(), mat.end()));
177            }
178        }
179
180        if let Ok(re) = get_cached_regex(ANGLE_LINK_PATTERN_STR) {
181            for cap in re.captures_iter(line) {
182                if let Some(mat) = cap.get(0) {
183                    buffers.markdown_link_ranges.push((mat.start(), mat.end()));
184                }
185            }
186        }
187
188        // Find all markdown images for exclusion
189        buffers.image_ranges.clear();
190        if let Ok(re) = get_cached_regex(MARKDOWN_IMAGE_PATTERN_STR) {
191            for cap in re.captures_iter(line) {
192                if let Some(mat) = cap.get(0) {
193                    buffers.image_ranges.push((mat.start(), mat.end()));
194                }
195            }
196        }
197
198        // Check if this line contains only a badge link (common pattern)
199        let is_badge_line = get_cached_regex(BADGE_LINK_LINE_STR)
200            .map(|re| re.is_match(line))
201            .unwrap_or(false);
202
203        if is_badge_line {
204            return warnings;
205        }
206
207        // Find bare URLs
208        buffers.urls_found.clear();
209
210        // First, find IPv6 URLs (they need special handling)
211        if let Ok(re) = get_cached_regex(URL_IPV6_STR) {
212            for mat in re.find_iter(line) {
213                let url_str = mat.as_str();
214                buffers.urls_found.push((mat.start(), mat.end(), url_str.to_string()));
215            }
216        }
217
218        // Then find regular URLs
219        if let Ok(re) = get_cached_regex(URL_STANDARD_STR) {
220            for mat in re.find_iter(line) {
221                let url_str = mat.as_str();
222
223                // Skip if it's an IPv6 URL (already handled)
224                if url_str.contains("://[") {
225                    continue;
226                }
227
228                // Skip malformed IPv6-like URLs
229                // Check for IPv6-like patterns that are malformed
230                if let Some(host_start) = url_str.find("://") {
231                    let after_protocol = &url_str[host_start + 3..];
232                    // If it looks like IPv6 (has :: or multiple :) but no brackets, skip if followed by ]
233                    if after_protocol.contains("::") || after_protocol.chars().filter(|&c| c == ':').count() > 1 {
234                        // Check if the next character after our match is ]
235                        if let Some(char_after) = line.chars().nth(mat.end())
236                            && char_after == ']'
237                        {
238                            // This is likely a malformed IPv6 URL like "https://::1]:8080"
239                            continue;
240                        }
241                    }
242                }
243
244                buffers.urls_found.push((mat.start(), mat.end(), url_str.to_string()));
245            }
246        }
247
248        // Find www URLs without protocol (e.g., www.example.com)
249        if let Ok(re) = get_cached_regex(URL_WWW_STR) {
250            for mat in re.find_iter(line) {
251                let url_str = mat.as_str();
252                let start_pos = mat.start();
253                let end_pos = mat.end();
254
255                // Skip if preceded by / or @ (likely part of a full URL)
256                if start_pos > 0 {
257                    let prev_char = line.as_bytes().get(start_pos - 1).copied();
258                    if prev_char == Some(b'/') || prev_char == Some(b'@') {
259                        continue;
260                    }
261                }
262
263                // Skip if inside angle brackets (autolink syntax like <www.example.com>)
264                if start_pos > 0 && end_pos < line.len() {
265                    let prev_char = line.as_bytes().get(start_pos - 1).copied();
266                    let next_char = line.as_bytes().get(end_pos).copied();
267                    if prev_char == Some(b'<') && next_char == Some(b'>') {
268                        continue;
269                    }
270                }
271
272                buffers.urls_found.push((start_pos, end_pos, url_str.to_string()));
273            }
274        }
275
276        // Find XMPP URIs (GFM extended autolinks: xmpp:user@domain/resource)
277        if let Ok(re) = get_cached_regex(XMPP_URI_STR) {
278            for mat in re.find_iter(line) {
279                let uri_str = mat.as_str();
280                let start_pos = mat.start();
281                let end_pos = mat.end();
282
283                // Skip if inside angle brackets (already properly formatted: <xmpp:user@domain>)
284                if start_pos > 0 && end_pos < line.len() {
285                    let prev_char = line.as_bytes().get(start_pos - 1).copied();
286                    let next_char = line.as_bytes().get(end_pos).copied();
287                    if prev_char == Some(b'<') && next_char == Some(b'>') {
288                        continue;
289                    }
290                }
291
292                buffers.urls_found.push((start_pos, end_pos, uri_str.to_string()));
293            }
294        }
295
296        // Process found URLs
297        for &(start, _end, ref url_str) in buffers.urls_found.iter() {
298            // Skip custom protocols
299            if get_cached_regex(CUSTOM_PROTOCOL_PATTERN_STR)
300                .map(|re| re.is_match(url_str))
301                .unwrap_or(false)
302            {
303                continue;
304            }
305
306            // Check if this URL is inside a markdown link, angle bracket, or image
307            // We check if the URL starts within a construct, not if it's entirely contained.
308            // This handles cases where URL detection may include trailing characters
309            // that extend past the construct boundary (e.g., parentheses).
310            let mut is_inside_construct = false;
311            for &(link_start, link_end) in buffers.markdown_link_ranges.iter() {
312                if start >= link_start && start < link_end {
313                    is_inside_construct = true;
314                    break;
315                }
316            }
317
318            for &(img_start, img_end) in buffers.image_ranges.iter() {
319                if start >= img_start && start < img_end {
320                    is_inside_construct = true;
321                    break;
322                }
323            }
324
325            if is_inside_construct {
326                continue;
327            }
328
329            // Calculate absolute byte position for context-aware checks
330            let line_start_byte = line_index.get_line_start_byte(line_number).unwrap_or(0);
331            let absolute_pos = line_start_byte + start;
332
333            // Check if URL is inside an HTML tag (handles multiline tags correctly)
334            if ctx.is_in_html_tag(absolute_pos) {
335                continue;
336            }
337
338            // Check if we're inside an HTML comment
339            if ctx.is_in_html_comment(absolute_pos) {
340                continue;
341            }
342
343            // Check if we're inside a Hugo/Quarto shortcode
344            if ctx.is_in_shortcode(absolute_pos) {
345                continue;
346            }
347
348            // Clean up the URL by removing trailing punctuation
349            let trimmed_url = self.trim_trailing_punctuation(url_str);
350
351            // Only report if we have a valid URL after trimming
352            if !trimmed_url.is_empty() && trimmed_url != "//" {
353                let trimmed_len = trimmed_url.len();
354                let (start_line, start_col, end_line, end_col) =
355                    calculate_url_range(line_number, line, start, trimmed_len);
356
357                // For www URLs without protocol, add https:// prefix in the fix
358                let replacement = if trimmed_url.starts_with("www.") {
359                    format!("<https://{trimmed_url}>")
360                } else {
361                    format!("<{trimmed_url}>")
362                };
363
364                warnings.push(LintWarning {
365                    rule_name: Some("MD034".to_string()),
366                    line: start_line,
367                    column: start_col,
368                    end_line,
369                    end_column: end_col,
370                    message: format!("URL without angle brackets or link formatting: '{trimmed_url}'"),
371                    severity: Severity::Warning,
372                    fix: Some(Fix {
373                        range: {
374                            let line_start_byte = line_index.get_line_start_byte(line_number).unwrap_or(0);
375                            (line_start_byte + start)..(line_start_byte + start + trimmed_len)
376                        },
377                        replacement,
378                    }),
379                });
380            }
381        }
382
383        // Check for bare email addresses
384        for cap in EMAIL_PATTERN.captures_iter(line) {
385            if let Some(mat) = cap.get(0) {
386                let email = mat.as_str();
387                let start = mat.start();
388                let end = mat.end();
389
390                // Skip if email is part of an XMPP URI (xmpp:user@domain)
391                // Check character boundary to avoid panics with multi-byte UTF-8
392                if start >= 5 && line.is_char_boundary(start - 5) && &line[start - 5..start] == "xmpp:" {
393                    continue;
394                }
395
396                // Check if email is inside angle brackets or markdown link
397                let mut is_inside_construct = false;
398                for &(link_start, link_end) in buffers.markdown_link_ranges.iter() {
399                    if start >= link_start && end <= link_end {
400                        is_inside_construct = true;
401                        break;
402                    }
403                }
404
405                if !is_inside_construct {
406                    // Calculate absolute byte position for context-aware checks
407                    let line_start_byte = line_index.get_line_start_byte(line_number).unwrap_or(0);
408                    let absolute_pos = line_start_byte + start;
409
410                    // Check if email is inside an HTML tag (handles multiline tags)
411                    if ctx.is_in_html_tag(absolute_pos) {
412                        continue;
413                    }
414
415                    // Check if email is inside a code span
416                    let is_in_code_span = code_spans
417                        .iter()
418                        .any(|span| span.line == line_number && start >= span.start_col && start < span.end_col);
419
420                    if !is_in_code_span {
421                        let email_len = end - start;
422                        let (start_line, start_col, end_line, end_col) =
423                            calculate_url_range(line_number, line, start, email_len);
424
425                        warnings.push(LintWarning {
426                            rule_name: Some("MD034".to_string()),
427                            line: start_line,
428                            column: start_col,
429                            end_line,
430                            end_column: end_col,
431                            message: format!("Email address without angle brackets or link formatting: '{email}'"),
432                            severity: Severity::Warning,
433                            fix: Some(Fix {
434                                range: (line_start_byte + start)..(line_start_byte + end),
435                                replacement: format!("<{email}>"),
436                            }),
437                        });
438                    }
439                }
440            }
441        }
442
443        warnings
444    }
445}
446
447impl Rule for MD034NoBareUrls {
448    #[inline]
449    fn name(&self) -> &'static str {
450        "MD034"
451    }
452
453    fn as_any(&self) -> &dyn std::any::Any {
454        self
455    }
456
457    fn from_config(_config: &crate::config::Config) -> Box<dyn Rule>
458    where
459        Self: Sized,
460    {
461        Box::new(MD034NoBareUrls)
462    }
463
464    #[inline]
465    fn category(&self) -> RuleCategory {
466        RuleCategory::Link
467    }
468
469    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
470        !ctx.likely_has_links_or_images() && self.should_skip_content(ctx.content)
471    }
472
473    #[inline]
474    fn description(&self) -> &'static str {
475        "No bare URLs - wrap URLs in angle brackets"
476    }
477
478    fn check(&self, ctx: &LintContext) -> LintResult {
479        let mut warnings = Vec::new();
480        let content = ctx.content;
481
482        // Quick skip for content without URLs
483        if self.should_skip_content(content) {
484            return Ok(warnings);
485        }
486
487        // Create LineIndex for correct byte position calculations across all line ending types
488        let line_index = &ctx.line_index;
489
490        // Get code spans for exclusion
491        let code_spans = ctx.code_spans();
492
493        // Allocate reusable buffers once instead of per-line to reduce allocations
494        let mut buffers = LineCheckBuffers::default();
495
496        // Iterate over content lines, automatically skipping front matter and code blocks
497        // This uses the filtered iterator API which centralizes the skip logic
498        for line in ctx.filtered_lines().skip_front_matter().skip_code_blocks() {
499            let mut line_warnings =
500                self.check_line(line.content, ctx, line.line_num, &code_spans, &mut buffers, line_index);
501
502            // Filter out warnings that are inside code spans
503            line_warnings.retain(|warning| {
504                // Check if the URL is inside a code span
505                !code_spans.iter().any(|span| {
506                    span.line == warning.line &&
507                    warning.column > 0 && // column is 1-indexed
508                    (warning.column - 1) >= span.start_col &&
509                    (warning.column - 1) < span.end_col
510                })
511            });
512
513            // Filter out warnings where the URL is inside a parsed link
514            // This handles cases like [text]( https://url ) where the URL has leading whitespace
515            // pulldown-cmark correctly parses these as valid links even though our regex misses them
516            line_warnings.retain(|warning| {
517                if let Some(fix) = &warning.fix {
518                    // Check if the fix range falls inside any parsed link's byte range
519                    !ctx.links
520                        .iter()
521                        .any(|link| fix.range.start >= link.byte_offset && fix.range.end <= link.byte_end)
522                } else {
523                    true
524                }
525            });
526
527            warnings.extend(line_warnings);
528        }
529
530        Ok(warnings)
531    }
532
533    fn fix(&self, ctx: &LintContext) -> Result<String, LintError> {
534        let mut content = ctx.content.to_string();
535        let mut warnings = self.check(ctx)?;
536
537        // Sort warnings by position to ensure consistent fix application
538        warnings.sort_by_key(|w| w.fix.as_ref().map(|f| f.range.start).unwrap_or(0));
539
540        // Apply fixes in reverse order to maintain positions
541        for warning in warnings.iter().rev() {
542            if let Some(fix) = &warning.fix {
543                let start = fix.range.start;
544                let end = fix.range.end;
545                content.replace_range(start..end, &fix.replacement);
546            }
547        }
548
549        Ok(content)
550    }
551}