rumdl_lib/rules/
md034_no_bare_urls.rs

1/// Rule MD034: No unformatted URLs
2///
3/// See [docs/md034.md](../../docs/md034.md) for full documentation, configuration, and examples.
4use crate::rule::{Fix, LintError, LintResult, LintWarning, Rule, RuleCategory, Severity};
5use crate::utils::range_utils::{LineIndex, calculate_url_range};
6use crate::utils::regex_cache::{
7    EMAIL_PATTERN, URL_IPV6_STR, URL_QUICK_CHECK_STR, URL_STANDARD_STR, URL_WWW_STR, get_cached_fancy_regex,
8    get_cached_regex,
9};
10
11use crate::filtered_lines::FilteredLinesExt;
12use crate::lint_context::LintContext;
13
14// MD034-specific patterns for markdown constructs
15// Core URL patterns (URL_QUICK_CHECK_STR, URL_STANDARD_STR, etc.) are imported from regex_cache
16const CUSTOM_PROTOCOL_PATTERN_STR: &str = r#"(?:grpc|ws|wss|ssh|git|svn|file|data|javascript|vscode|chrome|about|slack|discord|matrix|irc|redis|mongodb|postgresql|mysql|kafka|nats|amqp|mqtt|custom|app|api|service)://"#;
17const MARKDOWN_LINK_PATTERN_STR: &str = r#"\[(?:[^\[\]]|\[[^\]]*\])*\]\(([^)\s]+)(?:\s+(?:\"[^\"]*\"|\'[^\']*\'))?\)"#;
18const MARKDOWN_EMPTY_LINK_PATTERN_STR: &str = r#"\[(?:[^\[\]]|\[[^\]]*\])*\]\(\)"#;
19const MARKDOWN_EMPTY_REF_PATTERN_STR: &str = r#"\[(?:[^\[\]]|\[[^\]]*\])*\]\[\]"#;
20const ANGLE_LINK_PATTERN_STR: &str =
21    r#"<((?:https?|ftps?)://(?:\[[0-9a-fA-F:]+(?:%[a-zA-Z0-9]+)?\]|[^>]+)|[^@\s]+@[^@\s]+\.[^@\s>]+)>"#;
22const BADGE_LINK_LINE_STR: &str = r#"^\s*\[!\[[^\]]*\]\([^)]*\)\]\([^)]*\)\s*$"#;
23const MARKDOWN_IMAGE_PATTERN_STR: &str = r#"!\s*\[([^\]]*)\]\s*\(([^)\s]+)(?:\s+(?:\"[^\"]*\"|\'[^\']*\'))?\)"#;
24// Reference definition pattern - matches [label]: URL with optional title
25const REFERENCE_DEF_RE_STR: &str = r"^\s*\[[^\]]+\]:\s*(?:<|(?:https?|ftps?)://)";
26const MULTILINE_LINK_CONTINUATION_STR: &str = r#"^[^\[]*\]\(.*\)"#;
27// Pattern to match shortcut/collapsed reference links: [text] or [text][]
28const SHORTCUT_REF_PATTERN_STR: &str = r#"\[([^\[\]]+)\](?!\s*[\[(])"#;
29
30/// Reusable buffers for check_line to reduce allocations
31#[derive(Default)]
32struct LineCheckBuffers {
33    markdown_link_ranges: Vec<(usize, usize)>,
34    image_ranges: Vec<(usize, usize)>,
35    urls_found: Vec<(usize, usize, String)>,
36}
37
38#[derive(Default, Clone)]
39pub struct MD034NoBareUrls;
40
41impl MD034NoBareUrls {
42    #[inline]
43    pub fn should_skip_content(&self, content: &str) -> bool {
44        // Skip if content has no URLs and no email addresses
45        // Fast byte scanning for common URL/email indicators
46        let bytes = content.as_bytes();
47        let has_colon = bytes.contains(&b':');
48        let has_at = bytes.contains(&b'@');
49        let has_www = content.contains("www.");
50        !has_colon && !has_at && !has_www
51    }
52
53    /// Remove trailing punctuation that is likely sentence punctuation, not part of the URL
54    fn trim_trailing_punctuation<'a>(&self, url: &'a str) -> &'a str {
55        let mut trimmed = url;
56
57        // Check for balanced parentheses - if we have unmatched closing parens, they're likely punctuation
58        let open_parens = url.chars().filter(|&c| c == '(').count();
59        let close_parens = url.chars().filter(|&c| c == ')').count();
60
61        if close_parens > open_parens {
62            // Find the last balanced closing paren position
63            let mut balance = 0;
64            let mut last_balanced_pos = url.len();
65
66            for (byte_idx, c) in url.char_indices() {
67                if c == '(' {
68                    balance += 1;
69                } else if c == ')' {
70                    balance -= 1;
71                    if balance < 0 {
72                        // Found an unmatched closing paren
73                        last_balanced_pos = byte_idx;
74                        break;
75                    }
76                }
77            }
78
79            trimmed = &trimmed[..last_balanced_pos];
80        }
81
82        // Trim specific punctuation only if not followed by more URL-like chars
83        while let Some(last_char) = trimmed.chars().last() {
84            if matches!(last_char, '.' | ',' | ';' | ':' | '!' | '?') {
85                // Check if this looks like it could be part of the URL
86                // For ':' specifically, keep it if followed by digits (port number)
87                if last_char == ':' && trimmed.len() > 1 {
88                    // Don't trim
89                    break;
90                }
91                trimmed = &trimmed[..trimmed.len() - 1];
92            } else {
93                break;
94            }
95        }
96
97        trimmed
98    }
99
100    /// Check if line is inside a reference definition
101    fn is_reference_definition(&self, line: &str) -> bool {
102        get_cached_regex(REFERENCE_DEF_RE_STR)
103            .map(|re| re.is_match(line))
104            .unwrap_or(false)
105    }
106
107    fn check_line(
108        &self,
109        line: &str,
110        ctx: &LintContext,
111        line_number: usize,
112        code_spans: &[crate::lint_context::CodeSpan],
113        buffers: &mut LineCheckBuffers,
114        line_index: &LineIndex,
115    ) -> Vec<LintWarning> {
116        let mut warnings = Vec::new();
117
118        // Skip reference definitions
119        if self.is_reference_definition(line) {
120            return warnings;
121        }
122
123        // Skip lines inside HTML blocks - URLs in HTML attributes should not be linted
124        if ctx.line_info(line_number).is_some_and(|info| info.in_html_block) {
125            return warnings;
126        }
127
128        // Skip lines that are continuations of multiline markdown links
129        // Pattern: text](url) without a leading [
130        if let Ok(re) = get_cached_regex(MULTILINE_LINK_CONTINUATION_STR)
131            && re.is_match(line)
132        {
133            return warnings;
134        }
135
136        // Quick check - does this line potentially have a URL or email?
137        let has_quick_check = get_cached_regex(URL_QUICK_CHECK_STR)
138            .map(|re| re.is_match(line))
139            .unwrap_or(false);
140        let has_www = line.contains("www.");
141        let has_at = line.contains('@');
142
143        if !has_quick_check && !has_at && !has_www {
144            return warnings;
145        }
146
147        // Clear and reuse buffers instead of allocating new ones
148        buffers.markdown_link_ranges.clear();
149        if let Ok(re) = get_cached_regex(MARKDOWN_LINK_PATTERN_STR) {
150            for cap in re.captures_iter(line) {
151                if let Some(mat) = cap.get(0) {
152                    buffers.markdown_link_ranges.push((mat.start(), mat.end()));
153                }
154            }
155        }
156
157        // Also include empty link patterns like [text]() and [text][]
158        if let Ok(re) = get_cached_regex(MARKDOWN_EMPTY_LINK_PATTERN_STR) {
159            for mat in re.find_iter(line) {
160                buffers.markdown_link_ranges.push((mat.start(), mat.end()));
161            }
162        }
163
164        if let Ok(re) = get_cached_regex(MARKDOWN_EMPTY_REF_PATTERN_STR) {
165            for mat in re.find_iter(line) {
166                buffers.markdown_link_ranges.push((mat.start(), mat.end()));
167            }
168        }
169
170        // Also exclude shortcut reference links like [URL] - even if no definition exists,
171        // the brackets indicate user intent to use markdown formatting
172        // Uses fancy_regex for negative lookahead support
173        if let Ok(re) = get_cached_fancy_regex(SHORTCUT_REF_PATTERN_STR) {
174            for mat in re.find_iter(line).flatten() {
175                buffers.markdown_link_ranges.push((mat.start(), mat.end()));
176            }
177        }
178
179        if let Ok(re) = get_cached_regex(ANGLE_LINK_PATTERN_STR) {
180            for cap in re.captures_iter(line) {
181                if let Some(mat) = cap.get(0) {
182                    buffers.markdown_link_ranges.push((mat.start(), mat.end()));
183                }
184            }
185        }
186
187        // Find all markdown images for exclusion
188        buffers.image_ranges.clear();
189        if let Ok(re) = get_cached_regex(MARKDOWN_IMAGE_PATTERN_STR) {
190            for cap in re.captures_iter(line) {
191                if let Some(mat) = cap.get(0) {
192                    buffers.image_ranges.push((mat.start(), mat.end()));
193                }
194            }
195        }
196
197        // Check if this line contains only a badge link (common pattern)
198        let is_badge_line = get_cached_regex(BADGE_LINK_LINE_STR)
199            .map(|re| re.is_match(line))
200            .unwrap_or(false);
201
202        if is_badge_line {
203            return warnings;
204        }
205
206        // Find bare URLs
207        buffers.urls_found.clear();
208
209        // First, find IPv6 URLs (they need special handling)
210        if let Ok(re) = get_cached_regex(URL_IPV6_STR) {
211            for mat in re.find_iter(line) {
212                let url_str = mat.as_str();
213                buffers.urls_found.push((mat.start(), mat.end(), url_str.to_string()));
214            }
215        }
216
217        // Then find regular URLs
218        if let Ok(re) = get_cached_regex(URL_STANDARD_STR) {
219            for mat in re.find_iter(line) {
220                let url_str = mat.as_str();
221
222                // Skip if it's an IPv6 URL (already handled)
223                if url_str.contains("://[") {
224                    continue;
225                }
226
227                // Skip malformed IPv6-like URLs
228                // Check for IPv6-like patterns that are malformed
229                if let Some(host_start) = url_str.find("://") {
230                    let after_protocol = &url_str[host_start + 3..];
231                    // If it looks like IPv6 (has :: or multiple :) but no brackets, skip if followed by ]
232                    if after_protocol.contains("::") || after_protocol.chars().filter(|&c| c == ':').count() > 1 {
233                        // Check if the next character after our match is ]
234                        if let Some(char_after) = line.chars().nth(mat.end())
235                            && char_after == ']'
236                        {
237                            // This is likely a malformed IPv6 URL like "https://::1]:8080"
238                            continue;
239                        }
240                    }
241                }
242
243                buffers.urls_found.push((mat.start(), mat.end(), url_str.to_string()));
244            }
245        }
246
247        // Find www URLs without protocol (e.g., www.example.com)
248        if let Ok(re) = get_cached_regex(URL_WWW_STR) {
249            for mat in re.find_iter(line) {
250                let url_str = mat.as_str();
251                let start_pos = mat.start();
252                let end_pos = mat.end();
253
254                // Skip if preceded by / or @ (likely part of a full URL)
255                if start_pos > 0 {
256                    let prev_char = line.as_bytes().get(start_pos - 1).copied();
257                    if prev_char == Some(b'/') || prev_char == Some(b'@') {
258                        continue;
259                    }
260                }
261
262                // Skip if inside angle brackets (autolink syntax like <www.example.com>)
263                if start_pos > 0 && end_pos < line.len() {
264                    let prev_char = line.as_bytes().get(start_pos - 1).copied();
265                    let next_char = line.as_bytes().get(end_pos).copied();
266                    if prev_char == Some(b'<') && next_char == Some(b'>') {
267                        continue;
268                    }
269                }
270
271                buffers.urls_found.push((start_pos, end_pos, url_str.to_string()));
272            }
273        }
274
275        // Process found URLs
276        for &(start, _end, ref url_str) in buffers.urls_found.iter() {
277            // Skip custom protocols
278            if get_cached_regex(CUSTOM_PROTOCOL_PATTERN_STR)
279                .map(|re| re.is_match(url_str))
280                .unwrap_or(false)
281            {
282                continue;
283            }
284
285            // Check if this URL is inside a markdown link, angle bracket, or image
286            // We check if the URL starts within a construct, not if it's entirely contained.
287            // This handles cases where URL detection may include trailing characters
288            // that extend past the construct boundary (e.g., parentheses).
289            let mut is_inside_construct = false;
290            for &(link_start, link_end) in buffers.markdown_link_ranges.iter() {
291                if start >= link_start && start < link_end {
292                    is_inside_construct = true;
293                    break;
294                }
295            }
296
297            for &(img_start, img_end) in buffers.image_ranges.iter() {
298                if start >= img_start && start < img_end {
299                    is_inside_construct = true;
300                    break;
301                }
302            }
303
304            if is_inside_construct {
305                continue;
306            }
307
308            // Calculate absolute byte position for context-aware checks
309            let line_start_byte = line_index.get_line_start_byte(line_number).unwrap_or(0);
310            let absolute_pos = line_start_byte + start;
311
312            // Check if URL is inside an HTML tag (handles multiline tags correctly)
313            if ctx.is_in_html_tag(absolute_pos) {
314                continue;
315            }
316
317            // Check if we're inside an HTML comment
318            if ctx.is_in_html_comment(absolute_pos) {
319                continue;
320            }
321
322            // Clean up the URL by removing trailing punctuation
323            let trimmed_url = self.trim_trailing_punctuation(url_str);
324
325            // Only report if we have a valid URL after trimming
326            if !trimmed_url.is_empty() && trimmed_url != "//" {
327                let trimmed_len = trimmed_url.len();
328                let (start_line, start_col, end_line, end_col) =
329                    calculate_url_range(line_number, line, start, trimmed_len);
330
331                warnings.push(LintWarning {
332                    rule_name: Some("MD034".to_string()),
333                    line: start_line,
334                    column: start_col,
335                    end_line,
336                    end_column: end_col,
337                    message: format!("URL without angle brackets or link formatting: '{trimmed_url}'"),
338                    severity: Severity::Warning,
339                    fix: Some(Fix {
340                        range: {
341                            let line_start_byte = line_index.get_line_start_byte(line_number).unwrap_or(0);
342                            (line_start_byte + start)..(line_start_byte + start + trimmed_len)
343                        },
344                        replacement: format!("<{trimmed_url}>"),
345                    }),
346                });
347            }
348        }
349
350        // Check for bare email addresses
351        for cap in EMAIL_PATTERN.captures_iter(line) {
352            if let Some(mat) = cap.get(0) {
353                let email = mat.as_str();
354                let start = mat.start();
355                let end = mat.end();
356
357                // Check if email is inside angle brackets or markdown link
358                let mut is_inside_construct = false;
359                for &(link_start, link_end) in buffers.markdown_link_ranges.iter() {
360                    if start >= link_start && end <= link_end {
361                        is_inside_construct = true;
362                        break;
363                    }
364                }
365
366                if !is_inside_construct {
367                    // Calculate absolute byte position for context-aware checks
368                    let line_start_byte = line_index.get_line_start_byte(line_number).unwrap_or(0);
369                    let absolute_pos = line_start_byte + start;
370
371                    // Check if email is inside an HTML tag (handles multiline tags)
372                    if ctx.is_in_html_tag(absolute_pos) {
373                        continue;
374                    }
375
376                    // Check if email is inside a code span
377                    let is_in_code_span = code_spans
378                        .iter()
379                        .any(|span| span.line == line_number && start >= span.start_col && start < span.end_col);
380
381                    if !is_in_code_span {
382                        let email_len = end - start;
383                        let (start_line, start_col, end_line, end_col) =
384                            calculate_url_range(line_number, line, start, email_len);
385
386                        warnings.push(LintWarning {
387                            rule_name: Some("MD034".to_string()),
388                            line: start_line,
389                            column: start_col,
390                            end_line,
391                            end_column: end_col,
392                            message: format!("Email address without angle brackets or link formatting: '{email}'"),
393                            severity: Severity::Warning,
394                            fix: Some(Fix {
395                                range: (line_start_byte + start)..(line_start_byte + end),
396                                replacement: format!("<{email}>"),
397                            }),
398                        });
399                    }
400                }
401            }
402        }
403
404        warnings
405    }
406}
407
408impl Rule for MD034NoBareUrls {
409    #[inline]
410    fn name(&self) -> &'static str {
411        "MD034"
412    }
413
414    fn as_any(&self) -> &dyn std::any::Any {
415        self
416    }
417
418    fn from_config(_config: &crate::config::Config) -> Box<dyn Rule>
419    where
420        Self: Sized,
421    {
422        Box::new(MD034NoBareUrls)
423    }
424
425    #[inline]
426    fn category(&self) -> RuleCategory {
427        RuleCategory::Link
428    }
429
430    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
431        !ctx.likely_has_links_or_images() && self.should_skip_content(ctx.content)
432    }
433
434    #[inline]
435    fn description(&self) -> &'static str {
436        "No bare URLs - wrap URLs in angle brackets"
437    }
438
439    fn check(&self, ctx: &LintContext) -> LintResult {
440        let mut warnings = Vec::new();
441        let content = ctx.content;
442
443        // Quick skip for content without URLs
444        if self.should_skip_content(content) {
445            return Ok(warnings);
446        }
447
448        // Create LineIndex for correct byte position calculations across all line ending types
449        let line_index = &ctx.line_index;
450
451        // Get code spans for exclusion
452        let code_spans = ctx.code_spans();
453
454        // Allocate reusable buffers once instead of per-line to reduce allocations
455        let mut buffers = LineCheckBuffers::default();
456
457        // Iterate over content lines, automatically skipping front matter and code blocks
458        // This uses the filtered iterator API which centralizes the skip logic
459        for line in ctx.filtered_lines().skip_front_matter().skip_code_blocks() {
460            let mut line_warnings =
461                self.check_line(line.content, ctx, line.line_num, &code_spans, &mut buffers, line_index);
462
463            // Filter out warnings that are inside code spans
464            line_warnings.retain(|warning| {
465                // Check if the URL is inside a code span
466                !code_spans.iter().any(|span| {
467                    span.line == warning.line &&
468                    warning.column > 0 && // column is 1-indexed
469                    (warning.column - 1) >= span.start_col &&
470                    (warning.column - 1) < span.end_col
471                })
472            });
473
474            // Filter out warnings where the URL is inside a parsed link
475            // This handles cases like [text]( https://url ) where the URL has leading whitespace
476            // pulldown-cmark correctly parses these as valid links even though our regex misses them
477            line_warnings.retain(|warning| {
478                if let Some(fix) = &warning.fix {
479                    // Check if the fix range falls inside any parsed link's byte range
480                    !ctx.links
481                        .iter()
482                        .any(|link| fix.range.start >= link.byte_offset && fix.range.end <= link.byte_end)
483                } else {
484                    true
485                }
486            });
487
488            warnings.extend(line_warnings);
489        }
490
491        Ok(warnings)
492    }
493
494    fn fix(&self, ctx: &LintContext) -> Result<String, LintError> {
495        let mut content = ctx.content.to_string();
496        let mut warnings = self.check(ctx)?;
497
498        // Sort warnings by position to ensure consistent fix application
499        warnings.sort_by_key(|w| w.fix.as_ref().map(|f| f.range.start).unwrap_or(0));
500
501        // Apply fixes in reverse order to maintain positions
502        for warning in warnings.iter().rev() {
503            if let Some(fix) = &warning.fix {
504                let start = fix.range.start;
505                let end = fix.range.end;
506                content.replace_range(start..end, &fix.replacement);
507            }
508        }
509
510        Ok(content)
511    }
512}