rumdl_lib/rules/
md034_no_bare_urls.rs

1/// Rule MD034: No unformatted URLs
2///
3/// See [docs/md034.md](../../docs/md034.md) for full documentation, configuration, and examples.
4use crate::rule::{Fix, LintError, LintResult, LintWarning, Rule, RuleCategory, Severity};
5use crate::utils::range_utils::{LineIndex, calculate_url_range};
6use crate::utils::regex_cache::{EMAIL_PATTERN, get_cached_fancy_regex, get_cached_regex};
7
8use crate::filtered_lines::FilteredLinesExt;
9use crate::lint_context::LintContext;
10
11// URL detection patterns
12const URL_QUICK_CHECK_STR: &str = r#"(?:https?|ftps?)://|@|www\."#;
13const CUSTOM_PROTOCOL_PATTERN_STR: &str = r#"(?:grpc|ws|wss|ssh|git|svn|file|data|javascript|vscode|chrome|about|slack|discord|matrix|irc|redis|mongodb|postgresql|mysql|kafka|nats|amqp|mqtt|custom|app|api|service)://"#;
14const MARKDOWN_LINK_PATTERN_STR: &str = r#"\[(?:[^\[\]]|\[[^\]]*\])*\]\(([^)\s]+)(?:\s+(?:\"[^\"]*\"|\'[^\']*\'))?\)"#;
15const MARKDOWN_EMPTY_LINK_PATTERN_STR: &str = r#"\[(?:[^\[\]]|\[[^\]]*\])*\]\(\)"#;
16const MARKDOWN_EMPTY_REF_PATTERN_STR: &str = r#"\[(?:[^\[\]]|\[[^\]]*\])*\]\[\]"#;
17const ANGLE_LINK_PATTERN_STR: &str =
18    r#"<((?:https?|ftps?)://(?:\[[0-9a-fA-F:]+(?:%[a-zA-Z0-9]+)?\]|[^>]+)|[^@\s]+@[^@\s]+\.[^@\s>]+)>"#;
19const BADGE_LINK_LINE_STR: &str = r#"^\s*\[!\[[^\]]*\]\([^)]*\)\]\([^)]*\)\s*$"#;
20const MARKDOWN_IMAGE_PATTERN_STR: &str = r#"!\s*\[([^\]]*)\]\s*\(([^)\s]+)(?:\s+(?:\"[^\"]*\"|\'[^\']*\'))?\)"#;
21const SIMPLE_URL_REGEX_STR: &str = r#"(https?|ftps?)://(?:\[[0-9a-fA-F:%.]+\](?::\d+)?|[^\s<>\[\]()\\'\"`\]]+)(?:/[^\s<>\[\]()\\'\"`]*)?(?:\?[^\s<>\[\]()\\'\"`]*)?(?:#[^\s<>\[\]()\\'\"`]*)?"#;
22// Pattern to detect www URLs without protocol (e.g., www.example.com)
23// Matches www. followed by domain name with at least one dot and a valid TLD
24// Note: Uses standard regex. Negative lookbehind (no / or @ before www) is checked in code.
25const WWW_URL_REGEX_STR: &str = r#"www\.(?:[a-zA-Z0-9][-a-zA-Z0-9]*\.)+[a-zA-Z]{2,}(?:/[^\s<>\[\]()\\'\"`]*)?"#;
26const IPV6_URL_REGEX_STR: &str = r#"(https?|ftps?)://\[[0-9a-fA-F:%.\-a-zA-Z]+\](?::\d+)?(?:/[^\s<>\[\]()\\'\"`]*)?(?:\?[^\s<>\[\]()\\'\"`]*)?(?:#[^\s<>\[\]()\\'\"`]*)?"#;
27// Reference definition pattern - matches [label]: URL with optional title
28// Supports: [ref]: https://... or [ref]: <https://...> or [ref]: https://... "title"
29const REFERENCE_DEF_RE_STR: &str = r"^\s*\[[^\]]+\]:\s*(?:<|(?:https?|ftps?)://)";
30const MULTILINE_LINK_CONTINUATION_STR: &str = r#"^[^\[]*\]\(.*\)"#;
31// Pattern to match shortcut/collapsed reference links: [text] or [text][]
32// This includes [URL] which should not be flagged as a bare URL
33const SHORTCUT_REF_PATTERN_STR: &str = r#"\[([^\[\]]+)\](?!\s*[\[(])"#;
34
35/// Reusable buffers for check_line to reduce allocations
36#[derive(Default)]
37struct LineCheckBuffers {
38    markdown_link_ranges: Vec<(usize, usize)>,
39    image_ranges: Vec<(usize, usize)>,
40    urls_found: Vec<(usize, usize, String)>,
41}
42
43#[derive(Default, Clone)]
44pub struct MD034NoBareUrls;
45
46impl MD034NoBareUrls {
47    #[inline]
48    pub fn should_skip_content(&self, content: &str) -> bool {
49        // Skip if content has no URLs and no email addresses
50        // Fast byte scanning for common URL/email indicators
51        let bytes = content.as_bytes();
52        let has_colon = bytes.contains(&b':');
53        let has_at = bytes.contains(&b'@');
54        let has_www = content.contains("www.");
55        !has_colon && !has_at && !has_www
56    }
57
58    /// Remove trailing punctuation that is likely sentence punctuation, not part of the URL
59    fn trim_trailing_punctuation<'a>(&self, url: &'a str) -> &'a str {
60        let mut trimmed = url;
61
62        // Check for balanced parentheses - if we have unmatched closing parens, they're likely punctuation
63        let open_parens = url.chars().filter(|&c| c == '(').count();
64        let close_parens = url.chars().filter(|&c| c == ')').count();
65
66        if close_parens > open_parens {
67            // Find the last balanced closing paren position
68            let mut balance = 0;
69            let mut last_balanced_pos = url.len();
70
71            for (i, c) in url.chars().enumerate() {
72                if c == '(' {
73                    balance += 1;
74                } else if c == ')' {
75                    balance -= 1;
76                    if balance < 0 {
77                        // Found an unmatched closing paren
78                        last_balanced_pos = i;
79                        break;
80                    }
81                }
82            }
83
84            trimmed = &trimmed[..last_balanced_pos];
85        }
86
87        // Trim specific punctuation only if not followed by more URL-like chars
88        while let Some(last_char) = trimmed.chars().last() {
89            if matches!(last_char, '.' | ',' | ';' | ':' | '!' | '?') {
90                // Check if this looks like it could be part of the URL
91                // For ':' specifically, keep it if followed by digits (port number)
92                if last_char == ':' && trimmed.len() > 1 {
93                    // Don't trim
94                    break;
95                }
96                trimmed = &trimmed[..trimmed.len() - 1];
97            } else {
98                break;
99            }
100        }
101
102        trimmed
103    }
104
105    /// Check if line is inside a reference definition
106    fn is_reference_definition(&self, line: &str) -> bool {
107        get_cached_regex(REFERENCE_DEF_RE_STR)
108            .map(|re| re.is_match(line))
109            .unwrap_or(false)
110    }
111
112    fn check_line(
113        &self,
114        line: &str,
115        ctx: &LintContext,
116        line_number: usize,
117        code_spans: &[crate::lint_context::CodeSpan],
118        buffers: &mut LineCheckBuffers,
119        line_index: &LineIndex,
120    ) -> Vec<LintWarning> {
121        let mut warnings = Vec::new();
122
123        // Skip reference definitions
124        if self.is_reference_definition(line) {
125            return warnings;
126        }
127
128        // Skip lines inside HTML blocks - URLs in HTML attributes should not be linted
129        if ctx.line_info(line_number).is_some_and(|info| info.in_html_block) {
130            return warnings;
131        }
132
133        // Skip lines that are continuations of multiline markdown links
134        // Pattern: text](url) without a leading [
135        if let Ok(re) = get_cached_regex(MULTILINE_LINK_CONTINUATION_STR)
136            && re.is_match(line)
137        {
138            return warnings;
139        }
140
141        // Quick check - does this line potentially have a URL or email?
142        let has_quick_check = get_cached_regex(URL_QUICK_CHECK_STR)
143            .map(|re| re.is_match(line))
144            .unwrap_or(false);
145        let has_www = line.contains("www.");
146        let has_at = line.contains('@');
147
148        if !has_quick_check && !has_at && !has_www {
149            return warnings;
150        }
151
152        // Clear and reuse buffers instead of allocating new ones
153        buffers.markdown_link_ranges.clear();
154        if let Ok(re) = get_cached_regex(MARKDOWN_LINK_PATTERN_STR) {
155            for cap in re.captures_iter(line) {
156                if let Some(mat) = cap.get(0) {
157                    buffers.markdown_link_ranges.push((mat.start(), mat.end()));
158                }
159            }
160        }
161
162        // Also include empty link patterns like [text]() and [text][]
163        if let Ok(re) = get_cached_regex(MARKDOWN_EMPTY_LINK_PATTERN_STR) {
164            for mat in re.find_iter(line) {
165                buffers.markdown_link_ranges.push((mat.start(), mat.end()));
166            }
167        }
168
169        if let Ok(re) = get_cached_regex(MARKDOWN_EMPTY_REF_PATTERN_STR) {
170            for mat in re.find_iter(line) {
171                buffers.markdown_link_ranges.push((mat.start(), mat.end()));
172            }
173        }
174
175        // Also exclude shortcut reference links like [URL] - even if no definition exists,
176        // the brackets indicate user intent to use markdown formatting
177        // Uses fancy_regex for negative lookahead support
178        if let Ok(re) = get_cached_fancy_regex(SHORTCUT_REF_PATTERN_STR) {
179            for mat in re.find_iter(line).flatten() {
180                buffers.markdown_link_ranges.push((mat.start(), mat.end()));
181            }
182        }
183
184        if let Ok(re) = get_cached_regex(ANGLE_LINK_PATTERN_STR) {
185            for cap in re.captures_iter(line) {
186                if let Some(mat) = cap.get(0) {
187                    buffers.markdown_link_ranges.push((mat.start(), mat.end()));
188                }
189            }
190        }
191
192        // Find all markdown images for exclusion
193        buffers.image_ranges.clear();
194        if let Ok(re) = get_cached_regex(MARKDOWN_IMAGE_PATTERN_STR) {
195            for cap in re.captures_iter(line) {
196                if let Some(mat) = cap.get(0) {
197                    buffers.image_ranges.push((mat.start(), mat.end()));
198                }
199            }
200        }
201
202        // Check if this line contains only a badge link (common pattern)
203        let is_badge_line = get_cached_regex(BADGE_LINK_LINE_STR)
204            .map(|re| re.is_match(line))
205            .unwrap_or(false);
206
207        if is_badge_line {
208            return warnings;
209        }
210
211        // Find bare URLs
212        buffers.urls_found.clear();
213
214        // First, find IPv6 URLs (they need special handling)
215        if let Ok(re) = get_cached_regex(IPV6_URL_REGEX_STR) {
216            for mat in re.find_iter(line) {
217                let url_str = mat.as_str();
218                buffers.urls_found.push((mat.start(), mat.end(), url_str.to_string()));
219            }
220        }
221
222        // Then find regular URLs
223        if let Ok(re) = get_cached_regex(SIMPLE_URL_REGEX_STR) {
224            for mat in re.find_iter(line) {
225                let url_str = mat.as_str();
226
227                // Skip if it's an IPv6 URL (already handled)
228                if url_str.contains("://[") {
229                    continue;
230                }
231
232                // Skip malformed IPv6-like URLs
233                // Check for IPv6-like patterns that are malformed
234                if let Some(host_start) = url_str.find("://") {
235                    let after_protocol = &url_str[host_start + 3..];
236                    // If it looks like IPv6 (has :: or multiple :) but no brackets, skip if followed by ]
237                    if after_protocol.contains("::") || after_protocol.chars().filter(|&c| c == ':').count() > 1 {
238                        // Check if the next character after our match is ]
239                        if let Some(char_after) = line.chars().nth(mat.end())
240                            && char_after == ']'
241                        {
242                            // This is likely a malformed IPv6 URL like "https://::1]:8080"
243                            continue;
244                        }
245                    }
246                }
247
248                buffers.urls_found.push((mat.start(), mat.end(), url_str.to_string()));
249            }
250        }
251
252        // Find www URLs without protocol (e.g., www.example.com)
253        if let Ok(re) = get_cached_regex(WWW_URL_REGEX_STR) {
254            for mat in re.find_iter(line) {
255                let url_str = mat.as_str();
256                let start_pos = mat.start();
257                let end_pos = mat.end();
258
259                // Skip if preceded by / or @ (likely part of a full URL)
260                if start_pos > 0 {
261                    let prev_char = line.as_bytes().get(start_pos - 1).copied();
262                    if prev_char == Some(b'/') || prev_char == Some(b'@') {
263                        continue;
264                    }
265                }
266
267                // Skip if inside angle brackets (autolink syntax like <www.example.com>)
268                if start_pos > 0 && end_pos < line.len() {
269                    let prev_char = line.as_bytes().get(start_pos - 1).copied();
270                    let next_char = line.as_bytes().get(end_pos).copied();
271                    if prev_char == Some(b'<') && next_char == Some(b'>') {
272                        continue;
273                    }
274                }
275
276                buffers.urls_found.push((start_pos, end_pos, url_str.to_string()));
277            }
278        }
279
280        // Process found URLs
281        for &(start, end, ref url_str) in buffers.urls_found.iter() {
282            // Skip custom protocols
283            if get_cached_regex(CUSTOM_PROTOCOL_PATTERN_STR)
284                .map(|re| re.is_match(url_str))
285                .unwrap_or(false)
286            {
287                continue;
288            }
289
290            // Check if this URL is inside a markdown link, angle bracket, or image
291            let mut is_inside_construct = false;
292            for &(link_start, link_end) in buffers.markdown_link_ranges.iter() {
293                if start >= link_start && end <= link_end {
294                    is_inside_construct = true;
295                    break;
296                }
297            }
298
299            for &(img_start, img_end) in buffers.image_ranges.iter() {
300                if start >= img_start && end <= img_end {
301                    is_inside_construct = true;
302                    break;
303                }
304            }
305
306            if is_inside_construct {
307                continue;
308            }
309
310            // Calculate absolute byte position for context-aware checks
311            let line_start_byte = line_index.get_line_start_byte(line_number).unwrap_or(0);
312            let absolute_pos = line_start_byte + start;
313
314            // Check if URL is inside an HTML tag (handles multiline tags correctly)
315            if ctx.is_in_html_tag(absolute_pos) {
316                continue;
317            }
318
319            // Check if we're inside an HTML comment
320            if ctx.is_in_html_comment(absolute_pos) {
321                continue;
322            }
323
324            // Clean up the URL by removing trailing punctuation
325            let trimmed_url = self.trim_trailing_punctuation(url_str);
326
327            // Only report if we have a valid URL after trimming
328            if !trimmed_url.is_empty() && trimmed_url != "//" {
329                let trimmed_len = trimmed_url.len();
330                let (start_line, start_col, end_line, end_col) =
331                    calculate_url_range(line_number, line, start, trimmed_len);
332
333                warnings.push(LintWarning {
334                    rule_name: Some("MD034".to_string()),
335                    line: start_line,
336                    column: start_col,
337                    end_line,
338                    end_column: end_col,
339                    message: format!("URL without angle brackets or link formatting: '{trimmed_url}'"),
340                    severity: Severity::Warning,
341                    fix: Some(Fix {
342                        range: {
343                            let line_start_byte = line_index.get_line_start_byte(line_number).unwrap_or(0);
344                            (line_start_byte + start)..(line_start_byte + start + trimmed_len)
345                        },
346                        replacement: format!("<{trimmed_url}>"),
347                    }),
348                });
349            }
350        }
351
352        // Check for bare email addresses
353        for cap in EMAIL_PATTERN.captures_iter(line) {
354            if let Some(mat) = cap.get(0) {
355                let email = mat.as_str();
356                let start = mat.start();
357                let end = mat.end();
358
359                // Check if email is inside angle brackets or markdown link
360                let mut is_inside_construct = false;
361                for &(link_start, link_end) in buffers.markdown_link_ranges.iter() {
362                    if start >= link_start && end <= link_end {
363                        is_inside_construct = true;
364                        break;
365                    }
366                }
367
368                if !is_inside_construct {
369                    // Calculate absolute byte position for context-aware checks
370                    let line_start_byte = line_index.get_line_start_byte(line_number).unwrap_or(0);
371                    let absolute_pos = line_start_byte + start;
372
373                    // Check if email is inside an HTML tag (handles multiline tags)
374                    if ctx.is_in_html_tag(absolute_pos) {
375                        continue;
376                    }
377
378                    // Check if email is inside a code span
379                    let is_in_code_span = code_spans
380                        .iter()
381                        .any(|span| span.line == line_number && start >= span.start_col && start < span.end_col);
382
383                    if !is_in_code_span {
384                        let email_len = end - start;
385                        let (start_line, start_col, end_line, end_col) =
386                            calculate_url_range(line_number, line, start, email_len);
387
388                        warnings.push(LintWarning {
389                            rule_name: Some("MD034".to_string()),
390                            line: start_line,
391                            column: start_col,
392                            end_line,
393                            end_column: end_col,
394                            message: format!("Email address without angle brackets or link formatting: '{email}'"),
395                            severity: Severity::Warning,
396                            fix: Some(Fix {
397                                range: (line_start_byte + start)..(line_start_byte + end),
398                                replacement: format!("<{email}>"),
399                            }),
400                        });
401                    }
402                }
403            }
404        }
405
406        warnings
407    }
408}
409
410impl Rule for MD034NoBareUrls {
411    #[inline]
412    fn name(&self) -> &'static str {
413        "MD034"
414    }
415
416    fn as_any(&self) -> &dyn std::any::Any {
417        self
418    }
419
420    fn from_config(_config: &crate::config::Config) -> Box<dyn Rule>
421    where
422        Self: Sized,
423    {
424        Box::new(MD034NoBareUrls)
425    }
426
427    #[inline]
428    fn category(&self) -> RuleCategory {
429        RuleCategory::Link
430    }
431
432    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
433        !ctx.likely_has_links_or_images() && self.should_skip_content(ctx.content)
434    }
435
436    #[inline]
437    fn description(&self) -> &'static str {
438        "No bare URLs - wrap URLs in angle brackets"
439    }
440
441    fn check(&self, ctx: &LintContext) -> LintResult {
442        let mut warnings = Vec::new();
443        let content = ctx.content;
444
445        // Quick skip for content without URLs
446        if self.should_skip_content(content) {
447            return Ok(warnings);
448        }
449
450        // Create LineIndex for correct byte position calculations across all line ending types
451        let line_index = &ctx.line_index;
452
453        // Get code spans for exclusion
454        let code_spans = ctx.code_spans();
455
456        // Allocate reusable buffers once instead of per-line to reduce allocations
457        let mut buffers = LineCheckBuffers::default();
458
459        // Iterate over content lines, automatically skipping front matter and code blocks
460        // This uses the filtered iterator API which centralizes the skip logic
461        for line in ctx.filtered_lines().skip_front_matter().skip_code_blocks() {
462            let mut line_warnings =
463                self.check_line(line.content, ctx, line.line_num, &code_spans, &mut buffers, line_index);
464
465            // Filter out warnings that are inside code spans
466            line_warnings.retain(|warning| {
467                // Check if the URL is inside a code span
468                !code_spans.iter().any(|span| {
469                    span.line == warning.line &&
470                    warning.column > 0 && // column is 1-indexed
471                    (warning.column - 1) >= span.start_col &&
472                    (warning.column - 1) < span.end_col
473                })
474            });
475
476            // Filter out warnings where the URL is inside a parsed link
477            // This handles cases like [text]( https://url ) where the URL has leading whitespace
478            // pulldown-cmark correctly parses these as valid links even though our regex misses them
479            line_warnings.retain(|warning| {
480                if let Some(fix) = &warning.fix {
481                    // Check if the fix range falls inside any parsed link's byte range
482                    !ctx.links
483                        .iter()
484                        .any(|link| fix.range.start >= link.byte_offset && fix.range.end <= link.byte_end)
485                } else {
486                    true
487                }
488            });
489
490            warnings.extend(line_warnings);
491        }
492
493        Ok(warnings)
494    }
495
496    fn fix(&self, ctx: &LintContext) -> Result<String, LintError> {
497        let mut content = ctx.content.to_string();
498        let mut warnings = self.check(ctx)?;
499
500        // Sort warnings by position to ensure consistent fix application
501        warnings.sort_by_key(|w| w.fix.as_ref().map(|f| f.range.start).unwrap_or(0));
502
503        // Apply fixes in reverse order to maintain positions
504        for warning in warnings.iter().rev() {
505            if let Some(fix) = &warning.fix {
506                let start = fix.range.start;
507                let end = fix.range.end;
508                content.replace_range(start..end, &fix.replacement);
509            }
510        }
511
512        Ok(content)
513    }
514}