rumdl_lib/rules/
md034_no_bare_urls.rs

1/// Rule MD034: No unformatted URLs
2///
3/// See [docs/md034.md](../../docs/md034.md) for full documentation, configuration, and examples.
4use crate::rule::{Fix, LintError, LintResult, LintWarning, Rule, RuleCategory, Severity};
5use crate::utils::range_utils::{LineIndex, calculate_url_range};
6use crate::utils::regex_cache::{EMAIL_PATTERN, get_cached_regex};
7
8use crate::filtered_lines::FilteredLinesExt;
9use crate::lint_context::LintContext;
10
11// URL detection patterns
12const URL_QUICK_CHECK_STR: &str = r#"(?:https?|ftps?)://|@"#;
13const CUSTOM_PROTOCOL_PATTERN_STR: &str = r#"(?:grpc|ws|wss|ssh|git|svn|file|data|javascript|vscode|chrome|about|slack|discord|matrix|irc|redis|mongodb|postgresql|mysql|kafka|nats|amqp|mqtt|custom|app|api|service)://"#;
14const MARKDOWN_LINK_PATTERN_STR: &str = r#"\[(?:[^\[\]]|\[[^\]]*\])*\]\(([^)\s]+)(?:\s+(?:\"[^\"]*\"|\'[^\']*\'))?\)"#;
15const MARKDOWN_EMPTY_LINK_PATTERN_STR: &str = r#"\[(?:[^\[\]]|\[[^\]]*\])*\]\(\)"#;
16const MARKDOWN_EMPTY_REF_PATTERN_STR: &str = r#"\[(?:[^\[\]]|\[[^\]]*\])*\]\[\]"#;
17const ANGLE_LINK_PATTERN_STR: &str =
18    r#"<((?:https?|ftps?)://(?:\[[0-9a-fA-F:]+(?:%[a-zA-Z0-9]+)?\]|[^>]+)|[^@\s]+@[^@\s]+\.[^@\s>]+)>"#;
19const BADGE_LINK_LINE_STR: &str = r#"^\s*\[!\[[^\]]*\]\([^)]*\)\]\([^)]*\)\s*$"#;
20const MARKDOWN_IMAGE_PATTERN_STR: &str = r#"!\s*\[([^\]]*)\]\s*\(([^)\s]+)(?:\s+(?:\"[^\"]*\"|\'[^\']*\'))?\)"#;
21const SIMPLE_URL_REGEX_STR: &str = r#"(https?|ftps?)://(?:\[[0-9a-fA-F:%.]+\](?::\d+)?|[^\s<>\[\]()\\'\"`\]]+)(?:/[^\s<>\[\]()\\'\"`]*)?(?:\?[^\s<>\[\]()\\'\"`]*)?(?:#[^\s<>\[\]()\\'\"`]*)?"#;
22const IPV6_URL_REGEX_STR: &str = r#"(https?|ftps?)://\[[0-9a-fA-F:%.\-a-zA-Z]+\](?::\d+)?(?:/[^\s<>\[\]()\\'\"`]*)?(?:\?[^\s<>\[\]()\\'\"`]*)?(?:#[^\s<>\[\]()\\'\"`]*)?"#;
23const REFERENCE_DEF_RE_STR: &str = r"^\s*\[[^\]]+\]:\s*(?:https?|ftps?)://\S+$";
24const HTML_TAG_PATTERN_STR: &str = r#"<[^>]*>"#;
25const MULTILINE_LINK_CONTINUATION_STR: &str = r#"^[^\[]*\]\(.*\)"#;
26
27/// Reusable buffers for check_line to reduce allocations
28#[derive(Default)]
29struct LineCheckBuffers {
30    markdown_link_ranges: Vec<(usize, usize)>,
31    image_ranges: Vec<(usize, usize)>,
32    urls_found: Vec<(usize, usize, String)>,
33}
34
35#[derive(Default, Clone)]
36pub struct MD034NoBareUrls;
37
38impl MD034NoBareUrls {
39    #[inline]
40    pub fn should_skip_content(&self, content: &str) -> bool {
41        // Skip if content has no URLs and no email addresses
42        // Fast byte scanning for common URL/email indicators
43        let bytes = content.as_bytes();
44        !bytes.contains(&b':') && !bytes.contains(&b'@')
45    }
46
47    /// Remove trailing punctuation that is likely sentence punctuation, not part of the URL
48    fn trim_trailing_punctuation<'a>(&self, url: &'a str) -> &'a str {
49        let mut trimmed = url;
50
51        // Check for balanced parentheses - if we have unmatched closing parens, they're likely punctuation
52        let open_parens = url.chars().filter(|&c| c == '(').count();
53        let close_parens = url.chars().filter(|&c| c == ')').count();
54
55        if close_parens > open_parens {
56            // Find the last balanced closing paren position
57            let mut balance = 0;
58            let mut last_balanced_pos = url.len();
59
60            for (i, c) in url.chars().enumerate() {
61                if c == '(' {
62                    balance += 1;
63                } else if c == ')' {
64                    balance -= 1;
65                    if balance < 0 {
66                        // Found an unmatched closing paren
67                        last_balanced_pos = i;
68                        break;
69                    }
70                }
71            }
72
73            trimmed = &trimmed[..last_balanced_pos];
74        }
75
76        // Trim specific punctuation only if not followed by more URL-like chars
77        while let Some(last_char) = trimmed.chars().last() {
78            if matches!(last_char, '.' | ',' | ';' | ':' | '!' | '?') {
79                // Check if this looks like it could be part of the URL
80                // For ':' specifically, keep it if followed by digits (port number)
81                if last_char == ':' && trimmed.len() > 1 {
82                    // Don't trim
83                    break;
84                }
85                trimmed = &trimmed[..trimmed.len() - 1];
86            } else {
87                break;
88            }
89        }
90
91        trimmed
92    }
93
94    /// Check if line is inside a reference definition
95    fn is_reference_definition(&self, line: &str) -> bool {
96        get_cached_regex(REFERENCE_DEF_RE_STR)
97            .map(|re| re.is_match(line))
98            .unwrap_or(false)
99    }
100
101    /// Check if a position in a line is inside an HTML tag
102    fn is_in_html_tag(&self, line: &str, pos: usize) -> bool {
103        // Find all HTML tags in the line
104        if let Ok(re) = get_cached_regex(HTML_TAG_PATTERN_STR) {
105            for mat in re.find_iter(line) {
106                if pos >= mat.start() && pos < mat.end() {
107                    return true;
108                }
109            }
110        }
111        false
112    }
113
114    fn check_line(
115        &self,
116        line: &str,
117        ctx: &LintContext,
118        line_number: usize,
119        code_spans: &[crate::lint_context::CodeSpan],
120        buffers: &mut LineCheckBuffers,
121        line_index: &LineIndex,
122    ) -> Vec<LintWarning> {
123        let mut warnings = Vec::new();
124
125        // Skip reference definitions
126        if self.is_reference_definition(line) {
127            return warnings;
128        }
129
130        // Skip lines inside HTML blocks - URLs in HTML attributes should not be linted
131        if ctx.line_info(line_number).is_some_and(|info| info.in_html_block) {
132            return warnings;
133        }
134
135        // Skip lines that are continuations of multiline markdown links
136        // Pattern: text](url) without a leading [
137        if let Ok(re) = get_cached_regex(MULTILINE_LINK_CONTINUATION_STR)
138            && re.is_match(line)
139        {
140            return warnings;
141        }
142
143        // Quick check - does this line potentially have a URL or email?
144        if let Ok(re) = get_cached_regex(URL_QUICK_CHECK_STR)
145            && !re.is_match(line)
146            && !line.contains('@')
147        {
148            return warnings;
149        }
150
151        // Clear and reuse buffers instead of allocating new ones
152        buffers.markdown_link_ranges.clear();
153        if let Ok(re) = get_cached_regex(MARKDOWN_LINK_PATTERN_STR) {
154            for cap in re.captures_iter(line) {
155                if let Some(mat) = cap.get(0) {
156                    buffers.markdown_link_ranges.push((mat.start(), mat.end()));
157                }
158            }
159        }
160
161        // Also include empty link patterns like [text]() and [text][]
162        if let Ok(re) = get_cached_regex(MARKDOWN_EMPTY_LINK_PATTERN_STR) {
163            for mat in re.find_iter(line) {
164                buffers.markdown_link_ranges.push((mat.start(), mat.end()));
165            }
166        }
167
168        if let Ok(re) = get_cached_regex(MARKDOWN_EMPTY_REF_PATTERN_STR) {
169            for mat in re.find_iter(line) {
170                buffers.markdown_link_ranges.push((mat.start(), mat.end()));
171            }
172        }
173
174        if let Ok(re) = get_cached_regex(ANGLE_LINK_PATTERN_STR) {
175            for cap in re.captures_iter(line) {
176                if let Some(mat) = cap.get(0) {
177                    buffers.markdown_link_ranges.push((mat.start(), mat.end()));
178                }
179            }
180        }
181
182        // Find all markdown images for exclusion
183        buffers.image_ranges.clear();
184        if let Ok(re) = get_cached_regex(MARKDOWN_IMAGE_PATTERN_STR) {
185            for cap in re.captures_iter(line) {
186                if let Some(mat) = cap.get(0) {
187                    buffers.image_ranges.push((mat.start(), mat.end()));
188                }
189            }
190        }
191
192        // Check if this line contains only a badge link (common pattern)
193        let is_badge_line = get_cached_regex(BADGE_LINK_LINE_STR)
194            .map(|re| re.is_match(line))
195            .unwrap_or(false);
196
197        if is_badge_line {
198            return warnings;
199        }
200
201        // Find bare URLs
202        buffers.urls_found.clear();
203
204        // First, find IPv6 URLs (they need special handling)
205        if let Ok(re) = get_cached_regex(IPV6_URL_REGEX_STR) {
206            for mat in re.find_iter(line) {
207                let url_str = mat.as_str();
208                buffers.urls_found.push((mat.start(), mat.end(), url_str.to_string()));
209            }
210        }
211
212        // Then find regular URLs
213        if let Ok(re) = get_cached_regex(SIMPLE_URL_REGEX_STR) {
214            for mat in re.find_iter(line) {
215                let url_str = mat.as_str();
216
217                // Skip if it's an IPv6 URL (already handled)
218                if url_str.contains("://[") {
219                    continue;
220                }
221
222                // Skip malformed IPv6-like URLs
223                // Check for IPv6-like patterns that are malformed
224                if let Some(host_start) = url_str.find("://") {
225                    let after_protocol = &url_str[host_start + 3..];
226                    // If it looks like IPv6 (has :: or multiple :) but no brackets, skip if followed by ]
227                    if after_protocol.contains("::") || after_protocol.chars().filter(|&c| c == ':').count() > 1 {
228                        // Check if the next character after our match is ]
229                        if let Some(char_after) = line.chars().nth(mat.end())
230                            && char_after == ']'
231                        {
232                            // This is likely a malformed IPv6 URL like "https://::1]:8080"
233                            continue;
234                        }
235                    }
236                }
237
238                buffers.urls_found.push((mat.start(), mat.end(), url_str.to_string()));
239            }
240        }
241
242        // Process found URLs
243        for &(start, end, ref url_str) in buffers.urls_found.iter() {
244            // Skip custom protocols
245            if get_cached_regex(CUSTOM_PROTOCOL_PATTERN_STR)
246                .map(|re| re.is_match(url_str))
247                .unwrap_or(false)
248            {
249                continue;
250            }
251
252            // Check if this URL is inside a markdown link, angle bracket, or image
253            let mut is_inside_construct = false;
254            for &(link_start, link_end) in buffers.markdown_link_ranges.iter() {
255                if start >= link_start && end <= link_end {
256                    is_inside_construct = true;
257                    break;
258                }
259            }
260
261            for &(img_start, img_end) in buffers.image_ranges.iter() {
262                if start >= img_start && end <= img_end {
263                    is_inside_construct = true;
264                    break;
265                }
266            }
267
268            if is_inside_construct {
269                continue;
270            }
271
272            // Check if URL is inside an HTML tag
273            if self.is_in_html_tag(line, start) {
274                continue;
275            }
276
277            // Check if we're inside an HTML comment
278            let line_start_byte = line_index.get_line_start_byte(line_number).unwrap_or(0);
279            let absolute_pos = line_start_byte + start;
280            if ctx.is_in_html_comment(absolute_pos) {
281                continue;
282            }
283
284            // Clean up the URL by removing trailing punctuation
285            let trimmed_url = self.trim_trailing_punctuation(url_str);
286
287            // Only report if we have a valid URL after trimming
288            if !trimmed_url.is_empty() && trimmed_url != "//" {
289                let trimmed_len = trimmed_url.len();
290                let (start_line, start_col, end_line, end_col) =
291                    calculate_url_range(line_number, line, start, trimmed_len);
292
293                warnings.push(LintWarning {
294                    rule_name: Some("MD034".to_string()),
295                    line: start_line,
296                    column: start_col,
297                    end_line,
298                    end_column: end_col,
299                    message: format!("URL without angle brackets or link formatting: '{trimmed_url}'"),
300                    severity: Severity::Warning,
301                    fix: Some(Fix {
302                        range: {
303                            let line_start_byte = line_index.get_line_start_byte(line_number).unwrap_or(0);
304                            (line_start_byte + start)..(line_start_byte + start + trimmed_len)
305                        },
306                        replacement: format!("<{trimmed_url}>"),
307                    }),
308                });
309            }
310        }
311
312        // Check for bare email addresses
313        for cap in EMAIL_PATTERN.captures_iter(line) {
314            if let Some(mat) = cap.get(0) {
315                let email = mat.as_str();
316                let start = mat.start();
317                let end = mat.end();
318
319                // Check if email is inside angle brackets or markdown link
320                let mut is_inside_construct = false;
321                for &(link_start, link_end) in buffers.markdown_link_ranges.iter() {
322                    if start >= link_start && end <= link_end {
323                        is_inside_construct = true;
324                        break;
325                    }
326                }
327
328                if !is_inside_construct {
329                    // Check if email is inside an HTML tag
330                    if self.is_in_html_tag(line, start) {
331                        continue;
332                    }
333
334                    // Check if email is inside a code span
335                    let is_in_code_span = code_spans
336                        .iter()
337                        .any(|span| span.line == line_number && start >= span.start_col && start < span.end_col);
338
339                    if !is_in_code_span {
340                        let email_len = end - start;
341                        let (start_line, start_col, end_line, end_col) =
342                            calculate_url_range(line_number, line, start, email_len);
343
344                        warnings.push(LintWarning {
345                            rule_name: Some("MD034".to_string()),
346                            line: start_line,
347                            column: start_col,
348                            end_line,
349                            end_column: end_col,
350                            message: format!("Email address without angle brackets or link formatting: '{email}'"),
351                            severity: Severity::Warning,
352                            fix: Some(Fix {
353                                range: {
354                                    let line_start_byte = line_index.get_line_start_byte(line_number).unwrap_or(0);
355                                    (line_start_byte + start)..(line_start_byte + end)
356                                },
357                                replacement: format!("<{email}>"),
358                            }),
359                        });
360                    }
361                }
362            }
363        }
364
365        warnings
366    }
367}
368
369impl Rule for MD034NoBareUrls {
370    #[inline]
371    fn name(&self) -> &'static str {
372        "MD034"
373    }
374
375    fn as_any(&self) -> &dyn std::any::Any {
376        self
377    }
378
379    fn from_config(_config: &crate::config::Config) -> Box<dyn Rule>
380    where
381        Self: Sized,
382    {
383        Box::new(MD034NoBareUrls)
384    }
385
386    #[inline]
387    fn category(&self) -> RuleCategory {
388        RuleCategory::Link
389    }
390
391    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
392        !ctx.likely_has_links_or_images() && self.should_skip_content(ctx.content)
393    }
394
395    #[inline]
396    fn description(&self) -> &'static str {
397        "No bare URLs - wrap URLs in angle brackets"
398    }
399
400    fn check(&self, ctx: &LintContext) -> LintResult {
401        let mut warnings = Vec::new();
402        let content = ctx.content;
403
404        // Quick skip for content without URLs
405        if self.should_skip_content(content) {
406            return Ok(warnings);
407        }
408
409        // Create LineIndex for correct byte position calculations across all line ending types
410        let line_index = &ctx.line_index;
411
412        // Get code spans for exclusion
413        let code_spans = ctx.code_spans();
414
415        // Allocate reusable buffers once instead of per-line to reduce allocations
416        let mut buffers = LineCheckBuffers::default();
417
418        // Iterate over content lines, automatically skipping front matter and code blocks
419        // This uses the filtered iterator API which centralizes the skip logic
420        for line in ctx.filtered_lines().skip_front_matter().skip_code_blocks() {
421            let mut line_warnings =
422                self.check_line(line.content, ctx, line.line_num, &code_spans, &mut buffers, line_index);
423
424            // Filter out warnings that are inside code spans
425            line_warnings.retain(|warning| {
426                // Check if the URL is inside a code span
427                !code_spans.iter().any(|span| {
428                    span.line == warning.line &&
429                    warning.column > 0 && // column is 1-indexed
430                    (warning.column - 1) >= span.start_col &&
431                    (warning.column - 1) < span.end_col
432                })
433            });
434
435            // Filter out warnings where the URL is inside a parsed link
436            // This handles cases like [text]( https://url ) where the URL has leading whitespace
437            // pulldown-cmark correctly parses these as valid links even though our regex misses them
438            line_warnings.retain(|warning| {
439                if let Some(fix) = &warning.fix {
440                    // Check if the fix range falls inside any parsed link's byte range
441                    !ctx.links
442                        .iter()
443                        .any(|link| fix.range.start >= link.byte_offset && fix.range.end <= link.byte_end)
444                } else {
445                    true
446                }
447            });
448
449            warnings.extend(line_warnings);
450        }
451
452        Ok(warnings)
453    }
454
455    fn fix(&self, ctx: &LintContext) -> Result<String, LintError> {
456        let mut content = ctx.content.to_string();
457        let mut warnings = self.check(ctx)?;
458
459        // Sort warnings by position to ensure consistent fix application
460        warnings.sort_by_key(|w| w.fix.as_ref().map(|f| f.range.start).unwrap_or(0));
461
462        // Apply fixes in reverse order to maintain positions
463        for warning in warnings.iter().rev() {
464            if let Some(fix) = &warning.fix {
465                let start = fix.range.start;
466                let end = fix.range.end;
467                content.replace_range(start..end, &fix.replacement);
468            }
469        }
470
471        Ok(content)
472    }
473}