rumdl_lib/rules/
md034_no_bare_urls.rs

1/// Rule MD034: No unformatted URLs
2///
3/// See [docs/md034.md](../../docs/md034.md) for full documentation, configuration, and examples.
4use crate::rule::{Fix, LintError, LintResult, LintWarning, Rule, RuleCategory, Severity};
5use crate::utils::range_utils::calculate_url_range;
6use crate::utils::regex_cache::{EMAIL_PATTERN, get_cached_regex};
7
8use crate::lint_context::LintContext;
9
10// URL detection patterns
11const URL_QUICK_CHECK_STR: &str = r#"(?:https?|ftps?)://|@"#;
12const CUSTOM_PROTOCOL_PATTERN_STR: &str = r#"(?:grpc|ws|wss|ssh|git|svn|file|data|javascript|vscode|chrome|about|slack|discord|matrix|irc|redis|mongodb|postgresql|mysql|kafka|nats|amqp|mqtt|custom|app|api|service)://"#;
13const MARKDOWN_LINK_PATTERN_STR: &str = r#"\[(?:[^\[\]]|\[[^\]]*\])*\]\(([^)\s]+)(?:\s+(?:\"[^\"]*\"|\'[^\']*\'))?\)"#;
14const ANGLE_LINK_PATTERN_STR: &str =
15    r#"<((?:https?|ftps?)://(?:\[[0-9a-fA-F:]+(?:%[a-zA-Z0-9]+)?\]|[^>]+)|[^@\s]+@[^@\s]+\.[^@\s>]+)>"#;
16const BADGE_LINK_LINE_STR: &str = r#"^\s*\[!\[[^\]]*\]\([^)]*\)\]\([^)]*\)\s*$"#;
17const MARKDOWN_IMAGE_PATTERN_STR: &str = r#"!\s*\[([^\]]*)\]\s*\(([^)\s]+)(?:\s+(?:\"[^\"]*\"|\'[^\']*\'))?\)"#;
18const SIMPLE_URL_REGEX_STR: &str = r#"(https?|ftps?)://(?:\[[0-9a-fA-F:%.]+\](?::\d+)?|[^\s<>\[\]()\\'\"`\]]+)(?:/[^\s<>\[\]()\\'\"`]*)?(?:\?[^\s<>\[\]()\\'\"`]*)?(?:#[^\s<>\[\]()\\'\"`]*)?"#;
19const IPV6_URL_REGEX_STR: &str = r#"(https?|ftps?)://\[[0-9a-fA-F:%.\-a-zA-Z]+\](?::\d+)?(?:/[^\s<>\[\]()\\'\"`]*)?(?:\?[^\s<>\[\]()\\'\"`]*)?(?:#[^\s<>\[\]()\\'\"`]*)?"#;
20const REFERENCE_DEF_RE_STR: &str = r"^\s*\[[^\]]+\]:\s*(?:https?|ftps?)://\S+$";
21const HTML_COMMENT_PATTERN_STR: &str = r#"<!--[\s\S]*?-->"#;
22const HTML_TAG_PATTERN_STR: &str = r#"<[^>]*>"#;
23const MULTILINE_LINK_CONTINUATION_STR: &str = r#"^[^\[]*\]\(.*\)"#;
24
25#[derive(Default, Clone)]
26pub struct MD034NoBareUrls;
27
28impl MD034NoBareUrls {
29    #[inline]
30    pub fn should_skip(&self, content: &str) -> bool {
31        // Skip if content has no URLs and no email addresses
32        // Fast byte scanning for common URL/email indicators
33        let bytes = content.as_bytes();
34        !bytes.contains(&b':') && !bytes.contains(&b'@')
35    }
36
37    /// Remove trailing punctuation that is likely sentence punctuation, not part of the URL
38    fn trim_trailing_punctuation<'a>(&self, url: &'a str) -> &'a str {
39        let mut trimmed = url;
40
41        // Check for balanced parentheses - if we have unmatched closing parens, they're likely punctuation
42        let open_parens = url.chars().filter(|&c| c == '(').count();
43        let close_parens = url.chars().filter(|&c| c == ')').count();
44
45        if close_parens > open_parens {
46            // Find the last balanced closing paren position
47            let mut balance = 0;
48            let mut last_balanced_pos = url.len();
49
50            for (i, c) in url.chars().enumerate() {
51                if c == '(' {
52                    balance += 1;
53                } else if c == ')' {
54                    balance -= 1;
55                    if balance < 0 {
56                        // Found an unmatched closing paren
57                        last_balanced_pos = i;
58                        break;
59                    }
60                }
61            }
62
63            trimmed = &trimmed[..last_balanced_pos];
64        }
65
66        // Trim specific punctuation only if not followed by more URL-like chars
67        while let Some(last_char) = trimmed.chars().last() {
68            if matches!(last_char, '.' | ',' | ';' | ':' | '!' | '?') {
69                // Check if this looks like it could be part of the URL
70                // For ':' specifically, keep it if followed by digits (port number)
71                if last_char == ':' && trimmed.len() > 1 {
72                    // Don't trim
73                    break;
74                }
75                trimmed = &trimmed[..trimmed.len() - 1];
76            } else {
77                break;
78            }
79        }
80
81        trimmed
82    }
83
84    /// Check if line is inside a reference definition
85    fn is_reference_definition(&self, line: &str) -> bool {
86        get_cached_regex(REFERENCE_DEF_RE_STR)
87            .map(|re| re.is_match(line))
88            .unwrap_or(false)
89    }
90
91    /// Check if we're inside an HTML comment
92    fn is_in_html_comment(&self, content: &str, pos: usize) -> bool {
93        // Find all HTML comments in the content
94        if let Ok(re) = get_cached_regex(HTML_COMMENT_PATTERN_STR) {
95            for mat in re.find_iter(content) {
96                if pos >= mat.start() && pos < mat.end() {
97                    return true;
98                }
99            }
100        }
101        false
102    }
103
104    /// Check if a position in a line is inside an HTML tag
105    fn is_in_html_tag(&self, line: &str, pos: usize) -> bool {
106        // Find all HTML tags in the line
107        if let Ok(re) = get_cached_regex(HTML_TAG_PATTERN_STR) {
108            for mat in re.find_iter(line) {
109                if pos >= mat.start() && pos < mat.end() {
110                    return true;
111                }
112            }
113        }
114        false
115    }
116
117    fn check_line(
118        &self,
119        line: &str,
120        content: &str,
121        line_number: usize,
122        code_spans: &[crate::lint_context::CodeSpan],
123    ) -> Vec<LintWarning> {
124        let mut warnings = Vec::new();
125
126        // Skip reference definitions
127        if self.is_reference_definition(line) {
128            return warnings;
129        }
130
131        // Skip lines that are continuations of multiline markdown links
132        // Pattern: text](url) without a leading [
133        if let Ok(re) = get_cached_regex(MULTILINE_LINK_CONTINUATION_STR)
134            && re.is_match(line)
135        {
136            return warnings;
137        }
138
139        // Quick check - does this line potentially have a URL or email?
140        if let Ok(re) = get_cached_regex(URL_QUICK_CHECK_STR)
141            && !re.is_match(line)
142            && !line.contains('@')
143        {
144            return warnings;
145        }
146
147        // Find all markdown links and angle bracket links for exclusion
148        let mut markdown_link_ranges = Vec::new();
149        if let Ok(re) = get_cached_regex(MARKDOWN_LINK_PATTERN_STR) {
150            for cap in re.captures_iter(line) {
151                if let Some(mat) = cap.get(0) {
152                    markdown_link_ranges.push((mat.start(), mat.end()));
153                }
154            }
155        }
156
157        if let Ok(re) = get_cached_regex(ANGLE_LINK_PATTERN_STR) {
158            for cap in re.captures_iter(line) {
159                if let Some(mat) = cap.get(0) {
160                    markdown_link_ranges.push((mat.start(), mat.end()));
161                }
162            }
163        }
164
165        // Find all markdown images for exclusion
166        let mut image_ranges = Vec::new();
167        if let Ok(re) = get_cached_regex(MARKDOWN_IMAGE_PATTERN_STR) {
168            for cap in re.captures_iter(line) {
169                if let Some(mat) = cap.get(0) {
170                    image_ranges.push((mat.start(), mat.end()));
171                }
172            }
173        }
174
175        // Check if this line contains only a badge link (common pattern)
176        let is_badge_line = get_cached_regex(BADGE_LINK_LINE_STR)
177            .map(|re| re.is_match(line))
178            .unwrap_or(false);
179
180        if is_badge_line {
181            return warnings;
182        }
183
184        // Find bare URLs
185        let mut urls_found = Vec::new();
186
187        // First, find IPv6 URLs (they need special handling)
188        if let Ok(re) = get_cached_regex(IPV6_URL_REGEX_STR) {
189            for mat in re.find_iter(line) {
190                let url_str = mat.as_str();
191                urls_found.push((mat.start(), mat.end(), url_str.to_string()));
192            }
193        }
194
195        // Then find regular URLs
196        if let Ok(re) = get_cached_regex(SIMPLE_URL_REGEX_STR) {
197            for mat in re.find_iter(line) {
198                let url_str = mat.as_str();
199
200                // Skip if it's an IPv6 URL (already handled)
201                if url_str.contains("://[") {
202                    continue;
203                }
204
205                // Skip malformed IPv6-like URLs
206                // Check for IPv6-like patterns that are malformed
207                if let Some(host_start) = url_str.find("://") {
208                    let after_protocol = &url_str[host_start + 3..];
209                    // If it looks like IPv6 (has :: or multiple :) but no brackets, skip if followed by ]
210                    if after_protocol.contains("::") || after_protocol.chars().filter(|&c| c == ':').count() > 1 {
211                        // Check if the next character after our match is ]
212                        if let Some(char_after) = line.chars().nth(mat.end())
213                            && char_after == ']'
214                        {
215                            // This is likely a malformed IPv6 URL like "https://::1]:8080"
216                            continue;
217                        }
218                    }
219                }
220
221                urls_found.push((mat.start(), mat.end(), url_str.to_string()));
222            }
223        }
224
225        // Process found URLs
226        for (start, end, url_str) in urls_found {
227            // Skip custom protocols
228            if get_cached_regex(CUSTOM_PROTOCOL_PATTERN_STR)
229                .map(|re| re.is_match(&url_str))
230                .unwrap_or(false)
231            {
232                continue;
233            }
234
235            // Check if this URL is inside a markdown link, angle bracket, or image
236            let mut is_inside_construct = false;
237            for &(link_start, link_end) in &markdown_link_ranges {
238                if start >= link_start && end <= link_end {
239                    is_inside_construct = true;
240                    break;
241                }
242            }
243
244            for &(img_start, img_end) in &image_ranges {
245                if start >= img_start && end <= img_end {
246                    is_inside_construct = true;
247                    break;
248                }
249            }
250
251            if is_inside_construct {
252                continue;
253            }
254
255            // Check if URL is inside an HTML tag
256            if self.is_in_html_tag(line, start) {
257                continue;
258            }
259
260            // Check if we're inside an HTML comment
261            let absolute_pos = content
262                .lines()
263                .take(line_number - 1)
264                .map(|l| l.len() + 1)
265                .sum::<usize>()
266                + start;
267            if self.is_in_html_comment(content, absolute_pos) {
268                continue;
269            }
270
271            // Clean up the URL by removing trailing punctuation
272            let trimmed_url = self.trim_trailing_punctuation(&url_str);
273
274            // Only report if we have a valid URL after trimming
275            if !trimmed_url.is_empty() && trimmed_url != "//" {
276                let trimmed_len = trimmed_url.len();
277                let (start_line, start_col, end_line, end_col) =
278                    calculate_url_range(line_number, line, start, trimmed_len);
279
280                warnings.push(LintWarning {
281                    rule_name: Some("MD034"),
282                    line: start_line,
283                    column: start_col,
284                    end_line,
285                    end_column: end_col,
286                    message: format!("URL without angle brackets or link formatting: '{trimmed_url}'"),
287                    severity: Severity::Warning,
288                    fix: Some(Fix {
289                        range: {
290                            let line_start_byte = content
291                                .lines()
292                                .take(line_number - 1)
293                                .map(|l| l.len() + 1)
294                                .sum::<usize>();
295                            (line_start_byte + start)..(line_start_byte + start + trimmed_len)
296                        },
297                        replacement: format!("<{trimmed_url}>"),
298                    }),
299                });
300            }
301        }
302
303        // Check for bare email addresses
304        for cap in EMAIL_PATTERN.captures_iter(line) {
305            if let Some(mat) = cap.get(0) {
306                let email = mat.as_str();
307                let start = mat.start();
308                let end = mat.end();
309
310                // Check if email is inside angle brackets or markdown link
311                let mut is_inside_construct = false;
312                for &(link_start, link_end) in &markdown_link_ranges {
313                    if start >= link_start && end <= link_end {
314                        is_inside_construct = true;
315                        break;
316                    }
317                }
318
319                if !is_inside_construct {
320                    // Check if email is inside an HTML tag
321                    if self.is_in_html_tag(line, start) {
322                        continue;
323                    }
324
325                    // Check if email is inside a code span
326                    let is_in_code_span = code_spans
327                        .iter()
328                        .any(|span| span.line == line_number && start >= span.start_col && start < span.end_col);
329
330                    if !is_in_code_span {
331                        let email_len = end - start;
332                        let (start_line, start_col, end_line, end_col) =
333                            calculate_url_range(line_number, line, start, email_len);
334
335                        warnings.push(LintWarning {
336                            rule_name: Some("MD034"),
337                            line: start_line,
338                            column: start_col,
339                            end_line,
340                            end_column: end_col,
341                            message: format!("Email address without angle brackets or link formatting: '{email}'"),
342                            severity: Severity::Warning,
343                            fix: Some(Fix {
344                                range: {
345                                    let line_start_byte = content
346                                        .lines()
347                                        .take(line_number - 1)
348                                        .map(|l| l.len() + 1)
349                                        .sum::<usize>();
350                                    (line_start_byte + start)..(line_start_byte + end)
351                                },
352                                replacement: format!("<{email}>"),
353                            }),
354                        });
355                    }
356                }
357            }
358        }
359
360        warnings
361    }
362}
363
364impl Rule for MD034NoBareUrls {
365    #[inline]
366    fn name(&self) -> &'static str {
367        "MD034"
368    }
369
370    fn as_any(&self) -> &dyn std::any::Any {
371        self
372    }
373
374    fn from_config(_config: &crate::config::Config) -> Box<dyn Rule>
375    where
376        Self: Sized,
377    {
378        Box::new(MD034NoBareUrls)
379    }
380
381    #[inline]
382    fn category(&self) -> RuleCategory {
383        RuleCategory::Link
384    }
385
386    #[inline]
387    fn description(&self) -> &'static str {
388        "No bare URLs - wrap URLs in angle brackets"
389    }
390
391    fn check(&self, ctx: &LintContext) -> LintResult {
392        let mut warnings = Vec::new();
393        let content = ctx.content;
394
395        // Quick skip for content without URLs
396        if self.should_skip(content) {
397            return Ok(warnings);
398        }
399
400        // Get code spans for exclusion
401        let code_spans = ctx.code_spans();
402
403        // Check line by line
404        for (line_num, line) in content.lines().enumerate() {
405            // Skip lines inside code blocks
406            if ctx.is_in_code_block(line_num + 1) {
407                continue;
408            }
409
410            let mut line_warnings = self.check_line(line, content, line_num + 1, &code_spans);
411
412            // Filter out warnings that are inside code spans
413            line_warnings.retain(|warning| {
414                // Check if the URL is inside a code span
415                !code_spans.iter().any(|span| {
416                    span.line == warning.line &&
417                    warning.column > 0 && // column is 1-indexed
418                    (warning.column - 1) >= span.start_col &&
419                    (warning.column - 1) < span.end_col
420                })
421            });
422
423            warnings.extend(line_warnings);
424        }
425
426        Ok(warnings)
427    }
428
429    fn fix(&self, ctx: &LintContext) -> Result<String, LintError> {
430        let mut content = ctx.content.to_string();
431        let mut warnings = self.check(ctx)?;
432
433        // Sort warnings by position to ensure consistent fix application
434        warnings.sort_by_key(|w| w.fix.as_ref().map(|f| f.range.start).unwrap_or(0));
435
436        // Apply fixes in reverse order to maintain positions
437        for warning in warnings.iter().rev() {
438            if let Some(fix) = &warning.fix {
439                let start = fix.range.start;
440                let end = fix.range.end;
441                content.replace_range(start..end, &fix.replacement);
442            }
443        }
444
445        Ok(content)
446    }
447}