rumdl_lib/rules/
md034_no_bare_urls.rs

1/// Rule MD034: No unformatted URLs
2///
3/// See [docs/md034.md](../../docs/md034.md) for full documentation, configuration, and examples.
4use crate::rule::{Fix, LintError, LintResult, LintWarning, Rule, RuleCategory, Severity};
5use crate::utils::range_utils::{LineIndex, calculate_url_range};
6use crate::utils::regex_cache::{EMAIL_PATTERN, get_cached_regex};
7
8use crate::lint_context::LintContext;
9
10// URL detection patterns
11const URL_QUICK_CHECK_STR: &str = r#"(?:https?|ftps?)://|@"#;
12const CUSTOM_PROTOCOL_PATTERN_STR: &str = r#"(?:grpc|ws|wss|ssh|git|svn|file|data|javascript|vscode|chrome|about|slack|discord|matrix|irc|redis|mongodb|postgresql|mysql|kafka|nats|amqp|mqtt|custom|app|api|service)://"#;
13const MARKDOWN_LINK_PATTERN_STR: &str = r#"\[(?:[^\[\]]|\[[^\]]*\])*\]\(([^)\s]+)(?:\s+(?:\"[^\"]*\"|\'[^\']*\'))?\)"#;
14const MARKDOWN_EMPTY_LINK_PATTERN_STR: &str = r#"\[(?:[^\[\]]|\[[^\]]*\])*\]\(\)"#;
15const MARKDOWN_EMPTY_REF_PATTERN_STR: &str = r#"\[(?:[^\[\]]|\[[^\]]*\])*\]\[\]"#;
16const ANGLE_LINK_PATTERN_STR: &str =
17    r#"<((?:https?|ftps?)://(?:\[[0-9a-fA-F:]+(?:%[a-zA-Z0-9]+)?\]|[^>]+)|[^@\s]+@[^@\s]+\.[^@\s>]+)>"#;
18const BADGE_LINK_LINE_STR: &str = r#"^\s*\[!\[[^\]]*\]\([^)]*\)\]\([^)]*\)\s*$"#;
19const MARKDOWN_IMAGE_PATTERN_STR: &str = r#"!\s*\[([^\]]*)\]\s*\(([^)\s]+)(?:\s+(?:\"[^\"]*\"|\'[^\']*\'))?\)"#;
20const SIMPLE_URL_REGEX_STR: &str = r#"(https?|ftps?)://(?:\[[0-9a-fA-F:%.]+\](?::\d+)?|[^\s<>\[\]()\\'\"`\]]+)(?:/[^\s<>\[\]()\\'\"`]*)?(?:\?[^\s<>\[\]()\\'\"`]*)?(?:#[^\s<>\[\]()\\'\"`]*)?"#;
21const IPV6_URL_REGEX_STR: &str = r#"(https?|ftps?)://\[[0-9a-fA-F:%.\-a-zA-Z]+\](?::\d+)?(?:/[^\s<>\[\]()\\'\"`]*)?(?:\?[^\s<>\[\]()\\'\"`]*)?(?:#[^\s<>\[\]()\\'\"`]*)?"#;
22const REFERENCE_DEF_RE_STR: &str = r"^\s*\[[^\]]+\]:\s*(?:https?|ftps?)://\S+$";
23const HTML_COMMENT_PATTERN_STR: &str = r#"<!--[\s\S]*?-->"#;
24const HTML_TAG_PATTERN_STR: &str = r#"<[^>]*>"#;
25const MULTILINE_LINK_CONTINUATION_STR: &str = r#"^[^\[]*\]\(.*\)"#;
26
27/// Reusable buffers for check_line to reduce allocations
28#[derive(Default)]
29struct LineCheckBuffers {
30    markdown_link_ranges: Vec<(usize, usize)>,
31    image_ranges: Vec<(usize, usize)>,
32    urls_found: Vec<(usize, usize, String)>,
33}
34
35#[derive(Default, Clone)]
36pub struct MD034NoBareUrls;
37
38impl MD034NoBareUrls {
39    #[inline]
40    pub fn should_skip_content(&self, content: &str) -> bool {
41        // Skip if content has no URLs and no email addresses
42        // Fast byte scanning for common URL/email indicators
43        let bytes = content.as_bytes();
44        !bytes.contains(&b':') && !bytes.contains(&b'@')
45    }
46
47    /// Remove trailing punctuation that is likely sentence punctuation, not part of the URL
48    fn trim_trailing_punctuation<'a>(&self, url: &'a str) -> &'a str {
49        let mut trimmed = url;
50
51        // Check for balanced parentheses - if we have unmatched closing parens, they're likely punctuation
52        let open_parens = url.chars().filter(|&c| c == '(').count();
53        let close_parens = url.chars().filter(|&c| c == ')').count();
54
55        if close_parens > open_parens {
56            // Find the last balanced closing paren position
57            let mut balance = 0;
58            let mut last_balanced_pos = url.len();
59
60            for (i, c) in url.chars().enumerate() {
61                if c == '(' {
62                    balance += 1;
63                } else if c == ')' {
64                    balance -= 1;
65                    if balance < 0 {
66                        // Found an unmatched closing paren
67                        last_balanced_pos = i;
68                        break;
69                    }
70                }
71            }
72
73            trimmed = &trimmed[..last_balanced_pos];
74        }
75
76        // Trim specific punctuation only if not followed by more URL-like chars
77        while let Some(last_char) = trimmed.chars().last() {
78            if matches!(last_char, '.' | ',' | ';' | ':' | '!' | '?') {
79                // Check if this looks like it could be part of the URL
80                // For ':' specifically, keep it if followed by digits (port number)
81                if last_char == ':' && trimmed.len() > 1 {
82                    // Don't trim
83                    break;
84                }
85                trimmed = &trimmed[..trimmed.len() - 1];
86            } else {
87                break;
88            }
89        }
90
91        trimmed
92    }
93
94    /// Check if line is inside a reference definition
95    fn is_reference_definition(&self, line: &str) -> bool {
96        get_cached_regex(REFERENCE_DEF_RE_STR)
97            .map(|re| re.is_match(line))
98            .unwrap_or(false)
99    }
100
101    /// Check if we're inside an HTML comment
102    fn is_in_html_comment(&self, content: &str, pos: usize) -> bool {
103        // Find all HTML comments in the content
104        if let Ok(re) = get_cached_regex(HTML_COMMENT_PATTERN_STR) {
105            for mat in re.find_iter(content) {
106                if pos >= mat.start() && pos < mat.end() {
107                    return true;
108                }
109            }
110        }
111        false
112    }
113
114    /// Check if a position in a line is inside an HTML tag
115    fn is_in_html_tag(&self, line: &str, pos: usize) -> bool {
116        // Find all HTML tags in the line
117        if let Ok(re) = get_cached_regex(HTML_TAG_PATTERN_STR) {
118            for mat in re.find_iter(line) {
119                if pos >= mat.start() && pos < mat.end() {
120                    return true;
121                }
122            }
123        }
124        false
125    }
126
127    fn check_line(
128        &self,
129        line: &str,
130        content: &str,
131        line_number: usize,
132        code_spans: &[crate::lint_context::CodeSpan],
133        buffers: &mut LineCheckBuffers,
134        line_index: &LineIndex,
135    ) -> Vec<LintWarning> {
136        let mut warnings = Vec::new();
137
138        // Skip reference definitions
139        if self.is_reference_definition(line) {
140            return warnings;
141        }
142
143        // Skip lines that are continuations of multiline markdown links
144        // Pattern: text](url) without a leading [
145        if let Ok(re) = get_cached_regex(MULTILINE_LINK_CONTINUATION_STR)
146            && re.is_match(line)
147        {
148            return warnings;
149        }
150
151        // Quick check - does this line potentially have a URL or email?
152        if let Ok(re) = get_cached_regex(URL_QUICK_CHECK_STR)
153            && !re.is_match(line)
154            && !line.contains('@')
155        {
156            return warnings;
157        }
158
159        // Clear and reuse buffers instead of allocating new ones
160        buffers.markdown_link_ranges.clear();
161        if let Ok(re) = get_cached_regex(MARKDOWN_LINK_PATTERN_STR) {
162            for cap in re.captures_iter(line) {
163                if let Some(mat) = cap.get(0) {
164                    buffers.markdown_link_ranges.push((mat.start(), mat.end()));
165                }
166            }
167        }
168
169        // Also include empty link patterns like [text]() and [text][]
170        if let Ok(re) = get_cached_regex(MARKDOWN_EMPTY_LINK_PATTERN_STR) {
171            for mat in re.find_iter(line) {
172                buffers.markdown_link_ranges.push((mat.start(), mat.end()));
173            }
174        }
175
176        if let Ok(re) = get_cached_regex(MARKDOWN_EMPTY_REF_PATTERN_STR) {
177            for mat in re.find_iter(line) {
178                buffers.markdown_link_ranges.push((mat.start(), mat.end()));
179            }
180        }
181
182        if let Ok(re) = get_cached_regex(ANGLE_LINK_PATTERN_STR) {
183            for cap in re.captures_iter(line) {
184                if let Some(mat) = cap.get(0) {
185                    buffers.markdown_link_ranges.push((mat.start(), mat.end()));
186                }
187            }
188        }
189
190        // Find all markdown images for exclusion
191        buffers.image_ranges.clear();
192        if let Ok(re) = get_cached_regex(MARKDOWN_IMAGE_PATTERN_STR) {
193            for cap in re.captures_iter(line) {
194                if let Some(mat) = cap.get(0) {
195                    buffers.image_ranges.push((mat.start(), mat.end()));
196                }
197            }
198        }
199
200        // Check if this line contains only a badge link (common pattern)
201        let is_badge_line = get_cached_regex(BADGE_LINK_LINE_STR)
202            .map(|re| re.is_match(line))
203            .unwrap_or(false);
204
205        if is_badge_line {
206            return warnings;
207        }
208
209        // Find bare URLs
210        buffers.urls_found.clear();
211
212        // First, find IPv6 URLs (they need special handling)
213        if let Ok(re) = get_cached_regex(IPV6_URL_REGEX_STR) {
214            for mat in re.find_iter(line) {
215                let url_str = mat.as_str();
216                buffers.urls_found.push((mat.start(), mat.end(), url_str.to_string()));
217            }
218        }
219
220        // Then find regular URLs
221        if let Ok(re) = get_cached_regex(SIMPLE_URL_REGEX_STR) {
222            for mat in re.find_iter(line) {
223                let url_str = mat.as_str();
224
225                // Skip if it's an IPv6 URL (already handled)
226                if url_str.contains("://[") {
227                    continue;
228                }
229
230                // Skip malformed IPv6-like URLs
231                // Check for IPv6-like patterns that are malformed
232                if let Some(host_start) = url_str.find("://") {
233                    let after_protocol = &url_str[host_start + 3..];
234                    // If it looks like IPv6 (has :: or multiple :) but no brackets, skip if followed by ]
235                    if after_protocol.contains("::") || after_protocol.chars().filter(|&c| c == ':').count() > 1 {
236                        // Check if the next character after our match is ]
237                        if let Some(char_after) = line.chars().nth(mat.end())
238                            && char_after == ']'
239                        {
240                            // This is likely a malformed IPv6 URL like "https://::1]:8080"
241                            continue;
242                        }
243                    }
244                }
245
246                buffers.urls_found.push((mat.start(), mat.end(), url_str.to_string()));
247            }
248        }
249
250        // Process found URLs
251        for &(start, end, ref url_str) in buffers.urls_found.iter() {
252            // Skip custom protocols
253            if get_cached_regex(CUSTOM_PROTOCOL_PATTERN_STR)
254                .map(|re| re.is_match(url_str))
255                .unwrap_or(false)
256            {
257                continue;
258            }
259
260            // Check if this URL is inside a markdown link, angle bracket, or image
261            let mut is_inside_construct = false;
262            for &(link_start, link_end) in buffers.markdown_link_ranges.iter() {
263                if start >= link_start && end <= link_end {
264                    is_inside_construct = true;
265                    break;
266                }
267            }
268
269            for &(img_start, img_end) in buffers.image_ranges.iter() {
270                if start >= img_start && end <= img_end {
271                    is_inside_construct = true;
272                    break;
273                }
274            }
275
276            if is_inside_construct {
277                continue;
278            }
279
280            // Check if URL is inside an HTML tag
281            if self.is_in_html_tag(line, start) {
282                continue;
283            }
284
285            // Check if we're inside an HTML comment
286            let line_start_byte = line_index.get_line_start_byte(line_number).unwrap_or(0);
287            let absolute_pos = line_start_byte + start;
288            if self.is_in_html_comment(content, absolute_pos) {
289                continue;
290            }
291
292            // Clean up the URL by removing trailing punctuation
293            let trimmed_url = self.trim_trailing_punctuation(url_str);
294
295            // Only report if we have a valid URL after trimming
296            if !trimmed_url.is_empty() && trimmed_url != "//" {
297                let trimmed_len = trimmed_url.len();
298                let (start_line, start_col, end_line, end_col) =
299                    calculate_url_range(line_number, line, start, trimmed_len);
300
301                warnings.push(LintWarning {
302                    rule_name: Some("MD034"),
303                    line: start_line,
304                    column: start_col,
305                    end_line,
306                    end_column: end_col,
307                    message: format!("URL without angle brackets or link formatting: '{trimmed_url}'"),
308                    severity: Severity::Warning,
309                    fix: Some(Fix {
310                        range: {
311                            let line_start_byte = line_index.get_line_start_byte(line_number).unwrap_or(0);
312                            (line_start_byte + start)..(line_start_byte + start + trimmed_len)
313                        },
314                        replacement: format!("<{trimmed_url}>"),
315                    }),
316                });
317            }
318        }
319
320        // Check for bare email addresses
321        for cap in EMAIL_PATTERN.captures_iter(line) {
322            if let Some(mat) = cap.get(0) {
323                let email = mat.as_str();
324                let start = mat.start();
325                let end = mat.end();
326
327                // Check if email is inside angle brackets or markdown link
328                let mut is_inside_construct = false;
329                for &(link_start, link_end) in buffers.markdown_link_ranges.iter() {
330                    if start >= link_start && end <= link_end {
331                        is_inside_construct = true;
332                        break;
333                    }
334                }
335
336                if !is_inside_construct {
337                    // Check if email is inside an HTML tag
338                    if self.is_in_html_tag(line, start) {
339                        continue;
340                    }
341
342                    // Check if email is inside a code span
343                    let is_in_code_span = code_spans
344                        .iter()
345                        .any(|span| span.line == line_number && start >= span.start_col && start < span.end_col);
346
347                    if !is_in_code_span {
348                        let email_len = end - start;
349                        let (start_line, start_col, end_line, end_col) =
350                            calculate_url_range(line_number, line, start, email_len);
351
352                        warnings.push(LintWarning {
353                            rule_name: Some("MD034"),
354                            line: start_line,
355                            column: start_col,
356                            end_line,
357                            end_column: end_col,
358                            message: format!("Email address without angle brackets or link formatting: '{email}'"),
359                            severity: Severity::Warning,
360                            fix: Some(Fix {
361                                range: {
362                                    let line_start_byte = line_index.get_line_start_byte(line_number).unwrap_or(0);
363                                    (line_start_byte + start)..(line_start_byte + end)
364                                },
365                                replacement: format!("<{email}>"),
366                            }),
367                        });
368                    }
369                }
370            }
371        }
372
373        warnings
374    }
375}
376
377impl Rule for MD034NoBareUrls {
378    #[inline]
379    fn name(&self) -> &'static str {
380        "MD034"
381    }
382
383    fn as_any(&self) -> &dyn std::any::Any {
384        self
385    }
386
387    fn from_config(_config: &crate::config::Config) -> Box<dyn Rule>
388    where
389        Self: Sized,
390    {
391        Box::new(MD034NoBareUrls)
392    }
393
394    #[inline]
395    fn category(&self) -> RuleCategory {
396        RuleCategory::Link
397    }
398
399    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
400        !ctx.likely_has_links_or_images() && self.should_skip_content(ctx.content)
401    }
402
403    #[inline]
404    fn description(&self) -> &'static str {
405        "No bare URLs - wrap URLs in angle brackets"
406    }
407
408    fn check(&self, ctx: &LintContext) -> LintResult {
409        let mut warnings = Vec::new();
410        let content = ctx.content;
411
412        // Quick skip for content without URLs
413        if self.should_skip_content(content) {
414            return Ok(warnings);
415        }
416
417        // Create LineIndex for correct byte position calculations across all line ending types
418        let line_index = LineIndex::new(content.to_string());
419
420        // Get code spans for exclusion
421        let code_spans = ctx.code_spans();
422
423        // Allocate reusable buffers once instead of per-line to reduce allocations
424        let mut buffers = LineCheckBuffers::default();
425
426        // Check line by line
427        for (line_num, line) in content.lines().enumerate() {
428            // Skip lines inside code blocks
429            if ctx.is_in_code_block(line_num + 1) {
430                continue;
431            }
432
433            let mut line_warnings =
434                self.check_line(line, content, line_num + 1, &code_spans, &mut buffers, &line_index);
435
436            // Filter out warnings that are inside code spans
437            line_warnings.retain(|warning| {
438                // Check if the URL is inside a code span
439                !code_spans.iter().any(|span| {
440                    span.line == warning.line &&
441                    warning.column > 0 && // column is 1-indexed
442                    (warning.column - 1) >= span.start_col &&
443                    (warning.column - 1) < span.end_col
444                })
445            });
446
447            warnings.extend(line_warnings);
448        }
449
450        Ok(warnings)
451    }
452
453    fn fix(&self, ctx: &LintContext) -> Result<String, LintError> {
454        let mut content = ctx.content.to_string();
455        let mut warnings = self.check(ctx)?;
456
457        // Sort warnings by position to ensure consistent fix application
458        warnings.sort_by_key(|w| w.fix.as_ref().map(|f| f.range.start).unwrap_or(0));
459
460        // Apply fixes in reverse order to maintain positions
461        for warning in warnings.iter().rev() {
462            if let Some(fix) = &warning.fix {
463                let start = fix.range.start;
464                let end = fix.range.end;
465                content.replace_range(start..end, &fix.replacement);
466            }
467        }
468
469        Ok(content)
470    }
471}