rumdl_lib/rules/
md034_no_bare_urls.rs

1/// Rule MD034: No unformatted URLs
2///
3/// See [docs/md034.md](../../docs/md034.md) for full documentation, configuration, and examples.
4use crate::rule::{Fix, LintError, LintResult, LintWarning, Rule, RuleCategory, Severity};
5use crate::utils::range_utils::{LineIndex, calculate_url_range};
6use crate::utils::regex_cache::{EMAIL_PATTERN, get_cached_regex};
7
8use crate::filtered_lines::FilteredLinesExt;
9use crate::lint_context::LintContext;
10
11// URL detection patterns
12const URL_QUICK_CHECK_STR: &str = r#"(?:https?|ftps?)://|@"#;
13const CUSTOM_PROTOCOL_PATTERN_STR: &str = r#"(?:grpc|ws|wss|ssh|git|svn|file|data|javascript|vscode|chrome|about|slack|discord|matrix|irc|redis|mongodb|postgresql|mysql|kafka|nats|amqp|mqtt|custom|app|api|service)://"#;
14const MARKDOWN_LINK_PATTERN_STR: &str = r#"\[(?:[^\[\]]|\[[^\]]*\])*\]\(([^)\s]+)(?:\s+(?:\"[^\"]*\"|\'[^\']*\'))?\)"#;
15const MARKDOWN_EMPTY_LINK_PATTERN_STR: &str = r#"\[(?:[^\[\]]|\[[^\]]*\])*\]\(\)"#;
16const MARKDOWN_EMPTY_REF_PATTERN_STR: &str = r#"\[(?:[^\[\]]|\[[^\]]*\])*\]\[\]"#;
17const ANGLE_LINK_PATTERN_STR: &str =
18    r#"<((?:https?|ftps?)://(?:\[[0-9a-fA-F:]+(?:%[a-zA-Z0-9]+)?\]|[^>]+)|[^@\s]+@[^@\s]+\.[^@\s>]+)>"#;
19const BADGE_LINK_LINE_STR: &str = r#"^\s*\[!\[[^\]]*\]\([^)]*\)\]\([^)]*\)\s*$"#;
20const MARKDOWN_IMAGE_PATTERN_STR: &str = r#"!\s*\[([^\]]*)\]\s*\(([^)\s]+)(?:\s+(?:\"[^\"]*\"|\'[^\']*\'))?\)"#;
21const SIMPLE_URL_REGEX_STR: &str = r#"(https?|ftps?)://(?:\[[0-9a-fA-F:%.]+\](?::\d+)?|[^\s<>\[\]()\\'\"`\]]+)(?:/[^\s<>\[\]()\\'\"`]*)?(?:\?[^\s<>\[\]()\\'\"`]*)?(?:#[^\s<>\[\]()\\'\"`]*)?"#;
22const IPV6_URL_REGEX_STR: &str = r#"(https?|ftps?)://\[[0-9a-fA-F:%.\-a-zA-Z]+\](?::\d+)?(?:/[^\s<>\[\]()\\'\"`]*)?(?:\?[^\s<>\[\]()\\'\"`]*)?(?:#[^\s<>\[\]()\\'\"`]*)?"#;
23const REFERENCE_DEF_RE_STR: &str = r"^\s*\[[^\]]+\]:\s*(?:https?|ftps?)://\S+$";
24const HTML_COMMENT_PATTERN_STR: &str = r#"<!--[\s\S]*?-->"#;
25const HTML_TAG_PATTERN_STR: &str = r#"<[^>]*>"#;
26const MULTILINE_LINK_CONTINUATION_STR: &str = r#"^[^\[]*\]\(.*\)"#;
27
28/// Reusable buffers for check_line to reduce allocations
29#[derive(Default)]
30struct LineCheckBuffers {
31    markdown_link_ranges: Vec<(usize, usize)>,
32    image_ranges: Vec<(usize, usize)>,
33    urls_found: Vec<(usize, usize, String)>,
34}
35
36#[derive(Default, Clone)]
37pub struct MD034NoBareUrls;
38
39impl MD034NoBareUrls {
40    #[inline]
41    pub fn should_skip_content(&self, content: &str) -> bool {
42        // Skip if content has no URLs and no email addresses
43        // Fast byte scanning for common URL/email indicators
44        let bytes = content.as_bytes();
45        !bytes.contains(&b':') && !bytes.contains(&b'@')
46    }
47
48    /// Remove trailing punctuation that is likely sentence punctuation, not part of the URL
49    fn trim_trailing_punctuation<'a>(&self, url: &'a str) -> &'a str {
50        let mut trimmed = url;
51
52        // Check for balanced parentheses - if we have unmatched closing parens, they're likely punctuation
53        let open_parens = url.chars().filter(|&c| c == '(').count();
54        let close_parens = url.chars().filter(|&c| c == ')').count();
55
56        if close_parens > open_parens {
57            // Find the last balanced closing paren position
58            let mut balance = 0;
59            let mut last_balanced_pos = url.len();
60
61            for (i, c) in url.chars().enumerate() {
62                if c == '(' {
63                    balance += 1;
64                } else if c == ')' {
65                    balance -= 1;
66                    if balance < 0 {
67                        // Found an unmatched closing paren
68                        last_balanced_pos = i;
69                        break;
70                    }
71                }
72            }
73
74            trimmed = &trimmed[..last_balanced_pos];
75        }
76
77        // Trim specific punctuation only if not followed by more URL-like chars
78        while let Some(last_char) = trimmed.chars().last() {
79            if matches!(last_char, '.' | ',' | ';' | ':' | '!' | '?') {
80                // Check if this looks like it could be part of the URL
81                // For ':' specifically, keep it if followed by digits (port number)
82                if last_char == ':' && trimmed.len() > 1 {
83                    // Don't trim
84                    break;
85                }
86                trimmed = &trimmed[..trimmed.len() - 1];
87            } else {
88                break;
89            }
90        }
91
92        trimmed
93    }
94
95    /// Check if line is inside a reference definition
96    fn is_reference_definition(&self, line: &str) -> bool {
97        get_cached_regex(REFERENCE_DEF_RE_STR)
98            .map(|re| re.is_match(line))
99            .unwrap_or(false)
100    }
101
102    /// Check if we're inside an HTML comment
103    fn is_in_html_comment(&self, content: &str, pos: usize) -> bool {
104        // Find all HTML comments in the content
105        if let Ok(re) = get_cached_regex(HTML_COMMENT_PATTERN_STR) {
106            for mat in re.find_iter(content) {
107                if pos >= mat.start() && pos < mat.end() {
108                    return true;
109                }
110            }
111        }
112        false
113    }
114
115    /// Check if a position in a line is inside an HTML tag
116    fn is_in_html_tag(&self, line: &str, pos: usize) -> bool {
117        // Find all HTML tags in the line
118        if let Ok(re) = get_cached_regex(HTML_TAG_PATTERN_STR) {
119            for mat in re.find_iter(line) {
120                if pos >= mat.start() && pos < mat.end() {
121                    return true;
122                }
123            }
124        }
125        false
126    }
127
128    fn check_line(
129        &self,
130        line: &str,
131        content: &str,
132        line_number: usize,
133        code_spans: &[crate::lint_context::CodeSpan],
134        buffers: &mut LineCheckBuffers,
135        line_index: &LineIndex,
136    ) -> Vec<LintWarning> {
137        let mut warnings = Vec::new();
138
139        // Skip reference definitions
140        if self.is_reference_definition(line) {
141            return warnings;
142        }
143
144        // Skip lines that are continuations of multiline markdown links
145        // Pattern: text](url) without a leading [
146        if let Ok(re) = get_cached_regex(MULTILINE_LINK_CONTINUATION_STR)
147            && re.is_match(line)
148        {
149            return warnings;
150        }
151
152        // Quick check - does this line potentially have a URL or email?
153        if let Ok(re) = get_cached_regex(URL_QUICK_CHECK_STR)
154            && !re.is_match(line)
155            && !line.contains('@')
156        {
157            return warnings;
158        }
159
160        // Clear and reuse buffers instead of allocating new ones
161        buffers.markdown_link_ranges.clear();
162        if let Ok(re) = get_cached_regex(MARKDOWN_LINK_PATTERN_STR) {
163            for cap in re.captures_iter(line) {
164                if let Some(mat) = cap.get(0) {
165                    buffers.markdown_link_ranges.push((mat.start(), mat.end()));
166                }
167            }
168        }
169
170        // Also include empty link patterns like [text]() and [text][]
171        if let Ok(re) = get_cached_regex(MARKDOWN_EMPTY_LINK_PATTERN_STR) {
172            for mat in re.find_iter(line) {
173                buffers.markdown_link_ranges.push((mat.start(), mat.end()));
174            }
175        }
176
177        if let Ok(re) = get_cached_regex(MARKDOWN_EMPTY_REF_PATTERN_STR) {
178            for mat in re.find_iter(line) {
179                buffers.markdown_link_ranges.push((mat.start(), mat.end()));
180            }
181        }
182
183        if let Ok(re) = get_cached_regex(ANGLE_LINK_PATTERN_STR) {
184            for cap in re.captures_iter(line) {
185                if let Some(mat) = cap.get(0) {
186                    buffers.markdown_link_ranges.push((mat.start(), mat.end()));
187                }
188            }
189        }
190
191        // Find all markdown images for exclusion
192        buffers.image_ranges.clear();
193        if let Ok(re) = get_cached_regex(MARKDOWN_IMAGE_PATTERN_STR) {
194            for cap in re.captures_iter(line) {
195                if let Some(mat) = cap.get(0) {
196                    buffers.image_ranges.push((mat.start(), mat.end()));
197                }
198            }
199        }
200
201        // Check if this line contains only a badge link (common pattern)
202        let is_badge_line = get_cached_regex(BADGE_LINK_LINE_STR)
203            .map(|re| re.is_match(line))
204            .unwrap_or(false);
205
206        if is_badge_line {
207            return warnings;
208        }
209
210        // Find bare URLs
211        buffers.urls_found.clear();
212
213        // First, find IPv6 URLs (they need special handling)
214        if let Ok(re) = get_cached_regex(IPV6_URL_REGEX_STR) {
215            for mat in re.find_iter(line) {
216                let url_str = mat.as_str();
217                buffers.urls_found.push((mat.start(), mat.end(), url_str.to_string()));
218            }
219        }
220
221        // Then find regular URLs
222        if let Ok(re) = get_cached_regex(SIMPLE_URL_REGEX_STR) {
223            for mat in re.find_iter(line) {
224                let url_str = mat.as_str();
225
226                // Skip if it's an IPv6 URL (already handled)
227                if url_str.contains("://[") {
228                    continue;
229                }
230
231                // Skip malformed IPv6-like URLs
232                // Check for IPv6-like patterns that are malformed
233                if let Some(host_start) = url_str.find("://") {
234                    let after_protocol = &url_str[host_start + 3..];
235                    // If it looks like IPv6 (has :: or multiple :) but no brackets, skip if followed by ]
236                    if after_protocol.contains("::") || after_protocol.chars().filter(|&c| c == ':').count() > 1 {
237                        // Check if the next character after our match is ]
238                        if let Some(char_after) = line.chars().nth(mat.end())
239                            && char_after == ']'
240                        {
241                            // This is likely a malformed IPv6 URL like "https://::1]:8080"
242                            continue;
243                        }
244                    }
245                }
246
247                buffers.urls_found.push((mat.start(), mat.end(), url_str.to_string()));
248            }
249        }
250
251        // Process found URLs
252        for &(start, end, ref url_str) in buffers.urls_found.iter() {
253            // Skip custom protocols
254            if get_cached_regex(CUSTOM_PROTOCOL_PATTERN_STR)
255                .map(|re| re.is_match(url_str))
256                .unwrap_or(false)
257            {
258                continue;
259            }
260
261            // Check if this URL is inside a markdown link, angle bracket, or image
262            let mut is_inside_construct = false;
263            for &(link_start, link_end) in buffers.markdown_link_ranges.iter() {
264                if start >= link_start && end <= link_end {
265                    is_inside_construct = true;
266                    break;
267                }
268            }
269
270            for &(img_start, img_end) in buffers.image_ranges.iter() {
271                if start >= img_start && end <= img_end {
272                    is_inside_construct = true;
273                    break;
274                }
275            }
276
277            if is_inside_construct {
278                continue;
279            }
280
281            // Check if URL is inside an HTML tag
282            if self.is_in_html_tag(line, start) {
283                continue;
284            }
285
286            // Check if we're inside an HTML comment
287            let line_start_byte = line_index.get_line_start_byte(line_number).unwrap_or(0);
288            let absolute_pos = line_start_byte + start;
289            if self.is_in_html_comment(content, absolute_pos) {
290                continue;
291            }
292
293            // Clean up the URL by removing trailing punctuation
294            let trimmed_url = self.trim_trailing_punctuation(url_str);
295
296            // Only report if we have a valid URL after trimming
297            if !trimmed_url.is_empty() && trimmed_url != "//" {
298                let trimmed_len = trimmed_url.len();
299                let (start_line, start_col, end_line, end_col) =
300                    calculate_url_range(line_number, line, start, trimmed_len);
301
302                warnings.push(LintWarning {
303                    rule_name: Some("MD034".to_string()),
304                    line: start_line,
305                    column: start_col,
306                    end_line,
307                    end_column: end_col,
308                    message: format!("URL without angle brackets or link formatting: '{trimmed_url}'"),
309                    severity: Severity::Warning,
310                    fix: Some(Fix {
311                        range: {
312                            let line_start_byte = line_index.get_line_start_byte(line_number).unwrap_or(0);
313                            (line_start_byte + start)..(line_start_byte + start + trimmed_len)
314                        },
315                        replacement: format!("<{trimmed_url}>"),
316                    }),
317                });
318            }
319        }
320
321        // Check for bare email addresses
322        for cap in EMAIL_PATTERN.captures_iter(line) {
323            if let Some(mat) = cap.get(0) {
324                let email = mat.as_str();
325                let start = mat.start();
326                let end = mat.end();
327
328                // Check if email is inside angle brackets or markdown link
329                let mut is_inside_construct = false;
330                for &(link_start, link_end) in buffers.markdown_link_ranges.iter() {
331                    if start >= link_start && end <= link_end {
332                        is_inside_construct = true;
333                        break;
334                    }
335                }
336
337                if !is_inside_construct {
338                    // Check if email is inside an HTML tag
339                    if self.is_in_html_tag(line, start) {
340                        continue;
341                    }
342
343                    // Check if email is inside a code span
344                    let is_in_code_span = code_spans
345                        .iter()
346                        .any(|span| span.line == line_number && start >= span.start_col && start < span.end_col);
347
348                    if !is_in_code_span {
349                        let email_len = end - start;
350                        let (start_line, start_col, end_line, end_col) =
351                            calculate_url_range(line_number, line, start, email_len);
352
353                        warnings.push(LintWarning {
354                            rule_name: Some("MD034".to_string()),
355                            line: start_line,
356                            column: start_col,
357                            end_line,
358                            end_column: end_col,
359                            message: format!("Email address without angle brackets or link formatting: '{email}'"),
360                            severity: Severity::Warning,
361                            fix: Some(Fix {
362                                range: {
363                                    let line_start_byte = line_index.get_line_start_byte(line_number).unwrap_or(0);
364                                    (line_start_byte + start)..(line_start_byte + end)
365                                },
366                                replacement: format!("<{email}>"),
367                            }),
368                        });
369                    }
370                }
371            }
372        }
373
374        warnings
375    }
376}
377
378impl Rule for MD034NoBareUrls {
379    #[inline]
380    fn name(&self) -> &'static str {
381        "MD034"
382    }
383
384    fn as_any(&self) -> &dyn std::any::Any {
385        self
386    }
387
388    fn from_config(_config: &crate::config::Config) -> Box<dyn Rule>
389    where
390        Self: Sized,
391    {
392        Box::new(MD034NoBareUrls)
393    }
394
395    #[inline]
396    fn category(&self) -> RuleCategory {
397        RuleCategory::Link
398    }
399
400    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
401        !ctx.likely_has_links_or_images() && self.should_skip_content(ctx.content)
402    }
403
404    #[inline]
405    fn description(&self) -> &'static str {
406        "No bare URLs - wrap URLs in angle brackets"
407    }
408
409    fn check(&self, ctx: &LintContext) -> LintResult {
410        let mut warnings = Vec::new();
411        let content = ctx.content;
412
413        // Quick skip for content without URLs
414        if self.should_skip_content(content) {
415            return Ok(warnings);
416        }
417
418        // Create LineIndex for correct byte position calculations across all line ending types
419        let line_index = LineIndex::new(content.to_string());
420
421        // Get code spans for exclusion
422        let code_spans = ctx.code_spans();
423
424        // Allocate reusable buffers once instead of per-line to reduce allocations
425        let mut buffers = LineCheckBuffers::default();
426
427        // Iterate over content lines, automatically skipping front matter and code blocks
428        // This uses the filtered iterator API which centralizes the skip logic
429        for line in ctx.filtered_lines().skip_front_matter().skip_code_blocks() {
430            let mut line_warnings = self.check_line(
431                line.content,
432                content,
433                line.line_num,
434                &code_spans,
435                &mut buffers,
436                &line_index,
437            );
438
439            // Filter out warnings that are inside code spans
440            line_warnings.retain(|warning| {
441                // Check if the URL is inside a code span
442                !code_spans.iter().any(|span| {
443                    span.line == warning.line &&
444                    warning.column > 0 && // column is 1-indexed
445                    (warning.column - 1) >= span.start_col &&
446                    (warning.column - 1) < span.end_col
447                })
448            });
449
450            warnings.extend(line_warnings);
451        }
452
453        Ok(warnings)
454    }
455
456    fn fix(&self, ctx: &LintContext) -> Result<String, LintError> {
457        let mut content = ctx.content.to_string();
458        let mut warnings = self.check(ctx)?;
459
460        // Sort warnings by position to ensure consistent fix application
461        warnings.sort_by_key(|w| w.fix.as_ref().map(|f| f.range.start).unwrap_or(0));
462
463        // Apply fixes in reverse order to maintain positions
464        for warning in warnings.iter().rev() {
465            if let Some(fix) = &warning.fix {
466                let start = fix.range.start;
467                let end = fix.range.end;
468                content.replace_range(start..end, &fix.replacement);
469            }
470        }
471
472        Ok(content)
473    }
474}