Skip to main content

rumdl_lib/rules/
md034_no_bare_urls.rs

1/// Rule MD034: No unformatted URLs
2///
3/// See [docs/md034.md](../../docs/md034.md) for full documentation, configuration, and examples.
4use std::sync::LazyLock;
5
6use fancy_regex::Regex as FancyRegex;
7use regex::Regex;
8
9use crate::rule::{Fix, LintError, LintResult, LintWarning, Rule, RuleCategory, Severity};
10use crate::utils::range_utils::{LineIndex, calculate_url_range};
11use crate::utils::regex_cache::{
12    EMAIL_PATTERN, URL_IPV6_REGEX, URL_QUICK_CHECK_REGEX, URL_STANDARD_REGEX, URL_WWW_REGEX, XMPP_URI_REGEX,
13};
14
15use crate::filtered_lines::FilteredLinesExt;
16use crate::lint_context::LintContext;
17
18// MD034-specific pre-compiled regex patterns for markdown constructs
19static CUSTOM_PROTOCOL_REGEX: LazyLock<Regex> = LazyLock::new(|| {
20    Regex::new(r#"(?:grpc|ws|wss|ssh|git|svn|file|data|javascript|vscode|chrome|about|slack|discord|matrix|irc|redis|mongodb|postgresql|mysql|kafka|nats|amqp|mqtt|custom|app|api|service)://"#).unwrap()
21});
22static MARKDOWN_LINK_REGEX: LazyLock<Regex> = LazyLock::new(|| {
23    Regex::new(r#"\[(?:[^\[\]]|\[[^\]]*\])*\]\(([^)\s]+)(?:\s+(?:\"[^\"]*\"|\'[^\']*\'))?\)"#).unwrap()
24});
25static MARKDOWN_EMPTY_LINK_REGEX: LazyLock<Regex> =
26    LazyLock::new(|| Regex::new(r#"\[(?:[^\[\]]|\[[^\]]*\])*\]\(\)"#).unwrap());
27static MARKDOWN_EMPTY_REF_REGEX: LazyLock<Regex> =
28    LazyLock::new(|| Regex::new(r#"\[(?:[^\[\]]|\[[^\]]*\])*\]\[\]"#).unwrap());
29static ANGLE_LINK_REGEX: LazyLock<Regex> = LazyLock::new(|| {
30    Regex::new(
31        r#"<((?:https?|ftps?)://(?:\[[0-9a-fA-F:]+(?:%[a-zA-Z0-9]+)?\]|[^>]+)|xmpp:[^>]+|[^@\s]+@[^@\s]+\.[^@\s>]+)>"#,
32    )
33    .unwrap()
34});
35static BADGE_LINK_LINE_REGEX: LazyLock<Regex> =
36    LazyLock::new(|| Regex::new(r#"^\s*\[!\[[^\]]*\]\([^)]*\)\]\([^)]*\)\s*$"#).unwrap());
37static MARKDOWN_IMAGE_REGEX: LazyLock<Regex> =
38    LazyLock::new(|| Regex::new(r#"!\s*\[([^\]]*)\]\s*\(([^)\s]+)(?:\s+(?:\"[^\"]*\"|\'[^\']*\'))?\)"#).unwrap());
39static REFERENCE_DEF_REGEX: LazyLock<Regex> =
40    LazyLock::new(|| Regex::new(r"^\s*\[[^\]]+\]:\s*(?:<|(?:https?|ftps?)://)").unwrap());
41static MULTILINE_LINK_CONTINUATION_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r#"^[^\[]*\]\(.*\)"#).unwrap());
42// Uses FancyRegex for negative lookahead support
43static SHORTCUT_REF_FANCY_REGEX: LazyLock<FancyRegex> =
44    LazyLock::new(|| FancyRegex::new(r#"\[([^\[\]]+)\](?!\s*[\[(])"#).unwrap());
45
46/// Reusable buffers for check_line to reduce allocations
47#[derive(Default)]
48struct LineCheckBuffers {
49    markdown_link_ranges: Vec<(usize, usize)>,
50    image_ranges: Vec<(usize, usize)>,
51    urls_found: Vec<(usize, usize, String)>,
52}
53
54#[derive(Default, Clone)]
55pub struct MD034NoBareUrls;
56
57impl MD034NoBareUrls {
58    #[inline]
59    pub fn should_skip_content(&self, content: &str) -> bool {
60        // Skip if content has no URLs, XMPP URIs, or email addresses
61        // Fast byte scanning for common URL/email/xmpp indicators
62        let bytes = content.as_bytes();
63        let has_colon = bytes.contains(&b':');
64        let has_at = bytes.contains(&b'@');
65        let has_www = content.contains("www.");
66        !has_colon && !has_at && !has_www
67    }
68
69    /// Remove trailing punctuation that is likely sentence punctuation, not part of the URL
70    fn trim_trailing_punctuation<'a>(&self, url: &'a str) -> &'a str {
71        let mut trimmed = url;
72
73        // Check for balanced parentheses - if we have unmatched closing parens, they're likely punctuation
74        let open_parens = url.chars().filter(|&c| c == '(').count();
75        let close_parens = url.chars().filter(|&c| c == ')').count();
76
77        if close_parens > open_parens {
78            // Find the last balanced closing paren position
79            let mut balance = 0;
80            let mut last_balanced_pos = url.len();
81
82            for (byte_idx, c) in url.char_indices() {
83                if c == '(' {
84                    balance += 1;
85                } else if c == ')' {
86                    balance -= 1;
87                    if balance < 0 {
88                        // Found an unmatched closing paren
89                        last_balanced_pos = byte_idx;
90                        break;
91                    }
92                }
93            }
94
95            trimmed = &trimmed[..last_balanced_pos];
96        }
97
98        // Trim specific punctuation only if not followed by more URL-like chars
99        while let Some(last_char) = trimmed.chars().last() {
100            if matches!(last_char, '.' | ',' | ';' | ':' | '!' | '?') {
101                // Check if this looks like it could be part of the URL
102                // For ':' specifically, keep it if followed by digits (port number)
103                if last_char == ':' && trimmed.len() > 1 {
104                    // Don't trim
105                    break;
106                }
107                trimmed = &trimmed[..trimmed.len() - 1];
108            } else {
109                break;
110            }
111        }
112
113        trimmed
114    }
115
116    /// Check if line is inside a reference definition
117    fn is_reference_definition(&self, line: &str) -> bool {
118        REFERENCE_DEF_REGEX.is_match(line)
119    }
120
121    fn check_line(
122        &self,
123        line: &str,
124        ctx: &LintContext,
125        line_number: usize,
126        code_spans: &[crate::lint_context::CodeSpan],
127        buffers: &mut LineCheckBuffers,
128        line_index: &LineIndex,
129    ) -> Vec<LintWarning> {
130        let mut warnings = Vec::new();
131
132        // Skip reference definitions
133        if self.is_reference_definition(line) {
134            return warnings;
135        }
136
137        // Skip lines inside HTML blocks - URLs in HTML attributes should not be linted
138        if ctx.line_info(line_number).is_some_and(|info| info.in_html_block) {
139            return warnings;
140        }
141
142        // Skip lines that are continuations of multiline markdown links
143        // Pattern: text](url) without a leading [
144        if MULTILINE_LINK_CONTINUATION_REGEX.is_match(line) {
145            return warnings;
146        }
147
148        // Quick check - does this line potentially have a URL or email?
149        let has_quick_check = URL_QUICK_CHECK_REGEX.is_match(line);
150        let has_www = line.contains("www.");
151        let has_at = line.contains('@');
152
153        if !has_quick_check && !has_at && !has_www {
154            return warnings;
155        }
156
157        // Clear and reuse buffers instead of allocating new ones
158        buffers.markdown_link_ranges.clear();
159        for cap in MARKDOWN_LINK_REGEX.captures_iter(line) {
160            if let Some(mat) = cap.get(0) {
161                buffers.markdown_link_ranges.push((mat.start(), mat.end()));
162            }
163        }
164
165        // Also include empty link patterns like [text]() and [text][]
166        for mat in MARKDOWN_EMPTY_LINK_REGEX.find_iter(line) {
167            buffers.markdown_link_ranges.push((mat.start(), mat.end()));
168        }
169
170        for mat in MARKDOWN_EMPTY_REF_REGEX.find_iter(line) {
171            buffers.markdown_link_ranges.push((mat.start(), mat.end()));
172        }
173
174        // Also exclude shortcut reference links like [URL] - even if no definition exists,
175        // the brackets indicate user intent to use markdown formatting
176        // Uses FancyRegex for negative lookahead support
177        for mat in SHORTCUT_REF_FANCY_REGEX.find_iter(line).flatten() {
178            buffers.markdown_link_ranges.push((mat.start(), mat.end()));
179        }
180
181        for cap in ANGLE_LINK_REGEX.captures_iter(line) {
182            if let Some(mat) = cap.get(0) {
183                buffers.markdown_link_ranges.push((mat.start(), mat.end()));
184            }
185        }
186
187        // Find all markdown images for exclusion
188        buffers.image_ranges.clear();
189        for cap in MARKDOWN_IMAGE_REGEX.captures_iter(line) {
190            if let Some(mat) = cap.get(0) {
191                buffers.image_ranges.push((mat.start(), mat.end()));
192            }
193        }
194
195        // Check if this line contains only a badge link (common pattern)
196        if BADGE_LINK_LINE_REGEX.is_match(line) {
197            return warnings;
198        }
199
200        // Find bare URLs
201        buffers.urls_found.clear();
202
203        // First, find IPv6 URLs (they need special handling)
204        for mat in URL_IPV6_REGEX.find_iter(line) {
205            let url_str = mat.as_str();
206            buffers.urls_found.push((mat.start(), mat.end(), url_str.to_string()));
207        }
208
209        // Then find regular URLs
210        for mat in URL_STANDARD_REGEX.find_iter(line) {
211            let url_str = mat.as_str();
212
213            // Skip if it's an IPv6 URL (already handled)
214            if url_str.contains("://[") {
215                continue;
216            }
217
218            // Skip malformed IPv6-like URLs
219            // Check for IPv6-like patterns that are malformed
220            if let Some(host_start) = url_str.find("://") {
221                let after_protocol = &url_str[host_start + 3..];
222                // If it looks like IPv6 (has :: or multiple :) but no brackets, skip if followed by ]
223                if after_protocol.contains("::") || after_protocol.chars().filter(|&c| c == ':').count() > 1 {
224                    // Check if the next byte after our match is ] (ASCII, so byte check is safe)
225                    if line.as_bytes().get(mat.end()) == Some(&b']') {
226                        // This is likely a malformed IPv6 URL like "https://::1]:8080"
227                        continue;
228                    }
229                }
230            }
231
232            buffers.urls_found.push((mat.start(), mat.end(), url_str.to_string()));
233        }
234
235        // Find www URLs without protocol (e.g., www.example.com)
236        for mat in URL_WWW_REGEX.find_iter(line) {
237            let url_str = mat.as_str();
238            let start_pos = mat.start();
239            let end_pos = mat.end();
240
241            // Skip if preceded by / or @ (likely part of a full URL)
242            if start_pos > 0 {
243                let prev_char = line.as_bytes().get(start_pos - 1).copied();
244                if prev_char == Some(b'/') || prev_char == Some(b'@') {
245                    continue;
246                }
247            }
248
249            // Skip if inside angle brackets (autolink syntax like <www.example.com>)
250            if start_pos > 0 && end_pos < line.len() {
251                let prev_char = line.as_bytes().get(start_pos - 1).copied();
252                let next_char = line.as_bytes().get(end_pos).copied();
253                if prev_char == Some(b'<') && next_char == Some(b'>') {
254                    continue;
255                }
256            }
257
258            buffers.urls_found.push((start_pos, end_pos, url_str.to_string()));
259        }
260
261        // Find XMPP URIs (GFM extended autolinks: xmpp:user@domain/resource)
262        for mat in XMPP_URI_REGEX.find_iter(line) {
263            let uri_str = mat.as_str();
264            let start_pos = mat.start();
265            let end_pos = mat.end();
266
267            // Skip if inside angle brackets (already properly formatted: <xmpp:user@domain>)
268            if start_pos > 0 && end_pos < line.len() {
269                let prev_char = line.as_bytes().get(start_pos - 1).copied();
270                let next_char = line.as_bytes().get(end_pos).copied();
271                if prev_char == Some(b'<') && next_char == Some(b'>') {
272                    continue;
273                }
274            }
275
276            buffers.urls_found.push((start_pos, end_pos, uri_str.to_string()));
277        }
278
279        // Process found URLs
280        for &(start, _end, ref url_str) in buffers.urls_found.iter() {
281            // Skip custom protocols
282            if CUSTOM_PROTOCOL_REGEX.is_match(url_str) {
283                continue;
284            }
285
286            // Check if this URL is inside a markdown link, angle bracket, or image
287            // We check if the URL starts within a construct, not if it's entirely contained.
288            // This handles cases where URL detection may include trailing characters
289            // that extend past the construct boundary (e.g., parentheses).
290            let mut is_inside_construct = false;
291            for &(link_start, link_end) in buffers.markdown_link_ranges.iter() {
292                if start >= link_start && start < link_end {
293                    is_inside_construct = true;
294                    break;
295                }
296            }
297
298            for &(img_start, img_end) in buffers.image_ranges.iter() {
299                if start >= img_start && start < img_end {
300                    is_inside_construct = true;
301                    break;
302                }
303            }
304
305            if is_inside_construct {
306                continue;
307            }
308
309            // Calculate absolute byte position for context-aware checks
310            let line_start_byte = line_index.get_line_start_byte(line_number).unwrap_or(0);
311            let absolute_pos = line_start_byte + start;
312
313            // Check if URL is inside an HTML tag (handles multiline tags correctly)
314            if ctx.is_in_html_tag(absolute_pos) {
315                continue;
316            }
317
318            // Check if we're inside an HTML comment
319            if ctx.is_in_html_comment(absolute_pos) {
320                continue;
321            }
322
323            // Check if we're inside a Hugo/Quarto shortcode
324            if ctx.is_in_shortcode(absolute_pos) {
325                continue;
326            }
327
328            // Clean up the URL by removing trailing punctuation
329            let trimmed_url = self.trim_trailing_punctuation(url_str);
330
331            // Only report if we have a valid URL after trimming
332            if !trimmed_url.is_empty() && trimmed_url != "//" {
333                let trimmed_len = trimmed_url.len();
334                let (start_line, start_col, end_line, end_col) =
335                    calculate_url_range(line_number, line, start, trimmed_len);
336
337                // For www URLs without protocol, add https:// prefix in the fix
338                let replacement = if trimmed_url.starts_with("www.") {
339                    format!("<https://{trimmed_url}>")
340                } else {
341                    format!("<{trimmed_url}>")
342                };
343
344                warnings.push(LintWarning {
345                    rule_name: Some("MD034".to_string()),
346                    line: start_line,
347                    column: start_col,
348                    end_line,
349                    end_column: end_col,
350                    message: format!("URL without angle brackets or link formatting: '{trimmed_url}'"),
351                    severity: Severity::Warning,
352                    fix: Some(Fix {
353                        range: {
354                            let line_start_byte = line_index.get_line_start_byte(line_number).unwrap_or(0);
355                            (line_start_byte + start)..(line_start_byte + start + trimmed_len)
356                        },
357                        replacement,
358                    }),
359                });
360            }
361        }
362
363        // Check for bare email addresses
364        for cap in EMAIL_PATTERN.captures_iter(line) {
365            if let Some(mat) = cap.get(0) {
366                let email = mat.as_str();
367                let start = mat.start();
368                let end = mat.end();
369
370                // Skip if email is part of an XMPP URI (xmpp:user@domain)
371                // Check character boundary to avoid panics with multi-byte UTF-8
372                if start >= 5 && line.is_char_boundary(start - 5) && &line[start - 5..start] == "xmpp:" {
373                    continue;
374                }
375
376                // Check if email is inside angle brackets or markdown link
377                let mut is_inside_construct = false;
378                for &(link_start, link_end) in buffers.markdown_link_ranges.iter() {
379                    if start >= link_start && end <= link_end {
380                        is_inside_construct = true;
381                        break;
382                    }
383                }
384
385                if !is_inside_construct {
386                    // Calculate absolute byte position for context-aware checks
387                    let line_start_byte = line_index.get_line_start_byte(line_number).unwrap_or(0);
388                    let absolute_pos = line_start_byte + start;
389
390                    // Check if email is inside an HTML tag (handles multiline tags)
391                    if ctx.is_in_html_tag(absolute_pos) {
392                        continue;
393                    }
394
395                    // Check if email is inside a code span
396                    let is_in_code_span = code_spans
397                        .iter()
398                        .any(|span| span.line == line_number && start >= span.start_col && start < span.end_col);
399
400                    if !is_in_code_span {
401                        let email_len = end - start;
402                        let (start_line, start_col, end_line, end_col) =
403                            calculate_url_range(line_number, line, start, email_len);
404
405                        warnings.push(LintWarning {
406                            rule_name: Some("MD034".to_string()),
407                            line: start_line,
408                            column: start_col,
409                            end_line,
410                            end_column: end_col,
411                            message: format!("Email address without angle brackets or link formatting: '{email}'"),
412                            severity: Severity::Warning,
413                            fix: Some(Fix {
414                                range: (line_start_byte + start)..(line_start_byte + end),
415                                replacement: format!("<{email}>"),
416                            }),
417                        });
418                    }
419                }
420            }
421        }
422
423        warnings
424    }
425}
426
427impl Rule for MD034NoBareUrls {
428    #[inline]
429    fn name(&self) -> &'static str {
430        "MD034"
431    }
432
433    fn as_any(&self) -> &dyn std::any::Any {
434        self
435    }
436
437    fn from_config(_config: &crate::config::Config) -> Box<dyn Rule>
438    where
439        Self: Sized,
440    {
441        Box::new(MD034NoBareUrls)
442    }
443
444    #[inline]
445    fn category(&self) -> RuleCategory {
446        RuleCategory::Link
447    }
448
449    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
450        !ctx.likely_has_links_or_images() && self.should_skip_content(ctx.content)
451    }
452
453    #[inline]
454    fn description(&self) -> &'static str {
455        "No bare URLs - wrap URLs in angle brackets"
456    }
457
458    fn check(&self, ctx: &LintContext) -> LintResult {
459        let mut warnings = Vec::new();
460        let content = ctx.content;
461
462        // Quick skip for content without URLs
463        if self.should_skip_content(content) {
464            return Ok(warnings);
465        }
466
467        // Create LineIndex for correct byte position calculations across all line ending types
468        let line_index = &ctx.line_index;
469
470        // Get code spans for exclusion
471        let code_spans = ctx.code_spans();
472
473        // Allocate reusable buffers once instead of per-line to reduce allocations
474        let mut buffers = LineCheckBuffers::default();
475
476        // Iterate over content lines, automatically skipping front matter, code blocks,
477        // and Obsidian comments (when in Obsidian flavor)
478        // This uses the filtered iterator API which centralizes the skip logic
479        for line in ctx
480            .filtered_lines()
481            .skip_front_matter()
482            .skip_code_blocks()
483            .skip_obsidian_comments()
484        {
485            let mut line_warnings =
486                self.check_line(line.content, ctx, line.line_num, &code_spans, &mut buffers, line_index);
487
488            // Filter out warnings that are inside code spans
489            line_warnings.retain(|warning| {
490                // Check if the URL is inside a code span
491                !code_spans.iter().any(|span| {
492                    span.line == warning.line &&
493                    warning.column > 0 && // column is 1-indexed
494                    (warning.column - 1) >= span.start_col &&
495                    (warning.column - 1) < span.end_col
496                })
497            });
498
499            // Filter out warnings where the URL is inside a parsed link
500            // This handles cases like [text]( https://url ) where the URL has leading whitespace
501            // pulldown-cmark correctly parses these as valid links even though our regex misses them
502            line_warnings.retain(|warning| {
503                if let Some(fix) = &warning.fix {
504                    // Check if the fix range falls inside any parsed link's byte range
505                    !ctx.links
506                        .iter()
507                        .any(|link| fix.range.start >= link.byte_offset && fix.range.end <= link.byte_end)
508                } else {
509                    true
510                }
511            });
512
513            // Filter out warnings where the URL is inside an Obsidian comment (%%...%%)
514            // This handles inline comments like: text %%https://hidden.com%% text
515            line_warnings.retain(|warning| !ctx.is_position_in_obsidian_comment(warning.line, warning.column));
516
517            warnings.extend(line_warnings);
518        }
519
520        Ok(warnings)
521    }
522
523    fn fix(&self, ctx: &LintContext) -> Result<String, LintError> {
524        let mut content = ctx.content.to_string();
525        let mut warnings = self.check(ctx)?;
526
527        // Sort warnings by position to ensure consistent fix application
528        warnings.sort_by_key(|w| w.fix.as_ref().map(|f| f.range.start).unwrap_or(0));
529
530        // Apply fixes in reverse order to maintain positions
531        for warning in warnings.iter().rev() {
532            if let Some(fix) = &warning.fix {
533                let start = fix.range.start;
534                let end = fix.range.end;
535                content.replace_range(start..end, &fix.replacement);
536            }
537        }
538
539        Ok(content)
540    }
541}