rumdl_lib/rules/
md034_no_bare_urls.rs

1/// Rule MD034: No unformatted URLs
2///
3/// See [docs/md034.md](../../docs/md034.md) for full documentation, configuration, and examples.
4use crate::rule::{
5    AstExtensions, Fix, LintError, LintResult, LintWarning, MarkdownAst, MaybeAst, Rule, RuleCategory, Severity,
6};
7use crate::utils::early_returns;
8use crate::utils::range_utils::calculate_url_range;
9use crate::utils::regex_cache::EMAIL_PATTERN;
10
11use crate::lint_context::LintContext;
12use fancy_regex::Regex as FancyRegex;
13use lazy_static::lazy_static;
14use markdown::mdast::Node;
15use regex::Regex;
16
17lazy_static! {
18    // Simple pattern to quickly check if a line might contain a URL or email
19    static ref URL_QUICK_CHECK: Regex = Regex::new(r#"(?:https?|ftps?)://|@"#).unwrap();
20
21    // Use fancy-regex for look-behind/look-ahead
22    // Updated to support IPv6 addresses in square brackets
23    static ref URL_REGEX: FancyRegex = FancyRegex::new(r#"(?<![\w\[\(\<])((?:https?|ftps?)://(?:\[[0-9a-fA-F:%]+\]|[^\s<>\[\]()\\'\"]+)(?::\d+)?(?:/[^\s<>\[\]()\\'\"]*)?(?:\?[^\s<>\[\]()\\'\"]*)?(?:#[^\s<>\[\]()\\'\"]*)?)"#).unwrap();
24    static ref URL_FIX_REGEX: FancyRegex = FancyRegex::new(r#"(?<![\w\[\(\<])((?:https?|ftps?)://(?:\[[0-9a-fA-F:%]+\]|[^\s<>\[\]()\\'\"]+)(?::\d+)?(?:/[^\s<>\[\]()\\'\"]*)?(?:\?[^\s<>\[\]()\\'\"]*)?(?:#[^\s<>\[\]()\\'\"]*)?)"#).unwrap();
25
26    // Pattern to detect custom protocol patterns that shouldn't be flagged
27    // These are commonly used in documentation but aren't actual browsable URLs
28    static ref CUSTOM_PROTOCOL_PATTERN: Regex = Regex::new(r#"(?:grpc|ws|wss|ssh|git|svn|file|data|javascript|vscode|chrome|about|slack|discord|matrix|irc|redis|mongodb|postgresql|mysql|kafka|nats|amqp|mqtt|custom|app|api|service)://"#).unwrap();
29
30    // Pattern to match markdown link format - capture destination in Group 1
31    // Updated to handle nested brackets in badge links like [![badge](img)](link)
32    static ref MARKDOWN_LINK_PATTERN: Regex = Regex::new(r#"\[(?:[^\[\]]|\[[^\]]*\])*\]\(([^)\s]+)(?:\s+(?:\"[^\"]*\"|\'[^\']*\'))?\)"#).unwrap();
33
34    // Pattern to match angle bracket link format (URLs and emails)
35    // Updated to support IPv6 addresses
36    static ref ANGLE_LINK_PATTERN: Regex = Regex::new(r#"<((?:https?|ftps?)://(?:\[[0-9a-fA-F:]+(?:%[a-zA-Z0-9]+)?\]|[^>]+)|[^@\s]+@[^@\s]+\.[^@\s>]+)>"#).unwrap();
37
38    // Add regex to identify lines containing only a badge link
39    static ref BADGE_LINK_LINE: Regex = Regex::new(r#"^\s*\[!\[[^\]]*\]\([^)]*\)\]\([^)]*\)\s*$"#).unwrap();
40
41    // Add pattern to check if link text is *only* an image
42    static ref IMAGE_ONLY_LINK_TEXT_PATTERN: Regex = Regex::new(r#"^!\s*\[[^\]]*\]\s*\([^)]*\)$"#).unwrap();
43
44    // Captures full image in 0, alt text in 1, src in 2
45    static ref MARKDOWN_IMAGE_PATTERN: Regex = Regex::new(r#"!\s*\[([^\]]*)\]\s*\(([^)\s]+)(?:\s+(?:\"[^\"]*\"|\'[^\']*\'))?\)"#).unwrap();
46
47    // Add a simple regex for candidate URLs (no look-behind/look-ahead)
48    // Updated to match markdownlint's behavior: URLs can have domains without dots
49    // Handles URL components properly: scheme://domain[:port][/path][?query][#fragment]
50    // Will post-process to remove trailing sentence punctuation
51    // Now supports IPv6 addresses in square brackets
52    // Note: We need two separate patterns - one for IPv6 and one for regular URLs
53    // Updated to avoid matching partial IPv6 patterns (e.g., "https://::1]" without opening bracket)
54    static ref SIMPLE_URL_REGEX: Regex = Regex::new(r#"(https?|ftps?)://(?:\[[0-9a-fA-F:%.]+\](?::\d+)?|[^\s<>\[\]()\\'\"`:\]]+(?::\d+)?)(?:/[^\s<>\[\]()\\'\"`]*)?(?:\?[^\s<>\[\]()\\'\"`]*)?(?:#[^\s<>\[\]()\\'\"`]*)?"#).unwrap();
55
56    // Special pattern just for IPv6 URLs to handle them separately
57    // Note: This is permissive to match markdownlint behavior, allowing technically invalid IPv6 for examples
58    static ref IPV6_URL_REGEX: Regex = Regex::new(r#"(https?|ftps?)://\[[0-9a-fA-F:%.\-a-zA-Z]+\](?::\d+)?(?:/[^\s<>\[\]()\\'\"`]*)?(?:\?[^\s<>\[\]()\\'\"`]*)?(?:#[^\s<>\[\]()\\'\"`]*)?"#).unwrap();
59
60    // Add regex for reference definitions
61    // Updated to support IPv6 addresses
62    static ref REFERENCE_DEF_RE: Regex = Regex::new(r"^\s*\[[^\]]+\]:\s*(?:https?|ftps?)://\S+$").unwrap();
63
64    // Pattern to match HTML comments
65    static ref HTML_COMMENT_PATTERN: Regex = Regex::new(r#"<!--[\s\S]*?-->"#).unwrap();
66}
67
68#[derive(Default, Clone)]
69pub struct MD034NoBareUrls;
70
71impl MD034NoBareUrls {
72    #[inline]
73    pub fn should_skip(&self, content: &str) -> bool {
74        // Skip if content has no URLs and no email addresses
75        !early_returns::has_urls(content) && !content.contains('@')
76    }
77
78    /// Remove trailing punctuation that is likely sentence punctuation, not part of the URL
79    fn trim_trailing_punctuation<'a>(&self, url: &'a str) -> &'a str {
80        let trailing_punct = ['.', ',', ';', ':', '!', '?'];
81        let mut end = url.len();
82
83        // Remove trailing punctuation characters
84        while end > 0 {
85            if let Some(last_char) = url.chars().nth(end - 1) {
86                if trailing_punct.contains(&last_char) {
87                    end -= last_char.len_utf8();
88                } else {
89                    break;
90                }
91            } else {
92                break;
93            }
94        }
95
96        &url[..end]
97    }
98
99    // Uses DocumentStructure for code block and code span detection in check_with_structure.
100    pub fn check_with_structure(
101        &self,
102        ctx: &crate::lint_context::LintContext,
103        _structure: &crate::utils::document_structure::DocumentStructure,
104    ) -> LintResult {
105        let content = ctx.content;
106
107        // Early return: skip if no URLs or emails
108        if self.should_skip(content) {
109            return Ok(vec![]);
110        }
111
112        // Process the entire content to handle multi-line markdown links
113        let mut warnings = Vec::new();
114
115        // First, find all markdown link ranges across the entire content
116        let mut excluded_ranges: Vec<(usize, usize)> = Vec::new();
117
118        // Markdown links: [text](url) - exclude both destination and entire link text
119        for cap in MARKDOWN_LINK_PATTERN.captures_iter(content) {
120            if let Some(dest) = cap.get(1) {
121                excluded_ranges.push((dest.start(), dest.end()));
122            }
123            // Also exclude the entire link to handle URLs in link text
124            if let Some(full_match) = cap.get(0) {
125                excluded_ranges.push((full_match.start(), full_match.end()));
126            }
127        }
128
129        // Markdown images: ![alt](url)
130        for cap in MARKDOWN_IMAGE_PATTERN.captures_iter(content) {
131            if let Some(dest) = cap.get(2) {
132                excluded_ranges.push((dest.start(), dest.end()));
133            }
134        }
135
136        // Angle-bracket links: <url>
137        for cap in ANGLE_LINK_PATTERN.captures_iter(content) {
138            if let Some(m) = cap.get(1) {
139                excluded_ranges.push((m.start(), m.end()));
140            }
141        }
142
143        // HTML tags: exclude everything inside them
144        for html_tag in ctx.html_tags().iter() {
145            excluded_ranges.push((html_tag.byte_offset, html_tag.byte_end));
146        }
147
148        // HTML comments: <!-- url -->
149        for cap in HTML_COMMENT_PATTERN.captures_iter(content) {
150            if let Some(comment) = cap.get(0) {
151                excluded_ranges.push((comment.start(), comment.end()));
152            }
153        }
154
155        // Sort and merge overlapping ranges
156        excluded_ranges.sort_by_key(|r| r.0);
157        let mut merged: Vec<(usize, usize)> = Vec::new();
158        for (start, end) in excluded_ranges {
159            if let Some((_, last_end)) = merged.last_mut()
160                && *last_end >= start
161            {
162                *last_end = (*last_end).max(end);
163                continue;
164            }
165            merged.push((start, end));
166        }
167
168        // Now find all URLs and emails in the content and check if they're excluded
169        // We'll combine URL and email detection for efficiency
170        let mut all_matches: Vec<(usize, usize, bool)> = Vec::new(); // (start, end, is_email)
171
172        // Early exit if no potential URLs/emails based on quick check
173        if !content.contains("://") && !content.contains('@') {
174            return Ok(warnings);
175        }
176
177        // Use line-based processing for better cache locality
178        for line_info in ctx.lines.iter() {
179            let line_content = &line_info.content;
180
181            // Skip lines in code blocks
182            if line_info.in_code_block {
183                continue;
184            }
185
186            // Quick check if line might contain URLs or emails
187            if !line_content.contains("://") && !line_content.contains('@') {
188                continue;
189            }
190
191            // Check for URLs in this line
192            for url_match in SIMPLE_URL_REGEX.find_iter(line_content) {
193                let start_in_line = url_match.start();
194                let end_in_line = url_match.end();
195                let matched_str = &line_content[start_in_line..end_in_line];
196
197                // Skip invalid IPv6 patterns
198                if matched_str.contains("::") && !matched_str.contains('[') && matched_str.contains(']') {
199                    continue;
200                }
201
202                // Skip custom protocols that aren't standard web protocols
203                // Check if there's a custom protocol pattern before this match
204                if start_in_line > 0 {
205                    // Look back to see if this is part of a custom protocol URL
206                    let prefix_start = start_in_line.saturating_sub(20); // Look back up to 20 chars
207                    let prefix = &line_content[prefix_start..start_in_line];
208                    if CUSTOM_PROTOCOL_PATTERN.is_match(prefix) {
209                        continue;
210                    }
211                }
212
213                let global_start = line_info.byte_offset + start_in_line;
214                let global_end = line_info.byte_offset + end_in_line;
215                all_matches.push((global_start, global_end, false));
216            }
217
218            // Check for IPv6 URLs
219            for url_match in IPV6_URL_REGEX.find_iter(line_content) {
220                let global_start = line_info.byte_offset + url_match.start();
221                let global_end = line_info.byte_offset + url_match.end();
222
223                // Remove any overlapping regular URL matches
224                all_matches.retain(|(start, end, _)| !(*start < global_end && *end > global_start));
225
226                all_matches.push((global_start, global_end, false));
227            }
228
229            // Check for emails in this line
230            for email_match in EMAIL_PATTERN.find_iter(line_content) {
231                let global_start = line_info.byte_offset + email_match.start();
232                let global_end = line_info.byte_offset + email_match.end();
233                all_matches.push((global_start, global_end, true));
234            }
235        }
236
237        // Process all matches
238        for (match_start, match_end_orig, is_email) in all_matches {
239            let mut match_end = match_end_orig;
240
241            // For URLs, trim trailing punctuation
242            if !is_email {
243                let raw_url = &content[match_start..match_end];
244                let trimmed_url = self.trim_trailing_punctuation(raw_url);
245                match_end = match_start + trimmed_url.len();
246            }
247
248            // Skip if became empty after trimming
249            if match_end <= match_start {
250                continue;
251            }
252
253            // Manual boundary check: not part of a larger word
254            let before = if match_start == 0 {
255                None
256            } else {
257                content.get(match_start - 1..match_start)
258            };
259            let after = content.get(match_end..match_end + 1);
260
261            let is_valid_boundary = if is_email {
262                before.is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_" && c != ".")
263                    && after.is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_" && c != ".")
264            } else {
265                before.is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_")
266                    && after.is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_")
267            };
268
269            if !is_valid_boundary {
270                continue;
271            }
272
273            // Skip if this is within a code span (code blocks already checked)
274            if ctx.is_in_code_block_or_span(match_start) {
275                continue;
276            }
277
278            // Skip if within any excluded range (link/image dest/HTML comment)
279            let in_any_range = merged.iter().any(|(start, end)| {
280                // For HTML comments and other exclusions, check if URL overlaps the range
281                (match_start >= *start && match_start < *end)
282                    || (match_end > *start && match_end <= *end)
283                    || (match_start < *start && match_end > *end)
284            });
285            if in_any_range {
286                continue;
287            }
288
289            // Get line information efficiently
290            let (line_num, col_num) = ctx.offset_to_line_col(match_start);
291
292            // Skip reference definitions for URLs
293            if !is_email
294                && let Some(line_info) = ctx.line_info(line_num)
295                && REFERENCE_DEF_RE.is_match(&line_info.content)
296            {
297                continue;
298            }
299
300            let matched_text = &content[match_start..match_end];
301            let line_info = ctx.line_info(line_num).unwrap();
302            let (start_line, start_col, end_line, end_col) =
303                calculate_url_range(line_num, &line_info.content, col_num - 1, matched_text.len());
304
305            let message = if is_email {
306                "Email address without angle brackets or link formatting".to_string()
307            } else {
308                "URL without angle brackets or link formatting".to_string()
309            };
310
311            warnings.push(LintWarning {
312                rule_name: Some(self.name()),
313                line: start_line,
314                column: start_col,
315                end_line,
316                end_column: end_col,
317                message,
318                severity: Severity::Warning,
319                fix: Some(Fix {
320                    range: match_start..match_end,
321                    replacement: format!("<{matched_text}>"),
322                }),
323            });
324        }
325
326        Ok(warnings)
327    }
328
329    /// AST-based bare URL detection: only flag URLs in text nodes not inside links/images/code/html
330    fn find_bare_urls_in_ast(
331        &self,
332        node: &Node,
333        parent_is_link_or_image: bool,
334        _content: &str,
335        warnings: &mut Vec<LintWarning>,
336        ctx: &LintContext,
337    ) {
338        use markdown::mdast::Node::*;
339        match node {
340            Text(text) if !parent_is_link_or_image => {
341                let text_str = &text.value;
342
343                // Check for URLs
344                for url_match in SIMPLE_URL_REGEX.find_iter(text_str) {
345                    let url_start = url_match.start();
346                    let mut url_end = url_match.end();
347
348                    // Trim trailing punctuation that's likely sentence punctuation
349                    let raw_url = &text_str[url_start..url_end];
350                    let trimmed_url = self.trim_trailing_punctuation(raw_url);
351                    url_end = url_start + trimmed_url.len();
352
353                    // Skip if URL became empty after trimming
354                    if url_end <= url_start {
355                        continue;
356                    }
357
358                    let before = if url_start == 0 {
359                        None
360                    } else {
361                        text_str.get(url_start - 1..url_start)
362                    };
363                    let after = text_str.get(url_end..url_end + 1);
364                    let is_valid_boundary = before
365                        .is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_")
366                        && after.is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_");
367                    if !is_valid_boundary {
368                        continue;
369                    }
370                    if let Some(pos) = &text.position {
371                        let offset = pos.start.offset + url_start;
372                        let (line, column) = ctx.offset_to_line_col(offset);
373                        let url_text = &text_str[url_start..url_end];
374                        let (start_line, start_col, end_line, end_col) =
375                            (line, column, line, column + url_text.chars().count());
376                        warnings.push(LintWarning {
377                            rule_name: Some(self.name()),
378                            line: start_line,
379                            column: start_col,
380                            end_line,
381                            end_column: end_col,
382                            message: "URL without angle brackets or link formatting".to_string(),
383                            severity: Severity::Warning,
384                            fix: Some(Fix {
385                                range: offset..(offset + url_text.len()),
386                                replacement: format!("<{url_text}>"),
387                            }),
388                        });
389                    }
390                }
391
392                // Check for email addresses
393                for email_match in EMAIL_PATTERN.find_iter(text_str) {
394                    let email_start = email_match.start();
395                    let email_end = email_match.end();
396                    let before = if email_start == 0 {
397                        None
398                    } else {
399                        text_str.get(email_start - 1..email_start)
400                    };
401                    let after = text_str.get(email_end..email_end + 1);
402                    let is_valid_boundary = before
403                        .is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_" && c != ".")
404                        && after.is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_" && c != ".");
405                    if !is_valid_boundary {
406                        continue;
407                    }
408                    if let Some(pos) = &text.position {
409                        let offset = pos.start.offset + email_start;
410                        let (line, column) = ctx.offset_to_line_col(offset);
411                        let email_text = &text_str[email_start..email_end];
412                        let (start_line, start_col, end_line, end_col) =
413                            (line, column, line, column + email_text.chars().count());
414                        warnings.push(LintWarning {
415                            rule_name: Some(self.name()),
416                            line: start_line,
417                            column: start_col,
418                            end_line,
419                            end_column: end_col,
420                            message: "Email address without angle brackets or link formatting (wrap like: <email>)"
421                                .to_string(),
422                            severity: Severity::Warning,
423                            fix: Some(Fix {
424                                range: offset..(offset + email_text.len()),
425                                replacement: format!("<{email_text}>"),
426                            }),
427                        });
428                    }
429                }
430            }
431            Link(link) => {
432                for child in &link.children {
433                    self.find_bare_urls_in_ast(child, true, _content, warnings, ctx);
434                }
435            }
436            Image(image) => {
437                // Only check alt text for bare URLs (rare, but possible)
438                let alt_str = &image.alt;
439                for url_match in SIMPLE_URL_REGEX.find_iter(alt_str) {
440                    let url_start = url_match.start();
441                    let mut url_end = url_match.end();
442
443                    // Trim trailing punctuation that's likely sentence punctuation
444                    let raw_url = &alt_str[url_start..url_end];
445                    let trimmed_url = self.trim_trailing_punctuation(raw_url);
446                    url_end = url_start + trimmed_url.len();
447
448                    // Skip if URL became empty after trimming
449                    if url_end <= url_start {
450                        continue;
451                    }
452
453                    let before = if url_start == 0 {
454                        None
455                    } else {
456                        alt_str.get(url_start - 1..url_start)
457                    };
458                    let after = alt_str.get(url_end..url_end + 1);
459                    let is_valid_boundary = before
460                        .is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_")
461                        && after.is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_");
462                    if !is_valid_boundary {
463                        continue;
464                    }
465                    if let Some(pos) = &image.position {
466                        let offset = pos.start.offset + url_start;
467                        let (line, column) = ctx.offset_to_line_col(offset);
468                        let url_text = &alt_str[url_start..url_end];
469                        let (start_line, start_col, end_line, end_col) =
470                            (line, column, line, column + url_text.chars().count());
471                        warnings.push(LintWarning {
472                            rule_name: Some(self.name()),
473                            line: start_line,
474                            column: start_col,
475                            end_line,
476                            end_column: end_col,
477                            message: "URL without angle brackets or link formatting".to_string(),
478                            severity: Severity::Warning,
479                            fix: Some(Fix {
480                                range: offset..(offset + url_text.len()),
481                                replacement: format!("<{url_text}>"),
482                            }),
483                        });
484                    }
485                }
486            }
487            Code(_) | InlineCode(_) | Html(_) => {
488                // Skip code and HTML nodes
489            }
490            _ => {
491                if let Some(children) = node.children() {
492                    for child in children {
493                        self.find_bare_urls_in_ast(child, false, _content, warnings, ctx);
494                    }
495                }
496            }
497        }
498    }
499
500    /// AST-based check method for MD034
501    pub fn check_ast(&self, ctx: &LintContext, ast: &Node) -> LintResult {
502        let mut warnings = Vec::new();
503        self.find_bare_urls_in_ast(ast, false, ctx.content, &mut warnings, ctx);
504        Ok(warnings)
505    }
506}
507
508impl Rule for MD034NoBareUrls {
509    fn name(&self) -> &'static str {
510        "MD034"
511    }
512
513    fn description(&self) -> &'static str {
514        "URL without angle brackets or link formatting"
515    }
516
517    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
518        // Use line-based detection to properly distinguish between bare URLs and autolinks
519        // AST-based approach doesn't work because CommonMark parser converts bare URLs to links
520        let content = ctx.content;
521
522        // Fast path: Early return for empty content
523        if content.is_empty() {
524            return Ok(Vec::new());
525        }
526
527        // Fast path: Early return if no potential URLs or emails
528        if !content.contains("http://")
529            && !content.contains("https://")
530            && !content.contains("ftp://")
531            && !content.contains("ftps://")
532            && !content.contains('@')
533        {
534            return Ok(Vec::new());
535        }
536
537        // Fast path: Quick check using simple pattern
538        if !URL_QUICK_CHECK.is_match(content) {
539            return Ok(Vec::new());
540        }
541
542        // Fallback path: create structure manually (should rarely be used)
543        let structure = crate::utils::document_structure::DocumentStructure::new(content);
544        self.check_with_structure(ctx, &structure)
545    }
546
547    fn check_with_ast(&self, ctx: &LintContext, ast: &MarkdownAst) -> LintResult {
548        // Use AST-based detection for better accuracy
549        let mut warnings = Vec::new();
550        self.find_bare_urls_in_ast(ast, false, ctx.content, &mut warnings, ctx);
551        Ok(warnings)
552    }
553
554    fn uses_ast(&self) -> bool {
555        // AST-based approach doesn't work because CommonMark parser converts bare URLs to links
556        // Use document structure approach instead
557        false
558    }
559
560    fn uses_document_structure(&self) -> bool {
561        true
562    }
563
564    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
565        let content = ctx.content;
566        if self.should_skip(content) {
567            return Ok(content.to_string());
568        }
569
570        // Get all warnings first - only fix URLs that are actually flagged
571        // Use structure-based detection to match the main linting path (since uses_document_structure() returns true)
572        let structure = crate::utils::document_structure::DocumentStructure::new(content);
573        let warnings = self.check_with_structure(ctx, &structure)?;
574        if warnings.is_empty() {
575            return Ok(content.to_string());
576        }
577
578        // Sort warnings by byte offset in reverse order (rightmost first) to avoid offset issues
579        let mut sorted_warnings = warnings.clone();
580        sorted_warnings.sort_by_key(|w| std::cmp::Reverse(w.fix.as_ref().map(|f| f.range.start).unwrap_or(0)));
581
582        let mut result = content.to_string();
583        for warning in sorted_warnings {
584            if let Some(fix) = &warning.fix {
585                let start = fix.range.start;
586                let end = fix.range.end;
587
588                if start <= result.len() && end <= result.len() && start < end {
589                    result.replace_range(start..end, &fix.replacement);
590                }
591            }
592        }
593
594        Ok(result)
595    }
596
597    /// Get the category of this rule for selective processing
598    fn category(&self) -> RuleCategory {
599        RuleCategory::Link
600    }
601
602    /// Check if this rule should be skipped based on content
603    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
604        self.should_skip(ctx.content)
605    }
606
607    fn as_any(&self) -> &dyn std::any::Any {
608        self
609    }
610
611    fn as_maybe_document_structure(&self) -> Option<&dyn crate::rule::MaybeDocumentStructure> {
612        Some(self)
613    }
614
615    fn as_maybe_ast(&self) -> Option<&dyn MaybeAst> {
616        Some(self)
617    }
618
619    fn from_config(_config: &crate::config::Config) -> Box<dyn Rule>
620    where
621        Self: Sized,
622    {
623        Box::new(MD034NoBareUrls)
624    }
625}
626
627impl crate::utils::document_structure::DocumentStructureExtensions for MD034NoBareUrls {
628    fn has_relevant_elements(
629        &self,
630        ctx: &crate::lint_context::LintContext,
631        _doc_structure: &crate::utils::document_structure::DocumentStructure,
632    ) -> bool {
633        // This rule is only relevant if there might be URLs or emails in the content
634        let content = ctx.content;
635        !content.is_empty()
636            && (content.contains("http://")
637                || content.contains("https://")
638                || content.contains("ftp://")
639                || content.contains("ftps://")
640                || content.contains('@'))
641    }
642}
643
644impl AstExtensions for MD034NoBareUrls {
645    fn has_relevant_ast_elements(&self, ctx: &LintContext, ast: &MarkdownAst) -> bool {
646        // Check if AST contains text nodes (where bare URLs would be)
647        use crate::utils::ast_utils::ast_contains_node_type;
648        !self.should_skip(ctx.content) && ast_contains_node_type(ast, "text")
649    }
650}
651
652#[cfg(test)]
653mod tests {
654    use super::*;
655    use crate::lint_context::LintContext;
656
657    #[test]
658    fn test_url_quick_check() {
659        assert!(URL_QUICK_CHECK.is_match("This is a URL: https://example.com"));
660        assert!(!URL_QUICK_CHECK.is_match("This has no URL"));
661    }
662
663    #[test]
664    fn test_multiple_badges_and_links_on_one_line() {
665        let rule = MD034NoBareUrls;
666        let content = "# [React](https://react.dev/) \
667&middot; [![GitHub license](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/facebook/react/blob/main/LICENSE) \
668[![npm version](https://img.shields.io/npm/v/react.svg?style=flat)](https://www.npmjs.com/package/react) \
669[![(Runtime) Build and Test](https://github.com/facebook/react/actions/workflows/runtime_build_and_test.yml/badge.svg)](https://github.com/facebook/react/actions/workflows/runtime_build_and_test.yml) \
670[![(Compiler) TypeScript](https://github.com/facebook/react/actions/workflows/compiler_typescript.yml/badge.svg?branch=main)](https://github.com/facebook/react/actions/workflows/compiler_typescript.yml) \
671[![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](https://legacy.reactjs.org/docs/how-to-contribute.html#your-first-pull-request)";
672        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
673        let result = rule.check(&ctx).unwrap();
674        if !result.is_empty() {
675            log::debug!("MD034 warnings: {result:#?}");
676        }
677        assert!(
678            result.is_empty(),
679            "Multiple badges and links on one line should not be flagged as bare URLs"
680        );
681    }
682
683    #[test]
684    fn test_bare_urls() {
685        let rule = MD034NoBareUrls;
686        let content = "This is a bare URL: https://example.com/foobar";
687        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
688        let result = rule.check(&ctx).unwrap();
689        assert_eq!(result.len(), 1, "Bare URLs should be flagged");
690        assert_eq!(result[0].line, 1);
691        assert_eq!(result[0].column, 21);
692    }
693
694    #[test]
695    fn test_md034_performance_baseline() {
696        use std::time::Instant;
697
698        // Generate test content with various URL patterns
699        let mut content = String::with_capacity(50_000);
700
701        // Add content with bare URLs (should be detected)
702        for i in 0..250 {
703            content.push_str(&format!("Line {i} with bare URL https://example{i}.com/path\n"));
704        }
705
706        // Add content with proper markdown links (should not be detected)
707        for i in 0..250 {
708            content.push_str(&format!(
709                "Line {} with [proper link](https://example{}.com/path)\n",
710                i + 250,
711                i
712            ));
713        }
714
715        // Add content with no URLs (should be fast)
716        for i in 0..500 {
717            content.push_str(&format!("Line {} with no URLs, just regular text content\n", i + 500));
718        }
719
720        // Add content with emails
721        for i in 0..100 {
722            content.push_str(&format!("Contact user{i}@example{i}.com for more info\n"));
723        }
724
725        println!(
726            "MD034 Performance Test - Content: {} bytes, {} lines",
727            content.len(),
728            content.lines().count()
729        );
730
731        let rule = MD034NoBareUrls;
732        let ctx = LintContext::new(&content, crate::config::MarkdownFlavor::Standard);
733
734        // Warm up
735        let _ = rule.check(&ctx).unwrap();
736
737        // Measure check performance (more runs for accuracy)
738        let mut total_duration = std::time::Duration::ZERO;
739        let runs = 10;
740        let mut warnings_count = 0;
741
742        for _ in 0..runs {
743            let start = Instant::now();
744            let warnings = rule.check(&ctx).unwrap();
745            total_duration += start.elapsed();
746            warnings_count = warnings.len();
747        }
748
749        let avg_check_duration = total_duration / runs;
750
751        println!("MD034 Optimized Performance:");
752        println!(
753            "- Average check time: {:?} ({:.2} ms)",
754            avg_check_duration,
755            avg_check_duration.as_secs_f64() * 1000.0
756        );
757        println!("- Found {warnings_count} warnings");
758        println!(
759            "- Lines per second: {:.0}",
760            content.lines().count() as f64 / avg_check_duration.as_secs_f64()
761        );
762        println!(
763            "- Microseconds per line: {:.2}",
764            avg_check_duration.as_micros() as f64 / content.lines().count() as f64
765        );
766
767        // Performance assertion - should complete reasonably fast
768        assert!(
769            avg_check_duration.as_millis() < 100,
770            "MD034 check should complete in under 100ms, took {}ms",
771            avg_check_duration.as_millis()
772        );
773
774        // Verify we're finding the expected number of warnings
775        assert_eq!(warnings_count, 350, "Should find 250 URLs + 100 emails = 350 warnings");
776    }
777}
rumdl_lib/rules/md034_no_bare_urls.rs

rumdl_lib/rules/
md034_no_bare_urls.rs