rumdl_lib/rules/
md034_no_bare_urls.rs

1/// Rule MD034: No unformatted URLs
2///
3/// See [docs/md034.md](../../docs/md034.md) for full documentation, configuration, and examples.
4use crate::rule::{
5    AstExtensions, Fix, LintError, LintResult, LintWarning, MarkdownAst, MaybeAst, Rule, RuleCategory, Severity,
6};
7use crate::utils::early_returns;
8use crate::utils::range_utils::calculate_url_range;
9use crate::utils::regex_cache::EMAIL_PATTERN;
10
11use crate::lint_context::LintContext;
12use fancy_regex::Regex as FancyRegex;
13use lazy_static::lazy_static;
14use markdown::mdast::Node;
15use regex::Regex;
16
17lazy_static! {
18    // Simple pattern to quickly check if a line might contain a URL or email
19    static ref URL_QUICK_CHECK: Regex = Regex::new(r#"(?:https?|ftps?)://|@"#).unwrap();
20
21    // Use fancy-regex for look-behind/look-ahead
22    // Updated to support IPv6 addresses in square brackets
23    static ref URL_REGEX: FancyRegex = FancyRegex::new(r#"(?<![\w\[\(\<])((?:https?|ftps?)://(?:\[[0-9a-fA-F:%]+\]|[^\s<>\[\]()\\'\"]+)(?::\d+)?(?:/[^\s<>\[\]()\\'\"]*)?(?:\?[^\s<>\[\]()\\'\"]*)?(?:#[^\s<>\[\]()\\'\"]*)?)"#).unwrap();
24    static ref URL_FIX_REGEX: FancyRegex = FancyRegex::new(r#"(?<![\w\[\(\<])((?:https?|ftps?)://(?:\[[0-9a-fA-F:%]+\]|[^\s<>\[\]()\\'\"]+)(?::\d+)?(?:/[^\s<>\[\]()\\'\"]*)?(?:\?[^\s<>\[\]()\\'\"]*)?(?:#[^\s<>\[\]()\\'\"]*)?)"#).unwrap();
25
26    // Pattern to detect custom protocol patterns that shouldn't be flagged
27    // These are commonly used in documentation but aren't actual browsable URLs
28    static ref CUSTOM_PROTOCOL_PATTERN: Regex = Regex::new(r#"(?:grpc|ws|wss|ssh|git|svn|file|data|javascript|vscode|chrome|about|slack|discord|matrix|irc|redis|mongodb|postgresql|mysql|kafka|nats|amqp|mqtt|custom|app|api|service)://"#).unwrap();
29
30    // Pattern to match markdown link format - capture destination in Group 1
31    // Updated to handle nested brackets in badge links like [![badge](img)](link)
32    static ref MARKDOWN_LINK_PATTERN: Regex = Regex::new(r#"\[(?:[^\[\]]|\[[^\]]*\])*\]\(([^)\s]+)(?:\s+(?:\"[^\"]*\"|\'[^\']*\'))?\)"#).unwrap();
33
34    // Pattern to match angle bracket link format (URLs and emails)
35    // Updated to support IPv6 addresses
36    static ref ANGLE_LINK_PATTERN: Regex = Regex::new(r#"<((?:https?|ftps?)://(?:\[[0-9a-fA-F:]+(?:%[a-zA-Z0-9]+)?\]|[^>]+)|[^@\s]+@[^@\s]+\.[^@\s>]+)>"#).unwrap();
37
38    // Add regex to identify lines containing only a badge link
39    static ref BADGE_LINK_LINE: Regex = Regex::new(r#"^\s*\[!\[[^\]]*\]\([^)]*\)\]\([^)]*\)\s*$"#).unwrap();
40
41    // Add pattern to check if link text is *only* an image
42    static ref IMAGE_ONLY_LINK_TEXT_PATTERN: Regex = Regex::new(r#"^!\s*\[[^\]]*\]\s*\([^)]*\)$"#).unwrap();
43
44    // Captures full image in 0, alt text in 1, src in 2
45    static ref MARKDOWN_IMAGE_PATTERN: Regex = Regex::new(r#"!\s*\[([^\]]*)\]\s*\(([^)\s]+)(?:\s+(?:\"[^\"]*\"|\'[^\']*\'))?\)"#).unwrap();
46
47    // Add a simple regex for candidate URLs (no look-behind/look-ahead)
48    // Updated to match markdownlint's behavior: URLs can have domains without dots
49    // Handles URL components properly: scheme://domain[:port][/path][?query][#fragment]
50    // Will post-process to remove trailing sentence punctuation
51    // Now supports IPv6 addresses in square brackets
52    // Note: We need two separate patterns - one for IPv6 and one for regular URLs
53    // Updated to avoid matching partial IPv6 patterns (e.g., "https://::1]" without opening bracket)
54    static ref SIMPLE_URL_REGEX: Regex = Regex::new(r#"(https?|ftps?)://(?:\[[0-9a-fA-F:%.]+\](?::\d+)?|[^\s<>\[\]()\\'\"`:\]]+(?::\d+)?)(?:/[^\s<>\[\]()\\'\"`]*)?(?:\?[^\s<>\[\]()\\'\"`]*)?(?:#[^\s<>\[\]()\\'\"`]*)?"#).unwrap();
55
56    // Special pattern just for IPv6 URLs to handle them separately
57    // Note: This is permissive to match markdownlint behavior, allowing technically invalid IPv6 for examples
58    static ref IPV6_URL_REGEX: Regex = Regex::new(r#"(https?|ftps?)://\[[0-9a-fA-F:%.\-a-zA-Z]+\](?::\d+)?(?:/[^\s<>\[\]()\\'\"`]*)?(?:\?[^\s<>\[\]()\\'\"`]*)?(?:#[^\s<>\[\]()\\'\"`]*)?"#).unwrap();
59
60    // Add regex for reference definitions
61    // Updated to support IPv6 addresses
62    static ref REFERENCE_DEF_RE: Regex = Regex::new(r"^\s*\[[^\]]+\]:\s*(?:https?|ftps?)://\S+$").unwrap();
63
64    // Pattern to match HTML comments
65    static ref HTML_COMMENT_PATTERN: Regex = Regex::new(r#"<!--[\s\S]*?-->"#).unwrap();
66}
67
68#[derive(Default, Clone)]
69pub struct MD034NoBareUrls;
70
71impl MD034NoBareUrls {
72    #[inline]
73    pub fn should_skip(&self, content: &str) -> bool {
74        // Skip if content has no URLs and no email addresses
75        !early_returns::has_urls(content) && !content.contains('@')
76    }
77
78    /// Remove trailing punctuation that is likely sentence punctuation, not part of the URL
79    fn trim_trailing_punctuation<'a>(&self, url: &'a str) -> &'a str {
80        let trailing_punct = ['.', ',', ';', ':', '!', '?'];
81        let mut end = url.len();
82
83        // Remove trailing punctuation characters
84        while end > 0 {
85            // Get the last character of the current substring safely
86            let current_url = &url[..end];
87            if let Some((last_char_pos, last_char)) = current_url.char_indices().next_back() {
88                if trailing_punct.contains(&last_char) {
89                    end = last_char_pos;
90                } else {
91                    break;
92                }
93            } else {
94                break;
95            }
96        }
97
98        &url[..end]
99    }
100
101    // Uses DocumentStructure for code block and code span detection in check_with_structure.
102    pub fn check_with_structure(
103        &self,
104        ctx: &crate::lint_context::LintContext,
105        _structure: &crate::utils::document_structure::DocumentStructure,
106    ) -> LintResult {
107        let content = ctx.content;
108
109        // Early return: skip if no URLs or emails
110        if self.should_skip(content) {
111            return Ok(vec![]);
112        }
113
114        // Process the entire content to handle multi-line markdown links
115        let mut warnings = Vec::new();
116
117        // First, find all markdown link ranges across the entire content
118        let mut excluded_ranges: Vec<(usize, usize)> = Vec::new();
119
120        // Markdown links: [text](url) - exclude both destination and entire link text
121        for cap in MARKDOWN_LINK_PATTERN.captures_iter(content) {
122            if let Some(dest) = cap.get(1) {
123                excluded_ranges.push((dest.start(), dest.end()));
124            }
125            // Also exclude the entire link to handle URLs in link text
126            if let Some(full_match) = cap.get(0) {
127                excluded_ranges.push((full_match.start(), full_match.end()));
128            }
129        }
130
131        // Markdown images: ![alt](url)
132        for cap in MARKDOWN_IMAGE_PATTERN.captures_iter(content) {
133            if let Some(dest) = cap.get(2) {
134                excluded_ranges.push((dest.start(), dest.end()));
135            }
136        }
137
138        // Angle-bracket links: <url>
139        for cap in ANGLE_LINK_PATTERN.captures_iter(content) {
140            if let Some(m) = cap.get(1) {
141                excluded_ranges.push((m.start(), m.end()));
142            }
143        }
144
145        // HTML tags: exclude everything inside them
146        for html_tag in ctx.html_tags().iter() {
147            excluded_ranges.push((html_tag.byte_offset, html_tag.byte_end));
148        }
149
150        // HTML comments: <!-- url -->
151        for cap in HTML_COMMENT_PATTERN.captures_iter(content) {
152            if let Some(comment) = cap.get(0) {
153                excluded_ranges.push((comment.start(), comment.end()));
154            }
155        }
156
157        // Sort and merge overlapping ranges
158        excluded_ranges.sort_by_key(|r| r.0);
159        let mut merged: Vec<(usize, usize)> = Vec::new();
160        for (start, end) in excluded_ranges {
161            if let Some((_, last_end)) = merged.last_mut()
162                && *last_end >= start
163            {
164                *last_end = (*last_end).max(end);
165                continue;
166            }
167            merged.push((start, end));
168        }
169
170        // Now find all URLs and emails in the content and check if they're excluded
171        // We'll combine URL and email detection for efficiency
172        let mut all_matches: Vec<(usize, usize, bool)> = Vec::new(); // (start, end, is_email)
173
174        // Early exit if no potential URLs/emails based on quick check
175        if !content.contains("://") && !content.contains('@') {
176            return Ok(warnings);
177        }
178
179        // Use line-based processing for better cache locality
180        for line_info in ctx.lines.iter() {
181            let line_content = &line_info.content;
182
183            // Skip lines in code blocks
184            if line_info.in_code_block {
185                continue;
186            }
187
188            // Quick check if line might contain URLs or emails
189            if !line_content.contains("://") && !line_content.contains('@') {
190                continue;
191            }
192
193            // Check for URLs in this line
194            for url_match in SIMPLE_URL_REGEX.find_iter(line_content) {
195                let start_in_line = url_match.start();
196                let end_in_line = url_match.end();
197                let matched_str = &line_content[start_in_line..end_in_line];
198
199                // Skip invalid IPv6 patterns
200                if matched_str.contains("::") && !matched_str.contains('[') && matched_str.contains(']') {
201                    continue;
202                }
203
204                // Skip custom protocols that aren't standard web protocols
205                // Check if there's a custom protocol pattern before this match
206                if start_in_line > 0 {
207                    // Look back to see if this is part of a custom protocol URL
208                    let prefix_start = start_in_line.saturating_sub(20); // Look back up to 20 chars
209                    let prefix = &line_content[prefix_start..start_in_line];
210                    if CUSTOM_PROTOCOL_PATTERN.is_match(prefix) {
211                        continue;
212                    }
213                }
214
215                let global_start = line_info.byte_offset + start_in_line;
216                let global_end = line_info.byte_offset + end_in_line;
217                all_matches.push((global_start, global_end, false));
218            }
219
220            // Check for IPv6 URLs
221            for url_match in IPV6_URL_REGEX.find_iter(line_content) {
222                let global_start = line_info.byte_offset + url_match.start();
223                let global_end = line_info.byte_offset + url_match.end();
224
225                // Remove any overlapping regular URL matches
226                all_matches.retain(|(start, end, _)| !(*start < global_end && *end > global_start));
227
228                all_matches.push((global_start, global_end, false));
229            }
230
231            // Check for emails in this line
232            for email_match in EMAIL_PATTERN.find_iter(line_content) {
233                let global_start = line_info.byte_offset + email_match.start();
234                let global_end = line_info.byte_offset + email_match.end();
235                all_matches.push((global_start, global_end, true));
236            }
237        }
238
239        // Process all matches
240        for (match_start, match_end_orig, is_email) in all_matches {
241            let mut match_end = match_end_orig;
242
243            // For URLs, trim trailing punctuation
244            if !is_email {
245                let raw_url = &content[match_start..match_end];
246                let trimmed_url = self.trim_trailing_punctuation(raw_url);
247                match_end = match_start + trimmed_url.len();
248            }
249
250            // Skip if became empty after trimming
251            if match_end <= match_start {
252                continue;
253            }
254
255            // Manual boundary check: not part of a larger word
256            // Use bytes for ASCII checks (more efficient)
257            let bytes = content.as_bytes();
258            let before_byte = if match_start == 0 {
259                None
260            } else {
261                bytes.get(match_start - 1).copied()
262            };
263            let after_byte = bytes.get(match_end).copied();
264
265            let is_valid_boundary = if is_email {
266                before_byte.is_none_or(|b| !b.is_ascii_alphanumeric() && b != b'_' && b != b'.')
267                    && after_byte.is_none_or(|b| !b.is_ascii_alphanumeric() && b != b'_' && b != b'.')
268            } else {
269                before_byte.is_none_or(|b| !b.is_ascii_alphanumeric() && b != b'_')
270                    && after_byte.is_none_or(|b| !b.is_ascii_alphanumeric() && b != b'_')
271            };
272
273            if !is_valid_boundary {
274                continue;
275            }
276
277            // Skip if this is within any skip context (code blocks, MkDocs snippets, etc.)
278            if crate::utils::skip_context::is_in_skip_context(ctx, match_start) {
279                continue;
280            }
281
282            // Skip if within any excluded range (link/image dest/HTML comment)
283            let in_any_range = merged.iter().any(|(start, end)| {
284                // For HTML comments and other exclusions, check if URL overlaps the range
285                (match_start >= *start && match_start < *end)
286                    || (match_end > *start && match_end <= *end)
287                    || (match_start < *start && match_end > *end)
288            });
289            if in_any_range {
290                continue;
291            }
292
293            // Get line information efficiently
294            let (line_num, col_num) = ctx.offset_to_line_col(match_start);
295
296            // Skip reference definitions for URLs
297            if !is_email
298                && let Some(line_info) = ctx.line_info(line_num)
299                && REFERENCE_DEF_RE.is_match(&line_info.content)
300            {
301                continue;
302            }
303
304            let matched_text = &content[match_start..match_end];
305            let line_info = ctx.line_info(line_num).unwrap();
306            let (start_line, start_col, end_line, end_col) =
307                calculate_url_range(line_num, &line_info.content, col_num - 1, matched_text.len());
308
309            let message = if is_email {
310                "Email address without angle brackets or link formatting".to_string()
311            } else {
312                "URL without angle brackets or link formatting".to_string()
313            };
314
315            warnings.push(LintWarning {
316                rule_name: Some(self.name()),
317                line: start_line,
318                column: start_col,
319                end_line,
320                end_column: end_col,
321                message,
322                severity: Severity::Warning,
323                fix: Some(Fix {
324                    range: match_start..match_end,
325                    replacement: format!("<{matched_text}>"),
326                }),
327            });
328        }
329
330        Ok(warnings)
331    }
332
333    /// AST-based bare URL detection: only flag URLs in text nodes not inside links/images/code/html
334    fn find_bare_urls_in_ast(
335        &self,
336        node: &Node,
337        parent_is_link_or_image: bool,
338        _content: &str,
339        warnings: &mut Vec<LintWarning>,
340        ctx: &LintContext,
341    ) {
342        use markdown::mdast::Node::*;
343        match node {
344            Text(text) if !parent_is_link_or_image => {
345                let text_str = &text.value;
346
347                // Check for URLs
348                for url_match in SIMPLE_URL_REGEX.find_iter(text_str) {
349                    let url_start = url_match.start();
350                    let mut url_end = url_match.end();
351
352                    // Trim trailing punctuation that's likely sentence punctuation
353                    let raw_url = &text_str[url_start..url_end];
354                    let trimmed_url = self.trim_trailing_punctuation(raw_url);
355                    url_end = url_start + trimmed_url.len();
356
357                    // Skip if URL became empty after trimming
358                    if url_end <= url_start {
359                        continue;
360                    }
361
362                    let before = if url_start == 0 {
363                        None
364                    } else {
365                        text_str.get(url_start - 1..url_start)
366                    };
367                    let after = text_str.get(url_end..url_end + 1);
368                    let is_valid_boundary = before
369                        .is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_")
370                        && after.is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_");
371                    if !is_valid_boundary {
372                        continue;
373                    }
374                    if let Some(pos) = &text.position {
375                        let offset = pos.start.offset + url_start;
376                        let (line, column) = ctx.offset_to_line_col(offset);
377                        let url_text = &text_str[url_start..url_end];
378                        let (start_line, start_col, end_line, end_col) =
379                            (line, column, line, column + url_text.chars().count());
380                        warnings.push(LintWarning {
381                            rule_name: Some(self.name()),
382                            line: start_line,
383                            column: start_col,
384                            end_line,
385                            end_column: end_col,
386                            message: "URL without angle brackets or link formatting".to_string(),
387                            severity: Severity::Warning,
388                            fix: Some(Fix {
389                                range: offset..(offset + url_text.len()),
390                                replacement: format!("<{url_text}>"),
391                            }),
392                        });
393                    }
394                }
395
396                // Check for email addresses
397                for email_match in EMAIL_PATTERN.find_iter(text_str) {
398                    let email_start = email_match.start();
399                    let email_end = email_match.end();
400                    let before = if email_start == 0 {
401                        None
402                    } else {
403                        text_str.get(email_start - 1..email_start)
404                    };
405                    let after = text_str.get(email_end..email_end + 1);
406                    let is_valid_boundary = before
407                        .is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_" && c != ".")
408                        && after.is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_" && c != ".");
409                    if !is_valid_boundary {
410                        continue;
411                    }
412                    if let Some(pos) = &text.position {
413                        let offset = pos.start.offset + email_start;
414                        let (line, column) = ctx.offset_to_line_col(offset);
415                        let email_text = &text_str[email_start..email_end];
416                        let (start_line, start_col, end_line, end_col) =
417                            (line, column, line, column + email_text.chars().count());
418                        warnings.push(LintWarning {
419                            rule_name: Some(self.name()),
420                            line: start_line,
421                            column: start_col,
422                            end_line,
423                            end_column: end_col,
424                            message: "Email address without angle brackets or link formatting (wrap like: <email>)"
425                                .to_string(),
426                            severity: Severity::Warning,
427                            fix: Some(Fix {
428                                range: offset..(offset + email_text.len()),
429                                replacement: format!("<{email_text}>"),
430                            }),
431                        });
432                    }
433                }
434            }
435            Link(link) => {
436                for child in &link.children {
437                    self.find_bare_urls_in_ast(child, true, _content, warnings, ctx);
438                }
439            }
440            Image(image) => {
441                // Only check alt text for bare URLs (rare, but possible)
442                let alt_str = &image.alt;
443                for url_match in SIMPLE_URL_REGEX.find_iter(alt_str) {
444                    let url_start = url_match.start();
445                    let mut url_end = url_match.end();
446
447                    // Trim trailing punctuation that's likely sentence punctuation
448                    let raw_url = &alt_str[url_start..url_end];
449                    let trimmed_url = self.trim_trailing_punctuation(raw_url);
450                    url_end = url_start + trimmed_url.len();
451
452                    // Skip if URL became empty after trimming
453                    if url_end <= url_start {
454                        continue;
455                    }
456
457                    let before = if url_start == 0 {
458                        None
459                    } else {
460                        alt_str.get(url_start - 1..url_start)
461                    };
462                    let after = alt_str.get(url_end..url_end + 1);
463                    let is_valid_boundary = before
464                        .is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_")
465                        && after.is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_");
466                    if !is_valid_boundary {
467                        continue;
468                    }
469                    if let Some(pos) = &image.position {
470                        let offset = pos.start.offset + url_start;
471                        let (line, column) = ctx.offset_to_line_col(offset);
472                        let url_text = &alt_str[url_start..url_end];
473                        let (start_line, start_col, end_line, end_col) =
474                            (line, column, line, column + url_text.chars().count());
475                        warnings.push(LintWarning {
476                            rule_name: Some(self.name()),
477                            line: start_line,
478                            column: start_col,
479                            end_line,
480                            end_column: end_col,
481                            message: "URL without angle brackets or link formatting".to_string(),
482                            severity: Severity::Warning,
483                            fix: Some(Fix {
484                                range: offset..(offset + url_text.len()),
485                                replacement: format!("<{url_text}>"),
486                            }),
487                        });
488                    }
489                }
490            }
491            Code(_) | InlineCode(_) | Html(_) => {
492                // Skip code and HTML nodes
493            }
494            _ => {
495                if let Some(children) = node.children() {
496                    for child in children {
497                        self.find_bare_urls_in_ast(child, false, _content, warnings, ctx);
498                    }
499                }
500            }
501        }
502    }
503
504    /// AST-based check method for MD034
505    pub fn check_ast(&self, ctx: &LintContext, ast: &Node) -> LintResult {
506        let mut warnings = Vec::new();
507        self.find_bare_urls_in_ast(ast, false, ctx.content, &mut warnings, ctx);
508        Ok(warnings)
509    }
510}
511
512impl Rule for MD034NoBareUrls {
513    fn name(&self) -> &'static str {
514        "MD034"
515    }
516
517    fn description(&self) -> &'static str {
518        "URL without angle brackets or link formatting"
519    }
520
521    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
522        // Use line-based detection to properly distinguish between bare URLs and autolinks
523        // AST-based approach doesn't work because CommonMark parser converts bare URLs to links
524        let content = ctx.content;
525
526        // Fast path: Early return for empty content
527        if content.is_empty() {
528            return Ok(Vec::new());
529        }
530
531        // Fast path: Early return if no potential URLs or emails
532        if !content.contains("http://")
533            && !content.contains("https://")
534            && !content.contains("ftp://")
535            && !content.contains("ftps://")
536            && !content.contains('@')
537        {
538            return Ok(Vec::new());
539        }
540
541        // Fast path: Quick check using simple pattern
542        if !URL_QUICK_CHECK.is_match(content) {
543            return Ok(Vec::new());
544        }
545
546        // Fallback path: create structure manually (should rarely be used)
547        let structure = crate::utils::document_structure::DocumentStructure::new(content);
548        self.check_with_structure(ctx, &structure)
549    }
550
551    fn check_with_ast(&self, ctx: &LintContext, ast: &MarkdownAst) -> LintResult {
552        // Use AST-based detection for better accuracy
553        let mut warnings = Vec::new();
554        self.find_bare_urls_in_ast(ast, false, ctx.content, &mut warnings, ctx);
555        Ok(warnings)
556    }
557
558    fn uses_ast(&self) -> bool {
559        // AST-based approach doesn't work because CommonMark parser converts bare URLs to links
560        // Use document structure approach instead
561        false
562    }
563
564    fn uses_document_structure(&self) -> bool {
565        true
566    }
567
568    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
569        let content = ctx.content;
570        if self.should_skip(content) {
571            return Ok(content.to_string());
572        }
573
574        // Get all warnings first - only fix URLs that are actually flagged
575        // Use structure-based detection to match the main linting path (since uses_document_structure() returns true)
576        let structure = crate::utils::document_structure::DocumentStructure::new(content);
577        let warnings = self.check_with_structure(ctx, &structure)?;
578        if warnings.is_empty() {
579            return Ok(content.to_string());
580        }
581
582        // Sort warnings by byte offset in reverse order (rightmost first) to avoid offset issues
583        let mut sorted_warnings = warnings.clone();
584        sorted_warnings.sort_by_key(|w| std::cmp::Reverse(w.fix.as_ref().map(|f| f.range.start).unwrap_or(0)));
585
586        let mut result = content.to_string();
587        for warning in sorted_warnings {
588            if let Some(fix) = &warning.fix {
589                let start = fix.range.start;
590                let end = fix.range.end;
591
592                if start <= result.len() && end <= result.len() && start < end {
593                    result.replace_range(start..end, &fix.replacement);
594                }
595            }
596        }
597
598        Ok(result)
599    }
600
601    /// Get the category of this rule for selective processing
602    fn category(&self) -> RuleCategory {
603        RuleCategory::Link
604    }
605
606    /// Check if this rule should be skipped based on content
607    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
608        self.should_skip(ctx.content)
609    }
610
611    fn as_any(&self) -> &dyn std::any::Any {
612        self
613    }
614
615    fn as_maybe_document_structure(&self) -> Option<&dyn crate::rule::MaybeDocumentStructure> {
616        Some(self)
617    }
618
619    fn as_maybe_ast(&self) -> Option<&dyn MaybeAst> {
620        Some(self)
621    }
622
623    fn from_config(_config: &crate::config::Config) -> Box<dyn Rule>
624    where
625        Self: Sized,
626    {
627        Box::new(MD034NoBareUrls)
628    }
629}
630
631impl crate::utils::document_structure::DocumentStructureExtensions for MD034NoBareUrls {
632    fn has_relevant_elements(
633        &self,
634        ctx: &crate::lint_context::LintContext,
635        _doc_structure: &crate::utils::document_structure::DocumentStructure,
636    ) -> bool {
637        // This rule is only relevant if there might be URLs or emails in the content
638        let content = ctx.content;
639        !content.is_empty()
640            && (content.contains("http://")
641                || content.contains("https://")
642                || content.contains("ftp://")
643                || content.contains("ftps://")
644                || content.contains('@'))
645    }
646}
647
648impl AstExtensions for MD034NoBareUrls {
649    fn has_relevant_ast_elements(&self, ctx: &LintContext, ast: &MarkdownAst) -> bool {
650        // Check if AST contains text nodes (where bare URLs would be)
651        use crate::utils::ast_utils::ast_contains_node_type;
652        !self.should_skip(ctx.content) && ast_contains_node_type(ast, "text")
653    }
654}
655
656#[cfg(test)]
657mod tests {
658    use super::*;
659    use crate::lint_context::LintContext;
660
661    #[test]
662    fn test_url_quick_check() {
663        assert!(URL_QUICK_CHECK.is_match("This is a URL: https://example.com"));
664        assert!(!URL_QUICK_CHECK.is_match("This has no URL"));
665    }
666
667    #[test]
668    fn test_multiple_badges_and_links_on_one_line() {
669        let rule = MD034NoBareUrls;
670        let content = "# [React](https://react.dev/) \
671&middot; [![GitHub license](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/facebook/react/blob/main/LICENSE) \
672[![npm version](https://img.shields.io/npm/v/react.svg?style=flat)](https://www.npmjs.com/package/react) \
673[![(Runtime) Build and Test](https://github.com/facebook/react/actions/workflows/runtime_build_and_test.yml/badge.svg)](https://github.com/facebook/react/actions/workflows/runtime_build_and_test.yml) \
674[![(Compiler) TypeScript](https://github.com/facebook/react/actions/workflows/compiler_typescript.yml/badge.svg?branch=main)](https://github.com/facebook/react/actions/workflows/compiler_typescript.yml) \
675[![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](https://legacy.reactjs.org/docs/how-to-contribute.html#your-first-pull-request)";
676        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
677        let result = rule.check(&ctx).unwrap();
678        if !result.is_empty() {
679            log::debug!("MD034 warnings: {result:#?}");
680        }
681        assert!(
682            result.is_empty(),
683            "Multiple badges and links on one line should not be flagged as bare URLs"
684        );
685    }
686
687    #[test]
688    fn test_bare_urls() {
689        let rule = MD034NoBareUrls;
690        let content = "This is a bare URL: https://example.com/foobar";
691        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
692        let result = rule.check(&ctx).unwrap();
693        assert_eq!(result.len(), 1, "Bare URLs should be flagged");
694        assert_eq!(result[0].line, 1);
695        assert_eq!(result[0].column, 21);
696    }
697
698    #[test]
699    fn test_md034_performance_baseline() {
700        use std::time::Instant;
701
702        // Generate test content with various URL patterns
703        let mut content = String::with_capacity(50_000);
704
705        // Add content with bare URLs (should be detected)
706        for i in 0..250 {
707            content.push_str(&format!("Line {i} with bare URL https://example{i}.com/path\n"));
708        }
709
710        // Add content with proper markdown links (should not be detected)
711        for i in 0..250 {
712            content.push_str(&format!(
713                "Line {} with [proper link](https://example{}.com/path)\n",
714                i + 250,
715                i
716            ));
717        }
718
719        // Add content with no URLs (should be fast)
720        for i in 0..500 {
721            content.push_str(&format!("Line {} with no URLs, just regular text content\n", i + 500));
722        }
723
724        // Add content with emails
725        for i in 0..100 {
726            content.push_str(&format!("Contact user{i}@example{i}.com for more info\n"));
727        }
728
729        println!(
730            "MD034 Performance Test - Content: {} bytes, {} lines",
731            content.len(),
732            content.lines().count()
733        );
734
735        let rule = MD034NoBareUrls;
736        let ctx = LintContext::new(&content, crate::config::MarkdownFlavor::Standard);
737
738        // Warm up
739        let _ = rule.check(&ctx).unwrap();
740
741        // Measure check performance (more runs for accuracy)
742        let mut total_duration = std::time::Duration::ZERO;
743        let runs = 10;
744        let mut warnings_count = 0;
745
746        for _ in 0..runs {
747            let start = Instant::now();
748            let warnings = rule.check(&ctx).unwrap();
749            total_duration += start.elapsed();
750            warnings_count = warnings.len();
751        }
752
753        let avg_check_duration = total_duration / runs;
754
755        println!("MD034 Optimized Performance:");
756        println!(
757            "- Average check time: {:?} ({:.2} ms)",
758            avg_check_duration,
759            avg_check_duration.as_secs_f64() * 1000.0
760        );
761        println!("- Found {warnings_count} warnings");
762        println!(
763            "- Lines per second: {:.0}",
764            content.lines().count() as f64 / avg_check_duration.as_secs_f64()
765        );
766        println!(
767            "- Microseconds per line: {:.2}",
768            avg_check_duration.as_micros() as f64 / content.lines().count() as f64
769        );
770
771        // Performance assertion - should complete reasonably fast
772        // Note: In debug builds this may take longer, so we use a higher threshold
773        let max_duration_ms = if cfg!(debug_assertions) { 1000 } else { 100 };
774        assert!(
775            avg_check_duration.as_millis() < max_duration_ms,
776            "MD034 check should complete in under {}ms, took {}ms",
777            max_duration_ms,
778            avg_check_duration.as_millis()
779        );
780
781        // Verify we're finding the expected number of warnings
782        assert_eq!(warnings_count, 350, "Should find 250 URLs + 100 emails = 350 warnings");
783    }
784}
rumdl_lib/rules/md034_no_bare_urls.rs

rumdl_lib/rules/
md034_no_bare_urls.rs