rumdl_lib/rules/
md034_no_bare_urls.rs

1/// Rule MD034: No unformatted URLs
2///
3/// See [docs/md034.md](../../docs/md034.md) for full documentation, configuration, and examples.
4use crate::rule::{
5    AstExtensions, Fix, LintError, LintResult, LintWarning, MarkdownAst, MaybeAst, Rule, RuleCategory, Severity,
6};
7use crate::utils::range_utils::calculate_url_range;
8use crate::utils::regex_cache::EMAIL_PATTERN;
9
10use crate::lint_context::LintContext;
11use fancy_regex::Regex as FancyRegex;
12use lazy_static::lazy_static;
13use markdown::mdast::Node;
14use regex::Regex;
15
16lazy_static! {
17    // Simple pattern to quickly check if a line might contain a URL or email
18    static ref URL_QUICK_CHECK: Regex = Regex::new(r#"(?:https?|ftps?)://|@"#).unwrap();
19
20    // Use fancy-regex for look-behind/look-ahead
21    // Updated to support IPv6 addresses in square brackets
22    static ref URL_REGEX: FancyRegex = FancyRegex::new(r#"(?<![\w\[\(\<])((?:https?|ftps?)://(?:\[[0-9a-fA-F:%]+\]|[^\s<>\[\]()\\'\"]+)(?::\d+)?(?:/[^\s<>\[\]()\\'\"]*)?(?:\?[^\s<>\[\]()\\'\"]*)?(?:#[^\s<>\[\]()\\'\"]*)?)"#).unwrap();
23    static ref URL_FIX_REGEX: FancyRegex = FancyRegex::new(r#"(?<![\w\[\(\<])((?:https?|ftps?)://(?:\[[0-9a-fA-F:%]+\]|[^\s<>\[\]()\\'\"]+)(?::\d+)?(?:/[^\s<>\[\]()\\'\"]*)?(?:\?[^\s<>\[\]()\\'\"]*)?(?:#[^\s<>\[\]()\\'\"]*)?)"#).unwrap();
24
25    // Pattern to detect custom protocol patterns that shouldn't be flagged
26    // These are commonly used in documentation but aren't actual browsable URLs
27    static ref CUSTOM_PROTOCOL_PATTERN: Regex = Regex::new(r#"(?:grpc|ws|wss|ssh|git|svn|file|data|javascript|vscode|chrome|about|slack|discord|matrix|irc|redis|mongodb|postgresql|mysql|kafka|nats|amqp|mqtt|custom|app|api|service)://"#).unwrap();
28
29    // Pattern to match markdown link format - capture destination in Group 1
30    // Updated to handle nested brackets in badge links like [![badge](img)](link)
31    static ref MARKDOWN_LINK_PATTERN: Regex = Regex::new(r#"\[(?:[^\[\]]|\[[^\]]*\])*\]\(([^)\s]+)(?:\s+(?:\"[^\"]*\"|\'[^\']*\'))?\)"#).unwrap();
32
33    // Pattern to match angle bracket link format (URLs and emails)
34    // Updated to support IPv6 addresses
35    static ref ANGLE_LINK_PATTERN: Regex = Regex::new(r#"<((?:https?|ftps?)://(?:\[[0-9a-fA-F:]+(?:%[a-zA-Z0-9]+)?\]|[^>]+)|[^@\s]+@[^@\s]+\.[^@\s>]+)>"#).unwrap();
36
37    // Add regex to identify lines containing only a badge link
38    static ref BADGE_LINK_LINE: Regex = Regex::new(r#"^\s*\[!\[[^\]]*\]\([^)]*\)\]\([^)]*\)\s*$"#).unwrap();
39
40    // Add pattern to check if link text is *only* an image
41    static ref IMAGE_ONLY_LINK_TEXT_PATTERN: Regex = Regex::new(r#"^!\s*\[[^\]]*\]\s*\([^)]*\)$"#).unwrap();
42
43    // Captures full image in 0, alt text in 1, src in 2
44    static ref MARKDOWN_IMAGE_PATTERN: Regex = Regex::new(r#"!\s*\[([^\]]*)\]\s*\(([^)\s]+)(?:\s+(?:\"[^\"]*\"|\'[^\']*\'))?\)"#).unwrap();
45
46    // Add a simple regex for candidate URLs (no look-behind/look-ahead)
47    // Updated to match markdownlint's behavior: URLs can have domains without dots
48    // Handles URL components properly: scheme://domain[:port][/path][?query][#fragment]
49    // Will post-process to remove trailing sentence punctuation
50    // Now supports IPv6 addresses in square brackets
51    // Note: We need two separate patterns - one for IPv6 and one for regular URLs
52    // Updated to avoid matching partial IPv6 patterns (e.g., "https://::1]" without opening bracket)
53    static ref SIMPLE_URL_REGEX: Regex = Regex::new(r#"(https?|ftps?)://(?:\[[0-9a-fA-F:%.]+\](?::\d+)?|[^\s<>\[\]()\\'\"`:\]]+(?::\d+)?)(?:/[^\s<>\[\]()\\'\"`]*)?(?:\?[^\s<>\[\]()\\'\"`]*)?(?:#[^\s<>\[\]()\\'\"`]*)?"#).unwrap();
54
55    // Special pattern just for IPv6 URLs to handle them separately
56    // Note: This is permissive to match markdownlint behavior, allowing technically invalid IPv6 for examples
57    static ref IPV6_URL_REGEX: Regex = Regex::new(r#"(https?|ftps?)://\[[0-9a-fA-F:%.\-a-zA-Z]+\](?::\d+)?(?:/[^\s<>\[\]()\\'\"`]*)?(?:\?[^\s<>\[\]()\\'\"`]*)?(?:#[^\s<>\[\]()\\'\"`]*)?"#).unwrap();
58
59    // Add regex for reference definitions
60    // Updated to support IPv6 addresses
61    static ref REFERENCE_DEF_RE: Regex = Regex::new(r"^\s*\[[^\]]+\]:\s*(?:https?|ftps?)://\S+$").unwrap();
62
63    // Pattern to match HTML comments
64    static ref HTML_COMMENT_PATTERN: Regex = Regex::new(r#"<!--[\s\S]*?-->"#).unwrap();
65}
66
67#[derive(Default, Clone)]
68pub struct MD034NoBareUrls;
69
70impl MD034NoBareUrls {
71    #[inline]
72    pub fn should_skip(&self, content: &str) -> bool {
73        // Skip if content has no URLs and no email addresses
74        // Fast byte scanning for common URL/email indicators
75        let bytes = content.as_bytes();
76        !bytes.contains(&b':') && !bytes.contains(&b'@')
77    }
78
79    /// Remove trailing punctuation that is likely sentence punctuation, not part of the URL
80    fn trim_trailing_punctuation<'a>(&self, url: &'a str) -> &'a str {
81        let trailing_punct = ['.', ',', ';', ':', '!', '?'];
82        let mut end = url.len();
83
84        // Remove trailing punctuation characters
85        while end > 0 {
86            // Get the last character of the current substring safely
87            let current_url = &url[..end];
88            if let Some((last_char_pos, last_char)) = current_url.char_indices().next_back() {
89                if trailing_punct.contains(&last_char) {
90                    end = last_char_pos;
91                } else {
92                    break;
93                }
94            } else {
95                break;
96            }
97        }
98
99        &url[..end]
100    }
101
102    // Uses DocumentStructure for code block and code span detection in check_with_structure.
103    pub fn check_with_structure(
104        &self,
105        ctx: &crate::lint_context::LintContext,
106        _structure: &crate::utils::document_structure::DocumentStructure,
107    ) -> LintResult {
108        let content = ctx.content;
109
110        // Early return: skip if no URLs or emails
111        if self.should_skip(content) {
112            return Ok(vec![]);
113        }
114
115        // Process the entire content to handle multi-line markdown links
116        let mut warnings = Vec::new();
117
118        // First, find all markdown link ranges across the entire content
119        let mut excluded_ranges: Vec<(usize, usize)> = Vec::new();
120
121        // Markdown links: [text](url) - exclude both destination and entire link text
122        for cap in MARKDOWN_LINK_PATTERN.captures_iter(content) {
123            if let Some(dest) = cap.get(1) {
124                excluded_ranges.push((dest.start(), dest.end()));
125            }
126            // Also exclude the entire link to handle URLs in link text
127            if let Some(full_match) = cap.get(0) {
128                excluded_ranges.push((full_match.start(), full_match.end()));
129            }
130        }
131
132        // Markdown images: ![alt](url)
133        for cap in MARKDOWN_IMAGE_PATTERN.captures_iter(content) {
134            if let Some(dest) = cap.get(2) {
135                excluded_ranges.push((dest.start(), dest.end()));
136            }
137        }
138
139        // Angle-bracket links: <url>
140        for cap in ANGLE_LINK_PATTERN.captures_iter(content) {
141            if let Some(m) = cap.get(1) {
142                excluded_ranges.push((m.start(), m.end()));
143            }
144        }
145
146        // HTML tags: exclude everything inside them
147        for html_tag in ctx.html_tags().iter() {
148            excluded_ranges.push((html_tag.byte_offset, html_tag.byte_end));
149        }
150
151        // HTML comments: <!-- url -->
152        for cap in HTML_COMMENT_PATTERN.captures_iter(content) {
153            if let Some(comment) = cap.get(0) {
154                excluded_ranges.push((comment.start(), comment.end()));
155            }
156        }
157
158        // Sort and merge overlapping ranges
159        excluded_ranges.sort_by_key(|r| r.0);
160        let mut merged: Vec<(usize, usize)> = Vec::new();
161        for (start, end) in excluded_ranges {
162            if let Some((_, last_end)) = merged.last_mut()
163                && *last_end >= start
164            {
165                *last_end = (*last_end).max(end);
166                continue;
167            }
168            merged.push((start, end));
169        }
170
171        // Now find all URLs and emails in the content and check if they're excluded
172        // We'll combine URL and email detection for efficiency
173        let mut all_matches: Vec<(usize, usize, bool)> = Vec::new(); // (start, end, is_email)
174
175        // Early exit if no potential URLs/emails based on quick check
176        if !content.contains("://") && !content.contains('@') {
177            return Ok(warnings);
178        }
179
180        // Pre-filter lines that might contain URLs or emails
181        let mut candidate_lines = Vec::new();
182        for (line_idx, line_info) in ctx.lines.iter().enumerate() {
183            // Skip lines in code blocks
184            if line_info.in_code_block {
185                continue;
186            }
187
188            let line_content = &line_info.content;
189            let bytes = line_content.as_bytes();
190
191            // Fast byte-level check for potential URLs/emails
192            let has_url = bytes.contains(&b':') && line_content.contains("://");
193            let has_email = bytes.contains(&b'@');
194
195            if has_url || has_email {
196                candidate_lines.push(line_idx);
197            }
198        }
199
200        // Process only candidate lines
201        for &line_idx in &candidate_lines {
202            let line_info = &ctx.lines[line_idx];
203            let line_content = &line_info.content;
204
205            // Check for URLs in this line
206            for url_match in SIMPLE_URL_REGEX.find_iter(line_content) {
207                let start_in_line = url_match.start();
208                let end_in_line = url_match.end();
209                let matched_str = &line_content[start_in_line..end_in_line];
210
211                // Skip invalid IPv6 patterns
212                if matched_str.contains("::") && !matched_str.contains('[') && matched_str.contains(']') {
213                    continue;
214                }
215
216                // Skip custom protocols that aren't standard web protocols
217                // Check if there's a custom protocol pattern before this match
218                if start_in_line > 0 {
219                    // Look back to see if this is part of a custom protocol URL
220                    let prefix_start = start_in_line.saturating_sub(20); // Look back up to 20 chars
221
222                    // Ensure we're on a character boundary
223                    let prefix_start = if prefix_start == 0 {
224                        0
225                    } else {
226                        // Find the nearest character boundary at or after prefix_start
227                        let mut adjusted_start = prefix_start;
228                        while adjusted_start < start_in_line && !line_content.is_char_boundary(adjusted_start) {
229                            adjusted_start += 1;
230                        }
231                        adjusted_start
232                    };
233
234                    let prefix = &line_content[prefix_start..start_in_line];
235                    if CUSTOM_PROTOCOL_PATTERN.is_match(prefix) {
236                        continue;
237                    }
238                }
239
240                let global_start = line_info.byte_offset + start_in_line;
241                let global_end = line_info.byte_offset + end_in_line;
242                all_matches.push((global_start, global_end, false));
243            }
244
245            // Check for IPv6 URLs
246            for url_match in IPV6_URL_REGEX.find_iter(line_content) {
247                let global_start = line_info.byte_offset + url_match.start();
248                let global_end = line_info.byte_offset + url_match.end();
249
250                // Remove any overlapping regular URL matches
251                all_matches.retain(|(start, end, _)| !(*start < global_end && *end > global_start));
252
253                all_matches.push((global_start, global_end, false));
254            }
255
256            // Check for emails in this line
257            for email_match in EMAIL_PATTERN.find_iter(line_content) {
258                let global_start = line_info.byte_offset + email_match.start();
259                let global_end = line_info.byte_offset + email_match.end();
260                all_matches.push((global_start, global_end, true));
261            }
262        }
263
264        // Process all matches
265        for (match_start, match_end_orig, is_email) in all_matches {
266            let mut match_end = match_end_orig;
267
268            // For URLs, trim trailing punctuation
269            if !is_email {
270                let raw_url = &content[match_start..match_end];
271                let trimmed_url = self.trim_trailing_punctuation(raw_url);
272                match_end = match_start + trimmed_url.len();
273            }
274
275            // Skip if became empty after trimming
276            if match_end <= match_start {
277                continue;
278            }
279
280            // Manual boundary check: not part of a larger word
281            // Use bytes for ASCII checks (more efficient)
282            let bytes = content.as_bytes();
283            let before_byte = if match_start == 0 {
284                None
285            } else {
286                bytes.get(match_start - 1).copied()
287            };
288            let after_byte = bytes.get(match_end).copied();
289
290            let is_valid_boundary = if is_email {
291                before_byte.is_none_or(|b| !b.is_ascii_alphanumeric() && b != b'_' && b != b'.')
292                    && after_byte.is_none_or(|b| !b.is_ascii_alphanumeric() && b != b'_' && b != b'.')
293            } else {
294                before_byte.is_none_or(|b| !b.is_ascii_alphanumeric() && b != b'_')
295                    && after_byte.is_none_or(|b| !b.is_ascii_alphanumeric() && b != b'_')
296            };
297
298            if !is_valid_boundary {
299                continue;
300            }
301
302            // Skip if this is within any skip context (code blocks, MkDocs snippets, etc.)
303            if crate::utils::skip_context::is_in_skip_context(ctx, match_start) {
304                continue;
305            }
306
307            // Skip if within any excluded range (link/image dest/HTML comment)
308            let in_any_range = merged.iter().any(|(start, end)| {
309                // For HTML comments and other exclusions, check if URL overlaps the range
310                (match_start >= *start && match_start < *end)
311                    || (match_end > *start && match_end <= *end)
312                    || (match_start < *start && match_end > *end)
313            });
314            if in_any_range {
315                continue;
316            }
317
318            // Get line information efficiently
319            let (line_num, col_num) = ctx.offset_to_line_col(match_start);
320
321            // Skip reference definitions for URLs
322            if !is_email
323                && let Some(line_info) = ctx.line_info(line_num)
324                && REFERENCE_DEF_RE.is_match(&line_info.content)
325            {
326                continue;
327            }
328
329            let matched_text = &content[match_start..match_end];
330            let line_info = ctx.line_info(line_num).unwrap();
331            let (start_line, start_col, end_line, end_col) =
332                calculate_url_range(line_num, &line_info.content, col_num - 1, matched_text.len());
333
334            let message = if is_email {
335                "Email address without angle brackets or link formatting".to_string()
336            } else {
337                "URL without angle brackets or link formatting".to_string()
338            };
339
340            warnings.push(LintWarning {
341                rule_name: Some(self.name()),
342                line: start_line,
343                column: start_col,
344                end_line,
345                end_column: end_col,
346                message,
347                severity: Severity::Warning,
348                fix: Some(Fix {
349                    range: match_start..match_end,
350                    replacement: format!("<{matched_text}>"),
351                }),
352            });
353        }
354
355        Ok(warnings)
356    }
357
358    /// AST-based bare URL detection: only flag URLs in text nodes not inside links/images/code/html
359    fn find_bare_urls_in_ast(
360        &self,
361        node: &Node,
362        parent_is_link_or_image: bool,
363        _content: &str,
364        warnings: &mut Vec<LintWarning>,
365        ctx: &LintContext,
366    ) {
367        use markdown::mdast::Node::*;
368        match node {
369            Text(text) if !parent_is_link_or_image => {
370                let text_str = &text.value;
371
372                // Check for URLs
373                for url_match in SIMPLE_URL_REGEX.find_iter(text_str) {
374                    let url_start = url_match.start();
375                    let mut url_end = url_match.end();
376
377                    // Trim trailing punctuation that's likely sentence punctuation
378                    let raw_url = &text_str[url_start..url_end];
379                    let trimmed_url = self.trim_trailing_punctuation(raw_url);
380                    url_end = url_start + trimmed_url.len();
381
382                    // Skip if URL became empty after trimming
383                    if url_end <= url_start {
384                        continue;
385                    }
386
387                    let before = if url_start == 0 {
388                        None
389                    } else {
390                        text_str.get(url_start - 1..url_start)
391                    };
392                    let after = text_str.get(url_end..url_end + 1);
393                    let is_valid_boundary = before
394                        .is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_")
395                        && after.is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_");
396                    if !is_valid_boundary {
397                        continue;
398                    }
399                    if let Some(pos) = &text.position {
400                        let offset = pos.start.offset + url_start;
401                        let (line, column) = ctx.offset_to_line_col(offset);
402                        let url_text = &text_str[url_start..url_end];
403                        let (start_line, start_col, end_line, end_col) =
404                            (line, column, line, column + url_text.chars().count());
405                        warnings.push(LintWarning {
406                            rule_name: Some(self.name()),
407                            line: start_line,
408                            column: start_col,
409                            end_line,
410                            end_column: end_col,
411                            message: "URL without angle brackets or link formatting".to_string(),
412                            severity: Severity::Warning,
413                            fix: Some(Fix {
414                                range: offset..(offset + url_text.len()),
415                                replacement: format!("<{url_text}>"),
416                            }),
417                        });
418                    }
419                }
420
421                // Check for email addresses
422                for email_match in EMAIL_PATTERN.find_iter(text_str) {
423                    let email_start = email_match.start();
424                    let email_end = email_match.end();
425                    let before = if email_start == 0 {
426                        None
427                    } else {
428                        text_str.get(email_start - 1..email_start)
429                    };
430                    let after = text_str.get(email_end..email_end + 1);
431                    let is_valid_boundary = before
432                        .is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_" && c != ".")
433                        && after.is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_" && c != ".");
434                    if !is_valid_boundary {
435                        continue;
436                    }
437                    if let Some(pos) = &text.position {
438                        let offset = pos.start.offset + email_start;
439                        let (line, column) = ctx.offset_to_line_col(offset);
440                        let email_text = &text_str[email_start..email_end];
441                        let (start_line, start_col, end_line, end_col) =
442                            (line, column, line, column + email_text.chars().count());
443                        warnings.push(LintWarning {
444                            rule_name: Some(self.name()),
445                            line: start_line,
446                            column: start_col,
447                            end_line,
448                            end_column: end_col,
449                            message: "Email address without angle brackets or link formatting (wrap like: <email>)"
450                                .to_string(),
451                            severity: Severity::Warning,
452                            fix: Some(Fix {
453                                range: offset..(offset + email_text.len()),
454                                replacement: format!("<{email_text}>"),
455                            }),
456                        });
457                    }
458                }
459            }
460            Link(link) => {
461                for child in &link.children {
462                    self.find_bare_urls_in_ast(child, true, _content, warnings, ctx);
463                }
464            }
465            Image(image) => {
466                // Only check alt text for bare URLs (rare, but possible)
467                let alt_str = &image.alt;
468                for url_match in SIMPLE_URL_REGEX.find_iter(alt_str) {
469                    let url_start = url_match.start();
470                    let mut url_end = url_match.end();
471
472                    // Trim trailing punctuation that's likely sentence punctuation
473                    let raw_url = &alt_str[url_start..url_end];
474                    let trimmed_url = self.trim_trailing_punctuation(raw_url);
475                    url_end = url_start + trimmed_url.len();
476
477                    // Skip if URL became empty after trimming
478                    if url_end <= url_start {
479                        continue;
480                    }
481
482                    let before = if url_start == 0 {
483                        None
484                    } else {
485                        alt_str.get(url_start - 1..url_start)
486                    };
487                    let after = alt_str.get(url_end..url_end + 1);
488                    let is_valid_boundary = before
489                        .is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_")
490                        && after.is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_");
491                    if !is_valid_boundary {
492                        continue;
493                    }
494                    if let Some(pos) = &image.position {
495                        let offset = pos.start.offset + url_start;
496                        let (line, column) = ctx.offset_to_line_col(offset);
497                        let url_text = &alt_str[url_start..url_end];
498                        let (start_line, start_col, end_line, end_col) =
499                            (line, column, line, column + url_text.chars().count());
500                        warnings.push(LintWarning {
501                            rule_name: Some(self.name()),
502                            line: start_line,
503                            column: start_col,
504                            end_line,
505                            end_column: end_col,
506                            message: "URL without angle brackets or link formatting".to_string(),
507                            severity: Severity::Warning,
508                            fix: Some(Fix {
509                                range: offset..(offset + url_text.len()),
510                                replacement: format!("<{url_text}>"),
511                            }),
512                        });
513                    }
514                }
515            }
516            Code(_) | InlineCode(_) | Html(_) => {
517                // Skip code and HTML nodes
518            }
519            _ => {
520                if let Some(children) = node.children() {
521                    for child in children {
522                        self.find_bare_urls_in_ast(child, false, _content, warnings, ctx);
523                    }
524                }
525            }
526        }
527    }
528
529    /// AST-based check method for MD034
530    pub fn check_ast(&self, ctx: &LintContext, ast: &Node) -> LintResult {
531        let mut warnings = Vec::new();
532        self.find_bare_urls_in_ast(ast, false, ctx.content, &mut warnings, ctx);
533        Ok(warnings)
534    }
535}
536
537impl Rule for MD034NoBareUrls {
538    fn name(&self) -> &'static str {
539        "MD034"
540    }
541
542    fn description(&self) -> &'static str {
543        "URL without angle brackets or link formatting"
544    }
545
546    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
547        // Use line-based detection to properly distinguish between bare URLs and autolinks
548        // AST-based approach doesn't work because CommonMark parser converts bare URLs to links
549        let content = ctx.content;
550
551        // Fast path: Early return for empty content
552        if content.is_empty() || self.should_skip(content) {
553            return Ok(Vec::new());
554        }
555
556        // Fallback path: create structure manually (should rarely be used)
557        let structure = crate::utils::document_structure::DocumentStructure::new(content);
558        self.check_with_structure(ctx, &structure)
559    }
560
561    fn check_with_ast(&self, ctx: &LintContext, ast: &MarkdownAst) -> LintResult {
562        // Use AST-based detection for better accuracy
563        let mut warnings = Vec::new();
564        self.find_bare_urls_in_ast(ast, false, ctx.content, &mut warnings, ctx);
565        Ok(warnings)
566    }
567
568    fn uses_ast(&self) -> bool {
569        // AST-based approach doesn't work because CommonMark parser converts bare URLs to links
570        // Use document structure approach instead
571        false
572    }
573
574    fn uses_document_structure(&self) -> bool {
575        true
576    }
577
578    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
579        let content = ctx.content;
580        if self.should_skip(content) {
581            return Ok(content.to_string());
582        }
583
584        // Get all warnings first - only fix URLs that are actually flagged
585        // Use structure-based detection to match the main linting path (since uses_document_structure() returns true)
586        let structure = crate::utils::document_structure::DocumentStructure::new(content);
587        let warnings = self.check_with_structure(ctx, &structure)?;
588        if warnings.is_empty() {
589            return Ok(content.to_string());
590        }
591
592        // Sort warnings by byte offset in reverse order (rightmost first) to avoid offset issues
593        let mut sorted_warnings = warnings.clone();
594        sorted_warnings.sort_by_key(|w| std::cmp::Reverse(w.fix.as_ref().map(|f| f.range.start).unwrap_or(0)));
595
596        let mut result = content.to_string();
597        for warning in sorted_warnings {
598            if let Some(fix) = &warning.fix {
599                let start = fix.range.start;
600                let end = fix.range.end;
601
602                if start <= result.len() && end <= result.len() && start < end {
603                    result.replace_range(start..end, &fix.replacement);
604                }
605            }
606        }
607
608        Ok(result)
609    }
610
611    /// Get the category of this rule for selective processing
612    fn category(&self) -> RuleCategory {
613        RuleCategory::Link
614    }
615
616    /// Check if this rule should be skipped based on content
617    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
618        self.should_skip(ctx.content)
619    }
620
621    fn as_any(&self) -> &dyn std::any::Any {
622        self
623    }
624
625    fn as_maybe_document_structure(&self) -> Option<&dyn crate::rule::MaybeDocumentStructure> {
626        Some(self)
627    }
628
629    fn as_maybe_ast(&self) -> Option<&dyn MaybeAst> {
630        Some(self)
631    }
632
633    fn from_config(_config: &crate::config::Config) -> Box<dyn Rule>
634    where
635        Self: Sized,
636    {
637        Box::new(MD034NoBareUrls)
638    }
639}
640
641impl crate::utils::document_structure::DocumentStructureExtensions for MD034NoBareUrls {
642    fn has_relevant_elements(
643        &self,
644        ctx: &crate::lint_context::LintContext,
645        _doc_structure: &crate::utils::document_structure::DocumentStructure,
646    ) -> bool {
647        // This rule is only relevant if there might be URLs or emails in the content
648        !self.should_skip(ctx.content)
649    }
650}
651
652impl AstExtensions for MD034NoBareUrls {
653    fn has_relevant_ast_elements(&self, ctx: &LintContext, ast: &MarkdownAst) -> bool {
654        // Check if AST contains text nodes (where bare URLs would be)
655        use crate::utils::ast_utils::ast_contains_node_type;
656        !self.should_skip(ctx.content) && ast_contains_node_type(ast, "text")
657    }
658}
659
660#[cfg(test)]
661mod tests {
662    use super::*;
663    use crate::lint_context::LintContext;
664
665    #[test]
666    fn test_url_quick_check() {
667        assert!(URL_QUICK_CHECK.is_match("This is a URL: https://example.com"));
668        assert!(!URL_QUICK_CHECK.is_match("This has no URL"));
669    }
670
671    #[test]
672    fn test_multiple_badges_and_links_on_one_line() {
673        let rule = MD034NoBareUrls;
674        let content = "# [React](https://react.dev/) \
675&middot; [![GitHub license](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/facebook/react/blob/main/LICENSE) \
676[![npm version](https://img.shields.io/npm/v/react.svg?style=flat)](https://www.npmjs.com/package/react) \
677[![(Runtime) Build and Test](https://github.com/facebook/react/actions/workflows/runtime_build_and_test.yml/badge.svg)](https://github.com/facebook/react/actions/workflows/runtime_build_and_test.yml) \
678[![(Compiler) TypeScript](https://github.com/facebook/react/actions/workflows/compiler_typescript.yml/badge.svg?branch=main)](https://github.com/facebook/react/actions/workflows/compiler_typescript.yml) \
679[![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](https://legacy.reactjs.org/docs/how-to-contribute.html#your-first-pull-request)";
680        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
681        let result = rule.check(&ctx).unwrap();
682        if !result.is_empty() {
683            log::debug!("MD034 warnings: {result:#?}");
684        }
685        assert!(
686            result.is_empty(),
687            "Multiple badges and links on one line should not be flagged as bare URLs"
688        );
689    }
690
691    #[test]
692    fn test_bare_urls() {
693        let rule = MD034NoBareUrls;
694        let content = "This is a bare URL: https://example.com/foobar";
695        let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
696        let result = rule.check(&ctx).unwrap();
697        assert_eq!(result.len(), 1, "Bare URLs should be flagged");
698        assert_eq!(result[0].line, 1);
699        assert_eq!(result[0].column, 21);
700    }
701
702    #[test]
703    fn test_md034_performance_baseline() {
704        use std::time::Instant;
705
706        // Generate test content with various URL patterns
707        let mut content = String::with_capacity(50_000);
708
709        // Add content with bare URLs (should be detected)
710        for i in 0..250 {
711            content.push_str(&format!("Line {i} with bare URL https://example{i}.com/path\n"));
712        }
713
714        // Add content with proper markdown links (should not be detected)
715        for i in 0..250 {
716            content.push_str(&format!(
717                "Line {} with [proper link](https://example{}.com/path)\n",
718                i + 250,
719                i
720            ));
721        }
722
723        // Add content with no URLs (should be fast)
724        for i in 0..500 {
725            content.push_str(&format!("Line {} with no URLs, just regular text content\n", i + 500));
726        }
727
728        // Add content with emails
729        for i in 0..100 {
730            content.push_str(&format!("Contact user{i}@example{i}.com for more info\n"));
731        }
732
733        println!(
734            "MD034 Performance Test - Content: {} bytes, {} lines",
735            content.len(),
736            content.lines().count()
737        );
738
739        let rule = MD034NoBareUrls;
740        let ctx = LintContext::new(&content, crate::config::MarkdownFlavor::Standard);
741
742        // Warm up
743        let _ = rule.check(&ctx).unwrap();
744
745        // Measure check performance (more runs for accuracy)
746        let mut total_duration = std::time::Duration::ZERO;
747        let runs = 10;
748        let mut warnings_count = 0;
749
750        for _ in 0..runs {
751            let start = Instant::now();
752            let warnings = rule.check(&ctx).unwrap();
753            total_duration += start.elapsed();
754            warnings_count = warnings.len();
755        }
756
757        let avg_check_duration = total_duration / runs;
758
759        println!("MD034 Optimized Performance:");
760        println!(
761            "- Average check time: {:?} ({:.2} ms)",
762            avg_check_duration,
763            avg_check_duration.as_secs_f64() * 1000.0
764        );
765        println!("- Found {warnings_count} warnings");
766        println!(
767            "- Lines per second: {:.0}",
768            content.lines().count() as f64 / avg_check_duration.as_secs_f64()
769        );
770        println!(
771            "- Microseconds per line: {:.2}",
772            avg_check_duration.as_micros() as f64 / content.lines().count() as f64
773        );
774
775        // Performance assertion - should complete reasonably fast
776        // Note: In debug builds this may take longer, so we use a higher threshold
777        let max_duration_ms = if cfg!(debug_assertions) { 1000 } else { 100 };
778        assert!(
779            avg_check_duration.as_millis() < max_duration_ms,
780            "MD034 check should complete in under {}ms, took {}ms",
781            max_duration_ms,
782            avg_check_duration.as_millis()
783        );
784
785        // Verify we're finding the expected number of warnings
786        assert_eq!(warnings_count, 350, "Should find 250 URLs + 100 emails = 350 warnings");
787    }
788}
rumdl_lib/rules/md034_no_bare_urls.rs

rumdl_lib/rules/
md034_no_bare_urls.rs