1use crate::rule::{
5 AstExtensions, Fix, LintError, LintResult, LintWarning, MarkdownAst, MaybeAst, Rule, RuleCategory, Severity,
6};
7use crate::utils::range_utils::calculate_url_range;
8use crate::utils::regex_cache::EMAIL_PATTERN;
9
10use crate::lint_context::LintContext;
11use fancy_regex::Regex as FancyRegex;
12use lazy_static::lazy_static;
13use markdown::mdast::Node;
14use regex::Regex;
15
16lazy_static! {
17 static ref URL_QUICK_CHECK: Regex = Regex::new(r#"(?:https?|ftps?)://|@"#).unwrap();
19
20 static ref URL_REGEX: FancyRegex = FancyRegex::new(r#"(?<![\w\[\(\<])((?:https?|ftps?)://(?:\[[0-9a-fA-F:%]+\]|[^\s<>\[\]()\\'\"]+)(?::\d+)?(?:/[^\s<>\[\]()\\'\"]*)?(?:\?[^\s<>\[\]()\\'\"]*)?(?:#[^\s<>\[\]()\\'\"]*)?)"#).unwrap();
23 static ref URL_FIX_REGEX: FancyRegex = FancyRegex::new(r#"(?<![\w\[\(\<])((?:https?|ftps?)://(?:\[[0-9a-fA-F:%]+\]|[^\s<>\[\]()\\'\"]+)(?::\d+)?(?:/[^\s<>\[\]()\\'\"]*)?(?:\?[^\s<>\[\]()\\'\"]*)?(?:#[^\s<>\[\]()\\'\"]*)?)"#).unwrap();
24
25 static ref CUSTOM_PROTOCOL_PATTERN: Regex = Regex::new(r#"(?:grpc|ws|wss|ssh|git|svn|file|data|javascript|vscode|chrome|about|slack|discord|matrix|irc|redis|mongodb|postgresql|mysql|kafka|nats|amqp|mqtt|custom|app|api|service)://"#).unwrap();
28
29 static ref MARKDOWN_LINK_PATTERN: Regex = Regex::new(r#"\[(?:[^\[\]]|\[[^\]]*\])*\]\(([^)\s]+)(?:\s+(?:\"[^\"]*\"|\'[^\']*\'))?\)"#).unwrap();
32
33 static ref ANGLE_LINK_PATTERN: Regex = Regex::new(r#"<((?:https?|ftps?)://(?:\[[0-9a-fA-F:]+(?:%[a-zA-Z0-9]+)?\]|[^>]+)|[^@\s]+@[^@\s]+\.[^@\s>]+)>"#).unwrap();
36
37 static ref BADGE_LINK_LINE: Regex = Regex::new(r#"^\s*\[!\[[^\]]*\]\([^)]*\)\]\([^)]*\)\s*$"#).unwrap();
39
40 static ref IMAGE_ONLY_LINK_TEXT_PATTERN: Regex = Regex::new(r#"^!\s*\[[^\]]*\]\s*\([^)]*\)$"#).unwrap();
42
43 static ref MARKDOWN_IMAGE_PATTERN: Regex = Regex::new(r#"!\s*\[([^\]]*)\]\s*\(([^)\s]+)(?:\s+(?:\"[^\"]*\"|\'[^\']*\'))?\)"#).unwrap();
45
46 static ref SIMPLE_URL_REGEX: Regex = Regex::new(r#"(https?|ftps?)://(?:\[[0-9a-fA-F:%.]+\](?::\d+)?|[^\s<>\[\]()\\'\"`:\]]+(?::\d+)?)(?:/[^\s<>\[\]()\\'\"`]*)?(?:\?[^\s<>\[\]()\\'\"`]*)?(?:#[^\s<>\[\]()\\'\"`]*)?"#).unwrap();
54
55 static ref IPV6_URL_REGEX: Regex = Regex::new(r#"(https?|ftps?)://\[[0-9a-fA-F:%.\-a-zA-Z]+\](?::\d+)?(?:/[^\s<>\[\]()\\'\"`]*)?(?:\?[^\s<>\[\]()\\'\"`]*)?(?:#[^\s<>\[\]()\\'\"`]*)?"#).unwrap();
58
59 static ref REFERENCE_DEF_RE: Regex = Regex::new(r"^\s*\[[^\]]+\]:\s*(?:https?|ftps?)://\S+$").unwrap();
62
63 static ref HTML_COMMENT_PATTERN: Regex = Regex::new(r#"<!--[\s\S]*?-->"#).unwrap();
65}
66
67#[derive(Default, Clone)]
68pub struct MD034NoBareUrls;
69
70impl MD034NoBareUrls {
71 #[inline]
72 pub fn should_skip(&self, content: &str) -> bool {
73 let bytes = content.as_bytes();
76 !bytes.contains(&b':') && !bytes.contains(&b'@')
77 }
78
79 fn trim_trailing_punctuation<'a>(&self, url: &'a str) -> &'a str {
81 let trailing_punct = ['.', ',', ';', ':', '!', '?'];
82 let mut end = url.len();
83
84 while end > 0 {
86 let current_url = &url[..end];
88 if let Some((last_char_pos, last_char)) = current_url.char_indices().next_back() {
89 if trailing_punct.contains(&last_char) {
90 end = last_char_pos;
91 } else {
92 break;
93 }
94 } else {
95 break;
96 }
97 }
98
99 &url[..end]
100 }
101
102 fn find_bare_urls_in_ast(
104 &self,
105 node: &Node,
106 parent_is_link_or_image: bool,
107 _content: &str,
108 warnings: &mut Vec<LintWarning>,
109 ctx: &LintContext,
110 ) {
111 use markdown::mdast::Node::*;
112 match node {
113 Text(text) if !parent_is_link_or_image => {
114 let text_str = &text.value;
115
116 for url_match in SIMPLE_URL_REGEX.find_iter(text_str) {
118 let url_start = url_match.start();
119 let mut url_end = url_match.end();
120
121 let raw_url = &text_str[url_start..url_end];
123 let trimmed_url = self.trim_trailing_punctuation(raw_url);
124 url_end = url_start + trimmed_url.len();
125
126 if url_end <= url_start {
128 continue;
129 }
130
131 let before = if url_start == 0 {
132 None
133 } else {
134 text_str.get(url_start - 1..url_start)
135 };
136 let after = text_str.get(url_end..url_end + 1);
137 let is_valid_boundary = before
138 .is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_")
139 && after.is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_");
140 if !is_valid_boundary {
141 continue;
142 }
143 if let Some(pos) = &text.position {
144 let offset = pos.start.offset + url_start;
145 let (line, column) = ctx.offset_to_line_col(offset);
146 let url_text = &text_str[url_start..url_end];
147 let (start_line, start_col, end_line, end_col) =
148 (line, column, line, column + url_text.chars().count());
149 warnings.push(LintWarning {
150 rule_name: Some(self.name()),
151 line: start_line,
152 column: start_col,
153 end_line,
154 end_column: end_col,
155 message: "URL without angle brackets or link formatting".to_string(),
156 severity: Severity::Warning,
157 fix: Some(Fix {
158 range: offset..(offset + url_text.len()),
159 replacement: format!("<{url_text}>"),
160 }),
161 });
162 }
163 }
164
165 for email_match in EMAIL_PATTERN.find_iter(text_str) {
167 let email_start = email_match.start();
168 let email_end = email_match.end();
169 let before = if email_start == 0 {
170 None
171 } else {
172 text_str.get(email_start - 1..email_start)
173 };
174 let after = text_str.get(email_end..email_end + 1);
175 let is_valid_boundary = before
176 .is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_" && c != ".")
177 && after.is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_" && c != ".");
178 if !is_valid_boundary {
179 continue;
180 }
181 if let Some(pos) = &text.position {
182 let offset = pos.start.offset + email_start;
183 let (line, column) = ctx.offset_to_line_col(offset);
184 let email_text = &text_str[email_start..email_end];
185 let (start_line, start_col, end_line, end_col) =
186 (line, column, line, column + email_text.chars().count());
187 warnings.push(LintWarning {
188 rule_name: Some(self.name()),
189 line: start_line,
190 column: start_col,
191 end_line,
192 end_column: end_col,
193 message: "Email address without angle brackets or link formatting (wrap like: <email>)"
194 .to_string(),
195 severity: Severity::Warning,
196 fix: Some(Fix {
197 range: offset..(offset + email_text.len()),
198 replacement: format!("<{email_text}>"),
199 }),
200 });
201 }
202 }
203 }
204 Link(link) => {
205 for child in &link.children {
206 self.find_bare_urls_in_ast(child, true, _content, warnings, ctx);
207 }
208 }
209 Image(image) => {
210 let alt_str = &image.alt;
212 for url_match in SIMPLE_URL_REGEX.find_iter(alt_str) {
213 let url_start = url_match.start();
214 let mut url_end = url_match.end();
215
216 let raw_url = &alt_str[url_start..url_end];
218 let trimmed_url = self.trim_trailing_punctuation(raw_url);
219 url_end = url_start + trimmed_url.len();
220
221 if url_end <= url_start {
223 continue;
224 }
225
226 let before = if url_start == 0 {
227 None
228 } else {
229 alt_str.get(url_start - 1..url_start)
230 };
231 let after = alt_str.get(url_end..url_end + 1);
232 let is_valid_boundary = before
233 .is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_")
234 && after.is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_");
235 if !is_valid_boundary {
236 continue;
237 }
238 if let Some(pos) = &image.position {
239 let offset = pos.start.offset + url_start;
240 let (line, column) = ctx.offset_to_line_col(offset);
241 let url_text = &alt_str[url_start..url_end];
242 let (start_line, start_col, end_line, end_col) =
243 (line, column, line, column + url_text.chars().count());
244 warnings.push(LintWarning {
245 rule_name: Some(self.name()),
246 line: start_line,
247 column: start_col,
248 end_line,
249 end_column: end_col,
250 message: "URL without angle brackets or link formatting".to_string(),
251 severity: Severity::Warning,
252 fix: Some(Fix {
253 range: offset..(offset + url_text.len()),
254 replacement: format!("<{url_text}>"),
255 }),
256 });
257 }
258 }
259 }
260 Code(_) | InlineCode(_) | Html(_) => {
261 }
263 _ => {
264 if let Some(children) = node.children() {
265 for child in children {
266 self.find_bare_urls_in_ast(child, false, _content, warnings, ctx);
267 }
268 }
269 }
270 }
271 }
272
273 pub fn check_ast(&self, ctx: &LintContext, ast: &Node) -> LintResult {
275 let mut warnings = Vec::new();
276 self.find_bare_urls_in_ast(ast, false, ctx.content, &mut warnings, ctx);
277 Ok(warnings)
278 }
279}
280
281impl Rule for MD034NoBareUrls {
282 fn name(&self) -> &'static str {
283 "MD034"
284 }
285
286 fn description(&self) -> &'static str {
287 "URL without angle brackets or link formatting"
288 }
289
290 fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
291 let content = ctx.content;
294
295 if content.is_empty() || self.should_skip(content) {
297 return Ok(Vec::new());
298 }
299 let mut warnings = Vec::new();
300
301 let mut excluded_ranges: Vec<(usize, usize)> = Vec::new();
303
304 for cap in MARKDOWN_LINK_PATTERN.captures_iter(content) {
306 if let Some(dest) = cap.get(1) {
307 excluded_ranges.push((dest.start(), dest.end()));
308 }
309 if let Some(full_match) = cap.get(0) {
311 excluded_ranges.push((full_match.start(), full_match.end()));
312 }
313 }
314
315 for cap in MARKDOWN_IMAGE_PATTERN.captures_iter(content) {
317 if let Some(dest) = cap.get(2) {
318 excluded_ranges.push((dest.start(), dest.end()));
319 }
320 }
321
322 for cap in ANGLE_LINK_PATTERN.captures_iter(content) {
324 if let Some(m) = cap.get(1) {
325 excluded_ranges.push((m.start(), m.end()));
326 }
327 }
328
329 for html_tag in ctx.html_tags().iter() {
331 excluded_ranges.push((html_tag.byte_offset, html_tag.byte_end));
332 }
333
334 for cap in HTML_COMMENT_PATTERN.captures_iter(content) {
336 if let Some(comment) = cap.get(0) {
337 excluded_ranges.push((comment.start(), comment.end()));
338 }
339 }
340
341 excluded_ranges.sort_by_key(|r| r.0);
343 let mut merged: Vec<(usize, usize)> = Vec::new();
344 for (start, end) in excluded_ranges {
345 if let Some((_, last_end)) = merged.last_mut()
346 && *last_end >= start
347 {
348 *last_end = (*last_end).max(end);
349 continue;
350 }
351 merged.push((start, end));
352 }
353
354 let mut all_matches: Vec<(usize, usize, bool)> = Vec::new(); if !content.contains("://") && !content.contains('@') {
360 return Ok(warnings);
361 }
362
363 let mut candidate_lines = Vec::new();
365 for (line_idx, line_info) in ctx.lines.iter().enumerate() {
366 if line_info.in_code_block {
368 continue;
369 }
370
371 let line_content = &line_info.content;
372 let bytes = line_content.as_bytes();
373
374 let has_url = bytes.contains(&b':') && line_content.contains("://");
376 let has_email = bytes.contains(&b'@');
377
378 if has_url || has_email {
379 candidate_lines.push(line_idx);
380 }
381 }
382
383 for &line_idx in &candidate_lines {
385 let line_info = &ctx.lines[line_idx];
386 let line_content = &line_info.content;
387
388 for url_match in SIMPLE_URL_REGEX.find_iter(line_content) {
390 let start_in_line = url_match.start();
391 let end_in_line = url_match.end();
392 let matched_str = &line_content[start_in_line..end_in_line];
393
394 if matched_str.contains("::") && !matched_str.contains('[') && matched_str.contains(']') {
396 continue;
397 }
398
399 if start_in_line > 0 {
402 let prefix_start = start_in_line.saturating_sub(20); let prefix_start = if prefix_start == 0 {
407 0
408 } else {
409 let mut adjusted_start = prefix_start;
411 while adjusted_start < start_in_line && !line_content.is_char_boundary(adjusted_start) {
412 adjusted_start += 1;
413 }
414 adjusted_start
415 };
416
417 let prefix = &line_content[prefix_start..start_in_line];
418 if CUSTOM_PROTOCOL_PATTERN.is_match(prefix) {
419 continue;
420 }
421 }
422
423 let global_start = line_info.byte_offset + start_in_line;
424 let global_end = line_info.byte_offset + end_in_line;
425 all_matches.push((global_start, global_end, false));
426 }
427
428 for url_match in IPV6_URL_REGEX.find_iter(line_content) {
430 let global_start = line_info.byte_offset + url_match.start();
431 let global_end = line_info.byte_offset + url_match.end();
432
433 all_matches.retain(|(start, end, _)| !(*start < global_end && *end > global_start));
435
436 all_matches.push((global_start, global_end, false));
437 }
438
439 for email_match in EMAIL_PATTERN.find_iter(line_content) {
441 let global_start = line_info.byte_offset + email_match.start();
442 let global_end = line_info.byte_offset + email_match.end();
443 all_matches.push((global_start, global_end, true));
444 }
445 }
446
447 for (match_start, match_end_orig, is_email) in all_matches {
449 let mut match_end = match_end_orig;
450
451 if !is_email {
453 let raw_url = &content[match_start..match_end];
454 let trimmed_url = self.trim_trailing_punctuation(raw_url);
455 match_end = match_start + trimmed_url.len();
456 }
457
458 if match_end <= match_start {
460 continue;
461 }
462
463 let bytes = content.as_bytes();
466 let before_byte = if match_start == 0 {
467 None
468 } else {
469 bytes.get(match_start - 1).copied()
470 };
471 let after_byte = bytes.get(match_end).copied();
472
473 let is_valid_boundary = if is_email {
474 before_byte.is_none_or(|b| !b.is_ascii_alphanumeric() && b != b'_' && b != b'.')
475 && after_byte.is_none_or(|b| !b.is_ascii_alphanumeric() && b != b'_' && b != b'.')
476 } else {
477 before_byte.is_none_or(|b| !b.is_ascii_alphanumeric() && b != b'_')
478 && after_byte.is_none_or(|b| !b.is_ascii_alphanumeric() && b != b'_')
479 };
480
481 if !is_valid_boundary {
482 continue;
483 }
484
485 if crate::utils::skip_context::is_in_skip_context(ctx, match_start) {
487 continue;
488 }
489
490 let in_any_range = merged.iter().any(|(start, end)| {
492 (match_start >= *start && match_start < *end)
494 || (match_end > *start && match_end <= *end)
495 || (match_start < *start && match_end > *end)
496 });
497 if in_any_range {
498 continue;
499 }
500
501 let (line_num, col_num) = ctx.offset_to_line_col(match_start);
503
504 if !is_email
506 && let Some(line_info) = ctx.line_info(line_num)
507 && REFERENCE_DEF_RE.is_match(&line_info.content)
508 {
509 continue;
510 }
511
512 let matched_text = &content[match_start..match_end];
513 let line_info = ctx.line_info(line_num).unwrap();
514 let (start_line, start_col, end_line, end_col) =
515 calculate_url_range(line_num, &line_info.content, col_num - 1, matched_text.len());
516
517 let message = if is_email {
518 "Email address without angle brackets or link formatting".to_string()
519 } else {
520 "URL without angle brackets or link formatting".to_string()
521 };
522
523 warnings.push(LintWarning {
524 rule_name: Some(self.name()),
525 line: start_line,
526 column: start_col,
527 end_line,
528 end_column: end_col,
529 message,
530 severity: Severity::Warning,
531 fix: Some(Fix {
532 range: match_start..match_end,
533 replacement: format!("<{matched_text}>"),
534 }),
535 });
536 }
537
538 Ok(warnings)
539 }
540
541 fn check_with_ast(&self, ctx: &LintContext, ast: &MarkdownAst) -> LintResult {
542 let mut warnings = Vec::new();
544 self.find_bare_urls_in_ast(ast, false, ctx.content, &mut warnings, ctx);
545 Ok(warnings)
546 }
547
548 fn uses_ast(&self) -> bool {
549 false
552 }
553
554 fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
555 let content = ctx.content;
556 if self.should_skip(content) {
557 return Ok(content.to_string());
558 }
559
560 let warnings = self.check(ctx)?;
562 if warnings.is_empty() {
563 return Ok(content.to_string());
564 }
565
566 let mut sorted_warnings = warnings.clone();
568 sorted_warnings.sort_by_key(|w| std::cmp::Reverse(w.fix.as_ref().map(|f| f.range.start).unwrap_or(0)));
569
570 let mut result = content.to_string();
571 for warning in sorted_warnings {
572 if let Some(fix) = &warning.fix {
573 let start = fix.range.start;
574 let end = fix.range.end;
575
576 if start <= result.len() && end <= result.len() && start < end {
577 result.replace_range(start..end, &fix.replacement);
578 }
579 }
580 }
581
582 Ok(result)
583 }
584
585 fn category(&self) -> RuleCategory {
587 RuleCategory::Link
588 }
589
590 fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
592 self.should_skip(ctx.content)
593 }
594
595 fn as_any(&self) -> &dyn std::any::Any {
596 self
597 }
598
599 fn as_maybe_ast(&self) -> Option<&dyn MaybeAst> {
600 Some(self)
601 }
602
603 fn from_config(_config: &crate::config::Config) -> Box<dyn Rule>
604 where
605 Self: Sized,
606 {
607 Box::new(MD034NoBareUrls)
608 }
609}
610
611impl AstExtensions for MD034NoBareUrls {
612 fn has_relevant_ast_elements(&self, ctx: &LintContext, ast: &MarkdownAst) -> bool {
613 use crate::utils::ast_utils::ast_contains_node_type;
615 !self.should_skip(ctx.content) && ast_contains_node_type(ast, "text")
616 }
617}
618
619#[cfg(test)]
620mod tests {
621 use super::*;
622 use crate::lint_context::LintContext;
623
624 #[test]
625 fn test_url_quick_check() {
626 assert!(URL_QUICK_CHECK.is_match("This is a URL: https://example.com"));
627 assert!(!URL_QUICK_CHECK.is_match("This has no URL"));
628 }
629
630 #[test]
631 fn test_multiple_badges_and_links_on_one_line() {
632 let rule = MD034NoBareUrls;
633 let content = "# [React](https://react.dev/) \
634· [](https://github.com/facebook/react/blob/main/LICENSE) \
635[](https://www.npmjs.com/package/react) \
636[](https://github.com/facebook/react/actions/workflows/runtime_build_and_test.yml) \
637[](https://github.com/facebook/react/actions/workflows/compiler_typescript.yml) \
638[](https://legacy.reactjs.org/docs/how-to-contribute.html#your-first-pull-request)";
639 let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
640 let result = rule.check(&ctx).unwrap();
641 if !result.is_empty() {
642 log::debug!("MD034 warnings: {result:#?}");
643 }
644 assert!(
645 result.is_empty(),
646 "Multiple badges and links on one line should not be flagged as bare URLs"
647 );
648 }
649
650 #[test]
651 fn test_bare_urls() {
652 let rule = MD034NoBareUrls;
653 let content = "This is a bare URL: https://example.com/foobar";
654 let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
655 let result = rule.check(&ctx).unwrap();
656 assert_eq!(result.len(), 1, "Bare URLs should be flagged");
657 assert_eq!(result[0].line, 1);
658 assert_eq!(result[0].column, 21);
659 }
660
661 #[test]
662 fn test_md034_performance_baseline() {
663 use std::time::Instant;
664
665 let mut content = String::with_capacity(50_000);
667
668 for i in 0..250 {
670 content.push_str(&format!("Line {i} with bare URL https://example{i}.com/path\n"));
671 }
672
673 for i in 0..250 {
675 content.push_str(&format!(
676 "Line {} with [proper link](https://example{}.com/path)\n",
677 i + 250,
678 i
679 ));
680 }
681
682 for i in 0..500 {
684 content.push_str(&format!("Line {} with no URLs, just regular text content\n", i + 500));
685 }
686
687 for i in 0..100 {
689 content.push_str(&format!("Contact user{i}@example{i}.com for more info\n"));
690 }
691
692 println!(
693 "MD034 Performance Test - Content: {} bytes, {} lines",
694 content.len(),
695 content.lines().count()
696 );
697
698 let rule = MD034NoBareUrls;
699 let ctx = LintContext::new(&content, crate::config::MarkdownFlavor::Standard);
700
701 let _ = rule.check(&ctx).unwrap();
703
704 let mut total_duration = std::time::Duration::ZERO;
706 let runs = 10;
707 let mut warnings_count = 0;
708
709 for _ in 0..runs {
710 let start = Instant::now();
711 let warnings = rule.check(&ctx).unwrap();
712 total_duration += start.elapsed();
713 warnings_count = warnings.len();
714 }
715
716 let avg_check_duration = total_duration / runs;
717
718 println!("MD034 Optimized Performance:");
719 println!(
720 "- Average check time: {:?} ({:.2} ms)",
721 avg_check_duration,
722 avg_check_duration.as_secs_f64() * 1000.0
723 );
724 println!("- Found {warnings_count} warnings");
725 println!(
726 "- Lines per second: {:.0}",
727 content.lines().count() as f64 / avg_check_duration.as_secs_f64()
728 );
729 println!(
730 "- Microseconds per line: {:.2}",
731 avg_check_duration.as_micros() as f64 / content.lines().count() as f64
732 );
733
734 let max_duration_ms = if cfg!(debug_assertions) { 1000 } else { 100 };
737 assert!(
738 avg_check_duration.as_millis() < max_duration_ms,
739 "MD034 check should complete in under {}ms, took {}ms",
740 max_duration_ms,
741 avg_check_duration.as_millis()
742 );
743
744 assert_eq!(warnings_count, 350, "Should find 250 URLs + 100 emails = 350 warnings");
746 }
747}