1use crate::rule::{
5 AstExtensions, Fix, LintError, LintResult, LintWarning, MarkdownAst, MaybeAst, Rule, RuleCategory, Severity,
6};
7use crate::utils::early_returns;
8use crate::utils::range_utils::calculate_url_range;
9use crate::utils::regex_cache::EMAIL_PATTERN;
10
11use crate::lint_context::LintContext;
12use fancy_regex::Regex as FancyRegex;
13use lazy_static::lazy_static;
14use markdown::mdast::Node;
15use regex::Regex;
16
17lazy_static! {
18 static ref URL_QUICK_CHECK: Regex = Regex::new(r#"(?:https?|ftps?)://|@"#).unwrap();
20
21 static ref URL_REGEX: FancyRegex = FancyRegex::new(r#"(?<![\w\[\(\<])((?:https?|ftps?)://(?:\[[0-9a-fA-F:%]+\]|[^\s<>\[\]()\\'\"]+)(?::\d+)?(?:/[^\s<>\[\]()\\'\"]*)?(?:\?[^\s<>\[\]()\\'\"]*)?(?:#[^\s<>\[\]()\\'\"]*)?)"#).unwrap();
24 static ref URL_FIX_REGEX: FancyRegex = FancyRegex::new(r#"(?<![\w\[\(\<])((?:https?|ftps?)://(?:\[[0-9a-fA-F:%]+\]|[^\s<>\[\]()\\'\"]+)(?::\d+)?(?:/[^\s<>\[\]()\\'\"]*)?(?:\?[^\s<>\[\]()\\'\"]*)?(?:#[^\s<>\[\]()\\'\"]*)?)"#).unwrap();
25
26 static ref MARKDOWN_LINK_PATTERN: Regex = Regex::new(r#"\[(?:[^\[\]]|\[[^\]]*\])*\]\(([^)\s]+)(?:\s+(?:\"[^\"]*\"|\'[^\']*\'))?\)"#).unwrap();
29
30 static ref ANGLE_LINK_PATTERN: Regex = Regex::new(r#"<((?:https?|ftps?)://(?:\[[0-9a-fA-F:]+(?:%[a-zA-Z0-9]+)?\]|[^>]+)|[^@\s]+@[^@\s]+\.[^@\s>]+)>"#).unwrap();
33
34 static ref BADGE_LINK_LINE: Regex = Regex::new(r#"^\s*\[!\[[^\]]*\]\([^)]*\)\]\([^)]*\)\s*$"#).unwrap();
36
37 static ref IMAGE_ONLY_LINK_TEXT_PATTERN: Regex = Regex::new(r#"^!\s*\[[^\]]*\]\s*\([^)]*\)$"#).unwrap();
39
40 static ref MARKDOWN_IMAGE_PATTERN: Regex = Regex::new(r#"!\s*\[([^\]]*)\]\s*\(([^)\s]+)(?:\s+(?:\"[^\"]*\"|\'[^\']*\'))?\)"#).unwrap();
42
43 static ref SIMPLE_URL_REGEX: Regex = Regex::new(r#"(https?|ftps?)://(?:\[[0-9a-fA-F:%.]+\](?::\d+)?|[^\s<>\[\]()\\'\"`:\]]+(?::\d+)?)(?:/[^\s<>\[\]()\\'\"`]*)?(?:\?[^\s<>\[\]()\\'\"`]*)?(?:#[^\s<>\[\]()\\'\"`]*)?"#).unwrap();
51
52 static ref IPV6_URL_REGEX: Regex = Regex::new(r#"(https?|ftps?)://\[[0-9a-fA-F:%.\-a-zA-Z]+\](?::\d+)?(?:/[^\s<>\[\]()\\'\"`]*)?(?:\?[^\s<>\[\]()\\'\"`]*)?(?:#[^\s<>\[\]()\\'\"`]*)?"#).unwrap();
55
56 static ref REFERENCE_DEF_RE: Regex = Regex::new(r"^\s*\[[^\]]+\]:\s*(?:https?|ftps?)://\S+$").unwrap();
59
60 static ref HTML_COMMENT_PATTERN: Regex = Regex::new(r#"<!--[\s\S]*?-->"#).unwrap();
62}
63
64#[derive(Default, Clone)]
65pub struct MD034NoBareUrls;
66
67impl MD034NoBareUrls {
68 #[inline]
69 pub fn should_skip(&self, content: &str) -> bool {
70 !early_returns::has_urls(content) && !content.contains('@')
72 }
73
74 fn trim_trailing_punctuation<'a>(&self, url: &'a str) -> &'a str {
76 let trailing_punct = ['.', ',', ';', ':', '!', '?'];
77 let mut end = url.len();
78
79 while end > 0 {
81 if let Some(last_char) = url.chars().nth(end - 1) {
82 if trailing_punct.contains(&last_char) {
83 end -= last_char.len_utf8();
84 } else {
85 break;
86 }
87 } else {
88 break;
89 }
90 }
91
92 &url[..end]
93 }
94
95 pub fn check_with_structure(
97 &self,
98 ctx: &crate::lint_context::LintContext,
99 _structure: &crate::utils::document_structure::DocumentStructure,
100 ) -> LintResult {
101 let content = ctx.content;
102
103 if self.should_skip(content) {
105 return Ok(vec![]);
106 }
107
108 let mut warnings = Vec::new();
110
111 let mut excluded_ranges: Vec<(usize, usize)> = Vec::new();
113
114 for cap in MARKDOWN_LINK_PATTERN.captures_iter(content) {
116 if let Some(dest) = cap.get(1) {
117 excluded_ranges.push((dest.start(), dest.end()));
118 }
119 if let Some(full_match) = cap.get(0) {
121 excluded_ranges.push((full_match.start(), full_match.end()));
122 }
123 }
124
125 for cap in MARKDOWN_IMAGE_PATTERN.captures_iter(content) {
127 if let Some(dest) = cap.get(2) {
128 excluded_ranges.push((dest.start(), dest.end()));
129 }
130 }
131
132 for cap in ANGLE_LINK_PATTERN.captures_iter(content) {
134 if let Some(m) = cap.get(1) {
135 excluded_ranges.push((m.start(), m.end()));
136 }
137 }
138
139 for html_tag in ctx.html_tags().iter() {
141 excluded_ranges.push((html_tag.byte_offset, html_tag.byte_end));
142 }
143
144 for cap in HTML_COMMENT_PATTERN.captures_iter(content) {
146 if let Some(comment) = cap.get(0) {
147 excluded_ranges.push((comment.start(), comment.end()));
148 }
149 }
150
151 excluded_ranges.sort_by_key(|r| r.0);
153 let mut merged: Vec<(usize, usize)> = Vec::new();
154 for (start, end) in excluded_ranges {
155 if let Some((_, last_end)) = merged.last_mut()
156 && *last_end >= start
157 {
158 *last_end = (*last_end).max(end);
159 continue;
160 }
161 merged.push((start, end));
162 }
163
164 let mut all_matches: Vec<(usize, usize, bool)> = Vec::new(); if !content.contains("://") && !content.contains('@') {
170 return Ok(warnings);
171 }
172
173 for line_info in ctx.lines.iter() {
175 let line_content = &line_info.content;
176
177 if line_info.in_code_block {
179 continue;
180 }
181
182 if !line_content.contains("://") && !line_content.contains('@') {
184 continue;
185 }
186
187 for url_match in SIMPLE_URL_REGEX.find_iter(line_content) {
189 let start_in_line = url_match.start();
190 let end_in_line = url_match.end();
191 let matched_str = &line_content[start_in_line..end_in_line];
192
193 if matched_str.contains("::") && !matched_str.contains('[') && matched_str.contains(']') {
195 continue;
196 }
197
198 let global_start = line_info.byte_offset + start_in_line;
199 let global_end = line_info.byte_offset + end_in_line;
200 all_matches.push((global_start, global_end, false));
201 }
202
203 for url_match in IPV6_URL_REGEX.find_iter(line_content) {
205 let global_start = line_info.byte_offset + url_match.start();
206 let global_end = line_info.byte_offset + url_match.end();
207
208 all_matches.retain(|(start, end, _)| !(*start < global_end && *end > global_start));
210
211 all_matches.push((global_start, global_end, false));
212 }
213
214 for email_match in EMAIL_PATTERN.find_iter(line_content) {
216 let global_start = line_info.byte_offset + email_match.start();
217 let global_end = line_info.byte_offset + email_match.end();
218 all_matches.push((global_start, global_end, true));
219 }
220 }
221
222 for (match_start, match_end_orig, is_email) in all_matches {
224 let mut match_end = match_end_orig;
225
226 if !is_email {
228 let raw_url = &content[match_start..match_end];
229 let trimmed_url = self.trim_trailing_punctuation(raw_url);
230 match_end = match_start + trimmed_url.len();
231 }
232
233 if match_end <= match_start {
235 continue;
236 }
237
238 let before = if match_start == 0 {
240 None
241 } else {
242 content.get(match_start - 1..match_start)
243 };
244 let after = content.get(match_end..match_end + 1);
245
246 let is_valid_boundary = if is_email {
247 before.is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_" && c != ".")
248 && after.is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_" && c != ".")
249 } else {
250 before.is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_")
251 && after.is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_")
252 };
253
254 if !is_valid_boundary {
255 continue;
256 }
257
258 if ctx.is_in_code_block_or_span(match_start) {
260 continue;
261 }
262
263 let in_any_range = merged.iter().any(|(start, end)| {
265 (match_start >= *start && match_start < *end)
267 || (match_end > *start && match_end <= *end)
268 || (match_start < *start && match_end > *end)
269 });
270 if in_any_range {
271 continue;
272 }
273
274 let (line_num, col_num) = ctx.offset_to_line_col(match_start);
276
277 if !is_email
279 && let Some(line_info) = ctx.line_info(line_num)
280 && REFERENCE_DEF_RE.is_match(&line_info.content)
281 {
282 continue;
283 }
284
285 let matched_text = &content[match_start..match_end];
286 let line_info = ctx.line_info(line_num).unwrap();
287 let (start_line, start_col, end_line, end_col) =
288 calculate_url_range(line_num, &line_info.content, col_num - 1, matched_text.len());
289
290 let message = if is_email {
291 "Email address without angle brackets or link formatting".to_string()
292 } else {
293 "URL without angle brackets or link formatting".to_string()
294 };
295
296 warnings.push(LintWarning {
297 rule_name: Some(self.name()),
298 line: start_line,
299 column: start_col,
300 end_line,
301 end_column: end_col,
302 message,
303 severity: Severity::Warning,
304 fix: Some(Fix {
305 range: match_start..match_end,
306 replacement: format!("<{matched_text}>"),
307 }),
308 });
309 }
310
311 Ok(warnings)
312 }
313
314 fn find_bare_urls_in_ast(
316 &self,
317 node: &Node,
318 parent_is_link_or_image: bool,
319 _content: &str,
320 warnings: &mut Vec<LintWarning>,
321 ctx: &LintContext,
322 ) {
323 use markdown::mdast::Node::*;
324 match node {
325 Text(text) if !parent_is_link_or_image => {
326 let text_str = &text.value;
327
328 for url_match in SIMPLE_URL_REGEX.find_iter(text_str) {
330 let url_start = url_match.start();
331 let mut url_end = url_match.end();
332
333 let raw_url = &text_str[url_start..url_end];
335 let trimmed_url = self.trim_trailing_punctuation(raw_url);
336 url_end = url_start + trimmed_url.len();
337
338 if url_end <= url_start {
340 continue;
341 }
342
343 let before = if url_start == 0 {
344 None
345 } else {
346 text_str.get(url_start - 1..url_start)
347 };
348 let after = text_str.get(url_end..url_end + 1);
349 let is_valid_boundary = before
350 .is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_")
351 && after.is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_");
352 if !is_valid_boundary {
353 continue;
354 }
355 if let Some(pos) = &text.position {
356 let offset = pos.start.offset + url_start;
357 let (line, column) = ctx.offset_to_line_col(offset);
358 let url_text = &text_str[url_start..url_end];
359 let (start_line, start_col, end_line, end_col) =
360 (line, column, line, column + url_text.chars().count());
361 warnings.push(LintWarning {
362 rule_name: Some(self.name()),
363 line: start_line,
364 column: start_col,
365 end_line,
366 end_column: end_col,
367 message: "URL without angle brackets or link formatting".to_string(),
368 severity: Severity::Warning,
369 fix: Some(Fix {
370 range: offset..(offset + url_text.len()),
371 replacement: format!("<{url_text}>"),
372 }),
373 });
374 }
375 }
376
377 for email_match in EMAIL_PATTERN.find_iter(text_str) {
379 let email_start = email_match.start();
380 let email_end = email_match.end();
381 let before = if email_start == 0 {
382 None
383 } else {
384 text_str.get(email_start - 1..email_start)
385 };
386 let after = text_str.get(email_end..email_end + 1);
387 let is_valid_boundary = before
388 .is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_" && c != ".")
389 && after.is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_" && c != ".");
390 if !is_valid_boundary {
391 continue;
392 }
393 if let Some(pos) = &text.position {
394 let offset = pos.start.offset + email_start;
395 let (line, column) = ctx.offset_to_line_col(offset);
396 let email_text = &text_str[email_start..email_end];
397 let (start_line, start_col, end_line, end_col) =
398 (line, column, line, column + email_text.chars().count());
399 warnings.push(LintWarning {
400 rule_name: Some(self.name()),
401 line: start_line,
402 column: start_col,
403 end_line,
404 end_column: end_col,
405 message: "Email address without angle brackets or link formatting (wrap like: <email>)"
406 .to_string(),
407 severity: Severity::Warning,
408 fix: Some(Fix {
409 range: offset..(offset + email_text.len()),
410 replacement: format!("<{email_text}>"),
411 }),
412 });
413 }
414 }
415 }
416 Link(link) => {
417 for child in &link.children {
418 self.find_bare_urls_in_ast(child, true, _content, warnings, ctx);
419 }
420 }
421 Image(image) => {
422 let alt_str = &image.alt;
424 for url_match in SIMPLE_URL_REGEX.find_iter(alt_str) {
425 let url_start = url_match.start();
426 let mut url_end = url_match.end();
427
428 let raw_url = &alt_str[url_start..url_end];
430 let trimmed_url = self.trim_trailing_punctuation(raw_url);
431 url_end = url_start + trimmed_url.len();
432
433 if url_end <= url_start {
435 continue;
436 }
437
438 let before = if url_start == 0 {
439 None
440 } else {
441 alt_str.get(url_start - 1..url_start)
442 };
443 let after = alt_str.get(url_end..url_end + 1);
444 let is_valid_boundary = before
445 .is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_")
446 && after.is_none_or(|c| !c.chars().next().unwrap().is_alphanumeric() && c != "_");
447 if !is_valid_boundary {
448 continue;
449 }
450 if let Some(pos) = &image.position {
451 let offset = pos.start.offset + url_start;
452 let (line, column) = ctx.offset_to_line_col(offset);
453 let url_text = &alt_str[url_start..url_end];
454 let (start_line, start_col, end_line, end_col) =
455 (line, column, line, column + url_text.chars().count());
456 warnings.push(LintWarning {
457 rule_name: Some(self.name()),
458 line: start_line,
459 column: start_col,
460 end_line,
461 end_column: end_col,
462 message: "URL without angle brackets or link formatting".to_string(),
463 severity: Severity::Warning,
464 fix: Some(Fix {
465 range: offset..(offset + url_text.len()),
466 replacement: format!("<{url_text}>"),
467 }),
468 });
469 }
470 }
471 }
472 Code(_) | InlineCode(_) | Html(_) => {
473 }
475 _ => {
476 if let Some(children) = node.children() {
477 for child in children {
478 self.find_bare_urls_in_ast(child, false, _content, warnings, ctx);
479 }
480 }
481 }
482 }
483 }
484
485 pub fn check_ast(&self, ctx: &LintContext, ast: &Node) -> LintResult {
487 let mut warnings = Vec::new();
488 self.find_bare_urls_in_ast(ast, false, ctx.content, &mut warnings, ctx);
489 Ok(warnings)
490 }
491}
492
493impl Rule for MD034NoBareUrls {
494 fn name(&self) -> &'static str {
495 "MD034"
496 }
497
498 fn description(&self) -> &'static str {
499 "URL without angle brackets or link formatting"
500 }
501
502 fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
503 let content = ctx.content;
506
507 if content.is_empty() {
509 return Ok(Vec::new());
510 }
511
512 if !content.contains("http://")
514 && !content.contains("https://")
515 && !content.contains("ftp://")
516 && !content.contains("ftps://")
517 && !content.contains('@')
518 {
519 return Ok(Vec::new());
520 }
521
522 if !URL_QUICK_CHECK.is_match(content) {
524 return Ok(Vec::new());
525 }
526
527 let structure = crate::utils::document_structure::DocumentStructure::new(content);
529 self.check_with_structure(ctx, &structure)
530 }
531
532 fn check_with_ast(&self, ctx: &LintContext, ast: &MarkdownAst) -> LintResult {
533 let mut warnings = Vec::new();
535 self.find_bare_urls_in_ast(ast, false, ctx.content, &mut warnings, ctx);
536 Ok(warnings)
537 }
538
539 fn uses_ast(&self) -> bool {
540 false
543 }
544
545 fn uses_document_structure(&self) -> bool {
546 true
547 }
548
549 fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
550 let content = ctx.content;
551 if self.should_skip(content) {
552 return Ok(content.to_string());
553 }
554
555 let structure = crate::utils::document_structure::DocumentStructure::new(content);
558 let warnings = self.check_with_structure(ctx, &structure)?;
559 if warnings.is_empty() {
560 return Ok(content.to_string());
561 }
562
563 let mut sorted_warnings = warnings.clone();
565 sorted_warnings.sort_by_key(|w| std::cmp::Reverse(w.fix.as_ref().map(|f| f.range.start).unwrap_or(0)));
566
567 let mut result = content.to_string();
568 for warning in sorted_warnings {
569 if let Some(fix) = &warning.fix {
570 let start = fix.range.start;
571 let end = fix.range.end;
572
573 if start <= result.len() && end <= result.len() && start < end {
574 result.replace_range(start..end, &fix.replacement);
575 }
576 }
577 }
578
579 Ok(result)
580 }
581
582 fn category(&self) -> RuleCategory {
584 RuleCategory::Link
585 }
586
587 fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
589 self.should_skip(ctx.content)
590 }
591
592 fn as_any(&self) -> &dyn std::any::Any {
593 self
594 }
595
596 fn as_maybe_document_structure(&self) -> Option<&dyn crate::rule::MaybeDocumentStructure> {
597 Some(self)
598 }
599
600 fn as_maybe_ast(&self) -> Option<&dyn MaybeAst> {
601 Some(self)
602 }
603
604 fn from_config(_config: &crate::config::Config) -> Box<dyn Rule>
605 where
606 Self: Sized,
607 {
608 Box::new(MD034NoBareUrls)
609 }
610}
611
612impl crate::utils::document_structure::DocumentStructureExtensions for MD034NoBareUrls {
613 fn has_relevant_elements(
614 &self,
615 ctx: &crate::lint_context::LintContext,
616 _doc_structure: &crate::utils::document_structure::DocumentStructure,
617 ) -> bool {
618 let content = ctx.content;
620 !content.is_empty()
621 && (content.contains("http://")
622 || content.contains("https://")
623 || content.contains("ftp://")
624 || content.contains("ftps://")
625 || content.contains('@'))
626 }
627}
628
629impl AstExtensions for MD034NoBareUrls {
630 fn has_relevant_ast_elements(&self, ctx: &LintContext, ast: &MarkdownAst) -> bool {
631 use crate::utils::ast_utils::ast_contains_node_type;
633 !self.should_skip(ctx.content) && ast_contains_node_type(ast, "text")
634 }
635}
636
637#[cfg(test)]
638mod tests {
639 use super::*;
640 use crate::lint_context::LintContext;
641
642 #[test]
643 fn test_url_quick_check() {
644 assert!(URL_QUICK_CHECK.is_match("This is a URL: https://example.com"));
645 assert!(!URL_QUICK_CHECK.is_match("This has no URL"));
646 }
647
648 #[test]
649 fn test_multiple_badges_and_links_on_one_line() {
650 let rule = MD034NoBareUrls;
651 let content = "# [React](https://react.dev/) \
652· [](https://github.com/facebook/react/blob/main/LICENSE) \
653[](https://www.npmjs.com/package/react) \
654[](https://github.com/facebook/react/actions/workflows/runtime_build_and_test.yml) \
655[](https://github.com/facebook/react/actions/workflows/compiler_typescript.yml) \
656[](https://legacy.reactjs.org/docs/how-to-contribute.html#your-first-pull-request)";
657 let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
658 let result = rule.check(&ctx).unwrap();
659 if !result.is_empty() {
660 log::debug!("MD034 warnings: {result:#?}");
661 }
662 assert!(
663 result.is_empty(),
664 "Multiple badges and links on one line should not be flagged as bare URLs"
665 );
666 }
667
668 #[test]
669 fn test_bare_urls() {
670 let rule = MD034NoBareUrls;
671 let content = "This is a bare URL: https://example.com/foobar";
672 let ctx = LintContext::new(content, crate::config::MarkdownFlavor::Standard);
673 let result = rule.check(&ctx).unwrap();
674 assert_eq!(result.len(), 1, "Bare URLs should be flagged");
675 assert_eq!(result[0].line, 1);
676 assert_eq!(result[0].column, 21);
677 }
678
679 #[test]
680 fn test_md034_performance_baseline() {
681 use std::time::Instant;
682
683 let mut content = String::with_capacity(50_000);
685
686 for i in 0..250 {
688 content.push_str(&format!("Line {i} with bare URL https://example{i}.com/path\n"));
689 }
690
691 for i in 0..250 {
693 content.push_str(&format!(
694 "Line {} with [proper link](https://example{}.com/path)\n",
695 i + 250,
696 i
697 ));
698 }
699
700 for i in 0..500 {
702 content.push_str(&format!("Line {} with no URLs, just regular text content\n", i + 500));
703 }
704
705 for i in 0..100 {
707 content.push_str(&format!("Contact user{i}@example{i}.com for more info\n"));
708 }
709
710 println!(
711 "MD034 Performance Test - Content: {} bytes, {} lines",
712 content.len(),
713 content.lines().count()
714 );
715
716 let rule = MD034NoBareUrls;
717 let ctx = LintContext::new(&content, crate::config::MarkdownFlavor::Standard);
718
719 let _ = rule.check(&ctx).unwrap();
721
722 let mut total_duration = std::time::Duration::ZERO;
724 let runs = 10;
725 let mut warnings_count = 0;
726
727 for _ in 0..runs {
728 let start = Instant::now();
729 let warnings = rule.check(&ctx).unwrap();
730 total_duration += start.elapsed();
731 warnings_count = warnings.len();
732 }
733
734 let avg_check_duration = total_duration / runs;
735
736 println!("MD034 Optimized Performance:");
737 println!(
738 "- Average check time: {:?} ({:.2} ms)",
739 avg_check_duration,
740 avg_check_duration.as_secs_f64() * 1000.0
741 );
742 println!("- Found {warnings_count} warnings");
743 println!(
744 "- Lines per second: {:.0}",
745 content.lines().count() as f64 / avg_check_duration.as_secs_f64()
746 );
747 println!(
748 "- Microseconds per line: {:.2}",
749 avg_check_duration.as_micros() as f64 / content.lines().count() as f64
750 );
751
752 assert!(
754 avg_check_duration.as_millis() < 100,
755 "MD034 check should complete in under 100ms, took {}ms",
756 avg_check_duration.as_millis()
757 );
758
759 assert_eq!(warnings_count, 350, "Should find 250 URLs + 100 emails = 350 warnings");
761 }
762}