use crate::config::MarkdownFlavor;
use crate::utils::code_block_utils::CodeBlockUtils;
use crate::utils::rumdl_parser_options;
use pulldown_cmark::{BrokenLink, Event, LinkType, Tag, TagEnd};
use regex::Regex;
use std::borrow::Cow;
use std::collections::HashSet;
use std::sync::LazyLock;
use super::types::*;
static LINK_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
r#"(?sx)
\[((?:[^\[\]\\]|\\.)*)\] # Link text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
(?:
\((?:<([^<>\n]*)>|([^\s)"']*))(?:\s+(?:"((?:[^"\\]|\\.)*)"|'((?:[^'\\]|\\.)*)'|\(((?:[^()\\]|\\.)*)\)))?\) # URL in group 2 (angle) or 3 (bare), title in 4 (dq) / 5 (sq) / 6 (paren)
|
\[([^\]]*)\] # Reference ID in group 7
)"#
).unwrap()
});
static IMAGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
r#"(?sx)
!\[((?:[^\[\]\\]|\\.)*)\] # Alt text in group 1 (optimized - no nested brackets to prevent catastrophic backtracking)
(?:
\((?:<([^<>\n]*)>|([^\s)"']*))(?:\s+(?:"((?:[^"\\]|\\.)*)"|'((?:[^'\\]|\\.)*)'|\(((?:[^()\\]|\\.)*)\)))?\) # URL in group 2 (angle) or 3 (bare), title in 4 (dq) / 5 (sq) / 6 (paren)
|
\[([^\]]*)\] # Reference ID in group 7
)"#
).unwrap()
});
fn extend_collapsed_byte_end(content: &str, link_type: LinkType, byte_end: usize) -> usize {
if !matches!(link_type, LinkType::Collapsed) {
return byte_end;
}
let bytes = content.as_bytes();
if bytes.get(byte_end) == Some(&b'[') && bytes.get(byte_end + 1) == Some(&b']') {
byte_end + 2
} else {
byte_end
}
}
static REF_DEF_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
r#"(?m)^[ ]{0,3}\[([^\]]+)\]:\s*(?:<((?:[^<>\n\\]|\\.)*)>|([^\s<][^\s]*))(?:\s+(?:"((?:[^"\\]|\\.)*)"|'((?:[^'\\]|\\.)*)'|\(((?:[^()\\]|\\.)*)\)))?$"#,
)
.unwrap()
});
pub(super) struct PulldownParseResult<'a> {
pub link_byte_ranges: Vec<(usize, usize)>,
pub links: Vec<ParsedLink<'a>>,
pub images: Vec<ParsedImage<'a>>,
pub broken_links: Vec<BrokenLinkInfo>,
pub footnote_refs: Vec<FootnoteRef>,
pub link_found_positions: HashSet<usize>,
pub image_found_positions: HashSet<usize>,
}
pub(super) fn parse_links_images_pulldown<'a>(
content: &'a str,
lines: &[LineInfo],
code_blocks: &[(usize, usize)],
flavor: MarkdownFlavor,
html_comment_ranges: &[crate::utils::skip_context::ByteRange],
) -> PulldownParseResult<'a> {
use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
let mut link_byte_ranges = Vec::new();
let mut links = Vec::with_capacity(content.len() / 500);
let mut images = Vec::with_capacity(content.len() / 1000);
let mut broken_links = Vec::new();
let mut footnote_refs = Vec::new();
let mut link_found_positions = HashSet::new();
let mut image_found_positions = HashSet::new();
let options = rumdl_parser_options();
let parser = pulldown_cmark::Parser::new_with_broken_link_callback(
content,
options,
Some(|link: BrokenLink<'_>| {
broken_links.push(BrokenLinkInfo {
reference: link.reference.to_string(),
span: link.span.clone(),
});
None
}),
)
.into_offset_iter();
type StackEntry<'b> = (
usize,
pulldown_cmark::CowStr<'b>,
LinkType,
pulldown_cmark::CowStr<'b>,
pulldown_cmark::CowStr<'b>,
);
let mut link_stack: Vec<StackEntry<'a>> = Vec::new();
let mut image_stack: Vec<StackEntry<'a>> = Vec::new();
let mut link_text_chunks: Vec<(String, usize, usize)> = Vec::new();
for (event, range) in parser {
match event {
Event::Start(Tag::Link {
link_type,
dest_url,
title,
id,
}) => {
link_stack.push((range.start, dest_url, link_type, id, title));
link_text_chunks.clear();
}
Event::Start(Tag::Image {
link_type,
dest_url,
title,
id,
}) => {
image_stack.push((range.start, dest_url, link_type, id, title));
link_text_chunks.clear();
}
Event::Text(text) if !link_stack.is_empty() || !image_stack.is_empty() => {
link_text_chunks.push((text.to_string(), range.start, range.end));
}
Event::Code(code) if !link_stack.is_empty() || !image_stack.is_empty() => {
let code_text = format!("`{code}`");
link_text_chunks.push((code_text, range.start, range.end));
}
Event::End(TagEnd::Link) => {
if let Some((start_pos, url, link_type, ref_id, title)) = link_stack.pop() {
let span_end = extend_collapsed_byte_end(content, link_type, range.end);
link_byte_ranges.push((start_pos, span_end));
if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
link_text_chunks.clear();
continue;
}
let (line_idx, line_num, col_start) = super::LintContext::find_line_for_offset(lines, start_pos);
if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
link_text_chunks.clear();
continue;
}
let (_, _end_line_num, col_end) = super::LintContext::find_line_for_offset(lines, span_end);
let is_reference = matches!(
link_type,
LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
);
let link_text = if matches!(link_type, LinkType::WikiLink { .. }) {
if !link_text_chunks.is_empty() {
let text: String = link_text_chunks.iter().map(|(t, _, _)| t.as_str()).collect();
Cow::Owned(text)
} else {
Cow::Owned(url.to_string())
}
} else if start_pos < content.len() {
let link_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
let mut close_pos = None;
let mut depth = 0;
let mut in_code_span = false;
for (i, &byte) in link_bytes.iter().enumerate().skip(1) {
let mut backslash_count = 0;
let mut j = i;
while j > 0 && link_bytes[j - 1] == b'\\' {
backslash_count += 1;
j -= 1;
}
let is_escaped = backslash_count % 2 != 0;
if byte == b'`' && !is_escaped {
in_code_span = !in_code_span;
}
if !is_escaped && !in_code_span {
if byte == b'[' {
depth += 1;
} else if byte == b']' {
if depth == 0 {
close_pos = Some(i);
break;
} else {
depth -= 1;
}
}
}
}
if let Some(pos) = close_pos {
Cow::Borrowed(std::str::from_utf8(&link_bytes[1..pos]).unwrap_or(""))
} else {
Cow::Borrowed("")
}
} else {
Cow::Borrowed("")
};
let reference_id = if is_reference && !ref_id.is_empty() {
Some(Cow::Owned(ref_id.to_lowercase()))
} else if is_reference {
Some(Cow::Owned(link_text.to_lowercase()))
} else {
None
};
link_found_positions.insert(start_pos);
let title_field = if !title.is_empty() {
Some(Cow::Owned(title.to_string()))
} else if matches!(link_type, LinkType::Inline)
&& has_explicit_empty_inline_title(&content[start_pos..range.end.min(content.len())])
{
Some(Cow::Borrowed(""))
} else {
None
};
links.push(ParsedLink {
line: line_num,
start_col: col_start,
end_col: col_end,
byte_offset: start_pos,
byte_end: span_end,
text: link_text,
url: Cow::Owned(url.to_string()),
title: title_field,
is_reference,
reference_id,
link_type,
});
link_text_chunks.clear();
}
}
Event::End(TagEnd::Image) => {
if let Some((start_pos, url, link_type, ref_id, title)) = image_stack.pop() {
let span_end = extend_collapsed_byte_end(content, link_type, range.end);
if CodeBlockUtils::is_in_code_block(code_blocks, start_pos) {
link_text_chunks.clear();
continue;
}
if is_in_html_comment_ranges(html_comment_ranges, start_pos) {
link_text_chunks.clear();
continue;
}
let (_, line_num, col_start) = super::LintContext::find_line_for_offset(lines, start_pos);
let (_, _end_line_num, col_end) = super::LintContext::find_line_for_offset(lines, span_end);
let is_reference = matches!(
link_type,
LinkType::Reference | LinkType::Collapsed | LinkType::Shortcut
);
let alt_text = if matches!(link_type, LinkType::WikiLink { has_pothole: true }) {
if !link_text_chunks.is_empty() {
let text: String = link_text_chunks.iter().map(|(t, _, _)| t.as_str()).collect();
let text = text.strip_suffix("]]").unwrap_or(&text).to_string();
Cow::Owned(text)
} else {
Cow::Borrowed("")
}
} else if matches!(link_type, LinkType::WikiLink { has_pothole: false }) {
Cow::Borrowed("")
} else if start_pos < content.len() {
let image_bytes = &content.as_bytes()[start_pos..range.end.min(content.len())];
let mut close_pos = None;
let mut depth = 0;
if image_bytes.len() > 2 {
for (i, &byte) in image_bytes.iter().enumerate().skip(2) {
let mut backslash_count = 0;
let mut j = i;
while j > 0 && image_bytes[j - 1] == b'\\' {
backslash_count += 1;
j -= 1;
}
let is_escaped = backslash_count % 2 != 0;
if !is_escaped {
if byte == b'[' {
depth += 1;
} else if byte == b']' {
if depth == 0 {
close_pos = Some(i);
break;
} else {
depth -= 1;
}
}
}
}
}
if let Some(pos) = close_pos {
Cow::Borrowed(std::str::from_utf8(&image_bytes[2..pos]).unwrap_or(""))
} else {
Cow::Borrowed("")
}
} else {
Cow::Borrowed("")
};
let url = Cow::Owned(url.to_string());
let reference_id = if is_reference && !ref_id.is_empty() {
Some(Cow::Owned(ref_id.to_lowercase()))
} else if is_reference {
Some(Cow::Owned(alt_text.to_lowercase()))
} else {
None
};
image_found_positions.insert(start_pos);
let title_field = if !title.is_empty() {
Some(Cow::Owned(title.to_string()))
} else if matches!(link_type, LinkType::Inline)
&& has_explicit_empty_inline_title(&content[start_pos..range.end.min(content.len())])
{
Some(Cow::Borrowed(""))
} else {
None
};
images.push(ParsedImage {
line: line_num,
start_col: col_start,
end_col: col_end,
byte_offset: start_pos,
byte_end: span_end,
alt_text,
url,
title: title_field,
is_reference,
reference_id,
link_type,
});
link_text_chunks.clear();
}
}
Event::FootnoteReference(footnote_id) => {
if is_in_html_comment_ranges(html_comment_ranges, range.start) {
continue;
}
let (_, line_num, _) = super::LintContext::find_line_for_offset(lines, range.start);
footnote_refs.push(FootnoteRef {
id: footnote_id.to_string(),
line: line_num,
byte_offset: range.start,
});
}
_ => {}
}
}
PulldownParseResult {
link_byte_ranges,
links,
images,
broken_links,
footnote_refs,
link_found_positions,
image_found_positions,
}
}
pub(super) fn finalize_links_and_images<'a>(
content: &'a str,
lines: &[LineInfo],
code_blocks: &[(usize, usize)],
code_spans: &[CodeSpan],
flavor: MarkdownFlavor,
html_comment_ranges: &[crate::utils::skip_context::ByteRange],
mut result: PulldownParseResult<'a>,
) -> (
Vec<ParsedLink<'a>>,
Vec<ParsedImage<'a>>,
Vec<BrokenLinkInfo>,
Vec<FootnoteRef>,
) {
use crate::utils::skip_context::{is_in_html_comment_ranges, is_mkdocs_snippet_line};
result
.images
.retain(|img| !super::LintContext::is_offset_in_code_span(code_spans, img.byte_offset));
for cap in LINK_PATTERN.captures_iter(content) {
let full_match = cap.get(0).unwrap();
let match_start = full_match.start();
let match_end = full_match.end();
if result.link_found_positions.contains(&match_start) {
continue;
}
if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
continue;
}
if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'!') {
continue;
}
if CodeBlockUtils::is_in_code_block(code_blocks, match_start) {
continue;
}
if super::LintContext::is_offset_in_code_span(code_spans, match_start) {
continue;
}
if is_in_html_comment_ranges(html_comment_ranges, match_start) {
continue;
}
let (line_idx, line_num, col_start) = super::LintContext::find_line_for_offset(lines, match_start);
if is_mkdocs_snippet_line(lines[line_idx].content(content), flavor) {
continue;
}
let (_, _end_line_num, col_end) = super::LintContext::find_line_for_offset(lines, match_end);
let text = cap.get(1).map_or("", |m| m.as_str());
if let Some(ref_id) = cap.get(7) {
let ref_id_str = ref_id.as_str();
let normalized_ref = if ref_id_str.is_empty() {
Cow::Owned(text.to_lowercase())
} else {
Cow::Owned(ref_id_str.to_lowercase())
};
result.links.push(ParsedLink {
line: line_num,
start_col: col_start,
end_col: col_end,
byte_offset: match_start,
byte_end: match_end,
text: Cow::Borrowed(text),
url: Cow::Borrowed(""),
title: None,
is_reference: true,
reference_id: Some(normalized_ref),
link_type: LinkType::Reference,
});
} else if let Some(line_info) = lines.get(line_idx)
&& line_info.in_mkdocs_container()
{
let url = cap
.get(2)
.or_else(|| cap.get(3))
.map_or(String::new(), |m| unescape_commonmark_punctuation(m.as_str().trim()));
let title = cap
.get(4)
.or_else(|| cap.get(5))
.or_else(|| cap.get(6))
.map(|m| Cow::Owned(unescape_commonmark_punctuation(m.as_str())));
result.links.push(ParsedLink {
line: line_num,
start_col: col_start,
end_col: col_end,
byte_offset: match_start,
byte_end: match_end,
text: Cow::Borrowed(text),
url: Cow::Owned(url),
title,
is_reference: false,
reference_id: None,
link_type: LinkType::Inline,
});
}
}
for cap in IMAGE_PATTERN.captures_iter(content) {
let full_match = cap.get(0).unwrap();
let match_start = full_match.start();
let match_end = full_match.end();
if result.image_found_positions.contains(&match_start) {
continue;
}
if match_start > 0 && content.as_bytes().get(match_start - 1) == Some(&b'\\') {
continue;
}
if CodeBlockUtils::is_in_code_block(code_blocks, match_start)
|| super::LintContext::is_offset_in_code_span(code_spans, match_start)
|| is_in_html_comment_ranges(html_comment_ranges, match_start)
{
continue;
}
let (line_idx, line_num, col_start) = super::LintContext::find_line_for_offset(lines, match_start);
let (_, _end_line_num, col_end) = super::LintContext::find_line_for_offset(lines, match_end);
let alt_text = cap.get(1).map_or("", |m| m.as_str());
if let Some(ref_id) = cap.get(7) {
let ref_id_str = ref_id.as_str();
let normalized_ref = if ref_id_str.is_empty() {
Cow::Owned(alt_text.to_lowercase())
} else {
Cow::Owned(ref_id_str.to_lowercase())
};
result.images.push(ParsedImage {
line: line_num,
start_col: col_start,
end_col: col_end,
byte_offset: match_start,
byte_end: match_end,
alt_text: Cow::Borrowed(alt_text),
url: Cow::Borrowed(""),
title: None,
is_reference: true,
reference_id: Some(normalized_ref),
link_type: LinkType::Reference,
});
} else if let Some(line_info) = lines.get(line_idx)
&& line_info.in_mkdocs_container()
{
let url = cap
.get(2)
.or_else(|| cap.get(3))
.map_or(String::new(), |m| unescape_commonmark_punctuation(m.as_str().trim()));
let title = cap
.get(4)
.or_else(|| cap.get(5))
.or_else(|| cap.get(6))
.map(|m| Cow::Owned(unescape_commonmark_punctuation(m.as_str())));
result.images.push(ParsedImage {
line: line_num,
start_col: col_start,
end_col: col_end,
byte_offset: match_start,
byte_end: match_end,
alt_text: Cow::Borrowed(alt_text),
url: Cow::Owned(url),
title,
is_reference: false,
reference_id: None,
link_type: LinkType::Inline,
});
}
}
result.links.sort_by_key(|l| (l.line, l.byte_offset));
result.images.sort_by_key(|i| (i.line, i.byte_offset));
(result.links, result.images, result.broken_links, result.footnote_refs)
}
fn has_explicit_empty_inline_title(span: &str) -> bool {
let bytes = span.as_bytes();
let mut i = bytes.len();
if i == 0 || bytes[i - 1] != b')' {
return false;
}
i -= 1; while i > 0 && matches!(bytes[i - 1], b' ' | b'\t' | b'\n' | b'\r') {
i -= 1;
}
if i < 2 {
return false;
}
let close = bytes[i - 1];
let open = match close {
b'"' => b'"',
b'\'' => b'\'',
b')' => b'(',
_ => return false,
};
if bytes[i - 2] != open {
return false;
}
if i < 3 {
return false;
}
matches!(bytes[i - 3], b' ' | b'\t' | b'\n' | b'\r')
}
fn unescape_commonmark_punctuation(input: &str) -> String {
if !input.contains('\\') {
return input.to_string();
}
let mut out = String::with_capacity(input.len());
let bytes = input.as_bytes();
let mut i = 0;
while i < bytes.len() {
let b = bytes[i];
if b == b'\\' && i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) {
out.push(bytes[i + 1] as char);
i += 2;
} else {
let ch_len = utf8_char_len(b);
out.push_str(&input[i..i + ch_len]);
i += ch_len;
}
}
out
}
#[inline]
fn is_ascii_punctuation(b: u8) -> bool {
matches!(
b,
b'!' | b'"'
| b'#'
| b'$'
| b'%'
| b'&'
| b'\''
| b'('
| b')'
| b'*'
| b'+'
| b','
| b'-'
| b'.'
| b'/'
| b':'
| b';'
| b'<'
| b'='
| b'>'
| b'?'
| b'@'
| b'['
| b'\\'
| b']'
| b'^'
| b'_'
| b'`'
| b'{'
| b'|'
| b'}'
| b'~'
)
}
#[inline]
fn utf8_char_len(first_byte: u8) -> usize {
match first_byte {
0x00..=0x7F => 1,
0xC0..=0xDF => 2,
0xE0..=0xEF => 3,
0xF0..=0xF7 => 4,
_ => 1, }
}
pub(super) fn parse_reference_defs(content: &str, lines: &[LineInfo]) -> Vec<ReferenceDef> {
let mut refs = Vec::with_capacity(lines.len() / 20);
for (line_idx, line_info) in lines.iter().enumerate() {
if line_info.in_code_block {
continue;
}
let line = line_info.content(content);
let line_num = line_idx + 1;
if let Some(cap) = REF_DEF_PATTERN.captures(line) {
let id_raw = cap.get(1).unwrap().as_str();
if id_raw.starts_with('^') {
continue;
}
let id = id_raw.to_lowercase();
let url = unescape_commonmark_punctuation(
cap.get(2)
.or_else(|| cap.get(3))
.expect("destination alternation always matches")
.as_str(),
);
let title_match = cap.get(4).or_else(|| cap.get(5)).or_else(|| cap.get(6));
let title = title_match.map(|m| unescape_commonmark_punctuation(m.as_str()));
let match_obj = cap.get(0).unwrap();
let byte_offset = line_info.byte_offset + match_obj.start();
let byte_end = line_info.byte_offset + match_obj.end();
let (title_byte_start, title_byte_end) = if let Some(m) = title_match {
let start = line_info.byte_offset + m.start().saturating_sub(1);
let end = line_info.byte_offset + m.end() + 1;
(Some(start), Some(end))
} else {
(None, None)
};
refs.push(ReferenceDef {
line: line_num,
id,
url,
title,
byte_offset,
byte_end,
title_byte_start,
title_byte_end,
});
}
}
refs
}