use crate::error::Result;
use crate::{
Document, Violation,
rule::{Rule, RuleCategory, RuleMetadata},
violation::Severity,
};
use comrak::nodes::{AstNode, NodeValue};
use std::collections::{HashMap, HashSet};
pub struct MD051 {
ignore_case: bool,
ignored_pattern: Option<String>,
}
impl Default for MD051 {
fn default() -> Self {
Self::new()
}
}
impl MD051 {
pub fn new() -> Self {
Self {
ignore_case: false,
ignored_pattern: None,
}
}
#[allow(dead_code)]
pub fn ignore_case(mut self, ignore_case: bool) -> Self {
self.ignore_case = ignore_case;
self
}
#[allow(dead_code)]
pub fn ignored_pattern(mut self, pattern: Option<String>) -> Self {
self.ignored_pattern = pattern;
self
}
fn get_position<'a>(&self, node: &'a AstNode<'a>) -> (usize, usize) {
let data = node.data.borrow();
let pos = data.sourcepos;
(pos.start.line, pos.start.column)
}
fn generate_heading_fragment(&self, text: &str) -> String {
let mut fragment = text.to_lowercase();
fragment = fragment
.chars()
.filter(|c| c.is_alphanumeric() || c.is_whitespace() || *c == '-' || *c == '_')
.collect();
fragment = fragment.replace(' ', "-");
fragment = self.consolidate_dashes(&fragment);
fragment = fragment.trim_matches('-').to_string();
fragment
}
fn extract_heading_text<'a>(node: &'a AstNode<'a>) -> String {
let mut text = String::new();
for child in node.children() {
match &child.data.borrow().value {
NodeValue::Text(t) => text.push_str(t),
NodeValue::Code(code) => text.push_str(&code.literal),
NodeValue::Emph | NodeValue::Strong => {
text.push_str(&Self::extract_heading_text(child));
}
_ => {}
}
}
text
}
fn collect_valid_fragments<'a>(&self, ast: &'a AstNode<'a>) -> HashSet<String> {
let mut fragments = HashSet::new();
let mut heading_counts: HashMap<String, usize> = HashMap::new();
fragments.insert("top".to_string());
self.traverse_for_fragments(ast, &mut fragments, &mut heading_counts);
fragments
}
fn traverse_for_fragments<'a>(
&self,
node: &'a AstNode<'a>,
fragments: &mut HashSet<String>,
heading_counts: &mut HashMap<String, usize>,
) {
match &node.data.borrow().value {
NodeValue::Heading(_) => {
let heading_text = Self::extract_heading_text(node);
let mut fragment = self.generate_heading_fragment(&heading_text);
if let Some(count) = heading_counts.get(&fragment) {
let new_count = count + 1;
heading_counts.insert(fragment.clone(), new_count);
fragment = format!("{fragment}-{new_count}");
} else {
heading_counts.insert(fragment.clone(), 1);
}
fragments.insert(fragment);
if let Some(anchor_id) = self.extract_custom_anchor(&heading_text) {
fragments.insert(anchor_id);
}
}
NodeValue::HtmlBlock(html) => {
let ids = self.extract_html_ids(&html.literal);
for id in ids {
fragments.insert(id);
}
let names = self.extract_html_names(&html.literal);
for name in names {
fragments.insert(name);
}
}
NodeValue::HtmlInline(html) => {
let ids = self.extract_html_ids(html);
for id in ids {
fragments.insert(id);
}
let names = self.extract_html_names(html);
for name in names {
fragments.insert(name);
}
}
_ => {}
}
for child in node.children() {
self.traverse_for_fragments(child, fragments, heading_counts);
}
}
fn consolidate_dashes(&self, text: &str) -> String {
let mut result = String::new();
let mut prev_was_dash = false;
for ch in text.chars() {
if ch == '-' {
if !prev_was_dash {
result.push(ch);
}
prev_was_dash = true;
} else {
result.push(ch);
prev_was_dash = false;
}
}
result
}
fn extract_custom_anchor(&self, text: &str) -> Option<String> {
if let Some(start) = text.find("{#") {
let remaining = &text[start + 2..];
if let Some(end) = remaining.find('}') {
let anchor_id = &remaining[..end];
if anchor_id
.chars()
.all(|c| c.is_alphanumeric() || c == '-' || c == '_')
&& !anchor_id.is_empty()
{
return Some(anchor_id.to_string());
}
}
}
None
}
fn extract_html_ids(&self, html: &str) -> Vec<String> {
let mut ids = Vec::new();
let html_lower = html.to_lowercase();
let mut pos = 0;
while let Some(id_pos) = html_lower[pos..].find("id") {
let absolute_pos = pos + id_pos;
let remaining = &html[absolute_pos + 2..];
let mut chars = remaining.chars();
let mut offset = 0;
for ch in chars.by_ref() {
if ch.is_whitespace() {
offset += ch.len_utf8();
} else if ch == '=' {
offset += ch.len_utf8();
break;
} else {
break;
}
}
for ch in chars {
if ch.is_whitespace() {
offset += ch.len_utf8();
} else if ch == '"' || ch == '\'' {
let quote = ch;
offset += ch.len_utf8();
let value_start = absolute_pos + 2 + offset;
let value_remaining = &html[value_start..];
if let Some(end_quote) = value_remaining.find(quote) {
let id_value = &value_remaining[..end_quote];
if !id_value.is_empty() {
ids.push(id_value.to_string());
}
pos = value_start + end_quote + 1;
}
break;
} else {
break;
}
}
}
ids
}
fn extract_html_names(&self, html: &str) -> Vec<String> {
let mut names = Vec::new();
let html_lower = html.to_lowercase();
let mut pos = 0;
while let Some(a_pos) = html_lower[pos..].find("<a") {
let absolute_pos = pos + a_pos;
if let Some(tag_end) = html[absolute_pos..].find('>') {
let tag_content = &html[absolute_pos..absolute_pos + tag_end];
let tag_lower = tag_content.to_lowercase();
if let Some(name_pos) = tag_lower.find("name") {
let name_start = absolute_pos + name_pos + 4;
let remaining = &html[name_start..absolute_pos + tag_end];
let mut chars = remaining.chars();
let mut offset = 0;
for ch in chars.by_ref() {
if ch.is_whitespace() {
offset += ch.len_utf8();
} else if ch == '=' {
offset += ch.len_utf8();
break;
} else {
break;
}
}
for ch in chars {
if ch.is_whitespace() {
offset += ch.len_utf8();
} else if ch == '"' || ch == '\'' {
let quote = ch;
offset += ch.len_utf8();
let value_start = name_start + offset;
let value_remaining = &html[value_start..absolute_pos + tag_end];
if let Some(end_quote) = value_remaining.find(quote) {
let name_value = &value_remaining[..end_quote];
if !name_value.is_empty() {
names.push(name_value.to_string());
}
}
break;
} else {
break;
}
}
}
pos = absolute_pos + tag_end + 1;
} else {
break;
}
}
names
}
fn is_github_line_reference(&self, fragment: &str) -> bool {
if !fragment.starts_with('L') {
return false;
}
let remaining = &fragment[1..];
let mut chars = remaining.chars().peekable();
if !self.consume_digits(&mut chars) {
return false;
}
if chars.peek() == Some(&'C') {
chars.next();
if !self.consume_digits(&mut chars) {
return false;
}
}
if chars.peek() == Some(&'-') {
chars.next();
if chars.next() != Some('L') {
return false;
}
if !self.consume_digits(&mut chars) {
return false;
}
if chars.peek() == Some(&'C') {
chars.next();
if !self.consume_digits(&mut chars) {
return false;
}
}
}
chars.peek().is_none()
}
fn consume_digits(&self, chars: &mut std::iter::Peekable<std::str::Chars>) -> bool {
let mut consumed_any = false;
while let Some(&ch) = chars.peek() {
if ch.is_ascii_digit() {
chars.next();
consumed_any = true;
} else {
break;
}
}
consumed_any
}
fn check_link_fragments<'a>(
&self,
ast: &'a AstNode<'a>,
valid_fragments: &HashSet<String>,
) -> Vec<Violation> {
let mut violations = Vec::new();
self.traverse_for_links(ast, valid_fragments, &mut violations);
violations
}
fn traverse_for_links<'a>(
&self,
node: &'a AstNode<'a>,
valid_fragments: &HashSet<String>,
violations: &mut Vec<Violation>,
) {
if let NodeValue::Link(link) = &node.data.borrow().value {
if let Some(fragment) = link.url.strip_prefix('#') {
if fragment.is_empty() {
let pos = self.get_position(node);
violations.push(self.create_violation(
"Link fragment is empty".to_string(),
pos.0,
pos.1,
Severity::Error,
));
return;
}
if let Some(ref pattern) = self.ignored_pattern {
if fragment.contains(pattern) {
return;
}
}
if self.is_github_line_reference(fragment) {
return;
}
let fragment_to_check = if self.ignore_case {
fragment.to_lowercase()
} else {
fragment.to_string()
};
let valid_fragments_check: HashSet<String> = if self.ignore_case {
valid_fragments.iter().map(|f| f.to_lowercase()).collect()
} else {
valid_fragments.clone()
};
if !valid_fragments_check.contains(&fragment_to_check) {
let pos = self.get_position(node);
violations.push(self.create_violation(
format!("Link fragment '{fragment}' is not valid"),
pos.0,
pos.1,
Severity::Error,
));
}
}
}
for child in node.children() {
self.traverse_for_links(child, valid_fragments, violations);
}
}
fn check_fragments_fallback(&self, document: &Document) -> Vec<Violation> {
let mut violations = Vec::new();
for (line_num, line) in document.content.lines().enumerate() {
let line_number = line_num + 1;
let mut chars = line.char_indices().peekable();
let mut in_backticks = false;
while let Some((i, ch)) = chars.next() {
match ch {
'`' => {
in_backticks = !in_backticks;
}
'[' if !in_backticks => {
if let Some((fragment, text_end)) = self.parse_fragment_link(&line[i..]) {
if fragment.is_empty() {
violations.push(self.create_violation(
"Link fragment is empty".to_string(),
line_number,
i + 1,
Severity::Error,
));
for _ in 0..text_end - 1 {
chars.next();
}
continue;
}
if fragment == "top" {
for _ in 0..text_end - 1 {
chars.next();
}
continue;
}
let mut is_suspicious = false;
if self.is_github_line_reference(&fragment) {
for _ in 0..text_end - 1 {
chars.next();
}
continue;
}
if fragment.contains("invalid") || fragment.contains("undefined") {
is_suspicious = true;
}
if !self.ignore_case && fragment != fragment.to_lowercase() {
is_suspicious = true;
}
if is_suspicious {
violations.push(self.create_violation(
format!("Link fragment '{fragment}' may not be valid"),
line_number,
i + 1,
Severity::Warning,
));
}
for _ in 0..text_end - 1 {
chars.next();
}
}
}
_ => {}
}
}
}
violations
}
fn parse_fragment_link(&self, text: &str) -> Option<(String, usize)> {
if !text.starts_with('[') {
return None;
}
let mut bracket_count = 0;
let mut closing_bracket_pos = None;
for (i, ch) in text.char_indices() {
match ch {
'[' => bracket_count += 1,
']' => {
bracket_count -= 1;
if bracket_count == 0 {
closing_bracket_pos = Some(i);
break;
}
}
_ => {}
}
}
let closing_bracket_pos = closing_bracket_pos?;
let remaining = &text[closing_bracket_pos + 1..];
if remaining.starts_with("(#") {
let fragment_start = closing_bracket_pos + 3; if let Some(closing_paren) = remaining.find(')') {
let fragment_end = closing_bracket_pos + 1 + closing_paren;
let fragment = &text[fragment_start..fragment_end];
let total_length = fragment_end + 1;
return Some((fragment.to_string(), total_length));
}
}
None
}
}
impl Rule for MD051 {
fn id(&self) -> &'static str {
"MD051"
}
fn name(&self) -> &'static str {
"link-fragments"
}
fn description(&self) -> &'static str {
"Link fragments should be valid"
}
fn metadata(&self) -> RuleMetadata {
RuleMetadata::stable(RuleCategory::Links)
}
fn check_with_ast<'a>(
&self,
document: &Document,
ast: Option<&'a AstNode<'a>>,
) -> Result<Vec<Violation>> {
if let Some(ast) = ast {
let valid_fragments = self.collect_valid_fragments(ast);
let violations = self.check_link_fragments(ast, &valid_fragments);
Ok(violations)
} else {
Ok(self.check_fragments_fallback(document))
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::test_helpers::{assert_no_violations, assert_single_violation};
#[test]
fn test_valid_fragments() {
let content = r#"# Heading Name
[Link](#heading-name)
## Another Heading
[Another link](#another-heading)
<div id="custom-id"></div>
[Custom](#custom-id)
<a name="bookmark"></a>
[Bookmark](#bookmark)
[Top link](#top)
"#;
assert_no_violations(MD051::new(), content);
}
#[test]
fn test_invalid_fragments() {
let content = r#"# Heading Name
[Invalid link](#invalid-fragment)
"#;
let violation = assert_single_violation(MD051::new(), content);
assert_eq!(violation.line, 3);
assert!(violation.message.contains("invalid-fragment"));
}
#[test]
fn test_duplicate_headings() {
let content = r#"# Test
[Link 1](#test)
# Test
[Link 2](#test-1)
"#;
assert_no_violations(MD051::new(), content);
}
#[test]
fn test_github_line_references() {
let content = r#"# Code
[Line 20](#L20)
[Range](#L19C5-L21C11)
"#;
assert_no_violations(MD051::new(), content);
}
#[test]
fn test_case_sensitivity() {
let content = r#"# Heading Name
[Link](#Heading-Name)
"#;
let violation = assert_single_violation(MD051::new(), content);
assert_eq!(violation.line, 3);
assert_no_violations(MD051::new().ignore_case(true), content);
}
#[test]
fn test_custom_anchor() {
let content = r#"# Heading Name {#custom-anchor}
[Link](#custom-anchor)
"#;
assert_no_violations(MD051::new(), content);
}
#[test]
fn test_empty_fragment() {
let content = r#"# Heading
[Empty fragment](#)
"#;
let violation = assert_single_violation(MD051::new(), content);
assert_eq!(violation.line, 3);
}
#[test]
fn test_html_id_attributes() {
let content = r#"# Heading
<div id="custom-id">Content</div>
<span id="another-id">Text</span>
[Link to div](#custom-id)
[Link to span](#another-id)
"#;
assert_no_violations(MD051::new(), content);
}
#[test]
fn test_html_name_attributes() {
let content = r#"# Heading
<a name="anchor-name"></a>
<div name="form-element">Content</div>
[Link to anchor](#anchor-name)
[Link to element](#form-element)
"#;
assert_no_violations(MD051::new(), content);
}
#[test]
fn test_html_block_extraction() {
let content = r#"# Heading
<div class="content">
<p id="paragraph-id">Text</p>
<a name="link-name" href="/test">Link</a>
</div>
[Link to paragraph](#paragraph-id)
[Link to anchor](#link-name)
"#;
assert_no_violations(MD051::new(), content);
}
#[test]
fn test_html_inline_extraction() {
let content = r#"# Heading
This is text with <span id="inline-id">inline HTML</span> and <a name="inline-name">anchor</a>.
[Link to inline](#inline-id)
[Link to anchor](#inline-name)
"#;
assert_no_violations(MD051::new(), content);
}
#[test]
fn test_complex_fragment_generation() {
let content = r#"# Complex Heading with (Parentheses) & Symbols!
[Link](#complex-heading-with-parentheses--symbols)
## Another_Complex-Title 123
[Another link](#another_complex-title-123)
### Multiple Spaces Between Words
[Space link](#multiple-spaces-between-words)
"#;
assert_no_violations(MD051::new(), content);
}
#[test]
fn test_dash_consolidation() {
let content = r#"# Title---With----Multiple-----Dashes
[Link](#title-with-multiple-dashes)
## --Leading-And-Trailing--
[Another link](#leading-and-trailing)
"#;
assert_no_violations(MD051::new(), content);
}
#[test]
fn test_unicode_and_special_chars() {
let content = r#"# Heading with émojis 🚀 and ñ
[Unicode link](#heading-with-émojis--and-ñ)
## Code `inline` and **bold**
[Code link](#code-inline-and-bold)
"#;
assert_no_violations(MD051::new(), content);
}
#[test]
fn test_custom_anchor_validation() {
let content = r#"# Valid Custom {#valid-anchor}
[Link](#valid-anchor)
# Invalid Custom {#invalid anchor}
[Bad link](#invalid-anchor)
"#;
let violation = assert_single_violation(MD051::new(), content);
assert_eq!(violation.line, 7);
assert!(violation.message.contains("invalid-anchor"));
}
#[test]
fn test_custom_anchor_edge_cases() {
let content = r#"# Empty Custom {#}
# Valid Custom {#test123}
[Link](#test123)
# Invalid Chars {#test@123}
# Nested {#outer {#inner} }
"#;
assert_no_violations(MD051::new(), content);
}
#[test]
fn test_github_line_references_detailed() {
let content = r#"# Code Examples
[Line reference](#L42)
[Line range](#L10-L20)
[Complex range](#L15C3-L25C10)
[Another format](#L1C1-L1C5)
"#;
assert_no_violations(MD051::new(), content);
}
#[test]
fn test_multiple_document_types() {
let content = r#"# Main Heading
Regular text here.
<div id="html-id">HTML content</div>
<a name="html-name">Anchor</a>
## Sub Heading {#custom-sub}
More content.
[Link to main](#main-heading)
[Link to sub](#custom-sub)
[Link to HTML ID](#html-id)
[Link to HTML name](#html-name)
[GitHub reference](#L100)
[Invalid reference](#Invalid-Reference)
"#;
let violation = assert_single_violation(MD051::new(), content);
assert_eq!(violation.line, 18);
assert!(violation.message.contains("Invalid-Reference"));
}
#[test]
fn test_duplicate_heading_numbering() {
let content = r#"# Test
[First link](#test)
# Test
[Second link](#test-1)
# Test
[Third link](#test-2)
# Different
[Different link](#different)
"#;
assert_no_violations(MD051::new(), content);
}
#[test]
fn test_html_parsing_edge_cases() {
let content = r#"# Heading
<!-- Comment with id="not-real" -->
<div id='single-quotes'>Content</div>
<span id="no-closing-quote>Broken</span>
<p id=unquoted-id>Unquoted</p>
[Single quotes](#single-quotes)
[Unquoted](#unquoted-id)
"#;
assert_no_violations(MD051::new(), content);
}
#[test]
fn test_configuration_options() {
let content = r#"# Test Heading
[Case mismatch](#Test-Heading)
"#;
let violation = assert_single_violation(MD051::new(), content);
assert_eq!(violation.line, 3);
assert_no_violations(MD051::new().ignore_case(true), content);
}
#[test]
fn test_ignored_pattern() {
let content = r#"# Heading
[External link](#external-pattern)
[Normal link](#invalid-fragment)
"#;
let rule = MD051::new().ignored_pattern(Some("external-*".to_string()));
let violation = assert_single_violation(rule, content);
assert_eq!(violation.line, 4);
assert!(violation.message.contains("invalid-fragment"));
}
#[test]
fn test_empty_document() {
let content = "";
assert_no_violations(MD051::new(), content);
}
#[test]
fn test_no_headings_no_fragments() {
let content = r#"Just some text without headings.
[Invalid link](#Invalid-Fragment)
"#;
let violation = assert_single_violation(MD051::new(), content);
assert_eq!(violation.line, 3);
assert!(violation.message.contains("Invalid-Fragment"));
}
#[test]
fn test_top_fragment() {
let content = r#"# Heading
[Link to top](#top)
"#;
assert_no_violations(MD051::new(), content);
}
#[test]
fn test_malformed_html() {
let content = r#"# Heading
<div id=>Empty value</div>
<span id>No value</span>
<p id="unclosed>Bad quote</p>
[Should still work](#heading)
"#;
assert_no_violations(MD051::new(), content);
}
#[test]
fn test_nested_html_elements() {
let content = r#"# Heading
<div class="outer">
<div id="nested-id">
<span name="deep-name">Content</span>
</div>
</div>
[Link to nested](#nested-id)
[Link to deep](#deep-name)
"#;
assert_no_violations(MD051::new(), content);
}
#[test]
fn test_heading_with_code_and_emphasis() {
let content = r#"# Title with `code` and **bold** and *italic*
[Link](#title-with-code-and-bold-and-italic)
## Another `complex` **formatting** example
[Another link](#another-complex-formatting-example)
"#;
assert_no_violations(MD051::new(), content);
}
}