quickmark_core/rules/
md051.rs

1use once_cell::sync::Lazy;
2use regex::Regex;
3use serde::Deserialize;
4use std::collections::HashSet;
5use std::rc::Rc;
6
7use tree_sitter::Node;
8
9use crate::{
10    linter::{range_from_tree_sitter, RuleViolation},
11    rules::{Context, Rule, RuleLinter, RuleType},
12};
13
14// MD051-specific configuration types
15#[derive(Debug, PartialEq, Clone, Deserialize, Default)]
16pub struct MD051LinkFragmentsTable {
17    #[serde(default)]
18    pub ignore_case: bool,
19    #[serde(default)]
20    pub ignored_pattern: String,
21}
22
23#[derive(Debug, Clone)]
24struct LinkFragment {
25    fragment: String,
26    range: tree_sitter::Range,
27}
28
29// Pre-compiled regex patterns for performance
30static LINK_PATTERN: Lazy<Regex> =
31    Lazy::new(|| Regex::new(r"\[([^\]]*)\]\(([^)]*#[^)]*)\)").unwrap());
32
33static RANGE_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"^L\d+C\d+-L\d+C\d+$").unwrap());
34
35static ID_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r#"id\s*=\s*["']([^"']+)["']"#).unwrap());
36
37static NAME_PATTERN: Lazy<Regex> =
38    Lazy::new(|| Regex::new(r#"name\s*=\s*["']([^"']+)["']"#).unwrap());
39
40pub(crate) struct MD051Linter {
41    context: Rc<Context>,
42    valid_fragments: HashSet<String>,
43    valid_fragments_lowercase: HashSet<String>, // Pre-computed lowercase for case-insensitive lookups
44    link_fragments: Vec<LinkFragment>,
45}
46
47impl MD051Linter {
48    pub fn new(context: Rc<Context>) -> Self {
49        Self {
50            context,
51            valid_fragments: HashSet::new(),
52            valid_fragments_lowercase: HashSet::new(),
53            link_fragments: Vec::new(),
54        }
55    }
56
57    fn extract_heading_text(&self, node: &Node) -> Option<String> {
58        // Get the text content of the heading, excluding markers
59        let start_byte = node.start_byte();
60        let end_byte = node.end_byte();
61        let document_content = self.context.document_content.borrow();
62        let _heading_content = &document_content[start_byte..end_byte];
63
64        // For ATX headings, remove the # markers and trim
65        if node.kind() == "atx_heading" {
66            // Find the heading text by looking for inline content
67            for i in 0..node.child_count() {
68                let child = node.child(i).unwrap();
69                if child.kind() == "inline" {
70                    let child_start = child.start_byte();
71                    let child_end = child.end_byte();
72                    let text = &document_content[child_start..child_end].trim();
73                    return Some(text.to_string());
74                }
75            }
76        }
77
78        // For setext headings, look for paragraph containing inline content
79        if node.kind() == "setext_heading" {
80            for i in 0..node.child_count() {
81                let child = node.child(i).unwrap();
82                if child.kind() == "paragraph" {
83                    // Look for inline content within the paragraph
84                    for j in 0..child.child_count() {
85                        let grandchild = child.child(j).unwrap();
86                        if grandchild.kind() == "inline" {
87                            let grandchild_start = grandchild.start_byte();
88                            let grandchild_end = grandchild.end_byte();
89                            let text = &document_content[grandchild_start..grandchild_end].trim();
90                            return Some(text.to_string());
91                        }
92                    }
93                }
94            }
95        }
96
97        None
98    }
99
100    fn generate_github_fragment(&self, heading_text: &str) -> String {
101        // Implementation of GitHub's heading algorithm:
102        // 1. Convert to lowercase
103        // 2. Remove punctuation (keep alphanumeric, spaces, hyphens)
104        // 3. Replace spaces with hyphens
105        // 4. Remove multiple consecutive hyphens
106
107        let mut result = heading_text.to_lowercase();
108
109        // Remove punctuation, keeping only alphanumeric, spaces, and hyphens
110        result = result
111            .chars()
112            .filter(|c| c.is_alphanumeric() || c.is_whitespace() || *c == '-')
113            .collect();
114
115        // Replace spaces with hyphens
116        result = result.replace(' ', "-");
117
118        // Remove multiple consecutive hyphens efficiently
119        let chars: Vec<char> = result.chars().collect();
120        let mut filtered = Vec::new();
121        let mut prev_was_dash = false;
122
123        for ch in chars {
124            if ch == '-' {
125                if !prev_was_dash {
126                    filtered.push(ch);
127                    prev_was_dash = true;
128                }
129            } else {
130                filtered.push(ch);
131                prev_was_dash = false;
132            }
133        }
134        result = filtered.into_iter().collect();
135
136        // Trim leading/trailing hyphens
137        result = result.trim_matches('-').to_string();
138
139        result
140    }
141
142    fn extract_custom_anchor(&self, heading_text: &str) -> Option<String> {
143        // Look for {#custom-anchor} syntax
144        if let Some(start) = heading_text.rfind("{#") {
145            if let Some(end) = heading_text[start..].find('}') {
146                let anchor = &heading_text[start + 2..start + end];
147                return Some(anchor.to_string());
148            }
149        }
150        None
151    }
152
153    fn extract_link_fragments(&self, node: &Node) -> Vec<String> {
154        // Extract all fragments from link nodes
155        let start_byte = node.start_byte();
156        let end_byte = node.end_byte();
157        let document_content = self.context.document_content.borrow();
158        let content = &document_content[start_byte..end_byte];
159
160        let mut fragments = Vec::new();
161
162        for cap in LINK_PATTERN.captures_iter(content) {
163            if let Some(url_with_fragment) = cap.get(2) {
164                let url_text = url_with_fragment.as_str();
165                if let Some(hash_pos) = url_text.rfind('#') {
166                    let fragment = &url_text[hash_pos + 1..];
167                    // Only process non-empty fragments that don't contain spaces
168                    // Fragments with spaces are likely malformed and handled by other rules
169                    if !fragment.is_empty() && !fragment.contains(' ') {
170                        fragments.push(fragment.to_string());
171                    }
172                }
173            }
174        }
175
176        fragments
177    }
178
179    fn is_github_special_fragment(&self, fragment: &str) -> bool {
180        // GitHub special fragments according to GitHub specification
181        // Reference: https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/creating-a-permanent-link-to-a-code-snippet
182
183        if fragment == "top" {
184            return true;
185        }
186
187        // Line number patterns: L followed by one or more digits (L123, L1, etc.)
188        if fragment.starts_with('L')
189            && fragment.len() > 1
190            && fragment[1..].chars().all(|c| c.is_ascii_digit())
191        {
192            return true;
193        }
194
195        // Range patterns: L19C5-L21C11 (GitHub's official format for line ranges with column numbers)
196        if RANGE_PATTERN.is_match(fragment) {
197            return true;
198        }
199
200        // Note: L10-L20 format is NOT valid according to GitHub spec
201        // GitHub requires column numbers: L10C1-L20C5
202
203        false
204    }
205
206    fn extract_html_id_or_name(&self, node: &Node) -> Vec<String> {
207        // Extract id and name attributes from HTML elements
208        let mut ids = Vec::new();
209        let start_byte = node.start_byte();
210        let end_byte = node.end_byte();
211        let document_content = self.context.document_content.borrow();
212        let html_content = &document_content[start_byte..end_byte];
213
214        for cap in ID_PATTERN.captures_iter(html_content) {
215            if let Some(id) = cap.get(1) {
216                ids.push(id.as_str().to_string());
217            }
218        }
219
220        for cap in NAME_PATTERN.captures_iter(html_content) {
221            if let Some(name) = cap.get(1) {
222                ids.push(name.as_str().to_string());
223            }
224        }
225
226        ids
227    }
228}
229
230impl RuleLinter for MD051Linter {
231    fn feed(&mut self, node: &Node) {
232        match node.kind() {
233            "atx_heading" | "setext_heading" => {
234                if let Some(heading_text) = self.extract_heading_text(node) {
235                    // Check for custom anchor first
236                    if let Some(custom_anchor) = self.extract_custom_anchor(&heading_text) {
237                        self.valid_fragments.insert(custom_anchor.clone());
238                        self.valid_fragments_lowercase
239                            .insert(custom_anchor.to_lowercase());
240                        // Also generate the default fragment from the heading text without the anchor
241                        let clean_text = heading_text
242                            .replace(&format!("{{#{custom_anchor}}}"), "")
243                            .trim()
244                            .to_string();
245                        if !clean_text.is_empty() {
246                            let fragment = self.generate_github_fragment(&clean_text);
247                            if !fragment.is_empty() {
248                                self.valid_fragments.insert(fragment.clone());
249                                self.valid_fragments_lowercase
250                                    .insert(fragment.to_lowercase());
251                            }
252                        }
253                    } else {
254                        // Generate GitHub-style fragment
255                        let fragment = self.generate_github_fragment(&heading_text);
256                        if !fragment.is_empty() {
257                            // Handle duplicate headings by checking if fragment already exists
258                            let mut unique_fragment = fragment.clone();
259                            let mut counter = 1;
260                            while self.valid_fragments.contains(&unique_fragment) {
261                                unique_fragment = format!("{fragment}-{counter}");
262                                counter += 1;
263                            }
264                            self.valid_fragments.insert(unique_fragment.clone());
265                            self.valid_fragments_lowercase
266                                .insert(unique_fragment.to_lowercase());
267                        }
268                    }
269                }
270            }
271            "inline" | "html_block" => {
272                // Extract HTML id and name attributes
273                let ids = self.extract_html_id_or_name(node);
274                for id in ids {
275                    self.valid_fragments.insert(id.clone());
276                    self.valid_fragments_lowercase.insert(id.to_lowercase());
277                }
278
279                // Also look for links in inline content
280                let fragments = self.extract_link_fragments(node);
281                for fragment in fragments {
282                    // We need to store the node for later violation reporting
283                    // Note: This is a simplified approach. In a real implementation,
284                    // we'd need to handle the lifetime properly
285                    self.link_fragments.push(LinkFragment {
286                        fragment,
287                        range: node.range(),
288                    });
289                }
290            }
291            _ => {
292                // For other nodes, do nothing to avoid duplicates
293            }
294        }
295    }
296
297    fn finalize(&mut self) -> Vec<RuleViolation> {
298        let mut violations = Vec::new();
299        let config = &self.context.config.linters.settings.link_fragments;
300
301        // Compile ignored pattern regex if provided
302        let ignored_regex = if !config.ignored_pattern.is_empty() {
303            Regex::new(&config.ignored_pattern).ok()
304        } else {
305            None
306        };
307
308        for link_fragment in &self.link_fragments {
309            let fragment = &link_fragment.fragment;
310            let mut is_valid = false;
311
312            // Check if it's a GitHub special fragment
313            if self.is_github_special_fragment(fragment) {
314                is_valid = true;
315            }
316
317            // Check if it matches ignored pattern
318            if !is_valid {
319                if let Some(ref regex) = ignored_regex {
320                    if regex.is_match(fragment) {
321                        is_valid = true;
322                    }
323                }
324            }
325
326            // Check if it matches any valid fragment
327            if !is_valid {
328                if config.ignore_case {
329                    let fragment_lower = fragment.to_lowercase();
330                    is_valid = self.valid_fragments_lowercase.contains(&fragment_lower);
331                } else {
332                    is_valid = self.valid_fragments.contains(fragment);
333                }
334            }
335
336            if !is_valid {
337                violations.push(RuleViolation::new(
338                    &MD051,
339                    format!("Link fragment '{fragment}' does not match any heading or anchor in the document"),
340                    self.context.file_path.clone(),
341                    range_from_tree_sitter(&link_fragment.range),
342                ));
343            }
344        }
345
346        violations
347    }
348}
349
350pub const MD051: Rule = Rule {
351    id: "MD051",
352    alias: "link-fragments",
353    tags: &["links"],
354    description: "Link fragments should be valid",
355    rule_type: RuleType::Document,
356    required_nodes: &["link", "atx_heading", "setext_heading"],
357    new_linter: |context| Box::new(MD051Linter::new(context)),
358};
359
360#[cfg(test)]
361mod test {
362    use std::path::PathBuf;
363
364    use crate::config::{LintersSettingsTable, MD051LinkFragmentsTable, RuleSeverity};
365    use crate::linter::MultiRuleLinter;
366    use crate::test_utils::test_helpers::test_config_with_rules;
367
368    fn test_config() -> crate::config::QuickmarkConfig {
369        test_config_with_rules(vec![("link-fragments", RuleSeverity::Error)])
370    }
371
372    fn test_config_with_settings(
373        ignore_case: bool,
374        ignored_pattern: String,
375    ) -> crate::config::QuickmarkConfig {
376        crate::test_utils::test_helpers::test_config_with_settings(
377            vec![("link-fragments", RuleSeverity::Error)],
378            LintersSettingsTable {
379                link_fragments: MD051LinkFragmentsTable {
380                    ignore_case,
381                    ignored_pattern,
382                },
383                ..Default::default()
384            },
385        )
386    }
387
388    #[test]
389    fn test_basic_valid_fragment() {
390        let input = "# Test Heading
391
392[Valid Link](#test-heading)
393";
394
395        let config = test_config();
396        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
397        let violations = linter.analyze();
398
399        // Should have no violations - valid fragment
400        assert_eq!(0, violations.len());
401    }
402
403    #[test]
404    fn test_basic_invalid_fragment() {
405        let input = "# Test Heading
406
407[Invalid Link](#nonexistent-heading)
408";
409
410        let config = test_config();
411        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
412        let violations = linter.analyze();
413
414        // Should have 1 violation - invalid fragment
415        assert_eq!(1, violations.len());
416    }
417
418    #[test]
419    fn test_case_sensitive_default() {
420        let input = "# Test Heading
421
422[Invalid Link](#Test-Heading)
423";
424
425        let config = test_config();
426        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
427        let violations = linter.analyze();
428
429        // Should have 1 violation - case mismatch
430        assert_eq!(1, violations.len());
431    }
432
433    #[test]
434    fn test_ignore_case_option() {
435        let input = "# Test Heading
436
437[Valid Link](#Test-Heading)
438";
439
440        let config = test_config_with_settings(true, String::new());
441        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
442        let violations = linter.analyze();
443
444        // Should have no violations - case ignored
445        assert_eq!(0, violations.len());
446    }
447
448    #[test]
449    fn test_punctuation_removal() {
450        let input = "# Test: Heading! With? Punctuation.
451
452[Valid Link](#test-heading-with-punctuation)
453";
454
455        let config = test_config();
456        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
457        let violations = linter.analyze();
458
459        // Should have no violations - punctuation correctly removed
460        assert_eq!(0, violations.len());
461    }
462
463    #[test]
464    fn test_duplicate_headings() {
465        let input = "# Test Heading
466
467## Test Heading
468
469[Link 1](#test-heading)
470[Link 2](#test-heading-1)
471";
472
473        let config = test_config();
474        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
475        let violations = linter.analyze();
476
477        // Should have no violations - both fragments are valid
478        assert_eq!(0, violations.len());
479    }
480
481    #[test]
482    fn test_custom_anchor() {
483        let input = "# Test Heading {#custom-anchor}
484
485[Valid Link](#custom-anchor)
486";
487
488        let config = test_config();
489        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
490        let violations = linter.analyze();
491
492        // Should have no violations - custom anchor is valid
493        assert_eq!(0, violations.len());
494    }
495
496    #[test]
497    fn test_html_id_attribute() {
498        let input = "# Test Heading
499
500<div id=\"my-custom-id\">Content</div>
501
502[Valid Link](#my-custom-id)
503";
504
505        let config = test_config();
506        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
507        let violations = linter.analyze();
508
509        // Should have no violations - HTML id is valid
510        assert_eq!(0, violations.len());
511    }
512
513    #[test]
514    fn test_html_name_attribute() {
515        let input = "# Test Heading
516
517<a name=\"my-anchor\">Anchor</a>
518
519[Valid Link](#my-anchor)
520";
521
522        let config = test_config();
523        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
524        let violations = linter.analyze();
525
526        // Should have no violations - HTML name is valid
527        assert_eq!(0, violations.len());
528    }
529
530    #[test]
531    fn test_ignored_pattern() {
532        let input = "# Test Heading
533
534[Link to external](#external-fragment)
535";
536
537        let config = test_config_with_settings(false, "external-.*".to_string());
538        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
539        let violations = linter.analyze();
540
541        // Should have no violations - fragment matches ignored pattern
542        assert_eq!(0, violations.len());
543    }
544
545    #[test]
546    fn test_github_special_fragments() {
547        let input = "# Test Heading
548
549[Link to top](#top)
550[Link to line](#L20)
551[Link to range](#L19C5-L21C11)
552[Invalid range](#L10-L20)
553";
554
555        let config = test_config();
556        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
557        let violations = linter.analyze();
558
559        // Should have 1 violation - L10-L20 is invalid per GitHub spec
560        assert_eq!(1, violations.len());
561        assert!(violations[0].message().contains("Link fragment 'L10-L20'"));
562    }
563
564    #[test]
565    fn test_multiple_violations() {
566        let input = "# Valid Heading
567
568[Valid Link](#valid-heading)
569[Invalid Link 1](#invalid-one)
570[Invalid Link 2](#invalid-two)
571";
572
573        let config = test_config();
574        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
575        let violations = linter.analyze();
576
577        // Should have 2 violations - two invalid fragments
578        assert_eq!(2, violations.len());
579    }
580
581    #[test]
582    fn test_setext_headings() {
583        let input = "Test Heading
584============
585
586Another Heading
587---------------
588
589[Valid Link 1](#test-heading)
590[Valid Link 2](#another-heading)
591";
592
593        let config = test_config();
594        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
595        let violations = linter.analyze();
596
597        // Should have no violations - both setext headings are valid
598        assert_eq!(0, violations.len());
599    }
600
601    #[test]
602    fn test_edge_cases_for_consistency() {
603        let input = "# Test Heading
604
605[Valid link](#test-heading)
606[Fragment with spaces](#test heading)
607[Empty fragment](#)
608[Invalid single L](#L)
609[Valid L with number](#L123)
610";
611
612        let config = test_config();
613        let mut linter = MultiRuleLinter::new_for_document(PathBuf::from("test.md"), config, input);
614        let violations = linter.analyze();
615
616        // Should have 1 violation - only #L should be reported
617        // Fragments with spaces and empty fragments are ignored (consistent with markdownlint)
618        assert_eq!(1, violations.len());
619        assert!(violations[0].message().contains("Link fragment 'L'"));
620    }
621}