rumdl_lib/rules/
md057_existing_relative_links.rs

1//!
2//! Rule MD057: Existing relative links
3//!
4//! See [docs/md057.md](../../docs/md057.md) for full documentation, configuration, and examples.
5
6use crate::rule::{LintError, LintResult, LintWarning, Rule, RuleCategory, Severity};
7use crate::utils::element_cache::ElementCache;
8use crate::utils::range_utils::LineIndex;
9use lazy_static::lazy_static;
10use regex::Regex;
11use std::collections::HashMap;
12use std::env;
13use std::path::{Path, PathBuf};
14use std::sync::{Arc, Mutex};
15
16mod md057_config;
17use md057_config::MD057Config;
18
19// Thread-safe cache for file existence checks to avoid redundant filesystem operations
20lazy_static! {
21    static ref FILE_EXISTENCE_CACHE: Arc<Mutex<HashMap<PathBuf, bool>>> = Arc::new(Mutex::new(HashMap::new()));
22}
23
24// Reset the file existence cache (typically between rule runs)
25fn reset_file_existence_cache() {
26    let mut cache = FILE_EXISTENCE_CACHE.lock().unwrap();
27    cache.clear();
28}
29
30// Check if a file exists with caching
31fn file_exists_with_cache(path: &Path) -> bool {
32    let mut cache = FILE_EXISTENCE_CACHE.lock().unwrap();
33    *cache.entry(path.to_path_buf()).or_insert_with(|| path.exists())
34}
35
36lazy_static! {
37    // Regex to match the start of a link - simplified for performance
38    static ref LINK_START_REGEX: Regex =
39        Regex::new(r"!?\[[^\]]*\]").unwrap();
40
41    /// Regex to extract the URL from a markdown link
42    /// Format: `](URL)` or `](URL "title")`
43    static ref URL_EXTRACT_REGEX: Regex =
44        Regex::new("\\]\\(\\s*<?([^>\\)\\s#]+)(#[^)\\s]*)?\\s*(?:\"[^\"]*\")?\\s*>?\\s*\\)").unwrap();
45
46    /// Regex to detect code fence blocks
47    static ref CODE_FENCE_REGEX: Regex =
48        Regex::new(r"^( {0,3})(`{3,}|~{3,})").unwrap();
49
50    /// Regex to detect protocol and domain for external links
51    static ref PROTOCOL_DOMAIN_REGEX: Regex =
52        Regex::new(r"^(https?://|ftp://|mailto:|www\.)").unwrap();
53
54    /// Regex to detect media file types
55    static ref MEDIA_FILE_REGEX: Regex =
56        Regex::new(r"\.(jpg|jpeg|png|gif|bmp|svg|webp|tiff|mp3|mp4|avi|mov|webm|wav|ogg|pdf)$").unwrap();
57
58    /// Regex to detect fragment-only links
59    static ref FRAGMENT_ONLY_REGEX: Regex =
60        Regex::new(r"^#").unwrap();
61
62    // Current working directory
63    static ref CURRENT_DIR: PathBuf = env::current_dir().unwrap_or_else(|_| PathBuf::from("."));
64}
65
66/// Rule MD057: Existing relative links should point to valid files or directories.
67#[derive(Debug, Default, Clone)]
68pub struct MD057ExistingRelativeLinks {
69    /// Base directory for resolving relative links
70    base_path: Arc<Mutex<Option<PathBuf>>>,
71    /// Configuration
72    config: MD057Config,
73}
74
75impl MD057ExistingRelativeLinks {
76    /// Create a new instance with default settings
77    pub fn new() -> Self {
78        Self::default()
79    }
80
81    /// Set the base path for resolving relative links
82    pub fn with_path<P: AsRef<Path>>(self, path: P) -> Self {
83        let path = path.as_ref();
84        let dir_path = if path.is_file() {
85            path.parent().map(|p| p.to_path_buf())
86        } else {
87            Some(path.to_path_buf())
88        };
89
90        *self.base_path.lock().unwrap() = dir_path;
91        self
92    }
93
94    /// Configure whether to skip checking media files
95    pub fn with_skip_media_files(mut self, skip_media_files: bool) -> Self {
96        self.config.skip_media_files = skip_media_files;
97        self
98    }
99
100    pub fn from_config_struct(config: MD057Config) -> Self {
101        Self {
102            base_path: Arc::new(Mutex::new(None)),
103            config,
104        }
105    }
106
107    /// Check if a URL is external (optimized version)
108    #[inline]
109    fn is_external_url(&self, url: &str) -> bool {
110        if url.is_empty() {
111            return false;
112        }
113
114        // Quick checks for common external URL patterns
115        if PROTOCOL_DOMAIN_REGEX.is_match(url) || url.starts_with("www.") {
116            return true;
117        }
118
119        // More restrictive domain check using a simpler pattern
120        if !self.is_media_file(url) && url.ends_with(".com") {
121            return true;
122        }
123
124        // Absolute paths within the site are not external
125        if url.starts_with('/') {
126            return false;
127        }
128
129        // All other cases (relative paths, etc.) are not external
130        false
131    }
132
133    /// Check if the URL is a fragment-only link (internal document link)
134    #[inline]
135    fn is_fragment_only_link(&self, url: &str) -> bool {
136        url.starts_with('#')
137    }
138
139    /// Check if the URL has a media file extension (optimized with early returns)
140    #[inline]
141    fn is_media_file(&self, url: &str) -> bool {
142        // Quick check before using regex
143        if !url.contains('.') {
144            return false;
145        }
146        MEDIA_FILE_REGEX.is_match(url)
147    }
148
149    /// Determine if we should skip checking this media file
150    #[inline]
151    fn should_skip_media_file(&self, url: &str) -> bool {
152        self.config.skip_media_files && self.is_media_file(url)
153    }
154
155    /// Resolve a relative link against the base path
156    fn resolve_link_path(&self, link: &str) -> Option<PathBuf> {
157        self.base_path
158            .lock()
159            .unwrap()
160            .as_ref()
161            .map(|base_path| base_path.join(link))
162    }
163
164    /// Process a single link and check if it exists
165    fn process_link(&self, url: &str, line_num: usize, column: usize, warnings: &mut Vec<LintWarning>) {
166        // Skip empty URLs
167        if url.is_empty() {
168            return;
169        }
170
171        // Skip external URLs and fragment-only links (optimized order)
172        if self.is_external_url(url) || self.is_fragment_only_link(url) {
173            return;
174        }
175
176        // Skip media files if configured to do so
177        if self.should_skip_media_file(url) {
178            return;
179        }
180
181        // Resolve the relative link against the base path
182        if let Some(resolved_path) = self.resolve_link_path(url) {
183            // Check if the file exists (with caching to avoid filesystem calls)
184            if !file_exists_with_cache(&resolved_path) {
185                warnings.push(LintWarning {
186                    rule_name: Some(self.name()),
187                    line: line_num,
188                    column,
189                    end_line: line_num,
190                    end_column: column + url.len(),
191                    message: format!("Relative link '{url}' does not exist"),
192                    severity: Severity::Warning,
193                    fix: None, // No automatic fix for missing files
194                });
195            }
196        }
197    }
198}
199
200impl Rule for MD057ExistingRelativeLinks {
201    fn name(&self) -> &'static str {
202        "MD057"
203    }
204
205    fn description(&self) -> &'static str {
206        "Relative links should point to existing files"
207    }
208
209    fn category(&self) -> RuleCategory {
210        RuleCategory::Link
211    }
212
213    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
214        ctx.content.is_empty() || !ctx.likely_has_links_or_images()
215    }
216
217    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
218        let content = ctx.content;
219
220        // Early returns for performance
221        if content.is_empty() || !content.contains('[') {
222            return Ok(Vec::new());
223        }
224
225        // Quick check for any potential links before expensive operations
226        if !content.contains("](") {
227            return Ok(Vec::new());
228        }
229
230        // Reset the file existence cache for a fresh run
231        reset_file_existence_cache();
232
233        let mut warnings = Vec::new();
234
235        // Cache base path lookup to avoid repeated mutex operations
236        let base_path = {
237            let base_path_guard = self.base_path.lock().unwrap();
238            if base_path_guard.is_some() {
239                base_path_guard.clone()
240            } else {
241                // Try to determine the base path from the file being processed (cached)
242                static CACHED_FILE_PATH: std::sync::OnceLock<Option<PathBuf>> = std::sync::OnceLock::new();
243                CACHED_FILE_PATH
244                    .get_or_init(|| {
245                        if let Ok(file_path) = env::var("RUMDL_FILE_PATH") {
246                            let path = Path::new(&file_path);
247                            if path.exists() {
248                                path.parent()
249                                    .map(|p| p.to_path_buf())
250                                    .or_else(|| Some(CURRENT_DIR.clone()))
251                            } else {
252                                Some(CURRENT_DIR.clone())
253                            }
254                        } else {
255                            Some(CURRENT_DIR.clone())
256                        }
257                    })
258                    .clone()
259            }
260        };
261
262        // If we still don't have a base path, we can't validate relative links
263        if base_path.is_none() {
264            return Ok(warnings);
265        }
266
267        // Use LintContext links instead of expensive regex parsing
268        if !ctx.links.is_empty() {
269            // Use LineIndex for correct position calculation across all line ending types
270            let line_index = LineIndex::new(content.to_string());
271
272            // Create element cache once for all links
273            let element_cache = ElementCache::new(content);
274
275            // Pre-collect lines to avoid repeated line iteration
276            let lines: Vec<&str> = content.lines().collect();
277
278            for link in &ctx.links {
279                let line_idx = link.line - 1;
280                if line_idx >= lines.len() {
281                    continue;
282                }
283
284                let line = lines[line_idx];
285
286                // Quick check for link pattern in this line
287                if !line.contains("](") {
288                    continue;
289                }
290
291                // Find all links in this line using optimized regex
292                for link_match in LINK_START_REGEX.find_iter(line) {
293                    let start_pos = link_match.start();
294                    let end_pos = link_match.end();
295
296                    // Calculate absolute position using LineIndex
297                    let line_start_byte = line_index.get_line_start_byte(line_idx + 1).unwrap_or(0);
298                    let absolute_start_pos = line_start_byte + start_pos;
299
300                    // Skip if this link is in a code span
301                    if element_cache.is_in_code_span(absolute_start_pos) {
302                        continue;
303                    }
304
305                    // Find the URL part after the link text
306                    if let Some(caps) = URL_EXTRACT_REGEX.captures_at(line, end_pos - 1)
307                        && let Some(url_group) = caps.get(1)
308                    {
309                        let url = url_group.as_str().trim();
310
311                        // Calculate column position
312                        let column = start_pos + 1;
313
314                        // Process and validate the link
315                        self.process_link(url, link.line, column, &mut warnings);
316                    }
317                }
318            }
319        }
320
321        Ok(warnings)
322    }
323
324    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
325        Ok(ctx.content.to_string())
326    }
327
328    fn as_any(&self) -> &dyn std::any::Any {
329        self
330    }
331
332    fn default_config_section(&self) -> Option<(String, toml::Value)> {
333        let json_value = serde_json::to_value(&self.config).ok()?;
334        Some((
335            self.name().to_string(),
336            crate::rule_config_serde::json_to_toml_value(&json_value)?,
337        ))
338    }
339
340    fn from_config(config: &crate::config::Config) -> Box<dyn Rule>
341    where
342        Self: Sized,
343    {
344        let rule_config = crate::rule_config_serde::load_rule_config::<MD057Config>(config);
345        Box::new(Self::from_config_struct(rule_config))
346    }
347}
348
349#[cfg(test)]
350mod tests {
351    use super::*;
352    use std::fs::File;
353    use std::io::Write;
354    use tempfile::tempdir;
355
356    #[test]
357    fn test_external_urls() {
358        let rule = MD057ExistingRelativeLinks::new();
359
360        assert!(rule.is_external_url("https://example.com"));
361        assert!(rule.is_external_url("http://example.com"));
362        assert!(rule.is_external_url("ftp://example.com"));
363        assert!(rule.is_external_url("www.example.com"));
364        assert!(rule.is_external_url("example.com"));
365
366        assert!(!rule.is_external_url("./relative/path.md"));
367        assert!(!rule.is_external_url("relative/path.md"));
368        assert!(!rule.is_external_url("../parent/path.md"));
369    }
370
371    #[test]
372    fn test_media_files() {
373        // Test with default settings (skip_media_files = true)
374        let rule_default = MD057ExistingRelativeLinks::new();
375
376        // Test media file identification
377        assert!(
378            rule_default.is_media_file("image.jpg"),
379            "image.jpg should be identified as a media file"
380        );
381        assert!(
382            rule_default.is_media_file("video.mp4"),
383            "video.mp4 should be identified as a media file"
384        );
385        assert!(
386            rule_default.is_media_file("document.pdf"),
387            "document.pdf should be identified as a media file"
388        );
389        assert!(
390            rule_default.is_media_file("path/to/audio.mp3"),
391            "path/to/audio.mp3 should be identified as a media file"
392        );
393
394        assert!(
395            !rule_default.is_media_file("document.md"),
396            "document.md should not be identified as a media file"
397        );
398        assert!(
399            !rule_default.is_media_file("code.rs"),
400            "code.rs should not be identified as a media file"
401        );
402
403        // Test media file skipping with default settings (skip_media_files = true)
404        assert!(
405            rule_default.should_skip_media_file("image.jpg"),
406            "image.jpg should be skipped with default settings"
407        );
408        assert!(
409            !rule_default.should_skip_media_file("document.md"),
410            "document.md should not be skipped"
411        );
412
413        // Test media file skipping with skip_media_files = false
414        let rule_no_skip = MD057ExistingRelativeLinks::new().with_skip_media_files(false);
415        assert!(
416            !rule_no_skip.should_skip_media_file("image.jpg"),
417            "image.jpg should not be skipped when skip_media_files is false"
418        );
419    }
420
421    #[test]
422    fn test_no_warnings_without_base_path() {
423        let rule = MD057ExistingRelativeLinks::new();
424        let content = "[Link](missing.md)";
425
426        let ctx = crate::lint_context::LintContext::new(content, crate::config::MarkdownFlavor::Standard);
427        let result = rule.check(&ctx).unwrap();
428        assert!(result.is_empty(), "Should have no warnings without base path");
429    }
430
431    #[test]
432    fn test_existing_and_missing_links() {
433        // Create a temporary directory for test files
434        let temp_dir = tempdir().unwrap();
435        let base_path = temp_dir.path();
436
437        // Create an existing file
438        let exists_path = base_path.join("exists.md");
439        File::create(&exists_path).unwrap().write_all(b"# Test File").unwrap();
440
441        // Verify the file exists
442        assert!(exists_path.exists(), "exists.md should exist for this test");
443
444        // Create test content with both existing and missing links
445        let content = r#"
446# Test Document
447
448[Valid Link](exists.md)
449[Invalid Link](missing.md)
450[External Link](https://example.com)
451[Media Link](image.jpg)
452        "#;
453
454        // Initialize rule with the base path
455        let rule = MD057ExistingRelativeLinks::new().with_path(base_path);
456
457        // Test the rule
458        let ctx = crate::lint_context::LintContext::new(content, crate::config::MarkdownFlavor::Standard);
459        let result = rule.check(&ctx).unwrap();
460
461        // Should have one warning for the missing.md link but not for the media file
462        assert_eq!(result.len(), 1);
463        assert!(result[0].message.contains("missing.md"));
464
465        // Test with check method
466        let result_with_structure = rule.check(&ctx).unwrap();
467
468        // Results should be the same
469        assert_eq!(result.len(), result_with_structure.len());
470        assert!(result_with_structure[0].message.contains("missing.md"));
471    }
472
473    #[test]
474    fn test_angle_bracket_links() {
475        // Create a temporary directory for test files
476        let temp_dir = tempdir().unwrap();
477        let base_path = temp_dir.path();
478
479        // Create an existing file
480        let exists_path = base_path.join("exists.md");
481        File::create(&exists_path).unwrap().write_all(b"# Test File").unwrap();
482
483        // Create test content with angle bracket links
484        let content = r#"
485# Test Document
486
487[Valid Link](<exists.md>)
488[Invalid Link](<missing.md>)
489[External Link](<https://example.com>)
490    "#;
491
492        // Test with default settings
493        let rule = MD057ExistingRelativeLinks::new().with_path(base_path);
494
495        let ctx = crate::lint_context::LintContext::new(content, crate::config::MarkdownFlavor::Standard);
496        let result = rule.check(&ctx).unwrap();
497
498        // Should have one warning for missing.md
499        assert_eq!(result.len(), 1, "Should have exactly one warning");
500        assert!(
501            result[0].message.contains("missing.md"),
502            "Warning should mention missing.md"
503        );
504    }
505
506    #[test]
507    fn test_media_file_handling() {
508        // Create a temporary directory for test files
509        let temp_dir = tempdir().unwrap();
510        let base_path = temp_dir.path();
511
512        // Explicitly check that image.jpg doesn't exist in the test directory
513        let image_path = base_path.join("image.jpg");
514        assert!(
515            !image_path.exists(),
516            "Test precondition failed: image.jpg should not exist"
517        );
518
519        // Create a test content with a media link - make sure it's very explicit
520        let content = "[Media Link](image.jpg)";
521
522        // Test with skip_media_files = true (default)
523        let rule_skip_media = MD057ExistingRelativeLinks::new().with_path(base_path);
524
525        let ctx = crate::lint_context::LintContext::new(content, crate::config::MarkdownFlavor::Standard);
526        let result_skip = rule_skip_media.check(&ctx).unwrap();
527
528        // Should have no warnings when media files are skipped
529        assert_eq!(
530            result_skip.len(),
531            0,
532            "Should have no warnings when skip_media_files is true"
533        );
534
535        // Test with skip_media_files = false
536        let rule_check_all = MD057ExistingRelativeLinks::new()
537            .with_path(base_path)
538            .with_skip_media_files(false);
539
540        let ctx = crate::lint_context::LintContext::new(content, crate::config::MarkdownFlavor::Standard);
541        let result_all = rule_check_all.check(&ctx).unwrap();
542
543        // Should warn about the missing media file
544        assert_eq!(
545            result_all.len(),
546            1,
547            "Should have one warning when skip_media_files is false"
548        );
549        assert!(
550            result_all[0].message.contains("image.jpg"),
551            "Warning should mention image.jpg"
552        );
553    }
554
555    #[test]
556    fn test_code_span_detection() {
557        let rule = MD057ExistingRelativeLinks::new();
558
559        // Create a temporary directory for test files
560        let temp_dir = tempdir().unwrap();
561        let base_path = temp_dir.path();
562
563        let rule = rule.with_path(base_path);
564
565        // Test with document structure
566        let content = "This is a [link](nonexistent.md) and `[not a link](not-checked.md)` in code.";
567
568        let ctx = crate::lint_context::LintContext::new(content, crate::config::MarkdownFlavor::Standard);
569        let result = rule.check(&ctx).unwrap();
570
571        // Should only find the real link, not the one in code
572        assert_eq!(result.len(), 1, "Should only flag the real link");
573        assert!(result[0].message.contains("nonexistent.md"));
574    }
575
576    #[test]
577    fn test_inline_code_spans() {
578        // Create a temporary directory for test files
579        let temp_dir = tempdir().unwrap();
580        let base_path = temp_dir.path();
581
582        // Create test content with links in inline code spans
583        let content = r#"
584# Test Document
585
586This is a normal link: [Link](missing.md)
587
588This is a code span with a link: `[Link](another-missing.md)`
589
590Some more text with `inline code [Link](yet-another-missing.md) embedded`.
591
592    "#;
593
594        // Initialize rule with the base path
595        let rule = MD057ExistingRelativeLinks::new().with_path(base_path);
596
597        // Test the rule
598        let ctx = crate::lint_context::LintContext::new(content, crate::config::MarkdownFlavor::Standard);
599        let result = rule.check(&ctx).unwrap();
600
601        // Should only have warning for the normal link, not for links in code spans
602        assert_eq!(result.len(), 1, "Should have exactly one warning");
603        assert!(
604            result[0].message.contains("missing.md"),
605            "Warning should be for missing.md"
606        );
607        assert!(
608            !result.iter().any(|w| w.message.contains("another-missing.md")),
609            "Should not warn about link in code span"
610        );
611        assert!(
612            !result.iter().any(|w| w.message.contains("yet-another-missing.md")),
613            "Should not warn about link in inline code"
614        );
615    }
616}