broken_md_links/
lib.rs

1//! A library and command-line tool for detecting broken links in Markdown files.
2//!
3//! By default, this tool detects broken links like "[foo](file.md)" (target file does not exist)
4//! and broken header links like "[foo](file.md#header)" (target file exists but specific header does not exist)
5//!
6//! ## Command-line usage
7//!
8//! Check a single file:
9//!
10//! ```shell
11//! broken-md-links input.md
12//! ```
13//!
14//! Check a whole directory:
15//!
16//! ```shell
17//! broken-md-links dir/
18//! ```
19//!
20//! ### Output
21//!
22//! There are several levels of verbosity:
23//!
24//! * `-v silent`: display nothing (exit code will be 0 if there was no broken link)
25//! * `-v errors`: display errors only
26//! * `-v warn`: display errors and warnings (the default)
27//! * `-v info`: display the list of analyzed files as well
28//! * `-v verbose`: display detailed informations
29//! * `-v trace`: display debug informations
30
31#![forbid(unsafe_code)]
32#![forbid(unused_must_use)]
33
34use colored::Colorize;
35use log::{debug, error, info, trace, warn};
36use pulldown_cmark::{BrokenLink, Event, LinkType, Options, Parser, Tag, TagEnd};
37use regex::Regex;
38use std::collections::HashMap;
39use std::path::{Component, Path, PathBuf};
40use std::sync::LazyLock;
41
42static EMAIL_REGEX: LazyLock<Regex> = LazyLock::new(|| {
43    Regex::new("\
44        (?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"\
45        (?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@\
46        (?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[\
47        (?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:\
48        (?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])"
49    ).unwrap()
50});
51
52fn simplify_path(path: &Path) -> String {
53    // Components of the canonicalized path
54    let mut out = vec![];
55
56    for comp in path.components() {
57        match comp {
58            // Prefixes, root directories and normal components are kept "as is"
59            Component::Prefix(_) | Component::RootDir | Component::Normal(_) => out.push(comp),
60
61            // "Current dir" symbols (e.g. ".") are useless so they are not kept
62            Component::CurDir => {}
63
64            // "Parent dir" symbols (e.g. "..") will remove the previous component *ONLY* if it's a normal one
65            // Else, if the path is relative the symbol will be kept to preserve the relativety of the path
66            Component::ParentDir => {
67                if let Some(Component::Normal(_)) = out.last() {
68                    out.pop();
69                } else if path.is_relative() {
70                    out.push(Component::ParentDir)
71                }
72            }
73        }
74    }
75
76    // Create a path from the components and display it as a lossy string
77    out.iter()
78        .collect::<PathBuf>()
79        .to_string_lossy()
80        .into_owned()
81}
82
83/// Slugify a Markdown header
84/// This function is used to generate slugs from all headers of a Markdown file (see the 'generate_slugs' function)
85///
86/// # Examples
87///
88/// ```
89/// use broken_md_links::slugify;
90///
91/// assert_eq!(slugify("My super header"), "my-super-header");
92/// assert_eq!(slugify("I love headers!"), "i-love-headers");
93/// ```
94pub fn slugify(header: &str) -> String {
95    header
96        .chars()
97        .map(|c| if c == ' ' { '-' } else { c })
98        .filter(|c| c.is_ascii_alphanumeric() || *c == '-' || *c == '_')
99        .collect::<String>()
100        .to_lowercase()
101}
102
103/// Get all headers of a Markdown file as slugs
104/// This function is used to check if the header specified in a link exists in the target file
105/// Returns an error message if the operation failed for any reason
106pub fn generate_slugs(path: &Path) -> Result<Vec<String>, String> {
107    // Get the canonicalized path for display
108    let canon = simplify_path(path);
109
110    debug!("Generating slugs for file: {}", canon);
111
112    // Read the input file
113    let content = std::fs::read_to_string(path)
114        .map_err(|err| format!("Failed to read file at '{}': {}", canon.green(), err))?;
115
116    trace!(
117        "In '{}': just read file, which is {} bytes long.",
118        canon,
119        content.len()
120    );
121
122    // The list of slugified headers
123    let mut headers = vec![];
124
125    // Counter of slugs for suffixes
126    let mut header_counts = HashMap::<String, usize>::new();
127
128    // When the 'pulldown_cmark' library encounters a heading, the actual title can be got between a Start() and an End() events
129    // This variable contains the pending title's content
130    let mut header: Option<String> = None;
131
132    // Create a pull-down markdown parser
133    let parser = Parser::new_ext(&content, Options::all());
134
135    for (event, range) in parser.into_offset_iter() {
136        macro_rules! format_msg {
137            ($($param: expr),*) => {{
138                // TODO: Optimize the computation of the line number
139                let line = content.chars().take(range.start).filter(|c| *c == '\n').count();
140                format!("In '{}', line {}: {}", canon.green(), (line + 1).to_string().bright_magenta(), format!($($param),*))
141            }}
142        }
143
144        // If the last event was an heading, we are now expecting to get its title
145        if let Some(ref mut header_str) = header {
146            match event {
147                // Event indicating the header is now complete
148                Event::End(TagEnd::Heading { .. }) => {
149                    // Get its slug
150                    let slug = slugify(header_str);
151                    debug!("{}", format_msg!("found header: #{}", slug));
152
153                    // Print a warning if the title is empty
154                    if header_str.trim().is_empty() {
155                        // We did not get a piece of text, which means this heading does not have a title
156                        warn!(
157                            "{}",
158                            format_msg!("heading was not directly followed by a title")
159                        );
160                        trace!("Faulty event: {:?}", event);
161                    }
162
163                    // Get the number of duplicates this slug has
164                    let duplicates = header_counts
165                        .entry(slug.clone())
166                        .and_modify(|d| *d += 1)
167                        .or_insert(0);
168
169                    // Add a suffix for duplicates
170                    if *duplicates > 0 {
171                        headers.push(format!("{}-{}", slug, duplicates));
172                    } else {
173                        headers.push(slug);
174                    }
175
176                    // Header is now complete
177                    header = None;
178                }
179
180                Event::Start(_)
181                | Event::End(_)
182                | Event::SoftBreak
183                | Event::HardBreak
184                | Event::Rule
185                | Event::TaskListMarker(_)
186                | Event::InlineMath(_)
187                | Event::DisplayMath(_)
188                | Event::InlineHtml(_) => {}
189
190                Event::Text(text)
191                | Event::Code(text)
192                | Event::Html(text)
193                | Event::FootnoteReference(text) => header_str.push_str(&text),
194            }
195        }
196        // If we encounted the beginning of a heading...
197        else if let Event::Start(Tag::Heading { .. }) = event {
198            // Expect to get the related title just after
199            header = Some(String::new())
200        }
201    }
202
203    // Everything went fine :D
204    Ok(headers)
205}
206
207/// Broken links checker options
208#[derive(Debug, Clone, Copy)]
209pub struct CheckerOptions {
210    pub ignore_header_links: bool,
211    pub disallow_dir_links: bool,
212}
213
214/// Checker error
215pub enum CheckerError {
216    Io(String),
217    BrokenLinks(Vec<DetectedBrokenLink>),
218}
219
220/// Markdown file links cache
221pub type FileLinksCache = HashMap<PathBuf, Vec<String>>;
222
223/// Detected broken link
224pub struct DetectedBrokenLink {
225    pub file: PathBuf,
226    pub line: usize,
227    pub error: String,
228}
229
230/// Check broken links in a Markdown file or directory
231///
232/// The input `path` will be checked recursively as a directory if `dir` is set to `true`, else as a single file.
233///
234/// By default, when a header points to a specific header (e.g. `other_file.md#some-header`), the target file will be opened and
235///  the function will check if it contains the said header. As this feature may slow down the whole process, it's possible to disable it by
236///  settings `ignore_header_links` to `true`.
237///
238/// In order to improve performances when looking at header-specific links, when a file's list of headers is made, it is stored inside a cache
239/// This cache is shared recursively through the `links_cache` argument. As it uses a specific format, it's recommanded to just pass a mutable
240///  reference to an empty HashMap to this function, and not build your own one which may cause detection problems.
241///
242/// The function returns an error is something goes wrong, or else the number of broken and invalid (without target) links.
243pub fn check_broken_links(
244    path: &Path,
245    options: CheckerOptions,
246    links_cache: &mut FileLinksCache,
247) -> Result<(), CheckerError> {
248    // Detect broken links
249    let errors = if path.is_dir() {
250        check_broken_links_in_dir(path, &options, links_cache).map_err(CheckerError::Io)?
251    } else {
252        check_file_broken_links(path, &options, links_cache).map_err(CheckerError::Io)?
253    };
254
255    if errors.is_empty() {
256        Ok(())
257    } else {
258        Err(CheckerError::BrokenLinks(errors))
259    }
260}
261
262pub fn check_broken_links_in_dir(
263    path: &Path,
264    options: &CheckerOptions,
265    links_cache: &mut FileLinksCache,
266) -> Result<Vec<DetectedBrokenLink>, String> {
267    // Get the canonicalized path for display
268    let canon = simplify_path(path);
269
270    debug!("Analyzing directory: {}", canon);
271
272    let dir_iter = path.read_dir().map_err(|err| {
273        format!(
274            "Failed to read input directory at '{}': {}",
275            canon.green(),
276            err
277        )
278    })?;
279
280    let mut errors = vec![];
281
282    for item in dir_iter {
283        let item = item.map_err(|err| {
284            format!(
285                "Failed to get item from directory at '{}': {}",
286                canon.green(),
287                err
288            )
289        })?;
290        let path = item.path();
291        let file_type = item.file_type().map_err(|err| {
292            format!(
293                "Failed to read file type of item at '{}': {}",
294                canon.green(),
295                err
296            )
297        })?;
298
299        if file_type.is_dir() {
300            // Check broken links recursively
301            errors.append(&mut check_broken_links_in_dir(&path, options, links_cache)?);
302        } else if file_type.is_file() {
303            // Only check ".md" files
304            if let Some(ext) = path.extension() {
305                if let Some(ext) = ext.to_str() {
306                    if ext.to_ascii_lowercase() == "md" {
307                        // Check this Markdown file
308                        errors.append(&mut check_file_broken_links(&path, options, links_cache)?);
309                    }
310                }
311            }
312        } else {
313            warn!(
314                "Item at path '{}' is neither a file nor a directory so it will be ignored",
315                canon
316            );
317        }
318    }
319
320    Ok(errors)
321}
322
323pub fn check_file_broken_links(
324    path: &Path,
325    options: &CheckerOptions,
326    links_cache: &mut FileLinksCache,
327) -> Result<Vec<DetectedBrokenLink>, String> {
328    // Get the canonicalized path for display
329    let canon = simplify_path(path);
330
331    info!("Analyzing: {}", canon);
332
333    let CheckerOptions {
334        ignore_header_links,
335        disallow_dir_links,
336    } = &options;
337
338    let mut errors = vec![];
339
340    let content = std::fs::read_to_string(path)
341        .map_err(|err| format!("Failed to read file at '{}': {}", canon.green(), err))?;
342
343    trace!(
344        "In '{}': just read file, which is {} bytes long.",
345        canon,
346        content.len()
347    );
348
349    // Count links without a target (like `[link name]`) as an error
350    let mut handle_broken_links = |link: BrokenLink| {
351        error!(
352            "In '{}': Missing target for link '{}'",
353            canon.green(),
354            link.reference.yellow()
355        );
356
357        None
358    };
359
360    // Create a pull-down parser
361    let parser = Parser::new_with_broken_link_callback(
362        &content,
363        Options::all(),
364        Some(&mut handle_broken_links),
365    );
366
367    for (event, range) in parser.into_offset_iter() {
368        macro_rules! make_err {
369                ($($param: expr),*) => {{
370                    // TODO: Optimize the computation of the line number
371                    let line = content.chars().take(range.start).filter(|c| *c == '\n').count();
372                    DetectedBrokenLink { file: path.to_path_buf(), line: line + 1, error: format!($($param),*) }
373                }}
374            }
375
376        // Check inline links only (not URLs or e-mail addresses in autolinks for instance)
377        if let Event::Start(Tag::Link {
378            link_type: LinkType::Inline,
379            dest_url,
380            title: _,
381            id: _,
382        }) = event
383        {
384            // Get the link's target file and optionally its header
385            let (target, header): (String, Option<String>) =
386                match dest_url.chars().position(|c| c == '#') {
387                    Some(index) => (
388                        dest_url.chars().take(index).collect(),
389                        Some(dest_url.chars().skip(index + 1).collect()),
390                    ),
391                    None => (dest_url.into_string(), None),
392                };
393
394            // Don't care about URLs
395            if target.starts_with("http://")
396                || target.starts_with("https://")
397                || target.starts_with("ftp://")
398            {
399                trace!("found link to URL: {target}");
400                continue;
401            }
402
403            if EMAIL_REGEX.is_match(&target) {
404                trace!("found link to e-mail addres: {target}");
405                continue;
406            }
407
408            let target = if !target.is_empty() {
409                path.parent().unwrap().join(Path::new(&target))
410            } else {
411                path.to_owned()
412            };
413
414            let target_canon = simplify_path(&target);
415
416            match std::fs::canonicalize(&target_canon) {
417                Ok(path) => {
418                    if *disallow_dir_links && !path.is_file() {
419                        errors.push(make_err!("invalid link found: path '{}' is a directory but only file links are allowed", target_canon.blue()));
420                        continue;
421                    }
422                }
423
424                Err(_) => {
425                    errors.push(make_err!(
426                        "broken link found: path '{}' does not exist",
427                        target_canon.green()
428                    ));
429                    continue;
430                }
431            }
432
433            trace!("valid link found: {}", target_canon);
434
435            // If header links must be checked...
436            if !ignore_header_links {
437                // If the link points to a specific header...
438                if let Some(header) = header {
439                    // Then the target must be a file
440                    if !target.is_file() {
441                        errors.push(make_err!(
442                            "invalid header link found: path '{}' exists but is not a file",
443                            target_canon.green()
444                        ));
445                    } else {
446                        debug!(
447                            "now checking link '{}' from file '{}'",
448                            header, target_canon
449                        );
450
451                        // Canonicalize properly the target path to avoid irregularities in cache's keys
452                        //  like 'dir/../file.md' and 'file.md' which are identical but do not have the same Path representation
453                        let unified_target = target.canonicalize().unwrap();
454
455                        // If the target file is not already in cache...
456                        if !links_cache.contains_key(&unified_target) {
457                            // 2. Push all slugs in the cache
458                            links_cache.insert(
459                                unified_target.clone(),
460                                // 1. Get all its headers as slugs
461                                // We do not use the fully canonicalized path to not force displaying an absolute path
462                                generate_slugs(&target).map_err(|err| {
463                                    format!(
464                                        "failed to generate slugs for file '{}': {}",
465                                        target_canon.green(),
466                                        err
467                                    )
468                                })?,
469                            );
470                        }
471
472                        // Get the file's slugs from the cache
473                        let slugs = links_cache.get(&unified_target).unwrap();
474
475                        // Ensure the link points to an existing header
476                        if !slugs.contains(&header) {
477                            errors.push(make_err!(
478                                "broken link found: header '{}' not found in '{}'",
479                                header.yellow(),
480                                target_canon.green()
481                            ));
482                        } else {
483                            trace!("valid header link found: {}", header);
484                        }
485                    }
486                }
487            }
488        }
489    }
490
491    Ok(errors)
492}