cargo_deadlinks/
lib.rs

1//! <https://tinyurl.com/rnxcavf>
2use std::{
3    fmt,
4    path::{Path, PathBuf},
5};
6
7use log::info;
8use rayon::prelude::*;
9use rayon::ThreadPoolBuilder;
10use url::Url;
11use walkdir::{DirEntry, WalkDir};
12
13use check::is_available;
14
15pub use check::{CheckError, IoError};
16
17mod check;
18mod parse;
19
20#[derive(Copy, Clone, Debug, PartialEq, Eq)]
21/// What behavior should deadlinks use for HTTP links?
22pub enum HttpCheck {
23    /// Make an internet request to ensure the link works
24    Enabled,
25    /// Do nothing when encountering a link
26    Ignored,
27    /// Give an error when encountering a link.
28    ///
29    /// Note that even when HTTP links are forbidden, `doc.rust-lang.org` links are still assumed to
30    /// be valid.
31    Forbidden,
32}
33
34// NOTE: this could be Copy, but we intentionally choose not to guarantee that.
35#[derive(Clone, Debug)]
36pub struct CheckContext {
37    pub verbose: bool,
38    pub check_http: HttpCheck,
39    pub check_fragments: bool,
40    pub check_intra_doc_links: bool,
41}
42
43impl Default for CheckContext {
44    fn default() -> Self {
45        CheckContext {
46            check_http: HttpCheck::Ignored,
47            verbose: false,
48            check_fragments: true,
49            check_intra_doc_links: false,
50        }
51    }
52}
53
54#[derive(Debug)]
55pub struct FileError {
56    pub path: PathBuf,
57    pub errors: Vec<CheckError>,
58}
59
60impl fmt::Display for FileError {
61    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
62        write!(f, "Found invalid urls in {}:", self.path.display())?;
63        for e in &self.errors {
64            write!(f, "\n\t{}", e)?;
65        }
66        Ok(())
67    }
68}
69
70/// Traverses a given path recursively, checking all *.html files found.
71///
72/// For each error that occurred, print an error message.
73/// Returns whether an error occurred.
74pub fn walk_dir(dir_path: &Path, ctx: &CheckContext) -> bool {
75    let pool = ThreadPoolBuilder::new()
76        .num_threads(num_cpus::get())
77        .build()
78        .unwrap();
79
80    pool.install(|| {
81        unavailable_urls(dir_path, ctx)
82            .map(|mut err| {
83                if !ctx.verbose {
84                    err.shorten_all(dir_path);
85                }
86                println!("{}", err);
87                true
88            })
89            // ||||||
90            .reduce(|| false, |initial, new| initial || new)
91    })
92}
93
94impl FileError {
95    fn shorten_all(&mut self, prefix: &Path) {
96        use check::Link;
97
98        if let Ok(shortened) = self.path.strip_prefix(&prefix) {
99            self.path = shortened.to_path_buf();
100        };
101        for mut e in &mut self.errors {
102            if let CheckError::File(epath) | CheckError::Fragment(Link::File(epath), _, _) = &mut e
103            {
104                if let Ok(shortened) = epath.strip_prefix(prefix) {
105                    *epath = shortened.to_path_buf();
106                }
107            }
108        }
109    }
110}
111
112fn is_html_file(entry: &DirEntry) -> bool {
113    match entry.path().extension() {
114        Some(e) => e.to_str().map(|ext| ext == "html").unwrap_or(false),
115        None => false,
116    }
117}
118
119pub fn unavailable_urls<'a>(
120    dir_path: &'a Path,
121    ctx: &'a CheckContext,
122) -> impl ParallelIterator<Item = FileError> + 'a {
123    let root_url = Url::from_directory_path(dir_path).unwrap();
124
125    WalkDir::new(dir_path)
126        .into_iter()
127        .par_bridge()
128        .filter_map(Result::ok)
129        .filter(|entry| entry.file_type().is_file() && is_html_file(entry))
130        .flat_map(move |entry| {
131            let path = entry.path();
132            info!("Checking doc page at {}", path.display());
133            let html = std::fs::read_to_string(path)
134                .unwrap_or_else(|e| panic!("{} did not contain valid UTF8: {}", path.display(), e));
135
136            let file_url = Url::from_file_path(path).unwrap();
137            let urls = parse::parse_a_hrefs(&html, &root_url, &file_url);
138            let broken_intra_doc_links = if ctx.check_intra_doc_links {
139                parse::broken_intra_doc_links(&html)
140            } else {
141                Vec::new()
142            };
143            let errors = urls
144                .into_iter()
145                .filter_map(|url| is_available(&url, ctx).err())
146                .chain(broken_intra_doc_links)
147                .collect::<Vec<_>>();
148
149            if errors.is_empty() {
150                None
151            } else {
152                let path = entry.path().to_owned();
153                Some(FileError { path, errors })
154            }
155        })
156}