cargo_deadlinks/
check.rs

1//! Provides functionality for checking the availablility of URLs.
2use std::collections::HashSet;
3use std::fmt;
4use std::fs::read_to_string;
5use std::path::{Path, PathBuf};
6
7use log::{debug, warn};
8use once_cell::sync::Lazy;
9use regex::Regex;
10use url::Url;
11
12use cached::cached_key_result;
13use cached::SizedCache;
14
15use super::CheckContext;
16
17use crate::{
18    parse::{parse_fragments, parse_redirect},
19    HttpCheck,
20};
21
22const PREFIX_BLACKLIST: [&str; 1] = ["https://doc.rust-lang.org"];
23
24#[derive(Debug)]
25pub enum IoError {
26    HttpUnexpectedStatus(ureq::Response),
27    HttpFetch(ureq::Transport),
28    FileIo(String, std::io::Error),
29}
30
31impl fmt::Display for IoError {
32    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
33        match self {
34            IoError::HttpUnexpectedStatus(resp) => write!(
35                f,
36                "Unexpected HTTP status fetching {}: {}",
37                resp.get_url(),
38                resp.status_text()
39            ),
40            IoError::HttpFetch(e) => write!(f, "Error fetching {}", e),
41            IoError::FileIo(url, e) => write!(f, "Error fetching {}: {}", url, e),
42        }
43    }
44}
45
46#[derive(Debug, Clone)]
47pub enum Link {
48    File(PathBuf),
49    Http(Url),
50}
51
52impl fmt::Display for Link {
53    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
54        match self {
55            Link::File(path) => write!(f, "{}", path.display()),
56            Link::Http(url) => f.write_str(url.as_str()),
57        }
58    }
59}
60
61impl Link {
62    /// Removes the fragment
63    fn without_fragment(&self) -> Link {
64        match self {
65            Link::Http(url) => {
66                let mut url = url.clone();
67                url.set_fragment(None);
68
69                Link::Http(url)
70            }
71            _ => self.clone(),
72        }
73    }
74}
75
76#[derive(Debug)]
77pub enum CheckError {
78    /// An intra-doc link went unresolved by rustdoc and ended up in the final HTML
79    IntraDocLink(String),
80    /// A relatively linked file did not exist
81    File(PathBuf),
82    /// A linked HTTP URL did not exist
83    Http(Url),
84    /// An HTTP URL was encountered, but HTTP checking was forbidden
85    HttpForbidden(Url),
86    /// The linked file existed, but was missing the linked HTML anchor
87    Fragment(Link, String, Option<Vec<String>>),
88    /// An error occured while trying to find whether the file or URL existed
89    Io(Box<IoError>),
90}
91
92impl From<ureq::Error> for CheckError {
93    fn from(err: ureq::Error) -> Self {
94        let io_err = match err {
95            ureq::Error::Status(_, response) => IoError::HttpUnexpectedStatus(response),
96            ureq::Error::Transport(err) => IoError::HttpFetch(err),
97        };
98        CheckError::Io(Box::new(io_err))
99    }
100}
101
102impl fmt::Display for CheckError {
103    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
104        match self {
105            CheckError::IntraDocLink(text) => {
106                write!(f, "Broken intra-doc link to {}!", text)
107            }
108            CheckError::File(path) => {
109                write!(f, "Linked file at path {} does not exist!", path.display())
110            }
111            CheckError::Http(url) => write!(f, "Linked URL {} does not exist!", url),
112            CheckError::HttpForbidden(url) => write!(
113                f,
114                "Found HTTP link {}, but HTTP checking is forbidden!",
115                url
116            ),
117            CheckError::Fragment(link, fragment, missing_parts) => match missing_parts {
118                Some(missing_parts) => write!(
119                    f,
120                    "Fragments #{} as expected by ranged fragment #{} at {} do not exist!\n\
121                    This is likely a bug in rustdoc itself.",
122                    missing_parts.join(", #"),
123                    fragment,
124                    link
125                ),
126                None => write!(f, "Fragment #{} at {} does not exist!", fragment, link),
127            },
128            CheckError::Io(err) => err.fmt(f),
129        }
130    }
131}
132
133/// Check a single URL for availability. Returns `false` if it is unavailable.
134pub fn is_available(url: &Url, ctx: &CheckContext) -> Result<(), CheckError> {
135    match url.scheme() {
136        "file" => check_file_url(url, ctx),
137        "http" | "https" => check_http_url(url, ctx),
138        scheme @ "javascript" => {
139            debug!("Not checking URL scheme {:?}", scheme);
140            Ok(())
141        }
142        other => {
143            debug!("Unrecognized URL scheme {:?}", other);
144            Ok(())
145        }
146    }
147}
148
149cached_key_result! {
150    CHECK_FILE: SizedCache<String, HashSet<String>> = SizedCache::with_size(100);
151    Key = { link.without_fragment().to_string() };
152    // `fetch_html` is different depending on whether the link is being
153    // loaded from disk or from the network.
154    fn fragments_from(
155        link: &Link,
156        fetch_html: impl Fn() -> Result<String, CheckError>
157    ) -> Result<HashSet<String>, CheckError> = {
158        fetch_html().map(|html| parse_fragments(&html))
159    }
160}
161
162fn is_fragment_available(
163    link: &Link,
164    fragment: &str,
165    fetch_html: impl Fn() -> Result<String, CheckError>,
166) -> Result<(), CheckError> {
167    // Empty fragments (e.g. file.html#) are commonly used to reach the top
168    // of the document, see https://html.spec.whatwg.org/multipage/browsing-the-web.html#scroll-to-fragid
169    if fragment.is_empty() {
170        return Ok(());
171    }
172
173    let fragments = fragments_from(link, fetch_html)?;
174
175    if fragments.contains(fragment) {
176        return Ok(());
177    }
178
179    // Try again with percent-decoding.
180    // NOTE: This isn't done unconditionally because it's possible the fragment it's linking to was also percent-encoded.
181    match percent_encoding::percent_decode(fragment.as_bytes()).decode_utf8() {
182        Ok(cow) => {
183            if fragments.contains(&*cow) {
184                return Ok(());
185            }
186        }
187        // If this was invalid UTF8 after percent-decoding, it can't be in the file (since we have a `String`, not opaque bytes).
188        // Assume it wasn't meant to be url-encoded.
189        Err(err) => warn!("{} url-decoded to invalid UTF8: {}", fragment, err),
190    }
191
192    // Rust documentation uses `#n-m` fragments and JavaScript to highlight
193    // a range of lines in HTML of source code, an element with `id`
194    // attribute of (literal) "#n-m" will not exist, but elements with
195    // `id`s n through m should, this parses the ranged n-m anchor and
196    // checks if elements with `id`s n through m do exist
197    static RUST_LINE_HIGLIGHT_RX: Lazy<Regex> =
198        Lazy::new(|| Regex::new(r#"^(?P<start>[0-9]+)-(?P<end>[0-9]+)$"#).unwrap());
199    match RUST_LINE_HIGLIGHT_RX.captures(fragment) {
200        Some(capture) => match (capture.name("start"), capture.name("end")) {
201            (Some(start_str), Some(end_str)) => {
202                // NOTE: assumes there are less than 2.pow(32) lines in a source file
203                let start = start_str.as_str().parse::<i32>().unwrap();
204                let end = end_str.as_str().parse::<i32>().unwrap();
205                let missing = (start..=end)
206                    .map(|i| i.to_string())
207                    .filter(|i| !fragments.contains(i))
208                    .collect::<Vec<String>>();
209                if !missing.is_empty() {
210                    Err(CheckError::Fragment(
211                        link.clone(),
212                        fragment.to_string(),
213                        Some(missing),
214                    ))
215                } else {
216                    Ok(())
217                }
218            }
219            _ => unreachable!("if the regex matches, it should have capture groups"),
220        },
221        None => Err(CheckError::Fragment(
222            link.clone(),
223            fragment.to_string(),
224            None,
225        )),
226    }
227}
228
229/// Check a URL with the "file" scheme for availability. Returns `false` if it is unavailable.
230fn check_file_url(url: &Url, ctx: &CheckContext) -> Result<(), CheckError> {
231    let path = url.to_file_path().unwrap();
232
233    // determine the full path by looking if the path points to a directory,
234    // and if so append `index.html`, this is needed as we'll try to read
235    // the file, so `expanded_path` should point to a file not a directory
236    let index_html;
237    let expanded_path = if path.is_file() {
238        &path
239    } else if path.is_dir() && path.join("index.html").is_file() {
240        index_html = path.join("index.html");
241        &index_html
242    } else {
243        debug!("Linked file at path {} does not exist!", path.display());
244        return Err(CheckError::File(path));
245    };
246
247    if !ctx.check_fragments {
248        return Ok(());
249    }
250
251    // The URL might contain a fragment. In that case we need a full GET
252    // request to check if the fragment exists.
253    match url.fragment() {
254        Some(fragment) => check_file_fragment(&path, expanded_path, fragment),
255        None => Ok(()),
256    }
257}
258
259fn check_file_fragment(
260    path: &Path,
261    expanded_path: &Path,
262    fragment: &str,
263) -> Result<(), CheckError> {
264    debug!(
265        "Checking fragment {} of file {}.",
266        fragment,
267        expanded_path.display()
268    );
269
270    fn get_html(expanded_path: &Path) -> Result<String, CheckError> {
271        read_to_string(expanded_path).map_err(|err| {
272            CheckError::Io(Box::new(IoError::FileIo(
273                expanded_path.to_string_lossy().to_string(),
274                err,
275            )))
276        })
277    }
278
279    let fetch_html = || {
280        let html = get_html(expanded_path)?;
281        if let Some(redirect) = parse_redirect(&html) {
282            get_html(&expanded_path.parent().unwrap().join(redirect))
283        } else {
284            Ok(html)
285        }
286    };
287    is_fragment_available(&Link::File(path.to_path_buf()), fragment, fetch_html)
288}
289
290/// Check a URL with "http" or "https" scheme for availability. Returns `Err` if it is unavailable.
291fn check_http_url(url: &Url, ctx: &CheckContext) -> Result<(), CheckError> {
292    if ctx.check_http == HttpCheck::Ignored {
293        warn!(
294            "Skip checking {} as checking of http URLs is turned off",
295            url
296        );
297        return Ok(());
298    }
299
300    for blacklisted_prefix in PREFIX_BLACKLIST.iter() {
301        if url.as_str().starts_with(blacklisted_prefix) {
302            warn!(
303                "Skip checking {} as URL prefix is on the builtin blacklist",
304                url
305            );
306            return Ok(());
307        }
308    }
309
310    if ctx.check_http == HttpCheck::Forbidden {
311        return Err(CheckError::HttpForbidden(url.clone()));
312    }
313
314    // The URL might contain a fragment. In that case we need a full GET
315    // request to check if the fragment exists.
316    if url.fragment().is_none() || !ctx.check_fragments {
317        match ureq::head(url.as_str()).call() {
318            Err(ureq::Error::Status(405, _)) => {
319                // If HEAD isn't allowed, try sending a GET instead
320                ureq::get(url.as_str()).call()?;
321                Ok(())
322            }
323            Err(other) => Err(other.into()),
324            Ok(_) => Ok(()),
325        }
326    } else {
327        // the URL might contain a fragment, in that case we need to check if
328        // the fragment exists, this issues a GET request
329        check_http_fragment(url, url.fragment().unwrap())
330    }
331}
332
333fn check_http_fragment(url: &Url, fragment: &str) -> Result<(), CheckError> {
334    debug!("Checking fragment {} of URL {}.", fragment, url.as_str());
335
336    fn get_html(url: &Url) -> Result<String, CheckError> {
337        let resp = ureq::get(url.as_str()).call()?;
338        Ok(resp.into_string().unwrap())
339    }
340
341    let fetch_html = || {
342        let html = get_html(url)?;
343        // NOTE: only handles one level of nesting. Maybe we should have multiple levels?
344        let redirect = parse_redirect(&html).and_then(|s| {
345            Url::parse(&s)
346                .map_err(|err| {
347                    warn!("failed to parse Rustdoc redirect: {}", err);
348                })
349                .ok()
350        });
351        if let Some(redirect) = redirect {
352            get_html(&redirect)
353        } else {
354            Ok(html)
355        }
356    };
357
358    is_fragment_available(&Link::Http(url.clone()), fragment, fetch_html)?;
359    Ok(())
360}
361
362#[cfg(test)]
363mod test {
364    use crate::HttpCheck;
365
366    use super::{check_file_url, is_available, CheckContext, CheckError, Link};
367    use mockito::{self, mock};
368    use std::env;
369    use url::Url;
370
371    fn url_for(path: &str) -> Url {
372        let cwd = env::current_dir().unwrap();
373        let mut parts = path.split('#');
374        let file_path = parts.next().unwrap();
375
376        let mut url = if file_path.ends_with('/') {
377            Url::from_directory_path(cwd.join(file_path))
378        } else {
379            Url::from_file_path(cwd.join(file_path))
380        }
381        .unwrap();
382
383        url.set_fragment(parts.next());
384        assert_eq!(parts.count(), 0); // make sure the anchor was valid, not `a.html#x#y`
385
386        url
387    }
388
389    fn test_check_file_url(path: &str) -> Result<(), CheckError> {
390        check_file_url(&url_for(path), &CheckContext::default())
391    }
392
393    #[test]
394    fn test_file_path() {
395        test_check_file_url("tests/html/index.html").unwrap();
396    }
397
398    #[test]
399    fn test_directory_path() {
400        test_check_file_url("tests/html/").unwrap();
401    }
402
403    #[test]
404    fn test_anchors() {
405        test_check_file_url("tests/html/anchors.html#h1").unwrap();
406    }
407
408    #[test]
409    fn test_hash_fragment() {
410        test_check_file_url("tests/html/anchors.html#").unwrap();
411    }
412
413    #[test]
414    fn test_missing_anchors() {
415        match test_check_file_url("tests/html/anchors.html#nonexistent") {
416            Err(CheckError::Fragment(Link::File(path), fragment, None)) => {
417                assert!(path.ends_with("tests/html/anchors.html"));
418                assert_eq!("nonexistent", fragment);
419            }
420            x => panic!(
421                "Expected to report missing anchor (Err(CheckError::FileAnchor)), got {:?}",
422                x
423            ),
424        }
425    }
426
427    #[test]
428    fn test_range_anchor() {
429        test_check_file_url("tests/html/range.html#2-4").unwrap();
430    }
431
432    #[test]
433    fn test_missing_range_anchor() {
434        match test_check_file_url("tests/html/range.html#4-6") {
435            Err(CheckError::Fragment(Link::File(path), fragment, Some(missing_parts))) => {
436                assert!(path.ends_with("tests/html/range.html"));
437                assert_eq!("4-6", fragment);
438                assert_eq!(missing_parts.len(), 1);
439                assert!(missing_parts.contains(&"6".to_string()));
440            }
441            x => panic!(
442                "Expected to report missing anchor (Err(CheckError::FileAnchorRange)), got {:?}",
443                x
444            ),
445        }
446    }
447
448    #[test]
449    fn test_is_available_file_path() {
450        is_available(
451            &url_for("tests/html/index.html#i1"),
452            &CheckContext::default(),
453        )
454        .unwrap();
455    }
456
457    #[test]
458    fn test_is_available_directory_path() {
459        is_available(&url_for("tests/html/#i1"), &CheckContext::default()).unwrap();
460    }
461
462    #[test]
463    fn test_missing_dir_index_fragment() {
464        match is_available(
465            &url_for("tests/html/missing_index/#i1"),
466            &CheckContext::default(),
467        ) {
468            Err(CheckError::File(path)) => assert!(path.ends_with("tests/html/missing_index")),
469            x => panic!(
470                "Expected to report missing anchor (Err(CheckError::File)), got {:?}",
471                x
472            ),
473        }
474    }
475
476    #[test]
477    fn test_http_check() {
478        let root = mock("HEAD", "/test_http_check").with_status(200).create();
479
480        let mut url = mockito::server_url();
481        url.push_str("/test_http_check");
482
483        is_available(
484            &Url::parse(&url).unwrap(),
485            &CheckContext {
486                check_http: HttpCheck::Enabled,
487                ..CheckContext::default()
488            },
489        )
490        .unwrap();
491
492        root.assert();
493    }
494
495    #[test]
496    fn test_http_check_fragment() {
497        let root = mock("GET", "/test_http_check_fragment")
498            .with_status(200)
499            .with_header("content-type", "text/html")
500            .with_body(
501                r#"<!DOCTYPE html>
502            <html>
503                <body id="r1" />
504            </html>"#,
505            )
506            .create();
507
508        let mut url = mockito::server_url();
509        url.push_str("/test_http_check_fragment#r1");
510
511        is_available(
512            &Url::parse(&url).unwrap(),
513            &CheckContext {
514                check_http: HttpCheck::Enabled,
515                ..CheckContext::default()
516            },
517        )
518        .unwrap();
519
520        root.assert();
521    }
522
523    #[test]
524    fn test_missing_http_fragment() {
525        let root = mock("GET", "/test_missing_http_fragment")
526            .with_status(200)
527            .with_header("content-type", "text/html")
528            .with_body(
529                r#"<!DOCTYPE html>
530            <html />"#,
531            )
532            .create();
533
534        let mut url = mockito::server_url();
535        url.push_str("/test_missing_http_fragment#missing");
536
537        match is_available(
538            &Url::parse(&url).unwrap(),
539            &CheckContext {
540                check_http: HttpCheck::Enabled,
541                ..CheckContext::default()
542            },
543        ) {
544            Err(CheckError::Fragment(Link::Http(url), fragment, None)) => {
545                assert_eq!(
546                    "http://127.0.0.1:1234/test_missing_http_fragment#missing",
547                    url.to_string()
548                );
549                assert_eq!("missing", fragment);
550            }
551            x => panic!(
552                "Expected to report missing anchor (Err(CheckError::File)), got {:?}",
553                x
554            ),
555        }
556
557        root.assert();
558    }
559
560    #[test]
561    fn test_disabling_fragment_checks_file() {
562        check_file_url(
563            &url_for("tests/html/anchors.html#nonexistent"),
564            &CheckContext {
565                check_fragments: false,
566                ..CheckContext::default()
567            },
568        )
569        .unwrap();
570    }
571
572    #[test]
573    fn test_disabling_fragment_checks_http() {
574        let root = mock("HEAD", "/test_disabling_fragment_checks_http")
575            .with_status(200)
576            .create();
577
578        let mut url = mockito::server_url();
579        url.push_str("/test_disabling_fragment_checks_http#missing");
580
581        is_available(
582            &Url::parse(&url).unwrap(),
583            &CheckContext {
584                check_http: HttpCheck::Enabled,
585                check_fragments: false,
586                ..CheckContext::default()
587            },
588        )
589        .unwrap();
590
591        root.assert();
592    }
593}