waybackmachine_client/
archivableurl.rs

1use crate::Error;
2use std::fmt;
3use url::{Host, Url};
4
5#[derive(Clone)]
6/// Validator for archivable URLs
7pub struct ArchivableUrl {
8    pub url: Url,
9}
10
11/// List of domains that block wayback requests
12const EXCLUDED_DOMAINS: &[&str] = &[
13    "archive.org",
14    "jstor.org",
15    "diw.de",
16    "youtube.com",
17    "plato.stanford.edu",
18    "muse.jhu.edu",
19    "read.dukeupress.edu",
20    "academic.oup.com",
21    "onlinelibrary.wiley.com",
22    "genius.com",
23    "taylorfrancis.com",
24    "tandfonline.com",
25    "iwaponline.com",
26    "link.springer.com",
27    "journals.sagepub.com",
28    "journals.openedition.org",
29    "sciencedirect.com",
30    "annualreviews.org",
31    "mit.edu",
32    "ucpress.edu",
33];
34
35impl ArchivableUrl {
36    /// Parses and validates the URL for archiving
37    pub fn parse(url: &str) -> Result<Self, Error> {
38        let parsed_url = Url::parse(url).map_err(|_| Error::InvalidUrl(url.to_string()))?;
39        let archivable_url = Self { url: parsed_url };
40        archivable_url.validate_url()
41    }
42
43    /// Validates the URL for archiving
44    fn validate_url(self) -> Result<Self, Error> {
45        let host = match self.url.host() {
46            Some(host) => host,
47            None => return Err(Error::InvalidUrl(self.url.to_string())),
48        };
49
50        // Check if the host is excluded
51        match host {
52            Host::Domain(domain) => {
53                if domain.contains("localhost") {
54                    return Err(Error::InvalidUrl(self.url.to_string()));
55                }
56
57                for &pattern in EXCLUDED_DOMAINS {
58                    if domain.contains(pattern) {
59                        return Err(Error::ExcludedUrl(self.url.to_string()));
60                    }
61                }
62            }
63            Host::Ipv4(ipv4)
64                if ipv4.is_loopback()
65                    || ipv4.is_private()
66                    || ipv4.is_multicast()
67                    || ipv4.is_unspecified() =>
68            {
69                return Err(Error::InvalidUrl(self.url.to_string()));
70            }
71            Host::Ipv6(ipv6) if ipv6.is_loopback() || ipv6.is_multicast() => {
72                return Err(Error::InvalidUrl(self.url.to_string()));
73            }
74            _ => {}
75        }
76
77        // Check for non-HTTP(S) protocols
78        if !["http", "https"].contains(&self.url.scheme()) {
79            return Err(Error::InvalidUrl(self.url.to_string()));
80        }
81
82        // If none of the filters matched, the URL is valid for archiving
83        Ok(self)
84    }
85
86    /// Returns the URL as a string
87    pub fn as_str(&self) -> &str {
88        self.url.as_str()
89    }
90}
91
92impl fmt::Display for ArchivableUrl {
93    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
94        write!(f, "{}", self.url)
95    }
96}
97
98#[cfg(test)]
99mod tests {
100    use super::*;
101
102    #[test]
103    fn valid_http_url() {
104        let url = "http://example.com/";
105        let result = ArchivableUrl::parse(url);
106        assert!(result.is_ok());
107        let archivable_url = result.unwrap();
108        assert_eq!(archivable_url.as_str(), url);
109    }
110
111    #[test]
112    fn valid_https_url() {
113        let url = "https://example.com/";
114        let result = ArchivableUrl::parse(url);
115        assert!(result.is_ok());
116        let archivable_url = result.unwrap();
117        assert_eq!(archivable_url.as_str(), url);
118    }
119
120    #[test]
121    fn invalid_url() {
122        let url = "invalid-url";
123        let result = ArchivableUrl::parse(url);
124        assert!(result.is_err());
125        assert_eq!(result.err(), Some(Error::InvalidUrl(url.to_string())));
126    }
127
128    #[test]
129    fn invalid_scheme() {
130        let url = "ftp://example.com/";
131        let result = ArchivableUrl::parse(url);
132        assert!(result.is_err());
133        assert_eq!(result.err(), Some(Error::InvalidUrl(url.to_string())));
134    }
135
136    #[test]
137    fn localhost_url() {
138        let url = "http://localhost/";
139        let result = ArchivableUrl::parse(url);
140        assert!(result.is_err());
141        assert_eq!(result.err(), Some(Error::InvalidUrl(url.to_string())));
142    }
143
144    #[test]
145    fn private_ip_url() {
146        let url = "http://192.168.1.1/";
147        let result = ArchivableUrl::parse(url);
148        assert!(result.is_err());
149        assert_eq!(result.err(), Some(Error::InvalidUrl(url.to_string())));
150    }
151
152    #[test]
153    fn reserved_ip_url() {
154        let url = "http://0.0.0.0/";
155        let result = ArchivableUrl::parse(url);
156        assert!(result.is_err());
157        assert_eq!(result.err(), Some(Error::InvalidUrl(url.to_string())));
158    }
159
160    #[test]
161    fn special_localhost_alias_url() {
162        let url = "http://localhost.localdomain/";
163        let result = ArchivableUrl::parse(url);
164        assert!(result.is_err());
165        assert_eq!(result.err(), Some(Error::InvalidUrl(url.to_string())));
166    }
167
168    #[test]
169    fn wayback_url() {
170        let url = "https://archive.org/some-book";
171        let result = ArchivableUrl::parse(url);
172        assert!(result.is_err());
173        assert_eq!(result.err(), Some(Error::ExcludedUrl(url.to_string())));
174    }
175
176    #[test]
177    fn jstor_url() {
178        let url = "https://jstor.org/some-book";
179        let result = ArchivableUrl::parse(url);
180        assert!(result.is_err());
181        assert_eq!(result.err(), Some(Error::ExcludedUrl(url.to_string())));
182    }
183
184    #[test]
185    fn excluded_domains() {
186        for &domain in EXCLUDED_DOMAINS {
187            let url = format!("https://{}/some-path", domain);
188            let result = ArchivableUrl::parse(&url);
189            assert!(result.is_err());
190            assert_eq!(result.err(), Some(Error::ExcludedUrl(url)));
191        }
192    }
193}