waybackmachine_client/
archivableurl.rs1use crate::Error;
2use std::fmt;
3use url::{Host, Url};
4
5#[derive(Clone)]
6pub struct ArchivableUrl {
8 pub url: Url,
9}
10
11const EXCLUDED_DOMAINS: &[&str] = &[
13 "archive.org",
14 "jstor.org",
15 "diw.de",
16 "youtube.com",
17 "plato.stanford.edu",
18 "muse.jhu.edu",
19 "read.dukeupress.edu",
20 "academic.oup.com",
21 "onlinelibrary.wiley.com",
22 "genius.com",
23 "taylorfrancis.com",
24 "tandfonline.com",
25 "iwaponline.com",
26 "link.springer.com",
27 "journals.sagepub.com",
28 "journals.openedition.org",
29 "sciencedirect.com",
30 "annualreviews.org",
31 "mit.edu",
32 "ucpress.edu",
33];
34
35impl ArchivableUrl {
36 pub fn parse(url: &str) -> Result<Self, Error> {
38 let parsed_url = Url::parse(url).map_err(|_| Error::InvalidUrl(url.to_string()))?;
39 let archivable_url = Self { url: parsed_url };
40 archivable_url.validate_url()
41 }
42
43 fn validate_url(self) -> Result<Self, Error> {
45 let host = match self.url.host() {
46 Some(host) => host,
47 None => return Err(Error::InvalidUrl(self.url.to_string())),
48 };
49
50 match host {
52 Host::Domain(domain) => {
53 if domain.contains("localhost") {
54 return Err(Error::InvalidUrl(self.url.to_string()));
55 }
56
57 for &pattern in EXCLUDED_DOMAINS {
58 if domain.contains(pattern) {
59 return Err(Error::ExcludedUrl(self.url.to_string()));
60 }
61 }
62 }
63 Host::Ipv4(ipv4)
64 if ipv4.is_loopback()
65 || ipv4.is_private()
66 || ipv4.is_multicast()
67 || ipv4.is_unspecified() =>
68 {
69 return Err(Error::InvalidUrl(self.url.to_string()));
70 }
71 Host::Ipv6(ipv6) if ipv6.is_loopback() || ipv6.is_multicast() => {
72 return Err(Error::InvalidUrl(self.url.to_string()));
73 }
74 _ => {}
75 }
76
77 if !["http", "https"].contains(&self.url.scheme()) {
79 return Err(Error::InvalidUrl(self.url.to_string()));
80 }
81
82 Ok(self)
84 }
85
86 pub fn as_str(&self) -> &str {
88 self.url.as_str()
89 }
90}
91
92impl fmt::Display for ArchivableUrl {
93 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
94 write!(f, "{}", self.url)
95 }
96}
97
98#[cfg(test)]
99mod tests {
100 use super::*;
101
102 #[test]
103 fn valid_http_url() {
104 let url = "http://example.com/";
105 let result = ArchivableUrl::parse(url);
106 assert!(result.is_ok());
107 let archivable_url = result.unwrap();
108 assert_eq!(archivable_url.as_str(), url);
109 }
110
111 #[test]
112 fn valid_https_url() {
113 let url = "https://example.com/";
114 let result = ArchivableUrl::parse(url);
115 assert!(result.is_ok());
116 let archivable_url = result.unwrap();
117 assert_eq!(archivable_url.as_str(), url);
118 }
119
120 #[test]
121 fn invalid_url() {
122 let url = "invalid-url";
123 let result = ArchivableUrl::parse(url);
124 assert!(result.is_err());
125 assert_eq!(result.err(), Some(Error::InvalidUrl(url.to_string())));
126 }
127
128 #[test]
129 fn invalid_scheme() {
130 let url = "ftp://example.com/";
131 let result = ArchivableUrl::parse(url);
132 assert!(result.is_err());
133 assert_eq!(result.err(), Some(Error::InvalidUrl(url.to_string())));
134 }
135
136 #[test]
137 fn localhost_url() {
138 let url = "http://localhost/";
139 let result = ArchivableUrl::parse(url);
140 assert!(result.is_err());
141 assert_eq!(result.err(), Some(Error::InvalidUrl(url.to_string())));
142 }
143
144 #[test]
145 fn private_ip_url() {
146 let url = "http://192.168.1.1/";
147 let result = ArchivableUrl::parse(url);
148 assert!(result.is_err());
149 assert_eq!(result.err(), Some(Error::InvalidUrl(url.to_string())));
150 }
151
152 #[test]
153 fn reserved_ip_url() {
154 let url = "http://0.0.0.0/";
155 let result = ArchivableUrl::parse(url);
156 assert!(result.is_err());
157 assert_eq!(result.err(), Some(Error::InvalidUrl(url.to_string())));
158 }
159
160 #[test]
161 fn special_localhost_alias_url() {
162 let url = "http://localhost.localdomain/";
163 let result = ArchivableUrl::parse(url);
164 assert!(result.is_err());
165 assert_eq!(result.err(), Some(Error::InvalidUrl(url.to_string())));
166 }
167
168 #[test]
169 fn wayback_url() {
170 let url = "https://archive.org/some-book";
171 let result = ArchivableUrl::parse(url);
172 assert!(result.is_err());
173 assert_eq!(result.err(), Some(Error::ExcludedUrl(url.to_string())));
174 }
175
176 #[test]
177 fn jstor_url() {
178 let url = "https://jstor.org/some-book";
179 let result = ArchivableUrl::parse(url);
180 assert!(result.is_err());
181 assert_eq!(result.err(), Some(Error::ExcludedUrl(url.to_string())));
182 }
183
184 #[test]
185 fn excluded_domains() {
186 for &domain in EXCLUDED_DOMAINS {
187 let url = format!("https://{}/some-path", domain);
188 let result = ArchivableUrl::parse(&url);
189 assert!(result.is_err());
190 assert_eq!(result.err(), Some(Error::ExcludedUrl(url)));
191 }
192 }
193}