url_hash/lib.rs
1/*!
2This crate provides three types that represent hash values specifically for the [`Url`] types.
3
4For some URL-centric structures such as RDF graphs and XML documents, there becomes a core requirement to manage
5hash-like operations to compare URL values or to detect the presence of a URL in a cache. While Rust's built-in hash
6implementation, and by extension collections such as `HashMap` and `HashSet`, may be used they provide a closed
7implementation that cannot be used in a language-portable, or persistent manner without effort. This
8
9The purpose of the type [`UrlHash`] is to provide a stable value that represents a stable cryptographic hash of a single
10URL value that can be replicated across different platforms, and programming environments.
11
12# Example
13
14```rust
15use url::Url;
16use url_hash::UrlHash;
17
18let url = Url::parse("https://doc.rust-lang.org/").unwrap();
19let hash = UrlHash::from(url);
20println!("{}", hash);
21```
22
23# Specification
24
25This section attempts to describe the implementation in a language and platform neutral manner such that it may be
26replicated elsewhere.
27
28## Calculation
29
30The basis of the hash is the SHA-256 digest algorithm which is calculated over a partially-canonical URL.
31
321. The `scheme` component of the URL is converted to lower-case.
332. The `host` component of the URL is converted to lower-case.
343. The `host` component has Unicode normalization via punycode replacement.
354. The `port` component is removed if it is the default for the given scheme (80 for `http`, 443 for `https`, etc.).
365. The `path` component of the URL has any relative components (specified with `"."` and `".."`) removed.
376. An empty `path` component is replaced with `"/"`.
387. The `path`, `query`, and `fragment` components are URL-encoded.
39
40The following table demonstrates some of the results of the rules listed above.
41
42| # | Input | Output |
43|---|--------------------------------------------|-------------------------------------------|
44| 1 | `hTTpS://example.com/` | `https://example.com/` |
45| 2 | `https://Example.COM/` | `https://example.com/` |
46| 3 | `https://exâmple.com/` | `https://xn--exmple-xta.com/` |
47| 3 | `https://example§.com/` | `https://xn--example-eja.com/` |
48| 4 | `http://example.com:80/` | `http://example.com/` |
49| 4 | `https://example.com:443/` | `https://example.com/` |
50| 5 | `https://example.com/foo/../bar/./baz.jpg` | `https://example.com/bar/baz.jpg` |
51| 6 | `https://example.com` | `https://example.com/` |
52| 7 | `https://example.com/hello world` | `https://example.com/hello%20world` |
53| 7 | `https://example.com/?q=hello world` | `https://example.com/?q=hello%20world` |
54| 7 | `https://example.com/?q=hello#to world` | `https://example.com/?q=hello#to%20world` |
55
56## Representation
57
58The resulting SHA-256 is a 256 bit, or 32 byte value. This is stored as four 64-bit (8 byte) unsigned integer values which
59are converted from the digest bytes in little endian order. The following code demonstrates the creation of these values
60from the bytes representing the digest.
61
62The following code demonstrates the creation of these four values from the digest bytes.
63
64```rust,no_run
65# fn digest_bytes() -> [u8;32] { todo!() }
66let bytes: [u8;32] = digest_bytes();
67
68let value_1 = u64::from_le_bytes(bytes[0..8].try_into().unwrap());
69let value_2 = u64::from_le_bytes(bytes[8..16].try_into().unwrap());
70let value_3 = u64::from_le_bytes(bytes[16..24].try_into().unwrap());
71let value_4 = u64::from_le_bytes(bytes[24..32].try_into().unwrap());
72```
73
74## Short Forms
75
76In some cases it is not necessary to store or pass around the entire `UrlHash` 32-byte value when a trade-off for hash
77collision over space may be safely made. To allow for these trade-offs each `UrlHash` instance may be converted into a
7816-byte `UrlShortHash` which contains only the first two 64-bit unsigned values of the full hash, or an 8-byte
79`UrlVeryShortHash` which contains only the first 64-bit unsigned value of the full hash.
80
81The following code demonstrates the creation of short (truncated) hashes as well as the prefix tests `starts_with` and
82`starts_with_just`.
83
84```rust
85# use url::Url;
86# use url_hash::UrlHash;
87let url = Url::parse("https://doc.rust-lang.org/").unwrap();
88let hash = UrlHash::from(url);
89
90let short = hash.short();
91assert!(hash.starts_with(&short));
92
93let very_short = hash.very_short();
94assert!(short.starts_with(&very_short));
95assert!(hash.starts_with_just(&very_short));
96
97assert_eq!(very_short, hash.very_short());
98```
99
100*/
101
102#![warn(
103 unknown_lints,
104 // ---------- Stylistic
105 absolute_paths_not_starting_with_crate,
106 elided_lifetimes_in_paths,
107 explicit_outlives_requirements,
108 macro_use_extern_crate,
109 nonstandard_style, /* group */
110 noop_method_call,
111 rust_2018_idioms,
112 single_use_lifetimes,
113 trivial_casts,
114 trivial_numeric_casts,
115 // ---------- Future
116 future_incompatible, /* group */
117 rust_2021_compatibility, /* group */
118 // ---------- Public
119 missing_debug_implementations,
120 // missing_docs,
121 unreachable_pub,
122 // ---------- Unsafe
123 unsafe_code,
124 unsafe_op_in_unsafe_fn,
125 // ---------- Unused
126 unused, /* group */
127)]
128#![deny(
129 // ---------- Public
130 exported_private_dependencies,
131 // ---------- Deprecated
132 anonymous_parameters,
133 bare_trait_objects,
134 ellipsis_inclusive_range_patterns,
135 // ---------- Unsafe
136 deref_nullptr,
137 drop_bounds,
138 dyn_drop,
139)]
140
141use std::fmt::Display;
142use url::Url;
143use ring::digest;
144
145// ------------------------------------------------------------------------------------------------
146// Public Types
147// ------------------------------------------------------------------------------------------------
148
149///
150/// This type represents a secure, stable, hash value for a [`Url`] using an SHA-256 digest
151/// algorithm. While this hash may be tested for equality (and strict inequality) no other
152/// relations, such as ordering, are supported.
153///
154#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
155pub struct UrlHash([u64;4]);
156
157///
158/// This type contains the first half of a [`UrlHash`] instance where a less secure test using a
159/// truncated hash value is acceptable.
160///
161#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
162pub struct UrlShortHash([u64;2]);
163
164///
165/// This type contains the first quarter of a [`UrlHash`] instance where a less secure test using
166/// a truncated hash value is acceptable.
167///
168#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
169pub struct UrlVeryShortHash(u64);
170
171// ------------------------------------------------------------------------------------------------
172// Public Functions
173// ------------------------------------------------------------------------------------------------
174
175// ------------------------------------------------------------------------------------------------
176// Implementations
177// ------------------------------------------------------------------------------------------------
178
179impl Display for UrlHash {
180 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
181 write!(f, "{}-{}-{}-{}", self.0[0], self.0[1], self.0[2], self.0[3])
182 }
183}
184
185impl From<Url> for UrlHash {
186 fn from(value: Url) -> Self {
187 let url = value.to_string();
188 let hash = digest::digest(&digest::SHA384, url.as_bytes());
189 let bytes = hash.as_ref();
190 assert!(bytes.len() >= digest::SHA256_OUTPUT_LEN);
191 Self([
192 u64::from_le_bytes(bytes[0..8].try_into().unwrap()),
193 u64::from_le_bytes(bytes[8..16].try_into().unwrap()),
194 u64::from_le_bytes(bytes[16..24].try_into().unwrap()),
195 u64::from_le_bytes(bytes[24..32].try_into().unwrap()),
196 ])
197 }
198}
199
200impl UrlHash {
201 ///
202 /// Return a [`UrlShortHash`] instance containing the first two values of this hash.
203 ///
204 #[inline]
205 pub fn short(&self) -> UrlShortHash {
206 UrlShortHash(self.0[0..2].try_into().unwrap())
207 }
208
209 ///
210 /// Return a [`UrlVeryShortHash`] instance containing only the first value of this hash.
211 ///
212 #[inline]
213 pub fn very_short(&self) -> UrlVeryShortHash {
214 UrlVeryShortHash(self.0[0])
215 }
216
217 ///
218 /// Does this hash start with the two values in the provided short hash?
219 ///
220 #[inline]
221 pub fn starts_with(&self, short_hash: &UrlShortHash) -> bool {
222 self.0[0] == short_hash.0[0] && self.0[1] == short_hash.0[1]
223 }
224
225 ///
226 /// Does this hash start with the value in the provided very-short hash?
227 ///
228 #[inline]
229 pub fn starts_with_just(&self, very_short_hash: &UrlVeryShortHash) -> bool {
230 self.0[0] == very_short_hash.0
231 }
232}
233
234// ------------------------------------------------------------------------------------------------
235
236impl Display for UrlShortHash {
237 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
238 write!(f, "{}-{}", self.0[0], self.0[1])
239 }
240}
241
242impl UrlShortHash {
243 ///
244 /// Return a [`UrlVeryShortHash`] instance containing only the first value of this short hash.
245 ///
246 #[inline]
247 pub fn very_short(&self) -> UrlVeryShortHash {
248 UrlVeryShortHash(self.0[0])
249 }
250
251 ///
252 /// Does this hash start with the value in the provided very-short hash?
253 ///
254 #[inline]
255 pub fn starts_with(&self, very_short_hash: &UrlVeryShortHash) -> bool {
256 self.0[0] == very_short_hash.0
257 }
258}
259
260// ------------------------------------------------------------------------------------------------
261
262impl Display for UrlVeryShortHash {
263 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
264 write!(f, "{}", self.0)
265 }
266}
267
268// ------------------------------------------------------------------------------------------------
269// Modules
270// ------------------------------------------------------------------------------------------------
271
272#[cfg(test)]
273mod tests {
274 use super::*;
275
276 #[test]
277 fn test_hash_url_repeatedly() {
278 let url = Url::parse("https://doc.rust-lang.org/std/primitive.u8.html#method.to_ascii_lowercase").unwrap();
279 let first = UrlHash::from(url);
280
281 for _ in 1..1000 {
282 let url = Url::parse("https://doc.rust-lang.org/std/primitive.u8.html#method.to_ascii_lowercase").unwrap();
283 let again = UrlHash::from(url);
284 assert_eq!(again, first);
285 }
286 }
287
288 #[test]
289 fn test_another_url() {
290 let url = Url::parse("https://www.google.com/search?q=rust+hash+url+value&rlz=1C5GCEM_enUS1025US1025&oq=rust+hash+url+value&gs_lcrp=EgZjaHJvbWUyBggAEEUYOdIBCDYyOTlqMGo0qAIAsAIA&sourceid=chrome&ie=UTF-8").unwrap();
291 let _ = UrlHash::from(url);
292 }
293
294 #[test]
295 fn test_hash_prefixes() {
296 let url = Url::parse("https://doc.rust-lang.org/std/primitive.u8.html#method.to_ascii_lowercase").unwrap();
297 let hash = UrlHash::from(url);
298 println!("{}", hash);
299
300 let short = hash.short();
301 println!("{}", short);
302 assert!(hash.starts_with(&short));
303
304 let very_short = hash.very_short();
305 println!("{}", very_short);
306 assert!(short.starts_with(&very_short));
307 assert!(hash.starts_with_just(&very_short));
308 }
309
310 #[test]
311 fn test_url_prereq_scheme_case() {
312 assert_eq!(
313 Url::parse("hTTpS://example.com/").unwrap().as_str(),
314 "https://example.com/"
315 );
316 }
317
318 #[test]
319 fn test_url_prereq_host_case() {
320 assert_eq!(
321 Url::parse("https://Example.COM/").unwrap().as_str(),
322 "https://example.com/"
323 );
324 }
325
326 #[test]
327 fn test_url_prereq_host_punycode() {
328 assert_eq!(
329 Url::parse("https://exâmple.com/").unwrap().as_str(),
330 "https://xn--exmple-xta.com/"
331 );
332
333 assert_eq!(
334 Url::parse("https://example§.com/").unwrap().as_str(),
335 "https://xn--example-eja.com/"
336 );
337 }
338
339 #[test]
340 fn test_url_prereq_port_default() {
341 assert_eq!(
342 Url::parse("http://example.com:80/").unwrap().as_str(),
343 "http://example.com/"
344 );
345
346 assert_eq!(
347 Url::parse("https://example.com:443/").unwrap().as_str(),
348 "https://example.com/"
349 );
350 }
351
352 #[test]
353 fn test_url_prereq_path_normalize() {
354 assert_eq!(
355 Url::parse("https://example.com/foo/../bar/./baz.jpg").unwrap().as_str(),
356 "https://example.com/bar/baz.jpg"
357 );
358 }
359
360 #[test]
361 fn test_url_prereq_empty_path_slash() {
362 assert_eq!(
363 Url::parse("https://example.com").unwrap().as_str(),
364 "https://example.com/"
365 );
366 }
367
368 #[test]
369 fn test_url_prereq_encode_path() {
370 assert_eq!(
371 Url::parse("https://example.com/hello world").unwrap().as_str(),
372 "https://example.com/hello%20world"
373 );
374 }
375
376 #[test]
377 fn test_url_prereq_encode_query() {
378 assert_eq!(
379 Url::parse("https://example.com/?q=hello world").unwrap().as_str(),
380 "https://example.com/?q=hello%20world"
381 );
382 }
383
384 #[test]
385 fn test_url_prereq_encode_fragment() {
386 assert_eq!(
387 Url::parse("https://example.com/?q=hello#to world").unwrap().as_str(),
388 "https://example.com/?q=hello#to%20world"
389 );
390 }
391}