url_hash/
lib.rs

1/*!
2This crate provides three types that represent hash values specifically for the [`Url`] types. 
3
4For some URL-centric structures such as RDF graphs and XML documents, there becomes a core requirement to manage
5hash-like operations to compare URL values or to detect the presence of a URL in a cache. While Rust's built-in hash
6implementation, and by extension collections such as `HashMap` and `HashSet`, may be used they provide a closed
7implementation that cannot be used in a language-portable, or persistent manner without effort. This
8
9The purpose of the type [`UrlHash`] is to provide a stable value that represents a stable cryptographic hash of a single
10URL value that can be replicated across different platforms, and programming environments. 
11
12# Example
13
14```rust
15use url::Url;
16use url_hash::UrlHash;
17
18let url = Url::parse("https://doc.rust-lang.org/").unwrap();
19let hash =  UrlHash::from(url);
20println!("{}", hash);
21```
22
23# Specification
24
25This section attempts to describe the implementation in a language and platform neutral manner such that it may be
26replicated elsewhere.
27
28## Calculation
29
30The basis of the hash is the SHA-256 digest algorithm which is calculated over a partially-canonical URL.
31
321. The `scheme` component of the URL is converted to lower-case.
332. The `host` component of the URL is converted to lower-case.
343. The `host` component has Unicode normalization via punycode replacement.
354. The `port` component is removed if it is the default for the given scheme (80 for `http`, 443 for `https`, etc.).
365. The `path` component of the URL has any relative components (specified with `"."` and `".."`) removed.
376. An empty `path` component is replaced with `"/"`.
387. The `path`, `query`, and `fragment` components are URL-encoded.
39
40The following table demonstrates some of the results of the rules listed above.
41
42| # | Input                                      | Output                                    |
43|---|--------------------------------------------|-------------------------------------------|
44| 1 | `hTTpS://example.com/`                     | `https://example.com/`                    |
45| 2 | `https://Example.COM/`                     | `https://example.com/`                    |
46| 3 | `https://exâmple.com/`                     | `https://xn--exmple-xta.com/`             |
47| 3 | `https://example§.com/`                    | `https://xn--example-eja.com/`            |
48| 4 | `http://example.com:80/`                   | `http://example.com/`                     |
49| 4 | `https://example.com:443/`                 | `https://example.com/`                    |
50| 5 | `https://example.com/foo/../bar/./baz.jpg` | `https://example.com/bar/baz.jpg`         |
51| 6 | `https://example.com`                      | `https://example.com/`                    |
52| 7 | `https://example.com/hello world`          | `https://example.com/hello%20world`       |
53| 7 | `https://example.com/?q=hello world`       | `https://example.com/?q=hello%20world`    |
54| 7 | `https://example.com/?q=hello#to world`    | `https://example.com/?q=hello#to%20world` |
55
56## Representation
57
58The resulting SHA-256 is a 256 bit, or 32 byte value. This is stored as four 64-bit (8 byte) unsigned integer values which
59are converted from the digest bytes in little endian order. The following code demonstrates the creation of these values
60from the bytes representing the digest.
61
62The following code demonstrates the creation of these four values from the digest bytes.
63
64```rust,no_run
65# fn digest_bytes() -> [u8;32] { todo!() }
66let bytes: [u8;32] = digest_bytes();
67
68let value_1 = u64::from_le_bytes(bytes[0..8].try_into().unwrap());
69let value_2 = u64::from_le_bytes(bytes[8..16].try_into().unwrap());
70let value_3 = u64::from_le_bytes(bytes[16..24].try_into().unwrap());
71let value_4 = u64::from_le_bytes(bytes[24..32].try_into().unwrap());
72```
73
74## Short Forms
75
76In some cases it is not necessary to store or pass around the entire `UrlHash` 32-byte value when a trade-off for hash
77collision over space may be safely made. To allow for these trade-offs each `UrlHash` instance may be converted into a
7816-byte `UrlShortHash` which contains only the first two 64-bit unsigned values of the full hash, or an 8-byte
79`UrlVeryShortHash` which contains only the first 64-bit unsigned value of the full hash.
80
81The following code demonstrates the creation of short (truncated) hashes as well as the prefix tests `starts_with` and
82`starts_with_just`.
83
84```rust
85# use url::Url;
86# use url_hash::UrlHash;
87let url = Url::parse("https://doc.rust-lang.org/").unwrap();
88let hash =  UrlHash::from(url);
89
90let short = hash.short();
91assert!(hash.starts_with(&short));
92
93let very_short = hash.very_short();
94assert!(short.starts_with(&very_short));
95assert!(hash.starts_with_just(&very_short));
96
97assert_eq!(very_short, hash.very_short());
98```
99
100*/
101
102#![warn(
103    unknown_lints,
104    // ---------- Stylistic
105    absolute_paths_not_starting_with_crate,
106    elided_lifetimes_in_paths,
107    explicit_outlives_requirements,
108    macro_use_extern_crate,
109    nonstandard_style, /* group */
110    noop_method_call,
111    rust_2018_idioms,
112    single_use_lifetimes,
113    trivial_casts,
114    trivial_numeric_casts,
115    // ---------- Future
116    future_incompatible, /* group */
117    rust_2021_compatibility, /* group */
118    // ---------- Public
119    missing_debug_implementations,
120    // missing_docs,
121    unreachable_pub,
122    // ---------- Unsafe
123    unsafe_code,
124    unsafe_op_in_unsafe_fn,
125    // ---------- Unused
126    unused, /* group */
127)]
128#![deny(
129    // ---------- Public
130    exported_private_dependencies,
131    // ---------- Deprecated
132    anonymous_parameters,
133    bare_trait_objects,
134    ellipsis_inclusive_range_patterns,
135    // ---------- Unsafe
136    deref_nullptr,
137    drop_bounds,
138    dyn_drop,
139)]
140
141use std::fmt::Display;
142use url::Url;
143use ring::digest;
144
145// ------------------------------------------------------------------------------------------------
146// Public Types
147// ------------------------------------------------------------------------------------------------
148
149///
150/// This type represents a secure, stable, hash value for a [`Url`] using an SHA-256 digest
151/// algorithm. While this hash may be tested for equality (and strict inequality) no other
152/// relations, such as ordering, are supported. 
153///
154#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
155pub struct UrlHash([u64;4]);
156
157///
158/// This type contains the first half of a [`UrlHash`] instance where a less secure test using a
159/// truncated hash value is acceptable.
160///
161#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
162pub struct UrlShortHash([u64;2]);
163
164///
165/// This type contains the first quarter of a [`UrlHash`] instance where a less secure test using
166/// a truncated hash value is acceptable.
167///
168#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
169pub struct UrlVeryShortHash(u64);
170
171// ------------------------------------------------------------------------------------------------
172// Public Functions
173// ------------------------------------------------------------------------------------------------
174
175// ------------------------------------------------------------------------------------------------
176// Implementations
177// ------------------------------------------------------------------------------------------------
178
179impl Display for UrlHash {
180    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
181        write!(f, "{}-{}-{}-{}", self.0[0], self.0[1], self.0[2], self.0[3])
182    }
183}
184
185impl From<Url> for UrlHash {
186    fn from(value: Url) -> Self {
187        let url = value.to_string();
188        let hash = digest::digest(&digest::SHA384, url.as_bytes());
189        let bytes = hash.as_ref();
190        assert!(bytes.len() >= digest::SHA256_OUTPUT_LEN);
191        Self([
192            u64::from_le_bytes(bytes[0..8].try_into().unwrap()),
193            u64::from_le_bytes(bytes[8..16].try_into().unwrap()),
194            u64::from_le_bytes(bytes[16..24].try_into().unwrap()),
195            u64::from_le_bytes(bytes[24..32].try_into().unwrap()),
196        ])
197    }
198}
199
200impl UrlHash {
201    ///
202    /// Return a [`UrlShortHash`] instance containing the first two values of this hash.
203    ///
204    #[inline]
205    pub fn short(&self) -> UrlShortHash {
206        UrlShortHash(self.0[0..2].try_into().unwrap())
207    }
208
209    ///
210    /// Return a [`UrlVeryShortHash`] instance containing only the first value of this hash.
211    ///
212    #[inline]
213    pub fn very_short(&self) -> UrlVeryShortHash {
214        UrlVeryShortHash(self.0[0])
215    }
216
217    ///
218    /// Does this hash start with the two values in the provided short hash?
219    ///
220    #[inline]
221    pub fn starts_with(&self, short_hash: &UrlShortHash) -> bool {
222        self.0[0] == short_hash.0[0] && self.0[1] == short_hash.0[1]
223    }
224
225    ///
226    /// Does this hash start with the value in the provided very-short hash?
227    ///
228    #[inline]
229    pub fn starts_with_just(&self, very_short_hash: &UrlVeryShortHash) -> bool {
230        self.0[0] == very_short_hash.0
231    }
232}
233
234// ------------------------------------------------------------------------------------------------
235
236impl Display for UrlShortHash {
237    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
238        write!(f, "{}-{}", self.0[0], self.0[1])
239    }
240}
241
242impl UrlShortHash {
243    ///
244    /// Return a [`UrlVeryShortHash`] instance containing only the first value of this short hash.
245    ///
246    #[inline]
247    pub fn very_short(&self) -> UrlVeryShortHash {
248        UrlVeryShortHash(self.0[0])
249    }
250
251    ///
252    /// Does this hash start with the value in the provided very-short hash?
253    ///
254    #[inline]
255    pub fn starts_with(&self, very_short_hash: &UrlVeryShortHash) -> bool {
256        self.0[0] == very_short_hash.0
257    }
258}
259
260// ------------------------------------------------------------------------------------------------
261
262impl Display for UrlVeryShortHash {
263    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
264        write!(f, "{}", self.0)
265    }
266}
267
268// ------------------------------------------------------------------------------------------------
269// Modules
270// ------------------------------------------------------------------------------------------------
271
272#[cfg(test)]
273mod tests {
274    use super::*;
275
276    #[test]
277    fn test_hash_url_repeatedly() {
278        let url = Url::parse("https://doc.rust-lang.org/std/primitive.u8.html#method.to_ascii_lowercase").unwrap();
279        let first =  UrlHash::from(url);
280
281        for _ in 1..1000 {
282            let url = Url::parse("https://doc.rust-lang.org/std/primitive.u8.html#method.to_ascii_lowercase").unwrap();
283            let again =  UrlHash::from(url);
284            assert_eq!(again, first);
285        }
286    }
287
288    #[test]
289    fn test_another_url() {
290        let url = Url::parse("https://www.google.com/search?q=rust+hash+url+value&rlz=1C5GCEM_enUS1025US1025&oq=rust+hash+url+value&gs_lcrp=EgZjaHJvbWUyBggAEEUYOdIBCDYyOTlqMGo0qAIAsAIA&sourceid=chrome&ie=UTF-8").unwrap();
291        let _ = UrlHash::from(url);
292    }
293
294    #[test]
295    fn test_hash_prefixes() {
296        let url = Url::parse("https://doc.rust-lang.org/std/primitive.u8.html#method.to_ascii_lowercase").unwrap();
297        let hash =  UrlHash::from(url);
298        println!("{}", hash);
299
300        let short = hash.short();
301        println!("{}", short);
302        assert!(hash.starts_with(&short));
303
304        let very_short = hash.very_short();
305        println!("{}", very_short);
306        assert!(short.starts_with(&very_short));
307        assert!(hash.starts_with_just(&very_short));
308    }
309
310    #[test]
311    fn test_url_prereq_scheme_case() {
312        assert_eq!(
313            Url::parse("hTTpS://example.com/").unwrap().as_str(),
314            "https://example.com/"
315        );
316    }
317
318    #[test]
319    fn test_url_prereq_host_case() {
320        assert_eq!(
321            Url::parse("https://Example.COM/").unwrap().as_str(),
322            "https://example.com/"
323        );
324    }
325
326    #[test]
327    fn test_url_prereq_host_punycode() {
328        assert_eq!(
329            Url::parse("https://exâmple.com/").unwrap().as_str(),
330            "https://xn--exmple-xta.com/"
331        );
332
333        assert_eq!(
334            Url::parse("https://example§.com/").unwrap().as_str(),
335            "https://xn--example-eja.com/"
336        );
337    }
338
339    #[test]
340    fn test_url_prereq_port_default() {
341        assert_eq!(
342            Url::parse("http://example.com:80/").unwrap().as_str(),
343            "http://example.com/"
344        );
345
346        assert_eq!(
347            Url::parse("https://example.com:443/").unwrap().as_str(),
348            "https://example.com/"
349        );
350    }
351
352    #[test]
353    fn test_url_prereq_path_normalize() {
354        assert_eq!(
355            Url::parse("https://example.com/foo/../bar/./baz.jpg").unwrap().as_str(),
356            "https://example.com/bar/baz.jpg"
357        );
358    }
359
360    #[test]
361    fn test_url_prereq_empty_path_slash() {
362        assert_eq!(
363            Url::parse("https://example.com").unwrap().as_str(),
364            "https://example.com/"
365        );
366    }
367
368    #[test]
369    fn test_url_prereq_encode_path() {
370        assert_eq!(
371            Url::parse("https://example.com/hello world").unwrap().as_str(),
372            "https://example.com/hello%20world"
373        );
374    }
375
376    #[test]
377    fn test_url_prereq_encode_query() {
378        assert_eq!(
379            Url::parse("https://example.com/?q=hello world").unwrap().as_str(),
380            "https://example.com/?q=hello%20world"
381        );
382    }
383
384    #[test]
385    fn test_url_prereq_encode_fragment() {
386        assert_eq!(
387            Url::parse("https://example.com/?q=hello#to world").unwrap().as_str(),
388            "https://example.com/?q=hello#to%20world"
389        );
390    }
391}