uv_cache_key/
canonical_url.rs

1use std::borrow::Cow;
2use std::fmt::{Debug, Formatter};
3use std::hash::{Hash, Hasher};
4use std::ops::Deref;
5
6use url::Url;
7use uv_redacted::{DisplaySafeUrl, DisplaySafeUrlError};
8
9use crate::cache_key::{CacheKey, CacheKeyHasher};
10
11/// A wrapper around `Url` which represents a "canonical" version of an original URL.
12///
13/// A "canonical" url is only intended for internal comparison purposes. It's to help paper over
14/// mistakes such as depending on `github.com/foo/bar` vs. `github.com/foo/bar.git`.
15///
16/// This is **only** for internal purposes and provides no means to actually read the underlying
17/// string value of the `Url` it contains. This is intentional, because all fetching should still
18/// happen within the context of the original URL.
19#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
20pub struct CanonicalUrl(DisplaySafeUrl);
21
22impl CanonicalUrl {
23    pub fn new(url: &DisplaySafeUrl) -> Self {
24        let mut url = url.clone();
25
26        // If the URL cannot be a base, then it's not a valid URL anyway.
27        if url.cannot_be_a_base() {
28            return Self(url);
29        }
30
31        // Strip credentials.
32        let _ = url.set_password(None);
33        let _ = url.set_username("");
34
35        // Strip a trailing slash.
36        if url.path().ends_with('/') {
37            url.path_segments_mut().unwrap().pop_if_empty();
38        }
39
40        // For GitHub URLs specifically, just lower-case everything. GitHub
41        // treats both the same, but they hash differently, and we're gonna be
42        // hashing them. This wants a more general solution, and also we're
43        // almost certainly not using the same case conversion rules that GitHub
44        // does. (See issue #84)
45        if url.host_str() == Some("github.com") {
46            let scheme = url.scheme().to_lowercase();
47            url.set_scheme(&scheme).unwrap();
48            let path = url.path().to_lowercase();
49            url.set_path(&path);
50        }
51
52        // Repos can generally be accessed with or without `.git` extension.
53        if let Some((prefix, suffix)) = url.path().rsplit_once('@') {
54            // Ex) `git+https://github.com/pypa/sample-namespace-packages.git@2.0.0`
55            let needs_chopping = std::path::Path::new(prefix)
56                .extension()
57                .is_some_and(|ext| ext.eq_ignore_ascii_case("git"));
58            if needs_chopping {
59                let prefix = &prefix[..prefix.len() - 4];
60                let path = format!("{prefix}@{suffix}");
61                url.set_path(&path);
62            }
63        } else {
64            // Ex) `git+https://github.com/pypa/sample-namespace-packages.git`
65            let needs_chopping = std::path::Path::new(url.path())
66                .extension()
67                .is_some_and(|ext| ext.eq_ignore_ascii_case("git"));
68            if needs_chopping {
69                let last = {
70                    // Unwrap safety: We checked `url.cannot_be_a_base()`, and `url.path()` having
71                    // an extension implies at least one segment.
72                    let last = url.path_segments().unwrap().next_back().unwrap();
73                    last[..last.len() - 4].to_owned()
74                };
75                url.path_segments_mut().unwrap().pop().push(&last);
76            }
77        }
78
79        // Decode any percent-encoded characters in the path.
80        if memchr::memchr(b'%', url.path().as_bytes()).is_some() {
81            // Unwrap safety: We checked `url.cannot_be_a_base()`.
82            let decoded = url
83                .path_segments()
84                .unwrap()
85                .map(|segment| {
86                    percent_encoding::percent_decode_str(segment)
87                        .decode_utf8()
88                        .unwrap_or(Cow::Borrowed(segment))
89                        .into_owned()
90                })
91                .collect::<Vec<_>>();
92
93            let mut path_segments = url.path_segments_mut().unwrap();
94            path_segments.clear();
95            path_segments.extend(decoded);
96        }
97
98        Self(url)
99    }
100
101    pub fn parse(url: &str) -> Result<Self, DisplaySafeUrlError> {
102        Ok(Self::new(&DisplaySafeUrl::parse(url)?))
103    }
104}
105
106impl CacheKey for CanonicalUrl {
107    fn cache_key(&self, state: &mut CacheKeyHasher) {
108        // `as_str` gives the serialisation of a url (which has a spec) and so insulates against
109        // possible changes in how the URL crate does hashing.
110        self.0.as_str().cache_key(state);
111    }
112}
113
114impl Hash for CanonicalUrl {
115    fn hash<H: Hasher>(&self, state: &mut H) {
116        // `as_str` gives the serialisation of a url (which has a spec) and so insulates against
117        // possible changes in how the URL crate does hashing.
118        self.0.as_str().hash(state);
119    }
120}
121
122impl From<CanonicalUrl> for DisplaySafeUrl {
123    fn from(value: CanonicalUrl) -> Self {
124        value.0
125    }
126}
127
128impl std::fmt::Display for CanonicalUrl {
129    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
130        std::fmt::Display::fmt(&self.0, f)
131    }
132}
133
134/// Like [`CanonicalUrl`], but attempts to represent an underlying source repository, abstracting
135/// away details like the specific commit or branch, or the subdirectory to build within the
136/// repository.
137///
138/// For example, `https://github.com/pypa/package.git#subdirectory=pkg_a` and
139/// `https://github.com/pypa/package.git#subdirectory=pkg_b` would map to different
140/// [`CanonicalUrl`] values, but the same [`RepositoryUrl`], since they map to the same
141/// resource.
142///
143/// The additional information it holds should only be used to discriminate between
144/// sources that hold the exact same commit in their canonical representation,
145/// but may differ in the contents such as when Git LFS is enabled.
146///
147/// A different cache key will be computed when Git LFS is enabled.
148/// When Git LFS is `false` or `None`, the cache key remains unchanged.
149#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
150pub struct RepositoryUrl {
151    repo_url: DisplaySafeUrl,
152    with_lfs: Option<bool>,
153}
154
155impl RepositoryUrl {
156    pub fn new(url: &DisplaySafeUrl) -> Self {
157        let mut url = CanonicalUrl::new(url).0;
158
159        // If a Git URL ends in a reference (like a branch, tag, or commit), remove it.
160        if url.scheme().starts_with("git+") {
161            if let Some(prefix) = url
162                .path()
163                .rsplit_once('@')
164                .map(|(prefix, _suffix)| prefix.to_string())
165            {
166                url.set_path(&prefix);
167            }
168        }
169
170        // Drop any fragments and query parameters.
171        url.set_fragment(None);
172        url.set_query(None);
173
174        Self {
175            repo_url: url,
176            with_lfs: None,
177        }
178    }
179
180    pub fn parse(url: &str) -> Result<Self, DisplaySafeUrlError> {
181        Ok(Self::new(&DisplaySafeUrl::parse(url)?))
182    }
183
184    #[must_use]
185    pub fn with_lfs(mut self, lfs: Option<bool>) -> Self {
186        self.with_lfs = lfs;
187        self
188    }
189}
190
191impl CacheKey for RepositoryUrl {
192    fn cache_key(&self, state: &mut CacheKeyHasher) {
193        // `as_str` gives the serialisation of a url (which has a spec) and so insulates against
194        // possible changes in how the URL crate does hashing.
195        self.repo_url.as_str().cache_key(state);
196        if let Some(true) = self.with_lfs {
197            1u8.cache_key(state);
198        }
199    }
200}
201
202impl Hash for RepositoryUrl {
203    fn hash<H: Hasher>(&self, state: &mut H) {
204        // `as_str` gives the serialisation of a url (which has a spec) and so insulates against
205        // possible changes in how the URL crate does hashing.
206        self.repo_url.as_str().hash(state);
207        if let Some(true) = self.with_lfs {
208            1u8.hash(state);
209        }
210    }
211}
212
213impl Deref for RepositoryUrl {
214    type Target = Url;
215
216    fn deref(&self) -> &Self::Target {
217        &self.repo_url
218    }
219}
220
221impl std::fmt::Display for RepositoryUrl {
222    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
223        std::fmt::Display::fmt(&self.repo_url, f)
224    }
225}
226
227#[cfg(test)]
228mod tests {
229    use super::*;
230
231    #[test]
232    fn user_credential_does_not_affect_cache_key() -> Result<(), DisplaySafeUrlError> {
233        let mut hasher = CacheKeyHasher::new();
234        CanonicalUrl::parse("https://example.com/pypa/sample-namespace-packages.git@2.0.0")?
235            .cache_key(&mut hasher);
236        let hash_without_creds = hasher.finish();
237
238        let mut hasher = CacheKeyHasher::new();
239        CanonicalUrl::parse(
240            "https://user:foo@example.com/pypa/sample-namespace-packages.git@2.0.0",
241        )?
242        .cache_key(&mut hasher);
243        let hash_with_creds = hasher.finish();
244        assert_eq!(
245            hash_without_creds, hash_with_creds,
246            "URLs with no user credentials should hash the same as URLs with different user credentials",
247        );
248
249        let mut hasher = CacheKeyHasher::new();
250        CanonicalUrl::parse(
251            "https://user:bar@example.com/pypa/sample-namespace-packages.git@2.0.0",
252        )?
253        .cache_key(&mut hasher);
254        let hash_with_creds = hasher.finish();
255        assert_eq!(
256            hash_without_creds, hash_with_creds,
257            "URLs with different user credentials should hash the same",
258        );
259
260        let mut hasher = CacheKeyHasher::new();
261        CanonicalUrl::parse("https://:bar@example.com/pypa/sample-namespace-packages.git@2.0.0")?
262            .cache_key(&mut hasher);
263        let hash_with_creds = hasher.finish();
264        assert_eq!(
265            hash_without_creds, hash_with_creds,
266            "URLs with no username, though with a password, should hash the same as URLs with different user credentials",
267        );
268
269        let mut hasher = CacheKeyHasher::new();
270        CanonicalUrl::parse("https://user:@example.com/pypa/sample-namespace-packages.git@2.0.0")?
271            .cache_key(&mut hasher);
272        let hash_with_creds = hasher.finish();
273        assert_eq!(
274            hash_without_creds, hash_with_creds,
275            "URLs with no password, though with a username, should hash the same as URLs with different user credentials",
276        );
277
278        Ok(())
279    }
280
281    #[test]
282    fn canonical_url() -> Result<(), DisplaySafeUrlError> {
283        // Two URLs should be considered equal regardless of the `.git` suffix.
284        assert_eq!(
285            CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git")?,
286            CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages")?,
287        );
288
289        // Two URLs should be considered equal regardless of the `.git` suffix.
290        assert_eq!(
291            CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git@2.0.0")?,
292            CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages@2.0.0")?,
293        );
294
295        // Two URLs should be _not_ considered equal if they point to different repositories.
296        assert_ne!(
297            CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git")?,
298            CanonicalUrl::parse("git+https://github.com/pypa/sample-packages.git")?,
299        );
300
301        // Two URLs should _not_ be considered equal if they request different subdirectories.
302        assert_ne!(
303            CanonicalUrl::parse(
304                "git+https://github.com/pypa/sample-namespace-packages.git#subdirectory=pkg_resources/pkg_a"
305            )?,
306            CanonicalUrl::parse(
307                "git+https://github.com/pypa/sample-namespace-packages.git#subdirectory=pkg_resources/pkg_b"
308            )?,
309        );
310
311        // Two URLs should _not_ be considered equal if they differ in Git LFS enablement.
312        assert_ne!(
313            CanonicalUrl::parse(
314                "git+https://github.com/pypa/sample-namespace-packages.git#lfs=true"
315            )?,
316            CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git")?,
317        );
318
319        // Two URLs should _not_ be considered equal if they request different commit tags.
320        assert_ne!(
321            CanonicalUrl::parse(
322                "git+https://github.com/pypa/sample-namespace-packages.git@v1.0.0"
323            )?,
324            CanonicalUrl::parse(
325                "git+https://github.com/pypa/sample-namespace-packages.git@v2.0.0"
326            )?,
327        );
328
329        // Two URLs that cannot be a base should be considered equal.
330        assert_eq!(
331            CanonicalUrl::parse("git+https:://github.com/pypa/sample-namespace-packages.git")?,
332            CanonicalUrl::parse("git+https:://github.com/pypa/sample-namespace-packages.git")?,
333        );
334
335        // Two URLs should _not_ be considered equal based on percent-decoding slashes.
336        assert_ne!(
337            CanonicalUrl::parse("https://github.com/pypa/sample%2Fnamespace%2Fpackages")?,
338            CanonicalUrl::parse("https://github.com/pypa/sample/namespace/packages")?,
339        );
340
341        // Two URLs should be considered equal regardless of percent-encoding.
342        assert_eq!(
343            CanonicalUrl::parse("https://github.com/pypa/sample%2Bnamespace%2Bpackages")?,
344            CanonicalUrl::parse("https://github.com/pypa/sample+namespace+packages")?,
345        );
346
347        // Two URLs should _not_ be considered equal based on percent-decoding slashes.
348        assert_ne!(
349            CanonicalUrl::parse(
350                "file:///home/ferris/my_project%2Fmy_project-0.1.0-py3-none-any.whl"
351            )?,
352            CanonicalUrl::parse(
353                "file:///home/ferris/my_project/my_project-0.1.0-py3-none-any.whl"
354            )?,
355        );
356
357        // Two URLs should be considered equal regardless of percent-encoding.
358        assert_eq!(
359            CanonicalUrl::parse(
360                "file:///home/ferris/my_project/my_project-0.1.0+foo-py3-none-any.whl"
361            )?,
362            CanonicalUrl::parse(
363                "file:///home/ferris/my_project/my_project-0.1.0%2Bfoo-py3-none-any.whl"
364            )?,
365        );
366
367        Ok(())
368    }
369
370    #[test]
371    fn repository_url() -> Result<(), DisplaySafeUrlError> {
372        // Two URLs should be considered equal regardless of the `.git` suffix.
373        assert_eq!(
374            RepositoryUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git")?,
375            RepositoryUrl::parse("git+https://github.com/pypa/sample-namespace-packages")?,
376        );
377
378        // Two URLs should be considered equal regardless of the `.git` suffix.
379        assert_eq!(
380            RepositoryUrl::parse(
381                "git+https://github.com/pypa/sample-namespace-packages.git@2.0.0"
382            )?,
383            RepositoryUrl::parse("git+https://github.com/pypa/sample-namespace-packages@2.0.0")?,
384        );
385
386        // Two URLs should be _not_ considered equal if they point to different repositories.
387        assert_ne!(
388            RepositoryUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git")?,
389            RepositoryUrl::parse("git+https://github.com/pypa/sample-packages.git")?,
390        );
391
392        // Two URLs should be considered equal if they map to the same repository, even if they
393        // request different subdirectories.
394        assert_eq!(
395            RepositoryUrl::parse(
396                "git+https://github.com/pypa/sample-namespace-packages.git#subdirectory=pkg_resources/pkg_a"
397            )?,
398            RepositoryUrl::parse(
399                "git+https://github.com/pypa/sample-namespace-packages.git#subdirectory=pkg_resources/pkg_b"
400            )?,
401        );
402
403        // Two URLs should be considered equal if they map to the same repository, even if they
404        // request different commit tags.
405        assert_eq!(
406            RepositoryUrl::parse(
407                "git+https://github.com/pypa/sample-namespace-packages.git@v1.0.0"
408            )?,
409            RepositoryUrl::parse(
410                "git+https://github.com/pypa/sample-namespace-packages.git@v2.0.0"
411            )?,
412        );
413
414        // Two URLs should be considered equal if they map to the same repository, even if they
415        // differ in Git LFS enablement.
416        assert_eq!(
417            RepositoryUrl::parse(
418                "git+https://github.com/pypa/sample-namespace-packages.git#lfs=true"
419            )?,
420            RepositoryUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git")?,
421        );
422
423        Ok(())
424    }
425
426    #[test]
427    fn repository_url_with_lfs() -> Result<(), DisplaySafeUrlError> {
428        let mut hasher = CacheKeyHasher::new();
429        RepositoryUrl::parse("https://example.com/pypa/sample-namespace-packages.git@2.0.0")?
430            .cache_key(&mut hasher);
431        let repo_url_basic = hasher.finish();
432
433        let mut hasher = CacheKeyHasher::new();
434        RepositoryUrl::parse(
435            "https://user:foo@example.com/pypa/sample-namespace-packages.git@2.0.0#foo=bar",
436        )?
437        .cache_key(&mut hasher);
438        let repo_url_with_fragments = hasher.finish();
439
440        assert_eq!(
441            repo_url_basic, repo_url_with_fragments,
442            "repository urls should have the exact cache keys as fragments are removed",
443        );
444
445        let mut hasher = CacheKeyHasher::new();
446        RepositoryUrl::parse(
447            "https://user:foo@example.com/pypa/sample-namespace-packages.git@2.0.0#foo=bar",
448        )?
449        .with_lfs(None)
450        .cache_key(&mut hasher);
451        let git_url_with_fragments = hasher.finish();
452
453        assert_eq!(
454            repo_url_with_fragments, git_url_with_fragments,
455            "both structs should have the exact cache keys as fragments are still removed",
456        );
457
458        let mut hasher = CacheKeyHasher::new();
459        RepositoryUrl::parse(
460            "https://user:foo@example.com/pypa/sample-namespace-packages.git@2.0.0#foo=bar",
461        )?
462        .with_lfs(Some(false))
463        .cache_key(&mut hasher);
464        let git_url_with_fragments_and_lfs_false = hasher.finish();
465
466        assert_eq!(
467            git_url_with_fragments, git_url_with_fragments_and_lfs_false,
468            "both structs should have the exact cache keys as lfs false should not influence them",
469        );
470
471        let mut hasher = CacheKeyHasher::new();
472        RepositoryUrl::parse(
473            "https://user:foo@example.com/pypa/sample-namespace-packages.git@2.0.0#foo=bar",
474        )?
475        .with_lfs(Some(true))
476        .cache_key(&mut hasher);
477        let git_url_with_fragments_and_lfs_true = hasher.finish();
478
479        assert_ne!(
480            git_url_with_fragments, git_url_with_fragments_and_lfs_true,
481            "both structs should have different cache keys as one has Git LFS enabled",
482        );
483
484        Ok(())
485    }
486}