uv_cache_key/
canonical_url.rs

1use std::borrow::Cow;
2use std::fmt::{Debug, Formatter};
3use std::hash::{Hash, Hasher};
4use std::ops::Deref;
5
6use url::Url;
7use uv_redacted::{DisplaySafeUrl, DisplaySafeUrlError};
8
9use crate::cache_key::{CacheKey, CacheKeyHasher};
10
11/// A wrapper around `Url` which represents a "canonical" version of an original URL.
12///
13/// A "canonical" url is only intended for internal comparison purposes. It's to help paper over
14/// mistakes such as depending on `github.com/foo/bar` vs. `github.com/foo/bar.git`.
15///
16/// This is **only** for internal purposes and provides no means to actually read the underlying
17/// string value of the `Url` it contains. This is intentional, because all fetching should still
18/// happen within the context of the original URL.
19#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
20pub struct CanonicalUrl(DisplaySafeUrl);
21
22impl CanonicalUrl {
23    pub fn new(url: &DisplaySafeUrl) -> Self {
24        let mut url = url.clone();
25
26        // If the URL cannot be a base, then it's not a valid URL anyway.
27        if url.cannot_be_a_base() {
28            return Self(url);
29        }
30
31        // Strip credentials.
32        let _ = url.set_password(None);
33        let _ = url.set_username("");
34
35        // Strip a trailing slash.
36        if url.path().ends_with('/') {
37            url.path_segments_mut().unwrap().pop_if_empty();
38        }
39
40        // For GitHub URLs specifically, just lower-case everything. GitHub
41        // treats both the same, but they hash differently, and we're gonna be
42        // hashing them. This wants a more general solution, and also we're
43        // almost certainly not using the same case conversion rules that GitHub
44        // does. (See issue #84)
45        if url.host_str() == Some("github.com") {
46            let scheme = url.scheme().to_lowercase();
47            url.set_scheme(&scheme).unwrap();
48            let path = url.path().to_lowercase();
49            url.set_path(&path);
50        }
51
52        // Repos can generally be accessed with or without `.git` extension.
53        if let Some((prefix, suffix)) = url.path().rsplit_once('@') {
54            // Ex) `git+https://github.com/pypa/sample-namespace-packages.git@2.0.0`
55            let needs_chopping = std::path::Path::new(prefix)
56                .extension()
57                .is_some_and(|ext| ext.eq_ignore_ascii_case("git"));
58            if needs_chopping {
59                let prefix = &prefix[..prefix.len() - 4];
60                let path = format!("{prefix}@{suffix}");
61                url.set_path(&path);
62            }
63        } else {
64            // Ex) `git+https://github.com/pypa/sample-namespace-packages.git`
65            let needs_chopping = std::path::Path::new(url.path())
66                .extension()
67                .is_some_and(|ext| ext.eq_ignore_ascii_case("git"));
68            if needs_chopping {
69                let last = {
70                    // Unwrap safety: We checked `url.cannot_be_a_base()`, and `url.path()` having
71                    // an extension implies at least one segment.
72                    let last = url.path_segments().unwrap().next_back().unwrap();
73                    last[..last.len() - 4].to_owned()
74                };
75                url.path_segments_mut().unwrap().pop().push(&last);
76            }
77        }
78
79        // Decode any percent-encoded characters in the path.
80        if memchr::memchr(b'%', url.path().as_bytes()).is_some() {
81            // Unwrap safety: We checked `url.cannot_be_a_base()`.
82            let decoded = url
83                .path_segments()
84                .unwrap()
85                .map(|segment| {
86                    percent_encoding::percent_decode_str(segment)
87                        .decode_utf8()
88                        .unwrap_or(Cow::Borrowed(segment))
89                        .into_owned()
90                })
91                .collect::<Vec<_>>();
92
93            let mut path_segments = url.path_segments_mut().unwrap();
94            path_segments.clear();
95            path_segments.extend(decoded);
96        }
97
98        Self(url)
99    }
100
101    pub fn parse(url: &str) -> Result<Self, DisplaySafeUrlError> {
102        Ok(Self::new(&DisplaySafeUrl::parse(url)?))
103    }
104}
105
106impl CacheKey for CanonicalUrl {
107    fn cache_key(&self, state: &mut CacheKeyHasher) {
108        // `as_str` gives the serialisation of a url (which has a spec) and so insulates against
109        // possible changes in how the URL crate does hashing.
110        self.0.as_str().cache_key(state);
111    }
112}
113
114impl Hash for CanonicalUrl {
115    fn hash<H: Hasher>(&self, state: &mut H) {
116        // `as_str` gives the serialisation of a url (which has a spec) and so insulates against
117        // possible changes in how the URL crate does hashing.
118        self.0.as_str().hash(state);
119    }
120}
121
122impl From<CanonicalUrl> for DisplaySafeUrl {
123    fn from(value: CanonicalUrl) -> Self {
124        value.0
125    }
126}
127
128impl std::fmt::Display for CanonicalUrl {
129    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
130        std::fmt::Display::fmt(&self.0, f)
131    }
132}
133
134/// Like [`CanonicalUrl`], but attempts to represent an underlying source repository, abstracting
135/// away details like the specific commit or branch, or the subdirectory to build within the
136/// repository.
137///
138/// For example, `https://github.com/pypa/package.git#subdirectory=pkg_a` and
139/// `https://github.com/pypa/package.git#subdirectory=pkg_b` would map to different
140/// [`CanonicalUrl`] values, but the same [`RepositoryUrl`], since they map to the same
141/// resource.
142#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
143pub struct RepositoryUrl(DisplaySafeUrl);
144
145impl RepositoryUrl {
146    pub fn new(url: &DisplaySafeUrl) -> Self {
147        let mut url = CanonicalUrl::new(url).0;
148
149        // If a Git URL ends in a reference (like a branch, tag, or commit), remove it.
150        if url.scheme().starts_with("git+") {
151            if let Some(prefix) = url
152                .path()
153                .rsplit_once('@')
154                .map(|(prefix, _suffix)| prefix.to_string())
155            {
156                url.set_path(&prefix);
157            }
158        }
159
160        // Drop any fragments and query parameters.
161        url.set_fragment(None);
162        url.set_query(None);
163
164        Self(url)
165    }
166
167    pub fn parse(url: &str) -> Result<Self, DisplaySafeUrlError> {
168        Ok(Self::new(&DisplaySafeUrl::parse(url)?))
169    }
170}
171
172impl CacheKey for RepositoryUrl {
173    fn cache_key(&self, state: &mut CacheKeyHasher) {
174        // `as_str` gives the serialisation of a url (which has a spec) and so insulates against
175        // possible changes in how the URL crate does hashing.
176        self.0.as_str().cache_key(state);
177    }
178}
179
180impl Hash for RepositoryUrl {
181    fn hash<H: Hasher>(&self, state: &mut H) {
182        // `as_str` gives the serialisation of a url (which has a spec) and so insulates against
183        // possible changes in how the URL crate does hashing.
184        self.0.as_str().hash(state);
185    }
186}
187
188impl Deref for RepositoryUrl {
189    type Target = Url;
190
191    fn deref(&self) -> &Self::Target {
192        &self.0
193    }
194}
195
196impl std::fmt::Display for RepositoryUrl {
197    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
198        std::fmt::Display::fmt(&self.0, f)
199    }
200}
201
202#[cfg(test)]
203mod tests {
204    use super::*;
205
206    #[test]
207    fn user_credential_does_not_affect_cache_key() -> Result<(), DisplaySafeUrlError> {
208        let mut hasher = CacheKeyHasher::new();
209        CanonicalUrl::parse("https://example.com/pypa/sample-namespace-packages.git@2.0.0")?
210            .cache_key(&mut hasher);
211        let hash_without_creds = hasher.finish();
212
213        let mut hasher = CacheKeyHasher::new();
214        CanonicalUrl::parse(
215            "https://user:foo@example.com/pypa/sample-namespace-packages.git@2.0.0",
216        )?
217        .cache_key(&mut hasher);
218        let hash_with_creds = hasher.finish();
219        assert_eq!(
220            hash_without_creds, hash_with_creds,
221            "URLs with no user credentials should hash the same as URLs with different user credentials",
222        );
223
224        let mut hasher = CacheKeyHasher::new();
225        CanonicalUrl::parse(
226            "https://user:bar@example.com/pypa/sample-namespace-packages.git@2.0.0",
227        )?
228        .cache_key(&mut hasher);
229        let hash_with_creds = hasher.finish();
230        assert_eq!(
231            hash_without_creds, hash_with_creds,
232            "URLs with different user credentials should hash the same",
233        );
234
235        let mut hasher = CacheKeyHasher::new();
236        CanonicalUrl::parse("https://:bar@example.com/pypa/sample-namespace-packages.git@2.0.0")?
237            .cache_key(&mut hasher);
238        let hash_with_creds = hasher.finish();
239        assert_eq!(
240            hash_without_creds, hash_with_creds,
241            "URLs with no username, though with a password, should hash the same as URLs with different user credentials",
242        );
243
244        let mut hasher = CacheKeyHasher::new();
245        CanonicalUrl::parse("https://user:@example.com/pypa/sample-namespace-packages.git@2.0.0")?
246            .cache_key(&mut hasher);
247        let hash_with_creds = hasher.finish();
248        assert_eq!(
249            hash_without_creds, hash_with_creds,
250            "URLs with no password, though with a username, should hash the same as URLs with different user credentials",
251        );
252
253        Ok(())
254    }
255
256    #[test]
257    fn canonical_url() -> Result<(), DisplaySafeUrlError> {
258        // Two URLs should be considered equal regardless of the `.git` suffix.
259        assert_eq!(
260            CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git")?,
261            CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages")?,
262        );
263
264        // Two URLs should be considered equal regardless of the `.git` suffix.
265        assert_eq!(
266            CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git@2.0.0")?,
267            CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages@2.0.0")?,
268        );
269
270        // Two URLs should be _not_ considered equal if they point to different repositories.
271        assert_ne!(
272            CanonicalUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git")?,
273            CanonicalUrl::parse("git+https://github.com/pypa/sample-packages.git")?,
274        );
275
276        // Two URLs should _not_ be considered equal if they request different subdirectories.
277        assert_ne!(
278            CanonicalUrl::parse(
279                "git+https://github.com/pypa/sample-namespace-packages.git#subdirectory=pkg_resources/pkg_a"
280            )?,
281            CanonicalUrl::parse(
282                "git+https://github.com/pypa/sample-namespace-packages.git#subdirectory=pkg_resources/pkg_b"
283            )?,
284        );
285
286        // Two URLs should _not_ be considered equal if they request different commit tags.
287        assert_ne!(
288            CanonicalUrl::parse(
289                "git+https://github.com/pypa/sample-namespace-packages.git@v1.0.0"
290            )?,
291            CanonicalUrl::parse(
292                "git+https://github.com/pypa/sample-namespace-packages.git@v2.0.0"
293            )?,
294        );
295
296        // Two URLs that cannot be a base should be considered equal.
297        assert_eq!(
298            CanonicalUrl::parse("git+https:://github.com/pypa/sample-namespace-packages.git")?,
299            CanonicalUrl::parse("git+https:://github.com/pypa/sample-namespace-packages.git")?,
300        );
301
302        // Two URLs should _not_ be considered equal based on percent-decoding slashes.
303        assert_ne!(
304            CanonicalUrl::parse("https://github.com/pypa/sample%2Fnamespace%2Fpackages")?,
305            CanonicalUrl::parse("https://github.com/pypa/sample/namespace/packages")?,
306        );
307
308        // Two URLs should be considered equal regardless of percent-encoding.
309        assert_eq!(
310            CanonicalUrl::parse("https://github.com/pypa/sample%2Bnamespace%2Bpackages")?,
311            CanonicalUrl::parse("https://github.com/pypa/sample+namespace+packages")?,
312        );
313
314        // Two URLs should _not_ be considered equal based on percent-decoding slashes.
315        assert_ne!(
316            CanonicalUrl::parse(
317                "file:///home/ferris/my_project%2Fmy_project-0.1.0-py3-none-any.whl"
318            )?,
319            CanonicalUrl::parse(
320                "file:///home/ferris/my_project/my_project-0.1.0-py3-none-any.whl"
321            )?,
322        );
323
324        // Two URLs should be considered equal regardless of percent-encoding.
325        assert_eq!(
326            CanonicalUrl::parse(
327                "file:///home/ferris/my_project/my_project-0.1.0+foo-py3-none-any.whl"
328            )?,
329            CanonicalUrl::parse(
330                "file:///home/ferris/my_project/my_project-0.1.0%2Bfoo-py3-none-any.whl"
331            )?,
332        );
333
334        Ok(())
335    }
336
337    #[test]
338    fn repository_url() -> Result<(), DisplaySafeUrlError> {
339        // Two URLs should be considered equal regardless of the `.git` suffix.
340        assert_eq!(
341            RepositoryUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git")?,
342            RepositoryUrl::parse("git+https://github.com/pypa/sample-namespace-packages")?,
343        );
344
345        // Two URLs should be considered equal regardless of the `.git` suffix.
346        assert_eq!(
347            RepositoryUrl::parse(
348                "git+https://github.com/pypa/sample-namespace-packages.git@2.0.0"
349            )?,
350            RepositoryUrl::parse("git+https://github.com/pypa/sample-namespace-packages@2.0.0")?,
351        );
352
353        // Two URLs should be _not_ considered equal if they point to different repositories.
354        assert_ne!(
355            RepositoryUrl::parse("git+https://github.com/pypa/sample-namespace-packages.git")?,
356            RepositoryUrl::parse("git+https://github.com/pypa/sample-packages.git")?,
357        );
358
359        // Two URLs should be considered equal if they map to the same repository, even if they
360        // request different subdirectories.
361        assert_eq!(
362            RepositoryUrl::parse(
363                "git+https://github.com/pypa/sample-namespace-packages.git#subdirectory=pkg_resources/pkg_a"
364            )?,
365            RepositoryUrl::parse(
366                "git+https://github.com/pypa/sample-namespace-packages.git#subdirectory=pkg_resources/pkg_b"
367            )?,
368        );
369
370        // Two URLs should be considered equal if they map to the same repository, even if they
371        // request different commit tags.
372        assert_eq!(
373            RepositoryUrl::parse(
374                "git+https://github.com/pypa/sample-namespace-packages.git@v1.0.0"
375            )?,
376            RepositoryUrl::parse(
377                "git+https://github.com/pypa/sample-namespace-packages.git@v2.0.0"
378            )?,
379        );
380
381        Ok(())
382    }
383}