crates_index/
dirs.rs

1use crate::Error;
2
3/// Get the disk location of the specified `url`, as well as its canonical form,
4/// exactly as cargo would.
5///
6/// `cargo_home` is used to root the directory at specific location, if not
7/// specified `CARGO_HOME` or else the default cargo location is used as the root.
8pub fn local_path_and_canonical_url(
9    url: &str,
10    cargo_home: Option<&std::path::Path>,
11) -> Result<(std::path::PathBuf, String), Error> {
12    local_path_and_canonical_url_with_hash_kind(url, cargo_home, &DEFAULT_HASHER_KIND)
13}
14
15/// Like [`local_path_and_canonical_url`] but accepts [`HashKind`] for determining the crate index path.
16pub fn local_path_and_canonical_url_with_hash_kind(
17    url: &str,
18    cargo_home: Option<&std::path::Path>,
19    hash_kind: &HashKind,
20) -> Result<(std::path::PathBuf, String), Error> {
21    let (dir_name, canonical_url) = url_to_local_dir(url, hash_kind)?;
22
23    let mut path = match cargo_home {
24        Some(path) => path.to_owned(),
25        None => home::cargo_home()?,
26    };
27
28    path.push("registry");
29    path.push("index");
30    path.push(dir_name);
31
32    Ok((path, canonical_url))
33}
34
35pub(crate) fn crate_prefix(accumulator: &mut String, crate_name: &str, separator: char) -> Option<()> {
36    match crate_name.len() {
37        0 => return None,
38        1 => accumulator.push('1'),
39        2 => accumulator.push('2'),
40        3 => {
41            accumulator.push('3');
42            accumulator.push(separator);
43            accumulator.extend(
44                crate_name
45                    .as_bytes()
46                    .get(0..1)?
47                    .iter()
48                    .map(|c| c.to_ascii_lowercase() as char),
49            );
50        }
51        _ => {
52            accumulator.extend(
53                crate_name
54                    .as_bytes()
55                    .get(0..2)?
56                    .iter()
57                    .map(|c| c.to_ascii_lowercase() as char),
58            );
59            accumulator.push(separator);
60            accumulator.extend(
61                crate_name
62                    .as_bytes()
63                    .get(2..4)?
64                    .iter()
65                    .map(|c| c.to_ascii_lowercase() as char),
66            );
67        }
68    };
69    Some(())
70}
71
72pub(crate) fn crate_name_to_relative_path(crate_name: &str, separator: Option<char>) -> Option<String> {
73    let separator = separator.unwrap_or(std::path::MAIN_SEPARATOR);
74    let mut rel_path = String::with_capacity(crate_name.len() + 6);
75    crate_prefix(&mut rel_path, crate_name, separator)?;
76    rel_path.push(separator);
77    rel_path.extend(crate_name.as_bytes().iter().map(|c| c.to_ascii_lowercase() as char));
78
79    Some(rel_path)
80}
81
82/// Matches https://github.com/rust-lang/cargo/blob/2928e32734b04925ee51e1ae88bea9a83d2fd451/crates/cargo-util-schemas/src/core/source_kind.rs#L5
83type SourceKind = u64;
84const SOURCE_KIND_REGISTRY: SourceKind = 2;
85const SOURCE_KIND_SPASE_REGISTRY: SourceKind = 3;
86
87/// Determine the crate registry hashing strategy for locating local crate indexes.
88#[derive(Debug)]
89pub enum HashKind {
90    /// Use the new hashing behavior introduced in Rust `1.85.0`.
91    Stable,
92
93    /// Use a hashing strategy that matches Cargo versions less than `1.85.0`
94    Legacy,
95}
96
97// For now, this acts as a centralized place to change the default. Ideally
98// this would be compiled conditionally based on the version of rustc as
99// a nice approximation of when consumers will be using the associated hash
100// implementation but this behavior is not yet stable: https://github.com/rust-lang/rust/issues/64796
101pub(crate) const DEFAULT_HASHER_KIND: HashKind = HashKind::Legacy;
102
103/// Converts a full url, eg https://github.com/rust-lang/crates.io-index, into
104/// the root directory name where cargo itself will fetch it on disk
105fn url_to_local_dir(url: &str, hash_kind: &HashKind) -> Result<(String, String), Error> {
106    #[allow(deprecated)]
107    fn legacy_hash_u64(url: &str, registry_kind: u64) -> u64 {
108        use std::hash::{Hash, Hasher, SipHasher};
109
110        let mut hasher = SipHasher::new_with_keys(0, 0);
111        // Registry
112        registry_kind.hash(&mut hasher);
113        // Url
114        url.hash(&mut hasher);
115        hasher.finish()
116    }
117
118    // Matches https://github.com/rust-lang/cargo/blob/2928e32734b04925ee51e1ae88bea9a83d2fd451/src/cargo/util/hasher.rs#L6
119    fn stable_hash_u64(url: &str, registry_kind: u64) -> u64 {
120        use rustc_stable_hash::StableSipHasher128 as StableHasher;
121        use std::hash::{Hash, Hasher};
122
123        let mut hasher = StableHasher::new();
124
125        // Type has an impact in the `rustc_stable_hasher`.
126        (registry_kind as isize).hash(&mut hasher);
127
128        url.hash(&mut hasher);
129
130        Hasher::finish(&hasher)
131    }
132
133    fn has_path_past_base(url: &str) -> bool {
134        if let Some(protocol_end) = url.find("://") {
135            // skip past protocol
136            let base_url_end = protocol_end + 3;
137            let rest_of_url = &url[base_url_end..];
138
139            // Check if there's any path or meaningful content after the domain (ignoring any trailing slashes)
140            return rest_of_url.trim_end_matches('/').contains('/');
141        }
142        false
143    }
144
145    // Matches https://github.com/rust-lang/cargo/blob/2928e32734b04925ee51e1ae88bea9a83d2fd451/src/cargo/util/hex.rs#L6
146    fn to_hex(num: u64) -> String {
147        hex::encode(num.to_le_bytes())
148    }
149
150    let hash_u64 = match hash_kind {
151        HashKind::Stable => stable_hash_u64,
152        HashKind::Legacy => legacy_hash_u64,
153    };
154
155    let mut registry_kind = SOURCE_KIND_REGISTRY;
156
157    // Ensure we have a registry or bare url
158    let (mut url, scheme_ind) = {
159        let scheme_ind = url
160            .find("://")
161            .ok_or_else(|| Error::Url(format!("'{url}' is not a valid url")))?;
162        let scheme_str = &url[..scheme_ind];
163        if scheme_str.starts_with("sparse+http") {
164            registry_kind = SOURCE_KIND_SPASE_REGISTRY;
165            (url, scheme_ind)
166        } else if let Some(ind) = scheme_str.find('+') {
167            if &scheme_str[..ind] != "registry" {
168                return Err(Error::Url(format!("'{url}' is not a valid registry url")));
169            }
170
171            (&url[ind + 1..], scheme_ind - ind - 1)
172        } else {
173            (url, scheme_ind)
174        }
175    };
176
177    // Could use the Url crate for this, but it's simple enough and we don't
178    // need to deal with every possible url (I hope...)
179    let host = match url[scheme_ind + 3..].find('/') {
180        Some(end) => &url[scheme_ind + 3..scheme_ind + 3 + end],
181        None => &url[scheme_ind + 3..],
182    };
183
184    // if a custom url ends with a slash it messes up the
185    // hash.  But if we remove it from just a base url such as
186    // https://index.crates.io/ it messes it up
187    // as well. So we strip if it has a path
188    // past the base url
189    if has_path_past_base(url) {
190        if let Some(stripped_url) = url.strip_suffix('/') {
191            url = stripped_url;
192        }
193    }
194
195    // trim port
196    let host = host.split(':').next().unwrap();
197
198    let (ident, url) = if registry_kind == SOURCE_KIND_REGISTRY {
199        // cargo special cases github.com for reasons, so do the same
200        let mut canonical = if host == "github.com" {
201            url.to_lowercase()
202        } else {
203            url.to_owned()
204        };
205
206        let ident = match hash_kind {
207            HashKind::Stable => {
208                // Locate the the first instance of params/fragments.
209                let mut params_index = {
210                    let question = canonical.find('?');
211                    let hash = canonical.rfind('#');
212
213                    question.zip(hash).map(|(q, h)| q.min(h)).or(question).or(hash)
214                };
215
216                // Attempt to trim `.git` from the end of url paths.
217                canonical = if let Some(idx) = params_index {
218                    let base_url = &canonical[..idx];
219                    let params = &canonical[idx..];
220
221                    if let Some(sanitized) = base_url.strip_suffix(".git") {
222                        params_index = Some(idx - 4);
223                        format!("{}{}", sanitized, params)
224                    } else {
225                        canonical
226                    }
227                } else {
228                    if canonical.ends_with(".git") {
229                        canonical.truncate(canonical.len() - 4);
230                    }
231                    canonical
232                };
233
234                let ident = to_hex(hash_u64(&canonical, registry_kind));
235
236                // Strip params
237                if let Some(idx) = params_index {
238                    canonical.truncate(canonical.len() - (canonical.len() - idx));
239                }
240
241                ident
242            }
243            HashKind::Legacy => {
244                // Chop off any query params/fragments
245                if let Some(hash) = canonical.rfind('#') {
246                    canonical.truncate(hash);
247                }
248
249                if let Some(query) = canonical.rfind('?') {
250                    canonical.truncate(query);
251                }
252
253                if canonical.ends_with('/') {
254                    canonical.pop();
255                }
256
257                let ident = to_hex(hash_u64(&canonical, registry_kind));
258
259                // Only GitHub (crates.io) repositories have their .git suffix truncated
260                if canonical.contains("github.com/") && canonical.ends_with(".git") {
261                    canonical.truncate(canonical.len() - 4);
262                }
263
264                ident
265            }
266        };
267
268        (ident, canonical)
269    } else {
270        (to_hex(hash_u64(url, registry_kind)), url.to_owned())
271    };
272    Ok((format!("{host}-{ident}"), url))
273}
274
275#[cfg(test)]
276mod test {
277    use crate::dirs::HashKind;
278
279    #[test]
280    fn http_index_url_matches_cargo() {
281        use crate::sparse::URL;
282        assert_eq!(
283            super::url_to_local_dir(URL, &HashKind::Legacy).unwrap(),
284            ("index.crates.io-6f17d22bba15001f".to_owned(), URL.to_owned(),)
285        );
286        assert_eq!(
287            super::url_to_local_dir(URL, &HashKind::Stable).unwrap(),
288            ("index.crates.io-1949cf8c6b5b557f".to_owned(), URL.to_owned(),)
289        );
290
291        // I've confirmed this also works with a custom registry, unfortunately
292        // that one includes a secret key as part of the url which would allow
293        // anyone to publish to the registry, so uhh...here's a fake one instead
294        assert_eq!(
295            super::url_to_local_dir(
296                "https://dl.cloudsmith.io/aBcW1234aBcW1234/embark/rust/cargo/index.git",
297                &HashKind::Legacy
298            )
299            .unwrap(),
300            (
301                "dl.cloudsmith.io-ff79e51ddd2b38fd".to_owned(),
302                "https://dl.cloudsmith.io/aBcW1234aBcW1234/embark/rust/cargo/index.git".to_owned()
303            )
304        );
305        assert_eq!(
306            super::url_to_local_dir(
307                "https://dl.cloudsmith.io/aBcW1234aBcW1234/embark/rust/cargo/index.git",
308                &HashKind::Stable
309            )
310            .unwrap(),
311            (
312                "dl.cloudsmith.io-5e6de3fada793d05".to_owned(),
313                "https://dl.cloudsmith.io/aBcW1234aBcW1234/embark/rust/cargo/index".to_owned()
314            )
315        );
316    }
317
318    #[test]
319    fn http_index_url_matches_index_slash() {
320        assert_eq!(
321            super::url_to_local_dir(
322                "https://dl.cloudsmith.io/aBcW1234aBcW1234/embark/rust/cargo/index/",
323                &HashKind::Stable
324            )
325            .unwrap(),
326            (
327                "dl.cloudsmith.io-5e6de3fada793d05".to_owned(),
328                "https://dl.cloudsmith.io/aBcW1234aBcW1234/embark/rust/cargo/index".to_owned()
329            )
330        );
331    }
332
333    #[test]
334    #[cfg(feature = "git")]
335    fn git_url_matches_cargo() {
336        use crate::git::URL;
337        assert_eq!(
338            crate::dirs::url_to_local_dir(URL, &HashKind::Legacy).unwrap(),
339            ("github.com-1ecc6299db9ec823".to_owned(), URL.to_owned())
340        );
341        assert_eq!(
342            crate::dirs::url_to_local_dir(URL, &HashKind::Stable).unwrap(),
343            ("github.com-25cdd57fae9f0462".to_owned(), URL.to_owned())
344        );
345
346        // Ensure we actually strip off the irrelevant parts of a url, note that
347        // the .git suffix is not part of the canonical url, but *is* used when hashing
348        assert_eq!(
349            crate::dirs::url_to_local_dir(&format!("registry+{}.git?one=1&two=2#fragment", URL), &HashKind::Legacy)
350                .unwrap(),
351            ("github.com-c786010fb7ef2e6e".to_owned(), URL.to_owned())
352        );
353        assert_eq!(
354            crate::dirs::url_to_local_dir(&format!("registry+{}.git?one=1&two=2#fragment", URL), &HashKind::Stable)
355                .unwrap(),
356            ("github.com-e78ed0bbfe5f35d7".to_owned(), URL.to_owned())
357        );
358    }
359}