forest/utils/net/
download_file.rs

1// Copyright 2019-2025 ChainSafe Systems
2// SPDX-License-Identifier: Apache-2.0, MIT
3
4use crate::utils::{RetryArgs, net::global_http_client, retry};
5use anyhow::Context as _;
6use backon::{ExponentialBuilder, Retryable as _};
7use base64::{Engine, prelude::BASE64_STANDARD};
8use md5::{Digest as _, Md5};
9use std::sync::Arc;
10use std::{
11    ffi::OsStr,
12    path::{Path, PathBuf},
13    time::Duration,
14};
15use url::Url;
16
17#[derive(Debug, Copy, Clone)]
18pub enum DownloadFileOption {
19    NonResumable,
20    Resumable,
21}
22
23#[derive(Debug, Clone)]
24pub struct DownloadFileResult {
25    pub path: PathBuf,
26    #[allow(dead_code)]
27    pub cache_hit: bool,
28}
29
30pub async fn download_file_with_cache(
31    url: &Url,
32    cache_dir: &Path,
33    option: DownloadFileOption,
34) -> anyhow::Result<DownloadFileResult> {
35    let cache_file_path =
36        cache_dir.join(url.path().strip_prefix('/').unwrap_or_else(|| url.path()));
37    if let Some(cache_file_dir) = cache_file_path.parent()
38        && !cache_file_dir.is_dir()
39    {
40        std::fs::create_dir_all(cache_file_dir)?;
41    }
42
43    let cache_hit = match get_file_md5_hash(&cache_file_path) {
44        Some(file_md5) => match get_content_md5_hash_from_url(url.clone()).await? {
45            Some(url_md5) => {
46                if file_md5 == url_md5 {
47                    true
48                } else {
49                    tracing::warn!(
50                        "download again due to md5 hash mismatch, url: {url}, local cache: {}, remote: {}",
51                        hex::encode(&file_md5),
52                        hex::encode(&url_md5)
53                    );
54                    false
55                }
56            }
57            None => {
58                anyhow::bail!("failed to extract md5 content hash from remote url {url}");
59            }
60        },
61        None => false,
62    };
63
64    if cache_hit {
65        tracing::debug!(%url, "loaded from cache");
66    } else {
67        download_file_with_retry(
68            url,
69            cache_file_path.parent().unwrap_or_else(|| Path::new(".")),
70            cache_file_path
71                .file_name()
72                .and_then(OsStr::to_str)
73                .with_context(|| {
74                    format!(
75                        "Error getting the file name of {}",
76                        cache_file_path.display()
77                    )
78                })?,
79            option,
80            None,
81        )
82        .await?;
83    }
84
85    Ok(DownloadFileResult {
86        path: cache_file_path,
87        cache_hit,
88    })
89}
90
91fn get_file_md5_hash(path: &Path) -> Option<Vec<u8>> {
92    std::fs::read(path).ok().map(|bytes| {
93        let mut hasher = Md5::new();
94        hasher.update(bytes.as_slice());
95        hasher.finalize().to_vec()
96    })
97}
98
99async fn get_content_md5_hash_from_url(url: Url) -> anyhow::Result<Option<Vec<u8>>> {
100    const TIMEOUT: Duration = Duration::from_secs(5);
101    let response = (|| {
102        global_http_client()
103            .head(url.clone())
104            .timeout(TIMEOUT)
105            .send()
106    })
107    .retry(ExponentialBuilder::default())
108    .await?;
109    let headers = response.headers();
110    // Github release assets
111    if let Some(ms_blob_md5) = headers.get("x-ms-blob-content-md5") {
112        return Ok(Some(BASE64_STANDARD.decode(ms_blob_md5)?));
113    }
114
115    static HOSTS_WITH_MD5_ETAG: [&str; 2] =
116        ["filecoin-actors.chainsafe.dev", ".digitaloceanspaces.com"];
117    if url
118        .host_str()
119        .map(|h| HOSTS_WITH_MD5_ETAG.iter().any(|h_part| h.contains(h_part)))
120        .unwrap_or_default()
121    {
122        let md5 = headers
123            .get("etag")
124            .and_then(|v| v.to_str().ok().map(|v| hex::decode(v.replace('"', ""))))
125            .transpose()?;
126        Ok(md5)
127    } else {
128        anyhow::bail!(
129            "unsupported host, register in HOSTS_WITH_MD5_ETAG if it's known to use md5 as etag algorithm. url: {url}"
130        )
131    }
132}
133
134/// Download the file at `url` with a private HTTP client, returning the path to the downloaded file
135pub async fn download_http(
136    url: &Url,
137    directory: &Path,
138    filename: &str,
139    option: DownloadFileOption,
140    callback: Option<Arc<dyn Fn(String) + Sync + Send>>,
141) -> anyhow::Result<PathBuf> {
142    if !directory.is_dir() {
143        std::fs::create_dir_all(directory)?;
144    }
145    let dst_path = directory.join(filename);
146    let destination = dst_path.display();
147    tracing::info!(%url, %destination, "downloading snapshot");
148    let mut reader = crate::utils::net::reader(url.as_str(), option, callback).await?;
149    let tmp_dst_path = {
150        // like `crdownload` for the chrome browser
151        const DOWNLOAD_EXTENSION: &str = "frdownload";
152        let mut path = dst_path.clone();
153        if let Some(ext) = path.extension() {
154            path.set_extension(format!(
155                "{}.{DOWNLOAD_EXTENSION}",
156                ext.to_str().unwrap_or_default()
157            ));
158        } else {
159            path.set_extension(DOWNLOAD_EXTENSION);
160        }
161        path
162    };
163    let mut tempfile = tokio::fs::File::create(&tmp_dst_path)
164        .await
165        .context("couldn't create destination file")?;
166    tokio::io::copy(&mut reader, &mut tempfile)
167        .await
168        .context("couldn't download file")?;
169    std::fs::rename(&tmp_dst_path, &dst_path).context("couldn't rename file")?;
170
171    Ok(dst_path)
172}
173
174pub async fn download_file_with_retry(
175    url: &Url,
176    directory: &Path,
177    filename: &str,
178    option: DownloadFileOption,
179    callback: Option<Arc<dyn Fn(String) + Sync + Send>>,
180) -> anyhow::Result<PathBuf> {
181    Ok(retry(
182        RetryArgs {
183            timeout: None,
184            ..Default::default()
185        },
186        || download_http(url, directory, filename, option, callback.clone()),
187    )
188    .await?)
189}
190
191pub async fn download_to(
192    url: &Url,
193    destination: &Path,
194    option: DownloadFileOption,
195    callback: Option<Arc<dyn Fn(String) + Sync + Send>>,
196) -> anyhow::Result<()> {
197    download_file_with_retry(
198        url,
199        destination.parent().with_context(|| {
200            format!(
201                "Error getting the parent directory of {}",
202                destination.display()
203            )
204        })?,
205        destination
206            .file_name()
207            .and_then(OsStr::to_str)
208            .with_context(|| format!("Error getting the file name of {}", destination.display()))?,
209        option,
210        callback,
211    )
212    .await?;
213
214    Ok(())
215}
216
217#[cfg(test)]
218mod test {
219    use super::*;
220
221    #[tokio::test]
222    async fn test_get_content_md5_hash_from_url_1() {
223        let url = "https://filecoin-actors.chainsafe.dev/v15.0.0/builtin-actors-mainnet.car";
224        let md5 = get_content_md5_hash_from_url(url.try_into().unwrap())
225            .await
226            .unwrap()
227            .map(hex::encode);
228        assert_eq!(md5, Some("676b41e3dd1dc94430084bde84849701".into()))
229    }
230
231    #[tokio::test]
232    async fn test_get_content_md5_hash_from_url_2() {
233        let url = "https://github.com/filecoin-project/builtin-actors/releases/download/v15.0.0/builtin-actors-mainnet.car";
234        let md5 = get_content_md5_hash_from_url(url.try_into().unwrap())
235            .await
236            .unwrap()
237            .map(hex::encode);
238        assert_eq!(md5, Some("676b41e3dd1dc94430084bde84849701".into()))
239    }
240
241    #[tokio::test]
242    async fn test_download_file_with_cache() {
243        let temp_dir = tempfile::tempdir().unwrap();
244        let url = "https://forest-snapshots.fra1.cdn.digitaloceanspaces.com/genesis/butterflynet-bafy2bzacecm7xklkq3hkc2kgm5wnb5shlxmffino6lzhh7lte5acytb7sssr4.car.zst".try_into().unwrap();
245        let result =
246            download_file_with_cache(&url, temp_dir.path(), DownloadFileOption::NonResumable)
247                .await
248                .unwrap();
249        assert!(!result.cache_hit);
250        let result =
251            download_file_with_cache(&url, temp_dir.path(), DownloadFileOption::NonResumable)
252                .await
253                .unwrap();
254        assert!(result.cache_hit);
255    }
256}