forest/utils/net/
download_file.rs1use crate::utils::{RetryArgs, net::global_http_client, retry};
5use anyhow::Context as _;
6use backon::{ExponentialBuilder, Retryable as _};
7use base64::{Engine, prelude::BASE64_STANDARD};
8use md5::{Digest as _, Md5};
9use std::sync::Arc;
10use std::{
11 ffi::OsStr,
12 path::{Path, PathBuf},
13 time::Duration,
14};
15use url::Url;
16
17#[derive(Debug, Copy, Clone)]
18pub enum DownloadFileOption {
19 NonResumable,
20 Resumable,
21}
22
23#[derive(Debug, Clone)]
24pub struct DownloadFileResult {
25 pub path: PathBuf,
26 #[allow(dead_code)]
27 pub cache_hit: bool,
28}
29
30pub async fn download_file_with_cache(
31 url: &Url,
32 cache_dir: &Path,
33 option: DownloadFileOption,
34) -> anyhow::Result<DownloadFileResult> {
35 let cache_file_path =
36 cache_dir.join(url.path().strip_prefix('/').unwrap_or_else(|| url.path()));
37 if let Some(cache_file_dir) = cache_file_path.parent()
38 && !cache_file_dir.is_dir()
39 {
40 std::fs::create_dir_all(cache_file_dir)?;
41 }
42
43 let cache_hit = match get_file_md5_hash(&cache_file_path) {
44 Some(file_md5) => match get_content_md5_hash_from_url(url.clone()).await? {
45 Some(url_md5) => {
46 if file_md5 == url_md5 {
47 true
48 } else {
49 tracing::warn!(
50 "download again due to md5 hash mismatch, url: {url}, local cache: {}, remote: {}",
51 hex::encode(&file_md5),
52 hex::encode(&url_md5)
53 );
54 false
55 }
56 }
57 None => {
58 anyhow::bail!("failed to extract md5 content hash from remote url {url}");
59 }
60 },
61 None => false,
62 };
63
64 if cache_hit {
65 tracing::debug!(%url, "loaded from cache");
66 } else {
67 download_file_with_retry(
68 url,
69 cache_file_path.parent().unwrap_or_else(|| Path::new(".")),
70 cache_file_path
71 .file_name()
72 .and_then(OsStr::to_str)
73 .with_context(|| {
74 format!(
75 "Error getting the file name of {}",
76 cache_file_path.display()
77 )
78 })?,
79 option,
80 None,
81 )
82 .await?;
83 }
84
85 Ok(DownloadFileResult {
86 path: cache_file_path,
87 cache_hit,
88 })
89}
90
91fn get_file_md5_hash(path: &Path) -> Option<Vec<u8>> {
92 std::fs::read(path).ok().map(|bytes| {
93 let mut hasher = Md5::new();
94 hasher.update(bytes.as_slice());
95 hasher.finalize().to_vec()
96 })
97}
98
99async fn get_content_md5_hash_from_url(url: Url) -> anyhow::Result<Option<Vec<u8>>> {
100 const TIMEOUT: Duration = Duration::from_secs(5);
101 let response = (|| {
102 global_http_client()
103 .head(url.clone())
104 .timeout(TIMEOUT)
105 .send()
106 })
107 .retry(ExponentialBuilder::default())
108 .await?;
109 let headers = response.headers();
110 if let Some(ms_blob_md5) = headers.get("x-ms-blob-content-md5") {
112 return Ok(Some(BASE64_STANDARD.decode(ms_blob_md5)?));
113 }
114
115 static HOSTS_WITH_MD5_ETAG: [&str; 2] =
116 ["filecoin-actors.chainsafe.dev", ".digitaloceanspaces.com"];
117 if url
118 .host_str()
119 .map(|h| HOSTS_WITH_MD5_ETAG.iter().any(|h_part| h.contains(h_part)))
120 .unwrap_or_default()
121 {
122 let md5 = headers
123 .get("etag")
124 .and_then(|v| v.to_str().ok().map(|v| hex::decode(v.replace('"', ""))))
125 .transpose()?;
126 Ok(md5)
127 } else {
128 anyhow::bail!(
129 "unsupported host, register in HOSTS_WITH_MD5_ETAG if it's known to use md5 as etag algorithm. url: {url}"
130 )
131 }
132}
133
134pub async fn download_http(
136 url: &Url,
137 directory: &Path,
138 filename: &str,
139 option: DownloadFileOption,
140 callback: Option<Arc<dyn Fn(String) + Sync + Send>>,
141) -> anyhow::Result<PathBuf> {
142 if !directory.is_dir() {
143 std::fs::create_dir_all(directory)?;
144 }
145 let dst_path = directory.join(filename);
146 let destination = dst_path.display();
147 tracing::info!(%url, %destination, "downloading snapshot");
148 let mut reader = crate::utils::net::reader(url.as_str(), option, callback).await?;
149 let tmp_dst_path = {
150 const DOWNLOAD_EXTENSION: &str = "frdownload";
152 let mut path = dst_path.clone();
153 if let Some(ext) = path.extension() {
154 path.set_extension(format!(
155 "{}.{DOWNLOAD_EXTENSION}",
156 ext.to_str().unwrap_or_default()
157 ));
158 } else {
159 path.set_extension(DOWNLOAD_EXTENSION);
160 }
161 path
162 };
163 let mut tempfile = tokio::fs::File::create(&tmp_dst_path)
164 .await
165 .context("couldn't create destination file")?;
166 tokio::io::copy(&mut reader, &mut tempfile)
167 .await
168 .context("couldn't download file")?;
169 std::fs::rename(&tmp_dst_path, &dst_path).context("couldn't rename file")?;
170
171 Ok(dst_path)
172}
173
174pub async fn download_file_with_retry(
175 url: &Url,
176 directory: &Path,
177 filename: &str,
178 option: DownloadFileOption,
179 callback: Option<Arc<dyn Fn(String) + Sync + Send>>,
180) -> anyhow::Result<PathBuf> {
181 Ok(retry(
182 RetryArgs {
183 timeout: None,
184 ..Default::default()
185 },
186 || download_http(url, directory, filename, option, callback.clone()),
187 )
188 .await?)
189}
190
191pub async fn download_to(
192 url: &Url,
193 destination: &Path,
194 option: DownloadFileOption,
195 callback: Option<Arc<dyn Fn(String) + Sync + Send>>,
196) -> anyhow::Result<()> {
197 download_file_with_retry(
198 url,
199 destination.parent().with_context(|| {
200 format!(
201 "Error getting the parent directory of {}",
202 destination.display()
203 )
204 })?,
205 destination
206 .file_name()
207 .and_then(OsStr::to_str)
208 .with_context(|| format!("Error getting the file name of {}", destination.display()))?,
209 option,
210 callback,
211 )
212 .await?;
213
214 Ok(())
215}
216
217#[cfg(test)]
218mod test {
219 use super::*;
220
221 #[tokio::test]
222 async fn test_get_content_md5_hash_from_url_1() {
223 let url = "https://filecoin-actors.chainsafe.dev/v15.0.0/builtin-actors-mainnet.car";
224 let md5 = get_content_md5_hash_from_url(url.try_into().unwrap())
225 .await
226 .unwrap()
227 .map(hex::encode);
228 assert_eq!(md5, Some("676b41e3dd1dc94430084bde84849701".into()))
229 }
230
231 #[tokio::test]
232 async fn test_get_content_md5_hash_from_url_2() {
233 let url = "https://github.com/filecoin-project/builtin-actors/releases/download/v15.0.0/builtin-actors-mainnet.car";
234 let md5 = get_content_md5_hash_from_url(url.try_into().unwrap())
235 .await
236 .unwrap()
237 .map(hex::encode);
238 assert_eq!(md5, Some("676b41e3dd1dc94430084bde84849701".into()))
239 }
240
241 #[tokio::test]
242 async fn test_download_file_with_cache() {
243 let temp_dir = tempfile::tempdir().unwrap();
244 let url = "https://forest-snapshots.fra1.cdn.digitaloceanspaces.com/genesis/butterflynet-bafy2bzacecm7xklkq3hkc2kgm5wnb5shlxmffino6lzhh7lte5acytb7sssr4.car.zst".try_into().unwrap();
245 let result =
246 download_file_with_cache(&url, temp_dir.path(), DownloadFileOption::NonResumable)
247 .await
248 .unwrap();
249 assert!(!result.cache_hit);
250 let result =
251 download_file_with_cache(&url, temp_dir.path(), DownloadFileOption::NonResumable)
252 .await
253 .unwrap();
254 assert!(result.cache_hit);
255 }
256}