1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
use env_logger;
use log::*;
use std::fs;
use std::fs::File;
use std::io;
use std::path::PathBuf;
use unzip::Unzipper;
use walkdir::WalkDir;
pub struct DataSource<'a> {
pub name: &'a str,
pub url: &'a str,
}
#[derive(Debug)]
pub struct SampleData {
pub root: PathBuf,
pub num_files: u64,
pub size: u64,
pub archive: PathBuf,
pub files: Vec<PathBuf>,
}
impl SampleData {
pub fn remove(&self) -> io::Result<()> {
fs::remove_dir_all(&self.root)?;
fs::remove_file(&self.archive)
}
}
const KERNEL: DataSource = DataSource {
name: "Linux_Kernel",
url: "https://github.com/torvalds/linux/archive/v5.9.zip",
};
const CARGO: DataSource = DataSource {
name: "Cargo_sources",
url: "https://github.com/rust-lang/cargo/archive/0.47.0.zip",
};
fn download_and_unpack(ds: DataSource) -> Result<SampleData, String> {
std::fs::create_dir_all(ds.name).map_err(|e| e.to_string())?;
let archive = format!("{}.zip", ds.name);
let mut num_files = 0;
let mut size = 0;
let mut files = vec![];
if !std::path::Path::new(&archive).is_file() {
debug!("Downloading {:?}", ds.url);
let mut resp = reqwest::blocking::get(ds.url).map_err(|e| format!("{:?}", e))?;
let mut out = File::create(&archive).map_err(|e| format!("{:?}", e))?;
std::io::copy(&mut resp, &mut out).map_err(|e| format!("{:?}", e))?;
} else {
info!("Did not download, archive already present");
}
info!("Unzipping...");
Unzipper::new(File::open(&archive).unwrap(), ds.name)
.unzip()
.map_err(|e| format!("{:?}", e))?;
debug!("Sample data ready. Gathering stats...");
for entry in WalkDir::new(ds.name).into_iter().filter_map(|e| e.ok()) {
num_files += 1;
if let Ok(meta) = entry.metadata() {
size += meta.len();
}
files.push(entry.path().to_path_buf());
}
data_path(ds.name)
.ok_or("Could not get data dir".to_string())
.map(|x| SampleData {
root: x,
archive: std::path::PathBuf::from(&archive),
num_files,
size,
files,
})
}
fn setup() {
std::env::set_var("RUST_LOG", "INFO");
let _ = env_logger::builder().try_init();
}
fn data_path(data_dir: &str) -> Option<PathBuf> {
PathBuf::from(file!())
.parent()
.map(|p| p.parent())
.flatten()
.map(|p| p.join(data_dir))
}
pub fn linux_kernel() -> Result<SampleData, String> {
setup();
download_and_unpack(KERNEL)
}
pub fn cargo_sources() -> Result<SampleData, String> {
setup();
download_and_unpack(CARGO)
}
pub fn from_url(zipfile_url: &str) -> Result<SampleData, String> {
setup();
download_and_unpack(DataSource {
url: zipfile_url,
name: "Custom url",
})
}
#[test]
fn test_kernel() {
std::env::set_var("RUST_LOG", "INFO");
let _ = env_logger::builder().try_init();
info!("{:?}", cargo_sources());
}