1use flate2::read::GzDecoder;
2use std::{
3 collections::HashMap,
4 fs::{create_dir_all, File},
5 io,
6 path::{Path, PathBuf},
7};
8use thiserror::Error;
9
10use cached_path::{Cache, CacheBuilder, Error as CachedError};
11use rusqlite::{Connection, Error as SqliteError};
12
13pub use cached_path;
14pub use rusqlite;
15
16#[derive(Error, Debug)]
17pub enum Error {
18 #[error("dump not found")]
19 NotFound(#[from] CachedError),
20
21 #[error("failed to load db")]
22 RusqliteError(#[from] SqliteError),
23
24 #[error("failed to unpack dump")]
25 IOError(#[from] io::Error),
26}
27
28pub struct CratesIODumpLoader {
29 pub resource: String,
30 pub files: Vec<PathBuf>,
31 pub cache: Cache,
32 pub target_path: PathBuf,
33 pub preload: bool,
34
35 table_schema: HashMap<String, String>,
36}
37
38impl Default for CratesIODumpLoader {
39 fn default() -> Self {
40 Self {
41 resource: "https://static.crates.io/db-dump.tar.gz".to_string(),
42 files: tables_to_files(&[
43 "badges",
44 "categories",
45 "crate_owners",
46 "crates",
47 "crates_categories",
48 "crates_keywords",
49 "dependencies",
50 "keywords",
51 "metadata",
52 "reserved_crate_names",
53 "teams",
54 "users",
55 "version_authors",
56 "version_downloads",
57 "versions",
58 ]),
59 cache: Cache::new().unwrap(), target_path: Path::new("data").to_path_buf(),
61 table_schema: HashMap::new(),
62 preload: false,
63 }
64 }
65}
66
67impl CratesIODumpLoader {
68 pub fn resource(&mut self, path: &str) -> &mut Self {
69 self.resource = path.to_owned();
70 self
71 }
72
73 pub fn files(&mut self, files: Vec<PathBuf>) -> &mut Self {
74 self.files = files;
75 self
76 }
77
78 pub fn tables(&mut self, tables: &[&str]) -> &mut Self {
79 self.files = tables_to_files(tables);
80 self
81 }
82
83 pub fn table_schema(&mut self, table: &str, schema: &str) -> &mut Self {
84 self.table_schema
85 .insert(table.to_string(), schema.to_string());
86 self
87 }
88
89 pub fn target_path(&mut self, path: &Path) -> &mut Self {
90 self.target_path = path.to_path_buf();
91 self
92 }
93
94 pub fn cache(&mut self, builder: CacheBuilder) -> Result<&mut Self, Error> {
95 self.cache = builder.build()?;
96 Ok(self)
97 }
98
99 pub fn preload(&mut self, should: bool) -> &mut Self {
100 self.preload = should;
101 self
102 }
103
104 pub fn minimal(&mut self) -> &mut Self {
105 self.tables(&["crates", "dependencies", "versions"])
106 }
107
108 pub fn update(&mut self) -> Result<&mut Self, Error> {
109 let path = self.cache.cached_path(&self.resource)?;
110
111 let first_local_file = self.target_path.join(self.files.first().unwrap());
112 if first_local_file.exists()
113 && path.metadata()?.created()? <= first_local_file.metadata()?.created()?
114 {
115 return Ok(self);
117 }
118
119 let tar_gz = File::open(path)?;
121 let tar = GzDecoder::new(tar_gz);
122 let mut archive = tar::Archive::new(tar);
123
124 create_dir_all(&self.target_path)?;
125 for file in archive.entries().unwrap() {
126 let mut f = file.unwrap();
127 let aname = match f.path().unwrap_or_default().file_name() {
128 Some(p) => PathBuf::from(p),
129 None => PathBuf::default(),
130 };
131 if self.files.contains(&aname) {
132 f.unpack(self.target_path.join(aname))?;
133 }
134 }
135 Ok(self)
136 }
137
138 pub fn sqlite_path(&self) -> PathBuf {
139 self.target_path.join(Path::new("db.sqlite"))
140 }
141
142 pub fn open_db(&mut self) -> Result<Connection, Error> {
143 let path = self.sqlite_path();
144
145 let mut should_load = false;
146 let first_local_file = self.target_path.join(self.files.first().unwrap());
147 if !path.exists() {
148 should_load = true;
149 } else if !first_local_file.exists()
150 && path.exists()
151 && path.metadata()?.created()? <= first_local_file.metadata()?.created()?
152 {
153 should_load = true;
154 std::fs::remove_file(&path)?;
155 }
156
157 let db = Connection::open(&path)?;
158 rusqlite::vtab::csvtab::load_module(&db)?;
159
160 if should_load {
161 self.load_dump_into(&db)?;
162 }
163 Ok(db)
164 }
165
166 pub fn load_dump_into(&mut self, db: &Connection) -> Result<(), Error> {
167 let schema = self
168 .files
169 .iter()
170 .map(|f| self.file_to_query(f))
171 .fold(String::new(), |a, b| a + b.as_str() + "\n");
172 db.execute_batch(schema.as_str())?;
173 Ok(())
174 }
175
176 fn file_to_query(&self, path: &PathBuf) -> String {
177 let actual_file = self.target_path.join(path);
178 let table = path.file_stem().unwrap_or_default().to_string_lossy();
179 let vtable = match self.preload {
180 true => format!("temp_{}", table),
181 false => table.to_string(),
182 };
183
184 let vtab = match self.table_schema.get(&table.to_string()) {
185 Some(schema) => format!(
186 r#"
187 DROP TABLE IF EXISTS {0};
188 CREATE VIRTUAL TABLE {0} USING csv(filename='{1}',header=yes,schema='{2}');
189 "#,
190 vtable,
191 actual_file.display(),
192 schema,
193 ),
194 None => format!(
195 r#"
196 DROP TABLE IF EXISTS {0};
197 CREATE VIRTUAL TABLE {0} USING csv(filename='{1}',header=yes);
198 "#,
199 vtable,
200 actual_file.display(),
201 ),
202 };
203
204 if self.preload {
205 let ptab = format!(
206 r#"
207 DROP TABLE IF EXISTS {0};
208 CREATE TABLE {0} AS SELECT * FROM {1};
209 DROP TABLE {1};
210 "#,
211 table, vtable,
212 );
213
214 return format!("{}\n{}", vtab, ptab);
215 }
216
217 vtab
218 }
219}
220
221fn tables_to_files(tables: &[&str]) -> Vec<PathBuf> {
222 tables
223 .iter()
224 .map(|t| {
225 let mut buf = PathBuf::new();
226 buf.set_file_name(t);
227 buf.set_extension("csv");
228 buf
229 })
230 .collect()
231}
232
233#[test]
234fn test_basic_csvtab() -> Result<(), Error> {
235 let cache = Cache::builder().progress_bar(None);
237
238 let db = Connection::open_in_memory().unwrap();
240 rusqlite::vtab::csvtab::load_module(&db).unwrap();
241
242 CratesIODumpLoader::default()
244 .preload(true)
245 .resource("testdata/test.tar.gz")
246 .target_path(Path::new("testdata/extracted"))
247 .tables(&["test"])
248 .table_schema("test", "CREATE TABLE x(renamed_id INT, name TEXT);")
249 .cache(cache)?
250 .update()?
251 .load_dump_into(&db)?;
252
253 let mut s = db.prepare("SELECT renamed_id FROM test WHERE name = ?")?;
254 let dummy = s.query_row(["awooo"], |row| row.get::<_, i64>(0))?;
255 assert_eq!(3, dummy);
256 Ok(())
257}
258
259#[test]
260fn test_basic_csvtab_open() -> Result<(), Error> {
261 let cache = Cache::builder().progress_bar(None);
263
264 let db = CratesIODumpLoader::default()
266 .preload(true)
267 .resource("testdata/test.tar.gz")
268 .target_path(Path::new("testdata/extracted"))
269 .tables(&["test"])
270 .table_schema("test", "CREATE TABLE x(renamed_id INT, name TEXT);")
271 .cache(cache)?
272 .update()?
273 .open_db()?;
274
275 let mut s = db.prepare("SELECT renamed_id FROM test WHERE name = ?")?;
276 let dummy = s.query_row(["awooo"], |row| row.get::<_, i64>(0))?;
277 assert_eq!(3, dummy);
278 Ok(())
279}
280
281