cratesio_dbdump_csvtab/
lib.rs

1use flate2::read::GzDecoder;
2use std::{
3    collections::HashMap,
4    fs::{create_dir_all, File},
5    io,
6    path::{Path, PathBuf},
7};
8use thiserror::Error;
9
10use cached_path::{Cache, CacheBuilder, Error as CachedError};
11use rusqlite::{Connection, Error as SqliteError};
12
13pub use cached_path;
14pub use rusqlite;
15
16#[derive(Error, Debug)]
17pub enum Error {
18    #[error("dump not found")]
19    NotFound(#[from] CachedError),
20
21    #[error("failed to load db")]
22    RusqliteError(#[from] SqliteError),
23
24    #[error("failed to unpack dump")]
25    IOError(#[from] io::Error),
26}
27
28pub struct CratesIODumpLoader {
29    pub resource: String,
30    pub files: Vec<PathBuf>,
31    pub cache: Cache,
32    pub target_path: PathBuf,
33    pub preload: bool,
34
35    table_schema: HashMap<String, String>,
36}
37
38impl Default for CratesIODumpLoader {
39    fn default() -> Self {
40        Self {
41            resource: "https://static.crates.io/db-dump.tar.gz".to_string(),
42            files: tables_to_files(&[
43                "badges",
44                "categories",
45                "crate_owners",
46                "crates",
47                "crates_categories",
48                "crates_keywords",
49                "dependencies",
50                "keywords",
51                "metadata",
52                "reserved_crate_names",
53                "teams",
54                "users",
55                "version_authors",
56                "version_downloads",
57                "versions",
58            ]),
59            cache: Cache::new().unwrap(), // TODO: Maybe just store the builder instead... idk...
60            target_path: Path::new("data").to_path_buf(),
61            table_schema: HashMap::new(),
62            preload: false,
63        }
64    }
65}
66
67impl CratesIODumpLoader {
68    pub fn resource(&mut self, path: &str) -> &mut Self {
69        self.resource = path.to_owned();
70        self
71    }
72
73    pub fn files(&mut self, files: Vec<PathBuf>) -> &mut Self {
74        self.files = files;
75        self
76    }
77
78    pub fn tables(&mut self, tables: &[&str]) -> &mut Self {
79        self.files = tables_to_files(tables);
80        self
81    }
82
83    pub fn table_schema(&mut self, table: &str, schema: &str) -> &mut Self {
84        self.table_schema
85            .insert(table.to_string(), schema.to_string());
86        self
87    }
88
89    pub fn target_path(&mut self, path: &Path) -> &mut Self {
90        self.target_path = path.to_path_buf();
91        self
92    }
93
94    pub fn cache(&mut self, builder: CacheBuilder) -> Result<&mut Self, Error> {
95        self.cache = builder.build()?;
96        Ok(self)
97    }
98
99    pub fn preload(&mut self, should: bool) -> &mut Self {
100        self.preload = should;
101        self
102    }
103
104    pub fn minimal(&mut self) -> &mut Self {
105        self.tables(&["crates", "dependencies", "versions"])
106    }
107
108    pub fn update(&mut self) -> Result<&mut Self, Error> {
109        let path = self.cache.cached_path(&self.resource)?;
110
111        let first_local_file = self.target_path.join(self.files.first().unwrap());
112        if first_local_file.exists()
113            && path.metadata()?.created()? <= first_local_file.metadata()?.created()?
114        {
115            // TODO: Improve change-detection later, this is just to prevent re-extracting existing obsurdity.
116            return Ok(self);
117        }
118
119        // Extract files manually instead of letting cached_path do it so we don't have to worry about {date} folder.
120        let tar_gz = File::open(path)?;
121        let tar = GzDecoder::new(tar_gz);
122        let mut archive = tar::Archive::new(tar);
123
124        create_dir_all(&self.target_path)?;
125        for file in archive.entries().unwrap() {
126            let mut f = file.unwrap();
127            let aname = match f.path().unwrap_or_default().file_name() {
128                Some(p) => PathBuf::from(p),
129                None => PathBuf::default(),
130            };
131            if self.files.contains(&aname) {
132                f.unpack(self.target_path.join(aname))?;
133            }
134        }
135        Ok(self)
136    }
137
138    pub fn sqlite_path(&self) -> PathBuf {
139        self.target_path.join(Path::new("db.sqlite"))
140    }
141
142    pub fn open_db(&mut self) -> Result<Connection, Error> {
143        let path = self.sqlite_path();
144
145        let mut should_load = false;
146        let first_local_file = self.target_path.join(self.files.first().unwrap());
147        if !path.exists() {
148            should_load = true;
149        } else if !first_local_file.exists()
150            && path.exists()
151            && path.metadata()?.created()? <= first_local_file.metadata()?.created()?
152        {
153            should_load = true;
154            std::fs::remove_file(&path)?;
155        }
156
157        let db = Connection::open(&path)?;
158        rusqlite::vtab::csvtab::load_module(&db)?;
159
160        if should_load {
161            self.load_dump_into(&db)?;
162        }
163        Ok(db)
164    }
165
166    pub fn load_dump_into(&mut self, db: &Connection) -> Result<(), Error> {
167        let schema = self
168            .files
169            .iter()
170            .map(|f| self.file_to_query(f))
171            .fold(String::new(), |a, b| a + b.as_str() + "\n");
172        db.execute_batch(schema.as_str())?;
173        Ok(())
174    }
175
176    fn file_to_query(&self, path: &PathBuf) -> String {
177        let actual_file = self.target_path.join(path);
178        let table = path.file_stem().unwrap_or_default().to_string_lossy();
179        let vtable = match self.preload {
180            true => format!("temp_{}", table),
181            false => table.to_string(),
182        };
183
184        let vtab = match self.table_schema.get(&table.to_string()) {
185            Some(schema) => format!(
186                r#"
187                    DROP TABLE IF EXISTS {0};
188                    CREATE VIRTUAL TABLE {0} USING csv(filename='{1}',header=yes,schema='{2}');
189                "#,
190                vtable,
191                actual_file.display(),
192                schema,
193            ),
194            None => format!(
195                r#"
196                    DROP TABLE IF EXISTS {0};
197                    CREATE VIRTUAL TABLE {0} USING csv(filename='{1}',header=yes);
198                "#,
199                vtable,
200                actual_file.display(),
201            ),
202        };
203
204        if self.preload {
205            let ptab = format!(
206                r#"
207                    DROP TABLE IF EXISTS {0};
208                    CREATE TABLE {0} AS SELECT * FROM {1};
209                    DROP TABLE {1};
210                "#,
211                table, vtable,
212            );
213
214            return format!("{}\n{}", vtab, ptab);
215        }
216
217        vtab
218    }
219}
220
221fn tables_to_files(tables: &[&str]) -> Vec<PathBuf> {
222    tables
223        .iter()
224        .map(|t| {
225            let mut buf = PathBuf::new();
226            buf.set_file_name(t);
227            buf.set_extension("csv");
228            buf
229        })
230        .collect()
231}
232
233#[test]
234fn test_basic_csvtab() -> Result<(), Error> {
235    // Setup cache.
236    let cache = Cache::builder().progress_bar(None);
237
238    // Setup db /w csvtab module.
239    let db = Connection::open_in_memory().unwrap();
240    rusqlite::vtab::csvtab::load_module(&db).unwrap();
241
242    // Load dump from a .tar.gz archive.
243    CratesIODumpLoader::default()
244        .preload(true)
245        .resource("testdata/test.tar.gz")
246        .target_path(Path::new("testdata/extracted"))
247        .tables(&["test"])
248        .table_schema("test", "CREATE TABLE x(renamed_id INT, name TEXT);")
249        .cache(cache)?
250        .update()?
251        .load_dump_into(&db)?;
252
253    let mut s = db.prepare("SELECT renamed_id FROM test WHERE name = ?")?;
254    let dummy = s.query_row(["awooo"], |row| row.get::<_, i64>(0))?;
255    assert_eq!(3, dummy);
256    Ok(())
257}
258
259#[test]
260fn test_basic_csvtab_open() -> Result<(), Error> {
261    // Setup cache.
262    let cache = Cache::builder().progress_bar(None);
263
264    // Load dump from a .tar.gz archive.
265    let db = CratesIODumpLoader::default()
266        .preload(true)
267        .resource("testdata/test.tar.gz")
268        .target_path(Path::new("testdata/extracted"))
269        .tables(&["test"])
270        .table_schema("test", "CREATE TABLE x(renamed_id INT, name TEXT);")
271        .cache(cache)?
272        .update()?
273        .open_db()?;
274
275    let mut s = db.prepare("SELECT renamed_id FROM test WHERE name = ?")?;
276    let dummy = s.query_row(["awooo"], |row| row.get::<_, i64>(0))?;
277    assert_eq!(3, dummy);
278    Ok(())
279}
280
281