Skip to main content

db_dump/
load.rs

1use crate::DbDump;
2use crate::error::{Result, err};
3use csv::StringRecord;
4use flate2::read::GzDecoder;
5use indicatif::{ProgressBar, ProgressDrawTarget, ProgressStyle};
6use memmap::Mmap;
7use serde::de::DeserializeOwned;
8use std::borrow::Cow;
9use std::fs::File;
10use std::io::Read;
11use std::path::Path;
12use tar::Archive;
13
14/// Perform a streaming load of only relevant database tables.
15///
16/// # Example
17///
18/// This example loads just the version_downloads.csv table, in which each row
19/// is the download count for a single version of a single crate on a single
20/// day. We do not store the rows individually in memory but instead stream from
21/// the csv to accumulate just a total count per day across all crates, which
22/// requires far less memory.
23///
24/// ```no_run
25/// use chrono::Utc;
26/// use db_dump::Date;
27/// use std::collections::BTreeMap as Map;
28///
29/// fn main() -> db_dump::Result<()> {
30///     let mut downloads = Map::<Date<Utc>, u64>::new();
31///     db_dump::Loader::new()
32///         .version_downloads(|row| {
33///             *downloads.entry(row.date).or_default() += row.downloads;
34///         })
35///         .load("./db-dump.tar.gz")?;
36///
37///     for (date, count) in downloads {
38///         println!("{},{}", date, count);
39///     }
40///
41///     Ok(())
42/// }
43/// ```
44#[derive(#[automatically_derived]
impl<'a> ::core::default::Default for Loader<'a> {
    #[inline]
    fn default() -> Loader<'a> {
        Loader {
            categories: ::core::default::Default::default(),
            crate_downloads: ::core::default::Default::default(),
            crate_owners: ::core::default::Default::default(),
            crates: ::core::default::Default::default(),
            crates_categories: ::core::default::Default::default(),
            crates_keywords: ::core::default::Default::default(),
            default_versions: ::core::default::Default::default(),
            deleted_crates: ::core::default::Default::default(),
            dependencies: ::core::default::Default::default(),
            keywords: ::core::default::Default::default(),
            metadata: ::core::default::Default::default(),
            oauth_github: ::core::default::Default::default(),
            reserved_crate_names: ::core::default::Default::default(),
            reserved_usernames: ::core::default::Default::default(),
            teams: ::core::default::Default::default(),
            users: ::core::default::Default::default(),
            version_downloads: ::core::default::Default::default(),
            versions: ::core::default::Default::default(),
        }
    }
}Default)]
45pub struct Loader<'a> {
46    categories: Option<Callback<'a, crate::categories::Row>>,
47    crate_downloads: Option<Callback<'a, crate::crate_downloads::Row>>,
48    crate_owners: Option<Callback<'a, crate::crate_owners::Row>>,
49    crates: Option<Callback<'a, crate::crates::Row>>,
50    crates_categories: Option<Callback<'a, crate::crates_categories::Row>>,
51    crates_keywords: Option<Callback<'a, crate::crates_keywords::Row>>,
52    default_versions: Option<Callback<'a, crate::default_versions::Row>>,
53    deleted_crates: Option<Callback<'a, crate::deleted_crates::Row>>,
54    dependencies: Option<Callback<'a, crate::dependencies::Row>>,
55    keywords: Option<Callback<'a, crate::keywords::Row>>,
56    metadata: Option<Callback<'a, crate::metadata::Row>>,
57    oauth_github: Option<Callback<'a, crate::oauth_github::Row>>,
58    reserved_crate_names: Option<Callback<'a, crate::reserved_crate_names::Row>>,
59    reserved_usernames: Option<Callback<'a, crate::reserved_usernames::Row>>,
60    teams: Option<Callback<'a, crate::teams::Row>>,
61    users: Option<Callback<'a, crate::users::Row>>,
62    version_downloads: Option<Callback<'a, crate::version_downloads::Row>>,
63    versions: Option<Callback<'a, crate::versions::Row>>,
64}
65
66struct Callback<'a, T> {
67    f: Box<dyn FnMut(T) + 'a>,
68    done: bool,
69}
70
71impl<'a> Loader<'a> {
72    pub fn new() -> Self {
73        Loader::default()
74    }
75
76    pub fn categories(&mut self, f: impl FnMut(crate::categories::Row) + 'a) -> &mut Self {
77        self.categories = Some(Callback::new(f));
78        self
79    }
80
81    pub fn crate_downloads(
82        &mut self,
83        f: impl FnMut(crate::crate_downloads::Row) + 'a,
84    ) -> &mut Self {
85        self.crate_downloads = Some(Callback::new(f));
86        self
87    }
88
89    pub fn crate_owners(&mut self, f: impl FnMut(crate::crate_owners::Row) + 'a) -> &mut Self {
90        self.crate_owners = Some(Callback::new(f));
91        self
92    }
93
94    pub fn crates(&mut self, f: impl FnMut(crate::crates::Row) + 'a) -> &mut Self {
95        self.crates = Some(Callback::new(f));
96        self
97    }
98
99    pub fn crates_categories(
100        &mut self,
101        f: impl FnMut(crate::crates_categories::Row) + 'a,
102    ) -> &mut Self {
103        self.crates_categories = Some(Callback::new(f));
104        self
105    }
106
107    pub fn crates_keywords(
108        &mut self,
109        f: impl FnMut(crate::crates_keywords::Row) + 'a,
110    ) -> &mut Self {
111        self.crates_keywords = Some(Callback::new(f));
112        self
113    }
114
115    pub fn default_versions(
116        &mut self,
117        f: impl FnMut(crate::default_versions::Row) + 'a,
118    ) -> &mut Self {
119        self.default_versions = Some(Callback::new(f));
120        self
121    }
122
123    pub fn deleted_crates(&mut self, f: impl FnMut(crate::deleted_crates::Row) + 'a) -> &mut Self {
124        self.deleted_crates = Some(Callback::new(f));
125        self
126    }
127
128    pub fn dependencies(&mut self, f: impl FnMut(crate::dependencies::Row) + 'a) -> &mut Self {
129        self.dependencies = Some(Callback::new(f));
130        self
131    }
132
133    pub fn keywords(&mut self, f: impl FnMut(crate::keywords::Row) + 'a) -> &mut Self {
134        self.keywords = Some(Callback::new(f));
135        self
136    }
137
138    pub fn metadata(&mut self, f: impl FnMut(crate::metadata::Row) + 'a) -> &mut Self {
139        self.metadata = Some(Callback::new(f));
140        self
141    }
142
143    pub fn oauth_github(&mut self, f: impl FnMut(crate::oauth_github::Row) + 'a) -> &mut Self {
144        self.oauth_github = Some(Callback::new(f));
145        self
146    }
147
148    pub fn reserved_crate_names(
149        &mut self,
150        f: impl FnMut(crate::reserved_crate_names::Row) + 'a,
151    ) -> &mut Self {
152        self.reserved_crate_names = Some(Callback::new(f));
153        self
154    }
155
156    pub fn reserved_usernames(
157        &mut self,
158        f: impl FnMut(crate::reserved_usernames::Row) + 'a,
159    ) -> &mut Self {
160        self.reserved_usernames = Some(Callback::new(f));
161        self
162    }
163
164    pub fn teams(&mut self, f: impl FnMut(crate::teams::Row) + 'a) -> &mut Self {
165        self.teams = Some(Callback::new(f));
166        self
167    }
168
169    pub fn users(&mut self, f: impl FnMut(crate::users::Row) + 'a) -> &mut Self {
170        self.users = Some(Callback::new(f));
171        self
172    }
173
174    pub fn version_downloads(
175        &mut self,
176        f: impl FnMut(crate::version_downloads::Row) + 'a,
177    ) -> &mut Self {
178        self.version_downloads = Some(Callback::new(f));
179        self
180    }
181
182    pub fn versions(&mut self, f: impl FnMut(crate::versions::Row) + 'a) -> &mut Self {
183        self.versions = Some(Callback::new(f));
184        self
185    }
186
187    pub fn load(&mut self, path: impl AsRef<Path>) -> Result<()> {
188        do_load(path.as_ref(), self)
189    }
190}
191
192impl<'a, T> Callback<'a, T> {
193    fn new(f: impl FnMut(T) + 'a) -> Self {
194        Callback {
195            f: Box::new(f),
196            done: false,
197        }
198    }
199
200    fn done(&self) -> bool {
201        self.done
202    }
203}
204
205fn do_load(path: &Path, loader: &mut Loader) -> Result<()> {
206    let file = File::open(path)?;
207    let mmap = unsafe { Mmap::map(&file) }?;
208
209    let pb = ProgressBar::hidden();
210    pb.set_length(mmap.len() as u64);
211    pb.set_style(
212        ProgressStyle::default_bar()
213            .template("[{wide_bar:.cyan/blue}] {percent}% {msg:>24}")
214            .unwrap()
215            .progress_chars(". "),
216    );
217    pb.set_draw_target(ProgressDrawTarget::stderr());
218    let input = pb.wrap_read(&*mmap);
219
220    let mut archive = Archive::new(GzDecoder::new(input));
221    for entry in archive.entries()? {
222        #[deny(unused_variables)]
223        let Loader {
224            categories,
225            crate_downloads,
226            crate_owners,
227            crates,
228            crates_categories,
229            crates_keywords,
230            default_versions,
231            deleted_crates,
232            dependencies,
233            keywords,
234            metadata,
235            oauth_github,
236            reserved_crate_names,
237            reserved_usernames,
238            teams,
239            users,
240            version_downloads,
241            versions,
242        } = loader;
243
244        if categories.as_ref().map_or(true, Callback::done)
245            && crate_downloads.as_ref().map_or(true, Callback::done)
246            && crate_owners.as_ref().map_or(true, Callback::done)
247            && crates.as_ref().map_or(true, Callback::done)
248            && crates_categories.as_ref().map_or(true, Callback::done)
249            && crates_keywords.as_ref().map_or(true, Callback::done)
250            && default_versions.as_ref().map_or(true, Callback::done)
251            && deleted_crates.as_ref().map_or(true, Callback::done)
252            && dependencies.as_ref().map_or(true, Callback::done)
253            && keywords.as_ref().map_or(true, Callback::done)
254            && metadata.as_ref().map_or(true, Callback::done)
255            && oauth_github.as_ref().map_or(true, Callback::done)
256            && reserved_crate_names.as_ref().map_or(true, Callback::done)
257            && reserved_usernames.as_ref().map_or(true, Callback::done)
258            && teams.as_ref().map_or(true, Callback::done)
259            && users.as_ref().map_or(true, Callback::done)
260            && version_downloads.as_ref().map_or(true, Callback::done)
261            && versions.as_ref().map_or(true, Callback::done)
262        {
263            break;
264        }
265
266        let entry = entry?;
267        let path = entry.path()?;
268        if path.extension().map_or(true, |ext| ext != "csv") {
269            continue;
270        }
271
272        pb.set_message(match path.file_name() {
273            Some(file_name) => Cow::Owned(file_name.to_string_lossy().into_owned()),
274            None => Cow::Borrowed(""),
275        });
276
277        #[deny(unused_variables)]
278        let Loader {
279            categories,
280            crate_downloads,
281            crate_owners,
282            crates,
283            crates_categories,
284            crates_keywords,
285            default_versions,
286            deleted_crates,
287            dependencies,
288            keywords,
289            metadata,
290            oauth_github,
291            reserved_crate_names,
292            reserved_usernames,
293            teams,
294            users,
295            version_downloads,
296            versions,
297        } = loader;
298
299        let (path, result) = if path.ends_with("badges.csv") {
300            continue; // https://github.com/rust-lang/crates.io/pull/8155
301        } else if path.ends_with("categories.csv") {
302            ("categories", read(categories, entry))
303        } else if path.ends_with("crate_downloads.csv") {
304            ("crate_downloads", read(crate_downloads, entry))
305        } else if path.ends_with("crate_owners.csv") {
306            ("crate_owners", read(crate_owners, entry))
307        } else if path.ends_with("crates.csv") {
308            ("crates", read(crates, entry))
309        } else if path.ends_with("crates_categories.csv") {
310            ("crates_categories", read(crates_categories, entry))
311        } else if path.ends_with("crates_keywords.csv") {
312            ("crates_keywords", read(crates_keywords, entry))
313        } else if path.ends_with("default_versions.csv") {
314            ("default_versions", read(default_versions, entry))
315        } else if path.ends_with("deleted_crates.csv") {
316            ("deleted_crates", read(deleted_crates, entry))
317        } else if path.ends_with("dependencies.csv") {
318            ("dependencies", read(dependencies, entry))
319        } else if path.ends_with("keywords.csv") {
320            ("keywords", read(keywords, entry))
321        } else if path.ends_with("metadata.csv") {
322            ("metadata", read(metadata, entry))
323        } else if path.ends_with("oauth_github.csv") {
324            ("oauth_github", read(oauth_github, entry))
325        } else if path.ends_with("reserved_crate_names.csv") {
326            ("reserved_crate_names", read(reserved_crate_names, entry))
327        } else if path.ends_with("reserved_usernames.csv") {
328            ("reserved_usernames", read(reserved_usernames, entry))
329        } else if path.ends_with("teams.csv") {
330            ("teams", read(teams, entry))
331        } else if path.ends_with("users.csv") {
332            ("users", read(users, entry))
333        } else if path.ends_with("version_authors.csv") {
334            continue; // https://github.com/rust-lang/crates.io/pull/3549
335        } else if path.ends_with("version_downloads.csv") {
336            ("version_downloads", read(version_downloads, entry))
337        } else if path.ends_with("versions.csv") {
338            ("versions", read(versions, entry))
339        } else {
340            if falsecfg!(db_dump_panic_on_unrecognized_csv) {
341                {
    ::core::panicking::panic_fmt(format_args!("unimplemented: {0}",
            path.display()));
};panic!("unimplemented: {}", path.display());
342            } else {
343                { ::std::io::_eprint(format_args!("unimplemented: {0}\n", path.display())); };eprintln!("unimplemented: {}", path.display());
344            }
345            continue;
346        };
347
348        if let Err(mut err) = result {
349            err.e.path = Some(Path::new(path));
350            return Err(err);
351        }
352    }
353
354    Ok(())
355}
356
357pub(crate) trait FromRecord: Sized {
358    fn from_record(record: &StringRecord, headers: &StringRecord) -> Result<Self>;
359}
360
361impl<T> FromRecord for T
362where
363    T: DeserializeOwned,
364{
365    fn from_record(record: &StringRecord, headers: &StringRecord) -> Result<Self> {
366        record.deserialize(Some(headers)).map_err(err)
367    }
368}
369
370fn read<T>(loader: &mut Option<Callback<T>>, entry: impl Read) -> Result<()>
371where
372    T: FromRecord,
373{
374    if let Some(loader) = loader {
375        let mut csv = csv::Reader::from_reader(entry);
376        let headers = csv.headers().map_err(err)?.clone();
377        let mut record = StringRecord::new();
378        while csv.read_record(&mut record).map_err(err)? {
379            let record = T::from_record(&record, &headers)?;
380            (loader.f)(record);
381        }
382        loader.done = true;
383    }
384    Ok(())
385}
386
387/// Deserialize *everything* in a crates.io DB dump into memory.
388///
389/// This function is equivalent to the following [`Loader`]-based invocation:
390///
391/// ```
392/// # use std::path::Path;
393/// # use db_dump::Result;
394/// #
395/// # struct DbDump {
396/// #     categories: Vec<db_dump::categories::Row>,
397/// #     crate_owners: Vec<db_dump::crate_owners::Row>,
398/// #     versions: Vec<db_dump::versions::Row>,
399/// # }
400/// #
401/// # pub fn load_all(path: impl AsRef<Path>) -> Result<DbDump> {
402/// #     let path = path.as_ref();
403/// let mut categories = Vec::new();
404/// let mut crate_owners = Vec::new();
405/// /* ... */
406/// let mut versions = Vec::new();
407///
408/// db_dump::Loader::new()
409///     .categories(|row| categories.push(row))
410///     .crate_owners(|row| crate_owners.push(row))
411///     /* ... */
412///     .versions(|row| versions.push(row))
413///     .load(path)?;
414///
415/// Ok(DbDump {
416///     categories,
417///     crate_owners,
418///     /* ... */
419///     versions,
420/// })
421/// # }
422/// ```
423///
424/// Usually whatever you are doing will not require *all* of the information in
425/// a dump, in which case utilizing `Loader` to load just what you need can be
426/// significantly more efficient.
427pub fn load_all(path: impl AsRef<Path>) -> Result<DbDump> {
428    do_load_all(path.as_ref())
429}
430
431fn do_load_all(path: &Path) -> Result<DbDump> {
432    let mut categories = Vec::new();
433    let mut crate_downloads = Vec::new();
434    let mut crate_owners = Vec::new();
435    let mut crates = Vec::new();
436    let mut crates_categories = Vec::new();
437    let mut crates_keywords = Vec::new();
438    let mut default_versions = Vec::new();
439    let mut deleted_crates = Vec::new();
440    let mut dependencies = Vec::new();
441    let mut keywords = Vec::new();
442    let mut metadata = crate::metadata::Row { total_downloads: 0 };
443    let mut oauth_github = Vec::new();
444    let mut reserved_crate_names = Vec::new();
445    let mut reserved_usernames = Vec::new();
446    let mut teams = Vec::new();
447    let mut users = Vec::new();
448    let mut version_downloads = Vec::new();
449    let mut versions = Vec::new();
450
451    let mut loader = Loader {
452        categories: Some(Callback::new(|row| categories.push(row))),
453        crate_downloads: Some(Callback::new(|row| crate_downloads.push(row))),
454        crate_owners: Some(Callback::new(|row| crate_owners.push(row))),
455        crates: Some(Callback::new(|row| crates.push(row))),
456        crates_categories: Some(Callback::new(|row| crates_categories.push(row))),
457        crates_keywords: Some(Callback::new(|row| crates_keywords.push(row))),
458        default_versions: Some(Callback::new(|row| default_versions.push(row))),
459        deleted_crates: Some(Callback::new(|row| deleted_crates.push(row))),
460        dependencies: Some(Callback::new(|row| dependencies.push(row))),
461        keywords: Some(Callback::new(|row| keywords.push(row))),
462        metadata: Some(Callback::new(|row| metadata = row)),
463        oauth_github: Some(Callback::new(|row| oauth_github.push(row))),
464        reserved_crate_names: Some(Callback::new(|row| reserved_crate_names.push(row))),
465        reserved_usernames: Some(Callback::new(|row| reserved_usernames.push(row))),
466        teams: Some(Callback::new(|row| teams.push(row))),
467        users: Some(Callback::new(|row| users.push(row))),
468        version_downloads: Some(Callback::new(|row| version_downloads.push(row))),
469        versions: Some(Callback::new(|row| versions.push(row))),
470    };
471
472    loader.load(path)?;
473    drop(loader);
474
475    Ok(DbDump {
476        categories,
477        crate_downloads,
478        crate_owners,
479        crates,
480        crates_categories,
481        crates_keywords,
482        default_versions,
483        deleted_crates,
484        dependencies,
485        keywords,
486        metadata,
487        oauth_github,
488        reserved_crate_names,
489        reserved_usernames,
490        teams,
491        users,
492        version_downloads,
493        versions,
494    })
495}