crates-enum 0.0.10

Process crates.io metadata CSV
Documentation
use anyhow::Result;
use url::Url;
use std::path::PathBuf;
use serde::Deserialize;
use crate::git::Git;

const REPO_DIR: &str = "mined";

struct LocalGit {
    /// If path is set, then a local copy exists, otherwise not
    path: PathBuf,

    /// Info about the repo
    info: RepoInfo,
}

// By default, struct field names are deserialized based on the position of
// a corresponding field in the CSV data's header record.
#[allow(dead_code)]
#[derive(Debug, Deserialize)]
struct RepoInfo {
    created_at: String,
    description: String,
    documentation: String,
    downloads: u64,
    homepage: String,
    id: u64,
    max_upload_size: Option<u64>,
    name: String,
    readme: String,
    repository: String,
    updated_at: String,
}

type CratesInfo = Vec<RepoInfo>;

pub struct CratesIO {
    repos: Vec<LocalGit>,
}

impl CratesIO {
    pub fn apply<F>(&self, f: F) -> Result<()>
        where F: Fn(&PathBuf) -> Result<()> {
        for repo in &self.repos {
            f(&repo.path)?;
        }
        Ok(())
    }

    /// Params:
    /// * csv: String
    ///   Location of crates.io csv, from urls:
    ///   https://crates.io/data-access
    ///   https://static.crates.io/db-dump.tar.gz
    /// * names: Option<Vec<String>>
    ///   The rates to process. If none then all crates will be processed
    /// * username: String,
    ///   SSH username
    /// * out_dir: Option<String>
    ///   Output directory, default "mined" if not specified
    pub fn init_local_repos(csv: String, names: Option<Vec<String>>, username: String, out_dir: Option<String>) -> Result<CratesIO> {
        let crates = Self::init(csv, names)?;
        Self::download_local(crates, &username, out_dir)
    }

    fn init(csv: String, subset_repos: Option<Vec<String>>) -> Result<CratesInfo> {
        let mut crates = Self::read_cratesio_csv(csv)?;
        // sort in descending order
        crates.sort_by(|a,b| b.downloads.cmp(&a.downloads));
        // filter to subset asked
        if let Some(allowed) = subset_repos {
            let crates = crates
                .into_iter()
                .filter(|c| allowed.contains(&c.name))
                .collect();
            return Ok(crates)
        }
        Ok(crates)
    }

    fn download_local(crates: CratesInfo, ssh_username: &str, dir: Option<String>) -> Result<Self> {
        let mut git = Git::init(ssh_username);
        let dir = dir.unwrap_or(REPO_DIR.to_string());
        let git_exists = |info: &RepoInfo| -> bool {
            reqwest::blocking::get(&info.repository)
                .map(|g| g.status().is_success()) // did we successfully get data
                .unwrap_or(false) // if get building failed return false
        };
        let mut repos = vec![];
        for info in crates.into_iter() {
            // stale repos do exist in crates.io, where github links are invalid
            if !git_exists(&info) {
                println!("Ignoring stale repo [{}] for [{}]", &info.repository, &info.name);
                continue;
            }
            // clone crate
            let clone = Self::download(&info, &mut git, dir.as_str());
            if let Ok(path) = clone {
                repos.push(LocalGit { path, info });
            } else {
                println!("!! Git clone failed [{}]", &info.repository);
            }
        }
        Ok(CratesIO { repos })
    }

    /// check that the path that exists contains a valid repo
    fn already_local_git(path: &PathBuf) -> bool {
        path.exists() && Git::is_repo(path)
    }

    fn download(info: &RepoInfo, git: &mut Git, dir: &str) -> Result<PathBuf> {
        let mut location = PathBuf::new();
        location.push(dir);
        location.push(info.name.clone());

        if Self::already_local_git(&location) {
            println!("Already local: [{}] at [{:?}]", info.repository, location);
            return Ok(location);
        }
        println!("Cloning: [{}] to [{:?}]", info.repository, location);

        // clone repo
        let url = Url::parse(&info.repository)?;
        git.clone(&url, &location)?;
        assert!(Self::already_local_git(&location));

        Ok(location)
    }

    fn read_cratesio_csv(path: String) -> Result<CratesInfo> {
        // created_at,description,documentation,downloads,homepage,id,max_upload_size,name,readme,repository,updated_at
        // pick subset we care about:
        let all_fields = vec![
            "created_at",
            "description",
            "documentation",
            "downloads",
            "homepage",
            "id",
            "max_upload_size",
            "name",
            "readme",
            "repository",
            "updated_at",
        ];

        let mut records = vec![];
        let mut rdr = csv::Reader::from_path(path)?;
        let headers = rdr.headers()?.clone();
        let fields = csv::StringRecord::from(all_fields);
        assert!(fields == headers);
        for r in rdr.records() {
            let row: RepoInfo = r?.deserialize(Some(&fields))?;
            records.push(row)
        }
        Ok(records)
    }
}