use std::collections::HashMap;
use std::collections::HashSet;
use std::io::Write;
use std::iter::FromIterator as _;
use std::path::Path;
use crate::utils::csv::*;
use crate::utils::error::*;
use crate::utils::fs::*;
use crate::utils::github::*;
use crate::utils::github_api::Github;
use crate::utils::json::*;
use crate::utils::logger::Logger;
use clap::ArgAction;
use clap::{Arg, Command};
use indicatif::ProgressBar;
use json::JsonValue;
use polars::frame::DataFrame;
use polars::prelude::*;
use rand::rngs::StdRng;
use rand::seq::SliceRandom as _;
use rand::SeedableRng;
pub fn cli() -> Command {
Command::new("metadata")
.about("Collect the metadata of GitHub projects")
.long_about(
"Collect metadata of GitHub projects. The input file must be a valid CSV file where one of the columns (\"name\") contains the full names of the projects, and another one contains their ids.\n\
The program sends requests to the GitHub API to collect metadata about each project.\n\
Projects are chosen randomly without replacement. The metadata is saved in a new CSV file.\nIf the program is interrupted, it \
can be restarted and will continue from where it left off.\nThe program can also optionally use a cache file to save requests.\n\
By default, the name of the output file is the same as the input file with the suffix '.metadata.csv'.\n"
)
.author("Andrea Gilot <andrea.gilot@it.uu.se>")
.disable_version_flag(true)
.arg(
Arg::new("output")
.short('o')
.long("output")
.value_name("OUTPUT_FILE.csv")
.help("Path to the output csv file to store the metadata. \
By default, the name of the output file is the same as the input file with the suffix '.metadata.csv'.")
.required(false)
)
.arg(
Arg::new("input")
.short('i')
.long("input")
.value_name("INPUT_FILE.csv")
.help("Path to the input csv file to use. One of the columns must contain the full names of the projects. ")
.required(true)
)
.arg(
Arg::new("tokens")
.short('t')
.long("tokens")
.value_name("TOKENS_FILE.csv")
.help("Path to the file containing the GitHub tokens to use. It must be a valid CSV file with one column named 'token' and where every line is a \
valid GitHub token (e.g ghp_Ab0C1D2eFg3hIjk4LM56oPqRsTuvWX7yZa8B).")
.required(true)
)
.arg(
Arg::new("cache")
.short('c')
.long("cache")
.value_name("CACHE.csv")
.help("Path to the cache file to use. Must have been generated by a previous run of this program.")
.required(false)
)
.arg(
Arg::new("seed")
.short('s')
.long("seed")
.value_name("SEED")
.help("Seed used to randomly shuffle the input data.")
.default_value("2955615809866670875")
.value_parser(clap::value_parser!(u64)),
)
.arg(
Arg::new("force")
.short('f')
.long("force")
.help("Override the output file if it already exists.")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new("ids")
.long("ids")
.help("Name of the column containing the ids of the projects.")
.value_name("COLUMN_NAME")
.default_value("id")
)
.arg(
Arg::new("names")
.long("names")
.help("Name of the column containing the full names of the projects.")
.value_name("COLUMN_NAME")
.default_value("name")
)
.arg(
Arg::new("sub")
.long("sub")
.value_name("NUMBER_OF_PROJECTS")
.help("Number of projects to sample from the input file. \
If not specified, all remaining projects in the input file are used.")
)
}
pub fn run(
input_path: &str,
output_path: Option<&String>,
tokens: &str,
cache_opt: Option<&String>,
seed: u64,
force: bool,
ids: &str,
names: &str,
sub: Option<usize>,
logger: &mut Logger,
) -> Result<(), Error> {
const ID_COL: usize = 0;
logger.log_tokens(tokens)?;
let input_file: DataFrame = logger.log_completion("Loading input file", || {
open_csv(
input_path,
Some(Schema::from_iter(vec![
Field::new(ids.into(), DataType::UInt32),
Field::new(names.into(), DataType::String),
])),
Some(vec![ids, names]),
)
})?;
logger.log_seed(seed)?;
let mut shuffled_idx: Vec<usize> = (0..input_file.height()).collect();
logger.log_completion("Loading project IDs in random order", || {
let mut rng: StdRng = SeedableRng::seed_from_u64(seed);
shuffled_idx.shuffle(&mut rng);
Ok(())
})?;
let shuffled_rows = shuffled_idx.into_iter().map(|idx| {
let row = input_file.get_row(idx).unwrap().0;
match (row[0].clone(), row[1].clone()) {
(AnyValue::UInt32(id), AnyValue::String(name)) => Ok((id, name)),
_ => Err(idx),
}
});
let n_proj: usize = input_file.height();
logger.log(&format!(" {} projects found.", n_proj))?;
let default_output_path: String = format!("{}.metadata.csv", &input_path);
let output_file_path: &str = output_path.unwrap_or(&default_output_path);
let previous_results: HashSet<u32> = if force {
HashSet::new()
} else {
logger.log_completion("Resuming progress", || {
Ok(if Path::new(&output_file_path).exists() {
map_err(
map_err(
open_csv(
input_path,
Some(Schema::from_iter(vec![Field::new(
ids.into(),
DataType::UInt32,
)])),
Some(vec![ids]),
)?
.column(ids),
"Could not extract the ids from the output file",
)?
.u32(),
"Could not convert the ids to u32",
)?
.iter()
.map(|x| x.unwrap())
.collect()
} else {
HashSet::new()
})
})?
};
if !previous_results.is_empty() {
logger.log(&format!(
" the metadata of {} projects have already been queried",
previous_results.len()
))?;
}
let mut output_file: CSVFile = CSVFile::new(
output_file_path,
if force {
FileMode::Overwrite
} else {
FileMode::Append
},
)?;
output_file.write_header(ProjectMetadata::header())?;
let cache: HashMap<u32, String> = logger.log_completion("Loading cache", || {
Ok(match cache_opt {
Some(cache_path) => {
let cache = CSVFile::new(cache_path, FileMode::Read)?;
cache.indexed_lines(ID_COL)?
}
None => HashMap::new(),
})
})?;
logger.log(&format!(" {} projects found in the cache.", cache.len()))?;
let mut request_from_cache: usize = 0;
let gh = Github::new(tokens);
logger.log("Starting to query the GitHub API...")?;
let mut n: usize = match sub {
Some(m) => m,
None => n_proj - previous_results.len(),
};
let progress_bar: ProgressBar = ProgressBar::new(n_proj as u64);
progress_bar.set_style(
indicatif::ProgressStyle::default_bar()
.template("{elapsed} {wide_bar} {percent}% | Requests from cache: {msg}")
.unwrap(),
);
if sub.is_some() {
progress_bar.set_length(n as u64);
}
for row in shuffled_rows {
if n == 0 {
break;
}
match row {
Ok((id, full_name)) => {
if !previous_results.contains(&id) {
let csv_row: String = if cache.contains_key(&id) {
request_from_cache += 1;
cache.get(&id).unwrap().clone()
} else {
match gh.request(&format!("https://api.github.com/repos/{}", full_name)) {
Ok(json) => ProjectMetadata::parse_json(&json, ())?
.to_csv((id, full_name.to_string())),
Err(e) => ProjectMetadata::default().to_csv((id, e.to_string())),
}
};
map_err(
writeln!(&mut output_file, "{}", csv_row),
&format!("Could not write to file {}", &output_file_path),
)?;
progress_bar.inc(1);
progress_bar.set_message(request_from_cache.to_string());
n -= 1;
}
}
Err(idx) => {
map_err(
row,
&format!("Could not parse row {} in the input file", idx),
)?;
}
}
}
Ok(())
}
struct ProjectMetadata {
language: String,
created: i64,
pushed: i64,
updated: i64,
fork: bool,
disabled: bool,
archived: bool,
stars: u32,
forks: u32,
issues: u32,
has_issues: bool,
watchers_count: u32,
subscribers: u32,
size: u64,
license: String,
}
impl Default for ProjectMetadata {
fn default() -> Self {
Self {
language: String::new(),
created: 0,
pushed: 0,
updated: 0,
fork: false,
disabled: false,
archived: false,
stars: 0,
forks: 0,
issues: 0,
has_issues: false,
watchers_count: 0,
subscribers: 0,
size: 0,
license: String::new(),
}
}
}
impl ToCSV for ProjectMetadata {
type Key = (u32, String);
fn header() -> &'static [&'static str] {
&[
"id",
"name",
"language",
"created",
"pushed",
"updated",
"fork",
"disabled",
"archived",
"stars",
"forks",
"issues",
"has_issues",
"watchers_count",
"subscribers",
"size",
"license",
]
}
fn to_csv(&self, key: Self::Key) -> String {
format!(
"{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}",
key.0,
key.1,
self.language,
self.created,
self.pushed,
self.updated,
if self.fork { 1 } else { 0 },
if self.disabled { 1 } else { 0 },
if self.archived { 1 } else { 0 },
self.stars,
self.forks,
self.issues,
if self.has_issues { 1 } else { 0 },
self.watchers_count,
self.subscribers,
self.size,
self.license,
)
}
}
impl FromGitHub for ProjectMetadata {
type Complement = ();
fn parse_json(json: &JsonValue, _complement: ()) -> Result<Self, Error> {
let language = get_field::<String>(json, "language")?;
let created: i64 = Self::parse_date_time(json, "created_at")?;
let pushed: i64 = Self::parse_date_time(json, "pushed_at")?;
let updated: i64 = Self::parse_date_time(json, "updated_at")?;
let fork = get_field::<bool>(json, "fork")?;
let disabled = get_field::<bool>(json, "disabled")?;
let archived = get_field::<bool>(json, "archived")?;
let stars = get_field::<u32>(json, "stargazers_count")?;
let forks = get_field::<u32>(json, "forks_count")?;
let issues = get_field::<u32>(json, "open_issues_count")?;
let has_issues = get_field::<bool>(json, "has_issues")?;
let watchers_count = get_field::<u32>(json, "watchers_count")?;
let subscribers = get_field::<u32>(json, "subscribers_count")?;
let size = get_field::<u64>(json, "size")?;
let license = if !json["license"].is_null() {
clean_string_to_csv(&get_field::<String>(&json["license"], "name")?)
} else {
"unknown".to_string()
};
Ok(Self {
language,
created,
pushed,
updated,
fork,
disabled,
archived,
stars,
forks,
issues,
has_issues,
watchers_count,
subscribers,
size,
license,
})
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::utils::dataframes::has_column;
const TEST_DATA: &str = "tests/data/phases/metadata";
#[test]
fn test_language_scraper() {
let input_file: String = format!("{}/repos.csv", TEST_DATA);
let output_file: String = format!("{}.metadata.csv", input_file);
assert!(std::path::Path::new(&input_file).exists());
assert!(delete_file(&output_file, true).is_ok());
let tokens_file: String = "ghtokens.csv".to_string();
let run_scraper = run(
&input_file,
None,
&tokens_file,
None,
0,
false,
"id",
"name",
None,
&mut Logger::new(),
);
assert!(run_scraper.is_ok());
let output_df = open_csv(&output_file, None, None);
assert!(output_df.is_ok());
let output_df = output_df.unwrap();
assert!(has_column(&output_df, "name"));
let sorted_output_df = output_df
.sort(vec!["name"], SortMultipleOptions::new())
.unwrap();
let expected_df = open_csv(&format!("{}.expected", output_file), None, None);
assert!(expected_df.is_ok());
let expected_df = expected_df.unwrap();
assert!(has_column(&expected_df, "name"));
let sorted_expected_df = expected_df
.sort(vec!["name"], SortMultipleOptions::new())
.unwrap();
assert_eq!(sorted_expected_df, sorted_output_df);
assert!(delete_file(&output_file, false).is_ok());
}
}