#![doc = include_str!("../docs/metadata.md")]
use anyhow::{bail, Result};
use std::collections::HashMap;
use std::collections::HashSet;
use std::io::Write;
use std::iter::FromIterator as _;
use std::path::Path;
use crate::utils::csv::*;
use crate::utils::dataframes;
use crate::utils::fs::*;
use crate::utils::github::*;
use crate::utils::github_api::Github;
use crate::utils::json::*;
use crate::utils::logger::{log_seed, Logger};
use clap::ArgAction;
use clap::{Arg, Command};
use indicatif::ProgressBar;
use json::JsonValue;
use polars::frame::DataFrame;
use polars::prelude::*;
use rand::rngs::StdRng;
use rand::seq::SliceRandom as _;
use rand::SeedableRng;
use tracing::info;
pub fn cli() -> Command {
Command::new("metadata")
.about("Collect the metadata of GitHub projects")
.long_about(include_str!("../docs/metadata.md"))
.author("Andrea Gilot <andrea.gilot@it.uu.se>")
.disable_version_flag(true)
.arg(
Arg::new("output")
.short('o')
.long("output")
.value_name("OUTPUT_FILE.csv")
.help("Path to the output csv file to store the metadata. \
By default, the name of the output file is the same as the input file with the suffix '.metadata.csv'.")
.required(false)
)
.arg(
Arg::new("input")
.short('i')
.long("input")
.value_name("INPUT_FILE.csv")
.help("Path to the input csv file to use. One of the columns must contain the full names of the projects. ")
.required(true)
)
.arg(
Arg::new("tokens")
.short('t')
.long("tokens")
.value_name("TOKENS_FILE.csv")
.help("Path to the file containing the GitHub tokens to use. It must be a valid CSV file with one column named 'token' and where every line is a \
valid GitHub token (e.g ghp_Ab0C1D2eFg3hIjk4LM56oPqRsTuvWX7yZa8B).")
.required(true)
)
.arg(
Arg::new("cache")
.short('c')
.long("cache")
.value_name("CACHE.csv")
.help("Path to the cache file to use. Must have been generated by a previous run of this program.")
.required(false)
)
.arg(
Arg::new("seed")
.short('s')
.long("seed")
.value_name("SEED")
.help("Seed used to randomly shuffle the input data.")
.default_value("2955615809866670875")
.value_parser(clap::value_parser!(u64)),
)
.arg(
Arg::new("force")
.short('f')
.long("force")
.help("Override the output file if it already exists.")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new("ids")
.long("ids")
.help("Name of the column containing the ids of the projects.")
.value_name("COLUMN_NAME")
.default_value("id")
)
.arg(
Arg::new("names")
.long("names")
.help("Name of the column containing the full names of the projects.")
.value_name("COLUMN_NAME")
.default_value("name")
)
.arg(
Arg::new("sub")
.long("sub")
.value_name("NUMBER_OF_PROJECTS")
.help("Number of projects to sample from the input file. \
If not specified, all remaining projects in the input file are used.")
)
}
pub fn run(
input_path: &str,
output_path: Option<&String>,
tokens: &str,
cache_opt: Option<&String>,
seed: u64,
force: bool,
ids: &str,
names: &str,
sub: Option<usize>,
logger: &Logger,
) -> Result<()> {
const ID_COL: usize = 0;
logger.log_tokens(tokens)?;
let input_file: DataFrame = logger.run_task("Loading input file", || {
open_csv(
input_path,
Some(Schema::from_iter(vec![
Field::new(ids.into(), DataType::UInt32),
Field::new(names.into(), DataType::String),
])),
Some(vec![ids, names]),
)
})?;
log_seed(seed);
let mut shuffled_idx: Vec<usize> = (0..input_file.height()).collect();
logger.run_task("Loading project IDs in random order", || {
let mut rng: StdRng = SeedableRng::seed_from_u64(seed);
shuffled_idx.shuffle(&mut rng);
Ok(())
})?;
let shuffled_rows = shuffled_idx.into_iter().map(|idx| {
let row = input_file.get_row(idx).unwrap().0;
match (row[0].clone(), row[1].clone()) {
(AnyValue::UInt32(id), AnyValue::String(name)) => Ok((id, name)),
_ => Err(idx),
}
});
let n_proj: usize = input_file.height();
info!(" {} projects found.", n_proj);
let default_output_path: String = format!("{}.metadata.csv", &input_path);
let output_file_path: &str = output_path.unwrap_or(&default_output_path);
let previous_results: HashSet<u32> = if force {
HashSet::new()
} else {
logger.run_task("Resuming progress", || {
Ok(if Path::new(&output_file_path).exists() {
dataframes::u32(
&open_csv(
input_path,
Some(Schema::from_iter(vec![Field::new(
ids.into(),
DataType::UInt32,
)])),
Some(vec![ids]),
)?,
ids,
)?
.into_iter()
.collect()
} else {
HashSet::new()
})
})?
};
if !previous_results.is_empty() {
info!(
" the metadata of {} projects have already been queried",
previous_results.len()
);
}
let mut output_file: CSVFile = CSVFile::new(
output_file_path,
if force {
FileMode::Overwrite
} else {
FileMode::Append
},
)?;
output_file.write_header(ProjectMetadata::header())?;
let cache: HashMap<u32, String> = logger.run_task("Loading cache", || {
Ok(match cache_opt {
Some(cache_path) => {
let cache = CSVFile::new(cache_path, FileMode::Read)?;
cache.indexed_lines(ID_COL)?
}
None => HashMap::new(),
})
})?;
info!(" {} projects found in the cache.", cache.len());
let mut request_from_cache: usize = 0;
let gh = Github::new(tokens);
info!("Starting to query the GitHub API...");
let mut n: usize = match sub {
Some(m) => m,
None => n_proj - previous_results.len(),
};
let progress_bar: ProgressBar = ProgressBar::new(n_proj as u64);
progress_bar.set_style(
indicatif::ProgressStyle::default_bar()
.template("{elapsed} {wide_bar} {percent}% | Requests from cache: {msg}")?,
);
if sub.is_some() {
progress_bar.set_length(n as u64);
}
for row in shuffled_rows {
if n == 0 {
break;
}
match row {
Ok((id, full_name)) => {
if !previous_results.contains(&id) {
let csv_row: String = if cache.contains_key(&id) {
request_from_cache += 1;
cache.get(&id).unwrap().clone()
} else {
match gh.request(&format!("https://api.github.com/repos/{full_name}")) {
Ok(json) => { ProjectMetadata::parse_json(&json, ())? }
.to_csv((id, full_name.to_string())),
Err(e) => ProjectMetadata::default()
.to_csv((id, e.to_string().trim().to_string())),
}
};
writeln!(&mut output_file, "{csv_row}")?;
progress_bar.inc(1);
progress_bar.set_message(request_from_cache.to_string());
n -= 1;
}
}
Err(idx) => {
bail!("Could not parse row {idx} in the input file")
}
}
}
Ok(())
}
struct ProjectMetadata {
language: String,
created: i64,
pushed: i64,
updated: i64,
fork: bool,
disabled: bool,
archived: bool,
stars: u32,
forks: u32,
issues: u32,
has_issues: bool,
watchers_count: u32,
subscribers: u32,
size: u64,
license: String,
}
impl Default for ProjectMetadata {
fn default() -> Self {
Self {
language: String::new(),
created: 0,
pushed: 0,
updated: 0,
fork: false,
disabled: false,
archived: false,
stars: 0,
forks: 0,
issues: 0,
has_issues: false,
watchers_count: 0,
subscribers: 0,
size: 0,
license: String::new(),
}
}
}
impl ToCSV for ProjectMetadata {
type Key = (u32, String);
fn header() -> &'static [&'static str] {
&[
"id",
"name",
"language",
"created",
"pushed",
"updated",
"fork",
"disabled",
"archived",
"stars",
"forks",
"issues",
"has_issues",
"watchers_count",
"subscribers",
"size",
"license",
]
}
fn to_csv(&self, key: Self::Key) -> String {
format!(
"{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}",
key.0,
key.1,
self.language,
self.created,
self.pushed,
self.updated,
if self.fork { 1 } else { 0 },
if self.disabled { 1 } else { 0 },
if self.archived { 1 } else { 0 },
self.stars,
self.forks,
self.issues,
if self.has_issues { 1 } else { 0 },
self.watchers_count,
self.subscribers,
self.size,
self.license,
)
}
}
impl FromGitHub for ProjectMetadata {
type Complement = ();
fn parse_json(json: &JsonValue, _complement: ()) -> Result<Self> {
let language: String = if !json["language"].is_null() {
get_field::<String>(json, "language")?
} else {
String::new()
};
let created: i64 = Self::parse_date_time(json, "created_at")?;
let pushed: i64 = Self::parse_date_time(json, "pushed_at")?;
let updated: i64 = Self::parse_date_time(json, "updated_at")?;
let fork = get_field::<bool>(json, "fork")?;
let disabled = get_field::<bool>(json, "disabled")?;
let archived = get_field::<bool>(json, "archived")?;
let stars = get_field::<u32>(json, "stargazers_count")?;
let forks = get_field::<u32>(json, "forks_count")?;
let issues = get_field::<u32>(json, "open_issues_count")?;
let has_issues = get_field::<bool>(json, "has_issues")?;
let watchers_count = get_field::<u32>(json, "watchers_count")?;
let subscribers = get_field::<u32>(json, "subscribers_count")?;
let size = get_field::<u64>(json, "size")?;
let license = if !json["license"].is_null() {
clean_string_to_csv(&get_field::<String>(&json["license"], "name")?)
} else {
"unknown".to_string()
};
Ok(Self {
language,
created,
pushed,
updated,
fork,
disabled,
archived,
stars,
forks,
issues,
has_issues,
watchers_count,
subscribers,
size,
license,
})
}
}
#[cfg(test)]
mod tests {
use anyhow::ensure;
use super::*;
use crate::utils::{dataframes::has_column, logger::test_logger};
const TEST_DATA: &str = "tests/data/phases/metadata";
#[test]
fn test_language_scraper() -> Result<()> {
let input_file: String = format!("{TEST_DATA}/repos.csv");
let output_file: String = format!("{input_file}.metadata.csv");
ensure!(
std::path::Path::new(&input_file).exists(),
"Input file does not exist"
);
delete_file(&output_file, true)?;
let tokens_file: String = "ghtokens.csv".to_string();
run(
&input_file,
None,
&tokens_file,
None,
0,
false,
"id",
"name",
None,
test_logger(),
)?;
let output_df = open_csv(&output_file, None, None)?;
ensure!(
has_column(&output_df, "name"),
"Output does not have 'name' column"
);
let sorted_output_df = output_df.sort(vec!["name"], SortMultipleOptions::new())?;
let expected_df = open_csv(&format!("{output_file}.expected"), None, None)?;
ensure!(
has_column(&expected_df, "name"),
"Expected output does not have 'name' column"
);
let sorted_expected_df = expected_df.sort(vec!["name"], SortMultipleOptions::new())?;
assert_eq!(sorted_expected_df, sorted_output_df);
delete_file(&output_file, false)
}
}