use std::collections::HashMap;
use std::collections::HashSet;
use std::io::Write;
use std::iter::FromIterator as _;
use std::path::Path;
use crate::utils::csv::*;
use crate::utils::error::*;
use crate::utils::fs::*;
use crate::utils::github::*;
use crate::utils::github_api::Github;
use crate::utils::json::*;
use crate::utils::logger::*;
use clap::ArgAction;
use clap::{Arg, Command};
use indicatif::ProgressBar;
use polars::frame::DataFrame;
use polars::prelude::*;
use rand::rngs::StdRng;
use rand::seq::SliceRandom as _;
use rand::SeedableRng;
pub fn cli() -> Command {
Command::new("languages")
.about("Collect all the languages of GitHub projects along with the hash of their latest commit.")
.long_about(
"Collect all the languages of GitHub projects. A list of ids and full names of projects must be provided \
in a CSV file.\nThe program will then make requests to the GitHub API to collect the list of languages of each project and the \
number of bytes for each of these languages. In addition the SHA of the latest commit is also saved.\n\
Projects are chosen randomly without replacement. The data are saved in a new CSV file.\nIf the program is interrupted, it \
can be restarted and will continue from where it left off.\n The program can also optionally use a cache file to save requests.\n\
The name of the output file is the same as the input file with the suffix '.with_lang'.\n"
)
.author("Andrea Gilot <andrea.gilot@it.uu.se>")
.disable_version_flag(true)
.arg(
Arg::new("output")
.short('o')
.long("output")
.value_name("OUTPUT_FILE.csv")
.help("Path to the output csv file to store the metadata. \
By default, the name of the output file is the same as the input file with the suffix '.languages.csv'.")
.required(false)
)
.arg(
Arg::new("input")
.short('i')
.long("input")
.value_name("INPUT_FILE.csv")
.help("Path to the input csv file to use. It must be a valid CSV file where the first column is the id of the project, \
the second column is the full name of the project and the third column is the hash of the latest commit. Other columns are ignored.")
.required(true)
)
.arg(
Arg::new("tokens")
.short('t')
.long("tokens")
.value_name("TOKENS_FILE.csv")
.help("Path to the file containing the GitHub tokens to use. It must be a valid CSV file with one column named 'token' and where every line is a \
valid GitHub token (e.g ghp_Ab0C1D2eFg3hIjk4LM56oPqRsTuvWX7yZa8B).")
.required(true)
)
.arg(
Arg::new("cache")
.short('c')
.long("cache")
.value_name("CACHE.csv")
.help("Path to the cache file to use. Must have been generated by a previous run of this program.")
.required(false)
)
.arg(
Arg::new("seed")
.short('s')
.long("seed")
.value_name("SEED")
.help("Seed used to ramdomly shuffle the input data.")
.default_value("2955615809866670875")
.value_parser(clap::value_parser!(u64)),
)
.arg(
Arg::new("force")
.short('f')
.long("force")
.help("Override the output file if it already exists.")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new("ids")
.long("ids")
.help("Name of the column containing the ids of the projects.")
.value_name("COLUMN_NAME")
.default_value("id")
)
.arg(
Arg::new("names")
.long("names")
.help("Name of the column containing the full names of the projects.")
.value_name("COLUMN_NAME")
.default_value("name")
)
.arg(
Arg::new("sub")
.long("sub")
.value_name("NUMBER_OF_PROJECTS")
.help("Number of projects to sample from the input file. \
If not specified, all remaining projects in the input file are used.")
)
}
pub fn run(
input_path: &str,
output_path: Option<&str>,
tokens: &str,
cache_opt: Option<&String>,
seed: u64,
force: bool,
ids: &str,
names: &str,
sub: Option<usize>,
logger: &mut Logger,
) -> Result<(), Error> {
const ID_COL: usize = 0;
logger.log_tokens(tokens)?;
let input_file: DataFrame = logger.log_completion("Loading input file", || {
open_csv(
input_path,
Some(Schema::from_iter(vec![
Field::new(ids.into(), DataType::UInt32),
Field::new(names.into(), DataType::String),
])),
Some(vec![ids, names]),
)
})?;
logger.log_seed(seed)?;
let mut shuffled_idx: Vec<usize> = (0..input_file.height()).collect();
logger.log_completion("Loading project IDs in random order", || {
let mut rng: StdRng = SeedableRng::seed_from_u64(seed);
shuffled_idx.shuffle(&mut rng);
Ok(())
})?;
let shuffled_rows = shuffled_idx.into_iter().map(|idx| {
let row = input_file.get_row(idx).unwrap().0;
match (row[0].clone(), row[1].clone()) {
(AnyValue::UInt32(id), AnyValue::String(name)) => Ok((id, name)),
_ => Err(idx),
}
});
let n_proj: usize = input_file.height();
logger.log(&format!(" {} projects found.", n_proj))?;
let default_output_path: String = format!("{}.languages.csv", &input_path);
let output_file_path: &str = output_path.unwrap_or(&default_output_path);
let previous_results: HashSet<u32> = if force {
HashSet::new()
} else {
logger.log_completion("Resuming progress", || {
Ok(if Path::new(&output_file_path).exists() {
map_err(
map_err(
open_csv(
input_path,
Some(Schema::from_iter(vec![Field::new(
ids.into(),
DataType::UInt32,
)])),
Some(vec![ids]),
)?
.column(ids),
"Could not extract the ids from the output file",
)?
.u32(),
"Could not convert the ids to u32",
)?
.iter()
.map(|x| x.unwrap())
.collect()
} else {
HashSet::new()
})
})?
};
if !previous_results.is_empty() {
logger.log(&format!(
" the languages of {} projects have already been queried",
previous_results.len()
))?;
}
let mut output_file: CSVFile = CSVFile::new(
output_file_path,
if force {
FileMode::Overwrite
} else {
FileMode::Append
},
)?;
output_file.write_header(ProjectInfo::header())?;
let cache: HashMap<u32, String> = logger.log_completion("Loading cache", || {
Ok(match cache_opt {
Some(cache_path) => {
let cache = CSVFile::new(cache_path, FileMode::Read)?;
cache.indexed_lines(ID_COL)?
}
None => HashMap::new(),
})
})?;
logger.log(&format!(" {} projects found in the cache.", cache.len()))?;
let mut request_from_cache: usize = 0;
let gh = Github::new(tokens);
logger.log("Starting to query the GitHub API...")?;
let mut n: usize = match sub {
Some(m) => m,
None => n_proj - previous_results.len(),
};
let progress_bar: ProgressBar = ProgressBar::new(n_proj as u64);
progress_bar.set_style(
indicatif::ProgressStyle::default_bar()
.template("{elapsed} {wide_bar} {percent}% | Requests from cache: {msg}")
.unwrap(),
);
if sub.is_some() {
progress_bar.set_length(n as u64);
}
for row in shuffled_rows {
if n == 0 {
break;
}
match row {
Ok((id, full_name)) => {
if !previous_results.contains(&id) {
let csv_row: String = if cache.contains_key(&id) {
request_from_cache += 1;
cache.get(&id).unwrap().clone()
} else {
let request1 = gh.request(&format!(
"https://api.github.com/repos/{}/languages",
full_name
));
let request2 = gh.request(&format!(
"https://api.github.com/repos/{}/commits",
full_name
));
match (request1, request2) {
(Ok(json_lang), Ok(json_commits)) => {
ProjectInfo::from_json(&json_lang, &json_commits)?
.to_csv((id, full_name.to_string()))
}
(Err(e), _) => ProjectInfo::default().to_csv((id, e.to_string())),
(_, Err(e)) => ProjectInfo::default().to_csv((id, e.to_string())),
}
};
map_err(
writeln!(&mut output_file, "{}", csv_row),
&format!("Could not write to file {}", &output_file_path),
)?;
progress_bar.inc(1);
progress_bar.set_message(request_from_cache.to_string());
n -= 1;
}
}
Err(idx) => {
map_err(
row,
&format!("Could not parse row {} in the input file", idx),
)?;
}
}
}
Ok(())
}
#[derive(Default)]
struct ProjectInfo {
languages: HashMap<String, i64>,
latest_commit: String,
}
impl ToCSV for ProjectInfo {
type Key = (u32, String);
fn to_csv(&self, key: Self::Key) -> String {
format!(
"{},{},{},{}",
key.0,
key.1,
Self::print_languages(&self.languages),
self.latest_commit,
)
}
fn header() -> &'static [&'static str] {
&["id", "name", "languages", "latest_commit"]
}
}
impl ProjectInfo {
fn from_json(
json_lang: &json::JsonValue,
json_commit: &json::JsonValue,
) -> Result<Self, Error> {
let mut languages: HashMap<String, i64> = HashMap::new();
for (lan, size) in json_lang.entries() {
languages.insert(
lan.to_owned(),
ok_or_else(
size.as_i64(),
&format!("Could not parse the size of the language {}", lan),
)?,
);
}
let sha = get_field::<String>(&json_commit[0], "sha")?;
Ok(Self {
languages,
latest_commit: sha,
})
}
fn print_languages(languages: &HashMap<String, i64>) -> String {
languages
.iter()
.map(|(k, v)| format!("{}:{}", k, v))
.collect::<Vec<String>>()
.join(";")
}
}
#[cfg(test)]
mod tests {
use super::*;
const TEST_DATA: &str = "tests/data/phases/languages";
#[test]
fn test_language_scraper() {
let input_file: String = format!("{}/repos.csv", TEST_DATA);
let output_file: String = format!("{}.languages.csv", input_file);
assert!(std::path::Path::new(&input_file).exists());
assert!(delete_file(&output_file, true).is_ok());
let tokens_file: String = "ghtokens.csv".to_string();
let run_scraper = run(
&input_file,
None,
&tokens_file,
None,
0,
false,
"id",
"name",
None,
&mut Logger::new(),
);
assert!(run_scraper.is_ok());
assert!(delete_file(&output_file, false).is_ok());
}
}