#![doc = include_str!("../docs/languages.md")]
use std::collections::HashMap;
use std::collections::HashSet;
use std::io::Write;
use std::iter::FromIterator as _;
use std::path::Path;
use crate::utils::csv::*;
use crate::utils::dataframes;
use crate::utils::fs::*;
use crate::utils::github::*;
use crate::utils::github_api::Github;
use crate::utils::json::*;
use crate::utils::logger::*;
use anyhow::{anyhow, bail, Context, Result};
use clap::ArgAction;
use clap::{Arg, Command};
use indicatif::ProgressBar;
use polars::frame::DataFrame;
use polars::prelude::*;
use rand::rngs::StdRng;
use rand::seq::SliceRandom as _;
use rand::SeedableRng;
use tracing::info;
pub fn cli() -> Command {
Command::new("languages")
.about("Collect all the languages of GitHub projects along with the hash of their latest commit.")
.long_about(include_str!("../docs/languages.md"))
.author("Andrea Gilot <andrea.gilot@it.uu.se>")
.disable_version_flag(true)
.arg(
Arg::new("output")
.short('o')
.long("output")
.value_name("OUTPUT_FILE.csv")
.help("Path to the output csv file to store the metadata. \
By default, the name of the output file is the same as the input file with the suffix '.languages.csv'.")
.required(false)
)
.arg(
Arg::new("input")
.short('i')
.long("input")
.value_name("INPUT_FILE.csv")
.help("Path to the input csv file to use. The file must contain at least two columns: one with the id of the projects and another one with their full name. \
By default, the column containing the ids is named 'id' and the column containing the full names is named 'name'.")
.required(true)
)
.arg(
Arg::new("tokens")
.short('t')
.long("tokens")
.value_name("TOKENS_FILE.csv")
.help("Path to the file containing the GitHub tokens to use. It must be a valid CSV file with one column named 'token' and where every line is a \
valid GitHub token (e.g ghp_Ab0C1D2eFg3hIjk4LM56oPqRsTuvWX7yZa8B).")
.required(true)
)
.arg(
Arg::new("cache")
.short('c')
.long("cache")
.value_name("CACHE.csv")
.help("Path to the cache file to use. Must have been generated by a previous run of this program.")
.required(false)
)
.arg(
Arg::new("seed")
.short('s')
.long("seed")
.value_name("SEED")
.help("Seed used to ramdomly shuffle the input data.")
.default_value("2955615809866670875")
.value_parser(clap::value_parser!(u64)),
)
.arg(
Arg::new("force")
.short('f')
.long("force")
.help("Override the output file if it already exists.")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new("ids")
.long("ids")
.help("Name of the column containing the ids of the projects.")
.value_name("COLUMN_NAME")
.default_value("id")
)
.arg(
Arg::new("names")
.long("names")
.help("Name of the column containing the full names of the projects.")
.value_name("COLUMN_NAME")
.default_value("name")
)
.arg(
Arg::new("sub")
.long("sub")
.value_name("NUMBER_OF_PROJECTS")
.help("Number of projects to sample from the input file. \
If not specified, all remaining projects in the input file are used.")
)
}
pub fn run(
input_path: &str,
output_path: Option<&str>,
tokens: &str,
cache_opt: Option<&String>,
seed: u64,
force: bool,
ids: &str,
names: &str,
sub: Option<usize>,
logger: &Logger,
) -> Result<()> {
const ID_COL: usize = 0;
logger.log_tokens(tokens)?;
let input_file: DataFrame = logger.run_task("Loading input file", || {
open_csv(
input_path,
Some(Schema::from_iter(vec![
Field::new(ids.into(), DataType::UInt32),
Field::new(names.into(), DataType::String),
])),
Some(vec![ids, names]),
)
})?;
log_seed(seed);
let mut shuffled_idx: Vec<usize> = (0..input_file.height()).collect();
logger.run_task("Loading project IDs in random order", || {
let mut rng: StdRng = SeedableRng::seed_from_u64(seed);
shuffled_idx.shuffle(&mut rng);
Ok(())
})?;
let shuffled_rows = shuffled_idx.into_iter().map(|idx| {
let row = input_file.get_row(idx).unwrap().0;
match (row[0].clone(), row[1].clone()) {
(AnyValue::UInt32(id), AnyValue::String(name)) => Ok((id, name)),
_ => Err(idx),
}
});
let n_proj: usize = input_file.height();
info!(" {} projects found.", n_proj);
let default_output_path: String = format!("{}.languages.csv", &input_path);
let output_file_path: &str = output_path.unwrap_or(&default_output_path);
let previous_results: HashSet<u32> = if force {
HashSet::new()
} else {
logger.run_task("Resuming progress", || {
Ok(if Path::new(&output_file_path).exists() {
dataframes::u32(
&open_csv(
input_path,
Some(Schema::from_iter(vec![Field::new(
ids.into(),
DataType::UInt32,
)])),
Some(vec![ids]),
)?,
ids,
)?
.into_iter()
.collect()
} else {
HashSet::new()
})
})?
};
if !previous_results.is_empty() {
info!(
" the languages of {} projects have already been queried",
previous_results.len()
);
}
let mut output_file: CSVFile = CSVFile::new(
output_file_path,
if force {
FileMode::Overwrite
} else {
FileMode::Append
},
)?;
output_file.write_header(ProjectInfo::header())?;
let cache: HashMap<u32, String> = logger.run_task("Loading cache", || {
Ok(match cache_opt {
Some(cache_path) => {
let cache = CSVFile::new(cache_path, FileMode::Read)?;
cache.indexed_lines(ID_COL)?
}
None => HashMap::new(),
})
})?;
info!(" {} projects found in the cache.", cache.len());
let mut request_from_cache: usize = 0;
let gh = Github::new(tokens);
info!("Starting to query the GitHub API...");
let mut n: usize = match sub {
Some(m) => m,
None => n_proj - previous_results.len(),
};
let progress_bar: ProgressBar = ProgressBar::new(n_proj as u64);
progress_bar.set_style(
indicatif::ProgressStyle::default_bar()
.template("{elapsed} {wide_bar} {percent}% | Requests from cache: {msg}")?,
);
if sub.is_some() {
progress_bar.set_length(n as u64);
}
for row in shuffled_rows {
if n == 0 {
break;
}
match row {
Ok((id, full_name)) => {
if !previous_results.contains(&id) {
let csv_row: String = if cache.contains_key(&id) {
request_from_cache += 1;
cache.get(&id).unwrap().clone()
} else {
let request1 = gh.request(&format!(
"https://api.github.com/repos/{full_name}/languages"
));
let request2 = gh
.request(&format!("https://api.github.com/repos/{full_name}/commits"));
match (request1, request2) {
(Ok(json_lang), Ok(json_commits)) => {
ProjectInfo::from_json(&json_lang, &json_commits)?
.to_csv((id, full_name.to_string()))
}
(Err(e), _) => ProjectInfo::default().to_csv((id, e.to_string())),
(_, Err(e)) => ProjectInfo::default().to_csv((id, e.to_string())),
}
};
writeln!(&mut output_file, "{csv_row}")?;
progress_bar.inc(1);
progress_bar.set_message(request_from_cache.to_string());
n -= 1;
}
}
Err(idx) => {
bail!("Could not parse row {idx} in the input file");
}
}
}
Ok(())
}
#[derive(Default)]
struct ProjectInfo {
languages: HashMap<String, i64>,
latest_commit: String,
}
impl ToCSV for ProjectInfo {
type Key = (u32, String);
fn to_csv(&self, key: Self::Key) -> String {
format!(
"{},{},{},{}",
key.0,
key.1,
Self::print_languages(&self.languages),
self.latest_commit,
)
}
fn header() -> &'static [&'static str] {
&["id", "name", "languages", "latest_commit"]
}
}
impl ProjectInfo {
fn from_json(json_lang: &json::JsonValue, json_commit: &json::JsonValue) -> Result<Self> {
let mut languages: HashMap<String, i64> = HashMap::new();
for (lan, size) in json_lang.entries() {
languages.insert(
lan.to_owned(),
size.as_i64()
.with_context(|| anyhow!("Could not parse the size of the language {lan}"))?,
);
}
let sha = get_field::<String>(&json_commit[0], "sha")?;
Ok(Self {
languages,
latest_commit: sha,
})
}
fn print_languages(languages: &HashMap<String, i64>) -> String {
languages
.iter()
.map(|(k, v)| format!("{k}:{v}"))
.collect::<Vec<String>>()
.join(";")
}
}
#[cfg(test)]
mod tests {
use super::*;
use anyhow::ensure;
const TEST_DATA: &str = "tests/data/phases/languages";
#[test]
fn test_language_scraper() -> Result<()> {
let input_file: String = format!("{TEST_DATA}/repos.csv");
let output_file: String = format!("{input_file}.languages.csv");
ensure!(std::path::Path::new(&input_file).exists());
delete_file(&output_file, true)?;
let tokens_file: String = "ghtokens.csv".to_string();
run(
&input_file,
None,
&tokens_file,
None,
0,
false,
"id",
"name",
None,
test_logger(),
)?;
delete_file(&output_file, false)
}
}