use std::collections::HashSet;
use std::fmt::Write as _;
use std::io::Write;
use std::iter::FromIterator as _;
use std::path::Path;
use crate::utils::csv::*;
use crate::utils::dataframes::u32;
use crate::utils::error::*;
use crate::utils::fs::*;
use crate::utils::github::*;
use crate::utils::github_api::*;
use crate::utils::json::*;
use crate::utils::logger::Logger;
use clap::ArgAction;
use clap::{Arg, Command};
use indicatif::ProgressBar;
use json::JsonValue;
use polars::frame::DataFrame;
use polars::prelude::*;
use rand::rngs::StdRng;
use rand::seq::SliceRandom as _;
use rand::SeedableRng;
pub fn cli() -> Command {
Command::new("pr")
.about("Collect pull requests of GitHub projects")
.long_about(
"Collect pull requests of GitHub projects. The input file must be a valid CSV file where one of the columns (\"name\") contains the full names of the projects, and another one (\"id\") contains their ids.\n\
The program sends requests to the GitHub API to collect metadata about the pull requests of the projects in the input file, as well as the comments in each pull request.\n\
Projects are chosen randomly without replacement. The metadata of the pull requests is saved in a new CSV file.\nIf the program is interrupted, it \
can be restarted and will continue from where it left off.\n The comments of each pull request are saved in separate CSV files in the target directory.\n\
By default, the name of the output file is the same as the input file with the suffix '.pulls.csv'.\n"
)
.author("Andrea Gilot <andrea.gilot@it.uu.se>")
.disable_version_flag(true)
.arg(
Arg::new("output")
.short('o')
.long("output")
.value_name("OUTPUT_FILE.csv")
.help("Path to the output csv file to store the metadata. \
By default, the name of the output file is the same as the input file with the suffix '.pulls.csv'.")
.required(false)
)
.arg(
Arg::new("input")
.short('i')
.long("input")
.value_name("INPUT_FILE.csv")
.help("Path to the input csv file to use. One of the columns must contain the full names of the projects. ")
.required(true)
)
.arg(
Arg::new("tokens")
.short('t')
.long("tokens")
.value_name("TOKENS_FILE.csv")
.help("Path to the file containing the GitHub tokens to use. It must be a valid CSV file with one column named 'token' and where every line is a \
valid GitHub token (e.g ghp_Ab0C1D2eFg3hIjk4LM56oPqRsTuvWX7yZa8B).")
.required(true)
)
.arg(
Arg::new("dest")
.short('d')
.long("dest")
.aliases(["target", "destination"])
.value_name("DESTINATION")
.help("Path to the directory where to store the pull request comments.")
.required(true)
)
.arg(
Arg::new("seed")
.short('s')
.long("seed")
.value_name("SEED")
.help("Seed used to randomly shuffle the input data.")
.default_value("9990520807055774474")
.value_parser(clap::value_parser!(u64)),
)
.arg(
Arg::new("force")
.short('f')
.long("force")
.help("Override the output file if it already exists.")
.action(ArgAction::SetTrue),
)
.arg(
Arg::new("ids")
.long("ids")
.help("Name of the column containing the ids of the projects.")
.value_name("COLUMN_NAME")
.default_value("id")
)
.arg(
Arg::new("names")
.long("names")
.help("Name of the column containing the full names of the projects.")
.value_name("COLUMN_NAME")
.default_value("name")
)
.arg(
Arg::new("sub")
.long("sub")
.value_name("NUMBER_OF_PROJECTS")
.help("Number of projects to sample from the input file. \
If not specified, all remaining projects in the input file are used.")
)
}
pub fn run(
input_path: &str,
output_path: Option<&String>,
tokens: &str,
seed: u64,
force: bool,
ids: &str,
names: &str,
target: &str,
sub: Option<usize>,
logger: &mut Logger,
) -> Result<(), Error> {
logger.log_tokens(tokens)?;
let input_file: DataFrame = logger.log_completion("Loading input file", || {
open_csv(
input_path,
Some(Schema::from_iter(vec![
Field::new(ids.into(), DataType::UInt32),
Field::new(names.into(), DataType::String),
])),
Some(vec![ids, names]),
)
})?;
logger.log_seed(seed)?;
let mut shuffled_idx: Vec<usize> = (0..input_file.height()).collect();
logger.log_completion("Loading project IDs in random order", || {
let mut rng: StdRng = SeedableRng::seed_from_u64(seed);
shuffled_idx.shuffle(&mut rng);
Ok(())
})?;
let shuffled_rows = shuffled_idx.into_iter().map(|idx| {
let row = input_file.get_row(idx).unwrap().0;
match (row[0].clone(), row[1].clone()) {
(AnyValue::UInt32(id), AnyValue::String(name)) => Ok((id, name)),
_ => Err(idx),
}
});
let n_pr: usize = input_file.height();
logger.log(&format!(" {} PR found.", n_pr))?;
let default_output_path: String = format!("{}.pulls.csv", &input_path);
let output_file_path: &str = output_path.unwrap_or(&default_output_path);
let previous_results: HashSet<u32> = if force {
HashSet::new()
} else {
logger.log_completion("Resuming progress", || {
Ok(if Path::new(output_file_path).exists() {
let df_res: DataFrame = open_csv(
input_path,
Some(Schema::from_iter(vec![Field::new(
ids.into(),
DataType::UInt32,
)])),
Some(vec![ids]),
)?;
u32(&df_res, ids)?.into_iter().collect()
} else {
HashSet::new()
})
})?
};
if !previous_results.is_empty() {
logger.log(&format!(
" the metadata of {} projects have already been queried",
previous_results.len()
))?;
}
let mut output_file: CSVFile = CSVFile::new(
output_file_path,
if force {
FileMode::Overwrite
} else {
FileMode::Append
},
)?;
output_file.write_header(PRMetadata::header())?;
let gh = Github::new(tokens);
logger.log("Starting to query the GitHub API...")?;
let mut n: usize = match sub {
Some(m) => m,
None => n_pr - previous_results.len(),
};
let progress_bar: ProgressBar = ProgressBar::new(n_pr as u64);
progress_bar.set_style(
indicatif::ProgressStyle::default_bar()
.template("{elapsed} {wide_bar} {percent}%")
.unwrap(),
);
if sub.is_some() {
progress_bar.set_length(n as u64);
}
for row in shuffled_rows {
if n == 0 {
break;
}
match row {
Ok((id, full_name)) => {
if !previous_results.contains(&id) {
let mut pull_requests: String = String::new();
for pr_res in scrape_pages(
&gh,
&|per_page, page| {
format!("https://api.github.com/repositories/{}/pulls?state=all&per_page={}&page={}", id, per_page, page)
},
&|json| {
let mut pr_metadata: PRMetadata =
PRMetadata::parse_json(&json, (id, target.to_string()))?;
scrape_pr_comments(&gh, id, &pr_metadata).unwrap_or_else(|_| {
pr_metadata.file_path = String::new();
});
Ok(pr_metadata)
},
)? {
let obj: PRMetadata = pr_res.unwrap_or_default();
map_err(
writeln!(
&mut pull_requests,
"{}",
obj.to_csv((id, full_name.to_string()))
),
"Could not write to string builder",
)?;
}
map_err(
write!(&mut output_file, "{}", pull_requests),
&format!("Could not write to file {}", &output_file_path),
)?;
progress_bar.inc(1);
n -= 1;
}
}
Err(idx) => {
map_err(
row,
&format!("Could not parse row {} in the input file", idx),
)?;
}
}
}
Ok(())
}
#[derive(Debug, Clone, Eq, PartialEq, Default)]
struct PRMetadata {
pr_number: u32,
file_path: String,
user: String,
user_id: u64,
created_at: u64,
updated_at: u64,
closed_at: u64,
merged_at: u64,
draft: bool,
state: String,
body: String,
}
impl ToCSV for PRMetadata {
type Key = (u32, String);
fn header() -> &'static [&'static str] {
&[
"id",
"name",
"pr_number",
"file_path",
"user",
"user_id",
"created_at",
"updated_at",
"closed_at",
"merged_at",
"draft",
"state",
]
}
fn to_csv(&self, key: Self::Key) -> String {
format!(
"{},{},{},{},{},{},{},{},{},{},{},{}",
key.0,
key.1,
self.pr_number,
self.file_path,
self.user,
self.user_id,
self.created_at,
self.updated_at,
self.closed_at,
self.merged_at,
if self.draft { 1 } else { 0 },
self.state,
)
}
}
impl FromGitHub for PRMetadata {
type Complement = (u32, String);
fn parse_json(json: &JsonValue, complement: Self::Complement) -> Result<Self, Error> {
let pr_number: u32 = get_field::<u32>(json, "number")?;
let created_at: i64 = if field_is_null(json, "created_at")? {
0
} else {
Self::parse_date_time(json, "created_at")?
};
let updated_at: i64 = if field_is_null(json, "updated_at")? {
0
} else {
Self::parse_date_time(json, "updated_at")?
};
let closed_at: i64 = if field_is_null(json, "closed_at")? {
0
} else {
Self::parse_date_time(json, "closed_at")?
};
let merged_at: i64 = if field_is_null(json, "merged_at")? {
0
} else {
Self::parse_date_time(json, "merged_at")?
};
let draft: bool = get_field::<bool>(json, "draft")?;
let state: String = get_field::<String>(json, "state")?;
let user_json: &JsonValue = &json["user"];
let user: String = get_field::<String>(user_json, "login")?;
let user_id: u64 = get_field::<u64>(user_json, "id")?;
let path: String = format!(
"{}/{}/{}/{}_{}.csv",
complement.1,
complement.0 % 10000,
complement.0,
complement.0,
pr_number
);
let body: String = if field_is_null(json, "body")? {
"".to_string()
} else {
clean_string_to_csv(&get_field::<String>(json, "body")?)
};
Ok(Self {
file_path: path,
pr_number,
created_at: created_at as u64,
updated_at: updated_at as u64,
closed_at: closed_at as u64,
merged_at: merged_at as u64,
draft,
state,
user,
user_id,
body,
})
}
}
fn scrape_pages<T>(
gh: &Github,
request: &dyn Fn(usize, usize) -> String,
func: &dyn Fn(JsonValue) -> Result<T, Error>,
) -> Result<Vec<Result<T, Error>>, Error> {
let mut page: usize = 1;
const PER_PAGE: usize = 100;
let mut is_null: bool = false;
let mut items: Vec<Result<T, Error>> = Vec::new();
while !is_null {
match gh.request(&request(PER_PAGE, page)) {
Ok(json) => {
if json.is_empty() {
is_null = true;
} else {
items.extend(json.members().map(|item| func(item.clone())));
if items.is_empty() {
is_null = true;
} else {
page += 1;
}
}
}
Err(e) => map(
e,
&format!("Error during GitHub request {}", &request(PER_PAGE, page)),
)
.to_res()?,
}
}
Ok(items)
}
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
enum PRCommentType {
Review,
Code,
Discussion,
Body,
Error,
}
#[derive(Debug)]
struct PRComment {
id: i64,
user: String,
user_id: u64,
comment_type: PRCommentType,
created_at: u64,
body: String,
}
impl ToCSV for PRComment {
type Key = ();
fn header() -> &'static [&'static str] {
&["id", "user", "user_id", "type", "created_at", "body"]
}
fn to_csv(&self, _key: Self::Key) -> String {
format!(
"{},{},{},{},{},\"{}\"",
self.id,
self.user,
self.user_id,
match self.comment_type {
PRCommentType::Review => "review",
PRCommentType::Code => "code",
PRCommentType::Discussion => "discussion",
PRCommentType::Body => "body",
PRCommentType::Error => "error",
},
self.created_at,
clean_string_to_csv(&self.body)
)
}
}
impl Default for PRComment {
fn default() -> Self {
Self {
id: -1,
user: String::new(),
user_id: 0,
comment_type: PRCommentType::Error,
created_at: 0,
body: String::new(),
}
}
}
impl FromGitHub for PRComment {
type Complement = PRCommentType;
fn parse_json(json: &JsonValue, complement: PRCommentType) -> Result<Self, Error> {
let id: u64 = get_field::<u64>(json, "id")?;
let user_json = &json["user"];
let user: String = get_field::<String>(user_json, "login")?;
let user_id: u64 = get_field::<u64>(user_json, "id")?;
let created_at: i64 = if complement == PRCommentType::Review {
if field_is_null(json, "submitted_at")? {
0
} else {
PRMetadata::parse_date_time(json, "submitted_at")?
}
} else if field_is_null(json, "created_at")? {
0
} else {
PRMetadata::parse_date_time(json, "created_at")?
};
let body = if field_is_null(json, "body")? {
"".to_string()
} else {
get_field::<String>(json, "body")?
};
Ok(Self {
id: id as i64,
user,
user_id,
comment_type: complement,
created_at: created_at as u64,
body,
})
}
}
fn scrape_pr_comments(gh: &Github, repo_id: u32, pr: &PRMetadata) -> Result<(), Error> {
let mut file_content: String = String::new();
let mut output_file: CSVFile = CSVFile::new(&pr.file_path, FileMode::Overwrite)?;
map_err(
writeln!(&mut file_content, "{}", PRComment::header().join(",")),
"Could not write headers to string builder",
)?;
let pr_body: PRComment = PRComment {
id: 0,
user: pr.user.clone(),
user_id: pr.user_id,
comment_type: PRCommentType::Body,
created_at: pr.created_at,
body: pr.body.clone(),
};
map_err(
writeln!(&mut file_content, "{}", pr_body.to_csv(())),
"Could not write PR comments to string builder",
)?;
for t in [
(PRCommentType::Discussion, "issues", "comments"),
(PRCommentType::Code, "pulls", "comments"),
(PRCommentType::Review, "pulls", "reviews"),
] {
for row_res in scrape_pages(
gh,
&|per_page, page| {
format!(
"https://api.github.com/repositories/{}/{}/{}/{}?per_page={}&page={}",
repo_id, t.1, pr.pr_number, t.2, per_page, page
)
},
&|json| Ok(PRComment::parse_json(&json, t.0)?.to_csv(())),
)? {
match row_res {
Ok(row) => map_err(
writeln!(&mut file_content, "{}", row),
"Could not write PR comments to string builder",
)?,
Err(_) => {
writeln!(&mut file_content, "{}", PRComment::default().to_csv(())).unwrap()
}
}
}
}
map_err(
write!(&mut output_file, "{}", file_content),
&format!("Could not write to file {}", &pr.file_path),
)
}
#[cfg(test)]
mod tests {
use super::*;
const TEST_DATA: &str = "tests/data/phases/pull_request";
#[test]
fn test_language_scraper() {
let input_file: String = format!("{}/repos.csv", TEST_DATA);
let output_file: String = format!("{}.pulls.csv", input_file);
assert!(std::path::Path::new(&input_file).exists());
assert!(delete_file(&output_file, true).is_ok());
let tokens_file: String = "ghtokens.csv".to_string();
let target: String = format!("{}/prs", TEST_DATA);
let run_scraper: Result<(), Error> = run(
&input_file,
None,
&tokens_file,
0,
false,
"id",
"name",
&target,
None,
&mut Logger::new(),
);
assert!(run_scraper.is_ok());
let pr_discussion_path: String = format!("{}/5983/1128315983/1128315983_1.csv", target);
let pr_discussion = open_csv(&output_file, None, None);
assert!(pr_discussion.is_ok());
let pr_discussion = pr_discussion.unwrap();
let pr_discussion_expected = open_csv(&format!("{}.expected", &output_file), None, None);
assert!(pr_discussion_expected.is_ok());
let pr_discussion_expected = pr_discussion_expected.unwrap();
assert_eq!(pr_discussion, pr_discussion_expected);
let output_df = open_csv(&output_file, None, None);
assert!(output_df.is_ok());
let output_df = output_df.unwrap();
let expected_df = open_csv(&format!("{}.expected", output_file), None, None);
assert!(expected_df.is_ok());
let expected_df = expected_df.unwrap();
assert!(expected_df.equals(&output_df));
assert!(delete_file(&pr_discussion_path, false).is_ok());
assert!(delete_file(&output_file, false).is_ok());
}
#[test]
fn test_language_scraper_inexistent() {
let input_file: String = format!("{}/invalid.csv", TEST_DATA);
let output_file: String = format!("{}.pulls.csv", input_file);
assert!(std::path::Path::new(&input_file).exists());
assert!(delete_file(&output_file, true).is_ok());
let tokens_file: String = "ghtokens.csv".to_string();
let run_scraper: Result<(), Error> = run(
&input_file,
None,
&tokens_file,
0,
false,
"id",
"name",
&format!("{}/target", TEST_DATA),
None,
&mut Logger::new(),
);
assert!(run_scraper.is_err());
assert!(delete_file(&output_file, true).is_ok());
}
}