#![doc = include_str!("../docs/ids.md")]
use anyhow::{anyhow, bail, Context, Result};
use clap::ArgAction;
use clap::{Arg, Command};
use indicatif::ProgressBar;
use json::JsonValue;
use polars::prelude::DataFrame;
use polars::prelude::DataType;
use polars::prelude::Field;
use polars::prelude::Schema;
use std::fmt::Write as _;
use std::io::Write;
use std::iter::FromIterator as _;
use std::path::Path;
use tracing::info;
use crate::utils::csv::*;
use crate::utils::dataframes;
use crate::utils::fs::*;
use crate::utils::github::*;
use crate::utils::github_api::Github;
use crate::utils::json::*;
use crate::utils::logger::{log_seed, Logger};
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
pub fn cli() -> Command {
Command::new("ids")
.about("Collects random ids of public projects on GitHub, along with their name and whether the project is a fork.")
.long_about(include_str!("../docs/ids.md"))
.disable_version_flag(true)
.arg(
Arg::new("output")
.short('o')
.long("output")
.value_name("OUTPUT_FILE.csv")
.help("Path to the output CSV file storing the ids and the names of the projects. If the file does not exist, it will be created.\n\
If the file already exists, new data will be appended to the end of the file.")
.required(true)
)
.arg(
Arg::new("tokens")
.short('t')
.long("tokens")
.value_name("TOKENS_FILE.csv")
.help("Path to the file containing the GitHub tokens to use for querrying GitHub REST API.\n\
It must be a valid CSV file, with a column named 'token' where every entry is a valid GitHub token.")
.required(true)
)
.arg(
Arg::new("seed")
.short('s')
.long("seed")
.value_name("SEED")
.help("Seed used to generate random ids.")
.default_value("11372246557183969657")
.value_parser(clap::value_parser!(u64)),
)
.arg(
Arg::new("min")
.long("min")
.value_name("MIN_ID")
.help("Minimum id to sample.")
.default_value("0")
.value_parser(clap::value_parser!(u32))
)
.arg(
Arg::new("max")
.long("max")
.value_name("MAX_ID")
.help("Maximum id to sample. Default to 2026/01/05.")
.default_value("1128315983")
.value_parser(clap::value_parser!(u32))
)
.arg(
Arg::new("number")
.short('n')
.long("number")
.value_name("NUMBER_OF_IDS")
.help("Number of ids to sample. \
If the number is not a multiple of 100, it is rounded to the least multiple of 100 greater than the value.\n\
If not specified the program will run indefinitely.")
.value_parser(clap::value_parser!(usize))
)
.arg(
Arg::new("mode")
.long("mode")
.value_name("MODE")
.help("Sampling mode. 'linear' to sample ids in sequential order, 'random' to sample ids at random.")
.default_value("random")
.value_parser(["linear", "random"]),
)
.arg(
Arg::new("force")
.short('f')
.long("force")
.help("Override the output file if it already exists.")
.default_value("false")
.action(ArgAction::SetTrue),
)
}
pub fn run(
output_path: &str,
tokens: &str,
seed: u64,
min_id: u32,
max_id: u32,
n: Option<usize>,
mode: &str,
force: bool,
logger: &Logger,
) -> Result<()> {
logger.log_tokens(tokens)?;
let (mut last_id, mut requests): (u32, usize) = if force {
info!("Overwriting previous results");
(min_id, 0)
} else if Path::new(output_path).exists() {
let input_df: DataFrame = logger.run_task("Loading previous results", || {
open_csv(
output_path,
Some(Schema::from_iter(vec![
Field::new("id".into(), DataType::UInt32),
Field::new("name".into(), DataType::String),
Field::new("fork".into(), DataType::UInt32),
Field::new("request_number".into(), DataType::UInt32),
])),
Some(ProjectInfo::header().to_vec()),
)
})?;
let last_id: u32 = dataframes::u32(&input_df, "id")?
.into_iter()
.last()
.with_context(|| "Could not get last id")?;
let last_request_number: u32 = dataframes::u32(&input_df, "request_number")?
.into_iter()
.last()
.with_context(|| "Could not get last request number")?;
info!(" {} ids already sampled.", input_df.height());
(last_id, last_request_number as usize + 1)
} else {
info!("No previous data found");
(min_id, 0)
};
match n {
Some(n) => info!("Sampling {} ids...", n),
None => info!("Sampling ids..."),
}
info!("Range: [{}, {}]", min_id, max_id);
let mut output_file = CSVFile::new(
output_path,
if force {
FileMode::Overwrite
} else {
FileMode::Append
},
)?;
output_file.write_header(ProjectInfo::header())?;
let gh = Github::new(tokens);
let progress_bar: ProgressBar = match n {
Some(n) => ProgressBar::new(n as u64),
None => ProgressBar::new_spinner(),
};
if n.is_some() {
progress_bar.set_style(
indicatif::ProgressStyle::default_bar().template("{elapsed} {wide_bar} {percent}%")?,
)
}
let mut rng: StdRng = SeedableRng::seed_from_u64(seed);
if mode == "random" {
log_seed(seed);
for _ in 0..requests {
rng.gen_range(min_id..max_id);
}
}
let mut remaining: Option<usize> = n;
while remaining
.map(|x| x > 0)
.unwrap_or(mode == "random" || last_id < max_id)
{
let first_id: u32 = if mode == "random" {
rng.gen_range(min_id..max_id)
} else {
last_id
};
const MAX_RETRIES: usize = 3;
let request: JsonValue = {
let mut attempts = 0;
let mut request: Result<JsonValue> =
Err(anyhow!("Did not send any request yet: ID {first_id}"));
while request.is_err() && attempts < MAX_RETRIES {
request = gh
.request(&format!(
"https://api.github.com/repositories?since={first_id}"
))
.with_context(|| {
format!("Could not send the request to the Github API: ID {first_id}")
});
attempts += 1;
}
request.with_context(|| "Maximum number of retries reached")
}?;
match request {
json::JsonValue::Array(repos) => {
let mut builder: String = String::new();
let response_size = repos.len();
let mut skipped: usize = 0;
for repo in repos.iter() {
if repo.is_null() {
skipped += 1;
} else {
let project_info: ProjectInfo = ProjectInfo::parse_json(repo, ())?;
last_id = project_info.id as u32;
writeln!(&mut builder, "{}", project_info.to_csv(requests))?;
}
}
match remaining {
Some(_) => progress_bar.inc((response_size - skipped) as u64),
None => progress_bar.tick(),
}
remaining = remaining.map(|x| x.saturating_sub(response_size - skipped));
write!(&mut output_file, "{builder}")
.with_context(|| format!("Could not write to file {output_path}"))?;
}
_ => {
if !request.has_key("message")
|| request["message"].as_str().with_context(|| {
format!("Could not parse message as string in {request}")
})? != "Not Found"
{
bail!("Unknown response format: {request} ")
}
}
}
requests += 1;
}
Ok(())
}
struct ProjectInfo {
id: i32,
name: String,
fork: u32,
}
impl ToCSV for ProjectInfo {
type Key = usize;
fn header() -> &'static [&'static str] {
&["id", "name", "fork", "request_number"]
}
fn to_csv(&self, request_number: Self::Key) -> String {
format!("{},{},{},{}", self.id, self.name, self.fork, request_number)
}
}
impl Default for ProjectInfo {
fn default() -> Self {
Self {
id: -1,
name: String::new(),
fork: 0,
}
}
}
impl FromGitHub for ProjectInfo {
type Complement = ();
fn parse_json(json: &json::JsonValue, _complement: ()) -> Result<Self>
where
Self: Sized,
{
let id: u32 = get_field::<u32>(json, "id")?;
let name: String = get_field::<String>(json, "full_name")?;
let fork: u32 = get_field::<bool>(json, "fork")? as u32;
Ok(Self {
id: id as i32,
name,
fork,
})
}
}
#[cfg(test)]
mod tests {
use std::fs;
use super::*;
use crate::utils::logger::test_logger;
const TEST_DATA: &str = "tests/data/phases/ids";
const TOKENS: &str = "ghtokens.csv";
const SEED: u64 = 113722657;
#[test]
fn test_random_ids() -> Result<()> {
let id_half = format!("{TEST_DATA}/id_random_1.csv");
let id_full = format!("{TEST_DATA}/id_random_2.csv");
let id_force = format!("{TEST_DATA}/id_random_3.csv");
delete_file(&id_half, true)?;
delete_file(&id_full, true)?;
delete_file(&id_force, true)?;
run(
&id_half,
TOKENS,
SEED,
0,
871212690,
Some(280),
"random",
false,
test_logger(),
)?;
run(
&id_half,
TOKENS,
SEED,
0,
871212690,
Some(280),
"random",
false,
test_logger(),
)?;
run(
&id_full,
TOKENS,
SEED,
0,
871212690,
Some(500),
"random",
false,
test_logger(),
)?;
run(
&id_force,
TOKENS,
SEED,
0,
871212690,
Some(1000),
"random",
true,
test_logger(),
)?;
run(
&id_half,
TOKENS,
SEED,
0,
871212690,
Some(500),
"random",
true,
test_logger(),
)?;
assert_eq!(fs::read_to_string(&id_half)?, fs::read_to_string(&id_full)?);
assert_ne!(
fs::read_to_string(&id_half)?,
fs::read_to_string(&id_force)?
);
delete_file(&id_half, false)?;
delete_file(&id_full, false)?;
delete_file(&id_force, false)
}
#[test]
fn test_linear_ids() -> Result<()> {
let id_half = format!("{TEST_DATA}/id_linear_1.csv");
let id_full = format!("{TEST_DATA}/id_linear_2.csv");
let id_force = format!("{TEST_DATA}/id_linear_3.csv");
delete_file(&id_half, true)?;
delete_file(&id_full, true)?;
delete_file(&id_force, true)?;
run(
&id_half,
TOKENS,
SEED,
0,
871212690,
Some(280),
"linear",
false,
test_logger(),
)?;
run(
&id_half,
TOKENS,
SEED,
0,
871212690,
Some(280),
"linear",
false,
test_logger(),
)?;
run(
&id_full,
TOKENS,
SEED,
0,
871212690,
Some(500),
"linear",
false,
test_logger(),
)?;
run(
&id_force,
TOKENS,
SEED,
0,
871212690,
Some(1000),
"linear",
true,
test_logger(),
)?;
run(
&id_half,
TOKENS,
SEED,
0,
871212690,
Some(500),
"linear",
true,
test_logger(),
)?;
assert_eq!(fs::read_to_string(&id_half)?, fs::read_to_string(&id_full)?);
assert_ne!(
fs::read_to_string(&id_half)?,
fs::read_to_string(&id_force)?
);
delete_file(&id_half, false)?;
delete_file(&id_full, false)?;
delete_file(&id_force, false)
}
}