scyros 0.2.2 - Docs.rs

// Copyright 2025 Andrea Gilot
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#![doc = include_str!("../docs/metadata.md")]

use anyhow::{bail, Result};
use std::collections::HashMap;
use std::collections::HashSet;
use std::io::Write;
use std::iter::FromIterator as _;
use std::path::Path;

use crate::utils::csv::*;
use crate::utils::dataframes;
use crate::utils::fs::*;
use crate::utils::github::*;
use crate::utils::github_api::Github;
use crate::utils::json::*;
use crate::utils::logger::{log_seed, Logger};
use clap::ArgAction;
use clap::{Arg, Command};
use indicatif::ProgressBar;
use json::JsonValue;
use polars::frame::DataFrame;
use polars::prelude::*;
use rand::rngs::StdRng;
use rand::seq::SliceRandom as _;
use rand::SeedableRng;
use tracing::info;

/// Command line arguments parsing.
pub fn cli() -> Command {
    Command::new("metadata")
        .about("Collect the metadata of GitHub projects")
        .long_about(include_str!("../docs/metadata.md"))
        .author("Andrea Gilot <andrea.gilot@it.uu.se>")
        .disable_version_flag(true)
        .arg(
            Arg::new("output")
                .short('o')
                .long("output")
                .value_name("OUTPUT_FILE.csv")
                .help("Path to the output csv file to store the metadata. \
                       By default, the name of the output file is the same as the input file with the suffix '.metadata.csv'.")
                .required(false)
        )
        .arg(
            Arg::new("input")
                .short('i')
                .long("input")
                .value_name("INPUT_FILE.csv")
                .help("Path to the input csv file to use. One of the columns must contain the full names of the projects. ")
                .required(true)
        )
        .arg(
            Arg::new("tokens")
                .short('t')
                .long("tokens")
                .value_name("TOKENS_FILE.csv")
                .help("Path to the file containing the GitHub tokens to use. It must be a valid CSV file with one column named 'token' and where every line is a \
                       valid GitHub token (e.g ghp_Ab0C1D2eFg3hIjk4LM56oPqRsTuvWX7yZa8B).")
                .required(true)
        )
        .arg(
            Arg::new("cache")
                .short('c')
                .long("cache")
                .value_name("CACHE.csv")
                .help("Path to the cache file to use. Must have been generated by a previous run of this program.")
                .required(false)
        )
        .arg(
            Arg::new("seed")
                .short('s')
                .long("seed")
                .value_name("SEED")
                .help("Seed used to randomly shuffle the input data.")
                .default_value("2955615809866670875")
                .value_parser(clap::value_parser!(u64)),
        )
        .arg(
            Arg::new("force")
                .short('f')
                .long("force")
                .help("Override the output file if it already exists.")
                .action(ArgAction::SetTrue),
        )
        .arg(
            Arg::new("ids")
                .long("ids")
                .help("Name of the column containing the ids of the projects.")
                .value_name("COLUMN_NAME")
                .default_value("id")
        )
        .arg(
            Arg::new("names")
                .long("names")
                .help("Name of the column containing the full names of the projects.")
                .value_name("COLUMN_NAME")
                .default_value("name")
        )
        .arg(
            Arg::new("sub")
                .long("sub")
                .value_name("NUMBER_OF_PROJECTS")
                .help("Number of projects to sample from the input file. \
                       If not specified, all remaining projects in the input file are used.")
        )
}

/// Collects metadata about GitHub projects.
///
/// The input must be a valid CSV file where the first column is the id of the project and the second column is the full name of the project.
/// Other columns are ignored. Such a file can be obtained by running the random-id-sampling program. Ids are chosen in a random order from the file.
/// The cache file must have been generated by a previous run of this program (possibly with a different input file or seed).
/// The results are saved in a new CSV file with the same name as the input file + ".with_metadata".
/// The output has the following columns:
/// * id: The id of the project.
/// * name: The full name of the project.
/// * language: The main language of the project.
/// * created: The timestamp of the creation of the project.
/// * pushed: The timestamp of the last push to the project.
/// * updated: The timestamp of the last update of the project.
/// * fork: Whether the project is a fork.
/// * disabled: Whether the project is disabled.
/// * archived: Whether the project is archived.
/// * stars: The number of stars of the project.
/// * forks: The number of times the project has been forked.
/// * issues: The number of open issues of the project.
/// * has_issues: Whether the project has issue tracking enabled (different from having 0 issues).
/// * watchers_count: The number of watchers of the project.
/// * subscribers: The number of subscribers of the project.
/// * size: The size of the project in LOC.
/// * license: The name of the license of the project.
///
///
/// # Arguments
///
/// * `input` - The path to the input file.
/// * `tokens` - The path to the file containing the GitHub tokens.
/// * `cache_opt` - The path to the cache file. If not provided, the program will not use a cache.
/// * `seed` - The seed to use for the random number generator.
///
///
/// # Returns
///
/// * Unit if the program finished successfully or an error message if an error occurred.
///
pub fn run(
    input_path: &str,
    output_path: Option<&String>,
    tokens: &str,
    cache_opt: Option<&String>,
    seed: u64,
    force: bool,
    ids: &str,
    names: &str,
    sub: Option<usize>,
    logger: &Logger,
) -> Result<()> {
    // Column index of the id in the input and cache files.
    const ID_COL: usize = 0;

    // Check if the token file is valid.
    logger.log_tokens(tokens)?;

    // Load input file
    let input_file: DataFrame = logger.run_task("Loading input file", || {
        open_csv(
            input_path,
            Some(Schema::from_iter(vec![
                Field::new(ids.into(), DataType::UInt32),
                Field::new(names.into(), DataType::String),
            ])),
            Some(vec![ids, names]),
        )
    })?;

    log_seed(seed);

    let mut shuffled_idx: Vec<usize> = (0..input_file.height()).collect();

    // Load the ids from the input file in random order.
    logger.run_task("Loading project IDs in random order", || {
        let mut rng: StdRng = SeedableRng::seed_from_u64(seed);
        shuffled_idx.shuffle(&mut rng);
        Ok(())
    })?;

    let shuffled_rows = shuffled_idx.into_iter().map(|idx| {
        // Safe unwrap
        let row = input_file.get_row(idx).unwrap().0;

        match (row[0].clone(), row[1].clone()) {
            (AnyValue::UInt32(id), AnyValue::String(name)) => Ok((id, name)),
            _ => Err(idx),
        }
    });

    let n_proj: usize = input_file.height();

    info!("  {} projects found.", n_proj);

    // Name of the output file.
    let default_output_path: String = format!("{}.metadata.csv", &input_path);
    let output_file_path: &str = output_path.unwrap_or(&default_output_path);

    // Load the previous results.
    let previous_results: HashSet<u32> = if force {
        HashSet::new()
    } else {
        logger.run_task("Resuming progress", || {
            Ok(if Path::new(&output_file_path).exists() {
                dataframes::u32(
                    &open_csv(
                        input_path,
                        Some(Schema::from_iter(vec![Field::new(
                            ids.into(),
                            DataType::UInt32,
                        )])),
                        Some(vec![ids]),
                    )?,
                    ids,
                )?
                .into_iter()
                .collect()
            } else {
                HashSet::new()
            })
        })?
    };

    if !previous_results.is_empty() {
        info!(
            "  the metadata of {} projects have already been queried",
            previous_results.len()
        );
    }

    let mut output_file: CSVFile = CSVFile::new(
        output_file_path,
        if force {
            FileMode::Overwrite
        } else {
            FileMode::Append
        },
    )?;

    output_file.write_header(ProjectMetadata::header())?;

    // Load the cache
    let cache: HashMap<u32, String> = logger.run_task("Loading cache", || {
        Ok(match cache_opt {
            Some(cache_path) => {
                let cache = CSVFile::new(cache_path, FileMode::Read)?;
                cache.indexed_lines(ID_COL)?
            }
            None => HashMap::new(),
        })
    })?;

    info!("  {} projects found in the cache.", cache.len());

    // Number of requests that were saved by using the cache.
    let mut request_from_cache: usize = 0;

    let gh = Github::new(tokens);

    info!("Starting to query the GitHub API...");

    // Number of projects to sample.
    let mut n: usize = match sub {
        Some(m) => m,
        None => n_proj - previous_results.len(),
    };

    // Create a progress bar
    let progress_bar: ProgressBar = ProgressBar::new(n_proj as u64);

    progress_bar.set_style(
        indicatif::ProgressStyle::default_bar()
            .template("{elapsed} {wide_bar} {percent}% | Requests from cache: {msg}")?,
    );

    if sub.is_some() {
        progress_bar.set_length(n as u64);
    }

    for row in shuffled_rows {
        if n == 0 {
            break;
        }
        match row {
            Ok((id, full_name)) => {
                // We first check if the project has already been processed. If not, we check the cache.
                // If the project is not in the cache, we make a request to the API.
                if !previous_results.contains(&id) {
                    // Row to write in the output file.
                    let csv_row: String = if cache.contains_key(&id) {
                        // Safe call to unwrap because the key is guaranteed to be in the cache.
                        request_from_cache += 1;
                        cache.get(&id).unwrap().clone()
                    } else {
                        match gh.request(&format!("https://api.github.com/repos/{full_name}")) {
                            Ok(json) => { ProjectMetadata::parse_json(&json, ())? }
                                .to_csv((id, full_name.to_string())),
                            Err(e) => ProjectMetadata::default()
                                .to_csv((id, e.to_string().trim().to_string())),
                        }
                    };

                    writeln!(&mut output_file, "{csv_row}")?;

                    progress_bar.inc(1);
                    progress_bar.set_message(request_from_cache.to_string());
                    n -= 1;
                }
            }
            Err(idx) => {
                bail!("Could not parse row {idx} in the input file")
            }
        }
    }
    Ok(())
}

/// Represents the metadata of a GitHub project.
/// The description of the project and the homepage are omitted as they can produce errors in the CSV file.
struct ProjectMetadata {
    /// The main language of the project.
    language: String,
    /// The timestamp of the creation of the project.
    created: i64,
    /// The timestamp of the last push to the project.
    pushed: i64,
    /// The timestamp of the last update of the project.
    updated: i64,
    /// Whether the project is a fork.
    fork: bool,
    /// Whether the project is disabled.
    disabled: bool,
    /// Whether the project is archived.
    archived: bool,
    /// The number of stars of the project.
    stars: u32,
    /// The number of times the project has been forked.
    forks: u32,
    /// The number of open issues of the project.
    issues: u32,
    /// Whether the project has issue tracking enabled (different from having 0 issues).
    has_issues: bool,
    /// The number of watchers of the project.
    watchers_count: u32,
    /// The number of subscribers of the project.
    subscribers: u32,
    /// The size of the project in LOC.
    size: u64,
    /// The name of the license of the project.
    license: String,
}

/// Default implementation for ProjectMetadata.
/// Sets all the dates to 1970-01-01T00:00:00Z, the booleans to false, the numbers to 0 and the strings to empty.
impl Default for ProjectMetadata {
    fn default() -> Self {
        Self {
            language: String::new(),
            created: 0,
            pushed: 0,
            updated: 0,
            fork: false,
            disabled: false,
            archived: false,
            stars: 0,
            forks: 0,
            issues: 0,
            has_issues: false,
            watchers_count: 0,
            subscribers: 0,
            size: 0,
            license: String::new(),
        }
    }
}

impl ToCSV for ProjectMetadata {
    /// Id of the project and error code
    type Key = (u32, String);

    fn header() -> &'static [&'static str] {
        &[
            "id",
            "name",
            "language",
            "created",
            "pushed",
            "updated",
            "fork",
            "disabled",
            "archived",
            "stars",
            "forks",
            "issues",
            "has_issues",
            "watchers_count",
            "subscribers",
            "size",
            "license",
        ]
    }

    fn to_csv(&self, key: Self::Key) -> String {
        format!(
            "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}",
            key.0,
            key.1,
            self.language,
            self.created,
            self.pushed,
            self.updated,
            if self.fork { 1 } else { 0 },
            if self.disabled { 1 } else { 0 },
            if self.archived { 1 } else { 0 },
            self.stars,
            self.forks,
            self.issues,
            if self.has_issues { 1 } else { 0 },
            self.watchers_count,
            self.subscribers,
            self.size,
            self.license,
        )
    }
}

impl FromGitHub for ProjectMetadata {
    type Complement = ();
    fn parse_json(json: &JsonValue, _complement: ()) -> Result<Self> {
        let language: String = if !json["language"].is_null() {
            get_field::<String>(json, "language")?
        } else {
            String::new()
        };
        let created: i64 = Self::parse_date_time(json, "created_at")?;
        let pushed: i64 = Self::parse_date_time(json, "pushed_at")?;
        let updated: i64 = Self::parse_date_time(json, "updated_at")?;
        let fork = get_field::<bool>(json, "fork")?;
        let disabled = get_field::<bool>(json, "disabled")?;
        let archived = get_field::<bool>(json, "archived")?;
        let stars = get_field::<u32>(json, "stargazers_count")?;
        let forks = get_field::<u32>(json, "forks_count")?;
        let issues = get_field::<u32>(json, "open_issues_count")?;
        let has_issues = get_field::<bool>(json, "has_issues")?;
        let watchers_count = get_field::<u32>(json, "watchers_count")?;
        let subscribers = get_field::<u32>(json, "subscribers_count")?;
        let size = get_field::<u64>(json, "size")?;

        // Parse the license field, defaulting to "unknown" if not present.
        let license = if !json["license"].is_null() {
            clean_string_to_csv(&get_field::<String>(&json["license"], "name")?)
        } else {
            "unknown".to_string()
        };

        // Return a new ProjectMetadata instance with the parsed data.
        Ok(Self {
            language,
            created,
            pushed,
            updated,
            fork,
            disabled,
            archived,
            stars,
            forks,
            issues,
            has_issues,
            watchers_count,
            subscribers,
            size,
            license,
        })
    }
}

#[cfg(test)]
mod tests {
    use anyhow::ensure;

    use super::*;
    use crate::utils::{dataframes::has_column, logger::test_logger};

    const TEST_DATA: &str = "tests/data/phases/metadata";

    #[test]
    fn test_language_scraper() -> Result<()> {
        let input_file: String = format!("{TEST_DATA}/repos.csv");
        let output_file: String = format!("{input_file}.metadata.csv");
        ensure!(
            std::path::Path::new(&input_file).exists(),
            "Input file does not exist"
        );
        delete_file(&output_file, true)?;

        let tokens_file: String = "ghtokens.csv".to_string();

        run(
            &input_file,
            None,
            &tokens_file,
            None,
            0,
            false,
            "id",
            "name",
            None,
            test_logger(),
        )?;

        let output_df = open_csv(&output_file, None, None)?;
        ensure!(
            has_column(&output_df, "name"),
            "Output does not have 'name' column"
        );
        let sorted_output_df = output_df.sort(vec!["name"], SortMultipleOptions::new())?;

        let expected_df = open_csv(&format!("{output_file}.expected"), None, None)?;
        ensure!(
            has_column(&expected_df, "name"),
            "Expected output does not have 'name' column"
        );
        let sorted_expected_df = expected_df.sort(vec!["name"], SortMultipleOptions::new())?;

        assert_eq!(sorted_expected_df, sorted_output_df);

        delete_file(&output_file, false)
    }
}