scyros 0.2.2 - Docs.rs

// Copyright 2025 Andrea Gilot
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#![doc = include_str!("../docs/languages.md")]

use std::collections::HashMap;
use std::collections::HashSet;
use std::io::Write;
use std::iter::FromIterator as _;
use std::path::Path;

use crate::utils::csv::*;
use crate::utils::dataframes;
use crate::utils::fs::*;
use crate::utils::github::*;
use crate::utils::github_api::Github;
use crate::utils::json::*;
use crate::utils::logger::*;
use anyhow::{anyhow, bail, Context, Result};
use clap::ArgAction;
use clap::{Arg, Command};
use indicatif::ProgressBar;
use polars::frame::DataFrame;
use polars::prelude::*;
use rand::rngs::StdRng;
use rand::seq::SliceRandom as _;
use rand::SeedableRng;
use tracing::info;

/// Command line arguments parsing.
pub fn cli() -> Command {
    Command::new("languages")
        .about("Collect all the languages of GitHub projects along with the hash of their latest commit.")
        .long_about(include_str!("../docs/languages.md"))
        .author("Andrea Gilot <andrea.gilot@it.uu.se>")
        .disable_version_flag(true)
        .arg(
            Arg::new("output")
                .short('o')
                .long("output")
                .value_name("OUTPUT_FILE.csv")
                .help("Path to the output csv file to store the metadata. \
                       By default, the name of the output file is the same as the input file with the suffix '.languages.csv'.")
                .required(false)
        )
        .arg(
            Arg::new("input")
                .short('i')
                .long("input")
                .value_name("INPUT_FILE.csv")
                .help("Path to the input csv file to use. The file must contain at least two columns: one with the id of the projects and another one with their full name. \
                       By default, the column containing the ids is named 'id' and the column containing the full names is named 'name'.")
                .required(true)
        )
        .arg(
            Arg::new("tokens")
                .short('t')
                .long("tokens")
                .value_name("TOKENS_FILE.csv")
                .help("Path to the file containing the GitHub tokens to use. It must be a valid CSV file with one column named 'token' and where every line is a \
                       valid GitHub token (e.g ghp_Ab0C1D2eFg3hIjk4LM56oPqRsTuvWX7yZa8B).")
                .required(true)
        )
        .arg(
            Arg::new("cache")
                .short('c')
                .long("cache")
                .value_name("CACHE.csv")
                .help("Path to the cache file to use. Must have been generated by a previous run of this program.")
                .required(false)
        )
        .arg(
            Arg::new("seed")
                .short('s')
                .long("seed")
                .value_name("SEED")
                .help("Seed used to ramdomly shuffle the input data.")
                .default_value("2955615809866670875")
                .value_parser(clap::value_parser!(u64)),
        )
        .arg(
            Arg::new("force")
                .short('f')
                .long("force")
                .help("Override the output file if it already exists.")
                .action(ArgAction::SetTrue),
        )
        .arg(
            Arg::new("ids")
                .long("ids")
                .help("Name of the column containing the ids of the projects.")
                .value_name("COLUMN_NAME")
                .default_value("id")
        )
        .arg(
            Arg::new("names")
                .long("names")
                .help("Name of the column containing the full names of the projects.")
                .value_name("COLUMN_NAME")
                .default_value("name")
        )
        .arg(
            Arg::new("sub")
                .long("sub")
                .value_name("NUMBER_OF_PROJECTS")
                .help("Number of projects to sample from the input file. \
                       If not specified, all remaining projects in the input file are used.")
        )
}

/// Collects the list of all languages with the number of bytes written in each language of GitHub projects. Also logs the latest commit SHA.
///
/// The input must be a valid CSV file where the first column is the id of the project and the second column is the full name of the project.
/// Other columns are ignored. Such a file can be obtained by running the random-id-sampling program. Ids are chosen in a random order from the file.
/// The cache file must have been generated by a previous run of this program (possibly with a different input file or seed).
///
/// The output has the following columns:
/// * id: The id of the project.
/// * name: The full name of the project.
/// * languages: A map from the name of each language to the total size of the files written in that language.
/// * latest_commit: The SHA of the last commit of the project.
///
/// # Arguments
///
/// * `input_path` - The path to the input file.
/// * `tokens` - The path to the file containing the GitHub tokens.
/// * `cache_opt` - The path to the cache file. If not provided, the program will not use a cache.
/// * `seed` - The seed to use for the random number generator.
/// * `logger` - The logger to use to display the progress of the program.
///
///
/// # Returns
///
/// * Unit if the program finished successfully or an error message if an error occurred.
///
pub fn run(
    input_path: &str,
    output_path: Option<&str>,
    tokens: &str,
    cache_opt: Option<&String>,
    seed: u64,
    force: bool,
    ids: &str,
    names: &str,
    sub: Option<usize>,
    logger: &Logger,
) -> Result<()> {
    // Column index of the id in the input and cache files.
    const ID_COL: usize = 0;

    // Check if the token file is valid.
    logger.log_tokens(tokens)?;

    // Load input file
    let input_file: DataFrame = logger.run_task("Loading input file", || {
        open_csv(
            input_path,
            Some(Schema::from_iter(vec![
                Field::new(ids.into(), DataType::UInt32),
                Field::new(names.into(), DataType::String),
            ])),
            Some(vec![ids, names]),
        )
    })?;

    log_seed(seed);

    let mut shuffled_idx: Vec<usize> = (0..input_file.height()).collect();

    // Load the ids from the input file in random order.
    logger.run_task("Loading project IDs in random order", || {
        let mut rng: StdRng = SeedableRng::seed_from_u64(seed);
        shuffled_idx.shuffle(&mut rng);
        Ok(())
    })?;

    let shuffled_rows = shuffled_idx.into_iter().map(|idx| {
        // Safe unwrap
        let row = input_file.get_row(idx).unwrap().0;

        match (row[0].clone(), row[1].clone()) {
            (AnyValue::UInt32(id), AnyValue::String(name)) => Ok((id, name)),
            _ => Err(idx),
        }
    });

    let n_proj: usize = input_file.height();

    info!("  {} projects found.", n_proj);

    // Name of the output file.
    let default_output_path: String = format!("{}.languages.csv", &input_path);
    let output_file_path: &str = output_path.unwrap_or(&default_output_path);

    // Load the previous results.
    let previous_results: HashSet<u32> = if force {
        HashSet::new()
    } else {
        logger.run_task("Resuming progress", || {
            Ok(if Path::new(&output_file_path).exists() {
                dataframes::u32(
                    &open_csv(
                        input_path,
                        Some(Schema::from_iter(vec![Field::new(
                            ids.into(),
                            DataType::UInt32,
                        )])),
                        Some(vec![ids]),
                    )?,
                    ids,
                )?
                .into_iter()
                .collect()
            } else {
                HashSet::new()
            })
        })?
    };

    if !previous_results.is_empty() {
        info!(
            "  the languages of {} projects have already been queried",
            previous_results.len()
        );
    }

    let mut output_file: CSVFile = CSVFile::new(
        output_file_path,
        if force {
            FileMode::Overwrite
        } else {
            FileMode::Append
        },
    )?;

    output_file.write_header(ProjectInfo::header())?;

    // Load the cache
    let cache: HashMap<u32, String> = logger.run_task("Loading cache", || {
        Ok(match cache_opt {
            Some(cache_path) => {
                let cache = CSVFile::new(cache_path, FileMode::Read)?;
                cache.indexed_lines(ID_COL)?
            }
            None => HashMap::new(),
        })
    })?;

    info!("  {} projects found in the cache.", cache.len());

    // Number of requests that were saved by using the cache.
    let mut request_from_cache: usize = 0;

    let gh = Github::new(tokens);

    info!("Starting to query the GitHub API...");

    // Number of projects to sample.
    let mut n: usize = match sub {
        Some(m) => m,
        None => n_proj - previous_results.len(),
    };

    // Create a progress bar
    let progress_bar: ProgressBar = ProgressBar::new(n_proj as u64);

    progress_bar.set_style(
        indicatif::ProgressStyle::default_bar()
            .template("{elapsed} {wide_bar} {percent}% | Requests from cache: {msg}")?,
    );

    if sub.is_some() {
        progress_bar.set_length(n as u64);
    }

    for row in shuffled_rows {
        if n == 0 {
            break;
        }
        match row {
            Ok((id, full_name)) => {
                // We first check if the project has already been processed. If not, we check the cache.
                // If the project is not in the cache, we make a request to the API.
                if !previous_results.contains(&id) {
                    // Row to write in the output file.
                    let csv_row: String = if cache.contains_key(&id) {
                        // Safe call to unwrap because the key is guaranteed to be in the cache.
                        request_from_cache += 1;
                        cache.get(&id).unwrap().clone()
                    } else {
                        let request1 = gh.request(&format!(
                            "https://api.github.com/repos/{full_name}/languages"
                        ));
                        let request2 = gh
                            .request(&format!("https://api.github.com/repos/{full_name}/commits"));
                        match (request1, request2) {
                            (Ok(json_lang), Ok(json_commits)) => {
                                ProjectInfo::from_json(&json_lang, &json_commits)?
                                    .to_csv((id, full_name.to_string()))
                            }
                            (Err(e), _) => ProjectInfo::default().to_csv((id, e.to_string())),
                            (_, Err(e)) => ProjectInfo::default().to_csv((id, e.to_string())),
                        }
                    };

                    writeln!(&mut output_file, "{csv_row}")?;

                    progress_bar.inc(1);
                    progress_bar.set_message(request_from_cache.to_string());
                    n -= 1;
                }
            }
            Err(idx) => {
                bail!("Could not parse row {idx} in the input file");
            }
        }
    }
    Ok(())
}

/// Represents the information retrieved from a GitHub project.
///
/// # Fields
///
/// * languages: A map from the name of each language to the total size of the files written in that language.
/// * last_commit: The SHA of the last commit of the project.
///
#[derive(Default)]
struct ProjectInfo {
    languages: HashMap<String, i64>,
    latest_commit: String,
}

impl ToCSV for ProjectInfo {
    /// ID and name of the project.
    type Key = (u32, String);

    fn to_csv(&self, key: Self::Key) -> String {
        format!(
            "{},{},{},{}",
            key.0,
            key.1,
            Self::print_languages(&self.languages),
            self.latest_commit,
        )
    }

    fn header() -> &'static [&'static str] {
        &["id", "name", "languages", "latest_commit"]
    }
}

impl ProjectInfo {
    /// Parses a JSON object into a ProjectInfo instance.
    ///
    /// # Arguments
    ///
    /// * `json` - The JSON object to parse.
    ///
    /// # Returns
    ///
    /// An option containing the ProjectInfo instance if the JSON object could be parsed.
    ///
    /// # Panics
    ///
    /// * If the JSON object could not be parsed.
    fn from_json(json_lang: &json::JsonValue, json_commit: &json::JsonValue) -> Result<Self> {
        let mut languages: HashMap<String, i64> = HashMap::new();
        for (lan, size) in json_lang.entries() {
            languages.insert(
                lan.to_owned(),
                size.as_i64()
                    .with_context(|| anyhow!("Could not parse the size of the language {lan}"))?,
            );
        }

        let sha = get_field::<String>(&json_commit[0], "sha")?;

        Ok(Self {
            languages,
            latest_commit: sha,
        })
    }

    /// Returns a string representation of the language map. Each entry is separated by a semicolon to avoid conflicts with the CSV format.\
    ///
    /// # Example
    ///
    /// ```ignore
    /// let mut languages = HashMap::<String, i64>::new();
    /// languages.insert("Rust".to_owned(), 1000);
    /// languages.insert("Python".to_owned(), 2000);
    ///
    /// assert_eq!(ProjectInfo::print_languages(&languages), "Rust:1000;Python:2000");
    /// ```
    fn print_languages(languages: &HashMap<String, i64>) -> String {
        languages
            .iter()
            .map(|(k, v)| format!("{k}:{v}"))
            .collect::<Vec<String>>()
            .join(";")
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use anyhow::ensure;

    const TEST_DATA: &str = "tests/data/phases/languages";

    #[test]
    fn test_language_scraper() -> Result<()> {
        let input_file: String = format!("{TEST_DATA}/repos.csv");
        let output_file: String = format!("{input_file}.languages.csv");
        ensure!(std::path::Path::new(&input_file).exists());

        delete_file(&output_file, true)?;

        let tokens_file: String = "ghtokens.csv".to_string();

        run(
            &input_file,
            None,
            &tokens_file,
            None,
            0,
            false,
            "id",
            "name",
            None,
            test_logger(),
        )?;

        delete_file(&output_file, false)
    }
}