scyros 0.2.2

A framework to design sound, reproducible and scalable mining repositories studies on GitHub.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
// Copyright 2025 Andrea Gilot
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#![doc = include_str!("../docs/metadata.md")]

use anyhow::{bail, Result};
use std::collections::HashMap;
use std::collections::HashSet;
use std::io::Write;
use std::iter::FromIterator as _;
use std::path::Path;

use crate::utils::csv::*;
use crate::utils::dataframes;
use crate::utils::fs::*;
use crate::utils::github::*;
use crate::utils::github_api::Github;
use crate::utils::json::*;
use crate::utils::logger::{log_seed, Logger};
use clap::ArgAction;
use clap::{Arg, Command};
use indicatif::ProgressBar;
use json::JsonValue;
use polars::frame::DataFrame;
use polars::prelude::*;
use rand::rngs::StdRng;
use rand::seq::SliceRandom as _;
use rand::SeedableRng;
use tracing::info;

/// Command line arguments parsing.
pub fn cli() -> Command {
    Command::new("metadata")
        .about("Collect the metadata of GitHub projects")
        .long_about(include_str!("../docs/metadata.md"))
        .author("Andrea Gilot <andrea.gilot@it.uu.se>")
        .disable_version_flag(true)
        .arg(
            Arg::new("output")
                .short('o')
                .long("output")
                .value_name("OUTPUT_FILE.csv")
                .help("Path to the output csv file to store the metadata. \
                       By default, the name of the output file is the same as the input file with the suffix '.metadata.csv'.")
                .required(false)
        )
        .arg(
            Arg::new("input")
                .short('i')
                .long("input")
                .value_name("INPUT_FILE.csv")
                .help("Path to the input csv file to use. One of the columns must contain the full names of the projects. ")
                .required(true)
        )
        .arg(
            Arg::new("tokens")
                .short('t')
                .long("tokens")
                .value_name("TOKENS_FILE.csv")
                .help("Path to the file containing the GitHub tokens to use. It must be a valid CSV file with one column named 'token' and where every line is a \
                       valid GitHub token (e.g ghp_Ab0C1D2eFg3hIjk4LM56oPqRsTuvWX7yZa8B).")
                .required(true)
        )
        .arg(
            Arg::new("cache")
                .short('c')
                .long("cache")
                .value_name("CACHE.csv")
                .help("Path to the cache file to use. Must have been generated by a previous run of this program.")
                .required(false)
        )
        .arg(
            Arg::new("seed")
                .short('s')
                .long("seed")
                .value_name("SEED")
                .help("Seed used to randomly shuffle the input data.")
                .default_value("2955615809866670875")
                .value_parser(clap::value_parser!(u64)),
        )
        .arg(
            Arg::new("force")
                .short('f')
                .long("force")
                .help("Override the output file if it already exists.")
                .action(ArgAction::SetTrue),
        )
        .arg(
            Arg::new("ids")
                .long("ids")
                .help("Name of the column containing the ids of the projects.")
                .value_name("COLUMN_NAME")
                .default_value("id")
        )
        .arg(
            Arg::new("names")
                .long("names")
                .help("Name of the column containing the full names of the projects.")
                .value_name("COLUMN_NAME")
                .default_value("name")
        )
        .arg(
            Arg::new("sub")
                .long("sub")
                .value_name("NUMBER_OF_PROJECTS")
                .help("Number of projects to sample from the input file. \
                       If not specified, all remaining projects in the input file are used.")
        )
}

/// Collects metadata about GitHub projects.
///
/// The input must be a valid CSV file where the first column is the id of the project and the second column is the full name of the project.
/// Other columns are ignored. Such a file can be obtained by running the random-id-sampling program. Ids are chosen in a random order from the file.
/// The cache file must have been generated by a previous run of this program (possibly with a different input file or seed).
/// The results are saved in a new CSV file with the same name as the input file + ".with_metadata".
/// The output has the following columns:
/// * id: The id of the project.
/// * name: The full name of the project.
/// * language: The main language of the project.
/// * created: The timestamp of the creation of the project.
/// * pushed: The timestamp of the last push to the project.
/// * updated: The timestamp of the last update of the project.
/// * fork: Whether the project is a fork.
/// * disabled: Whether the project is disabled.
/// * archived: Whether the project is archived.
/// * stars: The number of stars of the project.
/// * forks: The number of times the project has been forked.
/// * issues: The number of open issues of the project.
/// * has_issues: Whether the project has issue tracking enabled (different from having 0 issues).
/// * watchers_count: The number of watchers of the project.
/// * subscribers: The number of subscribers of the project.
/// * size: The size of the project in LOC.
/// * license: The name of the license of the project.
///
///
/// # Arguments
///
/// * `input` - The path to the input file.
/// * `tokens` - The path to the file containing the GitHub tokens.
/// * `cache_opt` - The path to the cache file. If not provided, the program will not use a cache.
/// * `seed` - The seed to use for the random number generator.
///
///
/// # Returns
///
/// * Unit if the program finished successfully or an error message if an error occurred.
///
pub fn run(
    input_path: &str,
    output_path: Option<&String>,
    tokens: &str,
    cache_opt: Option<&String>,
    seed: u64,
    force: bool,
    ids: &str,
    names: &str,
    sub: Option<usize>,
    logger: &Logger,
) -> Result<()> {
    // Column index of the id in the input and cache files.
    const ID_COL: usize = 0;

    // Check if the token file is valid.
    logger.log_tokens(tokens)?;

    // Load input file
    let input_file: DataFrame = logger.run_task("Loading input file", || {
        open_csv(
            input_path,
            Some(Schema::from_iter(vec![
                Field::new(ids.into(), DataType::UInt32),
                Field::new(names.into(), DataType::String),
            ])),
            Some(vec![ids, names]),
        )
    })?;

    log_seed(seed);

    let mut shuffled_idx: Vec<usize> = (0..input_file.height()).collect();

    // Load the ids from the input file in random order.
    logger.run_task("Loading project IDs in random order", || {
        let mut rng: StdRng = SeedableRng::seed_from_u64(seed);
        shuffled_idx.shuffle(&mut rng);
        Ok(())
    })?;

    let shuffled_rows = shuffled_idx.into_iter().map(|idx| {
        // Safe unwrap
        let row = input_file.get_row(idx).unwrap().0;

        match (row[0].clone(), row[1].clone()) {
            (AnyValue::UInt32(id), AnyValue::String(name)) => Ok((id, name)),
            _ => Err(idx),
        }
    });

    let n_proj: usize = input_file.height();

    info!("  {} projects found.", n_proj);

    // Name of the output file.
    let default_output_path: String = format!("{}.metadata.csv", &input_path);
    let output_file_path: &str = output_path.unwrap_or(&default_output_path);

    // Load the previous results.
    let previous_results: HashSet<u32> = if force {
        HashSet::new()
    } else {
        logger.run_task("Resuming progress", || {
            Ok(if Path::new(&output_file_path).exists() {
                dataframes::u32(
                    &open_csv(
                        input_path,
                        Some(Schema::from_iter(vec![Field::new(
                            ids.into(),
                            DataType::UInt32,
                        )])),
                        Some(vec![ids]),
                    )?,
                    ids,
                )?
                .into_iter()
                .collect()
            } else {
                HashSet::new()
            })
        })?
    };

    if !previous_results.is_empty() {
        info!(
            "  the metadata of {} projects have already been queried",
            previous_results.len()
        );
    }

    let mut output_file: CSVFile = CSVFile::new(
        output_file_path,
        if force {
            FileMode::Overwrite
        } else {
            FileMode::Append
        },
    )?;

    output_file.write_header(ProjectMetadata::header())?;

    // Load the cache
    let cache: HashMap<u32, String> = logger.run_task("Loading cache", || {
        Ok(match cache_opt {
            Some(cache_path) => {
                let cache = CSVFile::new(cache_path, FileMode::Read)?;
                cache.indexed_lines(ID_COL)?
            }
            None => HashMap::new(),
        })
    })?;

    info!("  {} projects found in the cache.", cache.len());

    // Number of requests that were saved by using the cache.
    let mut request_from_cache: usize = 0;

    let gh = Github::new(tokens);

    info!("Starting to query the GitHub API...");

    // Number of projects to sample.
    let mut n: usize = match sub {
        Some(m) => m,
        None => n_proj - previous_results.len(),
    };

    // Create a progress bar
    let progress_bar: ProgressBar = ProgressBar::new(n_proj as u64);

    progress_bar.set_style(
        indicatif::ProgressStyle::default_bar()
            .template("{elapsed} {wide_bar} {percent}% | Requests from cache: {msg}")?,
    );

    if sub.is_some() {
        progress_bar.set_length(n as u64);
    }

    for row in shuffled_rows {
        if n == 0 {
            break;
        }
        match row {
            Ok((id, full_name)) => {
                // We first check if the project has already been processed. If not, we check the cache.
                // If the project is not in the cache, we make a request to the API.
                if !previous_results.contains(&id) {
                    // Row to write in the output file.
                    let csv_row: String = if cache.contains_key(&id) {
                        // Safe call to unwrap because the key is guaranteed to be in the cache.
                        request_from_cache += 1;
                        cache.get(&id).unwrap().clone()
                    } else {
                        match gh.request(&format!("https://api.github.com/repos/{full_name}")) {
                            Ok(json) => { ProjectMetadata::parse_json(&json, ())? }
                                .to_csv((id, full_name.to_string())),
                            Err(e) => ProjectMetadata::default()
                                .to_csv((id, e.to_string().trim().to_string())),
                        }
                    };

                    writeln!(&mut output_file, "{csv_row}")?;

                    progress_bar.inc(1);
                    progress_bar.set_message(request_from_cache.to_string());
                    n -= 1;
                }
            }
            Err(idx) => {
                bail!("Could not parse row {idx} in the input file")
            }
        }
    }
    Ok(())
}

/// Represents the metadata of a GitHub project.
/// The description of the project and the homepage are omitted as they can produce errors in the CSV file.
struct ProjectMetadata {
    /// The main language of the project.
    language: String,
    /// The timestamp of the creation of the project.
    created: i64,
    /// The timestamp of the last push to the project.
    pushed: i64,
    /// The timestamp of the last update of the project.
    updated: i64,
    /// Whether the project is a fork.
    fork: bool,
    /// Whether the project is disabled.
    disabled: bool,
    /// Whether the project is archived.
    archived: bool,
    /// The number of stars of the project.
    stars: u32,
    /// The number of times the project has been forked.
    forks: u32,
    /// The number of open issues of the project.
    issues: u32,
    /// Whether the project has issue tracking enabled (different from having 0 issues).
    has_issues: bool,
    /// The number of watchers of the project.
    watchers_count: u32,
    /// The number of subscribers of the project.
    subscribers: u32,
    /// The size of the project in LOC.
    size: u64,
    /// The name of the license of the project.
    license: String,
}

/// Default implementation for ProjectMetadata.
/// Sets all the dates to 1970-01-01T00:00:00Z, the booleans to false, the numbers to 0 and the strings to empty.
impl Default for ProjectMetadata {
    fn default() -> Self {
        Self {
            language: String::new(),
            created: 0,
            pushed: 0,
            updated: 0,
            fork: false,
            disabled: false,
            archived: false,
            stars: 0,
            forks: 0,
            issues: 0,
            has_issues: false,
            watchers_count: 0,
            subscribers: 0,
            size: 0,
            license: String::new(),
        }
    }
}

impl ToCSV for ProjectMetadata {
    /// Id of the project and error code
    type Key = (u32, String);

    fn header() -> &'static [&'static str] {
        &[
            "id",
            "name",
            "language",
            "created",
            "pushed",
            "updated",
            "fork",
            "disabled",
            "archived",
            "stars",
            "forks",
            "issues",
            "has_issues",
            "watchers_count",
            "subscribers",
            "size",
            "license",
        ]
    }

    fn to_csv(&self, key: Self::Key) -> String {
        format!(
            "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}",
            key.0,
            key.1,
            self.language,
            self.created,
            self.pushed,
            self.updated,
            if self.fork { 1 } else { 0 },
            if self.disabled { 1 } else { 0 },
            if self.archived { 1 } else { 0 },
            self.stars,
            self.forks,
            self.issues,
            if self.has_issues { 1 } else { 0 },
            self.watchers_count,
            self.subscribers,
            self.size,
            self.license,
        )
    }
}

impl FromGitHub for ProjectMetadata {
    type Complement = ();
    fn parse_json(json: &JsonValue, _complement: ()) -> Result<Self> {
        let language: String = if !json["language"].is_null() {
            get_field::<String>(json, "language")?
        } else {
            String::new()
        };
        let created: i64 = Self::parse_date_time(json, "created_at")?;
        let pushed: i64 = Self::parse_date_time(json, "pushed_at")?;
        let updated: i64 = Self::parse_date_time(json, "updated_at")?;
        let fork = get_field::<bool>(json, "fork")?;
        let disabled = get_field::<bool>(json, "disabled")?;
        let archived = get_field::<bool>(json, "archived")?;
        let stars = get_field::<u32>(json, "stargazers_count")?;
        let forks = get_field::<u32>(json, "forks_count")?;
        let issues = get_field::<u32>(json, "open_issues_count")?;
        let has_issues = get_field::<bool>(json, "has_issues")?;
        let watchers_count = get_field::<u32>(json, "watchers_count")?;
        let subscribers = get_field::<u32>(json, "subscribers_count")?;
        let size = get_field::<u64>(json, "size")?;

        // Parse the license field, defaulting to "unknown" if not present.
        let license = if !json["license"].is_null() {
            clean_string_to_csv(&get_field::<String>(&json["license"], "name")?)
        } else {
            "unknown".to_string()
        };

        // Return a new ProjectMetadata instance with the parsed data.
        Ok(Self {
            language,
            created,
            pushed,
            updated,
            fork,
            disabled,
            archived,
            stars,
            forks,
            issues,
            has_issues,
            watchers_count,
            subscribers,
            size,
            license,
        })
    }
}

#[cfg(test)]
mod tests {
    use anyhow::ensure;

    use super::*;
    use crate::utils::{dataframes::has_column, logger::test_logger};

    const TEST_DATA: &str = "tests/data/phases/metadata";

    #[test]
    fn test_language_scraper() -> Result<()> {
        let input_file: String = format!("{TEST_DATA}/repos.csv");
        let output_file: String = format!("{input_file}.metadata.csv");
        ensure!(
            std::path::Path::new(&input_file).exists(),
            "Input file does not exist"
        );
        delete_file(&output_file, true)?;

        let tokens_file: String = "ghtokens.csv".to_string();

        run(
            &input_file,
            None,
            &tokens_file,
            None,
            0,
            false,
            "id",
            "name",
            None,
            test_logger(),
        )?;

        let output_df = open_csv(&output_file, None, None)?;
        ensure!(
            has_column(&output_df, "name"),
            "Output does not have 'name' column"
        );
        let sorted_output_df = output_df.sort(vec!["name"], SortMultipleOptions::new())?;

        let expected_df = open_csv(&format!("{output_file}.expected"), None, None)?;
        ensure!(
            has_column(&expected_df, "name"),
            "Expected output does not have 'name' column"
        );
        let sorted_expected_df = expected_df.sort(vec!["name"], SortMultipleOptions::new())?;

        assert_eq!(sorted_expected_df, sorted_output_df);

        delete_file(&output_file, false)
    }
}