use std::collections::BTreeMap;
use serde::{Deserialize, Serialize};
use time::OffsetDateTime;
use crate::api::github::{StargazerEntry, UserProfile};
use crate::collectors::stars::StarsRawData;
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
pub struct StarsFeatures {
pub total_stars: u64,
pub forks_count: u64,
pub watchers_count: u64,
pub fork_to_star_ratio: f64,
pub watcher_to_star_ratio: f64,
pub low_activity_share: Option<f64>,
pub lockstep_z_score: Option<f64>,
pub sample_size: usize,
pub primary_language: Option<String>,
pub repo_age_days: u64,
pub archived: bool,
}
#[must_use]
pub fn compute(raw: &StarsRawData, now: OffsetDateTime) -> StarsFeatures {
let total = raw.repo_metadata.stargazers_count;
let forks = raw.repo_metadata.forks_count;
let watchers = raw.repo_metadata.watchers_count;
let fork_to_star_ratio = if total > 0 {
forks as f64 / total as f64
} else {
0.0
};
let watcher_to_star_ratio = if total > 0 {
watchers as f64 / total as f64
} else {
0.0
};
let low_activity_share = if raw.sampled_profiles.is_empty() {
None
} else {
let matches = raw
.sampled_profiles
.iter()
.filter(|(s, p)| matches_low_activity_profile(s, p))
.count();
Some(matches as f64 / raw.sampled_profiles.len() as f64)
};
let repo_age_days = (now - raw.repo_metadata.created_at).whole_days().max(0) as u64;
let starred_dates: Vec<OffsetDateTime> = raw
.sampled_profiles
.iter()
.filter_map(|(entry, _)| match entry {
StargazerEntry::WithDate { starred_at, .. } => Some(*starred_at),
StargazerEntry::Plain(_) => None,
})
.collect();
let lockstep_z_score = lockstep_z_score(&starred_dates);
StarsFeatures {
total_stars: total,
forks_count: forks,
watchers_count: watchers,
fork_to_star_ratio: crate::utils::time::round6(fork_to_star_ratio),
watcher_to_star_ratio: crate::utils::time::round6(watcher_to_star_ratio),
low_activity_share: low_activity_share.map(crate::utils::time::round6),
lockstep_z_score,
sample_size: raw.sampled_profiles.len(),
primary_language: raw.repo_metadata.language.clone(),
repo_age_days,
archived: raw.repo_metadata.archived,
}
}
const LOCKSTEP_BASELINE_DAYS: usize = 28;
const LOCKSTEP_LAG_DAYS: usize = 7;
const LOCKSTEP_MIN_SERIES_DAYS: i64 = (LOCKSTEP_BASELINE_DAYS + LOCKSTEP_LAG_DAYS) as i64;
#[must_use]
pub fn lockstep_z_score(starred_dates: &[OffsetDateTime]) -> Option<f64> {
if starred_dates.is_empty() {
return None;
}
let mut by_day: BTreeMap<time::Date, u32> = BTreeMap::new();
for ts in starred_dates {
*by_day.entry(ts.date()).or_insert(0) += 1;
}
let first = *by_day.keys().next()?;
let last = *by_day.keys().next_back()?;
let span_days = (last - first).whole_days();
if span_days < LOCKSTEP_MIN_SERIES_DAYS {
return None;
}
let mut series: Vec<u32> = Vec::with_capacity(span_days.unsigned_abs() as usize + 1);
let mut cur = first;
while cur <= last {
series.push(by_day.get(&cur).copied().unwrap_or(0));
cur = match cur.next_day() {
Some(d) => d,
None => break,
};
}
let baseline_window = LOCKSTEP_BASELINE_DAYS;
let lag_days = LOCKSTEP_LAG_DAYS;
let baseline_window_f = f64::from(LOCKSTEP_BASELINE_DAYS as u32);
let mut max_z = f64::NEG_INFINITY;
let start = baseline_window + lag_days;
if start >= series.len() {
return None;
}
for d in start..series.len() {
let baseline_end = d - lag_days;
let baseline_start = baseline_end - baseline_window;
let window = &series[baseline_start..baseline_end];
let sum: f64 = window.iter().map(|x| f64::from(*x)).sum();
let mean = sum / baseline_window_f;
let var: f64 = window
.iter()
.map(|x| {
let dx = f64::from(*x) - mean;
dx * dx
})
.sum::<f64>()
/ baseline_window_f;
let std = var.sqrt().max(1.0);
let z = (f64::from(series[d]) - mean) / std;
if z > max_z {
max_z = z;
}
}
if max_z.is_finite() {
Some(crate::utils::time::round6(max_z))
} else {
None
}
}
#[must_use]
pub fn matches_low_activity_profile(stargazer: &StargazerEntry, profile: &UserProfile) -> bool {
let cutoff = OffsetDateTime::from_unix_timestamp(1_640_995_200) .expect("static unix timestamp");
if profile.created_at < cutoff {
return false;
}
if profile.followers > 1 || profile.following > 1 {
return false;
}
if profile.public_gists > 0 {
return false;
}
if profile.public_repos > 4 {
return false;
}
if non_empty(&profile.bio) || non_empty(&profile.blog) || non_empty(&profile.email) {
return false;
}
if let StargazerEntry::WithDate { starred_at, .. } = stargazer {
let starred_day = starred_at.date();
let created_day = profile.created_at.date();
if starred_day != created_day {
return false;
}
}
true
}
fn non_empty(s: &Option<String>) -> bool {
s.as_deref().is_some_and(|x| !x.trim().is_empty())
}
#[must_use]
pub fn ecosystem_multipliers(language: Option<&str>) -> (f64, f64) {
match language {
Some("TypeScript" | "JavaScript") => (0.7, 0.8),
Some("Python") => (1.0, 1.0),
Some("Go") => (1.1, 1.0),
Some("Rust") => (0.9, 0.9),
Some("Java" | "Kotlin") => (1.0, 1.0),
_ => (1.0, 1.0),
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::api::github::UserStub;
fn ts(year: i32, month: u8, day: u8) -> OffsetDateTime {
time::Date::from_calendar_date(year, month.try_into().unwrap(), day)
.unwrap()
.midnight()
.assume_utc()
}
fn flagged_profile() -> UserProfile {
UserProfile {
login: "low_a".into(),
created_at: ts(2024, 6, 1),
followers: 0,
following: 0,
public_repos: 1,
public_gists: 0,
bio: None,
blog: None,
email: None,
user_type: Some("User".into()),
}
}
fn entry_with_date(login: &str, day: time::OffsetDateTime) -> StargazerEntry {
StargazerEntry::WithDate {
starred_at: day,
user: UserStub {
login: login.into(),
user_type: Some("User".into()),
},
}
}
#[test]
fn low_activity_profile_flagged_with_all_8_signals() {
let profile = flagged_profile();
let stargazer = StargazerEntry::Plain(UserStub {
login: "low_a".into(),
user_type: Some("User".into()),
});
assert!(matches_low_activity_profile(&stargazer, &profile));
}
#[test]
fn pre_2022_account_not_flagged() {
let mut p = flagged_profile();
p.created_at = ts(2021, 12, 31);
let s = StargazerEntry::Plain(UserStub {
login: "x".into(),
user_type: None,
});
assert!(!matches_low_activity_profile(&s, &p));
}
#[test]
fn high_followers_not_flagged() {
let mut p = flagged_profile();
p.followers = 5;
let s = StargazerEntry::Plain(UserStub {
login: "x".into(),
user_type: None,
});
assert!(!matches_low_activity_profile(&s, &p));
}
#[test]
fn non_empty_bio_not_flagged() {
let mut p = flagged_profile();
p.bio = Some("hello".into());
let s = StargazerEntry::Plain(UserStub {
login: "x".into(),
user_type: None,
});
assert!(!matches_low_activity_profile(&s, &p));
}
#[test]
fn whitespace_bio_treated_as_empty() {
let mut p = flagged_profile();
p.bio = Some(" ".into());
let s = StargazerEntry::Plain(UserStub {
login: "x".into(),
user_type: None,
});
assert!(matches_low_activity_profile(&s, &p));
}
#[test]
fn starred_at_same_day_as_created_flagged() {
let p = flagged_profile();
let s = entry_with_date("low_a", ts(2024, 6, 1));
assert!(matches_low_activity_profile(&s, &p));
}
#[test]
fn starred_at_different_day_not_flagged() {
let p = flagged_profile();
let s = entry_with_date("low_a", ts(2024, 6, 5));
assert!(!matches_low_activity_profile(&s, &p));
}
#[test]
fn ecosystem_multipliers_table() {
assert_eq!(ecosystem_multipliers(Some("TypeScript")), (0.7, 0.8));
assert_eq!(ecosystem_multipliers(Some("JavaScript")), (0.7, 0.8));
assert_eq!(ecosystem_multipliers(Some("Go")), (1.1, 1.0));
assert_eq!(ecosystem_multipliers(Some("Rust")), (0.9, 0.9));
assert_eq!(ecosystem_multipliers(Some("Python")), (1.0, 1.0));
assert_eq!(ecosystem_multipliers(None), (1.0, 1.0));
assert_eq!(ecosystem_multipliers(Some("Cobol")), (1.0, 1.0));
}
}