#![forbid(unsafe_code)]
use std::collections::HashMap;
use std::fs;
use std::io::Write;
use std::path::PathBuf;
use anyhow::{Context, Result};
use chrono::{DateTime, Utc};
use clap::Parser;
use pi::extension_license::{ScreeningReport, VerdictStatus};
use pi::extension_popularity::{CandidateItem, CandidatePool};
use pi::extension_scoring::{
CandidateInput, CompatStatus, Compatibility, Gates, LicenseInfo, MarketplaceSignals, Recency,
Redistribution, RiskInfo, Signals, Tags, score_candidates,
};
use pi::extension_validation::{ValidationReport, ValidationStatus};
#[derive(Debug, Parser)]
#[command(name = "ext_tiered_corpus")]
#[command(about = "Build tiered extension corpus from merged research signals")]
struct Args {
#[arg(long)]
validated: PathBuf,
#[arg(long)]
candidate_pool: Option<PathBuf>,
#[arg(long)]
license_report: Option<PathBuf>,
#[arg(long)]
out: PathBuf,
#[arg(long)]
summary_out: Option<PathBuf>,
#[arg(long)]
log_out: Option<PathBuf>,
#[arg(long)]
as_of: Option<String>,
#[arg(long, default_value_t = 20)]
top_n: usize,
#[arg(long, default_value = "bd-34io")]
task_id: String,
}
struct MergedCandidate {
canonical_id: String,
name: String,
source_tier: Option<String>,
registrations: Vec<String>,
repository_url: Option<String>,
npm_package: Option<String>,
pool_item: Option<CandidateItem>,
_license_verdict: Option<VerdictStatus>,
license_spdx: Option<String>,
}
#[allow(clippy::too_many_lines)]
fn main() -> Result<()> {
let args = Args::parse();
let validated_text = fs::read_to_string(&args.validated)
.with_context(|| format!("reading validated report from {}", args.validated.display()))?;
let validated: ValidationReport = serde_json::from_str(&validated_text)
.with_context(|| format!("parsing validated report from {}", args.validated.display()))?;
let pool_map: HashMap<String, CandidateItem> = args
.candidate_pool
.as_ref()
.map(|p| {
let text = fs::read_to_string(p)
.with_context(|| format!("reading candidate pool from {}", p.display()))?;
let pool: CandidatePool = serde_json::from_str(&text)
.with_context(|| format!("parsing candidate pool from {}", p.display()))?;
let mut map = HashMap::new();
for item in pool.items {
map.insert(item.id.clone(), item.clone());
map.insert(item.name.clone(), item);
}
Ok::<_, anyhow::Error>(map)
})
.transpose()?
.unwrap_or_default();
let license_map: HashMap<String, (VerdictStatus, String)> = args
.license_report
.as_ref()
.map(|p| {
let text = fs::read_to_string(p)
.with_context(|| format!("reading license report from {}", p.display()))?;
let report: ScreeningReport = serde_json::from_str(&text)
.with_context(|| format!("parsing license report from {}", p.display()))?;
let mut map = HashMap::new();
for v in report.verdicts {
map.insert(v.canonical_id, (v.verdict, v.license));
}
Ok::<_, anyhow::Error>(map)
})
.transpose()?
.unwrap_or_default();
let merged: Vec<MergedCandidate> = validated
.candidates
.iter()
.filter(|c| c.status == ValidationStatus::TrueExtension)
.map(|c| {
let pool_item = pool_map
.get(&c.canonical_id)
.or_else(|| pool_map.get(&c.name))
.cloned();
let (license_verdict, license_spdx) = license_map
.get(&c.canonical_id)
.map_or((None, None), |(v, s)| (Some(*v), Some(s.clone())));
MergedCandidate {
canonical_id: c.canonical_id.clone(),
name: c.name.clone(),
source_tier: c.source_tier.clone(),
registrations: c.evidence.registrations.clone(),
repository_url: c.repository_url.clone(),
npm_package: c.npm_package.clone(),
pool_item,
_license_verdict: license_verdict,
license_spdx,
}
})
.collect();
eprintln!("Merged {} true extension candidates", merged.len());
let inputs: Vec<CandidateInput> = merged.iter().map(build_candidate_input).collect();
let as_of = args
.as_of
.as_ref()
.map(|s| DateTime::parse_from_rfc3339(s).context("parse as_of"))
.transpose()?
.map_or_else(Utc::now, |d| d.with_timezone(&Utc));
let report = score_candidates(&inputs, as_of, as_of, args.top_n);
let json = serde_json::to_string_pretty(&report).context("serializing scoring report")?;
let json = format!("{json}\n");
fs::write(&args.out, &json)
.with_context(|| format!("writing output to {}", args.out.display()))?;
if let Some(summary_path) = &args.summary_out {
let summary_json =
serde_json::to_string_pretty(&report.summary).context("serialize summary")?;
fs::write(summary_path, format!("{summary_json}\n"))
.with_context(|| format!("write {}", summary_path.display()))?;
}
if let Some(log_path) = &args.log_out {
let mut log_file = fs::File::create(log_path)
.with_context(|| format!("creating log file {}", log_path.display()))?;
for item in &report.items {
let line = serde_json::to_string(item).context("serializing log entry")?;
writeln!(log_file, "{line}").context("writing log entry")?;
}
}
let tier0 = report.items.iter().filter(|i| i.tier == "tier-0").count();
let tier1 = report.items.iter().filter(|i| i.tier == "tier-1").count();
let tier2 = report.items.iter().filter(|i| i.tier == "tier-2").count();
let excluded = report.items.iter().filter(|i| i.tier == "excluded").count();
eprintln!("=== Tiered Corpus Selection ({}) ===", args.task_id);
eprintln!("Total scored: {}", report.items.len());
eprintln!(" Tier-0: {tier0} (official baseline)");
eprintln!(" Tier-1: {tier1} (must-pass, score ≥ 70)");
eprintln!(" Tier-2: {tier2} (stretch, score 50-69)");
eprintln!(" Excluded: {excluded} (score < 50 or gate fail)");
eprintln!();
let mut type_counts: HashMap<String, usize> = HashMap::new();
for m in &merged {
if m.registrations.is_empty() {
*type_counts
.entry("(no registrations)".to_string())
.or_insert(0) += 1;
}
for reg in &m.registrations {
*type_counts.entry(reg.clone()).or_insert(0) += 1;
}
}
let mut type_list: Vec<_> = type_counts.iter().collect();
type_list.sort_by(|a, b| b.1.cmp(a.1));
eprintln!("Extension type coverage:");
for (ext_type, count) in &type_list {
eprintln!(" {ext_type:<25} {count}");
}
eprintln!("\nOutput written to: {}", args.out.display());
Ok(())
}
fn build_candidate_input(m: &MergedCandidate) -> CandidateInput {
let pool = m.pool_item.as_ref();
let signals = pool.map_or_else(Signals::default, |item| {
let is_official = m
.source_tier
.as_deref()
.is_some_and(|t| t == "official-pi-mono");
Signals {
official_listing: Some(is_official),
pi_mono_example: Some(is_official),
badlogic_gist: m
.repository_url
.as_deref()
.map(|url| url.contains("gist.github.com") && url.contains("badlogic"))
.or(Some(false)),
github_stars: item.popularity.github_stars,
github_forks: item.popularity.github_forks,
npm_downloads_month: item.popularity.npm_downloads_monthly,
references: item.popularity.mentions_sources.clone().unwrap_or_default(),
marketplace: Some(MarketplaceSignals {
rank: item.popularity.marketplace_rank,
installs_month: item.popularity.marketplace_installs_monthly,
featured: item.popularity.marketplace_featured,
}),
}
});
let interaction = registrations_to_interactions(&m.registrations);
let capabilities = registrations_to_capabilities(&m.registrations);
let runtime = Some(infer_runtime(m));
let tags = Tags {
runtime,
interaction,
capabilities,
};
let recency = pool.map_or_else(Recency::default, |item| Recency {
updated_at: item
.popularity
.github_last_commit
.clone()
.or_else(|| item.popularity.npm_last_publish.clone())
.or_else(|| item.retrieved.clone()),
});
let compat_status = pool.map_or(Some(CompatStatus::RequiresShims), |item| {
match item.status.as_str() {
"vendored" => Some(CompatStatus::Unmodified),
"unvendored" | "excluded" => Some(CompatStatus::Blocked),
_ => Some(CompatStatus::RequiresShims),
}
});
let compat = Compatibility {
status: compat_status,
..Compatibility::default()
};
let spdx = m
.license_spdx
.clone()
.or_else(|| pool.map(|item| item.license.clone()));
let redistribution = spdx
.as_deref()
.map_or(Redistribution::Unknown, infer_redistribution);
let license = LicenseInfo {
spdx,
redistribution: Some(redistribution),
notes: None,
};
let provenance_pinned = pool.map(|item| item.checksum.is_some());
let deterministic = pool.map(|item| item.status != "unvendored");
let gates = Gates {
provenance_pinned,
deterministic,
};
let risk = RiskInfo::default();
CandidateInput {
id: m.canonical_id.clone(),
name: Some(m.name.clone()),
source_tier: m.source_tier.clone(),
signals,
tags,
recency,
compat,
license,
gates,
risk,
manual_override: None,
}
}
fn registrations_to_interactions(registrations: &[String]) -> Vec<String> {
let mut interactions = Vec::new();
for reg in registrations {
let tag = match reg.as_str() {
"registerProvider" => "provider",
"registerTool" => "tool_only",
"registerCommand" | "registerSlashCommand" | "registerFlag" | "registerShortcut" => {
"slash_command"
}
"registerEvent" | "registerEventHook" => "event_hook",
"registerMessageRenderer" => "ui_integration",
_ => continue,
};
if !interactions.contains(&tag.to_string()) {
interactions.push(tag.to_string());
}
}
if interactions.is_empty() {
interactions.push("tool_only".to_string());
}
interactions
}
fn registrations_to_capabilities(registrations: &[String]) -> Vec<String> {
let mut caps = Vec::new();
for reg in registrations {
let tag = match reg.as_str() {
"registerTool" => "exec",
"registerProvider" => "http",
"registerMessageRenderer" => "ui",
"registerCommand" | "registerSlashCommand" => "session",
_ => continue,
};
if !caps.contains(&tag.to_string()) {
caps.push(tag.to_string());
}
}
caps
}
fn infer_runtime(m: &MergedCandidate) -> String {
if m.npm_package.is_some() {
return "pkg-with-deps".to_string();
}
if m.registrations.iter().any(|r| r == "registerProvider") {
return "provider-ext".to_string();
}
"legacy-js".to_string()
}
fn infer_redistribution(spdx: &str) -> Redistribution {
let up = spdx.trim().to_ascii_uppercase();
if up.is_empty() || matches!(up.as_str(), "UNKNOWN" | "UNLICENSED") {
return Redistribution::Unknown;
}
if up.contains("GPL") || up.contains("AGPL") {
return Redistribution::Restricted;
}
Redistribution::Ok
}