pi_agent_rust 0.1.7

#![forbid(unsafe_code)]

use std::cmp::Ordering;
use std::collections::BTreeMap;
use std::fmt::Write as _;
use std::fs;
use std::path::PathBuf;

use anyhow::{Context, Result};
use chrono::{DateTime, Utc};
use clap::Parser;
use pi::extension_popularity::{CandidateItem, CandidatePool, CandidateSource};
use serde::Serialize;
use url::Url;

#[derive(Debug, Parser)]
#[command(name = "ext_onboarding_queue")]
#[command(about = "Build a de-biased extension onboarding queue from candidate pool")]
struct Args {
    #[arg(long, default_value = "docs/extension-candidate-pool.json")]
    candidate_pool: PathBuf,
    #[arg(long, default_value = "docs/extension-onboarding-queue.json")]
    json_out: PathBuf,
    #[arg(long, default_value = "docs/extension-onboarding-queue.md")]
    md_out: PathBuf,
    #[arg(long)]
    generated_at: Option<String>,
}

#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
struct SelectionPolicy {
    goal: String,
    base_formula: String,
    debias_formula: String,
    relevance_formula: String,
    source_bonus: BTreeMap<String, i64>,
    recency_bands_days: BTreeMap<String, i64>,
    non_relevant_multiplier: f64,
}

#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
#[allow(clippy::struct_field_names)]
struct QueueSummary {
    top_100: usize,
    top_300: usize,
    top_500: usize,
}

#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
struct QueueEntry {
    id: String,
    name: String,
    source_tier: String,
    source_type: String,
    package: Option<String>,
    repository_url: Option<String>,
    repo_key: String,
    repo_candidate_count: usize,
    repo_rank: usize,
    pi_relevant: bool,
    pi_relevance_score: u32,
    impact_score: f64,
    raw_impact_score: f64,
    debiased_impact_score: f64,
    github_stars: u64,
    github_forks: u64,
    github_watchers: u64,
    npm_downloads_monthly: u64,
    updated_at: Option<String>,
    days_since_update: Option<i64>,
    notes: Option<String>,
    rank: usize,
}

#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
struct QueueDocument {
    schema: String,
    generated_at: String,
    source_pool_path: String,
    source_pool_total: usize,
    eligible_unvendored: usize,
    eligible_pi_relevant: usize,
    selection_policy: SelectionPolicy,
    summary: QueueSummary,
    top_100: Vec<QueueEntry>,
    top_300: Vec<QueueEntry>,
    all: Vec<QueueEntry>,
}

#[derive(Debug, Clone)]
struct QueueDraft {
    entry: QueueEntry,
    score_key: ScoreKey,
}

#[derive(Debug, Clone, Copy)]
struct ScoreKey {
    raw_impact: f64,
    impact: f64,
}

impl ScoreKey {
    fn cmp_desc(self, other: Self) -> Ordering {
        other
            .impact
            .partial_cmp(&self.impact)
            .unwrap_or(Ordering::Equal)
            .then_with(|| {
                other
                    .raw_impact
                    .partial_cmp(&self.raw_impact)
                    .unwrap_or(Ordering::Equal)
            })
    }
}

fn main() -> Result<()> {
    let args = Args::parse();
    let pool_bytes = fs::read(&args.candidate_pool)
        .with_context(|| format!("read {}", args.candidate_pool.display()))?;
    let pool: CandidatePool =
        serde_json::from_slice(&pool_bytes).context("parse extension candidate pool")?;

    let now = parse_generated_at(args.generated_at)?;
    let unvendored = pool
        .items
        .iter()
        .filter(|item| item.status == "unvendored")
        .collect::<Vec<_>>();

    let repo_counts = build_repo_counts(&unvendored);
    let mut drafts = unvendored
        .iter()
        .map(|item| draft_from_item(item, &repo_counts, now))
        .collect::<Vec<_>>();

    assign_repo_rank(&mut drafts);
    recompute_debiased_scores(&mut drafts);
    drafts.sort_by(|left, right| {
        left.score_key
            .cmp_desc(right.score_key)
            .then_with(|| right.entry.pi_relevant.cmp(&left.entry.pi_relevant))
            .then_with(|| left.entry.id.cmp(&right.entry.id))
    });

    for (index, draft) in drafts.iter_mut().enumerate() {
        draft.entry.rank = index + 1;
    }

    let all = drafts
        .into_iter()
        .map(|draft| draft.entry)
        .collect::<Vec<_>>();
    let top_100 = all.iter().take(100).cloned().collect::<Vec<_>>();
    let top_300 = all.iter().take(300).cloned().collect::<Vec<_>>();
    let eligible_pi_relevant = all.iter().filter(|entry| entry.pi_relevant).count();

    let selection_policy = build_selection_policy();
    let summary = QueueSummary {
        top_100: top_100.len(),
        top_300: top_300.len(),
        top_500: all.len().min(500),
    };

    let document = QueueDocument {
        schema: "pi.ext.onboarding_queue.v2".to_string(),
        generated_at: now.to_rfc3339(),
        source_pool_path: args.candidate_pool.display().to_string(),
        source_pool_total: pool.items.len(),
        eligible_unvendored: all.len(),
        eligible_pi_relevant,
        selection_policy,
        summary,
        top_100,
        top_300,
        all,
    };

    let json =
        serde_json::to_string_pretty(&document).context("serialize onboarding queue json")?;
    fs::write(&args.json_out, format!("{json}\n"))
        .with_context(|| format!("write {}", args.json_out.display()))?;

    let markdown = render_markdown(&document);
    fs::write(&args.md_out, markdown)
        .with_context(|| format!("write {}", args.md_out.display()))?;

    eprintln!(
        "Wrote queue: {} entries ({} pi-relevant) -> {}, {}",
        document.eligible_unvendored,
        document.eligible_pi_relevant,
        args.json_out.display(),
        args.md_out.display()
    );

    Ok(())
}

fn parse_generated_at(generated_at: Option<String>) -> Result<DateTime<Utc>> {
    let Some(value) = generated_at else {
        return Ok(Utc::now());
    };
    let parsed = DateTime::parse_from_rfc3339(&value)
        .with_context(|| format!("parse --generated-at timestamp: {value}"))?;
    Ok(parsed.with_timezone(&Utc))
}

fn build_selection_policy() -> SelectionPolicy {
    let source_bonus = BTreeMap::from([
        ("npm-registry".to_string(), 8),
        ("third-party-github".to_string(), 6),
        ("community".to_string(), 5),
        ("official-pi-mono".to_string(), 1),
    ]);
    let recency_bands_days = BTreeMap::from([
        ("<=30".to_string(), 20),
        ("<=90".to_string(), 16),
        ("<=180".to_string(), 12),
        ("<=365".to_string(), 8),
        ("<=730".to_string(), 4),
        (">730".to_string(), 0),
    ]);

    SelectionPolicy {
        goal: "Prioritize high-impact, Pi-relevant unvendored candidates for vendor+conformance onboarding while preventing single-repo popularity inflation.".to_string(),
        base_formula:
            "22*log10(1+stars)+8*log10(1+forks)+4*log10(1+watchers)+22*log10(1+npm_monthly)+recency+source_bonus+signal_bonus"
                .to_string(),
        debias_formula:
            "debiased=raw*(1/(1+ln(repo_count)))*(0.85^(repo_rank-1)); final=debiased*non_relevant_multiplier_if_needed + pi_relevance_score"
                .to_string(),
        relevance_formula:
            "Token and notes heuristic over id/name/package/repo/notes; strong boosts for Pi API imports and registration markers."
                .to_string(),
        source_bonus,
        recency_bands_days,
        non_relevant_multiplier: 0.45,
    }
}

fn build_repo_counts(items: &[&CandidateItem]) -> BTreeMap<String, usize> {
    let mut counts = BTreeMap::new();
    for item in items {
        let key = repo_key(item);
        *counts.entry(key).or_insert(0) += 1;
    }
    counts
}

fn draft_from_item(
    item: &CandidateItem,
    repo_counts: &BTreeMap<String, usize>,
    now: DateTime<Utc>,
) -> QueueDraft {
    let (source_type, package) = source_type_and_package(&item.source);
    let repo_key = repo_key(item);
    let repo_candidate_count = *repo_counts.get(&repo_key).unwrap_or(&1);

    let stars = item.popularity.github_stars.unwrap_or(0);
    let forks = item.popularity.github_forks.unwrap_or(0);
    let watchers = item.popularity.github_watchers.unwrap_or(0);
    let npm_monthly = item.popularity.npm_downloads_monthly.unwrap_or(0);
    let updated_at = item
        .popularity
        .github_last_commit
        .clone()
        .or_else(|| item.popularity.npm_last_publish.clone())
        .or_else(|| item.retrieved.clone());
    let days_since_update = updated_at
        .as_deref()
        .and_then(|timestamp| days_since(timestamp, now));
    let recency = recency_bonus(days_since_update);
    let source_bonus = source_bonus(&item.source_tier);
    let signal_bonus = signal_bonus(item.notes.as_deref());
    let raw_impact = weighted_log(22.0, stars)
        + weighted_log(8.0, forks)
        + weighted_log(4.0, watchers)
        + weighted_log(22.0, npm_monthly)
        + recency
        + source_bonus
        + signal_bonus;

    let pi_relevance_score = pi_relevance_score(item, package.as_deref());
    let pi_relevant = pi_relevance_score >= 18;

    let entry = QueueEntry {
        id: item.id.clone(),
        name: item.name.clone(),
        source_tier: item.source_tier.clone(),
        source_type: source_type.to_string(),
        package,
        repository_url: item.repository_url.clone(),
        repo_key,
        repo_candidate_count,
        repo_rank: 1,
        pi_relevant,
        pi_relevance_score,
        impact_score: 0.0,
        raw_impact_score: round3(raw_impact),
        debiased_impact_score: 0.0,
        github_stars: stars,
        github_forks: forks,
        github_watchers: watchers,
        npm_downloads_monthly: npm_monthly,
        updated_at,
        days_since_update,
        notes: item.notes.clone(),
        rank: 0,
    };

    QueueDraft {
        entry,
        score_key: ScoreKey {
            raw_impact,
            impact: raw_impact,
        },
    }
}

fn assign_repo_rank(drafts: &mut [QueueDraft]) {
    let mut per_repo: BTreeMap<String, Vec<usize>> = BTreeMap::new();
    for (index, draft) in drafts.iter().enumerate() {
        per_repo
            .entry(draft.entry.repo_key.clone())
            .or_default()
            .push(index);
    }
    for indexes in per_repo.values_mut() {
        indexes.sort_by(|left, right| {
            drafts[*right]
                .score_key
                .raw_impact
                .partial_cmp(&drafts[*left].score_key.raw_impact)
                .unwrap_or(Ordering::Equal)
                .then_with(|| drafts[*left].entry.id.cmp(&drafts[*right].entry.id))
        });
        for (repo_rank, index) in indexes.iter().enumerate() {
            drafts[*index].entry.repo_rank = repo_rank + 1;
        }
    }
}

#[allow(clippy::cast_precision_loss)]
fn recompute_debiased_scores(drafts: &mut [QueueDraft]) {
    for draft in drafts.iter_mut() {
        let repo_count = draft.entry.repo_candidate_count as f64;
        let rank_exponent =
            i32::try_from(draft.entry.repo_rank.saturating_sub(1)).unwrap_or(i32::MAX);
        let rank_penalty = 0.85_f64.powi(rank_exponent);
        let multiplicity_penalty = 1.0 / (1.0 + repo_count.ln());
        let relevance_multiplier = if draft.entry.pi_relevant { 1.0 } else { 0.45 };

        let raw = draft.score_key.raw_impact;
        let debiased = raw * multiplicity_penalty * rank_penalty;
        let impact = debiased * relevance_multiplier + f64::from(draft.entry.pi_relevance_score);

        draft.entry.debiased_impact_score = round3(debiased);
        draft.entry.impact_score = round3(impact);
        draft.score_key.impact = impact;
    }
}

fn source_type_and_package(source: &CandidateSource) -> (&'static str, Option<String>) {
    match source {
        CandidateSource::Git { .. } => ("git", None),
        CandidateSource::Npm { package, .. } => ("npm", Some(package.clone())),
        CandidateSource::Url { .. } => ("url", None),
    }
}

fn repo_key(item: &CandidateItem) -> String {
    if let Some(repo_url) = item.repository_url.as_deref() {
        if let Some(key) = canonical_repo_url(repo_url) {
            return key;
        }
    }

    match &item.source {
        CandidateSource::Git { repo, .. } => format!("git:{}", repo.trim().to_ascii_lowercase()),
        CandidateSource::Npm { package, .. } => {
            format!("npm:{}", package.trim().to_ascii_lowercase())
        }
        CandidateSource::Url { url } => format!("url:{}", url.trim().to_ascii_lowercase()),
    }
}

fn canonical_repo_url(repo_url: &str) -> Option<String> {
    let raw = repo_url.trim();
    if raw.is_empty() {
        return None;
    }

    if let Some(path) = raw.strip_prefix("git@github.com:") {
        let normalized = path.trim_end_matches(".git").trim_matches('/');
        return Some(format!("github.com/{}", normalized.to_ascii_lowercase()));
    }

    let no_prefix = raw.strip_prefix("git+").unwrap_or(raw);
    let url_str = if no_prefix.contains("://") {
        no_prefix.to_string()
    } else {
        format!("https://{no_prefix}")
    };
    let parsed = Url::parse(&url_str).ok()?;
    let host = parsed.host_str()?.to_ascii_lowercase();
    let segments = parsed
        .path_segments()
        .map(|it| it.filter(|seg| !seg.is_empty()).collect::<Vec<_>>())?;
    if segments.is_empty() {
        return None;
    }

    if host == "github.com" && segments.len() >= 2 {
        let owner = segments[0].to_ascii_lowercase();
        let repo = segments[1].trim_end_matches(".git").to_ascii_lowercase();
        return Some(format!("github.com/{owner}/{repo}"));
    }

    if segments.len() >= 2 {
        return Some(format!(
            "{}/{}/{}",
            host,
            segments[0].to_ascii_lowercase(),
            segments[1].trim_end_matches(".git").to_ascii_lowercase()
        ));
    }

    Some(format!("{}/{}", host, segments[0].to_ascii_lowercase()))
}

#[allow(clippy::cast_precision_loss)]
fn weighted_log(weight: f64, value: u64) -> f64 {
    weight * (1.0 + value as f64).log10()
}

const fn recency_bonus(days_since_update: Option<i64>) -> f64 {
    match days_since_update {
        Some(days) if days <= 30 => 20.0,
        Some(days) if days <= 90 => 16.0,
        Some(days) if days <= 180 => 12.0,
        Some(days) if days <= 365 => 8.0,
        Some(days) if days <= 730 => 4.0,
        Some(_) | None => 0.0,
    }
}

fn source_bonus(source_tier: &str) -> f64 {
    match source_tier {
        "npm-registry" => 8.0,
        "third-party-github" => 6.0,
        "community" => 5.0,
        "official-pi-mono" => 1.0,
        _ => 0.0,
    }
}

fn signal_bonus(notes: Option<&str>) -> f64 {
    let Some(notes) = notes else {
        return 0.0;
    };
    let normalized = notes.to_ascii_lowercase();
    let mut bonus = 0.0;
    if normalized.contains("registration calls detected") {
        bonus += 8.0;
    }
    if normalized.contains("pi api import found") {
        bonus += 6.0;
    }
    if normalized.contains("export default present") {
        bonus += 4.0;
    }
    bonus
}

fn pi_relevance_score(item: &CandidateItem, package: Option<&str>) -> u32 {
    let mut score: u32 = 0;
    let mut text = String::new();
    text.push_str(&item.id.to_ascii_lowercase());
    text.push(' ');
    text.push_str(&item.name.to_ascii_lowercase());
    text.push(' ');
    if let Some(pkg) = package {
        text.push_str(&pkg.to_ascii_lowercase());
        text.push(' ');
    }
    if let Some(repo) = item.repository_url.as_deref() {
        text.push_str(&repo.to_ascii_lowercase());
        text.push(' ');
    }
    if let Some(notes) = item.notes.as_deref() {
        text.push_str(&notes.to_ascii_lowercase());
    }

    if contains_word_token(&text, "pi") || text.contains("pi-") || text.contains("/pi-") {
        score = score.saturating_add(14);
    }
    if text.contains("openclaw") || text.contains("clawdbot") || contains_word_token(&text, "clawd")
    {
        score = score.saturating_add(14);
    }
    if contains_word_token(&text, "extension") || contains_word_token(&text, "extensions") {
        score = score.saturating_add(8);
    }
    if contains_word_token(&text, "skill") || contains_word_token(&text, "skills") {
        score = score.saturating_add(6);
    }
    if contains_word_token(&text, "mcp") {
        score = score.saturating_add(6);
    }
    if contains_word_token(&text, "agent") || contains_word_token(&text, "agents") {
        score = score.saturating_add(4);
    }

    if let Some(notes) = item.notes.as_deref() {
        let notes = notes.to_ascii_lowercase();
        if notes.contains("pi api import found") {
            score = score.saturating_add(30);
        }
        if notes.contains("registration calls detected") {
            score = score.saturating_add(20);
        }
        if notes.contains("export default present") {
            score = score.saturating_add(8);
        }
    }

    if matches!(item.source_tier.as_str(), "official-pi-mono" | "community") {
        score = score.saturating_add(6);
    }

    score.min(100)
}

fn contains_word_token(text: &str, token: &str) -> bool {
    text.split(|character: char| !character.is_ascii_alphanumeric())
        .any(|part| part == token)
}

fn days_since(timestamp: &str, now: DateTime<Utc>) -> Option<i64> {
    let parsed = DateTime::parse_from_rfc3339(timestamp).ok()?;
    let parsed_utc = parsed.with_timezone(&Utc);
    let duration = now.signed_duration_since(parsed_utc);
    Some(duration.num_days().max(0))
}

#[allow(clippy::cast_precision_loss)]
fn round3(value: f64) -> f64 {
    (value * 1000.0).round() / 1000.0
}

fn render_markdown(document: &QueueDocument) -> String {
    let mut out = String::new();
    let _ = writeln!(out, "# Extension Onboarding Queue");
    let _ = writeln!(out);
    let _ = writeln!(out, "Generated: {}", document.generated_at);
    let _ = writeln!(
        out,
        "Source pool: `{}` ({} total)",
        document.source_pool_path, document.source_pool_total
    );
    let _ = writeln!(
        out,
        "Eligible (unvendored): **{}**",
        document.eligible_unvendored
    );
    let _ = writeln!(
        out,
        "Pi-relevant (heuristic): **{}**",
        document.eligible_pi_relevant
    );
    let _ = writeln!(
        out,
        "Schema: `{}` (repo-level de-bias enabled)",
        document.schema
    );
    let _ = writeln!(out);
    let _ = writeln!(out, "## Top 100 (Start Here)");
    let _ = writeln!(out);
    let _ = writeln!(
        out,
        "| Rank | ID | Tier | Repo n | Rel | Stars | npm/mo | Impact |"
    );
    let _ = writeln!(out, "|---:|---|---|---:|---:|---:|---:|---:|");
    for entry in &document.top_100 {
        let _ = writeln!(
            out,
            "| {} | `{}` | `{}` | {} | {} | {} | {} | {:.3} |",
            entry.rank,
            entry.id,
            entry.source_tier,
            entry.repo_candidate_count,
            entry.pi_relevance_score,
            format_u64(entry.github_stars),
            format_u64(entry.npm_downloads_monthly),
            entry.impact_score
        );
    }

    let _ = writeln!(out);
    let _ = writeln!(out, "## Next 200 (Parallel Agents)");
    let _ = writeln!(out);
    let _ = writeln!(
        out,
        "| Rank | ID | Tier | Repo n | Rel | Stars | npm/mo | Impact |"
    );
    let _ = writeln!(out, "|---:|---|---|---:|---:|---:|---:|---:|");
    for entry in document
        .all
        .iter()
        .filter(|entry| (101..=300).contains(&entry.rank))
    {
        let _ = writeln!(
            out,
            "| {} | `{}` | `{}` | {} | {} | {} | {} | {:.3} |",
            entry.rank,
            entry.id,
            entry.source_tier,
            entry.repo_candidate_count,
            entry.pi_relevance_score,
            format_u64(entry.github_stars),
            format_u64(entry.npm_downloads_monthly),
            entry.impact_score
        );
    }
    out
}

fn format_u64(value: u64) -> String {
    let text = value.to_string();
    let mut out = String::with_capacity(text.len() + text.len() / 3);
    let bytes = text.as_bytes();
    for (index, byte) in bytes.iter().enumerate() {
        out.push(char::from(*byte));
        let remaining = bytes.len().saturating_sub(index + 1);
        if remaining > 0 && remaining % 3 == 0 {
            out.push(',');
        }
    }
    out
}