Skip to main content

mcp_methods/
screen.rs

1//! Stargazer screening — bulk-fetch a repo's stargazers and their public
2//! repo portfolios over cheap REST, classify each person into an
3//! archetype, and render a compact overview the agent can drill into.
4//!
5//! Cost model (the whole point): one REST request per stargazer-page +
6//! one per stargazer for their owned-repos list. No GraphQL, no
7//! per-repo calls, no READMEs in the bulk pass. The raw payloads
8//! (~550 KB for a prolific user) are projected down to a handful of
9//! fields before anything leaves Rust; the agent-facing overview stays
10//! ~2 KB regardless of how many people starred the repo.
11//!
12//! The value-bearing logic (`project_repo`, `profile_user`,
13//! `build_overview`) is pure over already-fetched JSON, so it unit-tests
14//! without the network. `screen_repo` is the orchestrator that fetches.
15
16use serde::{Deserialize, Serialize};
17use serde_json::Value;
18
19use crate::github;
20
21// ---------------------------------------------------------------------------
22// Config
23// ---------------------------------------------------------------------------
24
25/// Tuning knobs for a screen. Defaults are chosen for the kglite-scale
26/// case (dozens-to-low-hundreds of stargazers).
27#[derive(Debug, Clone, Serialize, Deserialize)]
28pub struct ScreenConfig {
29    /// Max stargazers to screen (most-recent first). `None` = all.
30    pub max_stargazers: Option<usize>,
31    /// Max owned repos to pull per user (sorted by most-recently pushed).
32    pub max_repos_per_user: usize,
33    /// Lowercase keywords for the relevance gate, matched against repo
34    /// name / description / topics / language.
35    pub relevance_keywords: Vec<String>,
36    /// Languages that define the seed project's stack (e.g. Rust + Python
37    /// for a PyO3 lib). Stargazers who use *all* of them are flagged as a
38    /// "stack match" — an architectural signal that needs no keywords and
39    /// catches relevant devs whose repo names don't contain the keywords.
40    pub stack_languages: Vec<String>,
41}
42
43impl Default for ScreenConfig {
44    fn default() -> Self {
45        Self {
46            max_stargazers: None,
47            max_repos_per_user: 100,
48            relevance_keywords: Vec::new(),
49            stack_languages: Vec::new(),
50        }
51    }
52}
53
54// ---------------------------------------------------------------------------
55// Projected types
56// ---------------------------------------------------------------------------
57
58/// The cheap fields we keep from a repo object — everything else (URLs,
59/// owner blob, permissions) is dropped at projection time.
60#[derive(Debug, Clone, Serialize, Deserialize)]
61pub struct RepoLite {
62    pub name: String,
63    pub fork: bool,
64    pub archived: bool,
65    pub lang: Option<String>,
66    pub stars: u64,
67    pub forks: u64,
68    pub pushed: String,  // YYYY-MM-DD
69    pub created: String, // YYYY-MM-DD
70    pub topics: Vec<String>,
71    pub desc: Option<String>,
72    /// Repo size in KB — a cheap proxy for code volume / effort.
73    #[serde(default)]
74    pub size: u64,
75}
76
77/// Normalized 0–1 metric vector for a person, percentile-ranked within the
78/// screened set. The basis axes for ranking and fan-out gating: they
79/// disagree with each other (effort ≠ popularity ≠ recency ≠ relatedness),
80/// which is what makes each a real axis rather than a derived view.
81#[derive(Debug, Default, Clone, Serialize, Deserialize)]
82pub struct Scores {
83    pub relatedness: f64,
84    pub popularity: f64,
85    pub effort: f64,
86    pub recency: f64,
87}
88
89/// One classified stargazer.
90#[derive(Debug, Clone, Serialize, Deserialize)]
91pub struct UserProfile {
92    pub login: String,
93    pub archetype: Archetype,
94    pub repos_seen: usize,
95    pub capped: bool, // hit the per-user page cap (has more repos)
96    pub original_count: usize,
97    pub fork_count: usize,
98    pub max_stars: u64,
99    pub total_stars: u64,
100    pub top_langs: Vec<String>,
101    pub last_active: String,
102    pub flagship: Option<RepoLite>,
103    /// Repos (non-fork) that matched the relevance keywords, best-first.
104    pub relevant: Vec<RepoLite>,
105    /// True when the top relevant repo names the topic (score ≥ 2) or has
106    /// real traction — i.e. worth surfacing, not language-only noise.
107    pub strong_hit: bool,
108    /// Relevance score of the best matching repo (match density).
109    pub hit_score: u32,
110    /// Keywords that matched the best repo (for the legibility annotation).
111    pub hit_terms: Vec<String>,
112    /// Uses every language in the seed project's stack (cheap PyO3-style
113    /// architectural signal, independent of keywords).
114    pub stack_match: bool,
115    /// Per-stack-language count of original repos, in `stack_languages`
116    /// order. The depth of stack commitment — a serial Rust+Python builder
117    /// (the kglite/maturin pattern) ranks above an incidental one. This is
118    /// the signal that catches keyword-invisible architectural peers.
119    #[serde(default)]
120    pub stack_lang_counts: Vec<(String, usize)>,
121    // ── Shortlist enrichments (filled only for leads + stack candidates) ──
122    /// Follower count (reach), if enriched. [item 6]
123    #[serde(default)]
124    pub followers: Option<u64>,
125    /// This stargazer's repo(s) declare the seed package as a dependency —
126    /// an actual adopter, not just a watcher. [item 2]
127    #[serde(default)]
128    pub adopter: bool,
129    /// Evidence line for the adoption flag (the manifest match).
130    #[serde(default)]
131    pub adoption_evidence: Option<String>,
132    /// Count of original repos that combine *all* stack languages in one
133    /// repo — the true PyO3/maturin co-location signal. [item 4]
134    #[serde(default)]
135    pub colocated_repos: Option<usize>,
136    /// Repos this person contributes to but does not own (from public
137    /// events) — surfaces relevance the owned-repo list can't see. [item 7]
138    #[serde(default)]
139    pub contributes_to: Vec<String>,
140    /// Normalized metric vector (filled by `normalize_scores` after the
141    /// whole set is classified + enriched).
142    #[serde(default)]
143    pub scores: Scores,
144    /// All projected repos, kept for drill-down.
145    pub repos: Vec<RepoLite>,
146}
147
148#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
149pub enum Archetype {
150    /// High-traction author: a very popular repo or several notable ones.
151    Established,
152    /// One repo stands far above the rest.
153    SingleProject,
154    /// Many original repos, low traction — a builder/tinkerer.
155    Prolific,
156    /// A few modest original repos.
157    Casual,
158    /// Mostly forks / nothing original — a consumer.
159    Consumer,
160    /// Has original work but no public push in a long time.
161    Dormant,
162}
163
164impl Archetype {
165    pub fn label(self) -> &'static str {
166        match self {
167            Archetype::Established => "Established authors",
168            Archetype::SingleProject => "Single-project devs",
169            Archetype::Prolific => "Prolific builders",
170            Archetype::Casual => "Casual devs",
171            Archetype::Consumer => "Consumers / lurkers",
172            Archetype::Dormant => "Dormant",
173        }
174    }
175    /// Display ordering for the overview (most interesting first).
176    fn rank(self) -> u8 {
177        match self {
178            Archetype::Established => 0,
179            Archetype::SingleProject => 1,
180            Archetype::Prolific => 2,
181            Archetype::Casual => 3,
182            Archetype::Dormant => 4,
183            Archetype::Consumer => 5,
184        }
185    }
186}
187
188// ---------------------------------------------------------------------------
189// Projection
190// ---------------------------------------------------------------------------
191
192fn date10(v: &Value, key: &str) -> String {
193    v.get(key)
194        .and_then(Value::as_str)
195        .map(|s| s.chars().take(10).collect())
196        .unwrap_or_default()
197}
198
199/// Project a raw GitHub repo object down to the cheap fields.
200pub fn project_repo(raw: &Value) -> RepoLite {
201    let topics = raw
202        .get("topics")
203        .and_then(Value::as_array)
204        .map(|a| {
205            a.iter()
206                .filter_map(|t| t.as_str().map(String::from))
207                .collect()
208        })
209        .unwrap_or_default();
210    RepoLite {
211        name: raw
212            .get("name")
213            .and_then(Value::as_str)
214            .unwrap_or("")
215            .to_string(),
216        fork: raw.get("fork").and_then(Value::as_bool).unwrap_or(false),
217        archived: raw
218            .get("archived")
219            .and_then(Value::as_bool)
220            .unwrap_or(false),
221        lang: raw
222            .get("language")
223            .and_then(Value::as_str)
224            .map(String::from),
225        stars: raw
226            .get("stargazers_count")
227            .and_then(Value::as_u64)
228            .unwrap_or(0),
229        forks: raw.get("forks_count").and_then(Value::as_u64).unwrap_or(0),
230        pushed: date10(raw, "pushed_at"),
231        created: date10(raw, "created_at"),
232        topics,
233        desc: raw
234            .get("description")
235            .and_then(Value::as_str)
236            .map(|s| s.trim().to_string())
237            .filter(|s| !s.is_empty()),
238        size: raw.get("size").and_then(Value::as_u64).unwrap_or(0),
239    }
240}
241
242// ---------------------------------------------------------------------------
243// Classification
244// ---------------------------------------------------------------------------
245
246/// Split text into lowercase alphanumeric word tokens.
247fn tokenize(s: &str) -> std::collections::HashSet<String> {
248    s.to_lowercase()
249        .split(|c: char| !c.is_alphanumeric())
250        .filter(|t| !t.is_empty())
251        .map(String::from)
252        .collect()
253}
254
255/// Whole-word (with naive plural stemming) match: `graph` matches `graph`,
256/// `graphs`, and the `graph` token inside `knowledge-graph` — but NOT
257/// `graphql` (no boundary) and NOT `storage` for `rag`. Kills the
258/// substring false positives the first tester flagged.
259fn token_set_has(tokens: &std::collections::HashSet<String>, kw: &str) -> bool {
260    tokens
261        .iter()
262        .any(|t| t == kw || t.strip_suffix('s') == Some(kw) || t.strip_prefix(kw) == Some("s"))
263}
264
265/// Weighted relevance of a repo against the keyword list, plus the distinct
266/// keywords that matched (for the legibility annotation). Name + topic are
267/// the strongest "about-ness" signal; description medium; a bare language
268/// match is weakest (matching "rust"-the-language alone would flag every
269/// Rust repo, so it scores 1 and the overview filters those out unless the
270/// repo earns traction).
271fn repo_relevance(r: &RepoLite, kws: &[String]) -> (u32, Vec<String>) {
272    if kws.is_empty() {
273        return (0, Vec::new());
274    }
275    let name = tokenize(&r.name);
276    let desc = tokenize(r.desc.as_deref().unwrap_or(""));
277    let topics: std::collections::HashSet<String> =
278        r.topics.iter().flat_map(|t| tokenize(t)).collect();
279    let lang = r.lang.as_deref().unwrap_or("").to_lowercase();
280
281    let mut score = 0u32;
282    let mut matched = Vec::new();
283    for k in kws {
284        let mut hit = false;
285        if token_set_has(&name, k) {
286            score += 3;
287            hit = true;
288        }
289        if token_set_has(&topics, k) {
290            score += 3;
291            hit = true;
292        }
293        if token_set_has(&desc, k) {
294            score += 2;
295            hit = true;
296        }
297        if &lang == k {
298            score += 1;
299            hit = true;
300        }
301        if hit {
302            matched.push(k.clone());
303        }
304    }
305    (score, matched)
306}
307
308/// Classify a user from their (already projected) owned repos.
309pub fn profile_user(
310    login: &str,
311    repos: Vec<RepoLite>,
312    capped: bool,
313    cfg: &ScreenConfig,
314) -> UserProfile {
315    let originals: Vec<&RepoLite> = repos.iter().filter(|r| !r.fork).collect();
316    let original_count = originals.len();
317    let fork_count = repos.len() - original_count;
318
319    let max_stars = originals.iter().map(|r| r.stars).max().unwrap_or(0);
320    let total_stars: u64 = originals.iter().map(|r| r.stars).sum();
321    let last_active = repos
322        .iter()
323        .map(|r| r.pushed.clone())
324        .max()
325        .unwrap_or_default();
326
327    // Top languages over original repos.
328    let mut lang_counts: std::collections::HashMap<String, usize> =
329        std::collections::HashMap::new();
330    for r in &originals {
331        if let Some(l) = &r.lang {
332            *lang_counts.entry(l.clone()).or_default() += 1;
333        }
334    }
335    let mut langs: Vec<(String, usize)> = lang_counts.into_iter().collect();
336    langs.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(&b.0)));
337    let top_langs: Vec<String> = langs.into_iter().take(3).map(|(l, _)| l).collect();
338
339    // Flagship = the most-starred original, if it stands out.
340    let mut by_stars: Vec<&RepoLite> = originals.clone();
341    by_stars.sort_by_key(|r| std::cmp::Reverse(r.stars));
342    let second = by_stars.get(1).map(|r| r.stars).unwrap_or(0);
343    let flagship = by_stars.first().copied().filter(|r| r.stars >= 10).cloned();
344    let flagship_dominant = flagship
345        .as_ref()
346        .map(|f| f.stars >= 25 && f.stars >= second.saturating_mul(3).max(25))
347        .unwrap_or(false);
348
349    let notable = originals.iter().filter(|r| r.stars >= 50).count();
350    let established = max_stars >= 500 || notable >= 2;
351
352    // Recency: dormant if the newest push is old. Compare lexically on
353    // YYYY-MM-DD against a rough "2 years before the freshest seen" is
354    // overkill here; use a fixed cutoff the caller can revisit.
355    let dormant = !last_active.is_empty() && last_active.as_str() < "2024-01-01";
356
357    let archetype = if original_count == 0 {
358        Archetype::Consumer
359    } else if established {
360        Archetype::Established
361    } else if flagship_dominant {
362        Archetype::SingleProject
363    } else if dormant {
364        Archetype::Dormant
365    } else if original_count >= 6 {
366        Archetype::Prolific
367    } else if fork_count > original_count && max_stars < 5 {
368        Archetype::Consumer
369    } else {
370        Archetype::Casual
371    };
372
373    // Relevant repos, scored and sorted best-first (highest score, then
374    // most stars). `relevant[0]` is the repo to show as the dev's hit.
375    let mut scored: Vec<(u32, Vec<String>, &&RepoLite)> = originals
376        .iter()
377        .map(|r| {
378            let (s, terms) = repo_relevance(r, &cfg.relevance_keywords);
379            (s, terms, r)
380        })
381        .filter(|(s, _, _)| *s > 0)
382        .collect();
383    // Rank by *breadth* first — how many distinct keywords the repo hits —
384    // then by weighted score, then traction. Breadth is the quality signal:
385    // a repo matching graph+knowledge+database is genuinely on-domain; one
386    // matching only "database" is almost always a coincidence.
387    scored.sort_by(|a, b| {
388        b.1.len()
389            .cmp(&a.1.len())
390            .then(b.0.cmp(&a.0))
391            .then(b.2.stars.cmp(&a.2.stars))
392    });
393    let relevant: Vec<RepoLite> = scored.iter().map(|(_, _, r)| (**r).clone()).collect();
394    let best_relevant_score = scored.first().map(|(s, _, _)| *s).unwrap_or(0);
395    let hit_terms: Vec<String> = scored
396        .first()
397        .map(|(_, t, _)| t.clone())
398        .unwrap_or_default();
399
400    // Stack match: count original repos in each of the seed project's
401    // languages. A dev is a stack match if they have ≥1 repo in *every*
402    // stack language; the counts give the depth to rank by.
403    let stack_lang_counts: Vec<(String, usize)> = cfg
404        .stack_languages
405        .iter()
406        .map(|sl| {
407            let n = originals
408                .iter()
409                .filter(|r| {
410                    r.lang
411                        .as_deref()
412                        .map(|l| l.eq_ignore_ascii_case(sl))
413                        .unwrap_or(false)
414                })
415                .count();
416            (sl.clone(), n)
417        })
418        .collect();
419    let stack_match =
420        !stack_lang_counts.is_empty() && stack_lang_counts.iter().all(|(_, n)| *n >= 1);
421
422    UserProfile {
423        login: login.to_string(),
424        archetype,
425        repos_seen: repos.len(),
426        capped,
427        original_count,
428        fork_count,
429        max_stars,
430        total_stars,
431        top_langs,
432        last_active,
433        flagship,
434        // A lead must hit ≥2 distinct keywords. Single-keyword matches
435        // ("database" on a file-manager plugin) are demoted to a weak-match
436        // footnote — they were the dominant noise the testers flagged.
437        strong_hit: hit_terms.len() >= 2,
438        hit_score: best_relevant_score,
439        hit_terms,
440        stack_match,
441        stack_lang_counts,
442        followers: None,
443        adopter: false,
444        adoption_evidence: None,
445        colocated_repos: None,
446        contributes_to: Vec::new(),
447        scores: Scores::default(),
448        relevant,
449        repos,
450    }
451}
452
453/// Recency of a date string "YYYY-MM-DD" as a sortable ordinal (higher =
454/// more recent). Empty → 0.
455fn date_ordinal(d: &str) -> f64 {
456    let mut it = d.split('-');
457    let y: f64 = it.next().and_then(|s| s.parse().ok()).unwrap_or(0.0);
458    let m: f64 = it.next().and_then(|s| s.parse().ok()).unwrap_or(0.0);
459    let day: f64 = it.next().and_then(|s| s.parse().ok()).unwrap_or(0.0);
460    y * 372.0 + m * 31.0 + day
461}
462
463/// Assign each value its percentile rank in [0,1] (ties share the average
464/// rank). The normalization that lets power-law axes (stars, size) be
465/// blended/compared with each other.
466fn percentile_ranks(values: &[f64]) -> Vec<f64> {
467    let n = values.len();
468    if n <= 1 {
469        return vec![1.0; n];
470    }
471    let mut idx: Vec<usize> = (0..n).collect();
472    idx.sort_by(|&a, &b| values[a].total_cmp(&values[b]));
473    let mut out = vec![0.0; n];
474    let mut i = 0;
475    while i < n {
476        let mut j = i;
477        while j + 1 < n && values[idx[j + 1]] == values[idx[i]] {
478            j += 1;
479        }
480        // average rank position for the tie group
481        let rank = (i + j) as f64 / 2.0;
482        let pct = rank / (n - 1) as f64;
483        for k in i..=j {
484            out[idx[k]] = pct;
485        }
486        i = j + 1;
487    }
488    out
489}
490
491/// Compute the normalized score vector for every profile from cheap raw
492/// signals: relatedness (keyword score), popularity (stars+followers),
493/// effort (repo size + breadth), recency (latest push). Percentile-ranked
494/// across the set so axes are comparable. Call after enrichment.
495pub fn normalize_scores(profiles: &mut [UserProfile]) {
496    let n = profiles.len();
497    if n == 0 {
498        return;
499    }
500    let rel: Vec<f64> = profiles.iter().map(|p| p.hit_score as f64).collect();
501    let pop: Vec<f64> = profiles
502        .iter()
503        .map(|p| ((p.total_stars + p.followers.unwrap_or(0)) as f64 + 1.0).ln())
504        .collect();
505    let eff: Vec<f64> = profiles
506        .iter()
507        .map(|p| {
508            let size: u64 = p.repos.iter().filter(|r| !r.fork).map(|r| r.size).sum();
509            (size as f64 + 1.0).ln() + p.original_count as f64
510        })
511        .collect();
512    let rec: Vec<f64> = profiles
513        .iter()
514        .map(|p| date_ordinal(&p.last_active))
515        .collect();
516
517    let (rel, pop, eff, rec) = (
518        percentile_ranks(&rel),
519        percentile_ranks(&pop),
520        percentile_ranks(&eff),
521        percentile_ranks(&rec),
522    );
523    for (i, p) in profiles.iter_mut().enumerate() {
524        p.scores = Scores {
525            relatedness: rel[i],
526            popularity: pop[i],
527            effort: eff[i],
528            recency: rec[i],
529        };
530    }
531}
532
533// ---------------------------------------------------------------------------
534// Rendering
535// ---------------------------------------------------------------------------
536
537fn trunc(s: &str, n: usize) -> String {
538    if s.chars().count() <= n {
539        s.to_string()
540    } else {
541        let mut out: String = s.chars().take(n.saturating_sub(1)).collect();
542        out.push('…');
543        out
544    }
545}
546
547fn repo_line(r: &RepoLite) -> String {
548    let lang = r.lang.as_deref().unwrap_or("—");
549    let desc = r.desc.as_deref().map(|d| trunc(d, 70)).unwrap_or_default();
550    let topics = if r.topics.is_empty() {
551        String::new()
552    } else {
553        format!(
554            " [{}]",
555            r.topics
556                .iter()
557                .take(4)
558                .cloned()
559                .collect::<Vec<_>>()
560                .join(",")
561        )
562    };
563    format!("{} {}★ ({}) \"{}\"{}", r.name, r.stars, lang, desc, topics)
564}
565
566/// One-line summary of a user for the overview.
567fn user_overview_line(p: &UserProfile) -> String {
568    match p.archetype {
569        Archetype::Established | Archetype::SingleProject => {
570            let f = p
571                .flagship
572                .as_ref()
573                .map(repo_line)
574                .unwrap_or_else(|| "—".into());
575            format!("{} — {}", p.login, f)
576        }
577        _ => {
578            let langs = if p.top_langs.is_empty() {
579                "—".into()
580            } else {
581                p.top_langs.join("/")
582            };
583            let more = if p.capped { "+" } else { "" };
584            format!(
585                "{} — {}{} repos · {} · active {} · {}★ max",
586                p.login, p.original_count, more, langs, p.last_active, p.max_stars
587            )
588        }
589    }
590}
591
592/// Disclosure scale — borrowed from kglite's `GraphScale`. The detail
593/// level adapts to how many stargazers there are: a small set shows every
594/// notable person inline; a huge set collapses to statistics + drill.
595#[derive(Debug, Clone, Copy, PartialEq, Eq)]
596pub enum Scale {
597    Small,   // ≤40   — full inline detail
598    Medium,  // ≤150  — established + single full, prolific sampled
599    Large,   // ≤1000 — standouts only, the rest as counts
600    Extreme, // >1000 — statistics, drill required
601}
602
603fn scale_of(n: usize) -> Scale {
604    match n {
605        0..=40 => Scale::Small,
606        41..=150 => Scale::Medium,
607        151..=1000 => Scale::Large,
608        _ => Scale::Extreme,
609    }
610}
611
612/// Total original repos a dev has in the seed project's stack languages —
613/// the depth of stack commitment used to rank stack matches.
614fn stack_depth(p: &UserProfile) -> usize {
615    p.stack_lang_counts.iter().map(|(_, n)| n).sum()
616}
617
618/// " · 142 followers" reach suffix [item 6], empty when not enriched.
619fn reach_tag(p: &UserProfile) -> String {
620    match p.followers {
621        Some(f) => format!(" · {f} followers"),
622        None => String::new(),
623    }
624}
625
626/// " · contributes: a/b, c/d" suffix [item 7], empty when none.
627fn contrib_tag(p: &UserProfile) -> String {
628    if p.contributes_to.is_empty() {
629        String::new()
630    } else {
631        format!(
632            " · contributes: {}",
633            p.contributes_to
634                .iter()
635                .take(3)
636                .cloned()
637                .collect::<Vec<_>>()
638                .join(", ")
639        )
640    }
641}
642
643/// Lowercase single-word cohort handle used in `cohort:<x>` drills.
644fn cohort_key(a: Archetype) -> &'static str {
645    match a {
646        Archetype::Established => "established",
647        Archetype::SingleProject => "single",
648        Archetype::Prolific => "prolific",
649        Archetype::Casual => "casual",
650        Archetype::Dormant => "dormant",
651        Archetype::Consumer => "consumers",
652    }
653}
654
655// ---------------------------------------------------------------------------
656// Selection: filter → rank → take (and the fan-out gate)
657// ---------------------------------------------------------------------------
658
659/// Conjunctive (AND) filters on the metric axes. All set predicates must
660/// pass. Absolute thresholds where the agent has intuition (keywords,
661/// stars, dates) + percentile thresholds on the normalized axes.
662#[derive(Debug, Clone, Default, Serialize, Deserialize)]
663pub struct Filters {
664    pub min_keywords: Option<usize>, // relatedness gate (distinct kw hits)
665    pub min_stars: Option<u64>,      // popularity gate (best repo stars)
666    pub active_since: Option<String>, // recency gate (YYYY-MM-DD)
667    pub adopters_only: bool,         // decisive: actual users
668    pub stack_only: bool,            // architectural peers
669    pub min_relatedness_pct: Option<f64>, // percentile gate, 0–1
670    pub min_effort_pct: Option<f64>,
671}
672
673#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
674pub enum RankBy {
675    Relatedness,
676    Popularity,
677    Effort,
678    Recency,
679}
680
681impl RankBy {
682    fn value(self, p: &UserProfile) -> f64 {
683        match self {
684            RankBy::Relatedness => p.scores.relatedness,
685            RankBy::Popularity => p.scores.popularity,
686            RankBy::Effort => p.scores.effort,
687            RankBy::Recency => p.scores.recency,
688        }
689    }
690    pub fn label(self) -> &'static str {
691        match self {
692            RankBy::Relatedness => "relatedness",
693            RankBy::Popularity => "popularity",
694            RankBy::Effort => "effort",
695            RankBy::Recency => "recency",
696        }
697    }
698    pub fn parse(s: &str) -> Option<RankBy> {
699        Some(match s.to_lowercase().as_str() {
700            "relatedness" | "related" | "relevance" => RankBy::Relatedness,
701            "popularity" | "reach" | "popular" => RankBy::Popularity,
702            "effort" | "substance" => RankBy::Effort,
703            "recency" | "active" | "recent" => RankBy::Recency,
704            _ => return None,
705        })
706    }
707}
708
709/// A complete selection: the filter predicate set, the ranking axis, and a
710/// human label. Drives both a focused overview view and the fan-out gate.
711#[derive(Debug, Clone)]
712pub struct Selection {
713    pub filters: Filters,
714    pub rank: RankBy,
715    pub label: String,
716    pub take: usize,
717}
718
719fn passes(p: &UserProfile, f: &Filters) -> bool {
720    if let Some(k) = f.min_keywords {
721        if p.hit_terms.len() < k {
722            return false;
723        }
724    }
725    if let Some(s) = f.min_stars {
726        if p.max_stars < s {
727            return false;
728        }
729    }
730    if let Some(d) = &f.active_since {
731        if p.last_active.as_str() < d.as_str() {
732            return false;
733        }
734    }
735    if f.adopters_only && !p.adopter {
736        return false;
737    }
738    if f.stack_only && !p.stack_match {
739        return false;
740    }
741    if let Some(pct) = f.min_relatedness_pct {
742        if p.scores.relatedness < pct {
743            return false;
744        }
745    }
746    if let Some(pct) = f.min_effort_pct {
747        if p.scores.effort < pct {
748            return false;
749        }
750    }
751    true
752}
753
754/// Filter → rank → take. The selection function, reused for the overview
755/// view and the fan-out gate (expand only these top-K).
756pub fn select<'a>(profiles: &'a [UserProfile], sel: &Selection) -> Vec<&'a UserProfile> {
757    let mut v: Vec<&UserProfile> = profiles
758        .iter()
759        .filter(|p| passes(p, &sel.filters))
760        .collect();
761    v.sort_by(|a, b| sel.rank.value(b).total_cmp(&sel.rank.value(a)));
762    v.truncate(sel.take.max(1));
763    v
764}
765
766/// Named preset = a bundled (filters, rank) for a common goal. Keeps the
767/// pipeline usable cold; raw filters are the power layer.
768pub fn preset(name: &str, take: usize) -> Option<Selection> {
769    let recent = "2025-01-01".to_string();
770    Some(match name.to_lowercase().as_str() {
771        // relevant + active, ranked by reach — who to email
772        "outreach" => Selection {
773            filters: Filters {
774                min_keywords: Some(2),
775                active_since: Some(recent),
776                ..Default::default()
777            },
778            rank: RankBy::Popularity,
779            label: "OUTREACH (relevant + active, by reach)".into(),
780            take,
781        },
782        // architectural peers, ranked by substance
783        "peers" => Selection {
784            filters: Filters {
785                stack_only: true,
786                active_since: Some(recent),
787                ..Default::default()
788            },
789            rank: RankBy::Effort,
790            label: "PEERS (build in your stack, by effort)".into(),
791            take,
792        },
793        // biggest audience, any domain — coding legends
794        "legends" => Selection {
795            filters: Filters::default(),
796            rank: RankBy::Popularity,
797            label: "LEGENDS (biggest reach, any domain)".into(),
798            take,
799        },
800        // anything on-domain, by popularity — competitive intel
801        "intel" => Selection {
802            filters: Filters {
803                min_keywords: Some(1),
804                ..Default::default()
805            },
806            rank: RankBy::Popularity,
807            label: "INTEL (on-domain, by popularity)".into(),
808            take,
809        },
810        // people who already depend on you
811        "adopters" => Selection {
812            filters: Filters {
813                adopters_only: true,
814                ..Default::default()
815            },
816            rank: RankBy::Popularity,
817            label: "ADOPTERS (actual users)".into(),
818            take,
819        },
820        _ => return None,
821    })
822}
823
824/// One selection-view line: the score vector inline (legible ranking) +
825/// flags + the representative repo.
826fn selection_line(p: &UserProfile) -> String {
827    let s = &p.scores;
828    let mut flags = String::new();
829    if p.adopter {
830        flags.push_str(" ✅adopter");
831    }
832    if is_legend(p) {
833        flags.push_str(" 🏆legend");
834    }
835    if p.stack_match {
836        flags.push_str(" ⚙stack");
837    }
838    let best = p
839        .relevant
840        .first()
841        .or(p.flagship.as_ref())
842        .or_else(|| p.repos.iter().filter(|r| !r.fork).max_by_key(|r| r.stars));
843    let bestline = best
844        .map(|r| {
845            format!(
846                "{} {}★ \"{}\"",
847                r.name,
848                r.stars,
849                r.desc.as_deref().map(|d| trunc(d, 50)).unwrap_or_default()
850            )
851        })
852        .unwrap_or_default();
853    format!(
854        "{} [rel {:.0} pop {:.0} eff {:.0} rec {:.0}]{}{} · {}",
855        p.login,
856        s.relatedness * 100.0,
857        s.popularity * 100.0,
858        s.effort * 100.0,
859        s.recency * 100.0,
860        reach_tag(p),
861        flags,
862        bestline
863    )
864}
865
866/// Render a focused selection view (filter → rank → take), with explicit
867/// empty-result guidance rather than a silent empty section.
868pub fn render_selection(profiles: &[UserProfile], sel: &Selection) -> String {
869    let chosen = select(profiles, sel);
870    if chosen.is_empty() {
871        let matched = profiles.iter().filter(|p| passes(p, &sel.filters)).count();
872        return format!(
873            "\n▶ {} — 0 of {} people passed the filter. Loosen it (lower min_keywords, earlier active_since, drop adopters_only/stack_only).\n",
874            sel.label, profiles.len()
875        ) + &format!("  (matched filter: {matched})\n");
876    }
877    let mut out = format!(
878        "\n▶ {} (top {}) — scores are 0–100 percentile ranks *within this set*, so they're relative, not comparable across calls:\n",
879        sel.label, sel.take
880    );
881    for p in &chosen {
882        out.push_str(&format!("  • {}\n", selection_line(p)));
883    }
884    out
885}
886
887/// Build the compact overview — the agent-facing digest. Scale-adaptive:
888/// inventory first (the cohort counts), then detail whose depth shrinks as
889/// the stargazer count grows.
890pub fn build_overview(
891    repo: &str,
892    profiles: &[UserProfile],
893    meta: &ScreenMeta,
894    cfg: &ScreenConfig,
895    selection: Option<&Selection>,
896) -> String {
897    let total = profiles.len();
898    let scale = scale_of(total);
899    let mut out = String::new();
900    let scale_note = match scale {
901        Scale::Small => "full detail",
902        Scale::Medium => "standouts inline, rest sampled",
903        Scale::Large => "standouts only, rest by count",
904        Scale::Extreme => "statistics — drill cohorts",
905    };
906    let noun = if meta.noun.is_empty() {
907        "people"
908    } else {
909        meta.noun.as_str()
910    };
911    let scope = if meta.partial && meta.total_stargazers > 0 {
912        format!("{total} of {} {noun}", meta.total_stargazers)
913    } else {
914        format!("{total} {noun}")
915    };
916    out.push_str(&format!(
917        "{repo} — {scope} · {} REST requests · 0 READMEs  [scale: {scale:?} → {scale_note}]\n",
918        meta.requests
919    ));
920    out.push_str(
921        "(Leads are description+language signals — descriptions often overstate; drill to confirm.)\n",
922    );
923    // [item 1] Report auto-derived config so the agent can refine it.
924    if let Some(kw) = &meta.derived_keywords {
925        out.push_str(&format!("(auto-keywords from repo: {})\n", kw.join(",")));
926    }
927    if let Some(st) = &meta.derived_stack {
928        out.push_str(&format!("(auto-stack from repo: {})\n", st.join("+")));
929    }
930    if let Some(reason) = &meta.partial_reason {
931        out.push_str(&format!("(partial: {reason})\n"));
932    }
933
934    // [item 2] Adopters first — they already depend on you. The single
935    // highest-value signal: not "might be interested", but "is a user".
936    let adopters: Vec<&UserProfile> = profiles.iter().filter(|p| p.adopter).collect();
937    if !adopters.is_empty() {
938        out.push_str(&format!(
939            "\n✅ ADOPTERS — already depend on you ({}):\n",
940            adopters.len()
941        ));
942        for p in &adopters {
943            out.push_str(&format!(
944                "  • {}{} — {}\n",
945                p.login,
946                reach_tag(p),
947                p.adoption_evidence
948                    .as_deref()
949                    .unwrap_or("(dependency declared)")
950            ));
951        }
952    } else if meta.noun != "users" {
953        // Only meaningful for a repo seed (there's a package to depend on).
954        out.push_str(
955            "\n✅ ADOPTERS: none found — no screened stargazer's repo declares the package as a dependency.\n",
956        );
957    }
958
959    // Group by archetype, sorted by traction within each.
960    let mut buckets: std::collections::BTreeMap<u8, (Archetype, Vec<&UserProfile>)> =
961        std::collections::BTreeMap::new();
962    for p in profiles {
963        buckets
964            .entry(p.archetype.rank())
965            .or_insert_with(|| (p.archetype, Vec::new()))
966            .1
967            .push(p);
968    }
969    for (_, (_, members)) in buckets.iter_mut() {
970        members.sort_by(|a, b| {
971            b.max_stars
972                .cmp(&a.max_stars)
973                .then(b.total_stars.cmp(&a.total_stars))
974        });
975    }
976
977    // Focused selection (filter → rank → take) when requested; otherwise
978    // the full multi-lens browse. Cohorts render in both modes.
979    if let Some(sel) = selection {
980        out.push_str(&render_selection(profiles, sel));
981    }
982
983    if selection.is_none() {
984        // ── Most relevant devs first (the answer to "who matters") ──
985        // The screening goal is relevance, not traction, so this leads — and
986        // it cross-cuts cohorts, surfacing on-domain repos that the star-ranked
987        // cohort lines bury.
988        let mut hits: Vec<&UserProfile> = profiles.iter().filter(|p| p.strong_hit).collect();
989        // Rank by distinct-keyword breadth (the visible `Nkw` prefix), then
990        // traction — once a repo clears the ≥2-keyword on-domain bar, stars are
991        // a credibility signal, so a 40★ builder outranks a 0★ name-match at the
992        // same breadth. Score only breaks remaining ties.
993        hits.sort_by(|a, b| {
994            b.hit_terms
995                .len()
996                .cmp(&a.hit_terms.len())
997                .then(b.relevant[0].stars.cmp(&a.relevant[0].stars))
998                .then(b.hit_score.cmp(&a.hit_score))
999        });
1000        if !cfg.relevance_keywords.is_empty() {
1001            let cap = match scale {
1002                Scale::Small | Scale::Medium => 12,
1003                Scale::Large => 8,
1004                Scale::Extreme => 5,
1005            };
1006            out.push_str(&format!(
1007            "\n★ MOST RELEVANT — on-domain repo per dev (≥2 keyword hits), ranked by `Nkw` then traction · keys=[{}] ({} devs):\n",
1008            cfg.relevance_keywords.join(","),
1009            hits.len()
1010        ));
1011            for p in hits.iter().take(cap) {
1012                out.push_str(&format!("  • {}\n", relevance_line(p)));
1013            }
1014            if hits.len() > cap {
1015                out.push_str(&format!("  …+{} more\n", hits.len() - cap));
1016            }
1017
1018            // Weak single-keyword matches: kept as a one-line footnote so they
1019            // are available without competing with the real leads.
1020            let weak: Vec<&UserProfile> = profiles
1021                .iter()
1022                .filter(|p| !p.strong_hit && !p.relevant.is_empty())
1023                .collect();
1024            if !weak.is_empty() {
1025                let listed: Vec<String> = weak
1026                    .iter()
1027                    .take(10)
1028                    .map(|p| {
1029                        format!(
1030                            "{}/{}({})",
1031                            p.login,
1032                            p.relevant[0].name,
1033                            p.hit_terms.join("/")
1034                        )
1035                    })
1036                    .collect();
1037                out.push_str(&format!(
1038                    "  ~ weak 1-keyword matches ({}): {}{}\n",
1039                    weak.len(),
1040                    listed.join(", "),
1041                    if weak.len() > 10 { ", …" } else { "" }
1042                ));
1043            }
1044        }
1045
1046        // ── Popularity / reach lens — coding legends, regardless of domain ──
1047        let mut notable: Vec<&UserProfile> = profiles
1048            .iter()
1049            .filter(|p| p.max_stars >= 50 || p.followers.map(|f| f >= 200).unwrap_or(false))
1050            .collect();
1051        notable.sort_by(|a, b| {
1052            b.followers
1053                .unwrap_or(0)
1054                .cmp(&a.followers.unwrap_or(0))
1055                .then(b.max_stars.cmp(&a.max_stars))
1056                .then(b.total_stars.cmp(&a.total_stars))
1057        });
1058        if !notable.is_empty() {
1059            out.push_str("\n🏆 NOTABLE — biggest reach/traction (popularity lens):\n");
1060            for p in notable.iter().take(6) {
1061                let legend = if is_legend(p) { " ⟵ LEGEND" } else { "" };
1062                let top = p
1063                    .repos
1064                    .iter()
1065                    .filter(|r| !r.fork)
1066                    .max_by_key(|r| r.stars)
1067                    .map(|r| format!("{} {}★", r.name, r.stars))
1068                    .unwrap_or_default();
1069                out.push_str(&format!(
1070                    "  • {}{} — {}★ total · top: {}{}\n",
1071                    p.login,
1072                    reach_tag(p),
1073                    p.total_stars,
1074                    top,
1075                    legend
1076                ));
1077            }
1078        }
1079
1080        // ── Quality lens — best-kept serious projects (maintenance + stars) ──
1081        let mut byq: Vec<&UserProfile> = profiles
1082            .iter()
1083            .filter(|p| quality_score(p) >= 3.5)
1084            .collect();
1085        byq.sort_by(|a, b| quality_score(b).total_cmp(&quality_score(a)));
1086        if !byq.is_empty() {
1087            out.push_str(
1088                "\n✦ QUALITY — best-kept projects (maintained + topic-tagged + active):\n",
1089            );
1090            for p in byq.iter().take(5) {
1091                let best = p
1092                    .repos
1093                    .iter()
1094                    .filter(|r| !r.fork)
1095                    .max_by(|a, b| repo_quality(a).total_cmp(&repo_quality(b)));
1096                if let Some(r) = best {
1097                    let topics = if r.topics.is_empty() {
1098                        String::new()
1099                    } else {
1100                        format!(
1101                            " [{}]",
1102                            r.topics
1103                                .iter()
1104                                .take(3)
1105                                .cloned()
1106                                .collect::<Vec<_>>()
1107                                .join(",")
1108                        )
1109                    };
1110                    out.push_str(&format!(
1111                        "  • {} — {} {}★ ({}) active {}{}\n",
1112                        p.login,
1113                        r.name,
1114                        r.stars,
1115                        r.lang.as_deref().unwrap_or("—"),
1116                        r.pushed,
1117                        topics
1118                    ));
1119                }
1120            }
1121        }
1122
1123        // ── Stack match: shares the seed project's languages (PyO3 signal) ──
1124        // Catches relevant devs whose repo names don't contain the keywords.
1125        if !cfg.stack_languages.is_empty() {
1126            let already: std::collections::HashSet<&str> =
1127                hits.iter().take(20).map(|p| p.login.as_str()).collect();
1128            let mut stack: Vec<&UserProfile> = profiles
1129                .iter()
1130                .filter(|p| p.stack_match && !already.contains(p.login.as_str()))
1131                .collect();
1132            // Rank by depth of stack commitment (total in-stack repos): a serial
1133            // Rust+Python builder — the kglite/maturin pattern — outranks an
1134            // incidental one. This is the signal that catches architectural
1135            // peers keywords structurally cannot (descriptions don't name the
1136            // toolchain). We show the *counts*, not a sample repo: the most-
1137            // starred in-stack repo is often off-domain, so the verifiable
1138            // signal is "builds N things in your exact stack" — then drill.
1139            stack.sort_by(|a, b| {
1140                stack_depth(b)
1141                    .cmp(&stack_depth(a))
1142                    .then(b.total_stars.cmp(&a.total_stars))
1143            });
1144            if !stack.is_empty() {
1145                out.push_str(&format!(
1146                "\n⚙ STACK MATCH — builds in your stack ({}), ranked by depth · drill to confirm ({} devs):\n",
1147                cfg.stack_languages.join("+"),
1148                stack.len()
1149            ));
1150                for p in stack.iter().take(8) {
1151                    let breakdown = p
1152                        .stack_lang_counts
1153                        .iter()
1154                        .map(|(l, n)| format!("{n} {l}"))
1155                        .collect::<Vec<_>>()
1156                        .join(" + ");
1157                    // [item 4] co-location: repos combining all stack langs (PyO3).
1158                    let coloc = match p.colocated_repos {
1159                        Some(n) if n > 0 => format!(" · {n} combine both (PyO3-style)"),
1160                        Some(_) => " · none combine both".to_string(),
1161                        None => String::new(),
1162                    };
1163                    out.push_str(&format!(
1164                        "  • {} — {} ({} in-stack of {} repos){}{}{}\n",
1165                        p.login,
1166                        breakdown,
1167                        stack_depth(p),
1168                        p.original_count,
1169                        coloc,
1170                        reach_tag(p),
1171                        contrib_tag(p),
1172                    ));
1173                }
1174                if stack.len() > 8 {
1175                    out.push_str(&format!("  …+{} more\n", stack.len() - 8));
1176                }
1177            }
1178        }
1179    } // end multi-lens browse (selection.is_none())
1180
1181    // Devs already surfaced as leads above — skip them in the cohort detail
1182    // so it shows "the rest of the audience", not a re-list of the leads.
1183    let shown: std::collections::HashSet<&str> = profiles
1184        .iter()
1185        .filter(|p| p.strong_hit || p.stack_match)
1186        .map(|p| p.login.as_str())
1187        .collect();
1188
1189    // ── Inventory: cohort counts (the audience shape) ──
1190    out.push_str("\nCOHORTS (drill 'cohort:<key>' for the full list):\n");
1191    for (arch, members) in buckets.values() {
1192        out.push_str(&format!(
1193            "  {:<20} {:>3}  {:<18} {}\n",
1194            arch.label(),
1195            members.len(),
1196            format!("cohort:{}", cohort_key(*arch)),
1197            cohort_blurb(*arch)
1198        ));
1199    }
1200
1201    // ── Per-cohort detail, depth by scale, leads excluded ──
1202    for (arch, members) in buckets.values() {
1203        let n = members.len();
1204        let show = inline_quota(*arch, scale, n);
1205        if show == 0 {
1206            continue;
1207        }
1208        // Only members not already surfaced as leads above.
1209        let rest: Vec<&&UserProfile> = members
1210            .iter()
1211            .filter(|p| !shown.contains(p.login.as_str()))
1212            .collect();
1213        let lead_count = n - rest.len();
1214        if rest.is_empty() {
1215            continue;
1216        }
1217        out.push_str(&format!("\n▍{} ({}):\n", arch.label(), n));
1218        for p in rest.iter().take(show) {
1219            out.push_str(&format!("  • {}\n", user_overview_line(p)));
1220        }
1221        let mut tail = String::new();
1222        if rest.len() > show {
1223            tail.push_str(&format!("+{} more", rest.len() - show));
1224        }
1225        if lead_count > 0 {
1226            if !tail.is_empty() {
1227                tail.push_str(", ");
1228            }
1229            tail.push_str(&format!("{} shown as leads above", lead_count));
1230        }
1231        if !tail.is_empty() {
1232            out.push_str(&format!(
1233                "  …{} — drill 'cohort:{}'\n",
1234                tail,
1235                cohort_key(*arch)
1236            ));
1237        }
1238    }
1239
1240    out.push_str(
1241        "\nDRILL: 'user:<login>' → portfolio · 'user:<login>/repo:<name>' → repo · '…/readme' → README gist · 'cohort:<name>' → full list\n",
1242    );
1243    out
1244}
1245
1246/// Relevance line: on-domain repo, its language, a description (or topics
1247/// when the description is empty — the highest-signal leads often have no
1248/// description), and the keywords that matched.
1249fn relevance_line(p: &UserProfile) -> String {
1250    let r = &p.relevant[0];
1251    let lang = r.lang.as_deref().unwrap_or("—");
1252    let about = match r.desc.as_deref() {
1253        Some(d) => format!("\"{}\"", trunc(d, 85)),
1254        None if !r.topics.is_empty() => format!(
1255            "topics:[{}]",
1256            r.topics
1257                .iter()
1258                .take(5)
1259                .cloned()
1260                .collect::<Vec<_>>()
1261                .join(",")
1262        ),
1263        None => "(no description)".into(),
1264    };
1265    format!(
1266        "{}kw  {}/{} {}★ [{}] active {} — {} · matched: {}{}{}",
1267        p.hit_terms.len(),
1268        p.login,
1269        r.name,
1270        r.stars,
1271        lang,
1272        r.pushed,
1273        about,
1274        p.hit_terms.join(","),
1275        reach_tag(p),
1276        contrib_tag(p),
1277    )
1278}
1279
1280fn cohort_blurb(a: Archetype) -> &'static str {
1281    match a {
1282        Archetype::Established => "high-traction repos",
1283        Archetype::SingleProject => "one standout repo",
1284        Archetype::Prolific => "many repos, low traction",
1285        Archetype::Casual => "a few modest repos",
1286        Archetype::Dormant => "no recent public pushes",
1287        Archetype::Consumer => "mostly forks / no original work",
1288    }
1289}
1290
1291/// How many members of a cohort to show inline, by disclosure scale.
1292fn inline_quota(arch: Archetype, scale: Scale, n: usize) -> usize {
1293    let q = match (arch, scale) {
1294        // Established + single-project: the standouts — keep inline longest.
1295        (Archetype::Established | Archetype::SingleProject, Scale::Small) => n,
1296        (Archetype::Established | Archetype::SingleProject, Scale::Medium) => 8,
1297        (Archetype::Established | Archetype::SingleProject, Scale::Large) => 5,
1298        (Archetype::Established | Archetype::SingleProject, Scale::Extreme) => 0,
1299        // Prolific: sample a few, collapse the tail.
1300        (Archetype::Prolific, Scale::Small) => 6,
1301        (Archetype::Prolific, Scale::Medium) => 4,
1302        (Archetype::Prolific, _) => 0,
1303        // Casual: only at small scale.
1304        (Archetype::Casual, Scale::Small) => 3,
1305        (Archetype::Casual, _) => 0,
1306        // Dormant + consumers: always collapsed to the inventory count.
1307        (Archetype::Dormant | Archetype::Consumer, _) => 0,
1308    };
1309    q.min(n)
1310}
1311
1312/// Resolve a `cohort:<key>` handle back to its archetype.
1313pub fn archetype_from_key(key: &str) -> Option<Archetype> {
1314    Some(match key {
1315        "established" => Archetype::Established,
1316        "single" => Archetype::SingleProject,
1317        "prolific" => Archetype::Prolific,
1318        "casual" => Archetype::Casual,
1319        "dormant" => Archetype::Dormant,
1320        "consumers" => Archetype::Consumer,
1321        _ => return None,
1322    })
1323}
1324
1325/// Members of a cohort, rendered in full (the `cohort:<name>` drill).
1326pub fn render_cohort(arch: Archetype, profiles: &[UserProfile]) -> String {
1327    let mut members: Vec<&UserProfile> = profiles.iter().filter(|p| p.archetype == arch).collect();
1328    members.sort_by(|a, b| {
1329        b.max_stars
1330            .cmp(&a.max_stars)
1331            .then(b.total_stars.cmp(&a.total_stars))
1332    });
1333    let mut out = format!(
1334        "{} ({}) — {}\n\n",
1335        arch.label(),
1336        members.len(),
1337        cohort_blurb(arch)
1338    );
1339    for p in &members {
1340        out.push_str(&format!("  • {}\n", user_overview_line(p)));
1341    }
1342    out
1343}
1344
1345/// Render a single user's full portfolio (drill-down level 1).
1346pub fn render_user(p: &UserProfile) -> String {
1347    let mut out = String::new();
1348    out.push_str(&format!(
1349        "{} — {} · {} original repos ({} forks) · {}★ total · active {}\n  languages: {}\n\n",
1350        p.login,
1351        p.archetype.label(),
1352        p.original_count,
1353        p.fork_count,
1354        p.total_stars,
1355        p.last_active,
1356        if p.top_langs.is_empty() {
1357            "—".into()
1358        } else {
1359            p.top_langs.join(", ")
1360        },
1361    ));
1362    let mut originals: Vec<&RepoLite> = p.repos.iter().filter(|r| !r.fork).collect();
1363    originals.sort_by(|a, b| b.stars.cmp(&a.stars).then(b.pushed.cmp(&a.pushed)));
1364    for r in originals.iter().take(30) {
1365        let arch = if r.archived { " [archived]" } else { "" };
1366        out.push_str(&format!(
1367            "  • {}{}  (pushed {})\n",
1368            repo_line(r),
1369            arch,
1370            r.pushed
1371        ));
1372    }
1373    if originals.len() > 30 {
1374        out.push_str(&format!(
1375            "  …+{} more original repos\n",
1376            originals.len() - 30
1377        ));
1378    }
1379    out
1380}
1381
1382/// Render a single repo profile (drill-down level 2). The cheap version —
1383/// everything here is already in the cache, no new request.
1384pub fn render_repo(login: &str, r: &RepoLite) -> String {
1385    let topics = if r.topics.is_empty() {
1386        "—".into()
1387    } else {
1388        r.topics.join(", ")
1389    };
1390    format!(
1391        "{login}/{name}\n  {stars}★ · {forks} forks · {lang} · pushed {pushed} · created {created}{arch}\n  topics: {topics}\n  {desc}\n\n  (drill '…/repo:{name}/readme' for the README headline — 1 REST request)\n",
1392        name = r.name,
1393        stars = r.stars,
1394        forks = r.forks,
1395        lang = r.lang.as_deref().unwrap_or("—"),
1396        pushed = r.pushed,
1397        created = r.created,
1398        arch = if r.archived { " · ARCHIVED" } else { "" },
1399        desc = r.desc.as_deref().unwrap_or("(no description)"),
1400    )
1401}
1402
1403/// Fetch a repo's README and compact it to a short headline gist. This is
1404/// the only drill that costs a request, and it's shortlist-only by design.
1405pub fn fetch_readme(repo: &str) -> Result<String, String> {
1406    let meta = github::gh_get(&format!("repos/{repo}/readme"))?;
1407    let url = meta
1408        .get("download_url")
1409        .and_then(Value::as_str)
1410        .ok_or("no README found")?;
1411    let body = ureq::get(url)
1412        .set("User-Agent", "mcp-methods")
1413        .call()
1414        .map_err(|e| format!("README fetch error: {e}"))?
1415        .into_string()
1416        .map_err(|e| format!("README decode error: {e}"))?;
1417    Ok(compact_readme(&body))
1418}
1419
1420/// Strip badges / HTML / boilerplate and keep the first lines of real prose.
1421fn compact_readme(md: &str) -> String {
1422    let mut kept: Vec<String> = Vec::new();
1423    let mut chars = 0usize;
1424    for line in md.lines() {
1425        let t = line.trim();
1426        if t.is_empty() {
1427            continue;
1428        }
1429        // Drop badge lines, HTML, comments, raw image refs.
1430        if t.starts_with("<!--")
1431            || t.starts_with('<')
1432            || t.starts_with("![")
1433            || t.contains("shields.io")
1434            || t.contains("badge")
1435        {
1436            continue;
1437        }
1438        kept.push(line.to_string());
1439        chars += line.len();
1440        if chars > 1200 || kept.len() >= 25 {
1441            kept.push("… (README truncated)".into());
1442            break;
1443        }
1444    }
1445    kept.join("\n")
1446}
1447
1448// ---------------------------------------------------------------------------
1449// Seed-config derivation [item 1] + shortlist enrichment [items 2/4/6/7]
1450// ---------------------------------------------------------------------------
1451
1452/// What to screen: a repo (screen its stargazers) or an explicit set of
1453/// users (screen them directly). The people-set differs; everything
1454/// downstream — projection, classification, enrichment, scoring,
1455/// selection — is identical.
1456#[derive(Debug, Clone)]
1457pub enum Seed {
1458    Repo(String),
1459    Users(Vec<String>),
1460}
1461
1462impl Seed {
1463    /// Cache key + header label.
1464    pub fn key(&self) -> String {
1465        match self {
1466            Seed::Repo(r) => r.clone(),
1467            Seed::Users(u) => {
1468                let mut v = u.clone();
1469                v.sort();
1470                format!("users:{}", v.join(","))
1471            }
1472        }
1473    }
1474    /// Auto-detect from a free-form target: `owner/repo` → repo, else a
1475    /// comma-separated user list.
1476    pub fn detect(target: &str) -> Seed {
1477        let t = target.trim();
1478        if t.contains('/') && !t.contains(',') {
1479            Seed::Repo(t.to_string())
1480        } else {
1481            Seed::Users(
1482                t.split(',')
1483                    .map(|s| s.trim().to_string())
1484                    .filter(|s| !s.is_empty())
1485                    .collect(),
1486            )
1487        }
1488    }
1489}
1490
1491/// Run metadata returned alongside the profiles — what was auto-derived,
1492/// how much was screened, and whether the result is partial. Drives the
1493/// overview header and the scale/cost story.
1494#[derive(Debug, Clone, Default, Serialize, Deserialize)]
1495pub struct ScreenMeta {
1496    pub derived_keywords: Option<Vec<String>>,
1497    pub derived_stack: Option<Vec<String>>,
1498    pub total_stargazers: usize,
1499    pub screened: usize,
1500    pub partial: bool,
1501    pub partial_reason: Option<String>,
1502    pub requests: usize,
1503    /// "stargazers" or "users" — the noun for the header.
1504    #[serde(default)]
1505    pub noun: String,
1506}
1507
1508/// Words too generic to be useful relevance keywords.
1509const STOPWORDS: &[&str] = &[
1510    "a",
1511    "an",
1512    "the",
1513    "and",
1514    "or",
1515    "of",
1516    "to",
1517    "in",
1518    "on",
1519    "for",
1520    "with",
1521    "is",
1522    "it",
1523    "its",
1524    "via",
1525    "using",
1526    "used",
1527    "based",
1528    "written",
1529    "powered",
1530    "support",
1531    "library",
1532    "lib",
1533    "tool",
1534    "tools",
1535    "framework",
1536    "simple",
1537    "lightweight",
1538    "fast",
1539    "high",
1540    "performance",
1541    "easy",
1542    "small",
1543    "modern",
1544    "minimal",
1545    "your",
1546    "you",
1547    "this",
1548    "that",
1549    "from",
1550    "by",
1551    "as",
1552    "at",
1553    "be",
1554    "are",
1555];
1556
1557/// Auto-derive relevance keywords + stack from the seed repo itself, so a
1558/// caller can `screen_stargazers(repo)` with no hand-tuned config. Topics
1559/// and significant description words become keywords; the top languages
1560/// (by bytes) become the stack. Returns (keywords, stack); empty on error.
1561pub fn derive_config(repo: &str) -> (Vec<String>, Vec<String>) {
1562    let meta = match github::gh_get(&format!("repos/{repo}")) {
1563        Ok(m) => m,
1564        Err(_) => return (Vec::new(), Vec::new()),
1565    };
1566    let mut kw: Vec<String> = Vec::new();
1567    // Topics first — curated, high-signal.
1568    if let Some(topics) = meta.get("topics").and_then(Value::as_array) {
1569        for t in topics {
1570            if let Some(s) = t.as_str() {
1571                for tok in tokenize(s) {
1572                    if tok.len() >= 3 && !STOPWORDS.contains(&tok.as_str()) && !kw.contains(&tok) {
1573                        kw.push(tok);
1574                    }
1575                }
1576            }
1577        }
1578    }
1579    // Then significant description words.
1580    if let Some(desc) = meta.get("description").and_then(Value::as_str) {
1581        for tok in tokenize(desc) {
1582            if tok.len() >= 4 && !STOPWORDS.contains(&tok.as_str()) && !kw.contains(&tok) {
1583                kw.push(tok);
1584            }
1585        }
1586    }
1587    kw.truncate(14);
1588
1589    // Stack: top languages by bytes.
1590    let stack = match github::gh_get(&format!("repos/{repo}/languages")) {
1591        Ok(langs) => {
1592            let mut v: Vec<(String, u64)> = langs
1593                .as_object()
1594                .map(|o| {
1595                    o.iter()
1596                        .filter_map(|(k, n)| n.as_u64().map(|b| (k.clone(), b)))
1597                        .collect()
1598                })
1599                .unwrap_or_default();
1600            v.sort_by_key(|x| std::cmp::Reverse(x.1));
1601            v.into_iter().take(2).map(|(l, _)| l).collect()
1602        }
1603        Err(_) => Vec::new(),
1604    };
1605    (kw, stack)
1606}
1607
1608/// [item 6] Follower count for one user — reach signal for outreach.
1609pub fn fetch_user_reach(login: &str) -> Option<u64> {
1610    let u = github::gh_get(&format!("users/{login}")).ok()?;
1611    u.get("followers").and_then(Value::as_u64)
1612}
1613
1614/// [item 7] Repos a user contributes to but does not own, from their recent
1615/// public events — surfaces relevance the owned-repo list can't see.
1616pub fn fetch_contributions(login: &str) -> Vec<String> {
1617    let events = match github::gh_get(&format!("users/{login}/events/public?per_page=100")) {
1618        Ok(Value::Array(a)) => a,
1619        _ => return Vec::new(),
1620    };
1621    let mut seen = std::collections::HashSet::new();
1622    let mut out = Vec::new();
1623    let login_lc = login.to_lowercase();
1624    for ev in &events {
1625        let kind = ev.get("type").and_then(Value::as_str).unwrap_or("");
1626        if !matches!(
1627            kind,
1628            "PushEvent" | "PullRequestEvent" | "IssuesEvent" | "CommitCommentEvent"
1629        ) {
1630            continue;
1631        }
1632        if let Some(full) = ev
1633            .get("repo")
1634            .and_then(|r| r.get("name"))
1635            .and_then(Value::as_str)
1636        {
1637            let owner = full.split('/').next().unwrap_or("");
1638            if owner.to_lowercase() != login_lc && seen.insert(full.to_string()) {
1639                out.push(full.to_string());
1640            }
1641        }
1642    }
1643    out.truncate(6);
1644    out
1645}
1646
1647/// [item 4] Count a dev's original repos that combine *all* stack languages
1648/// in one repo (the true PyO3/maturin co-location signal). Probes
1649/// `/languages` on stack-language repos, bounded to keep it cheap.
1650pub fn probe_colocation(p: &UserProfile, stack: &[String], max_probes: usize) -> usize {
1651    let in_stack = |l: &str| stack.iter().any(|s| s.eq_ignore_ascii_case(l));
1652    let mut probed = 0;
1653    let mut colocated = 0;
1654    for r in p.repos.iter().filter(|r| !r.fork) {
1655        if probed >= max_probes {
1656            break;
1657        }
1658        // Only probe repos whose primary language is in-stack (likeliest
1659        // to also contain the other stack language).
1660        if !r.lang.as_deref().map(in_stack).unwrap_or(false) {
1661            continue;
1662        }
1663        probed += 1;
1664        if let Ok(langs) = github::gh_get(&format!("repos/{}/{}/languages", p.login, r.name)) {
1665            if let Some(obj) = langs.as_object() {
1666                let present: std::collections::HashSet<String> =
1667                    obj.keys().map(|k| k.to_lowercase()).collect();
1668                if stack.iter().all(|s| present.contains(&s.to_lowercase())) {
1669                    colocated += 1;
1670                }
1671            }
1672        }
1673    }
1674    colocated
1675}
1676
1677/// [item 2] Find which of `logins` actually depend on the seed package.
1678/// Code-searches dependency manifests for the package name, intersects the
1679/// owners with the stargazer set, and verifies a bounded (whole-word) match
1680/// against the real manifest line to reject substring collisions
1681/// (e.g. `pkglite` ⊃ `kglite`). Returns login → evidence line.
1682pub fn find_adopters(
1683    pkg: &str,
1684    logins: &std::collections::HashSet<String>,
1685) -> std::collections::HashMap<String, String> {
1686    let mut found = std::collections::HashMap::new();
1687    if pkg.is_empty() {
1688        return found;
1689    }
1690    let logins_lc: std::collections::HashSet<String> =
1691        logins.iter().map(|l| l.to_lowercase()).collect();
1692    let manifests = [
1693        "Cargo.toml",
1694        "pyproject.toml",
1695        "requirements.txt",
1696        "package.json",
1697    ];
1698    for mf in manifests {
1699        let q = format!("{pkg}+filename:{mf}");
1700        let results = match github::gh_get(&format!("search/code?q={q}&per_page=30")) {
1701            Ok(v) => v,
1702            Err(_) => continue,
1703        };
1704        let items = match results.get("items").and_then(Value::as_array) {
1705            Some(a) => a,
1706            None => continue,
1707        };
1708        for it in items {
1709            let full = it
1710                .get("repository")
1711                .and_then(|r| r.get("full_name"))
1712                .and_then(Value::as_str)
1713                .unwrap_or("");
1714            let path = it.get("path").and_then(Value::as_str).unwrap_or("");
1715            let owner = full.split('/').next().unwrap_or("");
1716            if owner.is_empty() || !logins_lc.contains(&owner.to_lowercase()) {
1717                continue;
1718            }
1719            if found.contains_key(&owner.to_lowercase()) {
1720                continue;
1721            }
1722            // Verify a bounded match against the actual manifest line.
1723            if let Some(line) = verify_dependency(full, path, pkg) {
1724                found.insert(owner.to_lowercase(), format!("{full}/{path}: {line}"));
1725            }
1726        }
1727    }
1728    found
1729}
1730
1731/// Fetch a manifest and return the line that declares `pkg` as a bounded
1732/// token (rejecting substring collisions), or None.
1733fn verify_dependency(full_name: &str, path: &str, pkg: &str) -> Option<String> {
1734    let url = format!("https://raw.githubusercontent.com/{full_name}/HEAD/{path}");
1735    let body = ureq::get(&url)
1736        .set("User-Agent", "mcp-methods")
1737        .call()
1738        .ok()?
1739        .into_string()
1740        .ok()?;
1741    let pkg_lc = pkg.to_lowercase();
1742    for line in body.lines() {
1743        let l = line.to_lowercase();
1744        if let Some(idx) = l.find(&pkg_lc) {
1745            let before = l[..idx].chars().next_back();
1746            let after = l[idx + pkg_lc.len()..].chars().next();
1747            let boundary = |c: Option<char>| {
1748                c.map(|c| !(c.is_alphanumeric() || c == '_' || c == '-'))
1749                    .unwrap_or(true)
1750            };
1751            if boundary(before) && boundary(after) {
1752                return Some(line.trim().chars().take(80).collect());
1753            }
1754        }
1755    }
1756    None
1757}
1758
1759// ---------------------------------------------------------------------------
1760// Orchestration (fetch)
1761// ---------------------------------------------------------------------------
1762
1763/// Fetch a repo's stargazer logins, most-recent first (owner excluded).
1764/// Uncapped — the caller samples [item 9].
1765pub fn fetch_stargazer_logins(repo: &str) -> Result<Vec<String>, String> {
1766    if let Some(err) = crate::git_refs::validate_repo(repo) {
1767        return Err(err);
1768    }
1769    // STARRED_AT ordering isn't available on REST stargazers without a
1770    // preview media type; the default order is ascending by starred date,
1771    // so the *last* pages are the most recent — reverse to get most-recent
1772    // first for sampling.
1773    let endpoint = format!("repos/{repo}/stargazers?per_page=100");
1774    let pages = github::gh_get_paginated(&endpoint)?;
1775    let owner = repo.split('/').next().unwrap_or("").to_lowercase();
1776    let mut logins: Vec<String> = pages
1777        .iter()
1778        .filter_map(|u| u.get("login").and_then(Value::as_str).map(String::from))
1779        .filter(|l| l.to_lowercase() != owner)
1780        .collect();
1781    logins.reverse();
1782    Ok(logins)
1783}
1784
1785/// Fetch + project one user's owned repos. Returns (repos, capped).
1786pub fn fetch_portfolio(login: &str, cfg: &ScreenConfig) -> Result<(Vec<RepoLite>, bool), String> {
1787    let endpoint = format!("users/{login}/repos?sort=pushed&per_page=100");
1788    let raw = github::gh_get_paginated(&endpoint)?;
1789    let capped = raw.len() >= cfg.max_repos_per_user;
1790    let repos: Vec<RepoLite> = raw
1791        .iter()
1792        .take(cfg.max_repos_per_user)
1793        .map(project_repo)
1794        .collect();
1795    Ok((repos, capped))
1796}
1797
1798/// Full screen: derive config if needed [item 1], sample stargazers [item 9],
1799/// fetch portfolios, classify, then enrich the shortlist [items 2/4/6/7].
1800/// Returns the profiles, run metadata, and the *effective* config (so a
1801/// cached re-rank can reuse the derived keywords/stack).
1802pub fn run_screen(
1803    seed: &Seed,
1804    cfg_in: &ScreenConfig,
1805) -> Result<(Vec<UserProfile>, ScreenMeta, ScreenConfig), String> {
1806    let mut cfg = cfg_in.clone();
1807    let mut meta = ScreenMeta::default();
1808
1809    // The seed package (for adoption) and the people-set differ by seed type;
1810    // everything downstream is identical.
1811    let (pkg, all): (String, Vec<String>) = match seed {
1812        Seed::Repo(repo) => {
1813            meta.noun = "stargazers".into();
1814            // [item 1] Auto-derive missing config from the seed repo itself.
1815            if cfg.relevance_keywords.is_empty() || cfg.stack_languages.is_empty() {
1816                let (kw, st) = derive_config(repo);
1817                meta.requests += 2;
1818                if cfg.relevance_keywords.is_empty() && !kw.is_empty() {
1819                    cfg.relevance_keywords = kw.clone();
1820                    meta.derived_keywords = Some(kw);
1821                }
1822                if cfg.stack_languages.is_empty() && !st.is_empty() {
1823                    cfg.stack_languages = st.clone();
1824                    meta.derived_stack = Some(st);
1825                }
1826            }
1827            let pkg = repo.rsplit('/').next().unwrap_or("").to_string();
1828            // [item 9] Sample: most-recent N.
1829            let all = fetch_stargazer_logins(repo)?;
1830            meta.requests += all.len() / 100 + 1;
1831            (pkg, all)
1832        }
1833        Seed::Users(users) => {
1834            meta.noun = "users".into();
1835            // Explicit user set: no seed repo, so no auto-derive and no
1836            // adoption package. Relatedness needs `keywords` to be useful.
1837            (String::new(), users.clone())
1838        }
1839    };
1840    meta.total_stargazers = all.len();
1841    let logins: Vec<String> = match cfg.max_stargazers {
1842        Some(c) => all.into_iter().take(c).collect(),
1843        None => all,
1844    };
1845
1846    // Portfolios + classify, with graceful rate-limit handling [item 9].
1847    let mut profiles = Vec::with_capacity(logins.len());
1848    for login in &logins {
1849        match fetch_portfolio(login, &cfg) {
1850            Ok((repos, capped)) => profiles.push(profile_user(login, repos, capped, &cfg)),
1851            Err(e) if e.to_lowercase().contains("rate limit") => {
1852                meta.partial = true;
1853                meta.partial_reason = Some(format!(
1854                    "GitHub rate limit hit after {} of {} stargazers — retry later or set max_stargazers",
1855                    profiles.len(),
1856                    logins.len()
1857                ));
1858                break;
1859            }
1860            Err(_) => profiles.push(profile_user(login, Vec::new(), false, &cfg)),
1861        }
1862    }
1863    meta.screened = profiles.len();
1864    meta.requests += profiles.len();
1865    if !meta.partial && meta.screened < meta.total_stargazers {
1866        meta.partial = true;
1867        meta.partial_reason = Some(format!(
1868            "screened {} most-recent of {} stargazers (max_stargazers cap)",
1869            meta.screened, meta.total_stargazers
1870        ));
1871    }
1872
1873    enrich(&pkg, &mut profiles, &cfg, &mut meta);
1874    normalize_scores(&mut profiles);
1875    Ok((profiles, meta, cfg))
1876}
1877
1878/// Bounded shortlist enrichment: adoption [item 2] across all screened,
1879/// then reach [item 6] + contributions [item 7] + co-location [item 4] for
1880/// the highest-priority candidates. Keeps the per-call cost bounded so the
1881/// bulk pass stays cheap.
1882fn enrich(pkg: &str, profiles: &mut [UserProfile], cfg: &ScreenConfig, meta: &mut ScreenMeta) {
1883    // [item 2] Adoption — who actually depends on the seed package. Skipped
1884    // for a Users seed (no package).
1885    let logins: std::collections::HashSet<String> =
1886        profiles.iter().map(|p| p.login.clone()).collect();
1887    let adopters = if pkg.is_empty() {
1888        std::collections::HashMap::new()
1889    } else {
1890        meta.requests += 4;
1891        find_adopters(pkg, &logins)
1892    };
1893    for p in profiles.iter_mut() {
1894        if let Some(ev) = adopters.get(&p.login.to_lowercase()) {
1895            p.adopter = true;
1896            p.adoption_evidence = Some(ev.clone());
1897        }
1898    }
1899
1900    // Shortlist = adopters ∪ keyword leads ∪ stack matches, priority-ordered,
1901    // bounded so enrichment cost stays modest.
1902    let mut idx: Vec<usize> = (0..profiles.len())
1903        .filter(|&i| profiles[i].adopter || profiles[i].strong_hit || profiles[i].stack_match)
1904        .collect();
1905    idx.sort_by(|&a, &b| {
1906        let key = |p: &UserProfile| (p.adopter, p.strong_hit, stack_depth(p), p.total_stars);
1907        let (ka, kb) = (key(&profiles[a]), key(&profiles[b]));
1908        kb.cmp(&ka)
1909    });
1910    idx.truncate(15);
1911
1912    // Legend candidates: the highest-traction stargazers overall (by total
1913    // stars) — may be off-domain, but we want follower counts to answer
1914    // "is one of my stargazers a coding legend?".
1915    let mut legend: Vec<usize> = (0..profiles.len()).collect();
1916    legend.sort_by(|&a, &b| profiles[b].total_stars.cmp(&profiles[a].total_stars));
1917    legend.truncate(8);
1918
1919    // Reach (followers) for shortlist ∪ legend candidates.
1920    let mut reach_idx = idx.clone();
1921    for l in legend {
1922        if !reach_idx.contains(&l) {
1923            reach_idx.push(l);
1924        }
1925    }
1926    reach_idx.truncate(20);
1927    for &i in &reach_idx {
1928        if profiles[i].followers.is_none() {
1929            if let Some(f) = fetch_user_reach(&profiles[i].login) {
1930                profiles[i].followers = Some(f);
1931            }
1932            meta.requests += 1;
1933        }
1934    }
1935
1936    // [item 7] Contributions for the domain shortlist.
1937    for &i in &idx {
1938        let c = fetch_contributions(&profiles[i].login);
1939        meta.requests += 1;
1940        if !c.is_empty() {
1941            profiles[i].contributes_to = c;
1942        }
1943    }
1944
1945    // [item 4] Co-location for the top stack matches by depth (the ranking
1946    // shown in the overview), independent of the lead ordering.
1947    if !cfg.stack_languages.is_empty() {
1948        let mut stack_idx: Vec<usize> = (0..profiles.len())
1949            .filter(|&i| profiles[i].stack_match)
1950            .collect();
1951        stack_idx.sort_by(|&a, &b| stack_depth(&profiles[b]).cmp(&stack_depth(&profiles[a])));
1952        for &i in stack_idx.iter().take(5) {
1953            let n = probe_colocation(&profiles[i], &cfg.stack_languages, 6);
1954            profiles[i].colocated_repos = Some(n);
1955            meta.requests += 6;
1956        }
1957    }
1958}
1959
1960/// Per-repo "quality" proxy from cheap signals. Maintenance dominates so
1961/// this lens stays *distinct* from raw popularity: stars only validate (and
1962/// are capped, so a mega-hit can't buy a quality slot), while recency and
1963/// curation drive the score and dormancy is penalised — a tidy, active,
1964/// topic-tagged repo outranks an abandoned star-magnet.
1965fn repo_quality(r: &RepoLite) -> f64 {
1966    let mut q = 0.0;
1967    if r.desc.is_some() {
1968        q += 1.0;
1969    }
1970    if !r.topics.is_empty() {
1971        q += 1.0;
1972    }
1973    // Recency is the strongest maintenance signal.
1974    q += if r.pushed.as_str() >= "2026-01" {
1975        2.0
1976    } else if r.pushed.as_str() >= "2025-01" {
1977        1.0
1978    } else if r.pushed.as_str() < "2023-01" {
1979        -2.0
1980    } else {
1981        0.0
1982    };
1983    if r.archived {
1984        q -= 2.0;
1985    }
1986    // Stars validate but can't dominate — capped at ~ln(20).
1987    q += ((r.stars + 1) as f64).ln().min(3.0);
1988    q
1989}
1990
1991/// Best repo-quality across a dev's original repos.
1992fn quality_score(p: &UserProfile) -> f64 {
1993    p.repos
1994        .iter()
1995        .filter(|r| !r.fork)
1996        .map(repo_quality)
1997        .fold(0.0_f64, f64::max)
1998}
1999
2000/// A "coding legend": large audience or a genuinely popular project.
2001fn is_legend(p: &UserProfile) -> bool {
2002    p.followers.map(|f| f >= 1000).unwrap_or(false) || p.max_stars >= 1000 || p.total_stars >= 2500
2003}
2004
2005// ---------------------------------------------------------------------------
2006// Drill-down + in-memory session store (the MCP tool surface)
2007// ---------------------------------------------------------------------------
2008
2009/// Resolve a drill `element_id` against an already-screened profile set.
2010/// Mirrors `github_issues`' `element_id` convention:
2011///   `cohort:<key>` · `user:<login>` · `user:<login>/repo:<name>` ·
2012///   `user:<login>/repo:<name>/readme` (the only drill that costs a request).
2013pub fn drill(profiles: &[UserProfile], element_id: &str) -> String {
2014    if let Some(key) = element_id.strip_prefix("cohort:") {
2015        return match archetype_from_key(key) {
2016            Some(a) => render_cohort(a, profiles),
2017            None => format!(
2018                "unknown cohort '{key}'. Try: established, single, prolific, casual, dormant, consumers"
2019            ),
2020        };
2021    }
2022    let rest = element_id.strip_prefix("user:").unwrap_or(element_id);
2023    let mut parts = rest.splitn(2, "/repo:");
2024    let login = parts.next().unwrap_or("");
2025    let prof = match profiles
2026        .iter()
2027        .find(|p| p.login.eq_ignore_ascii_case(login))
2028    {
2029        Some(p) => p,
2030        None => return format!("no such stargazer in this screen: '{login}'"),
2031    };
2032    match parts.next() {
2033        None => render_user(prof),
2034        Some(repo_part) => {
2035            let (rname, want_readme) = match repo_part.strip_suffix("/readme") {
2036                Some(n) => (n, true),
2037                None => (repo_part, false),
2038            };
2039            let r = match prof
2040                .repos
2041                .iter()
2042                .find(|r| r.name.eq_ignore_ascii_case(rname))
2043            {
2044                Some(r) => r,
2045                None => return format!("{login} has no repo '{rname}' in this screen"),
2046            };
2047            if want_readme {
2048                let full = format!("{login}/{rname}");
2049                match fetch_readme(&full) {
2050                    Ok(gist) => format!("README — {full}\n\n{gist}"),
2051                    Err(e) => e,
2052                }
2053            } else {
2054                render_repo(login, r)
2055            }
2056        }
2057    }
2058}
2059
2060/// One cached screen: the enriched profiles, run metadata, and the
2061/// *effective* config (with any auto-derived keywords/stack) so a re-rank
2062/// can reuse them.
2063#[derive(Clone, Serialize, Deserialize)]
2064pub struct CachedScreen {
2065    pub profiles: Vec<UserProfile>,
2066    pub meta: ScreenMeta,
2067    pub cfg: ScreenConfig,
2068}
2069
2070/// In-memory store of screened profile sets, keyed by seed repo. Lives for
2071/// the server's lifetime — the stargazer-screen analogue of `ElementCache`.
2072#[derive(Default)]
2073pub struct ScreenStore {
2074    store: std::collections::HashMap<String, CachedScreen>,
2075}
2076
2077impl ScreenStore {
2078    pub fn new() -> Self {
2079        Self::default()
2080    }
2081}
2082
2083/// Carry the (keyword-independent) enrichment fields from a cached profile
2084/// onto a freshly re-classified one, so a free re-rank keeps the expensive
2085/// follower/adoption/co-location/contribution data.
2086fn carry_enrichment(fresh: &mut UserProfile, cached: &UserProfile) {
2087    fresh.followers = cached.followers;
2088    fresh.adopter = cached.adopter;
2089    fresh.adoption_evidence = cached.adoption_evidence.clone();
2090    fresh.colocated_repos = cached.colocated_repos;
2091    fresh.contributes_to = cached.contributes_to.clone();
2092}
2093
2094/// One-call entry point for the `screen_stargazers` MCP tool. Without
2095/// `element_id`: build (or reuse) the screen and return the overview —
2096/// re-classified against the current keywords/stack so the agent can
2097/// re-key a cached fetch for free. With `element_id`: drill the cached set.
2098pub fn screen_dispatch(
2099    store: &std::sync::Mutex<ScreenStore>,
2100    seed: &Seed,
2101    cfg: &ScreenConfig,
2102    selection: Option<&Selection>,
2103    element_id: Option<&str>,
2104    refresh: bool,
2105) -> String {
2106    let key = seed.key();
2107    // Drill path — pure cache hit, no network (README excepted).
2108    if let Some(eid) = element_id {
2109        let guard = store.lock().unwrap();
2110        return match guard.store.get(&key) {
2111            Some(c) => drill(&c.profiles, eid),
2112            None => format!(
2113                "No screen cached for {key}. Call screen(target=\"{key}\") first (no element_id) to build it."
2114            ),
2115        };
2116    }
2117
2118    // Overview path — reuse the cached fetch unless refreshing, re-classifying
2119    // with the live cfg (free re-rank) while carrying enrichment over.
2120    if !refresh {
2121        let guard = store.lock().unwrap();
2122        if let Some(c) = guard.store.get(&key) {
2123            // Incoming cfg overrides cached, else keep cached (auto-derived).
2124            let eff = ScreenConfig {
2125                max_repos_per_user: cfg.max_repos_per_user,
2126                max_stargazers: cfg.max_stargazers.or(c.cfg.max_stargazers),
2127                relevance_keywords: if cfg.relevance_keywords.is_empty() {
2128                    c.cfg.relevance_keywords.clone()
2129                } else {
2130                    cfg.relevance_keywords.clone()
2131                },
2132                stack_languages: if cfg.stack_languages.is_empty() {
2133                    c.cfg.stack_languages.clone()
2134                } else {
2135                    cfg.stack_languages.clone()
2136                },
2137            };
2138            let mut reclassified: Vec<UserProfile> = c
2139                .profiles
2140                .iter()
2141                .map(|u| {
2142                    let mut p = profile_user(&u.login, u.repos.clone(), u.capped, &eff);
2143                    carry_enrichment(&mut p, u);
2144                    p
2145                })
2146                .collect();
2147            normalize_scores(&mut reclassified);
2148            return build_overview(&key, &reclassified, &c.meta, &eff, selection);
2149        }
2150    }
2151
2152    match run_screen(seed, cfg) {
2153        Ok((profiles, meta, eff)) => {
2154            let out = build_overview(&key, &profiles, &meta, &eff, selection);
2155            store.lock().unwrap().store.insert(
2156                key,
2157                CachedScreen {
2158                    profiles,
2159                    meta,
2160                    cfg: eff,
2161                },
2162            );
2163            out
2164        }
2165        Err(e) => e,
2166    }
2167}
2168
2169#[cfg(test)]
2170mod tests {
2171    use super::*;
2172    use serde_json::json;
2173
2174    fn repo(name: &str, fork: bool, stars: u64, lang: &str, desc: &str, pushed: &str) -> Value {
2175        json!({
2176            "name": name, "fork": fork, "archived": false,
2177            "language": lang, "stargazers_count": stars, "forks_count": 0,
2178            "pushed_at": format!("{pushed}T00:00:00Z"), "created_at": format!("{pushed}T00:00:00Z"),
2179            "topics": [], "description": desc,
2180        })
2181    }
2182
2183    fn profiles_for(repos: Vec<Value>) -> UserProfile {
2184        let cfg = ScreenConfig {
2185            relevance_keywords: vec!["graph".into()],
2186            ..Default::default()
2187        };
2188        let lite: Vec<RepoLite> = repos.iter().map(project_repo).collect();
2189        profile_user("tester", lite, false, &cfg)
2190    }
2191
2192    #[test]
2193    fn single_project_dev_detected() {
2194        let p = profiles_for(vec![
2195            repo(
2196                "flagship",
2197                false,
2198                179,
2199                "Lua",
2200                "graph preview plugin",
2201                "2025-05-29",
2202            ),
2203            repo("misc1", false, 1, "Python", "small thing", "2025-04-01"),
2204            repo("misc2", false, 0, "Lua", "another", "2025-03-01"),
2205        ]);
2206        assert_eq!(p.archetype, Archetype::SingleProject);
2207        assert_eq!(p.flagship.as_ref().unwrap().name, "flagship");
2208        assert_eq!(p.relevant.len(), 1); // matched "graph"
2209    }
2210
2211    #[test]
2212    fn prolific_builder_detected() {
2213        let mut repos = Vec::new();
2214        for i in 0..20 {
2215            repos.push(repo(
2216                &format!("r{i}"),
2217                false,
2218                0,
2219                "Rust",
2220                "experiment",
2221                "2026-05-01",
2222            ));
2223        }
2224        let p = profiles_for(repos);
2225        assert_eq!(p.archetype, Archetype::Prolific);
2226        assert_eq!(p.top_langs, vec!["Rust".to_string()]);
2227    }
2228
2229    #[test]
2230    fn consumer_detected() {
2231        let p = profiles_for(vec![
2232            repo("fork1", true, 0, "JS", "someone elses", "2025-01-01"),
2233            repo("fork2", true, 0, "JS", "another fork", "2025-01-01"),
2234        ]);
2235        assert_eq!(p.archetype, Archetype::Consumer);
2236    }
2237
2238    #[test]
2239    fn projection_drops_empty_desc() {
2240        let r = project_repo(&repo("x", false, 0, "Rust", "", "2025-01-01"));
2241        assert!(r.desc.is_none());
2242    }
2243
2244    fn two_profiles() -> Vec<UserProfile> {
2245        let cfg = ScreenConfig {
2246            relevance_keywords: vec!["graph".into()],
2247            ..Default::default()
2248        };
2249        let flagship = profile_user(
2250            "solo",
2251            vec![project_repo(&repo(
2252                "flag",
2253                false,
2254                99,
2255                "Rust",
2256                "graph engine",
2257                "2025-05-01",
2258            ))],
2259            false,
2260            &cfg,
2261        );
2262        let mut repos = Vec::new();
2263        for i in 0..8 {
2264            repos.push(project_repo(&repo(
2265                &format!("r{i}"),
2266                false,
2267                0,
2268                "Go",
2269                "x",
2270                "2026-01-01",
2271            )));
2272        }
2273        let prolific = profile_user("builder", repos, false, &cfg);
2274        vec![flagship, prolific]
2275    }
2276
2277    #[test]
2278    fn drill_user_and_repo() {
2279        let p = two_profiles();
2280        assert!(drill(&p, "user:solo").contains("Single-project"));
2281        assert!(drill(&p, "user:solo/repo:flag").contains("99★"));
2282        assert!(drill(&p, "user:nobody").contains("no such stargazer"));
2283        assert!(drill(&p, "user:solo/repo:ghost").contains("no repo"));
2284    }
2285
2286    #[test]
2287    fn drill_cohort() {
2288        let p = two_profiles();
2289        let out = drill(&p, "cohort:prolific");
2290        assert!(out.contains("Prolific builders"));
2291        assert!(out.contains("builder"));
2292        assert!(drill(&p, "cohort:bogus").contains("unknown cohort"));
2293    }
2294
2295    #[test]
2296    fn dispatch_caches_then_drills_without_refetch() {
2297        // No network: pre-seed the store, then a drill must hit the cache.
2298        let store = std::sync::Mutex::new(ScreenStore::new());
2299        store.lock().unwrap().store.insert(
2300            "a/b".into(),
2301            CachedScreen {
2302                profiles: two_profiles(),
2303                meta: ScreenMeta::default(),
2304                cfg: ScreenConfig::default(),
2305            },
2306        );
2307        let cfg = ScreenConfig::default();
2308        let out = screen_dispatch(
2309            &store,
2310            &Seed::Repo("a/b".into()),
2311            &cfg,
2312            None,
2313            Some("user:solo"),
2314            false,
2315        );
2316        assert!(out.contains("Single-project"));
2317        // Missing repo → friendly "build it first".
2318        let miss = screen_dispatch(
2319            &store,
2320            &Seed::Repo("x/y".into()),
2321            &cfg,
2322            None,
2323            Some("user:solo"),
2324            false,
2325        );
2326        assert!(miss.contains("No screen cached"));
2327    }
2328}