Skip to main content

pi/
extension_validation.rs

1//! Deterministic classifier and deduplication engine for Pi extension candidates.
2//!
3//! This module takes mixed-source research data (GitHub code search, repo search,
4//! npm scan, curated lists) and produces a validated, deduplicated candidate set.
5//!
6//! Each candidate gets:
7//! - A `ValidationStatus` (true-extension, mention-only, unknown)
8//! - `ValidationEvidence` (which signals matched)
9//! - A canonical identity key for deduplication
10//!
11//! The classifier is intentionally conservative: a candidate must show clear Pi
12//! extension API usage to be classified as `TrueExtension`.
13
14use crate::extension_popularity::{
15    CandidateItem, CandidatePool, GitHubRepoCandidate, github_repo_candidate_from_url,
16};
17use serde::{Deserialize, Serialize};
18use std::collections::HashMap;
19
20// ────────────────────────────────────────────────────────────────────────────
21// Classification types
22// ────────────────────────────────────────────────────────────────────────────
23
24/// Validation status for a candidate.
25#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
26#[serde(rename_all = "snake_case")]
27pub enum ValidationStatus {
28    /// Confirmed Pi extension: has API import + export default or registration calls.
29    TrueExtension,
30    /// Mentions Pi but does not implement the extension protocol.
31    MentionOnly,
32    /// Insufficient evidence to classify.
33    Unknown,
34}
35
36/// Evidence supporting a validation decision.
37#[derive(Debug, Clone, Default, Serialize, Deserialize)]
38pub struct ValidationEvidence {
39    /// Has `@mariozechner/pi-coding-agent` or `@mariozechner/pi-ai` import.
40    pub has_api_import: bool,
41    /// Has `export default` in entrypoint.
42    pub has_export_default: bool,
43    /// Registration API calls found (e.g. `registerTool`, `registerCommand`).
44    pub registrations: Vec<String>,
45    /// Sources that contributed to this candidate.
46    pub sources: Vec<String>,
47    /// Human-readable reason for the classification decision.
48    pub reason: String,
49}
50
51/// A fully validated candidate with classification and dedup info.
52#[derive(Debug, Clone, Serialize, Deserialize)]
53pub struct ValidatedCandidate {
54    /// Canonical identity key (e.g. "nicobailon/pi-messenger" or "npm:@oh-my-pi/lsp").
55    pub canonical_id: String,
56    /// Display name.
57    pub name: String,
58    /// Validation status.
59    pub status: ValidationStatus,
60    /// Evidence for classification.
61    pub evidence: ValidationEvidence,
62    /// Aliases — other identifiers that map to this canonical entry.
63    pub aliases: Vec<String>,
64    /// Source tier (official-pi-mono, community, npm-registry, third-party-github).
65    pub source_tier: Option<String>,
66    /// Repository URL (if known).
67    pub repository_url: Option<String>,
68    /// npm package name (if known).
69    pub npm_package: Option<String>,
70}
71
72/// Output of the full validation + dedup pipeline.
73#[derive(Debug, Clone, Serialize, Deserialize)]
74pub struct ValidationReport {
75    pub generated_at: String,
76    pub task: String,
77    pub stats: ValidationStats,
78    pub candidates: Vec<ValidatedCandidate>,
79}
80
81/// Aggregate statistics.
82#[derive(Debug, Clone, Serialize, Deserialize)]
83pub struct ValidationStats {
84    pub total_input_candidates: usize,
85    pub after_dedup: usize,
86    pub true_extension: usize,
87    pub mention_only: usize,
88    pub unknown: usize,
89    pub sources_merged: usize,
90}
91
92// ────────────────────────────────────────────────────────────────────────────
93// Research source input types (deserialized from JSON files)
94// ────────────────────────────────────────────────────────────────────────────
95
96/// A candidate from the GitHub code search inventory.
97#[derive(Debug, Clone, Deserialize)]
98pub struct CodeSearchEntry {
99    pub repo: String,
100    pub path: String,
101    #[serde(default)]
102    pub all_paths: Vec<String>,
103    #[serde(default)]
104    pub is_valid_extension: bool,
105    #[serde(default)]
106    pub has_api_import: bool,
107    #[serde(default)]
108    pub has_export_default: bool,
109    #[serde(default)]
110    pub registrations: Vec<String>,
111    #[serde(default)]
112    pub file_count: usize,
113}
114
115/// Wrapper for code search inventory JSON.
116#[derive(Debug, Clone, Deserialize)]
117pub struct CodeSearchInventory {
118    pub meta: serde_json::Value,
119    pub extensions: Vec<CodeSearchEntry>,
120}
121
122/// A candidate from the GitHub repo search.
123#[derive(Debug, Clone, Deserialize)]
124pub struct RepoSearchEntry {
125    pub repo: String,
126    #[serde(default)]
127    pub entrypoint: Option<String>,
128    #[serde(default)]
129    pub stars: Option<u64>,
130    #[serde(default)]
131    pub description: Option<String>,
132    #[serde(default)]
133    pub registrations: Vec<String>,
134}
135
136/// Wrapper for repo search summary JSON.
137#[derive(Debug, Clone, Deserialize)]
138pub struct RepoSearchSummary {
139    pub repos: Vec<RepoSearchEntry>,
140}
141
142/// A candidate from the npm scan.
143#[derive(Debug, Clone, Deserialize)]
144pub struct NpmScanEntry {
145    pub name: String,
146    #[serde(default)]
147    pub version: Option<String>,
148    #[serde(default)]
149    pub description: Option<String>,
150    #[serde(default)]
151    pub repository: Option<String>,
152    #[serde(default)]
153    pub has_pi_dep: bool,
154}
155
156/// Wrapper for npm scan summary JSON.
157#[derive(Debug, Clone, Deserialize)]
158pub struct NpmScanSummary {
159    pub packages: Vec<NpmScanEntry>,
160}
161
162/// A candidate from the curated list sweep.
163#[derive(Debug, Clone, Deserialize)]
164pub struct CuratedListEntry {
165    pub name: String,
166    #[serde(default)]
167    pub source: Option<String>,
168    #[serde(default)]
169    pub category: Option<String>,
170    #[serde(default)]
171    pub status: Option<String>,
172}
173
174/// Wrapper for curated list summary JSON.
175#[derive(Debug, Clone, Deserialize)]
176pub struct CuratedListSummary {
177    pub candidates: Vec<CuratedListEntry>,
178}
179
180// ────────────────────────────────────────────────────────────────────────────
181// Canonical ID generation
182// ────────────────────────────────────────────────────────────────────────────
183
184/// Normalize a GitHub repo slug to lowercase `owner/repo`.
185#[must_use]
186pub fn normalize_github_repo(repo: &str) -> String {
187    let repo = repo.trim().to_lowercase();
188    // Strip trailing .git
189    repo.strip_suffix(".git").unwrap_or(&repo).to_string()
190}
191
192/// Extract a canonical ID from a GitHub repository URL.
193/// Returns `owner/repo` in lowercase, or None if not a GitHub URL.
194#[must_use]
195pub fn canonical_id_from_repo_url(url: &str) -> Option<String> {
196    match github_repo_candidate_from_url(url)? {
197        GitHubRepoCandidate::Repo(r) => Some(format!(
198            "{}/{}",
199            r.owner.to_lowercase(),
200            r.repo.to_lowercase()
201        )),
202        GitHubRepoCandidate::Slug(_) => None,
203    }
204}
205
206/// Generate a canonical ID from an npm package name.
207/// Prefixed with `npm:` to distinguish from GitHub repos.
208#[must_use]
209pub fn canonical_id_from_npm(package: &str) -> String {
210    format!("npm:{}", package.trim().to_lowercase())
211}
212
213/// Generate a canonical ID from a GitHub repo slug (e.g. "owner/repo").
214#[must_use]
215pub fn canonical_id_from_repo_slug(slug: &str) -> String {
216    normalize_github_repo(slug)
217}
218
219// ────────────────────────────────────────────────────────────────────────────
220// Classification logic
221// ────────────────────────────────────────────────────────────────────────────
222
223/// Known Pi extension API registration methods.
224const REGISTRATION_METHODS: &[&str] = &[
225    "registerTool",
226    "registerCommand",
227    "registerProvider",
228    "registerShortcut",
229    "registerFlag",
230    "registerMessageRenderer",
231];
232
233/// Classify a candidate based on code-level evidence.
234///
235/// A candidate is `TrueExtension` if it has:
236/// - An API import (`@mariozechner/pi-coding-agent`, `@mariozechner/pi-ai`, or `ExtensionAPI`)
237/// - AND either `export default` or at least one registration call.
238///
239/// It is `MentionOnly` if it references Pi but lacks the protocol implementation.
240/// Otherwise it is `Unknown`.
241#[must_use]
242pub fn classify_from_evidence(evidence: &ValidationEvidence) -> ValidationStatus {
243    let has_registrations = !evidence.registrations.is_empty();
244
245    if evidence.has_api_import && (evidence.has_export_default || has_registrations) {
246        ValidationStatus::TrueExtension
247    } else if evidence.has_api_import || has_registrations || evidence.has_export_default {
248        // Has some signal but not enough for full classification.
249        ValidationStatus::MentionOnly
250    } else {
251        ValidationStatus::Unknown
252    }
253}
254
255/// Classify extension source content (raw TypeScript/JavaScript).
256#[must_use]
257pub fn classify_source_content(content: &str) -> (ValidationStatus, ValidationEvidence) {
258    let has_api_import = content.contains("@mariozechner/pi-coding-agent")
259        || content.contains("@mariozechner/pi-ai")
260        || content.contains("ExtensionAPI");
261
262    let has_export_default = content.contains("export default");
263
264    let mut registrations = Vec::new();
265    for method in REGISTRATION_METHODS {
266        let pattern = format!("{method}(");
267        if content.contains(&pattern) {
268            registrations.push((*method).to_string());
269        }
270    }
271
272    let evidence = ValidationEvidence {
273        has_api_import,
274        has_export_default,
275        registrations: registrations.clone(),
276        sources: vec!["source_content".to_string()],
277        reason: build_classification_reason(has_api_import, has_export_default, &registrations),
278    };
279
280    let status = classify_from_evidence(&evidence);
281    (status, evidence)
282}
283
284/// Build a human-readable reason string.
285fn build_classification_reason(
286    has_api_import: bool,
287    has_export_default: bool,
288    registrations: &[String],
289) -> String {
290    let mut parts = Vec::new();
291    if has_api_import {
292        parts.push("Pi API import found");
293    }
294    if has_export_default {
295        parts.push("export default present");
296    }
297    if !registrations.is_empty() {
298        parts.push("registration calls detected");
299    }
300    if parts.is_empty() {
301        "no Pi extension signals detected".to_string()
302    } else {
303        parts.join("; ")
304    }
305}
306
307// ────────────────────────────────────────────────────────────────────────────
308// Deduplication engine
309// ────────────────────────────────────────────────────────────────────────────
310
311/// Intermediate merge record used during dedup.
312#[derive(Debug, Clone)]
313struct MergeRecord {
314    canonical_id: String,
315    name: String,
316    evidence: ValidationEvidence,
317    aliases: Vec<String>,
318    source_tier: Option<String>,
319    repository_url: Option<String>,
320    npm_package: Option<String>,
321    /// Whether this candidate has a vendored artifact (already confirmed as extension).
322    is_vendored: bool,
323}
324
325/// Merge map keyed by canonical ID.
326type MergeMap = HashMap<String, MergeRecord>;
327
328/// Merge a new candidate into the merge map.
329/// If the canonical ID already exists, merge evidence and aliases.
330fn merge_into(map: &mut MergeMap, canonical_id: String, record: MergeRecord) {
331    if let Some(existing) = map.get_mut(&canonical_id) {
332        // Merge evidence: take the union of signals.
333        existing.evidence.has_api_import |= record.evidence.has_api_import;
334        existing.evidence.has_export_default |= record.evidence.has_export_default;
335        for reg in &record.evidence.registrations {
336            if !existing.evidence.registrations.contains(reg) {
337                existing.evidence.registrations.push(reg.clone());
338            }
339        }
340        for src in &record.evidence.sources {
341            if !existing.evidence.sources.contains(src) {
342                existing.evidence.sources.push(src.clone());
343            }
344        }
345        // Merge aliases (add any new ones, including the incoming canonical_id if different).
346        for alias in &record.aliases {
347            if !existing.aliases.contains(alias) && *alias != existing.canonical_id {
348                existing.aliases.push(alias.clone());
349            }
350        }
351        // Merge vendored status.
352        existing.is_vendored |= record.is_vendored;
353        // Prefer more specific/curated source tiers over generic ones.
354        // Candidate pool tiers (official-pi-mono, community, npm-registry) are more
355        // accurate than research-derived "third-party-github".
356        match (&existing.source_tier, &record.source_tier) {
357            (None, _) => existing.source_tier = record.source_tier,
358            (Some(existing_tier), Some(new_tier))
359                if existing_tier == "third-party-github" && is_curated_tier(new_tier) =>
360            {
361                existing.source_tier = record.source_tier;
362            }
363            _ => {}
364        }
365        if existing.repository_url.is_none() {
366            existing.repository_url = record.repository_url;
367        }
368        if existing.npm_package.is_none() {
369            existing.npm_package = record.npm_package;
370        }
371    } else {
372        map.insert(canonical_id, record);
373    }
374}
375
376/// Returns true if the tier name indicates a curated/hand-verified classification
377/// (as opposed to the generic "third-party-github" from research).
378fn is_curated_tier(tier: &str) -> bool {
379    matches!(
380        tier,
381        "official-pi-mono" | "community" | "npm-registry" | "agents-mikeastock"
382    )
383}
384
385/// Link an npm package to a GitHub repo canonical ID (via repository URL).
386/// Returns the GitHub canonical ID if the npm package's repo URL matches.
387fn npm_to_github_canonical(npm_repo_url: &str) -> Option<String> {
388    canonical_id_from_repo_url(npm_repo_url)
389}
390
391// ────────────────────────────────────────────────────────────────────────────
392// Full pipeline
393// ────────────────────────────────────────────────────────────────────────────
394
395/// Configuration for the validation pipeline.
396pub struct ValidationConfig {
397    /// Task ID for provenance tracking.
398    pub task_id: String,
399}
400
401/// Run the full validation + dedup pipeline on all research sources.
402///
403/// Steps:
404/// 1. Ingest all sources into a merge map keyed by canonical ID.
405/// 2. Merge npm packages with their GitHub repos when repository URLs match.
406/// 3. Classify each merged candidate.
407/// 4. Produce output report.
408#[allow(clippy::too_many_lines)]
409pub fn run_validation_pipeline(
410    code_search: Option<&CodeSearchInventory>,
411    repo_search: Option<&RepoSearchSummary>,
412    npm_scan: Option<&NpmScanSummary>,
413    curated_list: Option<&CuratedListSummary>,
414    existing_pool: Option<&CandidatePool>,
415    config: &ValidationConfig,
416) -> ValidationReport {
417    let mut merge_map: MergeMap = HashMap::new();
418    let mut total_input = 0usize;
419
420    // Phase 1: Ingest code search results (highest-signal source).
421    if let Some(cs) = code_search {
422        for entry in &cs.extensions {
423            total_input += 1;
424            let canonical_id = canonical_id_from_repo_slug(&entry.repo);
425            let record = MergeRecord {
426                canonical_id: canonical_id.clone(),
427                name: entry
428                    .repo
429                    .split('/')
430                    .next_back()
431                    .unwrap_or(&entry.repo)
432                    .to_string(),
433                evidence: ValidationEvidence {
434                    has_api_import: entry.has_api_import,
435                    has_export_default: entry.has_export_default,
436                    registrations: entry.registrations.clone(),
437                    sources: vec!["code_search".to_string()],
438                    reason: String::new(), // Will be computed later.
439                },
440                aliases: Vec::new(),
441                source_tier: Some("third-party-github".to_string()),
442                repository_url: Some(format!("https://github.com/{}", entry.repo)),
443                npm_package: None,
444                is_vendored: false,
445            };
446            merge_into(&mut merge_map, canonical_id, record);
447        }
448    }
449
450    // Phase 2: Ingest repo search results.
451    if let Some(rs) = repo_search {
452        for entry in &rs.repos {
453            total_input += 1;
454            let canonical_id = canonical_id_from_repo_slug(&entry.repo);
455            let record = MergeRecord {
456                canonical_id: canonical_id.clone(),
457                name: entry
458                    .repo
459                    .split('/')
460                    .next_back()
461                    .unwrap_or(&entry.repo)
462                    .to_string(),
463                evidence: ValidationEvidence {
464                    has_api_import: true, // Repo search already validated these.
465                    has_export_default: true,
466                    registrations: entry.registrations.clone(),
467                    sources: vec!["repo_search".to_string()],
468                    reason: String::new(),
469                },
470                aliases: Vec::new(),
471                source_tier: Some("third-party-github".to_string()),
472                repository_url: Some(format!("https://github.com/{}", entry.repo)),
473                npm_package: None,
474                is_vendored: false,
475            };
476            merge_into(&mut merge_map, canonical_id, record);
477        }
478    }
479
480    // Phase 3: Ingest npm scan results.
481    // First pass: try to link to existing GitHub repo entries.
482    if let Some(ns) = npm_scan {
483        for entry in &ns.packages {
484            total_input += 1;
485            let npm_canonical = canonical_id_from_npm(&entry.name);
486
487            // Try to link to GitHub repo via repository URL.
488            let github_canonical = entry
489                .repository
490                .as_deref()
491                .and_then(npm_to_github_canonical);
492
493            let target_id = github_canonical
494                .clone()
495                .unwrap_or_else(|| npm_canonical.clone());
496
497            let mut aliases = vec![npm_canonical.clone()];
498            if let Some(ref gc) = github_canonical {
499                if *gc != target_id {
500                    aliases.push(gc.clone());
501                }
502            }
503            // Remove duplicates with target_id.
504            aliases.retain(|a| *a != target_id);
505
506            let record = MergeRecord {
507                canonical_id: target_id.clone(),
508                name: entry.name.clone(),
509                evidence: ValidationEvidence {
510                    has_api_import: entry.has_pi_dep,
511                    has_export_default: false, // npm metadata doesn't tell us this.
512                    registrations: Vec::new(),
513                    sources: vec!["npm_scan".to_string()],
514                    reason: String::new(),
515                },
516                aliases,
517                source_tier: Some("npm-registry".to_string()),
518                repository_url: entry.repository.as_deref().and_then(|u| {
519                    canonical_id_from_repo_url(u).map(|slug| format!("https://github.com/{slug}"))
520                }),
521                npm_package: Some(entry.name.clone()),
522                is_vendored: false,
523            };
524            merge_into(&mut merge_map, target_id, record);
525        }
526    }
527
528    // Phase 4: Ingest curated list results.
529    if let Some(cl) = curated_list {
530        for entry in &cl.candidates {
531            total_input += 1;
532            let canonical_id = if entry.name.contains('/') {
533                canonical_id_from_repo_slug(&entry.name)
534            } else {
535                entry.name.to_lowercase()
536            };
537
538            // Curated list entries: use category to determine signal level.
539            // "extensions" and "providers" categories indicate human-curated Pi extensions.
540            let cat = entry.category.as_deref().unwrap_or("");
541            let is_extension_category =
542                cat == "extensions" || cat == "providers" || cat == "skills";
543            let record = MergeRecord {
544                canonical_id: canonical_id.clone(),
545                name: entry
546                    .name
547                    .split('/')
548                    .next_back()
549                    .unwrap_or(&entry.name)
550                    .to_string(),
551                evidence: ValidationEvidence {
552                    has_api_import: is_extension_category,
553                    has_export_default: is_extension_category,
554                    registrations: Vec::new(),
555                    sources: vec![format!(
556                        "curated_list:{}",
557                        entry.source.as_deref().unwrap_or("unknown")
558                    )],
559                    reason: String::new(),
560                },
561                aliases: Vec::new(),
562                source_tier: entry.category.clone(),
563                repository_url: if entry.name.contains('/') {
564                    Some(format!("https://github.com/{}", entry.name))
565                } else {
566                    None
567                },
568                npm_package: None,
569                is_vendored: false,
570            };
571            merge_into(&mut merge_map, canonical_id, record);
572        }
573    }
574
575    // Phase 5: Ingest existing candidate pool (enriches with vendor/tier info).
576    if let Some(pool) = existing_pool {
577        for item in &pool.items {
578            total_input += 1;
579            let canonical_id = item.id.to_lowercase();
580
581            // For candidate pool items, prefer the item's own ID as canonical.
582            // Only use the GitHub URL as canonical if the item ID already matches
583            // an owner/repo pattern. This prevents monorepo collapse (e.g., all 60
584            // official pi-mono extensions sharing badlogic/pi-mono URL).
585            let github_canonical = item
586                .repository_url
587                .as_deref()
588                .and_then(canonical_id_from_repo_url);
589
590            let target_id = if canonical_id.contains('/') {
591                // Item ID already looks like owner/repo — try GitHub canonical.
592                github_canonical.unwrap_or_else(|| canonical_id.clone())
593            } else {
594                // Item ID is a package name (e.g., "antigravity-image-gen") — keep it.
595                canonical_id.clone()
596            };
597
598            let mut aliases = vec![canonical_id.clone()];
599            for a in &item.aliases {
600                aliases.push(a.to_lowercase());
601            }
602            aliases.retain(|a| *a != target_id);
603            aliases.sort();
604            aliases.dedup();
605
606            let record = MergeRecord {
607                canonical_id: target_id.clone(),
608                name: item.name.clone(),
609                evidence: ValidationEvidence {
610                    has_api_import: false,
611                    has_export_default: false,
612                    registrations: Vec::new(),
613                    sources: vec![format!("candidate_pool:{}", item.source_tier)],
614                    reason: String::new(),
615                },
616                aliases,
617                source_tier: Some(item.source_tier.clone()),
618                repository_url: item.repository_url.clone(),
619                npm_package: extract_npm_package(item),
620                is_vendored: item.status == "vendored",
621            };
622            merge_into(&mut merge_map, target_id, record);
623        }
624    }
625
626    // Phase 6: Classify all merged candidates.
627    let mut candidates: Vec<ValidatedCandidate> = merge_map
628        .into_values()
629        .map(|mut rec| {
630            // Compute the reason string.
631            rec.evidence.reason = build_classification_reason(
632                rec.evidence.has_api_import,
633                rec.evidence.has_export_default,
634                &rec.evidence.registrations,
635            );
636            let mut status = classify_from_evidence(&rec.evidence);
637            // Promote vendored candidates: they were already artifact-validated.
638            if rec.is_vendored && status != ValidationStatus::TrueExtension {
639                status = ValidationStatus::TrueExtension;
640                if !rec.evidence.reason.is_empty() {
641                    rec.evidence.reason.push_str("; ");
642                }
643                rec.evidence
644                    .reason
645                    .push_str("vendored artifact (pre-validated)");
646            }
647            ValidatedCandidate {
648                canonical_id: rec.canonical_id,
649                name: rec.name,
650                status,
651                evidence: rec.evidence,
652                aliases: rec.aliases,
653                source_tier: rec.source_tier,
654                repository_url: rec.repository_url,
655                npm_package: rec.npm_package,
656            }
657        })
658        .collect();
659
660    // Sort by canonical_id for stable output.
661    candidates.sort_by(|a, b| a.canonical_id.cmp(&b.canonical_id));
662
663    // Compute stats.
664    let true_ext = candidates
665        .iter()
666        .filter(|c| c.status == ValidationStatus::TrueExtension)
667        .count();
668    let mention = candidates
669        .iter()
670        .filter(|c| c.status == ValidationStatus::MentionOnly)
671        .count();
672    let unknown = candidates
673        .iter()
674        .filter(|c| c.status == ValidationStatus::Unknown)
675        .count();
676
677    // Count sources merged (candidates that have >1 source).
678    let sources_merged = candidates
679        .iter()
680        .filter(|c| c.evidence.sources.len() > 1)
681        .count();
682
683    ValidationReport {
684        generated_at: chrono_now_iso(),
685        task: config.task_id.clone(),
686        stats: ValidationStats {
687            total_input_candidates: total_input,
688            after_dedup: candidates.len(),
689            true_extension: true_ext,
690            mention_only: mention,
691            unknown,
692            sources_merged,
693        },
694        candidates,
695    }
696}
697
698/// Extract npm package name from a `CandidateItem` if its source is npm.
699fn extract_npm_package(item: &CandidateItem) -> Option<String> {
700    match &item.source {
701        crate::extension_popularity::CandidateSource::Npm { package, .. } => Some(package.clone()),
702        _ => None,
703    }
704}
705
706/// Simple ISO timestamp (avoids pulling in chrono).
707pub fn chrono_now_iso() -> String {
708    // Use a fixed format for determinism in tests, real timestamp in production.
709    let now = std::time::SystemTime::now();
710    let secs = now
711        .duration_since(std::time::UNIX_EPOCH)
712        .unwrap_or_default()
713        .as_secs();
714    // Simple UTC approximation.
715    let days = secs / 86400;
716    let rem = secs % 86400;
717    let hours = rem / 3600;
718    let mins = (rem % 3600) / 60;
719    let s = rem % 60;
720    // Approximate year/month/day from days since epoch.
721    // Good enough for a timestamp string.
722    let (year, month, day) = days_to_ymd(days);
723    format!("{year:04}-{month:02}-{day:02}T{hours:02}:{mins:02}:{s:02}Z")
724}
725
726/// Convert days since Unix epoch to (year, month, day).
727fn days_to_ymd(mut days: u64) -> (u64, u64, u64) {
728    // Simplified civil calendar conversion.
729    let mut year = 1970;
730    loop {
731        let days_in_year = if is_leap(year) { 366 } else { 365 };
732        if days < days_in_year {
733            break;
734        }
735        days -= days_in_year;
736        year += 1;
737    }
738    let month_days: &[u64] = if is_leap(year) {
739        &[31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
740    } else {
741        &[31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
742    };
743    let mut month = 1;
744    for &md in month_days {
745        if days < md {
746            break;
747        }
748        days -= md;
749        month += 1;
750    }
751    (year, month, days + 1)
752}
753
754const fn is_leap(y: u64) -> bool {
755    (y % 4 == 0 && y % 100 != 0) || y % 400 == 0
756}
757
758// ────────────────────────────────────────────────────────────────────────────
759// Tests
760// ────────────────────────────────────────────────────────────────────────────
761
762#[cfg(test)]
763mod tests {
764    use super::*;
765
766    // ====================================================================
767    // Canonical ID generation
768    // ====================================================================
769
770    #[test]
771    fn canonical_id_from_repo_url_standard() {
772        assert_eq!(
773            canonical_id_from_repo_url("https://github.com/Owner/Repo"),
774            Some("owner/repo".to_string())
775        );
776    }
777
778    #[test]
779    fn canonical_id_from_repo_url_git_plus() {
780        assert_eq!(
781            canonical_id_from_repo_url("git+https://github.com/Can1357/oh-my-pi.git"),
782            Some("can1357/oh-my-pi".to_string())
783        );
784    }
785
786    #[test]
787    fn canonical_id_from_repo_url_ssh() {
788        assert_eq!(
789            canonical_id_from_repo_url("git@github.com:zenobi-us/pi-rose-pine.git"),
790            Some("zenobi-us/pi-rose-pine".to_string())
791        );
792    }
793
794    #[test]
795    fn canonical_id_from_repo_url_non_github() {
796        assert_eq!(canonical_id_from_repo_url("https://gitlab.com/a/b"), None);
797    }
798
799    #[test]
800    fn canonical_id_from_npm_scoped() {
801        assert_eq!(canonical_id_from_npm("@oh-my-pi/lsp"), "npm:@oh-my-pi/lsp");
802    }
803
804    #[test]
805    fn canonical_id_from_npm_unscoped() {
806        assert_eq!(canonical_id_from_npm("mitsupi"), "npm:mitsupi");
807    }
808
809    // ====================================================================
810    // Classification
811    // ====================================================================
812
813    #[test]
814    fn classify_true_extension_import_plus_export() {
815        let ev = ValidationEvidence {
816            has_api_import: true,
817            has_export_default: true,
818            registrations: Vec::new(),
819            ..Default::default()
820        };
821        assert_eq!(classify_from_evidence(&ev), ValidationStatus::TrueExtension);
822    }
823
824    #[test]
825    fn classify_true_extension_import_plus_registration() {
826        let ev = ValidationEvidence {
827            has_api_import: true,
828            has_export_default: false,
829            registrations: vec!["registerTool".to_string()],
830            ..Default::default()
831        };
832        assert_eq!(classify_from_evidence(&ev), ValidationStatus::TrueExtension);
833    }
834
835    #[test]
836    fn classify_mention_only_import_only() {
837        let ev = ValidationEvidence {
838            has_api_import: true,
839            has_export_default: false,
840            registrations: Vec::new(),
841            ..Default::default()
842        };
843        assert_eq!(classify_from_evidence(&ev), ValidationStatus::MentionOnly);
844    }
845
846    #[test]
847    fn classify_mention_only_export_only() {
848        let ev = ValidationEvidence {
849            has_api_import: false,
850            has_export_default: true,
851            registrations: Vec::new(),
852            ..Default::default()
853        };
854        assert_eq!(classify_from_evidence(&ev), ValidationStatus::MentionOnly);
855    }
856
857    #[test]
858    fn classify_mention_only_registration_only() {
859        let ev = ValidationEvidence {
860            has_api_import: false,
861            has_export_default: false,
862            registrations: vec!["registerCommand".to_string()],
863            ..Default::default()
864        };
865        assert_eq!(classify_from_evidence(&ev), ValidationStatus::MentionOnly);
866    }
867
868    #[test]
869    fn classify_unknown_no_signals() {
870        let ev = ValidationEvidence::default();
871        assert_eq!(classify_from_evidence(&ev), ValidationStatus::Unknown);
872    }
873
874    // ====================================================================
875    // Source content classification
876    // ====================================================================
877
878    #[test]
879    fn classify_source_basic_extension() {
880        let content = r#"
881import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
882export default function init(api: ExtensionAPI) {
883    api.registerTool({ name: "mytool", description: "test", handler: () => {} });
884}
885"#;
886        let (status, ev) = classify_source_content(content);
887        assert_eq!(status, ValidationStatus::TrueExtension);
888        assert!(ev.has_api_import);
889        assert!(ev.has_export_default);
890        assert!(ev.registrations.contains(&"registerTool".to_string()));
891    }
892
893    #[test]
894    fn classify_source_pi_ai_import() {
895        let content = r#"
896import { ExtensionAPI } from "@mariozechner/pi-ai";
897export default (api: ExtensionAPI) => { api.registerCommand({ name: "/test" }); };
898"#;
899        let (status, _ev) = classify_source_content(content);
900        assert_eq!(status, ValidationStatus::TrueExtension);
901    }
902
903    #[test]
904    fn classify_source_mention_only_readme() {
905        let content = "This extension works with @mariozechner/pi-coding-agent to provide...";
906        let (status, _ev) = classify_source_content(content);
907        assert_eq!(status, ValidationStatus::MentionOnly);
908    }
909
910    #[test]
911    fn classify_source_no_signals() {
912        let content = "function hello() { console.log('world'); }";
913        let (status, _ev) = classify_source_content(content);
914        assert_eq!(status, ValidationStatus::Unknown);
915    }
916
917    // ====================================================================
918    // Dedup / merge
919    // ====================================================================
920
921    #[test]
922    fn merge_same_repo_via_code_search_and_npm() {
923        let code_search = CodeSearchInventory {
924            meta: serde_json::json!({}),
925            extensions: vec![CodeSearchEntry {
926                repo: "can1357/oh-my-pi".to_string(),
927                path: "packages/lsp/src/index.ts".to_string(),
928                all_paths: vec![],
929                is_valid_extension: true,
930                has_api_import: true,
931                has_export_default: true,
932                registrations: vec!["registerTool".to_string()],
933                file_count: 1,
934            }],
935        };
936
937        let npm_scan = NpmScanSummary {
938            packages: vec![NpmScanEntry {
939                name: "@oh-my-pi/lsp".to_string(),
940                version: Some("1.3.3710".to_string()),
941                description: None,
942                repository: Some("git+https://github.com/can1357/oh-my-pi.git".to_string()),
943                has_pi_dep: false,
944            }],
945        };
946
947        let config = ValidationConfig {
948            task_id: "test".to_string(),
949        };
950
951        let report = run_validation_pipeline(
952            Some(&code_search),
953            None,
954            Some(&npm_scan),
955            None,
956            None,
957            &config,
958        );
959
960        // Should merge into one entry, not two.
961        let matching: Vec<_> = report
962            .candidates
963            .iter()
964            .filter(|c| c.canonical_id.contains("oh-my-pi"))
965            .collect();
966        assert_eq!(matching.len(), 1, "should merge repo + npm into one");
967        assert_eq!(matching[0].status, ValidationStatus::TrueExtension);
968        assert!(
969            matching[0]
970                .evidence
971                .sources
972                .contains(&"code_search".to_string())
973        );
974        assert!(
975            matching[0]
976                .evidence
977                .sources
978                .contains(&"npm_scan".to_string())
979        );
980    }
981
982    #[test]
983    fn merge_different_repos_stay_separate() {
984        let code_search = CodeSearchInventory {
985            meta: serde_json::json!({}),
986            extensions: vec![
987                CodeSearchEntry {
988                    repo: "alice/ext-a".to_string(),
989                    path: "index.ts".to_string(),
990                    all_paths: vec![],
991                    is_valid_extension: true,
992                    has_api_import: true,
993                    has_export_default: true,
994                    registrations: vec![],
995                    file_count: 1,
996                },
997                CodeSearchEntry {
998                    repo: "bob/ext-b".to_string(),
999                    path: "index.ts".to_string(),
1000                    all_paths: vec![],
1001                    is_valid_extension: true,
1002                    has_api_import: true,
1003                    has_export_default: true,
1004                    registrations: vec![],
1005                    file_count: 1,
1006                },
1007            ],
1008        };
1009
1010        let config = ValidationConfig {
1011            task_id: "test".to_string(),
1012        };
1013
1014        let report = run_validation_pipeline(Some(&code_search), None, None, None, None, &config);
1015
1016        assert_eq!(report.candidates.len(), 2);
1017    }
1018
1019    #[test]
1020    fn merge_preserves_aliases() {
1021        let npm_scan = NpmScanSummary {
1022            packages: vec![NpmScanEntry {
1023                name: "@oh-my-pi/lsp".to_string(),
1024                version: Some("1.0.0".to_string()),
1025                description: None,
1026                repository: Some("https://github.com/can1357/oh-my-pi".to_string()),
1027                has_pi_dep: true,
1028            }],
1029        };
1030
1031        let config = ValidationConfig {
1032            task_id: "test".to_string(),
1033        };
1034
1035        let report = run_validation_pipeline(None, None, Some(&npm_scan), None, None, &config);
1036
1037        let candidate = report
1038            .candidates
1039            .iter()
1040            .find(|c| c.canonical_id == "can1357/oh-my-pi")
1041            .expect("should use github canonical");
1042
1043        assert!(
1044            candidate.aliases.contains(&"npm:@oh-my-pi/lsp".to_string()),
1045            "npm name should be alias: {:?}",
1046            candidate.aliases
1047        );
1048    }
1049
1050    // ====================================================================
1051    // Pipeline stats
1052    // ====================================================================
1053
1054    #[test]
1055    fn pipeline_stats_correct() {
1056        let code_search = CodeSearchInventory {
1057            meta: serde_json::json!({}),
1058            extensions: vec![
1059                CodeSearchEntry {
1060                    repo: "a/ext1".to_string(),
1061                    path: "index.ts".to_string(),
1062                    all_paths: vec![],
1063                    is_valid_extension: true,
1064                    has_api_import: true,
1065                    has_export_default: true,
1066                    registrations: vec![],
1067                    file_count: 1,
1068                },
1069                CodeSearchEntry {
1070                    repo: "b/ext2".to_string(),
1071                    path: "index.ts".to_string(),
1072                    all_paths: vec![],
1073                    is_valid_extension: true,
1074                    has_api_import: true,
1075                    has_export_default: false,
1076                    registrations: vec![],
1077                    file_count: 1,
1078                },
1079            ],
1080        };
1081
1082        let config = ValidationConfig {
1083            task_id: "test".to_string(),
1084        };
1085
1086        let report = run_validation_pipeline(Some(&code_search), None, None, None, None, &config);
1087
1088        assert_eq!(report.stats.total_input_candidates, 2);
1089        assert_eq!(report.stats.after_dedup, 2);
1090        assert_eq!(report.stats.true_extension, 1);
1091        assert_eq!(report.stats.mention_only, 1);
1092    }
1093
1094    // ====================================================================
1095    // Serialization round-trip
1096    // ====================================================================
1097
1098    #[test]
1099    fn validation_status_serde_round_trip() {
1100        let statuses = [
1101            ValidationStatus::TrueExtension,
1102            ValidationStatus::MentionOnly,
1103            ValidationStatus::Unknown,
1104        ];
1105        for status in &statuses {
1106            let json = serde_json::to_string(status).unwrap();
1107            let back: ValidationStatus = serde_json::from_str(&json).unwrap();
1108            assert_eq!(*status, back);
1109        }
1110    }
1111
1112    #[test]
1113    fn validated_candidate_serde_round_trip() {
1114        let c = ValidatedCandidate {
1115            canonical_id: "owner/repo".to_string(),
1116            name: "repo".to_string(),
1117            status: ValidationStatus::TrueExtension,
1118            evidence: ValidationEvidence {
1119                has_api_import: true,
1120                has_export_default: true,
1121                registrations: vec!["registerTool".to_string()],
1122                sources: vec!["code_search".to_string()],
1123                reason: "Pi API import found; export default present".to_string(),
1124            },
1125            aliases: vec!["npm:@scope/repo".to_string()],
1126            source_tier: Some("community".to_string()),
1127            repository_url: Some("https://github.com/owner/repo".to_string()),
1128            npm_package: Some("@scope/repo".to_string()),
1129        };
1130        let json = serde_json::to_string_pretty(&c).unwrap();
1131        let back: ValidatedCandidate = serde_json::from_str(&json).unwrap();
1132        assert_eq!(back.canonical_id, "owner/repo");
1133        assert_eq!(back.status, ValidationStatus::TrueExtension);
1134        assert_eq!(back.aliases, vec!["npm:@scope/repo"]);
1135    }
1136
1137    // ====================================================================
1138    // Timestamp helpers
1139    // ====================================================================
1140
1141    #[test]
1142    fn days_to_ymd_epoch() {
1143        assert_eq!(days_to_ymd(0), (1970, 1, 1));
1144    }
1145
1146    #[test]
1147    fn days_to_ymd_known_date() {
1148        // 2026-01-01 = 20454 days since epoch.
1149        let (y, m, d) = days_to_ymd(20454);
1150        assert_eq!(y, 2026);
1151        assert_eq!(m, 1);
1152        assert_eq!(d, 1);
1153    }
1154
1155    // ====================================================================
1156    // normalize_github_repo
1157    // ====================================================================
1158
1159    #[test]
1160    fn normalize_lowercases_and_strips_git() {
1161        assert_eq!(normalize_github_repo("Owner/Repo.git"), "owner/repo");
1162    }
1163
1164    #[test]
1165    fn normalize_trims_whitespace() {
1166        assert_eq!(normalize_github_repo("  owner/repo  "), "owner/repo");
1167    }
1168
1169    mod proptest_extension_validation {
1170        use super::*;
1171        use proptest::prelude::*;
1172
1173        proptest! {
1174            /// `normalize_github_repo` never panics.
1175            #[test]
1176            fn normalize_never_panics(s in ".{0,100}") {
1177                let _ = normalize_github_repo(&s);
1178            }
1179
1180            /// `normalize_github_repo` output is always lowercase.
1181            #[test]
1182            fn normalize_is_lowercase(s in "[a-zA-Z0-9_/-]{1,30}") {
1183                let out = normalize_github_repo(&s);
1184                assert_eq!(out, out.to_lowercase());
1185            }
1186
1187            /// `normalize_github_repo` is idempotent.
1188            #[test]
1189            fn normalize_idempotent(s in "[a-zA-Z0-9_/-]{1,30}") {
1190                let once = normalize_github_repo(&s);
1191                let twice = normalize_github_repo(&once);
1192                assert_eq!(once, twice);
1193            }
1194
1195            /// `normalize_github_repo` strips `.git` suffix.
1196            #[test]
1197            fn normalize_strips_git_suffix(s in "[a-z]{1,10}/[a-z]{1,10}") {
1198                let with_git = format!("{s}.git");
1199                assert_eq!(normalize_github_repo(&with_git), normalize_github_repo(&s));
1200            }
1201
1202            /// `normalize_github_repo` trims whitespace.
1203            #[test]
1204            fn normalize_trims(s in "[a-z]{1,10}/[a-z]{1,10}", ws in "[ \\t]{0,5}") {
1205                let padded = format!("{ws}{s}{ws}");
1206                assert_eq!(normalize_github_repo(&padded), normalize_github_repo(&s));
1207            }
1208
1209            /// `canonical_id_from_npm` always starts with "npm:".
1210            #[test]
1211            fn npm_canonical_prefix(pkg in "[a-zA-Z@/-]{1,30}") {
1212                let id = canonical_id_from_npm(&pkg);
1213                assert!(id.starts_with("npm:"));
1214            }
1215
1216            /// `canonical_id_from_npm` output after prefix is lowercase.
1217            #[test]
1218            fn npm_canonical_lowercase(pkg in "[a-zA-Z]{1,20}") {
1219                let id = canonical_id_from_npm(&pkg);
1220                let after_prefix = &id[4..];
1221                assert_eq!(after_prefix, after_prefix.to_lowercase());
1222            }
1223
1224            /// `canonical_id_from_repo_url` returns lowercase when Some.
1225            #[test]
1226            fn repo_url_canonical_lowercase(
1227                owner in "[a-zA-Z0-9]{1,10}",
1228                repo in "[a-zA-Z0-9]{1,10}"
1229            ) {
1230                let url = format!("https://github.com/{owner}/{repo}");
1231                if let Some(id) = canonical_id_from_repo_url(&url) {
1232                    assert_eq!(id, id.to_lowercase());
1233                }
1234            }
1235
1236            /// `canonical_id_from_repo_url` matches normalized owner/repo slugs
1237            /// for standard GitHub URLs, including optional `.git` suffix.
1238            #[test]
1239            fn repo_url_canonical_matches_normalized_slug(
1240                owner in "[a-zA-Z0-9][a-zA-Z0-9-]{0,10}",
1241                repo in "[a-zA-Z0-9][a-zA-Z0-9._-]{0,14}",
1242                with_git in proptest::bool::ANY
1243            ) {
1244                let mut url = format!("https://github.com/{owner}/{repo}");
1245                if with_git {
1246                    url.push_str(".git");
1247                }
1248                let expected = normalize_github_repo(&format!("{owner}/{repo}"));
1249                assert_eq!(canonical_id_from_repo_url(&url), Some(expected));
1250            }
1251
1252            /// `canonical_id_from_repo_url` rejects non-GitHub hosts.
1253            #[test]
1254            fn repo_url_non_github_hosts_return_none(
1255                owner in "[a-zA-Z0-9]{1,10}",
1256                repo in "[a-zA-Z0-9]{1,10}",
1257                host in prop_oneof![
1258                    Just("gitlab.com"),
1259                    Just("bitbucket.org"),
1260                    Just("example.com"),
1261                ]
1262            ) {
1263                let url = format!("https://{host}/{owner}/{repo}");
1264                assert_eq!(canonical_id_from_repo_url(&url), None);
1265            }
1266
1267            /// `classify_from_evidence` — full signals → `TrueExtension`.
1268            #[test]
1269            fn classify_true_extension(
1270                has_export in proptest::bool::ANY,
1271                reg_count in 0..3usize
1272            ) {
1273                let evidence = ValidationEvidence {
1274                    has_api_import: true,
1275                    has_export_default: has_export || reg_count == 0,
1276                    registrations: (0..reg_count).map(|i| format!("reg{i}")).collect(),
1277                    sources: vec![],
1278                    reason: String::new(),
1279                };
1280                // api_import + (export_default OR registrations) → TrueExtension
1281                if evidence.has_export_default || !evidence.registrations.is_empty() {
1282                    assert_eq!(classify_from_evidence(&evidence), ValidationStatus::TrueExtension);
1283                }
1284            }
1285
1286            /// `classify_from_evidence` — no signals → Unknown.
1287            #[test]
1288            fn classify_no_signals_unknown(_dummy in 0..1u8) {
1289                let evidence = ValidationEvidence::default();
1290                assert_eq!(classify_from_evidence(&evidence), ValidationStatus::Unknown);
1291            }
1292
1293            /// `classify_source_content` never panics.
1294            #[test]
1295            fn classify_content_never_panics(content in "(?s).{0,200}") {
1296                let _ = classify_source_content(&content);
1297            }
1298
1299            /// `classify_source_content` always includes "source_content" in sources.
1300            #[test]
1301            fn classify_content_has_source(content in ".{0,100}") {
1302                let (_, evidence) = classify_source_content(&content);
1303                assert!(evidence.sources.contains(&"source_content".to_string()));
1304            }
1305
1306            /// Content with API import + export default → `TrueExtension`.
1307            #[test]
1308            fn classify_content_true_ext(prefix in "[a-z ]{0,20}") {
1309                let content = format!(
1310                    r#"{prefix}import {{ ExtensionAPI }} from "@mariozechner/pi-coding-agent"; export default"#
1311                );
1312                let (status, _) = classify_source_content(&content);
1313                assert_eq!(status, ValidationStatus::TrueExtension);
1314            }
1315
1316            /// `build_classification_reason` — no signals → specific message.
1317            #[test]
1318            fn reason_no_signals(_dummy in 0..1u8) {
1319                let reason = build_classification_reason(false, false, &[]);
1320                assert_eq!(reason, "no Pi extension signals detected");
1321            }
1322
1323            /// `build_classification_reason` includes "import" when has_api_import.
1324            #[test]
1325            fn reason_mentions_import(_dummy in 0..1u8) {
1326                let reason = build_classification_reason(true, false, &[]);
1327                assert!(reason.contains("import"));
1328            }
1329
1330            /// `build_classification_reason` includes "export" when has_export_default.
1331            #[test]
1332            fn reason_mentions_export(_dummy in 0..1u8) {
1333                let reason = build_classification_reason(false, true, &[]);
1334                assert!(reason.contains("export"));
1335            }
1336
1337            /// `build_classification_reason` mentions registrations when present.
1338            #[test]
1339            fn reason_mentions_registrations(n in 1..5usize) {
1340                let regs: Vec<String> = (0..n).map(|i| format!("reg{i}")).collect();
1341                let reason = build_classification_reason(false, false, &regs);
1342                assert!(reason.contains("registration"));
1343            }
1344
1345            /// `ValidationStatus` serde roundtrip.
1346            #[test]
1347            fn validation_status_serde(idx in 0..3usize) {
1348                let statuses = [
1349                    ValidationStatus::TrueExtension,
1350                    ValidationStatus::MentionOnly,
1351                    ValidationStatus::Unknown,
1352                ];
1353                let s = statuses[idx];
1354                let json = serde_json::to_string(&s).unwrap();
1355                let back: ValidationStatus = serde_json::from_str(&json).unwrap();
1356                assert_eq!(s, back);
1357            }
1358
1359            /// `days_to_ymd` produces valid month/day ranges.
1360            #[test]
1361            fn days_to_ymd_valid_ranges(days in 0u64..40000) {
1362                let (y, m, d) = days_to_ymd(days);
1363                assert!(y >= 1970);
1364                assert!((1..=12).contains(&m), "month {m} out of range");
1365                assert!((1..=31).contains(&d), "day {d} out of range");
1366            }
1367
1368            /// `is_leap` follows standard rules.
1369            #[test]
1370            fn leap_year_rules(y in 1900u64..2200) {
1371                let expected = (y % 4 == 0 && y % 100 != 0) || y % 400 == 0;
1372                assert_eq!(is_leap(y), expected);
1373            }
1374
1375            /// `chrono_now_iso` format matches ISO 8601 pattern.
1376            #[test]
1377            fn chrono_now_format(_dummy in 0..1u8) {
1378                let ts = chrono_now_iso();
1379                assert!(ts.ends_with('Z'));
1380                assert!(ts.contains('T'));
1381                assert_eq!(ts.len(), 20); // YYYY-MM-DDTHH:MM:SSZ
1382            }
1383        }
1384    }
1385}