pleme-doc-gen 0.1.45

Rust replacement for the M0 Python _gen-patterns.py + _gen-docs.py scripts in pleme-io/actions. Walks every action.yml + emits substrate's patterns-full.nix + per-action README.md + root catalog. Per the NO-SHELL prime directive.
//! search-and-consume — GitHub-search-driven mass-absorb.
//!
//! Composes `gh search repos <query>` with the existing consume-gh-org
//! per-repo pipeline (clone + reverse + optional render + optional
//! measure). Operator-facing:
//!
//!   pleme-doc-gen search-and-consume \
//!     --query "language:rust stars:>1000 cli" \
//!     --limit 30 \
//!     --measure-fidelity
//!
//! Opens external territory — the substrate stops being constrained
//! to pleme-io's own corpus and surfaces typed-quality against any
//! GitHub-searchable repo set. Per the operator's "leverage GitHub
//! search to feed new territories to consume" directive.
//!
//! The result is the substrate's typed-quality signal cross-applied
//! to third-party shapes: if the score is high, our extractors
//! generalize; if low, we know exactly which shapes to invest in.

use anyhow::{anyhow, Result};
use std::path::Path;

/// List repo slugs matching a GitHub search query via the `gh` CLI.
/// Returns `owner/repo` strings in result order.
///
/// Uses `gh search repos --json fullName --jq '.[].fullName'`.
pub fn search_repos(query: &str, limit: usize) -> Result<Vec<String>> {
    let limit_s = limit.to_string();
    // gh search repos parses qualifiers (language:, topic:, stars:) as
    // separate POSITIONAL ARGS, not as a single quoted string. Pass
    // each whitespace-separated token as its own arg so qualifiers
    // don't collapse into bogus values.
    let mut cmd = std::process::Command::new("gh");
    cmd.args(["search", "repos"]);
    for tok in query.split_whitespace() { cmd.arg(tok); }
    cmd.args([
        "--limit", &limit_s,
        "--json", "fullName",
        "--jq", ".[].fullName",
    ]);
    let out = cmd.output()
        .map_err(|e| anyhow!("gh search repos failed: {e}"))?;
    if !out.status.success() {
        let stderr = String::from_utf8_lossy(&out.stderr);
        return Err(anyhow!("gh search repos non-zero: {stderr}"));
    }
    let text = String::from_utf8_lossy(&out.stdout);
    Ok(text.lines()
        .filter(|l| !l.is_empty() && l.contains('/'))
        .map(|l| l.trim().to_string())
        .collect())
}

/// Top-level search-and-consume operation. Lists matching repos via
/// `gh search`, then runs the same per-repo pipeline that
/// consume_gh_org uses. Reuses the existing OrgReport shape (with
/// org = the query for identification).
pub fn search_and_consume(
    query: &str,
    out: &Path,
    work_dir: &Path,
    limit: usize,
    render_too: bool,
    measure: bool,
) -> Result<crate::consume_gh_org::OrgReport> {
    use crate::ast::Render;
    std::fs::create_dir_all(out)?;
    std::fs::create_dir_all(work_dir)?;

    let slugs = search_repos(query, limit)?;
    let mut report = crate::consume_gh_org::OrgReport {
        org: format!("search:{query}"),
        listed: slugs.len(),
        ..Default::default()
    };

    for slug in &slugs {
        let mut outcome = crate::consume_gh_org::RepoOutcome {
            slug: slug.clone(),
            ecosystem: None,
            caixa_path: None,
            rendered_path: None,
            artifact_count: 0,
            error: None,
            fidelity: None,
            render_health: None,
        };

        // Step 1 — shallow clone via the existing helper. We re-clone
        // each repo into work_dir; if it exists already, the helper
        // returns the existing path (idempotent).
        let clone_path = match shallow_clone(slug, work_dir) {
            Ok(p) => p,
            Err(e) => {
                outcome.error = Some(format!("clone: {e}"));
                report.failed += 1;
                report.outcomes.push(outcome);
                continue;
            }
        };

        // Step 2 — discover.
        let detected = crate::discover::detect(&clone_path);
        outcome.ecosystem = detected.as_ref().map(|d| d.ecosystem.to_string());
        if detected.is_none() {
            outcome.error = Some("no ecosystem detected".into());
            report.skipped += 1;
            report.outcomes.push(outcome);
            continue;
        }

        // Step 3 — reverse + write .caixa.lisp.
        let forms = match crate::reverse::reverse_from_path(&clone_path) {
            Ok(f) => f,
            Err(e) => {
                outcome.error = Some(format!("reverse: {e}"));
                report.failed += 1;
                report.outcomes.push(outcome);
                continue;
            }
        };
        let repo_name = slug.rsplit('/').next().unwrap_or(slug);
        let caixa_path = out.join(format!("{repo_name}.caixa.lisp"));
        if let Err(e) = std::fs::write(&caixa_path, forms.render()) {
            outcome.error = Some(format!("write caixa: {e}"));
            report.failed += 1;
            report.outcomes.push(outcome);
            continue;
        }
        outcome.caixa_path = Some(caixa_path.clone());

        // Step 4 (optional) — render + optionally measure fidelity.
        if render_too || measure {
            let rendered = out.join(format!("{repo_name}-rendered"));
            if let Err(e) = std::fs::create_dir_all(&rendered) {
                outcome.error = Some(format!("mkdir rendered: {e}"));
                report.failed += 1;
                report.outcomes.push(outcome);
                continue;
            }
            let src = forms.render();
            match crate::caixa::render(&src, &rendered, true) {
                Ok(files) => {
                    outcome.rendered_path = Some(rendered.clone());
                    outcome.artifact_count = files.len();
                    if measure {
                        match crate::fidelity::measure(&clone_path, &rendered) {
                            Ok(f) => {
                                report.fidelity_perfect_total += f.perfect_count;
                                report.fidelity_lossy_total += f.lossy_count;
                                report.fidelity_gap_total += f.gap_count;
                                report.fidelity_measured += 1;
                                if let Some(eco) = &f.ecosystem {
                                    let row = report.fidelity_by_ecosystem
                                        .entry(eco.clone()).or_default();
                                    row.measured += 1;
                                    row.perfect += f.perfect_count;
                                    row.lossy += f.lossy_count;
                                    row.gap += f.gap_count;
                                }
                                outcome.fidelity = Some(f);
                            }
                            Err(e) => {
                                outcome.error = Some(format!("measure: {e}"));
                            }
                        }
                    }
                }
                Err(e) => {
                    outcome.error = Some(format!("render: {e}"));
                    report.failed += 1;
                    report.outcomes.push(outcome);
                    continue;
                }
            }
        }

        report.consumed += 1;
        report.outcomes.push(outcome);
    }

    Ok(report)
}

/// Mirror of consume_gh_org::shallow_clone — the function is private
/// in that module; rather than expose it, we duplicate the 8-line
/// helper here. (Triple-use threshold would justify extracting; we
/// re-evaluate after the next consumer appears.)
fn shallow_clone(slug: &str, work_dir: &Path) -> Result<std::path::PathBuf> {
    let repo_name = slug.rsplit('/').next().unwrap_or(slug);
    // Search results can have owner collisions across the work dir; key
    // by the full slug to avoid one search clobbering another.
    let safe_dir = slug.replace('/', "__");
    let target = work_dir.join(&safe_dir).join(repo_name);
    if target.is_dir() {
        return Ok(target);
    }
    std::fs::create_dir_all(target.parent().unwrap())?;
    let url = format!("https://github.com/{slug}.git");
    let st = std::process::Command::new("git")
        .args(["clone", "--depth", "1", "--quiet", &url, target.to_str().unwrap()])
        .status()
        .map_err(|e| anyhow!("git clone {slug}: {e}"))?;
    if !st.success() {
        return Err(anyhow!("git clone {slug} returned non-zero"));
    }
    Ok(target)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn search_repos_validates_slug_shape() {
        // Pure unit (no network) — verify the result filter rejects
        // non-slug lines (could appear in error output).
        // We can't actually call gh in tests; this is a smoke check on
        // the filter logic by hand-constructing a result.
        let lines = vec!["", "owner/repo", "noslash", "a/b", "junk"];
        let filtered: Vec<&str> = lines.into_iter()
            .filter(|l| !l.is_empty() && l.contains('/'))
            .collect();
        assert_eq!(filtered, vec!["owner/repo", "a/b"]);
    }
}