use std::path::PathBuf;
use std::time::Duration;
use zeph_common::math::cosine_similarity;
use zeph_llm::any::AnyProvider;
use zeph_llm::provider::{LlmProvider, Message, Role};
use zeph_common::secret::Secret;
use crate::error::SkillError;
use crate::generator::{GeneratedSkill, SkillGenerator};
use crate::loader::SkillMeta;
const MAX_README_BYTES: usize = 32_768;
const MINING_SYSTEM_PROMPT: &str = "\
You are an expert at creating SKILL.md files for the Zeph AI agent. \
Given a GitHub repository name and its README, generate a SKILL.md that captures \
the repository's primary use case as an agent skill. \
\n\nRules:\n\
- name: lowercase letters, digits, and hyphens only (1-64 chars); derive from the repo name\n\
- description: one or two sentences describing what the tool does and when to use it\n\
- Use only tools that are appropriate for the skill (e.g. bash for CLI tools)\n\
- Body: max 3 ## sections, practical examples from the README\n\
- Body size: keep under 15000 bytes\n\
- Output ONLY the raw SKILL.md content, no explanation, no code fences\n";
pub struct RepoCandidate {
pub full_name: String,
pub description: String,
pub readme_content: String,
pub stars: u32,
}
pub struct MinedSkill {
pub repo: String,
pub skill: GeneratedSkill,
pub nearest_similarity: f32,
}
pub struct MiningConfig {
pub queries: Vec<String>,
pub max_repos_per_query: usize,
pub dedup_threshold: f32,
pub output_dir: PathBuf,
pub rate_limit_rpm: u32,
pub dry_run: bool,
}
pub struct SkillMiner {
generator: SkillGenerator,
embed_provider: AnyProvider,
github_token: Secret,
config: MiningConfig,
http: reqwest::Client,
}
impl SkillMiner {
pub fn new(
generation_provider: AnyProvider,
embed_provider: AnyProvider,
github_token: impl Into<String>,
config: MiningConfig,
) -> Result<Self, SkillError> {
let http = reqwest::Client::builder()
.user_agent("zeph-skills-miner/1.0")
.timeout(Duration::from_secs(30))
.build()
.map_err(|e| SkillError::Other(format!("HTTP client build failed: {e}")))?;
let generator = SkillGenerator::new(generation_provider, config.output_dir.clone());
Ok(Self {
generator,
embed_provider,
github_token: Secret::new(github_token),
config,
http,
})
}
pub async fn run(&self, existing_skills: &[SkillMeta]) -> Result<Vec<MinedSkill>, SkillError> {
let existing_embeddings = self.embed_existing(existing_skills).await;
let delay_between_requests = self.request_delay();
let mut results: Vec<MinedSkill> = Vec::new();
for query in &self.config.queries {
tracing::info!(query = %query, "mining: searching GitHub");
let repos = match self.search_repos(query).await {
Ok(r) => r,
Err(e) => {
tracing::warn!(query = %query, error = %e, "GitHub search failed, skipping");
continue;
}
};
tokio::time::sleep(delay_between_requests).await;
for repo in repos {
match self
.process_repo(&repo, &existing_embeddings, self.config.dry_run)
.await
{
Ok(Some(mined)) => {
results.push(mined);
}
Ok(None) => {} Err(e) => {
tracing::warn!(repo = %repo.full_name, error = %e, "failed to process repo");
}
}
tokio::time::sleep(delay_between_requests).await;
}
}
Ok(results)
}
async fn embed_existing(&self, skills: &[SkillMeta]) -> Vec<(String, Vec<f32>)> {
let mut embeddings = Vec::with_capacity(skills.len());
for skill in skills {
match self.embed_provider.embed(&skill.description).await {
Ok(emb) => embeddings.push((skill.name.clone(), emb)),
Err(e) => {
tracing::warn!(
skill = %skill.name,
error = %e,
"failed to embed existing skill for dedup"
);
}
}
}
embeddings
}
async fn process_repo(
&self,
repo: &RepoCandidate,
existing_embeddings: &[(String, Vec<f32>)],
is_dry_run: bool,
) -> Result<Option<MinedSkill>, SkillError> {
let skill = match self.generate_from_repo(repo).await {
Ok(s) => s,
Err(e) => {
tracing::warn!(repo = %repo.full_name, error = %e, "skill generation failed");
return Ok(None);
}
};
let (is_novel, nearest_sim) = self.is_novel(&skill, existing_embeddings).await?;
if !is_novel {
tracing::info!(
repo = %repo.full_name,
skill = %skill.name,
similarity = nearest_sim,
"skipping duplicate skill"
);
return Ok(None);
}
if is_dry_run {
tracing::info!(
repo = %repo.full_name,
skill = %skill.name,
"dry-run: would write skill"
);
return Ok(Some(MinedSkill {
repo: repo.full_name.clone(),
skill,
nearest_similarity: nearest_sim,
}));
}
self.generator.approve_and_save(&skill).await?;
Ok(Some(MinedSkill {
repo: repo.full_name.clone(),
skill,
nearest_similarity: nearest_sim,
}))
}
fn request_delay(&self) -> Duration {
let rpm = self.config.rate_limit_rpm.max(1);
Duration::from_millis(u64::from(60_000 / rpm))
}
pub async fn search_repos(&self, query: &str) -> Result<Vec<RepoCandidate>, SkillError> {
let per_page = self.config.max_repos_per_query.min(100);
let encoded_query: String = query
.chars()
.flat_map(|c| {
if c.is_ascii_alphanumeric() || c == '-' || c == '_' || c == '.' || c == '~' {
vec![c]
} else {
format!("%{:02X}", c as u32).chars().collect()
}
})
.collect();
let url = format!(
"https://api.github.com/search/repositories?q={encoded_query}&sort=stars&per_page={per_page}"
);
let resp = self
.http
.get(&url)
.header(
"Authorization",
format!("Bearer {}", self.github_token.expose()),
)
.header("Accept", "application/vnd.github.v3+json")
.send()
.await
.map_err(|e| SkillError::Other(format!("GitHub search request failed: {e}")))?;
if resp.status() == reqwest::StatusCode::FORBIDDEN
|| resp.status() == reqwest::StatusCode::TOO_MANY_REQUESTS
{
return Err(SkillError::Other(format!(
"GitHub rate limit exceeded ({})",
resp.status()
)));
}
if !resp.status().is_success() {
return Err(SkillError::Other(format!(
"GitHub search returned {}: {}",
resp.status(),
resp.text().await.unwrap_or_default()
)));
}
let json: serde_json::Value = resp
.json()
.await
.map_err(|e| SkillError::Other(format!("GitHub search JSON parse failed: {e}")))?;
let items = json["items"].as_array().cloned().unwrap_or_default();
let mut candidates = Vec::with_capacity(items.len());
for item in &items {
let full_name = item["full_name"].as_str().unwrap_or_default().to_string();
let description = item["description"].as_str().unwrap_or_default().to_string();
let stars =
u32::try_from(item["stargazers_count"].as_u64().unwrap_or(0)).unwrap_or(u32::MAX);
if full_name.is_empty() {
continue;
}
let readme = match self.fetch_readme(&full_name).await {
Ok(r) => r,
Err(e) => {
tracing::debug!(repo = %full_name, error = %e, "README fetch failed, skipping");
continue;
}
};
candidates.push(RepoCandidate {
full_name,
description,
readme_content: readme,
stars,
});
}
Ok(candidates)
}
async fn fetch_readme(&self, repo: &str) -> Result<String, SkillError> {
{
let mut parts = repo.splitn(2, '/');
let owner = parts.next().unwrap_or("");
let name = parts.next().unwrap_or("");
let is_safe = |s: &str| {
!s.is_empty()
&& s.chars()
.all(|c| c.is_ascii_alphanumeric() || c == '.' || c == '-' || c == '_')
};
if !is_safe(owner) || !is_safe(name) || parts.next().is_some() {
return Err(SkillError::Invalid(format!(
"invalid repository name: {repo:?}"
)));
}
}
let url = format!("https://api.github.com/repos/{repo}/readme");
let resp = self
.http
.get(&url)
.header(
"Authorization",
format!("Bearer {}", self.github_token.expose()),
)
.header("Accept", "application/vnd.github.raw")
.send()
.await
.map_err(|e| SkillError::Other(format!("README fetch failed: {e}")))?;
if resp.status() == reqwest::StatusCode::NOT_FOUND {
return Err(SkillError::NotFound(format!("no README for {repo}")));
}
if !resp.status().is_success() {
return Err(SkillError::Other(format!(
"README fetch returned {}",
resp.status()
)));
}
let bytes = resp
.bytes()
.await
.map_err(|e| SkillError::Other(format!("README read failed: {e}")))?;
let text =
String::from_utf8_lossy(&bytes[..bytes.len().min(MAX_README_BYTES)]).into_owned();
Ok(text)
}
async fn generate_from_repo(&self, repo: &RepoCandidate) -> Result<GeneratedSkill, SkillError> {
let user_prompt = format!(
"Repository: {}\nDescription: {}\n\nREADME (truncated to 32KB):\n\n{}",
repo.full_name, repo.description, repo.readme_content
);
let messages = vec![
Message::from_legacy(Role::System, MINING_SYSTEM_PROMPT),
Message::from_legacy(Role::User, &user_prompt),
];
let raw = self
.generator
.provider
.chat(&messages)
.await
.map_err(|e| SkillError::Other(format!("LLM generation failed: {e}")))?;
crate::generator::parse_and_validate_pub(&crate::generator::extract_skill_md_pub(&raw))
}
pub async fn is_novel(
&self,
candidate: &GeneratedSkill,
existing_embeddings: &[(String, Vec<f32>)],
) -> Result<(bool, f32), SkillError> {
if existing_embeddings.is_empty() {
return Ok((true, 0.0));
}
let candidate_emb = self
.embed_provider
.embed(&candidate.meta.description)
.await
.map_err(|e| SkillError::Other(format!("embed failed: {e}")))?;
let max_sim = existing_embeddings
.iter()
.map(|(_, emb)| cosine_similarity(&candidate_emb, emb))
.fold(0.0_f32, f32::max);
Ok((max_sim < self.config.dedup_threshold, max_sim))
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn request_delay_25rpm() {
let config = MiningConfig {
queries: vec![],
max_repos_per_query: 20,
dedup_threshold: 0.85,
output_dir: PathBuf::from("/tmp"),
rate_limit_rpm: 25,
dry_run: false,
};
let miner = SkillMiner {
generator: SkillGenerator::new(
zeph_llm::any::AnyProvider::Mock(zeph_llm::mock::MockProvider::default()),
PathBuf::from("/tmp"),
),
embed_provider: zeph_llm::any::AnyProvider::Mock(
zeph_llm::mock::MockProvider::default(),
),
github_token: Secret::new(String::new()),
config,
http: reqwest::Client::new(),
};
let delay = miner.request_delay();
assert_eq!(delay, Duration::from_millis(2400));
}
#[test]
fn mining_config_defaults() {
let config = MiningConfig {
queries: vec![],
max_repos_per_query: 20,
dedup_threshold: 0.85,
output_dir: PathBuf::from("/tmp"),
rate_limit_rpm: 25,
dry_run: false,
};
assert!((config.dedup_threshold - 0.85_f32).abs() < f32::EPSILON);
assert_eq!(config.max_repos_per_query, 20);
assert_eq!(config.rate_limit_rpm, 25);
}
#[tokio::test]
async fn is_novel_empty_existing() {
let miner = make_test_miner("/tmp");
let skill = make_test_skill();
let (novel, sim) = miner.is_novel(&skill, &[]).await.unwrap();
assert!(novel);
assert!(sim.abs() < f32::EPSILON);
}
fn make_test_miner(output_dir: &str) -> SkillMiner {
let mock = zeph_llm::any::AnyProvider::Mock(zeph_llm::mock::MockProvider::default());
let config = MiningConfig {
queries: vec![],
max_repos_per_query: 20,
dedup_threshold: 0.85,
output_dir: PathBuf::from(output_dir),
rate_limit_rpm: 25,
dry_run: false,
};
SkillMiner {
generator: SkillGenerator::new(mock.clone(), PathBuf::from(output_dir)),
embed_provider: mock,
github_token: Secret::new(String::new()),
config,
http: reqwest::Client::new(),
}
}
fn make_test_skill() -> GeneratedSkill {
use crate::loader::load_skill_meta_from_str;
let content =
"---\nname: test-skill\ndescription: A test skill.\n---\n\n## Usage\n\nDo stuff.\n";
let (meta, _) = load_skill_meta_from_str(content).unwrap();
GeneratedSkill {
name: "test-skill".into(),
content: content.into(),
meta,
warnings: vec![],
has_injection_patterns: false,
}
}
struct FixedEmbedMiner {
embed_vec: Vec<f32>,
threshold: f32,
}
impl FixedEmbedMiner {
fn is_novel_direct(
&self,
candidate_emb: &[f32],
existing: &[(String, Vec<f32>)],
) -> (bool, f32) {
if existing.is_empty() {
return (true, 0.0);
}
let max_sim = existing
.iter()
.map(|(_, emb)| cosine_similarity(candidate_emb, emb))
.fold(0.0_f32, f32::max);
(max_sim < self.threshold, max_sim)
}
}
#[test]
fn is_novel_rejects_similar_skill() {
let helper = FixedEmbedMiner {
embed_vec: vec![1.0, 0.0, 0.0],
threshold: 0.85,
};
let existing = vec![("existing".to_string(), vec![1.0, 0.0, 0.0])];
let (novel, sim) = helper.is_novel_direct(&helper.embed_vec, &existing);
assert!(!novel, "identical vectors should not be novel");
assert!(
(sim - 1.0_f32).abs() < 1e-5,
"expected similarity ~1.0, got {sim}"
);
}
#[test]
fn is_novel_accepts_dissimilar_skill() {
let helper = FixedEmbedMiner {
embed_vec: vec![1.0, 0.0, 0.0],
threshold: 0.85,
};
let existing = vec![("other".to_string(), vec![0.0, 1.0, 0.0])];
let (novel, sim) = helper.is_novel_direct(&helper.embed_vec, &existing);
assert!(novel, "orthogonal vectors should be novel, sim={sim}");
assert!(sim < 0.85);
}
#[tokio::test]
async fn dry_run_does_not_write_files() {
let dir = tempfile::tempdir().unwrap();
let mock = zeph_llm::any::AnyProvider::Mock(zeph_llm::mock::MockProvider::default());
let config = MiningConfig {
queries: vec![],
max_repos_per_query: 20,
dedup_threshold: 0.85,
output_dir: dir.path().to_path_buf(),
rate_limit_rpm: 25,
dry_run: true,
};
let miner = SkillMiner {
generator: SkillGenerator::new(mock.clone(), dir.path().to_path_buf()),
embed_provider: mock,
github_token: Secret::new(String::new()),
config,
http: reqwest::Client::new(),
};
let skill = make_test_skill();
let result = miner
.process_repo(
&RepoCandidate {
full_name: "test/repo".into(),
description: "A test repo.".into(),
readme_content: String::new(),
stars: 100,
},
&[],
true,
)
.await;
let skill_dir = dir.path().join(&skill.name);
assert!(
!skill_dir.exists(),
"dry-run must not write skill directory to disk"
);
let entries: Vec<_> = std::fs::read_dir(dir.path())
.unwrap()
.filter_map(std::result::Result::ok)
.collect();
assert!(
entries.is_empty(),
"dry-run must not create any files in output_dir, found: {:?}",
entries
.iter()
.map(std::fs::DirEntry::file_name)
.collect::<Vec<_>>()
);
let _ = result; }
#[test]
fn request_delay_zero_rpm_uses_minimum() {
let config = MiningConfig {
queries: vec![],
max_repos_per_query: 20,
dedup_threshold: 0.85,
output_dir: PathBuf::from("/tmp"),
rate_limit_rpm: 0,
dry_run: false,
};
let miner = SkillMiner {
generator: SkillGenerator::new(
zeph_llm::any::AnyProvider::Mock(zeph_llm::mock::MockProvider::default()),
PathBuf::from("/tmp"),
),
embed_provider: zeph_llm::any::AnyProvider::Mock(
zeph_llm::mock::MockProvider::default(),
),
github_token: Secret::new(String::new()),
config,
http: reqwest::Client::new(),
};
assert_eq!(miner.request_delay(), Duration::from_millis(60_000));
}
}