use sqlx::SqlitePool;
use crate::errors::CoreError;
#[derive(Debug, Clone)]
#[allow(dead_code)]
pub struct RuleDocument {
pub skill_id: String,
pub title: String,
pub content: String,
pub confidence: f64,
pub file_patterns: Option<String>,
pub language: Option<String>,
pub repo_scope: Option<String>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct RuleIndexState {
pub rule_count: i64,
pub max_updated_at: Option<String>,
pub embedding_profile: String,
pub scope_signature: Option<String>,
}
pub fn scope_signature_from_skill_ids<'a>(
skill_ids: impl IntoIterator<Item = &'a str>,
) -> Option<String> {
use sha1::{Digest, Sha1};
let mut ids: Vec<&str> = skill_ids.into_iter().collect();
if ids.is_empty() {
return None;
}
ids.sort_unstable();
ids.dedup();
let mut hasher = Sha1::new();
for id in ids {
hasher.update(id.as_bytes());
hasher.update(b"\0");
}
let digest = hasher.finalize();
let mut hex = String::with_capacity(digest.len() * 2);
for byte in digest {
hex.push_str(&format!("{byte:02x}"));
}
Some(hex)
}
#[derive(sqlx::FromRow)]
struct RuleRow {
id: String,
name: String,
description: String,
r#type: String,
tags: String,
confidence_score: f64,
file_patterns: Option<String>,
source_repo: Option<String>,
}
const LANGUAGE_TAGS: &[&str] = &[
"rust",
"typescript",
"javascript",
"python",
"go",
"java",
"kotlin",
"swift",
"ruby",
"php",
"cpp",
"c++",
"csharp",
"c#",
"c",
];
pub fn language_from_tags(tags_json: &str) -> Option<String> {
let trimmed = tags_json.trim();
if trimmed.is_empty() {
return None;
}
let tags: Vec<String> = serde_json::from_str(trimmed).ok()?;
for tag in tags {
let lower = tag.trim().to_ascii_lowercase();
if LANGUAGE_TAGS.iter().any(|known| *known == lower) {
let canonical = match lower.as_str() {
"c++" => "cpp".to_owned(),
"c#" => "csharp".to_owned(),
other => other.to_owned(),
};
return Some(canonical);
}
}
None
}
pub fn confidence_from_tags(tags_json: &str) -> Option<f64> {
let trimmed = tags_json.trim();
if trimmed.is_empty() {
return None;
}
let tags: Vec<String> = serde_json::from_str(trimmed).ok()?;
let mut cluster_size: Option<u32> = None;
let mut severity: Option<String> = None;
for tag in &tags {
let lower = tag.trim().to_ascii_lowercase();
if let Some(rest) = lower.strip_prefix("cluster-size:") {
if let Ok(n) = rest.parse::<u32>() {
cluster_size = Some(n);
}
} else if let Some(rest) = lower.strip_prefix("severity:") {
severity = Some(rest.to_owned());
}
}
if cluster_size.is_none() && severity.is_none() {
return None;
}
let base_score = if let Some(n) = cluster_size {
match n {
0 | 1 => 0.55, 2 => 0.7,
3..=4 => 0.8,
_ => 0.9, }
} else {
0.7
};
let score = if let Some(sev) = severity.as_deref() {
match sev {
"error" => f64::min(base_score + 0.05, 0.95),
"info" => f64::max(base_score - 0.05, 0.4),
_ => base_score, }
} else {
base_score
};
Some(score)
}
fn language_from_pattern(p: &str) -> Option<&'static str> {
let lower = p.to_ascii_lowercase();
let ext = lower.rsplit('.').next()?;
if ext == lower || ext.contains('/') || ext.contains('*') {
return None;
}
Some(match ext {
"rs" => "rust",
"ts" | "tsx" => "typescript",
"js" | "jsx" | "mjs" | "cjs" => "javascript",
"py" | "pyi" => "python",
"go" => "go",
"java" => "java",
"kt" | "kts" => "kotlin",
"swift" => "swift",
"rb" => "ruby",
"php" => "php",
"cpp" | "cc" | "cxx" | "hpp" => "cpp",
"cs" => "csharp",
_ => return None,
})
}
pub fn language_from_file_patterns(file_patterns_json: Option<&str>) -> Option<String> {
let raw = file_patterns_json?.trim();
if raw.is_empty() {
return None;
}
let patterns: Vec<String> = serde_json::from_str(raw).ok()?;
let mut seen: Option<&'static str> = None;
for p in &patterns {
if let Some(lang) = language_from_pattern(p) {
match seen {
None => seen = Some(lang),
Some(existing) if existing == lang => {}
Some(_) => return None,
}
}
}
seen.map(String::from)
}
pub fn repo_scope_from_source_repo(source_repo: Option<&str>) -> Option<String> {
if let Some(repo) = source_repo.map(str::trim)
&& let Some((owner, name)) = repo.split_once('/')
&& !owner.trim().is_empty()
&& !name.trim().is_empty()
{
return Some(format!("{}/{}", owner.trim(), name.trim()).to_ascii_lowercase());
}
None
}
impl From<RuleRow> for RuleDocument {
fn from(r: RuleRow) -> Self {
let language = language_from_tags(&r.tags)
.or_else(|| language_from_file_patterns(r.file_patterns.as_deref()));
let repo_scope = repo_scope_from_source_repo(r.source_repo.as_deref());
let content = match repo_scope.as_deref() {
Some(scope) => format!(
"Rule ID: {}\nRule Name: {}\nType: {}\nSource: {}\nTags: {}\n\n{}",
r.id, r.name, r.r#type, scope, r.tags, r.description
),
None => format!(
"Rule ID: {}\nRule Name: {}\nType: {}\nTags: {}\n\n{}",
r.id, r.name, r.r#type, r.tags, r.description
),
};
Self {
skill_id: r.id,
title: r.name,
content,
confidence: r.confidence_score,
file_patterns: r.file_patterns,
language,
repo_scope,
}
}
}
pub async fn load_rules_from_db(pool: &SqlitePool) -> Result<Vec<RuleDocument>, CoreError> {
load_rules_from_db_for_engine(pool, None).await
}
pub async fn load_rule_index_state(pool: &SqlitePool) -> Result<RuleIndexState, CoreError> {
let row = sqlx::query!(
"SELECT COUNT(*) AS rule_count, MAX(updated_at) AS max_updated_at FROM skills WHERE status = 'active'"
)
.fetch_one(pool)
.await?;
Ok(RuleIndexState {
rule_count: row.rule_count,
max_updated_at: row.max_updated_at,
embedding_profile: crate::context::embedding::active_embedding_profile().await,
scope_signature: None,
})
}
pub async fn load_rules_from_db_for_engine(
pool: &SqlitePool,
engine: Option<&str>,
) -> Result<Vec<RuleDocument>, CoreError> {
let rows = match engine {
Some("codex") => {
sqlx::query_as::<_, RuleRow>(
"SELECT id, name, description, type as \"type\", tags, confidence_score, \
file_patterns, source_repo FROM skills \
WHERE enabled_for_codex = 1 AND status = 'active'",
)
.fetch_all(pool)
.await?
}
Some("claude") => {
sqlx::query_as::<_, RuleRow>(
"SELECT id, name, description, type as \"type\", tags, confidence_score, \
file_patterns, source_repo FROM skills \
WHERE enabled_for_claude = 1 AND status = 'active'",
)
.fetch_all(pool)
.await?
}
Some("gemini") => {
sqlx::query_as::<_, RuleRow>(
"SELECT id, name, description, type as \"type\", tags, confidence_score, \
file_patterns, source_repo FROM skills \
WHERE enabled_for_gemini = 1 AND status = 'active'",
)
.fetch_all(pool)
.await?
}
Some("cursor") => {
sqlx::query_as::<_, RuleRow>(
"SELECT id, name, description, type as \"type\", tags, confidence_score, \
file_patterns, source_repo FROM skills \
WHERE enabled_for_cursor = 1 AND status = 'active'",
)
.fetch_all(pool)
.await?
}
_ => {
sqlx::query_as::<_, RuleRow>(
"SELECT id, name, description, type as \"type\", tags, confidence_score, \
file_patterns, source_repo FROM skills \
WHERE status = 'active'",
)
.fetch_all(pool)
.await?
}
};
Ok(rows.into_iter().map(RuleDocument::from).collect())
}
pub async fn load_rule_confidence_map(
pool: &SqlitePool,
) -> Result<std::collections::HashMap<String, f64>, CoreError> {
let rows = sqlx::query!("SELECT id, confidence_score FROM skills WHERE status = 'active'")
.fetch_all(pool)
.await?;
Ok(rows
.into_iter()
.map(|row| (row.id, row.confidence_score))
.collect())
}
#[derive(Debug, Clone, Default)]
pub struct RuleRankingInputs {
pub confidence_map: Option<std::collections::HashMap<String, f64>>,
pub age_days_map: Option<std::collections::HashMap<String, f32>>,
}
pub async fn load_rule_ranking_inputs(pool: &SqlitePool) -> RuleRankingInputs {
RuleRankingInputs {
confidence_map: load_rule_confidence_map(pool).await.ok(),
age_days_map: load_rule_age_days_map(pool).await.ok(),
}
}
pub async fn load_rule_age_days_map(
pool: &SqlitePool,
) -> Result<std::collections::HashMap<String, f32>, CoreError> {
use sqlx::Row;
let rows = sqlx::query(
"SELECT id, COALESCE(created_at, updated_at) AS ts \
FROM skills WHERE status = 'active'",
)
.fetch_all(pool)
.await?;
let now = chrono::Utc::now();
let mut out = std::collections::HashMap::with_capacity(rows.len());
for row in rows {
let id: String = row.try_get("id").unwrap_or_default();
if id.is_empty() {
continue;
}
let ts: Option<String> = row.try_get("ts").ok();
let Some(ts) = ts else { continue };
let parsed = chrono::DateTime::parse_from_rfc3339(&ts)
.map(|dt| dt.with_timezone(&chrono::Utc))
.ok()
.or_else(|| {
chrono::NaiveDateTime::parse_from_str(&ts, "%Y-%m-%d %H:%M:%S")
.ok()
.map(|n| n.and_utc())
})
.or_else(|| {
chrono::NaiveDateTime::parse_from_str(&ts, "%Y-%m-%dT%H:%M:%S%.f")
.ok()
.map(|n| n.and_utc())
});
if let Some(created) = parsed {
let age_days = (now - created).num_seconds().max(0) as f32 / 86_400.0;
out.insert(id, age_days);
}
}
Ok(out)
}
pub async fn load_rule_examples(
pool: &SqlitePool,
skill_id: &str,
) -> Result<Vec<RuleExample>, CoreError> {
let rows = sqlx::query_as!(
RuleExampleRow,
"SELECT id, skill_id, bad_code, good_code, description, source \
FROM rule_examples WHERE skill_id = ?1 ORDER BY created_at DESC LIMIT 3",
skill_id
)
.fetch_all(pool)
.await?;
Ok(rows.into_iter().map(RuleExample::from).collect())
}
pub async fn load_rule_examples_batch(
pool: &SqlitePool,
skill_ids: &[String],
) -> Result<std::collections::HashMap<String, Vec<RuleExample>>, CoreError> {
if skill_ids.is_empty() {
return Ok(std::collections::HashMap::new());
}
let ids_json = serde_json::to_string(skill_ids)
.map_err(|e| CoreError::Internal(format!("serialize skill_ids: {e}")))?;
let rows = sqlx::query_as!(
RuleExampleRow,
"SELECT id, skill_id, bad_code, good_code, description, source \
FROM rule_examples \
WHERE skill_id IN (SELECT value FROM json_each(?1)) \
ORDER BY created_at DESC",
ids_json,
)
.fetch_all(pool)
.await?;
let mut map: std::collections::HashMap<String, Vec<RuleExample>> =
std::collections::HashMap::new();
for row in rows {
let skill_id = row.skill_id.clone();
let example = RuleExample::from(row);
map.entry(skill_id).or_default().push(example);
}
for examples in map.values_mut() {
examples.truncate(3);
}
Ok(map)
}
#[derive(Debug, Clone)]
pub struct RuleExample {
pub id: String,
pub skill_id: String,
pub bad_code: String,
pub good_code: String,
pub description: Option<String>,
pub source: String,
}
#[derive(sqlx::FromRow)]
struct RuleExampleRow {
id: String,
skill_id: String,
bad_code: String,
good_code: String,
description: Option<String>,
source: String,
}
impl From<RuleExampleRow> for RuleExample {
fn from(r: RuleExampleRow) -> Self {
Self {
id: r.id,
skill_id: r.skill_id,
bad_code: r.bad_code,
good_code: r.good_code,
description: r.description,
source: r.source,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn confidence_from_tags_singleton_downweighted() {
let c = confidence_from_tags(r#"["auto-from-extractions","cluster-size:1"]"#).unwrap();
assert!((c - 0.55).abs() < 1e-9, "got {c}");
}
#[test]
fn confidence_from_tags_large_cluster_strongest() {
let c = confidence_from_tags(r#"["cluster-size:8","severity:warning"]"#).unwrap();
assert!((c - 0.9).abs() < 1e-9, "got {c}");
}
#[test]
fn confidence_from_tags_severity_error_boosts() {
let c = confidence_from_tags(r#"["cluster-size:3","severity:error"]"#).unwrap();
assert!((c - 0.85).abs() < 1e-9, "got {c}");
}
#[test]
fn confidence_from_tags_severity_info_dampens() {
let c = confidence_from_tags(r#"["cluster-size:1","severity:info"]"#).unwrap();
assert!((c - 0.50).abs() < 1e-9, "got {c}");
}
#[test]
fn confidence_from_tags_missing_evidence_returns_none() {
assert_eq!(
confidence_from_tags(r#"["auto-from-extractions","origin:review-extraction"]"#),
None
);
assert_eq!(confidence_from_tags("[]"), None);
assert_eq!(confidence_from_tags(""), None);
assert_eq!(confidence_from_tags("not-json"), None);
}
#[test]
fn language_from_tags_table() {
let cases: &[(&str, Option<&str>)] = &[
(r#"["async", "rust", "concurrency"]"#, Some("rust")),
(r#"["typescript", "react"]"#, Some("typescript")),
(r#"["c++"]"#, Some("cpp")),
(r#"["C#"]"#, Some("csharp")),
("[]", None),
("", None),
("not-json", None),
(r#"["lint", "performance"]"#, None),
];
for (input, expected) in cases {
assert_eq!(
language_from_tags(input).as_deref(),
*expected,
"input: {input}"
);
}
}
#[test]
fn language_from_file_patterns_resolves_single_language() {
assert_eq!(
language_from_file_patterns(Some(r#"["**/*.rs"]"#)).as_deref(),
Some("rust")
);
assert_eq!(
language_from_file_patterns(Some(r#"["**/*.ts","**/*.tsx"]"#)).as_deref(),
Some("typescript")
);
assert_eq!(
language_from_file_patterns(Some(r#"["src/**/*.go","tests/**/*.go"]"#)).as_deref(),
Some("go")
);
}
#[test]
fn language_from_file_patterns_returns_none_for_mixed_or_universal() {
assert_eq!(
language_from_file_patterns(Some(r#"["**/*.rs","**/*.go"]"#)),
None
);
assert_eq!(language_from_file_patterns(Some(r#"["**/*"]"#)), None);
assert_eq!(language_from_file_patterns(Some(r#"["**/*test*"]"#)), None);
}
#[test]
fn language_from_file_patterns_handles_missing_or_empty_input() {
assert_eq!(language_from_file_patterns(None), None);
assert_eq!(language_from_file_patterns(Some("")), None);
assert_eq!(language_from_file_patterns(Some("[]")), None);
assert_eq!(language_from_file_patterns(Some("not-json")), None);
}
#[test]
fn repo_scope_uses_canonical_source_repo_only() {
assert_eq!(
repo_scope_from_source_repo(Some("vitejs/vite")).as_deref(),
Some("vitejs/vite")
);
assert!(repo_scope_from_source_repo(None).is_none());
assert!(repo_scope_from_source_repo(Some("vitejs")).is_none());
assert!(repo_scope_from_source_repo(Some(" /vite")).is_none());
}
#[test]
fn scope_signature_depends_only_on_membership() {
assert_eq!(
scope_signature_from_skill_ids(["a", "b", "c"]),
scope_signature_from_skill_ids(["c", "a", "b"]),
);
assert_eq!(
scope_signature_from_skill_ids(["a", "a", "b"]),
scope_signature_from_skill_ids(["a", "b"]),
);
assert_eq!(scope_signature_from_skill_ids(Vec::<&str>::new()), None);
assert_ne!(
scope_signature_from_skill_ids(["a", "b"]),
scope_signature_from_skill_ids(["a", "c"]),
);
}
#[test]
fn scope_signature_length_delimits_to_avoid_collision() {
assert_ne!(
scope_signature_from_skill_ids(["ab", "c"]),
scope_signature_from_skill_ids(["a", "bc"]),
);
}
}