use std::collections::{BTreeMap, BTreeSet};
use std::env;
use std::fs;
use std::path::{Path, PathBuf};
const MAX_RULES_FILE_SIZE: u64 = 5 * 1024 * 1024;
fn find_workspace_root(start: &Path) -> Option<PathBuf> {
start
.ancestors()
.find(|path| {
path.join("Cargo.toml")
.exists()
.then(|| fs::read_to_string(path.join("Cargo.toml")).ok())
.flatten()
.is_some_and(|content| {
content.contains("[workspace]") || content.contains("[workspace.")
})
})
.map(|p| p.to_path_buf())
}
fn main() {
let manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
let manifest_path = Path::new(&manifest_dir);
let crate_rules = manifest_path.join("rules.json");
let workspace_rules =
find_workspace_root(manifest_path).map(|root| root.join("knowledge-base/rules.json"));
println!("cargo:rerun-if-changed={}", crate_rules.display());
let rules_path = if crate_rules.exists() {
crate_rules
} else if let Some(ws_rules) = workspace_rules {
if ws_rules.exists() {
println!("cargo:rerun-if-changed={}", ws_rules.display());
ws_rules
} else {
panic!(
"Could not find rules.json at {} or {}",
manifest_path.join("rules.json").display(),
ws_rules.display()
);
}
} else {
panic!(
"Could not find rules.json at {} (no workspace root found)",
manifest_path.join("rules.json").display()
);
};
let file_size = fs::metadata(&rules_path)
.unwrap_or_else(|e| panic!("Failed to get metadata for {}: {}", rules_path.display(), e))
.len();
if file_size > MAX_RULES_FILE_SIZE {
panic!(
"rules.json at {} is too large ({} bytes, max {} bytes)",
rules_path.display(),
file_size,
MAX_RULES_FILE_SIZE
);
}
let rules_json = fs::read_to_string(&rules_path).unwrap_or_else(|e| {
panic!(
"Failed to read rules.json at {}: {}",
rules_path.display(),
e
)
});
let rules: serde_json::Value = serde_json::from_str(&rules_json).unwrap_or_else(|e| {
panic!(
"Failed to parse rules.json at {}: {}",
rules_path.display(),
e
)
});
let rules_array = rules["rules"]
.as_array()
.expect("rules.json must have a 'rules' array");
let mut generated_code = String::new();
generated_code.push_str("// Auto-generated from rules.json by build.rs\n");
generated_code.push_str("// Do not edit manually!\n\n");
generated_code.push_str("/// Rule data as (id, name) tuples.\n");
generated_code.push_str("/// \n");
generated_code.push_str(
"/// This is the complete list of validation rules from knowledge-base/rules.json.\n",
);
generated_code.push_str("pub const RULES_DATA: &[(&str, &str)] = &[\n");
let escape_str = |s: &str| {
s.replace('\\', "\\\\")
.replace('"', "\\\"")
.replace('\n', "\\n")
.replace('\r', "\\r")
.replace('\t', "\\t")
};
let is_valid_id = |id: &str| -> bool {
!id.is_empty()
&& id.len() <= 20
&& id.chars().all(|c| c.is_ascii_alphanumeric() || c == '-')
};
let is_valid_name = |name: &str| -> bool {
!name.is_empty() && name.len() <= 200 && !name.chars().any(|c| c.is_control() && c != ' ')
};
for (idx, rule) in rules_array.iter().enumerate() {
let id = rule["id"]
.as_str()
.unwrap_or_else(|| panic!("rule[{}] must have string 'id' field", idx));
let name = rule["name"]
.as_str()
.unwrap_or_else(|| panic!("rule[{}] must have string 'name' field", idx));
if !is_valid_id(id) {
panic!(
"rule[{}] has invalid id '{}': must be 1-20 alphanumeric/hyphen characters",
idx, id
);
}
if !is_valid_name(name) {
panic!(
"rule[{}] '{}' has invalid name: must be 1-200 chars, no control characters",
idx, id
);
}
let escaped_id = escape_str(id);
let escaped_name = escape_str(name);
generated_code.push_str(&format!(
" (\"{}\", \"{}\"),\n",
escaped_id, escaped_name
));
}
generated_code.push_str("];\n\n");
generated_code.push_str("/// Rule metadata as (id, category, severity, tool) tuples.\n");
generated_code.push_str("/// \n");
generated_code.push_str(
"/// Provides structured metadata for each rule from knowledge-base/rules.json.\n",
);
generated_code.push_str("/// The tool field is empty when the rule applies generically.\n");
generated_code.push_str("pub const RULES_METADATA: &[(&str, &str, &str, &str)] = &[\n");
for (idx, rule) in rules_array.iter().enumerate() {
let id = rule["id"]
.as_str()
.unwrap_or_else(|| panic!("rule[{}] must have string 'id' field", idx));
let category = rule
.get("category")
.and_then(|c| c.as_str())
.filter(|s| !s.trim().is_empty())
.unwrap_or_else(|| {
panic!(
"rule[{}] with id '{}' must have non-empty string 'category' field",
idx, id
)
});
let severity = rule
.get("severity")
.and_then(|s| s.as_str())
.filter(|s| !s.trim().is_empty())
.unwrap_or_else(|| {
panic!(
"rule[{}] with id '{}' must have non-empty string 'severity' field",
idx, id
)
});
let tool = rule
.get("evidence")
.and_then(|e| e.get("applies_to"))
.and_then(|a| a.get("tool"))
.and_then(|t| t.as_str())
.unwrap_or("");
generated_code.push_str(&format!(
" (\"{}\", \"{}\", \"{}\", \"{}\"),\n",
escape_str(id),
escape_str(category),
escape_str(severity),
escape_str(tool)
));
}
generated_code.push_str("];\n\n");
let mut tools: BTreeSet<String> = BTreeSet::new();
for rule in rules_array {
if let Some(tool) = rule
.get("evidence")
.and_then(|e| e.get("applies_to"))
.and_then(|a| a.get("tool"))
.and_then(|t| t.as_str())
.filter(|t| !t.is_empty())
{
tools.insert(tool.to_string());
}
}
generated_code
.push_str("/// Valid tool names derived from rules.json evidence.applies_to.tool.\n");
generated_code.push_str("/// \n");
generated_code
.push_str("/// These are the tools that have at least one rule specifically for them.\n");
generated_code.push_str("pub const VALID_TOOLS: &[&str] = &[\n");
for tool in &tools {
generated_code.push_str(&format!(" \"{}\",\n", escape_str(tool)));
}
generated_code.push_str("];\n\n");
#[derive(Default)]
struct PrefixInfo {
tools: BTreeSet<String>,
has_generic: bool, }
let mut prefix_info: BTreeMap<String, PrefixInfo> = BTreeMap::new();
for rule in rules_array {
let id = rule["id"].as_str().unwrap_or("");
let tool = rule
.get("evidence")
.and_then(|e| e.get("applies_to"))
.and_then(|a| a.get("tool"))
.and_then(|t| t.as_str());
if let Some(prefix) = extract_rule_prefix(id) {
let info = prefix_info.entry(prefix).or_default();
if let Some(tool_name) = tool {
if !tool_name.is_empty() {
info.tools.insert(tool_name.to_string());
} else {
info.has_generic = true;
}
} else {
info.has_generic = true;
}
}
}
generated_code.push_str("/// Mapping of rule ID prefixes to their associated tools.\n");
generated_code.push_str("/// \n");
generated_code.push_str(
"/// Derived from rules.json: for each prefix, this is the tool that all rules\n",
);
generated_code
.push_str("/// with that prefix apply to. Only includes prefixes where ALL rules\n");
generated_code
.push_str("/// consistently specify the same tool (excludes generic prefixes).\n");
generated_code.push_str("pub const TOOL_RULE_PREFIXES: &[(&str, &str)] = &[\n");
for (prefix, info) in &prefix_info {
if info.tools.len() == 1 && !info.has_generic {
let tool = info.tools.iter().next().unwrap();
generated_code.push_str(&format!(
" (\"{}\", \"{}\"),\n",
escape_str(prefix),
escape_str(tool)
));
}
}
generated_code.push_str("];\n");
let authoring = rules
.get("authoring")
.cloned()
.unwrap_or(serde_json::Value::Null);
let authoring_version = authoring
.get("version")
.and_then(|v| v.as_str())
.unwrap_or("0.0.0");
let is_valid_version = |version: &str| -> bool {
!version.is_empty()
&& version.len() <= 32
&& version
.chars()
.all(|c| c.is_ascii_digit() || c == '.' || c == '-' || c.is_ascii_alphabetic())
};
if !is_valid_version(authoring_version) {
panic!(
"authoring.version '{}' is invalid: expected a short semver-like string",
authoring_version
);
}
let mut authoring_families: BTreeSet<String> = BTreeSet::new();
if let Some(families) = authoring.get("families").and_then(|f| f.as_array()) {
for (idx, family) in families.iter().enumerate() {
let id = family
.get("id")
.and_then(|v| v.as_str())
.unwrap_or_else(|| {
panic!(
"authoring.families[{}].id must be a string in rules.json",
idx
)
});
let valid_family = !id.is_empty()
&& id.len() <= 64
&& id
.chars()
.all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '-');
if !valid_family {
panic!(
"authoring.families[{}].id '{}' is invalid: use lowercase letters, digits, and hyphens",
idx, id
);
}
authoring_families.insert(id.to_string());
}
}
let authoring_json_str = serde_json::to_string(&authoring)
.expect("BUG: failed to serialize authoring catalog to JSON string");
generated_code.push_str("\n/// Authoring catalog schema version.\n");
generated_code.push_str(&format!(
"pub const AUTHORING_VERSION: &str = \"{}\";\n\n",
escape_str(authoring_version)
));
generated_code
.push_str("/// Authoring family IDs derived from rules.json authoring.families.\n");
generated_code.push_str("pub const AUTHORING_FAMILIES: &[&str] = &[\n");
for family in &authoring_families {
generated_code.push_str(&format!(" \"{}\",\n", escape_str(family)));
}
generated_code.push_str("];\n\n");
generated_code.push_str(
"/// Raw authoring catalog JSON (top-level `authoring` section from rules.json).\n",
);
generated_code.push_str(
"/// This is generated at build time to keep rules.json as the source of truth.\n",
);
generated_code.push_str(&format!(
"pub const AUTHORING_CATALOG_JSON: &str = \"{}\";\n",
escape_str(&authoring_json_str)
));
let out_dir = env::var("OUT_DIR").unwrap();
let dest_path = Path::new(&out_dir).join("rules_data.rs");
fs::write(&dest_path, generated_code).expect("Failed to write generated rules");
}
fn extract_rule_prefix(rule_id: &str) -> Option<String> {
rule_id
.rsplit_once('-')
.filter(|(_, suffix)| !suffix.is_empty() && suffix.chars().all(|c| c.is_ascii_digit()))
.map(|(prefix, _)| format!("{}-", prefix))
}