use std::collections::{BTreeMap, BTreeSet};
use index_capture::validate_capture_bundle;
use serde::{Deserialize, Serialize};
use serde_json::json;
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum CorpusSource {
Top100,
Forum,
}
impl CorpusSource {
#[must_use]
pub const fn as_str(self) -> &'static str {
match self {
Self::Top100 => "top100",
Self::Forum => "forum",
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct LabRow {
pub source: CorpusSource,
pub domain: String,
pub family: String,
pub intent: Option<String>,
pub current_tier: u8,
pub known_limit: String,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct IngestSummary {
pub rows: Vec<LabRow>,
pub captures_total: usize,
pub family_counts: Vec<(String, usize)>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct PackRuleSuggestion {
pub host: String,
pub path_prefix: String,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct PackLintReport {
pub errors: Vec<String>,
pub warnings: Vec<String>,
}
impl PackLintReport {
#[must_use]
pub fn passed(&self) -> bool {
self.errors.is_empty()
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct SynthesisQuality {
pub family: String,
pub eligible_rows: usize,
pub covered_rows: usize,
pub score: f64,
pub reasons: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct PackFile {
version: String,
id: String,
#[serde(default)]
rules: Vec<PackRule>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct PackRule {
host: String,
path_prefix: String,
manifest: serde_json::Value,
}
pub fn parse_top100_matrix(input: &str) -> Result<Vec<LabRow>, String> {
let mut rows = Vec::new();
for (line_number, line) in input.lines().enumerate() {
let trimmed = line.trim();
if trimmed.is_empty() || trimmed.starts_with('#') {
continue;
}
let fields = trimmed.split('\t').collect::<Vec<_>>();
if fields.len() < 9 {
return Err(format!(
"invalid top100 row at line {}: expected 9 fields, got {}",
line_number + 1,
fields.len()
));
}
let current_tier = parse_tier(fields[4], "top100", line_number + 1)?;
rows.push(LabRow {
source: CorpusSource::Top100,
domain: fields[0].trim().to_owned(),
family: canonical_family(fields[1]),
intent: Some(fields[2].trim().to_owned()),
current_tier,
known_limit: fields[8].trim().to_owned(),
});
}
Ok(rows)
}
pub fn parse_forum_matrix(input: &str) -> Result<Vec<LabRow>, String> {
let mut rows = Vec::new();
for (line_number, line) in input.lines().enumerate() {
let trimmed = line.trim();
if trimmed.is_empty() || trimmed.starts_with('#') {
continue;
}
let fields = trimmed.split('\t').collect::<Vec<_>>();
if fields.len() < 8 {
return Err(format!(
"invalid forum row at line {}: expected 8 fields, got {}",
line_number + 1,
fields.len()
));
}
let current_tier = parse_tier(fields[3], "forum", line_number + 1)?;
rows.push(LabRow {
source: CorpusSource::Forum,
domain: fields[0].trim().to_owned(),
family: canonical_family(fields[1]),
intent: None,
current_tier,
known_limit: fields[7].trim().to_owned(),
});
}
Ok(rows)
}
pub fn ingest_summary(
top100_matrix: &str,
forum_matrix: &str,
capture_artifacts: &[String],
) -> Result<IngestSummary, String> {
let mut rows = parse_top100_matrix(top100_matrix)?;
rows.extend(parse_forum_matrix(forum_matrix)?);
rows.sort_by(|left, right| {
(
left.family.as_str(),
left.domain.as_str(),
left.source.as_str(),
left.intent.as_deref().unwrap_or(""),
)
.cmp(&(
right.family.as_str(),
right.domain.as_str(),
right.source.as_str(),
right.intent.as_deref().unwrap_or(""),
))
});
let mut captures_total = 0usize;
for artifact in capture_artifacts {
validate_capture_bundle(artifact).map_err(|error| error.to_string())?;
captures_total = captures_total.saturating_add(1);
}
let mut counts = BTreeMap::<String, usize>::new();
for row in &rows {
let counter = counts.entry(row.family.clone()).or_default();
*counter = counter.saturating_add(1);
}
let family_counts = counts.into_iter().collect::<Vec<_>>();
Ok(IngestSummary {
rows,
captures_total,
family_counts,
})
}
pub fn synthesize_rules(rows: &[LabRow], family: &str) -> Vec<PackRuleSuggestion> {
let family = canonical_family(family);
let mut entries = BTreeSet::new();
for row in rows {
if row.family != family {
continue;
}
let (host, path_prefix) = domain_to_host_path_prefix(&row.domain);
entries.insert((host, path_prefix));
}
entries
.into_iter()
.map(|(host, path_prefix)| PackRuleSuggestion { host, path_prefix })
.collect()
}
pub fn synthesize_quality(
rows: &[LabRow],
family: &str,
rules: &[PackRuleSuggestion],
) -> SynthesisQuality {
let family = canonical_family(family);
let eligible = rows
.iter()
.filter(|row| row.family == family && row.known_limit == "none")
.collect::<Vec<_>>();
let covered_rows = eligible
.iter()
.filter(|row| {
let (host, path_prefix) = domain_to_host_path_prefix(&row.domain);
rules
.iter()
.any(|rule| rule.host == host && rule.path_prefix == path_prefix)
})
.count();
let eligible_rows = eligible.len();
let score = if eligible_rows == 0 {
1.0
} else {
covered_rows as f64 / eligible_rows as f64
};
let mut reasons = Vec::new();
if eligible_rows == 0 {
reasons.push("no eligible rows (known_limit=none) for family".to_owned());
} else if covered_rows == eligible_rows {
reasons.push("all eligible rows map to synthesized host/path rules".to_owned());
} else {
reasons.push(format!(
"{}/{} eligible rows covered by synthesized rules",
covered_rows, eligible_rows
));
}
SynthesisQuality {
family,
eligible_rows,
covered_rows,
score,
reasons,
}
}
pub fn scaffold_pack_json(rows: &[LabRow], family: &str) -> Result<String, String> {
let rules = synthesize_rules(rows, family);
let canonical_family = canonical_family(family);
let id = format!("family.{canonical_family}");
let pack_rules = rules
.into_iter()
.map(|rule| {
json!({
"host": rule.host,
"path_prefix": rule.path_prefix,
"manifest": {
"version": "index.idx/v1",
"scope": "/",
"content": {
"main_selector": "main, article, [role='main']"
},
"regions": [],
"fields": [],
"forms": [],
"dates": []
}
})
})
.collect::<Vec<_>>();
let output = json!({
"version": "index.pack/v1",
"id": id,
"rules": pack_rules
});
serde_json::to_string_pretty(&output).map_err(|error| error.to_string())
}
pub fn lint_pack_json(input: &str) -> Result<PackLintReport, String> {
let pack = serde_json::from_str::<PackFile>(input)
.map_err(|error| format!("pack JSON is invalid: {error}"))?;
let mut errors = Vec::new();
let mut warnings = Vec::new();
if pack.version != "index.pack/v1" {
errors.push(format!("unsupported pack version: {}", pack.version));
}
if pack.id.trim().is_empty() {
errors.push("pack id must not be empty".to_owned());
}
for (index, rule) in pack.rules.iter().enumerate() {
if rule.host.contains('*')
|| rule.host.starts_with('.')
|| rule.host.contains(' ')
|| !rule.host.contains('.')
{
errors.push(format!("rule {} host is invalid: {}", index + 1, rule.host));
}
if !rule.path_prefix.starts_with('/') || rule.path_prefix.contains('*') {
errors.push(format!(
"rule {} has invalid path_prefix {}",
index + 1,
rule.path_prefix
));
}
if let Some(version) = rule
.manifest
.get("version")
.and_then(|value| value.as_str())
{
if version != "index.idx/v1" {
errors.push(format!(
"rule {} has unsupported manifest version: {}",
index + 1,
version
));
}
}
if let Some(selector) = rule
.manifest
.get("content")
.and_then(|content| content.get("main_selector"))
.and_then(|selector| selector.as_str())
{
let selector_lower = selector.to_ascii_lowercase();
if selector_lower.contains("script") || selector_lower.contains("iframe") {
errors.push(format!(
"rule {} has unsafe main_selector: {}",
index + 1,
selector
));
}
}
let field_names = rule
.manifest
.get("fields")
.and_then(|fields| fields.as_array())
.cloned()
.unwrap_or_default();
for field in field_names {
if let Some(name) = field.get("name").and_then(|value| value.as_str()) {
let lower = name.to_ascii_lowercase();
if lower.contains("password") || lower.contains("token") || lower.contains("cookie")
{
errors.push(format!(
"rule {} field hint is sensitive and unsupported: {}",
index + 1,
name
));
}
}
}
if rule.manifest.get("dates").is_none() {
warnings.push(format!(
"rule {} does not define date hints; output may be less consistent",
index + 1
));
}
}
Ok(PackLintReport { errors, warnings })
}
pub fn merge_pack_overrides(generated: &str, overrides: &str) -> Result<String, String> {
let mut base = serde_json::from_str::<PackFile>(generated)
.map_err(|error| format!("generated pack JSON is invalid: {error}"))?;
let override_pack = serde_json::from_str::<PackFile>(overrides)
.map_err(|error| format!("override pack JSON is invalid: {error}"))?;
if base.version != override_pack.version {
return Err("override version must match generated pack version".to_owned());
}
let mut by_key = BTreeMap::new();
for rule in base.rules {
by_key.insert((rule.host.clone(), rule.path_prefix.clone()), rule);
}
for rule in override_pack.rules {
by_key.insert((rule.host.clone(), rule.path_prefix.clone()), rule);
}
base.rules = by_key.into_values().collect();
serde_json::to_string_pretty(&base).map_err(|error| error.to_string())
}
fn parse_tier(value: &str, source: &str, line_number: usize) -> Result<u8, String> {
let parsed = value
.trim()
.parse::<u8>()
.map_err(|error| format!("invalid {source} tier at line {line_number}: {error}"))?;
if parsed > 5 {
return Err(format!(
"invalid {source} tier at line {line_number}: {parsed} (expected 0..=5)"
));
}
Ok(parsed)
}
fn canonical_family(value: &str) -> String {
match value.trim().to_ascii_lowercase().as_str() {
"reddit" | "generic-forum" => "social-community".to_owned(),
other => other.to_owned(),
}
}
fn domain_to_host_path_prefix(domain: &str) -> (String, String) {
let trimmed = domain.trim();
if let Some((host, path)) = trimmed.split_once('/') {
let prefix = format!("/{}", path.trim_start_matches('/'));
return (
host.trim().to_ascii_lowercase(),
if prefix == "/" {
"/".to_owned()
} else {
prefix
},
);
}
(trimmed.to_ascii_lowercase(), "/".to_owned())
}
#[cfg(test)]
mod tests {
use super::{
ingest_summary, lint_pack_json, merge_pack_overrides, parse_forum_matrix,
parse_top100_matrix, scaffold_pack_json, synthesize_quality, synthesize_rules,
};
#[test]
fn parses_matrix_rows_and_canonicalizes_families() -> Result<(), Box<dyn std::error::Error>> {
let top100 = "# domain\tfamily\tprimary_intent\tmin_tier\tcurrent_tier\tfixture\texpected_path\tstatus\tknown_limit\nreddit.example\treddit\tfeed-or-thread\t1\t1\ta.html\tgeneric\tpartial\tnone\n";
let forum = "# domain\tfamily\tmin_tier\tcurrent_tier\tfixture\texpected_path\tstatus\tknown_limit\nforum.example\tgeneric-forum\t1\t1\ta.html\tgeneric\tpartial\tnone\n";
let parsed_top = parse_top100_matrix(top100)?;
let parsed_forum = parse_forum_matrix(forum)?;
assert_eq!(parsed_top.len(), 1);
assert_eq!(parsed_forum.len(), 1);
assert_eq!(parsed_top[0].family, "social-community");
assert_eq!(parsed_forum[0].family, "social-community");
Ok(())
}
#[test]
fn ingest_summary_is_deterministic_for_row_order() -> Result<(), Box<dyn std::error::Error>> {
let top100_a = "# domain\tfamily\tprimary_intent\tmin_tier\tcurrent_tier\tfixture\texpected_path\tstatus\tknown_limit\nb.example\tsearch-portal\tsearch-results\t1\t1\ta.html\tgeneric\tpartial\tnone\na.example\tsearch-portal\tsearch-results\t1\t1\ta.html\tgeneric\tpartial\tnone\n";
let top100_b = "# domain\tfamily\tprimary_intent\tmin_tier\tcurrent_tier\tfixture\texpected_path\tstatus\tknown_limit\na.example\tsearch-portal\tsearch-results\t1\t1\ta.html\tgeneric\tpartial\tnone\nb.example\tsearch-portal\tsearch-results\t1\t1\ta.html\tgeneric\tpartial\tnone\n";
let forum = "# domain\tfamily\tmin_tier\tcurrent_tier\tfixture\texpected_path\tstatus\tknown_limit\nforum.example\tlegacy-forum\t1\t1\ta.html\tgeneric\tpartial\tnone\n";
let summary_a = ingest_summary(top100_a, forum, &[])?;
let summary_b = ingest_summary(top100_b, forum, &[])?;
assert_eq!(summary_a.rows, summary_b.rows);
assert_eq!(summary_a.family_counts, summary_b.family_counts);
Ok(())
}
#[test]
fn synthesize_and_scaffold_are_deterministic() -> Result<(), Box<dyn std::error::Error>> {
let top100 = "# domain\tfamily\tprimary_intent\tmin_tier\tcurrent_tier\tfixture\texpected_path\tstatus\tknown_limit\nexample.org/docs\tknowledge-reference\tarticle\t1\t1\ta.html\tgeneric\tpartial\tnone\nexample.org/help\tknowledge-reference\tarticle\t1\t1\ta.html\tgeneric\tpartial\tnone\n";
let forum = "# domain\tfamily\tmin_tier\tcurrent_tier\tfixture\texpected_path\tstatus\tknown_limit\nforum.example\tlegacy-forum\t1\t1\ta.html\tgeneric\tpartial\tnone\n";
let summary = ingest_summary(top100, forum, &[])?;
let rules = synthesize_rules(&summary.rows, "knowledge-reference");
assert_eq!(rules.len(), 2);
let scaffold_a = scaffold_pack_json(&summary.rows, "knowledge-reference")?;
let scaffold_b = scaffold_pack_json(&summary.rows, "knowledge-reference")?;
assert_eq!(scaffold_a, scaffold_b);
assert!(scaffold_a.contains("\"version\": \"index.pack/v1\""));
Ok(())
}
#[test]
fn lint_rejects_unsafe_selectors_and_sensitive_fields() -> Result<(), Box<dyn std::error::Error>>
{
let report = lint_pack_json(
r#"{
"version": "index.pack/v1",
"id": "unsafe-pack",
"rules": [
{
"host": "example.org",
"path_prefix": "/docs*",
"manifest": {
"content": { "main_selector": "main script" },
"fields": [{ "name": "auth_token" }]
}
}
]
}"#,
)?;
assert!(!report.passed());
assert!(
report
.errors
.iter()
.any(|error| error.contains("invalid path_prefix"))
);
assert!(
report
.errors
.iter()
.any(|error| error.contains("unsafe main_selector"))
);
assert!(
report
.errors
.iter()
.any(|error| error.contains("sensitive"))
);
Ok(())
}
#[test]
fn lint_rejects_wildcard_hosts_and_manifest_version_mismatch()
-> Result<(), Box<dyn std::error::Error>> {
let report = lint_pack_json(
r#"{
"version": "index.pack/v1",
"id": "unsafe-hosts",
"rules": [
{
"host": "*.example.org",
"path_prefix": "/docs",
"manifest": {
"version": "index.idx/v2",
"content": { "main_selector": "main article" }
}
}
]
}"#,
)?;
assert!(!report.passed());
assert!(
report
.errors
.iter()
.any(|error| error.contains("host is invalid"))
);
assert!(
report
.errors
.iter()
.any(|error| error.contains("unsupported manifest version"))
);
Ok(())
}
#[test]
fn synthesis_quality_scores_eligible_row_coverage() -> Result<(), Box<dyn std::error::Error>> {
let top100 = "# domain\tfamily\tprimary_intent\tmin_tier\tcurrent_tier\tfixture\texpected_path\tstatus\tknown_limit\nexample.org/docs\tknowledge-reference\tarticle\t1\t1\ta.html\tgeneric\tpartial\tnone\nexample.org/help\tknowledge-reference\tarticle\t1\t1\ta.html\tgeneric\tpartial\tnone\n";
let forum = "# domain\tfamily\tmin_tier\tcurrent_tier\tfixture\texpected_path\tstatus\tknown_limit\nforum.example\tlegacy-forum\t1\t1\ta.html\tgeneric\tpartial\tnone\n";
let summary = ingest_summary(top100, forum, &[])?;
let rules = synthesize_rules(&summary.rows, "knowledge-reference");
let quality = synthesize_quality(&summary.rows, "knowledge-reference", &rules);
assert_eq!(quality.eligible_rows, 2);
assert_eq!(quality.covered_rows, 2);
assert!((quality.score - 1.0).abs() < f64::EPSILON);
assert!(
quality
.reasons
.iter()
.any(|reason| reason.contains("all eligible rows"))
);
Ok(())
}
#[test]
fn merge_overrides_replaces_matching_rules() -> Result<(), Box<dyn std::error::Error>> {
let generated = r#"{
"version": "index.pack/v1",
"id": "family.docs",
"rules": [
{ "host": "example.org", "path_prefix": "/docs", "manifest": {"content":{"main_selector":"main"}} }
]
}"#;
let overrides = r#"{
"version": "index.pack/v1",
"id": "family.docs",
"rules": [
{ "host": "example.org", "path_prefix": "/docs", "manifest": {"content":{"main_selector":"article"}} },
{ "host": "example.net", "path_prefix": "/", "manifest": {"content":{"main_selector":"main"}} }
]
}"#;
let merged = merge_pack_overrides(generated, overrides)?;
assert!(merged.contains("\"example.org\""));
assert!(merged.contains("\"example.net\""));
assert!(merged.contains("\"article\""));
Ok(())
}
}