1use std::collections::{BTreeMap, BTreeSet};
4
5use index_capture::validate_capture_bundle;
6use serde::{Deserialize, Serialize};
7use serde_json::json;
8
9#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
11pub enum CorpusSource {
12 Top100,
14 Forum,
16}
17
18impl CorpusSource {
19 #[must_use]
21 pub const fn as_str(self) -> &'static str {
22 match self {
23 Self::Top100 => "top100",
24 Self::Forum => "forum",
25 }
26 }
27}
28
29#[derive(Debug, Clone, PartialEq, Eq)]
31pub struct LabRow {
32 pub source: CorpusSource,
34 pub domain: String,
36 pub family: String,
38 pub intent: Option<String>,
40 pub current_tier: u8,
42 pub known_limit: String,
44}
45
46#[derive(Debug, Clone, PartialEq, Eq)]
48pub struct IngestSummary {
49 pub rows: Vec<LabRow>,
51 pub captures_total: usize,
53 pub family_counts: Vec<(String, usize)>,
55}
56
57#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
59pub struct PackRuleSuggestion {
60 pub host: String,
62 pub path_prefix: String,
64}
65
66#[derive(Debug, Clone, PartialEq, Eq)]
68pub struct PackLintReport {
69 pub errors: Vec<String>,
71 pub warnings: Vec<String>,
73}
74
75impl PackLintReport {
76 #[must_use]
78 pub fn passed(&self) -> bool {
79 self.errors.is_empty()
80 }
81}
82
83#[derive(Debug, Clone, PartialEq)]
85pub struct SynthesisQuality {
86 pub family: String,
88 pub eligible_rows: usize,
90 pub covered_rows: usize,
92 pub score: f64,
94 pub reasons: Vec<String>,
96}
97
98#[derive(Debug, Clone, Serialize, Deserialize)]
99struct PackFile {
100 version: String,
101 id: String,
102 #[serde(default)]
103 rules: Vec<PackRule>,
104}
105
106#[derive(Debug, Clone, Serialize, Deserialize)]
107struct PackRule {
108 host: String,
109 path_prefix: String,
110 manifest: serde_json::Value,
111}
112
113pub fn parse_top100_matrix(input: &str) -> Result<Vec<LabRow>, String> {
115 let mut rows = Vec::new();
116 for (line_number, line) in input.lines().enumerate() {
117 let trimmed = line.trim();
118 if trimmed.is_empty() || trimmed.starts_with('#') {
119 continue;
120 }
121 let fields = trimmed.split('\t').collect::<Vec<_>>();
122 if fields.len() < 9 {
123 return Err(format!(
124 "invalid top100 row at line {}: expected 9 fields, got {}",
125 line_number + 1,
126 fields.len()
127 ));
128 }
129 let current_tier = parse_tier(fields[4], "top100", line_number + 1)?;
130 rows.push(LabRow {
131 source: CorpusSource::Top100,
132 domain: fields[0].trim().to_owned(),
133 family: canonical_family(fields[1]),
134 intent: Some(fields[2].trim().to_owned()),
135 current_tier,
136 known_limit: fields[8].trim().to_owned(),
137 });
138 }
139 Ok(rows)
140}
141
142pub fn parse_forum_matrix(input: &str) -> Result<Vec<LabRow>, String> {
144 let mut rows = Vec::new();
145 for (line_number, line) in input.lines().enumerate() {
146 let trimmed = line.trim();
147 if trimmed.is_empty() || trimmed.starts_with('#') {
148 continue;
149 }
150 let fields = trimmed.split('\t').collect::<Vec<_>>();
151 if fields.len() < 8 {
152 return Err(format!(
153 "invalid forum row at line {}: expected 8 fields, got {}",
154 line_number + 1,
155 fields.len()
156 ));
157 }
158 let current_tier = parse_tier(fields[3], "forum", line_number + 1)?;
159 rows.push(LabRow {
160 source: CorpusSource::Forum,
161 domain: fields[0].trim().to_owned(),
162 family: canonical_family(fields[1]),
163 intent: None,
164 current_tier,
165 known_limit: fields[7].trim().to_owned(),
166 });
167 }
168 Ok(rows)
169}
170
171pub fn ingest_summary(
173 top100_matrix: &str,
174 forum_matrix: &str,
175 capture_artifacts: &[String],
176) -> Result<IngestSummary, String> {
177 let mut rows = parse_top100_matrix(top100_matrix)?;
178 rows.extend(parse_forum_matrix(forum_matrix)?);
179 rows.sort_by(|left, right| {
180 (
181 left.family.as_str(),
182 left.domain.as_str(),
183 left.source.as_str(),
184 left.intent.as_deref().unwrap_or(""),
185 )
186 .cmp(&(
187 right.family.as_str(),
188 right.domain.as_str(),
189 right.source.as_str(),
190 right.intent.as_deref().unwrap_or(""),
191 ))
192 });
193
194 let mut captures_total = 0usize;
195 for artifact in capture_artifacts {
196 validate_capture_bundle(artifact).map_err(|error| error.to_string())?;
197 captures_total = captures_total.saturating_add(1);
198 }
199
200 let mut counts = BTreeMap::<String, usize>::new();
201 for row in &rows {
202 let counter = counts.entry(row.family.clone()).or_default();
203 *counter = counter.saturating_add(1);
204 }
205 let family_counts = counts.into_iter().collect::<Vec<_>>();
206
207 Ok(IngestSummary {
208 rows,
209 captures_total,
210 family_counts,
211 })
212}
213
214pub fn synthesize_rules(rows: &[LabRow], family: &str) -> Vec<PackRuleSuggestion> {
216 let family = canonical_family(family);
217 let mut entries = BTreeSet::new();
218 for row in rows {
219 if row.family != family {
220 continue;
221 }
222 let (host, path_prefix) = domain_to_host_path_prefix(&row.domain);
223 entries.insert((host, path_prefix));
224 }
225 entries
226 .into_iter()
227 .map(|(host, path_prefix)| PackRuleSuggestion { host, path_prefix })
228 .collect()
229}
230
231pub fn synthesize_quality(
233 rows: &[LabRow],
234 family: &str,
235 rules: &[PackRuleSuggestion],
236) -> SynthesisQuality {
237 let family = canonical_family(family);
238 let eligible = rows
239 .iter()
240 .filter(|row| row.family == family && row.known_limit == "none")
241 .collect::<Vec<_>>();
242
243 let covered_rows = eligible
244 .iter()
245 .filter(|row| {
246 let (host, path_prefix) = domain_to_host_path_prefix(&row.domain);
247 rules
248 .iter()
249 .any(|rule| rule.host == host && rule.path_prefix == path_prefix)
250 })
251 .count();
252 let eligible_rows = eligible.len();
253 let score = if eligible_rows == 0 {
254 1.0
255 } else {
256 covered_rows as f64 / eligible_rows as f64
257 };
258 let mut reasons = Vec::new();
259 if eligible_rows == 0 {
260 reasons.push("no eligible rows (known_limit=none) for family".to_owned());
261 } else if covered_rows == eligible_rows {
262 reasons.push("all eligible rows map to synthesized host/path rules".to_owned());
263 } else {
264 reasons.push(format!(
265 "{}/{} eligible rows covered by synthesized rules",
266 covered_rows, eligible_rows
267 ));
268 }
269 SynthesisQuality {
270 family,
271 eligible_rows,
272 covered_rows,
273 score,
274 reasons,
275 }
276}
277
278pub fn scaffold_pack_json(rows: &[LabRow], family: &str) -> Result<String, String> {
280 let rules = synthesize_rules(rows, family);
281 let canonical_family = canonical_family(family);
282 let id = format!("family.{canonical_family}");
283 let pack_rules = rules
284 .into_iter()
285 .map(|rule| {
286 json!({
287 "host": rule.host,
288 "path_prefix": rule.path_prefix,
289 "manifest": {
290 "version": "index.idx/v1",
291 "scope": "/",
292 "content": {
293 "main_selector": "main, article, [role='main']"
294 },
295 "regions": [],
296 "fields": [],
297 "forms": [],
298 "dates": []
299 }
300 })
301 })
302 .collect::<Vec<_>>();
303 let output = json!({
304 "version": "index.pack/v1",
305 "id": id,
306 "rules": pack_rules
307 });
308 serde_json::to_string_pretty(&output).map_err(|error| error.to_string())
309}
310
311pub fn lint_pack_json(input: &str) -> Result<PackLintReport, String> {
313 let pack = serde_json::from_str::<PackFile>(input)
314 .map_err(|error| format!("pack JSON is invalid: {error}"))?;
315 let mut errors = Vec::new();
316 let mut warnings = Vec::new();
317 if pack.version != "index.pack/v1" {
318 errors.push(format!("unsupported pack version: {}", pack.version));
319 }
320 if pack.id.trim().is_empty() {
321 errors.push("pack id must not be empty".to_owned());
322 }
323
324 for (index, rule) in pack.rules.iter().enumerate() {
325 if rule.host.contains('*')
326 || rule.host.starts_with('.')
327 || rule.host.contains(' ')
328 || !rule.host.contains('.')
329 {
330 errors.push(format!("rule {} host is invalid: {}", index + 1, rule.host));
331 }
332 if !rule.path_prefix.starts_with('/') || rule.path_prefix.contains('*') {
333 errors.push(format!(
334 "rule {} has invalid path_prefix {}",
335 index + 1,
336 rule.path_prefix
337 ));
338 }
339 if let Some(version) = rule
340 .manifest
341 .get("version")
342 .and_then(|value| value.as_str())
343 {
344 if version != "index.idx/v1" {
345 errors.push(format!(
346 "rule {} has unsupported manifest version: {}",
347 index + 1,
348 version
349 ));
350 }
351 }
352 if let Some(selector) = rule
353 .manifest
354 .get("content")
355 .and_then(|content| content.get("main_selector"))
356 .and_then(|selector| selector.as_str())
357 {
358 let selector_lower = selector.to_ascii_lowercase();
359 if selector_lower.contains("script") || selector_lower.contains("iframe") {
360 errors.push(format!(
361 "rule {} has unsafe main_selector: {}",
362 index + 1,
363 selector
364 ));
365 }
366 }
367 let field_names = rule
368 .manifest
369 .get("fields")
370 .and_then(|fields| fields.as_array())
371 .cloned()
372 .unwrap_or_default();
373 for field in field_names {
374 if let Some(name) = field.get("name").and_then(|value| value.as_str()) {
375 let lower = name.to_ascii_lowercase();
376 if lower.contains("password") || lower.contains("token") || lower.contains("cookie")
377 {
378 errors.push(format!(
379 "rule {} field hint is sensitive and unsupported: {}",
380 index + 1,
381 name
382 ));
383 }
384 }
385 }
386 if rule.manifest.get("dates").is_none() {
387 warnings.push(format!(
388 "rule {} does not define date hints; output may be less consistent",
389 index + 1
390 ));
391 }
392 }
393
394 Ok(PackLintReport { errors, warnings })
395}
396
397pub fn merge_pack_overrides(generated: &str, overrides: &str) -> Result<String, String> {
399 let mut base = serde_json::from_str::<PackFile>(generated)
400 .map_err(|error| format!("generated pack JSON is invalid: {error}"))?;
401 let override_pack = serde_json::from_str::<PackFile>(overrides)
402 .map_err(|error| format!("override pack JSON is invalid: {error}"))?;
403
404 if base.version != override_pack.version {
405 return Err("override version must match generated pack version".to_owned());
406 }
407
408 let mut by_key = BTreeMap::new();
409 for rule in base.rules {
410 by_key.insert((rule.host.clone(), rule.path_prefix.clone()), rule);
411 }
412 for rule in override_pack.rules {
413 by_key.insert((rule.host.clone(), rule.path_prefix.clone()), rule);
414 }
415 base.rules = by_key.into_values().collect();
416
417 serde_json::to_string_pretty(&base).map_err(|error| error.to_string())
418}
419
420fn parse_tier(value: &str, source: &str, line_number: usize) -> Result<u8, String> {
421 let parsed = value
422 .trim()
423 .parse::<u8>()
424 .map_err(|error| format!("invalid {source} tier at line {line_number}: {error}"))?;
425 if parsed > 5 {
426 return Err(format!(
427 "invalid {source} tier at line {line_number}: {parsed} (expected 0..=5)"
428 ));
429 }
430 Ok(parsed)
431}
432
433fn canonical_family(value: &str) -> String {
434 match value.trim().to_ascii_lowercase().as_str() {
435 "reddit" | "generic-forum" => "social-community".to_owned(),
436 other => other.to_owned(),
437 }
438}
439
440fn domain_to_host_path_prefix(domain: &str) -> (String, String) {
441 let trimmed = domain.trim();
442 if let Some((host, path)) = trimmed.split_once('/') {
443 let prefix = format!("/{}", path.trim_start_matches('/'));
444 return (
445 host.trim().to_ascii_lowercase(),
446 if prefix == "/" {
447 "/".to_owned()
448 } else {
449 prefix
450 },
451 );
452 }
453 (trimmed.to_ascii_lowercase(), "/".to_owned())
454}
455
456#[cfg(test)]
457mod tests {
458 use super::{
459 ingest_summary, lint_pack_json, merge_pack_overrides, parse_forum_matrix,
460 parse_top100_matrix, scaffold_pack_json, synthesize_quality, synthesize_rules,
461 };
462
463 #[test]
464 fn parses_matrix_rows_and_canonicalizes_families() -> Result<(), Box<dyn std::error::Error>> {
465 let top100 = "# domain\tfamily\tprimary_intent\tmin_tier\tcurrent_tier\tfixture\texpected_path\tstatus\tknown_limit\nreddit.example\treddit\tfeed-or-thread\t1\t1\ta.html\tgeneric\tpartial\tnone\n";
466 let forum = "# domain\tfamily\tmin_tier\tcurrent_tier\tfixture\texpected_path\tstatus\tknown_limit\nforum.example\tgeneric-forum\t1\t1\ta.html\tgeneric\tpartial\tnone\n";
467 let parsed_top = parse_top100_matrix(top100)?;
468 let parsed_forum = parse_forum_matrix(forum)?;
469 assert_eq!(parsed_top.len(), 1);
470 assert_eq!(parsed_forum.len(), 1);
471 assert_eq!(parsed_top[0].family, "social-community");
472 assert_eq!(parsed_forum[0].family, "social-community");
473 Ok(())
474 }
475
476 #[test]
477 fn ingest_summary_is_deterministic_for_row_order() -> Result<(), Box<dyn std::error::Error>> {
478 let top100_a = "# domain\tfamily\tprimary_intent\tmin_tier\tcurrent_tier\tfixture\texpected_path\tstatus\tknown_limit\nb.example\tsearch-portal\tsearch-results\t1\t1\ta.html\tgeneric\tpartial\tnone\na.example\tsearch-portal\tsearch-results\t1\t1\ta.html\tgeneric\tpartial\tnone\n";
479 let top100_b = "# domain\tfamily\tprimary_intent\tmin_tier\tcurrent_tier\tfixture\texpected_path\tstatus\tknown_limit\na.example\tsearch-portal\tsearch-results\t1\t1\ta.html\tgeneric\tpartial\tnone\nb.example\tsearch-portal\tsearch-results\t1\t1\ta.html\tgeneric\tpartial\tnone\n";
480 let forum = "# domain\tfamily\tmin_tier\tcurrent_tier\tfixture\texpected_path\tstatus\tknown_limit\nforum.example\tlegacy-forum\t1\t1\ta.html\tgeneric\tpartial\tnone\n";
481 let summary_a = ingest_summary(top100_a, forum, &[])?;
482 let summary_b = ingest_summary(top100_b, forum, &[])?;
483 assert_eq!(summary_a.rows, summary_b.rows);
484 assert_eq!(summary_a.family_counts, summary_b.family_counts);
485 Ok(())
486 }
487
488 #[test]
489 fn synthesize_and_scaffold_are_deterministic() -> Result<(), Box<dyn std::error::Error>> {
490 let top100 = "# domain\tfamily\tprimary_intent\tmin_tier\tcurrent_tier\tfixture\texpected_path\tstatus\tknown_limit\nexample.org/docs\tknowledge-reference\tarticle\t1\t1\ta.html\tgeneric\tpartial\tnone\nexample.org/help\tknowledge-reference\tarticle\t1\t1\ta.html\tgeneric\tpartial\tnone\n";
491 let forum = "# domain\tfamily\tmin_tier\tcurrent_tier\tfixture\texpected_path\tstatus\tknown_limit\nforum.example\tlegacy-forum\t1\t1\ta.html\tgeneric\tpartial\tnone\n";
492 let summary = ingest_summary(top100, forum, &[])?;
493 let rules = synthesize_rules(&summary.rows, "knowledge-reference");
494 assert_eq!(rules.len(), 2);
495 let scaffold_a = scaffold_pack_json(&summary.rows, "knowledge-reference")?;
496 let scaffold_b = scaffold_pack_json(&summary.rows, "knowledge-reference")?;
497 assert_eq!(scaffold_a, scaffold_b);
498 assert!(scaffold_a.contains("\"version\": \"index.pack/v1\""));
499 Ok(())
500 }
501
502 #[test]
503 fn lint_rejects_unsafe_selectors_and_sensitive_fields() -> Result<(), Box<dyn std::error::Error>>
504 {
505 let report = lint_pack_json(
506 r#"{
507 "version": "index.pack/v1",
508 "id": "unsafe-pack",
509 "rules": [
510 {
511 "host": "example.org",
512 "path_prefix": "/docs*",
513 "manifest": {
514 "content": { "main_selector": "main script" },
515 "fields": [{ "name": "auth_token" }]
516 }
517 }
518 ]
519}"#,
520 )?;
521 assert!(!report.passed());
522 assert!(
523 report
524 .errors
525 .iter()
526 .any(|error| error.contains("invalid path_prefix"))
527 );
528 assert!(
529 report
530 .errors
531 .iter()
532 .any(|error| error.contains("unsafe main_selector"))
533 );
534 assert!(
535 report
536 .errors
537 .iter()
538 .any(|error| error.contains("sensitive"))
539 );
540 Ok(())
541 }
542
543 #[test]
544 fn lint_rejects_wildcard_hosts_and_manifest_version_mismatch()
545 -> Result<(), Box<dyn std::error::Error>> {
546 let report = lint_pack_json(
547 r#"{
548 "version": "index.pack/v1",
549 "id": "unsafe-hosts",
550 "rules": [
551 {
552 "host": "*.example.org",
553 "path_prefix": "/docs",
554 "manifest": {
555 "version": "index.idx/v2",
556 "content": { "main_selector": "main article" }
557 }
558 }
559 ]
560}"#,
561 )?;
562 assert!(!report.passed());
563 assert!(
564 report
565 .errors
566 .iter()
567 .any(|error| error.contains("host is invalid"))
568 );
569 assert!(
570 report
571 .errors
572 .iter()
573 .any(|error| error.contains("unsupported manifest version"))
574 );
575 Ok(())
576 }
577
578 #[test]
579 fn synthesis_quality_scores_eligible_row_coverage() -> Result<(), Box<dyn std::error::Error>> {
580 let top100 = "# domain\tfamily\tprimary_intent\tmin_tier\tcurrent_tier\tfixture\texpected_path\tstatus\tknown_limit\nexample.org/docs\tknowledge-reference\tarticle\t1\t1\ta.html\tgeneric\tpartial\tnone\nexample.org/help\tknowledge-reference\tarticle\t1\t1\ta.html\tgeneric\tpartial\tnone\n";
581 let forum = "# domain\tfamily\tmin_tier\tcurrent_tier\tfixture\texpected_path\tstatus\tknown_limit\nforum.example\tlegacy-forum\t1\t1\ta.html\tgeneric\tpartial\tnone\n";
582 let summary = ingest_summary(top100, forum, &[])?;
583 let rules = synthesize_rules(&summary.rows, "knowledge-reference");
584 let quality = synthesize_quality(&summary.rows, "knowledge-reference", &rules);
585 assert_eq!(quality.eligible_rows, 2);
586 assert_eq!(quality.covered_rows, 2);
587 assert!((quality.score - 1.0).abs() < f64::EPSILON);
588 assert!(
589 quality
590 .reasons
591 .iter()
592 .any(|reason| reason.contains("all eligible rows"))
593 );
594 Ok(())
595 }
596
597 #[test]
598 fn merge_overrides_replaces_matching_rules() -> Result<(), Box<dyn std::error::Error>> {
599 let generated = r#"{
600 "version": "index.pack/v1",
601 "id": "family.docs",
602 "rules": [
603 { "host": "example.org", "path_prefix": "/docs", "manifest": {"content":{"main_selector":"main"}} }
604 ]
605}"#;
606 let overrides = r#"{
607 "version": "index.pack/v1",
608 "id": "family.docs",
609 "rules": [
610 { "host": "example.org", "path_prefix": "/docs", "manifest": {"content":{"main_selector":"article"}} },
611 { "host": "example.net", "path_prefix": "/", "manifest": {"content":{"main_selector":"main"}} }
612 ]
613}"#;
614 let merged = merge_pack_overrides(generated, overrides)?;
615 assert!(merged.contains("\"example.org\""));
616 assert!(merged.contains("\"example.net\""));
617 assert!(merged.contains("\"article\""));
618 Ok(())
619 }
620}