Skip to main content

index_compat_lab/
lib.rs

1//! Offline compatibility artifact synthesis and lint helpers.
2
3use std::collections::{BTreeMap, BTreeSet};
4
5use index_capture::validate_capture_bundle;
6use serde::{Deserialize, Serialize};
7use serde_json::json;
8
9/// Corpus source label.
10#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
11pub enum CorpusSource {
12    /// Top-100 corpus matrix.
13    Top100,
14    /// Forum corpus matrix.
15    Forum,
16}
17
18impl CorpusSource {
19    /// Stable source label.
20    #[must_use]
21    pub const fn as_str(self) -> &'static str {
22        match self {
23            Self::Top100 => "top100",
24            Self::Forum => "forum",
25        }
26    }
27}
28
29/// Ingested compatibility row.
30#[derive(Debug, Clone, PartialEq, Eq)]
31pub struct LabRow {
32    /// Row source.
33    pub source: CorpusSource,
34    /// Domain identifier from matrix.
35    pub domain: String,
36    /// Family identifier from matrix.
37    pub family: String,
38    /// Optional intent identifier.
39    pub intent: Option<String>,
40    /// Current support tier.
41    pub current_tier: u8,
42    /// Known limit category.
43    pub known_limit: String,
44}
45
46/// Ingest summary.
47#[derive(Debug, Clone, PartialEq, Eq)]
48pub struct IngestSummary {
49    /// Rows parsed from corpus matrices.
50    pub rows: Vec<LabRow>,
51    /// Validated capture artifact count.
52    pub captures_total: usize,
53    /// Rows grouped by family.
54    pub family_counts: Vec<(String, usize)>,
55}
56
57/// Pack-rule suggestion.
58#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
59pub struct PackRuleSuggestion {
60    /// Host matcher.
61    pub host: String,
62    /// Path-prefix matcher.
63    pub path_prefix: String,
64}
65
66/// Pack lint report.
67#[derive(Debug, Clone, PartialEq, Eq)]
68pub struct PackLintReport {
69    /// Collected errors.
70    pub errors: Vec<String>,
71    /// Collected warnings.
72    pub warnings: Vec<String>,
73}
74
75impl PackLintReport {
76    /// Returns true if lint has no errors.
77    #[must_use]
78    pub fn passed(&self) -> bool {
79        self.errors.is_empty()
80    }
81}
82
83/// Deterministic synthesis quality report.
84#[derive(Debug, Clone, PartialEq)]
85pub struct SynthesisQuality {
86    /// Family evaluated.
87    pub family: String,
88    /// Rows eligible for deterministic pack coverage scoring.
89    pub eligible_rows: usize,
90    /// Eligible rows covered by generated rules.
91    pub covered_rows: usize,
92    /// Coverage ratio in the range `[0.0, 1.0]`.
93    pub score: f64,
94    /// Human-readable deterministic reasons.
95    pub reasons: Vec<String>,
96}
97
98#[derive(Debug, Clone, Serialize, Deserialize)]
99struct PackFile {
100    version: String,
101    id: String,
102    #[serde(default)]
103    rules: Vec<PackRule>,
104}
105
106#[derive(Debug, Clone, Serialize, Deserialize)]
107struct PackRule {
108    host: String,
109    path_prefix: String,
110    manifest: serde_json::Value,
111}
112
113/// Parses top-100 matrix rows relevant to compatibility lab tasks.
114pub fn parse_top100_matrix(input: &str) -> Result<Vec<LabRow>, String> {
115    let mut rows = Vec::new();
116    for (line_number, line) in input.lines().enumerate() {
117        let trimmed = line.trim();
118        if trimmed.is_empty() || trimmed.starts_with('#') {
119            continue;
120        }
121        let fields = trimmed.split('\t').collect::<Vec<_>>();
122        if fields.len() < 9 {
123            return Err(format!(
124                "invalid top100 row at line {}: expected 9 fields, got {}",
125                line_number + 1,
126                fields.len()
127            ));
128        }
129        let current_tier = parse_tier(fields[4], "top100", line_number + 1)?;
130        rows.push(LabRow {
131            source: CorpusSource::Top100,
132            domain: fields[0].trim().to_owned(),
133            family: canonical_family(fields[1]),
134            intent: Some(fields[2].trim().to_owned()),
135            current_tier,
136            known_limit: fields[8].trim().to_owned(),
137        });
138    }
139    Ok(rows)
140}
141
142/// Parses forum matrix rows relevant to compatibility lab tasks.
143pub fn parse_forum_matrix(input: &str) -> Result<Vec<LabRow>, String> {
144    let mut rows = Vec::new();
145    for (line_number, line) in input.lines().enumerate() {
146        let trimmed = line.trim();
147        if trimmed.is_empty() || trimmed.starts_with('#') {
148            continue;
149        }
150        let fields = trimmed.split('\t').collect::<Vec<_>>();
151        if fields.len() < 8 {
152            return Err(format!(
153                "invalid forum row at line {}: expected 8 fields, got {}",
154                line_number + 1,
155                fields.len()
156            ));
157        }
158        let current_tier = parse_tier(fields[3], "forum", line_number + 1)?;
159        rows.push(LabRow {
160            source: CorpusSource::Forum,
161            domain: fields[0].trim().to_owned(),
162            family: canonical_family(fields[1]),
163            intent: None,
164            current_tier,
165            known_limit: fields[7].trim().to_owned(),
166        });
167    }
168    Ok(rows)
169}
170
171/// Ingests corpus rows and optional capture artifacts.
172pub fn ingest_summary(
173    top100_matrix: &str,
174    forum_matrix: &str,
175    capture_artifacts: &[String],
176) -> Result<IngestSummary, String> {
177    let mut rows = parse_top100_matrix(top100_matrix)?;
178    rows.extend(parse_forum_matrix(forum_matrix)?);
179    rows.sort_by(|left, right| {
180        (
181            left.family.as_str(),
182            left.domain.as_str(),
183            left.source.as_str(),
184            left.intent.as_deref().unwrap_or(""),
185        )
186            .cmp(&(
187                right.family.as_str(),
188                right.domain.as_str(),
189                right.source.as_str(),
190                right.intent.as_deref().unwrap_or(""),
191            ))
192    });
193
194    let mut captures_total = 0usize;
195    for artifact in capture_artifacts {
196        validate_capture_bundle(artifact).map_err(|error| error.to_string())?;
197        captures_total = captures_total.saturating_add(1);
198    }
199
200    let mut counts = BTreeMap::<String, usize>::new();
201    for row in &rows {
202        let counter = counts.entry(row.family.clone()).or_default();
203        *counter = counter.saturating_add(1);
204    }
205    let family_counts = counts.into_iter().collect::<Vec<_>>();
206
207    Ok(IngestSummary {
208        rows,
209        captures_total,
210        family_counts,
211    })
212}
213
214/// Suggests deterministic host/path rules for a family.
215pub fn synthesize_rules(rows: &[LabRow], family: &str) -> Vec<PackRuleSuggestion> {
216    let family = canonical_family(family);
217    let mut entries = BTreeSet::new();
218    for row in rows {
219        if row.family != family {
220            continue;
221        }
222        let (host, path_prefix) = domain_to_host_path_prefix(&row.domain);
223        entries.insert((host, path_prefix));
224    }
225    entries
226        .into_iter()
227        .map(|(host, path_prefix)| PackRuleSuggestion { host, path_prefix })
228        .collect()
229}
230
231/// Computes deterministic synthesis quality for one family.
232pub fn synthesize_quality(
233    rows: &[LabRow],
234    family: &str,
235    rules: &[PackRuleSuggestion],
236) -> SynthesisQuality {
237    let family = canonical_family(family);
238    let eligible = rows
239        .iter()
240        .filter(|row| row.family == family && row.known_limit == "none")
241        .collect::<Vec<_>>();
242
243    let covered_rows = eligible
244        .iter()
245        .filter(|row| {
246            let (host, path_prefix) = domain_to_host_path_prefix(&row.domain);
247            rules
248                .iter()
249                .any(|rule| rule.host == host && rule.path_prefix == path_prefix)
250        })
251        .count();
252    let eligible_rows = eligible.len();
253    let score = if eligible_rows == 0 {
254        1.0
255    } else {
256        covered_rows as f64 / eligible_rows as f64
257    };
258    let mut reasons = Vec::new();
259    if eligible_rows == 0 {
260        reasons.push("no eligible rows (known_limit=none) for family".to_owned());
261    } else if covered_rows == eligible_rows {
262        reasons.push("all eligible rows map to synthesized host/path rules".to_owned());
263    } else {
264        reasons.push(format!(
265            "{}/{} eligible rows covered by synthesized rules",
266            covered_rows, eligible_rows
267        ));
268    }
269    SynthesisQuality {
270        family,
271        eligible_rows,
272        covered_rows,
273        score,
274        reasons,
275    }
276}
277
278/// Creates a deterministic scaffold pack JSON.
279pub fn scaffold_pack_json(rows: &[LabRow], family: &str) -> Result<String, String> {
280    let rules = synthesize_rules(rows, family);
281    let canonical_family = canonical_family(family);
282    let id = format!("family.{canonical_family}");
283    let pack_rules = rules
284        .into_iter()
285        .map(|rule| {
286            json!({
287                "host": rule.host,
288                "path_prefix": rule.path_prefix,
289                "manifest": {
290                    "version": "index.idx/v1",
291                    "scope": "/",
292                    "content": {
293                        "main_selector": "main, article, [role='main']"
294                    },
295                    "regions": [],
296                    "fields": [],
297                    "forms": [],
298                    "dates": []
299                }
300            })
301        })
302        .collect::<Vec<_>>();
303    let output = json!({
304        "version": "index.pack/v1",
305        "id": id,
306        "rules": pack_rules
307    });
308    serde_json::to_string_pretty(&output).map_err(|error| error.to_string())
309}
310
311/// Lints synthesized or edited pack JSON for common unsafe patterns.
312pub fn lint_pack_json(input: &str) -> Result<PackLintReport, String> {
313    let pack = serde_json::from_str::<PackFile>(input)
314        .map_err(|error| format!("pack JSON is invalid: {error}"))?;
315    let mut errors = Vec::new();
316    let mut warnings = Vec::new();
317    if pack.version != "index.pack/v1" {
318        errors.push(format!("unsupported pack version: {}", pack.version));
319    }
320    if pack.id.trim().is_empty() {
321        errors.push("pack id must not be empty".to_owned());
322    }
323
324    for (index, rule) in pack.rules.iter().enumerate() {
325        if rule.host.contains('*')
326            || rule.host.starts_with('.')
327            || rule.host.contains(' ')
328            || !rule.host.contains('.')
329        {
330            errors.push(format!("rule {} host is invalid: {}", index + 1, rule.host));
331        }
332        if !rule.path_prefix.starts_with('/') || rule.path_prefix.contains('*') {
333            errors.push(format!(
334                "rule {} has invalid path_prefix {}",
335                index + 1,
336                rule.path_prefix
337            ));
338        }
339        if let Some(version) = rule
340            .manifest
341            .get("version")
342            .and_then(|value| value.as_str())
343        {
344            if version != "index.idx/v1" {
345                errors.push(format!(
346                    "rule {} has unsupported manifest version: {}",
347                    index + 1,
348                    version
349                ));
350            }
351        }
352        if let Some(selector) = rule
353            .manifest
354            .get("content")
355            .and_then(|content| content.get("main_selector"))
356            .and_then(|selector| selector.as_str())
357        {
358            let selector_lower = selector.to_ascii_lowercase();
359            if selector_lower.contains("script") || selector_lower.contains("iframe") {
360                errors.push(format!(
361                    "rule {} has unsafe main_selector: {}",
362                    index + 1,
363                    selector
364                ));
365            }
366        }
367        let field_names = rule
368            .manifest
369            .get("fields")
370            .and_then(|fields| fields.as_array())
371            .cloned()
372            .unwrap_or_default();
373        for field in field_names {
374            if let Some(name) = field.get("name").and_then(|value| value.as_str()) {
375                let lower = name.to_ascii_lowercase();
376                if lower.contains("password") || lower.contains("token") || lower.contains("cookie")
377                {
378                    errors.push(format!(
379                        "rule {} field hint is sensitive and unsupported: {}",
380                        index + 1,
381                        name
382                    ));
383                }
384            }
385        }
386        if rule.manifest.get("dates").is_none() {
387            warnings.push(format!(
388                "rule {} does not define date hints; output may be less consistent",
389                index + 1
390            ));
391        }
392    }
393
394    Ok(PackLintReport { errors, warnings })
395}
396
397/// Merges override pack rules into a generated pack.
398pub fn merge_pack_overrides(generated: &str, overrides: &str) -> Result<String, String> {
399    let mut base = serde_json::from_str::<PackFile>(generated)
400        .map_err(|error| format!("generated pack JSON is invalid: {error}"))?;
401    let override_pack = serde_json::from_str::<PackFile>(overrides)
402        .map_err(|error| format!("override pack JSON is invalid: {error}"))?;
403
404    if base.version != override_pack.version {
405        return Err("override version must match generated pack version".to_owned());
406    }
407
408    let mut by_key = BTreeMap::new();
409    for rule in base.rules {
410        by_key.insert((rule.host.clone(), rule.path_prefix.clone()), rule);
411    }
412    for rule in override_pack.rules {
413        by_key.insert((rule.host.clone(), rule.path_prefix.clone()), rule);
414    }
415    base.rules = by_key.into_values().collect();
416
417    serde_json::to_string_pretty(&base).map_err(|error| error.to_string())
418}
419
420fn parse_tier(value: &str, source: &str, line_number: usize) -> Result<u8, String> {
421    let parsed = value
422        .trim()
423        .parse::<u8>()
424        .map_err(|error| format!("invalid {source} tier at line {line_number}: {error}"))?;
425    if parsed > 5 {
426        return Err(format!(
427            "invalid {source} tier at line {line_number}: {parsed} (expected 0..=5)"
428        ));
429    }
430    Ok(parsed)
431}
432
433fn canonical_family(value: &str) -> String {
434    match value.trim().to_ascii_lowercase().as_str() {
435        "reddit" | "generic-forum" => "social-community".to_owned(),
436        other => other.to_owned(),
437    }
438}
439
440fn domain_to_host_path_prefix(domain: &str) -> (String, String) {
441    let trimmed = domain.trim();
442    if let Some((host, path)) = trimmed.split_once('/') {
443        let prefix = format!("/{}", path.trim_start_matches('/'));
444        return (
445            host.trim().to_ascii_lowercase(),
446            if prefix == "/" {
447                "/".to_owned()
448            } else {
449                prefix
450            },
451        );
452    }
453    (trimmed.to_ascii_lowercase(), "/".to_owned())
454}
455
456#[cfg(test)]
457mod tests {
458    use super::{
459        ingest_summary, lint_pack_json, merge_pack_overrides, parse_forum_matrix,
460        parse_top100_matrix, scaffold_pack_json, synthesize_quality, synthesize_rules,
461    };
462
463    #[test]
464    fn parses_matrix_rows_and_canonicalizes_families() -> Result<(), Box<dyn std::error::Error>> {
465        let top100 = "# domain\tfamily\tprimary_intent\tmin_tier\tcurrent_tier\tfixture\texpected_path\tstatus\tknown_limit\nreddit.example\treddit\tfeed-or-thread\t1\t1\ta.html\tgeneric\tpartial\tnone\n";
466        let forum = "# domain\tfamily\tmin_tier\tcurrent_tier\tfixture\texpected_path\tstatus\tknown_limit\nforum.example\tgeneric-forum\t1\t1\ta.html\tgeneric\tpartial\tnone\n";
467        let parsed_top = parse_top100_matrix(top100)?;
468        let parsed_forum = parse_forum_matrix(forum)?;
469        assert_eq!(parsed_top.len(), 1);
470        assert_eq!(parsed_forum.len(), 1);
471        assert_eq!(parsed_top[0].family, "social-community");
472        assert_eq!(parsed_forum[0].family, "social-community");
473        Ok(())
474    }
475
476    #[test]
477    fn ingest_summary_is_deterministic_for_row_order() -> Result<(), Box<dyn std::error::Error>> {
478        let top100_a = "# domain\tfamily\tprimary_intent\tmin_tier\tcurrent_tier\tfixture\texpected_path\tstatus\tknown_limit\nb.example\tsearch-portal\tsearch-results\t1\t1\ta.html\tgeneric\tpartial\tnone\na.example\tsearch-portal\tsearch-results\t1\t1\ta.html\tgeneric\tpartial\tnone\n";
479        let top100_b = "# domain\tfamily\tprimary_intent\tmin_tier\tcurrent_tier\tfixture\texpected_path\tstatus\tknown_limit\na.example\tsearch-portal\tsearch-results\t1\t1\ta.html\tgeneric\tpartial\tnone\nb.example\tsearch-portal\tsearch-results\t1\t1\ta.html\tgeneric\tpartial\tnone\n";
480        let forum = "# domain\tfamily\tmin_tier\tcurrent_tier\tfixture\texpected_path\tstatus\tknown_limit\nforum.example\tlegacy-forum\t1\t1\ta.html\tgeneric\tpartial\tnone\n";
481        let summary_a = ingest_summary(top100_a, forum, &[])?;
482        let summary_b = ingest_summary(top100_b, forum, &[])?;
483        assert_eq!(summary_a.rows, summary_b.rows);
484        assert_eq!(summary_a.family_counts, summary_b.family_counts);
485        Ok(())
486    }
487
488    #[test]
489    fn synthesize_and_scaffold_are_deterministic() -> Result<(), Box<dyn std::error::Error>> {
490        let top100 = "# domain\tfamily\tprimary_intent\tmin_tier\tcurrent_tier\tfixture\texpected_path\tstatus\tknown_limit\nexample.org/docs\tknowledge-reference\tarticle\t1\t1\ta.html\tgeneric\tpartial\tnone\nexample.org/help\tknowledge-reference\tarticle\t1\t1\ta.html\tgeneric\tpartial\tnone\n";
491        let forum = "# domain\tfamily\tmin_tier\tcurrent_tier\tfixture\texpected_path\tstatus\tknown_limit\nforum.example\tlegacy-forum\t1\t1\ta.html\tgeneric\tpartial\tnone\n";
492        let summary = ingest_summary(top100, forum, &[])?;
493        let rules = synthesize_rules(&summary.rows, "knowledge-reference");
494        assert_eq!(rules.len(), 2);
495        let scaffold_a = scaffold_pack_json(&summary.rows, "knowledge-reference")?;
496        let scaffold_b = scaffold_pack_json(&summary.rows, "knowledge-reference")?;
497        assert_eq!(scaffold_a, scaffold_b);
498        assert!(scaffold_a.contains("\"version\": \"index.pack/v1\""));
499        Ok(())
500    }
501
502    #[test]
503    fn lint_rejects_unsafe_selectors_and_sensitive_fields() -> Result<(), Box<dyn std::error::Error>>
504    {
505        let report = lint_pack_json(
506            r#"{
507  "version": "index.pack/v1",
508  "id": "unsafe-pack",
509  "rules": [
510    {
511      "host": "example.org",
512      "path_prefix": "/docs*",
513      "manifest": {
514        "content": { "main_selector": "main script" },
515        "fields": [{ "name": "auth_token" }]
516      }
517    }
518  ]
519}"#,
520        )?;
521        assert!(!report.passed());
522        assert!(
523            report
524                .errors
525                .iter()
526                .any(|error| error.contains("invalid path_prefix"))
527        );
528        assert!(
529            report
530                .errors
531                .iter()
532                .any(|error| error.contains("unsafe main_selector"))
533        );
534        assert!(
535            report
536                .errors
537                .iter()
538                .any(|error| error.contains("sensitive"))
539        );
540        Ok(())
541    }
542
543    #[test]
544    fn lint_rejects_wildcard_hosts_and_manifest_version_mismatch()
545    -> Result<(), Box<dyn std::error::Error>> {
546        let report = lint_pack_json(
547            r#"{
548  "version": "index.pack/v1",
549  "id": "unsafe-hosts",
550  "rules": [
551    {
552      "host": "*.example.org",
553      "path_prefix": "/docs",
554      "manifest": {
555        "version": "index.idx/v2",
556        "content": { "main_selector": "main article" }
557      }
558    }
559  ]
560}"#,
561        )?;
562        assert!(!report.passed());
563        assert!(
564            report
565                .errors
566                .iter()
567                .any(|error| error.contains("host is invalid"))
568        );
569        assert!(
570            report
571                .errors
572                .iter()
573                .any(|error| error.contains("unsupported manifest version"))
574        );
575        Ok(())
576    }
577
578    #[test]
579    fn synthesis_quality_scores_eligible_row_coverage() -> Result<(), Box<dyn std::error::Error>> {
580        let top100 = "# domain\tfamily\tprimary_intent\tmin_tier\tcurrent_tier\tfixture\texpected_path\tstatus\tknown_limit\nexample.org/docs\tknowledge-reference\tarticle\t1\t1\ta.html\tgeneric\tpartial\tnone\nexample.org/help\tknowledge-reference\tarticle\t1\t1\ta.html\tgeneric\tpartial\tnone\n";
581        let forum = "# domain\tfamily\tmin_tier\tcurrent_tier\tfixture\texpected_path\tstatus\tknown_limit\nforum.example\tlegacy-forum\t1\t1\ta.html\tgeneric\tpartial\tnone\n";
582        let summary = ingest_summary(top100, forum, &[])?;
583        let rules = synthesize_rules(&summary.rows, "knowledge-reference");
584        let quality = synthesize_quality(&summary.rows, "knowledge-reference", &rules);
585        assert_eq!(quality.eligible_rows, 2);
586        assert_eq!(quality.covered_rows, 2);
587        assert!((quality.score - 1.0).abs() < f64::EPSILON);
588        assert!(
589            quality
590                .reasons
591                .iter()
592                .any(|reason| reason.contains("all eligible rows"))
593        );
594        Ok(())
595    }
596
597    #[test]
598    fn merge_overrides_replaces_matching_rules() -> Result<(), Box<dyn std::error::Error>> {
599        let generated = r#"{
600  "version": "index.pack/v1",
601  "id": "family.docs",
602  "rules": [
603    { "host": "example.org", "path_prefix": "/docs", "manifest": {"content":{"main_selector":"main"}} }
604  ]
605}"#;
606        let overrides = r#"{
607  "version": "index.pack/v1",
608  "id": "family.docs",
609  "rules": [
610    { "host": "example.org", "path_prefix": "/docs", "manifest": {"content":{"main_selector":"article"}} },
611    { "host": "example.net", "path_prefix": "/", "manifest": {"content":{"main_selector":"main"}} }
612  ]
613}"#;
614        let merged = merge_pack_overrides(generated, overrides)?;
615        assert!(merged.contains("\"example.org\""));
616        assert!(merged.contains("\"example.net\""));
617        assert!(merged.contains("\"article\""));
618        Ok(())
619    }
620}