Skip to main content

difflore_core/context/
rule_source.rs

1use sqlx::SqlitePool;
2
3use crate::errors::CoreError;
4
5#[derive(Debug, Clone)]
6#[allow(dead_code)]
7pub struct RuleDocument {
8    pub skill_id: String,
9    pub title: String,
10    pub content: String,
11    pub confidence: f64,
12    /// JSON-serialised glob list (e.g. `["**/*.rs", "tokio/src/io/**"]`).
13    /// Empty / NULL = universal rule (cascade treats as always-matching).
14    pub file_patterns: Option<String>,
15    /// Derived from `tags` JSON. NULL means no language hint; SQL filters keep
16    /// NULL rows eligible unless an exact language match is required.
17    pub language: Option<String>,
18    /// Derived from canonical `source_repo`. NULL is unattributed metadata, not
19    /// a runtime global rule; recall must exact-match the current repo/project.
20    pub repo_scope: Option<String>,
21}
22
23#[derive(Debug, Clone, PartialEq, Eq)]
24pub struct RuleIndexState {
25    pub rule_count: i64,
26    pub max_updated_at: Option<String>,
27    pub embedding_profile: String,
28    /// Stable identity of the in-scope rule SET actually served for the
29    /// current git repo scope. `rule_count` + `max_updated_at` alone cannot
30    /// detect a scope swap (e.g. the git remote changes so a different but
31    /// equally-sized set of rules becomes in-scope with the same max
32    /// timestamp); the freshness check would then wrongly skip a re-index and
33    /// serve the wrong scope's chunks. `None` means "scope-agnostic" — used by
34    /// `load_rule_index_state`, which describes the whole active corpus rather
35    /// than a filtered scope — and is ignored by the freshness comparison.
36    pub scope_signature: Option<String>,
37}
38
39/// Derive a stable signature for an in-scope rule SET from its skill ids.
40/// Order-independent: ids are sorted before hashing so the signature depends
41/// only on membership, not on retrieval/iteration order. Returns `None` for
42/// an empty set so the freshness check stays scope-agnostic when nothing is
43/// filtered.
44pub fn scope_signature_from_skill_ids<'a>(
45    skill_ids: impl IntoIterator<Item = &'a str>,
46) -> Option<String> {
47    use sha1::{Digest, Sha1};
48    let mut ids: Vec<&str> = skill_ids.into_iter().collect();
49    if ids.is_empty() {
50        return None;
51    }
52    ids.sort_unstable();
53    ids.dedup();
54    let mut hasher = Sha1::new();
55    for id in ids {
56        hasher.update(id.as_bytes());
57        // Length-delimit so ["ab", "c"] and ["a", "bc"] cannot collide.
58        hasher.update(b"\0");
59    }
60    let digest = hasher.finalize();
61    let mut hex = String::with_capacity(digest.len() * 2);
62    for byte in digest {
63        hex.push_str(&format!("{byte:02x}"));
64    }
65    Some(hex)
66}
67
68#[derive(sqlx::FromRow)]
69struct RuleRow {
70    id: String,
71    name: String,
72    description: String,
73    r#type: String,
74    tags: String,
75    confidence_score: f64,
76    file_patterns: Option<String>,
77    source_repo: Option<String>,
78}
79
80/// Well-known language tags we recognise inside a skill's `tags` JSON.
81/// Matched case-insensitively against each tag. The first hit wins, so the
82/// order below controls priority — put the most common languages first to
83/// keep the hot path cheap.
84const LANGUAGE_TAGS: &[&str] = &[
85    "rust",
86    "typescript",
87    "javascript",
88    "python",
89    "go",
90    "java",
91    "kotlin",
92    "swift",
93    "ruby",
94    "php",
95    "cpp",
96    "c++",
97    "csharp",
98    "c#",
99    "c",
100];
101
102/// Extract a language string from a skill's `tags` JSON (a stringified
103/// array like `["rust", "async"]`). Returns the first recognised language
104/// tag (normalised to lower-case), or `None` if the tags are unparseable,
105/// empty, or carry no language hint.
106///
107/// Kept public so tests can exercise the mapping without building a full
108/// `SqliteRow`. Conservative-by-default: unknown tags are ignored rather
109/// than guessed at — a false `language` hint would silently drop real hits
110/// at retrieval time.
111pub fn language_from_tags(tags_json: &str) -> Option<String> {
112    let trimmed = tags_json.trim();
113    if trimmed.is_empty() {
114        return None;
115    }
116    let tags: Vec<String> = serde_json::from_str(trimmed).ok()?;
117    for tag in tags {
118        let lower = tag.trim().to_ascii_lowercase();
119        if LANGUAGE_TAGS.iter().any(|known| *known == lower) {
120            // Normalise "c++" -> "cpp", "c#" -> "csharp" so downstream
121            // filters can match on a single canonical spelling.
122            let canonical = match lower.as_str() {
123                "c++" => "cpp".to_owned(),
124                "c#" => "csharp".to_owned(),
125                other => other.to_owned(),
126            };
127            return Some(canonical);
128        }
129    }
130    None
131}
132
133/// Derive a confidence multiplier from a rule's tags. Tags carry two
134/// orthogonal evidence signals on extracted rules:
135///   - `cluster-size:N` — how many distinct review extractions clustered
136///     into this rule. N=1 is a singleton (weakest evidence: one
137///     reviewer wrote it once); N>=3 is corroborated.
138///   - `severity:{error,warning,info}` — informational severity
139///     attached during extraction.
140///
141/// Returns a value in `[0.4, 0.95]`, intended to multiply against the
142/// retrieval score in `retrieve_rules_with_confidence`. Returns `None`
143/// when neither tag is present so the caller can keep the default 0.7
144/// default.
145pub fn confidence_from_tags(tags_json: &str) -> Option<f64> {
146    let trimmed = tags_json.trim();
147    if trimmed.is_empty() {
148        return None;
149    }
150    let tags: Vec<String> = serde_json::from_str(trimmed).ok()?;
151    let mut cluster_size: Option<u32> = None;
152    let mut severity: Option<String> = None;
153    for tag in &tags {
154        let lower = tag.trim().to_ascii_lowercase();
155        if let Some(rest) = lower.strip_prefix("cluster-size:") {
156            if let Ok(n) = rest.parse::<u32>() {
157                cluster_size = Some(n);
158            }
159        } else if let Some(rest) = lower.strip_prefix("severity:") {
160            severity = Some(rest.to_owned());
161        }
162    }
163    if cluster_size.is_none() && severity.is_none() {
164        return None;
165    }
166    let base_score = if let Some(n) = cluster_size {
167        match n {
168            0 | 1 => 0.55, // singleton — downweight, but not under the 0.2 floor
169            2 => 0.7,
170            3..=4 => 0.8,
171            _ => 0.9, // 5+ corroborating extractions — strongest evidence
172        }
173    } else {
174        0.7
175    };
176    let score = if let Some(sev) = severity.as_deref() {
177        match sev {
178            "error" => f64::min(base_score + 0.05, 0.95),
179            "info" => f64::max(base_score - 0.05, 0.4),
180            _ => base_score, // warning is the neutral default
181        }
182    } else {
183        base_score
184    };
185    Some(score)
186}
187
188/// Map a single glob pattern to a canonical language tag if it
189/// uniquely identifies one. Patterns like `"**/*.rs"` or
190/// `"src/**/*.ts"` resolve cleanly. Patterns that span multiple
191/// languages (`"**/*"`, `"**/test*"`) return None.
192fn language_from_pattern(p: &str) -> Option<&'static str> {
193    let lower = p.to_ascii_lowercase();
194    let ext = lower.rsplit('.').next()?;
195    if ext == lower || ext.contains('/') || ext.contains('*') {
196        return None;
197    }
198    Some(match ext {
199        "rs" => "rust",
200        "ts" | "tsx" => "typescript",
201        "js" | "jsx" | "mjs" | "cjs" => "javascript",
202        "py" | "pyi" => "python",
203        "go" => "go",
204        "java" => "java",
205        "kt" | "kts" => "kotlin",
206        "swift" => "swift",
207        "rb" => "ruby",
208        "php" => "php",
209        "cpp" | "cc" | "cxx" | "hpp" => "cpp",
210        "cs" => "csharp",
211        _ => return None,
212    })
213}
214
215/// Fallback for `language_from_tags` when tags carry no language hint:
216/// scan a rule's `file_patterns`. If every parseable language-bearing
217/// pattern resolves to the same canonical language, return it.
218/// Mixed-language pattern lists (`["**/*.rs", "**/*.go"]`) and universal
219/// patterns (`["**/*"]`) return None, matching `language_from_tags`.
220pub fn language_from_file_patterns(file_patterns_json: Option<&str>) -> Option<String> {
221    let raw = file_patterns_json?.trim();
222    if raw.is_empty() {
223        return None;
224    }
225    let patterns: Vec<String> = serde_json::from_str(raw).ok()?;
226    let mut seen: Option<&'static str> = None;
227    for p in &patterns {
228        if let Some(lang) = language_from_pattern(p) {
229            match seen {
230                None => seen = Some(lang),
231                Some(existing) if existing == lang => {}
232                Some(_) => return None,
233            }
234        }
235    }
236    seen.map(String::from)
237}
238
239pub fn repo_scope_from_source_repo(source_repo: Option<&str>) -> Option<String> {
240    if let Some(repo) = source_repo.map(str::trim)
241        && let Some((owner, name)) = repo.split_once('/')
242        && !owner.trim().is_empty()
243        && !name.trim().is_empty()
244    {
245        return Some(format!("{}/{}", owner.trim(), name.trim()).to_ascii_lowercase());
246    }
247    None
248}
249
250impl From<RuleRow> for RuleDocument {
251    fn from(r: RuleRow) -> Self {
252        let language = language_from_tags(&r.tags)
253            .or_else(|| language_from_file_patterns(r.file_patterns.as_deref()));
254        let repo_scope = repo_scope_from_source_repo(r.source_repo.as_deref());
255        // Include source repo attribution in indexed content so displayed rule
256        // bodies can cite it and repo-specific queries get a small embedding
257        // bias. Universal rules skip the line.
258        let content = match repo_scope.as_deref() {
259            Some(scope) => format!(
260                "Rule ID: {}\nRule Name: {}\nType: {}\nSource: {}\nTags: {}\n\n{}",
261                r.id, r.name, r.r#type, scope, r.tags, r.description
262            ),
263            None => format!(
264                "Rule ID: {}\nRule Name: {}\nType: {}\nTags: {}\n\n{}",
265                r.id, r.name, r.r#type, r.tags, r.description
266            ),
267        };
268        Self {
269            skill_id: r.id,
270            title: r.name,
271            content,
272            confidence: r.confidence_score,
273            file_patterns: r.file_patterns,
274            language,
275            repo_scope,
276        }
277    }
278}
279
280pub async fn load_rules_from_db(pool: &SqlitePool) -> Result<Vec<RuleDocument>, CoreError> {
281    load_rules_from_db_for_engine(pool, None).await
282}
283
284pub async fn load_rule_index_state(pool: &SqlitePool) -> Result<RuleIndexState, CoreError> {
285    // Mirror `load_rules_from_db_for_engine`: pending candidates are not
286    // served, so the index-state hash must ignore them too. Otherwise a
287    // pending insert would invalidate the rule index without changing
288    // any served document.
289    let row = sqlx::query!(
290        "SELECT COUNT(*) AS rule_count, MAX(updated_at) AS max_updated_at FROM skills WHERE status = 'active'"
291    )
292    .fetch_one(pool)
293    .await?;
294    Ok(RuleIndexState {
295        rule_count: row.rule_count,
296        max_updated_at: row.max_updated_at,
297        embedding_profile: crate::context::embedding::active_embedding_profile().await,
298        // Base state describes the whole active corpus, not a repo-filtered
299        // scope, so it carries no scope signature; the orchestrator fills this
300        // in after filtering for the current git repo scope.
301        scope_signature: None,
302    })
303}
304
305pub async fn load_rules_from_db_for_engine(
306    pool: &SqlitePool,
307    engine: Option<&str>,
308) -> Result<Vec<RuleDocument>, CoreError> {
309    // SELECT list pulls canonical `source_repo` so `RuleDocument` can carry the
310    // denormalised `repo_scope` used by retrieval filters.
311    // Boundary: pending candidates (e.g. ingested agent memory) MUST NOT
312    // surface here — they exist for team review on the dashboard, not for
313    // injection into agent context.
314    let rows = match engine {
315        Some("codex") => {
316            sqlx::query_as::<_, RuleRow>(
317                "SELECT id, name, description, type as \"type\", tags, confidence_score, \
318             file_patterns, source_repo FROM skills \
319             WHERE enabled_for_codex = 1 AND status = 'active'",
320            )
321            .fetch_all(pool)
322            .await?
323        }
324        Some("claude") => {
325            sqlx::query_as::<_, RuleRow>(
326                "SELECT id, name, description, type as \"type\", tags, confidence_score, \
327             file_patterns, source_repo FROM skills \
328             WHERE enabled_for_claude = 1 AND status = 'active'",
329            )
330            .fetch_all(pool)
331            .await?
332        }
333        Some("gemini") => {
334            sqlx::query_as::<_, RuleRow>(
335                "SELECT id, name, description, type as \"type\", tags, confidence_score, \
336             file_patterns, source_repo FROM skills \
337             WHERE enabled_for_gemini = 1 AND status = 'active'",
338            )
339            .fetch_all(pool)
340            .await?
341        }
342        Some("cursor") => {
343            sqlx::query_as::<_, RuleRow>(
344                "SELECT id, name, description, type as \"type\", tags, confidence_score, \
345             file_patterns, source_repo FROM skills \
346             WHERE enabled_for_cursor = 1 AND status = 'active'",
347            )
348            .fetch_all(pool)
349            .await?
350        }
351        _ => {
352            sqlx::query_as::<_, RuleRow>(
353                "SELECT id, name, description, type as \"type\", tags, confidence_score, \
354             file_patterns, source_repo FROM skills \
355             WHERE status = 'active'",
356            )
357            .fetch_all(pool)
358            .await?
359        }
360    };
361
362    Ok(rows.into_iter().map(RuleDocument::from).collect())
363}
364
365/// Build a `skill_id -> confidence_score` map used by retrieval to
366/// boost or dampen rules at ranking time. Mirrors the SELECT shape of
367/// `load_rules_from_db` but skips the heavy text columns since the
368/// caller only needs the score; this stays a tight key/value query
369/// so calling it on every hook fire stays cheap.
370///
371/// Confidence semantics (from the local rule-ranking contract and
372/// `skills.confidence_score` defaults):
373///   - manual / cloud-extracted base: 0.7
374///   - conversation-channel base: 0.6 (fidelity discount on agent
375///     transcription)
376///   - dedup-bump: +0.05 per re-capture
377///   - feedback dismiss: -0.10
378///
379/// Retrieval uses these to weight RRF scores so a high-confidence
380/// rule outranks a fresh capture with the same lexical signal.
381pub async fn load_rule_confidence_map(
382    pool: &SqlitePool,
383) -> Result<std::collections::HashMap<String, f64>, CoreError> {
384    let rows = sqlx::query!("SELECT id, confidence_score FROM skills WHERE status = 'active'")
385        .fetch_all(pool)
386        .await?;
387    Ok(rows
388        .into_iter()
389        .map(|row| (row.id, row.confidence_score))
390        .collect())
391}
392
393/// Best-effort ranking metadata shared by CLI search/recall and MCP
394/// runtime recall. Keeping confidence and age maps behind one loader makes
395/// it harder for a callsite to apply confidence boosts while accidentally
396/// skipping the half-life decay input.
397#[derive(Debug, Clone, Default)]
398pub struct RuleRankingInputs {
399    pub confidence_map: Option<std::collections::HashMap<String, f64>>,
400    pub age_days_map: Option<std::collections::HashMap<String, f32>>,
401}
402
403pub async fn load_rule_ranking_inputs(pool: &SqlitePool) -> RuleRankingInputs {
404    RuleRankingInputs {
405        confidence_map: load_rule_confidence_map(pool).await.ok(),
406        age_days_map: load_rule_age_days_map(pool).await.ok(),
407    }
408}
409
410/// Build `skill_id -> age_in_days` map for the half-life decay applied at
411/// retrieval time (`effective_confidence`). Age uses `created_at` when
412/// present, falls back to `updated_at` so rules backfilled without a
413/// `created_at` still get a sane age. Skills with neither column set
414/// (shouldn't happen in practice — both have NOT NULL defaults) are
415/// omitted; the retrieval path treats absence as `age_days = 0` so
416/// behaviour matches the pre-plumbing default.
417///
418/// Uses runtime `sqlx::query()` because this query is optional ranking metadata
419/// and should not depend on offline SQLx metadata.
420pub async fn load_rule_age_days_map(
421    pool: &SqlitePool,
422) -> Result<std::collections::HashMap<String, f32>, CoreError> {
423    use sqlx::Row;
424    let rows = sqlx::query(
425        "SELECT id, COALESCE(created_at, updated_at) AS ts \
426         FROM skills WHERE status = 'active'",
427    )
428    .fetch_all(pool)
429    .await?;
430    let now = chrono::Utc::now();
431    let mut out = std::collections::HashMap::with_capacity(rows.len());
432    for row in rows {
433        let id: String = row.try_get("id").unwrap_or_default();
434        if id.is_empty() {
435            continue;
436        }
437        let ts: Option<String> = row.try_get("ts").ok();
438        let Some(ts) = ts else { continue };
439        // SQLite stores timestamps as ISO-8601 strings. Try RFC3339 first
440        // (the canonical write path), then a few common SQLite shapes
441        // before giving up. A parse failure means we omit the entry —
442        // retrieval defaults age to 0, so a malformed timestamp degrades
443        // to "no decay" rather than mis-aging the rule.
444        let parsed = chrono::DateTime::parse_from_rfc3339(&ts)
445            .map(|dt| dt.with_timezone(&chrono::Utc))
446            .ok()
447            .or_else(|| {
448                chrono::NaiveDateTime::parse_from_str(&ts, "%Y-%m-%d %H:%M:%S")
449                    .ok()
450                    .map(|n| n.and_utc())
451            })
452            .or_else(|| {
453                chrono::NaiveDateTime::parse_from_str(&ts, "%Y-%m-%dT%H:%M:%S%.f")
454                    .ok()
455                    .map(|n| n.and_utc())
456            });
457        if let Some(created) = parsed {
458            let age_days = (now - created).num_seconds().max(0) as f32 / 86_400.0;
459            out.insert(id, age_days);
460        }
461    }
462    Ok(out)
463}
464
465/// Load few-shot code examples for a given skill
466pub async fn load_rule_examples(
467    pool: &SqlitePool,
468    skill_id: &str,
469) -> Result<Vec<RuleExample>, CoreError> {
470    let rows = sqlx::query_as!(
471        RuleExampleRow,
472        "SELECT id, skill_id, bad_code, good_code, description, source \
473         FROM rule_examples WHERE skill_id = ?1 ORDER BY created_at DESC LIMIT 3",
474        skill_id
475    )
476    .fetch_all(pool)
477    .await?;
478    Ok(rows.into_iter().map(RuleExample::from).collect())
479}
480
481/// Load examples for multiple skills in one batch
482pub async fn load_rule_examples_batch(
483    pool: &SqlitePool,
484    skill_ids: &[String],
485) -> Result<std::collections::HashMap<String, Vec<RuleExample>>, CoreError> {
486    if skill_ids.is_empty() {
487        return Ok(std::collections::HashMap::new());
488    }
489    let ids_json = serde_json::to_string(skill_ids)
490        .map_err(|e| CoreError::Internal(format!("serialize skill_ids: {e}")))?;
491    let rows = sqlx::query_as!(
492        RuleExampleRow,
493        "SELECT id, skill_id, bad_code, good_code, description, source \
494         FROM rule_examples \
495         WHERE skill_id IN (SELECT value FROM json_each(?1)) \
496         ORDER BY created_at DESC",
497        ids_json,
498    )
499    .fetch_all(pool)
500    .await?;
501
502    let mut map: std::collections::HashMap<String, Vec<RuleExample>> =
503        std::collections::HashMap::new();
504    for row in rows {
505        let skill_id = row.skill_id.clone();
506        let example = RuleExample::from(row);
507        map.entry(skill_id).or_default().push(example);
508    }
509    // Limit to 3 examples per skill
510    for examples in map.values_mut() {
511        examples.truncate(3);
512    }
513    Ok(map)
514}
515
516#[derive(Debug, Clone)]
517pub struct RuleExample {
518    pub id: String,
519    pub skill_id: String,
520    pub bad_code: String,
521    pub good_code: String,
522    pub description: Option<String>,
523    pub source: String,
524}
525
526#[derive(sqlx::FromRow)]
527struct RuleExampleRow {
528    id: String,
529    skill_id: String,
530    bad_code: String,
531    good_code: String,
532    description: Option<String>,
533    source: String,
534}
535
536impl From<RuleExampleRow> for RuleExample {
537    fn from(r: RuleExampleRow) -> Self {
538        Self {
539            id: r.id,
540            skill_id: r.skill_id,
541            bad_code: r.bad_code,
542            good_code: r.good_code,
543            description: r.description,
544            source: r.source,
545        }
546    }
547}
548
549#[cfg(test)]
550mod tests {
551    use super::*;
552
553    #[test]
554    fn confidence_from_tags_singleton_downweighted() {
555        let c = confidence_from_tags(r#"["auto-from-extractions","cluster-size:1"]"#).unwrap();
556        assert!((c - 0.55).abs() < 1e-9, "got {c}");
557    }
558
559    #[test]
560    fn confidence_from_tags_large_cluster_strongest() {
561        let c = confidence_from_tags(r#"["cluster-size:8","severity:warning"]"#).unwrap();
562        assert!((c - 0.9).abs() < 1e-9, "got {c}");
563    }
564
565    #[test]
566    fn confidence_from_tags_severity_error_boosts() {
567        let c = confidence_from_tags(r#"["cluster-size:3","severity:error"]"#).unwrap();
568        assert!((c - 0.85).abs() < 1e-9, "got {c}");
569    }
570
571    #[test]
572    fn confidence_from_tags_severity_info_dampens() {
573        let c = confidence_from_tags(r#"["cluster-size:1","severity:info"]"#).unwrap();
574        assert!((c - 0.50).abs() < 1e-9, "got {c}");
575    }
576
577    #[test]
578    fn confidence_from_tags_missing_evidence_returns_none() {
579        assert_eq!(
580            confidence_from_tags(r#"["auto-from-extractions","origin:review-extraction"]"#),
581            None
582        );
583        assert_eq!(confidence_from_tags("[]"), None);
584        assert_eq!(confidence_from_tags(""), None);
585        assert_eq!(confidence_from_tags("not-json"), None);
586    }
587
588    #[test]
589    fn language_from_tags_table() {
590        let cases: &[(&str, Option<&str>)] = &[
591            (r#"["async", "rust", "concurrency"]"#, Some("rust")),
592            (r#"["typescript", "react"]"#, Some("typescript")),
593            // Normalised aliases.
594            (r#"["c++"]"#, Some("cpp")),
595            (r#"["C#"]"#, Some("csharp")),
596            // No known language tag — fall through to None.
597            ("[]", None),
598            ("", None),
599            ("not-json", None),
600            (r#"["lint", "performance"]"#, None),
601        ];
602        for (input, expected) in cases {
603            assert_eq!(
604                language_from_tags(input).as_deref(),
605                *expected,
606                "input: {input}"
607            );
608        }
609    }
610
611    #[test]
612    fn language_from_file_patterns_resolves_single_language() {
613        assert_eq!(
614            language_from_file_patterns(Some(r#"["**/*.rs"]"#)).as_deref(),
615            Some("rust")
616        );
617        assert_eq!(
618            language_from_file_patterns(Some(r#"["**/*.ts","**/*.tsx"]"#)).as_deref(),
619            Some("typescript")
620        );
621        assert_eq!(
622            language_from_file_patterns(Some(r#"["src/**/*.go","tests/**/*.go"]"#)).as_deref(),
623            Some("go")
624        );
625    }
626
627    #[test]
628    fn language_from_file_patterns_returns_none_for_mixed_or_universal() {
629        // Mixed languages → can't pick one without guessing.
630        assert_eq!(
631            language_from_file_patterns(Some(r#"["**/*.rs","**/*.go"]"#)),
632            None
633        );
634        // Universal pattern → applies everywhere.
635        assert_eq!(language_from_file_patterns(Some(r#"["**/*"]"#)), None);
636        // Test glob without language extension.
637        assert_eq!(language_from_file_patterns(Some(r#"["**/*test*"]"#)), None);
638    }
639
640    #[test]
641    fn language_from_file_patterns_handles_missing_or_empty_input() {
642        assert_eq!(language_from_file_patterns(None), None);
643        assert_eq!(language_from_file_patterns(Some("")), None);
644        assert_eq!(language_from_file_patterns(Some("[]")), None);
645        assert_eq!(language_from_file_patterns(Some("not-json")), None);
646    }
647
648    #[test]
649    fn repo_scope_uses_canonical_source_repo_only() {
650        assert_eq!(
651            repo_scope_from_source_repo(Some("vitejs/vite")).as_deref(),
652            Some("vitejs/vite")
653        );
654        assert!(repo_scope_from_source_repo(None).is_none());
655        assert!(repo_scope_from_source_repo(Some("vitejs")).is_none());
656        assert!(repo_scope_from_source_repo(Some(" /vite")).is_none());
657    }
658
659    #[test]
660    fn scope_signature_depends_only_on_membership() {
661        // Order-independent: same set, different iteration order → same sig.
662        // If this broke, recall with a different rule iteration order would
663        // spuriously invalidate freshness and re-embed the whole corpus on
664        // every call.
665        assert_eq!(
666            scope_signature_from_skill_ids(["a", "b", "c"]),
667            scope_signature_from_skill_ids(["c", "a", "b"]),
668        );
669        // Dedup: repeated ids do not change the signature (membership-only).
670        assert_eq!(
671            scope_signature_from_skill_ids(["a", "a", "b"]),
672            scope_signature_from_skill_ids(["a", "b"]),
673        );
674        // Empty set → scope-agnostic `None` (freshness check ignores scope).
675        assert_eq!(scope_signature_from_skill_ids(Vec::<&str>::new()), None);
676        // A genuine membership change MUST change the signature — this is what
677        // lets the index-freshness check catch a scope change even when the
678        // rule count is unchanged.
679        assert_ne!(
680            scope_signature_from_skill_ids(["a", "b"]),
681            scope_signature_from_skill_ids(["a", "c"]),
682        );
683    }
684
685    #[test]
686    fn scope_signature_length_delimits_to_avoid_collision() {
687        // Without the NUL length-delimiter between ids, ["ab","c"] and
688        // ["a","bc"] would hash the same concatenated bytes ("abc") and
689        // collide — a real scope change would then be silently missed and
690        // stale chunks served.
691        assert_ne!(
692            scope_signature_from_skill_ids(["ab", "c"]),
693            scope_signature_from_skill_ids(["a", "bc"]),
694        );
695    }
696}