tga 2.7.1 - Docs.rs

//! Cascade orchestrator combining the classification tiers.

use std::collections::HashMap;
use std::sync::{Arc, Mutex};

use rayon::prelude::*;
use rusqlite::Connection;

use crate::classify::errors::Result;
use crate::classify::rules::RuleSet;
use crate::classify::taxonomy::{SubcategoryDef, TaxonomyRegistry};
use crate::classify::tiers::exact::ExactMatcher;
use crate::classify::tiers::fuzzy::FuzzyClassifier;
use crate::classify::tiers::issue_type_tier::IssueTypeTier;
use crate::classify::tiers::jira_project_tier::JiraProjectTier;
use crate::classify::tiers::llm::LlmClassifier;
use crate::classify::tiers::override_tier::OverrideTier;
use crate::classify::tiers::regex_tier::RegexMatcher;
use crate::classify::tiers::weighted_sum::WeightedSumClassifier;
use crate::classify::tiers::ClassificationResult;
use crate::core::models::ClassificationMethod;

/// Runtime configuration for the [`ClassificationEngine`].
///
/// Why: classification tiers (especially LLM) need provider / model /
/// threshold tuning; bundling those knobs into a config struct keeps the
/// engine constructor signature stable as new tiers are added.
/// What: holds LLM toggles (`use_llm`, `llm_model`, `llm_provider`,
/// `openrouter_api_key`) plus a `confidence_threshold` shared by all tiers,
/// and the `weighted_sum` config for Tier 2.5 (added in 1.3.0).
/// Test: every classifier test in `classify::tests` builds one via
/// `Default::default()` or with explicit overrides.
#[derive(Debug, Clone)]
pub struct ClassificationEngineConfig {
    /// Whether to engage the LLM tier when tiers 1–3 fail.
    pub use_llm: bool,
    /// LLM model identifier (provider-specific).
    pub llm_model: String,
    /// LLM provider: `"openrouter"`, `"openai"`, or `"auto"`.
    pub llm_provider: String,
    /// Optional OpenRouter API key. If `None`, the env var
    /// `OPENROUTER_API_KEY` is consulted at engine-build time.
    pub openrouter_api_key: Option<String>,
    /// Minimum confidence required to accept a verdict.
    ///
    /// Verdicts below this threshold are returned as-is (so the caller
    /// can still inspect them), but their `confidence` informs filtering
    /// in downstream reports.
    pub confidence_threshold: f64,
    /// Configuration for the weighted-sum tier (Tier 2.5). Added in 1.3.0.
    ///
    /// Why: the pipeline builds the engine from `ClassificationEngineConfig`;
    /// threading the weighted-sum config here keeps all per-run tier knobs in
    /// one place without changing the engine constructor's arity.
    /// What: forwards to [`WeightedSumClassifier::new`] at engine-build time.
    /// Test: covered indirectly by `classify_sync` tests that exercise Tier 2.5.
    pub weighted_sum: crate::classify::tiers::weighted_sum::WeightedSumConfig,
}

impl Default for ClassificationEngineConfig {
    fn default() -> Self {
        Self {
            use_llm: false,
            llm_model: "gpt-4o-mini".to_string(),
            llm_provider: "auto".to_string(),
            openrouter_api_key: None,
            confidence_threshold: 0.7,
            weighted_sum: crate::classify::tiers::weighted_sum::WeightedSumConfig::default(),
        }
    }
}

/// Combined classification cascade.
///
/// Why: a single engine orchestrates the four-tier cascade
/// (override → exact → issue-type → regex → JIRA-project → fuzzy → LLM)
/// so callers don't reimplement the precedence rules.
/// What: holds one classifier per tier plus the shared taxonomy and
/// config. `classify` walks the tiers in order; first non-`None` wins.
/// The fuzzy tier is gated by `extend_defaults`: when `false`, the fuzzy
/// tier is suppressed because it emits hardcoded built-in category strings
/// (`"merge"`, `"feature"`, `"chore"`) that conflict with user-defined
/// taxonomies, violating the principle that `extend_defaults: false` means
/// "no built-in classification of any kind".
/// Test: covered by `classify::tests::engine_classify_batch_does_not_panic`
/// and the cascade-coverage `corpus_uncategorized_below_1_percent` test.
pub struct ClassificationEngine {
    override_tier: Option<OverrideTier>,
    exact: ExactMatcher,
    issue_type: IssueTypeTier,
    regex: RegexMatcher,
    jira_project: JiraProjectTier,
    /// Tier 2.5: weighted-sum classifier (always enabled by default; see
    /// [`WeightedSumConfig::enabled`] for the per-instance toggle).
    ///
    /// Why: unlike the fuzzy tier, the weighted-sum tier does not emit
    /// hardcoded built-in category strings — it picks winners based on signal
    /// scores and maps them via `to_verdict()`. It is therefore safe to leave
    /// active even when `extend_defaults: false`. If a user's custom taxonomy
    /// does not include the voted category the verdict's `top_level` will be
    /// `None`; the category string itself is still written to the DB.
    weighted_sum: WeightedSumClassifier,
    /// `None` when `extend_defaults == false` in the loaded ruleset.
    ///
    /// Why: the fuzzy tier is built-in by definition — it emits hardcoded
    /// category strings (`"merge"`, `"feature"`, `"chore"`) regardless of
    /// user taxonomy. Gating it behind `extend_defaults` ensures it never
    /// fires when the user has elected a fully custom taxonomy.
    fuzzy: Option<FuzzyClassifier>,
    llm: Option<LlmClassifier>,
    taxonomy: TaxonomyRegistry,
    config: ClassificationEngineConfig,
}

impl ClassificationEngine {
    /// Build a new engine from a [`RuleSet`] and configuration.
    ///
    /// Why: most callers want the default behaviour — built-in taxonomy,
    /// no JIRA mappings, no override tier. This constructor is the shortest
    /// path.
    /// What: delegates to [`Self::with_taxonomy`] with an empty custom
    /// taxonomy vec.
    /// Test: covered by `classify::tests::engine_classify_batch_does_not_panic`.
    ///
    /// The LLM tier is constructed (but only invoked) if `config.use_llm`
    /// is true. The API key is read from the `OPENAI_API_KEY` environment
    /// variable; if unset, the LLM tier silently returns `None`.
    ///
    /// Uses the built-in taxonomy registry only. To extend it with
    /// user-defined subcategories, use [`Self::with_taxonomy`].
    ///
    /// # Errors
    ///
    /// Returns an error if the rules fail to compile (e.g. invalid regex).
    pub fn new(ruleset: RuleSet, config: ClassificationEngineConfig) -> Result<Self> {
        Self::with_taxonomy(ruleset, config, Vec::new())
    }

    /// Build an engine with user-defined subcategory definitions merged into
    /// the built-in taxonomy registry.
    ///
    /// Why: organisations often need custom subcategories
    /// (e.g. `"payments"`, `"auth"`) without forking the binary.
    /// What: delegates to [`Self::with_taxonomy_and_mappings`] with empty
    /// JIRA mappings and no override connection.
    /// Test: covered by `classify::tests::registry_merges_user_defined`.
    ///
    /// # Errors
    ///
    /// Returns an error if the rules fail to compile.
    pub fn with_taxonomy(
        ruleset: RuleSet,
        config: ClassificationEngineConfig,
        custom_taxonomy: Vec<SubcategoryDef>,
    ) -> Result<Self> {
        Self::with_taxonomy_and_mappings(ruleset, config, custom_taxonomy, HashMap::new(), None)
    }

    /// Full builder allowing JIRA project-key mappings and an optional DB
    /// connection for the manual-override tier.
    ///
    /// Why: operators sometimes need to seed both JIRA project keys (so
    /// `PROJ-123` knows it lives under "Project") and an override database
    /// (for human-corrected verdicts). This builder accepts both.
    /// What: delegates to [`Self::with_taxonomy_mappings_and_confidence`]
    /// with `jira_confidence = None` (use the default 0.88).
    /// Test: covered by `jira_project_mapping_*` tests.
    ///
    /// # Errors
    ///
    /// Returns an error if the rules fail to compile.
    pub fn with_taxonomy_and_mappings(
        ruleset: RuleSet,
        config: ClassificationEngineConfig,
        custom_taxonomy: Vec<SubcategoryDef>,
        jira_project_mappings: HashMap<String, String>,
        override_conn: Option<Arc<Mutex<Connection>>>,
    ) -> Result<Self> {
        Self::with_taxonomy_mappings_and_confidence(
            ruleset,
            config,
            custom_taxonomy,
            jira_project_mappings,
            None,
            override_conn,
        )
    }

    /// Full builder with the JIRA project-key mapping confidence override.
    ///
    /// Why: issue #206 — operators need to tune how aggressively the
    /// project-key mapping overrides downstream regex/fuzzy verdicts.
    /// Passing `None` keeps the default
    /// [`crate::classify::tiers::jira_project_tier::DEFAULT_PROJECT_MAPPING_CONFIDENCE`]
    /// (0.88).
    /// What: same as [`Self::with_taxonomy_and_mappings`] but takes an
    /// extra `jira_confidence` parameter.
    /// Test: covered by `jira_project_mapping_*` tests in this module.
    ///
    /// # Errors
    ///
    /// Returns an error if the rules fail to compile.
    pub fn with_taxonomy_mappings_and_confidence(
        ruleset: RuleSet,
        config: ClassificationEngineConfig,
        custom_taxonomy: Vec<SubcategoryDef>,
        jira_project_mappings: HashMap<String, String>,
        jira_confidence: Option<f64>,
        override_conn: Option<Arc<Mutex<Connection>>>,
    ) -> Result<Self> {
        let exact = ExactMatcher::new(&ruleset.rules)?;
        let regex = RegexMatcher::new(&ruleset.rules)?;
        // Tier 2.5: weighted-sum classifier. Active regardless of
        // extend_defaults because it composes signals; it does NOT emit
        // hardcoded built-in category strings. The per-instance `enabled`
        // flag in WeightedSumConfig lets operators opt out.
        let weighted_sum = WeightedSumClassifier::new(config.weighted_sum.clone());
        // Gate the fuzzy tier on extend_defaults. The fuzzy tier is built-in
        // by definition: it emits hardcoded category strings ("merge",
        // "feature", "chore") that conflict with user-defined taxonomies.
        // When extend_defaults is false the user has elected a fully custom
        // taxonomy, so the fuzzy tier must be suppressed entirely.
        let fuzzy = if ruleset.extend_defaults {
            Some(FuzzyClassifier)
        } else {
            None
        };
        let llm = if config.use_llm {
            match LlmClassifier::from_provider(
                &config.llm_provider,
                &config.llm_model,
                config.openrouter_api_key.clone(),
            ) {
                Ok(c) => Some(c),
                Err(e) => {
                    return Err(crate::classify::errors::ClassifyError::Config(format!(
                        "LLM provider init failed: {e}"
                    )))
                }
            }
        } else {
            None
        };
        let taxonomy = TaxonomyRegistry::new(custom_taxonomy);
        let issue_type = IssueTypeTier::with_taxonomy(taxonomy.clone());
        let jira_project = JiraProjectTier::with_taxonomy_and_confidence(
            jira_project_mappings,
            taxonomy.clone(),
            jira_confidence.unwrap_or(
                crate::classify::tiers::jira_project_tier::DEFAULT_PROJECT_MAPPING_CONFIDENCE,
            ),
        );
        let override_tier = override_conn.map(|c| OverrideTier::with_taxonomy(c, taxonomy.clone()));
        Ok(Self {
            override_tier,
            exact,
            issue_type,
            regex,
            jira_project,
            weighted_sum,
            fuzzy,
            llm,
            taxonomy,
            config,
        })
    }

    /// Borrow the engine's taxonomy registry.
    ///
    /// Why: report formatters that surface top-level categories need to
    /// resolve subcategory strings; sharing the engine's registry keeps
    /// the resolution consistent across the run.
    /// What: returns a shared reference to the internal
    /// [`TaxonomyRegistry`].
    /// Test: covered indirectly — every callable engine test holds a
    /// taxonomy reference.
    pub fn taxonomy(&self) -> &TaxonomyRegistry {
        &self.taxonomy
    }

    /// Attach a pre-built LLM classifier to this engine.
    ///
    /// Why: the pipeline's `build_engine` constructs the engine synchronously
    /// (without an LLM tier) and then attaches the LLM classifier via this
    /// method after the async SDK init (Bedrock credential resolution) has
    /// completed. This keeps the engine constructor itself synchronous —
    /// necessary for Rayon-batch callers — while allowing the async LLM init
    /// to happen at the right time.
    /// What: sets `self.llm = Some(classifier)` and updates `self.config.use_llm`.
    /// Test: exercised indirectly by all pipeline integration tests that use
    /// `use_llm: true`.
    pub fn attach_llm(&mut self, classifier: LlmClassifier) {
        self.config.use_llm = true;
        self.llm = Some(classifier);
    }

    /// Test-only seam: rebuild the LLM tier targeting an explicit endpoint
    /// with a fixed API key.
    ///
    /// Why: integration tests that exercise the LLM tier (e.g. complexity
    /// backfill) need to point the classifier at a `wiremock` server rather
    /// than a real provider. Production code never calls this.
    /// What: replaces `self.llm` with an `LlmClassifier` keyed for `endpoint`.
    /// Test: used by the pipeline complexity-backfill integration tests.
    #[cfg(test)]
    pub(crate) fn with_test_llm_endpoint(mut self, endpoint: &str) -> Self {
        self.llm = Some(
            LlmClassifier::new(&self.config.llm_model, Some("sk-test".to_string()))
                .with_endpoint(endpoint),
        );
        self
    }

    /// Borrow the engine's effective configuration.
    pub fn config(&self) -> &ClassificationEngineConfig {
        &self.config
    }

    /// Run the synchronous tiers (0, 1, 1.5, 2, 3, 3.5) for a single message.
    ///
    /// Returns `None` if no tier matched; callers may then invoke the
    /// async [`ClassificationEngine::classify`] for the LLM fallback.
    ///
    /// `commit_sha` and `repo_path` are optional; when supplied, the
    /// manual-override tier (Tier 0) is consulted first. When `issue_type`
    /// is supplied, the issue-type tier (Tier 1.5) is consulted between
    /// the exact and regex tiers.
    pub fn classify_sync(&self, message: &str, is_merge: bool) -> Option<ClassificationResult> {
        self.classify_sync_with_context(message, is_merge, None, None, None)
    }

    /// Context-aware variant of [`Self::classify_sync`] that supplies
    /// optional commit identity (for Tier 0) and PM-system issue type
    /// (for Tier 1.5).
    pub fn classify_sync_with_context(
        &self,
        message: &str,
        is_merge: bool,
        commit_sha: Option<&str>,
        repo_path: Option<&str>,
        issue_type: Option<&str>,
    ) -> Option<ClassificationResult> {
        // Tier 0: manual override (DB lookup, short-circuits everything).
        if let (Some(tier), Some(sha), Some(repo)) =
            (self.override_tier.as_ref(), commit_sha, repo_path)
        {
            if let Some(r) = tier.lookup(sha, repo) {
                return Some(r);
            }
        }

        // Tier 1: exact keywords
        if let Some(rule) = self.exact.classify(message) {
            return Some(ClassificationResult {
                top_level: self.taxonomy.resolve(&rule.category),
                category: rule.category.clone(),
                subcategory: rule.subcategory.clone(),
                confidence: rule.confidence,
                method: ClassificationMethod::ExactRule,
                ticket_id: RegexMatcher::extract_ticket_id(message),
                complexity: None,
            });
        }

        // Tier 1.5: PM issue-type mapping.
        if let Some(it) = issue_type {
            if let Some(mut r) = self.issue_type.classify(it) {
                r.ticket_id = RegexMatcher::extract_ticket_id(message);
                return Some(r);
            }
        }

        // Tier 1.6: JIRA project-key mapping (if configured).
        //
        // Issue #206 — JIRA project codes (e.g. `TQL-1234`) carry semantic
        // meaning that no amount of message parsing can reproduce: `TQL`
        // is an existing-product bug tracker, `INFRA` is platform work, and
        // so on. Insert this tier *before* the regex tier so the project
        // mapping outranks the generic `[A-Z]+-\d+` `jira-ticket` rule
        // (which routes everything to "feature/ticketed" at confidence
        // 0.7). Tier-0 manual overrides still win because they short-
        // circuit above.
        if !self.jira_project.is_empty() {
            if let Some(r) = self.jira_project.classify(message) {
                return Some(r);
            }
        }

        // Tier 2: regex
        if let Some(rule) = self.regex.classify(message) {
            return Some(ClassificationResult {
                top_level: self.taxonomy.resolve(&rule.category),
                category: rule.category.clone(),
                subcategory: rule.subcategory.clone(),
                confidence: rule.confidence,
                method: ClassificationMethod::RegexRule,
                ticket_id: RegexMatcher::extract_ticket_id(message),
                complexity: None,
            });
        }

        // Tier 2.5: weighted-sum classifier.
        //
        // Sits between the regex tier (Tier 2) and the fuzzy tier (Tier 3).
        // Active even when `extend_defaults: false` because it composes signals
        // rather than emitting hardcoded built-in category strings. Disabled
        // per-instance via `WeightedSumConfig { enabled: false }`.
        //
        // File paths are not available in the synchronous path (they would
        // require a DB join); pass an empty slice so the signal contributes
        // zero rather than penalising the commit.
        if let Some(mut result) = self.weighted_sum.classify(message, is_merge, &[]) {
            if result.ticket_id.is_none() {
                result.ticket_id = RegexMatcher::extract_ticket_id(message);
            }
            // Re-resolve top_level via the engine's registry in case the user
            // has overridden the default parent for the voted category.
            if let Some(top) = self.taxonomy.resolve(&result.category) {
                result.top_level = Some(top);
            }
            return Some(result);
        }

        // Tier 3.5: fuzzy heuristics (only when extend_defaults is true).
        // The fuzzy tier emits hardcoded built-in category strings ("merge",
        // "feature", "chore"). When the user's ruleset has extend_defaults:
        // false the fuzzy field is None, and this block is skipped entirely.
        if let Some(fuzzy) = &self.fuzzy {
            if let Some(mut result) = fuzzy.classify(message, is_merge) {
                if result.ticket_id.is_none() {
                    result.ticket_id = RegexMatcher::extract_ticket_id(message);
                }
                // Re-resolve top_level via the engine's registry in case user
                // overrides changed the parent for the fuzzy verdict's category.
                if let Some(top) = self.taxonomy.resolve(&result.category) {
                    result.top_level = Some(top);
                }
                return Some(result);
            }
        }

        None
    }

    /// Run the full four-tier cascade including the optional LLM fallback.
    pub async fn classify(&self, message: &str, is_merge: bool) -> ClassificationResult {
        if let Some(r) = self.classify_sync(message, is_merge) {
            return r;
        }

        if let Some(r) = self.llm_classify_only(message).await {
            return r;
        }

        let mut fallback = ClassificationResult::unclassified();
        fallback.ticket_id = RegexMatcher::extract_ticket_id(message);
        fallback
    }

    /// Invoke the LLM tier directly, bypassing tiers 0–3.5.
    ///
    /// Returns `None` when the LLM tier is not configured, no API key is
    /// reachable, or the underlying request fails. The pipeline-level LLM
    /// fallback uses this to route low-confidence catch-all verdicts to the
    /// LLM without re-running `classify_sync` (which would short-circuit on
    /// the same low-confidence verdict that triggered the fallback).
    ///
    /// Backfills `ticket_id` from the message text — the LLM verdict
    /// itself does not surface ticket IDs, and without this the pipeline's
    /// overwrite-guard would otherwise drop a ticket reference carried by
    /// the original tier-1-3 verdict when the LLM result wins.
    pub async fn llm_classify_only(&self, message: &str) -> Option<ClassificationResult> {
        let llm = self.llm.as_ref()?;
        let mut r = llm.classify(message).await?;
        r.top_level = self.taxonomy.resolve(&r.category);
        if r.ticket_id.is_none() {
            r.ticket_id = RegexMatcher::extract_ticket_id(message);
        }
        Some(r)
    }

    /// `Some(true)` when the LLM tier is enabled and has a reachable API
    /// key, `Some(false)` when it is enabled but unconfigured, `None` when
    /// the tier is disabled entirely. Callers can warn at startup when the
    /// middle case occurs to avoid silent misconfiguration.
    pub fn llm_has_api_key(&self) -> Option<bool> {
        self.llm.as_ref().map(LlmClassifier::has_api_key)
    }

    /// Classify a batch of `(message, is_merge)` pairs in parallel using
    /// Rayon (tiers 1–3 only). Entries where no tier matched are returned
    /// as [`ClassificationResult::unclassified`].
    pub fn classify_batch(&self, messages: &[(&str, bool)]) -> Vec<ClassificationResult> {
        messages
            .par_iter()
            .map(|(msg, is_merge)| {
                self.classify_sync(msg, *is_merge)
                    .unwrap_or_else(ClassificationResult::unclassified)
            })
            .collect()
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::classify::rules::default_rules;

    /// Why: issue #206 requires the JIRA project mapping to fire as a
    /// Tier-1.6 tiebreaker — between exact-keyword and regex tiers. If
    /// the ordering regresses, the generic `jira-ticket` regex rule
    /// (confidence 0.7, category "feature/ticketed") would steal the
    /// verdict and operators would never see their mapping fire.
    /// What: build an engine over the default ruleset with one mapping
    /// (`TQL → bug_fix`) and classify a message that matches both the
    /// generic ticket pattern and the mapping. Assert the mapping wins.
    /// Test: pure cascade exercise, no DB.
    #[test]
    fn jira_project_mapping_outranks_generic_ticket_regex() {
        let mut mappings = HashMap::new();
        mappings.insert("TQL".to_string(), "bug_fix".to_string());
        let engine = ClassificationEngine::with_taxonomy_and_mappings(
            default_rules(),
            ClassificationEngineConfig::default(),
            Vec::new(),
            mappings,
            None,
        )
        .expect("engine builds");

        // The catch-all and `jira-ticket` rules would normally classify
        // this as "feature/ticketed" at confidence 0.7. With the mapping,
        // we should get "bug_fix" at the JIRA-tier confidence (0.88).
        let v = engine
            .classify_sync("TQL-1234 fix null pointer", false)
            .expect("verdict");
        assert_eq!(v.category, "bug_fix");
        assert!((v.confidence - 0.88).abs() < 1e-6);
        assert_eq!(v.ticket_id.as_deref(), Some("TQL-1234"));
    }

    /// Why: when the operator configures a per-tier confidence override
    /// (e.g. to crowd out manual overrides less aggressively), the
    /// value must reach the verdict.
    /// What: build an engine with `jira_confidence = Some(0.5)` and
    /// assert the verdict carries that value.
    /// Test: pure constructor + classify exercise.
    #[test]
    fn jira_project_mapping_confidence_threads_through_engine_builder() {
        let mut mappings = HashMap::new();
        mappings.insert("INFRA".to_string(), "platform".to_string());
        let engine = ClassificationEngine::with_taxonomy_mappings_and_confidence(
            default_rules(),
            ClassificationEngineConfig::default(),
            Vec::new(),
            mappings,
            Some(0.5),
            None,
        )
        .expect("engine builds");
        let v = engine
            .classify_sync("INFRA-7 patch", false)
            .expect("verdict");
        assert!((v.confidence - 0.5).abs() < 1e-6);
    }

    /// Why: the fuzzy tier emits hardcoded built-in category strings
    /// ("merge", "feature", "chore") that conflict with user-defined
    /// taxonomies. It must be suppressed when `extend_defaults: false` so
    /// the user's fully-custom ruleset is respected end-to-end. The
    /// weighted-sum tier (Tier 2.5) is intentionally active even with
    /// `extend_defaults: false` because it composes signals rather than
    /// emitting hardcoded strings; this test verifies that the verdict's
    /// `method` is `WeightedSum` (not `FuzzyMatch`) when both tiers could
    /// have fired.
    /// What: build an engine from a minimal `extend_defaults: false` ruleset,
    /// classify a merge-commit message, and assert the verdict (if any) was
    /// produced by the weighted-sum tier — never by the fuzzy tier.
    /// Test: pure cascade exercise, no DB or HTTP.
    #[test]
    fn fuzzy_tier_suppressed_when_extend_defaults_false() {
        use crate::classify::rules::{Rule, RuleSet};
        let ruleset = RuleSet {
            version: None,
            extend_defaults: false, // user has a custom-only taxonomy
            rules: vec![Rule {
                id: "my-deploy".to_string(),
                category: "deployment".to_string(),
                subcategory: None,
                keywords: vec!["deploy:".to_string()],
                patterns: vec![],
                priority: 110,
                confidence: 0.9,
            }],
        };
        let engine = ClassificationEngine::new(ruleset, ClassificationEngineConfig::default())
            .expect("engine builds");

        // A merge-commit message that would normally fire the fuzzy tier.
        let result = engine.classify_sync("Merge pull request #42 from main", true);
        // With extend_defaults: false the fuzzy tier is suppressed.
        // The weighted-sum tier (Tier 2.5) may still fire — it emits
        // "merge" based on the strong merge-indicator signal, which is
        // signal-driven rather than hardcoded. If a verdict is produced it
        // must come from WeightedSum, not FuzzyMatch.
        if let Some(ref r) = result {
            assert_ne!(
                r.method,
                ClassificationMethod::FuzzyMatch,
                "fuzzy tier must not fire when extend_defaults is false; got: {result:?}"
            );
        }
    }

    /// Why: the fuzzy tier must still fire when `extend_defaults: true`
    /// to preserve backward-compatible behaviour for users who opt in to
    /// the built-in ruleset.
    /// What: build an engine from the default ruleset (extend_defaults: true)
    /// and classify a merge commit; assert the fuzzy tier fires.
    /// Test: pure cascade exercise.
    #[test]
    fn fuzzy_tier_active_when_extend_defaults_true() {
        let ruleset = {
            let mut rs = default_rules();
            rs.extend_defaults = true;
            rs
        };
        let engine = ClassificationEngine::new(ruleset, ClassificationEngineConfig::default())
            .expect("engine builds");

        let result = engine.classify_sync("Merge pull request #42 from main", true);
        assert!(
            result.is_some(),
            "fuzzy tier must fire for merge commits when extend_defaults is true"
        );
        let r = result.unwrap();
        assert_eq!(r.category, "merge");
    }

    /// Why: exact-keyword conventional-commit prefixes (`fix:`, `feat:`)
    /// must still beat the JIRA mapping — they encode developer intent
    /// at much higher confidence than the project key. The cascade runs
    /// Tier 1 (ExactRule) before Tier 1.6 (JiraProjectTier), so the
    /// cc-fix ExactRule verdict is returned before the JIRA tier is
    /// consulted.
    /// What: classify a `fix: TQL-1 ...` message with a TQL mapping
    /// configured; assert the cc-fix rule wins and reports ExactRule.
    /// Test: pure cascade exercise.
    #[test]
    fn exact_rule_still_beats_jira_project_mapping() {
        let mut mappings = HashMap::new();
        mappings.insert("TQL".to_string(), "platform".to_string());
        let engine = ClassificationEngine::with_taxonomy_and_mappings(
            default_rules(),
            ClassificationEngineConfig::default(),
            Vec::new(),
            mappings,
            None,
        )
        .expect("engine builds");
        let v = engine
            .classify_sync("fix: TQL-1 handle null user", false)
            .expect("verdict");
        assert_eq!(v.category, "bugfix");
        assert_eq!(v.method, ClassificationMethod::ExactRule);
    }

    /// Why: the JIRA project-key tier (Tier 1.6) must report
    /// `ExternalSource` rather than `RegexRule` so that analytics
    /// dashboards correctly attribute JIRA-project-driven verdicts to the
    /// external-source bucket. Before tga 1.5.3 the wrong method caused
    /// JIRA contribution to appear as ~1% (`regex_rule`) when it should
    /// have been 5–10% (`external_source`) (issue #319).
    /// What: classify a bare JIRA-key message with a project mapping and
    /// assert the verdict's method is `ExternalSource`.
    /// Test: pure cascade exercise, no DB.
    #[test]
    fn jira_project_tier_reports_external_source_method() {
        let mut mappings = HashMap::new();
        mappings.insert("TQL".to_string(), "bug_fix".to_string());
        let engine = ClassificationEngine::with_taxonomy_and_mappings(
            default_rules(),
            ClassificationEngineConfig::default(),
            Vec::new(),
            mappings,
            None,
        )
        .expect("engine builds");
        // No conventional-commit prefix: the JIRA project tier fires (not ExactRule).
        let v = engine
            .classify_sync("TQL-1234 fix null pointer", false)
            .expect("verdict");
        assert_eq!(
            v.method,
            ClassificationMethod::ExternalSource,
            "JIRA project-key mapping must report ExternalSource, not RegexRule"
        );
        assert_eq!(v.category, "bug_fix");
        assert_eq!(v.ticket_id.as_deref(), Some("TQL-1234"));
    }
}