octoroute 1.0.0

//! LLM-based router that uses an LLM to make intelligent routing decisions
//!
//! Uses a configurable tier (Fast/Balanced/Deep) via config.routing.router_tier to analyze
//! requests and choose the optimal target model. This is a pure LLM routing strategy that
//! always uses LLM analysis (not a fallback - see HybridRouter for rule+LLM fallback).
//!
//! ## Tier Selection for Routing
//!
//! See [`TierSelector`] documentation for tier comparison,
//! latency characteristics, and trade-offs when choosing a router tier.

use crate::error::{AppError, AppResult};
use crate::models::endpoint_name::ExclusionSet;
use crate::models::{ModelSelector, TierSelector};
use crate::router::{RouteMetadata, RoutingDecision, RoutingStrategy, TargetModel};
use async_trait::async_trait;
use std::sync::Arc;

/// Trait for LLM-based routing
///
/// Allows dependency injection of different LLM router implementations,
/// enabling testing with mock routers that don't make real network calls.
#[async_trait]
pub trait LlmRouter: Send + Sync {
    /// Route a request based on LLM analysis
    ///
    /// # Arguments
    ///
    /// * `user_prompt` - The user's original prompt
    /// * `meta` - Request metadata (token estimate, importance, task type)
    ///
    /// # Returns
    ///
    /// Returns a routing decision indicating which tier to use, or an error
    /// if routing fails (no healthy endpoints, LLM malfunction, etc.)
    async fn route(&self, user_prompt: &str, meta: &RouteMetadata) -> AppResult<RoutingDecision>;
}

/// Errors specific to LLM-based routing decisions
///
/// Categorizes errors into systemic (LLM malfunction) vs transient (network/endpoint issues).
/// This allows the retry logic to distinguish between errors that should fail fast vs errors
/// that can be resolved by trying a different endpoint.
#[derive(Debug, thiserror::Error)]
pub enum LlmRouterError {
    /// Empty response from LLM router (systemic - no retry)
    #[error(
        "LLM router returned empty response from {endpoint}. \
         Expected response containing one of: FAST, BALANCED, or DEEP. \
         Possible causes: safety filter activation, API failure, streaming error, or LLM misconfiguration."
    )]
    EmptyResponse { endpoint: String },

    /// Unparseable response from LLM router (systemic - no retry)
    #[error("LLM router returned unparseable response ({response_length} bytes): {response}")]
    UnparseableResponse {
        endpoint: String,
        response: String,
        response_length: usize,
    },

    /// Refusal or error from LLM router (systemic - no retry)
    #[error("LLM router refused or returned error: {message}")]
    Refusal { endpoint: String, message: String },

    /// Response size limit exceeded (systemic - no retry)
    #[error(
        "Router response exceeded {max_size} bytes (got {size} bytes). LLM not following instructions."
    )]
    SizeExceeded {
        endpoint: String,
        size: usize,
        max_size: usize,
    },

    /// AgentOptions configuration failure (systemic - no retry)
    #[error("Failed to configure AgentOptions for router: {details}")]
    AgentOptionsConfigError { endpoint: String, details: String },

    /// Stream error during router query (transient - retry allowed)
    #[error("Stream error after {bytes_received} bytes received: {error_message}")]
    StreamError {
        endpoint: String,
        bytes_received: usize,
        error_message: String,
    },

    /// Router query timeout (transient - retry allowed)
    #[error(
        "Router query to {endpoint} timed out after {timeout_seconds}s (tier: {router_tier:?}, attempt {attempt}/{max_attempts})"
    )]
    Timeout {
        endpoint: String,
        timeout_seconds: u64,
        attempt: usize,
        max_attempts: usize,
        router_tier: crate::router::TargetModel,
    },
}

impl LlmRouterError {
    /// Returns true if this error is retryable (transient network/endpoint issue)
    ///
    /// Retryable errors:
    /// - StreamError: Network interruption, may succeed with different endpoint
    /// - Timeout: Endpoint overloaded, may succeed with different endpoint
    ///
    /// Non-retryable (systemic) errors:
    /// - EmptyResponse: LLM malfunction
    /// - UnparseableResponse: LLM not following instructions
    /// - Refusal: Safety filter or LLM refusing request
    /// - SizeExceeded: LLM generating invalid output
    /// - AgentOptionsConfigError: Configuration problem
    pub fn is_retryable(&self) -> bool {
        matches!(
            self,
            LlmRouterError::StreamError { .. } | LlmRouterError::Timeout { .. }
        )
    }
}

// From<LlmRouterError> for AppError is auto-generated by the #[from] attribute
// on AppError::LlmRouting variant. This preserves type information instead of
// losing it by converting to AppError::ModelQueryFailed.

/// Maximum size for router LLM response (1KB)
///
/// Prevents unbounded memory growth from LLM malfunctions (runaway generation,
/// hallucination loops, prompt injection).
///
/// **Expected response**: "FAST", "BALANCED", or "DEEP" with optional explanation (~10-200 bytes).
/// **Safety margin**: 1024 bytes = 5x maximum expected legitimate response (~200 bytes).
///
/// Oversized responses (>1KB) indicate LLM misconfiguration and are rejected during streaming.
const MAX_ROUTER_RESPONSE: usize = 1024;

/// LLM-powered router that uses a model to make routing decisions
///
/// Uses the configured tier to analyze requests and choose optimal target.
/// Provides intelligent fallback when rule-based routing is ambiguous.
///
/// # Construction-Time Validation
///
/// Uses `TierSelector` to validate that the specified tier has available endpoints.
/// The tier is chosen via `config.routing.router_tier` at construction time.
pub struct LlmBasedRouter {
    selector: TierSelector,
    router_tier: TargetModel,
    router_timeout_secs: u64,
    metrics: Arc<crate::metrics::Metrics>,
}

impl LlmBasedRouter {
    /// Create a new LLM-based router using the specified tier
    ///
    /// Returns an error if no endpoints are configured for the specified tier.
    ///
    /// # Arguments
    /// * `selector` - The underlying ModelSelector
    /// * `tier` - Which tier (Fast, Balanced, Deep) to use for routing decisions
    /// * `router_timeout_secs` - Timeout for router queries in seconds
    /// * `metrics` - Metrics collector for observability
    ///
    /// # Tier Selection
    ///
    /// - **Fast**: Lowest latency (~50-200ms) but may misroute complex requests
    /// - **Balanced**: Recommended default (~100-500ms) with good accuracy
    /// - **Deep**: Highest accuracy (~2-5s) but rarely worth the latency overhead
    ///
    /// # Construction-Time Validation
    ///
    /// The `TierSelector` validates tier availability at construction, ensuring
    /// at least one endpoint exists for the specified tier.
    pub fn new(
        selector: Arc<ModelSelector>,
        tier: TargetModel,
        router_timeout_secs: u64,
        metrics: Arc<crate::metrics::Metrics>,
    ) -> AppResult<Self> {
        // TierSelector validates that the tier exists
        let tier_selector = TierSelector::new(selector, tier)?;

        Ok(Self {
            selector: tier_selector,
            router_tier: tier,
            router_timeout_secs,
            metrics,
        })
    }

    /// Returns the configured router tier
    pub fn tier(&self) -> TargetModel {
        self.router_tier
    }

    /// Classify error as retryable (transient) or non-retryable (systemic)
    ///
    /// Systemic errors indicate problems that won't be fixed by retrying with
    /// a different endpoint:
    /// - Parse failures (unparseable, empty, or oversized responses)
    /// - Refusal/error responses from LLM
    /// - Configuration errors
    ///
    /// Transient errors may be resolved by retrying with a different endpoint:
    /// - Network timeouts
    /// - Connection failures
    /// - Stream errors
    ///
    /// # Implementation
    ///
    /// Uses type-safe error classification via LlmRouterError::is_retryable()
    /// and ModelQueryError::is_retryable().
    fn is_retryable_error(error: &AppError) -> bool {
        match error {
            // Type-safe error classification - no string matching!
            AppError::LlmRouting(e) => e.is_retryable(),
            AppError::ModelQuery(e) => e.is_retryable(),

            // Config errors are systemic, never retryable
            AppError::Config(_)
            | AppError::ConfigFileRead { .. }
            | AppError::ConfigParseFailed { .. }
            | AppError::ConfigValidationFailed { .. } => false,

            // Default: assume transient for unknown error types
            // Conservative approach - retry unless we know it's systemic
            _ => true,
        }
    }

    /// Route request using LLM analysis
    ///
    /// # Async Behavior
    /// This method is async because it:
    /// - **Waits for LLM inference**: ~100-500ms for 30B model routing decision (dominant latency)
    /// - Makes HTTP requests to LLM endpoints (network I/O, ~10-100ms connection overhead)
    /// - Awaits endpoint selection from ModelSelector (async lock acquisition, <1ms)
    /// - Performs health tracking mark_success/mark_failure (async lock, <1ms)
    ///
    /// Total typical latency: ~110-600ms (dominated by LLM inference)
    ///
    /// # Retry Logic & Failure Tracking (Dual-Level)
    /// Implements sophisticated retry with TWO failure tracking mechanisms:
    /// 1. **Request-Scoped Exclusion** (`failed_endpoints`): Prevents retrying
    ///    the same endpoint within THIS request. Clears when function returns.
    /// 2. **Global Health Tracking**: Marks endpoints unhealthy after 3 consecutive
    ///    failures across ALL requests. Persists via ModelSelector's health_checker.
    ///
    /// # Cancellation Safety
    /// If the returned Future is dropped (cancelled), in-flight LLM queries will be
    /// aborted but endpoint health state remains consistent (mark_success/mark_failure
    /// only called after query completes).
    pub async fn route(
        &self,
        user_prompt: &str,
        meta: &RouteMetadata,
    ) -> AppResult<RoutingDecision> {
        // Build router prompt
        let router_prompt = Self::build_router_prompt(user_prompt, meta);

        tracing::debug!(
            prompt_length = router_prompt.len(),
            user_prompt_length = user_prompt.len(),
            "Built router prompt for LLM analysis"
        );

        // Retry loop with request-scoped exclusion (similar to chat handler)
        //
        // SCOPE: The `failed_endpoints` exclusion set is request-scoped - it exists only
        // for the duration of this function call and is discarded when the function returns.
        // This means endpoints excluded during this request's retries will be available again
        // for the next request.
        //
        // WHY REQUEST-SCOPED: If we permanently excluded endpoints after failures, a single
        // transient network glitch could permanently remove a healthy endpoint from rotation.
        // Request-scoped exclusions allow the health checker to independently track endpoint
        // health and recover failed endpoints, while still preventing retry loops from
        // hitting the same failed endpoint repeatedly within a single request.
        const MAX_ROUTER_RETRIES: usize = 2;
        const RETRY_BACKOFF_MS: u64 = 100; // Base backoff: 100ms, doubles each retry
        let mut last_error = None;
        let mut failed_endpoints = ExclusionSet::new();

        for attempt in 1..=MAX_ROUTER_RETRIES {
            // Select endpoint from router tier (with health filtering + exclusions)
            let endpoint = match self.selector.select(&failed_endpoints).await {
                Some(ep) => ep.clone(),
                None => {
                    let total_configured = self.selector.endpoint_count();
                    let excluded_count = failed_endpoints.len();
                    let router_tier = self.selector.tier();

                    // Categorize failure type for better diagnostics and actionable guidance
                    if total_configured == 0 {
                        // CONFIGURATION ERROR: No endpoints configured for this tier
                        // This should have been caught by Config::validate() but check defensively
                        tracing::error!(
                            tier = ?router_tier,
                            attempt = attempt,
                            max_retries = MAX_ROUTER_RETRIES,
                            "CONFIGURATION ERROR: No endpoints configured for {:?} tier. \
                            Check config.toml: [[models.{:?}]] section must have at least one endpoint. \
                            This should have been caught by validation.",
                            router_tier, router_tier
                        );
                        last_error = Some(AppError::Config(format!(
                            "No endpoints configured for {:?} tier (router_tier setting). \
                            Add at least one endpoint to [[models.{:?}]] in config.toml.",
                            router_tier, router_tier
                        )));

                        // Add exponential backoff before retry
                        if attempt < MAX_ROUTER_RETRIES {
                            let backoff_ms = RETRY_BACKOFF_MS.saturating_mul(
                                2_u64.saturating_pow((attempt as u32).saturating_sub(1)),
                            );
                            tokio::time::sleep(tokio::time::Duration::from_millis(backoff_ms))
                                .await;
                        }
                        continue;
                    } else if excluded_count == total_configured {
                        // COMPLETE EXHAUSTION: All configured endpoints tried and failed
                        // This is an error condition - we tried everything and nothing worked

                        // Collect failed endpoint names for debugging
                        let failed_names: Vec<&str> =
                            failed_endpoints.iter().map(|ep| ep.as_str()).collect();
                        let failed_names_str = failed_names.join(", ");

                        tracing::error!(
                            tier = ?router_tier,
                            attempt = attempt,
                            max_retries = MAX_ROUTER_RETRIES,
                            total_configured_endpoints = total_configured,
                            failed_endpoints = ?failed_endpoints,
                            last_error = ?last_error,
                            "COMPLETE EXHAUSTION: All {} {:?} tier endpoints failed for routing. \
                            All endpoints tried in this request returned errors. Check endpoint health.",
                            total_configured, router_tier
                        );

                        // Preserve the last error's details (timeout, etc.) in the exhaustion message
                        // This ensures operators see WHY endpoints failed, not just that they failed
                        let detailed_cause = if let Some(ref err) = last_error {
                            format!("Last failure: {}", err)
                        } else {
                            "No error details available".to_string()
                        };

                        last_error = Some(AppError::RoutingFailed(format!(
                            "All {} {:?} tier endpoints exhausted for routing (attempt {}/{}). \
                            Failed endpoints: {}. {}. \
                            Check endpoint connectivity and health.",
                            total_configured,
                            router_tier,
                            attempt,
                            MAX_ROUTER_RETRIES,
                            failed_names_str,
                            detailed_cause
                        )));

                        // Add exponential backoff before retry
                        if attempt < MAX_ROUTER_RETRIES {
                            let backoff_ms = RETRY_BACKOFF_MS.saturating_mul(
                                2_u64.saturating_pow((attempt as u32).saturating_sub(1)),
                            );
                            tokio::time::sleep(tokio::time::Duration::from_millis(backoff_ms))
                                .await;
                        }
                        continue;
                    } else {
                        // TRANSIENT FAILURE: Some endpoints exist but are unhealthy, waiting for recovery
                        // This is a warning, not an error - endpoints may recover soon
                        let healthy_count = total_configured - excluded_count;
                        tracing::warn!(
                            tier = ?router_tier,
                            attempt = attempt,
                            max_retries = MAX_ROUTER_RETRIES,
                            total_configured_endpoints = total_configured,
                            failed_endpoints_count = excluded_count,
                            healthy_but_unavailable_count = healthy_count,
                            failed_endpoints = ?failed_endpoints,
                            last_error = ?last_error,
                            "TRANSIENT: No available {:?} tier endpoints (configured: {}, failed: {}, \
                            healthy but unavailable: {}). Endpoints may be recovering from failures. \
                            Waiting for health checker recovery.",
                            router_tier, total_configured, excluded_count, healthy_count
                        );

                        // Preserve the last error's details (timeout, etc.) in the transient failure message
                        let detailed_cause = if let Some(ref err) = last_error {
                            format!("Last failure: {}", err)
                        } else {
                            "No error details available".to_string()
                        };

                        last_error = Some(AppError::RoutingFailed(format!(
                            "No available {:?} tier endpoints (configured: {}, failed: {}, \
                            healthy but temporarily unavailable: {}, attempt {}/{}). \
                            {}. Endpoints may recover shortly.",
                            router_tier,
                            total_configured,
                            excluded_count,
                            healthy_count,
                            attempt,
                            MAX_ROUTER_RETRIES,
                            detailed_cause
                        )));

                        // Add exponential backoff before retry
                        if attempt < MAX_ROUTER_RETRIES {
                            let backoff_ms = RETRY_BACKOFF_MS.saturating_mul(
                                2_u64.saturating_pow((attempt as u32).saturating_sub(1)),
                            );
                            tokio::time::sleep(tokio::time::Duration::from_millis(backoff_ms))
                                .await;
                        }
                        continue;
                    }
                }
            };

            tracing::debug!(
                endpoint_name = %endpoint.name(),
                endpoint_url = %endpoint.base_url(),
                tier = ?self.selector.tier(),
                attempt = attempt,
                max_retries = MAX_ROUTER_RETRIES,
                "Selected {:?} tier endpoint for routing decision",
                self.selector.tier()
            );

            // Try to query this endpoint
            let query_result = self
                .try_router_query(&endpoint, &router_prompt, attempt, MAX_ROUTER_RETRIES)
                .await;

            match query_result {
                Ok(target_model) => {
                    // Success! Mark endpoint healthy for immediate recovery
                    //
                    // Health tracking is observability infrastructure, not core functionality.
                    // If mark_success fails, log warning but DON'T fail the request - we have
                    // a valid routing decision from the LLM. This matches chat handler behavior
                    // (warn and continue, don't propagate health tracking errors).
                    //
                    // Potential failures: UnknownEndpoint (config reload race), HttpClientCreationFailed
                    // (TLS issues), InvalidEndpointUrl (config error). All are observability issues,
                    // not reasons to discard a successful routing decision.
                    if let Err(e) = self
                        .selector
                        .health_checker()
                        .mark_success(endpoint.name())
                        .await
                    {
                        self.metrics
                            .health_tracking_failure(endpoint.name(), e.error_type());

                        tracing::warn!(
                            endpoint_name = %endpoint.name(),
                            error = %e,
                            target_model = ?target_model,
                            attempt = attempt,
                            "Health tracking skipped: {} (router continues with successful routing decision)",
                            e
                        );
                    }

                    tracing::info!(
                        endpoint_name = %endpoint.name(),
                        target_model = ?target_model,
                        attempt = attempt,
                        "Router LLM successfully determined target model"
                    );

                    // Return routing decision (no warnings - health tracking errors now fail fast)
                    return Ok(RoutingDecision::new(target_model, RoutingStrategy::Llm));
                }
                Err(e) => {
                    // Classify error as retryable or systemic
                    let is_retryable = Self::is_retryable_error(&e);

                    if !is_retryable {
                        // Systemic error - fail fast without retrying
                        // Examples: parse failures, config errors, unparseable responses
                        tracing::error!(
                            endpoint_name = %endpoint.name(),
                            attempt = attempt,
                            error = %e,
                            "Router query failed with systemic error - failing fast (no retry)"
                        );
                        return Err(e);
                    }

                    // Transient error - mark endpoint unhealthy and retry with different endpoint
                    tracing::warn!(
                        endpoint_name = %endpoint.name(),
                        attempt = attempt,
                        max_retries = MAX_ROUTER_RETRIES,
                        error = %e,
                        "Router query failed with transient error, marking endpoint and retrying"
                    );

                    // Health tracking is observability infrastructure, not core functionality.
                    // If mark_failure fails, log warning but DON'T fail the request - we can still
                    // retry with other endpoints. This matches chat handler behavior (warn and continue,
                    // don't propagate health tracking errors).
                    //
                    // Potential failures: UnknownEndpoint (config reload race), HttpClientCreationFailed
                    // (TLS issues), InvalidEndpointUrl (config error). All are observability issues,
                    // not reasons to block retry logic.
                    if let Err(e) = self
                        .selector
                        .health_checker()
                        .mark_failure(endpoint.name())
                        .await
                    {
                        self.metrics
                            .health_tracking_failure(endpoint.name(), e.error_type());

                        tracing::warn!(
                            endpoint_name = %endpoint.name(),
                            error = %e,
                            attempt = attempt,
                            "Health tracking skipped: {} (router continues with retry logic)",
                            e
                        );
                    }

                    // Add to exclusion set to prevent retry on same endpoint
                    use crate::models::EndpointName;
                    failed_endpoints.insert(EndpointName::from(&endpoint));
                    last_error = Some(e);

                    // Add exponential backoff before retry
                    if attempt < MAX_ROUTER_RETRIES {
                        let backoff_ms = RETRY_BACKOFF_MS * (2_u64.pow(attempt as u32 - 1));
                        tokio::time::sleep(tokio::time::Duration::from_millis(backoff_ms)).await;
                    }
                    continue; // Try next endpoint
                }
            }
        }

        // All retries exhausted
        tracing::error!(
            tier = ?self.selector.tier(),
            max_retries = MAX_ROUTER_RETRIES,
            "All router retry attempts exhausted"
        );

        Err(last_error.unwrap_or_else(|| {
            // DEFENSIVE: This check should be unreachable if retry logic is correct.
            // All failure paths (endpoint selection, query errors) set last_error before continue.
            // This fallback exists for future-proofing - if someone adds a new failure path
            // without setting last_error, we catch it here instead of panicking.
            tracing::error!(
                tier = ?self.router_tier,
                max_retries = MAX_ROUTER_RETRIES,
                "DEFENSIVE BUG: Retry loop exhausted but last_error is None. \
                The retry loop has a missing error assignment path."
            );

            AppError::Internal(format!(
                "DEFENSIVE: All {} router retry attempts exhausted but no error recorded. \
                Indicates missing error assignment in retry logic. Please report this bug.",
                MAX_ROUTER_RETRIES
            ))
        }))
    }

    /// Helper to attempt a single router query (extracted for retry logic)
    async fn try_router_query(
        &self,
        endpoint: &crate::config::ModelEndpoint,
        router_prompt: &str,
        attempt: usize,
        max_retries: usize,
    ) -> AppResult<TargetModel> {
        // Build AgentOptions from endpoint
        let options = open_agent::AgentOptions::builder()
            .model(endpoint.name())
            .base_url(endpoint.base_url())
            .max_tokens(endpoint.max_tokens() as u32)
            .temperature(endpoint.temperature() as f32)
            .build()
            .map_err(|e| {
                tracing::error!(
                    endpoint_name = %endpoint.name(),
                    endpoint_url = %endpoint.base_url(),
                    model = %endpoint.name(),
                    max_tokens = endpoint.max_tokens(),
                    temperature = endpoint.temperature(),
                    error = %e,
                    attempt = attempt,
                    max_retries = max_retries,
                    "Failed to build AgentOptions for router query"
                );
                AppError::LlmRouting(LlmRouterError::AgentOptionsConfigError {
                    endpoint: endpoint.base_url().to_string(),
                    details: format!(
                        "{}. Check configuration: model='{}' (must be non-empty), \
                         max_tokens={} (must be > 0), base_url='{}' (must end with /v1)",
                        e,
                        endpoint.name(),
                        endpoint.max_tokens(),
                        endpoint.base_url()
                    ),
                })
            })?;

        // Query the router model with timeout protection
        // IMPORTANT: The timeout must wrap BOTH the initial connection AND stream consumption.
        // Previously, only open_agent::query() was wrapped, but wiremock delays happen during
        // stream.next().await, causing the timeout to be ineffective. This was a bug.
        use futures::StreamExt;
        use tokio::time::{Duration, timeout};

        let timeout_duration = Duration::from_secs(self.router_timeout_secs);
        let endpoint_url = endpoint.base_url().to_string();
        let endpoint_name = endpoint.name().to_string();

        // Wrap the entire query + stream consumption in a single timeout
        let query_result = timeout(timeout_duration, async {
            // Start the query and get the stream
            let mut stream = open_agent::query(router_prompt, &options)
                .await
                .map_err(|e| {
                    AppError::LlmRouting(LlmRouterError::StreamError {
                        endpoint: endpoint_url.clone(),
                        bytes_received: 0,
                        error_message: format!("Router query failed: {}", e),
                    })
                })?;

            // Collect response from stream with size limit
            let mut response_text = String::new();
            while let Some(result) = stream.next().await {
                match result {
                    Ok(block) => {
                        use open_agent::ContentBlock;
                        if let ContentBlock::Text(text_block) = block {
                            // Check size limit before accumulating
                            if response_text.len() + text_block.text.len() > MAX_ROUTER_RESPONSE {
                                return Err(AppError::LlmRouting(LlmRouterError::SizeExceeded {
                                    endpoint: endpoint_url.clone(),
                                    size: response_text.len() + text_block.text.len(),
                                    max_size: MAX_ROUTER_RESPONSE,
                                }));
                            }
                            response_text.push_str(&text_block.text);
                        }
                    }
                    Err(e) => {
                        return Err(AppError::LlmRouting(LlmRouterError::StreamError {
                            endpoint: endpoint_url.clone(),
                            bytes_received: response_text.len(),
                            error_message: format!("{}", e),
                        }));
                    }
                }
            }

            Ok::<String, AppError>(response_text)
        })
        .await;

        // Handle timeout vs inner errors
        let response_text = match query_result {
            Ok(Ok(text)) => text,
            Ok(Err(inner_error)) => {
                // Log the inner error with context
                tracing::error!(
                    endpoint_name = %endpoint_name,
                    endpoint_url = %endpoint_url,
                    error = %inner_error,
                    attempt = attempt,
                    max_retries = max_retries,
                    "Router query failed (attempt {}/{})",
                    attempt, max_retries
                );
                return Err(inner_error);
            }
            Err(_elapsed) => {
                tracing::error!(
                    endpoint_name = %endpoint_name,
                    endpoint_url = %endpoint_url,
                    timeout_seconds = self.router_timeout_secs,
                    router_tier = ?self.router_tier,
                    attempt = attempt,
                    max_retries = max_retries,
                    "Router query timeout - endpoint did not respond within {} seconds (attempt {}/{})",
                    self.router_timeout_secs, attempt, max_retries
                );
                return Err(AppError::LlmRouting(LlmRouterError::Timeout {
                    endpoint: endpoint_url,
                    timeout_seconds: self.router_timeout_secs,
                    attempt,
                    max_attempts: max_retries,
                    router_tier: self.router_tier,
                }));
            }
        };

        // Early empty response detection
        if response_text.trim().is_empty() {
            tracing::error!(
                endpoint_name = %endpoint_name,
                endpoint_url = %endpoint_url,
                attempt = attempt,
                max_retries = max_retries,
                "LLM router returned empty response (0 text blocks received) - \
                 cannot determine routing decision (attempt {}/{})",
                attempt, max_retries
            );
            return Err(AppError::LlmRouting(LlmRouterError::EmptyResponse {
                endpoint: endpoint_url,
            }));
        }

        tracing::debug!(
            endpoint_name = %endpoint_name,
            response_length = response_text.len(),
            response = %response_text,
            attempt = attempt,
            "Received router decision from LLM"
        );

        // Parse routing decision
        Self::parse_routing_decision(&response_text)
    }

    /// Build router prompt from user request + metadata
    ///
    /// Creates a structured prompt that asks the LLM to choose between
    /// FAST, BALANCED, or DEEP based on the user's request and metadata.
    ///
    /// Includes prompt injection protection:
    /// - Truncates long user prompts to prevent context overflow
    /// - Adds reinforcement instructions after user input
    fn build_router_prompt(user_prompt: &str, meta: &RouteMetadata) -> String {
        // Truncate user prompt to prevent prompt injection via context overflow
        const MAX_USER_PROMPT_CHARS: usize = 500;

        // Use char-based indexing to avoid panics on UTF-8 boundaries
        let char_count = user_prompt.chars().count();
        let truncated_prompt = if char_count > MAX_USER_PROMPT_CHARS {
            let truncated: String = user_prompt.chars().take(MAX_USER_PROMPT_CHARS).collect();
            format!("{}... [truncated]", truncated)
        } else {
            user_prompt.to_string()
        };

        format!(
            "You are a router that chooses which LLM to use.\n\n\
             Available models:\n\
             - FAST: Quick (small params), for simple chat, short Q&A, casual tasks.\n\
             - BALANCED: Good reasoning (medium params), coding, document summaries, explanations.\n\
             - DEEP: Deep reasoning (large params), creative writing, complex analysis, research.\n\n\
             User request:\n{}\n\n\
             Metadata:\n\
             - Estimated tokens: {}\n\
             - Importance: {:?}\n\
             - Task type: {:?}\n\n\
             Based on the above, respond with ONLY one word: FAST, BALANCED, or DEEP.\n\
             Do not include explanations or other text.",
            truncated_prompt, meta.token_estimate, meta.importance, meta.task_type
        )
    }

    /// Find a word at word boundaries in text (prevents false positives)
    ///
    /// Returns the position of the first occurrence of `word` that is surrounded
    /// by word boundaries (whitespace, punctuation, or start/end of string).
    ///
    /// Prevents false positives like matching "FAST" in "BREAKFAST" or "STEADFAST".
    fn find_word_boundary(text: &str, word: &str) -> Option<usize> {
        let word_len = word.len();
        let text_bytes = text.as_bytes();

        // Try all possible positions where word could start
        for (pos, _) in text.match_indices(word) {
            // Check character before (must be word boundary or start of string)
            let before_is_boundary = if pos == 0 {
                true
            } else {
                let prev_char = text_bytes[pos - 1];
                // Word boundary definition: Not alphanumeric AND not underscore
                // Treat underscore as part of words (like in identifiers)
                // This prevents "SUPER_FAST" from matching "FAST"
                // Examples:
                //   - "FAST-TRACK" matches "FAST" (dash is boundary)
                //   - "SUPER_FAST" does NOT match "FAST" (underscore is part of word)
                //   - "你FAST好" matches "FAST" (Chinese chars are boundary)
                //   - "steadFAST" does NOT match "FAST" (lowercase 'd' is alphanumeric)
                !(prev_char.is_ascii_alphanumeric() || prev_char == b'_')
            };

            // Check character after (must be word boundary or end of string)
            let after_pos = pos + word_len;
            let after_is_boundary = if after_pos >= text.len() {
                true
            } else {
                let next_char = text_bytes[after_pos];
                // Word boundary definition: Not alphanumeric AND not underscore
                // Treat underscore as part of words (like in identifiers)
                !(next_char.is_ascii_alphanumeric() || next_char == b'_')
            };

            if before_is_boundary && after_is_boundary {
                return Some(pos);
            }
        }

        None
    }

    /// Parse LLM response to extract routing decision
    ///
    /// Uses **word-boundary-aware fuzzy matching** with refusal detection to extract
    /// FAST, BALANCED, or DEEP. Word boundaries are critical because:
    /// - Without them, "BREAKFAST" would match "FAST" (substring match)
    /// - Without them, "STEADFAST" would match "FAST" (substring match)
    /// - With boundaries, only whole-word matches succeed
    ///
    /// Prevents false positives like "FAST" in "BREAKFAST" by requiring keywords
    /// to be surrounded by word boundaries (whitespace, punctuation, or start/end of string).
    /// See `find_word_boundary()` for matching logic.
    ///
    /// Returns an error if response is empty, unparseable, or indicates refusal/error.
    ///
    /// Algorithm:
    /// 1. Check for refusal/error patterns (CANNOT, ERROR, UNABLE, SORRY) - return error
    /// 2. Find leftmost routing keyword (FAST, BALANCED, DEEP) at word boundary - return that tier
    /// 3. If no keyword found at word boundaries - return error (unparseable)
    ///
    /// Examples:
    /// - "FAST" → Fast (exact match)
    /// - "I recommend FAST for this" → Fast (word boundary match)
    /// - "FAST-TRACK" → Fast (punctuation counts as word boundary)
    /// - "BREAKFAST" → Error (no word boundary, substring ignored)
    /// - "FAST or BALANCED" → Fast (leftmost at word boundary wins)
    ///
    /// Errors indicate serious problems:
    /// - LLM misconfiguration (wrong model/prompt)
    /// - Safety filter activation
    /// - API failures or rate limiting
    /// - Prompt injection bypass
    fn parse_routing_decision(response: &str) -> AppResult<TargetModel> {
        let normalized = response.trim().to_uppercase();

        // Check for empty response first
        if normalized.is_empty() {
            tracing::error!(
                response = %response,
                "LLM router returned empty response - cannot determine routing decision"
            );
            return Err(AppError::LlmRouting(LlmRouterError::EmptyResponse {
                endpoint: "router".to_string(),
            }));
        }

        // Check for refusal/error patterns BEFORE keyword matching
        //
        // Note: Uses simple substring matching for refusal patterns. False positives
        // are possible (e.g., "The FAST tier CANNOT be used") but rare in practice
        // because the prompt explicitly instructs single-word responses. If the LLM
        // ignores instructions and generates multi-word responses containing refusal
        // keywords, we treat it as a malfunction regardless of false positive risk.
        const REFUSAL_PATTERNS: &[&str] = &[
            "CANNOT", "CAN'T", "UNABLE", "ERROR", "SORRY", "REFUSE", "FAILED", "TIMEOUT",
        ];

        for pattern in REFUSAL_PATTERNS {
            if normalized.contains(pattern) {
                tracing::error!(
                    response = %response,
                    refusal_pattern = pattern,
                    "Router LLM returned refusal or error response"
                );

                // Truncate response to 500 chars for error message preview
                let response_preview = if response.len() > 500 {
                    format!("{}...", &response.chars().take(500).collect::<String>())
                } else {
                    response.to_string()
                };

                return Err(AppError::LlmRouting(LlmRouterError::Refusal {
                    endpoint: "router".to_string(),
                    message: format!(
                        "Router LLM returned refusal/error response (contains '{}'): '{}'",
                        pattern, response_preview
                    ),
                }));
            }
        }

        // Position-based matching with word boundary checking: Find leftmost routing keyword
        // This handles cases like "FAST or BALANCED" correctly (picks FAST)
        // Word boundary prevents false positives like "FAST" in "BREAKFAST"
        let fast_pos = Self::find_word_boundary(&normalized, "FAST");
        let balanced_pos = Self::find_word_boundary(&normalized, "BALANCED");
        let deep_pos = Self::find_word_boundary(&normalized, "DEEP");

        // Determine which keyword appears first (leftmost position)
        let positions = vec![
            (fast_pos, TargetModel::Fast),
            (balanced_pos, TargetModel::Balanced),
            (deep_pos, TargetModel::Deep),
        ];

        // Filter out None positions and find the minimum (leftmost)
        if let Some((_, model)) = positions
            .into_iter()
            .filter_map(|(pos, model)| pos.map(|p| (p, model)))
            .min_by_key(|(pos, _)| *pos)
        {
            return Ok(model);
        }

        // No valid routing decision found - return error instead of silent fallback
        // This indicates serious problems:
        // - LLM misconfiguration (wrong model, wrong prompt)
        // - Safety filter activation (LLM refusing to answer)
        // - API failures or rate limiting
        // - Prompt injection successful bypass
        tracing::error!(
            response = %response,
            response_length = response.len(),
            "Router LLM returned unparseable response - cannot extract FAST, BALANCED, or DEEP"
        );

        // Truncate response to 500 chars for error message preview
        let response_preview = if response.len() > 500 {
            format!(
                "{}... [truncated]",
                &response.chars().take(500).collect::<String>()
            )
        } else {
            response.to_string()
        };

        Err(AppError::LlmRouting(LlmRouterError::UnparseableResponse {
            endpoint: "router".to_string(),
            response: response_preview,
            response_length: response.len(),
        }))
    }
}

/// Implementation of LlmRouter trait for LlmBasedRouter
///
/// This allows LlmBasedRouter to be used as a trait object for dependency injection in tests.
#[async_trait]
impl LlmRouter for LlmBasedRouter {
    async fn route(&self, user_prompt: &str, meta: &RouteMetadata) -> AppResult<RoutingDecision> {
        // Delegate to the existing route method
        self.route(user_prompt, meta).await
    }
}

// Test modules
#[cfg(test)]
mod parsing_tests;

#[cfg(test)]
mod prompt_tests;

#[cfg(test)]
mod size_limit_tests;

#[cfg(test)]
mod utf8_safety_tests;

#[cfg(test)]
mod constructor_tests;

#[cfg(test)]
mod error_classification_tests;

#[cfg(test)]
mod error_type_tests;