zagens-cli 0.8.3

Zagens headless CLI + HTTP/SSE runtime sidecar (`zagens`, `zagens-runtime` binaries)
Documentation
use std::time::Duration;

use anyhow::Result;

use crate::models::{ContentBlock, Message, MessageRequest, SystemPrompt};

use super::runtime::SubAgentRuntime;

#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct SubAgentResolvedRoute {
    pub(crate) model: String,
    pub(crate) reasoning_effort: Option<String>,
}

pub async fn resolve_subagent_assignment_route(
    runtime: &SubAgentRuntime,
    configured_model: Option<String>,
    prompt: &str,
) -> SubAgentResolvedRoute {
    let explicit_model = configured_model.is_some();
    let mut route = fallback_subagent_assignment_route(runtime, configured_model, prompt);

    if (runtime.auto_model || runtime.reasoning_effort_auto)
        && let Ok(Some(recommendation)) = subagent_flash_router(runtime, prompt).await
    {
        if runtime.auto_model && !explicit_model {
            route.model = recommendation.model;
        }
        if runtime.reasoning_effort_auto {
            route.reasoning_effort = recommendation
                .reasoning_effort
                .map(|effort| effort.as_setting().to_string())
                .or(route.reasoning_effort);
        }
    }

    route
}

pub(crate) fn fallback_subagent_assignment_route(
    runtime: &SubAgentRuntime,
    configured_model: Option<String>,
    prompt: &str,
) -> SubAgentResolvedRoute {
    let model = if let Some(model) = configured_model {
        model
    } else if runtime.auto_model {
        crate::auto_route::auto_model_heuristic(prompt, &runtime.model)
    } else {
        runtime.model.clone()
    };

    let reasoning_effort = if runtime.reasoning_effort_auto {
        let effort = match crate::auto_reasoning::select(false, prompt) {
            crate::agent_surface::ReasoningEffort::Low
            | crate::agent_surface::ReasoningEffort::Medium => {
                crate::agent_surface::ReasoningEffort::High
            }
            other => other,
        };
        Some(effort.as_setting().to_string())
    } else {
        runtime.reasoning_effort.clone()
    };

    SubAgentResolvedRoute {
        model,
        reasoning_effort,
    }
}

pub(crate) async fn subagent_flash_router(
    runtime: &SubAgentRuntime,
    prompt: &str,
) -> Result<Option<crate::auto_route::AutoRouteRecommendation>> {
    if cfg!(test) {
        return Ok(None);
    }

    let request = MessageRequest {
        model: "deepseek-v4-flash".to_string(),
        messages: vec![Message {
            role: "user".to_string(),
            content: vec![ContentBlock::Text {
                text: subagent_router_prompt(runtime, prompt),
                cache_control: None,
            }],
        }],
        max_tokens: 96,
        system: Some(SystemPrompt::Text(
            SUBAGENT_ROUTER_SYSTEM_PROMPT.to_string(),
        )),
        tools: None,
        tool_choice: None,
        metadata: None,
        thinking: None,
        reasoning_effort: Some("off".to_string()),
        stream: Some(false),
        temperature: Some(0.0),
        top_p: None,
    };

    let response = tokio::time::timeout(
        Duration::from_secs(4),
        runtime.client.create_message(request),
    )
    .await??;
    Ok(crate::auto_route::parse_auto_route_recommendation(
        &message_response_text(&response.content),
    ))
}

const SUBAGENT_ROUTER_SYSTEM_PROMPT: &str = "\
You are the Zagens sub-agent routing manager. Return only compact JSON: \
{\"model\":\"deepseek-v4-flash|deepseek-v4-pro\",\"thinking\":\"off|high|max\"}. \
Treat each child assignment like a customer request entering a team queue: decide the least \
sufficient worker and thinking budget for that assignment. Do not treat being a sub-agent as \
important by itself. Use Flash for trivial, read-only, status, lookup, or single-step work. \
Use Pro for coding, debugging, release work, multi-file changes, security, architecture, \
high-risk decisions, ambiguous requests, or work likely to need tool-call judgment. Use thinking \
off for trivial no-tool work, high for ordinary reasoning, and max only for hard, risky, \
multi-step, uncertain, or tool-heavy work.";

pub(crate) fn subagent_router_prompt(runtime: &SubAgentRuntime, prompt: &str) -> String {
    format!(
        "Parent selected model mode: {}\nParent selected thinking mode: {}\n\nSub-agent assignment:\n{}\n\nReturn JSON only.",
        if runtime.auto_model { "auto" } else { "fixed" },
        if runtime.reasoning_effort_auto {
            "auto"
        } else {
            runtime
                .reasoning_effort
                .as_deref()
                .unwrap_or("provider-default")
        },
        truncate_subagent_router_prompt(prompt, 4_000)
    )
}

pub(crate) fn truncate_subagent_router_prompt(text: &str, max_chars: usize) -> String {
    if text.chars().count() <= max_chars {
        return text.to_string();
    }
    let mut out = text.chars().take(max_chars).collect::<String>();
    out.push_str("\n[truncated]");
    out
}

pub(crate) fn message_response_text(blocks: &[ContentBlock]) -> String {
    let mut out = String::new();
    for block in blocks {
        match block {
            ContentBlock::Text { text, .. } => {
                if !out.is_empty() {
                    out.push('\n');
                }
                out.push_str(text);
            }
            ContentBlock::Thinking { thinking } => {
                if !out.is_empty() {
                    out.push('\n');
                }
                out.push_str(thinking);
            }
            _ => {}
        }
    }
    out
}