use futures_core::Stream;
use indexmap::IndexMap;
use serde_json::Value;
use std::fmt;
use std::pin::Pin;
use std::sync::Arc;
use crate::core::tool_spec::ToolSpec;
use crate::error::{BackendError, ContextDiscoveryError, StreamError};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum ChunkType {
TextDelta,
ToolCallDelta,
ProviderEvent,
Final,
Retry,
}
impl ChunkType {
pub fn as_str(&self) -> &'static str {
match self {
Self::TextDelta => "text_delta",
Self::ToolCallDelta => "tool_call_delta",
Self::ProviderEvent => "provider_event",
Self::Final => "final",
Self::Retry => "retry",
}
}
}
impl fmt::Display for ChunkType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.as_str())
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct ToolCall {
pub id: Option<String>,
pub tool: String,
pub args: IndexMap<String, Value>,
pub reasoning: Option<String>,
}
impl ToolCall {
pub fn new(tool: impl Into<String>, args: IndexMap<String, Value>) -> Self {
Self {
id: None,
tool: tool.into(),
args,
reasoning: None,
}
}
pub fn with_id(mut self, id: impl Into<String>) -> Self {
self.id = Some(id.into());
self
}
pub fn with_reasoning(mut self, reasoning: impl Into<String>) -> Self {
self.reasoning = Some(reasoning.into());
self
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct TextResponse {
pub content: String,
}
impl TextResponse {
pub fn new(content: impl Into<String>) -> Self {
Self {
content: content.into(),
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub enum LLMResponse {
ToolCalls(Vec<ToolCall>),
Text(TextResponse),
}
#[derive(Debug, Clone, PartialEq)]
pub struct StreamChunk {
pub chunk_type: ChunkType,
pub content: String,
pub response: Option<LLMResponse>,
pub usage: Option<TokenUsage>,
pub usage_details: Option<LLMUsageDetails>,
pub call_info: Option<LLMCallInfo>,
pub provider_event: Option<Value>,
}
impl StreamChunk {
pub fn new(chunk_type: ChunkType) -> Self {
Self {
chunk_type,
content: String::new(),
response: None,
usage: None,
usage_details: None,
call_info: None,
provider_event: None,
}
}
pub fn with_content(mut self, content: impl Into<String>) -> Self {
self.content = content.into();
self
}
pub fn with_response(mut self, response: LLMResponse) -> Self {
self.response = Some(response);
self
}
pub fn with_metadata(
mut self,
usage: Option<TokenUsage>,
usage_details: Option<LLMUsageDetails>,
call_info: Option<LLMCallInfo>,
) -> Self {
self.usage = usage;
self.usage_details = usage_details;
self.call_info = call_info;
self
}
pub fn with_provider_event(mut self, event: Value) -> Self {
self.provider_event = Some(event);
self
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct TokenUsage {
pub prompt_tokens: i64,
pub completion_tokens: i64,
pub total_tokens: i64,
}
impl TokenUsage {
pub fn new(prompt_tokens: i64, completion_tokens: i64, total_tokens: i64) -> Self {
Self {
prompt_tokens,
completion_tokens,
total_tokens,
}
}
pub fn empty() -> Self {
Self {
prompt_tokens: 0,
completion_tokens: 0,
total_tokens: 0,
}
}
}
#[derive(Debug, Clone, Default, PartialEq)]
pub struct LLMRateLimitInfo {
pub requests_limit: Option<String>,
pub requests_remaining: Option<String>,
pub requests_reset: Option<String>,
pub tokens_limit: Option<String>,
pub tokens_remaining: Option<String>,
pub tokens_reset: Option<String>,
pub retry_after: Option<String>,
pub organization_id: Option<String>,
}
#[derive(Debug, Clone, Default, PartialEq)]
pub struct LLMCallInfo {
pub requested_model: Option<String>,
pub response_model: Option<String>,
pub selected_backend: Option<String>,
pub mapped_model: Option<String>,
pub backend_kind: Option<String>,
pub provider_id: Option<String>,
pub used_responses_api: bool,
pub degradation_warnings: Option<String>,
pub cache_status: Option<String>,
pub rate_limits: LLMRateLimitInfo,
pub estimated_cost_usd: Option<f64>,
}
#[derive(Debug, Clone, Default, PartialEq)]
pub struct LLMUsageDetails {
pub cached_prompt_tokens: Option<i64>,
pub cache_creation_prompt_tokens: Option<i64>,
pub cache_miss_prompt_tokens: Option<i64>,
pub cache_read_input_tokens: Option<i64>,
pub cache_creation_input_tokens: Option<i64>,
pub prompt_cache_hit_tokens: Option<i64>,
pub prompt_cache_miss_tokens: Option<i64>,
pub anthropic_thinking_output_tokens: Option<i64>,
}
impl LLMUsageDetails {
pub fn is_empty(&self) -> bool {
self.cached_prompt_tokens.is_none()
&& self.cache_creation_prompt_tokens.is_none()
&& self.cache_miss_prompt_tokens.is_none()
&& self.cache_read_input_tokens.is_none()
&& self.cache_creation_input_tokens.is_none()
&& self.prompt_cache_hit_tokens.is_none()
&& self.prompt_cache_miss_tokens.is_none()
&& self.anthropic_thinking_output_tokens.is_none()
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct LLMResponseEnvelope {
pub response: LLMResponse,
pub usage: Option<TokenUsage>,
pub usage_details: Option<LLMUsageDetails>,
pub call_info: Option<LLMCallInfo>,
pub provider_response: Option<Value>,
}
impl LLMResponseEnvelope {
pub fn from_response(response: LLMResponse) -> Self {
Self {
response,
usage: None,
usage_details: None,
call_info: None,
provider_response: None,
}
}
pub fn with_metadata(
mut self,
usage: Option<TokenUsage>,
usage_details: Option<LLMUsageDetails>,
call_info: Option<LLMCallInfo>,
) -> Self {
self.usage = usage;
self.usage_details = usage_details;
self.call_info = call_info;
self
}
pub fn with_provider_response(mut self, response: Value) -> Self {
self.provider_response = Some(response);
self
}
}
pub type SamplingParams = serde_json::Map<String, serde_json::Value>;
#[derive(Debug, Clone, Default, PartialEq)]
pub struct LLMRequestOptions {
pub sampling: Option<SamplingParams>,
pub passthrough: Option<serde_json::Map<String, serde_json::Value>>,
pub inbound_anthropic_body: Option<Arc<Value>>,
pub initial_openai_messages: Option<Arc<[Value]>>,
pub anthropic_headers: Option<Vec<(String, String)>>,
pub preserve_provider_response: bool,
}
impl LLMRequestOptions {
pub fn from_sampling(sampling: Option<SamplingParams>) -> Self {
Self {
sampling,
passthrough: None,
inbound_anthropic_body: None,
initial_openai_messages: None,
anthropic_headers: None,
preserve_provider_response: false,
}
}
pub fn is_empty(&self) -> bool {
self.sampling.is_none()
&& self.passthrough.is_none()
&& self.inbound_anthropic_body.is_none()
&& self.initial_openai_messages.is_none()
&& self.anthropic_headers.is_none()
&& !self.preserve_provider_response
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum ApiFormat {
OpenAI,
Ollama,
}
impl ApiFormat {
pub fn as_str(&self) -> &'static str {
match self {
Self::OpenAI => "openai",
Self::Ollama => "ollama",
}
}
}
pub type ChunkStream = Pin<Box<dyn Stream<Item = Result<StreamChunk, StreamError>> + Send>>;
#[allow(async_fn_in_trait)]
pub trait LLMClient: Send + Sync {
fn api_format(&self) -> ApiFormat;
fn last_usage(&self) -> Option<TokenUsage> {
None
}
fn last_call_info(&self) -> Option<LLMCallInfo> {
None
}
fn last_usage_details(&self) -> Option<LLMUsageDetails> {
None
}
fn send(
&self,
messages: Vec<Value>,
tools: Option<Vec<ToolSpec>>,
sampling: Option<SamplingParams>,
) -> impl std::future::Future<Output = Result<LLMResponse, BackendError>> + Send;
fn send_with_options(
&self,
messages: Vec<Value>,
tools: Option<Vec<ToolSpec>>,
options: LLMRequestOptions,
) -> impl std::future::Future<Output = Result<LLMResponse, BackendError>> + Send {
async move { self.send(messages, tools, options.sampling).await }
}
fn send_envelope_with_options(
&self,
messages: Vec<Value>,
tools: Option<Vec<ToolSpec>>,
options: LLMRequestOptions,
) -> impl std::future::Future<Output = Result<LLMResponseEnvelope, BackendError>> + Send {
async move {
let response = self.send_with_options(messages, tools, options).await?;
Ok(LLMResponseEnvelope::from_response(response).with_metadata(
self.last_usage(),
self.last_usage_details(),
self.last_call_info(),
))
}
}
fn send_stream(
&self,
messages: Vec<Value>,
tools: Option<Vec<ToolSpec>>,
sampling: Option<SamplingParams>,
) -> impl std::future::Future<Output = Result<ChunkStream, StreamError>> + Send;
fn send_stream_with_options(
&self,
messages: Vec<Value>,
tools: Option<Vec<ToolSpec>>,
options: LLMRequestOptions,
) -> impl std::future::Future<Output = Result<ChunkStream, StreamError>> + Send {
async move { self.send_stream(messages, tools, options.sampling).await }
}
fn get_context_length(
&self,
) -> impl std::future::Future<Output = Result<Option<i64>, ContextDiscoveryError>> + Send;
}
pub fn format_tool(spec: &ToolSpec) -> Value {
let mut outer = serde_json::Map::new();
outer.insert("type".to_string(), Value::String("function".to_string()));
let mut func = serde_json::Map::new();
func.insert("name".to_string(), Value::String(spec.name.clone()));
func.insert(
"description".to_string(),
Value::String(spec.description.clone()),
);
func.insert("parameters".to_string(), spec.get_json_schema());
outer.insert("function".to_string(), Value::Object(func));
Value::Object(outer)
}