yomo 0.6.5

A QUIC-based runtime for AI-LLM tool routing and serverless execution
Documentation
use async_stream::try_stream;
use async_trait::async_trait;
use futures_core::Stream;
use futures_util::StreamExt;
use serde_json::Value;
use std::collections::HashMap;
use std::pin::Pin;

use crate::llm_provider::openai_compatible::{client, mapper};
use crate::llm_provider::{Provider, ProviderError, UnifiedEvent, UnifiedResponse};
use crate::openai_http_mapping::validate_openai_request;
use crate::openai_types::{ChatCompletionRequest, ClientError, Content, ContentPart, Role};
use crate::serve_config::ConfigError;

#[derive(Clone)]
pub struct VllmDeepseekProvider {
    client: client::Client,
    model_id: Option<String>,
}

impl VllmDeepseekProvider {
    pub fn new(client: client::Client, model_id: Option<String>) -> Self {
        Self { client, model_id }
    }
}

#[async_trait]
impl Provider for VllmDeepseekProvider {
    fn model_id(&self) -> &str {
        "deepseek-v4-flash"
    }

    async fn complete(
        &self,
        mut request: ChatCompletionRequest,
    ) -> Result<UnifiedResponse, ProviderError> {
        if let Some(model_id) = &self.model_id {
            request.model = model_id.clone();
        }
        let request = normalize_request(request)?;
        validate_request(&request)?;
        let response = self
            .client
            .chat_completions(request)
            .await
            .map_err(map_openai_error)?;
        mapper::map_response(response)
    }

    async fn stream<'a>(
        &'a self,
        mut request: ChatCompletionRequest,
    ) -> Result<
        Pin<Box<dyn Stream<Item = Result<UnifiedEvent, ProviderError>> + Send + 'a>>,
        ProviderError,
    > {
        if let Some(model_id) = &self.model_id {
            request.model = model_id.clone();
        }
        let request = normalize_request(request)?;
        validate_request(&request)?;
        let stream = self
            .client
            .chat_completions_stream(request)
            .await
            .map_err(map_openai_error)?;

        let output = try_stream! {
            futures_util::pin_mut!(stream);
            let mut state = mapper::StreamMapState::default();

            while let Some(item) = stream.next().await {
                let chunk = item.map_err(map_openai_error)?;
                for event in mapper::map_stream_chunk(chunk, &mut state) {
                    yield event;
                }
            }
        };

        Ok(Box::pin(output))
    }
}

pub fn build_vllm_deepseek_provider(
    params: &std::collections::HashMap<String, String>,
) -> Result<VllmDeepseekProvider, ConfigError> {
    let api_key = params
        .get("api_key")
        .ok_or_else(|| ConfigError::InvalidProvider("api_key is required".to_string()))?;
    let mut config = client::Config::new(api_key.to_string());
    let model_id = params.get("model").cloned();
    if let Some(base_url) = params.get("base_url") {
        config = config.base_url(base_url.to_string());
    }
    let client =
        client::Client::new(config).map_err(|err| ConfigError::InvalidProvider(err.to_string()))?;
    Ok(VllmDeepseekProvider::new(client, model_id))
}

fn validate_request(request: &ChatCompletionRequest) -> Result<(), ProviderError> {
    validate_openai_request(request).map_err(ProviderError::Internal)
}

fn normalize_request(
    mut request: ChatCompletionRequest,
) -> Result<ChatCompletionRequest, ProviderError> {
    ensure_no_image_parts(&request)?;
    flatten_assistant_text_content(&mut request);
    normalize_reasoning_effort(&mut request);
    Ok(request)
}

fn ensure_no_image_parts(request: &ChatCompletionRequest) -> Result<(), ProviderError> {
    for message in &request.messages {
        if let Content::Parts(parts) = &message.content {
            for part in parts {
                if matches!(part, ContentPart::Image { .. }) {
                    return Err(ProviderError::Internal(
                        "deepseek-v4-flash does not support image_url messages".to_string(),
                    ));
                }
            }
        }
    }
    Ok(())
}

fn flatten_assistant_text_content(request: &mut ChatCompletionRequest) {
    for message in &mut request.messages {
        if message.role != Role::Assistant {
            continue;
        }
        let mut merged_text: Option<String> = None;
        if let Content::Parts(parts) = &message.content {
            let mut combined = String::new();
            let mut all_text = true;
            for part in parts {
                if let ContentPart::Text { text } = part {
                    combined.push_str(text);
                } else {
                    all_text = false;
                    break;
                }
            }
            if all_text {
                merged_text = Some(combined);
            }
        }
        if let Some(text) = merged_text {
            message.content = Content::Text(text);
        }
    }
}

fn normalize_reasoning_effort(request: &mut ChatCompletionRequest) {
    let effort = request.reasoning_effort.take();
    let Some(effort) = effort else {
        return;
    };

    let has_thinking = request
        .chat_template_kwargs
        .as_ref()
        .map(|kwargs| kwargs.contains_key("thinking"))
        .unwrap_or(false);
    if has_thinking {
        return;
    }

    if matches!(effort.as_str(), "low" | "medium" | "high" | "max") {
        let mut kwargs = HashMap::new();
        kwargs.insert("thinking".to_string(), Value::Bool(true));
        kwargs.insert("reasoning_effort".to_string(), Value::String(effort));
        request.chat_template_kwargs = Some(kwargs);
    }
}

fn map_openai_error(err: ClientError) -> ProviderError {
    ProviderError::Internal(err.to_string())
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::openai_types::{
        Content, ImageUrl, Message, ThinkingConfig, ThinkingType, ToolCall, ToolCallFunction,
    };
    use serde_json::json;

    fn request_with_messages(messages: Vec<Message>) -> ChatCompletionRequest {
        ChatCompletionRequest {
            model: "deepseek-v4-flash".to_string(),
            messages,
            n: None,
            temperature: None,
            top_p: None,
            presence_penalty: None,
            frequency_penalty: None,
            logprobs: None,
            top_logprobs: None,
            modalities: None,
            audio: None,
            max_completion_tokens: None,
            stop: None,
            response_format: None,
            thinking: None,
            reasoning_effort: None,
            chat_template_kwargs: None,
            prediction: None,
            verbosity: None,
            tools: None,
            tool_choice: None,
            allowed_tools: None,
            parallel_tool_calls: None,
            service_tier: None,
            seed: None,
            stream: None,
            stream_options: None,
            metadata: None,
            agent_context: None,
        }
    }

    fn user_text_message(text: &str) -> Message {
        Message {
            role: Role::User,
            content: Content::Text(text.to_string()),
            reasoning_content: None,
            tool_call_id: None,
            tool_calls: None,
        }
    }

    #[test]
    fn normalize_reasoning_effort_sets_chat_template_kwargs() {
        let mut request = request_with_messages(vec![user_text_message("hello")]);
        request.reasoning_effort = Some("high".to_string());

        normalize_reasoning_effort(&mut request);

        let kwargs = request.chat_template_kwargs.expect("chat_template_kwargs");
        assert_eq!(kwargs.get("thinking"), Some(&Value::Bool(true)));
        assert_eq!(
            kwargs.get("reasoning_effort"),
            Some(&Value::String("high".to_string()))
        );
    }

    #[test]
    fn normalize_reasoning_effort_keeps_existing_thinking_kwargs() {
        let mut request = request_with_messages(vec![user_text_message("hello")]);
        request.reasoning_effort = Some("medium".to_string());
        request.chat_template_kwargs = Some(HashMap::from([(
            "thinking".to_string(),
            Value::Bool(false),
        )]));

        normalize_reasoning_effort(&mut request);

        let kwargs = request.chat_template_kwargs.expect("chat_template_kwargs");
        assert_eq!(kwargs.get("thinking"), Some(&Value::Bool(false)));
        assert_eq!(kwargs.get("reasoning_effort"), None);
    }

    #[test]
    fn flatten_assistant_text_content_merges_text_parts() {
        let assistant = Message {
            role: Role::Assistant,
            content: Content::Parts(vec![
                crate::openai_types::ContentPart::Text {
                    text: "hello ".to_string(),
                },
                crate::openai_types::ContentPart::Text {
                    text: "world".to_string(),
                },
            ]),
            reasoning_content: None,
            tool_call_id: None,
            tool_calls: Some(vec![ToolCall {
                id: Some("call_1".to_string()),
                r#type: Some("function".to_string()),
                function: ToolCallFunction {
                    name: "noop".to_string(),
                    arguments: "{}".to_string(),
                    description: None,
                },
            }]),
        };
        let mut request = request_with_messages(vec![assistant]);

        flatten_assistant_text_content(&mut request);

        match &request.messages[0].content {
            Content::Text(text) => assert_eq!(text, "hello world"),
            _ => panic!("assistant content should be flattened to text"),
        }
    }

    #[test]
    fn ensure_no_image_parts_returns_error() {
        let request = request_with_messages(vec![Message {
            role: Role::User,
            content: Content::Parts(vec![crate::openai_types::ContentPart::Image {
                image_url: ImageUrl {
                    url: "https://example.com/a.png".to_string(),
                    detail: None,
                },
            }]),
            reasoning_content: None,
            tool_call_id: None,
            tool_calls: None,
        }]);

        let err = ensure_no_image_parts(&request).expect_err("should reject image parts");
        assert!(
            err.to_string()
                .contains("deepseek-v4-flash does not support image_url messages")
        );
    }

    #[test]
    fn normalize_request_combines_compatibility_steps() {
        let assistant = Message {
            role: Role::Assistant,
            content: Content::Parts(vec![crate::openai_types::ContentPart::Text {
                text: "ok".to_string(),
            }]),
            reasoning_content: None,
            tool_call_id: None,
            tool_calls: None,
        };
        let mut request = request_with_messages(vec![assistant]);
        request.reasoning_effort = Some("max".to_string());
        request.thinking = Some(ThinkingConfig {
            kind: ThinkingType::Disabled,
        });
        request.chat_template_kwargs = Some(HashMap::from([(
            "temperature_hint".to_string(),
            json!("ignored by normalize_reasoning_effort overwrite"),
        )]));

        let normalized = normalize_request(request).expect("normalize request");

        assert!(matches!(&normalized.messages[0].content, Content::Text(text) if text == "ok"));
        let kwargs = normalized
            .chat_template_kwargs
            .expect("chat_template_kwargs set by reasoning effort");
        assert_eq!(kwargs.get("thinking"), Some(&Value::Bool(true)));
        assert_eq!(
            kwargs.get("reasoning_effort"),
            Some(&Value::String("max".to_string()))
        );
    }
}