aha 0.2.5

aha model inference library, now supports Qwen(2.5VL/3/3VL/3.5/ASR/3Embedding/3Reranker), MiniCPM4, VoxCPM/1.5, DeepSeek-OCR/2, Hunyuan-OCR, PaddleOCR-VL/1.5, RMBG2.0, GLM(ASR-Nano-2512/OCR), Fun-ASR-Nano-2512, LFM(2/2.5/2VL/2.5VL)
Documentation
use serde::{Deserialize, Serialize};

#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub struct Usage {
    /// Number of tokens in the prompt.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub prompt_tokens: Option<u32>,
    /// Number of tokens in the prompt.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub prompt_secs: Option<f64>,
    /// Number of tokens in the completion.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub completion_tokens: Option<u32>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub completion_secs: Option<f64>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub completion_per_token_secs: Option<f64>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub completion_tps: Option<f64>,
    /// Number of tokens in the entire response.
    pub total_tokens: u32,
    /// Breakdown of tokens used in the prompt.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub prompt_tokens_details: Option<PromptTokensDetails>,
    /// Breakdown of tokens used in a completion.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub completion_tokens_details: Option<CompletionTokensDetails>,
}

#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub struct InputTokensDetails {
    /// The number of tokens that were retrieved from the cache.
    pub cached_tokens: u32,
}

#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub struct OutputTokensDetails {
    /// The number of reasoning tokens.
    pub reasoning_tokens: u32,
}

#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub struct PromptTokensDetails {
    /// Audio input tokens present in the prompt.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub audio_tokens: Option<u32>,
    /// Cached tokens present in the prompt.
    pub cached_tokens: u32,
}

#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub struct CompletionTokensDetails {
    /// Tokens generated by the model for reasoning.
    pub reasoning_tokens: u32,
    /// Audio input tokens generated by the model.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub audio_tokens: Option<u32>,
    /// When using Predicted Outputs, the number of tokens in the prediction that appeared in the completion.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub accepted_prediction_tokens: Option<u32>,
    /// When using Predicted Outputs, the number of tokens in the prediction that did not appear in the completion.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub rejected_prediction_tokens: Option<u32>,
}

#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub struct ResponseWrapper<T> {
    pub data: T,
    pub headers: Headers,
}

#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub struct Headers {
    /// The maximum number of requests that are permitted before exhausting the rate limit.
    #[serde(rename = "x-ratelimit-limit-requests")]
    pub x_ratelimit_limit_requests: Option<u32>,
    /// The maximum number of tokens that are permitted before exhausting the rate limit.
    #[serde(rename = "x-ratelimit-limit-tokens")]
    pub x_ratelimit_limit_tokens: Option<u32>,
    /// The remaining number of requests that are permitted before exhausting the rate limit.
    #[serde(rename = "x-ratelimit-remaining-requests")]
    pub x_ratelimit_remaining_requests: Option<u32>,
    /// The remaining number of tokens that are permitted before exhausting the rate limit.
    #[serde(rename = "x-ratelimit-remaining-tokens")]
    pub x_ratelimit_remaining_tokens: Option<u32>,
    /// The time until the rate limit (based on requests) resets to its initial state.
    #[serde(rename = "x-ratelimit-reset-requests")]
    pub x_ratelimit_reset_requests: Option<String>,
    /// The time until the rate limit (based on tokens) resets to its initial state.
    #[serde(rename = "x-ratelimit-reset-tokens")]
    pub x_ratelimit_reset_tokens: Option<String>,
}

#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub struct SimpleListParameters {
    /// Identifier for the last object from the previous pagination request.
    pub after: Option<String>,
    /// Number of objects to retrieve.
    pub limit: Option<u32>,
}

#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub struct ListParameters {
    /// A limit on the number of objects to be returned. Limit can range between 1 and 100, and the default is 20.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub limit: Option<u32>,
    /// Sort order by the created_at timestamp of the objects. asc for ascending order and desc for descending order.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub order: Option<String>,
    /// A cursor for use in pagination. after is an object ID that defines your place in the list.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub after: Option<String>,
    /// A cursor for use in pagination. before is an object ID that defines your place in the list.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub before: Option<String>,
}

#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub struct ListResponse<T> {
    // The object type, which is always "list".
    pub object: String,
    /// The list ob objects.
    pub data: Vec<T>,
    /// The ID of the first objects in the list.
    pub first_id: Option<String>,
    /// The ID of the last objects in the list.
    pub last_id: Option<String>,
    /// Indicates whether there are more objects to retrieve.
    pub has_more: bool,
}

#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub struct DeletedObject {
    /// ID of the deleted object.
    pub id: String,
    /// The object type.
    pub object: String,
    /// Indicates whether the file was successfully deleted.
    pub deleted: bool,
}

#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
pub struct LastError {
    /// One of 'server_error' or 'rate_limit_exceeded'.
    pub code: LastErrorCode,
    /// A human-readable description of the error.
    pub message: String,
}

#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
#[serde(rename_all = "snake_case")]
pub enum LastErrorCode {
    ServerError,
    RateLimitExceeded,
}

#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
#[serde(rename_all = "snake_case")]
pub enum FinishReason {
    /// API returned complete message, or a message terminated by one of the stop sequences provided via the stop parameter.
    #[serde(rename = "stop", alias = "STOP")]
    StopSequenceReached,
    /// Incomplete model output due to max_tokens parameter or token limit.
    #[serde(rename = "length", alias = "MAX_TOKENS")]
    TokenLimitReached,
    /// Omitted content due to a flag from our content filters.
    #[serde(
        rename = "content_filter",
        alias = "SAFETY",
        alias = "SPII",
        alias = "PROHIBITED_CONTENT",
        alias = "BLOCKLIST",
        alias = "RECITATION"
    )]
    ContentFilterFlagged,
    /// The model decided to call one or more tools.
    ToolCalls,
    /// The model reached a natural stopping point. [Claude]
    EndTurn,
    /// The finish reason is unspecified. [Gemini]
    #[serde(rename = "FINISH_REASON_UNSPECIFIED	")]
    Unspecified,
    #[serde(rename = "MALFORMED_FUNCTION_CALL")]
    MalformedFunctionCall,
    #[serde(rename = "OTHER")]
    Other,
}

#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
#[serde(rename_all = "snake_case")]
pub enum ReasoningEffort {
    High,
    Medium,
    Low,
    Minimal,
}

#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
#[serde(rename_all = "snake_case")]
pub enum WebSearchContextSize {
    Low,
    Medium,
    Large,
}

#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
#[serde(untagged)]
pub enum StopToken {
    String(String),
    Array(Vec<String>),
}