dynamo_llm/protocols/common/llm_backend.rs
1// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16use serde::{Deserialize, Serialize};
17
18use crate::protocols::TokenIdType;
19
20pub type TokenType = Option<String>;
21pub type LogProbs = Vec<f64>;
22
23pub use super::preprocessor::PreprocessedRequest;
24pub use super::FinishReason;
25
26#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
27pub struct BackendOutput {
28 /// New token_ids generated from the LLM Engine
29 pub token_ids: Vec<TokenIdType>,
30
31 /// Unlike [`LLMEngineOutput::tokens`], this is a vector of tokens, not an optional.
32 /// The size of this vector should be the same as the size of `token_ids`.
33 pub tokens: Vec<TokenType>,
34
35 /// Decoded text from the list tokens.
36 pub text: Option<String>,
37
38 /// Optional cumulative log probabilities
39 pub cum_log_probs: Option<f64>,
40
41 /// Optional log probabilities
42 pub log_probs: Option<LogProbs>,
43
44 // TODO: Enrich this with more information as can apply our first-level postprocessing
45 // logic and return more detailed information
46 pub finish_reason: Option<FinishReason>,
47 // Model Deployment Card checksum
48 //pub mdcsum: String,
49}
50
51/// The LLM engine and backnd with manage it's own state, specifically translating how a
52/// given request/slot is managed on that particular backend.
53///
54/// For nvLLM's purpose, it has a single tracable request_id as part of it's context that
55/// has propaged through the service pipeline to the backend.
56///
57/// This is the minimal raw output from the LLM engine. The Backend may then apply multiple
58/// levels of post-processing before the BackendOutput is returns
59#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
60pub struct LLMEngineOutput {
61 // new token_ids
62 pub token_ids: Vec<TokenIdType>,
63
64 /// If the LLM Engine performs the detokenization, then this will have a Some of the detokenized
65 /// text/tokens. If this value is None, then the Backend is responsible for detokenization.
66 pub tokens: Option<Vec<TokenType>>,
67
68 // decoded text -
69 pub text: Option<String>,
70
71 /// cumulative log probabilities
72 pub cum_log_probs: Option<f64>,
73
74 /// Optional log probabilities
75 pub log_probs: Option<LogProbs>,
76
77 // TODO: Enrich this with more information as can apply our first-level postprocessing
78 // logic and return more detailed information
79 pub finish_reason: Option<FinishReason>,
80}
81
82impl LLMEngineOutput {
83 pub fn cancelled() -> Self {
84 LLMEngineOutput {
85 token_ids: vec![],
86 tokens: None,
87 text: None,
88 cum_log_probs: None,
89 log_probs: None,
90 finish_reason: Some(FinishReason::Cancelled),
91 }
92 }
93
94 pub fn stop() -> Self {
95 LLMEngineOutput {
96 token_ids: vec![],
97 tokens: None,
98 text: None,
99 cum_log_probs: None,
100 log_probs: None,
101 finish_reason: Some(FinishReason::Stop),
102 }
103 }
104
105 pub fn length() -> Self {
106 LLMEngineOutput {
107 token_ids: vec![],
108 tokens: None,
109 text: None,
110 cum_log_probs: None,
111 log_probs: None,
112 finish_reason: Some(FinishReason::Length),
113 }
114 }
115
116 pub fn error(err_msg: String) -> Self {
117 LLMEngineOutput {
118 token_ids: vec![],
119 tokens: None,
120 text: None,
121 cum_log_probs: None,
122 log_probs: None,
123 finish_reason: Some(FinishReason::Error(err_msg)),
124 }
125 }
126}