1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
use crate::protocols;
pub use protocols::{Annotated, TokenIdType};
pub mod openai {
use super::*;
use dynamo_runtime::pipeline::{ServerStreamingEngine, UnaryEngine};
pub mod completions {
use super::*;
pub use protocols::openai::completions::{
NvCreateCompletionRequest, NvCreateCompletionResponse,
};
/// A [`UnaryEngine`] implementation for the OpenAI Completions API
pub type OpenAICompletionsUnaryEngine =
UnaryEngine<NvCreateCompletionRequest, NvCreateCompletionResponse>;
/// A [`ServerStreamingEngine`] implementation for the OpenAI Completions API
pub type OpenAICompletionsStreamingEngine =
ServerStreamingEngine<NvCreateCompletionRequest, Annotated<NvCreateCompletionResponse>>;
}
pub mod chat_completions {
use super::*;
pub use protocols::openai::chat_completions::{
NvCreateChatCompletionRequest, NvCreateChatCompletionResponse,
NvCreateChatCompletionStreamResponse,
};
/// A [`UnaryEngine`] implementation for the OpenAI Chat Completions API
pub type OpenAIChatCompletionsUnaryEngine =
UnaryEngine<NvCreateChatCompletionRequest, NvCreateChatCompletionResponse>;
/// A [`ServerStreamingEngine`] implementation for the OpenAI Chat Completions API
pub type OpenAIChatCompletionsStreamingEngine = ServerStreamingEngine<
NvCreateChatCompletionRequest,
Annotated<NvCreateChatCompletionStreamResponse>,
>;
}
pub mod embeddings {
use super::*;
pub use protocols::openai::embeddings::{
NvCreateEmbeddingRequest, NvCreateEmbeddingResponse,
};
/// A [`UnaryEngine`] implementation for the OpenAI Embeddings API
pub type OpenAIEmbeddingsUnaryEngine =
UnaryEngine<NvCreateEmbeddingRequest, NvCreateEmbeddingResponse>;
/// A [`ServerStreamingEngine`] implementation for the OpenAI Embeddings API
pub type OpenAIEmbeddingsStreamingEngine =
ServerStreamingEngine<NvCreateEmbeddingRequest, Annotated<NvCreateEmbeddingResponse>>;
}
pub mod images {
use super::*;
pub use protocols::openai::images::{NvCreateImageRequest, NvImagesResponse};
/// A [`UnaryEngine`] implementation for the OpenAI Images API
pub type OpenAIImagesUnaryEngine = UnaryEngine<NvCreateImageRequest, NvImagesResponse>;
/// A [`ServerStreamingEngine`] implementation for the OpenAI Images API.
///
/// **Note**: This "streaming" refers to the internal routing/distribution architecture,
/// NOT client-facing Server-Sent Events (SSE) streaming. Image generation does not
/// support progressive streaming to clients - images are generated completely and
/// returned as finished artifacts (URLs or base64).
///
/// The HTTP endpoint folds this stream into a single response before returning to clients,
/// similar to how embeddings work. The streaming infrastructure is used for:
/// - Consistent routing architecture across all model types
/// - Request distribution via PushRouter
/// - Worker fault detection and load balancing
pub type OpenAIImagesStreamingEngine =
ServerStreamingEngine<NvCreateImageRequest, Annotated<NvImagesResponse>>;
}
pub mod videos {
use super::*;
pub use protocols::openai::videos::{NvCreateVideoRequest, NvVideosResponse};
/// A [`UnaryEngine`] implementation for the OpenAI Videos API
pub type OpenAIVideosUnaryEngine = UnaryEngine<NvCreateVideoRequest, NvVideosResponse>;
/// A [`ServerStreamingEngine`] implementation for the OpenAI Videos API
pub type OpenAIVideosStreamingEngine =
ServerStreamingEngine<NvCreateVideoRequest, Annotated<NvVideosResponse>>;
}
pub mod audios {
use super::*;
pub use protocols::openai::audios::{NvAudioSpeechResponse, NvCreateAudioSpeechRequest};
/// A [`UnaryEngine`] implementation for the Audio Speech API
pub type OpenAIAudiosUnaryEngine =
UnaryEngine<NvCreateAudioSpeechRequest, NvAudioSpeechResponse>;
/// A [`ServerStreamingEngine`] implementation for the Audio Speech API
pub type OpenAIAudiosStreamingEngine =
ServerStreamingEngine<NvCreateAudioSpeechRequest, Annotated<NvAudioSpeechResponse>>;
}
}
pub mod generic {
use super::*;
use dynamo_runtime::pipeline::{
BidirectionalStreamingEngine, ServerStreamingEngine, UnaryEngine,
};
pub mod tensor {
use super::*;
pub use protocols::tensor::{NvCreateTensorRequest, NvCreateTensorResponse};
/// A [`UnaryEngine`] implementation for the generic Tensor API
pub type TensorUnaryEngine = UnaryEngine<NvCreateTensorRequest, NvCreateTensorResponse>;
/// A [`ServerStreamingEngine`] implementation for the generic Tensor API
pub type TensorStreamingEngine =
ServerStreamingEngine<NvCreateTensorRequest, Annotated<NvCreateTensorResponse>>;
}
pub mod realtime {
use super::*;
pub use protocols::openai::chat_completions::{
NvCreateChatCompletionRequest, NvCreateChatCompletionStreamResponse,
};
/// TODO (#9175): reuses chat-completion request/response for now, as
/// the request can carry audio data on the request side and text data
/// on the response side. Will move to a dedicated realtime-API
/// protocol type.
///
/// A [`BidirectionalStreamingEngine`] implementation for the OpenAI Chat
/// Completions API.
///
/// Many-in / many-out: the client streams a sequence of `NvCreateChatCompletionRequest`
/// chunks for the same logical session and receives a stream of
/// `NvCreateChatCompletionStreamResponse` chunks back. Used by the experimental
/// `/v1/realtime` WebSocket endpoint. The canonical concrete implementor of
/// the input side is [`dynamo_runtime::pipeline::RequestStream`].
pub type RealtimeBidirectionalEngine = BidirectionalStreamingEngine<
NvCreateChatCompletionRequest,
Annotated<NvCreateChatCompletionStreamResponse>,
>;
}
}