runpod_sdk/model/endpoint.rs
1use serde::{Deserialize, Serialize};
2#[cfg(feature = "strum")]
3use strum::{Display, EnumString};
4
5use super::common::*;
6use super::pod::Pod;
7use super::template::Template;
8#[cfg(feature = "serverless")]
9use crate::RunpodClient;
10#[cfg(feature = "serverless")]
11use crate::serverless::{ServerlessEndpoint, ServerlessJob};
12
13/// Scaling strategy for serverless endpoint worker management.
14///
15/// Determines how the serverless infrastructure responds to incoming request load
16/// by automatically scaling the number of active workers up or down.
17///
18/// # Strategies
19///
20/// ## Queue Delay (`QueueDelay`)
21/// **Latency-optimized scaling** that prioritizes response time consistency.
22/// - Scales up when requests wait longer than the configured threshold
23/// - Maintains responsive service with predictable latency characteristics
24/// - Best for interactive applications, real-time inference, SLA-sensitive workloads
25/// - Higher baseline costs to ensure responsiveness
26///
27/// ## Request Count (`RequestCount`)
28/// **Throughput-optimized scaling** that balances load efficiently across workers.
29/// - Maintains approximately `queue_size / scaler_value` workers
30/// - Optimizes for cost efficiency and overall throughput
31/// - Best for batch processing, background tasks, cost-sensitive workloads
32/// - May have higher latency during traffic spikes
33///
34/// # Examples
35///
36/// ```rust
37/// use runpod_sdk::model::ScalerType;
38///
39/// // For real-time AI inference requiring <3s response times
40/// let latency_optimized = ScalerType::QueueDelay;
41/// // scaler_value = 2 means scale up if any request waits >2 seconds
42///
43/// // For batch image processing where cost matters more than speed
44/// let cost_optimized = ScalerType::RequestCount;
45/// // scaler_value = 10 means maintain 1 worker per 10 queued requests
46/// ```
47#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
48#[cfg_attr(feature = "strum", derive(Display, EnumString))]
49#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
50#[cfg_attr(feature = "strum", strum(serialize_all = "SCREAMING_SNAKE_CASE"))]
51pub enum ScalerType {
52 /// Queue delay-based scaling - prioritizes response time consistency.
53 /// Scales up when requests wait longer than the threshold.
54 #[default]
55 QueueDelay,
56 /// Request count-based scaling - optimizes for throughput and cost.
57 /// Maintains workers proportional to queue depth.
58 RequestCount,
59}
60
61/// Serverless endpoint resource providing auto-scaling compute infrastructure.
62///
63/// Represents a fully configured serverless endpoint with all deployment settings,
64/// scaling configuration, and runtime status. Endpoints automatically manage
65/// worker lifecycle based on request load and configured policies.
66///
67/// # Key Properties
68///
69/// - **Auto-scaling**: Workers spin up/down based on request queue and scaling policy
70/// - **Template-driven**: Consistent runtime environment from pre-configured templates
71/// - **Multi-region**: Distributed deployment across multiple data centers
72/// - **Cost-optimized**: Pay-per-use billing with idle timeout management
73/// - **High-availability**: Automatic failover and redundancy
74///
75/// # Examples
76///
77/// ```rust
78/// use runpod_sdk::model::Endpoint;
79///
80/// // Endpoint instances are typically obtained from API responses
81/// // when listing, creating, or retrieving serverless endpoints
82/// ```
83#[derive(Debug, Clone, Serialize, Deserialize)]
84#[serde(rename_all = "camelCase")]
85pub struct Endpoint {
86 /// A unique string identifying the serverless endpoint.
87 pub id: String,
88
89 /// A user-defined name for the endpoint. The name does not need to be unique.
90 ///
91 /// Used for organization and identification in dashboards and monitoring.
92 /// Can be updated without affecting endpoint functionality.
93 pub name: Option<String>,
94
95 /// A unique string identifying the RunPod user who created the endpoint.
96 pub user_id: String,
97
98 /// The unique string identifying the template used to create the endpoint.
99 ///
100 /// Templates define the container image, environment, and resource configuration
101 /// that will be deployed across all workers for this endpoint.
102 pub template_id: String,
103
104 /// The current version of the endpoint configuration.
105 ///
106 /// Incremented whenever the template or environment variables are changed,
107 /// triggering a rolling update of all workers.
108 pub version: i32,
109
110 /// The type of compute used by workers on this endpoint.
111 ///
112 /// Determines whether workers will have GPU or CPU compute resources attached.
113 /// This setting affects pricing, available hardware types, and performance characteristics.
114 pub compute_type: ComputeType,
115
116 /// The UTC timestamp when the endpoint was created.
117 ///
118 /// ISO 8601 format string representing the endpoint creation time.
119 pub created_at: String,
120
121 /// List of RunPod data center IDs where workers can be located.
122 ///
123 /// Workers are distributed across these data centers for availability and performance.
124 /// The system automatically selects the best available data center based on
125 /// resource availability and proximity to users.
126 pub data_center_ids: Vec<DataCenterId>,
127
128 /// Environment variables for the endpoint's container runtime.
129 ///
130 /// These variables are injected into all worker containers and can be used
131 /// for configuration, API keys, feature flags, and other runtime settings.
132 pub env: Option<EnvVars>,
133
134 /// The maximum execution time in milliseconds for individual requests.
135 ///
136 /// If a request exceeds this timeout, the worker is stopped and the request
137 /// is marked as failed. This prevents runaway processes and ensures
138 /// predictable resource usage.
139 ///
140 /// **Common values:**
141 /// - Web APIs: 30,000ms (30 seconds)
142 /// - AI inference: 300,000ms (5 minutes)
143 /// - Batch processing: 3,600,000ms (1 hour)
144 pub execution_timeout_ms: i32,
145
146 /// The number of GPUs attached to each worker (GPU endpoints only).
147 ///
148 /// Only relevant when `compute_type` is `GPU`. Determines the GPU resources
149 /// allocated to each worker instance for parallel processing workloads.
150 pub gpu_count: Option<i32>,
151
152 /// List of RunPod GPU types that can be attached to workers (GPU endpoints only).
153 ///
154 /// The system tries to allocate GPUs in the order specified, falling back
155 /// to subsequent types if the preferred options are unavailable.
156 /// Only relevant when `compute_type` is `GPU`.
157 pub gpu_type_ids: Option<Vec<GpuTypeId>>,
158
159 /// List of CPU instance IDs that can be attached to workers (CPU endpoints only).
160 ///
161 /// For CPU endpoints, specifies the available instance types that workers
162 /// can use, allowing the system to choose based on availability and cost.
163 pub instance_ids: Option<Vec<String>>,
164
165 /// The number of seconds a worker can be idle before being scaled down.
166 ///
167 /// Workers that haven't processed requests for this duration are automatically
168 /// terminated to reduce costs. Shorter timeouts reduce costs but may increase
169 /// cold start latency for subsequent requests.
170 ///
171 /// **Typical values:**
172 /// - Cost-optimized: 30-60 seconds
173 /// - Balanced: 5-15 seconds
174 /// - Performance-optimized: 1-5 seconds
175 pub idle_timeout: i32,
176
177 /// The unique ID of the network volume attached to workers, if any.
178 ///
179 /// Network volumes provide persistent, shared storage across all workers,
180 /// useful for model weights, datasets, and other shared assets.
181 pub network_volume_id: Option<String>,
182
183 /// The scaling strategy used to manage worker count.
184 ///
185 /// Determines how the system responds to request load by scaling workers
186 /// up or down automatically.
187 pub scaler_type: ScalerType,
188
189 /// The scaling sensitivity parameter.
190 ///
191 /// **For `QueueDelay` scaling:**
192 /// - Seconds a request can wait in queue before scaling up
193 /// - Lower values = more responsive but potentially higher costs
194 ///
195 /// **For `RequestCount` scaling:**
196 /// - Target requests per worker (queue_size / scaler_value = worker_count)
197 /// - Higher values = fewer workers, more cost-efficient
198 pub scaler_value: i32,
199
200 /// The maximum number of workers that can run simultaneously.
201 ///
202 /// Hard limit preventing runaway scaling and controlling maximum costs.
203 /// Set based on expected peak load and budget constraints.
204 pub workers_max: i32,
205
206 /// The minimum number of workers that always remain running.
207 ///
208 /// Reserved capacity that's always available, even during idle periods.
209 /// These workers are billed at a lower rate but provide immediate availability.
210 /// Set to 0 for maximum cost efficiency, or >0 for better responsiveness.
211 pub workers_min: i32,
212
213 /// List of acceptable CUDA versions for GPU workers.
214 ///
215 /// If specified, only workers with compatible CUDA runtimes will be used.
216 /// Useful for ensuring compatibility with specific AI/ML frameworks.
217 /// Only relevant for GPU endpoints.
218 pub allowed_cuda_versions: Option<Vec<CudaVersion>>,
219
220 /// Detailed template information (included when `include_template` is true).
221 ///
222 /// Contains the full template configuration including container image,
223 /// environment setup, and resource requirements.
224 #[serde(skip_serializing_if = "Option::is_none")]
225 pub template: Option<Template>,
226
227 /// Current worker instances (included when `include_workers` is true).
228 ///
229 /// List of active worker pods with their current status, resource allocation,
230 /// and performance metrics.
231 #[serde(skip_serializing_if = "Option::is_none")]
232 pub workers: Option<Vec<Pod>>,
233}
234
235/// List of serverless endpoints.
236///
237/// A collection type representing multiple endpoints, typically returned
238/// from API endpoints that list endpoints for an account.
239pub type Endpoints = Vec<Endpoint>;
240
241/// Input parameters for creating a new serverless endpoint.
242///
243/// This struct contains all the configuration options available when creating an endpoint,
244/// including compute specifications, scaling policies, and deployment preferences.
245/// Most fields are optional and will use RunPod defaults if not specified.
246///
247/// # Required Fields
248///
249/// Only `template_id` is required - all other configuration uses sensible defaults
250/// that can be customized based on your specific workload requirements.
251///
252/// # Examples
253///
254/// ```rust
255/// use runpod_sdk::model::{EndpointCreateInput, ScalerType};
256/// use runpod_sdk::model::{ComputeType, CudaVersion, GpuTypeId};
257///
258/// // High-performance GPU endpoint for real-time AI inference
259/// let inference_endpoint = EndpointCreateInput {
260/// template_id: "pytorch-inference-template".to_string(),
261/// name: Some("ai-inference-prod".to_string()),
262/// compute_type: Some(ComputeType::Gpu),
263/// gpu_count: Some(1),
264/// gpu_type_ids: Some(vec![GpuTypeId::NvidiaA100_80GbPcie]),
265/// allowed_cuda_versions: Some(vec![CudaVersion::V12_1]),
266/// scaler_type: Some(ScalerType::QueueDelay),
267/// scaler_value: Some(3), // Scale if requests wait >3 seconds
268/// workers_min: Some(1), // Keep 1 worker always ready
269/// workers_max: Some(5), // Burst up to 5 workers
270/// flashboot: Some(true), // Fast cold starts
271/// idle_timeout: Some(30), // Scale down after 30s idle
272/// execution_timeout_ms: Some(300000), // 5 minute timeout
273/// ..Default::default()
274/// };
275///
276/// // Cost-optimized CPU endpoint for batch processing
277/// let batch_endpoint = EndpointCreateInput {
278/// template_id: "batch-processor-template".to_string(),
279/// name: Some("data-batch-processor".to_string()),
280/// compute_type: Some(ComputeType::Cpu),
281/// vcpu_count: Some(8),
282/// scaler_type: Some(ScalerType::RequestCount),
283/// scaler_value: Some(10), // 1 worker per 10 requests
284/// workers_min: Some(0), // No reserved capacity
285/// workers_max: Some(20), // Allow large bursts
286/// flashboot: Some(false), // Standard startup (cheaper)
287/// idle_timeout: Some(120), // Longer idle time for batches
288/// execution_timeout_ms: Some(1800000), // 30 minute timeout
289/// ..Default::default()
290/// };
291/// ```
292#[derive(Debug, Clone, Default, Serialize, Deserialize)]
293#[serde(rename_all = "camelCase")]
294pub struct EndpointCreateInput {
295 /// The unique string identifying the template used to create the endpoint.
296 ///
297 /// **Required field** - specifies the container image, environment, and
298 /// resource configuration that will be deployed across all workers.
299 ///
300 /// Templates ensure consistent runtime environments and can be shared
301 /// across multiple endpoints for standardized deployments.
302 pub template_id: String,
303
304 /// If the endpoint is a GPU endpoint, acceptable CUDA versions for workers.
305 ///
306 /// Constrains worker allocation to machines with compatible CUDA runtimes.
307 /// Useful for ensuring compatibility with specific AI/ML framework versions
308 /// that require particular CUDA versions.
309 ///
310 /// **Default**: Any CUDA version is acceptable
311 /// **GPU endpoints only**: Ignored for CPU endpoints
312 ///
313 /// **Example**: `[CudaVersion::V12_1, CudaVersion::V11_8]`
314 #[serde(skip_serializing_if = "Option::is_none")]
315 pub allowed_cuda_versions: Option<Vec<CudaVersion>>,
316
317 /// Set to `GPU` for GPU-accelerated workers, `CPU` for CPU-only workers.
318 ///
319 /// Determines the type of compute resources allocated to workers:
320 /// - `GPU`: Workers get GPU acceleration for AI/ML workloads
321 /// - `CPU`: Workers get high-performance CPUs for general compute
322 ///
323 /// **Default**: `GPU`
324 /// **Impact**: Affects available hardware types, pricing, and performance
325 #[serde(skip_serializing_if = "Option::is_none")]
326 pub compute_type: Option<ComputeType>,
327
328 /// If creating a CPU endpoint, list of CPU flavors for workers.
329 ///
330 /// Specifies the CPU configurations that can be used for workers.
331 /// The order determines rental priority - preferred flavors first.
332 ///
333 /// **CPU endpoints only**: Ignored for GPU endpoints
334 /// **Default**: All available CPU flavors
335 ///
336 /// **Available flavors**: `cpu3c`, `cpu3g`, `cpu5c`, `cpu5g`
337 #[serde(skip_serializing_if = "Option::is_none")]
338 pub cpu_flavor_ids: Option<Vec<CpuFlavorId>>,
339
340 /// List of data center IDs where workers can be located.
341 ///
342 /// Workers are distributed across these data centers for availability,
343 /// performance, and proximity to users. The system automatically
344 /// selects the best available data center for each worker.
345 ///
346 /// **Default**: All available data centers globally
347 /// **Strategy**: Choose data centers close to your users and data sources
348 ///
349 /// **Common choices:**
350 /// - Global: `["US-CA-1", "EU-RO-1", "AP-JP-1"]`
351 /// - Regional: `["US-TX-1", "US-CA-2"]` for US-only
352 /// - Single DC: `["EU-RO-1"]` for data residency requirements
353 #[serde(skip_serializing_if = "Option::is_none")]
354 pub data_center_ids: Option<Vec<DataCenterId>>,
355
356 /// Maximum execution time in milliseconds for individual requests.
357 ///
358 /// Requests exceeding this timeout are terminated and marked as failed.
359 /// Prevents runaway processes and ensures predictable resource usage.
360 ///
361 /// **Default**: 600,000ms (10 minutes)
362 /// **Range**: 1,000ms to 3,600,000ms (1 second to 1 hour)
363 ///
364 /// **Guidelines:**
365 /// - Web APIs: 30,000ms (30 seconds)
366 /// - AI inference: 300,000ms (5 minutes)
367 /// - Image processing: 600,000ms (10 minutes)
368 /// - Batch jobs: 3,600,000ms (1 hour)
369 #[serde(skip_serializing_if = "Option::is_none")]
370 pub execution_timeout_ms: Option<i32>,
371
372 /// Whether to enable flash boot for faster worker startup.
373 ///
374 /// Flash boot dramatically reduces cold start time by using pre-warmed
375 /// container images with cached dependencies and optimized initialization.
376 ///
377 /// **Default**: `false`
378 /// **Trade-off**: Higher per-request cost for much faster startup
379 /// **Best for**: Interactive applications, real-time inference, low-latency requirements
380 /// **Startup time**: ~5-10 seconds with flash boot vs 30-60 seconds without
381 #[serde(skip_serializing_if = "Option::is_none")]
382 pub flashboot: Option<bool>,
383
384 /// If creating a GPU endpoint, number of GPUs per worker.
385 ///
386 /// Determines GPU resources allocated to each worker for parallel processing.
387 /// More GPUs enable larger models and higher throughput but increase costs.
388 ///
389 /// **Default**: 1
390 /// **GPU endpoints only**: Ignored for CPU endpoints
391 /// **Range**: 1-8 depending on GPU type availability
392 ///
393 /// **Use cases:**
394 /// - Single GPU: Most inference workloads, small models
395 /// - Multi-GPU: Large language models, distributed training, high-throughput inference
396 #[serde(skip_serializing_if = "Option::is_none")]
397 pub gpu_count: Option<i32>,
398
399 /// If creating a GPU endpoint, list of GPU types for workers.
400 ///
401 /// Specifies GPU hardware that can be used for workers. The order
402 /// determines rental priority - the system tries preferred types first.
403 ///
404 /// **GPU endpoints only**: Ignored for CPU endpoints
405 /// **Default**: All available GPU types
406 ///
407 /// **Performance tiers:**
408 /// - High-end: `"NVIDIA H100 80GB HBM3"`, `"NVIDIA A100 80GB PCIe"`
409 /// - Mid-range: `"NVIDIA RTX A6000"`, `"NVIDIA A40"`
410 /// - Budget: `"NVIDIA RTX 4090"`, `"NVIDIA RTX 3090"`
411 #[serde(skip_serializing_if = "Option::is_none")]
412 pub gpu_type_ids: Option<Vec<GpuTypeId>>,
413
414 /// Number of seconds workers can be idle before scaling down.
415 ///
416 /// Workers that haven't processed requests for this duration are
417 /// automatically terminated to reduce costs. Balance between cost
418 /// optimization and cold start latency.
419 ///
420 /// **Default**: 5 seconds
421 /// **Range**: 1-3600 seconds (1 second to 1 hour)
422 ///
423 /// **Strategy:**
424 /// - Aggressive (cost-focused): 30-60 seconds
425 /// - Balanced: 5-15 seconds
426 /// - Responsive (latency-focused): 1-5 seconds
427 #[serde(skip_serializing_if = "Option::is_none")]
428 pub idle_timeout: Option<i32>,
429
430 /// A user-defined name for the endpoint.
431 ///
432 /// Used for organization and identification in dashboards, monitoring,
433 /// and API responses. The name does not need to be unique across your account.
434 ///
435 /// **Default**: Auto-generated based on template name
436 /// **Max length**: 191 characters
437 /// **Best practices**: Use descriptive names like "prod-image-classifier" or "staging-api-v2"
438 #[serde(skip_serializing_if = "Option::is_none")]
439 pub name: Option<String>,
440
441 /// The unique ID of a network volume to attach to workers.
442 ///
443 /// Network volumes provide persistent, shared storage across all workers,
444 /// useful for model weights, datasets, cached data, and other shared assets.
445 ///
446 /// **Default**: No network volume attached
447 /// **Requirements**: Volume must exist in same data centers as workers
448 /// **Use cases**: Model storage, dataset access, shared caching, persistent logs
449 #[serde(skip_serializing_if = "Option::is_none")]
450 pub network_volume_id: Option<String>,
451
452 /// The scaling strategy for managing worker count.
453 ///
454 /// Determines how the system automatically scales workers up/down based
455 /// on request load and queue depth.
456 ///
457 /// **Default**: `QueueDelay`
458 ///
459 /// **Strategies:**
460 /// - `QueueDelay`: Scale based on request wait time (latency-optimized)
461 /// - `RequestCount`: Scale based on queue depth (throughput-optimized)
462 #[serde(skip_serializing_if = "Option::is_none")]
463 pub scaler_type: Option<ScalerType>,
464
465 /// The scaling sensitivity parameter.
466 ///
467 /// Meaning depends on the `scaler_type`:
468 ///
469 /// **For `QueueDelay`**: Maximum seconds requests can wait before scaling up
470 /// - Lower values = more responsive scaling, higher costs
471 /// - Higher values = slower scaling, lower costs
472 ///
473 /// **For `RequestCount`**: Target requests per worker
474 /// - `queue_size / scaler_value = target_worker_count`
475 /// - Lower values = more workers, lower latency
476 /// - Higher values = fewer workers, higher latency
477 ///
478 /// **Default**: 4
479 /// **Range**: 1-3600
480 #[serde(skip_serializing_if = "Option::is_none")]
481 pub scaler_value: Option<i32>,
482
483 /// If creating a CPU endpoint, number of vCPUs per worker.
484 ///
485 /// Determines CPU resources allocated to each worker. More vCPUs enable
486 /// higher parallelism and throughput for CPU-intensive workloads.
487 ///
488 /// **Default**: 2 vCPUs
489 /// **CPU endpoints only**: Ignored for GPU endpoints
490 /// **Range**: 1-32 vCPUs depending on CPU flavor
491 ///
492 /// **Guidelines:**
493 /// - Light workloads: 1-2 vCPUs
494 /// - Web APIs: 2-4 vCPUs
495 /// - Data processing: 4-16 vCPUs
496 /// - Heavy computation: 16+ vCPUs
497 #[serde(skip_serializing_if = "Option::is_none")]
498 pub vcpu_count: Option<i32>,
499
500 /// Maximum number of workers that can run simultaneously.
501 ///
502 /// Hard limit preventing runaway scaling and controlling maximum costs.
503 /// Set based on expected peak load, budget constraints, and infrastructure limits.
504 ///
505 /// **Default**: No limit (subject to account quotas)
506 /// **Range**: 0-1000+ depending on account limits
507 ///
508 /// **Strategy**: Set 2-3x expected peak load for safety margin
509 #[serde(skip_serializing_if = "Option::is_none")]
510 pub workers_max: Option<i32>,
511
512 /// Minimum number of workers that always remain running.
513 ///
514 /// Reserved capacity providing immediate availability even during idle
515 /// periods. These workers are billed at a reduced rate but ensure
516 /// zero cold start latency for the first few requests.
517 ///
518 /// **Default**: 0 (no reserved capacity)
519 /// **Range**: 0-100 depending on account limits
520 ///
521 /// **Trade-offs:**
522 /// - 0: Maximum cost efficiency, but cold starts for first requests
523 /// - 1+: Immediate availability, continuous billing for reserved workers
524 ///
525 /// **Strategy**: Set to 1 for production endpoints requiring <1s response time
526 #[serde(skip_serializing_if = "Option::is_none")]
527 pub workers_min: Option<i32>,
528}
529
530/// Input parameters for updating an existing serverless endpoint.
531///
532/// This struct allows you to modify endpoint configuration and trigger a rolling
533/// release that updates all workers with the new settings. All fields are optional,
534/// allowing you to update only the properties you want to change.
535///
536/// # Rolling Release Process
537///
538/// When an endpoint is updated:
539/// 1. **Validation**: New configuration is validated for compatibility
540/// 2. **Version Increment**: Endpoint version number is incremented
541/// 3. **Rolling Update**: Workers are gradually replaced with new configuration
542/// 4. **Traffic Migration**: Requests are routed to updated workers as they become available
543/// 5. **Cleanup**: Old workers are terminated once traffic migration is complete
544///
545/// # Important Notes
546///
547/// - **Zero Downtime**: Updates are performed without service interruption
548/// - **Gradual Rollout**: Workers are updated in batches to maintain availability
549/// - **Rollback**: Previous versions can be restored if issues are detected
550/// - **Template Changes**: Updating `template_id` deploys new container images
551///
552/// # Examples
553///
554/// ```rust
555/// use runpod_sdk::model::{EndpointUpdateInput, ScalerType};
556///
557/// // Scale up for increased traffic
558/// let scale_up = EndpointUpdateInput {
559/// workers_max: Some(20), // Double capacity
560/// scaler_value: Some(2), // More aggressive scaling
561/// idle_timeout: Some(10), // Keep workers longer
562/// ..Default::default()
563/// };
564///
565/// // Enable flash boot for better performance
566/// let performance_upgrade = EndpointUpdateInput {
567/// flashboot: Some(true),
568/// execution_timeout_ms: Some(60000), // Reduce timeout
569/// ..Default::default()
570/// };
571///
572/// // Switch to cost-optimized scaling
573/// let cost_optimization = EndpointUpdateInput {
574/// scaler_type: Some(ScalerType::RequestCount),
575/// scaler_value: Some(10), // 1 worker per 10 requests
576/// workers_min: Some(0), // No reserved capacity
577/// flashboot: Some(false), // Standard startup
578/// ..Default::default()
579/// };
580/// ```
581#[derive(Debug, Clone, Default, Serialize, Deserialize)]
582#[serde(rename_all = "camelCase")]
583pub struct EndpointUpdateInput {
584 /// If the endpoint is a GPU endpoint, acceptable CUDA versions for workers.
585 ///
586 /// Updates the CUDA version constraints for worker allocation.
587 /// Triggers rolling release to ensure all workers use compatible CUDA versions.
588 ///
589 /// **Note**: Set to `None` to keep current setting unchanged.
590 #[serde(skip_serializing_if = "Option::is_none")]
591 pub allowed_cuda_versions: Option<Vec<CudaVersion>>,
592
593 /// If the endpoint is a CPU endpoint, list of CPU flavors for workers.
594 ///
595 /// Updates the available CPU configurations for workers.
596 /// The order determines rental priority for new workers.
597 ///
598 /// **CPU endpoints only**: Ignored for GPU endpoints
599 /// **Note**: Set to `None` to keep current setting unchanged.
600 #[serde(skip_serializing_if = "Option::is_none")]
601 pub cpu_flavor_ids: Option<Vec<CpuFlavorId>>,
602
603 /// List of data center IDs where workers can be located.
604 ///
605 /// Updates the geographic distribution of workers.
606 /// Existing workers in removed data centers will be gradually replaced.
607 ///
608 /// **Note**: Set to `None` to keep current setting unchanged.
609 #[serde(skip_serializing_if = "Option::is_none")]
610 pub data_center_ids: Option<Vec<DataCenterId>>,
611
612 /// Maximum execution time in milliseconds for individual requests.
613 ///
614 /// Updates the timeout for request processing. Affects new requests
615 /// immediately, existing requests continue with previous timeout.
616 ///
617 /// **Range**: 1,000ms to 3,600,000ms (1 second to 1 hour)
618 /// **Note**: Set to `None` to keep current setting unchanged.
619 #[serde(skip_serializing_if = "Option::is_none")]
620 pub execution_timeout_ms: Option<i32>,
621
622 /// Whether to enable flash boot for faster worker startup.
623 ///
624 /// Updates the startup optimization for new workers.
625 /// Affects cold start performance and per-request costs.
626 ///
627 /// **Trade-off**: Higher per-request cost for faster startup
628 /// **Note**: Set to `None` to keep current setting unchanged.
629 #[serde(skip_serializing_if = "Option::is_none")]
630 pub flashboot: Option<bool>,
631
632 /// If the endpoint is a GPU endpoint, number of GPUs per worker.
633 ///
634 /// Updates GPU allocation for new workers. Triggers rolling release
635 /// to deploy workers with the new GPU configuration.
636 ///
637 /// **GPU endpoints only**: Ignored for CPU endpoints
638 /// **Range**: 1-8 depending on GPU type availability
639 /// **Note**: Set to `None` to keep current setting unchanged.
640 #[serde(skip_serializing_if = "Option::is_none")]
641 pub gpu_count: Option<i32>,
642
643 /// If the endpoint is a GPU endpoint, list of GPU types for workers.
644 ///
645 /// Updates available GPU hardware types for workers.
646 /// The order determines rental priority for new workers.
647 ///
648 /// **GPU endpoints only**: Ignored for CPU endpoints
649 /// **Note**: Set to `None` to keep current setting unchanged.
650 #[serde(skip_serializing_if = "Option::is_none")]
651 pub gpu_type_ids: Option<Vec<GpuTypeId>>,
652
653 /// Number of seconds workers can be idle before scaling down.
654 ///
655 /// Updates the idle timeout for worker lifecycle management.
656 /// Affects cost optimization and cold start frequency.
657 ///
658 /// **Range**: 1-3600 seconds (1 second to 1 hour)
659 /// **Note**: Set to `None` to keep current setting unchanged.
660 #[serde(skip_serializing_if = "Option::is_none")]
661 pub idle_timeout: Option<i32>,
662
663 /// A user-defined name for the endpoint.
664 ///
665 /// Updates the display name used in dashboards and API responses.
666 /// This change is applied immediately without triggering a rolling release.
667 ///
668 /// **Max length**: 191 characters
669 /// **Note**: Set to `None` to keep current name unchanged.
670 #[serde(skip_serializing_if = "Option::is_none")]
671 pub name: Option<String>,
672
673 /// The unique ID of a network volume to attach to workers.
674 ///
675 /// Updates the persistent storage attached to workers.
676 /// Triggers rolling release to mount/unmount volumes on all workers.
677 ///
678 /// **Requirements**: Volume must exist in same data centers as workers
679 /// **Note**: Set to `None` to keep current volume unchanged.
680 #[serde(skip_serializing_if = "Option::is_none")]
681 pub network_volume_id: Option<String>,
682
683 /// The scaling strategy for managing worker count.
684 ///
685 /// Updates the auto-scaling algorithm used for worker management.
686 /// Change takes effect immediately for new scaling decisions.
687 ///
688 /// **Strategies:**
689 /// - `QueueDelay`: Scale based on request wait time
690 /// - `RequestCount`: Scale based on queue depth
691 ///
692 /// **Note**: Set to `None` to keep current strategy unchanged.
693 #[serde(skip_serializing_if = "Option::is_none")]
694 pub scaler_type: Option<ScalerType>,
695
696 /// The scaling sensitivity parameter.
697 ///
698 /// Updates the scaling behavior sensitivity.
699 /// Change takes effect immediately for new scaling decisions.
700 ///
701 /// **For QueueDelay**: Maximum seconds requests can wait
702 /// **For RequestCount**: Target requests per worker
703 /// **Range**: 1-3600
704 /// **Note**: Set to `None` to keep current value unchanged.
705 #[serde(skip_serializing_if = "Option::is_none")]
706 pub scaler_value: Option<i32>,
707
708 /// The unique ID of the template used to create the endpoint.
709 ///
710 /// Updates the container image and environment configuration.
711 /// Triggers rolling release to deploy all workers with the new template.
712 ///
713 /// **Impact**: Changes container image, environment, resource allocation
714 /// **Rolling Release**: All workers are gradually replaced
715 /// **Note**: Set to `None` to keep current template unchanged.
716 #[serde(skip_serializing_if = "Option::is_none")]
717 pub template_id: Option<String>,
718
719 /// If the endpoint is a CPU endpoint, number of vCPUs per worker.
720 ///
721 /// Updates CPU allocation for new workers. Triggers rolling release
722 /// to deploy workers with the new CPU configuration.
723 ///
724 /// **CPU endpoints only**: Ignored for GPU endpoints
725 /// **Range**: 1-32 vCPUs depending on CPU flavor
726 /// **Note**: Set to `None` to keep current setting unchanged.
727 #[serde(skip_serializing_if = "Option::is_none")]
728 pub vcpu_count: Option<i32>,
729
730 /// Maximum number of workers that can run simultaneously.
731 ///
732 /// Updates the scaling limit for worker count.
733 /// Change takes effect immediately for new scaling decisions.
734 ///
735 /// **Range**: 0-1000+ depending on account limits
736 /// **Note**: Set to `None` to keep current limit unchanged.
737 #[serde(skip_serializing_if = "Option::is_none")]
738 pub workers_max: Option<i32>,
739
740 /// Minimum number of workers that always remain running.
741 ///
742 /// Updates the reserved capacity for immediate availability.
743 /// Change triggers immediate scaling to meet the new minimum.
744 ///
745 /// **Range**: 0-100 depending on account limits
746 /// **Billing**: Reserved workers are always charged (at reduced rate)
747 /// **Note**: Set to `None` to keep current minimum unchanged.
748 #[serde(skip_serializing_if = "Option::is_none")]
749 pub workers_min: Option<i32>,
750}
751
752/// Query parameters for listing serverless endpoints.
753///
754/// Controls which additional data is included in the response when retrieving
755/// multiple endpoints. Including additional data provides more detailed information
756/// but increases response size and latency.
757///
758/// # Examples
759///
760/// ```rust
761/// use runpod_sdk::model::ListEndpointsQuery;
762///
763/// // Basic listing (endpoints only)
764/// let basic_query = ListEndpointsQuery::default();
765///
766/// // Include template details for each endpoint
767/// let with_templates = ListEndpointsQuery {
768/// include_template: Some(true),
769/// include_workers: Some(false),
770/// };
771///
772/// // Include both template and worker information
773/// let full_details = ListEndpointsQuery {
774/// include_template: Some(true),
775/// include_workers: Some(true),
776/// };
777/// ```
778#[derive(Debug, Clone, Default, Serialize)]
779#[serde(rename_all = "camelCase")]
780pub struct ListEndpointsQuery {
781 /// Whether to include template information for each endpoint.
782 ///
783 /// When `true`, the response includes detailed template configuration
784 /// including container image, environment variables, resource requirements,
785 /// and deployment settings for each endpoint.
786 ///
787 /// **Default**: `false`
788 /// **Impact**: Increases response size and latency
789 /// **Useful for**: Deployment auditing, configuration comparison, debugging
790 #[serde(skip_serializing_if = "Option::is_none")]
791 pub include_template: Option<bool>,
792
793 /// Whether to include current worker information for each endpoint.
794 ///
795 /// When `true`, the response includes detailed information about active
796 /// workers including their status, resource allocation, performance metrics,
797 /// and current workload for each endpoint.
798 ///
799 /// **Default**: `false`
800 /// **Impact**: Significantly increases response size and latency
801 /// **Useful for**: Capacity monitoring, performance analysis, troubleshooting
802 #[serde(skip_serializing_if = "Option::is_none")]
803 pub include_workers: Option<bool>,
804}
805
806/// Query parameters for retrieving a single serverless endpoint.
807///
808/// Controls which additional data is included in the response when retrieving
809/// a specific endpoint. Including additional data provides more detailed information
810/// but increases response size and latency.
811///
812/// # Examples
813///
814/// ```rust
815/// use runpod_sdk::model::GetEndpointQuery;
816///
817/// // Basic endpoint information only
818/// let basic_query = GetEndpointQuery::default();
819///
820/// // Include template configuration
821/// let with_template = GetEndpointQuery {
822/// include_template: Some(true),
823/// include_workers: Some(false),
824/// };
825///
826/// // Include complete details for monitoring
827/// let monitoring_query = GetEndpointQuery {
828/// include_template: Some(true),
829/// include_workers: Some(true),
830/// };
831/// ```
832#[derive(Debug, Clone, Default, Serialize)]
833#[serde(rename_all = "camelCase")]
834pub struct GetEndpointQuery {
835 /// Whether to include template information in the response.
836 ///
837 /// When `true`, the response includes detailed template configuration
838 /// including container image, environment variables, resource requirements,
839 /// and deployment settings.
840 ///
841 /// **Default**: `false`
842 /// **Impact**: Increases response size and latency
843 /// **Useful for**: Configuration review, deployment debugging, audit trails
844 #[serde(skip_serializing_if = "Option::is_none")]
845 pub include_template: Option<bool>,
846
847 /// Whether to include current worker information in the response.
848 ///
849 /// When `true`, the response includes detailed information about active
850 /// workers including their status, resource allocation, performance metrics,
851 /// machine details, and current workload.
852 ///
853 /// **Default**: `false`
854 /// **Impact**: Significantly increases response size and latency
855 /// **Useful for**: Real-time monitoring, performance optimization, troubleshooting
856 #[serde(skip_serializing_if = "Option::is_none")]
857 pub include_workers: Option<bool>,
858}
859
860#[cfg(feature = "serverless")]
861#[cfg_attr(docsrs, doc(cfg(feature = "serverless")))]
862impl Endpoint {
863 /// Creates an endpoint runner from this endpoint.
864 ///
865 /// # Example
866 ///
867 /// ```no_run
868 /// # use runpod_sdk::{RunpodClient, Result};
869 /// # use runpod_sdk::service::EndpointsService;
870 /// # use runpod_sdk::model::GetEndpointQuery;
871 /// # async fn example() -> Result<()> {
872 /// let client = RunpodClient::from_env()?;
873 /// let serverless_endpoint = client.get_endpoint("endpoint_id", GetEndpointQuery::default()).await?;
874 ///
875 /// let runner = serverless_endpoint.to_runner(client);
876 /// # Ok(())
877 /// # }
878 /// ```
879 pub fn to_runner(&self, client: RunpodClient) -> ServerlessEndpoint {
880 ServerlessEndpoint::new(&self.id, client)
881 }
882
883 /// Runs a job on this endpoint.
884 ///
885 /// This is a convenience method that creates a runner and submits a job in one call.
886 ///
887 /// # Example
888 ///
889 /// ```no_run
890 /// # use runpod_sdk::{RunpodClient, Result};
891 /// # use runpod_sdk::service::EndpointsService;
892 /// # use runpod_sdk::model::GetEndpointQuery;
893 /// # use serde_json::json;
894 /// # async fn example() -> Result<()> {
895 /// let client = RunpodClient::from_env()?;
896 /// let serverless_endpoint = client.get_endpoint("endpoint_id", GetEndpointQuery::default()).await?;
897 ///
898 /// let job = serverless_endpoint.run(client, &json!({"prompt": "Hello"}))?;
899 /// # Ok(())
900 /// # }
901 /// ```
902 pub fn run<I>(&self, client: RunpodClient, input: &I) -> crate::Result<ServerlessJob>
903 where
904 I: Serialize,
905 {
906 let runner = self.to_runner(client);
907 runner.run(input)
908 }
909}