runpod_sdk/model/
endpoint.rs

1use serde::{Deserialize, Serialize};
2#[cfg(feature = "strum")]
3use strum::{Display, EnumString};
4
5use super::common::*;
6use super::pod::Pod;
7use super::template::Template;
8#[cfg(feature = "serverless")]
9use crate::RunpodClient;
10#[cfg(feature = "serverless")]
11use crate::serverless::{ServerlessEndpoint, ServerlessJob};
12
13/// Scaling strategy for serverless endpoint worker management.
14///
15/// Determines how the serverless infrastructure responds to incoming request load
16/// by automatically scaling the number of active workers up or down.
17///
18/// # Strategies
19///
20/// ## Queue Delay (`QueueDelay`)
21/// **Latency-optimized scaling** that prioritizes response time consistency.
22/// - Scales up when requests wait longer than the configured threshold
23/// - Maintains responsive service with predictable latency characteristics
24/// - Best for interactive applications, real-time inference, SLA-sensitive workloads
25/// - Higher baseline costs to ensure responsiveness
26///
27/// ## Request Count (`RequestCount`)
28/// **Throughput-optimized scaling** that balances load efficiently across workers.
29/// - Maintains approximately `queue_size / scaler_value` workers
30/// - Optimizes for cost efficiency and overall throughput
31/// - Best for batch processing, background tasks, cost-sensitive workloads
32/// - May have higher latency during traffic spikes
33///
34/// # Examples
35///
36/// ```rust
37/// use runpod_sdk::model::ScalerType;
38///
39/// // For real-time AI inference requiring <3s response times
40/// let latency_optimized = ScalerType::QueueDelay;
41/// // scaler_value = 2 means scale up if any request waits >2 seconds
42///
43/// // For batch image processing where cost matters more than speed
44/// let cost_optimized = ScalerType::RequestCount;
45/// // scaler_value = 10 means maintain 1 worker per 10 queued requests
46/// ```
47#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
48#[cfg_attr(feature = "strum", derive(Display, EnumString))]
49#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
50#[cfg_attr(feature = "strum", strum(serialize_all = "SCREAMING_SNAKE_CASE"))]
51pub enum ScalerType {
52    /// Queue delay-based scaling - prioritizes response time consistency.
53    /// Scales up when requests wait longer than the threshold.
54    #[default]
55    QueueDelay,
56    /// Request count-based scaling - optimizes for throughput and cost.
57    /// Maintains workers proportional to queue depth.
58    RequestCount,
59}
60
61/// Serverless endpoint resource providing auto-scaling compute infrastructure.
62///
63/// Represents a fully configured serverless endpoint with all deployment settings,
64/// scaling configuration, and runtime status. Endpoints automatically manage
65/// worker lifecycle based on request load and configured policies.
66///
67/// # Key Properties
68///
69/// - **Auto-scaling**: Workers spin up/down based on request queue and scaling policy
70/// - **Template-driven**: Consistent runtime environment from pre-configured templates
71/// - **Multi-region**: Distributed deployment across multiple data centers
72/// - **Cost-optimized**: Pay-per-use billing with idle timeout management
73/// - **High-availability**: Automatic failover and redundancy
74///
75/// # Examples
76///
77/// ```rust
78/// use runpod_sdk::model::Endpoint;
79///
80/// // Endpoint instances are typically obtained from API responses
81/// // when listing, creating, or retrieving serverless endpoints
82/// ```
83#[derive(Debug, Clone, Serialize, Deserialize)]
84#[serde(rename_all = "camelCase")]
85pub struct Endpoint {
86    /// A unique string identifying the serverless endpoint.
87    pub id: String,
88
89    /// A user-defined name for the endpoint. The name does not need to be unique.
90    ///
91    /// Used for organization and identification in dashboards and monitoring.
92    /// Can be updated without affecting endpoint functionality.
93    pub name: Option<String>,
94
95    /// A unique string identifying the RunPod user who created the endpoint.
96    pub user_id: String,
97
98    /// The unique string identifying the template used to create the endpoint.
99    ///
100    /// Templates define the container image, environment, and resource configuration
101    /// that will be deployed across all workers for this endpoint.
102    pub template_id: String,
103
104    /// The current version of the endpoint configuration.
105    ///
106    /// Incremented whenever the template or environment variables are changed,
107    /// triggering a rolling update of all workers.
108    pub version: i32,
109
110    /// The type of compute used by workers on this endpoint.
111    ///
112    /// Determines whether workers will have GPU or CPU compute resources attached.
113    /// This setting affects pricing, available hardware types, and performance characteristics.
114    pub compute_type: ComputeType,
115
116    /// The UTC timestamp when the endpoint was created.
117    ///
118    /// ISO 8601 format string representing the endpoint creation time.
119    pub created_at: String,
120
121    /// List of RunPod data center IDs where workers can be located.
122    ///
123    /// Workers are distributed across these data centers for availability and performance.
124    /// The system automatically selects the best available data center based on
125    /// resource availability and proximity to users.
126    pub data_center_ids: Vec<DataCenterId>,
127
128    /// Environment variables for the endpoint's container runtime.
129    ///
130    /// These variables are injected into all worker containers and can be used
131    /// for configuration, API keys, feature flags, and other runtime settings.
132    pub env: Option<EnvVars>,
133
134    /// The maximum execution time in milliseconds for individual requests.
135    ///
136    /// If a request exceeds this timeout, the worker is stopped and the request
137    /// is marked as failed. This prevents runaway processes and ensures
138    /// predictable resource usage.
139    ///
140    /// **Common values:**
141    /// - Web APIs: 30,000ms (30 seconds)
142    /// - AI inference: 300,000ms (5 minutes)
143    /// - Batch processing: 3,600,000ms (1 hour)
144    pub execution_timeout_ms: i32,
145
146    /// The number of GPUs attached to each worker (GPU endpoints only).
147    ///
148    /// Only relevant when `compute_type` is `GPU`. Determines the GPU resources
149    /// allocated to each worker instance for parallel processing workloads.
150    pub gpu_count: Option<i32>,
151
152    /// List of RunPod GPU types that can be attached to workers (GPU endpoints only).
153    ///
154    /// The system tries to allocate GPUs in the order specified, falling back
155    /// to subsequent types if the preferred options are unavailable.
156    /// Only relevant when `compute_type` is `GPU`.
157    pub gpu_type_ids: Option<Vec<GpuTypeId>>,
158
159    /// List of CPU instance IDs that can be attached to workers (CPU endpoints only).
160    ///
161    /// For CPU endpoints, specifies the available instance types that workers
162    /// can use, allowing the system to choose based on availability and cost.
163    pub instance_ids: Option<Vec<String>>,
164
165    /// The number of seconds a worker can be idle before being scaled down.
166    ///
167    /// Workers that haven't processed requests for this duration are automatically
168    /// terminated to reduce costs. Shorter timeouts reduce costs but may increase
169    /// cold start latency for subsequent requests.
170    ///
171    /// **Typical values:**
172    /// - Cost-optimized: 30-60 seconds
173    /// - Balanced: 5-15 seconds
174    /// - Performance-optimized: 1-5 seconds
175    pub idle_timeout: i32,
176
177    /// The unique ID of the network volume attached to workers, if any.
178    ///
179    /// Network volumes provide persistent, shared storage across all workers,
180    /// useful for model weights, datasets, and other shared assets.
181    pub network_volume_id: Option<String>,
182
183    /// The scaling strategy used to manage worker count.
184    ///
185    /// Determines how the system responds to request load by scaling workers
186    /// up or down automatically.
187    pub scaler_type: ScalerType,
188
189    /// The scaling sensitivity parameter.
190    ///
191    /// **For `QueueDelay` scaling:**
192    /// - Seconds a request can wait in queue before scaling up
193    /// - Lower values = more responsive but potentially higher costs
194    ///
195    /// **For `RequestCount` scaling:**
196    /// - Target requests per worker (queue_size / scaler_value = worker_count)
197    /// - Higher values = fewer workers, more cost-efficient
198    pub scaler_value: i32,
199
200    /// The maximum number of workers that can run simultaneously.
201    ///
202    /// Hard limit preventing runaway scaling and controlling maximum costs.
203    /// Set based on expected peak load and budget constraints.
204    pub workers_max: i32,
205
206    /// The minimum number of workers that always remain running.
207    ///
208    /// Reserved capacity that's always available, even during idle periods.
209    /// These workers are billed at a lower rate but provide immediate availability.
210    /// Set to 0 for maximum cost efficiency, or >0 for better responsiveness.
211    pub workers_min: i32,
212
213    /// List of acceptable CUDA versions for GPU workers.
214    ///
215    /// If specified, only workers with compatible CUDA runtimes will be used.
216    /// Useful for ensuring compatibility with specific AI/ML frameworks.
217    /// Only relevant for GPU endpoints.
218    pub allowed_cuda_versions: Option<Vec<CudaVersion>>,
219
220    /// Detailed template information (included when `include_template` is true).
221    ///
222    /// Contains the full template configuration including container image,
223    /// environment setup, and resource requirements.
224    #[serde(skip_serializing_if = "Option::is_none")]
225    pub template: Option<Template>,
226
227    /// Current worker instances (included when `include_workers` is true).
228    ///
229    /// List of active worker pods with their current status, resource allocation,
230    /// and performance metrics.
231    #[serde(skip_serializing_if = "Option::is_none")]
232    pub workers: Option<Vec<Pod>>,
233}
234
235/// List of serverless endpoints.
236///
237/// A collection type representing multiple endpoints, typically returned
238/// from API endpoints that list endpoints for an account.
239pub type Endpoints = Vec<Endpoint>;
240
241/// Input parameters for creating a new serverless endpoint.
242///
243/// This struct contains all the configuration options available when creating an endpoint,
244/// including compute specifications, scaling policies, and deployment preferences.
245/// Most fields are optional and will use RunPod defaults if not specified.
246///
247/// # Required Fields
248///
249/// Only `template_id` is required - all other configuration uses sensible defaults
250/// that can be customized based on your specific workload requirements.
251///
252/// # Examples
253///
254/// ```rust
255/// use runpod_sdk::model::{EndpointCreateInput, ScalerType};
256/// use runpod_sdk::model::{ComputeType, CudaVersion, GpuTypeId};
257///
258/// // High-performance GPU endpoint for real-time AI inference
259/// let inference_endpoint = EndpointCreateInput {
260///     template_id: "pytorch-inference-template".to_string(),
261///     name: Some("ai-inference-prod".to_string()),
262///     compute_type: Some(ComputeType::Gpu),
263///     gpu_count: Some(1),
264///     gpu_type_ids: Some(vec![GpuTypeId::NvidiaA100_80GbPcie]),
265///     allowed_cuda_versions: Some(vec![CudaVersion::V12_1]),
266///     scaler_type: Some(ScalerType::QueueDelay),
267///     scaler_value: Some(3), // Scale if requests wait >3 seconds
268///     workers_min: Some(1),  // Keep 1 worker always ready
269///     workers_max: Some(5),  // Burst up to 5 workers
270///     flashboot: Some(true), // Fast cold starts
271///     idle_timeout: Some(30), // Scale down after 30s idle
272///     execution_timeout_ms: Some(300000), // 5 minute timeout
273///     ..Default::default()
274/// };
275///
276/// // Cost-optimized CPU endpoint for batch processing
277/// let batch_endpoint = EndpointCreateInput {
278///     template_id: "batch-processor-template".to_string(),
279///     name: Some("data-batch-processor".to_string()),
280///     compute_type: Some(ComputeType::Cpu),
281///     vcpu_count: Some(8),
282///     scaler_type: Some(ScalerType::RequestCount),
283///     scaler_value: Some(10), // 1 worker per 10 requests
284///     workers_min: Some(0),   // No reserved capacity
285///     workers_max: Some(20),  // Allow large bursts
286///     flashboot: Some(false), // Standard startup (cheaper)
287///     idle_timeout: Some(120), // Longer idle time for batches
288///     execution_timeout_ms: Some(1800000), // 30 minute timeout
289///     ..Default::default()
290/// };
291/// ```
292#[derive(Debug, Clone, Default, Serialize, Deserialize)]
293#[serde(rename_all = "camelCase")]
294pub struct EndpointCreateInput {
295    /// The unique string identifying the template used to create the endpoint.
296    ///
297    /// **Required field** - specifies the container image, environment, and
298    /// resource configuration that will be deployed across all workers.
299    ///
300    /// Templates ensure consistent runtime environments and can be shared
301    /// across multiple endpoints for standardized deployments.
302    pub template_id: String,
303
304    /// If the endpoint is a GPU endpoint, acceptable CUDA versions for workers.
305    ///
306    /// Constrains worker allocation to machines with compatible CUDA runtimes.
307    /// Useful for ensuring compatibility with specific AI/ML framework versions
308    /// that require particular CUDA versions.
309    ///
310    /// **Default**: Any CUDA version is acceptable
311    /// **GPU endpoints only**: Ignored for CPU endpoints
312    ///
313    /// **Example**: `[CudaVersion::V12_1, CudaVersion::V11_8]`
314    #[serde(skip_serializing_if = "Option::is_none")]
315    pub allowed_cuda_versions: Option<Vec<CudaVersion>>,
316
317    /// Set to `GPU` for GPU-accelerated workers, `CPU` for CPU-only workers.
318    ///
319    /// Determines the type of compute resources allocated to workers:
320    /// - `GPU`: Workers get GPU acceleration for AI/ML workloads
321    /// - `CPU`: Workers get high-performance CPUs for general compute
322    ///
323    /// **Default**: `GPU`
324    /// **Impact**: Affects available hardware types, pricing, and performance
325    #[serde(skip_serializing_if = "Option::is_none")]
326    pub compute_type: Option<ComputeType>,
327
328    /// If creating a CPU endpoint, list of CPU flavors for workers.
329    ///
330    /// Specifies the CPU configurations that can be used for workers.
331    /// The order determines rental priority - preferred flavors first.
332    ///
333    /// **CPU endpoints only**: Ignored for GPU endpoints
334    /// **Default**: All available CPU flavors
335    ///
336    /// **Available flavors**: `cpu3c`, `cpu3g`, `cpu5c`, `cpu5g`
337    #[serde(skip_serializing_if = "Option::is_none")]
338    pub cpu_flavor_ids: Option<Vec<CpuFlavorId>>,
339
340    /// List of data center IDs where workers can be located.
341    ///
342    /// Workers are distributed across these data centers for availability,
343    /// performance, and proximity to users. The system automatically
344    /// selects the best available data center for each worker.
345    ///
346    /// **Default**: All available data centers globally
347    /// **Strategy**: Choose data centers close to your users and data sources
348    ///
349    /// **Common choices:**
350    /// - Global: `["US-CA-1", "EU-RO-1", "AP-JP-1"]`
351    /// - Regional: `["US-TX-1", "US-CA-2"]` for US-only
352    /// - Single DC: `["EU-RO-1"]` for data residency requirements
353    #[serde(skip_serializing_if = "Option::is_none")]
354    pub data_center_ids: Option<Vec<DataCenterId>>,
355
356    /// Maximum execution time in milliseconds for individual requests.
357    ///
358    /// Requests exceeding this timeout are terminated and marked as failed.
359    /// Prevents runaway processes and ensures predictable resource usage.
360    ///
361    /// **Default**: 600,000ms (10 minutes)
362    /// **Range**: 1,000ms to 3,600,000ms (1 second to 1 hour)
363    ///
364    /// **Guidelines:**
365    /// - Web APIs: 30,000ms (30 seconds)
366    /// - AI inference: 300,000ms (5 minutes)
367    /// - Image processing: 600,000ms (10 minutes)
368    /// - Batch jobs: 3,600,000ms (1 hour)
369    #[serde(skip_serializing_if = "Option::is_none")]
370    pub execution_timeout_ms: Option<i32>,
371
372    /// Whether to enable flash boot for faster worker startup.
373    ///
374    /// Flash boot dramatically reduces cold start time by using pre-warmed
375    /// container images with cached dependencies and optimized initialization.
376    ///
377    /// **Default**: `false`
378    /// **Trade-off**: Higher per-request cost for much faster startup
379    /// **Best for**: Interactive applications, real-time inference, low-latency requirements
380    /// **Startup time**: ~5-10 seconds with flash boot vs 30-60 seconds without
381    #[serde(skip_serializing_if = "Option::is_none")]
382    pub flashboot: Option<bool>,
383
384    /// If creating a GPU endpoint, number of GPUs per worker.
385    ///
386    /// Determines GPU resources allocated to each worker for parallel processing.
387    /// More GPUs enable larger models and higher throughput but increase costs.
388    ///
389    /// **Default**: 1
390    /// **GPU endpoints only**: Ignored for CPU endpoints
391    /// **Range**: 1-8 depending on GPU type availability
392    ///
393    /// **Use cases:**
394    /// - Single GPU: Most inference workloads, small models
395    /// - Multi-GPU: Large language models, distributed training, high-throughput inference
396    #[serde(skip_serializing_if = "Option::is_none")]
397    pub gpu_count: Option<i32>,
398
399    /// If creating a GPU endpoint, list of GPU types for workers.
400    ///
401    /// Specifies GPU hardware that can be used for workers. The order
402    /// determines rental priority - the system tries preferred types first.
403    ///
404    /// **GPU endpoints only**: Ignored for CPU endpoints
405    /// **Default**: All available GPU types
406    ///
407    /// **Performance tiers:**
408    /// - High-end: `"NVIDIA H100 80GB HBM3"`, `"NVIDIA A100 80GB PCIe"`
409    /// - Mid-range: `"NVIDIA RTX A6000"`, `"NVIDIA A40"`
410    /// - Budget: `"NVIDIA RTX 4090"`, `"NVIDIA RTX 3090"`
411    #[serde(skip_serializing_if = "Option::is_none")]
412    pub gpu_type_ids: Option<Vec<GpuTypeId>>,
413
414    /// Number of seconds workers can be idle before scaling down.
415    ///
416    /// Workers that haven't processed requests for this duration are
417    /// automatically terminated to reduce costs. Balance between cost
418    /// optimization and cold start latency.
419    ///
420    /// **Default**: 5 seconds
421    /// **Range**: 1-3600 seconds (1 second to 1 hour)
422    ///
423    /// **Strategy:**
424    /// - Aggressive (cost-focused): 30-60 seconds
425    /// - Balanced: 5-15 seconds
426    /// - Responsive (latency-focused): 1-5 seconds
427    #[serde(skip_serializing_if = "Option::is_none")]
428    pub idle_timeout: Option<i32>,
429
430    /// A user-defined name for the endpoint.
431    ///
432    /// Used for organization and identification in dashboards, monitoring,
433    /// and API responses. The name does not need to be unique across your account.
434    ///
435    /// **Default**: Auto-generated based on template name
436    /// **Max length**: 191 characters
437    /// **Best practices**: Use descriptive names like "prod-image-classifier" or "staging-api-v2"
438    #[serde(skip_serializing_if = "Option::is_none")]
439    pub name: Option<String>,
440
441    /// The unique ID of a network volume to attach to workers.
442    ///
443    /// Network volumes provide persistent, shared storage across all workers,
444    /// useful for model weights, datasets, cached data, and other shared assets.
445    ///
446    /// **Default**: No network volume attached
447    /// **Requirements**: Volume must exist in same data centers as workers
448    /// **Use cases**: Model storage, dataset access, shared caching, persistent logs
449    #[serde(skip_serializing_if = "Option::is_none")]
450    pub network_volume_id: Option<String>,
451
452    /// The scaling strategy for managing worker count.
453    ///
454    /// Determines how the system automatically scales workers up/down based
455    /// on request load and queue depth.
456    ///
457    /// **Default**: `QueueDelay`
458    ///
459    /// **Strategies:**
460    /// - `QueueDelay`: Scale based on request wait time (latency-optimized)
461    /// - `RequestCount`: Scale based on queue depth (throughput-optimized)
462    #[serde(skip_serializing_if = "Option::is_none")]
463    pub scaler_type: Option<ScalerType>,
464
465    /// The scaling sensitivity parameter.
466    ///
467    /// Meaning depends on the `scaler_type`:
468    ///
469    /// **For `QueueDelay`**: Maximum seconds requests can wait before scaling up
470    /// - Lower values = more responsive scaling, higher costs
471    /// - Higher values = slower scaling, lower costs
472    ///
473    /// **For `RequestCount`**: Target requests per worker
474    /// - `queue_size / scaler_value = target_worker_count`
475    /// - Lower values = more workers, lower latency
476    /// - Higher values = fewer workers, higher latency
477    ///
478    /// **Default**: 4
479    /// **Range**: 1-3600
480    #[serde(skip_serializing_if = "Option::is_none")]
481    pub scaler_value: Option<i32>,
482
483    /// If creating a CPU endpoint, number of vCPUs per worker.
484    ///
485    /// Determines CPU resources allocated to each worker. More vCPUs enable
486    /// higher parallelism and throughput for CPU-intensive workloads.
487    ///
488    /// **Default**: 2 vCPUs
489    /// **CPU endpoints only**: Ignored for GPU endpoints
490    /// **Range**: 1-32 vCPUs depending on CPU flavor
491    ///
492    /// **Guidelines:**
493    /// - Light workloads: 1-2 vCPUs
494    /// - Web APIs: 2-4 vCPUs
495    /// - Data processing: 4-16 vCPUs
496    /// - Heavy computation: 16+ vCPUs
497    #[serde(skip_serializing_if = "Option::is_none")]
498    pub vcpu_count: Option<i32>,
499
500    /// Maximum number of workers that can run simultaneously.
501    ///
502    /// Hard limit preventing runaway scaling and controlling maximum costs.
503    /// Set based on expected peak load, budget constraints, and infrastructure limits.
504    ///
505    /// **Default**: No limit (subject to account quotas)
506    /// **Range**: 0-1000+ depending on account limits
507    ///
508    /// **Strategy**: Set 2-3x expected peak load for safety margin
509    #[serde(skip_serializing_if = "Option::is_none")]
510    pub workers_max: Option<i32>,
511
512    /// Minimum number of workers that always remain running.
513    ///
514    /// Reserved capacity providing immediate availability even during idle
515    /// periods. These workers are billed at a reduced rate but ensure
516    /// zero cold start latency for the first few requests.
517    ///
518    /// **Default**: 0 (no reserved capacity)
519    /// **Range**: 0-100 depending on account limits
520    ///
521    /// **Trade-offs:**
522    /// - 0: Maximum cost efficiency, but cold starts for first requests
523    /// - 1+: Immediate availability, continuous billing for reserved workers
524    ///
525    /// **Strategy**: Set to 1 for production endpoints requiring <1s response time
526    #[serde(skip_serializing_if = "Option::is_none")]
527    pub workers_min: Option<i32>,
528}
529
530/// Input parameters for updating an existing serverless endpoint.
531///
532/// This struct allows you to modify endpoint configuration and trigger a rolling
533/// release that updates all workers with the new settings. All fields are optional,
534/// allowing you to update only the properties you want to change.
535///
536/// # Rolling Release Process
537///
538/// When an endpoint is updated:
539/// 1. **Validation**: New configuration is validated for compatibility
540/// 2. **Version Increment**: Endpoint version number is incremented
541/// 3. **Rolling Update**: Workers are gradually replaced with new configuration
542/// 4. **Traffic Migration**: Requests are routed to updated workers as they become available
543/// 5. **Cleanup**: Old workers are terminated once traffic migration is complete
544///
545/// # Important Notes
546///
547/// - **Zero Downtime**: Updates are performed without service interruption
548/// - **Gradual Rollout**: Workers are updated in batches to maintain availability
549/// - **Rollback**: Previous versions can be restored if issues are detected
550/// - **Template Changes**: Updating `template_id` deploys new container images
551///
552/// # Examples
553///
554/// ```rust
555/// use runpod_sdk::model::{EndpointUpdateInput, ScalerType};
556///
557/// // Scale up for increased traffic
558/// let scale_up = EndpointUpdateInput {
559///     workers_max: Some(20),      // Double capacity
560///     scaler_value: Some(2),      // More aggressive scaling
561///     idle_timeout: Some(10),     // Keep workers longer
562///     ..Default::default()
563/// };
564///
565/// // Enable flash boot for better performance
566/// let performance_upgrade = EndpointUpdateInput {
567///     flashboot: Some(true),
568///     execution_timeout_ms: Some(60000), // Reduce timeout
569///     ..Default::default()
570/// };
571///
572/// // Switch to cost-optimized scaling
573/// let cost_optimization = EndpointUpdateInput {
574///     scaler_type: Some(ScalerType::RequestCount),
575///     scaler_value: Some(10),     // 1 worker per 10 requests
576///     workers_min: Some(0),       // No reserved capacity
577///     flashboot: Some(false),     // Standard startup
578///     ..Default::default()
579/// };
580/// ```
581#[derive(Debug, Clone, Default, Serialize, Deserialize)]
582#[serde(rename_all = "camelCase")]
583pub struct EndpointUpdateInput {
584    /// If the endpoint is a GPU endpoint, acceptable CUDA versions for workers.
585    ///
586    /// Updates the CUDA version constraints for worker allocation.
587    /// Triggers rolling release to ensure all workers use compatible CUDA versions.
588    ///
589    /// **Note**: Set to `None` to keep current setting unchanged.
590    #[serde(skip_serializing_if = "Option::is_none")]
591    pub allowed_cuda_versions: Option<Vec<CudaVersion>>,
592
593    /// If the endpoint is a CPU endpoint, list of CPU flavors for workers.
594    ///
595    /// Updates the available CPU configurations for workers.
596    /// The order determines rental priority for new workers.
597    ///
598    /// **CPU endpoints only**: Ignored for GPU endpoints
599    /// **Note**: Set to `None` to keep current setting unchanged.
600    #[serde(skip_serializing_if = "Option::is_none")]
601    pub cpu_flavor_ids: Option<Vec<CpuFlavorId>>,
602
603    /// List of data center IDs where workers can be located.
604    ///
605    /// Updates the geographic distribution of workers.
606    /// Existing workers in removed data centers will be gradually replaced.
607    ///
608    /// **Note**: Set to `None` to keep current setting unchanged.
609    #[serde(skip_serializing_if = "Option::is_none")]
610    pub data_center_ids: Option<Vec<DataCenterId>>,
611
612    /// Maximum execution time in milliseconds for individual requests.
613    ///
614    /// Updates the timeout for request processing. Affects new requests
615    /// immediately, existing requests continue with previous timeout.
616    ///
617    /// **Range**: 1,000ms to 3,600,000ms (1 second to 1 hour)
618    /// **Note**: Set to `None` to keep current setting unchanged.
619    #[serde(skip_serializing_if = "Option::is_none")]
620    pub execution_timeout_ms: Option<i32>,
621
622    /// Whether to enable flash boot for faster worker startup.
623    ///
624    /// Updates the startup optimization for new workers.
625    /// Affects cold start performance and per-request costs.
626    ///
627    /// **Trade-off**: Higher per-request cost for faster startup
628    /// **Note**: Set to `None` to keep current setting unchanged.
629    #[serde(skip_serializing_if = "Option::is_none")]
630    pub flashboot: Option<bool>,
631
632    /// If the endpoint is a GPU endpoint, number of GPUs per worker.
633    ///
634    /// Updates GPU allocation for new workers. Triggers rolling release
635    /// to deploy workers with the new GPU configuration.
636    ///
637    /// **GPU endpoints only**: Ignored for CPU endpoints
638    /// **Range**: 1-8 depending on GPU type availability
639    /// **Note**: Set to `None` to keep current setting unchanged.
640    #[serde(skip_serializing_if = "Option::is_none")]
641    pub gpu_count: Option<i32>,
642
643    /// If the endpoint is a GPU endpoint, list of GPU types for workers.
644    ///
645    /// Updates available GPU hardware types for workers.
646    /// The order determines rental priority for new workers.
647    ///
648    /// **GPU endpoints only**: Ignored for CPU endpoints
649    /// **Note**: Set to `None` to keep current setting unchanged.
650    #[serde(skip_serializing_if = "Option::is_none")]
651    pub gpu_type_ids: Option<Vec<GpuTypeId>>,
652
653    /// Number of seconds workers can be idle before scaling down.
654    ///
655    /// Updates the idle timeout for worker lifecycle management.
656    /// Affects cost optimization and cold start frequency.
657    ///
658    /// **Range**: 1-3600 seconds (1 second to 1 hour)
659    /// **Note**: Set to `None` to keep current setting unchanged.
660    #[serde(skip_serializing_if = "Option::is_none")]
661    pub idle_timeout: Option<i32>,
662
663    /// A user-defined name for the endpoint.
664    ///
665    /// Updates the display name used in dashboards and API responses.
666    /// This change is applied immediately without triggering a rolling release.
667    ///
668    /// **Max length**: 191 characters
669    /// **Note**: Set to `None` to keep current name unchanged.
670    #[serde(skip_serializing_if = "Option::is_none")]
671    pub name: Option<String>,
672
673    /// The unique ID of a network volume to attach to workers.
674    ///
675    /// Updates the persistent storage attached to workers.
676    /// Triggers rolling release to mount/unmount volumes on all workers.
677    ///
678    /// **Requirements**: Volume must exist in same data centers as workers
679    /// **Note**: Set to `None` to keep current volume unchanged.
680    #[serde(skip_serializing_if = "Option::is_none")]
681    pub network_volume_id: Option<String>,
682
683    /// The scaling strategy for managing worker count.
684    ///
685    /// Updates the auto-scaling algorithm used for worker management.
686    /// Change takes effect immediately for new scaling decisions.
687    ///
688    /// **Strategies:**
689    /// - `QueueDelay`: Scale based on request wait time
690    /// - `RequestCount`: Scale based on queue depth
691    ///
692    /// **Note**: Set to `None` to keep current strategy unchanged.
693    #[serde(skip_serializing_if = "Option::is_none")]
694    pub scaler_type: Option<ScalerType>,
695
696    /// The scaling sensitivity parameter.
697    ///
698    /// Updates the scaling behavior sensitivity.
699    /// Change takes effect immediately for new scaling decisions.
700    ///
701    /// **For QueueDelay**: Maximum seconds requests can wait
702    /// **For RequestCount**: Target requests per worker
703    /// **Range**: 1-3600
704    /// **Note**: Set to `None` to keep current value unchanged.
705    #[serde(skip_serializing_if = "Option::is_none")]
706    pub scaler_value: Option<i32>,
707
708    /// The unique ID of the template used to create the endpoint.
709    ///
710    /// Updates the container image and environment configuration.
711    /// Triggers rolling release to deploy all workers with the new template.
712    ///
713    /// **Impact**: Changes container image, environment, resource allocation
714    /// **Rolling Release**: All workers are gradually replaced
715    /// **Note**: Set to `None` to keep current template unchanged.
716    #[serde(skip_serializing_if = "Option::is_none")]
717    pub template_id: Option<String>,
718
719    /// If the endpoint is a CPU endpoint, number of vCPUs per worker.
720    ///
721    /// Updates CPU allocation for new workers. Triggers rolling release
722    /// to deploy workers with the new CPU configuration.
723    ///
724    /// **CPU endpoints only**: Ignored for GPU endpoints
725    /// **Range**: 1-32 vCPUs depending on CPU flavor
726    /// **Note**: Set to `None` to keep current setting unchanged.
727    #[serde(skip_serializing_if = "Option::is_none")]
728    pub vcpu_count: Option<i32>,
729
730    /// Maximum number of workers that can run simultaneously.
731    ///
732    /// Updates the scaling limit for worker count.
733    /// Change takes effect immediately for new scaling decisions.
734    ///
735    /// **Range**: 0-1000+ depending on account limits
736    /// **Note**: Set to `None` to keep current limit unchanged.
737    #[serde(skip_serializing_if = "Option::is_none")]
738    pub workers_max: Option<i32>,
739
740    /// Minimum number of workers that always remain running.
741    ///
742    /// Updates the reserved capacity for immediate availability.
743    /// Change triggers immediate scaling to meet the new minimum.
744    ///
745    /// **Range**: 0-100 depending on account limits
746    /// **Billing**: Reserved workers are always charged (at reduced rate)
747    /// **Note**: Set to `None` to keep current minimum unchanged.
748    #[serde(skip_serializing_if = "Option::is_none")]
749    pub workers_min: Option<i32>,
750}
751
752/// Query parameters for listing serverless endpoints.
753///
754/// Controls which additional data is included in the response when retrieving
755/// multiple endpoints. Including additional data provides more detailed information
756/// but increases response size and latency.
757///
758/// # Examples
759///
760/// ```rust
761/// use runpod_sdk::model::ListEndpointsQuery;
762///
763/// // Basic listing (endpoints only)
764/// let basic_query = ListEndpointsQuery::default();
765///
766/// // Include template details for each endpoint
767/// let with_templates = ListEndpointsQuery {
768///     include_template: Some(true),
769///     include_workers: Some(false),
770/// };
771///
772/// // Include both template and worker information
773/// let full_details = ListEndpointsQuery {
774///     include_template: Some(true),
775///     include_workers: Some(true),
776/// };
777/// ```
778#[derive(Debug, Clone, Default, Serialize)]
779#[serde(rename_all = "camelCase")]
780pub struct ListEndpointsQuery {
781    /// Whether to include template information for each endpoint.
782    ///
783    /// When `true`, the response includes detailed template configuration
784    /// including container image, environment variables, resource requirements,
785    /// and deployment settings for each endpoint.
786    ///
787    /// **Default**: `false`
788    /// **Impact**: Increases response size and latency
789    /// **Useful for**: Deployment auditing, configuration comparison, debugging
790    #[serde(skip_serializing_if = "Option::is_none")]
791    pub include_template: Option<bool>,
792
793    /// Whether to include current worker information for each endpoint.
794    ///
795    /// When `true`, the response includes detailed information about active
796    /// workers including their status, resource allocation, performance metrics,
797    /// and current workload for each endpoint.
798    ///
799    /// **Default**: `false`
800    /// **Impact**: Significantly increases response size and latency
801    /// **Useful for**: Capacity monitoring, performance analysis, troubleshooting
802    #[serde(skip_serializing_if = "Option::is_none")]
803    pub include_workers: Option<bool>,
804}
805
806/// Query parameters for retrieving a single serverless endpoint.
807///
808/// Controls which additional data is included in the response when retrieving
809/// a specific endpoint. Including additional data provides more detailed information
810/// but increases response size and latency.
811///
812/// # Examples
813///
814/// ```rust
815/// use runpod_sdk::model::GetEndpointQuery;
816///
817/// // Basic endpoint information only
818/// let basic_query = GetEndpointQuery::default();
819///
820/// // Include template configuration
821/// let with_template = GetEndpointQuery {
822///     include_template: Some(true),
823///     include_workers: Some(false),
824/// };
825///
826/// // Include complete details for monitoring
827/// let monitoring_query = GetEndpointQuery {
828///     include_template: Some(true),
829///     include_workers: Some(true),
830/// };
831/// ```
832#[derive(Debug, Clone, Default, Serialize)]
833#[serde(rename_all = "camelCase")]
834pub struct GetEndpointQuery {
835    /// Whether to include template information in the response.
836    ///
837    /// When `true`, the response includes detailed template configuration
838    /// including container image, environment variables, resource requirements,
839    /// and deployment settings.
840    ///
841    /// **Default**: `false`
842    /// **Impact**: Increases response size and latency
843    /// **Useful for**: Configuration review, deployment debugging, audit trails
844    #[serde(skip_serializing_if = "Option::is_none")]
845    pub include_template: Option<bool>,
846
847    /// Whether to include current worker information in the response.
848    ///
849    /// When `true`, the response includes detailed information about active
850    /// workers including their status, resource allocation, performance metrics,
851    /// machine details, and current workload.
852    ///
853    /// **Default**: `false`
854    /// **Impact**: Significantly increases response size and latency
855    /// **Useful for**: Real-time monitoring, performance optimization, troubleshooting
856    #[serde(skip_serializing_if = "Option::is_none")]
857    pub include_workers: Option<bool>,
858}
859
860#[cfg(feature = "serverless")]
861#[cfg_attr(docsrs, doc(cfg(feature = "serverless")))]
862impl Endpoint {
863    /// Creates an endpoint runner from this endpoint.
864    ///
865    /// # Example
866    ///
867    /// ```no_run
868    /// # use runpod_sdk::{RunpodClient, Result};
869    /// # use runpod_sdk::service::EndpointsService;
870    /// # use runpod_sdk::model::GetEndpointQuery;
871    /// # async fn example() -> Result<()> {
872    /// let client = RunpodClient::from_env()?;
873    /// let serverless_endpoint = client.get_endpoint("endpoint_id", GetEndpointQuery::default()).await?;
874    ///
875    /// let runner = serverless_endpoint.to_runner(client);
876    /// # Ok(())
877    /// # }
878    /// ```
879    pub fn to_runner(&self, client: RunpodClient) -> ServerlessEndpoint {
880        ServerlessEndpoint::new(&self.id, client)
881    }
882
883    /// Runs a job on this endpoint.
884    ///
885    /// This is a convenience method that creates a runner and submits a job in one call.
886    ///
887    /// # Example
888    ///
889    /// ```no_run
890    /// # use runpod_sdk::{RunpodClient, Result};
891    /// # use runpod_sdk::service::EndpointsService;
892    /// # use runpod_sdk::model::GetEndpointQuery;
893    /// # use serde_json::json;
894    /// # async fn example() -> Result<()> {
895    /// let client = RunpodClient::from_env()?;
896    /// let serverless_endpoint = client.get_endpoint("endpoint_id", GetEndpointQuery::default()).await?;
897    ///
898    /// let job = serverless_endpoint.run(client, &json!({"prompt": "Hello"}))?;
899    /// # Ok(())
900    /// # }
901    /// ```
902    pub fn run<I>(&self, client: RunpodClient, input: &I) -> crate::Result<ServerlessJob>
903    where
904        I: Serialize,
905    {
906        let runner = self.to_runner(client);
907        runner.run(input)
908    }
909}