dynamo_runtime/
lib.rs

1// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2// SPDX-License-Identifier: Apache-2.0
3
4//! Dynamo
5
6#![allow(dead_code)]
7#![allow(unused_imports)]
8
9use std::{
10    collections::HashMap,
11    sync::{Arc, OnceLock, Weak},
12};
13
14pub use anyhow::{
15    Context as ErrorContext, Error, Ok as OK, Result, anyhow as error, bail as raise,
16};
17
18use async_once_cell::OnceCell;
19
20mod config;
21pub use config::RuntimeConfig;
22
23pub mod component;
24pub mod compute;
25pub mod discovery;
26pub mod engine;
27pub mod health_check;
28pub mod system_status_server;
29pub use system_status_server::SystemStatusServerInfo;
30pub mod instances;
31pub mod logging;
32pub mod metrics;
33pub mod pipeline;
34pub mod prelude;
35pub mod protocols;
36pub mod runnable;
37pub mod runtime;
38pub mod service;
39pub mod slug;
40pub mod storage;
41pub mod system_health;
42pub mod traits;
43pub mod transports;
44pub mod utils;
45pub mod worker;
46
47pub mod distributed;
48pub use distributed::distributed_test_utils;
49pub use futures::stream;
50pub use metrics::MetricsRegistry;
51pub use system_health::{HealthCheckTarget, SystemHealth};
52pub use tokio_util::sync::CancellationToken;
53pub use worker::Worker;
54
55use crate::{
56    metrics::prometheus_names::distributed_runtime,
57    storage::key_value_store::{KeyValueStore, KeyValueStoreManager},
58};
59
60use component::{Endpoint, InstanceSource};
61use utils::GracefulShutdownTracker;
62
63use config::HealthStatus;
64
65/// Types of Tokio runtimes that can be used to construct a Dynamo [Runtime].
66#[derive(Clone)]
67enum RuntimeType {
68    Shared(Arc<tokio::runtime::Runtime>),
69    External(tokio::runtime::Handle),
70}
71
72/// Local [Runtime] which provides access to shared resources local to the physical node/machine.
73#[derive(Debug, Clone)]
74pub struct Runtime {
75    id: Arc<String>,
76    primary: RuntimeType,
77    secondary: RuntimeType,
78    cancellation_token: CancellationToken,
79    endpoint_shutdown_token: CancellationToken,
80    graceful_shutdown_tracker: Arc<GracefulShutdownTracker>,
81    compute_pool: Option<Arc<compute::ComputePool>>,
82    block_in_place_permits: Option<Arc<tokio::sync::Semaphore>>,
83}
84
85/// Distributed [Runtime] which provides access to shared resources across the cluster, this includes
86/// communication protocols and transports.
87#[derive(Clone)]
88pub struct DistributedRuntime {
89    // local runtime
90    runtime: Runtime,
91
92    // we might consider a unifed transport manager here
93    etcd_client: Option<transports::etcd::Client>,
94    nats_client: Option<transports::nats::Client>,
95    store: KeyValueStoreManager,
96    tcp_server: Arc<OnceCell<Arc<transports::tcp::server::TcpStreamServer>>>,
97    system_status_server: Arc<OnceLock<Arc<system_status_server::SystemStatusServerInfo>>>,
98
99    // local registry for components
100    // the registry allows us to use share runtime resources across instances of the same component object.
101    // take for example two instances of a client to the same remote component. The registry allows us to use
102    // a single endpoint watcher for both clients, this keeps the number background tasking watching specific
103    // paths in etcd to a minimum.
104    component_registry: component::Registry,
105
106    // Will only have static components that are not discoverable via etcd, they must be know at
107    // startup. Will not start etcd.
108    is_static: bool,
109
110    instance_sources: Arc<tokio::sync::Mutex<HashMap<Endpoint, Weak<InstanceSource>>>>,
111
112    // Health Status
113    system_health: Arc<parking_lot::Mutex<SystemHealth>>,
114
115    // This hierarchy's own metrics registry
116    metrics_registry: MetricsRegistry,
117}