runctl/
lib.rs

1//! runctl library
2//!
3//! This library provides the core functionality for runctl CLI, a unified tool for
4//! ML training orchestration across multiple cloud providers (AWS, RunPod, Lyceum AI).
5//!
6//! ## Architecture
7//!
8//! The library follows industry patterns from Terraform (plugin registry), Pulumi (component model),
9//! and Kubernetes (CRD extensibility). See `docs/ARCHITECTURE.md` for details.
10//!
11//! ## Key Modules
12//!
13//! - **Provider System**: `provider` and `providers` modules for multi-cloud abstraction
14//! - **Error Handling**: `error` module with structured error types and retry awareness
15//! - **Resource Tracking**: `resource_tracking` for cost awareness and lifecycle management
16//! - **Retry Logic**: `retry` module with exponential backoff for cloud API calls
17//!
18//! ## Usage
19//!
20//! ### Basic Example
21//!
22//! ```rust,no_run
23//! use runctl::{Config, ResourceTracker};
24//!
25//! # async fn example() -> runctl::error::Result<()> {
26//! // Load configuration
27//! let config = Config::load(None)?;
28//!
29//! // Track resources
30//! let tracker = ResourceTracker::new();
31//! let running = tracker.get_running().await;
32//! # Ok(())
33//! # }
34//! ```
35//!
36//! ### Using Convenience Re-exports
37//!
38//! Common types are re-exported at the crate root for convenience:
39//!
40//! ```rust,no_run
41//! use runctl::{Config, Result, TrainctlError};
42//! use runctl::{CreateInstanceOptions, TrainInstanceOptions};
43//!
44//! # async fn example() -> runctl::Result<()> {
45//! let config = Config::load(None)?;
46//! // Use re-exported types directly
47//! # Ok(())
48//! # }
49//! ```
50//!
51//! ### Provider Trait (Future)
52//!
53//! The provider trait system is defined but not yet used by the CLI. When
54//! multi-cloud support is enabled:
55//!
56//! ```rust,no_run
57//! use runctl::{Config, TrainingProvider};
58//!
59//! # async fn example() -> runctl::error::Result<()> {
60//! let config = Config::load(None)?;
61//! // let provider = config.get_provider("aws")?;
62//! // let resource_id = provider.create_resource("g4dn.xlarge", options).await?;
63//! # Ok(())
64//! # }
65//! ```
66
67pub mod aws;
68pub mod aws_utils;
69pub mod checkpoint;
70pub mod config;
71pub mod dashboard;
72pub mod data_transfer;
73pub mod diagnostics;
74pub mod docker;
75pub mod ebs;
76pub mod ebs_optimization;
77pub mod error;
78pub mod error_helpers;
79pub mod fast_data_loading;
80pub mod local;
81pub mod monitor;
82pub mod provider;
83pub mod providers;
84pub mod resource_tracking;
85pub mod resources;
86pub mod retry;
87pub mod runpod;
88pub mod s3;
89pub mod safe_cleanup;
90pub mod ssh_sync;
91pub mod training;
92pub mod utils;
93pub mod validation;
94pub mod workflow;
95
96// Re-export commonly used types
97pub use error::{ConfigError, IsRetryable, Result, TrainctlError};
98pub use provider::{
99    CreateResourceOptions, ResourceState, ResourceStatus, TrainingJob, TrainingProvider,
100};
101pub use providers::ProviderRegistry;
102pub use resource_tracking::{ResourceTracker, ResourceUsage, TrackedResource};
103pub use retry::{ExponentialBackoffPolicy, RetryPolicy};
104pub use safe_cleanup::{safe_cleanup, CleanupResult, CleanupSafety};
105pub use training::{TrainingSession, TrainingStatus};
106pub use validation::{validate_path, validate_path_path};
107
108// Re-export commonly used types for convenience
109pub use aws::{CreateInstanceOptions, TrainInstanceOptions};
110pub use config::Config;
111pub use resources::estimate_instance_cost;