commonware_deployer/ec2/
mod.rs

1//! AWS EC2 deployer
2//!
3//! Deploy a custom binary (and configuration) to any number of EC2 instances across multiple regions. View metrics and logs
4//! from all instances with Grafana.
5//!
6//! # Features
7//!
8//! * Automated creation, update, and destruction of EC2 instances across multiple regions
9//! * Provide a unique name, instance type, region, binary, and configuration for each deployed instance
10//! * Collect metrics, profiles (when enabled), and logs from all deployed instances on a long-lived monitoring instance
11//!   (accessible only to the deployer's IP)
12//!
13//! # Architecture
14//!
15//! ```txt
16//!                    Deployer's Machine (Public IP)
17//!                                  |
18//!                                  |
19//!                                  v
20//!               +-----------------------------------+
21//!               | Monitoring VPC (us-east-1)        |
22//!               |  - Monitoring Instance            |
23//!               |    - Prometheus                   |
24//!               |    - Loki                         |
25//!               |    - Pyroscope                    |
26//!               |    - Tempo                        |
27//!               |    - Grafana                      |
28//!               |  - Security Group                 |
29//!               |    - All: Deployer IP             |
30//!               |    - 3100: Binary VPCs            |
31//!               |    - 4040: Binary VPCs            |
32//!               |    - 4318: Binary VPCs            |
33//!               +-----------------------------------+
34//!                     ^                       ^
35//!                (Telemetry)             (Telemetry)
36//!                     |                       |
37//!                     |                       |
38//! +------------------------------+  +------------------------------+
39//! | Binary VPC 1                 |  | Binary VPC 2                 |
40//! |  - Binary Instance           |  |  - Binary Instance           |
41//! |    - Binary A                |  |    - Binary B                |
42//! |    - Promtail                |  |    - Promtail                |
43//! |    - Node Exporter           |  |    - Node Exporter           |
44//! |    - Pyroscope Agent         |  |    - Pyroscope Agent         |
45//! |  - Security Group            |  |  - Security Group            |
46//! |    - All: Deployer IP        |  |    - All: Deployer IP        |
47//! |    - 9090: Monitoring IP     |  |    - 9090: Monitoring IP     |
48//! |    - 9100: Monitoring IP     |  |    - 9100: Monitoring IP     |
49//! |    - 8012: 0.0.0.0/0         |  |    - 8765: 12.3.7.9/32       |
50//! +------------------------------+  +------------------------------+
51//! ```
52//!
53//! ## Instances
54//!
55//! ### Monitoring
56//!
57//! * Deployed in `us-east-1` with a configurable ARM64 instance type (e.g., `t4g.small`) and storage (e.g., 10GB gp2).
58//! * Runs:
59//!     * **Prometheus**: Scrapes binary metrics from all instances at `:9090` and system metrics from all instances at `:9100`.
60//!     * **Loki**: Listens at `:3100`, storing logs in `/loki/chunks` with a TSDB index at `/loki/index`.
61//!     * **Pyroscope**: Listens at `:4040`, storing profiles in `/var/lib/pyroscope`.
62//!     * **Tempo**: Listens at `:4318`, storing traces in `/var/lib/tempo`.
63//!     * **Grafana**: Hosted at `:3000`, provisioned with Prometheus, Loki, and Tempo datasources and a custom dashboard.
64//! * Ingress:
65//!     * Allows deployer IP access (TCP 0-65535).
66//!     * Binary instance traffic to Loki (TCP 3100) and Tempo (TCP 4318).
67//!
68//! ### Binary
69//!
70//! * Deployed in user-specified regions with configurable ARM64 instance types and storage.
71//! * Run:
72//!     * **Custom Binary**: Executes with `--hosts=/home/ubuntu/hosts.yaml --config=/home/ubuntu/config.conf`, exposing metrics at `:9090`.
73//!     * **Promtail**: Forwards `/var/log/binary.log` to Loki on the monitoring instance.
74//!     * **Node Exporter**: Exposes system metrics at `:9100`.
75//!     * **Pyroscope Agent**: Forwards `perf` profiles to Pyroscope on the monitoring instance.
76//! * Ingress:
77//!     * Deployer IP access (TCP 0-65535).
78//!     * Monitoring IP access to `:9090` and `:9100` for Prometheus.
79//!     * User-defined ports from the configuration.
80//!
81//! ## Networking
82//!
83//! ### VPCs
84//!
85//! One per region with CIDR `10.<region-index>.0.0/16` (e.g., `10.0.0.0/16` for `us-east-1`).
86//!
87//! ### Subnets
88//!
89//! Single subnet per VPC (e.g., `10.<region-index>.1.0/24`), linked to a route table with an internet gateway.
90//!
91//! ### VPC Peering
92//!
93//! Connects the monitoring VPC to each binary VPC, with routes added to route tables for private communication.
94//!
95//! ### Security Groups
96//!
97//! Separate for monitoring (tag) and binary instances (`{tag}-binary`), dynamically configured for deployer and inter-instance traffic.
98//!
99//! # Workflow
100//!
101//! ## `ec2 create`
102//!
103//! 1. Validates configuration and generates an SSH key pair, stored in `$HOME/.commonware_deployer/{tag}/id_rsa_{tag}`.
104//! 2. Creates VPCs, subnets, internet gateways, route tables, and security groups per region.
105//! 3. Establishes VPC peering between the monitoring region and binary regions.
106//! 4. Launches the monitoring instance, uploads service files, and installs Prometheus, Grafana, Loki, Pyroscope, and Tempo.
107//! 5. Launches binary instances, uploads binaries, configurations, and hosts.yaml, and installs Promtail and the binary.
108//! 6. Configures BBR on all instances and updates the monitoring security group for Loki traffic.
109//! 7. Marks completion with `$HOME/.commonware_deployer/{tag}/created`.
110//!
111//! ## `ec2 update`
112//!
113//! 1. Stops the `binary` service on each binary instance.
114//! 2. Uploads the latest binary and configuration from the YAML config.
115//! 3. Restarts the `binary` service, ensuring minimal downtime.
116//!
117//! ## `ec2 authorize`
118//!
119//! 1. Obtains the deployer's current public IP address (or parses the one provided).
120//! 2. For each security group in the deployment, adds an ingress rule for the IP (if it doesn't already exist).
121//!
122//! ## `ec2 destroy`
123//!
124//! 1. Terminates all instances across regions.
125//! 2. Deletes security groups, subnets, route tables, VPC peering connections, internet gateways, key pairs, and VPCs in dependency order.
126//! 3. Marks destruction with `$HOME/.commonware_deployer/{tag}/destroyed`, retaining the directory to prevent tag reuse.
127//!
128//! # Persistence
129//!
130//! * A directory `$HOME/.commonware_deployer/{tag}` stores the SSH private key, service files, and status files (`created`, `destroyed`).
131//! * The deployment state is tracked via these files, ensuring operations respect prior create/destroy actions.
132//!
133//! # Example Configuration
134//!
135//! ```yaml
136//! tag: ffa638a0-991c-442c-8ec4-aa4e418213a5
137//! monitoring:
138//!   instance_type: t4g.small
139//!   storage_size: 10
140//!   storage_class: gp2
141//!   dashboard: /path/to/dashboard.json
142//! instances:
143//!   - name: node1
144//!     region: us-east-1
145//!     instance_type: t4g.small
146//!     storage_size: 10
147//!     storage_class: gp2
148//!     binary: /path/to/binary
149//!     config: /path/to/config.conf
150//!     profiling: true
151//!   - name: node2
152//!     region: us-west-2
153//!     instance_type: t4g.small
154//!     storage_size: 10
155//!     storage_class: gp2
156//!     binary: /path/to/binary2
157//!     config: /path/to/config2.conf
158//!     profiling: false
159//! ports:
160//!   - protocol: tcp
161//!     port: 4545
162//!     cidr: 0.0.0.0/0
163//! ```
164
165use serde::{Deserialize, Serialize};
166use std::net::IpAddr;
167
168cfg_if::cfg_if! {
169    if #[cfg(feature="aws")] {
170        use thiserror::Error;
171        use std::path::PathBuf;
172
173        pub mod aws;
174        mod create;
175        pub mod services;
176        pub use create::create;
177        mod update;
178        pub use update::update;
179        mod authorize;
180        pub use authorize::authorize;
181        mod destroy;
182        pub use destroy::destroy;
183        pub mod utils;
184
185        /// Name of the monitoring instance
186        const MONITORING_NAME: &str = "monitoring";
187
188        /// AWS region where monitoring instances are deployed
189        const MONITORING_REGION: &str = "us-east-1";
190
191        /// File name that indicates the deployment completed
192        const CREATED_FILE_NAME: &str = "created";
193
194        /// File name that indicates the deployment was destroyed
195        const DESTROYED_FILE_NAME: &str = "destroyed";
196
197        /// Port on instance where system metrics are exposed
198        const SYSTEM_PORT: u16 = 9100;
199
200        /// Port on monitoring where logs are pushed
201        const LOGS_PORT: u16 = 3100;
202
203        /// Port on monitoring where profiles are pushed
204        const PROFILES_PORT: u16 = 4040;
205
206        /// Port on monitoring where traces are pushed
207        const TRACES_PORT: u16 = 4318;
208
209        /// Subcommand name
210        pub const CMD: &str = "ec2";
211
212        /// Create subcommand name
213        pub const CREATE_CMD: &str = "create";
214
215        /// Update subcommand name
216        pub const UPDATE_CMD: &str = "update";
217
218        /// Authorize subcommand name
219        pub const AUTHORIZE_CMD: &str = "authorize";
220
221        /// Destroy subcommand name
222        pub const DESTROY_CMD: &str = "destroy";
223
224        /// Directory where deployer files are stored
225        fn deployer_directory(tag: &str) -> PathBuf {
226            let base_dir = std::env::var("HOME").expect("$HOME is not configured");
227            PathBuf::from(format!("{base_dir}/.commonware_deployer/{tag}"))
228        }
229
230        /// Errors that can occur when deploying infrastructure on AWS
231        #[derive(Error, Debug)]
232        pub enum Error {
233            #[error("AWS EC2 error: {0}")]
234            AwsEc2(#[from] aws_sdk_ec2::Error),
235            #[error("AWS security group ingress error: {0}")]
236            AwsSecurityGroupIngress(#[from] aws_sdk_ec2::operation::authorize_security_group_ingress::AuthorizeSecurityGroupIngressError),
237            #[error("AWS describe instances error: {0}")]
238            AwsDescribeInstances(#[from] aws_sdk_ec2::operation::describe_instances::DescribeInstancesError),
239            #[error("IO error: {0}")]
240            Io(#[from] std::io::Error),
241            #[error("YAML error: {0}")]
242            Yaml(#[from] serde_yaml::Error),
243            #[error("creation already attempted")]
244            CreationAttempted,
245            #[error("invalid instance name: {0}")]
246            InvalidInstanceName(String),
247            #[error("reqwest error: {0}")]
248            Reqwest(#[from] reqwest::Error),
249            #[error("SCP failed")]
250            ScpFailed,
251            #[error("SSH failed")]
252            SshFailed,
253            #[error("keygen failed")]
254            KeygenFailed,
255            #[error("service timeout({0}): {1}")]
256            ServiceTimeout(String, String),
257            #[error("deployment does not exist: {0}")]
258            DeploymentDoesNotExist(String),
259            #[error("deployment is not complete: {0}")]
260            DeploymentNotComplete(String),
261            #[error("deployment already destroyed: {0}")]
262            DeploymentAlreadyDestroyed(String),
263            #[error("private key not found")]
264            PrivateKeyNotFound,
265            #[error("invalid IP address: {0}")]
266            InvalidIpAddress(String),
267        }
268    }
269}
270
271/// Port on binary where metrics are exposed
272pub const METRICS_PORT: u16 = 9090;
273
274/// Host deployment information
275#[derive(Serialize, Deserialize, Clone)]
276pub struct Host {
277    /// Name of the host
278    pub name: String,
279
280    /// Region where the host is deployed
281    pub region: String,
282
283    /// Public IP address of the host
284    pub ip: IpAddr,
285}
286
287/// List of hosts
288#[derive(Serialize, Deserialize, Clone)]
289pub struct Hosts {
290    /// Private IP address of the monitoring instance
291    pub monitoring: IpAddr,
292
293    /// Hosts deployed across all regions
294    pub hosts: Vec<Host>,
295}
296
297/// Port configuration
298#[derive(Serialize, Deserialize, Clone)]
299pub struct PortConfig {
300    /// Protocol (e.g., "tcp")
301    pub protocol: String,
302
303    /// Port number
304    pub port: u16,
305
306    /// CIDR block
307    pub cidr: String,
308}
309
310/// Instance configuration
311#[derive(Serialize, Deserialize, Clone)]
312pub struct InstanceConfig {
313    /// Name of the instance
314    pub name: String,
315
316    /// AWS region where the instance is deployed
317    pub region: String,
318
319    /// Instance type (only ARM-based instances are supported)
320    pub instance_type: String,
321
322    /// Storage size in GB
323    pub storage_size: i32,
324
325    /// Storage class (e.g., "gp2")
326    pub storage_class: String,
327
328    /// Path to the binary to deploy
329    pub binary: String,
330
331    /// Path to the binary configuration file
332    pub config: String,
333
334    /// Whether to enable profiling
335    pub profiling: bool,
336}
337
338/// Monitoring configuration
339#[derive(Serialize, Deserialize, Clone)]
340pub struct MonitoringConfig {
341    /// Instance type (only ARM-based instances are supported)
342    pub instance_type: String,
343
344    /// Storage size in GB
345    pub storage_size: i32,
346
347    /// Storage class (e.g., "gp2")
348    pub storage_class: String,
349
350    /// Path to a custom dashboard file that is automatically
351    /// uploaded to grafana
352    pub dashboard: String,
353}
354
355/// Deployer configuration
356#[derive(Serialize, Deserialize, Clone)]
357pub struct Config {
358    /// Unique tag for the deployment
359    pub tag: String,
360
361    /// Monitoring instance configuration
362    pub monitoring: MonitoringConfig,
363
364    /// Instance configurations
365    pub instances: Vec<InstanceConfig>,
366
367    /// Ports open on all instances
368    pub ports: Vec<PortConfig>,
369}