Skip to main content

paygress/
docker.rs

1// Docker compute backend.
2//
3// Implements `ComputeBackend` by shelling out to the `docker` CLI.
4// Used for the killer-templates path (#31) where workloads are real
5// public Docker images (strfry, ollama, browserless, bitcoind) and
6// LXD's image surface doesn't fit.
7//
8// Container naming is `paygress-<vmid>`. We persist no state — the
9// container's existence on the host IS the state; `find_available_id`
10// scans for `paygress-<n>` containers.
11//
12// Caveats
13// - Subprocess-shelling is intentionally simple; a long-running
14//   provider would benefit from talking to the docker daemon socket
15//   directly via the `bollard` crate. Keeping it CLI-shelled today
16//   means zero runtime dependencies and easy debuggability via
17//   `docker ps`.
18// - `cpu_cores` and `memory_mb` are passed via `--cpus` and
19//   `--memory`; Docker enforces them via cgroups.
20// - `host_port` is the SSH forwarding port the spawn handler
21//   already calculates; for templates it's irrelevant (no SSH into
22//   a template container) but kept for symmetry with other
23//   backends.
24
25use std::process::Stdio;
26
27use anyhow::{Context, Result};
28use async_trait::async_trait;
29use tokio::process::Command;
30
31use crate::compute::{ComputeBackend, ContainerConfig, NodeStatus};
32use crate::luks::{create_encrypted_volume, destroy_encrypted_volume};
33
34/// Docker backend. Stateless wrapper around the `docker` CLI.
35pub struct DockerBackend {
36    /// Optional Docker network to attach containers to. Defaults to
37    /// the host's default `bridge` network when None.
38    network: Option<String>,
39}
40
41impl DockerBackend {
42    pub fn new() -> Self {
43        Self { network: None }
44    }
45
46    pub fn with_network(network: impl Into<String>) -> Self {
47        Self {
48            network: Some(network.into()),
49        }
50    }
51
52    fn name_for(id: u32) -> String {
53        format!("paygress-{}", id)
54    }
55
56    async fn docker(&self, args: &[&str]) -> Result<std::process::Output> {
57        let output = Command::new("docker")
58            .args(args)
59            .stdout(Stdio::piped())
60            .stderr(Stdio::piped())
61            .output()
62            .await
63            .context("invoke docker CLI")?;
64        Ok(output)
65    }
66}
67
68impl Default for DockerBackend {
69    fn default() -> Self {
70        Self::new()
71    }
72}
73
74#[async_trait]
75impl ComputeBackend for DockerBackend {
76    async fn find_available_id(&self, range_start: u32, range_end: u32) -> Result<u32> {
77        let output = self
78            .docker(&[
79                "ps",
80                "-a",
81                "--format",
82                "{{.Names}}",
83                "--filter",
84                "name=paygress-",
85            ])
86            .await?;
87        let names = String::from_utf8_lossy(&output.stdout);
88        let used: std::collections::HashSet<u32> = names
89            .lines()
90            .filter_map(|n| n.strip_prefix("paygress-")?.parse().ok())
91            .collect();
92        for id in range_start..=range_end {
93            if !used.contains(&id) {
94                return Ok(id);
95            }
96        }
97        anyhow::bail!(
98            "no available container id in range {}..={}",
99            range_start,
100            range_end
101        );
102    }
103
104    async fn create_container(&self, config: &ContainerConfig) -> Result<String> {
105        let name = Self::name_for(config.id);
106
107        let mut args: Vec<String> = vec![
108            "run".into(),
109            "-d".into(),
110            "--name".into(),
111            name.clone(),
112            "--restart".into(),
113            "unless-stopped".into(),
114            "--cpus".into(),
115            config.cpu_cores.to_string(),
116            "--memory".into(),
117            format!("{}m", config.memory_mb),
118        ];
119
120        if let Some(net) = &self.network {
121            args.push("--network".into());
122            args.push(net.clone());
123        }
124
125        for port in &config.template_ports {
126            args.push("-p".into());
127            args.push(format!(
128                "{}:{}/{}",
129                port.host_port, port.container_port, port.protocol
130            ));
131        }
132
133        for (k, v) in &config.template_env {
134            args.push("-e".into());
135            args.push(format!("{}={}", k, v));
136        }
137
138        // Per-template extra flags (ulimits, sysctls, caps).
139        for arg in &config.extra_runtime_args {
140            args.push(arg.clone());
141        }
142
143        // Persistent state volume (vmid-scoped so two instances of
144        // the same template don't share state). Two paths:
145        //   - encrypted: provision a LUKS-on-loop file, mount its
146        //     ext4 on the host, bind-mount the mountpoint into the
147        //     container at `data_path`. Host operator's
148        //     post-eviction `tar` reveals only ciphertext.
149        //   - plain: use a Docker named volume (the historical
150        //     default; host operator can `tar` /var/lib/docker/...).
151        if let Some(path) = &config.data_path {
152            match config.volume_encryption_key.as_ref() {
153                Some(key) => {
154                    let vol = create_encrypted_volume(config.id, config.storage_gb, key)
155                        .await
156                        .with_context(|| {
157                            format!("create LUKS-encrypted volume for id={}", config.id)
158                        })?;
159                    args.push("-v".into());
160                    args.push(format!("{}:{}", vol.mount_path.display(), path));
161                }
162                None => {
163                    args.push("-v".into());
164                    args.push(format!("paygress-{}-data:{}", config.id, path));
165                }
166            }
167        }
168
169        // Image must be the last positional arg so docker treats
170        // anything after it as the container's CMD (which we don't
171        // override — image's default CMD runs).
172        args.push(config.image.clone());
173
174        let arg_refs: Vec<&str> = args.iter().map(|s| s.as_str()).collect();
175        let output = self.docker(&arg_refs).await?;
176        if !output.status.success() {
177            anyhow::bail!(
178                "docker run failed: {}",
179                String::from_utf8_lossy(&output.stderr)
180            );
181        }
182        let container_id = String::from_utf8_lossy(&output.stdout).trim().to_string();
183        Ok(container_id)
184    }
185
186    async fn start_container(&self, id: u32) -> Result<()> {
187        let name = Self::name_for(id);
188        let output = self.docker(&["start", &name]).await?;
189        if !output.status.success() {
190            anyhow::bail!(
191                "docker start {} failed: {}",
192                name,
193                String::from_utf8_lossy(&output.stderr)
194            );
195        }
196        Ok(())
197    }
198
199    async fn stop_container(&self, id: u32) -> Result<()> {
200        let name = Self::name_for(id);
201        // Best-effort — if the container is already gone, don't
202        // bubble up an error to cleanup_loop.
203        let _ = self.docker(&["stop", &name]).await;
204        Ok(())
205    }
206
207    async fn delete_container(&self, id: u32) -> Result<()> {
208        let name = Self::name_for(id);
209        let _ = self.docker(&["rm", "-f", &name]).await;
210        // Best-effort: also remove BOTH possible volume backings so a
211        // re-spawn doesn't inherit stale state.
212        //   1. The Docker named volume (used when volume_encryption_key
213        //      is None — historical default path).
214        //   2. The LUKS-on-loop volume (used when the consumer set
215        //      --encrypt-volume). Idempotent: the destroy helper
216        //      treats "no LUKS file at the expected id" as a no-op.
217        //      luksErase inside destroy is the load-bearing step —
218        //      it overwrites the LUKS header so the keyslots are
219        //      unrecoverable even if the operator extracted the
220        //      file before this ran.
221        let volume = format!("paygress-{}-data", id);
222        let _ = self.docker(&["volume", "rm", "-f", &volume]).await;
223        if let Err(e) = destroy_encrypted_volume(id).await {
224            // Never propagate: cleanup_loop relies on this being
225            // best-effort. A leaked mapper entry (e.g. provider
226            // process killed mid-cleanup) gets re-cleaned on the
227            // next delete attempt at the same id, since the helper
228            // is idempotent.
229            tracing::warn!("destroy_encrypted_volume(id={}) non-fatal: {}", id, e);
230        }
231        Ok(())
232    }
233
234    async fn get_node_status(&self) -> Result<NodeStatus> {
235        // `docker info` could give us system stats; for now report
236        // zeros so the heartbeat publishes a valid payload. The
237        // host's true CPU/memory could come from a sysinfo crate
238        // in a follow-up.
239        Ok(NodeStatus {
240            cpu_usage: 0.0,
241            memory_used: 0,
242            memory_total: 0,
243            disk_used: 0,
244            disk_total: 0,
245        })
246    }
247
248    async fn get_container_ip(&self, id: u32) -> Result<Option<String>> {
249        let name = Self::name_for(id);
250        let output = self
251            .docker(&["inspect", "-f", "{{.NetworkSettings.IPAddress}}", &name])
252            .await?;
253        let ip = String::from_utf8_lossy(&output.stdout).trim().to_string();
254        if ip.is_empty() {
255            Ok(None)
256        } else {
257            Ok(Some(ip))
258        }
259    }
260}
261
262#[cfg(test)]
263mod tests {
264    use super::*;
265
266    #[test]
267    fn name_for_is_deterministic() {
268        assert_eq!(DockerBackend::name_for(1234), "paygress-1234");
269    }
270
271    #[test]
272    fn run_args_include_all_pieces() {
273        // We can't actually shell out to docker in unit tests, but
274        // we can verify the argument-building logic by
275        // reconstructing what `create_container` would build.
276        let cfg = ContainerConfig {
277            id: 42,
278            name: "paygress-42".to_string(),
279            image: "alpine:latest".to_string(),
280            cpu_cores: 1,
281            memory_mb: 256,
282            storage_gb: 1,
283            password: "x".to_string(),
284            ssh_key: None,
285            host_port: Some(30042),
286            template_ports: vec![crate::compute::PortMapping {
287                host_port: 17777,
288                container_port: 7777,
289                protocol: "tcp",
290            }],
291            template_env: {
292                let mut m = std::collections::HashMap::new();
293                m.insert("FOO".to_string(), "bar".to_string());
294                m
295            },
296            extra_runtime_args: vec!["--ulimit".to_string(), "nofile=1024:1024".to_string()],
297            data_path: Some("/var/data".to_string()),
298            volume_encryption_key: None,
299        };
300
301        // Mirror the logic in `create_container` for assertion.
302        let mut args: Vec<String> = vec![
303            "run".into(),
304            "-d".into(),
305            "--name".into(),
306            DockerBackend::name_for(cfg.id),
307            "--restart".into(),
308            "unless-stopped".into(),
309            "--cpus".into(),
310            cfg.cpu_cores.to_string(),
311            "--memory".into(),
312            format!("{}m", cfg.memory_mb),
313        ];
314        for port in &cfg.template_ports {
315            args.push("-p".into());
316            args.push(format!(
317                "{}:{}/{}",
318                port.host_port, port.container_port, port.protocol
319            ));
320        }
321        for (k, v) in &cfg.template_env {
322            args.push("-e".into());
323            args.push(format!("{}={}", k, v));
324        }
325        args.push(cfg.image.clone());
326
327        // Sanity: name resolves; image is last; ports show up as
328        // expected.
329        assert!(args.contains(&"paygress-42".to_string()));
330        assert!(args.contains(&"17777:7777/tcp".to_string()));
331        assert!(args.contains(&"FOO=bar".to_string()));
332        assert_eq!(args.last().map(|s| s.as_str()), Some("alpine:latest"));
333    }
334}