Skip to main content

s4_server/
acme.rs

1//! ACME (Let's Encrypt) auto-cert support (v0.3 #11).
2//!
3//! Wraps `rustls-acme` for the TLS-ALPN-01 challenge path. Operators
4//! enable this by passing `--acme <domain>` to the binary; certificate
5//! acquisition + renewal happens transparently in the background, and
6//! the listening port handles both real TLS traffic AND the ACME
7//! challenge handshake on the same socket (the TLS-ALPN-01 selling
8//! point — no separate port-80 HTTP listener needed).
9//!
10//! ## Skipped scope (intentional)
11//!
12//! - **HTTP-01 challenge**: requires a separate port-80 listener and
13//!   coordinated routing. TLS-ALPN-01 covers the same use case for
14//!   anyone serving on port 443 without that complexity.
15//! - **DNS-01 challenge**: requires a DNS provider integration. Not
16//!   on the v0.3 roadmap; reopen if a customer needs wildcard certs.
17//! - **Custom ACME directory**: the binary hard-codes Let's Encrypt
18//!   (production / staging selectable via `--acme-staging`). Add a
19//!   `--acme-endpoint` flag if ZeroSSL / internal CA support is asked
20//!   for.
21
22use std::path::PathBuf;
23use std::sync::Arc;
24use std::time::Duration;
25
26use rustls_acme::caches::DirCache;
27use rustls_acme::{AcmeConfig, is_tls_alpn_challenge};
28use tokio_rustls::LazyConfigAcceptor;
29use tokio_rustls::rustls::ServerConfig;
30
31/// v0.8.4 #80: per-poll timeout on the background ACME renewal stream.
32///
33/// Without this, a hung Let's Encrypt API (network partition, slow
34/// response, transparent-proxy black hole) would wedge the renewal
35/// task on a single `state.next().await` indefinitely — the existing
36/// cert keeps serving traffic, but renewal silently stops and the
37/// cert ages out 90 days later. The timeout fires per-iteration so
38/// the loop just retries on the next tick instead of dying.
39///
40/// 60s is comfortably longer than any healthy LE round-trip
41/// (typically < 5s) but short enough that an operator's Prometheus
42/// alert on `s4_acme_renewal_total{result="timeout"}` rate fires
43/// within a single scrape window when LE goes dark.
44const ACME_POLL_TIMEOUT: Duration = Duration::from_secs(60);
45
46/// Inputs to [`bootstrap`]: the operator-supplied flags from main.
47pub struct AcmeOptions {
48    pub domains: Vec<String>,
49    pub contact: Option<String>,
50    pub cache_dir: PathBuf,
51    pub staging: bool,
52}
53
54/// What [`bootstrap`] returns: two rustls configs the per-connection
55/// handler picks between based on whether the incoming ClientHello is a
56/// TLS-ALPN-01 challenge or a real TLS request.
57pub struct AcmeAcceptors {
58    /// Rustls config used for the TLS-ALPN-01 challenge response.
59    /// Hand to `LazyConfigAcceptor::into_stream` when
60    /// `is_tls_alpn_challenge(&client_hello)` is true.
61    pub challenge: Arc<ServerConfig>,
62    /// Rustls config used for ordinary TLS traffic. Carries the
63    /// currently-issued certificate; `rustls-acme` swaps the inner
64    /// `Arc<ServerConfig>` automatically on each successful renewal,
65    /// so this `Arc` always points at the latest cert.
66    pub default: Arc<ServerConfig>,
67}
68
69/// Build ACME state, kick off the background renewal loop, and return
70/// the two rustls configs the accept loop needs. Spawns one tokio task
71/// for the renewal driver; that task lives for the lifetime of the
72/// process and shouldn't normally exit.
73pub fn bootstrap(opts: AcmeOptions) -> AcmeAcceptors {
74    if let Err(e) = std::fs::create_dir_all(&opts.cache_dir) {
75        tracing::warn!(
76            "could not create ACME cache directory {}: {e}",
77            opts.cache_dir.display()
78        );
79    }
80
81    let mut state = AcmeConfig::new(opts.domains.clone())
82        .contact(
83            opts.contact
84                .iter()
85                .map(|e| format!("mailto:{e}"))
86                .collect::<Vec<_>>(),
87        )
88        .cache(DirCache::new(opts.cache_dir.clone()))
89        // rustls-acme uses `directory_lets_encrypt(production: bool)` —
90        // i.e. `true` selects the production directory. We invert here
91        // because the user-facing `--acme-staging` flag is the safer
92        // default to surface in CLI help.
93        .directory_lets_encrypt(!opts.staging)
94        .state();
95
96    let challenge = state.challenge_rustls_config();
97    let default = state.default_rustls_config();
98
99    // Background driver: rustls-acme runs renewal + challenge handling
100    // through this stream. Bumping the renewal counter on every event
101    // surfaces failures to operators via the s4_acme_renewal_total
102    // Prometheus metric. We never break out of this loop — failures
103    // just retry on the next poll.
104    //
105    // v0.8.4 #80: each `state.next().await` is wrapped in a
106    // `tokio::time::timeout(ACME_POLL_TIMEOUT, …)`. A hung Let's
107    // Encrypt API (audit L1) used to wedge the task forever without
108    // this guard; now we log + bump the "timeout" label and continue
109    // looping so the next iteration retries.
110    let domains = opts.domains.join(",");
111    tokio::spawn(async move {
112        use futures::StreamExt;
113        loop {
114            match tokio::time::timeout(ACME_POLL_TIMEOUT, state.next()).await {
115                Ok(Some(Ok(ok))) => {
116                    tracing::info!(target: "s4_acme", domains = %domains, "ACME event: {ok:?}");
117                    crate::metrics::record_acme_renewal("ok");
118                }
119                Ok(Some(Err(err))) => {
120                    tracing::warn!(target: "s4_acme", domains = %domains, "ACME error: {err:?}");
121                    crate::metrics::record_acme_renewal("err");
122                }
123                Ok(None) => {
124                    tracing::warn!(target: "s4_acme", "ACME state stream ended unexpectedly");
125                    break;
126                }
127                Err(_elapsed) => {
128                    tracing::warn!(
129                        target: "s4_acme",
130                        domains = %domains,
131                        timeout_secs = ACME_POLL_TIMEOUT.as_secs(),
132                        "ACME renewal poll timeout; will retry on next iteration"
133                    );
134                    crate::metrics::record_acme_renewal_timeout();
135                    // Fall through to next loop iteration — `state` is
136                    // still owned, so the next poll picks up where the
137                    // hung future left off (or its successor, since the
138                    // timed-out future is dropped here at scope exit).
139                }
140            }
141        }
142    });
143
144    AcmeAcceptors { challenge, default }
145}
146
147/// Per-connection accept entry point. Inspect the ClientHello via
148/// `LazyConfigAcceptor`, then route to either the challenge config
149/// (TLS-ALPN-01 ack) or the default cert config (real traffic).
150///
151/// Returns `Ok(Some(stream))` for a finished real TLS handshake — the
152/// caller serves HTTP on it. Returns `Ok(None)` when a challenge was
153/// answered and the caller should just close the connection. `Err(_)`
154/// is logged at WARN by the caller.
155pub async fn accept_one<IO>(
156    sock: IO,
157    acceptors: &AcmeAcceptors,
158) -> Result<Option<tokio_rustls::server::TlsStream<IO>>, Box<dyn std::error::Error + Send + Sync>>
159where
160    IO: tokio::io::AsyncRead + tokio::io::AsyncWrite + Unpin,
161{
162    let start = LazyConfigAcceptor::new(Default::default(), sock).await?;
163    if is_tls_alpn_challenge(&start.client_hello()) {
164        let mut tls = start.into_stream(acceptors.challenge.clone()).await?;
165        use tokio::io::AsyncWriteExt;
166        let _ = tls.shutdown().await;
167        Ok(None)
168    } else {
169        let tls = start.into_stream(acceptors.default.clone()).await?;
170        Ok(Some(tls))
171    }
172}
173
174#[cfg(test)]
175mod tests {
176    use super::*;
177
178    /// Bootstrap returns two distinct rustls configs. We never reach
179    /// the Let's Encrypt servers in this unit test (the background
180    /// renewal task will retry forever without test-side observation),
181    /// so we just verify the synchronous return path.
182    #[tokio::test]
183    async fn bootstrap_returns_challenge_and_default_configs() {
184        crate::tls::install_default_crypto_provider();
185        let dir = tempfile::tempdir().unwrap();
186        let acceptors = bootstrap(AcmeOptions {
187            domains: vec!["example.test".into()],
188            contact: Some("ops@example.test".into()),
189            cache_dir: dir.path().to_path_buf(),
190            staging: true,
191        });
192        // Both configs must exist; they're distinct (challenge serves the
193        // TLS-ALPN-01 magic cert, default serves the real cert).
194        assert!(!Arc::ptr_eq(&acceptors.challenge, &acceptors.default));
195    }
196
197    /// v0.8.4 #80: the renewal driver wraps each `state.next().await`
198    /// in `tokio::time::timeout(ACME_POLL_TIMEOUT, …)`. We can't drive
199    /// the real `rustls-acme` stream here without reaching Let's
200    /// Encrypt, and the workspace doesn't enable tokio's `test-util`
201    /// feature so `tokio::time::pause()` is unavailable. Instead we
202    /// assert the same `timeout(_, pending)` shape against an
203    /// always-pending future with a tiny deadline: if the wrapper
204    /// returns `Err(Elapsed)`, the production loop's "timeout" arm is
205    /// reachable. Combined with the metric-label assertion in
206    /// `metrics::tests::install_and_render_basic_counters` (which
207    /// scrapes for `result="timeout"`), this nails down both halves
208    /// of the fix.
209    #[tokio::test]
210    async fn renewal_poll_timeout_arm_fires_when_inner_future_hangs() {
211        // Sanity: the production constant is 60s — long enough to
212        // dwarf any healthy LE round-trip but short enough that an
213        // alert window (typically 5 minutes) catches a wedge fast.
214        assert_eq!(ACME_POLL_TIMEOUT, Duration::from_secs(60));
215
216        // Demonstrate the same wrapper shape used in `bootstrap`.
217        // `pending` never resolves, so `timeout` MUST take the
218        // `Err(Elapsed)` path. A tiny deadline keeps the test wall
219        // time near zero.
220        let pending = futures::future::pending::<()>();
221        let res = tokio::time::timeout(Duration::from_millis(20), pending).await;
222        assert!(
223            res.is_err(),
224            "tokio::time::timeout must surface Elapsed for a never-ready future; \
225             this is the same branch that bumps record_acme_renewal_timeout in \
226             the production loop"
227        );
228
229        // Also exercise the recorder helper directly so any future
230        // refactor of the metric-label vocabulary trips this test
231        // (compile-time guard on the `&'static str` signature).
232        crate::metrics::record_acme_renewal_timeout();
233    }
234}