s4_server/acme.rs
1//! ACME (Let's Encrypt) auto-cert support (v0.3 #11).
2//!
3//! Wraps `rustls-acme` for the TLS-ALPN-01 challenge path. Operators
4//! enable this by passing `--acme <domain>` to the binary; certificate
5//! acquisition + renewal happens transparently in the background, and
6//! the listening port handles both real TLS traffic AND the ACME
7//! challenge handshake on the same socket (the TLS-ALPN-01 selling
8//! point — no separate port-80 HTTP listener needed).
9//!
10//! ## Skipped scope (intentional)
11//!
12//! - **HTTP-01 challenge**: requires a separate port-80 listener and
13//! coordinated routing. TLS-ALPN-01 covers the same use case for
14//! anyone serving on port 443 without that complexity.
15//! - **DNS-01 challenge**: requires a DNS provider integration. Not
16//! on the v0.3 roadmap; reopen if a customer needs wildcard certs.
17//! - **Custom ACME directory**: the binary hard-codes Let's Encrypt
18//! (production / staging selectable via `--acme-staging`). Add a
19//! `--acme-endpoint` flag if ZeroSSL / internal CA support is asked
20//! for.
21
22use std::path::PathBuf;
23use std::sync::Arc;
24use std::time::Duration;
25
26use rustls_acme::caches::DirCache;
27use rustls_acme::{AcmeConfig, is_tls_alpn_challenge};
28use tokio_rustls::LazyConfigAcceptor;
29use tokio_rustls::rustls::ServerConfig;
30
31/// v0.8.4 #80: per-poll timeout on the background ACME renewal stream.
32///
33/// Without this, a hung Let's Encrypt API (network partition, slow
34/// response, transparent-proxy black hole) would wedge the renewal
35/// task on a single `state.next().await` indefinitely — the existing
36/// cert keeps serving traffic, but renewal silently stops and the
37/// cert ages out 90 days later. The timeout fires per-iteration so
38/// the loop just retries on the next tick instead of dying.
39///
40/// 60s is comfortably longer than any healthy LE round-trip
41/// (typically < 5s) but short enough that an operator's Prometheus
42/// alert on `s4_acme_renewal_total{result="timeout"}` rate fires
43/// within a single scrape window when LE goes dark.
44const ACME_POLL_TIMEOUT: Duration = Duration::from_secs(60);
45
46/// Inputs to [`bootstrap`]: the operator-supplied flags from main.
47pub struct AcmeOptions {
48 pub domains: Vec<String>,
49 pub contact: Option<String>,
50 pub cache_dir: PathBuf,
51 pub staging: bool,
52}
53
54/// What [`bootstrap`] returns: two rustls configs the per-connection
55/// handler picks between based on whether the incoming ClientHello is a
56/// TLS-ALPN-01 challenge or a real TLS request.
57pub struct AcmeAcceptors {
58 /// Rustls config used for the TLS-ALPN-01 challenge response.
59 /// Hand to `LazyConfigAcceptor::into_stream` when
60 /// `is_tls_alpn_challenge(&client_hello)` is true.
61 pub challenge: Arc<ServerConfig>,
62 /// Rustls config used for ordinary TLS traffic. Carries the
63 /// currently-issued certificate; `rustls-acme` swaps the inner
64 /// `Arc<ServerConfig>` automatically on each successful renewal,
65 /// so this `Arc` always points at the latest cert.
66 pub default: Arc<ServerConfig>,
67}
68
69/// Build ACME state, kick off the background renewal loop, and return
70/// the two rustls configs the accept loop needs. Spawns one tokio task
71/// for the renewal driver; that task lives for the lifetime of the
72/// process and shouldn't normally exit.
73pub fn bootstrap(opts: AcmeOptions) -> AcmeAcceptors {
74 if let Err(e) = std::fs::create_dir_all(&opts.cache_dir) {
75 tracing::warn!(
76 "could not create ACME cache directory {}: {e}",
77 opts.cache_dir.display()
78 );
79 }
80
81 let mut state = AcmeConfig::new(opts.domains.clone())
82 .contact(
83 opts.contact
84 .iter()
85 .map(|e| format!("mailto:{e}"))
86 .collect::<Vec<_>>(),
87 )
88 .cache(DirCache::new(opts.cache_dir.clone()))
89 // rustls-acme uses `directory_lets_encrypt(production: bool)` —
90 // i.e. `true` selects the production directory. We invert here
91 // because the user-facing `--acme-staging` flag is the safer
92 // default to surface in CLI help.
93 .directory_lets_encrypt(!opts.staging)
94 .state();
95
96 let challenge = state.challenge_rustls_config();
97 let default = state.default_rustls_config();
98
99 // Background driver: rustls-acme runs renewal + challenge handling
100 // through this stream. Bumping the renewal counter on every event
101 // surfaces failures to operators via the s4_acme_renewal_total
102 // Prometheus metric. We never break out of this loop — failures
103 // just retry on the next poll.
104 //
105 // v0.8.4 #80: each `state.next().await` is wrapped in a
106 // `tokio::time::timeout(ACME_POLL_TIMEOUT, …)`. A hung Let's
107 // Encrypt API (audit L1) used to wedge the task forever without
108 // this guard; now we log + bump the "timeout" label and continue
109 // looping so the next iteration retries.
110 let domains = opts.domains.join(",");
111 tokio::spawn(async move {
112 use futures::StreamExt;
113 loop {
114 match tokio::time::timeout(ACME_POLL_TIMEOUT, state.next()).await {
115 Ok(Some(Ok(ok))) => {
116 tracing::info!(target: "s4_acme", domains = %domains, "ACME event: {ok:?}");
117 crate::metrics::record_acme_renewal("ok");
118 }
119 Ok(Some(Err(err))) => {
120 tracing::warn!(target: "s4_acme", domains = %domains, "ACME error: {err:?}");
121 crate::metrics::record_acme_renewal("err");
122 }
123 Ok(None) => {
124 tracing::warn!(target: "s4_acme", "ACME state stream ended unexpectedly");
125 break;
126 }
127 Err(_elapsed) => {
128 tracing::warn!(
129 target: "s4_acme",
130 domains = %domains,
131 timeout_secs = ACME_POLL_TIMEOUT.as_secs(),
132 "ACME renewal poll timeout; will retry on next iteration"
133 );
134 crate::metrics::record_acme_renewal_timeout();
135 // Fall through to next loop iteration — `state` is
136 // still owned, so the next poll picks up where the
137 // hung future left off (or its successor, since the
138 // timed-out future is dropped here at scope exit).
139 }
140 }
141 }
142 });
143
144 AcmeAcceptors { challenge, default }
145}
146
147/// Per-connection accept entry point. Inspect the ClientHello via
148/// `LazyConfigAcceptor`, then route to either the challenge config
149/// (TLS-ALPN-01 ack) or the default cert config (real traffic).
150///
151/// Returns `Ok(Some(stream))` for a finished real TLS handshake — the
152/// caller serves HTTP on it. Returns `Ok(None)` when a challenge was
153/// answered and the caller should just close the connection. `Err(_)`
154/// is logged at WARN by the caller.
155pub async fn accept_one<IO>(
156 sock: IO,
157 acceptors: &AcmeAcceptors,
158) -> Result<Option<tokio_rustls::server::TlsStream<IO>>, Box<dyn std::error::Error + Send + Sync>>
159where
160 IO: tokio::io::AsyncRead + tokio::io::AsyncWrite + Unpin,
161{
162 let start = LazyConfigAcceptor::new(Default::default(), sock).await?;
163 if is_tls_alpn_challenge(&start.client_hello()) {
164 let mut tls = start.into_stream(acceptors.challenge.clone()).await?;
165 use tokio::io::AsyncWriteExt;
166 let _ = tls.shutdown().await;
167 Ok(None)
168 } else {
169 let tls = start.into_stream(acceptors.default.clone()).await?;
170 Ok(Some(tls))
171 }
172}
173
174#[cfg(test)]
175mod tests {
176 use super::*;
177
178 /// Bootstrap returns two distinct rustls configs. We never reach
179 /// the Let's Encrypt servers in this unit test (the background
180 /// renewal task will retry forever without test-side observation),
181 /// so we just verify the synchronous return path.
182 #[tokio::test]
183 async fn bootstrap_returns_challenge_and_default_configs() {
184 crate::tls::install_default_crypto_provider();
185 let dir = tempfile::tempdir().unwrap();
186 let acceptors = bootstrap(AcmeOptions {
187 domains: vec!["example.test".into()],
188 contact: Some("ops@example.test".into()),
189 cache_dir: dir.path().to_path_buf(),
190 staging: true,
191 });
192 // Both configs must exist; they're distinct (challenge serves the
193 // TLS-ALPN-01 magic cert, default serves the real cert).
194 assert!(!Arc::ptr_eq(&acceptors.challenge, &acceptors.default));
195 }
196
197 /// v0.8.4 #80: the renewal driver wraps each `state.next().await`
198 /// in `tokio::time::timeout(ACME_POLL_TIMEOUT, …)`. We can't drive
199 /// the real `rustls-acme` stream here without reaching Let's
200 /// Encrypt, and the workspace doesn't enable tokio's `test-util`
201 /// feature so `tokio::time::pause()` is unavailable. Instead we
202 /// assert the same `timeout(_, pending)` shape against an
203 /// always-pending future with a tiny deadline: if the wrapper
204 /// returns `Err(Elapsed)`, the production loop's "timeout" arm is
205 /// reachable. Combined with the metric-label assertion in
206 /// `metrics::tests::install_and_render_basic_counters` (which
207 /// scrapes for `result="timeout"`), this nails down both halves
208 /// of the fix.
209 #[tokio::test]
210 async fn renewal_poll_timeout_arm_fires_when_inner_future_hangs() {
211 // Sanity: the production constant is 60s — long enough to
212 // dwarf any healthy LE round-trip but short enough that an
213 // alert window (typically 5 minutes) catches a wedge fast.
214 assert_eq!(ACME_POLL_TIMEOUT, Duration::from_secs(60));
215
216 // Demonstrate the same wrapper shape used in `bootstrap`.
217 // `pending` never resolves, so `timeout` MUST take the
218 // `Err(Elapsed)` path. A tiny deadline keeps the test wall
219 // time near zero.
220 let pending = futures::future::pending::<()>();
221 let res = tokio::time::timeout(Duration::from_millis(20), pending).await;
222 assert!(
223 res.is_err(),
224 "tokio::time::timeout must surface Elapsed for a never-ready future; \
225 this is the same branch that bumps record_acme_renewal_timeout in \
226 the production loop"
227 );
228
229 // Also exercise the recorder helper directly so any future
230 // refactor of the metric-label vocabulary trips this test
231 // (compile-time guard on the `&'static str` signature).
232 crate::metrics::record_acme_renewal_timeout();
233 }
234}