sozu_lib/protocol/mux/
router.rs

1//! Backend routing and connection reuse for the mux layer.
2//!
3//! [`Router`] owns the map of token -> backend [`Connection`] and centralises
4//! the logic for picking (or opening) the right backend for an incoming
5//! request. The H2 reuse strategy prefers the least-loaded non-draining
6//! connection of the target cluster; H1 falls back to keep-alive reuse.
7
8use std::{cell::RefCell, collections::HashMap, rc::Rc, time::Duration};
9
10use mio::{Interest, Token, net::TcpStream};
11use sozu_command::{
12    logging::ansi_palette,
13    proto::command::{ListenerType, RedirectPolicy, RedirectScheme},
14};
15
16#[cfg(debug_assertions)]
17use super::DebugEvent;
18use super::{BackendStatus, Connection, Context, GlobalStreamId, Position, StreamState};
19use crate::{
20    BackendConnectionError, L7ListenerHandler, L7Proxy, ListenerHandler, ProxySession, Readiness,
21    RetrieveClusterError,
22    backends::{Backend, BackendError},
23    protocol::http::editor::{HeaderEditMode, HeaderEditSnapshot, HttpContext},
24    router::{HeaderEdit, RouteResult},
25    server::CONN_RETRIES,
26    socket::SessionTcpStream,
27    timer::TimeoutContainer,
28};
29
30use crate::metrics::names;
31
32/// Module-level prefix used on every log line emitted from the router.
33///
34/// Two arms:
35/// * `log_module_context!()` — zero-arg, legacy `MUX-ROUTER\t >>>` output.
36///   Kept for sites without an `HttpContext` in scope. No call site in this
37///   module currently uses this arm (every one has an `HttpContext` reachable
38///   via [`Context::http_context`] or a direct `&mut HttpContext`
39///   parameter), but the arm is retained so the macro name stays stable for
40///   future sessionless callers.
41/// * `log_module_context!($http_context)` — rich form. `$http_context` must be
42///   `&HttpContext` (or coerce to one). Produces the same
43///   `[session req cluster backend]` bracket as RUSTLS/PIPE/TCP followed by a
44///   `Session(frontend=..., method=..., authority=...)` block, so router
45///   lines are filterable by session ULID or request ULID. `cluster_id` is
46///   already carried by the bracket's third slot — not duplicated inside
47///   `Session(...)`.
48macro_rules! log_module_context {
49    () => {{
50        let (open, reset, _, _, _) = ansi_palette();
51        format!("{open}MUX-ROUTER{reset}\t >>>", open = open, reset = reset)
52    }};
53    ($http_context:expr) => {{
54        let (open, reset, grey, gray, white) = ansi_palette();
55        let http_ctx: &HttpContext = &$http_context;
56        let ctx = http_ctx.log_context();
57        format!(
58            "{gray}{ctx}{reset}\t{open}MUX-ROUTER{reset}\t{grey}Session{reset}({gray}frontend{reset}={white}{frontend:?}{reset}, {gray}method{reset}={white}{method:?}{reset}, {gray}authority{reset}={white}{authority:?}{reset})\t >>>",
59            open = open,
60            reset = reset,
61            grey = grey,
62            gray = gray,
63            white = white,
64            ctx = ctx,
65            frontend = http_ctx.session_address,
66            method = http_ctx.method,
67            authority = http_ctx.authority,
68        )
69    }};
70}
71
72#[derive(Debug)]
73pub struct Router {
74    pub backends: HashMap<Token, Connection<SessionTcpStream>>,
75    pub configured_backend_timeout: Duration,
76    pub configured_connect_timeout: Duration,
77    /// Fallback readiness used when a backend token is missing from the map.
78    /// This prevents panicking in the Endpoint trait methods that return references.
79    pub(super) fallback_readiness: Readiness,
80}
81
82impl Router {
83    pub fn new(configured_backend_timeout: Duration, configured_connect_timeout: Duration) -> Self {
84        Self {
85            backends: HashMap::new(),
86            configured_backend_timeout,
87            configured_connect_timeout,
88            fallback_readiness: Readiness::new(),
89        }
90    }
91
92    pub(super) fn connect<L: ListenerHandler + L7ListenerHandler>(
93        &mut self,
94        stream_id: GlobalStreamId,
95        context: &mut Context<L>,
96        session: Rc<RefCell<dyn ProxySession>>,
97        proxy: Rc<RefCell<dyn L7Proxy>>,
98        // Frontend session token, threaded in from `Mux::ready` so the
99        // per-(cluster, source-IP) accounting can key on it without
100        // re-borrowing `session` — the outer event-loop call chain
101        // already holds a mutable borrow of that cell.
102        frontend_token: Token,
103    ) -> Result<(), BackendConnectionError> {
104        let stream = &mut context.streams[stream_id];
105        // when reused, a stream should be detached from its old connection, if not we could end
106        // with concurrent connections on a single endpoint
107        if !matches!(stream.state, StreamState::Link) {
108            error!(
109                "{} stream {} expected to be in Link state, got {:?}",
110                log_module_context!(stream.context),
111                stream_id,
112                stream.state
113            );
114            return Err(BackendConnectionError::MaxSessionsMemory);
115        }
116        #[cfg(debug_assertions)]
117        context
118            .debug
119            .push(DebugEvent::Str(stream.context.get_route()));
120        if stream.attempts >= CONN_RETRIES {
121            incr!(
122                "backend.connect.retries_exhausted",
123                stream.context.cluster_id.as_deref(),
124                stream.context.backend_id.as_deref()
125            );
126            return Err(BackendConnectionError::MaxConnectionRetries(
127                stream.context.cluster_id.clone(),
128            ));
129        }
130        stream.attempts += 1;
131
132        // Borrow front mutably (so route_from_request can rewrite the request
133        // line authority/path and inject request-side header edits before we
134        // forward to the backend) plus context mutably (so it can stash
135        // redirect_location / www_authenticate / original_authority /
136        // headers_response). We split-borrow manually to keep the rest of
137        // `connect` working with `stream_context` aliasing `stream.context`.
138        let (front_ref, stream_context_ref) = {
139            let stream_split = &mut *stream;
140            (&mut stream_split.front, &mut stream_split.context)
141        };
142        let cluster_id = self
143            .route_from_request(stream_context_ref, front_ref, &context.listener, &proxy)
144            .map_err(BackendConnectionError::RetrieveClusterError)?;
145        let stream_context = &mut stream.context;
146        stream_context.cluster_id = Some(cluster_id.to_owned());
147
148        let (
149            frontend_should_stick,
150            frontend_should_redirect_https,
151            h2,
152            cluster_max_connections_per_ip,
153            cluster_retry_after,
154        ) = proxy
155            .borrow()
156            .clusters()
157            .get(&cluster_id)
158            .map(|cluster| {
159                (
160                    cluster.sticky_session,
161                    cluster.https_redirect,
162                    cluster.http2.unwrap_or(false),
163                    cluster.max_connections_per_ip,
164                    cluster.retry_after,
165                )
166            })
167            .unwrap_or((false, false, false, None, None));
168
169        // ── Legacy `cluster.https_redirect` short-circuit ──
170        //
171        // Resolve the legacy HTTP→HTTPS redirect BEFORE per-(cluster,
172        // source-IP) accounting so a redirect-only request never
173        // consumes an IP slot. Otherwise a same-IP client iterating an
174        // HTTP→HTTPS hop could trip 429 ahead of the 301 even though no
175        // backend would have been opened. A duplicate guard that lived
176        // here previously (rebase artefact — two identical
177        // `if frontend_should_redirect_https && …` blocks back-to-back)
178        // is folded into this single early-return.
179        // Frontend-scoped `RedirectPolicy::PERMANENT` already returns
180        // from `route_from_request` with the same error, so this only
181        // handles the legacy cluster-level path that doesn't surface
182        // from `route_from_request`.
183        if frontend_should_redirect_https && matches!(proxy.borrow().kind(), ListenerType::Http) {
184            return Err(BackendConnectionError::RetrieveClusterError(
185                RetrieveClusterError::HttpsRedirect,
186            ));
187        }
188
189        // Per-(cluster, source-IP) connection limit gate. Runs AFTER cluster
190        // resolution AND legacy redirect emission (so a 401/421/redirect
191        // frontend never trips the limit) and BEFORE any backend selection
192        // (so a rejection consumes neither a backend pool slot nor a retry
193        // budget). The check uses the source IP from the per-stream
194        // `HttpContext.session_address`, which is the proxy-protocol-aware
195        // client address when present, falling back to `peer_addr`. The
196        // limit governs distinct **frontend connections** per
197        // `(cluster, ip)`: an H2 session multiplexing N streams to the same
198        // cluster from the same IP still consumes a single slot.
199        let session_ip = stream_context.session_address.map(|sa| sa.ip());
200        if let Some(ip) = session_ip {
201            // The frontend session is mutably borrowed up the call stack
202            // (`HttpSession::ready` -> `state.ready` -> `Mux::ready` ->
203            // here), so we cannot reach `session.borrow().frontend_token()`.
204            // The token is threaded in by the caller instead.
205            let sessions_rc = proxy.borrow().sessions();
206            let at_limit = sessions_rc.borrow().cluster_ip_at_limit(
207                frontend_token,
208                &cluster_id,
209                &ip,
210                cluster_max_connections_per_ip,
211            );
212            if at_limit {
213                let retry_after = sessions_rc
214                    .borrow()
215                    .effective_retry_after(cluster_retry_after);
216                // Stash the resolved retry value on the stream so the
217                // mux's BackendConnectionError → 429 mapping can render
218                // (or elide) the `Retry-After` header without
219                // re-deriving the override chain.
220                stream_context.retry_after_seconds = Some(retry_after).filter(|v| *v > 0);
221                return Err(BackendConnectionError::TooManyConnectionsPerIp {
222                    cluster_id: cluster_id.to_owned(),
223                });
224            }
225            // Idempotent track — H2 streams to the same `(cluster, ip)`
226            // share a single slot in the per-token set. Decrement happens
227            // wholesale on session close via `untrack_all_cluster_ip`.
228            sessions_rc
229                .borrow_mut()
230                .track_cluster_ip(frontend_token, cluster_id.clone(), ip);
231        }
232
233        /*
234        H2 connecting strategy (least-loaded):
235        - look at every backend connection
236        - among connected backends for this cluster, pick the one with the fewest active streams
237        - fall back to a connecting backend if no connected one exists
238        - if no backend is to reuse, ask the router for a socket to the "next in line" backend
239
240        H1 strategy: reuse the first KeepAlive backend for this cluster.
241         */
242
243        let mut reuse_token = None;
244        let mut best_h2_stream_count = usize::MAX;
245        for (token, backend) in &self.backends {
246            match (h2, backend.position()) {
247                (_, Position::Server) => {
248                    error!(
249                        "{} Backend connection unexpectedly behaves like a server",
250                        log_module_context!(stream_context)
251                    );
252                    continue;
253                }
254                (_, Position::Client(_, _, BackendStatus::Disconnecting)) => {}
255
256                (true, Position::Client(other_cluster_id, _, BackendStatus::Connected)) => {
257                    if *other_cluster_id == cluster_id && !backend.is_draining() {
258                        // Pick the non-draining H2 connection with the fewest active streams
259                        let Connection::H2(h2c) = backend else {
260                            continue;
261                        };
262                        let stream_count = h2c.streams.len();
263                        if stream_count
264                            >= h2c.peer_settings.settings_max_concurrent_streams as usize
265                        {
266                            continue;
267                        }
268                        if stream_count < best_h2_stream_count {
269                            best_h2_stream_count = stream_count;
270                            reuse_token = Some(*token);
271                        }
272                    }
273                }
274                (true, Position::Client(other_cluster_id, _, BackendStatus::Connecting(_))) => {
275                    // Only use a connecting backend if no connected one was found
276                    if *other_cluster_id == cluster_id
277                        && best_h2_stream_count == usize::MAX
278                        && matches!(backend, Connection::H2(_))
279                    {
280                        reuse_token = Some(*token)
281                    }
282                }
283                (true, Position::Client(other_cluster_id, _, BackendStatus::KeepAlive)) => {
284                    if *other_cluster_id == cluster_id && matches!(backend, Connection::H2(_)) {
285                        error!(
286                            "{} ConnectionH2 unexpectedly behaves like H1 with KeepAlive",
287                            log_module_context!(stream_context)
288                        );
289                    }
290                }
291
292                (false, Position::Client(old_cluster_id, _, BackendStatus::KeepAlive)) => {
293                    if *old_cluster_id == cluster_id {
294                        reuse_token = Some(*token);
295                        break;
296                    }
297                }
298                // can't bundle H1 streams together
299                (false, Position::Client(_, _, BackendStatus::Connected))
300                | (false, Position::Client(_, _, BackendStatus::Connecting(_))) => {}
301            }
302        }
303        trace!(
304            "{} connect: (stick={}, h2={}) -> (reuse={:?})",
305            log_module_context!(stream_context),
306            frontend_should_stick,
307            h2,
308            reuse_token
309        );
310
311        if let Some(token) = reuse_token {
312            // Pool reuse: an existing backend connection (H2 multiplex slot or
313            // H1 keep-alive socket) is being reattached to this stream. Pair
314            // with `backend.pool.miss` below — together they describe the
315            // pool's hit/miss ratio. Counted before any commit so the metric
316            // is consistent with the trace log.
317            incr!(names::backend::POOL_HIT);
318            trace!(
319                "{} reused backend: {:#?}",
320                log_module_context!(stream_context),
321                self.backends.get(&token)
322            );
323            // Link backend to stream for the reused connection path. We check
324            // that the backend can accept a new stream before committing any
325            // per-stream state.
326            let Some(backend_conn) = self.backends.get_mut(&token) else {
327                error!(
328                    "{} reused backend token {:?} missing from backends map",
329                    log_module_context!(stream_context),
330                    token
331                );
332                return Err(BackendConnectionError::MaxSessionsMemory);
333            };
334            if !backend_conn.start_stream(stream_id, context) {
335                // Use `context.http_context(stream_id)` instead of reusing
336                // `stream_context`: `start_stream` above takes `&mut
337                // context`, which reborrows the slab mutably and ends any
338                // outstanding `stream_context` reference. A fresh shared
339                // borrow via the accessor is borrow-check clean.
340                error!(
341                    "{} Backend rejected stream start (max concurrent streams reached)",
342                    log_module_context!(context.http_context(stream_id))
343                );
344                return Err(BackendConnectionError::MaxSessionsMemory);
345            }
346            // For reused backends: set context fields and metrics lifecycle
347            if let Some(backend_conn) = self.backends.get(&token) {
348                if let Position::Client(_, backend_ref, _) = backend_conn.position() {
349                    let backend = backend_ref.borrow();
350                    let stream = &mut context.streams[stream_id];
351                    stream.context.backend_id = Some(backend.backend_id.to_owned());
352                    stream.context.backend_address = Some(backend.address);
353                    stream.metrics.backend_id = Some(backend.backend_id.to_owned());
354                    stream.metrics.backend_start();
355                    stream.metrics.backend_connected();
356                }
357            }
358            context.link_stream(stream_id, token);
359            return Ok(());
360        }
361
362        // New-backend path: fall through.
363        //
364        // Pool miss: no reusable connection was found (no live H2 multiplex
365        // slot for this cluster, no H1 keep-alive socket). A fresh TCP dial
366        // and full backend handshake will follow. Pair with `backend.pool.hit`
367        // above. The metric is incremented BEFORE `backend_from_request` so
368        // the count includes attempts that fail at backend selection
369        // (BackendError::NoBackendForCluster, etc.) — every miss is a slot
370        // we did not save. The dial itself may still fail
371        // (BackendConnectionError::*), in which case `backend.pool.size` is
372        // never bumped (see the gauge below) but the miss is already counted.
373        incr!(names::backend::POOL_MISS);
374        let token = {
375            //
376            // SECURITY (CWE-400): defer every stateful side-effect
377            // (backend.connections / connections_per_backend gauges, slab
378            // add_session, mio register_socket, self.backends.insert,
379            // stream.metrics.backend_start) until AFTER `new_h2_client` AND
380            // `start_stream` have both succeeded. If either fails we must
381            // return Err without leaking a slab entry, an epoll registration,
382            // a gauge counter, or a router-map entry.
383            //
384            // The TcpStream lives on the stack here and is moved into the
385            // Connection by `new_h2_client`/`new_h1_client`; on failure the
386            // Connection (or the raw TcpStream, for the pool-exhaustion
387            // branch that drops inside `new_h2_client`) is dropped, closing
388            // the fd. No token is ever allocated, so there is nothing to
389            // roll back.
390            let (socket, backend) = self.backend_from_request(
391                &cluster_id,
392                frontend_should_stick,
393                stream_context,
394                proxy.clone(),
395                &context.listener,
396            )?;
397
398            if let Err(e) = socket.set_nodelay(true) {
399                error!(
400                    "{} error setting nodelay on back socket({:?}): {:?}",
401                    log_module_context!(context.http_context(stream_id)),
402                    socket,
403                    e
404                );
405            }
406
407            // Cache the backend's configured address so SOCKET log lines
408            // fired on ECONNREFUSED (or any failed async `connect()`) can
409            // still render `peer=<backend>` — `getpeername(2)` returns
410            // ENOTCONN in that state, so the live lookup path would show
411            // `peer=None` exactly when the operator needs the backend id.
412            let backend_peer = Some(backend.borrow().address);
413            let socket = SessionTcpStream::new(socket, context.session_ulid, backend_peer);
414
415            // Build an un-armed timeout: we can't call `TimeoutContainer::new`
416            // yet because that requires the slab token, and we only allocate
417            // the token on the happy path. `.set(token)` below arms it.
418            let timeout_container = TimeoutContainer::new_empty(self.configured_connect_timeout);
419            let flood_config = context.listener.borrow().get_h2_flood_config();
420            let connection_config = context.listener.borrow().get_h2_connection_config();
421            let stream_idle_timeout = context.listener.borrow().get_h2_stream_idle_timeout();
422            let graceful_shutdown_deadline = context
423                .listener
424                .borrow()
425                .get_h2_graceful_shutdown_deadline();
426            let backend_id_for_gauge = backend.borrow().backend_id.to_owned();
427            let mut connection = if h2 {
428                match Connection::new_h2_client(
429                    context.session_ulid,
430                    socket,
431                    cluster_id.to_owned(),
432                    backend,
433                    context.pool.clone(),
434                    timeout_container,
435                    flood_config,
436                    connection_config,
437                    stream_idle_timeout,
438                    graceful_shutdown_deadline,
439                ) {
440                    Some(connection) => connection,
441                    // pool exhaustion: socket already dropped by new_h2_client,
442                    // no side-effects were committed.
443                    None => return Err(BackendConnectionError::MaxBuffers),
444                }
445            } else {
446                Connection::new_h1_client(
447                    context.session_ulid,
448                    socket,
449                    cluster_id.to_owned(),
450                    backend,
451                    timeout_container,
452                )
453            };
454
455            // Check the backend can accept a new stream BEFORE committing any
456            // registry state. `start_stream` increments `active_requests` via
457            // `pre_start_stream_client_bookkeeping` and undoes it itself on
458            // failure (see `Connection::start_stream`), so dropping the
459            // connection on a false return leaves backend accounting clean.
460            if !connection.start_stream(stream_id, context) {
461                error!(
462                    "{} Backend rejected stream start (max concurrent streams reached)",
463                    log_module_context!(context.http_context(stream_id))
464                );
465                // `connection` (socket + timeout_container) drops here.
466                return Err(BackendConnectionError::MaxSessionsMemory);
467            }
468
469            // --- Happy path: commit side-effects in one atomic-ish block ---
470            let stream = &mut context.streams[stream_id];
471            stream.metrics.backend_start();
472            stream.metrics.backend_id = stream.context.backend_id.to_owned();
473            gauge_add!(names::backend::CONNECTIONS, 1);
474            // `backend.pool.size` mirrors `backend.connections` exactly: one
475            // entry per `Router::backends` token. The `-1` partner lives in
476            // `connection.rs::pre_close_client_bookkeeping` (graceful close)
477            // and `mod.rs::close_backend` (session teardown). Symmetric
478            // pairing with both decrement sites is the only defence against
479            // the gauge underflow class of bug fixed by a650ad69 / d2f01ed4.
480            gauge_add!(names::backend::POOL_SIZE, 1);
481            gauge_add!(
482                names::backend::CONNECTIONS_PER_BACKEND,
483                1,
484                Some(&cluster_id),
485                Some(&backend_id_for_gauge)
486            );
487
488            let token = proxy.borrow().add_session(session);
489
490            {
491                let socket_ref = connection.socket_mut();
492                if let Err(e) = proxy.borrow().register_socket(
493                    socket_ref,
494                    token,
495                    Interest::READABLE | Interest::WRITABLE,
496                ) {
497                    // SECURITY (CWE-400): treat mio registration failure as a
498                    // hard connect failure. Without this rollback the gauges
499                    // (`backend.connections`, `backend.pool.size`,
500                    // `connections_per_backend`), the slab session, and the
501                    // already-incremented `Backend.active_requests` counter
502                    // (bumped in `Connection::start_stream` ->
503                    // `pre_start_stream_client_bookkeeping`) all leak until
504                    // the connect timeout fires. Under fd pressure
505                    // (EMFILE/ENFILE) this can occur in tight bursts and
506                    // poison capacity dashboards.
507                    error!(
508                        "{} error registering back socket: {:?} — rolling back",
509                        log_module_context!(context.http_context(stream_id)),
510                        e
511                    );
512                    // Undo the gauge increments committed above.
513                    gauge_add!(names::backend::CONNECTIONS, -1);
514                    gauge_add!(names::backend::POOL_SIZE, -1);
515                    gauge_add!(
516                        names::backend::CONNECTIONS_PER_BACKEND,
517                        -1,
518                        Some(&cluster_id),
519                        Some(&backend_id_for_gauge)
520                    );
521                    // Drop the slab session and the connection. The connection
522                    // is local to this scope; dropping it here also closes the
523                    // underlying TcpStream and releases the
524                    // `Backend.active_requests` increment via the regular
525                    // session drop path (`pre_close_client_bookkeeping`).
526                    proxy.borrow().remove_session(token);
527                    return Err(BackendConnectionError::MaxSessionsMemory);
528                }
529            }
530
531            // Arm the connect timeout now that we own a real token.
532            connection.timeout_container().set(token);
533
534            self.backends.insert(token, connection);
535            token
536        };
537
538        context.link_stream(stream_id, token);
539        Ok(())
540    }
541
542    fn route_from_request<L: ListenerHandler + L7ListenerHandler>(
543        &mut self,
544        context: &mut HttpContext,
545        front: &mut super::GenericHttpStream,
546        listener: &Rc<RefCell<L>>,
547        proxy: &Rc<RefCell<dyn L7Proxy>>,
548    ) -> Result<String, RetrieveClusterError> {
549        let (host, uri, method) = match context.extract_route() {
550            Ok(tuple) => tuple,
551            Err(cluster_error) => {
552                // we are past kawa parsing if it succeeded this can't fail
553                // if the request was malformed it was caught by kawa and we sent a 400
554                error!(
555                    "{} Malformed request in connect (should be caught at parsing) {:?}: {}",
556                    log_module_context!(context),
557                    context,
558                    cluster_error
559                );
560                return Err(cluster_error);
561            }
562        };
563        // Snapshot the pre-rewrite authority into an owned string so we
564        // can later stash it on `context.original_authority` without
565        // mutably aliasing the immutable borrow that `host: &str` still
566        // holds on `context`.
567        let captured_authority = host.to_owned();
568
569        // ── TLS cert SAN ↔ HTTP :authority binding ────────────────────────
570        // Reject any request whose `:authority` is not covered by a SAN of
571        // the certificate Sōzu actually served at the TLS handshake, with
572        // RFC 6125 §6.4.3 wildcard handling. Without this binding, an
573        // attacker holding a valid certificate for tenant A could open TLS
574        // with SNI=A then send an H2 stream with `:authority=tenantB.…` and
575        // reach tenant B's backend, crossing the TLS trust boundary
576        // (CWE-346 / CWE-444). The H2 spec explicitly allows browsers to
577        // coalesce streams onto a connection whenever the server is
578        // authoritative for the new origin (RFC 7540 §9.1.1 / RFC 9113
579        // §9.1.1), which "authoritative" means "covered by a SAN of the
580        // served cert"; rejecting coalesced streams as 421 caused the
581        // user-visible bug this predicate fixes (RFC 9110 §15.5.20).
582        //
583        // Plaintext listeners bypass the check (SNI is always `None`).
584        // Connections where SNI was sent but no cert matched (rustls served
585        // the default cert) carry `Some(empty)` SAN snapshot, so every
586        // authority is rejected — Sōzu is not authoritative for any name.
587        // Connections with no SNI fall back to the legacy exact-SNI match
588        // predicate (`authority_matches_sni`) for parity with pre-fix
589        // behaviour on the pathological "no SNI" case.
590        // Operators may opt out per-listener via
591        // `HttpsListenerConfig::strict_sni_binding = false`.
592        if let Some(sni) = context
593            .tls_server_name
594            .as_deref()
595            .filter(|_| context.strict_sni_binding)
596        {
597            let matched: Option<&str> = match context.tls_cert_names.as_deref() {
598                Some(cert_names) => authority_matched_cert_name(host, cert_names),
599                None => {
600                    if authority_matches_sni(host, sni) {
601                        Some(sni)
602                    } else {
603                        None
604                    }
605                }
606            };
607            match matched {
608                Some(matched_name) => {
609                    // Real coalescing = matched SAN differs from the SNI's
610                    // value after the matcher's port-strip + ASCII case
611                    // folding. Same-name requests are the common
612                    // non-coalesced path; do not pollute the counter or
613                    // logs with them. The ALPN=`h2` gate is a defensive
614                    // guard, not load-bearing under current invariants —
615                    // every request reaching `route_from_request` on an
616                    // HTTPS listener with `tls_cert_names` populated has
617                    // already gone through the H2 mux (ALPN=h2 by
618                    // construction). Kept explicit so a future routing
619                    // refactor that funnels H1 keep-alive through the
620                    // same predicate doesn't silently double-count
621                    // sequential `Host:` reuse as "coalescing".
622                    if !authority_matches_sni(host, sni) && context.tls_alpn == Some("h2") {
623                        incr!(names::h2::COALESCING_ACCEPTED);
624                        debug!(
625                            "{} accepted coalesced authority {:?} (SNI {:?}, matched SAN {:?})",
626                            log_module_context!(context),
627                            host,
628                            sni,
629                            matched_name,
630                        );
631                    }
632                }
633                None => {
634                    incr!(names::http::SNI_AUTHORITY_MISMATCH);
635                    warn!(
636                        "{} rejecting request: TLS cert SANs do not cover :authority {:?} (SNI {:?})",
637                        log_module_context!(context),
638                        host,
639                        sni,
640                    );
641                    return Err(RetrieveClusterError::SniAuthorityMismatch {
642                        sni: sni.to_owned(),
643                        authority: host.to_owned(),
644                    });
645                }
646            }
647        }
648
649        let route_result = listener.borrow().frontend_from_request(host, uri, method);
650
651        let route = match route_result {
652            Ok(route) => route,
653            Err(frontend_error) => {
654                trace!("{} {}", log_module_context!(context), frontend_error);
655                return Err(RetrieveClusterError::RetrieveFrontend(frontend_error));
656            }
657        };
658
659        // Stash the pre-rewrite authority unconditionally so log lines,
660        // access logs, and audit records that fire on ANY downstream
661        // path (denial, redirect, basic-auth 401, backend-connect
662        // failure, successful forward) carry the value the client
663        // actually sent. Capturing inside the rewrite helper alone would
664        // lose it on every branch where the rewrite is not applied.
665        context.original_authority = Some(captured_authority);
666
667        // ── Resolve the routing decision ──────────────────────────────────
668        // Snapshot the policy fields we need before consuming `route`, then
669        // map each policy outcome to either an early-error variant (which
670        // the caller turns into a default answer) or a cluster_id (which
671        // proceeds to backend connect).
672        let RouteResult {
673            cluster_id,
674            redirect,
675            redirect_scheme,
676            redirect_template,
677            rewritten_host,
678            rewritten_path,
679            rewritten_port,
680            headers_request,
681            headers_response,
682            required_auth: frontend_required_auth,
683            ..
684        } = route;
685
686        // ── HSTS (RFC 6797) snapshot hoist for HTTPS ──────────────────────
687        // The response snapshot is built in two passes so HSTS reaches
688        // every HTTPS response code (RFC 6797 §8.1 — including
689        // proxy-generated 3xx / 401 / 5xx default answers) WITHOUT
690        // changing the pre-PR scope of operator-defined `Append`
691        // response headers (which only apply on the regular forward
692        // path).
693        //
694        // Pass 1 (here, before any early return): for HTTPS only, copy
695        // ONLY the HSTS-class typed edits (`SetIfAbsent | Set`). These
696        // need to land on default answers — `set_default_answer_with_retry_after`
697        // bypasses the post-forward copy below.
698        //
699        // Pass 2 (post-forward, end of function): copy EVERY edit
700        // (including operator `Append` headers). Runs only on the
701        // regular forward path because the early returns short-circuit
702        // before reaching it.
703        //
704        // Plain-HTTP listeners are skipped here per RFC 6797 §7.2 (no
705        // STS over plaintext) — defense in depth on top of the
706        // TOML-time `ConfigError::HstsOnPlainHttp` and the worker IPC
707        // `ProxyError::HstsOnPlainHttp` rejects.
708        if matches!(context.protocol, crate::Protocol::HTTPS) {
709            snapshot_response_edits(&mut context.headers_response, &headers_response, |e| {
710                matches!(e.mode, HeaderEditMode::SetIfAbsent | HeaderEditMode::Set)
711            });
712        }
713
714        // Look up cluster-side policy knobs once. The values we need are:
715        //  - `https_redirect` (legacy) and `https_redirect_port` for the 301 location URL
716        //  - `authorized_hashes` and `www_authenticate` for the 401 path
717        let (legacy_https_redirect, https_redirect_port, authorized_hashes, www_authenticate) =
718            match cluster_id.as_deref() {
719                Some(id) => proxy
720                    .borrow()
721                    .clusters()
722                    .get(id)
723                    .map(|c| {
724                        (
725                            c.https_redirect,
726                            c.https_redirect_port,
727                            c.authorized_hashes.clone(),
728                            c.www_authenticate.clone(),
729                        )
730                    })
731                    .unwrap_or((false, None, Vec::new(), None)),
732                None => (false, None, Vec::new(), None),
733            };
734
735        // ── 1. Explicit redirect policies (PERMANENT / FOUND / PERMANENT_REDIRECT) ──
736        // Resolved BEFORE the clusterless-deny branch so a frontend that
737        // declares `redirect = permanent | found | permanent_redirect`
738        // emits the matching 3xx even when no cluster is bound. This is
739        // the canonical "moved" shape from the original proposal in
740        // #1161 and is the only way to express "this hostname has moved"
741        // without standing up a dummy cluster. The block does not read
742        // `cluster_id`; per-cluster values (`https_redirect_port`,
743        // `www_authenticate`, …) default to safe sentinels at the cluster
744        // lookup above when `cluster_id` is `None`, so the reorder is
745        // data-flow-safe.
746        //
747        // Status code mapping (closes #1009):
748        //   Permanent          → 301 (RFC 9110 §15.4.2)
749        //   Found              → 302 (RFC 9110 §15.4.3) — UA may rewrite POST→GET
750        //   PermanentRedirect  → 308 (RFC 9110 §15.4.9) — method MUST be preserved
751        let redirect_status = match redirect {
752            RedirectPolicy::Permanent => Some(301u16),
753            RedirectPolicy::Found => Some(302u16),
754            RedirectPolicy::PermanentRedirect => Some(308u16),
755            // Forward / Unauthorized are handled by other branches
756            // below; keeping them named here forces an exhaustive
757            // match so a future RedirectPolicy variant doesn't
758            // silently fall through to `None`.
759            RedirectPolicy::Forward | RedirectPolicy::Unauthorized => None,
760        };
761        if let Some(status_code) = redirect_status {
762            let scheme = resolve_redirect_scheme(redirect_scheme, context);
763            let port = rewritten_port.map(|p| p as u32).or(https_redirect_port);
764            // Feed the rewritten host AND path into the `Location` URL
765            // when the frontend's RewriteParts populated them. Without
766            // this, a `redirect = permanent` frontend with
767            // `rewrite_host = "new.example.com"` would serve clients
768            // back to the original `Host:` header, defeating the
769            // documented `old → new` shape.
770            // The host_override path also keeps `:port` stripping
771            // intact: `build_redirect_location` removes any `:port` on
772            // the override before reapplying `port_suffix`.
773            context.redirect_location = Some(build_redirect_location(
774                scheme,
775                context,
776                port,
777                rewritten_host.as_deref(),
778                rewritten_path.as_deref(),
779            ));
780            // Stash the frontend's `redirect_template` (when set) so the
781            // 3xx default-answer path can render it via
782            // `HttpAnswers::render_inline_redirect` instead of the
783            // listener / cluster default. Without this stash the field
784            // flows into `RouteResult` only to be dropped by the
785            // wildcard destructure below, so the operator-supplied
786            // template has no observable effect on the rendered
787            // redirect.
788            context.frontend_redirect_template = redirect_template;
789            // Stash the resolved status so the answer engine picks the
790            // matching default template (`http.301.redirection` /
791            // `http.302.redirection` / `http.308.redirection`).
792            context.redirect_status = Some(status_code);
793            return Err(RetrieveClusterError::HttpsRedirect);
794        }
795
796        // ── 2. Explicit `RedirectPolicy::UNAUTHORIZED` or clusterless deny ─
797        // Reached when the frontend either explicitly asks for 401 or has
798        // no backing cluster and no `Permanent` redirect to honour. The
799        // `Forward + cluster_id == None` combination collapses here so
800        // legacy clusterless frontends still emit 401 by default.
801        if matches!(redirect, RedirectPolicy::Unauthorized) || cluster_id.is_none() {
802            context.www_authenticate = www_authenticate.clone();
803            trace!("{} RouteResult::deny", log_module_context!(context));
804            return Err(RetrieveClusterError::UnauthorizedRoute);
805        }
806
807        let Some(cluster_id) = cluster_id else {
808            // Guarded by the clusterless-deny branch immediately above;
809            // the `is_none()` arm has already returned `UnauthorizedRoute`
810            // by the time control reaches here.
811            unreachable!("cluster_id was checked Some above")
812        };
813
814        // ── 3. Legacy `cluster.https_redirect` (HTTP-only listeners) ───────
815        // The caller (`Router::connect`) emits the actual 301 only on
816        // `ListenerType::Http`; gate the URL stash on the same predicate
817        // so an HTTPS listener never carries a stale `redirect_location`
818        // into a downstream default-answer path.
819        if legacy_https_redirect && matches!(proxy.borrow().kind(), ListenerType::Http) {
820            let port = https_redirect_port;
821            context.redirect_location =
822                Some(build_redirect_location("https", context, port, None, None));
823        }
824
825        // ── 4. Basic auth check (only when `required_auth` was set) ────────
826        // The check iterates the full hash list in constant time (see
827        // `crate::protocol::mux::auth::check_basic`) so the time spent
828        // does not leak which hash matched, or whether any did at all.
829        // On failure, stash the cluster's `www_authenticate` realm so the
830        // 401 default-answer can render the matching `WWW-Authenticate`
831        // header. An empty realm causes the template engine to elide the
832        // header entirely (`or_elide_header = true`).
833        if frontend_required_auth
834            && !crate::protocol::mux::auth::check_basic(front, &authorized_hashes)
835        {
836            context.www_authenticate = www_authenticate.clone();
837            trace!(
838                "{} basic-auth check failed; emitting 401",
839                log_module_context!(context)
840            );
841            return Err(RetrieveClusterError::UnauthorizedRoute);
842        }
843
844        // ── 5. Request-side mutations on the front kawa ────────────────────
845        // From here on the route is a Forward — apply the frontend's
846        // rewrite + header policy to the request kawa so the backend
847        // wire carries the operator-configured shape.
848        apply_request_rewrites_and_headers(
849            front,
850            context,
851            rewritten_host.as_deref(),
852            rewritten_path.as_deref(),
853            &headers_request,
854        );
855
856        // Pass 2 of the response-snapshot copy (see the HSTS hoist
857        // above). Runs unconditionally on the regular forward path
858        // (the early returns above bypass this site, which keeps the
859        // default-answer scope as HSTS-only). Copies EVERY edit so
860        // operator-defined `Append` response headers reach
861        // backend-served responses on both HTTP and HTTPS listeners,
862        // preserving their pre-PR scope.
863        snapshot_response_edits(&mut context.headers_response, &headers_response, |_| true);
864
865        Ok(cluster_id)
866    }
867
868    pub fn backend_from_request<L: ListenerHandler + L7ListenerHandler>(
869        &mut self,
870        cluster_id: &str,
871        frontend_should_stick: bool,
872        context: &mut HttpContext,
873        proxy: Rc<RefCell<dyn L7Proxy>>,
874        listener: &Rc<RefCell<L>>,
875    ) -> Result<(TcpStream, Rc<RefCell<Backend>>), BackendConnectionError> {
876        let (backend, conn) = self
877            .get_backend_for_sticky_session(
878                cluster_id,
879                frontend_should_stick,
880                context.sticky_session_found.as_deref(),
881                proxy,
882            )
883            .map_err(|backend_error| {
884                trace!("{} {}", log_module_context!(context), backend_error);
885                BackendConnectionError::Backend(backend_error)
886            })?;
887
888        if frontend_should_stick {
889            // update sticky name in case it changed I guess?
890            context.sticky_name = listener.borrow().get_sticky_name().to_string();
891
892            context.sticky_session = Some(
893                backend
894                    .borrow()
895                    .sticky_id
896                    .clone()
897                    .unwrap_or_else(|| backend.borrow().backend_id.to_owned()),
898            );
899        }
900
901        context.backend_id = Some(backend.borrow().backend_id.to_owned());
902        context.backend_address = Some(backend.borrow().address);
903
904        Ok((conn, backend))
905    }
906
907    fn get_backend_for_sticky_session(
908        &self,
909        cluster_id: &str,
910        frontend_should_stick: bool,
911        sticky_session: Option<&str>,
912        proxy: Rc<RefCell<dyn L7Proxy>>,
913    ) -> Result<(Rc<RefCell<Backend>>, TcpStream), BackendError> {
914        match (frontend_should_stick, sticky_session) {
915            (true, Some(sticky_session)) => proxy
916                .borrow()
917                .backends()
918                .borrow_mut()
919                .backend_from_sticky_session(cluster_id, sticky_session),
920            _ => proxy
921                .borrow()
922                .backends()
923                .borrow_mut()
924                .backend_from_cluster_id(cluster_id),
925        }
926    }
927}
928
929/// Apply the frontend's request-side rewrite + header policy to the
930/// request kawa. Mutations land before backend connect so the backend
931/// wire carries the rewritten shape:
932///
933/// 1. If `rewritten_host` is set, replace the request-line authority
934///    with the rewritten value, replace any existing `Host` request
935///    header (so H1 backends see the same value the H2 `:authority`
936///    would carry), and inject `X-Forwarded-Host` carrying the
937///    pre-rewrite authority. The X-Forwarded-Host injection ONLY fires
938///    when `rewritten_host` is set — without a rewrite there is no host
939///    swap to disclose, and HAProxy's `option forwardfor` style
940///    headers (`X-Forwarded-For`, `X-Forwarded-Proto`) still flow from
941///    the kawa parser. The pre-rewrite authority itself is captured by
942///    the caller (`route_from_request`) into `context.original_authority`
943///    on every routed request so it survives every downstream code path
944///    (audit, deny, redirect, basic-auth 401, backend-connect failure).
945///    Dedup rule: the synthetic Host AND any pre-existing Host header
946///    are dropped in the retain pass below before the rewritten Host is
947///    appended, so the wire never carries two `Host:` headers.
948/// 2. If `rewritten_path` is set, replace both the abstract path
949///    (consumed by H2 `:path`) and the request-line URI (consumed by
950///    the H1 converter) so cardinality H1↔H1, H1↔H2, H2↔H1, H2↔H2 all
951///    propagate the rewritten target.
952/// 3. For every `headers_request` edit:
953///    - empty `val` → remove every existing header with the matching
954///      name from `kawa.blocks` (HAProxy `del-header` parity);
955///    - non-empty `val` → append the header before the `end_header`
956///      flag block. Set/replace semantics: callers that want to replace
957///      a header pass two edits (one delete with empty val, one set
958///      with the new value).
959fn apply_request_rewrites_and_headers(
960    kawa: &mut super::GenericHttpStream,
961    context: &mut HttpContext,
962    rewritten_host: Option<&str>,
963    rewritten_path: Option<&str>,
964    headers_request: &[HeaderEdit],
965) {
966    use kawa::{Block, Pair, Store};
967
968    if rewritten_host.is_none() && rewritten_path.is_none() && headers_request.is_empty() {
969        return;
970    }
971
972    // `route_from_request` already captured the pre-rewrite authority
973    // into `context.original_authority`. Re-borrow it here for the
974    // optional X-Forwarded-Host injection rather than re-parsing the
975    // kawa Store. Cloning a short header value (typically `host:port`)
976    // is cheaper than another UTF-8 decode of the request-line slice.
977    let original_authority: Option<String> = if rewritten_host.is_some() {
978        context.original_authority.clone()
979    } else {
980        None
981    };
982
983    // ── status-line authority / path rewrites ─────────────────────────
984    // The kawa request status line carries both `path` and `uri` —
985    // `path` is the abstract path (consumed by the H2 converter to
986    // emit `:path`) while `uri` is the request-line URI (consumed by
987    // the H1 converter at `kawa::protocol::h1::converter`). Both must
988    // be mutated so an H1 frontend forwarding to an H1 backend AND an
989    // H2 frontend forwarding to an H1 backend (or vice versa) see the
990    // rewritten target on the wire.
991    if rewritten_host.is_some() || rewritten_path.is_some() {
992        if let kawa::StatusLine::Request {
993            authority,
994            path,
995            uri,
996            ..
997        } = &mut kawa.detached.status_line
998        {
999            if let Some(new_host) = rewritten_host {
1000                *authority = Store::from_string(new_host.to_owned());
1001            }
1002            if let Some(new_path) = rewritten_path {
1003                *path = Store::from_string(new_path.to_owned());
1004                *uri = Store::from_string(new_path.to_owned());
1005            }
1006        }
1007    }
1008
1009    // ── single-pass split: deletes vs. sets ───────────────────────────
1010    // Walk `headers_request` once and separate each edit into either the
1011    // delete list (empty val) or the insert list (non-empty val). Two
1012    // passes was wasteful when an operator stacks many `--header` flags;
1013    // one pass keeps the allocation profile flat.
1014    let host_lower = b"host";
1015    let xfh_lower = b"x-forwarded-host";
1016    let rewriting_host = rewritten_host.is_some();
1017    let mut keys_to_drop: Vec<Vec<u8>> = Vec::with_capacity(headers_request.len() + 2);
1018    let mut to_insert: Vec<Block> = Vec::with_capacity(headers_request.len() + 2);
1019    // Track whether any operator-supplied edit names Host or
1020    // X-Forwarded-Host so we always dedup the existing kawa Host header
1021    // before inserting the operator's value. Without this, an operator
1022    // who sets `--header request=Host=evil` on a frontend WITHOUT
1023    // `--rewrite-host` lands TWO `Host:` headers on the backend wire —
1024    // a request-smuggling primitive on backends that pick last-Host
1025    // (CWE-444 cousin).
1026    let mut operator_overrides_host = false;
1027    let mut operator_overrides_xfh = false;
1028    for edit in headers_request {
1029        let key_is_host = edit.key.eq_ignore_ascii_case(host_lower);
1030        let key_is_xfh = edit.key.eq_ignore_ascii_case(xfh_lower);
1031        operator_overrides_host |= key_is_host;
1032        operator_overrides_xfh |= key_is_xfh;
1033        if edit.val.is_empty() {
1034            keys_to_drop.push(edit.key.iter().map(u8::to_ascii_lowercase).collect());
1035        } else {
1036            to_insert.push(Block::Header(Pair {
1037                key: Store::from_slice(&edit.key),
1038                val: Store::from_slice(&edit.val),
1039            }));
1040        }
1041    }
1042    if rewriting_host || operator_overrides_host {
1043        keys_to_drop.push(host_lower.to_vec());
1044    }
1045    if rewriting_host || operator_overrides_xfh {
1046        keys_to_drop.push(xfh_lower.to_vec());
1047    }
1048
1049    // ── delete pass on existing blocks ────────────────────────────────
1050    let buf_ptr = kawa.storage.buffer();
1051    if !keys_to_drop.is_empty() {
1052        // Read `key.data(buf_ptr)` only on non-elided headers — kawa's
1053        // earlier passes (HPACK decoder, H1 header parser) tag suppressed
1054        // headers with `Store::Empty` rather than removing them, and
1055        // calling `.data()` on `Store::Empty` panics in
1056        // `kawa-0.6.8/src/storage/repr.rs`. Pinning the guard explicitly
1057        // until kawa changes its policy.
1058        let buf = buf_ptr;
1059        kawa.blocks.retain(|block| {
1060            if let Block::Header(Pair { key, val: _ }) = block {
1061                if matches!(key, Store::Empty) {
1062                    return true;
1063                }
1064                let key_bytes = key.data(buf);
1065                // Both `keys_to_drop` and `key_lower` are pre-lowercased,
1066                // so a byte-equality compare is sufficient — a second
1067                // ASCII-fold pass via `compare_no_case` would just burn
1068                // cycles re-folding bytes that are already canonical.
1069                let key_lower: Vec<u8> = key_bytes.iter().map(u8::to_ascii_lowercase).collect();
1070                !keys_to_drop
1071                    .iter()
1072                    .any(|k| k.as_slice() == key_lower.as_slice())
1073            } else {
1074                true
1075            }
1076        });
1077    }
1078
1079    // ── insertion before the end-of-headers flag ──────────────────────
1080    // Every header we add (rewritten Host, X-Forwarded-Host,
1081    // operator-supplied set/append edits) must land before
1082    // `Block::Flags { end_header: true }` so the converter emits them
1083    // as part of the request header block. Synthetic Host/X-Forwarded-Host
1084    // are prepended (they describe the rewrite, not an operator policy).
1085    let end_header_idx = super::shared::end_of_headers_index(kawa);
1086
1087    if rewriting_host {
1088        let mut synth: Vec<Block> = Vec::with_capacity(2);
1089        if let Some(new_host) = rewritten_host {
1090            synth.push(Block::Header(Pair {
1091                key: Store::Static(b"Host"),
1092                val: Store::from_string(new_host.to_owned()),
1093            }));
1094        }
1095        if let Some(orig) = original_authority.as_deref() {
1096            synth.push(Block::Header(Pair {
1097                key: Store::Static(b"X-Forwarded-Host"),
1098                val: Store::from_string(orig.to_owned()),
1099            }));
1100        }
1101        synth.append(&mut to_insert);
1102        to_insert = synth;
1103    }
1104    if !to_insert.is_empty() {
1105        let insert_at = end_header_idx.unwrap_or(kawa.blocks.len());
1106        for (offset, block) in to_insert.into_iter().enumerate() {
1107            kawa.blocks.insert(insert_at + offset, block);
1108        }
1109    }
1110}
1111
1112/// Copy a per-frontend response-edit slice into the per-stream
1113/// `HttpContext.headers_response` snapshot, applying `filter` to each
1114/// edit. The snapshot is cleared before the copy so a second pass on
1115/// the same context (the HSTS hoist + post-forward pattern in
1116/// `route_from_request`) overrides any earlier partial copy.
1117fn snapshot_response_edits<F>(target: &mut Vec<HeaderEditSnapshot>, src: &[HeaderEdit], filter: F)
1118where
1119    F: Fn(&HeaderEdit) -> bool,
1120{
1121    target.clear();
1122    for edit in src.iter().filter(|e| filter(e)) {
1123        target.push(HeaderEditSnapshot {
1124            key: edit.key.to_vec(),
1125            val: edit.val.to_vec(),
1126            mode: edit.mode,
1127        });
1128    }
1129}
1130
1131/// Resolve the protocol scheme to use when emitting a redirect's `Location`
1132/// header. Maps the proto enum onto `"http"` / `"https"`, with `USE_SAME`
1133/// preserving the request's scheme (HTTPS for TLS listeners, HTTP otherwise).
1134fn resolve_redirect_scheme(scheme: RedirectScheme, context: &HttpContext) -> &'static str {
1135    match scheme {
1136        RedirectScheme::UseHttps => "https",
1137        RedirectScheme::UseHttp => "http",
1138        RedirectScheme::UseSame => {
1139            if context.tls_server_name.is_some() {
1140                "https"
1141            } else {
1142                "http"
1143            }
1144        }
1145    }
1146}
1147
1148/// Build the `Location` URL for a redirect response. Defaults the port
1149/// suffix only when the operator provided one or when scheme defaults
1150/// would mismatch (port 80 on https / 443 on http stays implicit).
1151///
1152/// `host_override` and `path_override` carry the frontend's
1153/// `RewriteParts::run` output for `RedirectPolicy::PERMANENT` flows so
1154/// the 301 `Location` reflects `rewrite_host` / `rewrite_path` instead
1155/// of the original `:authority` / `:path`. The legacy
1156/// `cluster.https_redirect` path passes `None` for both — it has no
1157/// per-frontend rewrite knobs.
1158fn build_redirect_location(
1159    scheme: &str,
1160    context: &HttpContext,
1161    port: Option<u32>,
1162    host_override: Option<&str>,
1163    path_override: Option<&str>,
1164) -> String {
1165    let authority = host_override
1166        .or(context.authority.as_deref())
1167        .unwrap_or_default();
1168    let path = path_override.or(context.path.as_deref()).unwrap_or("/");
1169    // Strip an existing `:port` from the authority — operators typically
1170    // configure `https_redirect_port` precisely because the listener's
1171    // port differs from the redirect target. Bracketed IPv6 literals
1172    // like `[::1]` survive intact: `rsplit_once(':')` only triggers when
1173    // the suffix after the final `:` is entirely ASCII digits.
1174    let host_only = match authority.rsplit_once(':') {
1175        Some((host, port_part))
1176            if !port_part.is_empty() && port_part.bytes().all(|b| b.is_ascii_digit()) =>
1177        {
1178            host
1179        }
1180        _ => authority,
1181    };
1182    let port_suffix = match port {
1183        Some(80) if scheme == "http" => String::new(),
1184        Some(443) if scheme == "https" => String::new(),
1185        Some(p) => format!(":{p}"),
1186        None => String::new(),
1187    };
1188    format!("{scheme}://{host_only}{port_suffix}{path}")
1189}
1190
1191/// Exact-match test between an HTTP `:authority` / `Host` value and a TLS SNI.
1192///
1193/// Matching rules:
1194///   * The authority is stripped of its optional `:port` suffix. RFC 6066 §3
1195///     forbids a port in the SNI extension, so the SNI is compared against
1196///     the host component only.
1197///   * The comparison is case-insensitive (RFC 9110 §4.2.3 — hosts are
1198///     case-insensitive). The SNI is assumed to be already lowercased by
1199///     the caller (see `https.rs::upgrade_handshake`); only the authority
1200///     side needs on-the-fly `to_ascii_lowercase`.
1201///   * No wildcard logic: if the operator serves a wildcard certificate,
1202///     the SNI negotiated by the client is still the specific name that
1203///     client sent, and the request `:authority` must equal that specific
1204///     name exactly. This is the tightest possible TLS trust boundary.
1205///
1206/// The `:port` suffix is only stripped when the suffix is non-empty and
1207/// entirely ASCII digits. This keeps bracketed IPv6 literals like `[::1]`
1208/// intact: `rsplit_once(':')` would otherwise mis-split them.
1209pub(crate) fn authority_matches_sni(authority: &str, sni_lowercased: &str) -> bool {
1210    let host = strip_authority_port(authority);
1211    if host.len() != sni_lowercased.len() {
1212        return false;
1213    }
1214    host.as_bytes()
1215        .iter()
1216        .zip(sni_lowercased.as_bytes())
1217        .all(|(a, b)| a.to_ascii_lowercase() == *b)
1218}
1219
1220/// Strip the optional `:port` suffix from an authority value. Bracketed
1221/// IPv6 literals (`[::1]`, `[::1]:8443`) keep their inner colons intact:
1222/// the suffix is only stripped when the tail after the last `:` is
1223/// non-empty and entirely ASCII digits.
1224fn strip_authority_port(authority: &str) -> &str {
1225    match authority.rsplit_once(':') {
1226        Some((h, port)) if !port.is_empty() && port.bytes().all(|b| b.is_ascii_digit()) => h,
1227        _ => authority,
1228    }
1229}
1230
1231/// RFC 6125 §6.4.3 wildcard-aware match of `:authority` against a SAN set
1232/// snapshot taken at TLS handshake.
1233///
1234/// Returns the matched SAN entry on success so the caller can log it.
1235///
1236/// Matching rules:
1237///   * Port suffix on the authority is stripped (same logic as
1238///     [`authority_matches_sni`], IPv6-bracket safe).
1239///   * Compare is ASCII case-insensitive (`:authority` is ASCII per
1240///     RFC 9113 §8.3.1; SAN entries are stored pre-lowercased by
1241///     `https.rs::upgrade_handshake`).
1242///   * `*.suffix` matches exactly one DNS label at the leftmost position
1243///     and only when that label is non-empty: it does NOT match the apex,
1244///     does NOT cross dots, and embedded wildcards (`foo.*.example.com`,
1245///     `*foo.example.com`) are forbidden.
1246///   * Empty `names` ⇒ `None` (default-cert path — Sōzu is not
1247///     authoritative for any name).
1248pub(crate) fn authority_matched_cert_name<'a>(
1249    authority: &str,
1250    names: &'a [String],
1251) -> Option<&'a str> {
1252    let mut host = strip_authority_port(authority);
1253    // RFC 1034 §3.1 absolute-form: `example.com.` and `example.com` name
1254    // the same host. The SAN snapshot already strips trailing dots at
1255    // `https.rs::upgrade_handshake`, and the SNI side strips them at the
1256    // same site; strip on the authority side so a client emitting
1257    // absolute-form `:authority` (or H1 `Host`) does not get a false 421.
1258    // Only one trailing dot is removed because RFC 1034 forbids multiple
1259    // trailing dots on a domain literal.
1260    if let Some(trimmed) = host.strip_suffix('.') {
1261        host = trimmed;
1262    }
1263    if host.is_empty() {
1264        return None;
1265    }
1266    for entry in names {
1267        if let Some(suffix) = entry.strip_prefix("*.") {
1268            // RFC 6125 §6.4.3: the wildcard label is the *entire* left-most
1269            // label. Embedded wildcards (`f*.example.com`, `*f.example.com`)
1270            // are rejected because we reach this branch only when the entry
1271            // starts with the exact two bytes `*.`. We still must reject
1272            // wildcards anywhere else in the entry by requiring no further
1273            // `*` in `suffix`.
1274            if suffix.contains('*') {
1275                continue;
1276            }
1277            // Authority has the form `<left-most-label>.<rest>`; the
1278            // wildcard substitutes for exactly that left-most label, which
1279            // must be non-empty and contain no dot.
1280            let Some((leftmost, rest)) = host.split_once('.') else {
1281                continue;
1282            };
1283            if leftmost.is_empty() {
1284                continue;
1285            }
1286            if rest.eq_ignore_ascii_case(suffix) {
1287                return Some(entry);
1288            }
1289            continue;
1290        }
1291        if entry.contains('*') {
1292            // Internal wildcards (`foo.*.example.com`) are not RFC 6125-
1293            // valid. Skip rather than mis-match.
1294            continue;
1295        }
1296        if host.eq_ignore_ascii_case(entry) {
1297            return Some(entry);
1298        }
1299    }
1300    None
1301}
1302
1303#[cfg(test)]
1304mod tests {
1305    use super::authority_matches_sni;
1306
1307    #[test]
1308    fn match_exact() {
1309        assert!(authority_matches_sni("example.com", "example.com"));
1310    }
1311
1312    #[test]
1313    fn match_different_case() {
1314        assert!(authority_matches_sni("Example.COM", "example.com"));
1315    }
1316
1317    #[test]
1318    fn match_authority_with_port() {
1319        assert!(authority_matches_sni("example.com:8443", "example.com"));
1320    }
1321
1322    #[test]
1323    fn reject_different_host() {
1324        assert!(!authority_matches_sni(
1325            "tenant-b.example.com",
1326            "tenant-a.example.com"
1327        ));
1328    }
1329
1330    #[test]
1331    fn reject_substring_attack() {
1332        // Length check guards against an authority that is a prefix or
1333        // suffix of the SNI (or vice versa).
1334        assert!(!authority_matches_sni("example.co", "example.com"));
1335        assert!(!authority_matches_sni("example.commons", "example.com"));
1336    }
1337
1338    #[test]
1339    fn reject_wildcard_not_expanded() {
1340        // Wildcard cert selection happens at the cert-resolver layer; the SNI
1341        // we see here is the concrete name the client sent. Do not silently
1342        // accept `*.example.com` as matching `foo.example.com`.
1343        assert!(!authority_matches_sni("foo.example.com", "*.example.com"));
1344    }
1345
1346    #[test]
1347    fn ipv6_bracketed_literal_with_port() {
1348        // `[::1]:8443` must still match the SNI `[::1]`; only the trailing
1349        // `:8443` is a port (all digits → stripped).
1350        assert!(authority_matches_sni("[::1]:8443", "[::1]"));
1351    }
1352
1353    #[test]
1354    fn ipv6_bracketed_without_port() {
1355        // The `:` characters inside the brackets must not be mistaken for a
1356        // port separator: the tail after the last `:` is `1]`, not all
1357        // digits, so it is NOT stripped and the whole string compares.
1358        assert!(authority_matches_sni("[::1]", "[::1]"));
1359    }
1360}
1361
1362#[cfg(test)]
1363mod authority_matched_cert_name_tests {
1364    use super::authority_matched_cert_name;
1365
1366    #[test]
1367    fn cert_name_match_exact_single_san() {
1368        let names = vec!["example.com".to_owned()];
1369        assert_eq!(
1370            authority_matched_cert_name("example.com", &names),
1371            Some("example.com"),
1372        );
1373    }
1374
1375    #[test]
1376    fn cert_name_match_wildcard_left_most() {
1377        let names = vec!["*.cleverapps.io".to_owned()];
1378        assert_eq!(
1379            authority_matched_cert_name("staging-3.cleverapps.io", &names),
1380            Some("*.cleverapps.io"),
1381        );
1382    }
1383
1384    #[test]
1385    fn cert_name_reject_wildcard_apex() {
1386        // RFC 6125 §6.4.3: `*.example.com` does NOT cover the apex
1387        // `example.com` — the wildcard label must consume exactly one
1388        // non-empty label.
1389        let names = vec!["*.example.com".to_owned()];
1390        assert_eq!(authority_matched_cert_name("example.com", &names), None);
1391    }
1392
1393    #[test]
1394    fn cert_name_reject_wildcard_two_labels() {
1395        // `*.example.com` cannot cross dots: `a.b.example.com` has two
1396        // labels before `example.com` and must be rejected.
1397        let names = vec!["*.example.com".to_owned()];
1398        assert_eq!(authority_matched_cert_name("a.b.example.com", &names), None,);
1399    }
1400
1401    #[test]
1402    fn cert_name_reject_wildcard_not_left_most() {
1403        // Embedded wildcards (`foo.*.example.com`) are not RFC 6125-valid
1404        // and must be skipped, not mis-matched.
1405        let names = vec!["foo.*.example.com".to_owned()];
1406        assert_eq!(
1407            authority_matched_cert_name("foo.bar.example.com", &names),
1408            None,
1409        );
1410    }
1411
1412    #[test]
1413    fn cert_name_match_case_insensitive() {
1414        // ASCII case folding only — `:authority` is ASCII per RFC 9113
1415        // §8.3.1 and the snapshot is pre-lowercased at handshake.
1416        let names = vec!["EXAMPLE.com".to_owned()];
1417        assert!(authority_matched_cert_name("Example.COM", &names).is_some());
1418    }
1419
1420    #[test]
1421    fn cert_name_match_with_port() {
1422        // The port suffix on `:authority` must be stripped before the
1423        // SAN compare.
1424        let names = vec!["example.com".to_owned()];
1425        assert!(authority_matched_cert_name("example.com:8443", &names).is_some());
1426    }
1427
1428    #[test]
1429    fn cert_name_match_absolute_form_trailing_dot() {
1430        // RFC 1034 §3.1: an absolute-form domain literal carries one
1431        // trailing dot (`example.com.`) and resolves to the same host as
1432        // the relative form. The SAN snapshot stores the relative form
1433        // (https.rs strips the trailing dot at handshake), so the matcher
1434        // must strip it on the authority side too — otherwise a client
1435        // emitting an absolute-form `:authority` gets a false 421.
1436        let names = vec!["example.com".to_owned()];
1437        assert!(authority_matched_cert_name("example.com.", &names).is_some());
1438        // And with both port and trailing dot.
1439        assert!(authority_matched_cert_name("example.com.:8443", &names).is_some());
1440        // The wildcard branch must also accept the absolute form.
1441        let wildcard = vec!["*.example.com".to_owned()];
1442        assert!(authority_matched_cert_name("foo.example.com.", &wildcard).is_some());
1443    }
1444
1445    #[test]
1446    fn cert_name_match_idn_a_label() {
1447        // IDNA A-labels (xn--…) are ASCII and compare byte-for-byte once
1448        // the snapshot is lowercased.
1449        let names = vec!["xn--bcher-kva.example.com".to_owned()];
1450        assert!(authority_matched_cert_name("xn--bcher-kva.example.com", &names).is_some());
1451    }
1452
1453    #[test]
1454    fn cert_name_reject_empty_names() {
1455        // Empty snapshot = default cert served = Sōzu is not
1456        // authoritative for any name; every authority must miss.
1457        assert_eq!(authority_matched_cert_name("example.com", &[]), None);
1458    }
1459
1460    #[test]
1461    fn cert_name_match_multi_san_one_hit() {
1462        let names = vec!["foo.com".to_owned(), "*.example.org".to_owned()];
1463        assert_eq!(
1464            authority_matched_cert_name("bar.example.org", &names),
1465            Some("*.example.org"),
1466        );
1467    }
1468
1469    #[test]
1470    fn cert_name_reject_substring_attack() {
1471        // `*.example.com` must not match `example.commons` — the suffix
1472        // after the first label is `commons`, not `example.com`.
1473        let names = vec!["*.example.com".to_owned()];
1474        assert_eq!(authority_matched_cert_name("example.commons", &names), None,);
1475    }
1476
1477    #[test]
1478    fn cert_name_ipv6_bracketed_literal_with_port() {
1479        // The `:` characters inside the brackets must not be mistaken for
1480        // a port separator: only the trailing `:8443` is stripped, and
1481        // `[::1]` compares equal to `[::1]`.
1482        let names = vec!["[::1]".to_owned()];
1483        assert!(authority_matched_cert_name("[::1]:8443", &names).is_some());
1484    }
1485}
sozu_lib/protocol/mux/router.rs

sozu_lib/protocol/mux/
router.rs