tarweb 0.1.1

io-uring based static file web server, with SNI router
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
//! # SNI router.
//!
//! TCP terminating server that snoops on TLS SNI, and then passes the FD on to
//! another server, like tarweb. Or if the other server doesn't support FD
//! passing, it proxies the connection via the PROXY v1 protocol.
//!
//! The idea here is to actually make different routing decisions based on SNI,
//! and depending on the match, either pass the FD, or do TCP level proxying.
//!
//! Optionally, the SNI router can also do the TLS handshake, and set up kTLS,
//! so that the other server can just treat the connection as plaintext.
//!
//! If you enable *both* proxying (i.e. not FD passing) and TLS handshaking,
//! then make sure the path to the other server is not going over an unencrypted
//! channel, such as plain ethernet. You'll want it to be localhost, or over
//! some VPN, since the connection to the backend will not be encrypted.
//!
//! ## Notable
//!
//! * Under extremely heavy fd passing, `net.unix.max_dgram_qlen` could possibly
//!   become a factor.
//!
//! ## TODO
//!
//! * Add max connection idle time.
//! * Think more about how to best degrade if `sendmsg()` passing the FD fails
//!   with `EMSGSIZE`. Queue? Drop?
//! * Maybe leave the unix socket connected, and only try to reconnect on error?
//! * Add a bunch of tests.
// Disable overly pedantic pedantic-level clippy lints.
#![allow(clippy::similar_names)]

use std::net::ToSocketAddrs;
use std::os::unix::io::AsRawFd;
use std::pin::Pin;
use std::sync::Arc;

use anyhow::anyhow;
use anyhow::{Context, Result, bail};
use clap::Parser;
use tokio::io::AsyncReadExt;
use tokio::io::AsyncWriteExt;
use tokio::net::UnixDatagram;
use tracing::{debug, error, info, trace, warn};

use tarweb::sock;

pub mod protos {
    include!(concat!(env!("OUT_DIR"), "/tarweb.rs"));
}

// How much capacity to prepare for ClientHello and stuff.
const BUF_CAPACITY: usize = 2048;

#[derive(clap::Parser)]
struct Opt {
    /// Verbosity level. Can be error, warn info, debug, or trace.
    #[arg(long, short, default_value = "info")]
    verbose: String,

    /// Address to listen to.
    #[arg(long, short, default_value = "[::]:443")]
    listen: std::net::SocketAddr,

    /// Restrict router to only be able to read under this directory.
    #[arg(long, default_value = "/")]
    restrict_dirs: Vec<std::path::PathBuf>,

    /// Asciiproto config.
    #[arg(long, short)]
    config: String,
}

/// Load TLS data from files as specified in the proto part.
#[allow(clippy::unnecessary_wraps)]
fn load_tls(cfg: Option<&protos::backend::Tls>) -> Result<Option<Arc<rustls::ServerConfig>>> {
    let Some(cfg) = cfg else {
        return Ok(None);
    };
    let certs = tarweb::load_certs(&cfg.cert_file)?;
    let key = tarweb::load_private_key(&cfg.key_file)?;
    Ok(Some(Arc::new({
        let mut cfg =
            rustls::ServerConfig::builder_with_protocol_versions(&[&rustls::version::TLS13])
                .with_no_client_auth()
                .with_single_cert(certs, key)?;
        cfg.enable_secret_extraction = true;
        // Enable key log file to file named from env SSLKEYLOGFILE.
        // cfg.key_log = Arc::new(rustls::KeyLogFile::new());
        cfg
    })))
}

/// Load backend config from the parsed proto.
///
/// This includes loading the TLS cert/key, so it's not just proto data
/// transformation.
fn load_backend(
    be: &protos::backend::BackendType,
    frontend_tls: Option<&protos::backend::Tls>,
    sorry: Option<&protos::Backend>,
) -> Result<Backend> {
    if sorry.is_some_and(|s| s.sorry.is_some()) {
        return Err(anyhow!("sorry servers can't have sorry servers"));
    }
    let sorry = sorry
        .map(|s| {
            load_backend(
                s.backend_type.as_ref().unwrap(),
                s.frontend_tls.as_ref(),
                None,
            )
        })
        .transpose()?
        .map(Box::new);
    Ok(match be {
        protos::backend::BackendType::Null(_) => {
            if sorry.is_some() {
                return Err(anyhow!("null backend with sorry server not allowed"));
            }
            Backend::Null
        }
        protos::backend::BackendType::Proxy(p) => Backend::Proxy {
            addr: p.addr.clone(),
            proxy_header: p.proxy_header,
            frontend_tls: load_tls(frontend_tls)?,
            sorry,
        },
        protos::backend::BackendType::Pass(p) => Backend::Pass {
            path: p.path.clone().into(),
            frontend_tls: load_tls(frontend_tls)?,
            sorry,
        },
    })
}

/// Attempt to load the config from file. This transitively loads any TLS
/// cert/key too.
fn load_config(filename: &str) -> Result<Config> {
    let pool = prost_reflect::DescriptorPool::decode(PROTO_DESCRIPTOR)?;
    let md = pool
        .get_message_by_name("tarweb.SNIConfig")
        .ok_or(anyhow!("Unable to reflect SNIConfig"))?;
    let cwd = std::env::current_dir()
        .map(|c| c.display().to_string())
        .unwrap_or("<unknown>".to_string());
    let txt = std::fs::read_to_string(filename)
        .context(anyhow!("opening {filename:?} from cwd {cwd:?}"))?;
    let dyn_msg = prost_reflect::DynamicMessage::parse_text_format(md, &txt)?;

    let protocfg: protos::SniConfig = dyn_msg.transcode_to()?;

    let mut config = Config {
        max_lifetime: if protocfg.max_lifetime_ms > 0 {
            Some(tokio::time::Duration::from_millis(protocfg.max_lifetime_ms))
        } else {
            None
        },
        handshake_timeout: if protocfg.handshake_timeout_ms > 0 {
            Some(tokio::time::Duration::from_millis(
                protocfg.handshake_timeout_ms,
            ))
        } else {
            None
        },
        rules: vec![],
        default_backend: {
            let (be, frontend_tls, sorry) = protocfg
                .default_backend
                .as_ref()
                .map(|d| (&d.backend_type, d.frontend_tls.as_ref(), d.sorry.as_deref()))
                .ok_or(anyhow!("Config missing default backend"))?;
            load_backend(
                be.as_ref()
                    .ok_or(anyhow!("default backend missing an actual backend"))?,
                frontend_tls,
                sorry,
            )?
        },
        default_backend_timeout: protocfg.default_backend.and_then(|b| {
            let t = b.max_lifetime_ms;
            if t > 0 {
                Some(tokio::time::Duration::from_millis(b.max_lifetime_ms))
            } else {
                None
            }
        }),
    };
    for rule in protocfg.rules {
        config.rules.push(Rule {
            re: regex::Regex::new(&rule.regex)?,
            timeout: rule.backend.as_ref().and_then(|b| {
                let t = b.max_lifetime_ms;
                if t > 0 {
                    Some(tokio::time::Duration::from_millis(b.max_lifetime_ms))
                } else {
                    None
                }
            }),
            backend: {
                let (be, frontend_tls, sorry) = rule
                    .backend
                    .as_ref()
                    .map(|d| (&d.backend_type, d.frontend_tls.as_ref(), d.sorry.as_deref()))
                    .ok_or(anyhow!("rule missing backend"))?;
                load_backend(
                    be.as_ref()
                        .ok_or(anyhow!("backend missing actual backend"))?,
                    frontend_tls,
                    sorry,
                )?
            },
        });
    }
    Ok(config)
}

/// Read enough bytes from `stream` to cover the entire TLS `ClientHello` handshake
/// (which may span multiple records). Returns the handshake (type+len+body).
///
/// TLS record format:
///   - 5B header: `content_type(1)=22`, `legacy_version(2)`, length(2)
///   - payload: one or more handshake messages
///
/// Handshake header:
///   - `msg_type(1)=1(ClientHello)`
///   - length(3) = `body_len`
///
/// Return all bytes read, and clienthello bytes.
///
/// This function is mostly AI coded for the parsing parts. Seems to work, and
/// reviewing it it seems safe.
async fn read_tls_clienthello(
    stream: &mut tokio::net::TcpStream,
) -> Result<(Vec<u8>, Result<Vec<u8>>)> {
    const REC_HDR_LEN: usize = 5;
    let mut hello = Vec::with_capacity(BUF_CAPACITY);
    let mut bytes = Vec::with_capacity(BUF_CAPACITY);

    // We need at least first record to see handshake header (type + 3-byte len).
    // Loop records until we have full ClientHello bytes (4 + body_len).
    let mut needed: Option<usize> = None;

    while needed.is_none_or(|n| hello.len() < n) {
        // Read record header.
        let mut rec_hdr = [0u8; REC_HDR_LEN];
        stream
            .read_exact(&mut rec_hdr)
            .await
            .context("read TLS record header")?;
        bytes.extend(rec_hdr);

        // Parse header.
        let content_type = rec_hdr[0];
        let _legacy_ver = u16::from_be_bytes([rec_hdr[1], rec_hdr[2]]);
        let rec_len = u16::from_be_bytes([rec_hdr[3], rec_hdr[4]]) as usize;

        // Confirm it's Handshake.
        if content_type != 22 {
            return Ok((
                bytes,
                Err(anyhow!(
                    "unexpected TLS content_type {content_type}, want 22 (handshake)"
                )),
            ));
        }
        if rec_len == 0 {
            return Ok((bytes, Err(anyhow!("zero-length TLS record"))));
        }

        // Read whole record.
        let mut rec_payload = vec![0u8; rec_len];
        stream
            .read_exact(&mut rec_payload)
            .await
            .context("read TLS record payload")?;

        // Append to handshake buffer (could contain partial or full ClientHello).
        hello.extend(&rec_payload);
        bytes.extend(&rec_payload);

        // If we haven't established how many bytes we need, try now.
        if needed.is_none() {
            if hello.len() < 4 {
                // Not enough to read handshake header yet; continue.
                continue;
            }
            let msg_type = hello[0];
            if msg_type != 1 {
                return Ok((
                    bytes,
                    Err(anyhow!(
                        "first handshake msg is type {msg_type}, expected 1 (ClientHello)"
                    )),
                ));
            }
            let body_len =
                ((hello[1] as usize) << 16) | ((hello[2] as usize) << 8) | (hello[3] as usize);
            needed = Some(4 + body_len);
        }
    }

    // Truncate to exactly the ClientHello (in case next record started).
    // TODO: that's impossible, right?
    let n = needed.unwrap();
    if hello.len() > n {
        hello.truncate(n);
    }
    Ok((bytes, Ok(hello)))
}

/// Send file descriptor and handshake data using `SCM_RIGHTS` on a Unix datagram.
async fn pass_fd_over_uds(
    stream: tokio::net::TcpStream,
    sock: UnixDatagram,
    bytes: Vec<u8>,
) -> Result<()> {
    use nix::sys::socket::{ControlMessage, MsgFlags, sendmsg};

    let fd = stream.as_raw_fd();
    let iov = [std::io::IoSlice::new(&bytes)];
    let cmsg = [ControlMessage::ScmRights(&[fd])];

    // Async wait until it *should* be fine to write.
    sock.writable().await?;

    // Send sync, but per above *should* be fine to write. Also with
    // `MSG_DONTWAIT` it shouldn't block.
    //
    // This error is sorryable, if it failed in its entirety.
    let sent = sendmsg::<()>(
        sock.as_raw_fd(),
        &iov,
        &cmsg,
        MsgFlags::MSG_NOSIGNAL | MsgFlags::MSG_DONTWAIT,
        None,
    )
    .context("sendmsg SCM_RIGHTS")?;

    if sent != bytes.len() {
        // This is not sorryable.
        return Err(anyhow!(
            "sendmsg: expected to send {} bytes, sent {sent}",
            bytes.len()
        ));
    }
    Ok(())
}

/// Extract SNI `host_name` from a TLS `ClientHello` (handshake header + body).
/// Returns Ok(Some(host)) if found, Ok(None) if no SNI extension exists.
///
/// This function is mostly jipptycoded. Seems to work, and reviewing it it seems
/// safe.
fn extract_sni(clienthello: &[u8]) -> Result<Option<String>> {
    // Handshake header: type(1)=1, len(3)
    if clienthello.len() < 4 {
        bail!("ClientHello too short for handshake header");
    }
    if clienthello[0] != 1 {
        bail!("not a ClientHello (handshake type {})", clienthello[0]);
    }
    let body_len = ((clienthello[1] as usize) << 16)
        | ((clienthello[2] as usize) << 8)
        | (clienthello[3] as usize);
    if clienthello.len() < 4 + body_len {
        bail!("truncated ClientHello body");
    }
    let body = &clienthello[4..4 + body_len];

    let mut i = 0usize;
    // legacy_version(2) + random(32) + session_id_len(1)
    if body.len() < 35 {
        bail!("ClientHello body too short");
    }
    i += 2 + 32;
    let sid_len = body[i] as usize;
    i += 1;
    if body.len() < i + sid_len {
        bail!("truncated session_id");
    }
    i += sid_len;

    // cipher_suites: len(2) + entries (each 2 bytes)
    if body.len() < i + 2 {
        bail!("missing cipher_suites length");
    }
    let cs_len = u16::from_be_bytes([body[i], body[i + 1]]) as usize;
    i += 2;
    if body.len() < i + cs_len || !cs_len.is_multiple_of(2) {
        bail!("invalid cipher_suites vector");
    }
    i += cs_len;

    // compression_methods: len(1) + values
    if body.len() < i + 1 {
        bail!("missing compression_methods length");
    }
    let cmethod_len = body[i] as usize;
    i += 1;
    if body.len() < i + cmethod_len {
        bail!("invalid compression_methods vector");
    }
    i += cmethod_len;

    // optional extensions: len(2) + vector
    if i == body.len() {
        return Ok(None); // no extensions -> no SNI
    }
    if body.len() < i + 2 {
        bail!("missing extensions length");
    }
    let ext_total = u16::from_be_bytes([body[i], body[i + 1]]) as usize;
    i += 2;
    if body.len() < i + ext_total {
        bail!("truncated extensions block");
    }

    let mut j = i;
    while j + 4 <= i + ext_total {
        let etype = u16::from_be_bytes([body[j], body[j + 1]]);
        let elen = u16::from_be_bytes([body[j + 2], body[j + 3]]) as usize;
        j += 4;
        if j + elen > i + ext_total {
            bail!("truncated extension body");
        }
        if etype == 0x0000 {
            // server_name ext
            let ext = &body[j..j + elen];
            if ext.len() < 2 {
                bail!("server_name: missing list length");
            }
            let list_len = u16::from_be_bytes([ext[0], ext[1]]) as usize;
            if ext.len() < 2 + list_len {
                bail!("server_name: truncated list");
            }
            let mut k = 2usize;
            while k + 3 <= 2 + list_len {
                let name_type = ext[k];
                let host_len = u16::from_be_bytes([ext[k + 1], ext[k + 2]]) as usize;
                k += 3;
                if k + host_len > 2 + list_len {
                    bail!("server_name: truncated host entry");
                }
                if name_type == 0 {
                    let host_bytes = &ext[k..k + host_len];
                    // RFC 6066: ASCII, no trailing dot, no NULs. We’ll do a lossy UTF-8 just in case.
                    let host = String::from_utf8_lossy(host_bytes).to_string();
                    return Ok(Some(host));
                }
                k += host_len;
            }
            // SNI ext present but no host_name item
            return Ok(None);
        }
        j += elen;
    }

    Ok(None)
}

/// In process backend config.
///
/// This is not just the proto because TLS configs are loaded too, and it
/// includes other TLS server configs set.
#[derive(Debug)]
enum Backend {
    // Just close the connection.
    Null,

    // Connect to a unix socket and pass in bytes read so far, and the file
    // descriptor to continue.
    Pass {
        path: std::path::PathBuf,
        frontend_tls: Option<Arc<rustls::ServerConfig>>,
        sorry: Option<Box<Backend>>,
    },

    // Proxy string. DNS resolved on every new connection.
    //
    // If a TlsConfig is provided then the handshake and kTLS setup is done by
    // the SNI router.
    Proxy {
        addr: String,
        proxy_header: bool,
        frontend_tls: Option<Arc<rustls::ServerConfig>>,
        sorry: Option<Box<Backend>>,
    },
}

#[derive(Debug)]
struct Rule {
    re: regex::Regex,
    backend: Backend,
    timeout: Option<tokio::time::Duration>,
}

#[derive(Debug)]
struct Config {
    max_lifetime: Option<tokio::time::Duration>,
    handshake_timeout: Option<tokio::time::Duration>,
    rules: Vec<Rule>,
    default_backend: Backend,
    default_backend_timeout: Option<tokio::time::Duration>,
}

/// After going through rules, sorries and backups, we have finally found and
/// connected to the backend we're going to use.
///
/// The timeout is either the global config max lifetime or a per rule maximum.
///
/// The thing that actually connects to a backend doesn't know what the timeout
/// is, nor does the connection loop need to know, so `ConnectedBackend` doesn't
/// contain the timeout.
struct RoutedConnection {
    backend: ConnectedBackend,
    timeout: Option<tokio::time::Duration>,
}

/// A successfull connect has happened, and just needs to do its thing.
enum ConnectedBackend {
    /// File descriptor passed to another process. Nothing more to do.
    Done,

    /// All the data needed to handshake with the backend and proxy the
    /// connection.
    ///
    /// Timeout is already applied to the reader of this at call time.
    Proxy {
        stream: tokio::net::TcpStream,
        bytes: Vec<u8>,
        conn: tokio::net::TcpStream,
        proxy_header: bool,
        frontend_tls: Option<Arc<rustls::ServerConfig>>,
    },
}

/// Perform TLS handshake and setsockopt with kTLS.
///
/// Returns the new stream and the new initial bytes.
async fn tls_handshake(
    mut stream: tokio::net::TcpStream,
    mut bytes: Vec<u8>,
    cfg: Arc<rustls::ServerConfig>,
) -> Result<(tokio::net::TcpStream, Vec<u8>)> {
    use std::io::Read;
    use tokio::io::AsyncWriteExt;

    debug!("Handshaking…");

    // If this fails, we could actually still continue with a sorry server in
    // the caller, but it seems like a very unlikely case, so let's just fail.
    //
    // Anything after creating the config is unsafe to go to sorry-server.
    let mut tls = rustls::ServerConnection::new(cfg)
        .context("creating TLS server config: This is sorry-able, but is not implemented")?;
    loop {
        // Give bytes we have to rustls.
        {
            let mut cur = std::io::Cursor::new(&bytes);
            let n = tls.read_tls(&mut cur)?;
            bytes.drain(0..n);
        }
        let io = tls.process_new_packets()?;

        // Send rustls bytes to the peer.
        let bytes_to_write = io.tls_bytes_to_write();
        if bytes_to_write > 0 {
            let mut buf = vec![0u8; bytes_to_write];
            let mut cur = std::io::Cursor::new(&mut buf);
            let n = tls.write_tls(&mut cur)?;
            // TODO: can we assume remote side will not be overwhelmed?
            // If it is, and insists on writing, then we deadlock (time out).
            stream.write_all(&buf[..n]).await?;
        }
        let still_handshaking = tls.is_handshaking();
        if !still_handshaking {
            let plain_n = io.plaintext_bytes_to_read();
            let mut buf = vec![0u8; plain_n];
            let n = tls.reader().read(&mut buf[..plain_n])?;
            assert_eq!(plain_n, n);

            // Enable initial TLS option.
            let ulp_name = b"tls\0";
            let rc = unsafe {
                libc::setsockopt(
                    stream.as_raw_fd(),
                    libc::SOL_TCP,
                    libc::TCP_ULP,
                    ulp_name.as_ptr().cast(),
                    ulp_name.len().try_into()?,
                )
            };
            if rc < 0 {
                return Err(anyhow!(
                    "setsockopt()=>{rc}: {}",
                    std::io::Error::from_raw_os_error(rc.abs())
                ));
            }

            // Hand over keys.
            let suite = tls.negotiated_cipher_suite().ok_or(anyhow!("bleh"))?;
            let keys = tls.dangerous_extract_secrets()?;
            let tls_rx = ktls::CryptoInfo::from_rustls(suite, keys.rx)?;
            let tls_tx = ktls::CryptoInfo::from_rustls(suite, keys.tx)?;
            for (name, s) in [(libc::TLS_RX, tls_rx), (libc::TLS_TX, tls_tx)] {
                let rc = unsafe {
                    libc::setsockopt(
                        stream.as_raw_fd(),
                        libc::SOL_TLS,
                        name,
                        s.as_ptr(),
                        s.size().try_into()?,
                    )
                };
                if rc < 0 {
                    return Err(anyhow!(
                        "setsockopt()=>{rc}: {}",
                        std::io::Error::from_raw_os_error(rc.abs())
                    ));
                }
            }
            return Ok((stream, buf));
        }

        // Handshake still going.
        let mut buf = [0u8; 4096];
        let n = stream.read(&mut buf).await?;
        if n == 0 {
            return Err(std::io::Error::new(
                std::io::ErrorKind::UnexpectedEof,
                "EOF during handshake",
            )
            .into());
        }
        bytes.extend(&buf[..n]);

        // TODO: what should this magic value be?
        if bytes.len() > 8192 {
            return Err(anyhow!("max TLS outstanding size exceeded"));
        }
    }
}

/// Do a connect for proxied connections.
///
/// This is called under handshake timeout, and failure will fall back to sorry
/// server.
async fn connect_for_proxy(id: usize, addr: &str) -> Result<tokio::net::TcpStream> {
    let addrs = addr.to_socket_addrs()?;
    let mut conn = None;
    for addr in addrs {
        match tokio::net::TcpStream::connect(addr).await {
            Ok(ok) => {
                trace!("id={id} Connected to backend {addr}");
                conn = Some(ok);
                break;
            }
            Err(e) => {
                debug!("id={id} Failed to connect to backend {addr:?}: {e}");
            }
        }
    }
    conn.ok_or(anyhow!(
        "failed to connect to any backend with address {addr}"
    ))
}

/// After fully connected, and handshake timeout no longer relevant, run the
/// remaining proxying.
///
/// Any failure here will NOT fall back to sorry servers, as we're already
/// connected.
async fn handle_connected_backend(id: usize, backend: ConnectedBackend) -> Result<()> {
    match backend {
        // No proxying needed if fd was passed.
        ConnectedBackend::Done => Ok(()),

        ConnectedBackend::Proxy {
            stream,
            bytes,
            conn,
            proxy_header,
            frontend_tls,
        } => handle_connected_proxy(id, stream, bytes, conn, proxy_header, frontend_tls).await,
    }
}

/// Do any frontend TLS and work with the already connected backend proxy.
///
/// Any failure here will NOT fall back to sorry servers, as we're already
/// connected.
async fn handle_connected_proxy(
    id: usize,
    stream: tokio::net::TcpStream,
    bytes: Vec<u8>,
    mut conn: tokio::net::TcpStream,
    proxy_header: bool,
    tls: Option<Arc<rustls::ServerConfig>>,
) -> Result<()> {
    let (mut stream, bytes) = if let Some(tls) = tls {
        tls_handshake(stream, bytes, tls).await?
    } else {
        (stream, bytes)
    };
    let (mut up_r, mut up_w) = conn.split();
    let (mut down_r, mut down_w) = stream.split();
    let upstream = async {
        if proxy_header {
            let me = down_r.local_addr()?;
            let peer = down_r.peer_addr()?;
            let src_port = peer.port();
            let src_addr = peer.ip().to_string();
            let proto = if peer.is_ipv4() {
                "TCP4"
            } else if peer.is_ipv6() {
                "TCP6"
            } else {
                "UNKNOWN"
            };
            let dst_addr = me.ip().to_string();
            let dst_port = me.port();
            up_w.write_all(
                format!("PROXY {proto} {src_addr} {dst_addr} {src_port} {dst_port}\r\n").as_bytes(),
            )
            .await?;
        }
        // Re-write ClientHello or anything else pre-read.
        up_w.write_all(&bytes).await?;
        tokio::io::copy(&mut down_r, &mut up_w).await?;
        up_w.shutdown().await?;
        trace!("id={id} Upstream write completed");
        Ok::<_, anyhow::Error>(())
    };
    let downstream = async {
        tokio::io::copy(&mut up_r, &mut down_w).await?;
        down_w.shutdown().await?;
        trace!("id={id} Downstream write completed");
        Ok::<_, anyhow::Error>(())
    };
    tokio::try_join!(upstream, downstream)?;
    Ok(())
}

/// Having found a matching backend config (incl sorry server fallback), we try
/// to connect to it.
///
/// TODO: Document why this creates a box pinned future instead of just being
/// async. IIRC it had something to do with circular references or something.
fn connect_or_handoff_backend<'a>(
    id: usize,
    stream: tokio::net::TcpStream,
    bytes: Vec<u8>,
    backend: &'a Backend,
) -> Pin<Box<dyn std::future::Future<Output = Result<ConnectedBackend>> + Send + 'a>> {
    Box::pin(async move {
        match backend {
            Backend::Null => {
                trace!("id={id} Null backend. Closing");
                Ok(ConnectedBackend::Done)
            }
            Backend::Pass {
                path,
                frontend_tls,
                sorry,
            } => {
                // Connecting to a UnixDatagram should be cheap, and not at all be
                // visible to the backend. It's only when we SendMsg that it can
                // cause any load. So we first do this connect, so that we don't
                // needlessly do a handshake only to then never connect to anything.
                //
                // Besides, perhaps the sorry server doesn't have frontend TLS
                // enabled.
                let sock = tokio::net::UnixDatagram::unbound().context("create UnixDatagram")?;
                if let Err(e) = sock
                    .connect(path)
                    .with_context(|| format!("connect to {:?}", path.display()))
                {
                    info!("Primary backend connect failure: {e}");
                    if let Some(s) = sorry {
                        return connect_or_handoff_backend(id, stream, bytes, s).await;
                    }
                    return Err(e);
                }
                // This doesn't work, because we're using DGRAM. Maybe it works with
                // SEQPACKET?
                if false {
                    // While this error is sorry-able, but since it doesn't work
                    // anyway, shrug.
                    let ucred = nix::sys::socket::getsockopt(
                        &sock,
                        nix::sys::socket::sockopt::PeerCredentials,
                    )?;
                    debug!(
                        "id={id} peer pid={} uid={} gid={}",
                        ucred.pid(),
                        ucred.uid(),
                        ucred.gid()
                    );
                }
                let (stream, bytes) = if let Some(tls) = frontend_tls {
                    tls_handshake(stream, bytes, tls.clone()).await?
                } else {
                    (stream, bytes)
                };
                pass_fd_over_uds(stream, sock, bytes).await?;
                Ok(ConnectedBackend::Done)
            }
            Backend::Proxy {
                addr,
                proxy_header,
                frontend_tls,
                sorry,
            } => {
                let conn = match connect_for_proxy(id, addr).await {
                    Ok(c) => c,
                    Err(e) => {
                        info!("Primary backend connect failure: {e}");
                        return match sorry {
                            None => Err(e),
                            Some(s) => connect_or_handoff_backend(id, stream, bytes, s).await,
                        };
                    }
                };
                Ok(ConnectedBackend::Proxy {
                    stream,
                    bytes,
                    conn,
                    proxy_header: *proxy_header,
                    frontend_tls: frontend_tls.clone(),
                })
            }
        }
    })
}

/// Same as `connect_or_handoff_backend`, but with the per rule timeout when
/// trying to connect to that backend.
///
/// It's also running under the global `max_lifetime_ms`, like everything else.
async fn connect_or_handoff_backend_with_timeout(
    id: usize,
    stream: tokio::net::TcpStream,
    bytes: Vec<u8>,
    backend: &Backend,
    timeout: Option<tokio::time::Duration>,
) -> Result<ConnectedBackend> {
    let fut = connect_or_handoff_backend(id, stream, bytes, backend);
    if let Some(timeout) = timeout {
        match tokio::time::timeout(timeout, fut).await {
            Ok(r) => r,
            Err(e) => Err(anyhow!("backend connect/handoff timeout: {e}")),
        }
    } else {
        fut.await
    }
}

/// Regex fullmatch wrapper.
fn is_full_match(re: &regex::Regex, text: &str) -> bool {
    match re.find(text) {
        Some(m) => m.start() == 0 && m.end() == text.len(),
        None => false,
    }
}

/// Find correct rule and connect to backend.
///
/// This is called under global `max_lifetime_ms` and `handshake_timeout_ms`
/// timeout.
async fn route_and_connect(
    id: usize,
    mut stream: tokio::net::TcpStream,
    config: &Config,
) -> Result<RoutedConnection> {
    // Read and validate a full TLS ClientHello.
    let (bytes, clienthello) = read_tls_clienthello(&mut stream).await?;
    match clienthello {
        Ok(clienthello) => {
            debug!("id={id} ClientHello len={} bytes", clienthello.len());
            match extract_sni(&clienthello)? {
                Some(sni) => {
                    debug!("id={id} SNI: {sni:?}");

                    for rule in &config.rules {
                        if is_full_match(&rule.re, &sni) {
                            trace!("id={id} SNI {sni} matched rule {rule:?}");
                            return Ok(RoutedConnection {
                                backend: connect_or_handoff_backend_with_timeout(
                                    id,
                                    stream,
                                    bytes,
                                    &rule.backend,
                                    rule.timeout,
                                )
                                .await?,
                                timeout: rule.timeout,
                            });
                        }
                    }
                }
                None => {
                    warn!("id={id} Failed to extract SNI");
                }
            }
        }
        Err(e) => {
            warn!("id={id} Using default backend because no clienthello: {e}");
        }
    }
    Ok(RoutedConnection {
        backend: connect_or_handoff_backend_with_timeout(
            id,
            stream,
            bytes,
            &config.default_backend,
            config.default_backend_timeout,
        )
        .await?,
        timeout: config.default_backend_timeout,
    })
}

/// Handle connection.
///
/// Called under `max_lifetime_ms` timeout.
async fn handle_conn(id: usize, stream: tokio::net::TcpStream, config: &Config) -> Result<()> {
    let fut = route_and_connect(id, stream, config);
    let routed = if let Some(timeout) = config.handshake_timeout {
        match tokio::time::timeout(timeout, fut).await {
            Ok(r) => r?,
            Err(e) => return Err(anyhow!("handshake timeout: {e}")),
        }
    } else {
        fut.await?
    };

    let fut = handle_connected_backend(id, routed.backend);
    if let Some(timeout) = routed.timeout {
        tokio::time::timeout(timeout, fut).await?
    } else {
        fut.await
    }
}

async fn mainloop(
    mut config: Arc<Config>,
    config_filename: &str,
    listener: tokio::net::TcpListener,
) -> Result<()> {
    let mut id = 0;
    let mut hups = tokio::signal::unix::signal(tokio::signal::unix::SignalKind::hangup())
        .expect("Registering SIGHUP");
    loop {
        let (stream, peer) = tokio::select! {
            r = listener.accept() => r,
            _ = hups.recv() => {
                let cwd = std::env::current_dir().map(|c|c.display().to_string()).unwrap_or("<unknown>".to_string());
                info!("Got SIGHUP. Loading new config {config_filename:?} in cwd {cwd:?}");
                match load_config(config_filename) {
                    Ok(c) => config = Arc::new(c),
                    Err(e) => error!(
                        "Failed to load config {config_filename:?}, staying with old config: {e}"
                    ),
                }
                continue;
            }
        }?;
        debug!("id={id} fd={} Accepted {}", stream.as_raw_fd(), peer);
        let config = config.clone();
        tokio::spawn(async move {
            let fut = handle_conn(id, stream, &config);
            let res = if let Some(timeout) = config.max_lifetime {
                match tokio::time::timeout(timeout, fut).await {
                    Ok(o) => o,
                    Err(e) => Err(anyhow!("connection timeout for peer {peer}: {e}")),
                }
            } else {
                fut.await
            };
            if let Err(e) = res {
                warn!("id={id} Handling connection to {peer}: {e:#}");
            }
            debug!("id={id} Done");
        });
        id += 1;
    }
}

const PROTO_DESCRIPTOR: &[u8] = include_bytes!("../../descriptor.bin");

#[tokio::main]
async fn main() -> Result<()> {
    let opt = Opt::parse();

    // This is only needed for integration tests, that get multiple crypto
    // implementation features turned on, so we have to pick one.
    rustls::crypto::aws_lc_rs::default_provider()
        .install_default()
        .unwrap();

    tracing_subscriber::fmt()
        //.with_env_filter(format!("sni_router={}", opt.verbose))
        .with_env_filter(&opt.verbose)
        .with_writer(std::io::stderr)
        .init();
    info!("SNI Router");
    let listener = tokio::net::TcpListener::bind(&opt.listen)
        .await
        .context(format!("listening to {}", opt.listen))?;
    debug!("Listening on {}", listener.local_addr()?);
    tarweb::privs::sni_drop(
        &opt.restrict_dirs
            .iter()
            .map(std::path::PathBuf::as_path)
            .collect::<Vec<_>>(),
    )?;
    sock::set_nodelay(listener.as_raw_fd())?;
    // Config.
    let config = load_config(&opt.config).context(format!("Loading config {:?}", opt.config))?;
    mainloop(Arc::new(config), &opt.config, listener).await
}

#[cfg(test)]
mod tests {
    #![allow(clippy::too_many_lines)]
    use super::*;
    use std::net::SocketAddr;
    use std::sync::atomic::Ordering;

    const MAX_TEST_CONNECTION_TIME: tokio::time::Duration = tokio::time::Duration::from_secs(5);

    #[test]
    fn config_loads_handshake_timeout() -> Result<()> {
        let tmp_dir = tempfile::TempDir::new()?;
        let config_file = tmp_dir.path().join("config.cfg");
        std::fs::write(
            &config_file,
            r#"
default_backend: <
        null: <>
>
handshake_timeout_ms: 1234
"#,
        )?;

        let config = load_config(config_file.to_str().unwrap())?;
        assert_eq!(
            config.handshake_timeout,
            Some(tokio::time::Duration::from_millis(1234))
        );
        Ok(())
    }

    #[tokio::test]
    async fn default_client() -> Result<()> {
        if false {
            tracing_subscriber::fmt()
                .with_env_filter("trace")
                .with_writer(std::io::stderr)
                .init();
        }
        for curl_opt in ["--tlsv1", "--tlsv1.1", "--tls1.2", "--tls1.3"] {
            for sni in ["foo", "bar", "bar2", "socket"] {
                info!("TESTING: sni={sni} opt={curl_opt}");

                let tmp_dir = tempfile::TempDir::new()?;
                let hit_something = std::sync::atomic::AtomicBool::new(false);
                let listener =
                    tokio::net::TcpListener::bind("[::1]:0".parse::<SocketAddr>()?).await?;
                let listener_port = listener.local_addr()?.port();

                // Backends.
                let backend_bar =
                    tokio::net::TcpListener::bind("[::1]:0".parse::<SocketAddr>()?).await?;
                let backend_bar_port = backend_bar.local_addr()?.port();
                let backend_baz =
                    tokio::net::TcpListener::bind("[::1]:0".parse::<SocketAddr>()?).await?;
                let backend_baz_port = backend_baz.local_addr()?.port();

                let sockfile = tmp_dir.path().join("tarweb-testing.sock");
                let backend_sock = tokio::net::UnixDatagram::bind(&sockfile)?;

                // Test config.
                #[allow(clippy::regex_creation_in_loops)]
                let config = Config {
                    max_lifetime: Some(MAX_TEST_CONNECTION_TIME),
                    handshake_timeout: None,
                    rules: vec![
                        Rule {
                            re: regex::Regex::new("foo")?,
                            backend: Backend::Null,
                            timeout: None,
                        },
                        Rule {
                            re: regex::Regex::new("socket")?,
                            backend: Backend::Pass {
                                path: sockfile.clone(),
                                frontend_tls: None,
                                sorry: None,
                            },
                            timeout: None,
                        },
                        Rule {
                            re: regex::Regex::new("bar")?,
                            backend: Backend::Proxy {
                                addr: format!("[::1]:{backend_bar_port}"),
                                proxy_header: false,
                                frontend_tls: None,
                                sorry: None,
                            },
                            timeout: None,
                        },
                    ],
                    default_backend: Backend::Proxy {
                        addr: format!("[::1]:{backend_baz_port}"),
                        proxy_header: false,
                        frontend_tls: None,
                        sorry: None,
                    },
                    default_backend_timeout: None,
                };
                let _main =
                    tokio::task::spawn(
                        async move { mainloop(Arc::new(config), "", listener).await },
                    );

                let (done_tx1, mut done_rx_bar) = tokio::sync::mpsc::channel::<()>(1);
                let (done_tx2, mut done_rx_baz) = tokio::sync::mpsc::channel::<()>(1);
                let (done_tx3, mut done_rx_sock) = tokio::sync::mpsc::channel::<()>(1);
                let client = async {
                    // Expect failure because our backend immediately disconnects.
                    let _status = tokio::process::Command::new("curl")
                        .arg("-S")
                        .arg("--no-progress-meter")
                        .arg("--connect-to")
                        .arg(format!("foo:443:[::1]:{listener_port}"))
                        .arg("--connect-to")
                        .arg(format!("bar:443:[::1]:{listener_port}"))
                        .arg("--connect-to")
                        .arg(format!("socket:443:[::1]:{listener_port}"))
                        .arg("--connect-to")
                        .arg(format!("bar2:443:[::1]:{listener_port}"))
                        .arg(format!("https://{sni}/"))
                        .spawn()?
                        .wait()
                        .await?;
                    drop(done_tx1);
                    drop(done_tx2);
                    drop(done_tx3);
                    Ok::<(), anyhow::Error>(())
                };
                let backend_bar = async {
                    if sni == "bar" {
                        info!("COVERED: bar");
                        hit_something.store(true, Ordering::Relaxed);
                        tokio::select! {
                            _ = backend_bar.accept() => Ok(()),
                            _ = done_rx_bar.recv() => Err(anyhow!("nobody connected to backend")),
                        }
                    } else {
                        Ok(())
                    }
                };
                let backend_baz = async {
                    if sni == "bar2" {
                        info!("COVERED: default");
                        hit_something.store(true, Ordering::Relaxed);
                        tokio::select! {
                            _ = backend_baz.accept() => Ok(()),
                            _ = done_rx_baz.recv() => Err(anyhow!("nobody connected to backend")),
                        }
                    } else {
                        Ok(())
                    }
                };
                let backend_sock = async {
                    if sni == "socket" {
                        info!("COVERED: socket");
                        hit_something.store(true, Ordering::Relaxed);
                        let mut buf = [0u8; 2048];
                        tokio::select! {
                            _ = backend_sock.recv(&mut buf) => Ok(()),
                            _ = done_rx_sock.recv() => Err(anyhow!("nobody connected to backend")),
                        }
                    } else {
                        Ok(())
                    }
                };
                if sni == "foo" {
                    // Connected to nothing.
                    hit_something.store(true, Ordering::Relaxed);
                }
                tokio::time::timeout(MAX_TEST_CONNECTION_TIME, async {
                    tokio::try_join!(client, backend_bar, backend_baz, backend_sock,)
                })
                .await??;
                assert!(
                    hit_something.load(Ordering::Relaxed),
                    "SNI {sni:?} and opts {curl_opt:?} did not do anything"
                );
            }
        }
        Ok(())
    }

    #[tokio::test]
    async fn handshake_timeout_closes_idle_preroute_client() -> Result<()> {
        let listener = tokio::net::TcpListener::bind("[::1]:0".parse::<SocketAddr>()?).await?;
        let listener_port = listener.local_addr()?.port();
        let config = Config {
            max_lifetime: Some(MAX_TEST_CONNECTION_TIME),
            handshake_timeout: Some(tokio::time::Duration::from_millis(50)),
            rules: vec![],
            default_backend: Backend::Null,
            default_backend_timeout: None,
        };
        let _main =
            tokio::task::spawn(async move { mainloop(Arc::new(config), "", listener).await });

        let mut stream = tokio::net::TcpStream::connect(format!("[::1]:{listener_port}")).await?;
        let mut buf = [0u8; 1];
        let read = tokio::time::timeout(MAX_TEST_CONNECTION_TIME, stream.read(&mut buf)).await?;
        match read {
            Ok(0) => Ok(()),
            Ok(n) => Err(anyhow!("idle preroute client read unexpected {n} bytes")),
            Err(e) if e.kind() == std::io::ErrorKind::ConnectionReset => Ok(()),
            Err(e) => Err(e.into()),
        }
    }

    #[tokio::test]
    async fn handshake_timeout_stops_after_proxy_backend_connects() -> Result<()> {
        let listener = tokio::net::TcpListener::bind("[::1]:0".parse::<SocketAddr>()?).await?;
        let listener_port = listener.local_addr()?.port();
        let backend = tokio::net::TcpListener::bind("[::1]:0".parse::<SocketAddr>()?).await?;
        let backend_port = backend.local_addr()?.port();
        let config = Config {
            max_lifetime: Some(MAX_TEST_CONNECTION_TIME),
            handshake_timeout: Some(tokio::time::Duration::from_millis(50)),
            rules: vec![],
            default_backend: Backend::Proxy {
                addr: format!("[::1]:{backend_port}"),
                proxy_header: false,
                frontend_tls: None,
                sorry: None,
            },
            default_backend_timeout: None,
        };
        let _main =
            tokio::task::spawn(async move { mainloop(Arc::new(config), "", listener).await });

        let backend = tokio::spawn(async move {
            let (mut stream, _) = backend.accept().await?;
            let mut got = [0u8; 5];
            stream.read_exact(&mut got).await?;
            if got != *b"abcde" {
                return Err(anyhow!("backend got unexpected bytes: {got:?}"));
            }
            tokio::time::sleep(tokio::time::Duration::from_millis(150)).await;
            stream.write_all(b"ok").await?;
            stream.shutdown().await?;
            Ok::<(), anyhow::Error>(())
        });

        let mut stream = tokio::net::TcpStream::connect(format!("[::1]:{listener_port}")).await?;

        // Write invalid TLS records, forcing router to pick the default
        // backend.
        stream.write_all(b"abcde").await?;

        let mut got = Vec::new();
        tokio::time::timeout(MAX_TEST_CONNECTION_TIME, stream.read_to_end(&mut got)).await??;
        backend.await??;
        assert_eq!(got, b"ok");
        Ok(())
    }
}