pitchfork-cli 2.13.1

Daemons with DX
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
//! Supervisor module - daemon process supervisor
//!
//! This module is split into focused submodules:
//! - `state`: State access layer (get/set operations)
//! - `lifecycle`: Daemon start/stop operations
//! - `autostop`: Autostop logic and boot daemon startup
//! - `retry`: Retry logic with backoff
//! - `watchers`: Background tasks (interval, cron, file watching)
//! - `ipc_handlers`: IPC request dispatch

mod autostop;
mod hooks;
mod ipc_handlers;
mod lifecycle;
#[cfg(unix)]
mod pty;
mod retry;
mod state;
mod watchers;

use crate::daemon_id::DaemonId;
use crate::daemon_status::DaemonStatus;
use crate::deps::compute_reverse_stop_order;
use crate::ipc::server::{IpcServer, IpcServerHandle};

use crate::procs::PROCS;
use crate::settings::settings;
use crate::state_file::StateFile;
use crate::{Result, env};
use duct::cmd;
use miette::IntoDiagnostic;
use once_cell::sync::Lazy;
use std::collections::HashMap;
#[cfg(unix)]
use std::collections::HashSet;
use std::fs;
#[cfg(unix)]
use std::os::unix::fs::PermissionsExt;
use std::process::exit;
use std::sync::atomic;
use std::sync::atomic::{AtomicBool, AtomicU32};
use std::time::Duration;
#[cfg(unix)]
use tokio::signal::unix::SignalKind;
use tokio::sync::{Mutex, Notify};
use tokio::task::JoinHandle;
use tokio::{signal, time};

/// Exit statuses reaped by the container-mode zombie reaper for managed daemon
/// PIDs. On non-Linux Unix platforms where `waitid(WNOWAIT)` is unavailable,
/// `waitpid(None, WNOHANG)` may race with Tokio's `child.wait()`. When the
/// zombie reaper wins, the exit status is stashed here so the monitoring task
/// in lifecycle.rs can recover it instead of treating the ECHILD as a failure.
///
/// On Linux this map is unused because the reaper uses `waitid` with `WNOWAIT`
/// to peek before reaping, which avoids the race entirely.
#[cfg(all(unix, not(target_os = "linux")))]
pub(crate) static REAPED_STATUSES: Lazy<Mutex<HashMap<u32, i32>>> =
    Lazy::new(|| Mutex::new(HashMap::new()));

// Re-export types needed by other modules
pub(crate) use state::UpsertDaemonOpts;

pub struct Supervisor {
    pub(crate) state_file: Mutex<StateFile>,
    pub(crate) pending_notifications: Mutex<Vec<(log::LevelFilter, String)>>,
    pub(crate) last_refreshed_at: Mutex<time::Instant>,
    /// Map of daemon ID to scheduled autostop time
    pub(crate) pending_autostops: Mutex<HashMap<DaemonId, time::Instant>>,
    /// Handle for graceful IPC server shutdown
    pub(crate) ipc_shutdown: Mutex<Option<IpcServerHandle>>,
    /// Tracks in-flight hook tasks so shutdown can wait for them to complete
    pub(crate) hook_tasks: Mutex<Vec<JoinHandle<()>>>,
    /// Number of monitoring tasks that are still running (between process exit
    /// and hook registration completion). Used by `close()` to know when it is
    /// safe to drain `hook_tasks`.
    pub(crate) active_monitors: AtomicU32,
    /// Signalled by each monitoring task after it finishes registering hooks
    /// (or decides it has nothing to register). `close()` waits on this.
    pub(crate) monitor_done: Notify,
    /// Cancellation token for the proxy server — cancelled on shutdown to
    /// stop accepting new connections and drain in-flight ones.
    pub(crate) proxy_cancel: Mutex<Option<tokio_util::sync::CancellationToken>>,
    /// Join handle for the proxy task so shutdown can wait for cleanup.
    pub(crate) proxy_task: Mutex<Option<JoinHandle<()>>>,
    /// mDNS publisher for LAN mode (None if LAN mode is disabled).
    /// Shared with the LAN IP monitor task so it can re-publish on IP change.
    pub(crate) mdns_publisher:
        Mutex<Option<std::sync::Arc<tokio::sync::Mutex<crate::proxy::mdns::MdnsPublisher>>>>,
    /// Join handle for the LAN IP monitor task.
    pub(crate) lan_monitor_task: Mutex<Option<JoinHandle<()>>>,
    /// Cancellation token for the background state flush task.
    pub(crate) flush_cancel: std::sync::Mutex<Option<tokio_util::sync::CancellationToken>>,
}

pub(crate) fn interval_duration() -> Duration {
    settings().general_interval()
}

pub static SUPERVISOR: Lazy<Supervisor> =
    Lazy::new(|| Supervisor::new().expect("Error creating supervisor"));

pub fn start_if_not_running() -> Result<()> {
    let sf = StateFile::get();
    if let Some(d) = sf.daemons.get(&DaemonId::pitchfork())
        && let Some(pid) = d.pid
    {
        PROCS.refresh_pids(&[pid]);
        if PROCS.is_running(pid) {
            return Ok(());
        }
    }
    start_in_background()
}

pub fn start_in_background() -> Result<()> {
    debug!("starting supervisor in background");
    // Ensure the log directory exists so we can redirect stderr there.
    // Panics and other fatal errors from the background supervisor process
    // would otherwise be silently swallowed.
    let log_file = &*env::PITCHFORK_LOG_FILE;
    if let Some(parent) = log_file.parent() {
        let _ = fs::create_dir_all(parent);
    }
    let stderr_file = fs::OpenOptions::new()
        .create(true)
        .append(true)
        .open(log_file)
        .into_diagnostic()?;
    #[cfg(unix)]
    fix_state_dir_permissions();
    cmd!(&*env::PITCHFORK_BIN, "supervisor", "run")
        .stdout_null()
        .stderr_file(stderr_file)
        .start()
        .into_diagnostic()?;
    Ok(())
}

impl Supervisor {
    pub fn new() -> Result<Self> {
        Ok(Self {
            state_file: Mutex::new(StateFile::new(env::PITCHFORK_STATE_FILE.clone())),
            last_refreshed_at: Mutex::new(time::Instant::now()),
            pending_notifications: Mutex::new(vec![]),
            pending_autostops: Mutex::new(HashMap::new()),
            ipc_shutdown: Mutex::new(None),
            hook_tasks: Mutex::new(Vec::new()),
            active_monitors: AtomicU32::new(0),
            monitor_done: Notify::new(),
            proxy_cancel: Mutex::new(None),
            proxy_task: Mutex::new(None),
            mdns_publisher: Mutex::new(None),
            lan_monitor_task: Mutex::new(None),
            flush_cancel: std::sync::Mutex::new(None),
        })
    }

    pub async fn start(
        &self,
        is_boot: bool,
        container: bool,
        web_port: Option<u16>,
        web_path: Option<String>,
    ) -> Result<()> {
        // Ensure the state directory and its contents are accessible by non-root
        // users. This is needed when the supervisor is started with `sudo` — all
        // files it creates are owned by root, which prevents normal CLI clients
        // from reading/writing state or connecting to the IPC socket.
        #[cfg(unix)]
        fix_state_dir_permissions();

        let pid = std::process::id();
        // Ensure PROCS has data for the supervisor PID before upsert_daemon reads title()
        PROCS.refresh_pids(&[pid]);
        // Determine container mode: CLI flag takes priority, then settings
        let container_mode = container || settings().supervisor.container;
        if container_mode {
            info!("Starting supervisor in container/PID1 mode with pid {pid}");
        } else {
            info!("Starting supervisor with pid {pid}");
        }

        // If the previous supervisor died uncleanly, its daemon child processes
        // may still be alive (orphaned, re-parented to init).  Terminate them
        // before we take over so we don't end up with duplicate processes
        // holding the same ports.
        cleanup_orphaned_daemons(self).await;

        self.upsert_daemon(
            UpsertDaemonOpts::builder(DaemonId::pitchfork())
                .set(|o| {
                    o.pid = Some(pid);
                    o.status = DaemonStatus::Running;
                })
                .build(),
        )
        .await?;
        #[cfg(unix)]
        fix_state_dir_permissions();

        // If this is a boot start, automatically start boot_start daemons
        if is_boot {
            info!("Boot start mode enabled, starting boot_start daemons");
            self.start_boot_daemons().await?;
        }

        self.interval_watch()?;
        self.cron_watch()?;
        self.signals()?;
        self.daemon_file_watch()?;

        // In container mode, install SIGCHLD handler to reap orphaned/zombie processes
        #[cfg(unix)]
        if container_mode {
            self.reap_zombies()?;
        }

        // Start web server: CLI --web-port takes priority, then settings.web.auto_start + bind_port
        let s = settings();
        let effective_port = web_port.or_else(|| {
            if s.web.auto_start {
                match u16::try_from(s.web.bind_port).ok().filter(|&p| p > 0) {
                    Some(p) => Some(p),
                    None => {
                        error!(
                            "web.bind_port {} is out of valid port range (1-65535), web UI disabled",
                            s.web.bind_port
                        );
                        None
                    }
                }
            } else {
                None
            }
        });
        // CLI --web-path takes priority, then settings.web.base_path
        let effective_path = web_path.or_else(|| {
            let bp = s.web.base_path.clone();
            if bp.is_empty() { None } else { Some(bp) }
        });
        if let Some(port) = effective_port {
            tokio::spawn(async move {
                if let Err(e) = crate::web::serve(port, effective_path).await {
                    error!("Web server error: {e}");
                }
            });
        }

        // Start standalone API server if configured
        let api_port = if s.api.auto_start {
            match u16::try_from(s.api.bind_port).ok().filter(|&p| p > 0) {
                Some(p) => Some(p),
                None => {
                    error!(
                        "api.bind_port {} is out of valid port range (1-65535), API server disabled",
                        s.api.bind_port
                    );
                    None
                }
            }
        } else {
            None
        };
        if let Some(port) = api_port {
            tokio::spawn(async move {
                if let Err(e) = crate::web::serve_api(port, None).await {
                    error!("API server error: {e}");
                }
            });
        }

        // Start reverse proxy server if enabled
        if s.proxy.enable {
            // Pre-generate the TLS certificate synchronously before spawning the proxy
            // task. This ensures the cert exists immediately after `sup start` returns,
            // so `proxy trust` can be run right away without waiting for the async task.
            #[cfg(feature = "proxy-tls")]
            if s.proxy.https {
                let proxy_dir = crate::env::PITCHFORK_STATE_DIR.join("proxy");
                let ca_cert_path = proxy_dir.join("ca.pem");
                let ca_key_path = proxy_dir.join("ca-key.pem");
                if !ca_cert_path.exists() || !ca_key_path.exists() {
                    match crate::proxy::server::generate_ca(&ca_cert_path, &ca_key_path) {
                        Ok(()) => {
                            info!(
                                "Generated local CA certificate at {}",
                                ca_cert_path.display()
                            );
                        }
                        Err(e) => {
                            error!("Failed to generate CA certificate: {e}");
                        }
                    }
                }

                // Auto-trust: attempt to install the CA certificate into the
                // system trust store. May fail silently due to permissions;
                // user can run `pitchfork proxy trust` manually.
                if s.proxy.auto_trust && ca_cert_path.exists() {
                    use crate::proxy::trust::{AutoTrustResult, auto_trust};
                    match auto_trust(&ca_cert_path) {
                        AutoTrustResult::AlreadyTrusted => {}
                        AutoTrustResult::Trusted => {
                            info!("CA certificate auto-trusted in system store");
                        }
                        AutoTrustResult::NotTrusted { reason } => {
                            warn!("Auto-trust skipped: {reason}");
                            warn!("Run `pitchfork proxy trust` to install manually");
                        }
                    }
                }
            }
            // Spawn the proxy server and wait for its bind result via a oneshot
            // channel.  This avoids the TOCTOU race of a pre-flight bind check
            // while still surfacing binding failures immediately.
            let (bind_tx, bind_rx) = tokio::sync::oneshot::channel();
            let proxy_cancel = tokio_util::sync::CancellationToken::new();
            let proxy_cancel_clone = proxy_cancel.clone();
            *self.proxy_cancel.lock().await = Some(proxy_cancel);
            let proxy_task = tokio::spawn(async move {
                if let Err(e) = crate::proxy::server::serve(bind_tx, proxy_cancel_clone).await {
                    error!("Proxy server error: {e}");
                }
            });
            *self.proxy_task.lock().await = Some(proxy_task);
            match bind_rx.await {
                Ok(Ok(())) => {
                    info!("Proxy server bound successfully");
                    self.start_mdns().await;
                }
                Ok(Err(msg)) => {
                    error!("{msg}");
                    self.add_notification(log::LevelFilter::Error, msg).await;
                }
                Err(_) => {
                    // Sender dropped without sending — serve() panicked or
                    // returned before signalling.  Already logged by the
                    // spawn error handler above.
                }
            }
        }

        // Pre-warm slug cache so the first /api/proxies request is fast.
        // Spawned as a background task so it does not block startup.
        tokio::spawn(async {
            crate::proxy::server::get_cached_slugs().await;
        });

        let (ipc, ipc_handle) = IpcServer::new()?;
        *self.ipc_shutdown.lock().await = Some(ipc_handle);
        self.start_state_flush_task();
        self.conn_watch(ipc).await
    }

    /// Start mDNS publishing for LAN mode (called after the proxy binds successfully).
    async fn start_mdns(&self) {
        let s = crate::settings::settings();
        let lan_enabled = s.proxy.lan || !s.proxy.lan_ip.is_empty();
        if !s.proxy.enable || !lan_enabled {
            return;
        }

        let lan_ip = if !s.proxy.lan_ip.is_empty() {
            match s.proxy.lan_ip.parse::<std::net::Ipv4Addr>() {
                Ok(ip) => Some(ip),
                Err(e) => {
                    error!(
                        "proxy.lan_ip {:?} is not a valid IPv4 address: {e}",
                        s.proxy.lan_ip
                    );
                    return;
                }
            }
        } else {
            match crate::proxy::lan_ip::detect_lan_ip().await {
                Some(ip) => Some(ip),
                None => {
                    error!(
                        "LAN mode is enabled but no LAN IP address could be detected. \
                         Set proxy.lan_ip to a specific address, or ensure you are connected to a network."
                    );
                    return;
                }
            }
        };

        let Some(lan_ip) = lan_ip else { return };
        let port = u16::try_from(s.proxy.port).unwrap_or(443);

        let Some(mut publisher) = crate::proxy::mdns::MdnsPublisher::new(lan_ip) else {
            error!("Failed to start mDNS publisher. Is Avahi (Linux) or Bonjour (macOS) running?");
            return;
        };

        // Publish all registered slugs.
        let slugs = crate::pitchfork_toml::PitchforkToml::read_global_slugs();
        for slug in slugs.keys() {
            let hostname = format!("{slug}.local");
            publisher.publish(&hostname, port);
        }

        log::info!(
            "LAN mode: mDNS publishing on {lan_ip}, {} slug(s) registered",
            slugs.len()
        );

        let publisher = std::sync::Arc::new(tokio::sync::Mutex::new(publisher));

        // Start the IP monitor (only when IP is auto-detected, not pinned).
        let ip_pinned = !s.proxy.lan_ip.is_empty();
        if !ip_pinned {
            let monitor_cancel = self.proxy_cancel.lock().await.clone();
            let publisher_clone = publisher.clone();
            let task = tokio::spawn(async move {
                let mut last_ip = lan_ip;
                let interval = std::time::Duration::from_secs(5);
                let mut ticker = tokio::time::interval(interval);
                ticker.tick().await; // first tick is immediate
                loop {
                    ticker.tick().await;
                    if let Some(cancel) = monitor_cancel.as_ref() {
                        if cancel.is_cancelled() {
                            break;
                        }
                    }
                    if let Some(new_ip) =
                        crate::proxy::lan_ip::detect_lan_ip_if_changed(last_ip).await
                    {
                        log::info!("LAN IP changed: {last_ip} → {new_ip}");
                        last_ip = new_ip;
                        let mut pub_guard = publisher_clone.lock().await;
                        pub_guard.republish_all(new_ip, port);
                    }
                }
            });
            *self.lan_monitor_task.lock().await = Some(task);
        }

        *self.mdns_publisher.lock().await = Some(publisher);
    }

    /// Re-read slugs from config and update mDNS records.
    ///
    /// Publishes new slugs and unpublishes removed ones. Called via IPC when
    /// `proxy add` or `proxy remove` modifies the slug registry.
    async fn sync_mdns(&self) {
        // Clone the Arc and release the outer lock immediately so we don't
        // block close() from taking the publisher during shutdown.
        let publisher = {
            let guard = self.mdns_publisher.lock().await;
            match guard.as_ref() {
                Some(p) => p.clone(),
                None => {
                    debug!("sync_mdns: mDNS publisher not active, skipping");
                    return;
                }
            }
        };

        let s = crate::settings::settings();
        let port = u16::try_from(s.proxy.port).unwrap_or(443);

        let slugs = crate::pitchfork_toml::PitchforkToml::read_global_slugs();
        let mut pub_guard = publisher.lock().await;

        // Unpublish slugs that no longer exist in config.
        let current_keys: Vec<&String> = slugs.keys().collect();
        let registered: Vec<String> = pub_guard.registered_hostnames();
        for hostname in &registered {
            // hostname is "slug.local" — extract slug part.
            let slug = hostname.strip_suffix(".local").unwrap_or(hostname);
            if !current_keys.iter().any(|k| k.as_str() == slug) {
                log::info!("mDNS: unpublishing removed slug {slug}");
                pub_guard.unpublish(hostname);
            }
        }

        // Publish new slugs that aren't yet registered.
        for slug in slugs.keys() {
            let hostname = format!("{slug}.local");
            if !pub_guard.is_published(&hostname) {
                log::info!("mDNS: publishing new slug {slug}");
                pub_guard.publish(&hostname, port);
            }
        }
    }

    /// Spawn a background task that periodically flushes the state file to
    /// disk if it has been marked dirty.  Uses debouncing (1s interval) to
    /// batch rapid state changes.
    fn start_state_flush_task(&self) {
        let cancel = tokio_util::sync::CancellationToken::new();
        *self.flush_cancel.lock().unwrap() = Some(cancel.clone());
        tokio::spawn(async move {
            let mut interval = time::interval(Duration::from_secs(1));
            interval.set_missed_tick_behavior(time::MissedTickBehavior::Skip);
            loop {
                tokio::select! {
                    _ = interval.tick() => {}
                    _ = cancel.cancelled() => {
                        debug!("state flush task received shutdown signal");
                        break;
                    }
                }
                let state = SUPERVISOR.state_file.lock().await;
                if state.is_dirty() {
                    if let Err(e) = state.write() {
                        warn!("failed to flush state file: {e}");
                    }
                }
            }
            debug!("state flush task exiting");
        });
    }

    pub(crate) async fn flush_state(&self) {
        let state = self.state_file.lock().await;
        if state.is_dirty() {
            if let Err(e) = state.write() {
                warn!("failed to flush state file: {e}");
            }
        }
    }

    pub(crate) async fn refresh(&self) -> Result<()> {
        trace!("refreshing");

        // Collect PIDs we need to check (shell PIDs only)
        // This is more efficient than refreshing all processes on the system
        let dirs_with_pids = self.get_dirs_with_shell_pids().await;
        let pids_to_check: Vec<u32> = dirs_with_pids.values().flatten().copied().collect();

        if pids_to_check.is_empty() {
            // No PIDs to check, skip the expensive refresh
            trace!("no shell PIDs to check, skipping process refresh");
        } else {
            PROCS.refresh_pids(&pids_to_check);
        }

        let mut last_refreshed_at = self.last_refreshed_at.lock().await;
        *last_refreshed_at = time::Instant::now();

        for (dir, pids) in dirs_with_pids {
            let to_remove = pids
                .iter()
                .filter(|pid| !PROCS.is_running(**pid))
                .collect::<Vec<_>>();
            for pid in &to_remove {
                self.remove_shell_pid(**pid).await?
            }
            if to_remove.len() == pids.len() {
                self.leave_dir(&dir).await?;
            }
        }

        self.check_retry().await?;
        self.process_pending_autostops().await?;

        Ok(())
    }

    /// Install a SIGCHLD handler that reaps orphaned zombie child processes.
    ///
    /// When running as PID 1 inside a container, orphaned processes are
    /// re-parented to PID 1. Without explicit reaping, they accumulate
    /// as zombies in the process table indefinitely.
    ///
    /// Only reaps processes that are NOT managed by the supervisor (i.e.
    /// not tracked in the state file). Managed daemon processes are reaped
    /// by their monitoring tasks via `child.wait()`.
    ///
    /// ## Strategy
    ///
    /// **Linux**: Uses `waitid(Id::All, WNOHANG | WNOWAIT | WEXITED)` to
    /// *peek* at the next zombie without consuming its status. If the PID
    /// belongs to a managed daemon, the reaper skips it so Tokio's
    /// `child.wait()` can collect the status normally. Only unmanaged
    /// orphans are actually reaped (via `waitpid(Pid, WNOHANG)`). This
    /// eliminates the race entirely.
    ///
    /// **Non-Linux Unix** (e.g. macOS — mainly for local development;
    /// container mode targets Linux): `waitid` is unavailable, so we fall
    /// back to `waitpid(None, WNOHANG)`. If the reaper accidentally
    /// consumes a managed PID's status, it stashes the exit code in
    /// [`REAPED_STATUSES`] for the monitoring task to recover.
    #[cfg(unix)]
    fn reap_zombies(&self) -> Result<()> {
        let mut stream = signal::unix::signal(SignalKind::child())
            .map_err(|e| miette::miette!("Failed to register SIGCHLD handler: {e}"))?;
        tokio::spawn(async move {
            loop {
                stream.recv().await;
                // Collect PIDs of managed daemons so we don't steal their exit status
                let managed_pids: HashSet<u32> = SUPERVISOR
                    .state_file
                    .lock()
                    .await
                    .daemons
                    .values()
                    .filter_map(|d| d.pid)
                    .collect();
                // Reap all available zombie children that are NOT managed
                Self::reap_unmanaged_zombies(&managed_pids).await;
            }
        });
        info!("container mode: SIGCHLD zombie reaper installed");
        Ok(())
    }

    /// Linux implementation: peek with `waitid(WNOWAIT)` then selectively reap.
    ///
    /// `WNOWAIT` leaves the zombie in the table so we can inspect its PID
    /// without consuming the exit status. Only if the PID is *not* managed
    /// do we call `waitpid(Pid, WNOHANG)` to actually reap it.
    #[cfg(target_os = "linux")]
    async fn reap_unmanaged_zombies(managed_pids: &HashSet<u32>) {
        use nix::sys::wait::{Id, WaitPidFlag, WaitStatus, waitid, waitpid};
        use nix::unistd::Pid;

        loop {
            // Peek at the next zombie without consuming it
            let peek_flags = WaitPidFlag::WNOHANG | WaitPidFlag::WNOWAIT | WaitPidFlag::WEXITED;
            match waitid(Id::All, peek_flags) {
                Ok(WaitStatus::StillAlive) => break,
                Ok(status) => {
                    let Some(pid_raw) = status.pid().map(|p| p.as_raw() as u32) else {
                        break;
                    };
                    if managed_pids.contains(&pid_raw) {
                        // This is a managed daemon — leave it for Tokio's child.wait().
                        // We must break out of the loop because waitid(Id::All) would
                        // keep returning the same zombie if we don't consume it.
                        trace!(
                            "zombie reaper: skipping managed daemon pid {pid_raw}, \
                             leaving for Tokio to reap"
                        );
                        break;
                    }
                    // Not managed — actually reap it
                    match waitpid(Pid::from_raw(pid_raw as i32), Some(WaitPidFlag::WNOHANG)) {
                        Ok(s) => trace!("reaped orphaned zombie child: {s:?}"),
                        Err(nix::errno::Errno::ECHILD) => break,
                        Err(e) => {
                            trace!("waitpid error reaping pid {pid_raw}: {e}");
                            break;
                        }
                    }
                }
                Err(nix::errno::Errno::ECHILD) => break, // no children at all
                Err(e) => {
                    trace!("waitid error in zombie reaper: {e}");
                    break;
                }
            }
        }
    }

    /// Non-Linux fallback: blind `waitpid(None, WNOHANG)` with stash recovery.
    ///
    /// Since `waitid(WNOWAIT)` is not available, we cannot peek. If we
    /// accidentally reap a managed PID, we stash the exit code in
    /// [`REAPED_STATUSES`] so the monitoring task can recover it.
    #[cfg(all(unix, not(target_os = "linux")))]
    async fn reap_unmanaged_zombies(managed_pids: &HashSet<u32>) {
        use nix::sys::wait::{WaitPidFlag, WaitStatus, waitpid};

        loop {
            match waitpid(None, Some(WaitPidFlag::WNOHANG)) {
                Ok(WaitStatus::StillAlive) => break,
                Ok(status) => {
                    let Some(pid) = status.pid().map(|p| p.as_raw() as u32) else {
                        continue;
                    };
                    if managed_pids.contains(&pid) {
                        // Race lost — stash the exit code for lifecycle recovery
                        let exit_code = match status {
                            WaitStatus::Exited(_, code) => code,
                            WaitStatus::Signaled(_, sig, _) => -(sig as i32),
                            _ => -1,
                        };
                        warn!(
                            "zombie reaper reaped managed daemon pid {pid} \
                             (exit_code={exit_code}); stashing status for recovery"
                        );
                        REAPED_STATUSES.lock().await.insert(pid, exit_code);
                    } else {
                        trace!("reaped orphaned zombie child: {status:?}");
                    }
                }
                Err(nix::errno::Errno::ECHILD) => break, // no more children
                Err(e) => {
                    trace!("waitpid error in zombie reaper: {e}");
                    break;
                }
            }
        }
    }

    #[cfg(unix)]
    fn signals(&self) -> Result<()> {
        let signals = [
            SignalKind::terminate(),
            SignalKind::alarm(),
            SignalKind::interrupt(),
            SignalKind::quit(),
            SignalKind::hangup(),
            SignalKind::user_defined1(),
            SignalKind::user_defined2(),
        ];
        static RECEIVED_SIGNAL: AtomicBool = AtomicBool::new(false);
        for signal in signals {
            let stream = match signal::unix::signal(signal) {
                Ok(s) => s,
                Err(e) => {
                    warn!("Failed to register signal handler for {signal:?}: {e}");
                    continue;
                }
            };
            tokio::spawn(async move {
                let mut stream = stream;
                loop {
                    stream.recv().await;
                    if RECEIVED_SIGNAL.swap(true, atomic::Ordering::SeqCst) {
                        exit(1);
                    } else {
                        SUPERVISOR.handle_signal().await;
                    }
                }
            });
        }
        Ok(())
    }

    #[cfg(windows)]
    fn signals(&self) -> Result<()> {
        tokio::spawn(async move {
            static RECEIVED_SIGNAL: AtomicBool = AtomicBool::new(false);
            loop {
                if let Err(e) = signal::ctrl_c().await {
                    error!("Failed to wait for ctrl-c: {}", e);
                    return;
                }
                if RECEIVED_SIGNAL.swap(true, atomic::Ordering::SeqCst) {
                    exit(1);
                } else {
                    SUPERVISOR.handle_signal().await;
                }
            }
        });
        Ok(())
    }

    async fn handle_signal(&self) {
        info!("received signal, stopping");
        self.close().await;
        exit(0)
    }

    pub(crate) async fn close(&self) {
        // Signal the proxy server to stop accepting new connections
        // and drain in-flight ones, *before* stopping daemons so the
        // proxy has time to finish forwarding active requests.
        if let Some(cancel) = self.proxy_cancel.lock().await.take() {
            cancel.cancel();
        }

        // Stop the LAN IP monitor task.
        if let Some(monitor_task) = self.lan_monitor_task.lock().await.take() {
            monitor_task.abort();
        }

        // Shutdown the mDNS publisher (sends goodbye packets).
        if let Some(publisher) = self.mdns_publisher.lock().await.take() {
            publisher.lock().await.shutdown();
        }

        if let Some(proxy_task) = self.proxy_task.lock().await.take() {
            let _ = tokio::time::timeout(Duration::from_secs(12), proxy_task).await;
        }

        // Clean up /etc/hosts entries managed by pitchfork
        let s = settings();
        if s.proxy.enable && s.proxy.sync_hosts {
            crate::proxy::hosts::clean_hosts_file();
        }

        let pitchfork_id = DaemonId::pitchfork();
        let active = self.active_daemons().await;
        let active_ids: Vec<DaemonId> = active
            .iter()
            .filter(|d| d.id != pitchfork_id)
            .map(|d| d.id.clone())
            .collect();

        // Stop daemons in reverse dependency order.
        // If dependency resolution fails (e.g. config changed), fall back to
        // stopping in arbitrary order so we still shut down cleanly.
        // Daemons within the same level are stopped concurrently.
        let stop_levels = compute_reverse_stop_order(&active_ids);
        for level in &stop_levels {
            let mut tasks = Vec::new();
            for id in level {
                let id = id.clone();
                tasks.push(tokio::spawn(async move {
                    if let Err(err) = SUPERVISOR.stop(&id).await {
                        error!("failed to stop daemon {id}: {err}");
                    }
                }));
            }
            for task in tasks {
                let _ = task.await;
            }
        }
        let _ = self.remove_daemon(&pitchfork_id).await;

        // Signal the background state flush task to exit so it doesn't
        // keep waking up and acquiring the state mutex after shutdown.
        if let Some(cancel) = self.flush_cancel.lock().unwrap().take() {
            cancel.cancel();
        }

        // Force-flush state to disk before shutting down IPC so no
        // in-memory-only changes are lost.
        {
            let state = self.state_file.lock().await;
            if state.is_dirty() {
                if let Err(e) = state.write() {
                    warn!("failed to flush state file during shutdown: {e}");
                }
            }
        }

        // Signal IPC server to shut down gracefully
        if let Some(mut handle) = self.ipc_shutdown.lock().await.take() {
            handle.shutdown();
        }

        // Wait for all in-flight monitoring tasks to finish registering their
        // hook handles. Each monitoring task increments `active_monitors` when
        // its process exits, and decrements it (+ notifies `monitor_done`)
        // after all fire_hook() calls complete. This replaces the old
        // yield_now() approach which had a race window.
        let drain_timeout = time::sleep(Duration::from_secs(5));
        tokio::pin!(drain_timeout);
        loop {
            if self.active_monitors.load(atomic::Ordering::Acquire) == 0 {
                break;
            }
            tokio::select! {
                _ = self.monitor_done.notified() => {}
                _ = &mut drain_timeout => {
                    warn!("timed out waiting for monitoring tasks to register hooks, proceeding with shutdown");
                    break;
                }
            }
        }
        let handles: Vec<JoinHandle<()>> = std::mem::take(&mut *self.hook_tasks.lock().await);
        let hook_timeout = Duration::from_secs(30);
        for handle in handles {
            match time::timeout(hook_timeout, handle).await {
                Ok(_) => {} // Hook completed (success or error, doesn't matter)
                Err(_) => {
                    warn!(
                        "hook task did not complete within {hook_timeout:?} during shutdown, skipping"
                    );
                }
            }
        }

        let _ = fs::remove_dir_all(&*env::IPC_SOCK_DIR);
    }

    pub(crate) async fn add_notification(&self, level: log::LevelFilter, message: String) {
        self.pending_notifications
            .lock()
            .await
            .push((level, message));
    }
}

/// Fix ownership on the state directory so non-root users can access files
/// created by a `sudo`-started supervisor.
///
/// When `[settings.supervisor] user` or `SUDO_UID`/`SUDO_GID` are set, we
/// `chown` the state directory and safe subdirectories back to that non-root
/// runtime user. This is strictly better than `chmod 0o666` because it does not
/// widen the permission bits — the files stay owner-only (0o600/0o700) but the
/// *owner* is the user that daemon processes and CLI clients need to share.
///
/// **Security**: The `proxy/` subtree is intentionally skipped. It contains
/// `ca-key.pem` which must remain `0o600` and owned by the process that
/// generated it. Changing its ownership or permissions would expose the CA
/// private key to other local users.
///
/// If neither `user` nor `SUDO_UID`/`SUDO_GID` are available (e.g. direct
/// root login), we fall back to relaxing permissions on only the `sock/` and
/// `logs/` subdirectories (plus `state.toml`) so CLI clients can still function.
#[cfg(unix)]
fn fix_state_dir_permissions() {
    let state_dir = &*env::PITCHFORK_STATE_DIR;
    if let Some((uid, gid)) = state_owner_ids() {
        if !state_dir.exists()
            && let Err(err) = fs::create_dir_all(state_dir)
        {
            warn!(
                "failed to create state directory for ownership fix at {}: {err}",
                state_dir.display()
            );
            return;
        }

        // Best path: chown back to the runtime user. Permissions stay tight.
        chown_recursive(state_dir, uid, gid, true);
        debug!(
            "chowned state directory to uid={uid} gid={gid} at {}",
            state_dir.display()
        );
    } else {
        if !state_dir.exists() {
            return;
        }

        // Fallback: relax permissions on safe subdirectories only.
        // proxy/ is never touched.
        chmod_safe_subtrees(state_dir);
        debug!(
            "relaxed permissions on safe subtrees at {}",
            state_dir.display()
        );
    }
}

#[cfg(unix)]
pub(crate) fn state_owner_ids() -> Option<(u32, u32)> {
    if !nix::unistd::Uid::effective().is_root() {
        return None;
    }

    let s = settings();
    let user = s.supervisor.user.trim();
    if !user.is_empty() {
        return resolve_supervisor_user_ids(user).or_else(|| {
            warn!(
                "failed to resolve supervisor.user '{user}' for state ownership; falling back to SUDO_UID/SUDO_GID"
            );
            parse_sudo_ids()
        });
    }

    parse_sudo_ids()
}

#[cfg(unix)]
fn resolve_supervisor_user_ids(user: &str) -> Option<(u32, u32)> {
    let user_record = if user.chars().all(|c| c.is_ascii_digit()) {
        let uid = user.parse::<u32>().ok()?;
        nix::unistd::User::from_uid(nix::unistd::Uid::from_raw(uid))
            .ok()
            .flatten()
    } else {
        nix::unistd::User::from_name(user).ok().flatten()
    }?;

    Some((user_record.uid.as_raw(), user_record.gid.as_raw()))
}

/// Parse `SUDO_UID` and `SUDO_GID` environment variables into numeric IDs.
///
/// Returns `None` unless the effective UID is 0 (root). This prevents stale
/// `SUDO_UID`/`SUDO_GID` values inherited into non-sudo environments from
/// triggering incorrect `chown` operations.
#[cfg(unix)]
fn parse_sudo_ids() -> Option<(u32, u32)> {
    if !nix::unistd::Uid::effective().is_root() {
        return None;
    }
    let uid: u32 = std::env::var("SUDO_UID").ok()?.parse().ok()?;
    let gid: u32 = std::env::var("SUDO_GID").ok()?.parse().ok()?;
    Some((uid, gid))
}

/// Recursively `chown` a directory tree. If `skip_proxy` is true, the `proxy/`
/// subdirectory is skipped entirely to protect the CA private key.
#[cfg(unix)]
fn chown_recursive(dir: &std::path::Path, uid: u32, gid: u32, skip_proxy: bool) {
    // chown the directory itself
    let _ = chown_path(dir, uid, gid);

    let entries = match std::fs::read_dir(dir) {
        Ok(e) => e,
        Err(_) => return,
    };
    for entry in entries.flatten() {
        let path = entry.path();
        if path.is_dir() {
            // Skip proxy/ at the top level of the state directory
            if skip_proxy {
                if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
                    if name == "proxy" {
                        continue;
                    }
                }
            }
            chown_recursive(&path, uid, gid, false);
        } else {
            let _ = chown_path(&path, uid, gid);
        }
    }
}

/// `chown` a single path using libc. Returns Ok(()) on success.
#[cfg(unix)]
fn chown_path(path: &std::path::Path, uid: u32, gid: u32) -> std::io::Result<()> {
    use std::ffi::CString;
    use std::os::unix::ffi::OsStrExt;
    let c_path = CString::new(path.as_os_str().as_bytes())
        .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidInput, e))?;
    let ret = unsafe { libc::chown(c_path.as_ptr(), uid, gid) };
    if ret == 0 {
        Ok(())
    } else {
        Err(std::io::Error::last_os_error())
    }
}

/// Fallback: relax permissions on safe subdirectories only (sock/, logs/, and
/// state.toml). The proxy/ subtree is never touched.
#[cfg(unix)]
fn chmod_safe_subtrees(state_dir: &std::path::Path) {
    // The state directory itself needs to be traversable
    let _ = fs::set_permissions(state_dir, fs::Permissions::from_mode(0o755));

    // state.toml — needs to be readable by CLI clients
    let state_file = state_dir.join("state.toml");
    if state_file.exists() {
        let _ = fs::set_permissions(&state_file, fs::Permissions::from_mode(0o644));
    }

    // Safe subdirectories: sock/ and logs/
    for subdir_name in &["sock", "logs"] {
        let subdir = state_dir.join(subdir_name);
        if subdir.is_dir() {
            chmod_recursive(&subdir);
        }
    }
}

/// On startup, kill any daemon processes left behind by a previous supervisor
/// that was terminated unexpectedly (e.g. `kill -9`).
///
/// This iterates the state file for daemon entries with a recorded PID.  If the
/// PID is still alive and the process name matches the recorded `title`, it is
/// assumed to be an orphan from the previous supervisor session and is killed.
/// The daemon's state is then reset to `Stopped` with no PID.
///
/// The process-name check protects against the rare case where a PID has been
/// recycled by an unrelated process since the state file was written.
///
/// This is gated by the `supervisor.cleanup_orphans` setting (default: true).
async fn cleanup_orphaned_daemons(supervisor: &Supervisor) {
    if !settings().supervisor.cleanup_orphans {
        return;
    }

    let candidates: Vec<_> = {
        let state = supervisor.state_file.lock().await;
        state
            .daemons
            .values()
            .filter(|d| d.id != DaemonId::pitchfork() && d.pid.is_some())
            .cloned()
            .collect()
    };

    if candidates.is_empty() {
        return;
    }

    info!(
        "checking {} daemon(s) for orphaned processes",
        candidates.len()
    );

    for daemon in candidates {
        let Some(pid) = daemon.pid else { continue };

        PROCS.refresh_pids(&[pid]);

        if !PROCS.is_running(pid) {
            // PID already dead — just reset its state
            let _ = supervisor
                .upsert_daemon(
                    UpsertDaemonOpts::builder(daemon.id.clone())
                        .set(|o| {
                            o.pid = None;
                            o.status = DaemonStatus::Stopped;
                            o.active_port = None;
                        })
                        .build(),
                )
                .await;
            continue;
        }

        // Safety check: verify the process name matches what we recorded.
        // This avoids accidentally killing an unrelated process if the PID
        // was recycled by the kernel between state-file writes.
        let current_title = PROCS.title(pid);
        let matches = match (&current_title, &daemon.title) {
            (Some(current), Some(expected)) => current == expected,
            // If we don't have a recorded title, fall back to allowing the kill
            // (this is a degraded but functional state — the process was probably
            // started before title tracking was added, or the state was reset).
            _ => true,
        };

        if !matches {
            warn!(
                "pid {pid} for daemon {} has changed name (expected '{}', found '{}'); skipping orphan cleanup",
                daemon.id,
                daemon.title.as_deref().unwrap_or("?"),
                current_title.as_deref().unwrap_or("?")
            );
            // Still reset state so we don't track a stale PID
            let _ = supervisor
                .upsert_daemon(
                    UpsertDaemonOpts::builder(daemon.id.clone())
                        .set(|o| {
                            o.pid = None;
                            o.status = DaemonStatus::Stopped;
                            o.active_port = None;
                        })
                        .build(),
                )
                .await;
            continue;
        }

        info!("terminating orphaned daemon {} (pid {pid})", daemon.id);

        let stop_cfg = daemon.stop_signal.unwrap_or_default();
        let _ = PROCS
            .kill_process_group_async(pid, stop_cfg.signal.into(), stop_cfg.timeout)
            .await;

        let _ = supervisor
            .upsert_daemon(
                UpsertDaemonOpts::builder(daemon.id.clone())
                    .set(|o| {
                        o.pid = None;
                        o.status = DaemonStatus::Stopped;
                        o.active_port = None;
                    })
                    .build(),
            )
            .await;
    }
}

/// Recursively chmod: directories → 0o755, files → 0o644.
#[cfg(unix)]
fn chmod_recursive(dir: &std::path::Path) {
    let _ = fs::set_permissions(dir, fs::Permissions::from_mode(0o755));
    let entries = match fs::read_dir(dir) {
        Ok(e) => e,
        Err(_) => return,
    };
    for entry in entries.flatten() {
        let path = entry.path();
        if path.is_dir() {
            chmod_recursive(&path);
        } else {
            let _ = fs::set_permissions(&path, fs::Permissions::from_mode(0o644));
        }
    }
}