Skip to main content

agent_sim/daemon/
lifecycle.rs

1use crate::daemon::error::DaemonError;
2use crate::load::{LoadSpec, write_load_spec};
3use std::io::Read;
4use std::path::{Path, PathBuf};
5use std::process::Stdio;
6use std::time::Duration;
7use tokio::net::UnixStream;
8use tokio::time::sleep;
9
10#[derive(Debug, Clone, PartialEq, Eq)]
11pub struct InstanceRuntimeInfo {
12    pub name: String,
13    pub socket_path: PathBuf,
14    pub running: bool,
15    pub env: Option<String>,
16}
17
18#[derive(Debug, Clone, Copy, Default)]
19pub struct InstanceRegistry;
20
21pub fn session_root() -> PathBuf {
22    if let Some(path) = std::env::var_os("AGENT_SIM_HOME") {
23        return PathBuf::from(path);
24    }
25
26    dirs::home_dir()
27        .unwrap_or_else(|| PathBuf::from("."))
28        .join(".agent-sim")
29}
30
31pub fn socket_path(session: &str) -> PathBuf {
32    session_root().join(format!("{session}.sock"))
33}
34
35pub fn pid_path(session: &str) -> PathBuf {
36    session_root().join(format!("{session}.pid"))
37}
38
39pub fn meta_path(session: &str) -> PathBuf {
40    session_root().join(format!("{session}.meta"))
41}
42
43pub fn bootstrap_dir() -> PathBuf {
44    session_root().join("bootstrap")
45}
46
47pub async fn ensure_daemon_running(session: &str) -> Result<(), DaemonError> {
48    InstanceRegistry.ensure_running(session).await
49}
50
51pub async fn bootstrap_daemon(session: &str, load_spec: &LoadSpec) -> Result<(), DaemonError> {
52    InstanceRegistry.bootstrap(session, load_spec).await
53}
54
55impl InstanceRegistry {
56    pub async fn ensure_running(self, session: &str) -> Result<(), DaemonError> {
57        std::fs::create_dir_all(session_root())?;
58        let socket = socket_path(session);
59        if can_connect(&socket).await {
60            return Ok(());
61        }
62        Err(DaemonError::NotRunning(session.to_string()))
63    }
64
65    pub async fn bootstrap(self, session: &str, load_spec: &LoadSpec) -> Result<(), DaemonError> {
66        std::fs::create_dir_all(session_root())?;
67        std::fs::create_dir_all(bootstrap_dir())?;
68        let socket = socket_path(session);
69        if can_connect(&socket).await {
70            return Err(DaemonError::AlreadyRunning(session.to_string()));
71        }
72        let bootstrap_path =
73            bootstrap_dir().join(format!("{session}-{}.json", uuid::Uuid::new_v4()));
74        write_load_spec(&bootstrap_path, load_spec)
75            .map_err(|err| DaemonError::Request(err.to_string()))?;
76        let mut child = self.spawn_daemon(session, &bootstrap_path)?;
77
78        let timeout = Duration::from_secs(5);
79        let mut waited = Duration::from_millis(0);
80        while waited < timeout {
81            if can_connect(&socket).await {
82                let _ = std::fs::remove_file(&bootstrap_path);
83                return Ok(());
84            }
85            if let Some(status) = child.try_wait()? {
86                let stderr = if let Some(mut pipe) = child.stderr.take() {
87                    tokio::task::spawn_blocking(move || {
88                        let mut stderr = String::new();
89                        let _ = pipe.read_to_string(&mut stderr);
90                        stderr
91                    })
92                    .await
93                    .unwrap_or_else(|_| String::new())
94                } else {
95                    String::new()
96                };
97                let _ = std::fs::remove_file(&bootstrap_path);
98                let details = stderr.trim();
99                let message = if details.is_empty() {
100                    format!("daemon exited with status {status}")
101                } else {
102                    details.to_string()
103                };
104                return Err(DaemonError::StartupFailed(message));
105            }
106            sleep(Duration::from_millis(100)).await;
107            waited += Duration::from_millis(100);
108        }
109        cleanup_bootstrap_timeout(&mut child, &bootstrap_path);
110        Err(DaemonError::StartupTimeout)
111    }
112
113    fn spawn_daemon(
114        self,
115        session: &str,
116        bootstrap_path: &Path,
117    ) -> Result<std::process::Child, DaemonError> {
118        let exe = std::env::current_exe()?;
119        let mut command = std::process::Command::new(exe);
120        command
121            .arg("__internal")
122            .arg("instance-daemon")
123            .arg("--instance")
124            .arg(session)
125            .arg("--load-spec-path")
126            .arg(bootstrap_path)
127            .stdout(Stdio::null())
128            .stderr(Stdio::piped());
129        let child = command.spawn()?;
130        Ok(child)
131    }
132
133    pub async fn list_instances(self) -> Result<Vec<InstanceRuntimeInfo>, DaemonError> {
134        let root = session_root();
135        std::fs::create_dir_all(&root)?;
136        let mut out = Vec::new();
137        for entry in std::fs::read_dir(root)? {
138            let entry = entry?;
139            let path = entry.path();
140            if path.extension().and_then(|v| v.to_str()) != Some("sock") {
141                continue;
142            }
143            let Some(stem) = path.file_stem().and_then(|s| s.to_str()) else {
144                continue;
145            };
146            let running = can_connect(&path).await;
147            let env = read_env_tag(stem);
148            out.push(InstanceRuntimeInfo {
149                name: stem.to_string(),
150                socket_path: path,
151                running,
152                env,
153            });
154        }
155        out.sort_by(|a, b| a.name.cmp(&b.name));
156        Ok(out)
157    }
158}
159
160pub async fn list_instances() -> Result<Vec<InstanceRuntimeInfo>, DaemonError> {
161    InstanceRegistry.list_instances().await
162}
163
164fn cleanup_bootstrap_timeout(child: &mut std::process::Child, bootstrap_path: &Path) {
165    let _ = std::fs::remove_file(bootstrap_path);
166    let _ = child.kill();
167    let _ = child.wait();
168}
169
170async fn can_connect(socket: &Path) -> bool {
171    UnixStream::connect(socket).await.is_ok()
172}
173
174pub fn write_env_tag(session: &str, env: Option<&str>) -> Result<(), DaemonError> {
175    let path = meta_path(session);
176    if let Some(env) = env {
177        std::fs::write(path, env)?;
178    } else if path.exists() {
179        let _ = std::fs::remove_file(path);
180    }
181    Ok(())
182}
183
184pub fn read_env_tag(session: &str) -> Option<String> {
185    let path = meta_path(session);
186    std::fs::read_to_string(path)
187        .ok()
188        .map(|value| value.trim().to_string())
189        .filter(|value| !value.is_empty())
190}
191
192pub fn remove_env_tag(session: &str) {
193    let path = meta_path(session);
194    if path.exists() {
195        let _ = std::fs::remove_file(path);
196    }
197}
198
199pub fn read_pid(session: &str) -> Option<u32> {
200    std::fs::read_to_string(pid_path(session))
201        .ok()
202        .and_then(|value| value.trim().parse::<u32>().ok())
203}
204
205pub fn kill_pid(pid: u32) -> Result<(), DaemonError> {
206    #[cfg(unix)]
207    {
208        let result = unsafe { libc::kill(pid as i32, libc::SIGKILL) };
209        if result != 0 {
210            return Err(DaemonError::Request(format!(
211                "failed to kill pid {pid}: {}",
212                std::io::Error::last_os_error()
213            )));
214        }
215        Ok(())
216    }
217    #[cfg(not(unix))]
218    {
219        let _ = pid;
220        Err(DaemonError::Request(
221            "pid kill fallback is not supported on this platform".to_string(),
222        ))
223    }
224}
225
226#[cfg(test)]
227mod tests {
228    use super::cleanup_bootstrap_timeout;
229    use std::process::{Command, Stdio};
230
231    #[cfg(unix)]
232    #[test]
233    fn timeout_cleanup_kills_child_and_removes_bootstrap_file() {
234        let bootstrap = tempfile::NamedTempFile::new().expect("temp bootstrap file");
235        let bootstrap_path = bootstrap.path().to_path_buf();
236        let mut child = Command::new("sh")
237            .arg("-c")
238            .arg("sleep 30")
239            .stdout(Stdio::null())
240            .stderr(Stdio::null())
241            .spawn()
242            .expect("sleep child should spawn");
243
244        cleanup_bootstrap_timeout(&mut child, &bootstrap_path);
245
246        assert!(
247            !bootstrap_path.exists(),
248            "bootstrap file should be removed during timeout cleanup"
249        );
250        assert!(
251            child
252                .try_wait()
253                .expect("child status should be queryable")
254                .is_some(),
255            "timed-out child should be reaped"
256        );
257    }
258}