Skip to main content

astrid_kernel/
socket.rs

1use std::path::PathBuf;
2
3use astrid_core::session_token::SessionToken;
4use tokio::net::UnixListener;
5use tracing::warn;
6
7/// Path to the local Unix Domain Socket for the kernel.
8#[must_use]
9pub(crate) fn kernel_socket_path() -> PathBuf {
10    use astrid_core::dirs::AstridHome;
11    match AstridHome::resolve() {
12        Ok(home) => home.socket_path(),
13        Err(e) => {
14            warn!(error = %e, "Failed to resolve ASTRID_HOME; falling back to /tmp/.astrid/run/system.sock");
15            PathBuf::from("/tmp/.astrid/run/system.sock")
16        },
17    }
18}
19
20/// Maximum byte length for a Unix domain socket path.
21/// macOS/FreeBSD/OpenBSD `sockaddr_un.sun_path` is 104 bytes; Linux is 108.
22#[cfg(any(target_os = "macos", target_os = "freebsd", target_os = "openbsd"))]
23const MAX_SOCKET_PATH_LEN: usize = 104;
24#[cfg(not(any(target_os = "macos", target_os = "freebsd", target_os = "openbsd")))]
25const MAX_SOCKET_PATH_LEN: usize = 108;
26
27/// Binds a local Unix Domain Socket for the OS and acquires the singleton lock.
28/// Returns the bound listener (for the WASM execution context) plus the lock
29/// file, which the caller MUST keep alive for the process lifetime.
30///
31/// Takes the already-resolved [`AstridHome`](astrid_core::dirs::AstridHome) so
32/// the path is resolved exactly once, by the caller. There is intentionally no
33/// `/tmp` fallback: the caller resolves `ASTRID_HOME` strictly and a daemon
34/// that can't resolve it refuses to boot, rather than binding a divergent
35/// `/tmp` path and running side by side with another instance (split-brain).
36///
37/// # Errors
38/// Returns an error if the socket cannot be bound, the path exceeds the
39/// platform's `sun_path` limit, the singleton lock is already held by another
40/// kernel instance, or another kernel is already listening on the socket.
41pub(crate) fn bind_session_socket(
42    home: &astrid_core::dirs::AstridHome,
43) -> Result<(UnixListener, std::fs::File), std::io::Error> {
44    let path = home.socket_path();
45
46    // Create the run directory first — both the lockfile and the socket live
47    // in it. Enforce 0o700: AstridHome::ensure() does this at boot, but if the
48    // directory was just created here it would inherit the process umask
49    // (commonly 0o755, making the socket listable by other users).
50    if let Some(parent) = path.parent() {
51        std::fs::create_dir_all(parent).map_err(|e| {
52            std::io::Error::other(format!(
53                "Failed to create socket parent directory {}: {e}",
54                parent.display()
55            ))
56        })?;
57
58        #[cfg(unix)]
59        {
60            use std::os::unix::fs::PermissionsExt;
61            std::fs::set_permissions(parent, std::fs::Permissions::from_mode(0o700))?;
62        }
63    }
64
65    // Singleton guard: hold an exclusive advisory lock on a lockfile next to
66    // the socket for the daemon's lifetime. This closes the connect-probe ->
67    // bind TOCTOU window in `prepare_socket_path` deterministically — a second
68    // daemon fails to acquire the lock and exits before touching the socket.
69    // The OS releases the lock when the process dies, so a crashed daemon
70    // never wedges a restart. The caller MUST keep the returned file alive for
71    // the process lifetime (dropping it releases the lock).
72    let lock = acquire_singleton_lock(&path.with_file_name("system.lock"))?;
73
74    prepare_socket_path(&path)?;
75
76    // Also clean stale readiness file as defense-in-depth for daemon
77    // crashes that bypassed graceful shutdown.
78    remove_readiness_file();
79
80    let listener = UnixListener::bind(&path)?;
81    Ok((listener, lock))
82}
83
84/// Acquire an exclusive, non-blocking advisory lock on `lock_path`, returning
85/// the open file handle. The lock is held for as long as the returned `File`
86/// is alive — the caller stores it for the daemon's lifetime, and the OS
87/// releases it on process exit (so a crash can't wedge a restart). The
88/// lockfile itself is intentionally left in place between runs.
89fn acquire_singleton_lock(lock_path: &std::path::Path) -> Result<std::fs::File, std::io::Error> {
90    use std::fs::OpenOptions;
91
92    let mut opts = OpenOptions::new();
93    opts.read(true).write(true).create(true);
94    #[cfg(unix)]
95    {
96        use std::os::unix::fs::OpenOptionsExt;
97        opts.mode(0o600);
98    }
99    let file = opts.open(lock_path).map_err(|e| {
100        std::io::Error::other(format!(
101            "Failed to open singleton lockfile {}: {e}",
102            lock_path.display()
103        ))
104    })?;
105
106    file.try_lock().map_err(|e| match e {
107        std::fs::TryLockError::WouldBlock => std::io::Error::other(format!(
108            "Another kernel instance is already running (singleton lock held): {}",
109            lock_path.display()
110        )),
111        std::fs::TryLockError::Error(err) => std::io::Error::other(format!(
112            "Failed to acquire singleton lock {}: {err}",
113            lock_path.display()
114        )),
115    })?;
116
117    Ok(file)
118}
119
120/// Generate a random session token and write it to the token file.
121///
122/// Returns both the token and the path it was written to. The caller should
123/// store the path so that the exact same path is used for cleanup at shutdown
124/// (avoids fallback mismatch if the env changes between boot and shutdown).
125///
126/// The token is written with 0o600 permissions so only the owning user
127/// can read it. The CLI reads this token at connect time and sends it
128/// as part of the handshake.
129///
130/// # Errors
131/// Returns an error if `ASTRID_HOME` cannot be resolved or the token file
132/// cannot be written. Unlike socket/CLI paths, there is no `/tmp` fallback
133/// because writing a secret token under a world-listable directory would
134/// undermine the authentication it provides.
135pub(crate) fn generate_session_token() -> Result<(SessionToken, PathBuf), std::io::Error> {
136    use astrid_core::dirs::AstridHome;
137
138    let token = SessionToken::generate();
139
140    let home = AstridHome::resolve().map_err(|e| {
141        std::io::Error::other(format!(
142            "Cannot generate session token: failed to resolve ASTRID_HOME: {e}"
143        ))
144    })?;
145
146    let path = home.token_path();
147    token.write_to_file(&path)?;
148    Ok((token, path))
149}
150
151/// Validate a socket path and handle stale/live socket detection.
152///
153/// Extracted from `bind_session_socket` for testability. Returns `Ok(())`
154/// if the path is safe to bind (stale socket removed or no socket exists).
155/// Returns `Err` if the path is too long or another kernel is listening.
156fn prepare_socket_path(path: &std::path::Path) -> Result<(), std::io::Error> {
157    let path_len = path.as_os_str().as_encoded_bytes().len();
158    if path_len >= MAX_SOCKET_PATH_LEN {
159        return Err(std::io::Error::other(format!(
160            "Socket path is {path_len} bytes, exceeding the platform limit of {MAX_SOCKET_PATH_LEN} bytes: {}",
161            path.display()
162        )));
163    }
164
165    if path.is_symlink() {
166        warn!(path = %path.display(), "Removing unexpected symlink at socket path");
167        std::fs::remove_file(path).map_err(|e| {
168            std::io::Error::other(format!(
169                "Failed to remove symlink at socket path {}: {e}",
170                path.display()
171            ))
172        })?;
173    } else if path.exists() {
174        match std::os::unix::net::UnixStream::connect(path) {
175            Ok(_stream) => {
176                return Err(std::io::Error::other(format!(
177                    "Another kernel instance is already running on this socket: {}",
178                    path.display()
179                )));
180            },
181            Err(e) if e.kind() == std::io::ErrorKind::ConnectionRefused => {
182                // No listener attached: stale socket, safe to remove.
183                std::fs::remove_file(path).map_err(|e| {
184                    std::io::Error::other(format!(
185                        "Failed to remove stale socket {}: {e}",
186                        path.display()
187                    ))
188                })?;
189            },
190            Err(e) => {
191                // Other errors (EACCES, etc.) may indicate a live kernel
192                // under a different user or transient issue. Don't delete.
193                return Err(std::io::Error::other(format!(
194                    "Failed to probe existing socket {}: {e}",
195                    path.display()
196                )));
197            },
198        }
199    }
200
201    Ok(())
202}
203
204/// Path to the daemon readiness sentinel file.
205///
206/// NOTE: This is intentionally duplicated in `astrid-cli/src/socket_client.rs`
207/// because the CLI cannot depend on `astrid-kernel`. The canonical path
208/// definition is `AstridHome::ready_path()` in `astrid-core`.
209#[must_use]
210pub fn readiness_path() -> PathBuf {
211    use astrid_core::dirs::AstridHome;
212    match AstridHome::resolve() {
213        Ok(home) => home.ready_path(),
214        Err(e) => {
215            warn!(
216                error = %e,
217                "Failed to resolve ASTRID_HOME; falling back to /tmp/.astrid/run/system.ready"
218            );
219            PathBuf::from("/tmp/.astrid/run/system.ready")
220        },
221    }
222}
223
224/// Write the readiness sentinel file to signal that the daemon is fully
225/// initialized and accepting connections.
226///
227/// This must be called **after** `load_all_capsules()` completes (which
228/// includes `await_capsule_readiness()`). The CLI polls for this file
229/// instead of the socket file to avoid connecting before the accept loop
230/// is running.
231///
232/// # Errors
233/// Returns an error if the file cannot be written. The caller should treat
234/// this as a fatal boot failure - without the sentinel, the CLI will never
235/// detect that the daemon is ready.
236pub fn write_readiness_file() -> Result<(), std::io::Error> {
237    use std::fs::OpenOptions;
238
239    let path = readiness_path();
240
241    // Ensure the parent directory exists (defense-in-depth for contexts
242    // where bind_session_socket() has not run first).
243    if let Some(parent) = path.parent() {
244        std::fs::create_dir_all(parent)?;
245    }
246
247    // Create the sentinel file with owner-only permissions set atomically
248    // via OpenOptions::mode() to avoid a TOCTOU window where the file exists
249    // with default permissions before chmod.
250    let mut opts = OpenOptions::new();
251    opts.write(true).create(true).truncate(true);
252
253    #[cfg(unix)]
254    {
255        use std::os::unix::fs::OpenOptionsExt;
256        opts.mode(0o600);
257    }
258
259    opts.open(&path)?;
260    Ok(())
261}
262
263/// Remove the readiness sentinel file (best-effort).
264///
265/// Called during shutdown and stale-file cleanup. Errors are silently
266/// ignored - a missing file is not an error, and if removal fails the
267/// CLI's pre-spawn cleanup will handle it on next boot.
268pub fn remove_readiness_file() {
269    let _ = std::fs::remove_file(readiness_path());
270}
271
272#[cfg(test)]
273mod tests {
274    use super::*;
275
276    #[test]
277    fn path_too_long_is_rejected() {
278        // Build a path that exceeds the platform limit.
279        let long_name = "a".repeat(MAX_SOCKET_PATH_LEN + 10);
280        let path = PathBuf::from(format!("/tmp/{long_name}.sock"));
281        let err = prepare_socket_path(&path).unwrap_err();
282        assert!(
283            err.to_string().contains("exceeding the platform limit"),
284            "unexpected error: {err}"
285        );
286    }
287
288    #[test]
289    fn stale_socket_is_removed() {
290        // Bind a listener, drop it (making the socket stale), then verify
291        // prepare_socket_path removes it.
292        let dir = tempfile::tempdir().unwrap();
293        let sock = dir.path().join("test.sock");
294
295        // Create and immediately drop a listener to leave a stale socket file.
296        let _listener = std::os::unix::net::UnixListener::bind(&sock).unwrap();
297        drop(_listener);
298
299        assert!(sock.exists(), "socket file should exist after bind");
300        prepare_socket_path(&sock).unwrap();
301        assert!(!sock.exists(), "stale socket should have been removed");
302    }
303
304    #[test]
305    fn live_socket_is_rejected() {
306        let dir = tempfile::tempdir().unwrap();
307        let sock = dir.path().join("test.sock");
308
309        // Keep the listener alive so connect succeeds.
310        let _listener = std::os::unix::net::UnixListener::bind(&sock).unwrap();
311
312        let err = prepare_socket_path(&sock).unwrap_err();
313        assert!(
314            err.to_string().contains("already running"),
315            "unexpected error: {err}"
316        );
317    }
318
319    #[test]
320    fn symlink_is_removed() {
321        let dir = tempfile::tempdir().unwrap();
322        let target = dir.path().join("target");
323        std::fs::write(&target, "not a socket").unwrap();
324
325        let sock = dir.path().join("test.sock");
326        std::os::unix::fs::symlink(&target, &sock).unwrap();
327        assert!(sock.is_symlink());
328
329        prepare_socket_path(&sock).unwrap();
330        assert!(!sock.exists(), "symlink should have been removed");
331        assert!(target.exists(), "target should be untouched");
332    }
333
334    #[test]
335    fn nonexistent_path_succeeds() {
336        let dir = tempfile::tempdir().unwrap();
337        let sock = dir.path().join("does_not_exist.sock");
338        prepare_socket_path(&sock).unwrap();
339    }
340
341    #[test]
342    fn singleton_lock_is_exclusive() {
343        let dir = tempfile::tempdir().unwrap();
344        let lock = dir.path().join("system.lock");
345
346        // First acquisition holds the lock for the duration of `_first`.
347        let _first = acquire_singleton_lock(&lock).expect("first acquisition succeeds");
348
349        // A second acquisition while the first is held must fail — this is the
350        // "another kernel is already running" guard.
351        let err = acquire_singleton_lock(&lock).unwrap_err();
352        assert!(
353            err.to_string().contains("already running"),
354            "unexpected error: {err}"
355        );
356    }
357
358    #[test]
359    fn singleton_lock_is_released_on_drop() {
360        let dir = tempfile::tempdir().unwrap();
361        let lock = dir.path().join("system.lock");
362
363        // Acquire and drop — mirrors a daemon exiting and releasing the lock.
364        {
365            let _first = acquire_singleton_lock(&lock).expect("first acquisition succeeds");
366        }
367
368        // A fresh daemon can now acquire the same lock (no wedged restart).
369        let _second =
370            acquire_singleton_lock(&lock).expect("lock should be re-acquirable after release");
371    }
372}