innisfree 0.4.3

//! High-level controller logic for managing
//! service proxies, i.e. [TunnelManager].

use crate::config::{Protocol, ServicePort};
use crate::proxy::{proxy_handler, udp_handler};
use crate::server::{InnisfreeServer, Provider, ServerSpec};
use crate::ssh::SshKeypair;
use crate::state::{
    remove_state_for_service, TunnelConfig, TunnelIdentity, TunnelStateDir, TunnelStatus,
};
// `remove_state_for_service` is intentionally retained: clean shutdown
// (`Self::clean`) still wipes the dir on the way out, but the wipe-then-
// recreate that used to live in `Self::new` is gone — that hid leftover
// state from crashed prior runs. The CLI now opts in via `--force`.
use crate::wg::{LocalWg, WireguardManager};
use anyhow::{anyhow, bail, Context, Result};
use futures::future::{join_all, BoxFuture};
use std::net::{IpAddr, SocketAddr};
use std::time::Duration;
use tokio::net::TcpStream;
use tokio::signal::unix::{signal, SignalKind};

/// Controller class for handling tunnel configurations.
/// Handles the soup-to-nuts configuration, including server creation,
/// WireGuard device config, and proxy.
///
/// Fields are `pub(crate)` so the layout can change freely without
/// breaking external callers — reach in via the inherent accessors
/// ([`Self::server_ipv4`], [`Self::local_wg_address`], [`Self::services`])
/// instead.
pub struct TunnelManager {
    /// List of `ServicePort`s to manage connections for.
    pub(crate) services: Vec<ServicePort>,
    /// Remote server handling public ingress.
    pub(crate) server: Box<dyn InnisfreeServer>,
    /// Human-readable name for this service manager. Retained alongside
    /// `state` because it's also used in user-facing log messages.
    pub(crate) name: String,
    /// On-disk per-tunnel state dir (config files, ip marker, etc.).
    pub(crate) state: TunnelStateDir,
    /// Controller for Wireguard tunnels.
    pub(crate) wg: WireguardManager,
    /// SSH keypair for managing client-side SSH connections.
    pub(crate) ssh_client_keypair: SshKeypair,
    /// SSH keypair for identifying remote SSH server identity.
    pub(crate) ssh_server_keypair: SshKeypair,
    /// Live local Wireguard interface, populated by [`Self::up`].
    /// Dropping it tears the interface down.
    local_wg: Option<LocalWg>,
}

impl TunnelManager {
    /// Public IPv4 address of the remote cloud server. Errors if the
    /// underlying provider has no public v4 network on the server (in
    /// practice: not yet booted). Named explicitly `_ipv4` to mirror
    /// [`InnisfreeServer::ipv4_address`] — IPv6 will arrive as a sibling
    /// method, not as a generalization of this one.
    pub fn server_ipv4(&self) -> Result<IpAddr> {
        self.server.ipv4_address()
    }

    /// Address of the local Wireguard interface — what local services
    /// must bind to in order to receive forwarded traffic.
    pub fn local_wg_address(&self) -> IpAddr {
        self.wg.local_device.interface.address
    }

    /// Service ports being proxied through this tunnel.
    pub fn services(&self) -> &[ServicePort] {
        &self.services
    }
}

impl TunnelManager {
    /// Create a new controller for managing a collection of services.
    /// Call `up()` to build.
    ///
    /// `provider` decides which cloud backend creates the server; its
    /// concrete type is opaque here so adding a second provider doesn't
    /// touch this function. `config` carries the operator's intent —
    /// the same fields the `up` CLI exposes — and is persisted to
    /// `$XDG_CONFIG_HOME/innisfree/<svc>.toml` before any cloud calls
    /// so a crashed bringup still leaves the intent on disk for
    /// diagnosis (and forces a subsequent `up` to `--force` or
    /// `innisfree clean`).
    pub async fn new(provider: Box<dyn Provider>, config: TunnelConfig) -> Result<TunnelManager> {
        // `for_service` takes the per-tunnel lock and refuses if leftover
        // state is present from a crashed prior run. Callers who want
        // the old wipe-first behaviour (CLI: `innisfree up --force`) call
        // `remove_state_for_service` first; this constructor stays strict
        // so the library can't silently overwrite half-written keys.
        let state = TunnelStateDir::for_service(&config.name)?;
        let wg = WireguardManager::new(&config.name)?;
        // Create new ephemeral ssh keypair
        let ssh_client_keypair = SshKeypair::new()?;
        let ssh_server_keypair = SshKeypair::new()?;

        // Persist user intent + crypto identity before any cloud calls.
        // If `provider.create` below fails, the lock drops on return
        // and the next `innisfree up --name <svc>` will trip the
        // leftover-state check, directing the user at `clean` /
        // `--force` — same UX as a crash mid-bringup.
        state
            .write_config(&config)
            .context("persisting tunnel config")?;
        state
            .write_identity(&TunnelIdentity {
                wireguard: wg.clone(),
            })
            .context("persisting tunnel identity")?;

        let spec = ServerSpec {
            name: config.name.clone(),
            services: config.services.clone(),
            wg_mgr: wg.clone(),
            ssh_client_keypair: ssh_client_keypair.clone(),
            ssh_server_keypair: ssh_server_keypair.clone(),
        };
        let server = provider.create(&spec).await?;

        if let Some(ip) = config.floating_ip {
            server.assign_floating_ip(ip).await?;
        }

        Ok(TunnelManager {
            name: config.name,
            services: config.services,
            server,
            ssh_client_keypair,
            ssh_server_keypair,
            state,
            wg,
            local_wg: None,
        })
    }
    /// Create remote and local infrastructure. Creates a cloud server,
    /// configures it to forward public ports over its Wireguard interface,
    /// to a local Wireguard interface
    pub async fn up(&mut self) -> Result<()> {
        self.wait_for_ssh().await?;
        tracing::debug!("Configuring remote proxy...");
        self.wait_for_cloudinit()
            .await
            .context("failed while waiting for cloudinit")?;
        let ip = self.server.ipv4_address()?;
        tracing::debug!("Configuring tunnel...");
        // Stamp the freshly-discovered remote IP onto the local
        // device's peer config so the boringtun runtime can connect.
        self.wg.local_device.peer.endpoint = Some(ip);
        // Persist the rendered wg0.conf for debugging / SSH inspection;
        // the local runtime no longer reads it, but it remains useful.
        self.wg
            .local_device
            .write_to(&self.state.wg_conf(), &self.services)
            .context("failed to write wireguard configs")?;
        tracing::debug!("Bringing up remote Wireguard interface");
        self.bring_up_remote_wg()
            .await
            .context("failed to bring up remote wg interface")?;
        tracing::debug!("Bringing up local Wireguard interface");
        let local_wg = LocalWg::start(&self.wg.local_device)
            .await
            .context("failed to bring up local wg interface")?;
        self.local_wg = Some(local_wg);

        self.test_connection().await?;
        // The `ip` marker is the readiness signal for `innisfree ip` (and
        // anything polling it, like the integration test). Writing it here
        // — only after `test_connection` confirms the wg tunnel pings end
        // to end — guarantees a successful `innisfree ip` means the tunnel
        // is actually usable, not just that a known_hosts file got written
        // as a side effect during the cloud-init wait.
        self.write_ready_marker()
            .context("writing ready marker after tunnel came up")?;
        Ok(())
    }

    /// Persist tunnel readiness to disk: a structured `status.json`
    /// holding the IP, server id, and ready-at timestamp, plus a
    /// bare `ip` file for back-compat with scripts that pre-date the
    /// JSON format. Used by [`get_server_ip`] (and thus `innisfree ip`
    /// / `innisfree ssh`) as both the IP source and the readiness
    /// signal.
    fn write_ready_marker(&self) -> Result<()> {
        let status = TunnelStatus {
            ip: self.server.ipv4_address()?,
            droplet_id: self.server.server_id(),
            ready_at: time::OffsetDateTime::now_utc(),
        };
        self.state
            .write_status(&status)
            .context("writing tunnel status")?;
        Ok(())
    }
    /// Blocks until the server's cloudinit process reports completion.
    async fn wait_for_cloudinit(&self) -> Result<()> {
        let cmd: Vec<&str> = vec!["cloud-init", "status", "--long", "--wait"];
        self.run_ssh_cmd(cmd).await
    }
    /// Blocks until 22/TCP is available on the server. Each connect is
    /// wrapped in a 5 s timeout so an unreachable host doesn't pin the
    /// loop on the kernel's much longer default connect timeout.
    async fn wait_for_ssh(&self) -> Result<()> {
        let dest_ip = SocketAddr::new(self.server.ipv4_address()?, 22);
        loop {
            match tokio::time::timeout(Duration::from_secs(5), TcpStream::connect(dest_ip)).await {
                Ok(Ok(_)) => {
                    tracing::debug!("SSH port is open, proceeding");
                    return Ok(());
                }
                Ok(Err(_)) | Err(_) => {
                    tracing::debug!("Waiting for ssh...");
                    tokio::time::sleep(Duration::from_secs(10)).await;
                }
            }
        }
    }
    /// Wait for SIGTERM or SIGINT, then tear the tunnel down. Returns
    /// `Ok(())` once cleanup completes; the caller decides what to do
    /// next (typically: drop out of `main` and let the process exit).
    /// Returns `Err` if signal registration fails or cleanup fails —
    /// no `process::exit` from inside this function, so callers stay
    /// in control of exit codes.
    ///
    /// Handles SIGTERM as well as SIGINT so systemd can deliver the
    /// default `KillSignal=SIGTERM` and we still clean up the cloud
    /// node — that lets the unit file drop the old `KillSignal=SIGINT`
    /// hack.
    pub async fn block(&mut self) -> Result<()> {
        let mut sigterm = signal(SignalKind::terminate()).context("registering SIGTERM handler")?;
        let mut sigint = signal(SignalKind::interrupt()).context("registering SIGINT handler")?;
        tokio::select! {
            _ = sigterm.recv() => tracing::warn!("Received SIGTERM, exiting gracefully"),
            _ = sigint.recv() => tracing::warn!("Received SIGINT, exiting gracefully"),
        }
        self.clean().await?;
        tracing::info!("Clean up complete");
        Ok(())
    }
    /// Ping remote remote Wireguard IP from local Wireguard device.
    /// Ensures connectivity is established between remote and local interfaces.
    async fn test_connection(&self) -> Result<()> {
        let ip = self.wg.remote_device.interface.address;
        tokio::process::Command::new("ping")
            .arg("-c1")
            .arg("-w5")
            .arg(ip.to_string())
            .stdout(std::process::Stdio::null())
            .stderr(std::process::Stdio::null())
            .status()
            .await
            .context("Failed to ping remote Wireguard interface, tunnel broken")?;
        tracing::debug!("Confirmed tunnel is established, able to ping across it");
        Ok(())
    }
    /// Runs `wg-quick up` on remote server to bring up its Wireguard interface.
    /// Remote node is still Debian + cloud-init + wg-quick (Phase 2 will
    /// migrate it to NixOS); local side now uses the in-process runtime.
    async fn bring_up_remote_wg(&self) -> Result<()> {
        let cmd = vec!["wg-quick", "up", "/tmp/innisfree.conf"];
        tracing::trace!("Activating remote wg interface");
        self.run_ssh_cmd(cmd).await
    }
    /// Generates an SSH known_hosts file, containing the automatically
    /// generated SSH hostkey for the remote server. Doing so allows
    /// us to verify the SSH connection on first use.
    fn known_hosts(&self) -> Result<String> {
        let ipv4_address = &self.server.ipv4_address()?;
        let server_host_key = &self.ssh_server_keypair.public;
        let host_line = format!("{} {}", ipv4_address, server_host_key);
        let fpath = self.state.known_hosts();
        std::fs::write(&fpath, host_line).context("Failed to create known_hosts")?;
        Ok(fpath.display().to_string())
    }
    /// Execute a shell command on the remote server. Fails if the remote
    /// command exits non-zero, surfacing its captured stderr/stdout so the
    /// caller can diagnose what went wrong (`Command::status()` reports the
    /// exit code as `Ok(_)`, so the previous Stdio::null() + `?` shape was
    /// silently swallowing remote-side failures).
    async fn run_ssh_cmd(&self, cmd: Vec<&str>) -> Result<()> {
        let key_path = self.state.client_key();
        self.ssh_client_keypair.write_to(&key_path)?;
        let known_hosts = self.known_hosts()?;
        let ip = self.server.ipv4_address()?.to_string();
        let pretty_cmd = cmd.join(" ");
        let mut cmd_args = ssh_base_args(&key_path.display().to_string(), &known_hosts, &ip);
        cmd_args.extend(cmd.iter().map(|s| (*s).to_string()));
        let output = tokio::process::Command::new("ssh")
            .args(&cmd_args)
            .output()
            .await
            .context("invoking ssh")?;
        let stdout = String::from_utf8_lossy(&output.stdout);
        let stderr = String::from_utf8_lossy(&output.stderr);
        if !output.status.success() {
            bail!(
                "remote command `{pretty_cmd}` failed (ssh exit {}):\n\
                 --- stderr ---\n{}\n--- stdout ---\n{}",
                output.status,
                stderr.trim_end(),
                stdout.trim_end(),
            );
        }
        if !stdout.trim().is_empty() {
            tracing::debug!("remote `{pretty_cmd}` stdout:\n{}", stdout.trim_end());
        }
        if !stderr.trim().is_empty() {
            tracing::debug!("remote `{pretty_cmd}` stderr:\n{}", stderr.trim_end());
        }
        Ok(())
    }
    /// Destroys all infrastructure, including local Wireguard interfaces,
    /// remote server, and local config dir.
    pub async fn clean(&mut self) -> Result<()> {
        tracing::debug!("removing local Wireguard interface");
        // Drop the runtime: boringtun's DeviceHandle::Drop tears down
        // the TUN device and joins worker threads.
        self.local_wg = None;

        // Run both cleanup steps regardless of which fails: a leaked droplet
        // costs money, but local config is also worth removing. Capture each
        // result, log loudly on a destroy failure (so the user knows manual
        // cleanup is needed), then propagate.
        let destroy_result = self
            .server
            .destroy()
            .await
            .context("destroying remote server");
        if let Err(e) = &destroy_result {
            tracing::error!(
                "failed to destroy remote server (manual cleanup may be required): {e:#}"
            );
        }
        let config_result =
            remove_state_for_service(&self.name).context("removing local state dir");

        destroy_result?;
        config_result
    }
}

/// Look up the IPv4 address for the remote server. Reads the
/// structured `status.json` written by
/// [`TunnelManager::write_ready_marker`] after the tunnel comes up
/// successfully — its presence doubles as the readiness signal, so
/// callers polling for readiness don't need a separate channel.
///
/// Falls back to the legacy bare `ip` file when `status.json` is
/// absent (a tunnel brought up by 0.4.x or earlier). Drop the
/// fallback in 0.6 alongside `ip_marker`.
pub fn get_server_ip(service_name: &str) -> Result<IpAddr> {
    tracing::trace!("Looking up server IP from ready marker");
    let state = TunnelStateDir::open(service_name)?;
    if let Some(status) = state.read_status().context("reading status.json")? {
        return Ok(status.ip);
    }
    // Pre-0.5 fallback: bare `ip` marker.
    let fpath = state.ip_marker();
    let content = std::fs::read_to_string(&fpath)
        .with_context(|| format!("reading {} (tunnel may not be ready yet)", fpath.display()))?;
    let ip: IpAddr = content
        .trim()
        .parse()
        .with_context(|| format!("parsing IP from {}", fpath.display()))?;
    Ok(ip)
}

/// Create an interface SSH session on remote server.
pub async fn open_shell(service_name: &str) -> Result<()> {
    let state = TunnelStateDir::open(service_name)?;
    let key_path = state.client_key().display().to_string();
    let known_hosts = state.known_hosts().display().to_string();
    let ip = get_server_ip(service_name)?.to_string();
    let cmd_args = ssh_base_args(&key_path, &known_hosts, &ip);
    tokio::process::Command::new("ssh")
        .args(&cmd_args)
        .status()
        .await
        .context("SSH interactive session failed")?;
    Ok(())
}

/// Build the canonical `ssh -l innisfree -i <key> -o UserKnownHostsFile=<known_hosts>
/// -o ConnectTimeout=5 <ip>` arg vector. Append a remote command after to run it
/// non-interactively, or pass to `Command::status()` as-is for an interactive shell.
fn ssh_base_args(key_path: &str, known_hosts_path: &str, ip: &str) -> Vec<String> {
    vec![
        "-l".into(),
        "innisfree".into(),
        "-i".into(),
        key_path.into(),
        "-o".into(),
        format!("UserKnownHostsFile={known_hosts_path}"),
        "-o".into(),
        "ConnectTimeout=5".into(),
        ip.into(),
    ]
}

/// Spin up local network proxy to handle passing traffic
/// between the local service(s) and the remote server.
pub async fn run_proxy(
    local_ip: IpAddr,
    dest_ip: IpAddr,
    services: Vec<ServicePort>,
) -> Result<()> {
    // We'll kick off a dedicated proxy for each service, dispatching
    // on protocol, and collect them to await concurrently. Box the
    // futures so TCP and UDP handlers (different concrete types) can
    // share a single Vec without spawning each onto its own task.
    let mut tasks: Vec<BoxFuture<'static, Result<()>>> = vec![];
    for s in services {
        let listen_addr: SocketAddr = format!("{}:{}", local_ip, &s.local_port).parse()?;
        let dest_addr: SocketAddr = format!("{}:{}", dest_ip, &s.port).parse()?;
        let h: BoxFuture<'static, Result<()>> = match s.protocol {
            Protocol::Tcp => Box::pin(proxy_handler(listen_addr, dest_addr)),
            Protocol::Udp => Box::pin(udp_handler(listen_addr, dest_addr)),
        };
        tasks.push(h);
    }
    // We expect the proxies to block indefinitely, except ctrl+c.
    // If they return earlier, we'll be able to inspect the errors.
    let proxy_tasks = join_all(tasks).await;
    tracing::warn!("Proxy stopped unexpectedly, no longer forwarding traffic");
    for t in proxy_tasks {
        match t {
            Ok(t) => {
                // I don't expect to see this
                tracing::debug!("Service proxy returned ok: {:?}", t);
            }
            Err(e) => {
                return Err(anyhow!("Service proxy failed: {}", e));
            }
        }
    }
    Ok(())
}