Skip to main content

mockforge_registry_server/deployment/
cleanup.rs

1//! Deployment cleanup worker
2//!
3//! Periodically cleans up orphaned, stuck, and soft-deleted deployment resources.
4
5use anyhow::{Context, Result};
6use sqlx::PgPool;
7use std::sync::Arc;
8use tokio::time::{interval, Duration};
9use tracing::{error, info, warn};
10
11use crate::deployment::flyio::FlyioClient;
12use crate::models::DeploymentStatus;
13
14/// Background worker that handles deployment lifecycle cleanup
15pub struct DeploymentCleanup {
16    db: Arc<PgPool>,
17    flyio_client: Option<FlyioClient>,
18}
19
20impl DeploymentCleanup {
21    pub fn new(db: Arc<PgPool>, flyio_client: Option<FlyioClient>) -> Self {
22        Self { db, flyio_client }
23    }
24
25    /// Start the cleanup worker (runs every hour)
26    pub fn start(self: Arc<Self>) -> tokio::task::JoinHandle<()> {
27        tokio::spawn(async move {
28            let mut interval = interval(Duration::from_secs(3600)); // Every 1 hour
29
30            loop {
31                interval.tick().await;
32
33                if let Err(e) = self.run_cleanup().await {
34                    error!("Error during deployment cleanup: {}", e);
35                }
36            }
37        })
38    }
39
40    /// Run all cleanup tasks
41    async fn run_cleanup(&self) -> Result<()> {
42        self.hard_delete_old_records().await?;
43        self.mark_stuck_deployments().await?;
44        self.retry_stuck_deletions().await?;
45        Ok(())
46    }
47
48    /// Hard-delete rows that were soft-deleted more than 30 days ago
49    async fn hard_delete_old_records(&self) -> Result<()> {
50        let pool = self.db.as_ref();
51
52        let result = sqlx::query(
53            r#"
54            DELETE FROM hosted_mocks
55            WHERE deleted_at IS NOT NULL
56            AND deleted_at < NOW() - INTERVAL '30 days'
57            "#,
58        )
59        .execute(pool)
60        .await
61        .context("Failed to hard-delete old records")?;
62
63        let count = result.rows_affected();
64        if count > 0 {
65            info!("Hard-deleted {} old soft-deleted deployment records", count);
66        }
67
68        Ok(())
69    }
70
71    /// Mark deployments stuck in 'deploying' status for >1 hour as failed
72    async fn mark_stuck_deployments(&self) -> Result<()> {
73        let pool = self.db.as_ref();
74
75        let result = sqlx::query(
76            r#"
77            UPDATE hosted_mocks
78            SET status = $1, error_message = 'Deployment timed out', updated_at = NOW()
79            WHERE status = 'deploying'
80            AND updated_at < NOW() - INTERVAL '1 hour'
81            AND deleted_at IS NULL
82            "#,
83        )
84        .bind(DeploymentStatus::Failed.to_string())
85        .execute(pool)
86        .await
87        .context("Failed to mark stuck deployments")?;
88
89        let count = result.rows_affected();
90        if count > 0 {
91            warn!("Marked {} stuck deployments as failed", count);
92        }
93
94        Ok(())
95    }
96
97    /// Retry deletions stuck in 'deleting' status for >1 hour
98    async fn retry_stuck_deletions(&self) -> Result<()> {
99        let pool = self.db.as_ref();
100
101        let stuck = sqlx::query_as::<_, crate::models::HostedMock>(
102            r#"
103            SELECT * FROM hosted_mocks
104            WHERE status = 'deleting'
105            AND updated_at < NOW() - INTERVAL '1 hour'
106            AND deleted_at IS NULL
107            "#,
108        )
109        .fetch_all(pool)
110        .await
111        .context("Failed to fetch stuck deletions")?;
112
113        if stuck.is_empty() {
114            return Ok(());
115        }
116
117        warn!("Found {} deployments stuck in deleting state", stuck.len());
118
119        for deployment in &stuck {
120            // Try to clean up Fly.io resources if possible
121            if let Some(ref flyio_client) = self.flyio_client {
122                let app_name = format!(
123                    "mockforge-{}-{}",
124                    deployment
125                        .org_id
126                        .to_string()
127                        .replace('-', "")
128                        .chars()
129                        .take(8)
130                        .collect::<String>(),
131                    deployment.slug
132                );
133
134                // Try to delete any remaining machines
135                if let Ok(machines) = flyio_client.list_machines(&app_name).await {
136                    for machine in machines {
137                        if let Err(e) = flyio_client.delete_machine(&app_name, &machine.id).await {
138                            warn!("Cleanup: failed to delete machine {}: {}", machine.id, e);
139                        }
140                    }
141                }
142
143                // Try to delete the app
144                if let Err(e) = flyio_client.delete_app(&app_name).await {
145                    warn!("Cleanup: failed to delete app {}: {}", app_name, e);
146                }
147            }
148
149            // Soft-delete the record regardless
150            sqlx::query(
151                r#"
152                UPDATE hosted_mocks
153                SET deleted_at = NOW(), updated_at = NOW()
154                WHERE id = $1
155                "#,
156            )
157            .bind(deployment.id)
158            .execute(pool)
159            .await
160            .ok();
161
162            info!("Cleanup: completed stuck deletion for deployment {}", deployment.id);
163        }
164
165        Ok(())
166    }
167}