mockforge_registry_server/deployment/
cleanup.rs1use anyhow::{Context, Result};
6use sqlx::PgPool;
7use std::sync::Arc;
8use tokio::time::{interval, Duration};
9use tracing::{error, info, warn};
10
11use crate::deployment::flyio::FlyioClient;
12use crate::models::DeploymentStatus;
13
14pub struct DeploymentCleanup {
16 db: Arc<PgPool>,
17 flyio_client: Option<FlyioClient>,
18}
19
20impl DeploymentCleanup {
21 pub fn new(db: Arc<PgPool>, flyio_client: Option<FlyioClient>) -> Self {
22 Self { db, flyio_client }
23 }
24
25 pub fn start(self: Arc<Self>) -> tokio::task::JoinHandle<()> {
27 tokio::spawn(async move {
28 let mut interval = interval(Duration::from_secs(3600)); loop {
31 interval.tick().await;
32
33 if let Err(e) = self.run_cleanup().await {
34 error!("Error during deployment cleanup: {}", e);
35 }
36 }
37 })
38 }
39
40 async fn run_cleanup(&self) -> Result<()> {
42 self.hard_delete_old_records().await?;
43 self.mark_stuck_deployments().await?;
44 self.retry_stuck_deletions().await?;
45 Ok(())
46 }
47
48 async fn hard_delete_old_records(&self) -> Result<()> {
50 let pool = self.db.as_ref();
51
52 let result = sqlx::query(
53 r#"
54 DELETE FROM hosted_mocks
55 WHERE deleted_at IS NOT NULL
56 AND deleted_at < NOW() - INTERVAL '30 days'
57 "#,
58 )
59 .execute(pool)
60 .await
61 .context("Failed to hard-delete old records")?;
62
63 let count = result.rows_affected();
64 if count > 0 {
65 info!("Hard-deleted {} old soft-deleted deployment records", count);
66 }
67
68 Ok(())
69 }
70
71 async fn mark_stuck_deployments(&self) -> Result<()> {
73 let pool = self.db.as_ref();
74
75 let result = sqlx::query(
76 r#"
77 UPDATE hosted_mocks
78 SET status = $1, error_message = 'Deployment timed out', updated_at = NOW()
79 WHERE status = 'deploying'
80 AND updated_at < NOW() - INTERVAL '1 hour'
81 AND deleted_at IS NULL
82 "#,
83 )
84 .bind(DeploymentStatus::Failed.to_string())
85 .execute(pool)
86 .await
87 .context("Failed to mark stuck deployments")?;
88
89 let count = result.rows_affected();
90 if count > 0 {
91 warn!("Marked {} stuck deployments as failed", count);
92 }
93
94 Ok(())
95 }
96
97 async fn retry_stuck_deletions(&self) -> Result<()> {
99 let pool = self.db.as_ref();
100
101 let stuck = sqlx::query_as::<_, crate::models::HostedMock>(
102 r#"
103 SELECT * FROM hosted_mocks
104 WHERE status = 'deleting'
105 AND updated_at < NOW() - INTERVAL '1 hour'
106 AND deleted_at IS NULL
107 "#,
108 )
109 .fetch_all(pool)
110 .await
111 .context("Failed to fetch stuck deletions")?;
112
113 if stuck.is_empty() {
114 return Ok(());
115 }
116
117 warn!("Found {} deployments stuck in deleting state", stuck.len());
118
119 for deployment in &stuck {
120 if let Some(ref flyio_client) = self.flyio_client {
122 let app_name = format!(
123 "mockforge-{}-{}",
124 deployment
125 .org_id
126 .to_string()
127 .replace('-', "")
128 .chars()
129 .take(8)
130 .collect::<String>(),
131 deployment.slug
132 );
133
134 if let Ok(machines) = flyio_client.list_machines(&app_name).await {
136 for machine in machines {
137 if let Err(e) = flyio_client.delete_machine(&app_name, &machine.id).await {
138 warn!("Cleanup: failed to delete machine {}: {}", machine.id, e);
139 }
140 }
141 }
142
143 if let Err(e) = flyio_client.delete_app(&app_name).await {
145 warn!("Cleanup: failed to delete app {}: {}", app_name, e);
146 }
147 }
148
149 sqlx::query(
151 r#"
152 UPDATE hosted_mocks
153 SET deleted_at = NOW(), updated_at = NOW()
154 WHERE id = $1
155 "#,
156 )
157 .bind(deployment.id)
158 .execute(pool)
159 .await
160 .ok();
161
162 info!("Cleanup: completed stuck deletion for deployment {}", deployment.id);
163 }
164
165 Ok(())
166 }
167}