llm-orchestrator-state 0.1.1

State persistence and recovery for LLM workflow orchestrator
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
// Copyright (c) 2025 LLM DevOps
// SPDX-License-Identifier: MIT OR Apache-2.0

//! PostgreSQL implementation of the StateStore trait.

use crate::models::{Checkpoint, StepState, WorkflowState, WorkflowStatus};
use crate::traits::{StateStore, StateStoreError, StateStoreResult};
use async_trait::async_trait;
use chrono::{DateTime, Utc};
use sqlx::postgres::{PgConnectOptions, PgPoolOptions};
use sqlx::{ConnectOptions, PgPool, Row};
use std::str::FromStr;
use std::time::Duration;
use tracing::{debug, info, warn};
use uuid::Uuid;

/// PostgreSQL state store implementation.
pub struct PostgresStateStore {
    pool: PgPool,
}

impl PostgresStateStore {
    /// Create a new PostgreSQL state store with connection pooling.
    ///
    /// # Arguments
    /// * `database_url` - PostgreSQL connection string
    /// * `min_connections` - Minimum number of connections in pool (default: 5)
    /// * `max_connections` - Maximum number of connections in pool (default: 20)
    ///
    /// # Example
    /// ```no_run
    /// # use llm_orchestrator_state::postgres::PostgresStateStore;
    /// # async fn example() -> Result<(), Box<dyn std::error::Error>> {
    /// let store = PostgresStateStore::new(
    ///     "postgresql://user:pass@localhost/workflows",
    ///     Some(5),
    ///     Some(20),
    /// ).await?;
    /// # Ok(())
    /// # }
    /// ```
    pub async fn new(
        database_url: impl AsRef<str>,
        min_connections: Option<u32>,
        max_connections: Option<u32>,
    ) -> StateStoreResult<Self> {
        let min_conn = min_connections.unwrap_or(5);
        let max_conn = max_connections.unwrap_or(20);

        info!(
            "Initializing PostgreSQL state store (min_connections={}, max_connections={})",
            min_conn, max_conn
        );

        // Parse connection options
        let mut connect_opts = PgConnectOptions::from_str(database_url.as_ref())
            .map_err(|e| StateStoreError::Configuration(format!("Invalid database URL: {}", e)))?;

        // Configure logging
        connect_opts = connect_opts.log_statements(tracing::log::LevelFilter::Debug);

        // Build connection pool
        let pool = PgPoolOptions::new()
            .min_connections(min_conn)
            .max_connections(max_conn)
            .acquire_timeout(Duration::from_secs(5))
            .idle_timeout(Some(Duration::from_secs(300)))
            .max_lifetime(Some(Duration::from_secs(1800)))
            .connect_with(connect_opts)
            .await
            .map_err(|e| StateStoreError::Connection(format!("Failed to create connection pool: {}", e)))?;

        info!("PostgreSQL connection pool established");

        let store = Self { pool };

        // Run migrations
        store.run_migrations().await?;

        Ok(store)
    }

    /// Run database migrations.
    async fn run_migrations(&self) -> StateStoreResult<()> {
        info!("Running database migrations");

        // Read migration files
        let migration_001 = include_str!("../migrations/001_initial_schema.sql");
        let migration_002 = include_str!("../migrations/002_checkpoints.sql");

        // Execute migrations
        sqlx::query(migration_001)
            .execute(&self.pool)
            .await
            .map_err(|e| StateStoreError::Database(format!("Migration 001 failed: {}", e)))?;

        sqlx::query(migration_002)
            .execute(&self.pool)
            .await
            .map_err(|e| StateStoreError::Database(format!("Migration 002 failed: {}", e)))?;

        info!("Database migrations completed successfully");
        Ok(())
    }

    /// Get the connection pool (for advanced use cases).
    pub fn pool(&self) -> &PgPool {
        &self.pool
    }
}

#[async_trait]
impl StateStore for PostgresStateStore {
    async fn save_workflow_state(&self, state: &WorkflowState) -> StateStoreResult<()> {
        debug!("Saving workflow state: id={}, workflow_id={}", state.id, state.workflow_id);

        let mut tx = self.pool.begin().await?;

        // Serialize context to JSON string
        let context_json = serde_json::to_string(&state.context)?;

        // Upsert workflow state
        sqlx::query(
            r#"
            INSERT INTO workflow_states (
                id, workflow_id, workflow_name, status, user_id,
                started_at, updated_at, completed_at, context, error
            ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)
            ON CONFLICT (id) DO UPDATE SET
                status = EXCLUDED.status,
                updated_at = EXCLUDED.updated_at,
                completed_at = EXCLUDED.completed_at,
                context = EXCLUDED.context,
                error = EXCLUDED.error
            "#
        )
        .bind(state.id)
        .bind(&state.workflow_id)
        .bind(&state.workflow_name)
        .bind(state.status.to_string())
        .bind(&state.user_id)
        .bind(state.started_at)
        .bind(state.updated_at)
        .bind(state.completed_at)
        .bind(context_json)
        .bind(&state.error)
        .execute(&mut *tx)
        .await?;

        // Save step states
        for (step_id, step_state) in &state.steps {
            let outputs_json = serde_json::to_string(&step_state.outputs)?;

            sqlx::query(
                r#"
                INSERT INTO step_states (
                    workflow_state_id, step_id, status, started_at, completed_at,
                    outputs, error, retry_count
                ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
                ON CONFLICT (workflow_state_id, step_id) DO UPDATE SET
                    status = EXCLUDED.status,
                    started_at = EXCLUDED.started_at,
                    completed_at = EXCLUDED.completed_at,
                    outputs = EXCLUDED.outputs,
                    error = EXCLUDED.error,
                    retry_count = EXCLUDED.retry_count
                "#
            )
            .bind(state.id)
            .bind(step_id)
            .bind(step_state.status.to_string())
            .bind(step_state.started_at)
            .bind(step_state.completed_at)
            .bind(outputs_json)
            .bind(&step_state.error)
            .bind(step_state.retry_count)
            .execute(&mut *tx)
            .await?;
        }

        tx.commit().await?;

        debug!("Workflow state saved successfully: id={}", state.id);
        Ok(())
    }

    async fn load_workflow_state(&self, id: &Uuid) -> StateStoreResult<WorkflowState> {
        debug!("Loading workflow state: id={}", id);

        // Load workflow state
        let row = sqlx::query(
            r#"
            SELECT id, workflow_id, workflow_name, status, user_id,
                   started_at, updated_at, completed_at, context, error
            FROM workflow_states
            WHERE id = $1
            "#
        )
        .bind(id)
        .fetch_one(&self.pool)
        .await?;

        let workflow_id: Uuid = row.get("id");
        let status_str: String = row.get("status");
        let status = WorkflowStatus::from_str(&status_str)
            .map_err(StateStoreError::InvalidState)?;

        let context_str: String = row.get("context");
        let context = serde_json::from_str(&context_str)?;

        let mut state = WorkflowState {
            id: workflow_id,
            workflow_id: row.get("workflow_id"),
            workflow_name: row.get("workflow_name"),
            status,
            user_id: row.get("user_id"),
            started_at: row.get("started_at"),
            updated_at: row.get("updated_at"),
            completed_at: row.get("completed_at"),
            context,
            error: row.get("error"),
            steps: Default::default(),
        };

        // Load step states
        let step_rows = sqlx::query(
            r#"
            SELECT step_id, status, started_at, completed_at,
                   outputs, error, retry_count
            FROM step_states
            WHERE workflow_state_id = $1
            "#
        )
        .bind(workflow_id)
        .fetch_all(&self.pool)
        .await?;

        for step_row in step_rows {
            let step_id: String = step_row.get("step_id");
            let status_str: String = step_row.get("status");
            let status = crate::models::StepStatus::from_str(&status_str)
                .map_err(StateStoreError::InvalidState)?;

            let outputs_str: Option<String> = step_row.get("outputs");
            let outputs = if let Some(json_str) = outputs_str {
                serde_json::from_str(&json_str)?
            } else {
                serde_json::Value::Null
            };

            let step_state = StepState {
                step_id: step_id.clone(),
                status,
                started_at: step_row.get("started_at"),
                completed_at: step_row.get("completed_at"),
                outputs,
                error: step_row.get("error"),
                retry_count: step_row.get("retry_count"),
            };

            state.steps.insert(step_id, step_state);
        }

        debug!("Workflow state loaded successfully: id={}", id);
        Ok(state)
    }

    async fn load_workflow_state_by_workflow_id(&self, workflow_id: &str) -> StateStoreResult<WorkflowState> {
        debug!("Loading workflow state by workflow_id: {}", workflow_id);

        // Get the most recent state for this workflow_id
        let row = sqlx::query(
            r#"
            SELECT id
            FROM workflow_states
            WHERE workflow_id = $1
            ORDER BY updated_at DESC
            LIMIT 1
            "#
        )
        .bind(workflow_id)
        .fetch_one(&self.pool)
        .await?;

        let id: Uuid = row.get("id");
        self.load_workflow_state(&id).await
    }

    async fn list_active_workflows(&self) -> StateStoreResult<Vec<WorkflowState>> {
        debug!("Listing active workflows");

        let rows = sqlx::query(
            r#"
            SELECT id
            FROM workflow_states
            WHERE status IN ('running', 'pending', 'paused')
            ORDER BY updated_at DESC
            "#
        )
        .fetch_all(&self.pool)
        .await?;

        let mut workflows = Vec::new();
        for row in rows {
            let id: Uuid = row.get("id");
            match self.load_workflow_state(&id).await {
                Ok(state) => workflows.push(state),
                Err(e) => {
                    warn!("Failed to load workflow state {}: {}", id, e);
                }
            }
        }

        debug!("Found {} active workflows", workflows.len());
        Ok(workflows)
    }

    async fn create_checkpoint(&self, checkpoint: &Checkpoint) -> StateStoreResult<()> {
        debug!("Creating checkpoint: id={}, workflow_state_id={}", checkpoint.id, checkpoint.workflow_state_id);

        let snapshot_json = serde_json::to_string(&checkpoint.snapshot)?;

        sqlx::query(
            r#"
            INSERT INTO checkpoints (id, workflow_state_id, step_id, timestamp, snapshot)
            VALUES ($1, $2, $3, $4, $5)
            "#
        )
        .bind(checkpoint.id)
        .bind(checkpoint.workflow_state_id)
        .bind(&checkpoint.step_id)
        .bind(checkpoint.timestamp)
        .bind(snapshot_json)
        .execute(&self.pool)
        .await?;

        // Cleanup old checkpoints (keep last 10)
        self.cleanup_old_checkpoints(&checkpoint.workflow_state_id, 10).await?;

        debug!("Checkpoint created successfully: id={}", checkpoint.id);
        Ok(())
    }

    async fn get_latest_checkpoint(&self, workflow_state_id: &Uuid) -> StateStoreResult<Option<Checkpoint>> {
        debug!("Getting latest checkpoint for workflow_state_id={}", workflow_state_id);

        let row_opt = sqlx::query(
            r#"
            SELECT id, workflow_state_id, step_id, timestamp, snapshot
            FROM checkpoints
            WHERE workflow_state_id = $1
            ORDER BY timestamp DESC
            LIMIT 1
            "#
        )
        .bind(workflow_state_id)
        .fetch_optional(&self.pool)
        .await?;

        if let Some(row) = row_opt {
            let snapshot_str: String = row.get("snapshot");
            let snapshot = serde_json::from_str(&snapshot_str)?;

            let checkpoint = Checkpoint {
                id: row.get("id"),
                workflow_state_id: row.get("workflow_state_id"),
                step_id: row.get("step_id"),
                timestamp: row.get("timestamp"),
                snapshot,
            };

            debug!("Found latest checkpoint: id={}", checkpoint.id);
            Ok(Some(checkpoint))
        } else {
            debug!("No checkpoints found for workflow_state_id={}", workflow_state_id);
            Ok(None)
        }
    }

    async fn restore_from_checkpoint(&self, checkpoint_id: &Uuid) -> StateStoreResult<WorkflowState> {
        debug!("Restoring from checkpoint: id={}", checkpoint_id);

        let row = sqlx::query(
            r#"
            SELECT snapshot
            FROM checkpoints
            WHERE id = $1
            "#
        )
        .bind(checkpoint_id)
        .fetch_one(&self.pool)
        .await?;

        let snapshot_str: String = row.get("snapshot");
        let state: WorkflowState = serde_json::from_str(&snapshot_str)?;

        debug!("Successfully restored state from checkpoint: id={}", checkpoint_id);
        Ok(state)
    }

    async fn delete_old_states(&self, older_than: DateTime<Utc>) -> StateStoreResult<u64> {
        debug!("Deleting states older than: {}", older_than);

        let result = sqlx::query(
            r#"
            DELETE FROM workflow_states
            WHERE updated_at < $1
              AND status IN ('completed', 'failed')
            "#
        )
        .bind(older_than)
        .execute(&self.pool)
        .await?;

        let deleted = result.rows_affected();
        debug!("Deleted {} old workflow states", deleted);
        Ok(deleted)
    }

    async fn cleanup_old_checkpoints(&self, workflow_state_id: &Uuid, keep_count: usize) -> StateStoreResult<u64> {
        debug!("Cleaning up old checkpoints for workflow_state_id={}, keeping last {}", workflow_state_id, keep_count);

        // PostgreSQL approach: delete checkpoints not in the top N
        let result = sqlx::query(
            r#"
            DELETE FROM checkpoints
            WHERE workflow_state_id = $1
              AND id NOT IN (
                SELECT id FROM checkpoints
                WHERE workflow_state_id = $1
                ORDER BY timestamp DESC
                LIMIT $2
              )
            "#
        )
        .bind(workflow_state_id)
        .bind(keep_count as i64)
        .execute(&self.pool)
        .await?;

        let deleted = result.rows_affected();
        if deleted > 0 {
            debug!("Cleaned up {} old checkpoints", deleted);
        }
        Ok(deleted)
    }

    async fn health_check(&self) -> StateStoreResult<()> {
        debug!("Performing health check");

        // Simple query to verify database connectivity
        sqlx::query("SELECT 1")
            .fetch_one(&self.pool)
            .await
            .map_err(|e| StateStoreError::Connection(format!("Health check failed: {}", e)))?;

        debug!("Health check passed");
        Ok(())
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::models::WorkflowState;
    use serde_json::json;

    // Integration tests require a running PostgreSQL instance
    // These are disabled by default - run with:
    // TEST_DATABASE_URL=postgresql://... cargo test -- --ignored

    #[tokio::test]
    #[ignore]
    async fn test_postgres_state_store_integration() {
        let database_url = std::env::var("TEST_DATABASE_URL")
            .unwrap_or_else(|_| "postgresql://postgres:postgres@localhost/test_workflows".to_string());

        let store = PostgresStateStore::new(&database_url, Some(2), Some(5))
            .await
            .expect("Failed to create state store");

        // Test health check
        store.health_check().await.expect("Health check failed");

        // Create test workflow state
        let mut state = WorkflowState::new(
            "test-workflow-1",
            "Test Workflow",
            Some("user-123".to_string()),
            json!({"inputs": {"test": "value"}}),
        );
        state.mark_running();

        // Save state
        store.save_workflow_state(&state).await.expect("Failed to save state");

        // Load state
        let loaded = store.load_workflow_state(&state.id).await.expect("Failed to load state");
        assert_eq!(loaded.workflow_id, state.workflow_id);
        assert_eq!(loaded.status, WorkflowStatus::Running);

        // List active workflows
        let active = store.list_active_workflows().await.expect("Failed to list active workflows");
        assert!(!active.is_empty());

        println!("✅ PostgreSQL integration test passed");
    }
}