Skip to main content

forge_runtime/gateway/
server.rs

1use std::sync::Arc;
2use std::time::Duration;
3
4use axum::{
5    Extension, Json, Router,
6    error_handling::HandleErrorLayer,
7    extract::DefaultBodyLimit,
8    http::StatusCode,
9    middleware,
10    response::IntoResponse,
11    routing::{get, post},
12};
13use serde::Serialize;
14use tower::BoxError;
15use tower::ServiceBuilder;
16use tower::limit::ConcurrencyLimitLayer;
17use tower::timeout::TimeoutLayer;
18use tower_http::cors::{Any, CorsLayer};
19
20use forge_core::cluster::NodeId;
21use forge_core::config::McpConfig;
22use forge_core::function::{JobDispatch, WorkflowDispatch};
23use opentelemetry::global;
24use opentelemetry::propagation::Extractor;
25use tracing::Instrument;
26use tracing_opentelemetry::OpenTelemetrySpanExt;
27
28use super::auth::{AuthConfig, AuthMiddleware, HmacTokenIssuer, auth_middleware};
29use super::mcp::{McpState, mcp_get_handler, mcp_post_handler};
30use super::multipart::{MultipartConfig, rpc_multipart_handler};
31use super::response::{RpcError, RpcResponse};
32use super::rpc::{RpcHandler, rpc_batch_handler, rpc_function_handler, rpc_handler};
33use super::sse::{
34    SseState, sse_handler, sse_job_subscribe_handler, sse_subscribe_handler,
35    sse_unsubscribe_handler, sse_workflow_subscribe_handler,
36};
37use super::tracing::{REQUEST_ID_HEADER, SPAN_ID_HEADER, TRACE_ID_HEADER, TracingState};
38use crate::db::Database;
39use crate::function::FunctionRegistry;
40use crate::mcp::McpToolRegistry;
41use crate::realtime::{Reactor, ReactorConfig};
42
43const DEFAULT_MAX_JSON_BODY_SIZE: usize = 1024 * 1024;
44const DEFAULT_MAX_MULTIPART_BODY_SIZE: usize = 20 * 1024 * 1024;
45const MAX_MULTIPART_CONCURRENCY: usize = 32;
46/// Fallback for visitor ID hashing when no JWT secret is configured (dev only).
47const DEFAULT_SIGNAL_SECRET: &str = "forge-default-signal-secret";
48
49/// Gateway server configuration.
50#[derive(Debug, Clone)]
51pub struct GatewayConfig {
52    /// Port to listen on.
53    pub port: u16,
54    /// Maximum number of connections.
55    pub max_connections: usize,
56    /// Maximum number of active SSE sessions.
57    pub sse_max_sessions: usize,
58    /// Request timeout in seconds.
59    pub request_timeout_secs: u64,
60    /// Enable CORS.
61    pub cors_enabled: bool,
62    /// Allowed CORS origins.
63    pub cors_origins: Vec<String>,
64    /// Authentication configuration.
65    pub auth: AuthConfig,
66    /// MCP configuration.
67    pub mcp: McpConfig,
68    /// Routes excluded from request logs, metrics, and traces.
69    pub quiet_routes: Vec<String>,
70    /// Token TTL configuration for refresh token management.
71    pub token_ttl: forge_core::AuthTokenTtl,
72    /// Project name (displayed on OAuth consent page).
73    pub project_name: String,
74    /// Maximum body size in bytes for uploads. Defaults to 20 MB.
75    pub max_body_size_bytes: usize,
76}
77
78impl Default for GatewayConfig {
79    fn default() -> Self {
80        Self {
81            port: 9081,
82            max_connections: 512,
83            sse_max_sessions: 10_000,
84            request_timeout_secs: 30,
85            cors_enabled: false,
86            cors_origins: Vec::new(),
87            auth: AuthConfig::default(),
88            mcp: McpConfig::default(),
89            quiet_routes: Vec::new(),
90            token_ttl: forge_core::AuthTokenTtl::default(),
91            project_name: "forge-app".to_string(),
92            max_body_size_bytes: DEFAULT_MAX_MULTIPART_BODY_SIZE,
93        }
94    }
95}
96
97/// Health check response.
98#[derive(Debug, Serialize)]
99pub struct HealthResponse {
100    pub status: String,
101    pub version: String,
102}
103
104/// Readiness check response.
105#[derive(Debug, Serialize)]
106pub struct ReadinessResponse {
107    pub ready: bool,
108    pub database: bool,
109    pub reactor: bool,
110    pub workflows: bool,
111    #[serde(skip_serializing_if = "Option::is_none")]
112    pub blocked_workflow_runs: Option<i64>,
113    pub version: String,
114}
115
116/// State for readiness check.
117#[derive(Clone)]
118pub struct ReadinessState {
119    db_pool: sqlx::PgPool,
120    reactor: Arc<Reactor>,
121}
122
123/// Gateway HTTP server.
124pub struct GatewayServer {
125    config: GatewayConfig,
126    registry: FunctionRegistry,
127    db: Database,
128    reactor: Arc<Reactor>,
129    job_dispatcher: Option<Arc<dyn JobDispatch>>,
130    workflow_dispatcher: Option<Arc<dyn WorkflowDispatch>>,
131    mcp_registry: Option<McpToolRegistry>,
132    token_ttl: forge_core::AuthTokenTtl,
133    signals_collector: Option<crate::signals::SignalsCollector>,
134}
135
136impl GatewayServer {
137    /// Create a new gateway server.
138    pub fn new(config: GatewayConfig, registry: FunctionRegistry, db: Database) -> Self {
139        let node_id = NodeId::new();
140        let reactor = Arc::new(Reactor::new(
141            node_id,
142            db.primary().clone(),
143            registry.clone(),
144            ReactorConfig::default(),
145        ));
146
147        let token_ttl = config.token_ttl.clone();
148        Self {
149            config,
150            registry,
151            db,
152            reactor,
153            job_dispatcher: None,
154            workflow_dispatcher: None,
155            mcp_registry: None,
156            token_ttl,
157            signals_collector: None,
158        }
159    }
160
161    /// Set the job dispatcher.
162    pub fn with_job_dispatcher(mut self, dispatcher: Arc<dyn JobDispatch>) -> Self {
163        self.job_dispatcher = Some(dispatcher);
164        self
165    }
166
167    /// Set the workflow dispatcher.
168    pub fn with_workflow_dispatcher(mut self, dispatcher: Arc<dyn WorkflowDispatch>) -> Self {
169        self.workflow_dispatcher = Some(dispatcher);
170        self
171    }
172
173    /// Set the MCP tool registry.
174    pub fn with_mcp_registry(mut self, registry: McpToolRegistry) -> Self {
175        self.mcp_registry = Some(registry);
176        self
177    }
178
179    /// Set the signals collector for auto-capturing RPC events and
180    /// registering client signal ingestion endpoints.
181    pub fn with_signals_collector(mut self, collector: crate::signals::SignalsCollector) -> Self {
182        self.signals_collector = Some(collector);
183        self
184    }
185
186    /// Get a reference to the reactor.
187    pub fn reactor(&self) -> Arc<Reactor> {
188        self.reactor.clone()
189    }
190
191    /// Build an OAuth router (bypasses auth middleware). Returns None if OAuth is disabled.
192    pub fn oauth_router(&self) -> Option<(Router, Arc<super::oauth::OAuthState>)> {
193        if !self.config.mcp.oauth {
194            return None;
195        }
196
197        let token_issuer = HmacTokenIssuer::from_config(&self.config.auth)
198            .map(|issuer| Arc::new(issuer) as Arc<dyn forge_core::TokenIssuer>)?;
199
200        let auth_middleware_state = Arc::new(AuthMiddleware::new(self.config.auth.clone()));
201
202        let jwt_secret = self.config.auth.jwt_secret.clone().unwrap_or_default();
203
204        let oauth_state = Arc::new(super::oauth::OAuthState::new(
205            self.db.primary().clone(),
206            auth_middleware_state,
207            token_issuer,
208            self.token_ttl.access_token_secs,
209            self.token_ttl.refresh_token_days,
210            self.config.auth.is_hmac(),
211            self.config.project_name.clone(),
212            jwt_secret,
213        ));
214
215        let router = Router::new()
216            .route(
217                "/oauth/authorize",
218                get(super::oauth::oauth_authorize_get).post(super::oauth::oauth_authorize_post),
219            )
220            .route("/oauth/token", post(super::oauth::oauth_token))
221            .route("/oauth/register", post(super::oauth::oauth_register))
222            .with_state(oauth_state.clone());
223
224        Some((router, oauth_state))
225    }
226
227    /// Build the Axum router.
228    pub fn router(&self) -> Router {
229        let token_issuer = HmacTokenIssuer::from_config(&self.config.auth)
230            .map(|issuer| Arc::new(issuer) as Arc<dyn forge_core::TokenIssuer>);
231
232        let mut rpc = RpcHandler::with_dispatch_and_issuer(
233            self.registry.clone(),
234            self.db.clone(),
235            self.job_dispatcher.clone(),
236            self.workflow_dispatcher.clone(),
237            token_issuer,
238        );
239        rpc.set_token_ttl(self.token_ttl.clone());
240        if let Some(collector) = &self.signals_collector {
241            let secret = self
242                .config
243                .auth
244                .jwt_secret
245                .clone()
246                .unwrap_or_else(|| DEFAULT_SIGNAL_SECRET.to_string());
247            rpc.set_signals_collector(collector.clone(), secret);
248        }
249        let rpc_handler_state = Arc::new(rpc);
250
251        let auth_middleware_state = Arc::new(AuthMiddleware::new(self.config.auth.clone()));
252
253        // Build CORS layer. When specific origins are configured, allow
254        // credentials so the browser accepts cross-origin API responses
255        // (the forge-svelte client sends `credentials: "include"` for
256        // the SSE session cookie). Wildcard methods/headers are incompatible
257        // with credentials per the CORS spec, so we enumerate them.
258        let cors = if self.config.cors_enabled {
259            if self.config.cors_origins.iter().any(|o| o == "*") {
260                // Wildcard origin can't use credentials
261                CorsLayer::new()
262                    .allow_origin(Any)
263                    .allow_methods(Any)
264                    .allow_headers(Any)
265            } else {
266                use axum::http::Method;
267                let origins: Vec<_> = self
268                    .config
269                    .cors_origins
270                    .iter()
271                    .filter_map(|o| o.parse().ok())
272                    .collect();
273                CorsLayer::new()
274                    .allow_origin(origins)
275                    .allow_methods([
276                        Method::GET,
277                        Method::POST,
278                        Method::PUT,
279                        Method::DELETE,
280                        Method::PATCH,
281                        Method::OPTIONS,
282                    ])
283                    .allow_headers([
284                        axum::http::header::CONTENT_TYPE,
285                        axum::http::header::AUTHORIZATION,
286                        axum::http::header::ACCEPT,
287                        axum::http::HeaderName::from_static("x-webhook-signature"),
288                        axum::http::HeaderName::from_static("x-idempotency-key"),
289                        axum::http::HeaderName::from_static("x-correlation-id"),
290                        axum::http::HeaderName::from_static("x-session-id"),
291                        axum::http::HeaderName::from_static("x-forge-platform"),
292                    ])
293                    .allow_credentials(true)
294            }
295        } else {
296            CorsLayer::new()
297        };
298
299        // SSE state for Server-Sent Events
300        let sse_state = Arc::new(SseState::with_config(
301            self.reactor.clone(),
302            auth_middleware_state.clone(),
303            super::sse::SseConfig {
304                max_sessions: self.config.sse_max_sessions,
305                ..Default::default()
306            },
307        ));
308
309        // Readiness state for DB + reactor health check
310        let readiness_state = Arc::new(ReadinessState {
311            db_pool: self.db.primary().clone(),
312            reactor: self.reactor.clone(),
313        });
314
315        // Build the main router with middleware
316        let mut main_router = Router::new()
317            // Health check endpoint (liveness)
318            .route("/health", get(health_handler))
319            // Readiness check endpoint (checks DB)
320            .route("/ready", get(readiness_handler).with_state(readiness_state))
321            // RPC endpoint
322            .route("/rpc", post(rpc_handler))
323            // Batch RPC endpoint
324            .route("/rpc/batch", post(rpc_batch_handler))
325            // REST-style function endpoint (JSON)
326            .route("/rpc/{function}", post(rpc_function_handler))
327            // Prevent oversized JSON payloads from exhausting memory.
328            .layer(DefaultBodyLimit::max(DEFAULT_MAX_JSON_BODY_SIZE))
329            // Add state
330            .with_state(rpc_handler_state.clone());
331
332        // Multipart RPC router. The Axum layer limit is set to the highest
333        // configured size (global or any per-mutation override) so that
334        // per-mutation max_size values aren't rejected at the HTTP layer.
335        // The handler still enforces per-function limits chunk-by-chunk.
336        let max_per_mutation = self
337            .registry
338            .functions()
339            .filter_map(|(_, entry)| entry.info().max_upload_size_bytes)
340            .max()
341            .unwrap_or(0);
342        let layer_limit = self.config.max_body_size_bytes.max(max_per_mutation);
343        let mp_config = MultipartConfig {
344            max_body_size_bytes: self.config.max_body_size_bytes,
345        };
346        let multipart_router = Router::new()
347            .route("/rpc/{function}/upload", post(rpc_multipart_handler))
348            .layer(DefaultBodyLimit::max(layer_limit))
349            .layer(Extension(mp_config))
350            // Cap upload fan-out; each request buffers data in memory.
351            .layer(ConcurrencyLimitLayer::new(MAX_MULTIPART_CONCURRENCY))
352            .with_state(rpc_handler_state);
353
354        // SSE router
355        let sse_router = Router::new()
356            .route("/events", get(sse_handler))
357            .route("/subscribe", post(sse_subscribe_handler))
358            .route("/unsubscribe", post(sse_unsubscribe_handler))
359            .route("/subscribe-job", post(sse_job_subscribe_handler))
360            .route("/subscribe-workflow", post(sse_workflow_subscribe_handler))
361            .with_state(sse_state);
362
363        let mut mcp_router = Router::new();
364        if self.config.mcp.enabled {
365            let path = self.config.mcp.path.clone();
366            let mcp_state = Arc::new(McpState::new(
367                self.config.mcp.clone(),
368                self.mcp_registry.clone().unwrap_or_default(),
369                self.db.primary().clone(),
370                self.job_dispatcher.clone(),
371                self.workflow_dispatcher.clone(),
372            ));
373            mcp_router = mcp_router.route(
374                &path,
375                post(mcp_post_handler)
376                    .get(mcp_get_handler)
377                    .with_state(mcp_state),
378            );
379        }
380
381        // Signal ingestion endpoints (product analytics + diagnostics)
382        let mut signals_router = Router::new();
383        if let Some(collector) = &self.signals_collector {
384            let signals_state = Arc::new(crate::signals::endpoints::SignalsState {
385                collector: collector.clone(),
386                pool: self.db.analytics_pool().clone(),
387                server_secret: self
388                    .config
389                    .auth
390                    .jwt_secret
391                    .clone()
392                    .unwrap_or_else(|| DEFAULT_SIGNAL_SECRET.to_string()),
393            });
394            signals_router = Router::new()
395                .route(
396                    "/signal/event",
397                    post(crate::signals::endpoints::event_handler),
398                )
399                .route(
400                    "/signal/view",
401                    post(crate::signals::endpoints::view_handler),
402                )
403                .route(
404                    "/signal/user",
405                    post(crate::signals::endpoints::user_handler),
406                )
407                .route(
408                    "/signal/report",
409                    post(crate::signals::endpoints::report_handler),
410                )
411                .with_state(signals_state);
412        }
413
414        main_router = main_router
415            .merge(multipart_router)
416            .merge(sse_router)
417            .merge(mcp_router)
418            .merge(signals_router);
419
420        // Build middleware stack
421        let service_builder = ServiceBuilder::new()
422            .layer(HandleErrorLayer::new(handle_middleware_error))
423            .layer(ConcurrencyLimitLayer::new(self.config.max_connections))
424            .layer(TimeoutLayer::new(Duration::from_secs(
425                self.config.request_timeout_secs,
426            )))
427            .layer(cors.clone())
428            .layer(middleware::from_fn_with_state(
429                auth_middleware_state,
430                auth_middleware,
431            ))
432            .layer(middleware::from_fn_with_state(
433                Arc::new(self.config.quiet_routes.clone()),
434                tracing_middleware,
435            ));
436
437        // Apply the remaining middleware layers
438        main_router.layer(service_builder)
439    }
440
441    /// Get the socket address to bind to.
442    pub fn addr(&self) -> std::net::SocketAddr {
443        std::net::SocketAddr::from(([0, 0, 0, 0], self.config.port))
444    }
445
446    /// Run the server (blocking).
447    pub async fn run(self) -> Result<(), std::io::Error> {
448        let addr = self.addr();
449        let router = self.router();
450
451        // Start the reactor for real-time updates
452        self.reactor
453            .start()
454            .await
455            .map_err(|e| std::io::Error::other(format!("Failed to start reactor: {}", e)))?;
456        tracing::info!("Reactor started for real-time updates");
457
458        tracing::info!("Gateway server listening on {}", addr);
459
460        let listener = tokio::net::TcpListener::bind(addr).await?;
461        axum::serve(listener, router.into_make_service()).await
462    }
463}
464
465/// Health check handler (liveness probe).
466async fn health_handler() -> Json<HealthResponse> {
467    Json(HealthResponse {
468        status: "healthy".to_string(),
469        version: env!("CARGO_PKG_VERSION").to_string(),
470    })
471}
472
473/// Readiness check handler (readiness probe).
474async fn readiness_handler(
475    axum::extract::State(state): axum::extract::State<Arc<ReadinessState>>,
476) -> (axum::http::StatusCode, Json<ReadinessResponse>) {
477    // Check database connectivity
478    let db_ok = sqlx::query_scalar!("SELECT 1 as \"v!\"")
479        .fetch_one(&state.db_pool)
480        .await
481        .is_ok();
482
483    // Check reactor health (change listener must be running for real-time updates)
484    let reactor_stats = state.reactor.stats().await;
485    let reactor_ok = reactor_stats.listener_running;
486
487    // Check for blocked workflow runs (strict mode: unhealthy if any runs are blocked)
488    let (workflows_ok, blocked_count) = if db_ok {
489        match sqlx::query_scalar!(
490            r#"SELECT COUNT(*) as "count!" FROM forge_workflow_runs WHERE status LIKE 'blocked_%'"#,
491        )
492        .fetch_one(&state.db_pool)
493        .await
494        {
495            Ok(count) => (count == 0, if count > 0 { Some(count) } else { None }),
496            Err(_) => (true, None), // if query fails, don't block on this check
497        }
498    } else {
499        (true, None)
500    };
501
502    let ready = db_ok && reactor_ok && workflows_ok;
503    let status_code = if ready {
504        axum::http::StatusCode::OK
505    } else {
506        axum::http::StatusCode::SERVICE_UNAVAILABLE
507    };
508
509    (
510        status_code,
511        Json(ReadinessResponse {
512            ready,
513            database: db_ok,
514            reactor: reactor_ok,
515            workflows: workflows_ok,
516            blocked_workflow_runs: blocked_count,
517            version: env!("CARGO_PKG_VERSION").to_string(),
518        }),
519    )
520}
521
522async fn handle_middleware_error(err: BoxError) -> axum::response::Response {
523    let (status, code, message) = if err.is::<tower::timeout::error::Elapsed>() {
524        (StatusCode::REQUEST_TIMEOUT, "TIMEOUT", "Request timed out")
525    } else {
526        (
527            StatusCode::SERVICE_UNAVAILABLE,
528            "SERVICE_UNAVAILABLE",
529            "Server overloaded",
530        )
531    };
532    (
533        status,
534        Json(RpcResponse::error(RpcError::new(code, message))),
535    )
536        .into_response()
537}
538
539fn set_tracing_headers(response: &mut axum::response::Response, trace_id: &str, request_id: &str) {
540    if let Ok(val) = trace_id.parse() {
541        response.headers_mut().insert(TRACE_ID_HEADER, val);
542    }
543    if let Ok(val) = request_id.parse() {
544        response.headers_mut().insert(REQUEST_ID_HEADER, val);
545    }
546}
547
548/// Extracts W3C traceparent context from HTTP headers.
549struct HeaderExtractor<'a>(&'a axum::http::HeaderMap);
550
551impl<'a> Extractor for HeaderExtractor<'a> {
552    fn get(&self, key: &str) -> Option<&str> {
553        self.0.get(key).and_then(|v| v.to_str().ok())
554    }
555
556    fn keys(&self) -> Vec<&str> {
557        self.0.keys().map(|k| k.as_str()).collect()
558    }
559}
560
561/// Wraps each request in a span with HTTP semantics and OpenTelemetry
562/// context propagation. Incoming `traceparent` headers are extracted so
563/// that spans join the caller's distributed trace.
564/// Quiet routes skip spans, logs, and metrics to avoid noise from
565/// probes or high-frequency internal endpoints.
566async fn tracing_middleware(
567    axum::extract::State(quiet_routes): axum::extract::State<Arc<Vec<String>>>,
568    req: axum::extract::Request,
569    next: axum::middleware::Next,
570) -> axum::response::Response {
571    let headers = req.headers();
572
573    // Extract W3C traceparent from incoming headers for distributed tracing
574    let parent_cx =
575        global::get_text_map_propagator(|propagator| propagator.extract(&HeaderExtractor(headers)));
576
577    let trace_id = headers
578        .get(TRACE_ID_HEADER)
579        .and_then(|v| v.to_str().ok())
580        .map(String::from)
581        .unwrap_or_else(|| uuid::Uuid::new_v4().to_string());
582
583    let parent_span_id = headers
584        .get(SPAN_ID_HEADER)
585        .and_then(|v| v.to_str().ok())
586        .map(String::from);
587
588    let method = req.method().to_string();
589    let path = req.uri().path().to_string();
590
591    let mut tracing_state = TracingState::with_trace_id(trace_id.clone());
592    if let Some(span_id) = parent_span_id {
593        tracing_state = tracing_state.with_parent_span(span_id);
594    }
595
596    let mut req = req;
597    req.extensions_mut().insert(tracing_state.clone());
598
599    if req
600        .extensions()
601        .get::<forge_core::function::AuthContext>()
602        .is_none()
603    {
604        req.extensions_mut()
605            .insert(forge_core::function::AuthContext::unauthenticated());
606    }
607
608    // Config uses full paths (/_api/health) but axum strips the prefix
609    // for nested routers, so the middleware sees /health not /_api/health.
610    let full_path = format!("/_api{}", path);
611    let is_quiet = quiet_routes.iter().any(|r| *r == full_path || *r == path);
612
613    if is_quiet {
614        let mut response = next.run(req).await;
615        set_tracing_headers(&mut response, &trace_id, &tracing_state.request_id);
616        return response;
617    }
618
619    let span = tracing::info_span!(
620        "http.request",
621        http.method = %method,
622        http.route = %path,
623        http.status_code = tracing::field::Empty,
624        trace_id = %trace_id,
625        request_id = %tracing_state.request_id,
626    );
627
628    // Link this span to the incoming distributed trace context so
629    // fn.execute and all downstream spans share the caller's trace ID
630    span.set_parent(parent_cx);
631
632    let mut response = next.run(req).instrument(span.clone()).await;
633
634    let status = response.status().as_u16();
635    let elapsed = tracing_state.elapsed();
636
637    span.record("http.status_code", status);
638    let duration_ms = elapsed.as_millis() as u64;
639    match status {
640        500..=599 => tracing::error!(parent: &span, duration_ms, "Request failed"),
641        400..=499 => tracing::warn!(parent: &span, duration_ms, "Request rejected"),
642        200..=299 => tracing::info!(parent: &span, duration_ms, "Request completed"),
643        _ => tracing::trace!(parent: &span, duration_ms, "Request completed"),
644    }
645    crate::observability::record_http_request(&method, &path, status, elapsed.as_secs_f64());
646
647    set_tracing_headers(&mut response, &trace_id, &tracing_state.request_id);
648    response
649}
650
651#[cfg(test)]
652#[allow(clippy::unwrap_used, clippy::indexing_slicing, clippy::panic)]
653mod tests {
654    use super::*;
655
656    #[test]
657    fn test_gateway_config_default() {
658        let config = GatewayConfig::default();
659        assert_eq!(config.port, 9081);
660        assert_eq!(config.max_connections, 512);
661        assert!(!config.cors_enabled);
662    }
663
664    #[test]
665    fn test_health_response_serialization() {
666        let resp = HealthResponse {
667            status: "healthy".to_string(),
668            version: "0.1.0".to_string(),
669        };
670        let json = serde_json::to_string(&resp).unwrap();
671        assert!(json.contains("healthy"));
672    }
673}