roboticus-api 0.11.3

//! SSE streaming endpoint for agent message inference.
//!
//! Thin connector: parse → `run_pipeline()` → match `StreamReady` → SSE loop → `finalize()`.

use std::convert::Infallible;

use axum::extract::State;
use axum::response::sse::{Event, KeepAlive, Sse};
use futures_util::StreamExt;
use serde_json::json;

use super::pipeline::{
    PipelineConfig, PipelineError, PipelineOutcome, PipelineRequest, resolve_web_scope_hint,
    run_pipeline,
};
use super::{AgentMessageRequest, AppState};

/// Stream timeout is this multiple of the per-provider timeout.
/// Streams may produce tokens for much longer than the initial connection timeout.
const STREAM_TIMEOUT_MULTIPLIER: u32 = 3;

/// Streaming version of `agent_message`. Returns an SSE stream of `StreamChunk`
/// events as tokens arrive from the LLM provider. The accumulated response is
/// stored in the session and published to the EventBus after the stream ends.
#[tracing::instrument(skip_all, fields(channel = "api-stream"))]
pub async fn agent_message_stream(
    State(state): State<AppState>,
    axum::Json(body): axum::Json<AgentMessageRequest>,
) -> Result<Sse<impl futures_util::Stream<Item = Result<Event, Infallible>>>, PipelineError> {
    tracing::info!(channel = "api-stream", session_id = ?body.session_id, "Processing streaming agent message");

    let scope_hint = resolve_web_scope_hint(&state, &body).await?;

    let request = PipelineRequest {
        state: &state,
        config: PipelineConfig::streaming(),
        raw_content: &body.content,
        session_id_hint: body.session_id.as_deref(),
        scope_hint,
        is_correction_turn: false,
        channel_context: None,
        content_parts: None,
    };

    let outcome = run_pipeline(request).await?;

    // PipelineConfig::streaming() uses InferenceMode::Streaming, so we always
    // get StreamReady. SpecialistProposal and Complete are unreachable but
    // handled defensively.
    let ctx = match outcome {
        PipelineOutcome::StreamReady(ctx) => *ctx,
        PipelineOutcome::SpecialistProposal { .. } | PipelineOutcome::Complete { .. } => {
            return Err(PipelineError::Internal(
                "unexpected non-streaming outcome on streaming endpoint".into(),
            ));
        }
    };

    // ── SSE stream generation ───────────────────────────────────────
    let per_provider_timeout = {
        let config = state.config.read().await;
        std::time::Duration::from_secs(config.models.routing.per_provider_timeout_seconds)
    };

    // Destructure ctx — move all fields into the stream closure.
    // Provider resolution already happened in the pipeline; the connector
    // only consumes the pre-resolved stream.
    let super::pipeline::StreamContext {
        session_id,
        turn_id,
        user_content,
        prepared: ctx_prepared,
        shortcut_result,
        dedup_guard,
        agent_id,
        resolved_stream,
        guard_set: stream_guard_set,
    } = ctx;

    // Extract provider metadata from the resolved stream context.
    let (chunk_stream_opt, selected_model, provider_prefix, cost_in, cost_out) =
        if let Some(resolved) = resolved_stream {
            (
                Some(resolved.stream),
                resolved.selected_model,
                resolved.provider_prefix,
                resolved.cost_in,
                resolved.cost_out,
            )
        } else {
            // Shortcut path — no stream needed; synthetic SSE emitted below.
            (None, ctx_prepared.model.clone(), String::new(), 0.0, 0.0)
        };

    let model_clone = selected_model.clone();
    let cache_hash = ctx_prepared.cache_hash.clone();
    let prepared_model = ctx_prepared.model.clone();
    let system_prompt_hash = ctx_prepared.system_prompt_hash.clone();
    let event_bus = state.event_bus.clone();
    let state_clone = state.clone();

    let sse_stream = async_stream::stream! {
        drop(dedup_guard);

        let open = json!({
            "type": "stream_start",
            "session_id": session_id,
            "turn_id": turn_id,
            "model": model_clone,
        });
        yield Ok(Event::default().data(open.to_string()));
        event_bus.publish(
            json!({
                "type": "agent_working",
                "agent_id": agent_id,
                "workstation": "llm",
                "activity": "inference",
                "session_id": session_id,
                "model": model_clone,
            })
            .to_string(),
        );

        // ── Shortcut fast-path: emit pre-computed result as synthetic chunks ──
        if let Some(sc) = shortcut_result {
            let chunk_data = json!({
                "type": "chunk",
                "delta": sc.content,
                "model": sc.model,
                "finish_reason": "stop",
            });
            yield Ok(Event::default().data(chunk_data.to_string()));
            event_bus.publish(
                json!({
                    "type": "agent_idle",
                    "agent_id": agent_id,
                    "workstation": "llm",
                    "session_id": session_id,
                })
                .to_string(),
            );
            let done_data = json!({
                "type": "stream_end",
                "session_id": session_id,
                "model": sc.model,
                "tokens_in": sc.tokens_in,
                "tokens_out": sc.tokens_out,
                "cost": sc.cost,
            });
            yield Ok(Event::default().data(done_data.to_string()));
        } else if let Some(chunk_stream) = chunk_stream_opt {
        // ── Normal provider-streaming path with buffer-first guard check ──
        //
        // Phase 1 (BUFFER): Accumulate chunks until we have enough content to
        // run high-value guards (~100 chars or a sentence boundary). Nothing
        // is sent to the client during this phase.
        //
        // Phase 2 (FLUSH+STREAM): If guards pass, flush the buffered chunks
        // to the client, then stream remaining chunks normally.
        //
        // If guards fail during the buffer phase, discard everything and
        // yield a retry event instructing the client to re-request.

        const BUFFER_MIN_CHARS: usize = 100;
        let mut accumulator = roboticus_llm::format::StreamAccumulator::default();
        let stream_start = std::time::Instant::now();
        let stream_timeout = per_provider_timeout * STREAM_TIMEOUT_MULTIPLIER;
        let mut warned_timeout = false;
        let mut stream = std::pin::pin!(chunk_stream);

        // Phase 1: Buffer initial chunks
        let mut buffered_chunks: Vec<roboticus_llm::format::StreamChunk> = Vec::new();
        let mut buffer_text = String::new();
        let mut buffer_passed = false;
        let mut stream_errored = false;

        while !buffer_passed {
            match stream.next().await {
                Some(Ok(chunk)) => {
                    buffer_text.push_str(&chunk.delta);
                    accumulator.push(&chunk);
                    buffered_chunks.push(chunk);

                    // Check if we have enough to evaluate
                    let has_enough = buffer_text.len() >= BUFFER_MIN_CHARS
                        || buffer_text.contains('.')
                        || buffer_text.contains('!')
                        || buffer_text.contains('?')
                        || buffer_text.contains('\n');

                    if has_enough {
                        // Run high-value guards on buffer content
                        let guard_scores =
                            super::guard_registry::precompute_guard_scores(
                                &state_clone.semantic_classifier,
                                &buffer_text,
                            )
                            .await;
                        let sub_names: Vec<String> =
                            roboticus_db::agents::list_sub_agents(&state_clone.db)
                                .unwrap_or_default()
                                .iter()
                                .map(|a| a.name.to_ascii_lowercase())
                                .collect();
                        let empty_provenance =
                            super::decomposition::DelegationProvenance::default();
                        let guard_ctx =
                            super::guard_registry::GuardContext::for_streaming(
                                &user_content,
                                &ctx_prepared.intents,
                                &agent_id,
                                &selected_model,
                                &empty_provenance,
                                guard_scores,
                                sub_names,
                            );
                        let chain = stream_guard_set.resolve();
                        let result = chain.apply(buffer_text.clone(), &guard_ctx);

                        if result.retry.is_some() {
                            // Guard failed — discard buffered content and signal retry
                            let reason = result
                                .retry
                                .as_ref()
                                .map(|r| r.reason.as_str())
                                .unwrap_or("guard check failed");
                            tracing::warn!(
                                guard = ?result.retry.as_ref().map(|r| r.guard_id),
                                reason,
                                "streaming buffer guard check failed — discarding"
                            );
                            let retry_event = json!({
                                "type": "stream_retry",
                                "reason": reason,
                                "session_id": session_id,
                            });
                            yield Ok(Event::default().data(retry_event.to_string()));
                            stream_errored = true;
                            break;
                        }
                        buffer_passed = true;
                    }
                }
                Some(Err(e)) => {
                    tracing::error!(error = %e, "streaming chunk error during buffer phase");
                    {
                        let mut llm = state_clone.llm.write().await;
                        llm.breakers.record_failure(&provider_prefix);
                        llm.breakers.set_capacity_pressure(&provider_prefix, false);
                    }
                    let err_data = json!({"type": "error", "error": "upstream provider error"});
                    yield Ok(Event::default().data(err_data.to_string()));
                    stream_errored = true;
                    break;
                }
                None => {
                    // Stream ended during buffer phase — short response, run guards anyway
                    let guard_scores =
                        super::guard_registry::precompute_guard_scores(
                            &state_clone.semantic_classifier,
                            &buffer_text,
                        )
                        .await;
                    let sub_names: Vec<String> =
                        roboticus_db::agents::list_sub_agents(&state_clone.db)
                            .unwrap_or_default()
                            .iter()
                            .map(|a| a.name.to_ascii_lowercase())
                            .collect();
                    let empty_provenance =
                        super::decomposition::DelegationProvenance::default();
                    let guard_ctx =
                        super::guard_registry::GuardContext::for_streaming(
                            &user_content,
                            &ctx_prepared.intents,
                            &agent_id,
                            &selected_model,
                            &empty_provenance,
                            guard_scores,
                            sub_names,
                        );
                    let chain = stream_guard_set.resolve();
                    let result = chain.apply(buffer_text.clone(), &guard_ctx);
                    if result.retry.is_some() {
                        let reason = result
                            .retry
                            .as_ref()
                            .map(|r| r.reason.as_str())
                            .unwrap_or("guard check failed");
                        tracing::warn!(
                            "streaming buffer guard check failed on short response — discarding"
                        );
                        let retry_event = json!({
                            "type": "stream_retry",
                            "reason": reason,
                            "session_id": session_id,
                        });
                        yield Ok(Event::default().data(retry_event.to_string()));
                        stream_errored = true;
                    }
                    // buffer_passed is implicitly true since the stream ended —
                    // the break exits the buffer phase loop.
                    break;
                }
            }
        }

        // Phase 2: Flush buffered chunks + stream remaining
        if !stream_errored {
            // Flush buffered chunks
            for chunk in &buffered_chunks {
                let chunk_event = json!({
                    "type": "stream_chunk",
                    "delta": chunk.delta,
                    "done": false,
                    "session_id": session_id,
                });
                event_bus.publish(chunk_event.to_string());

                let sse_data = json!({
                    "type": "chunk",
                    "delta": chunk.delta,
                    "model": chunk.model,
                    "finish_reason": chunk.finish_reason,
                });
                yield Ok(Event::default().data(sse_data.to_string()));
            }

            // Continue streaming remaining chunks
            while let Some(item) = stream.next().await {
                match item {
                    Ok(chunk) => {
                        if !warned_timeout && stream_start.elapsed() > stream_timeout / 2 {
                            warned_timeout = true;
                            let remaining = stream_timeout.saturating_sub(stream_start.elapsed());
                            let warn_event = json!({
                                "type": "stream_warning",
                                "message": format!("This request is taking longer than anticipated. Timeout in {}s.", remaining.as_secs()),
                                "session_id": session_id,
                            });
                            yield Ok(Event::default().data(warn_event.to_string()));
                        }

                        accumulator.push(&chunk);

                        let chunk_event = json!({
                            "type": "stream_chunk",
                            "delta": chunk.delta,
                            "done": false,
                            "session_id": session_id,
                        });
                        event_bus.publish(chunk_event.to_string());

                        let sse_data = json!({
                            "type": "chunk",
                            "delta": chunk.delta,
                            "model": chunk.model,
                            "finish_reason": chunk.finish_reason,
                        });
                        yield Ok(Event::default().data(sse_data.to_string()));
                    }
                    Err(e) => {
                        tracing::error!(error = %e, "streaming chunk error from provider");
                        {
                            let mut llm = state_clone.llm.write().await;
                            llm.breakers.record_failure(&provider_prefix);
                            llm.breakers.set_capacity_pressure(&provider_prefix, false);
                        }
                        let err_data = json!({"type": "error", "error": "upstream provider error"});
                        yield Ok(Event::default().data(err_data.to_string()));
                        break;
                    }
                }
            }
        }

        // If the stream was aborted (buffer guard failure or upstream error),
        // skip finalization entirely — do not persist intercepted/partial content.
        // Emit stream_end (not "done") so the dashboard clears _liveStreamTurn
        // and restores the send button.
        if stream_errored {
            // Emit the standard done chunk to close the event bus stream
            let done_chunk = json!({
                "type": "stream_chunk",
                "delta": "",
                "done": true,
                "session_id": session_id,
            });
            event_bus.publish(done_chunk.to_string());
            event_bus.publish(
                json!({
                    "type": "agent_idle",
                    "agent_id": agent_id,
                    "workstation": "llm",
                    "session_id": session_id,
                })
                .to_string(),
            );
            // Emit stream_end so the dashboard finalizes the message bubble
            let end_event = json!({
                "type": "stream_end",
                "session_id": session_id,
                "turn_id": turn_id,
                "aborted": true,
                "content_blocked": false,
            });
            yield Ok(Event::default().data(end_event.to_string()));
        } else {

        let truncated = accumulator.is_truncated();
        let mut unified_resp = accumulator.finalize();

        if truncated {
            tracing::warn!(session_id = %session_id, "streaming response truncated at size limit");
            let trunc_event = json!({
                "type": "stream_warning",
                "message": "Response truncated: exceeded maximum response size",
                "session_id": session_id,
            });
            yield Ok(Event::default().data(trunc_event.to_string()));
        }

        // ── Post-stream full guard check ──
        // The buffer-phase check only covers the first ~100 chars. This runs
        // the full guard chain on the COMPLETE accumulated response. If a guard
        // fires, we emit stream_replace with the rewritten content so the client
        // can swap out the streamed text the user already saw.
        if !stream_errored && !unified_resp.content.is_empty() {
            let final_guard_scores =
                super::guard_registry::precompute_guard_scores(
                    &state_clone.semantic_classifier,
                    &unified_resp.content,
                )
                .await;
            let sub_names: Vec<String> =
                roboticus_db::agents::list_sub_agents(&state_clone.db)
                    .unwrap_or_default()
                    .iter()
                    .map(|a| a.name.to_ascii_lowercase())
                    .collect();
            let empty_provenance =
                super::decomposition::DelegationProvenance::default();
            let guard_ctx = super::guard_registry::GuardContext::for_streaming(
                &user_content,
                &ctx_prepared.intents,
                &agent_id,
                &selected_model,
                &empty_provenance,
                final_guard_scores,
                sub_names,
            );
            let chain = stream_guard_set.resolve();
            let result = chain.apply(unified_resp.content.clone(), &guard_ctx);

            if result.retry.is_some() {
                // Guard caught a late leak — RetryRequested means "re-infer"
                // but we can't re-run inference on the streaming path (stream
                // is done). Replace with a generic acknowledgment and persist
                // the cleaned version. The user can re-send their message.
                let reason = result
                    .retry
                    .as_ref()
                    .map(|r| r.reason.as_str())
                    .unwrap_or("guard check failed");
                tracing::warn!(
                    guard = ?result.retry.as_ref().map(|r| r.guard_id),
                    reason,
                    "post-stream guard check failed — replacing streamed content"
                );
                let fallback = "I need to rephrase my response. Please send your message again \
                     and I'll answer properly."
                    .to_string();
                let replace_event = json!({
                    "type": "stream_replace",
                    "reason": reason,
                    "replacement": fallback,
                    "session_id": session_id,
                });
                yield Ok(Event::default().data(replace_event.to_string()));
                // Persist the fallback, not the leaked content
                unified_resp.content = fallback;
            } else if result.content != unified_resp.content {
                // Guard rewrote content (Rewritten verdict, not retry)
                let replace_event = json!({
                    "type": "stream_replace",
                    "reason": "content cleaned by output filter",
                    "replacement": result.content,
                    "session_id": session_id,
                });
                yield Ok(Event::default().data(replace_event.to_string()));
                unified_resp.content = result.content;
            }
        }

        // Delegate all post-stream bookkeeping to finalize_stream()
        let fin = super::pipeline::finalize_stream(
            &state_clone,
            &session_id,
            &turn_id,
            &user_content,
            &prepared_model,
            &cache_hash,
            &unified_resp,
            &provider_prefix,
            cost_in,
            cost_out,
            stream_start,
            &system_prompt_hash,
        ).await;

        if fin.content_blocked {
            let blocked_event = json!({
                "type": "stream_blocked",
                "reason": "output safety filter triggered",
                "session_id": session_id,
            });
            yield Ok(Event::default().data(blocked_event.to_string()));
        }

        let done_event = json!({
            "type": "stream_chunk",
            "delta": "",
            "done": true,
            "session_id": session_id,
        });
        event_bus.publish(done_event.to_string());
        event_bus.publish(
            json!({
                "type": "agent_idle",
                "agent_id": agent_id,
                "workstation": "llm",
                "session_id": session_id,
            })
            .to_string(),
        );

        let final_event = json!({
            "type": "stream_end",
            "session_id": session_id,
            "turn_id": turn_id,
            "model": fin.model,
            "tokens_in": fin.tokens_in,
            "tokens_out": fin.tokens_out,
            "content_length": fin.assistant_content.len(),
            "content_blocked": fin.content_blocked,
        });
        yield Ok(Event::default().data(final_event.to_string()));

        } // end else (!stream_errored — normal finalization path)

        } // end else (normal provider-streaming path)
    };

    Ok(Sse::new(sse_stream).keep_alive(KeepAlive::default()))
}